@tangle-network/agent-eval 0.59.1 → 0.61.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/CHANGELOG.md +21 -0
  2. package/dist/adapters/http.d.ts +1 -1
  3. package/dist/adapters/http.js +1 -1
  4. package/dist/adapters/langchain.d.ts +1 -1
  5. package/dist/adapters/langchain.js +1 -1
  6. package/dist/adapters/otel.d.ts +5 -5
  7. package/dist/adapters/otel.js +1 -1
  8. package/dist/agent-profile-9J9hxdm2.d.ts +114 -0
  9. package/dist/benchmarks/index.d.ts +3 -3
  10. package/dist/benchmarks/index.js +2 -2
  11. package/dist/builder-eval/index.js +3 -3
  12. package/dist/campaign/index.d.ts +153 -9
  13. package/dist/campaign/index.js +229 -23
  14. package/dist/campaign/index.js.map +1 -1
  15. package/dist/{chunk-QDOSODID.js → chunk-3B7Y5AUR.js} +2 -2
  16. package/dist/{chunk-QYJT52YW.js → chunk-3BFEG2F6.js} +1 -1
  17. package/dist/chunk-3BFEG2F6.js.map +1 -0
  18. package/dist/{chunk-J4DIMSRK.js → chunk-6EKXFFGQ.js} +2 -2
  19. package/dist/{chunk-MHQPVHXU.js → chunk-6QDKWHLS.js} +2 -2
  20. package/dist/{chunk-63EPZQUZ.js → chunk-6REHLN5J.js} +2 -2
  21. package/dist/{chunk-GM476SZU.js → chunk-AIWHLG7J.js} +5 -5
  22. package/dist/{chunk-AIXHUIHG.js → chunk-B26KI423.js} +3 -3
  23. package/dist/{chunk-NCK5QLGT.js → chunk-F3SRAAZO.js} +2 -2
  24. package/dist/{chunk-N4SBKEPJ.js → chunk-GMXHLSLL.js} +107 -2
  25. package/dist/chunk-GMXHLSLL.js.map +1 -0
  26. package/dist/{chunk-VXNVVBZO.js → chunk-IHDHUN2X.js} +2 -2
  27. package/dist/{chunk-S3SDD56V.js → chunk-ITBRCT73.js} +2 -2
  28. package/dist/{chunk-OLIBRKRD.js → chunk-KX6F6NCG.js} +2 -2
  29. package/dist/{chunk-74Y2EMNH.js → chunk-OLULBECP.js} +18 -6
  30. package/dist/chunk-OLULBECP.js.map +1 -0
  31. package/dist/chunk-PQV2TKC3.js +27 -0
  32. package/dist/chunk-PQV2TKC3.js.map +1 -0
  33. package/dist/chunk-PZ5AY32C.js +10 -0
  34. package/dist/{chunk-UBPIXOC4.js → chunk-SBCB6VZY.js} +2 -2
  35. package/dist/chunk-SHTXZ4O2.js +113 -0
  36. package/dist/chunk-SHTXZ4O2.js.map +1 -0
  37. package/dist/{chunk-JB4UWIM6.js → chunk-SUGME4OT.js} +266 -15
  38. package/dist/chunk-SUGME4OT.js.map +1 -0
  39. package/dist/{chunk-YTMXBHFM.js → chunk-T375SUOZ.js} +2 -2
  40. package/dist/{chunk-PIEAE33T.js → chunk-Z4ZCBC7M.js} +2 -2
  41. package/dist/cli.js +4 -4
  42. package/dist/contract/index.d.ts +48 -16
  43. package/dist/contract/index.js +59 -19
  44. package/dist/contract/index.js.map +1 -1
  45. package/dist/{control-DjEgwWNo.d.ts → control-Bf8owbuG.d.ts} +2 -2
  46. package/dist/control.d.ts +5 -5
  47. package/dist/control.js +4 -4
  48. package/dist/{dataset-BlwAtYYf.d.ts → dataset-B2kL-fSM.d.ts} +1 -1
  49. package/dist/{errors-mje_cKOs.d.ts → errors-Dwqw-T_m.d.ts} +1 -1
  50. package/dist/{feedback-trajectory-DpUmE90J.d.ts → feedback-trajectory-8hKC5EOb.d.ts} +1 -1
  51. package/dist/governance/index.d.ts +3 -3
  52. package/dist/governance/index.js +1 -1
  53. package/dist/hosted/index.d.ts +5 -5
  54. package/dist/hosted/index.js +1 -1
  55. package/dist/{index-wlaiph9Y.d.ts → index-Bvk35ils.d.ts} +1 -1
  56. package/dist/{index-D2nT6_KT.d.ts → index-D9dwa00f.d.ts} +2 -2
  57. package/dist/index.d.ts +24 -132
  58. package/dist/index.js +23 -36
  59. package/dist/index.js.map +1 -1
  60. package/dist/{integrity-CfXjSqEv.d.ts → integrity-CJzrpUua.d.ts} +1 -1
  61. package/dist/knowledge/index.js +1 -1
  62. package/dist/{llm-client-BXVRUZyX.d.ts → llm-client-DbjLfz-K.d.ts} +1 -1
  63. package/dist/matrix/index.js +1 -1
  64. package/dist/meta-eval/index.d.ts +3 -3
  65. package/dist/meta-eval/index.js +1 -1
  66. package/dist/multishot/index.js +1 -1
  67. package/dist/openapi.json +1 -1
  68. package/dist/pipelines/index.js +4 -4
  69. package/dist/prm/index.js +1 -1
  70. package/dist/{run-improvement-loop-BhfdjrMY.d.ts → provenance-D0WeCXt1.d.ts} +208 -6
  71. package/dist/{red-team-CrC5MZYd.d.ts → red-team-DW9Ca_tj.d.ts} +1 -1
  72. package/dist/{registry-DK9kqXvb.d.ts → registry-qmbYT3Eo.d.ts} +2 -2
  73. package/dist/{release-report-DmPjIce3.d.ts → release-report-DszkgvJ3.d.ts} +3 -3
  74. package/dist/reporting.d.ts +6 -6
  75. package/dist/reporting.js +5 -5
  76. package/dist/{researcher-JP8EvnLv.d.ts → researcher-BaVsy0sW.d.ts} +4 -4
  77. package/dist/rl.d.ts +9 -9
  78. package/dist/rl.js +8 -8
  79. package/dist/{rubric-predictive-validity-B3qNa4aY.d.ts → rubric-predictive-validity-DgBHWsh7.d.ts} +1 -1
  80. package/dist/run-campaign-HXPJAUZ3.js +10 -0
  81. package/dist/{run-record-etiCMsUq.d.ts → run-record-DgUVo5pw.d.ts} +1 -1
  82. package/dist/{summary-report-DLxh4yWk.d.ts → summary-report-BQvXpvaR.d.ts} +1 -1
  83. package/dist/telemetry/file.js +1 -1
  84. package/dist/telemetry/index.js +1 -1
  85. package/dist/traces.d.ts +2 -2
  86. package/dist/traces.js +4 -4
  87. package/dist/{types-BgrxOJSf.d.ts → types-Beb6KPqZ.d.ts} +52 -4
  88. package/dist/wire/index.d.ts +3 -3
  89. package/dist/wire/index.js +4 -4
  90. package/package.json +1 -1
  91. package/dist/chunk-74Y2EMNH.js.map +0 -1
  92. package/dist/chunk-JB4UWIM6.js.map +0 -1
  93. package/dist/chunk-N4SBKEPJ.js.map +0 -1
  94. package/dist/chunk-NSBPE2FW.js +0 -17
  95. package/dist/chunk-QYJT52YW.js.map +0 -1
  96. package/dist/chunk-ZWEQJIM6.js +0 -220
  97. package/dist/chunk-ZWEQJIM6.js.map +0 -1
  98. package/dist/run-campaign-ZURVWMMI.js +0 -10
  99. /package/dist/{chunk-QDOSODID.js.map → chunk-3B7Y5AUR.js.map} +0 -0
  100. /package/dist/{chunk-J4DIMSRK.js.map → chunk-6EKXFFGQ.js.map} +0 -0
  101. /package/dist/{chunk-MHQPVHXU.js.map → chunk-6QDKWHLS.js.map} +0 -0
  102. /package/dist/{chunk-63EPZQUZ.js.map → chunk-6REHLN5J.js.map} +0 -0
  103. /package/dist/{chunk-GM476SZU.js.map → chunk-AIWHLG7J.js.map} +0 -0
  104. /package/dist/{chunk-AIXHUIHG.js.map → chunk-B26KI423.js.map} +0 -0
  105. /package/dist/{chunk-NCK5QLGT.js.map → chunk-F3SRAAZO.js.map} +0 -0
  106. /package/dist/{chunk-VXNVVBZO.js.map → chunk-IHDHUN2X.js.map} +0 -0
  107. /package/dist/{chunk-S3SDD56V.js.map → chunk-ITBRCT73.js.map} +0 -0
  108. /package/dist/{chunk-OLIBRKRD.js.map → chunk-KX6F6NCG.js.map} +0 -0
  109. /package/dist/{chunk-NSBPE2FW.js.map → chunk-PZ5AY32C.js.map} +0 -0
  110. /package/dist/{chunk-UBPIXOC4.js.map → chunk-SBCB6VZY.js.map} +0 -0
  111. /package/dist/{chunk-YTMXBHFM.js.map → chunk-T375SUOZ.js.map} +0 -0
  112. /package/dist/{chunk-PIEAE33T.js.map → chunk-Z4ZCBC7M.js.map} +0 -0
  113. /package/dist/{run-campaign-ZURVWMMI.js.map → run-campaign-HXPJAUZ3.js.map} +0 -0
@@ -1,9 +1,9 @@
1
- import { a as FeedbackLabel, p as ProposedSideEffect } from './feedback-trajectory-DpUmE90J.js';
1
+ import { a as FeedbackLabel, p as ProposedSideEffect } from './feedback-trajectory-8hKC5EOb.js';
2
2
  import { C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig } from './control-runtime-DuFBYg7A.js';
3
3
  import { T as TraceEmitter } from './emitter-DEZwY14K.js';
4
4
  import { F as FailureClass } from './schema-m0gsnbt3.js';
5
5
  import { T as TraceStore } from './store-CKUAgsJz.js';
6
- import { a as RunSplitTag, b as RunTokenUsage, R as RunRecord } from './run-record-etiCMsUq.js';
6
+ import { a as RunSplitTag, b as RunTokenUsage, R as RunRecord } from './run-record-DgUVo5pw.js';
7
7
 
8
8
  interface ActionExecutionPolicy {
9
9
  allowedTypes?: string[];
package/dist/control.d.ts CHANGED
@@ -1,9 +1,9 @@
1
- export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, p as RunEvidenceMetadata, s as controlRunToRunRecord, u as evaluateActionPolicy, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-DjEgwWNo.js';
1
+ export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, p as RunEvidenceMetadata, s as controlRunToRunRecord, u as evaluateActionPolicy, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-Bf8owbuG.js';
2
2
  export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-DuFBYg7A.js';
3
- import './feedback-trajectory-DpUmE90J.js';
4
- import './dataset-BlwAtYYf.js';
5
- import './errors-mje_cKOs.js';
3
+ import './feedback-trajectory-8hKC5EOb.js';
4
+ import './dataset-B2kL-fSM.js';
5
+ import './errors-Dwqw-T_m.js';
6
6
  import './emitter-DEZwY14K.js';
7
7
  import './schema-m0gsnbt3.js';
8
8
  import './store-CKUAgsJz.js';
9
- import './run-record-etiCMsUq.js';
9
+ import './run-record-DgUVo5pw.js';
package/dist/control.js CHANGED
@@ -4,7 +4,7 @@ import {
4
4
  runProposeReview,
5
5
  runProposeReviewAsControlLoop,
6
6
  scoreFromEvals
7
- } from "./chunk-J4DIMSRK.js";
7
+ } from "./chunk-6EKXFFGQ.js";
8
8
  import {
9
9
  allCriticalPassed,
10
10
  objectiveEval,
@@ -13,11 +13,11 @@ import {
13
13
  stopOnRepeatedAction,
14
14
  subjectiveEval
15
15
  } from "./chunk-NCRFYPS3.js";
16
- import "./chunk-NCK5QLGT.js";
16
+ import "./chunk-F3SRAAZO.js";
17
17
  import "./chunk-TVVP3ZZQ.js";
18
18
  import "./chunk-VSMTAMNK.js";
19
- import "./chunk-QYJT52YW.js";
20
- import "./chunk-NSBPE2FW.js";
19
+ import "./chunk-3BFEG2F6.js";
20
+ import "./chunk-PZ5AY32C.js";
21
21
  export {
22
22
  allCriticalPassed,
23
23
  controlRunToRunRecord,
@@ -1,4 +1,4 @@
1
- import { V as ValidationError } from './errors-mje_cKOs.js';
1
+ import { V as ValidationError } from './errors-Dwqw-T_m.js';
2
2
 
3
3
  /**
4
4
  * Dataset — versioned, sliceable, content-hashed scenario collection.
@@ -12,7 +12,7 @@
12
12
  * remain plain `Error`s on purpose — they're programmer-mistake assertions,
13
13
  * not consumer-catchable contract failures.
14
14
  */
15
- type AgentEvalErrorCode = 'validation' | 'not_found' | 'config' | 'capture_integrity' | 'judge' | 'verification' | 'replay' | 'backend_integrity';
15
+ type AgentEvalErrorCode = 'validation' | 'not_found' | 'config' | 'capture_integrity' | 'judge' | 'verification' | 'replay' | 'backend_integrity' | 'profile_matrix';
16
16
  declare class AgentEvalError extends Error {
17
17
  /** Stable string code. Survives minification; safe to switch on. */
18
18
  readonly code: AgentEvalErrorCode;
@@ -1,5 +1,5 @@
1
1
  import { C as ControlEvalResult, a as ControlRunResult, b as ControlStep } from './control-runtime-DuFBYg7A.js';
2
- import { D as DatasetSplit, a as DatasetScenario } from './dataset-BlwAtYYf.js';
2
+ import { D as DatasetSplit, a as DatasetScenario } from './dataset-B2kL-fSM.js';
3
3
 
4
4
  type FeedbackArtifactType = 'text' | 'code' | 'plan' | 'research' | 'action' | 'ui' | 'decision' | 'data' | 'other';
5
5
  type FeedbackLabelSource = 'user' | 'judge' | 'environment' | 'metric' | 'policy' | 'system';
@@ -1,9 +1,9 @@
1
- import { c as DatasetManifest } from '../dataset-BlwAtYYf.js';
1
+ import { c as DatasetManifest } from '../dataset-B2kL-fSM.js';
2
2
  import { b as CalibrationResult } from '../judge-calibration-DilmB3Ml.js';
3
3
  import { O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
4
- import { d as RedTeamReport } from '../red-team-CrC5MZYd.js';
4
+ import { d as RedTeamReport } from '../red-team-DW9Ca_tj.js';
5
5
  import { T as TraceStore } from '../store-CKUAgsJz.js';
6
- import '../errors-mje_cKOs.js';
6
+ import '../errors-Dwqw-T_m.js';
7
7
  import '../schema-m0gsnbt3.js';
8
8
 
9
9
  /**
@@ -6,7 +6,7 @@ import {
6
6
  soc2Report,
7
7
  summarize
8
8
  } from "../chunk-KKHDIONI.js";
9
- import "../chunk-NSBPE2FW.js";
9
+ import "../chunk-PZ5AY32C.js";
10
10
  export {
11
11
  classifyEuAiRisk,
12
12
  euAiActReport,
@@ -1,8 +1,8 @@
1
- export { E as EvalRunCellScore, d as EvalRunEvent, e as EvalRunGenerationSnapshot, f as EvalRunStatus, g as HOSTED_WIRE_VERSION, H as HostedClient, h as HostedIngestHeaders, a as HostedTenant, i as HostedWireVersion, j as IngestEvalRunsRequest, k as IngestResponse, l as IngestTracesRequest, T as TraceSpanEvent, m as createHostedClient } from '../index-D2nT6_KT.js';
2
- import '../types-BgrxOJSf.js';
3
- import '../summary-report-DLxh4yWk.js';
4
- import '../run-record-etiCMsUq.js';
5
- import '../errors-mje_cKOs.js';
1
+ export { E as EvalRunCellScore, d as EvalRunEvent, e as EvalRunGenerationSnapshot, f as EvalRunStatus, g as HOSTED_WIRE_VERSION, H as HostedClient, h as HostedIngestHeaders, a as HostedTenant, i as HostedWireVersion, j as IngestEvalRunsRequest, k as IngestResponse, l as IngestTracesRequest, T as TraceSpanEvent, m as createHostedClient } from '../index-D9dwa00f.js';
2
+ import '../types-Beb6KPqZ.js';
3
+ import '../summary-report-BQvXpvaR.js';
4
+ import '../run-record-DgUVo5pw.js';
5
+ import '../errors-Dwqw-T_m.js';
6
6
  import '../schema-m0gsnbt3.js';
7
7
  import '../failure-cluster-CL7IVgkJ.js';
8
8
  import '../store-CKUAgsJz.js';
@@ -2,7 +2,7 @@ import {
2
2
  HOSTED_WIRE_VERSION,
3
3
  createHostedClient
4
4
  } from "../chunk-FQK2CCIM.js";
5
- import "../chunk-NSBPE2FW.js";
5
+ import "../chunk-PZ5AY32C.js";
6
6
  export {
7
7
  HOSTED_WIRE_VERSION,
8
8
  createHostedClient
@@ -1,4 +1,4 @@
1
- import { a as RunSplitTag } from './run-record-etiCMsUq.js';
1
+ import { a as RunSplitTag } from './run-record-DgUVo5pw.js';
2
2
 
3
3
  /**
4
4
  * Shared types for the reference benchmark wrappers under
@@ -1,5 +1,5 @@
1
- import { M as MutableSurface, n as GateDecision } from './types-BgrxOJSf.js';
2
- import { G as GainDistributionBin, P as ParetoFigureSpec } from './summary-report-DLxh4yWk.js';
1
+ import { M as MutableSurface, p as GateDecision } from './types-Beb6KPqZ.js';
2
+ import { G as GainDistributionBin, P as ParetoFigureSpec } from './summary-report-BQvXpvaR.js';
3
3
  import { a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
4
4
 
5
5
  /**
package/dist/index.d.ts CHANGED
@@ -1,38 +1,40 @@
1
- export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-DjEgwWNo.js';
2
- import { R as RunRecord } from './run-record-etiCMsUq.js';
3
- export { e as AGENT_PROFILE_KINDS, A as AgentProfileCell, d as AgentProfileCellInput, f as AgentProfileCellSchemaVersion, g as AgentProfileCellValidationError, h as AgentProfileDimensionValue, i as AgentProfileHarness, j as AgentProfileJson, k as AgentProfileKind, l as AgentProfileSource, m as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, n as RunOutcome, o as RunRecordValidationError, a as RunSplitTag, b as RunTokenUsage, S as SandboxAgentProfileLike, p as agentProfileCellHashMaterial, q as agentProfileCellKey, r as assertRunAgentProfileCell, s as buildAgentProfileCell, t as buildSandboxAgentProfileCell, u as groupRunsByAgentProfileCell, v as isRunRecord, w as parseRunRecordSafe, x as requireAgentProfileCell, y as roundTripRunRecord, z as toAgentProfileJson, B as validateAgentProfileCell, C as validateRunRecord, D as verifyAgentProfileCell } from './run-record-etiCMsUq.js';
1
+ export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-Bf8owbuG.js';
2
+ import { R as RunRecord } from './run-record-DgUVo5pw.js';
3
+ export { e as AGENT_PROFILE_KINDS, A as AgentProfileCell, d as AgentProfileCellInput, f as AgentProfileCellSchemaVersion, g as AgentProfileCellValidationError, h as AgentProfileDimensionValue, i as AgentProfileHarness, j as AgentProfileJson, k as AgentProfileKind, l as AgentProfileSource, m as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, n as RunOutcome, o as RunRecordValidationError, a as RunSplitTag, b as RunTokenUsage, S as SandboxAgentProfileLike, p as agentProfileCellHashMaterial, q as agentProfileCellKey, r as assertRunAgentProfileCell, s as buildAgentProfileCell, t as buildSandboxAgentProfileCell, u as groupRunsByAgentProfileCell, v as isRunRecord, w as parseRunRecordSafe, x as requireAgentProfileCell, y as roundTripRunRecord, z as toAgentProfileJson, B as validateAgentProfileCell, C as validateRunRecord, D as verifyAgentProfileCell } from './run-record-DgUVo5pw.js';
4
4
  import { AxAIService, AxFunction } from '@ax-llm/ax';
5
- import { d as Severity, M as MultiLayerVerifier, e as VerifyOptions, L as Layer, f as LayerResult, g as VerifyContext } from './researcher-JP8EvnLv.js';
6
- export { C as CallbackResearcher, h as CallbackResearcherOptions, i as CampaignFactoryParams, j as CampaignIntegrityPolicy, k as CampaignRunContext, l as CampaignRunOutcome, m as CampaignRunner, n as CampaignScenario, o as CampaignVariant, c as EvalCampaignOptions, b as EvalCampaignResult, E as ExperimentPlan, a as ExperimentResult, p as FailedRun, F as FailureMode, q as Finding, s as LayerStatus, N as NoopResearcher, R as Researcher, S as SteeringChange, V as VerificationReport, t as gradeSemanticStatus, r as runEvalCampaign } from './researcher-JP8EvnLv.js';
5
+ import { d as Severity, M as MultiLayerVerifier, e as VerifyOptions, L as Layer, f as LayerResult, g as VerifyContext } from './researcher-BaVsy0sW.js';
6
+ export { C as CallbackResearcher, h as CallbackResearcherOptions, i as CampaignFactoryParams, j as CampaignIntegrityPolicy, k as CampaignRunContext, l as CampaignRunOutcome, m as CampaignRunner, n as CampaignScenario, o as CampaignVariant, c as EvalCampaignOptions, b as EvalCampaignResult, E as ExperimentPlan, a as ExperimentResult, p as FailedRun, F as FailureMode, q as Finding, s as LayerStatus, N as NoopResearcher, R as Researcher, S as SteeringChange, V as VerificationReport, t as gradeSemanticStatus, r as runEvalCampaign } from './researcher-BaVsy0sW.js';
7
7
  import { R as Run$1, S as Span, a as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, h as BudgetSpec, L as LlmSpan } from './schema-m0gsnbt3.js';
8
8
  export { E as EventKind, i as FAILURE_CLASSES, F as FailureClass, G as GenericSpan, J as JudgeSpan, M as Message, d as RetrievalSpan, g as RunLayer, f as RunStatus, e as SandboxSpan, j as SpanBase, c as SpanKind, k as SpanStatus, l as TRACE_SCHEMA_VERSION, T as ToolSpan, m as isJudgeSpan, n as isLlmSpan, o as isRetrievalSpan, p as isSandboxSpan, q as isToolSpan } from './schema-m0gsnbt3.js';
9
9
  import { T as TraceStore, R as RunFilter } from './store-CKUAgsJz.js';
10
10
  export { E as EventFilter, F as FileSystemTraceStore, a as FileSystemTraceStoreOptions, I as InMemoryTraceStore, S as SpanFilter } from './store-CKUAgsJz.js';
11
- import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
12
- export { d as LlmCallError, b as LlmCallRequest, c as LlmCallResult, e as LlmClient, f as LlmMessage, g as LlmRouteAssertionError, a as LlmRouteRequirements, h as LlmUsage, i as assertLlmRoute, j as backoffMs, k as callLlm, l as callLlmJson, m as isTransientLlmError, p as probeLlm, s as stripFencedJson } from './llm-client-BXVRUZyX.js';
11
+ import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
12
+ export { d as LlmCallError, b as LlmCallRequest, c as LlmCallResult, e as LlmClient, f as LlmMessage, g as LlmRouteAssertionError, a as LlmRouteRequirements, h as LlmUsage, i as assertLlmRoute, j as backoffMs, k as callLlm, l as callLlmJson, m as isTransientLlmError, p as probeLlm, s as stripFencedJson } from './llm-client-DbjLfz-K.js';
13
13
  import { AnalyzeTracesOptions, OtelExporter, OtelExportConfig, AnalyzeTracesInput, AnalyzeTracesResult } from './traces.js';
14
14
  export { AnalyzeTracesTurnSnapshot, CaptureFetchContext, CaptureFetchOptions, DEFAULT_REDACTION_RULES, ExportableSpan, FlattenOtlpOptions, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpFlatLine, OtlpResourceSpans, OtlpSpan, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TraceAnalystHookOptions, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, captureFetchToRawSink, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, flattenOtlpExportToNdjson, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
15
15
  import { T as TraceAnalysisStore } from './store-jzKpMl16.js';
16
16
  export { D as DEFAULT_TRACE_ANALYST_BUDGETS, a as DatasetOverview, Q as QueryTracesPage, S as SearchSpanResult, b as SearchTraceResult, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, f as TraceAnalystFilters, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, V as ViewSpansResult, k as ViewTraceOversized, l as ViewTraceResult } from './store-jzKpMl16.js';
17
17
  import { b as JudgeFn, J as JudgeInput, B as BenchmarkRunnerConfig, S as Scenario, c as BenchmarkReport, P as ProductClientConfig, C as CheckResult, T as TestResult, d as PersonaConfig, D as DriverResult, e as DriverState, f as CollectedArtifacts, g as ScenarioResult, h as TurnMetrics, i as ScenarioFile, j as CompletionCriterion } from './types-DhqpAi_z.js';
18
18
  export { A as ArtifactCheck, k as ArtifactResult, E as EvalResult, F as FeedbackPattern, l as JudgeConfig, m as JudgeRubric, a as JudgeScore, n as PersonaRigor, R as RouteMap, o as RubricDimension, p as Turn, q as TurnResult } from './types-DhqpAi_z.js';
19
- import { a as Analyst, b as AnalystSeverity, c as AnalystFinding, d as AnalystCost, e as AnalystContext } from './registry-DK9kqXvb.js';
20
- export { f as AnalystHooks, g as AnalystInputKind, A as AnalystRegistry, h as AnalystRegistryOptions, i as AnalystRequirements, j as AnalystRunEvent, k as AnalystRunInputs, l as AnalystRunResult, m as AnalystRunSummary, B as BudgetPolicy, C as ChatCallOpts, n as ChatClient, o as ChatRequest, p as ChatResponse, q as ChatTransport, r as CliBridgeTransportOpts, s as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RegistryRunOpts, t as RouterTransportOpts, S as SandboxSdkTransportOpts, u as computeFindingId, v as createChatClient, w as makeFinding } from './registry-DK9kqXvb.js';
19
+ import { a as Analyst, b as AnalystSeverity, c as AnalystFinding, d as AnalystCost, e as AnalystContext } from './registry-qmbYT3Eo.js';
20
+ export { f as AnalystHooks, g as AnalystInputKind, A as AnalystRegistry, h as AnalystRegistryOptions, i as AnalystRequirements, j as AnalystRunEvent, k as AnalystRunInputs, l as AnalystRunResult, m as AnalystRunSummary, B as BudgetPolicy, C as ChatCallOpts, n as ChatClient, o as ChatRequest, p as ChatResponse, q as ChatTransport, r as CliBridgeTransportOpts, s as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RegistryRunOpts, t as RouterTransportOpts, S as SandboxSdkTransportOpts, u as computeFindingId, v as createChatClient, w as makeFinding } from './registry-qmbYT3Eo.js';
21
21
  import { TCloud } from '@tangle-network/tcloud';
22
22
  import { z } from 'zod';
23
23
  export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-DuFBYg7A.js';
24
- import { A as AgentEvalError } from './errors-mje_cKOs.js';
25
- export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-mje_cKOs.js';
26
- import { a as FeedbackLabel, F as FeedbackTrajectoryStore, b as FeedbackTrajectory } from './feedback-trajectory-DpUmE90J.js';
27
- export { c as FeedbackArtifactType, d as FeedbackAttempt, e as FeedbackLabelKind, f as FeedbackLabelSource, g as FeedbackOptimizerRow, h as FeedbackOutcome, i as FeedbackReplayAdapter, j as FeedbackReplayResult, k as FeedbackSeverity, l as FeedbackSplitPolicy, m as FeedbackTask, n as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-DpUmE90J.js';
24
+ import { A as AgentEvalError } from './errors-Dwqw-T_m.js';
25
+ export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-Dwqw-T_m.js';
26
+ import { a as FeedbackLabel, F as FeedbackTrajectoryStore, b as FeedbackTrajectory } from './feedback-trajectory-8hKC5EOb.js';
27
+ export { c as FeedbackArtifactType, d as FeedbackAttempt, e as FeedbackLabelKind, f as FeedbackLabelSource, g as FeedbackOptimizerRow, h as FeedbackOutcome, i as FeedbackReplayAdapter, j as FeedbackReplayResult, k as FeedbackSeverity, l as FeedbackSplitPolicy, m as FeedbackTask, n as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-8hKC5EOb.js';
28
+ import { A as AgentProfile } from './agent-profile-9J9hxdm2.js';
29
+ export { a as BackendIntegrityError, B as BackendIntegrityReport, b as agentProfileHash, c as assertRealBackend, s as summarizeBackendIntegrity } from './agent-profile-9J9hxdm2.js';
28
30
  export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
29
- import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-DmPjIce3.js';
30
- export { A as ActionableSideInfo, s as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, C as CliffsMagnitude, t as CorpusAgreementOptions, u as CorpusAgreementPerDimension, v as CorpusAgreementReport, x as CorpusScoreRecord, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, W as WeightedCompositeInput, y as WeightedCompositeResult, l as assertReleaseConfidence, m as benjaminiHochberg, z as bonferroni, n as bootstrapCi, D as cliffsDelta, E as cohensD, F as confidenceInterval, G as corpusInterRaterAgreement, H as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, I as interRaterReliability, K as interpretCliffs, p as judgeReplayGate, L as mannWhitneyU, M as normalizeScores, q as pairedBootstrap, N as pairedMde, O as pairedTTest, Q as partialCredit, r as renderReleaseReport, S as requiredSampleSize, T as weightedComposite, U as weightedMean, w as wilcoxonSignedRank } from './release-report-DmPjIce3.js';
31
+ import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-DszkgvJ3.js';
32
+ export { A as ActionableSideInfo, s as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, C as CliffsMagnitude, t as CorpusAgreementOptions, u as CorpusAgreementPerDimension, v as CorpusAgreementReport, x as CorpusScoreRecord, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, W as WeightedCompositeInput, y as WeightedCompositeResult, l as assertReleaseConfidence, m as benjaminiHochberg, z as bonferroni, n as bootstrapCi, D as cliffsDelta, E as cohensD, F as confidenceInterval, G as corpusInterRaterAgreement, H as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, I as interRaterReliability, K as interpretCliffs, p as judgeReplayGate, L as mannWhitneyU, M as normalizeScores, q as pairedBootstrap, N as pairedMde, O as pairedTTest, Q as partialCredit, r as renderReleaseReport, S as requiredSampleSize, T as weightedComposite, U as weightedMean, w as wilcoxonSignedRank } from './release-report-DszkgvJ3.js';
31
33
  import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-BdVaPyHT.js';
32
34
  export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-BdVaPyHT.js';
33
35
  import { T as TraceEmitter } from './emitter-DEZwY14K.js';
34
36
  export { R as RunCompleteHook, a as RunCompleteHookContext, S as SpanHandle, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DEZwY14K.js';
35
- export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CfXjSqEv.js';
37
+ export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CJzrpUua.js';
36
38
  export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-CqTxMwDw.js';
37
39
  export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, c as RawProviderDirection, d as RawProviderEvent, R as RawProviderSink, e as RawProviderSinkFilter, f as defaultProviderRedactor, p as providerFromBaseUrl } from './raw-provider-sink-C46HDghv.js';
38
40
  export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-CL7IVgkJ.js';
@@ -41,14 +43,14 @@ export { B as BaselineOptions, M as MetricSamples, b as MetricVerdict, T as Tool
41
43
  import { T as Trajectory, a as TrajectoryStep } from './trajectory-GEdXJCL5.js';
42
44
  export { b as buildTrajectory } from './trajectory-GEdXJCL5.js';
43
45
  export { D as DefaultVerdict } from './verdict-CeEgtjyI.js';
44
- import { a as DatasetScenario, b as Dataset } from './dataset-BlwAtYYf.js';
45
- export { d as DatasetDifficulty, c as DatasetManifest, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-BlwAtYYf.js';
46
+ import { a as DatasetScenario, b as Dataset } from './dataset-B2kL-fSM.js';
47
+ export { d as DatasetDifficulty, c as DatasetManifest, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-B2kL-fSM.js';
46
48
  export { b as CalibrationResult, c as CandidateScore, a as ContinuousAgreement, C as ContinuousAgreementOptions, d as ContinuousCalibrationResult, G as GoldenItem, P as PositionalBiasResult, S as SelfPreferenceResult, V as VerbosityBiasResult, e as calibrateJudge, f as calibrateJudgeContinuous, g as continuousAgreement, p as positionalBias, s as selfPreference, v as verbosityBias } from './judge-calibration-DilmB3Ml.js';
47
- export { D as DEFAULT_RED_TEAM_CORPUS, R as RedTeamCase, a as RedTeamCategory, b as RedTeamFinding, c as RedTeamPayload, d as RedTeamReport, r as redTeamDataset, e as redTeamReport, s as scoreRedTeamOutput, t as toolNamesForRun } from './red-team-CrC5MZYd.js';
49
+ export { D as DEFAULT_RED_TEAM_CORPUS, R as RedTeamCase, a as RedTeamCategory, b as RedTeamFinding, c as RedTeamPayload, d as RedTeamReport, r as redTeamDataset, e as redTeamReport, s as scoreRedTeamOutput, t as toolNamesForRun } from './red-team-DW9Ca_tj.js';
48
50
  import { a as PrmGrader } from './rubric-BOfxn4ja.js';
49
51
  export { EuRiskClass, GovernanceContext, GovernanceFinding, GovernanceReport, UseCaseSignals, classifyEuAiRisk, euAiActReport, nistAiRmfReport, renderMarkdown, soc2Report, summarize } from './governance/index.js';
50
- export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-wlaiph9Y.js';
51
- export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, m as GateDecision, n as GateEvidence, H as HeldOutGate, o as HeldOutGateConfig, q as HeldOutGateRejectionCode, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-DLxh4yWk.js';
52
+ export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-Bvk35ils.js';
53
+ export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, m as GateDecision, n as GateEvidence, H as HeldOutGate, o as HeldOutGateConfig, q as HeldOutGateRejectionCode, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-BQvXpvaR.js';
52
54
  export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
53
55
  import './outcome-store-D6KWmYvj.js';
54
56
 
@@ -1121,76 +1123,6 @@ interface ExecutorConfig {
1121
1123
  */
1122
1124
  declare function executeScenario(tc: TCloud, scenario: Scenario, config: ExecutorConfig): Promise<ScenarioResult>;
1123
1125
 
1124
- /**
1125
- * Backend-integrity guard: distinguish "agent failed" from "eval ran against
1126
- * a stub / unconfigured backend." Without this guard a canonical eval can
1127
- * silently report `0/N passed` and look like an agent-quality problem when
1128
- * the LLM was never actually called — the failure mode we just hit running
1129
- * the 4-vertical parallel eval (legal-sandbox-stub returned hard-coded 33-104
1130
- * char strings; gtm/creative defaulted to a cli-bridge that wasn't running).
1131
- *
1132
- * The shape:
1133
- *
1134
- * const report = summarizeBackendIntegrity(records)
1135
- * assertRealBackend(records) // throws BackendIntegrityError if 100% stub
1136
- *
1137
- * A record is "stub-mode" if its `tokenUsage.input === 0 && tokenUsage.output === 0`.
1138
- * (`costUsd` alone is unreliable — some backends successfully call LLMs but
1139
- * don't propagate pricing, producing real tokens with $0 cost.)
1140
- *
1141
- * Verdicts:
1142
- * - `real` — at least one record has nonzero token usage
1143
- * - `stub` — every record is stub-mode (eval ran blind)
1144
- * - `mixed` — some records real, some stub (partial backend failure;
1145
- * often the 429-cascade or auth-half-failed case)
1146
- */
1147
-
1148
- interface BackendIntegrityReport {
1149
- /** Total records inspected. */
1150
- totalRecords: number;
1151
- /** Records with input=0 AND output=0 (a stub fingerprint). */
1152
- stubRecords: number;
1153
- /** Records with nonzero token usage (real LLM activity). */
1154
- realRecords: number;
1155
- /** Records where output>0 but costUsd=0 (real LLM, broken cost ledger). */
1156
- uncostedRecords: number;
1157
- /** Sum of input tokens across all records. */
1158
- totalInputTokens: number;
1159
- /** Sum of output tokens across all records. */
1160
- totalOutputTokens: number;
1161
- /** Sum of costUsd across all records. */
1162
- totalCostUsd: number;
1163
- /** Worst-case integrity verdict. */
1164
- verdict: 'real' | 'mixed' | 'stub';
1165
- /** Human-readable diagnosis suitable for terminal output. */
1166
- diagnosis: string;
1167
- }
1168
- /**
1169
- * Error thrown when an integrity assertion fails. Caller can pattern-match
1170
- * by `code === 'AGENT_EVAL_BACKEND_STUB'` to differentiate from other
1171
- * errors.
1172
- */
1173
- declare class BackendIntegrityError extends AgentEvalError {
1174
- readonly report: BackendIntegrityReport;
1175
- constructor(message: string, report: BackendIntegrityReport);
1176
- }
1177
- /**
1178
- * Inspect a batch of RunRecords and return an integrity report. Pure
1179
- * function — no I/O, no logging. The caller decides what to do with the
1180
- * verdict (print warning, throw, gate CI, etc.).
1181
- */
1182
- declare function summarizeBackendIntegrity(records: ReadonlyArray<RunRecord>): BackendIntegrityReport;
1183
- /**
1184
- * Throw BackendIntegrityError if the verdict is 'stub' — i.e. every record
1185
- * shows zero LLM activity. Non-strict callers can pass `{ allowMixed: false }`
1186
- * to also reject mixed verdicts (recommended for CI gates).
1187
- *
1188
- * Real backends pass through silently.
1189
- */
1190
- declare function assertRealBackend(records: ReadonlyArray<RunRecord>, opts?: {
1191
- allowMixed?: boolean;
1192
- }): BackendIntegrityReport;
1193
-
1194
1126
  /**
1195
1127
  * Single-backend guard: assert the agent and the rubric judge run through the
1196
1128
  * SAME backend config, so the judge can't silently re-route through a
@@ -2667,46 +2599,6 @@ declare class BudgetGuard {
2667
2599
  get state(): Record<keyof BudgetSpec, number>;
2668
2600
  }
2669
2601
 
2670
- /**
2671
- * @stable
2672
- *
2673
- * AgentProfile — the eval harness's unit of variation.
2674
- *
2675
- * A profile pins everything that changes agent behaviour for a benchmark
2676
- * cell: the model, the active skills, the prompt version, the available
2677
- * tools. Vary the profile — swap a model, add a skill — and re-run the suite
2678
- * to benchmark the change. The scorecard keys a cell on
2679
- * `(scenarioId, profileHash)`, so the model is not a separate axis: it lives
2680
- * inside the profile, and two profiles with the same model but different
2681
- * skills are different cells.
2682
- *
2683
- * `agentProfileHash` is the profile's behaviour identity. Two profiles that
2684
- * produce the same agent behaviour share a hash (and a scorecard cell);
2685
- * reordering `skills` or `tools` does not change it; the human-facing `id`
2686
- * label does not affect it.
2687
- */
2688
- interface AgentProfile {
2689
- /** Human-facing label, e.g. `sonnet-legal-skills-v3`. Not part of the hash. */
2690
- id: string;
2691
- /** Model snapshot id this profile pins, e.g. `claude-sonnet-4-6@2025-04-15`. */
2692
- model: string;
2693
- /** Skill ids/versions active in this profile — the primary behaviour lever. */
2694
- skills?: string[];
2695
- /** Prompt version identifier. */
2696
- promptVersion?: string;
2697
- /** Tool ids available to the agent. */
2698
- tools?: string[];
2699
- /** Any other behaviour-bearing knobs that should fingerprint into the hash. */
2700
- metadata?: Record<string, string | number | boolean>;
2701
- }
2702
- /**
2703
- * Deterministic behaviour identity of a profile — a sha256 over the
2704
- * behaviour-bearing fields. `skills` and `tools` are order-insensitive; the
2705
- * `id` label is excluded. Throws on a profile with no `model` — an unkeyable
2706
- * profile must fail loud rather than collapse into a blank-model cell.
2707
- */
2708
- declare function agentProfileHash(profile: AgentProfile): string;
2709
-
2710
2602
  /**
2711
2603
  * Cost tracker — token + USD accounting per scenario and per run.
2712
2604
  *
@@ -5601,4 +5493,4 @@ declare function traceJudge(judge: JudgeFn, judgeName: string, opts: TracedJudge
5601
5493
  */
5602
5494
  declare function traceJudgeEnsemble(judges: JudgeFn[], judgeNames: string[], opts: TracedJudgeOptions): JudgeFn;
5603
5495
 
5604
- export { ANALYST_SEVERITIES, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AgentProfile, type AlignmentOp, Analyst, AnalystContext, AnalystCost, AnalystFinding, AnalystSeverity, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AssertCrossFamilyOptions, type AssertSingleBackendOptions, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BackendDescriptor, BackendIntegrityError, type BackendIntegrityReport, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, CheckResult, CollectedArtifacts, type CommandRunner, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrectnessChecker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, CrossFamilyError, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type Direction, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAdapterOpts, type JudgeFamily, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, Mutex, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoResult, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type SingleBackendDivergence, SingleBackendError, type SingleBackendField, type SingleBackendReport, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, agentProfileHash, aggregatePrReviewScore, aggregateRunScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertCrossFamily, assertRealBackend, assertSingleBackend, attributeCounterfactuals, bisect, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, containsAll, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, crowdingDistance, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, dominates, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isModelPriced, isOtelConfigured, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeFamily, keyPreserved, liftSeverity, linterJudge, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, paretoFrontier, paretoFrontierWithCrowding, parseCorrectnessResponse, parseFindingSubject, parseRawFinding, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, resolveModelPricing, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scalarScore, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeBackendIntegrity, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };
5496
+ export { ANALYST_SEVERITIES, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, AgentProfile, type AlignmentOp, Analyst, AnalystContext, AnalystCost, AnalystFinding, AnalystSeverity, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AssertCrossFamilyOptions, type AssertSingleBackendOptions, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BackendDescriptor, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, CheckResult, CollectedArtifacts, type CommandRunner, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrectnessChecker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, CrossFamilyError, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type Direction, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAdapterOpts, type JudgeFamily, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, Mutex, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoResult, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type SingleBackendDivergence, SingleBackendError, type SingleBackendField, type SingleBackendReport, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregatePrReviewScore, aggregateRunScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertCrossFamily, assertSingleBackend, attributeCounterfactuals, bisect, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, containsAll, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, crowdingDistance, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, dominates, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isModelPriced, isOtelConfigured, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeFamily, keyPreserved, liftSeverity, linterJudge, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, paretoFrontier, paretoFrontierWithCrowding, parseCorrectnessResponse, parseFindingSubject, parseRawFinding, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, resolveModelPricing, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scalarScore, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };
package/dist/index.js CHANGED
@@ -1,18 +1,20 @@
1
1
  import {
2
- BackendIntegrityError,
2
+ agentProfileHash
3
+ } from "./chunk-PQV2TKC3.js";
4
+ import {
3
5
  HoldoutAuditor,
4
- assertRealBackend,
5
6
  canaryLeakView,
6
7
  checkBehavioralCanary,
7
8
  checkCanaries,
8
- runBehavioralCanaries,
9
- summarizeBackendIntegrity
10
- } from "./chunk-ZWEQJIM6.js";
9
+ runBehavioralCanaries
10
+ } from "./chunk-SHTXZ4O2.js";
11
11
  import {
12
+ BackendIntegrityError,
12
13
  DEFAULT_MUTATION_PRIMITIVES,
13
14
  DEFAULT_RED_TEAM_CORPUS,
14
15
  Dataset,
15
16
  HoldoutLockedError,
17
+ assertRealBackend,
16
18
  buildReflectionPrompt,
17
19
  hashScenarios,
18
20
  parseReflectionResponse,
@@ -20,13 +22,14 @@ import {
20
22
  redTeamReport,
21
23
  runCanaries,
22
24
  scoreRedTeamOutput,
25
+ summarizeBackendIntegrity,
23
26
  toolNamesForRun
24
- } from "./chunk-N4SBKEPJ.js";
27
+ } from "./chunk-GMXHLSLL.js";
25
28
  import {
26
29
  BENCHMARK_SPLIT_SEED,
27
30
  benchmarks_exports,
28
31
  deterministicSplit
29
- } from "./chunk-MHQPVHXU.js";
32
+ } from "./chunk-6QDKWHLS.js";
30
33
  import {
31
34
  DEFAULT_RULES,
32
35
  classifyFailure,
@@ -34,7 +37,7 @@ import {
34
37
  computeToolUseMetrics,
35
38
  iqr,
36
39
  welchsTTest
37
- } from "./chunk-QDOSODID.js";
40
+ } from "./chunk-3B7Y5AUR.js";
38
41
  import {
39
42
  exportTrainingData,
40
43
  toNdjson
@@ -51,7 +54,7 @@ import {
51
54
  pytestTestParser,
52
55
  runTestGradedScenario,
53
56
  vitestTestParser
54
- } from "./chunk-YTMXBHFM.js";
57
+ } from "./chunk-T375SUOZ.js";
55
58
  import {
56
59
  classifyEuAiRisk,
57
60
  euAiActReport,
@@ -77,7 +80,7 @@ import {
77
80
  runProposeReview,
78
81
  runProposeReviewAsControlLoop,
79
82
  scoreFromEvals
80
- } from "./chunk-J4DIMSRK.js";
83
+ } from "./chunk-6EKXFFGQ.js";
81
84
  import {
82
85
  allCriticalPassed,
83
86
  objectiveEval,
@@ -92,10 +95,10 @@ import {
92
95
  evaluateReleaseConfidence,
93
96
  judgeReplayGate,
94
97
  renderReleaseReport
95
- } from "./chunk-AIXHUIHG.js";
98
+ } from "./chunk-B26KI423.js";
96
99
  import {
97
100
  runEvalCampaign
98
- } from "./chunk-GM476SZU.js";
101
+ } from "./chunk-AIWHLG7J.js";
99
102
  import {
100
103
  AGENT_PROFILE_KINDS,
101
104
  AgentProfileCellValidationError,
@@ -114,7 +117,7 @@ import {
114
117
  validateAgentProfileCell,
115
118
  validateRunRecord,
116
119
  verifyAgentProfileCell
117
- } from "./chunk-NCK5QLGT.js";
120
+ } from "./chunk-F3SRAAZO.js";
118
121
  import {
119
122
  evaluateInterimReleaseConfidence,
120
123
  pairedEvalueSequence
@@ -125,7 +128,7 @@ import {
125
128
  paretoChart,
126
129
  researchReport,
127
130
  summaryTable
128
- } from "./chunk-OLIBRKRD.js";
131
+ } from "./chunk-KX6F6NCG.js";
129
132
  import {
130
133
  benjaminiHochberg,
131
134
  bonferroni,
@@ -152,7 +155,7 @@ import {
152
155
  weightedComposite,
153
156
  weightedMean,
154
157
  wilcoxonSignedRank
155
- } from "./chunk-S3SDD56V.js";
158
+ } from "./chunk-ITBRCT73.js";
156
159
  import {
157
160
  DEFAULT_TRACE_ANALYST_BUDGETS,
158
161
  FileSystemTraceStore,
@@ -189,7 +192,7 @@ import {
189
192
  tokenizeDomainWords,
190
193
  traceAnalystFunctionGroup,
191
194
  traceAnalystOnRunComplete
192
- } from "./chunk-PIEAE33T.js";
195
+ } from "./chunk-Z4ZCBC7M.js";
193
196
  import {
194
197
  DEFAULT_REDACTION_RULES,
195
198
  REDACTION_VERSION,
@@ -219,7 +222,7 @@ import {
219
222
  RunIntegrityError,
220
223
  assertRunCaptured,
221
224
  throwIfRunIncomplete
222
- } from "./chunk-UBPIXOC4.js";
225
+ } from "./chunk-SBCB6VZY.js";
223
226
  import {
224
227
  TraceEmitter,
225
228
  llmSpanFromProvider
@@ -242,7 +245,7 @@ import {
242
245
  isTransientLlmError,
243
246
  probeLlm,
244
247
  stripFencedJson
245
- } from "./chunk-VXNVVBZO.js";
248
+ } from "./chunk-IHDHUN2X.js";
246
249
  import {
247
250
  FileSystemRawProviderSink,
248
251
  InMemoryRawProviderSink,
@@ -259,8 +262,8 @@ import {
259
262
  ReplayError,
260
263
  ValidationError,
261
264
  VerificationError
262
- } from "./chunk-QYJT52YW.js";
263
- import "./chunk-NSBPE2FW.js";
265
+ } from "./chunk-3BFEG2F6.js";
266
+ import "./chunk-PZ5AY32C.js";
264
267
 
265
268
  // src/run-score.ts
266
269
  var DEFAULT_RUN_SCORE_WEIGHTS = {
@@ -5953,22 +5956,6 @@ var BudgetGuard = class {
5953
5956
  }
5954
5957
  };
5955
5958
 
5956
- // src/agent-profile.ts
5957
- import { createHash as createHash2 } from "crypto";
5958
- function agentProfileHash(profile) {
5959
- if (typeof profile.model !== "string" || profile.model.trim().length === 0) {
5960
- throw new ValidationError(`AgentProfile "${profile.id}" has no model \u2014 cannot hash`);
5961
- }
5962
- const behaviour = {
5963
- model: profile.model.trim(),
5964
- skills: [...profile.skills ?? []].sort(),
5965
- promptVersion: profile.promptVersion ?? null,
5966
- tools: [...profile.tools ?? []].sort(),
5967
- metadata: profile.metadata ?? {}
5968
- };
5969
- return createHash2("sha256").update(JSON.stringify(canonicalize(behaviour))).digest("hex");
5970
- }
5971
-
5972
5959
  // src/cost-tracker.ts
5973
5960
  var CostTracker = class {
5974
5961
  byScenario = /* @__PURE__ */ new Map();