@tangle-network/agent-eval 0.29.1 → 0.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/dist/{baseline-BwdCXUS8.d.ts → baseline-4R5deP0N.d.ts} +1 -1
  2. package/dist/benchmarks/index.d.ts +3 -3
  3. package/dist/builder-eval/index.d.ts +3 -3
  4. package/dist/builder-eval/index.js +2 -2
  5. package/dist/{chunk-R5UQJNKC.js → chunk-4L3WJXQJ.js} +2 -2
  6. package/dist/{chunk-RUI6SIHY.js → chunk-75ZREHD7.js} +4 -4
  7. package/dist/{chunk-5AKPEK5L.js → chunk-CXJOVDJR.js} +2 -2
  8. package/dist/{chunk-K33INZHH.js → chunk-GVQT44CS.js} +2 -2
  9. package/dist/{chunk-UW4NOOZI.js → chunk-HIO4UIS5.js} +308 -2
  10. package/dist/chunk-HIO4UIS5.js.map +1 -0
  11. package/dist/{chunk-4S4BM3QQ.js → chunk-M6RZ5LJN.js} +2 -2
  12. package/dist/{chunk-NG236HPC.js → chunk-QYJT52YW.js} +1 -1
  13. package/dist/chunk-QYJT52YW.js.map +1 -0
  14. package/dist/{chunk-XFZCM5Z3.js → chunk-SMSGXM74.js} +2 -2
  15. package/dist/{chunk-KTGTIOFD.js → chunk-UBPIXOC4.js} +2 -2
  16. package/dist/{chunk-DBIGN5MJ.js → chunk-WGXZAQLR.js} +3 -3
  17. package/dist/{chunk-NLMNWKVM.js → chunk-WSI4K3WB.js} +2 -2
  18. package/dist/{chunk-PALJO75S.js → chunk-XEL6UP7C.js} +2 -2
  19. package/dist/{chunk-SZSBQUIJ.js → chunk-Y2CPBYKH.js} +3 -3
  20. package/dist/{chunk-QHF6EQKK.js → chunk-YTMXBHFM.js} +2 -2
  21. package/dist/cli.js +3 -3
  22. package/dist/{control-rJhEDdpy.d.ts → control-BFpqHFV2.d.ts} +5 -5
  23. package/dist/{control-runtime-BRdQ0wrx.d.ts → control-runtime-BZ_lVLYW.d.ts} +2 -2
  24. package/dist/control.d.ts +8 -8
  25. package/dist/control.js +3 -3
  26. package/dist/{dataset-CiK_3LDr.d.ts → dataset-ueRVTUoY.d.ts} +1 -1
  27. package/dist/{emitter-BqjeOvJh.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  28. package/dist/{errors-BZ9sTdz7.d.ts → errors-mje_cKOs.d.ts} +1 -1
  29. package/dist/{failure-cluster-D1NZKqYu.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -1
  30. package/dist/{feedback-trajectory-j0nJFgC6.d.ts → feedback-trajectory-iATEAHmc.d.ts} +2 -2
  31. package/dist/governance/index.d.ts +4 -4
  32. package/dist/{index-Cgt3DKXr.d.ts → index-DPILdKbP.d.ts} +2 -2
  33. package/dist/{index--fVrWDiR.d.ts → index-TVjRYWRm.d.ts} +1 -1
  34. package/dist/index.d.ts +254 -38
  35. package/dist/index.js +378 -26
  36. package/dist/index.js.map +1 -1
  37. package/dist/{integrity-BAxLGJ9I.d.ts → integrity-DYR5gWlb.d.ts} +2 -2
  38. package/dist/knowledge/index.d.ts +3 -3
  39. package/dist/meta-eval/index.d.ts +4 -4
  40. package/dist/openapi.json +1 -1
  41. package/dist/optimization.d.ts +11 -11
  42. package/dist/optimization.js +8 -8
  43. package/dist/pipelines/index.d.ts +6 -6
  44. package/dist/pipelines/index.js +3 -3
  45. package/dist/prm/index.d.ts +4 -4
  46. package/dist/{query-BFDT0kX_.d.ts → query-DODUYdPg.d.ts} +1 -1
  47. package/dist/{release-report-PWhGlpfO.d.ts → release-report-C8r4Vben.d.ts} +3 -3
  48. package/dist/reporting.d.ts +8 -8
  49. package/dist/reporting.js +4 -4
  50. package/dist/{researcher-ClDX3KZx.d.ts → researcher-BmgJ_901.d.ts} +6 -6
  51. package/dist/rl.d.ts +10 -10
  52. package/dist/rl.js +6 -6
  53. package/dist/{rubric-DgSqjqqj.d.ts → rubric-D5tjHNJQ.d.ts} +2 -2
  54. package/dist/{rubric-predictive-validity-C0uDYwG6.d.ts → rubric-predictive-validity-Bm-CbN46.d.ts} +1 -1
  55. package/dist/{run-record-CqzahIbx.d.ts → run-record-nYf9x2hU.d.ts} +1 -1
  56. package/dist/{store-BP5be6s7.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  57. package/dist/{summary-report-jrSGb2xZ.d.ts → summary-report-dir7A-eQ.d.ts} +2 -2
  58. package/dist/{test-graded-scenario-BJ54PDan.d.ts → test-graded-scenario-B2kWEdh9.d.ts} +2 -2
  59. package/dist/traces.d.ts +533 -10
  60. package/dist/traces.js +14 -300
  61. package/dist/traces.js.map +1 -1
  62. package/dist/{trajectory-BFmveYZt.d.ts → trajectory-CnoBo-JY.d.ts} +1 -1
  63. package/dist/wire/index.d.ts +6 -6
  64. package/dist/wire/index.js +3 -3
  65. package/package.json +1 -1
  66. package/dist/chunk-NG236HPC.js.map +0 -1
  67. package/dist/chunk-UW4NOOZI.js.map +0 -1
  68. package/dist/replay-BX5Fm8en.d.ts +0 -529
  69. /package/dist/{chunk-R5UQJNKC.js.map → chunk-4L3WJXQJ.js.map} +0 -0
  70. /package/dist/{chunk-RUI6SIHY.js.map → chunk-75ZREHD7.js.map} +0 -0
  71. /package/dist/{chunk-5AKPEK5L.js.map → chunk-CXJOVDJR.js.map} +0 -0
  72. /package/dist/{chunk-K33INZHH.js.map → chunk-GVQT44CS.js.map} +0 -0
  73. /package/dist/{chunk-4S4BM3QQ.js.map → chunk-M6RZ5LJN.js.map} +0 -0
  74. /package/dist/{chunk-XFZCM5Z3.js.map → chunk-SMSGXM74.js.map} +0 -0
  75. /package/dist/{chunk-KTGTIOFD.js.map → chunk-UBPIXOC4.js.map} +0 -0
  76. /package/dist/{chunk-DBIGN5MJ.js.map → chunk-WGXZAQLR.js.map} +0 -0
  77. /package/dist/{chunk-NLMNWKVM.js.map → chunk-WSI4K3WB.js.map} +0 -0
  78. /package/dist/{chunk-PALJO75S.js.map → chunk-XEL6UP7C.js.map} +0 -0
  79. /package/dist/{chunk-SZSBQUIJ.js.map → chunk-Y2CPBYKH.js.map} +0 -0
  80. /package/dist/{chunk-QHF6EQKK.js.map → chunk-YTMXBHFM.js.map} +0 -0
package/dist/control.d.ts CHANGED
@@ -1,8 +1,8 @@
1
- export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, p as RunEvidenceMetadata, s as controlRunToRunRecord, u as evaluateActionPolicy, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-rJhEDdpy.js';
2
- export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BRdQ0wrx.js';
3
- import './feedback-trajectory-j0nJFgC6.js';
4
- import './dataset-CiK_3LDr.js';
5
- import './errors-BZ9sTdz7.js';
6
- import './emitter-BqjeOvJh.js';
7
- import './store-BP5be6s7.js';
8
- import './run-record-CqzahIbx.js';
1
+ export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, p as RunEvidenceMetadata, s as controlRunToRunRecord, u as evaluateActionPolicy, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-BFpqHFV2.js';
2
+ export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BZ_lVLYW.js';
3
+ import './feedback-trajectory-iATEAHmc.js';
4
+ import './dataset-ueRVTUoY.js';
5
+ import './errors-mje_cKOs.js';
6
+ import './emitter-DP_cSSiw.js';
7
+ import './store-Db2Bv8Cf.js';
8
+ import './run-record-nYf9x2hU.js';
package/dist/control.js CHANGED
@@ -4,7 +4,7 @@ import {
4
4
  runProposeReview,
5
5
  runProposeReviewAsControlLoop,
6
6
  scoreFromEvals
7
- } from "./chunk-PALJO75S.js";
7
+ } from "./chunk-XEL6UP7C.js";
8
8
  import {
9
9
  allCriticalPassed,
10
10
  objectiveEval,
@@ -13,9 +13,9 @@ import {
13
13
  stopOnRepeatedAction,
14
14
  subjectiveEval
15
15
  } from "./chunk-NCRFYPS3.js";
16
- import "./chunk-NLMNWKVM.js";
16
+ import "./chunk-WSI4K3WB.js";
17
17
  import "./chunk-TVVP3ZZQ.js";
18
- import "./chunk-NG236HPC.js";
18
+ import "./chunk-QYJT52YW.js";
19
19
  import "./chunk-PZ5AY32C.js";
20
20
  export {
21
21
  allCriticalPassed,
@@ -1,4 +1,4 @@
1
- import { V as ValidationError } from './errors-BZ9sTdz7.js';
1
+ import { V as ValidationError } from './errors-mje_cKOs.js';
2
2
 
3
3
  /**
4
4
  * Dataset — versioned, sliceable, content-hashed scenario collection.
@@ -1,4 +1,4 @@
1
- import { T as TraceStore, c as RunOutcome, R as Run, S as Span, d as SpanKind, L as LlmSpan, a as ToolSpan, e as RetrievalSpan, J as JudgeSpan, f as SandboxSpan, E as EventKind, b as TraceEvent, B as BudgetLedgerEntry, A as Artifact, M as Message } from './store-BP5be6s7.js';
1
+ import { T as TraceStore, c as RunOutcome, R as Run, S as Span, d as SpanKind, L as LlmSpan, a as ToolSpan, e as RetrievalSpan, J as JudgeSpan, f as SandboxSpan, E as EventKind, b as TraceEvent, B as BudgetLedgerEntry, A as Artifact, M as Message } from './store-Db2Bv8Cf.js';
2
2
 
3
3
  /**
4
4
  * TraceEmitter — hierarchical span builder that auto-parents using an
@@ -12,7 +12,7 @@
12
12
  * remain plain `Error`s on purpose — they're programmer-mistake assertions,
13
13
  * not consumer-catchable contract failures.
14
14
  */
15
- type AgentEvalErrorCode = 'validation' | 'not_found' | 'config' | 'capture_integrity' | 'judge' | 'verification' | 'replay';
15
+ type AgentEvalErrorCode = 'validation' | 'not_found' | 'config' | 'capture_integrity' | 'judge' | 'verification' | 'replay' | 'backend_integrity';
16
16
  declare class AgentEvalError extends Error {
17
17
  /** Stable string code. Survives minification; safe to switch on. */
18
18
  readonly code: AgentEvalErrorCode;
@@ -1,4 +1,4 @@
1
- import { R as Run, S as Span, b as TraceEvent, F as FailureClass, T as TraceStore } from './store-BP5be6s7.js';
1
+ import { R as Run, S as Span, b as TraceEvent, F as FailureClass, T as TraceStore } from './store-Db2Bv8Cf.js';
2
2
 
3
3
  /**
4
4
  * Failure taxonomy — canonical classes + a default classifier.
@@ -1,5 +1,5 @@
1
- import { C as ControlEvalResult, a as ControlRunResult, b as ControlStep } from './control-runtime-BRdQ0wrx.js';
2
- import { D as DatasetSplit, a as DatasetScenario } from './dataset-CiK_3LDr.js';
1
+ import { C as ControlEvalResult, a as ControlRunResult, b as ControlStep } from './control-runtime-BZ_lVLYW.js';
2
+ import { D as DatasetSplit, a as DatasetScenario } from './dataset-ueRVTUoY.js';
3
3
 
4
4
  type FeedbackArtifactType = 'text' | 'code' | 'plan' | 'research' | 'action' | 'ui' | 'decision' | 'data' | 'other';
5
5
  type FeedbackLabelSource = 'user' | 'judge' | 'environment' | 'metric' | 'policy' | 'system';
@@ -1,5 +1,5 @@
1
- export { E as EuRiskClass, e as GovernanceContext, f as GovernanceFinding, g as GovernanceReport, U as UseCaseSignals, n as classifyEuAiRisk, p as euAiActReport, q as nistAiRmfReport, u as renderMarkdown, x as soc2Report, y as summarize } from '../index-Cgt3DKXr.js';
2
- import '../dataset-CiK_3LDr.js';
3
- import '../errors-BZ9sTdz7.js';
1
+ export { E as EuRiskClass, e as GovernanceContext, f as GovernanceFinding, g as GovernanceReport, U as UseCaseSignals, n as classifyEuAiRisk, p as euAiActReport, q as nistAiRmfReport, u as renderMarkdown, x as soc2Report, y as summarize } from '../index-DPILdKbP.js';
2
+ import '../dataset-ueRVTUoY.js';
3
+ import '../errors-mje_cKOs.js';
4
4
  import '../outcome-store-D6KWmYvj.js';
5
- import '../store-BP5be6s7.js';
5
+ import '../store-Db2Bv8Cf.js';
@@ -1,6 +1,6 @@
1
- import { a as DatasetScenario, c as Dataset, b as DatasetManifest } from './dataset-CiK_3LDr.js';
1
+ import { a as DatasetScenario, c as Dataset, b as DatasetManifest } from './dataset-ueRVTUoY.js';
2
2
  import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
3
- import { T as TraceStore } from './store-BP5be6s7.js';
3
+ import { T as TraceStore } from './store-Db2Bv8Cf.js';
4
4
 
5
5
  /**
6
6
  * Judge calibration — measure judge quality against human gold + bias.
@@ -1,4 +1,4 @@
1
- import { a as RunSplitTag } from './run-record-CqzahIbx.js';
1
+ import { a as RunSplitTag } from './run-record-nYf9x2hU.js';
2
2
 
3
3
  /**
4
4
  * Shared types for the reference benchmark wrappers under
package/dist/index.d.ts CHANGED
@@ -1,46 +1,46 @@
1
- export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-rJhEDdpy.js';
1
+ export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-BFpqHFV2.js';
2
2
  import { AxAIService, AxFunction } from '@ax-llm/ax';
3
3
  import { S as Severity, M as MultiLayerVerifier, a as VerifyOptions, L as Layer, b as LayerResult, c as VerifyContext } from './multi-layer-verifier-BNi4-8lR.js';
4
4
  export { F as Finding, d as LayerStatus, V as VerificationReport, g as gradeSemanticStatus } from './multi-layer-verifier-BNi4-8lR.js';
5
- import { R as Run$1, S as Span, b as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, T as TraceStore, g as BudgetSpec, l as RunFilter, L as LlmSpan } from './store-BP5be6s7.js';
6
- export { h as EventFilter, E as EventKind, i as FAILURE_CLASSES, F as FailureClass, j as FileSystemTraceStore, k as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, M as Message, e as RetrievalSpan, m as RunLayer, n as RunStatus, f as SandboxSpan, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-BP5be6s7.js';
7
- import { L as LlmClientOptions, m as LlmCallRequest, n as LlmCallResult } from './researcher-ClDX3KZx.js';
8
- export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, o as LlmCallError, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as callLlm, x as callLlmJson, y as probeLlm, r as runEvalCampaign, z as stripFencedJson } from './researcher-ClDX3KZx.js';
9
- import { T as TraceAnalysisStore, A as AnalyzeTracesOptions } from './replay-BX5Fm8en.js';
10
- export { g as DEFAULT_REDACTION_RULES, O as OTEL_AGENT_EVAL_SCOPE, i as OtlpExport, j as OtlpResourceSpans, k as OtlpSpan, R as REDACTION_VERSION, l as RedactionReport, m as RedactionRule, n as ReplayCache, o as ReplayCacheEntry, p as ReplayCacheMissError, q as ReplayCacheStats, r as ReplayFetchOptions, C as createReplayFetch, E as exportRunAsOtlp, F as iterateRawCalls, G as redactString, H as redactValue } from './replay-BX5Fm8en.js';
5
+ import { R as Run$1, S as Span, b as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, T as TraceStore, g as BudgetSpec, h as RunFilter, L as LlmSpan } from './store-Db2Bv8Cf.js';
6
+ export { i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, M as Message, e as RetrievalSpan, m as RunLayer, n as RunStatus, f as SandboxSpan, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
7
+ import { L as LlmClientOptions, m as LlmCallRequest, n as LlmCallResult } from './researcher-BmgJ_901.js';
8
+ export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, o as LlmCallError, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as callLlm, x as callLlmJson, y as probeLlm, r as runEvalCampaign, z as stripFencedJson } from './researcher-BmgJ_901.js';
9
+ import { TraceAnalysisStore, AnalyzeTracesOptions } from './traces.js';
10
+ export { AnalyzeTracesInput, AnalyzeTracesResult, AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, DatasetOverview, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpResourceSpans, OtlpSpan, QueryTracesPage, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SearchSpanResult, SearchTraceResult, SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TraceAnalystByteBudgets, TraceAnalystFilters, TraceAnalystHookOptions, TraceAnalystSpan, TraceAnalystSpanKind, TraceAnalystSpanStatus, TraceAnalystTraceSummary, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, ViewSpansResult, ViewTraceOversized, ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
11
11
  import { TCloud } from '@tangle-network/tcloud';
12
- import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
13
- export { c as RunJudgeMetadata, d as RunOutcome, e as RunRecordValidationError, b as RunTokenUsage, i as isRunRecord, p as parseRunRecordSafe, r as roundTripRunRecord, v as validateRunRecord } from './run-record-CqzahIbx.js';
12
+ import { R as RunRecord, a as RunSplitTag } from './run-record-nYf9x2hU.js';
13
+ export { c as RunJudgeMetadata, d as RunOutcome, e as RunRecordValidationError, b as RunTokenUsage, i as isRunRecord, p as parseRunRecordSafe, r as roundTripRunRecord, v as validateRunRecord } from './run-record-nYf9x2hU.js';
14
14
  import { z } from 'zod';
15
- import { C as ControlEvalResult } from './control-runtime-BRdQ0wrx.js';
16
- export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BRdQ0wrx.js';
17
- import { A as AgentEvalError } from './errors-BZ9sTdz7.js';
18
- export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-BZ9sTdz7.js';
19
- import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory } from './feedback-trajectory-j0nJFgC6.js';
20
- export { F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-j0nJFgC6.js';
21
- import { A as ActionableSideInfo, h as MultiShotRunner, j as MultiShotScorer, c as MultiShotMutateAdapter, a4 as HeldOutGateConfig, E as EvolvableVariant, m as MultiShotTrialResult, e as MultiShotOptimizationResult, a3 as GateDecision, a5 as Objective, a6 as ParetoResult, V as VariantAggregate, t as TrialResult, o as MutateAdapter, T as TrialCache } from './summary-report-jrSGb2xZ.js';
22
- export { a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, a7 as Direction, C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, a8 as GateEvidence, G as GenerationReport, a9 as HeldOutGate, aa as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, d as MultiShotOptimizationConfig, f as MultiShotRun, g as MultiShotRunInput, i as MultiShotScore, k as MultiShotSplit, l as MultiShotTrace, n as MultiShotVariant, J as ParetoFigureSpec, K as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ab as crowdingDistance, w as defaultMultiShotObjectives, ac as dominates, $ as gainHistogram, a0 as paretoChart, ad as paretoFrontier, ae as paretoFrontierWithCrowding, x as parseReflectionResponse, a1 as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, af as scalarScore, a2 as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-jrSGb2xZ.js';
15
+ import { C as ControlEvalResult } from './control-runtime-BZ_lVLYW.js';
16
+ export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BZ_lVLYW.js';
17
+ import { A as AgentEvalError } from './errors-mje_cKOs.js';
18
+ export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-mje_cKOs.js';
19
+ import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory } from './feedback-trajectory-iATEAHmc.js';
20
+ export { F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-iATEAHmc.js';
21
+ import { A as ActionableSideInfo, h as MultiShotRunner, j as MultiShotScorer, c as MultiShotMutateAdapter, a4 as HeldOutGateConfig, E as EvolvableVariant, m as MultiShotTrialResult, e as MultiShotOptimizationResult, a3 as GateDecision, a5 as Objective, a6 as ParetoResult, V as VariantAggregate, t as TrialResult, o as MutateAdapter, T as TrialCache } from './summary-report-dir7A-eQ.js';
22
+ export { a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, a7 as Direction, C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, a8 as GateEvidence, G as GenerationReport, a9 as HeldOutGate, aa as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, d as MultiShotOptimizationConfig, f as MultiShotRun, g as MultiShotRunInput, i as MultiShotScore, k as MultiShotSplit, l as MultiShotTrace, n as MultiShotVariant, J as ParetoFigureSpec, K as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ab as crowdingDistance, w as defaultMultiShotObjectives, ac as dominates, $ as gainHistogram, a0 as paretoChart, ad as paretoFrontier, ae as paretoFrontierWithCrowding, x as parseReflectionResponse, a1 as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, af as scalarScore, a2 as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-dir7A-eQ.js';
23
23
  export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
24
- import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-PWhGlpfO.js';
25
- export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-PWhGlpfO.js';
26
- import { a as FailureCluster } from './failure-cluster-D1NZKqYu.js';
27
- export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-D1NZKqYu.js';
28
- import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './index-Cgt3DKXr.js';
29
- export { b as CalibrationResult, c as CandidateScore, d as ContinuousCalibrationResult, D as DEFAULT_RED_TEAM_CORPUS, E as EuRiskClass, G as GoldenItem, e as GovernanceContext, f as GovernanceFinding, g as GovernanceReport, P as PositionalBiasResult, R as RedTeamCase, h as RedTeamCategory, i as RedTeamFinding, j as RedTeamPayload, k as RedTeamReport, S as SelfPreferenceResult, U as UseCaseSignals, V as VerbosityBiasResult, l as calibrateJudge, m as calibrateJudgeContinuous, n as classifyEuAiRisk, o as continuousAgreement, p as euAiActReport, q as nistAiRmfReport, r as positionalBias, s as redTeamDataset, t as redTeamReport, u as renderMarkdown, v as scoreRedTeamOutput, w as selfPreference, x as soc2Report, y as summarize, z as toolNamesForRun, A as verbosityBias } from './index-Cgt3DKXr.js';
30
- import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-BJ54PDan.js';
31
- export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-BJ54PDan.js';
32
- import { T as TraceEmitter } from './emitter-BqjeOvJh.js';
33
- export { R as RunCompleteHook, a as RunCompleteHookContext, S as SpanHandle, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-BqjeOvJh.js';
34
- export { F as FileSystemRawProviderSink, c as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, d as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, e as RawProviderDirection, f as RawProviderEvent, R as RawProviderSink, g as RawProviderSinkFilter, h as RunIntegrityError, a as RunIntegrityExpectations, i as RunIntegrityIssue, j as RunIntegrityIssueCode, b as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-BAxLGJ9I.js';
35
- export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-BFDT0kX_.js';
36
- import { a as BaselineReport } from './baseline-BwdCXUS8.js';
37
- export { B as BaselineOptions, M as MetricSamples, b as MetricVerdict, T as ToolStats, d as ToolUseMetrics, e as ToolUseOptions, f as compareToBaseline, c as computeToolUseMetrics, i as iqr, w as welchsTTest } from './baseline-BwdCXUS8.js';
38
- import { T as Trajectory, a as TrajectoryStep } from './trajectory-BFmveYZt.js';
39
- export { b as buildTrajectory } from './trajectory-BFmveYZt.js';
40
- import { a as DatasetScenario, c as Dataset } from './dataset-CiK_3LDr.js';
41
- export { d as DatasetDifficulty, b as DatasetManifest, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-CiK_3LDr.js';
42
- import { a as PrmGrader } from './rubric-DgSqjqqj.js';
43
- export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index--fVrWDiR.js';
24
+ import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-C8r4Vben.js';
25
+ export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-C8r4Vben.js';
26
+ import { a as FailureCluster } from './failure-cluster-Cw65_5FY.js';
27
+ export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-Cw65_5FY.js';
28
+ import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './index-DPILdKbP.js';
29
+ export { b as CalibrationResult, c as CandidateScore, d as ContinuousCalibrationResult, D as DEFAULT_RED_TEAM_CORPUS, E as EuRiskClass, G as GoldenItem, e as GovernanceContext, f as GovernanceFinding, g as GovernanceReport, P as PositionalBiasResult, R as RedTeamCase, h as RedTeamCategory, i as RedTeamFinding, j as RedTeamPayload, k as RedTeamReport, S as SelfPreferenceResult, U as UseCaseSignals, V as VerbosityBiasResult, l as calibrateJudge, m as calibrateJudgeContinuous, n as classifyEuAiRisk, o as continuousAgreement, p as euAiActReport, q as nistAiRmfReport, r as positionalBias, s as redTeamDataset, t as redTeamReport, u as renderMarkdown, v as scoreRedTeamOutput, w as selfPreference, x as soc2Report, y as summarize, z as toolNamesForRun, A as verbosityBias } from './index-DPILdKbP.js';
30
+ import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-B2kWEdh9.js';
31
+ export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-B2kWEdh9.js';
32
+ import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
33
+ export { R as RunCompleteHook, a as RunCompleteHookContext, S as SpanHandle, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DP_cSSiw.js';
34
+ export { F as FileSystemRawProviderSink, c as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, d as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, e as RawProviderDirection, f as RawProviderEvent, R as RawProviderSink, g as RawProviderSinkFilter, h as RunIntegrityError, a as RunIntegrityExpectations, i as RunIntegrityIssue, j as RunIntegrityIssueCode, b as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-DYR5gWlb.js';
35
+ export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-DODUYdPg.js';
36
+ import { a as BaselineReport } from './baseline-4R5deP0N.js';
37
+ export { B as BaselineOptions, M as MetricSamples, b as MetricVerdict, T as ToolStats, d as ToolUseMetrics, e as ToolUseOptions, f as compareToBaseline, c as computeToolUseMetrics, i as iqr, w as welchsTTest } from './baseline-4R5deP0N.js';
38
+ import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
39
+ export { b as buildTrajectory } from './trajectory-CnoBo-JY.js';
40
+ import { a as DatasetScenario, c as Dataset } from './dataset-ueRVTUoY.js';
41
+ export { d as DatasetDifficulty, b as DatasetManifest, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-ueRVTUoY.js';
42
+ import { a as PrmGrader } from './rubric-D5tjHNJQ.js';
43
+ export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-TVjRYWRm.js';
44
44
  export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
45
45
  import './outcome-store-D6KWmYvj.js';
46
46
 
@@ -936,6 +936,152 @@ declare const RAW_FINDING_SCHEMA_PROMPT = "Each finding MUST be a JSON object wi
936
936
  */
937
937
  declare function parseRawFinding(row: unknown, log?: (msg: string, fields?: Record<string, unknown>) => void): RawAnalystFinding | null;
938
938
 
939
+ /**
940
+ * Typed `FindingSubject` — the canonical grammar every analyst kind emits.
941
+ *
942
+ * Background: kind actor prompts have always documented a subject grammar
943
+ * (e.g. `system-prompt:<section>`, `agent-knowledge:wiki:<slug>`) but the
944
+ * LLM was unconstrained — it could emit `subject: "fix the prompt"`
945
+ * (prose) and downstream adapters routed on `startsWith(...)` would
946
+ * silently skip it. Every per-vertical `ImprovementAdapter` had a
947
+ * routing table that mostly caught nothing.
948
+ *
949
+ * This module fixes that:
950
+ * - `parseFindingSubject(raw)` — returns the typed `FindingSubject`
951
+ * when `raw` matches the grammar, else `null`. Used at the
952
+ * `RawAnalystFindingSchema` boundary so malformed subjects are
953
+ * rejected loudly instead of silently lifted into the registry.
954
+ * - `FindingSubjectKind` — the union of valid locus categories. Each
955
+ * variant carries the typed components downstream adapters resolve
956
+ * against the agent's surface manifest (no string parsing in the
957
+ * adapter).
958
+ * - `FINDING_SUBJECT_GRAMMAR_PROMPT` — single source of truth for the
959
+ * grammar string embedded in kind actor prompts. Drift between
960
+ * prompt and parser is impossible if every kind imports this.
961
+ *
962
+ * The grammar is intentionally NARROW — only loci the substrate's
963
+ * default `ImprovementAdapter` / `KnowledgeAdapter` can act on. A
964
+ * finding with a subject outside this set fails the parser; the kind
965
+ * author either extends the grammar here (and adds adapter routing)
966
+ * or rephrases the prompt to map onto an existing variant.
967
+ *
968
+ * `failure-mode` is the one exception — its subjects are free-form
969
+ * cluster labels, not loci. The schema preserves them as
970
+ * `{ kind: 'cluster', label }` and the adapters skip them (cluster
971
+ * findings are evidence, not actionable mutations).
972
+ */
973
+
974
+ /**
975
+ * Discriminated union of every locus the substrate can route findings to.
976
+ *
977
+ * Adapters narrow on `kind` and use the typed components (no string
978
+ * parsing). Adding a variant here REQUIRES updating the parser, the
979
+ * grammar prompt, and at least one adapter — by design.
980
+ */
981
+ type FindingSubject = {
982
+ kind: 'knowledge.wiki';
983
+ slug: string;
984
+ heading?: string;
985
+ } | {
986
+ kind: 'knowledge.claim';
987
+ topic: string;
988
+ } | {
989
+ kind: 'knowledge.raw';
990
+ sourceId: string;
991
+ } | {
992
+ kind: 'knowledge.stale';
993
+ slug: string;
994
+ } | {
995
+ kind: 'system-prompt';
996
+ section: string;
997
+ } | {
998
+ kind: 'tool-doc';
999
+ tool: string;
1000
+ aspect?: string;
1001
+ } | {
1002
+ kind: 'new-tool';
1003
+ name: string;
1004
+ } | {
1005
+ kind: 'rag';
1006
+ corpus: string;
1007
+ docId: string;
1008
+ } | {
1009
+ kind: 'memory';
1010
+ key: string;
1011
+ } | {
1012
+ kind: 'scaffolding';
1013
+ concern: string;
1014
+ } | {
1015
+ kind: 'output-schema';
1016
+ field: string;
1017
+ } | {
1018
+ kind: 'websearch.outdated';
1019
+ topic: string;
1020
+ } | {
1021
+ kind: 'prior-run-summary';
1022
+ topic: string;
1023
+ } | {
1024
+ kind: 'cluster';
1025
+ label: string;
1026
+ };
1027
+ type FindingSubjectKind = FindingSubject['kind'];
1028
+ declare const FINDING_SUBJECT_KINDS: ReadonlyArray<FindingSubjectKind>;
1029
+ /**
1030
+ * Parse a raw subject string emitted by an analyst kind's actor.
1031
+ *
1032
+ * Returns the typed `FindingSubject` when `raw` matches the grammar,
1033
+ * else `null`. Callers use the `null` return as a signal to either
1034
+ * (a) reject the finding at parse time (kinds that emit typed loci —
1035
+ * knowledge-gap, improvement, knowledge-poisoning) or (b) lift it as
1036
+ * a cluster label (failure-mode).
1037
+ *
1038
+ * Slugs are constrained to `[a-z0-9-]+` (lowercase kebab) to keep file
1039
+ * paths sane downstream. Topics / keys / sections allow any non-empty
1040
+ * string (free-form for the LLM's voice) but get trimmed.
1041
+ *
1042
+ * Empty / whitespace-only inputs return `null`. `undefined` returns
1043
+ * `null`. Both are surfaced by the caller as a rejected subject.
1044
+ */
1045
+ declare function parseFindingSubject(raw: string | null | undefined): FindingSubject | null;
1046
+ /**
1047
+ * Render the parsed subject back to its canonical string form. Inverse
1048
+ * of `parseFindingSubject`; useful when the substrate constructs new
1049
+ * findings programmatically (e.g. for tests, replays, or
1050
+ * `id_basis` carry-forward).
1051
+ */
1052
+ declare function renderFindingSubject(s: FindingSubject): string;
1053
+ /**
1054
+ * The grammar text embedded into kind actor prompts. Kinds opt into
1055
+ * the subset of variants they emit (e.g. `improvement` excludes the
1056
+ * cluster variant; `failure-mode` includes ONLY the cluster variant).
1057
+ *
1058
+ * Drift between prompt and parser is impossible: every kind imports
1059
+ * this constant + the matching `expects` set, and the unit tests below
1060
+ * lock the table to the parser.
1061
+ */
1062
+ declare const FINDING_SUBJECT_GRAMMAR_PROMPT: string;
1063
+ /**
1064
+ * The variants each kind is allowed to emit. Used at the kind factory
1065
+ * boundary so a knowledge-gap finding can't sneak in a `system-prompt:*`
1066
+ * subject (the improvement-analyst's job) and vice versa.
1067
+ *
1068
+ * `failure-mode` is restricted to `cluster` — the only kind that emits
1069
+ * a non-locus subject.
1070
+ */
1071
+ declare const KIND_EXPECTED_SUBJECTS: Record<string, ReadonlyArray<FindingSubjectKind>>;
1072
+ /**
1073
+ * Zod schema that validates a raw subject string and returns the parsed
1074
+ * `FindingSubject`. Embedded in `RawAnalystFindingSchema` via
1075
+ * `transform`, so `subject` arrives at the kind factory either as a
1076
+ * typed locus or as a parse error attached to a single Zod issue.
1077
+ *
1078
+ * Optionality is preserved: subjects ARE optional on the wire (some
1079
+ * findings are descriptive, not actionable). When present, they MUST
1080
+ * parse — emitting a malformed subject is a contract violation, not a
1081
+ * soft signal.
1082
+ */
1083
+ declare const FindingSubjectStringSchema: z.ZodString;
1084
+
939
1085
  /**
940
1086
  * FindingsStore — durable persistence for AnalystFinding rows + a diff
941
1087
  * helper so we can answer "what changed since the last run?" without
@@ -1671,6 +1817,76 @@ declare function integrationInvokeFailedPayload(input: IntegrationInvokeFailureI
1671
1817
  declare function integrationGateEvals(input: IntegrationManifestGateInput): ControlEvalResult[];
1672
1818
  declare function integrationAsi(input: IntegrationManifestGateInput | IntegrationInvokeFailureInput): ActionableSideInfo;
1673
1819
 
1820
+ /**
1821
+ * Backend-integrity guard: distinguish "agent failed" from "eval ran against
1822
+ * a stub / unconfigured backend." Without this guard a canonical eval can
1823
+ * silently report `0/N passed` and look like an agent-quality problem when
1824
+ * the LLM was never actually called — the failure mode we just hit running
1825
+ * the 4-vertical parallel eval (legal-sandbox-stub returned hard-coded 33-104
1826
+ * char strings; gtm/creative defaulted to a cli-bridge that wasn't running).
1827
+ *
1828
+ * The shape:
1829
+ *
1830
+ * const report = summarizeBackendIntegrity(records)
1831
+ * assertRealBackend(records) // throws BackendIntegrityError if 100% stub
1832
+ *
1833
+ * A record is "stub-mode" if its `tokenUsage.input === 0 && tokenUsage.output === 0`.
1834
+ * (`costUsd` alone is unreliable — some backends successfully call LLMs but
1835
+ * don't propagate pricing, producing real tokens with $0 cost.)
1836
+ *
1837
+ * Verdicts:
1838
+ * - `real` — at least one record has nonzero token usage
1839
+ * - `stub` — every record is stub-mode (eval ran blind)
1840
+ * - `mixed` — some records real, some stub (partial backend failure;
1841
+ * often the 429-cascade or auth-half-failed case)
1842
+ */
1843
+
1844
+ interface BackendIntegrityReport {
1845
+ /** Total records inspected. */
1846
+ totalRecords: number;
1847
+ /** Records with input=0 AND output=0 (a stub fingerprint). */
1848
+ stubRecords: number;
1849
+ /** Records with nonzero token usage (real LLM activity). */
1850
+ realRecords: number;
1851
+ /** Records where output>0 but costUsd=0 (real LLM, broken cost ledger). */
1852
+ uncostedRecords: number;
1853
+ /** Sum of input tokens across all records. */
1854
+ totalInputTokens: number;
1855
+ /** Sum of output tokens across all records. */
1856
+ totalOutputTokens: number;
1857
+ /** Sum of costUsd across all records. */
1858
+ totalCostUsd: number;
1859
+ /** Worst-case integrity verdict. */
1860
+ verdict: 'real' | 'mixed' | 'stub';
1861
+ /** Human-readable diagnosis suitable for terminal output. */
1862
+ diagnosis: string;
1863
+ }
1864
+ /**
1865
+ * Error thrown when an integrity assertion fails. Caller can pattern-match
1866
+ * by `code === 'AGENT_EVAL_BACKEND_STUB'` to differentiate from other
1867
+ * errors.
1868
+ */
1869
+ declare class BackendIntegrityError extends AgentEvalError {
1870
+ readonly report: BackendIntegrityReport;
1871
+ constructor(message: string, report: BackendIntegrityReport);
1872
+ }
1873
+ /**
1874
+ * Inspect a batch of RunRecords and return an integrity report. Pure
1875
+ * function — no I/O, no logging. The caller decides what to do with the
1876
+ * verdict (print warning, throw, gate CI, etc.).
1877
+ */
1878
+ declare function summarizeBackendIntegrity(records: ReadonlyArray<RunRecord>): BackendIntegrityReport;
1879
+ /**
1880
+ * Throw BackendIntegrityError if the verdict is 'stub' — i.e. every record
1881
+ * shows zero LLM activity. Non-strict callers can pass `{ allowMixed: false }`
1882
+ * to also reject mixed verdicts (recommended for CI gates).
1883
+ *
1884
+ * Real backends pass through silently.
1885
+ */
1886
+ declare function assertRealBackend(records: ReadonlyArray<RunRecord>, opts?: {
1887
+ allowMixed?: boolean;
1888
+ }): BackendIntegrityReport;
1889
+
1674
1890
  /**
1675
1891
  * Create a domain expert judge with a configurable domain.
1676
1892
  *
@@ -6027,4 +6243,4 @@ declare function aggregateTrialsByMode(trials: TrialResult[], opts: {
6027
6243
  mode: AggregatorMode;
6028
6244
  }): TrialAggregate;
6029
6245
 
6030
- export { ANALYST_SEVERITIES, ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AggregatorMode, type AlignmentOp, type Analyst, type AnalystContext, type AnalystCost, type AnalystFinding, type AnalystHooks, type AnalystInputKind, AnalystRegistry, type AnalystRegistryOptions, type AnalystRequirements, type AnalystRunEvent, type AnalystRunInputs, type AnalystRunResult, type AnalystRunSummary, type AnalystSeverity, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, type BudgetPolicy, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type ChatCallOpts, type ChatClient, type ChatRequest, type ChatResponse, type ChatTransport, type CheckResult, type CliBridgeTransportOpts, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, ContinuousAgreement, ContinuousAgreementOptions, type ContractMetric, type ContractReport, ControlEvalResult, ConvergenceTracker, type CorpusAgreementOptions, type CorpusAgreementPerDimension, type CorpusAgreementReport, type CorpusScoreRecord, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateChatClientOpts, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DirEntry, type DirectProviderTransportOpts, type DiscoverPersonasOptions, type DiscoveredPersona, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvalResult, type EvidenceRef, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, type FactorContribution, type FactorialCell, type FailureClusterConfig, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, GateDecision, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGateConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAdapterOpts, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, type JudgeRubric, JudgeRunner, type JudgeScore, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallRequest, LlmCallResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MockTransportOpts, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, MultiShotMutateAdapter, MultiShotOptimizationResult, MultiShotRunner, MultiShotScorer, MultiShotTrialResult, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersistedFinding, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, ProductClient, type ProductClientConfig, type ProductionEvolveConfig, type ProductionLoopCronConfig, type ProductionLoopDecision, type ProductionLoopRenderContext, type ProductionLoopResult, type ProductionShipConfig, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegistryRunOpts, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RouterTransportOpts, type RubricDimension, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, type RunProductionLoopOptions, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxSdkTransportOpts, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, Trajectory, TrajectoryStep, type TrialAggregate, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateRunScore, aggregateTrialsByMode, analyzeAntiSlop, analyzeSeries, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, computeFindingId, confidenceInterval, containsAll, corpusInterRaterAgreement, corpusInterRaterAgreementFromJudgeScores, createAntiSlopJudge, createChatClient, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createRunCriticAdapter, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, liftSeverity, linterJudge, loadScorerFromGrader, localCommandRunner, lowercaseMutator, makeFinding, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, normalizeScores, notBlocked, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, parseRawFinding, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runProductionLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scanForMuffledGates, scoreContinuity, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, typoMutator, urlContains, verifyManifest, visualDiff, viteDeployRunner, weightedMean, weightedRecall, whitespaceCollapseMutator, wilcoxonSignedRank, withJudgeRetry, wranglerDeployRunner };
6246
+ export { ANALYST_SEVERITIES, ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AggregatorMode, type AlignmentOp, type Analyst, type AnalystContext, type AnalystCost, type AnalystFinding, type AnalystHooks, type AnalystInputKind, AnalystRegistry, type AnalystRegistryOptions, type AnalystRequirements, type AnalystRunEvent, type AnalystRunInputs, type AnalystRunResult, type AnalystRunSummary, type AnalystSeverity, AnalyzeTracesOptions, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BackendIntegrityError, type BackendIntegrityReport, BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, type BudgetPolicy, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type ChatCallOpts, type ChatClient, type ChatRequest, type ChatResponse, type ChatTransport, type CheckResult, type CliBridgeTransportOpts, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, ContinuousAgreement, ContinuousAgreementOptions, type ContractMetric, type ContractReport, ControlEvalResult, ConvergenceTracker, type CorpusAgreementOptions, type CorpusAgreementPerDimension, type CorpusAgreementReport, type CorpusScoreRecord, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateChatClientOpts, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DirEntry, type DirectProviderTransportOpts, type DiscoverPersonasOptions, type DiscoveredPersona, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvalResult, type EvidenceRef, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, type FailureClusterConfig, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, GateDecision, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGateConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAdapterOpts, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, type JudgeRubric, JudgeRunner, type JudgeScore, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallRequest, LlmCallResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MockTransportOpts, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, MultiShotMutateAdapter, MultiShotOptimizationResult, MultiShotRunner, MultiShotScorer, MultiShotTrialResult, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersistedFinding, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, ProductClient, type ProductClientConfig, type ProductionEvolveConfig, type ProductionLoopCronConfig, type ProductionLoopDecision, type ProductionLoopRenderContext, type ProductionLoopResult, type ProductionShipConfig, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegistryRunOpts, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RouterTransportOpts, type RubricDimension, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, type RunProductionLoopOptions, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxSdkTransportOpts, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TestResult, type ThresholdContract, TokenCounter, type TokenSpec, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, Trajectory, TrajectoryStep, type TrialAggregate, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateRunScore, aggregateTrialsByMode, analyzeAntiSlop, analyzeSeries, assertRealBackend, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, computeFindingId, confidenceInterval, containsAll, corpusInterRaterAgreement, corpusInterRaterAgreementFromJudgeScores, createAntiSlopJudge, createChatClient, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createRunCriticAdapter, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, liftSeverity, linterJudge, loadScorerFromGrader, localCommandRunner, lowercaseMutator, makeFinding, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, normalizeScores, notBlocked, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, parseFindingSubject, parseRawFinding, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runProductionLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scanForMuffledGates, scoreContinuity, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeBackendIntegrity, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, typoMutator, urlContains, verifyManifest, visualDiff, viteDeployRunner, weightedMean, weightedRecall, whitespaceCollapseMutator, wilcoxonSignedRank, withJudgeRetry, wranglerDeployRunner };