@tangle-network/agent-eval 0.52.0 → 0.54.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/CHANGELOG.md +23 -0
  2. package/dist/adapters/http.d.ts +1 -1
  3. package/dist/adapters/langchain.d.ts +1 -1
  4. package/dist/adapters/otel.d.ts +7 -6
  5. package/dist/{baseline-4R5deP0N.d.ts → baseline-DE36-Np7.d.ts} +1 -1
  6. package/dist/benchmarks/index.d.ts +3 -2
  7. package/dist/builder-eval/index.d.ts +4 -3
  8. package/dist/campaign/index.d.ts +9 -7
  9. package/dist/campaign/index.js +33 -4
  10. package/dist/campaign/index.js.map +1 -1
  11. package/dist/{chunk-L7XMNXLO.js → chunk-J4DIMSRK.js} +2 -2
  12. package/dist/{chunk-BWZEGTES.js → chunk-NCK5QLGT.js} +1 -1
  13. package/dist/chunk-NCK5QLGT.js.map +1 -0
  14. package/dist/{chunk-5KSDYBYH.js → chunk-YXTT6GSZ.js} +2 -2
  15. package/dist/contract/index.d.ts +25 -12
  16. package/dist/contract/index.js +171 -0
  17. package/dist/contract/index.js.map +1 -1
  18. package/dist/{control-ojEWkMfJ.d.ts → control-DjEgwWNo.d.ts} +6 -5
  19. package/dist/{control-runtime-BZ_lVLYW.d.ts → control-runtime-DuFBYg7A.d.ts} +3 -2
  20. package/dist/control.d.ts +7 -6
  21. package/dist/control.js +2 -2
  22. package/dist/{emitter-DP_cSSiw.d.ts → emitter-DEZwY14K.d.ts} +2 -1
  23. package/dist/{failure-cluster-Cw65_5FY.d.ts → failure-cluster-CL7IVgkJ.d.ts} +2 -1
  24. package/dist/{feedback-trajectory-BSxqEpu7.d.ts → feedback-trajectory-DpUmE90J.d.ts} +1 -1
  25. package/dist/governance/index.d.ts +3 -2
  26. package/dist/hosted/index.d.ts +7 -6
  27. package/dist/{index-DQHtWQ57.d.ts → index-D2nT6_KT.d.ts} +66 -2
  28. package/dist/{index-0pu_fBwZ.d.ts → index-wlaiph9Y.d.ts} +1 -1
  29. package/dist/index.d.ts +31 -29
  30. package/dist/index.js +3 -3
  31. package/dist/{integrity-CTDhR1Sg.d.ts → integrity-CfXjSqEv.d.ts} +1 -1
  32. package/dist/knowledge/index.d.ts +4 -3
  33. package/dist/meta-eval/index.d.ts +4 -3
  34. package/dist/openapi.json +1 -1
  35. package/dist/pipelines/index.d.ts +7 -6
  36. package/dist/prm/index.d.ts +5 -4
  37. package/dist/{query-DODUYdPg.d.ts → query-CqTxMwDw.d.ts} +2 -1
  38. package/dist/{red-team-30II1T4o.d.ts → red-team-CrC5MZYd.d.ts} +1 -1
  39. package/dist/{registry-8KAs18kY.d.ts → registry-BSWy0rvH.d.ts} +1 -1
  40. package/dist/{release-report-DSu0DWy8.d.ts → release-report-B6l5fi7T.d.ts} +2 -2
  41. package/dist/reporting.d.ts +7 -6
  42. package/dist/{researcher-LZD0qHEa.d.ts → researcher-D4AZjxNa.d.ts} +5 -5
  43. package/dist/rl.d.ts +11 -10
  44. package/dist/rl.js +2 -2
  45. package/dist/{rubric-D5tjHNJQ.d.ts → rubric-BOfxn4ja.d.ts} +3 -2
  46. package/dist/{rubric-predictive-validity-ByZEC3BX.d.ts → rubric-predictive-validity-B3qNa4aY.d.ts} +1 -1
  47. package/dist/{run-improvement-loop-Cc7oZlRP.d.ts → run-improvement-loop-BhfdjrMY.d.ts} +3 -3
  48. package/dist/{run-record-BGY6bHRh.d.ts → run-record-etiCMsUq.d.ts} +11 -3
  49. package/dist/{store-Db2Bv8Cf.d.ts → schema-m0gsnbt3.d.ts} +1 -99
  50. package/dist/store-CKUAgsJz.d.ts +101 -0
  51. package/dist/{summary-report-B7gNRX-r.d.ts → summary-report-DLxh4yWk.d.ts} +2 -2
  52. package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BdVaPyHT.d.ts} +3 -2
  53. package/dist/traces.d.ts +7 -6
  54. package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-GEdXJCL5.d.ts} +2 -1
  55. package/dist/{types-Dbj5gu8n.d.ts → types-BgrxOJSf.d.ts} +31 -1
  56. package/dist/wire/index.d.ts +5 -4
  57. package/docs/design/self-improvement-protocol.md +223 -0
  58. package/docs/pilot/README.md +62 -0
  59. package/docs/pilot/customer-checklist.md +90 -0
  60. package/docs/pilot/integration-foreign-stack.md +296 -0
  61. package/docs/pilot/integration-tangle-stack.md +248 -0
  62. package/docs/pilot/one-pager.md +161 -0
  63. package/docs/pilot/sample-insight-report.json +172 -0
  64. package/docs/research/research-roadmap.md +204 -0
  65. package/package.json +1 -1
  66. package/dist/chunk-BWZEGTES.js.map +0 -1
  67. /package/dist/{chunk-L7XMNXLO.js.map → chunk-J4DIMSRK.js.map} +0 -0
  68. /package/dist/{chunk-5KSDYBYH.js.map → chunk-YXTT6GSZ.js.map} +0 -0
@@ -1,9 +1,10 @@
1
1
  import { c as DatasetManifest } from '../dataset-BlwAtYYf.js';
2
2
  import { b as CalibrationResult } from '../judge-calibration-DilmB3Ml.js';
3
3
  import { O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
4
- import { d as RedTeamReport } from '../red-team-30II1T4o.js';
5
- import { T as TraceStore } from '../store-Db2Bv8Cf.js';
4
+ import { d as RedTeamReport } from '../red-team-CrC5MZYd.js';
5
+ import { T as TraceStore } from '../store-CKUAgsJz.js';
6
6
  import '../errors-mje_cKOs.js';
7
+ import '../schema-m0gsnbt3.js';
7
8
 
8
9
  /**
9
10
  * Governance reporting — shared types.
@@ -1,8 +1,9 @@
1
- export { E as EvalRunCellScore, d as EvalRunEvent, e as EvalRunGenerationSnapshot, f as EvalRunStatus, g as HOSTED_WIRE_VERSION, H as HostedClient, h as HostedIngestHeaders, a as HostedTenant, i as HostedWireVersion, j as IngestEvalRunsRequest, k as IngestResponse, l as IngestTracesRequest, T as TraceSpanEvent, m as createHostedClient } from '../index-DQHtWQ57.js';
2
- import '../types-Dbj5gu8n.js';
3
- import '../summary-report-B7gNRX-r.js';
4
- import '../run-record-BGY6bHRh.js';
1
+ export { E as EvalRunCellScore, d as EvalRunEvent, e as EvalRunGenerationSnapshot, f as EvalRunStatus, g as HOSTED_WIRE_VERSION, H as HostedClient, h as HostedIngestHeaders, a as HostedTenant, i as HostedWireVersion, j as IngestEvalRunsRequest, k as IngestResponse, l as IngestTracesRequest, T as TraceSpanEvent, m as createHostedClient } from '../index-D2nT6_KT.js';
2
+ import '../types-BgrxOJSf.js';
3
+ import '../summary-report-DLxh4yWk.js';
4
+ import '../run-record-etiCMsUq.js';
5
5
  import '../errors-mje_cKOs.js';
6
- import '../failure-cluster-Cw65_5FY.js';
7
- import '../store-Db2Bv8Cf.js';
6
+ import '../schema-m0gsnbt3.js';
7
+ import '../failure-cluster-CL7IVgkJ.js';
8
+ import '../store-CKUAgsJz.js';
8
9
  import '../judge-calibration-DilmB3Ml.js';
@@ -1,5 +1,5 @@
1
- import { M as MutableSurface, m as GateDecision } from './types-Dbj5gu8n.js';
2
- import { G as GainDistributionBin, P as ParetoFigureSpec } from './summary-report-B7gNRX-r.js';
1
+ import { M as MutableSurface, n as GateDecision } from './types-BgrxOJSf.js';
2
+ import { G as GainDistributionBin, P as ParetoFigureSpec } from './summary-report-DLxh4yWk.js';
3
3
  import { a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
4
4
 
5
5
  /**
@@ -81,6 +81,17 @@ interface InsightReport {
81
81
  * ActionableSideInfo bag) calls `evaluateReleaseConfidence()` directly;
82
82
  * this summary captures the analyzeRuns-derived axes. */
83
83
  release: ReleaseSummary;
84
+ /** Delta vs a prior period when `baselineRuns` is passed. Per-metric
85
+ * current vs baseline with Welch CI + Cohen's d + significance flag.
86
+ * Answers "did my last change help?" — the customer-conversion question.
87
+ * Surfaced metrics: composite, cost, duration, tokenUsage, plus any
88
+ * per-dimension judge metric present in both windows. */
89
+ priorPeriodComparison?: PriorPeriodComparison;
90
+ /** Model-free failure-mode breakdown from `RunRecord.failureMode`, ranked
91
+ * by count descending. Present when any run carries a `failureMode`.
92
+ * Complements `failureClusters` (LLM-semantic) with the structured tags
93
+ * the harness already recorded — actionable with no analyst wired. */
94
+ failureModes?: FailureModeTally[];
84
95
  /** Top-N actionable recommendations, ranked by priority. The packet's
85
96
  * human-readable layer; the numeric sections are the evidence. */
86
97
  recommendations: Recommendation[];
@@ -175,6 +186,19 @@ interface FailureClusterInsight {
175
186
  }>;
176
187
  totalFailures: number;
177
188
  }
189
+ /** Model-free failure breakdown over the structured `RunRecord.failureMode`
190
+ * enum. Unlike `failureClusters` (semantic, requires an LLM analyst), this
191
+ * is computed directly from the tags the harness already recorded — so a
192
+ * customer ingesting one batch with no judge/analyst still learns which
193
+ * named failure dominates. */
194
+ interface FailureModeTally {
195
+ /** The `failureMode` tag. */
196
+ mode: string;
197
+ /** Number of runs carrying this tag. */
198
+ count: number;
199
+ /** Share of the whole corpus, 0..1. */
200
+ share: number;
201
+ }
178
202
  interface ContaminationInsight {
179
203
  /** Canary phrases that leaked into outputs. */
180
204
  leaks: number;
@@ -217,6 +241,46 @@ interface ReleaseSummary {
217
241
  * consumers can post-process to populate. */
218
242
  issues: string[];
219
243
  }
244
+ interface MetricDelta {
245
+ /** Current-period mean. */
246
+ current: number;
247
+ /** Baseline-period mean. */
248
+ baseline: number;
249
+ /** current - baseline. Positive means improved (or, for cost/duration,
250
+ * the consumer-side interpretation: "higher current" — semantic
251
+ * direction depends on the metric). */
252
+ delta: number;
253
+ /** Welch 95% confidence interval on the delta. Two-sample, unpaired —
254
+ * the baseline and current run sets may have different scenarios. */
255
+ ci95: [number, number];
256
+ /** Welch t-test p-value (two-sided). */
257
+ pValue: number;
258
+ /** Cohen's d (pooled stddev). Effect size, signed. */
259
+ cohensD: number;
260
+ /** Sample sizes. */
261
+ baselineN: number;
262
+ currentN: number;
263
+ /** True when p < 0.05 AND |d| >= 0.2 (small-effect threshold). The
264
+ * conjunction prevents large-effect-but-noisy and significant-but-
265
+ * tiny from triggering recommendations. */
266
+ significant: boolean;
267
+ }
268
+ interface PriorPeriodComparison {
269
+ /** Sample counts. */
270
+ baselineN: number;
271
+ currentN: number;
272
+ /** Optional human-readable label — "vs prior 7 days", "vs v3 release". */
273
+ windowLabel?: string;
274
+ /** Every metric we could compare. Keys: 'composite', 'cost', 'duration',
275
+ * 'tokenUsage' for always-present ones; per-dimension keys when both
276
+ * windows have judge scores on the same dimension. */
277
+ metrics: Record<string, MetricDelta>;
278
+ /** Metric names where current is significantly WORSE than baseline.
279
+ * Direction-aware: for cost/duration, higher current = worse. */
280
+ regressedMetrics: string[];
281
+ /** Metric names where current is significantly BETTER than baseline. */
282
+ improvedMetrics: string[];
283
+ }
220
284
  interface Recommendation {
221
285
  priority: 'critical' | 'high' | 'medium' | 'low';
222
286
  kind: 'ship' | 'hold' | 'investigate' | 'fix' | 'recalibrate' | 'expand-corpus';
@@ -1,4 +1,4 @@
1
- import { a as RunSplitTag } from './run-record-BGY6bHRh.js';
1
+ import { a as RunSplitTag } from './run-record-etiCMsUq.js';
2
2
 
3
3
  /**
4
4
  * Shared types for the reference benchmark wrappers under
package/dist/index.d.ts CHANGED
@@ -1,11 +1,13 @@
1
- export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-ojEWkMfJ.js';
2
- import { R as RunRecord } from './run-record-BGY6bHRh.js';
3
- export { e as AGENT_PROFILE_KINDS, A as AgentProfileCell, d as AgentProfileCellInput, f as AgentProfileCellSchemaVersion, g as AgentProfileCellValidationError, h as AgentProfileDimensionValue, i as AgentProfileHarness, j as AgentProfileJson, k as AgentProfileKind, l as AgentProfileSource, m as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, n as RunOutcome, o as RunRecordValidationError, a as RunSplitTag, b as RunTokenUsage, S as SandboxAgentProfileLike, p as agentProfileCellHashMaterial, q as agentProfileCellKey, r as assertRunAgentProfileCell, s as buildAgentProfileCell, t as buildSandboxAgentProfileCell, u as groupRunsByAgentProfileCell, v as isRunRecord, w as parseRunRecordSafe, x as requireAgentProfileCell, y as roundTripRunRecord, z as toAgentProfileJson, B as validateAgentProfileCell, C as validateRunRecord, D as verifyAgentProfileCell } from './run-record-BGY6bHRh.js';
1
+ export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-DjEgwWNo.js';
2
+ import { R as RunRecord } from './run-record-etiCMsUq.js';
3
+ export { e as AGENT_PROFILE_KINDS, A as AgentProfileCell, d as AgentProfileCellInput, f as AgentProfileCellSchemaVersion, g as AgentProfileCellValidationError, h as AgentProfileDimensionValue, i as AgentProfileHarness, j as AgentProfileJson, k as AgentProfileKind, l as AgentProfileSource, m as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, n as RunOutcome, o as RunRecordValidationError, a as RunSplitTag, b as RunTokenUsage, S as SandboxAgentProfileLike, p as agentProfileCellHashMaterial, q as agentProfileCellKey, r as assertRunAgentProfileCell, s as buildAgentProfileCell, t as buildSandboxAgentProfileCell, u as groupRunsByAgentProfileCell, v as isRunRecord, w as parseRunRecordSafe, x as requireAgentProfileCell, y as roundTripRunRecord, z as toAgentProfileJson, B as validateAgentProfileCell, C as validateRunRecord, D as verifyAgentProfileCell } from './run-record-etiCMsUq.js';
4
4
  import { AxAIService, AxFunction } from '@ax-llm/ax';
5
- import { d as Severity, M as MultiLayerVerifier, e as VerifyOptions, L as Layer, f as LayerResult, g as VerifyContext } from './researcher-LZD0qHEa.js';
6
- export { C as CallbackResearcher, h as CallbackResearcherOptions, i as CampaignFactoryParams, j as CampaignIntegrityPolicy, k as CampaignRunContext, l as CampaignRunOutcome, m as CampaignRunner, n as CampaignScenario, o as CampaignVariant, c as EvalCampaignOptions, b as EvalCampaignResult, E as ExperimentPlan, a as ExperimentResult, p as FailedRun, F as FailureMode, q as Finding, s as LayerStatus, N as NoopResearcher, R as Researcher, S as SteeringChange, V as VerificationReport, t as gradeSemanticStatus, r as runEvalCampaign } from './researcher-LZD0qHEa.js';
7
- import { R as Run$1, S as Span, b as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, T as TraceStore, g as BudgetSpec, h as RunFilter, L as LlmSpan } from './store-Db2Bv8Cf.js';
8
- export { i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, M as Message, e as RetrievalSpan, m as RunLayer, n as RunStatus, f as SandboxSpan, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
5
+ import { d as Severity, M as MultiLayerVerifier, e as VerifyOptions, L as Layer, f as LayerResult, g as VerifyContext } from './researcher-D4AZjxNa.js';
6
+ export { C as CallbackResearcher, h as CallbackResearcherOptions, i as CampaignFactoryParams, j as CampaignIntegrityPolicy, k as CampaignRunContext, l as CampaignRunOutcome, m as CampaignRunner, n as CampaignScenario, o as CampaignVariant, c as EvalCampaignOptions, b as EvalCampaignResult, E as ExperimentPlan, a as ExperimentResult, p as FailedRun, F as FailureMode, q as Finding, s as LayerStatus, N as NoopResearcher, R as Researcher, S as SteeringChange, V as VerificationReport, t as gradeSemanticStatus, r as runEvalCampaign } from './researcher-D4AZjxNa.js';
7
+ import { R as Run$1, S as Span, a as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, h as BudgetSpec, L as LlmSpan } from './schema-m0gsnbt3.js';
8
+ export { E as EventKind, i as FAILURE_CLASSES, F as FailureClass, G as GenericSpan, J as JudgeSpan, M as Message, d as RetrievalSpan, g as RunLayer, f as RunStatus, e as SandboxSpan, j as SpanBase, c as SpanKind, k as SpanStatus, l as TRACE_SCHEMA_VERSION, T as ToolSpan, m as isJudgeSpan, n as isLlmSpan, o as isRetrievalSpan, p as isSandboxSpan, q as isToolSpan } from './schema-m0gsnbt3.js';
9
+ import { T as TraceStore, R as RunFilter } from './store-CKUAgsJz.js';
10
+ export { E as EventFilter, F as FileSystemTraceStore, a as FileSystemTraceStoreOptions, I as InMemoryTraceStore, S as SpanFilter } from './store-CKUAgsJz.js';
9
11
  import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
10
12
  export { d as LlmCallError, b as LlmCallRequest, c as LlmCallResult, e as LlmClient, f as LlmMessage, g as LlmRouteAssertionError, a as LlmRouteRequirements, h as LlmUsage, i as assertLlmRoute, j as backoffMs, k as callLlm, l as callLlmJson, m as isTransientLlmError, p as probeLlm, s as stripFencedJson } from './llm-client-BXVRUZyX.js';
11
13
  import { AnalyzeTracesOptions, OtelExporter, OtelExportConfig, AnalyzeTracesInput, AnalyzeTracesResult } from './traces.js';
@@ -14,39 +16,39 @@ import { T as TraceAnalysisStore } from './store-CJbzDxZ2.js';
14
16
  export { D as DEFAULT_TRACE_ANALYST_BUDGETS, a as DatasetOverview, Q as QueryTracesPage, S as SearchSpanResult, b as SearchTraceResult, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, f as TraceAnalystFilters, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, V as ViewSpansResult, k as ViewTraceOversized, l as ViewTraceResult } from './store-CJbzDxZ2.js';
15
17
  import { b as JudgeFn, J as JudgeInput, B as BenchmarkRunnerConfig, S as Scenario, c as BenchmarkReport, P as ProductClientConfig, C as CheckResult, T as TestResult, d as PersonaConfig, D as DriverResult, e as DriverState, f as CollectedArtifacts, g as ScenarioResult, h as TurnMetrics, i as ScenarioFile, j as CompletionCriterion } from './types-DhqpAi_z.js';
16
18
  export { A as ArtifactCheck, k as ArtifactResult, E as EvalResult, F as FeedbackPattern, l as JudgeConfig, m as JudgeRubric, a as JudgeScore, n as PersonaRigor, R as RouteMap, o as RubricDimension, p as Turn, q as TurnResult } from './types-DhqpAi_z.js';
17
- import { a as Analyst, b as AnalystSeverity, c as AnalystFinding, d as AnalystCost, e as AnalystContext } from './registry-8KAs18kY.js';
18
- export { f as AnalystHooks, g as AnalystInputKind, A as AnalystRegistry, h as AnalystRegistryOptions, i as AnalystRequirements, j as AnalystRunEvent, k as AnalystRunInputs, l as AnalystRunResult, m as AnalystRunSummary, B as BudgetPolicy, C as ChatCallOpts, n as ChatClient, o as ChatRequest, p as ChatResponse, q as ChatTransport, r as CliBridgeTransportOpts, s as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RegistryRunOpts, t as RouterTransportOpts, S as SandboxSdkTransportOpts, u as computeFindingId, v as createChatClient, w as makeFinding } from './registry-8KAs18kY.js';
19
+ import { a as Analyst, b as AnalystSeverity, c as AnalystFinding, d as AnalystCost, e as AnalystContext } from './registry-BSWy0rvH.js';
20
+ export { f as AnalystHooks, g as AnalystInputKind, A as AnalystRegistry, h as AnalystRegistryOptions, i as AnalystRequirements, j as AnalystRunEvent, k as AnalystRunInputs, l as AnalystRunResult, m as AnalystRunSummary, B as BudgetPolicy, C as ChatCallOpts, n as ChatClient, o as ChatRequest, p as ChatResponse, q as ChatTransport, r as CliBridgeTransportOpts, s as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RegistryRunOpts, t as RouterTransportOpts, S as SandboxSdkTransportOpts, u as computeFindingId, v as createChatClient, w as makeFinding } from './registry-BSWy0rvH.js';
19
21
  import { TCloud } from '@tangle-network/tcloud';
20
22
  import { z } from 'zod';
21
- export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BZ_lVLYW.js';
23
+ export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-DuFBYg7A.js';
22
24
  import { A as AgentEvalError } from './errors-mje_cKOs.js';
23
25
  export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-mje_cKOs.js';
24
- import { a as FeedbackLabel, F as FeedbackTrajectoryStore, b as FeedbackTrajectory } from './feedback-trajectory-BSxqEpu7.js';
25
- export { c as FeedbackArtifactType, d as FeedbackAttempt, e as FeedbackLabelKind, f as FeedbackLabelSource, g as FeedbackOptimizerRow, h as FeedbackOutcome, i as FeedbackReplayAdapter, j as FeedbackReplayResult, k as FeedbackSeverity, l as FeedbackSplitPolicy, m as FeedbackTask, n as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-BSxqEpu7.js';
26
+ import { a as FeedbackLabel, F as FeedbackTrajectoryStore, b as FeedbackTrajectory } from './feedback-trajectory-DpUmE90J.js';
27
+ export { c as FeedbackArtifactType, d as FeedbackAttempt, e as FeedbackLabelKind, f as FeedbackLabelSource, g as FeedbackOptimizerRow, h as FeedbackOutcome, i as FeedbackReplayAdapter, j as FeedbackReplayResult, k as FeedbackSeverity, l as FeedbackSplitPolicy, m as FeedbackTask, n as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-DpUmE90J.js';
26
28
  export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
27
- import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-DSu0DWy8.js';
28
- export { A as ActionableSideInfo, s as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, C as CorpusAgreementOptions, t as CorpusAgreementPerDimension, u as CorpusAgreementReport, v as CorpusScoreRecord, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, x as bonferroni, n as bootstrapCi, y as cohensD, z as confidenceInterval, D as corpusInterRaterAgreement, E as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, F as interRaterReliability, p as judgeReplayGate, G as mannWhitneyU, H as normalizeScores, q as pairedBootstrap, I as pairedMde, K as pairedTTest, L as partialCredit, r as renderReleaseReport, M as requiredSampleSize, N as weightedMean, w as wilcoxonSignedRank } from './release-report-DSu0DWy8.js';
29
- import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-B2kWEdh9.js';
30
- export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-B2kWEdh9.js';
31
- import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
32
- export { R as RunCompleteHook, a as RunCompleteHookContext, S as SpanHandle, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DP_cSSiw.js';
33
- export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CTDhR1Sg.js';
34
- export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-DODUYdPg.js';
29
+ import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-B6l5fi7T.js';
30
+ export { A as ActionableSideInfo, s as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, C as CorpusAgreementOptions, t as CorpusAgreementPerDimension, u as CorpusAgreementReport, v as CorpusScoreRecord, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, x as bonferroni, n as bootstrapCi, y as cohensD, z as confidenceInterval, D as corpusInterRaterAgreement, E as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, F as interRaterReliability, p as judgeReplayGate, G as mannWhitneyU, H as normalizeScores, q as pairedBootstrap, I as pairedMde, K as pairedTTest, L as partialCredit, r as renderReleaseReport, M as requiredSampleSize, N as weightedMean, w as wilcoxonSignedRank } from './release-report-B6l5fi7T.js';
31
+ import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-BdVaPyHT.js';
32
+ export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-BdVaPyHT.js';
33
+ import { T as TraceEmitter } from './emitter-DEZwY14K.js';
34
+ export { R as RunCompleteHook, a as RunCompleteHookContext, S as SpanHandle, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DEZwY14K.js';
35
+ export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CfXjSqEv.js';
36
+ export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-CqTxMwDw.js';
35
37
  export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, c as RawProviderDirection, d as RawProviderEvent, R as RawProviderSink, e as RawProviderSinkFilter, f as defaultProviderRedactor, p as providerFromBaseUrl } from './raw-provider-sink-C46HDghv.js';
36
- export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-Cw65_5FY.js';
37
- import { a as BaselineReport } from './baseline-4R5deP0N.js';
38
- export { B as BaselineOptions, M as MetricSamples, b as MetricVerdict, T as ToolStats, d as ToolUseMetrics, e as ToolUseOptions, f as compareToBaseline, c as computeToolUseMetrics, i as iqr, w as welchsTTest } from './baseline-4R5deP0N.js';
39
- import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
40
- export { b as buildTrajectory } from './trajectory-CnoBo-JY.js';
38
+ export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-CL7IVgkJ.js';
39
+ import { a as BaselineReport } from './baseline-DE36-Np7.js';
40
+ export { B as BaselineOptions, M as MetricSamples, b as MetricVerdict, T as ToolStats, d as ToolUseMetrics, e as ToolUseOptions, f as compareToBaseline, c as computeToolUseMetrics, i as iqr, w as welchsTTest } from './baseline-DE36-Np7.js';
41
+ import { T as Trajectory, a as TrajectoryStep } from './trajectory-GEdXJCL5.js';
42
+ export { b as buildTrajectory } from './trajectory-GEdXJCL5.js';
41
43
  export { D as DefaultVerdict } from './verdict-CeEgtjyI.js';
42
44
  import { a as DatasetScenario, b as Dataset } from './dataset-BlwAtYYf.js';
43
45
  export { d as DatasetDifficulty, c as DatasetManifest, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-BlwAtYYf.js';
44
46
  export { b as CalibrationResult, c as CandidateScore, a as ContinuousAgreement, C as ContinuousAgreementOptions, d as ContinuousCalibrationResult, G as GoldenItem, P as PositionalBiasResult, S as SelfPreferenceResult, V as VerbosityBiasResult, e as calibrateJudge, f as calibrateJudgeContinuous, g as continuousAgreement, p as positionalBias, s as selfPreference, v as verbosityBias } from './judge-calibration-DilmB3Ml.js';
45
- export { D as DEFAULT_RED_TEAM_CORPUS, R as RedTeamCase, a as RedTeamCategory, b as RedTeamFinding, c as RedTeamPayload, d as RedTeamReport, r as redTeamDataset, e as redTeamReport, s as scoreRedTeamOutput, t as toolNamesForRun } from './red-team-30II1T4o.js';
46
- import { a as PrmGrader } from './rubric-D5tjHNJQ.js';
47
+ export { D as DEFAULT_RED_TEAM_CORPUS, R as RedTeamCase, a as RedTeamCategory, b as RedTeamFinding, c as RedTeamPayload, d as RedTeamReport, r as redTeamDataset, e as redTeamReport, s as scoreRedTeamOutput, t as toolNamesForRun } from './red-team-CrC5MZYd.js';
48
+ import { a as PrmGrader } from './rubric-BOfxn4ja.js';
47
49
  export { EuRiskClass, GovernanceContext, GovernanceFinding, GovernanceReport, UseCaseSignals, classifyEuAiRisk, euAiActReport, nistAiRmfReport, renderMarkdown, soc2Report, summarize } from './governance/index.js';
48
- export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-0pu_fBwZ.js';
49
- export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, m as GateDecision, n as GateEvidence, H as HeldOutGate, o as HeldOutGateConfig, q as HeldOutGateRejectionCode, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-B7gNRX-r.js';
50
+ export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-wlaiph9Y.js';
51
+ export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, m as GateDecision, n as GateEvidence, H as HeldOutGate, o as HeldOutGateConfig, q as HeldOutGateRejectionCode, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-DLxh4yWk.js';
50
52
  export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
51
53
  import './outcome-store-D6KWmYvj.js';
52
54
 
package/dist/index.js CHANGED
@@ -74,7 +74,7 @@ import {
74
74
  runProposeReview,
75
75
  runProposeReviewAsControlLoop,
76
76
  scoreFromEvals
77
- } from "./chunk-L7XMNXLO.js";
77
+ } from "./chunk-J4DIMSRK.js";
78
78
  import {
79
79
  allCriticalPassed,
80
80
  objectiveEval,
@@ -92,7 +92,7 @@ import {
92
92
  } from "./chunk-UBQGWD3O.js";
93
93
  import {
94
94
  runEvalCampaign
95
- } from "./chunk-5KSDYBYH.js";
95
+ } from "./chunk-YXTT6GSZ.js";
96
96
  import {
97
97
  AGENT_PROFILE_KINDS,
98
98
  AgentProfileCellValidationError,
@@ -111,7 +111,7 @@ import {
111
111
  validateAgentProfileCell,
112
112
  validateRunRecord,
113
113
  verifyAgentProfileCell
114
- } from "./chunk-BWZEGTES.js";
114
+ } from "./chunk-NCK5QLGT.js";
115
115
  import {
116
116
  evaluateInterimReleaseConfidence,
117
117
  pairedEvalueSequence
@@ -1,6 +1,6 @@
1
1
  import { C as CaptureIntegrityError } from './errors-mje_cKOs.js';
2
2
  import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
3
- import { T as TraceStore } from './store-Db2Bv8Cf.js';
3
+ import { T as TraceStore } from './store-CKUAgsJz.js';
4
4
 
5
5
  /**
6
6
  * Run-completion integrity check — at end of run, verify the expected event
@@ -1,6 +1,7 @@
1
- import { j as ControlSeverity, C as ControlEvalResult } from '../control-runtime-BZ_lVLYW.js';
2
- import { T as TraceEmitter } from '../emitter-DP_cSSiw.js';
3
- import '../store-Db2Bv8Cf.js';
1
+ import { j as ControlSeverity, C as ControlEvalResult } from '../control-runtime-DuFBYg7A.js';
2
+ import { T as TraceEmitter } from '../emitter-DEZwY14K.js';
3
+ import '../schema-m0gsnbt3.js';
4
+ import '../store-CKUAgsJz.js';
4
5
 
5
6
  type KnowledgeRequirementCategory = 'user_specific' | 'company_specific' | 'domain_specific' | 'codebase_specific' | 'market_specific' | 'regulatory' | 'tool_api' | 'credential_or_secret' | 'runtime_environment' | 'preference' | 'historical_context';
6
7
  type KnowledgeAcquisitionMode = 'ask_user' | 'search_web' | 'query_connector' | 'inspect_repo' | 'run_command' | 'infer_low_confidence' | 'not_available';
@@ -1,8 +1,9 @@
1
- import { R as Run, T as TraceStore } from '../store-Db2Bv8Cf.js';
1
+ import { T as TraceStore } from '../store-CKUAgsJz.js';
2
+ import { R as Run } from '../schema-m0gsnbt3.js';
2
3
  import { a as OutcomeFilter, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
3
4
  export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from '../outcome-store-D6KWmYvj.js';
4
- export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-ByZEC3BX.js';
5
- import '../run-record-BGY6bHRh.js';
5
+ export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-B3qNa4aY.js';
6
+ import '../run-record-etiCMsUq.js';
6
7
  import '../errors-mje_cKOs.js';
7
8
 
8
9
  /**
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.52.0",
5
+ "version": "0.54.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -1,9 +1,10 @@
1
- import { g as BudgetSpec, T as TraceStore, h as RunFilter, R as Run, a as ToolSpan } from '../store-Db2Bv8Cf.js';
2
- export { a as FailureCluster, F as FailureClusterReport, f as failureClusterView } from '../failure-cluster-Cw65_5FY.js';
3
- import { a as TrajectoryStep } from '../trajectory-CnoBo-JY.js';
4
- import { B as BaselineOptions, a as BaselineReport } from '../baseline-4R5deP0N.js';
5
- export { c as computeToolUseMetrics } from '../baseline-4R5deP0N.js';
6
- import { l as llmSpans } from '../query-DODUYdPg.js';
1
+ import { h as BudgetSpec, R as Run, T as ToolSpan } from '../schema-m0gsnbt3.js';
2
+ import { T as TraceStore, R as RunFilter } from '../store-CKUAgsJz.js';
3
+ export { a as FailureCluster, F as FailureClusterReport, f as failureClusterView } from '../failure-cluster-CL7IVgkJ.js';
4
+ import { a as TrajectoryStep } from '../trajectory-GEdXJCL5.js';
5
+ import { B as BaselineOptions, a as BaselineReport } from '../baseline-DE36-Np7.js';
6
+ export { c as computeToolUseMetrics } from '../baseline-DE36-Np7.js';
7
+ import { l as llmSpans } from '../query-CqTxMwDw.js';
7
8
 
8
9
  /**
9
10
  * BudgetBreachView — aggregates breach events across the corpus.
@@ -1,7 +1,8 @@
1
- import { P as PrmGradedTrace, S as StepRubric, a as PrmGrader } from '../rubric-D5tjHNJQ.js';
2
- export { G as GradedStep, b as StepContext, i as isPrmVerdict } from '../rubric-D5tjHNJQ.js';
3
- import { S as Span, T as TraceStore } from '../store-Db2Bv8Cf.js';
4
- import '../trajectory-CnoBo-JY.js';
1
+ import { P as PrmGradedTrace, S as StepRubric, a as PrmGrader } from '../rubric-BOfxn4ja.js';
2
+ export { G as GradedStep, b as StepContext, i as isPrmVerdict } from '../rubric-BOfxn4ja.js';
3
+ import { T as TraceStore } from '../store-CKUAgsJz.js';
4
+ import { S as Span } from '../schema-m0gsnbt3.js';
5
+ import '../trajectory-GEdXJCL5.js';
5
6
 
6
7
  /**
7
8
  * Export PRM-graded traces as training data for downstream reward-model
@@ -1,4 +1,5 @@
1
- import { L as LlmSpan, T as TraceStore, J as JudgeSpan, R as Run, F as FailureClass, a as ToolSpan } from './store-Db2Bv8Cf.js';
1
+ import { L as LlmSpan, J as JudgeSpan, R as Run, F as FailureClass, T as ToolSpan } from './schema-m0gsnbt3.js';
2
+ import { T as TraceStore } from './store-CKUAgsJz.js';
2
3
 
3
4
  /**
4
5
  * Typed query helpers over TraceStore.
@@ -1,5 +1,5 @@
1
1
  import { a as DatasetScenario, b as Dataset } from './dataset-BlwAtYYf.js';
2
- import { T as TraceStore } from './store-Db2Bv8Cf.js';
2
+ import { T as TraceStore } from './store-CKUAgsJz.js';
3
3
 
4
4
  /**
5
5
  * Red-team battery — adversarial scenario corpus with per-category
@@ -1,5 +1,5 @@
1
1
  import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-BXVRUZyX.js';
2
- import { R as RunRecord } from './run-record-BGY6bHRh.js';
2
+ import { R as RunRecord } from './run-record-etiCMsUq.js';
3
3
  import { T as TraceAnalysisStore } from './store-CJbzDxZ2.js';
4
4
  import { J as JudgeInput } from './types-DhqpAi_z.js';
5
5
 
@@ -1,8 +1,8 @@
1
1
  import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
2
2
  import { a as JudgeScore } from './types-DhqpAi_z.js';
3
3
  import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-BlwAtYYf.js';
4
- import { m as GateDecision } from './summary-report-B7gNRX-r.js';
5
- import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
4
+ import { m as GateDecision } from './summary-report-DLxh4yWk.js';
5
+ import { R as RunRecord, a as RunSplitTag } from './run-record-etiCMsUq.js';
6
6
 
7
7
  /**
8
8
  * Release confidence gate.
@@ -1,13 +1,14 @@
1
- export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-ByZEC3BX.js';
2
- export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-DSu0DWy8.js';
1
+ export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-B3qNa4aY.js';
2
+ export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-B6l5fi7T.js';
3
3
  export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
4
- export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-B7gNRX-r.js';
5
- import './run-record-BGY6bHRh.js';
4
+ export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-DLxh4yWk.js';
5
+ import './run-record-etiCMsUq.js';
6
6
  import './errors-mje_cKOs.js';
7
+ import './schema-m0gsnbt3.js';
7
8
  import './outcome-store-D6KWmYvj.js';
8
9
  import './judge-calibration-DilmB3Ml.js';
9
10
  import './types-DhqpAi_z.js';
10
11
  import '@tangle-network/tcloud';
11
12
  import './dataset-BlwAtYYf.js';
12
- import './failure-cluster-Cw65_5FY.js';
13
- import './store-Db2Bv8Cf.js';
13
+ import './failure-cluster-CL7IVgkJ.js';
14
+ import './store-CKUAgsJz.js';
@@ -1,10 +1,10 @@
1
- import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-BGY6bHRh.js';
1
+ import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-etiCMsUq.js';
2
2
  import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-BXVRUZyX.js';
3
- import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-B7gNRX-r.js';
4
- import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
5
- import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CTDhR1Sg.js';
3
+ import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-DLxh4yWk.js';
4
+ import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DEZwY14K.js';
5
+ import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CfXjSqEv.js';
6
6
  import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
7
- import { T as TraceStore } from './store-Db2Bv8Cf.js';
7
+ import { T as TraceStore } from './store-CKUAgsJz.js';
8
8
 
9
9
  /**
10
10
  * Multi-layer verifier — ordered pipeline of verification layers.
package/dist/rl.d.ts CHANGED
@@ -1,19 +1,20 @@
1
- import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
2
- import { j as CampaignResult } from './types-Dbj5gu8n.js';
3
- import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-LZD0qHEa.js';
4
- export { r as runEvalCampaign } from './researcher-LZD0qHEa.js';
5
- import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
1
+ import { R as RunRecord, a as RunSplitTag } from './run-record-etiCMsUq.js';
2
+ import { k as CampaignResult } from './types-BgrxOJSf.js';
3
+ import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-D4AZjxNa.js';
4
+ export { r as runEvalCampaign } from './researcher-D4AZjxNa.js';
5
+ import { S as Span } from './schema-m0gsnbt3.js';
6
+ import { T as TraceStore } from './store-CKUAgsJz.js';
6
7
  import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
7
8
  export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from './outcome-store-D6KWmYvj.js';
8
- import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-ByZEC3BX.js';
9
+ import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-B3qNa4aY.js';
9
10
  import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
10
11
  import './errors-mje_cKOs.js';
11
12
  import './llm-client-BXVRUZyX.js';
12
13
  import './raw-provider-sink-C46HDghv.js';
13
- import './summary-report-B7gNRX-r.js';
14
- import './failure-cluster-Cw65_5FY.js';
15
- import './emitter-DP_cSSiw.js';
16
- import './integrity-CTDhR1Sg.js';
14
+ import './summary-report-DLxh4yWk.js';
15
+ import './failure-cluster-CL7IVgkJ.js';
16
+ import './emitter-DEZwY14K.js';
17
+ import './integrity-CfXjSqEv.js';
17
18
 
18
19
  /**
19
20
  * Test-time compute scaling curves.
package/dist/rl.js CHANGED
@@ -10,8 +10,8 @@ import {
10
10
  } from "./chunk-3RF76KTD.js";
11
11
  import {
12
12
  runEvalCampaign
13
- } from "./chunk-5KSDYBYH.js";
14
- import "./chunk-BWZEGTES.js";
13
+ } from "./chunk-YXTT6GSZ.js";
14
+ import "./chunk-NCK5QLGT.js";
15
15
  import {
16
16
  rubricPredictiveValidity
17
17
  } from "./chunk-YRZ4M5GS.js";
@@ -1,5 +1,6 @@
1
- import { S as Span, T as TraceStore, J as JudgeSpan } from './store-Db2Bv8Cf.js';
2
- import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
1
+ import { S as Span, J as JudgeSpan } from './schema-m0gsnbt3.js';
2
+ import { T as TraceStore } from './store-CKUAgsJz.js';
3
+ import { T as Trajectory, a as TrajectoryStep } from './trajectory-GEdXJCL5.js';
3
4
 
4
5
  /**
5
6
  * Process Reward Modeling — per-step rubric grading.
@@ -1,4 +1,4 @@
1
- import { R as RunRecord } from './run-record-BGY6bHRh.js';
1
+ import { R as RunRecord } from './run-record-etiCMsUq.js';
2
2
  import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
3
3
 
4
4
  /**
@@ -1,7 +1,7 @@
1
- import { S as Scenario, j as CampaignResult, n as GateResult, t as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, k as CampaignTraceWriter, M as MutableSurface, p as GenerationRecord } from './types-Dbj5gu8n.js';
1
+ import { S as Scenario, k as CampaignResult, o as GateResult, u as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, l as CampaignTraceWriter, M as MutableSurface, q as GenerationRecord } from './types-BgrxOJSf.js';
2
2
  import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
3
- import { R as RedTeamCase } from './red-team-30II1T4o.js';
4
- import { R as RunRecord } from './run-record-BGY6bHRh.js';
3
+ import { R as RedTeamCase } from './red-team-CrC5MZYd.js';
4
+ import { R as RunRecord } from './run-record-etiCMsUq.js';
5
5
 
6
6
  /**
7
7
  * @experimental
@@ -1,4 +1,5 @@
1
1
  import { V as ValidationError } from './errors-mje_cKOs.js';
2
+ import { F as FailureClass } from './schema-m0gsnbt3.js';
2
3
 
3
4
  type AgentProfileCellSchemaVersion = 'agent-profile-cell/v1';
4
5
  type AgentProfileJson = string | number | boolean | null | AgentProfileJson[] | {
@@ -249,9 +250,16 @@ interface RunRecord {
249
250
  judgeMetadata?: RunJudgeMetadata;
250
251
  /** Per-split scores + raw bag. */
251
252
  outcome: RunOutcome;
252
- /** Categorical failure tag, when the run failed and the harness
253
- * classified it. Free-form string; standard tags live in
254
- * `failure-taxonomy.ts`. */
253
+ /** Canonical, cross-agent failure class drawn from the shared
254
+ * `FAILURE_CLASSES` taxonomy. This is the aggregation key that makes
255
+ * "which failure dominates across the whole fleet" answerable in ONE
256
+ * vocabulary — every agent classifies against the same enum. Producers
257
+ * set it via the substrate classifier; leave unset only when the failure
258
+ * genuinely can't be classified. */
259
+ failureClass?: FailureClass;
260
+ /** Free-form domain-specific failure detail, scoped UNDER `failureClass`
261
+ * (e.g. failureClass='tool_recovery_failure', failureMode='forge_build_unsatisfied').
262
+ * The within-agent drill-down; `failureClass` is the cross-agent key. */
255
263
  failureMode?: string;
256
264
  /** Which split this run was drawn from. */
257
265
  splitTag: RunSplitTag;