npm - @tangle-network/agent-eval - Versions diffs - 0.52.0 → 0.54.0 - Mend

@tangle-network/agent-eval 0.52.0 → 0.54.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

package/CHANGELOG.md +23 -0
package/dist/adapters/http.d.ts +1 -1
package/dist/adapters/langchain.d.ts +1 -1
package/dist/adapters/otel.d.ts +7 -6
package/dist/{baseline-4R5deP0N.d.ts → baseline-DE36-Np7.d.ts} +1 -1
package/dist/benchmarks/index.d.ts +3 -2
package/dist/builder-eval/index.d.ts +4 -3
package/dist/campaign/index.d.ts +9 -7
package/dist/campaign/index.js +33 -4
package/dist/campaign/index.js.map +1 -1
package/dist/{chunk-L7XMNXLO.js → chunk-J4DIMSRK.js} +2 -2
package/dist/{chunk-BWZEGTES.js → chunk-NCK5QLGT.js} +1 -1
package/dist/chunk-NCK5QLGT.js.map +1 -0
package/dist/{chunk-5KSDYBYH.js → chunk-YXTT6GSZ.js} +2 -2
package/dist/contract/index.d.ts +25 -12
package/dist/contract/index.js +171 -0
package/dist/contract/index.js.map +1 -1
package/dist/{control-ojEWkMfJ.d.ts → control-DjEgwWNo.d.ts} +6 -5
package/dist/{control-runtime-BZ_lVLYW.d.ts → control-runtime-DuFBYg7A.d.ts} +3 -2
package/dist/control.d.ts +7 -6
package/dist/control.js +2 -2
package/dist/{emitter-DP_cSSiw.d.ts → emitter-DEZwY14K.d.ts} +2 -1
package/dist/{failure-cluster-Cw65_5FY.d.ts → failure-cluster-CL7IVgkJ.d.ts} +2 -1
package/dist/{feedback-trajectory-BSxqEpu7.d.ts → feedback-trajectory-DpUmE90J.d.ts} +1 -1
package/dist/governance/index.d.ts +3 -2
package/dist/hosted/index.d.ts +7 -6
package/dist/{index-DQHtWQ57.d.ts → index-D2nT6_KT.d.ts} +66 -2
package/dist/{index-0pu_fBwZ.d.ts → index-wlaiph9Y.d.ts} +1 -1
package/dist/index.d.ts +31 -29
package/dist/index.js +3 -3
package/dist/{integrity-CTDhR1Sg.d.ts → integrity-CfXjSqEv.d.ts} +1 -1
package/dist/knowledge/index.d.ts +4 -3
package/dist/meta-eval/index.d.ts +4 -3
package/dist/openapi.json +1 -1
package/dist/pipelines/index.d.ts +7 -6
package/dist/prm/index.d.ts +5 -4
package/dist/{query-DODUYdPg.d.ts → query-CqTxMwDw.d.ts} +2 -1
package/dist/{red-team-30II1T4o.d.ts → red-team-CrC5MZYd.d.ts} +1 -1
package/dist/{registry-8KAs18kY.d.ts → registry-BSWy0rvH.d.ts} +1 -1
package/dist/{release-report-DSu0DWy8.d.ts → release-report-B6l5fi7T.d.ts} +2 -2
package/dist/reporting.d.ts +7 -6
package/dist/{researcher-LZD0qHEa.d.ts → researcher-D4AZjxNa.d.ts} +5 -5
package/dist/rl.d.ts +11 -10
package/dist/rl.js +2 -2
package/dist/{rubric-D5tjHNJQ.d.ts → rubric-BOfxn4ja.d.ts} +3 -2
package/dist/{rubric-predictive-validity-ByZEC3BX.d.ts → rubric-predictive-validity-B3qNa4aY.d.ts} +1 -1
package/dist/{run-improvement-loop-Cc7oZlRP.d.ts → run-improvement-loop-BhfdjrMY.d.ts} +3 -3
package/dist/{run-record-BGY6bHRh.d.ts → run-record-etiCMsUq.d.ts} +11 -3
package/dist/{store-Db2Bv8Cf.d.ts → schema-m0gsnbt3.d.ts} +1 -99
package/dist/store-CKUAgsJz.d.ts +101 -0
package/dist/{summary-report-B7gNRX-r.d.ts → summary-report-DLxh4yWk.d.ts} +2 -2
package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BdVaPyHT.d.ts} +3 -2
package/dist/traces.d.ts +7 -6
package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-GEdXJCL5.d.ts} +2 -1
package/dist/{types-Dbj5gu8n.d.ts → types-BgrxOJSf.d.ts} +31 -1
package/dist/wire/index.d.ts +5 -4
package/docs/design/self-improvement-protocol.md +223 -0
package/docs/pilot/README.md +62 -0
package/docs/pilot/customer-checklist.md +90 -0
package/docs/pilot/integration-foreign-stack.md +296 -0
package/docs/pilot/integration-tangle-stack.md +248 -0
package/docs/pilot/one-pager.md +161 -0
package/docs/pilot/sample-insight-report.json +172 -0
package/docs/research/research-roadmap.md +204 -0
package/package.json +1 -1
package/dist/chunk-BWZEGTES.js.map +0 -1
/package/dist/{chunk-L7XMNXLO.js.map → chunk-J4DIMSRK.js.map} +0 -0
/package/dist/{chunk-5KSDYBYH.js.map → chunk-YXTT6GSZ.js.map} +0 -0

package/dist/governance/index.d.ts CHANGED Viewed

@@ -1,9 +1,10 @@
 import { c as DatasetManifest } from '../dataset-BlwAtYYf.js';
 import { b as CalibrationResult } from '../judge-calibration-DilmB3Ml.js';
 import { O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
-import { d as RedTeamReport } from '../red-team-30II1T4o.js';
-import { T as TraceStore } from '../store-Db2Bv8Cf.js';
+import { d as RedTeamReport } from '../red-team-CrC5MZYd.js';
+import { T as TraceStore } from '../store-CKUAgsJz.js';
 import '../errors-mje_cKOs.js';
+import '../schema-m0gsnbt3.js';
 /**
  * Governance reporting — shared types.

package/dist/hosted/index.d.ts CHANGED Viewed

@@ -1,8 +1,9 @@
-export { E as EvalRunCellScore, d as EvalRunEvent, e as EvalRunGenerationSnapshot, f as EvalRunStatus, g as HOSTED_WIRE_VERSION, H as HostedClient, h as HostedIngestHeaders, a as HostedTenant, i as HostedWireVersion, j as IngestEvalRunsRequest, k as IngestResponse, l as IngestTracesRequest, T as TraceSpanEvent, m as createHostedClient } from '../index-DQHtWQ57.js';
-import '../types-Dbj5gu8n.js';
-import '../summary-report-B7gNRX-r.js';
-import '../run-record-BGY6bHRh.js';
+export { E as EvalRunCellScore, d as EvalRunEvent, e as EvalRunGenerationSnapshot, f as EvalRunStatus, g as HOSTED_WIRE_VERSION, H as HostedClient, h as HostedIngestHeaders, a as HostedTenant, i as HostedWireVersion, j as IngestEvalRunsRequest, k as IngestResponse, l as IngestTracesRequest, T as TraceSpanEvent, m as createHostedClient } from '../index-D2nT6_KT.js';
+import '../types-BgrxOJSf.js';
+import '../summary-report-DLxh4yWk.js';
+import '../run-record-etiCMsUq.js';
 import '../errors-mje_cKOs.js';
-import '../failure-cluster-Cw65_5FY.js';
-import '../store-Db2Bv8Cf.js';
+import '../schema-m0gsnbt3.js';
+import '../failure-cluster-CL7IVgkJ.js';
+import '../store-CKUAgsJz.js';
 import '../judge-calibration-DilmB3Ml.js';

package/dist/{index-DQHtWQ57.d.ts → index-D2nT6_KT.d.ts} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { M as MutableSurface, m as GateDecision } from './types-Dbj5gu8n.js';
-import { G as GainDistributionBin, P as ParetoFigureSpec } from './summary-report-B7gNRX-r.js';
+import { M as MutableSurface, n as GateDecision } from './types-BgrxOJSf.js';
+import { G as GainDistributionBin, P as ParetoFigureSpec } from './summary-report-DLxh4yWk.js';
 import { a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
 /**
@@ -81,6 +81,17 @@ interface InsightReport {
      *  ActionableSideInfo bag) calls `evaluateReleaseConfidence()` directly;
      *  this summary captures the analyzeRuns-derived axes. */
     release: ReleaseSummary;
+    /** Delta vs a prior period when `baselineRuns` is passed. Per-metric
+     *  current vs baseline with Welch CI + Cohen's d + significance flag.
+     *  Answers "did my last change help?" — the customer-conversion question.
+     *  Surfaced metrics: composite, cost, duration, tokenUsage, plus any
+     *  per-dimension judge metric present in both windows. */
+    priorPeriodComparison?: PriorPeriodComparison;
+    /** Model-free failure-mode breakdown from `RunRecord.failureMode`, ranked
+     *  by count descending. Present when any run carries a `failureMode`.
+     *  Complements `failureClusters` (LLM-semantic) with the structured tags
+     *  the harness already recorded — actionable with no analyst wired. */
+    failureModes?: FailureModeTally[];
     /** Top-N actionable recommendations, ranked by priority. The packet's
      *  human-readable layer; the numeric sections are the evidence. */
     recommendations: Recommendation[];
@@ -175,6 +186,19 @@ interface FailureClusterInsight {
     }>;
     totalFailures: number;
 }
+/** Model-free failure breakdown over the structured `RunRecord.failureMode`
+ *  enum. Unlike `failureClusters` (semantic, requires an LLM analyst), this
+ *  is computed directly from the tags the harness already recorded — so a
+ *  customer ingesting one batch with no judge/analyst still learns which
+ *  named failure dominates. */
+interface FailureModeTally {
+    /** The `failureMode` tag. */
+    mode: string;
+    /** Number of runs carrying this tag. */
+    count: number;
+    /** Share of the whole corpus, 0..1. */
+    share: number;
+}
 interface ContaminationInsight {
     /** Canary phrases that leaked into outputs. */
     leaks: number;
@@ -217,6 +241,46 @@ interface ReleaseSummary {
      *  consumers can post-process to populate. */
     issues: string[];
 }
+interface MetricDelta {
+    /** Current-period mean. */
+    current: number;
+    /** Baseline-period mean. */
+    baseline: number;
+    /** current - baseline. Positive means improved (or, for cost/duration,
+     *  the consumer-side interpretation: "higher current" — semantic
+     *  direction depends on the metric). */
+    delta: number;
+    /** Welch 95% confidence interval on the delta. Two-sample, unpaired —
+     *  the baseline and current run sets may have different scenarios. */
+    ci95: [number, number];
+    /** Welch t-test p-value (two-sided). */
+    pValue: number;
+    /** Cohen's d (pooled stddev). Effect size, signed. */
+    cohensD: number;
+    /** Sample sizes. */
+    baselineN: number;
+    currentN: number;
+    /** True when p < 0.05 AND |d| >= 0.2 (small-effect threshold). The
+     *  conjunction prevents large-effect-but-noisy and significant-but-
+     *  tiny from triggering recommendations. */
+    significant: boolean;
+}
+interface PriorPeriodComparison {
+    /** Sample counts. */
+    baselineN: number;
+    currentN: number;
+    /** Optional human-readable label — "vs prior 7 days", "vs v3 release". */
+    windowLabel?: string;
+    /** Every metric we could compare. Keys: 'composite', 'cost', 'duration',
+     *  'tokenUsage' for always-present ones; per-dimension keys when both
+     *  windows have judge scores on the same dimension. */
+    metrics: Record<string, MetricDelta>;
+    /** Metric names where current is significantly WORSE than baseline.
+     *  Direction-aware: for cost/duration, higher current = worse. */
+    regressedMetrics: string[];
+    /** Metric names where current is significantly BETTER than baseline. */
+    improvedMetrics: string[];
+}
 interface Recommendation {
     priority: 'critical' | 'high' | 'medium' | 'low';
     kind: 'ship' | 'hold' | 'investigate' | 'fix' | 'recalibrate' | 'expand-corpus';

package/dist/{index-0pu_fBwZ.d.ts → index-wlaiph9Y.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { a as RunSplitTag } from './run-record-BGY6bHRh.js';
+import { a as RunSplitTag } from './run-record-etiCMsUq.js';
 /**
  * Shared types for the reference benchmark wrappers under

package/dist/index.d.ts CHANGED Viewed

@@ -1,11 +1,13 @@
-export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-ojEWkMfJ.js';
-import { R as RunRecord } from './run-record-BGY6bHRh.js';
-export { e as AGENT_PROFILE_KINDS, A as AgentProfileCell, d as AgentProfileCellInput, f as AgentProfileCellSchemaVersion, g as AgentProfileCellValidationError, h as AgentProfileDimensionValue, i as AgentProfileHarness, j as AgentProfileJson, k as AgentProfileKind, l as AgentProfileSource, m as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, n as RunOutcome, o as RunRecordValidationError, a as RunSplitTag, b as RunTokenUsage, S as SandboxAgentProfileLike, p as agentProfileCellHashMaterial, q as agentProfileCellKey, r as assertRunAgentProfileCell, s as buildAgentProfileCell, t as buildSandboxAgentProfileCell, u as groupRunsByAgentProfileCell, v as isRunRecord, w as parseRunRecordSafe, x as requireAgentProfileCell, y as roundTripRunRecord, z as toAgentProfileJson, B as validateAgentProfileCell, C as validateRunRecord, D as verifyAgentProfileCell } from './run-record-BGY6bHRh.js';
+export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-DjEgwWNo.js';
+import { R as RunRecord } from './run-record-etiCMsUq.js';
+export { e as AGENT_PROFILE_KINDS, A as AgentProfileCell, d as AgentProfileCellInput, f as AgentProfileCellSchemaVersion, g as AgentProfileCellValidationError, h as AgentProfileDimensionValue, i as AgentProfileHarness, j as AgentProfileJson, k as AgentProfileKind, l as AgentProfileSource, m as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, n as RunOutcome, o as RunRecordValidationError, a as RunSplitTag, b as RunTokenUsage, S as SandboxAgentProfileLike, p as agentProfileCellHashMaterial, q as agentProfileCellKey, r as assertRunAgentProfileCell, s as buildAgentProfileCell, t as buildSandboxAgentProfileCell, u as groupRunsByAgentProfileCell, v as isRunRecord, w as parseRunRecordSafe, x as requireAgentProfileCell, y as roundTripRunRecord, z as toAgentProfileJson, B as validateAgentProfileCell, C as validateRunRecord, D as verifyAgentProfileCell } from './run-record-etiCMsUq.js';
 import { AxAIService, AxFunction } from '@ax-llm/ax';
-import { d as Severity, M as MultiLayerVerifier, e as VerifyOptions, L as Layer, f as LayerResult, g as VerifyContext } from './researcher-LZD0qHEa.js';
-export { C as CallbackResearcher, h as CallbackResearcherOptions, i as CampaignFactoryParams, j as CampaignIntegrityPolicy, k as CampaignRunContext, l as CampaignRunOutcome, m as CampaignRunner, n as CampaignScenario, o as CampaignVariant, c as EvalCampaignOptions, b as EvalCampaignResult, E as ExperimentPlan, a as ExperimentResult, p as FailedRun, F as FailureMode, q as Finding, s as LayerStatus, N as NoopResearcher, R as Researcher, S as SteeringChange, V as VerificationReport, t as gradeSemanticStatus, r as runEvalCampaign } from './researcher-LZD0qHEa.js';
-import { R as Run$1, S as Span, b as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, T as TraceStore, g as BudgetSpec, h as RunFilter, L as LlmSpan } from './store-Db2Bv8Cf.js';
-export { i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, M as Message, e as RetrievalSpan, m as RunLayer, n as RunStatus, f as SandboxSpan, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
+import { d as Severity, M as MultiLayerVerifier, e as VerifyOptions, L as Layer, f as LayerResult, g as VerifyContext } from './researcher-D4AZjxNa.js';
+export { C as CallbackResearcher, h as CallbackResearcherOptions, i as CampaignFactoryParams, j as CampaignIntegrityPolicy, k as CampaignRunContext, l as CampaignRunOutcome, m as CampaignRunner, n as CampaignScenario, o as CampaignVariant, c as EvalCampaignOptions, b as EvalCampaignResult, E as ExperimentPlan, a as ExperimentResult, p as FailedRun, F as FailureMode, q as Finding, s as LayerStatus, N as NoopResearcher, R as Researcher, S as SteeringChange, V as VerificationReport, t as gradeSemanticStatus, r as runEvalCampaign } from './researcher-D4AZjxNa.js';
+import { R as Run$1, S as Span, a as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, h as BudgetSpec, L as LlmSpan } from './schema-m0gsnbt3.js';
+export { E as EventKind, i as FAILURE_CLASSES, F as FailureClass, G as GenericSpan, J as JudgeSpan, M as Message, d as RetrievalSpan, g as RunLayer, f as RunStatus, e as SandboxSpan, j as SpanBase, c as SpanKind, k as SpanStatus, l as TRACE_SCHEMA_VERSION, T as ToolSpan, m as isJudgeSpan, n as isLlmSpan, o as isRetrievalSpan, p as isSandboxSpan, q as isToolSpan } from './schema-m0gsnbt3.js';
+import { T as TraceStore, R as RunFilter } from './store-CKUAgsJz.js';
+export { E as EventFilter, F as FileSystemTraceStore, a as FileSystemTraceStoreOptions, I as InMemoryTraceStore, S as SpanFilter } from './store-CKUAgsJz.js';
 import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
 export { d as LlmCallError, b as LlmCallRequest, c as LlmCallResult, e as LlmClient, f as LlmMessage, g as LlmRouteAssertionError, a as LlmRouteRequirements, h as LlmUsage, i as assertLlmRoute, j as backoffMs, k as callLlm, l as callLlmJson, m as isTransientLlmError, p as probeLlm, s as stripFencedJson } from './llm-client-BXVRUZyX.js';
 import { AnalyzeTracesOptions, OtelExporter, OtelExportConfig, AnalyzeTracesInput, AnalyzeTracesResult } from './traces.js';
@@ -14,39 +16,39 @@ import { T as TraceAnalysisStore } from './store-CJbzDxZ2.js';
 export { D as DEFAULT_TRACE_ANALYST_BUDGETS, a as DatasetOverview, Q as QueryTracesPage, S as SearchSpanResult, b as SearchTraceResult, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, f as TraceAnalystFilters, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, V as ViewSpansResult, k as ViewTraceOversized, l as ViewTraceResult } from './store-CJbzDxZ2.js';
 import { b as JudgeFn, J as JudgeInput, B as BenchmarkRunnerConfig, S as Scenario, c as BenchmarkReport, P as ProductClientConfig, C as CheckResult, T as TestResult, d as PersonaConfig, D as DriverResult, e as DriverState, f as CollectedArtifacts, g as ScenarioResult, h as TurnMetrics, i as ScenarioFile, j as CompletionCriterion } from './types-DhqpAi_z.js';
 export { A as ArtifactCheck, k as ArtifactResult, E as EvalResult, F as FeedbackPattern, l as JudgeConfig, m as JudgeRubric, a as JudgeScore, n as PersonaRigor, R as RouteMap, o as RubricDimension, p as Turn, q as TurnResult } from './types-DhqpAi_z.js';
-import { a as Analyst, b as AnalystSeverity, c as AnalystFinding, d as AnalystCost, e as AnalystContext } from './registry-8KAs18kY.js';
-export { f as AnalystHooks, g as AnalystInputKind, A as AnalystRegistry, h as AnalystRegistryOptions, i as AnalystRequirements, j as AnalystRunEvent, k as AnalystRunInputs, l as AnalystRunResult, m as AnalystRunSummary, B as BudgetPolicy, C as ChatCallOpts, n as ChatClient, o as ChatRequest, p as ChatResponse, q as ChatTransport, r as CliBridgeTransportOpts, s as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RegistryRunOpts, t as RouterTransportOpts, S as SandboxSdkTransportOpts, u as computeFindingId, v as createChatClient, w as makeFinding } from './registry-8KAs18kY.js';
+import { a as Analyst, b as AnalystSeverity, c as AnalystFinding, d as AnalystCost, e as AnalystContext } from './registry-BSWy0rvH.js';
+export { f as AnalystHooks, g as AnalystInputKind, A as AnalystRegistry, h as AnalystRegistryOptions, i as AnalystRequirements, j as AnalystRunEvent, k as AnalystRunInputs, l as AnalystRunResult, m as AnalystRunSummary, B as BudgetPolicy, C as ChatCallOpts, n as ChatClient, o as ChatRequest, p as ChatResponse, q as ChatTransport, r as CliBridgeTransportOpts, s as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RegistryRunOpts, t as RouterTransportOpts, S as SandboxSdkTransportOpts, u as computeFindingId, v as createChatClient, w as makeFinding } from './registry-BSWy0rvH.js';
 import { TCloud } from '@tangle-network/tcloud';
 import { z } from 'zod';
-export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BZ_lVLYW.js';
+export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-DuFBYg7A.js';
 import { A as AgentEvalError } from './errors-mje_cKOs.js';
 export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-mje_cKOs.js';
-import { a as FeedbackLabel, F as FeedbackTrajectoryStore, b as FeedbackTrajectory } from './feedback-trajectory-BSxqEpu7.js';
-export { c as FeedbackArtifactType, d as FeedbackAttempt, e as FeedbackLabelKind, f as FeedbackLabelSource, g as FeedbackOptimizerRow, h as FeedbackOutcome, i as FeedbackReplayAdapter, j as FeedbackReplayResult, k as FeedbackSeverity, l as FeedbackSplitPolicy, m as FeedbackTask, n as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-BSxqEpu7.js';
+import { a as FeedbackLabel, F as FeedbackTrajectoryStore, b as FeedbackTrajectory } from './feedback-trajectory-DpUmE90J.js';
+export { c as FeedbackArtifactType, d as FeedbackAttempt, e as FeedbackLabelKind, f as FeedbackLabelSource, g as FeedbackOptimizerRow, h as FeedbackOutcome, i as FeedbackReplayAdapter, j as FeedbackReplayResult, k as FeedbackSeverity, l as FeedbackSplitPolicy, m as FeedbackTask, n as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-DpUmE90J.js';
 export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
-import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-DSu0DWy8.js';
-export { A as ActionableSideInfo, s as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, C as CorpusAgreementOptions, t as CorpusAgreementPerDimension, u as CorpusAgreementReport, v as CorpusScoreRecord, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, x as bonferroni, n as bootstrapCi, y as cohensD, z as confidenceInterval, D as corpusInterRaterAgreement, E as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, F as interRaterReliability, p as judgeReplayGate, G as mannWhitneyU, H as normalizeScores, q as pairedBootstrap, I as pairedMde, K as pairedTTest, L as partialCredit, r as renderReleaseReport, M as requiredSampleSize, N as weightedMean, w as wilcoxonSignedRank } from './release-report-DSu0DWy8.js';
-import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-B2kWEdh9.js';
-export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-B2kWEdh9.js';
-import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
-export { R as RunCompleteHook, a as RunCompleteHookContext, S as SpanHandle, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DP_cSSiw.js';
-export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CTDhR1Sg.js';
-export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-DODUYdPg.js';
+import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-B6l5fi7T.js';
+export { A as ActionableSideInfo, s as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, C as CorpusAgreementOptions, t as CorpusAgreementPerDimension, u as CorpusAgreementReport, v as CorpusScoreRecord, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, x as bonferroni, n as bootstrapCi, y as cohensD, z as confidenceInterval, D as corpusInterRaterAgreement, E as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, F as interRaterReliability, p as judgeReplayGate, G as mannWhitneyU, H as normalizeScores, q as pairedBootstrap, I as pairedMde, K as pairedTTest, L as partialCredit, r as renderReleaseReport, M as requiredSampleSize, N as weightedMean, w as wilcoxonSignedRank } from './release-report-B6l5fi7T.js';
+import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-BdVaPyHT.js';
+export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-BdVaPyHT.js';
+import { T as TraceEmitter } from './emitter-DEZwY14K.js';
+export { R as RunCompleteHook, a as RunCompleteHookContext, S as SpanHandle, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DEZwY14K.js';
+export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CfXjSqEv.js';
+export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-CqTxMwDw.js';
 export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, c as RawProviderDirection, d as RawProviderEvent, R as RawProviderSink, e as RawProviderSinkFilter, f as defaultProviderRedactor, p as providerFromBaseUrl } from './raw-provider-sink-C46HDghv.js';
-export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-Cw65_5FY.js';
-import { a as BaselineReport } from './baseline-4R5deP0N.js';
-export { B as BaselineOptions, M as MetricSamples, b as MetricVerdict, T as ToolStats, d as ToolUseMetrics, e as ToolUseOptions, f as compareToBaseline, c as computeToolUseMetrics, i as iqr, w as welchsTTest } from './baseline-4R5deP0N.js';
-import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
-export { b as buildTrajectory } from './trajectory-CnoBo-JY.js';
+export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-CL7IVgkJ.js';
+import { a as BaselineReport } from './baseline-DE36-Np7.js';
+export { B as BaselineOptions, M as MetricSamples, b as MetricVerdict, T as ToolStats, d as ToolUseMetrics, e as ToolUseOptions, f as compareToBaseline, c as computeToolUseMetrics, i as iqr, w as welchsTTest } from './baseline-DE36-Np7.js';
+import { T as Trajectory, a as TrajectoryStep } from './trajectory-GEdXJCL5.js';
+export { b as buildTrajectory } from './trajectory-GEdXJCL5.js';
 export { D as DefaultVerdict } from './verdict-CeEgtjyI.js';
 import { a as DatasetScenario, b as Dataset } from './dataset-BlwAtYYf.js';
 export { d as DatasetDifficulty, c as DatasetManifest, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-BlwAtYYf.js';
 export { b as CalibrationResult, c as CandidateScore, a as ContinuousAgreement, C as ContinuousAgreementOptions, d as ContinuousCalibrationResult, G as GoldenItem, P as PositionalBiasResult, S as SelfPreferenceResult, V as VerbosityBiasResult, e as calibrateJudge, f as calibrateJudgeContinuous, g as continuousAgreement, p as positionalBias, s as selfPreference, v as verbosityBias } from './judge-calibration-DilmB3Ml.js';
-export { D as DEFAULT_RED_TEAM_CORPUS, R as RedTeamCase, a as RedTeamCategory, b as RedTeamFinding, c as RedTeamPayload, d as RedTeamReport, r as redTeamDataset, e as redTeamReport, s as scoreRedTeamOutput, t as toolNamesForRun } from './red-team-30II1T4o.js';
-import { a as PrmGrader } from './rubric-D5tjHNJQ.js';
+export { D as DEFAULT_RED_TEAM_CORPUS, R as RedTeamCase, a as RedTeamCategory, b as RedTeamFinding, c as RedTeamPayload, d as RedTeamReport, r as redTeamDataset, e as redTeamReport, s as scoreRedTeamOutput, t as toolNamesForRun } from './red-team-CrC5MZYd.js';
+import { a as PrmGrader } from './rubric-BOfxn4ja.js';
 export { EuRiskClass, GovernanceContext, GovernanceFinding, GovernanceReport, UseCaseSignals, classifyEuAiRisk, euAiActReport, nistAiRmfReport, renderMarkdown, soc2Report, summarize } from './governance/index.js';
-export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-0pu_fBwZ.js';
-export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, m as GateDecision, n as GateEvidence, H as HeldOutGate, o as HeldOutGateConfig, q as HeldOutGateRejectionCode, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-B7gNRX-r.js';
+export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-wlaiph9Y.js';
+export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, m as GateDecision, n as GateEvidence, H as HeldOutGate, o as HeldOutGateConfig, q as HeldOutGateRejectionCode, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-DLxh4yWk.js';
 export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
 import './outcome-store-D6KWmYvj.js';

package/dist/index.js CHANGED Viewed

@@ -74,7 +74,7 @@ import {
   runProposeReview,
   runProposeReviewAsControlLoop,
   scoreFromEvals
-} from "./chunk-L7XMNXLO.js";
+} from "./chunk-J4DIMSRK.js";
 import {
   allCriticalPassed,
   objectiveEval,
@@ -92,7 +92,7 @@ import {
 } from "./chunk-UBQGWD3O.js";
 import {
   runEvalCampaign
-} from "./chunk-5KSDYBYH.js";
+} from "./chunk-YXTT6GSZ.js";
 import {
   AGENT_PROFILE_KINDS,
   AgentProfileCellValidationError,
@@ -111,7 +111,7 @@ import {
   validateAgentProfileCell,
   validateRunRecord,
   verifyAgentProfileCell
-} from "./chunk-BWZEGTES.js";
+} from "./chunk-NCK5QLGT.js";
 import {
   evaluateInterimReleaseConfidence,
   pairedEvalueSequence

package/dist/{integrity-CTDhR1Sg.d.ts → integrity-CfXjSqEv.d.ts} RENAMED Viewed

@@ -1,6 +1,6 @@
 import { C as CaptureIntegrityError } from './errors-mje_cKOs.js';
 import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
-import { T as TraceStore } from './store-Db2Bv8Cf.js';
+import { T as TraceStore } from './store-CKUAgsJz.js';
 /**
  * Run-completion integrity check — at end of run, verify the expected event

package/dist/knowledge/index.d.ts CHANGED Viewed

@@ -1,6 +1,7 @@
-import { j as ControlSeverity, C as ControlEvalResult } from '../control-runtime-BZ_lVLYW.js';
-import { T as TraceEmitter } from '../emitter-DP_cSSiw.js';
-import '../store-Db2Bv8Cf.js';
+import { j as ControlSeverity, C as ControlEvalResult } from '../control-runtime-DuFBYg7A.js';
+import { T as TraceEmitter } from '../emitter-DEZwY14K.js';
+import '../schema-m0gsnbt3.js';
+import '../store-CKUAgsJz.js';
 type KnowledgeRequirementCategory = 'user_specific' | 'company_specific' | 'domain_specific' | 'codebase_specific' | 'market_specific' | 'regulatory' | 'tool_api' | 'credential_or_secret' | 'runtime_environment' | 'preference' | 'historical_context';
 type KnowledgeAcquisitionMode = 'ask_user' | 'search_web' | 'query_connector' | 'inspect_repo' | 'run_command' | 'infer_low_confidence' | 'not_available';

package/dist/meta-eval/index.d.ts CHANGED Viewed

@@ -1,8 +1,9 @@
-import { R as Run, T as TraceStore } from '../store-Db2Bv8Cf.js';
+import { T as TraceStore } from '../store-CKUAgsJz.js';
+import { R as Run } from '../schema-m0gsnbt3.js';
 import { a as OutcomeFilter, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
 export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from '../outcome-store-D6KWmYvj.js';
-export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-ByZEC3BX.js';
-import '../run-record-BGY6bHRh.js';
+export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-B3qNa4aY.js';
+import '../run-record-etiCMsUq.js';
 import '../errors-mje_cKOs.js';
 /**

package/dist/openapi.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "openapi": "3.1.0",
   "info": {
     "title": "@tangle-network/agent-eval — wire protocol",
-    "version": "0.52.0",
+    "version": "0.54.0",
     "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
     "contact": {
       "name": "Tangle Network",

package/dist/pipelines/index.d.ts CHANGED Viewed

@@ -1,9 +1,10 @@
-import { g as BudgetSpec, T as TraceStore, h as RunFilter, R as Run, a as ToolSpan } from '../store-Db2Bv8Cf.js';
-export { a as FailureCluster, F as FailureClusterReport, f as failureClusterView } from '../failure-cluster-Cw65_5FY.js';
-import { a as TrajectoryStep } from '../trajectory-CnoBo-JY.js';
-import { B as BaselineOptions, a as BaselineReport } from '../baseline-4R5deP0N.js';
-export { c as computeToolUseMetrics } from '../baseline-4R5deP0N.js';
-import { l as llmSpans } from '../query-DODUYdPg.js';
+import { h as BudgetSpec, R as Run, T as ToolSpan } from '../schema-m0gsnbt3.js';
+import { T as TraceStore, R as RunFilter } from '../store-CKUAgsJz.js';
+export { a as FailureCluster, F as FailureClusterReport, f as failureClusterView } from '../failure-cluster-CL7IVgkJ.js';
+import { a as TrajectoryStep } from '../trajectory-GEdXJCL5.js';
+import { B as BaselineOptions, a as BaselineReport } from '../baseline-DE36-Np7.js';
+export { c as computeToolUseMetrics } from '../baseline-DE36-Np7.js';
+import { l as llmSpans } from '../query-CqTxMwDw.js';
 /**
  * BudgetBreachView — aggregates breach events across the corpus.

package/dist/prm/index.d.ts CHANGED Viewed

@@ -1,7 +1,8 @@
-import { P as PrmGradedTrace, S as StepRubric, a as PrmGrader } from '../rubric-D5tjHNJQ.js';
-export { G as GradedStep, b as StepContext, i as isPrmVerdict } from '../rubric-D5tjHNJQ.js';
-import { S as Span, T as TraceStore } from '../store-Db2Bv8Cf.js';
-import '../trajectory-CnoBo-JY.js';
+import { P as PrmGradedTrace, S as StepRubric, a as PrmGrader } from '../rubric-BOfxn4ja.js';
+export { G as GradedStep, b as StepContext, i as isPrmVerdict } from '../rubric-BOfxn4ja.js';
+import { T as TraceStore } from '../store-CKUAgsJz.js';
+import { S as Span } from '../schema-m0gsnbt3.js';
+import '../trajectory-GEdXJCL5.js';
 /**
  * Export PRM-graded traces as training data for downstream reward-model

package/dist/{query-DODUYdPg.d.ts → query-CqTxMwDw.d.ts} RENAMED Viewed

@@ -1,4 +1,5 @@
-import { L as LlmSpan, T as TraceStore, J as JudgeSpan, R as Run, F as FailureClass, a as ToolSpan } from './store-Db2Bv8Cf.js';
+import { L as LlmSpan, J as JudgeSpan, R as Run, F as FailureClass, T as ToolSpan } from './schema-m0gsnbt3.js';
+import { T as TraceStore } from './store-CKUAgsJz.js';
 /**
  * Typed query helpers over TraceStore.

package/dist/{red-team-30II1T4o.d.ts → red-team-CrC5MZYd.d.ts} RENAMED Viewed

@@ -1,5 +1,5 @@
 import { a as DatasetScenario, b as Dataset } from './dataset-BlwAtYYf.js';
-import { T as TraceStore } from './store-Db2Bv8Cf.js';
+import { T as TraceStore } from './store-CKUAgsJz.js';
 /**
  * Red-team battery — adversarial scenario corpus with per-category

package/dist/{registry-8KAs18kY.d.ts → registry-BSWy0rvH.d.ts} RENAMED Viewed

@@ -1,5 +1,5 @@
 import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-BXVRUZyX.js';
-import { R as RunRecord } from './run-record-BGY6bHRh.js';
+import { R as RunRecord } from './run-record-etiCMsUq.js';
 import { T as TraceAnalysisStore } from './store-CJbzDxZ2.js';
 import { J as JudgeInput } from './types-DhqpAi_z.js';

package/dist/{release-report-DSu0DWy8.d.ts → release-report-B6l5fi7T.d.ts} RENAMED Viewed

@@ -1,8 +1,8 @@
 import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
 import { a as JudgeScore } from './types-DhqpAi_z.js';
 import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-BlwAtYYf.js';
-import { m as GateDecision } from './summary-report-B7gNRX-r.js';
-import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
+import { m as GateDecision } from './summary-report-DLxh4yWk.js';
+import { R as RunRecord, a as RunSplitTag } from './run-record-etiCMsUq.js';
 /**
  * Release confidence gate.

package/dist/reporting.d.ts CHANGED Viewed

@@ -1,13 +1,14 @@
-export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-ByZEC3BX.js';
-export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-DSu0DWy8.js';
+export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-B3qNa4aY.js';
+export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-B6l5fi7T.js';
 export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
-export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-B7gNRX-r.js';
-import './run-record-BGY6bHRh.js';
+export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-DLxh4yWk.js';
+import './run-record-etiCMsUq.js';
 import './errors-mje_cKOs.js';
+import './schema-m0gsnbt3.js';
 import './outcome-store-D6KWmYvj.js';
 import './judge-calibration-DilmB3Ml.js';
 import './types-DhqpAi_z.js';
 import '@tangle-network/tcloud';
 import './dataset-BlwAtYYf.js';
-import './failure-cluster-Cw65_5FY.js';
-import './store-Db2Bv8Cf.js';
+import './failure-cluster-CL7IVgkJ.js';
+import './store-CKUAgsJz.js';

package/dist/{researcher-LZD0qHEa.d.ts → researcher-D4AZjxNa.d.ts} RENAMED Viewed

@@ -1,10 +1,10 @@
-import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-BGY6bHRh.js';
+import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-etiCMsUq.js';
 import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-BXVRUZyX.js';
-import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-B7gNRX-r.js';
-import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
-import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CTDhR1Sg.js';
+import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-DLxh4yWk.js';
+import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DEZwY14K.js';
+import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CfXjSqEv.js';
 import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
-import { T as TraceStore } from './store-Db2Bv8Cf.js';
+import { T as TraceStore } from './store-CKUAgsJz.js';
 /**
  * Multi-layer verifier — ordered pipeline of verification layers.

package/dist/rl.d.ts CHANGED Viewed

@@ -1,19 +1,20 @@
-import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
-import { j as CampaignResult } from './types-Dbj5gu8n.js';
-import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-LZD0qHEa.js';
-export { r as runEvalCampaign } from './researcher-LZD0qHEa.js';
-import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
+import { R as RunRecord, a as RunSplitTag } from './run-record-etiCMsUq.js';
+import { k as CampaignResult } from './types-BgrxOJSf.js';
+import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-D4AZjxNa.js';
+export { r as runEvalCampaign } from './researcher-D4AZjxNa.js';
+import { S as Span } from './schema-m0gsnbt3.js';
+import { T as TraceStore } from './store-CKUAgsJz.js';
 import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
 export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from './outcome-store-D6KWmYvj.js';
-import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-ByZEC3BX.js';
+import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-B3qNa4aY.js';
 import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
 import './errors-mje_cKOs.js';
 import './llm-client-BXVRUZyX.js';
 import './raw-provider-sink-C46HDghv.js';
-import './summary-report-B7gNRX-r.js';
-import './failure-cluster-Cw65_5FY.js';
-import './emitter-DP_cSSiw.js';
-import './integrity-CTDhR1Sg.js';
+import './summary-report-DLxh4yWk.js';
+import './failure-cluster-CL7IVgkJ.js';
+import './emitter-DEZwY14K.js';
+import './integrity-CfXjSqEv.js';
 /**
  * Test-time compute scaling curves.

package/dist/rl.js CHANGED Viewed

@@ -10,8 +10,8 @@ import {
 } from "./chunk-3RF76KTD.js";
 import {
   runEvalCampaign
-} from "./chunk-5KSDYBYH.js";
-import "./chunk-BWZEGTES.js";
+} from "./chunk-YXTT6GSZ.js";
+import "./chunk-NCK5QLGT.js";
 import {
   rubricPredictiveValidity
 } from "./chunk-YRZ4M5GS.js";

package/dist/{rubric-D5tjHNJQ.d.ts → rubric-BOfxn4ja.d.ts} RENAMED Viewed

@@ -1,5 +1,6 @@
-import { S as Span, T as TraceStore, J as JudgeSpan } from './store-Db2Bv8Cf.js';
-import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
+import { S as Span, J as JudgeSpan } from './schema-m0gsnbt3.js';
+import { T as TraceStore } from './store-CKUAgsJz.js';
+import { T as Trajectory, a as TrajectoryStep } from './trajectory-GEdXJCL5.js';
 /**
  * Process Reward Modeling — per-step rubric grading.

package/dist/{rubric-predictive-validity-ByZEC3BX.d.ts → rubric-predictive-validity-B3qNa4aY.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { R as RunRecord } from './run-record-BGY6bHRh.js';
+import { R as RunRecord } from './run-record-etiCMsUq.js';
 import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
 /**

package/dist/{run-improvement-loop-Cc7oZlRP.d.ts → run-improvement-loop-BhfdjrMY.d.ts} RENAMED Viewed

@@ -1,7 +1,7 @@
-import { S as Scenario, j as CampaignResult, n as GateResult, t as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, k as CampaignTraceWriter, M as MutableSurface, p as GenerationRecord } from './types-Dbj5gu8n.js';
+import { S as Scenario, k as CampaignResult, o as GateResult, u as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, l as CampaignTraceWriter, M as MutableSurface, q as GenerationRecord } from './types-BgrxOJSf.js';
 import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
-import { R as RedTeamCase } from './red-team-30II1T4o.js';
-import { R as RunRecord } from './run-record-BGY6bHRh.js';
+import { R as RedTeamCase } from './red-team-CrC5MZYd.js';
+import { R as RunRecord } from './run-record-etiCMsUq.js';
 /**
  * @experimental

package/dist/{run-record-BGY6bHRh.d.ts → run-record-etiCMsUq.d.ts} RENAMED Viewed

@@ -1,4 +1,5 @@
 import { V as ValidationError } from './errors-mje_cKOs.js';
+import { F as FailureClass } from './schema-m0gsnbt3.js';
 type AgentProfileCellSchemaVersion = 'agent-profile-cell/v1';
 type AgentProfileJson = string | number | boolean | null | AgentProfileJson[] | {
@@ -249,9 +250,16 @@ interface RunRecord {
     judgeMetadata?: RunJudgeMetadata;
     /** Per-split scores + raw bag. */
     outcome: RunOutcome;
-    /** Categorical failure tag, when the run failed and the harness
-     *  classified it. Free-form string; standard tags live in
-     *  `failure-taxonomy.ts`. */
+    /** Canonical, cross-agent failure class drawn from the shared
+     *  `FAILURE_CLASSES` taxonomy. This is the aggregation key that makes
+     *  "which failure dominates across the whole fleet" answerable in ONE
+     *  vocabulary — every agent classifies against the same enum. Producers
+     *  set it via the substrate classifier; leave unset only when the failure
+     *  genuinely can't be classified. */
+    failureClass?: FailureClass;
+    /** Free-form domain-specific failure detail, scoped UNDER `failureClass`
+     *  (e.g. failureClass='tool_recovery_failure', failureMode='forge_build_unsatisfied').
+     *  The within-agent drill-down; `failureClass` is the cross-agent key. */
     failureMode?: string;
     /** Which split this run was drawn from. */
     splitTag: RunSplitTag;