@tangle-network/agent-eval 0.53.0 → 0.54.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/otel.d.ts +7 -6
- package/dist/{baseline-4R5deP0N.d.ts → baseline-DE36-Np7.d.ts} +1 -1
- package/dist/benchmarks/index.d.ts +3 -2
- package/dist/builder-eval/index.d.ts +4 -3
- package/dist/campaign/index.d.ts +9 -7
- package/dist/campaign/index.js +33 -4
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-L7XMNXLO.js → chunk-J4DIMSRK.js} +2 -2
- package/dist/{chunk-BWZEGTES.js → chunk-NCK5QLGT.js} +1 -1
- package/dist/chunk-NCK5QLGT.js.map +1 -0
- package/dist/{chunk-5KSDYBYH.js → chunk-YXTT6GSZ.js} +2 -2
- package/dist/contract/index.d.ts +13 -12
- package/dist/contract/index.js +25 -0
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-ojEWkMfJ.d.ts → control-DjEgwWNo.d.ts} +6 -5
- package/dist/{control-runtime-BZ_lVLYW.d.ts → control-runtime-DuFBYg7A.d.ts} +3 -2
- package/dist/control.d.ts +7 -6
- package/dist/control.js +2 -2
- package/dist/{emitter-DP_cSSiw.d.ts → emitter-DEZwY14K.d.ts} +2 -1
- package/dist/{failure-cluster-Cw65_5FY.d.ts → failure-cluster-CL7IVgkJ.d.ts} +2 -1
- package/dist/{feedback-trajectory-BSxqEpu7.d.ts → feedback-trajectory-DpUmE90J.d.ts} +1 -1
- package/dist/governance/index.d.ts +3 -2
- package/dist/hosted/index.d.ts +7 -6
- package/dist/{index-C7RhhEME.d.ts → index-D2nT6_KT.d.ts} +20 -2
- package/dist/{index-0pu_fBwZ.d.ts → index-wlaiph9Y.d.ts} +1 -1
- package/dist/index.d.ts +31 -29
- package/dist/index.js +3 -3
- package/dist/{integrity-CTDhR1Sg.d.ts → integrity-CfXjSqEv.d.ts} +1 -1
- package/dist/knowledge/index.d.ts +4 -3
- package/dist/meta-eval/index.d.ts +4 -3
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.d.ts +7 -6
- package/dist/prm/index.d.ts +5 -4
- package/dist/{query-DODUYdPg.d.ts → query-CqTxMwDw.d.ts} +2 -1
- package/dist/{red-team-30II1T4o.d.ts → red-team-CrC5MZYd.d.ts} +1 -1
- package/dist/{registry-8KAs18kY.d.ts → registry-BSWy0rvH.d.ts} +1 -1
- package/dist/{release-report-DSu0DWy8.d.ts → release-report-B6l5fi7T.d.ts} +2 -2
- package/dist/reporting.d.ts +7 -6
- package/dist/{researcher-LZD0qHEa.d.ts → researcher-D4AZjxNa.d.ts} +5 -5
- package/dist/rl.d.ts +11 -10
- package/dist/rl.js +2 -2
- package/dist/{rubric-D5tjHNJQ.d.ts → rubric-BOfxn4ja.d.ts} +3 -2
- package/dist/{rubric-predictive-validity-ByZEC3BX.d.ts → rubric-predictive-validity-B3qNa4aY.d.ts} +1 -1
- package/dist/{run-improvement-loop-Cc7oZlRP.d.ts → run-improvement-loop-BhfdjrMY.d.ts} +3 -3
- package/dist/{run-record-BGY6bHRh.d.ts → run-record-etiCMsUq.d.ts} +11 -3
- package/dist/{store-Db2Bv8Cf.d.ts → schema-m0gsnbt3.d.ts} +1 -99
- package/dist/store-CKUAgsJz.d.ts +101 -0
- package/dist/{summary-report-B7gNRX-r.d.ts → summary-report-DLxh4yWk.d.ts} +2 -2
- package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BdVaPyHT.d.ts} +3 -2
- package/dist/traces.d.ts +7 -6
- package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-GEdXJCL5.d.ts} +2 -1
- package/dist/{types-Dbj5gu8n.d.ts → types-BgrxOJSf.d.ts} +31 -1
- package/dist/wire/index.d.ts +5 -4
- package/docs/pilot/README.md +62 -0
- package/docs/pilot/customer-checklist.md +90 -0
- package/docs/pilot/integration-foreign-stack.md +296 -0
- package/docs/pilot/integration-tangle-stack.md +248 -0
- package/docs/pilot/one-pager.md +161 -0
- package/docs/pilot/sample-insight-report.json +172 -0
- package/docs/research/research-roadmap.md +204 -0
- package/package.json +1 -1
- package/dist/chunk-BWZEGTES.js.map +0 -1
- /package/dist/{chunk-L7XMNXLO.js.map → chunk-J4DIMSRK.js.map} +0 -0
- /package/dist/{chunk-5KSDYBYH.js.map → chunk-YXTT6GSZ.js.map} +0 -0
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { b as RunOutcome, R as Run, S as Span, c as SpanKind, L as LlmSpan, T as ToolSpan, d as RetrievalSpan, J as JudgeSpan, e as SandboxSpan, E as EventKind, a as TraceEvent, B as BudgetLedgerEntry, A as Artifact, M as Message } from './schema-m0gsnbt3.js';
|
|
2
|
+
import { T as TraceStore } from './store-CKUAgsJz.js';
|
|
2
3
|
|
|
3
4
|
/**
|
|
4
5
|
* TraceEmitter — hierarchical span builder that auto-parents using an
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { R as Run, S as Span,
|
|
1
|
+
import { R as Run, S as Span, a as TraceEvent, F as FailureClass } from './schema-m0gsnbt3.js';
|
|
2
|
+
import { T as TraceStore } from './store-CKUAgsJz.js';
|
|
2
3
|
|
|
3
4
|
/**
|
|
4
5
|
* Failure taxonomy — canonical classes + a default classifier.
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { C as ControlEvalResult, a as ControlRunResult, b as ControlStep } from './control-runtime-
|
|
1
|
+
import { C as ControlEvalResult, a as ControlRunResult, b as ControlStep } from './control-runtime-DuFBYg7A.js';
|
|
2
2
|
import { D as DatasetSplit, a as DatasetScenario } from './dataset-BlwAtYYf.js';
|
|
3
3
|
|
|
4
4
|
type FeedbackArtifactType = 'text' | 'code' | 'plan' | 'research' | 'action' | 'ui' | 'decision' | 'data' | 'other';
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import { c as DatasetManifest } from '../dataset-BlwAtYYf.js';
|
|
2
2
|
import { b as CalibrationResult } from '../judge-calibration-DilmB3Ml.js';
|
|
3
3
|
import { O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
|
|
4
|
-
import { d as RedTeamReport } from '../red-team-
|
|
5
|
-
import { T as TraceStore } from '../store-
|
|
4
|
+
import { d as RedTeamReport } from '../red-team-CrC5MZYd.js';
|
|
5
|
+
import { T as TraceStore } from '../store-CKUAgsJz.js';
|
|
6
6
|
import '../errors-mje_cKOs.js';
|
|
7
|
+
import '../schema-m0gsnbt3.js';
|
|
7
8
|
|
|
8
9
|
/**
|
|
9
10
|
* Governance reporting — shared types.
|
package/dist/hosted/index.d.ts
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
export { E as EvalRunCellScore, d as EvalRunEvent, e as EvalRunGenerationSnapshot, f as EvalRunStatus, g as HOSTED_WIRE_VERSION, H as HostedClient, h as HostedIngestHeaders, a as HostedTenant, i as HostedWireVersion, j as IngestEvalRunsRequest, k as IngestResponse, l as IngestTracesRequest, T as TraceSpanEvent, m as createHostedClient } from '../index-
|
|
2
|
-
import '../types-
|
|
3
|
-
import '../summary-report-
|
|
4
|
-
import '../run-record-
|
|
1
|
+
export { E as EvalRunCellScore, d as EvalRunEvent, e as EvalRunGenerationSnapshot, f as EvalRunStatus, g as HOSTED_WIRE_VERSION, H as HostedClient, h as HostedIngestHeaders, a as HostedTenant, i as HostedWireVersion, j as IngestEvalRunsRequest, k as IngestResponse, l as IngestTracesRequest, T as TraceSpanEvent, m as createHostedClient } from '../index-D2nT6_KT.js';
|
|
2
|
+
import '../types-BgrxOJSf.js';
|
|
3
|
+
import '../summary-report-DLxh4yWk.js';
|
|
4
|
+
import '../run-record-etiCMsUq.js';
|
|
5
5
|
import '../errors-mje_cKOs.js';
|
|
6
|
-
import '../
|
|
7
|
-
import '../
|
|
6
|
+
import '../schema-m0gsnbt3.js';
|
|
7
|
+
import '../failure-cluster-CL7IVgkJ.js';
|
|
8
|
+
import '../store-CKUAgsJz.js';
|
|
8
9
|
import '../judge-calibration-DilmB3Ml.js';
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { M as MutableSurface,
|
|
2
|
-
import { G as GainDistributionBin, P as ParetoFigureSpec } from './summary-report-
|
|
1
|
+
import { M as MutableSurface, n as GateDecision } from './types-BgrxOJSf.js';
|
|
2
|
+
import { G as GainDistributionBin, P as ParetoFigureSpec } from './summary-report-DLxh4yWk.js';
|
|
3
3
|
import { a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
|
|
4
4
|
|
|
5
5
|
/**
|
|
@@ -87,6 +87,11 @@ interface InsightReport {
|
|
|
87
87
|
* Surfaced metrics: composite, cost, duration, tokenUsage, plus any
|
|
88
88
|
* per-dimension judge metric present in both windows. */
|
|
89
89
|
priorPeriodComparison?: PriorPeriodComparison;
|
|
90
|
+
/** Model-free failure-mode breakdown from `RunRecord.failureMode`, ranked
|
|
91
|
+
* by count descending. Present when any run carries a `failureMode`.
|
|
92
|
+
* Complements `failureClusters` (LLM-semantic) with the structured tags
|
|
93
|
+
* the harness already recorded — actionable with no analyst wired. */
|
|
94
|
+
failureModes?: FailureModeTally[];
|
|
90
95
|
/** Top-N actionable recommendations, ranked by priority. The packet's
|
|
91
96
|
* human-readable layer; the numeric sections are the evidence. */
|
|
92
97
|
recommendations: Recommendation[];
|
|
@@ -181,6 +186,19 @@ interface FailureClusterInsight {
|
|
|
181
186
|
}>;
|
|
182
187
|
totalFailures: number;
|
|
183
188
|
}
|
|
189
|
+
/** Model-free failure breakdown over the structured `RunRecord.failureMode`
|
|
190
|
+
* enum. Unlike `failureClusters` (semantic, requires an LLM analyst), this
|
|
191
|
+
* is computed directly from the tags the harness already recorded — so a
|
|
192
|
+
* customer ingesting one batch with no judge/analyst still learns which
|
|
193
|
+
* named failure dominates. */
|
|
194
|
+
interface FailureModeTally {
|
|
195
|
+
/** The `failureMode` tag. */
|
|
196
|
+
mode: string;
|
|
197
|
+
/** Number of runs carrying this tag. */
|
|
198
|
+
count: number;
|
|
199
|
+
/** Share of the whole corpus, 0..1. */
|
|
200
|
+
share: number;
|
|
201
|
+
}
|
|
184
202
|
interface ContaminationInsight {
|
|
185
203
|
/** Canary phrases that leaked into outputs. */
|
|
186
204
|
leaks: number;
|
package/dist/index.d.ts
CHANGED
|
@@ -1,11 +1,13 @@
|
|
|
1
|
-
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-
|
|
2
|
-
import { R as RunRecord } from './run-record-
|
|
3
|
-
export { e as AGENT_PROFILE_KINDS, A as AgentProfileCell, d as AgentProfileCellInput, f as AgentProfileCellSchemaVersion, g as AgentProfileCellValidationError, h as AgentProfileDimensionValue, i as AgentProfileHarness, j as AgentProfileJson, k as AgentProfileKind, l as AgentProfileSource, m as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, n as RunOutcome, o as RunRecordValidationError, a as RunSplitTag, b as RunTokenUsage, S as SandboxAgentProfileLike, p as agentProfileCellHashMaterial, q as agentProfileCellKey, r as assertRunAgentProfileCell, s as buildAgentProfileCell, t as buildSandboxAgentProfileCell, u as groupRunsByAgentProfileCell, v as isRunRecord, w as parseRunRecordSafe, x as requireAgentProfileCell, y as roundTripRunRecord, z as toAgentProfileJson, B as validateAgentProfileCell, C as validateRunRecord, D as verifyAgentProfileCell } from './run-record-
|
|
1
|
+
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-DjEgwWNo.js';
|
|
2
|
+
import { R as RunRecord } from './run-record-etiCMsUq.js';
|
|
3
|
+
export { e as AGENT_PROFILE_KINDS, A as AgentProfileCell, d as AgentProfileCellInput, f as AgentProfileCellSchemaVersion, g as AgentProfileCellValidationError, h as AgentProfileDimensionValue, i as AgentProfileHarness, j as AgentProfileJson, k as AgentProfileKind, l as AgentProfileSource, m as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, n as RunOutcome, o as RunRecordValidationError, a as RunSplitTag, b as RunTokenUsage, S as SandboxAgentProfileLike, p as agentProfileCellHashMaterial, q as agentProfileCellKey, r as assertRunAgentProfileCell, s as buildAgentProfileCell, t as buildSandboxAgentProfileCell, u as groupRunsByAgentProfileCell, v as isRunRecord, w as parseRunRecordSafe, x as requireAgentProfileCell, y as roundTripRunRecord, z as toAgentProfileJson, B as validateAgentProfileCell, C as validateRunRecord, D as verifyAgentProfileCell } from './run-record-etiCMsUq.js';
|
|
4
4
|
import { AxAIService, AxFunction } from '@ax-llm/ax';
|
|
5
|
-
import { d as Severity, M as MultiLayerVerifier, e as VerifyOptions, L as Layer, f as LayerResult, g as VerifyContext } from './researcher-
|
|
6
|
-
export { C as CallbackResearcher, h as CallbackResearcherOptions, i as CampaignFactoryParams, j as CampaignIntegrityPolicy, k as CampaignRunContext, l as CampaignRunOutcome, m as CampaignRunner, n as CampaignScenario, o as CampaignVariant, c as EvalCampaignOptions, b as EvalCampaignResult, E as ExperimentPlan, a as ExperimentResult, p as FailedRun, F as FailureMode, q as Finding, s as LayerStatus, N as NoopResearcher, R as Researcher, S as SteeringChange, V as VerificationReport, t as gradeSemanticStatus, r as runEvalCampaign } from './researcher-
|
|
7
|
-
import { R as Run$1, S as Span,
|
|
8
|
-
export {
|
|
5
|
+
import { d as Severity, M as MultiLayerVerifier, e as VerifyOptions, L as Layer, f as LayerResult, g as VerifyContext } from './researcher-D4AZjxNa.js';
|
|
6
|
+
export { C as CallbackResearcher, h as CallbackResearcherOptions, i as CampaignFactoryParams, j as CampaignIntegrityPolicy, k as CampaignRunContext, l as CampaignRunOutcome, m as CampaignRunner, n as CampaignScenario, o as CampaignVariant, c as EvalCampaignOptions, b as EvalCampaignResult, E as ExperimentPlan, a as ExperimentResult, p as FailedRun, F as FailureMode, q as Finding, s as LayerStatus, N as NoopResearcher, R as Researcher, S as SteeringChange, V as VerificationReport, t as gradeSemanticStatus, r as runEvalCampaign } from './researcher-D4AZjxNa.js';
|
|
7
|
+
import { R as Run$1, S as Span, a as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, h as BudgetSpec, L as LlmSpan } from './schema-m0gsnbt3.js';
|
|
8
|
+
export { E as EventKind, i as FAILURE_CLASSES, F as FailureClass, G as GenericSpan, J as JudgeSpan, M as Message, d as RetrievalSpan, g as RunLayer, f as RunStatus, e as SandboxSpan, j as SpanBase, c as SpanKind, k as SpanStatus, l as TRACE_SCHEMA_VERSION, T as ToolSpan, m as isJudgeSpan, n as isLlmSpan, o as isRetrievalSpan, p as isSandboxSpan, q as isToolSpan } from './schema-m0gsnbt3.js';
|
|
9
|
+
import { T as TraceStore, R as RunFilter } from './store-CKUAgsJz.js';
|
|
10
|
+
export { E as EventFilter, F as FileSystemTraceStore, a as FileSystemTraceStoreOptions, I as InMemoryTraceStore, S as SpanFilter } from './store-CKUAgsJz.js';
|
|
9
11
|
import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
|
|
10
12
|
export { d as LlmCallError, b as LlmCallRequest, c as LlmCallResult, e as LlmClient, f as LlmMessage, g as LlmRouteAssertionError, a as LlmRouteRequirements, h as LlmUsage, i as assertLlmRoute, j as backoffMs, k as callLlm, l as callLlmJson, m as isTransientLlmError, p as probeLlm, s as stripFencedJson } from './llm-client-BXVRUZyX.js';
|
|
11
13
|
import { AnalyzeTracesOptions, OtelExporter, OtelExportConfig, AnalyzeTracesInput, AnalyzeTracesResult } from './traces.js';
|
|
@@ -14,39 +16,39 @@ import { T as TraceAnalysisStore } from './store-CJbzDxZ2.js';
|
|
|
14
16
|
export { D as DEFAULT_TRACE_ANALYST_BUDGETS, a as DatasetOverview, Q as QueryTracesPage, S as SearchSpanResult, b as SearchTraceResult, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, f as TraceAnalystFilters, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, V as ViewSpansResult, k as ViewTraceOversized, l as ViewTraceResult } from './store-CJbzDxZ2.js';
|
|
15
17
|
import { b as JudgeFn, J as JudgeInput, B as BenchmarkRunnerConfig, S as Scenario, c as BenchmarkReport, P as ProductClientConfig, C as CheckResult, T as TestResult, d as PersonaConfig, D as DriverResult, e as DriverState, f as CollectedArtifacts, g as ScenarioResult, h as TurnMetrics, i as ScenarioFile, j as CompletionCriterion } from './types-DhqpAi_z.js';
|
|
16
18
|
export { A as ArtifactCheck, k as ArtifactResult, E as EvalResult, F as FeedbackPattern, l as JudgeConfig, m as JudgeRubric, a as JudgeScore, n as PersonaRigor, R as RouteMap, o as RubricDimension, p as Turn, q as TurnResult } from './types-DhqpAi_z.js';
|
|
17
|
-
import { a as Analyst, b as AnalystSeverity, c as AnalystFinding, d as AnalystCost, e as AnalystContext } from './registry-
|
|
18
|
-
export { f as AnalystHooks, g as AnalystInputKind, A as AnalystRegistry, h as AnalystRegistryOptions, i as AnalystRequirements, j as AnalystRunEvent, k as AnalystRunInputs, l as AnalystRunResult, m as AnalystRunSummary, B as BudgetPolicy, C as ChatCallOpts, n as ChatClient, o as ChatRequest, p as ChatResponse, q as ChatTransport, r as CliBridgeTransportOpts, s as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RegistryRunOpts, t as RouterTransportOpts, S as SandboxSdkTransportOpts, u as computeFindingId, v as createChatClient, w as makeFinding } from './registry-
|
|
19
|
+
import { a as Analyst, b as AnalystSeverity, c as AnalystFinding, d as AnalystCost, e as AnalystContext } from './registry-BSWy0rvH.js';
|
|
20
|
+
export { f as AnalystHooks, g as AnalystInputKind, A as AnalystRegistry, h as AnalystRegistryOptions, i as AnalystRequirements, j as AnalystRunEvent, k as AnalystRunInputs, l as AnalystRunResult, m as AnalystRunSummary, B as BudgetPolicy, C as ChatCallOpts, n as ChatClient, o as ChatRequest, p as ChatResponse, q as ChatTransport, r as CliBridgeTransportOpts, s as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RegistryRunOpts, t as RouterTransportOpts, S as SandboxSdkTransportOpts, u as computeFindingId, v as createChatClient, w as makeFinding } from './registry-BSWy0rvH.js';
|
|
19
21
|
import { TCloud } from '@tangle-network/tcloud';
|
|
20
22
|
import { z } from 'zod';
|
|
21
|
-
export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-
|
|
23
|
+
export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-DuFBYg7A.js';
|
|
22
24
|
import { A as AgentEvalError } from './errors-mje_cKOs.js';
|
|
23
25
|
export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-mje_cKOs.js';
|
|
24
|
-
import { a as FeedbackLabel, F as FeedbackTrajectoryStore, b as FeedbackTrajectory } from './feedback-trajectory-
|
|
25
|
-
export { c as FeedbackArtifactType, d as FeedbackAttempt, e as FeedbackLabelKind, f as FeedbackLabelSource, g as FeedbackOptimizerRow, h as FeedbackOutcome, i as FeedbackReplayAdapter, j as FeedbackReplayResult, k as FeedbackSeverity, l as FeedbackSplitPolicy, m as FeedbackTask, n as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-
|
|
26
|
+
import { a as FeedbackLabel, F as FeedbackTrajectoryStore, b as FeedbackTrajectory } from './feedback-trajectory-DpUmE90J.js';
|
|
27
|
+
export { c as FeedbackArtifactType, d as FeedbackAttempt, e as FeedbackLabelKind, f as FeedbackLabelSource, g as FeedbackOptimizerRow, h as FeedbackOutcome, i as FeedbackReplayAdapter, j as FeedbackReplayResult, k as FeedbackSeverity, l as FeedbackSplitPolicy, m as FeedbackTask, n as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-DpUmE90J.js';
|
|
26
28
|
export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
|
|
27
|
-
import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-
|
|
28
|
-
export { A as ActionableSideInfo, s as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, C as CorpusAgreementOptions, t as CorpusAgreementPerDimension, u as CorpusAgreementReport, v as CorpusScoreRecord, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, x as bonferroni, n as bootstrapCi, y as cohensD, z as confidenceInterval, D as corpusInterRaterAgreement, E as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, F as interRaterReliability, p as judgeReplayGate, G as mannWhitneyU, H as normalizeScores, q as pairedBootstrap, I as pairedMde, K as pairedTTest, L as partialCredit, r as renderReleaseReport, M as requiredSampleSize, N as weightedMean, w as wilcoxonSignedRank } from './release-report-
|
|
29
|
-
import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-
|
|
30
|
-
export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-
|
|
31
|
-
import { T as TraceEmitter } from './emitter-
|
|
32
|
-
export { R as RunCompleteHook, a as RunCompleteHookContext, S as SpanHandle, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-
|
|
33
|
-
export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-
|
|
34
|
-
export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-
|
|
29
|
+
import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-B6l5fi7T.js';
|
|
30
|
+
export { A as ActionableSideInfo, s as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, C as CorpusAgreementOptions, t as CorpusAgreementPerDimension, u as CorpusAgreementReport, v as CorpusScoreRecord, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, x as bonferroni, n as bootstrapCi, y as cohensD, z as confidenceInterval, D as corpusInterRaterAgreement, E as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, F as interRaterReliability, p as judgeReplayGate, G as mannWhitneyU, H as normalizeScores, q as pairedBootstrap, I as pairedMde, K as pairedTTest, L as partialCredit, r as renderReleaseReport, M as requiredSampleSize, N as weightedMean, w as wilcoxonSignedRank } from './release-report-B6l5fi7T.js';
|
|
31
|
+
import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-BdVaPyHT.js';
|
|
32
|
+
export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-BdVaPyHT.js';
|
|
33
|
+
import { T as TraceEmitter } from './emitter-DEZwY14K.js';
|
|
34
|
+
export { R as RunCompleteHook, a as RunCompleteHookContext, S as SpanHandle, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DEZwY14K.js';
|
|
35
|
+
export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CfXjSqEv.js';
|
|
36
|
+
export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-CqTxMwDw.js';
|
|
35
37
|
export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, c as RawProviderDirection, d as RawProviderEvent, R as RawProviderSink, e as RawProviderSinkFilter, f as defaultProviderRedactor, p as providerFromBaseUrl } from './raw-provider-sink-C46HDghv.js';
|
|
36
|
-
export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-
|
|
37
|
-
import { a as BaselineReport } from './baseline-
|
|
38
|
-
export { B as BaselineOptions, M as MetricSamples, b as MetricVerdict, T as ToolStats, d as ToolUseMetrics, e as ToolUseOptions, f as compareToBaseline, c as computeToolUseMetrics, i as iqr, w as welchsTTest } from './baseline-
|
|
39
|
-
import { T as Trajectory, a as TrajectoryStep } from './trajectory-
|
|
40
|
-
export { b as buildTrajectory } from './trajectory-
|
|
38
|
+
export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-CL7IVgkJ.js';
|
|
39
|
+
import { a as BaselineReport } from './baseline-DE36-Np7.js';
|
|
40
|
+
export { B as BaselineOptions, M as MetricSamples, b as MetricVerdict, T as ToolStats, d as ToolUseMetrics, e as ToolUseOptions, f as compareToBaseline, c as computeToolUseMetrics, i as iqr, w as welchsTTest } from './baseline-DE36-Np7.js';
|
|
41
|
+
import { T as Trajectory, a as TrajectoryStep } from './trajectory-GEdXJCL5.js';
|
|
42
|
+
export { b as buildTrajectory } from './trajectory-GEdXJCL5.js';
|
|
41
43
|
export { D as DefaultVerdict } from './verdict-CeEgtjyI.js';
|
|
42
44
|
import { a as DatasetScenario, b as Dataset } from './dataset-BlwAtYYf.js';
|
|
43
45
|
export { d as DatasetDifficulty, c as DatasetManifest, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-BlwAtYYf.js';
|
|
44
46
|
export { b as CalibrationResult, c as CandidateScore, a as ContinuousAgreement, C as ContinuousAgreementOptions, d as ContinuousCalibrationResult, G as GoldenItem, P as PositionalBiasResult, S as SelfPreferenceResult, V as VerbosityBiasResult, e as calibrateJudge, f as calibrateJudgeContinuous, g as continuousAgreement, p as positionalBias, s as selfPreference, v as verbosityBias } from './judge-calibration-DilmB3Ml.js';
|
|
45
|
-
export { D as DEFAULT_RED_TEAM_CORPUS, R as RedTeamCase, a as RedTeamCategory, b as RedTeamFinding, c as RedTeamPayload, d as RedTeamReport, r as redTeamDataset, e as redTeamReport, s as scoreRedTeamOutput, t as toolNamesForRun } from './red-team-
|
|
46
|
-
import { a as PrmGrader } from './rubric-
|
|
47
|
+
export { D as DEFAULT_RED_TEAM_CORPUS, R as RedTeamCase, a as RedTeamCategory, b as RedTeamFinding, c as RedTeamPayload, d as RedTeamReport, r as redTeamDataset, e as redTeamReport, s as scoreRedTeamOutput, t as toolNamesForRun } from './red-team-CrC5MZYd.js';
|
|
48
|
+
import { a as PrmGrader } from './rubric-BOfxn4ja.js';
|
|
47
49
|
export { EuRiskClass, GovernanceContext, GovernanceFinding, GovernanceReport, UseCaseSignals, classifyEuAiRisk, euAiActReport, nistAiRmfReport, renderMarkdown, soc2Report, summarize } from './governance/index.js';
|
|
48
|
-
export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-
|
|
49
|
-
export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, m as GateDecision, n as GateEvidence, H as HeldOutGate, o as HeldOutGateConfig, q as HeldOutGateRejectionCode, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-
|
|
50
|
+
export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-wlaiph9Y.js';
|
|
51
|
+
export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, m as GateDecision, n as GateEvidence, H as HeldOutGate, o as HeldOutGateConfig, q as HeldOutGateRejectionCode, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-DLxh4yWk.js';
|
|
50
52
|
export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
|
|
51
53
|
import './outcome-store-D6KWmYvj.js';
|
|
52
54
|
|
package/dist/index.js
CHANGED
|
@@ -74,7 +74,7 @@ import {
|
|
|
74
74
|
runProposeReview,
|
|
75
75
|
runProposeReviewAsControlLoop,
|
|
76
76
|
scoreFromEvals
|
|
77
|
-
} from "./chunk-
|
|
77
|
+
} from "./chunk-J4DIMSRK.js";
|
|
78
78
|
import {
|
|
79
79
|
allCriticalPassed,
|
|
80
80
|
objectiveEval,
|
|
@@ -92,7 +92,7 @@ import {
|
|
|
92
92
|
} from "./chunk-UBQGWD3O.js";
|
|
93
93
|
import {
|
|
94
94
|
runEvalCampaign
|
|
95
|
-
} from "./chunk-
|
|
95
|
+
} from "./chunk-YXTT6GSZ.js";
|
|
96
96
|
import {
|
|
97
97
|
AGENT_PROFILE_KINDS,
|
|
98
98
|
AgentProfileCellValidationError,
|
|
@@ -111,7 +111,7 @@ import {
|
|
|
111
111
|
validateAgentProfileCell,
|
|
112
112
|
validateRunRecord,
|
|
113
113
|
verifyAgentProfileCell
|
|
114
|
-
} from "./chunk-
|
|
114
|
+
} from "./chunk-NCK5QLGT.js";
|
|
115
115
|
import {
|
|
116
116
|
evaluateInterimReleaseConfidence,
|
|
117
117
|
pairedEvalueSequence
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { C as CaptureIntegrityError } from './errors-mje_cKOs.js';
|
|
2
2
|
import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
|
|
3
|
-
import { T as TraceStore } from './store-
|
|
3
|
+
import { T as TraceStore } from './store-CKUAgsJz.js';
|
|
4
4
|
|
|
5
5
|
/**
|
|
6
6
|
* Run-completion integrity check — at end of run, verify the expected event
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
import { j as ControlSeverity, C as ControlEvalResult } from '../control-runtime-
|
|
2
|
-
import { T as TraceEmitter } from '../emitter-
|
|
3
|
-
import '../
|
|
1
|
+
import { j as ControlSeverity, C as ControlEvalResult } from '../control-runtime-DuFBYg7A.js';
|
|
2
|
+
import { T as TraceEmitter } from '../emitter-DEZwY14K.js';
|
|
3
|
+
import '../schema-m0gsnbt3.js';
|
|
4
|
+
import '../store-CKUAgsJz.js';
|
|
4
5
|
|
|
5
6
|
type KnowledgeRequirementCategory = 'user_specific' | 'company_specific' | 'domain_specific' | 'codebase_specific' | 'market_specific' | 'regulatory' | 'tool_api' | 'credential_or_secret' | 'runtime_environment' | 'preference' | 'historical_context';
|
|
6
7
|
type KnowledgeAcquisitionMode = 'ask_user' | 'search_web' | 'query_connector' | 'inspect_repo' | 'run_command' | 'infer_low_confidence' | 'not_available';
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { T as TraceStore } from '../store-CKUAgsJz.js';
|
|
2
|
+
import { R as Run } from '../schema-m0gsnbt3.js';
|
|
2
3
|
import { a as OutcomeFilter, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
|
|
3
4
|
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from '../outcome-store-D6KWmYvj.js';
|
|
4
|
-
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-
|
|
5
|
-
import '../run-record-
|
|
5
|
+
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-B3qNa4aY.js';
|
|
6
|
+
import '../run-record-etiCMsUq.js';
|
|
6
7
|
import '../errors-mje_cKOs.js';
|
|
7
8
|
|
|
8
9
|
/**
|
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.54.0",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
import {
|
|
5
|
-
|
|
6
|
-
|
|
1
|
+
import { h as BudgetSpec, R as Run, T as ToolSpan } from '../schema-m0gsnbt3.js';
|
|
2
|
+
import { T as TraceStore, R as RunFilter } from '../store-CKUAgsJz.js';
|
|
3
|
+
export { a as FailureCluster, F as FailureClusterReport, f as failureClusterView } from '../failure-cluster-CL7IVgkJ.js';
|
|
4
|
+
import { a as TrajectoryStep } from '../trajectory-GEdXJCL5.js';
|
|
5
|
+
import { B as BaselineOptions, a as BaselineReport } from '../baseline-DE36-Np7.js';
|
|
6
|
+
export { c as computeToolUseMetrics } from '../baseline-DE36-Np7.js';
|
|
7
|
+
import { l as llmSpans } from '../query-CqTxMwDw.js';
|
|
7
8
|
|
|
8
9
|
/**
|
|
9
10
|
* BudgetBreachView — aggregates breach events across the corpus.
|
package/dist/prm/index.d.ts
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
import { P as PrmGradedTrace, S as StepRubric, a as PrmGrader } from '../rubric-
|
|
2
|
-
export { G as GradedStep, b as StepContext, i as isPrmVerdict } from '../rubric-
|
|
3
|
-
import {
|
|
4
|
-
import '../
|
|
1
|
+
import { P as PrmGradedTrace, S as StepRubric, a as PrmGrader } from '../rubric-BOfxn4ja.js';
|
|
2
|
+
export { G as GradedStep, b as StepContext, i as isPrmVerdict } from '../rubric-BOfxn4ja.js';
|
|
3
|
+
import { T as TraceStore } from '../store-CKUAgsJz.js';
|
|
4
|
+
import { S as Span } from '../schema-m0gsnbt3.js';
|
|
5
|
+
import '../trajectory-GEdXJCL5.js';
|
|
5
6
|
|
|
6
7
|
/**
|
|
7
8
|
* Export PRM-graded traces as training data for downstream reward-model
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { L as LlmSpan,
|
|
1
|
+
import { L as LlmSpan, J as JudgeSpan, R as Run, F as FailureClass, T as ToolSpan } from './schema-m0gsnbt3.js';
|
|
2
|
+
import { T as TraceStore } from './store-CKUAgsJz.js';
|
|
2
3
|
|
|
3
4
|
/**
|
|
4
5
|
* Typed query helpers over TraceStore.
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-BXVRUZyX.js';
|
|
2
|
-
import { R as RunRecord } from './run-record-
|
|
2
|
+
import { R as RunRecord } from './run-record-etiCMsUq.js';
|
|
3
3
|
import { T as TraceAnalysisStore } from './store-CJbzDxZ2.js';
|
|
4
4
|
import { J as JudgeInput } from './types-DhqpAi_z.js';
|
|
5
5
|
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
|
|
2
2
|
import { a as JudgeScore } from './types-DhqpAi_z.js';
|
|
3
3
|
import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-BlwAtYYf.js';
|
|
4
|
-
import { m as GateDecision } from './summary-report-
|
|
5
|
-
import { R as RunRecord, a as RunSplitTag } from './run-record-
|
|
4
|
+
import { m as GateDecision } from './summary-report-DLxh4yWk.js';
|
|
5
|
+
import { R as RunRecord, a as RunSplitTag } from './run-record-etiCMsUq.js';
|
|
6
6
|
|
|
7
7
|
/**
|
|
8
8
|
* Release confidence gate.
|
package/dist/reporting.d.ts
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
|
-
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-
|
|
2
|
-
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-
|
|
1
|
+
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-B3qNa4aY.js';
|
|
2
|
+
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-B6l5fi7T.js';
|
|
3
3
|
export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
|
|
4
|
-
export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-
|
|
5
|
-
import './run-record-
|
|
4
|
+
export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-DLxh4yWk.js';
|
|
5
|
+
import './run-record-etiCMsUq.js';
|
|
6
6
|
import './errors-mje_cKOs.js';
|
|
7
|
+
import './schema-m0gsnbt3.js';
|
|
7
8
|
import './outcome-store-D6KWmYvj.js';
|
|
8
9
|
import './judge-calibration-DilmB3Ml.js';
|
|
9
10
|
import './types-DhqpAi_z.js';
|
|
10
11
|
import '@tangle-network/tcloud';
|
|
11
12
|
import './dataset-BlwAtYYf.js';
|
|
12
|
-
import './failure-cluster-
|
|
13
|
-
import './store-
|
|
13
|
+
import './failure-cluster-CL7IVgkJ.js';
|
|
14
|
+
import './store-CKUAgsJz.js';
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-
|
|
1
|
+
import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-etiCMsUq.js';
|
|
2
2
|
import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-BXVRUZyX.js';
|
|
3
|
-
import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-
|
|
4
|
-
import { T as TraceEmitter, R as RunCompleteHook } from './emitter-
|
|
5
|
-
import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-
|
|
3
|
+
import { h as ResearchReportOptions, d as ResearchReport, m as GateDecision } from './summary-report-DLxh4yWk.js';
|
|
4
|
+
import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DEZwY14K.js';
|
|
5
|
+
import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CfXjSqEv.js';
|
|
6
6
|
import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
|
|
7
|
-
import { T as TraceStore } from './store-
|
|
7
|
+
import { T as TraceStore } from './store-CKUAgsJz.js';
|
|
8
8
|
|
|
9
9
|
/**
|
|
10
10
|
* Multi-layer verifier — ordered pipeline of verification layers.
|
package/dist/rl.d.ts
CHANGED
|
@@ -1,19 +1,20 @@
|
|
|
1
|
-
import { R as RunRecord, a as RunSplitTag } from './run-record-
|
|
2
|
-
import {
|
|
3
|
-
import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-
|
|
4
|
-
export { r as runEvalCampaign } from './researcher-
|
|
5
|
-
import { S as Span
|
|
1
|
+
import { R as RunRecord, a as RunSplitTag } from './run-record-etiCMsUq.js';
|
|
2
|
+
import { k as CampaignResult } from './types-BgrxOJSf.js';
|
|
3
|
+
import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-D4AZjxNa.js';
|
|
4
|
+
export { r as runEvalCampaign } from './researcher-D4AZjxNa.js';
|
|
5
|
+
import { S as Span } from './schema-m0gsnbt3.js';
|
|
6
|
+
import { T as TraceStore } from './store-CKUAgsJz.js';
|
|
6
7
|
import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
|
|
7
8
|
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from './outcome-store-D6KWmYvj.js';
|
|
8
|
-
import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-
|
|
9
|
+
import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-B3qNa4aY.js';
|
|
9
10
|
import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
|
|
10
11
|
import './errors-mje_cKOs.js';
|
|
11
12
|
import './llm-client-BXVRUZyX.js';
|
|
12
13
|
import './raw-provider-sink-C46HDghv.js';
|
|
13
|
-
import './summary-report-
|
|
14
|
-
import './failure-cluster-
|
|
15
|
-
import './emitter-
|
|
16
|
-
import './integrity-
|
|
14
|
+
import './summary-report-DLxh4yWk.js';
|
|
15
|
+
import './failure-cluster-CL7IVgkJ.js';
|
|
16
|
+
import './emitter-DEZwY14K.js';
|
|
17
|
+
import './integrity-CfXjSqEv.js';
|
|
17
18
|
|
|
18
19
|
/**
|
|
19
20
|
* Test-time compute scaling curves.
|
package/dist/rl.js
CHANGED
|
@@ -10,8 +10,8 @@ import {
|
|
|
10
10
|
} from "./chunk-3RF76KTD.js";
|
|
11
11
|
import {
|
|
12
12
|
runEvalCampaign
|
|
13
|
-
} from "./chunk-
|
|
14
|
-
import "./chunk-
|
|
13
|
+
} from "./chunk-YXTT6GSZ.js";
|
|
14
|
+
import "./chunk-NCK5QLGT.js";
|
|
15
15
|
import {
|
|
16
16
|
rubricPredictiveValidity
|
|
17
17
|
} from "./chunk-YRZ4M5GS.js";
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
import { S as Span,
|
|
2
|
-
import { T as
|
|
1
|
+
import { S as Span, J as JudgeSpan } from './schema-m0gsnbt3.js';
|
|
2
|
+
import { T as TraceStore } from './store-CKUAgsJz.js';
|
|
3
|
+
import { T as Trajectory, a as TrajectoryStep } from './trajectory-GEdXJCL5.js';
|
|
3
4
|
|
|
4
5
|
/**
|
|
5
6
|
* Process Reward Modeling — per-step rubric grading.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import { S as Scenario,
|
|
1
|
+
import { S as Scenario, k as CampaignResult, o as GateResult, u as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, l as CampaignTraceWriter, M as MutableSurface, q as GenerationRecord } from './types-BgrxOJSf.js';
|
|
2
2
|
import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
|
|
3
|
-
import { R as RedTeamCase } from './red-team-
|
|
4
|
-
import { R as RunRecord } from './run-record-
|
|
3
|
+
import { R as RedTeamCase } from './red-team-CrC5MZYd.js';
|
|
4
|
+
import { R as RunRecord } from './run-record-etiCMsUq.js';
|
|
5
5
|
|
|
6
6
|
/**
|
|
7
7
|
* @experimental
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { V as ValidationError } from './errors-mje_cKOs.js';
|
|
2
|
+
import { F as FailureClass } from './schema-m0gsnbt3.js';
|
|
2
3
|
|
|
3
4
|
type AgentProfileCellSchemaVersion = 'agent-profile-cell/v1';
|
|
4
5
|
type AgentProfileJson = string | number | boolean | null | AgentProfileJson[] | {
|
|
@@ -249,9 +250,16 @@ interface RunRecord {
|
|
|
249
250
|
judgeMetadata?: RunJudgeMetadata;
|
|
250
251
|
/** Per-split scores + raw bag. */
|
|
251
252
|
outcome: RunOutcome;
|
|
252
|
-
/**
|
|
253
|
-
*
|
|
254
|
-
*
|
|
253
|
+
/** Canonical, cross-agent failure class drawn from the shared
|
|
254
|
+
* `FAILURE_CLASSES` taxonomy. This is the aggregation key that makes
|
|
255
|
+
* "which failure dominates across the whole fleet" answerable in ONE
|
|
256
|
+
* vocabulary — every agent classifies against the same enum. Producers
|
|
257
|
+
* set it via the substrate classifier; leave unset only when the failure
|
|
258
|
+
* genuinely can't be classified. */
|
|
259
|
+
failureClass?: FailureClass;
|
|
260
|
+
/** Free-form domain-specific failure detail, scoped UNDER `failureClass`
|
|
261
|
+
* (e.g. failureClass='tool_recovery_failure', failureMode='forge_build_unsatisfied').
|
|
262
|
+
* The within-agent drill-down; `failureClass` is the cross-agent key. */
|
|
255
263
|
failureMode?: string;
|
|
256
264
|
/** Which split this run was drawn from. */
|
|
257
265
|
splitTag: RunSplitTag;
|