@tangle-network/agent-eval 0.27.0 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +72 -0
- package/README.md +4 -5
- package/dist/{baseline-4R5deP0N.d.ts → baseline-BwdCXUS8.d.ts} +1 -1
- package/dist/builder-eval/index.d.ts +3 -3
- package/dist/builder-eval/index.js +1 -1
- package/dist/{chunk-WWYCWKUM.js → chunk-3CKU6VGU.js} +2 -2
- package/dist/{chunk-2A5XJB43.js → chunk-5AKPEK5L.js} +3 -3
- package/dist/chunk-5AKPEK5L.js.map +1 -0
- package/dist/{chunk-RAF443UI.js → chunk-DBIGN5MJ.js} +2 -2
- package/dist/{chunk-JLZQWFV3.js → chunk-K33INZHH.js} +2 -2
- package/dist/chunk-K33INZHH.js.map +1 -0
- package/dist/{chunk-NU65VQ7M.js → chunk-MAZ26DC7.js} +1 -1
- package/dist/chunk-MAZ26DC7.js.map +1 -0
- package/dist/{chunk-LSH4MMOZ.js → chunk-NCRFYPS3.js} +1 -1
- package/dist/chunk-NCRFYPS3.js.map +1 -0
- package/dist/{chunk-ZN274SWR.js → chunk-PALJO75S.js} +2 -2
- package/dist/{chunk-OWLAAMME.js → chunk-QHF6EQKK.js} +3 -2
- package/dist/chunk-QHF6EQKK.js.map +1 -0
- package/dist/chunk-R5UQJNKC.js +722 -0
- package/dist/chunk-R5UQJNKC.js.map +1 -0
- package/dist/{chunk-SESZDQPX.js → chunk-RUI6SIHY.js} +3 -3
- package/dist/chunk-RUI6SIHY.js.map +1 -0
- package/dist/{chunk-WHZMVFUV.js → chunk-SZSBQUIJ.js} +2 -2
- package/dist/chunk-SZSBQUIJ.js.map +1 -0
- package/dist/chunk-UW4NOOZI.js +1561 -0
- package/dist/chunk-UW4NOOZI.js.map +1 -0
- package/dist/{chunk-4F5DQN55.js → chunk-VSMTAMNK.js} +1 -1
- package/dist/chunk-VSMTAMNK.js.map +1 -0
- package/dist/{chunk-5LBB5B3Z.js → chunk-XFZCM5Z3.js} +1 -1
- package/dist/chunk-XFZCM5Z3.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/{control-CBShYYA6.d.ts → control-rJhEDdpy.d.ts} +4 -4
- package/dist/{control-runtime-BuJHoLg0.d.ts → control-runtime-BRdQ0wrx.d.ts} +3 -2
- package/dist/control.d.ts +5 -5
- package/dist/control.js +2 -2
- package/dist/{emitter-DP_cSSiw.d.ts → emitter-BqjeOvJh.d.ts} +1 -1
- package/dist/{failure-cluster-C2EGSDiT.d.ts → failure-cluster-D1NZKqYu.d.ts} +2 -3
- package/dist/{feedback-trajectory-DfFdrraJ.d.ts → feedback-trajectory-j0nJFgC6.d.ts} +1 -1
- package/dist/governance/index.d.ts +2 -2
- package/dist/{index-D3iBCjdF.d.ts → index-Cgt3DKXr.d.ts} +2 -2
- package/dist/index.d.ts +1279 -468
- package/dist/index.js +1992 -1259
- package/dist/index.js.map +1 -1
- package/dist/{integrity-DK2EBVZC.d.ts → integrity-BAxLGJ9I.d.ts} +2 -2
- package/dist/knowledge/index.d.ts +3 -3
- package/dist/knowledge/index.js +2 -2
- package/dist/meta-eval/index.d.ts +1 -1
- package/dist/{multi-layer-verifier-LkP3LVKj.d.ts → multi-layer-verifier-BNi4-8lR.d.ts} +2 -2
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +8 -8
- package/dist/optimization.js +5 -5
- package/dist/pipelines/index.d.ts +6 -6
- package/dist/pipelines/index.js +2 -2
- package/dist/prm/index.d.ts +4 -4
- package/dist/{query-DODUYdPg.d.ts → query-BFDT0kX_.d.ts} +1 -1
- package/dist/{release-report-wfUySN5F.d.ts → release-report-PWhGlpfO.d.ts} +1 -1
- package/dist/replay-BX5Fm8en.d.ts +529 -0
- package/dist/reporting.d.ts +5 -5
- package/dist/reporting.js +5 -5
- package/dist/{researcher-bGkI7vCl.d.ts → researcher-ClDX3KZx.d.ts} +13 -14
- package/dist/rl.d.ts +29 -47
- package/dist/rl.js +5 -5
- package/dist/rl.js.map +1 -1
- package/dist/{rubric-D5tjHNJQ.d.ts → rubric-DgSqjqqj.d.ts} +2 -2
- package/dist/{sequential-Dgz1n51-.d.ts → sequential-5iSVfzl2.d.ts} +2 -2
- package/dist/{store-Db2Bv8Cf.d.ts → store-BP5be6s7.d.ts} +1 -1
- package/dist/{summary-report-DZVXOCK_.d.ts → summary-report-jrSGb2xZ.d.ts} +5 -5
- package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BJ54PDan.d.ts} +2 -2
- package/dist/traces.d.ts +9 -311
- package/dist/traces.js +16 -987
- package/dist/traces.js.map +1 -1
- package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-BFmveYZt.d.ts} +1 -1
- package/dist/wire/index.d.ts +4 -4
- package/dist/wire/index.js +1 -1
- package/docs/research-report-methodology.md +4 -4
- package/docs/three-package-architecture.md +12 -24
- package/package.json +1 -1
- package/dist/chunk-2A5XJB43.js.map +0 -1
- package/dist/chunk-4F5DQN55.js.map +0 -1
- package/dist/chunk-5LBB5B3Z.js.map +0 -1
- package/dist/chunk-I4MBDTY5.js +0 -272
- package/dist/chunk-I4MBDTY5.js.map +0 -1
- package/dist/chunk-JLZQWFV3.js.map +0 -1
- package/dist/chunk-K2TPS5LB.js +0 -569
- package/dist/chunk-K2TPS5LB.js.map +0 -1
- package/dist/chunk-LSH4MMOZ.js.map +0 -1
- package/dist/chunk-NU65VQ7M.js.map +0 -1
- package/dist/chunk-OWLAAMME.js.map +0 -1
- package/dist/chunk-SESZDQPX.js.map +0 -1
- package/dist/chunk-WHZMVFUV.js.map +0 -1
- package/dist/replay-BL96gCEP.d.ts +0 -226
- /package/dist/{chunk-WWYCWKUM.js.map → chunk-3CKU6VGU.js.map} +0 -0
- /package/dist/{chunk-RAF443UI.js.map → chunk-DBIGN5MJ.js.map} +0 -0
- /package/dist/{chunk-ZN274SWR.js.map → chunk-PALJO75S.js.map} +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { T as TraceEmitter } from './emitter-
|
|
2
|
-
import { F as FailureClass, T as TraceStore } from './store-
|
|
1
|
+
import { T as TraceEmitter } from './emitter-BqjeOvJh.js';
|
|
2
|
+
import { F as FailureClass, T as TraceStore } from './store-BP5be6s7.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* Policy-based agent control runtime.
|
|
@@ -120,6 +120,7 @@ interface ControlRunResult<TState, TAction, TActionResult, TEval extends Control
|
|
|
120
120
|
finalEvals: TEval[];
|
|
121
121
|
wallMs: number;
|
|
122
122
|
spentCostUsd: number;
|
|
123
|
+
/** null when the run executed without a TraceEmitter wired (no run record was persisted). */
|
|
123
124
|
runId: string | null;
|
|
124
125
|
failureClass?: FailureClass;
|
|
125
126
|
runtimeErrors: ControlRuntimeError[];
|
package/dist/control.d.ts
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, p as RunEvidenceMetadata, s as controlRunToRunRecord, u as evaluateActionPolicy, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-
|
|
2
|
-
export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-
|
|
3
|
-
import './feedback-trajectory-
|
|
1
|
+
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, p as RunEvidenceMetadata, s as controlRunToRunRecord, u as evaluateActionPolicy, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-rJhEDdpy.js';
|
|
2
|
+
export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BRdQ0wrx.js';
|
|
3
|
+
import './feedback-trajectory-j0nJFgC6.js';
|
|
4
4
|
import './dataset-CiK_3LDr.js';
|
|
5
5
|
import './errors-BZ9sTdz7.js';
|
|
6
|
-
import './emitter-
|
|
7
|
-
import './store-
|
|
6
|
+
import './emitter-BqjeOvJh.js';
|
|
7
|
+
import './store-BP5be6s7.js';
|
|
8
8
|
import './run-record-CqzahIbx.js';
|
package/dist/control.js
CHANGED
|
@@ -4,7 +4,7 @@ import {
|
|
|
4
4
|
runProposeReview,
|
|
5
5
|
runProposeReviewAsControlLoop,
|
|
6
6
|
scoreFromEvals
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-PALJO75S.js";
|
|
8
8
|
import {
|
|
9
9
|
allCriticalPassed,
|
|
10
10
|
objectiveEval,
|
|
@@ -12,7 +12,7 @@ import {
|
|
|
12
12
|
stopOnNoProgress,
|
|
13
13
|
stopOnRepeatedAction,
|
|
14
14
|
subjectiveEval
|
|
15
|
-
} from "./chunk-
|
|
15
|
+
} from "./chunk-NCRFYPS3.js";
|
|
16
16
|
import "./chunk-NLMNWKVM.js";
|
|
17
17
|
import "./chunk-TVVP3ZZQ.js";
|
|
18
18
|
import "./chunk-NG236HPC.js";
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { T as TraceStore, c as RunOutcome, R as Run, S as Span, d as SpanKind, L as LlmSpan, a as ToolSpan, e as RetrievalSpan, J as JudgeSpan, f as SandboxSpan, E as EventKind, b as TraceEvent, B as BudgetLedgerEntry, A as Artifact, M as Message } from './store-
|
|
1
|
+
import { T as TraceStore, c as RunOutcome, R as Run, S as Span, d as SpanKind, L as LlmSpan, a as ToolSpan, e as RetrievalSpan, J as JudgeSpan, f as SandboxSpan, E as EventKind, b as TraceEvent, B as BudgetLedgerEntry, A as Artifact, M as Message } from './store-BP5be6s7.js';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* TraceEmitter — hierarchical span builder that auto-parents using an
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { R as Run, S as Span, b as TraceEvent, F as FailureClass, T as TraceStore } from './store-
|
|
1
|
+
import { R as Run, S as Span, b as TraceEvent, F as FailureClass, T as TraceStore } from './store-BP5be6s7.js';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* Failure taxonomy — canonical classes + a default classifier.
|
|
@@ -54,8 +54,7 @@ interface FailureCluster {
|
|
|
54
54
|
* Source dimension when the trigger was a judge span (e.g. `'format'`,
|
|
55
55
|
* `'safety'`, `'correctness'`). Lets cross-template aggregators
|
|
56
56
|
* group failures by the dimension that fired without overloading
|
|
57
|
-
* `argPrefix`. Optional —
|
|
58
|
-
* deserialize cleanly.
|
|
57
|
+
* `argPrefix`. Optional — clusters without this field deserialize cleanly.
|
|
59
58
|
*/
|
|
60
59
|
dimension?: string;
|
|
61
60
|
runCount: number;
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { C as ControlEvalResult, a as ControlRunResult, b as ControlStep } from './control-runtime-
|
|
1
|
+
import { C as ControlEvalResult, a as ControlRunResult, b as ControlStep } from './control-runtime-BRdQ0wrx.js';
|
|
2
2
|
import { D as DatasetSplit, a as DatasetScenario } from './dataset-CiK_3LDr.js';
|
|
3
3
|
|
|
4
4
|
type FeedbackArtifactType = 'text' | 'code' | 'plan' | 'research' | 'action' | 'ui' | 'decision' | 'data' | 'other';
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
export { E as EuRiskClass, e as GovernanceContext, f as GovernanceFinding, g as GovernanceReport, U as UseCaseSignals, n as classifyEuAiRisk, p as euAiActReport, q as nistAiRmfReport, u as renderMarkdown, x as soc2Report, y as summarize } from '../index-
|
|
1
|
+
export { E as EuRiskClass, e as GovernanceContext, f as GovernanceFinding, g as GovernanceReport, U as UseCaseSignals, n as classifyEuAiRisk, p as euAiActReport, q as nistAiRmfReport, u as renderMarkdown, x as soc2Report, y as summarize } from '../index-Cgt3DKXr.js';
|
|
2
2
|
import '../dataset-CiK_3LDr.js';
|
|
3
3
|
import '../errors-BZ9sTdz7.js';
|
|
4
4
|
import '../outcome-store-D6KWmYvj.js';
|
|
5
|
-
import '../store-
|
|
5
|
+
import '../store-BP5be6s7.js';
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { a as DatasetScenario, c as Dataset, b as DatasetManifest } from './dataset-CiK_3LDr.js';
|
|
2
2
|
import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
|
|
3
|
-
import { T as TraceStore } from './store-
|
|
3
|
+
import { T as TraceStore } from './store-BP5be6s7.js';
|
|
4
4
|
|
|
5
5
|
/**
|
|
6
6
|
* Judge calibration — measure judge quality against human gold + bias.
|
|
@@ -328,4 +328,4 @@ declare function nistAiRmfReport(ctx: GovernanceContext): Promise<GovernanceRepo
|
|
|
328
328
|
|
|
329
329
|
declare function soc2Report(ctx: GovernanceContext): Promise<GovernanceReport>;
|
|
330
330
|
|
|
331
|
-
export { verbosityBias as A, type
|
|
331
|
+
export { verbosityBias as A, type ContinuousAgreementOptions as C, DEFAULT_RED_TEAM_CORPUS as D, type EuRiskClass as E, type GoldenItem as G, type PositionalBiasResult as P, type RedTeamCase as R, type SelfPreferenceResult as S, type UseCaseSignals as U, type VerbosityBiasResult as V, type ContinuousAgreement as a, type CalibrationResult as b, type CandidateScore as c, type ContinuousCalibrationResult as d, type GovernanceContext as e, type GovernanceFinding as f, type GovernanceReport as g, type RedTeamCategory as h, type RedTeamFinding as i, type RedTeamPayload as j, type RedTeamReport as k, calibrateJudge as l, calibrateJudgeContinuous as m, classifyEuAiRisk as n, continuousAgreement as o, euAiActReport as p, nistAiRmfReport as q, positionalBias as r, redTeamDataset as s, redTeamReport as t, renderMarkdown as u, scoreRedTeamOutput as v, selfPreference as w, soc2Report as x, summarize as y, toolNamesForRun as z };
|