@tangle-network/agent-eval 0.27.0 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/CHANGELOG.md +72 -0
  2. package/README.md +4 -5
  3. package/dist/{baseline-4R5deP0N.d.ts → baseline-BwdCXUS8.d.ts} +1 -1
  4. package/dist/builder-eval/index.d.ts +3 -3
  5. package/dist/builder-eval/index.js +1 -1
  6. package/dist/{chunk-WWYCWKUM.js → chunk-3CKU6VGU.js} +2 -2
  7. package/dist/{chunk-2A5XJB43.js → chunk-5AKPEK5L.js} +3 -3
  8. package/dist/chunk-5AKPEK5L.js.map +1 -0
  9. package/dist/{chunk-RAF443UI.js → chunk-DBIGN5MJ.js} +2 -2
  10. package/dist/{chunk-JLZQWFV3.js → chunk-K33INZHH.js} +2 -2
  11. package/dist/chunk-K33INZHH.js.map +1 -0
  12. package/dist/{chunk-NU65VQ7M.js → chunk-MAZ26DC7.js} +1 -1
  13. package/dist/chunk-MAZ26DC7.js.map +1 -0
  14. package/dist/{chunk-LSH4MMOZ.js → chunk-NCRFYPS3.js} +1 -1
  15. package/dist/chunk-NCRFYPS3.js.map +1 -0
  16. package/dist/{chunk-ZN274SWR.js → chunk-PALJO75S.js} +2 -2
  17. package/dist/{chunk-OWLAAMME.js → chunk-QHF6EQKK.js} +3 -2
  18. package/dist/chunk-QHF6EQKK.js.map +1 -0
  19. package/dist/chunk-R5UQJNKC.js +722 -0
  20. package/dist/chunk-R5UQJNKC.js.map +1 -0
  21. package/dist/{chunk-SESZDQPX.js → chunk-RUI6SIHY.js} +3 -3
  22. package/dist/chunk-RUI6SIHY.js.map +1 -0
  23. package/dist/{chunk-WHZMVFUV.js → chunk-SZSBQUIJ.js} +2 -2
  24. package/dist/chunk-SZSBQUIJ.js.map +1 -0
  25. package/dist/chunk-UW4NOOZI.js +1561 -0
  26. package/dist/chunk-UW4NOOZI.js.map +1 -0
  27. package/dist/{chunk-4F5DQN55.js → chunk-VSMTAMNK.js} +1 -1
  28. package/dist/chunk-VSMTAMNK.js.map +1 -0
  29. package/dist/{chunk-5LBB5B3Z.js → chunk-XFZCM5Z3.js} +1 -1
  30. package/dist/chunk-XFZCM5Z3.js.map +1 -0
  31. package/dist/cli.js +1 -1
  32. package/dist/{control-CBShYYA6.d.ts → control-rJhEDdpy.d.ts} +4 -4
  33. package/dist/{control-runtime-BuJHoLg0.d.ts → control-runtime-BRdQ0wrx.d.ts} +3 -2
  34. package/dist/control.d.ts +5 -5
  35. package/dist/control.js +2 -2
  36. package/dist/{emitter-DP_cSSiw.d.ts → emitter-BqjeOvJh.d.ts} +1 -1
  37. package/dist/{failure-cluster-C2EGSDiT.d.ts → failure-cluster-D1NZKqYu.d.ts} +2 -3
  38. package/dist/{feedback-trajectory-DfFdrraJ.d.ts → feedback-trajectory-j0nJFgC6.d.ts} +1 -1
  39. package/dist/governance/index.d.ts +2 -2
  40. package/dist/{index-D3iBCjdF.d.ts → index-Cgt3DKXr.d.ts} +2 -2
  41. package/dist/index.d.ts +1279 -468
  42. package/dist/index.js +1992 -1259
  43. package/dist/index.js.map +1 -1
  44. package/dist/{integrity-DK2EBVZC.d.ts → integrity-BAxLGJ9I.d.ts} +2 -2
  45. package/dist/knowledge/index.d.ts +3 -3
  46. package/dist/knowledge/index.js +2 -2
  47. package/dist/meta-eval/index.d.ts +1 -1
  48. package/dist/{multi-layer-verifier-LkP3LVKj.d.ts → multi-layer-verifier-BNi4-8lR.d.ts} +2 -2
  49. package/dist/openapi.json +1 -1
  50. package/dist/optimization.d.ts +8 -8
  51. package/dist/optimization.js +5 -5
  52. package/dist/pipelines/index.d.ts +6 -6
  53. package/dist/pipelines/index.js +2 -2
  54. package/dist/prm/index.d.ts +4 -4
  55. package/dist/{query-DODUYdPg.d.ts → query-BFDT0kX_.d.ts} +1 -1
  56. package/dist/{release-report-wfUySN5F.d.ts → release-report-PWhGlpfO.d.ts} +1 -1
  57. package/dist/replay-BX5Fm8en.d.ts +529 -0
  58. package/dist/reporting.d.ts +5 -5
  59. package/dist/reporting.js +5 -5
  60. package/dist/{researcher-bGkI7vCl.d.ts → researcher-ClDX3KZx.d.ts} +13 -14
  61. package/dist/rl.d.ts +29 -47
  62. package/dist/rl.js +5 -5
  63. package/dist/rl.js.map +1 -1
  64. package/dist/{rubric-D5tjHNJQ.d.ts → rubric-DgSqjqqj.d.ts} +2 -2
  65. package/dist/{sequential-Dgz1n51-.d.ts → sequential-5iSVfzl2.d.ts} +2 -2
  66. package/dist/{store-Db2Bv8Cf.d.ts → store-BP5be6s7.d.ts} +1 -1
  67. package/dist/{summary-report-DZVXOCK_.d.ts → summary-report-jrSGb2xZ.d.ts} +5 -5
  68. package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BJ54PDan.d.ts} +2 -2
  69. package/dist/traces.d.ts +9 -311
  70. package/dist/traces.js +16 -987
  71. package/dist/traces.js.map +1 -1
  72. package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-BFmveYZt.d.ts} +1 -1
  73. package/dist/wire/index.d.ts +4 -4
  74. package/dist/wire/index.js +1 -1
  75. package/docs/research-report-methodology.md +4 -4
  76. package/docs/three-package-architecture.md +12 -24
  77. package/package.json +1 -1
  78. package/dist/chunk-2A5XJB43.js.map +0 -1
  79. package/dist/chunk-4F5DQN55.js.map +0 -1
  80. package/dist/chunk-5LBB5B3Z.js.map +0 -1
  81. package/dist/chunk-I4MBDTY5.js +0 -272
  82. package/dist/chunk-I4MBDTY5.js.map +0 -1
  83. package/dist/chunk-JLZQWFV3.js.map +0 -1
  84. package/dist/chunk-K2TPS5LB.js +0 -569
  85. package/dist/chunk-K2TPS5LB.js.map +0 -1
  86. package/dist/chunk-LSH4MMOZ.js.map +0 -1
  87. package/dist/chunk-NU65VQ7M.js.map +0 -1
  88. package/dist/chunk-OWLAAMME.js.map +0 -1
  89. package/dist/chunk-SESZDQPX.js.map +0 -1
  90. package/dist/chunk-WHZMVFUV.js.map +0 -1
  91. package/dist/replay-BL96gCEP.d.ts +0 -226
  92. /package/dist/{chunk-WWYCWKUM.js.map → chunk-3CKU6VGU.js.map} +0 -0
  93. /package/dist/{chunk-RAF443UI.js.map → chunk-DBIGN5MJ.js.map} +0 -0
  94. /package/dist/{chunk-ZN274SWR.js.map → chunk-PALJO75S.js.map} +0 -0
@@ -1,5 +1,5 @@
1
- import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
2
- import { F as FailureClass, T as TraceStore } from './store-Db2Bv8Cf.js';
1
+ import { T as TraceEmitter } from './emitter-BqjeOvJh.js';
2
+ import { F as FailureClass, T as TraceStore } from './store-BP5be6s7.js';
3
3
 
4
4
  /**
5
5
  * Policy-based agent control runtime.
@@ -120,6 +120,7 @@ interface ControlRunResult<TState, TAction, TActionResult, TEval extends Control
120
120
  finalEvals: TEval[];
121
121
  wallMs: number;
122
122
  spentCostUsd: number;
123
+ /** null when the run executed without a TraceEmitter wired (no run record was persisted). */
123
124
  runId: string | null;
124
125
  failureClass?: FailureClass;
125
126
  runtimeErrors: ControlRuntimeError[];
package/dist/control.d.ts CHANGED
@@ -1,8 +1,8 @@
1
- export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, p as RunEvidenceMetadata, s as controlRunToRunRecord, u as evaluateActionPolicy, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-CBShYYA6.js';
2
- export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BuJHoLg0.js';
3
- import './feedback-trajectory-DfFdrraJ.js';
1
+ export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, p as RunEvidenceMetadata, s as controlRunToRunRecord, u as evaluateActionPolicy, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-rJhEDdpy.js';
2
+ export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BRdQ0wrx.js';
3
+ import './feedback-trajectory-j0nJFgC6.js';
4
4
  import './dataset-CiK_3LDr.js';
5
5
  import './errors-BZ9sTdz7.js';
6
- import './emitter-DP_cSSiw.js';
7
- import './store-Db2Bv8Cf.js';
6
+ import './emitter-BqjeOvJh.js';
7
+ import './store-BP5be6s7.js';
8
8
  import './run-record-CqzahIbx.js';
package/dist/control.js CHANGED
@@ -4,7 +4,7 @@ import {
4
4
  runProposeReview,
5
5
  runProposeReviewAsControlLoop,
6
6
  scoreFromEvals
7
- } from "./chunk-ZN274SWR.js";
7
+ } from "./chunk-PALJO75S.js";
8
8
  import {
9
9
  allCriticalPassed,
10
10
  objectiveEval,
@@ -12,7 +12,7 @@ import {
12
12
  stopOnNoProgress,
13
13
  stopOnRepeatedAction,
14
14
  subjectiveEval
15
- } from "./chunk-LSH4MMOZ.js";
15
+ } from "./chunk-NCRFYPS3.js";
16
16
  import "./chunk-NLMNWKVM.js";
17
17
  import "./chunk-TVVP3ZZQ.js";
18
18
  import "./chunk-NG236HPC.js";
@@ -1,4 +1,4 @@
1
- import { T as TraceStore, c as RunOutcome, R as Run, S as Span, d as SpanKind, L as LlmSpan, a as ToolSpan, e as RetrievalSpan, J as JudgeSpan, f as SandboxSpan, E as EventKind, b as TraceEvent, B as BudgetLedgerEntry, A as Artifact, M as Message } from './store-Db2Bv8Cf.js';
1
+ import { T as TraceStore, c as RunOutcome, R as Run, S as Span, d as SpanKind, L as LlmSpan, a as ToolSpan, e as RetrievalSpan, J as JudgeSpan, f as SandboxSpan, E as EventKind, b as TraceEvent, B as BudgetLedgerEntry, A as Artifact, M as Message } from './store-BP5be6s7.js';
2
2
 
3
3
  /**
4
4
  * TraceEmitter — hierarchical span builder that auto-parents using an
@@ -1,4 +1,4 @@
1
- import { R as Run, S as Span, b as TraceEvent, F as FailureClass, T as TraceStore } from './store-Db2Bv8Cf.js';
1
+ import { R as Run, S as Span, b as TraceEvent, F as FailureClass, T as TraceStore } from './store-BP5be6s7.js';
2
2
 
3
3
  /**
4
4
  * Failure taxonomy — canonical classes + a default classifier.
@@ -54,8 +54,7 @@ interface FailureCluster {
54
54
  * Source dimension when the trigger was a judge span (e.g. `'format'`,
55
55
  * `'safety'`, `'correctness'`). Lets cross-template aggregators
56
56
  * group failures by the dimension that fired without overloading
57
- * `argPrefix`. Optional — legacy clusters without this field
58
- * deserialize cleanly.
57
+ * `argPrefix`. Optional — clusters without this field deserialize cleanly.
59
58
  */
60
59
  dimension?: string;
61
60
  runCount: number;
@@ -1,4 +1,4 @@
1
- import { C as ControlEvalResult, a as ControlRunResult, b as ControlStep } from './control-runtime-BuJHoLg0.js';
1
+ import { C as ControlEvalResult, a as ControlRunResult, b as ControlStep } from './control-runtime-BRdQ0wrx.js';
2
2
  import { D as DatasetSplit, a as DatasetScenario } from './dataset-CiK_3LDr.js';
3
3
 
4
4
  type FeedbackArtifactType = 'text' | 'code' | 'plan' | 'research' | 'action' | 'ui' | 'decision' | 'data' | 'other';
@@ -1,5 +1,5 @@
1
- export { E as EuRiskClass, e as GovernanceContext, f as GovernanceFinding, g as GovernanceReport, U as UseCaseSignals, n as classifyEuAiRisk, p as euAiActReport, q as nistAiRmfReport, u as renderMarkdown, x as soc2Report, y as summarize } from '../index-D3iBCjdF.js';
1
+ export { E as EuRiskClass, e as GovernanceContext, f as GovernanceFinding, g as GovernanceReport, U as UseCaseSignals, n as classifyEuAiRisk, p as euAiActReport, q as nistAiRmfReport, u as renderMarkdown, x as soc2Report, y as summarize } from '../index-Cgt3DKXr.js';
2
2
  import '../dataset-CiK_3LDr.js';
3
3
  import '../errors-BZ9sTdz7.js';
4
4
  import '../outcome-store-D6KWmYvj.js';
5
- import '../store-Db2Bv8Cf.js';
5
+ import '../store-BP5be6s7.js';
@@ -1,6 +1,6 @@
1
1
  import { a as DatasetScenario, c as Dataset, b as DatasetManifest } from './dataset-CiK_3LDr.js';
2
2
  import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
3
- import { T as TraceStore } from './store-Db2Bv8Cf.js';
3
+ import { T as TraceStore } from './store-BP5be6s7.js';
4
4
 
5
5
  /**
6
6
  * Judge calibration — measure judge quality against human gold + bias.
@@ -328,4 +328,4 @@ declare function nistAiRmfReport(ctx: GovernanceContext): Promise<GovernanceRepo
328
328
 
329
329
  declare function soc2Report(ctx: GovernanceContext): Promise<GovernanceReport>;
330
330
 
331
- export { verbosityBias as A, type CalibrationResult as C, DEFAULT_RED_TEAM_CORPUS as D, type EuRiskClass as E, type GoldenItem as G, type PositionalBiasResult as P, type RedTeamCase as R, type SelfPreferenceResult as S, type UseCaseSignals as U, type VerbosityBiasResult as V, type CandidateScore as a, type ContinuousAgreement as b, type ContinuousAgreementOptions as c, type ContinuousCalibrationResult as d, type GovernanceContext as e, type GovernanceFinding as f, type GovernanceReport as g, type RedTeamCategory as h, type RedTeamFinding as i, type RedTeamPayload as j, type RedTeamReport as k, calibrateJudge as l, calibrateJudgeContinuous as m, classifyEuAiRisk as n, continuousAgreement as o, euAiActReport as p, nistAiRmfReport as q, positionalBias as r, redTeamDataset as s, redTeamReport as t, renderMarkdown as u, scoreRedTeamOutput as v, selfPreference as w, soc2Report as x, summarize as y, toolNamesForRun as z };
331
+ export { verbosityBias as A, type ContinuousAgreementOptions as C, DEFAULT_RED_TEAM_CORPUS as D, type EuRiskClass as E, type GoldenItem as G, type PositionalBiasResult as P, type RedTeamCase as R, type SelfPreferenceResult as S, type UseCaseSignals as U, type VerbosityBiasResult as V, type ContinuousAgreement as a, type CalibrationResult as b, type CandidateScore as c, type ContinuousCalibrationResult as d, type GovernanceContext as e, type GovernanceFinding as f, type GovernanceReport as g, type RedTeamCategory as h, type RedTeamFinding as i, type RedTeamPayload as j, type RedTeamReport as k, calibrateJudge as l, calibrateJudgeContinuous as m, classifyEuAiRisk as n, continuousAgreement as o, euAiActReport as p, nistAiRmfReport as q, positionalBias as r, redTeamDataset as s, redTeamReport as t, renderMarkdown as u, scoreRedTeamOutput as v, selfPreference as w, soc2Report as x, summarize as y, toolNamesForRun as z };