@tangle-network/agent-eval 0.27.0 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/CHANGELOG.md +72 -0
  2. package/README.md +4 -5
  3. package/dist/{baseline-4R5deP0N.d.ts → baseline-BwdCXUS8.d.ts} +1 -1
  4. package/dist/builder-eval/index.d.ts +3 -3
  5. package/dist/builder-eval/index.js +1 -1
  6. package/dist/{chunk-WWYCWKUM.js → chunk-3CKU6VGU.js} +2 -2
  7. package/dist/{chunk-2A5XJB43.js → chunk-5AKPEK5L.js} +3 -3
  8. package/dist/chunk-5AKPEK5L.js.map +1 -0
  9. package/dist/{chunk-RAF443UI.js → chunk-DBIGN5MJ.js} +2 -2
  10. package/dist/{chunk-JLZQWFV3.js → chunk-K33INZHH.js} +2 -2
  11. package/dist/chunk-K33INZHH.js.map +1 -0
  12. package/dist/{chunk-NU65VQ7M.js → chunk-MAZ26DC7.js} +1 -1
  13. package/dist/chunk-MAZ26DC7.js.map +1 -0
  14. package/dist/{chunk-LSH4MMOZ.js → chunk-NCRFYPS3.js} +1 -1
  15. package/dist/chunk-NCRFYPS3.js.map +1 -0
  16. package/dist/{chunk-ZN274SWR.js → chunk-PALJO75S.js} +2 -2
  17. package/dist/{chunk-OWLAAMME.js → chunk-QHF6EQKK.js} +3 -2
  18. package/dist/chunk-QHF6EQKK.js.map +1 -0
  19. package/dist/chunk-R5UQJNKC.js +722 -0
  20. package/dist/chunk-R5UQJNKC.js.map +1 -0
  21. package/dist/{chunk-SESZDQPX.js → chunk-RUI6SIHY.js} +3 -3
  22. package/dist/chunk-RUI6SIHY.js.map +1 -0
  23. package/dist/{chunk-WHZMVFUV.js → chunk-SZSBQUIJ.js} +2 -2
  24. package/dist/chunk-SZSBQUIJ.js.map +1 -0
  25. package/dist/chunk-UW4NOOZI.js +1561 -0
  26. package/dist/chunk-UW4NOOZI.js.map +1 -0
  27. package/dist/{chunk-4F5DQN55.js → chunk-VSMTAMNK.js} +1 -1
  28. package/dist/chunk-VSMTAMNK.js.map +1 -0
  29. package/dist/{chunk-5LBB5B3Z.js → chunk-XFZCM5Z3.js} +1 -1
  30. package/dist/chunk-XFZCM5Z3.js.map +1 -0
  31. package/dist/cli.js +1 -1
  32. package/dist/{control-CBShYYA6.d.ts → control-rJhEDdpy.d.ts} +4 -4
  33. package/dist/{control-runtime-BuJHoLg0.d.ts → control-runtime-BRdQ0wrx.d.ts} +3 -2
  34. package/dist/control.d.ts +5 -5
  35. package/dist/control.js +2 -2
  36. package/dist/{emitter-DP_cSSiw.d.ts → emitter-BqjeOvJh.d.ts} +1 -1
  37. package/dist/{failure-cluster-C2EGSDiT.d.ts → failure-cluster-D1NZKqYu.d.ts} +2 -3
  38. package/dist/{feedback-trajectory-DfFdrraJ.d.ts → feedback-trajectory-j0nJFgC6.d.ts} +1 -1
  39. package/dist/governance/index.d.ts +2 -2
  40. package/dist/{index-D3iBCjdF.d.ts → index-Cgt3DKXr.d.ts} +2 -2
  41. package/dist/index.d.ts +1279 -468
  42. package/dist/index.js +1992 -1259
  43. package/dist/index.js.map +1 -1
  44. package/dist/{integrity-DK2EBVZC.d.ts → integrity-BAxLGJ9I.d.ts} +2 -2
  45. package/dist/knowledge/index.d.ts +3 -3
  46. package/dist/knowledge/index.js +2 -2
  47. package/dist/meta-eval/index.d.ts +1 -1
  48. package/dist/{multi-layer-verifier-LkP3LVKj.d.ts → multi-layer-verifier-BNi4-8lR.d.ts} +2 -2
  49. package/dist/openapi.json +1 -1
  50. package/dist/optimization.d.ts +8 -8
  51. package/dist/optimization.js +5 -5
  52. package/dist/pipelines/index.d.ts +6 -6
  53. package/dist/pipelines/index.js +2 -2
  54. package/dist/prm/index.d.ts +4 -4
  55. package/dist/{query-DODUYdPg.d.ts → query-BFDT0kX_.d.ts} +1 -1
  56. package/dist/{release-report-wfUySN5F.d.ts → release-report-PWhGlpfO.d.ts} +1 -1
  57. package/dist/replay-BX5Fm8en.d.ts +529 -0
  58. package/dist/reporting.d.ts +5 -5
  59. package/dist/reporting.js +5 -5
  60. package/dist/{researcher-bGkI7vCl.d.ts → researcher-ClDX3KZx.d.ts} +13 -14
  61. package/dist/rl.d.ts +29 -47
  62. package/dist/rl.js +5 -5
  63. package/dist/rl.js.map +1 -1
  64. package/dist/{rubric-D5tjHNJQ.d.ts → rubric-DgSqjqqj.d.ts} +2 -2
  65. package/dist/{sequential-Dgz1n51-.d.ts → sequential-5iSVfzl2.d.ts} +2 -2
  66. package/dist/{store-Db2Bv8Cf.d.ts → store-BP5be6s7.d.ts} +1 -1
  67. package/dist/{summary-report-DZVXOCK_.d.ts → summary-report-jrSGb2xZ.d.ts} +5 -5
  68. package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BJ54PDan.d.ts} +2 -2
  69. package/dist/traces.d.ts +9 -311
  70. package/dist/traces.js +16 -987
  71. package/dist/traces.js.map +1 -1
  72. package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-BFmveYZt.d.ts} +1 -1
  73. package/dist/wire/index.d.ts +4 -4
  74. package/dist/wire/index.js +1 -1
  75. package/docs/research-report-methodology.md +4 -4
  76. package/docs/three-package-architecture.md +12 -24
  77. package/package.json +1 -1
  78. package/dist/chunk-2A5XJB43.js.map +0 -1
  79. package/dist/chunk-4F5DQN55.js.map +0 -1
  80. package/dist/chunk-5LBB5B3Z.js.map +0 -1
  81. package/dist/chunk-I4MBDTY5.js +0 -272
  82. package/dist/chunk-I4MBDTY5.js.map +0 -1
  83. package/dist/chunk-JLZQWFV3.js.map +0 -1
  84. package/dist/chunk-K2TPS5LB.js +0 -569
  85. package/dist/chunk-K2TPS5LB.js.map +0 -1
  86. package/dist/chunk-LSH4MMOZ.js.map +0 -1
  87. package/dist/chunk-NU65VQ7M.js.map +0 -1
  88. package/dist/chunk-OWLAAMME.js.map +0 -1
  89. package/dist/chunk-SESZDQPX.js.map +0 -1
  90. package/dist/chunk-WHZMVFUV.js.map +0 -1
  91. package/dist/replay-BL96gCEP.d.ts +0 -226
  92. /package/dist/{chunk-WWYCWKUM.js.map → chunk-3CKU6VGU.js.map} +0 -0
  93. /package/dist/{chunk-RAF443UI.js.map → chunk-DBIGN5MJ.js.map} +0 -0
  94. /package/dist/{chunk-ZN274SWR.js.map → chunk-PALJO75S.js.map} +0 -0
package/dist/rl.d.ts CHANGED
@@ -1,16 +1,16 @@
1
1
  import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
2
- import { V as VerificationReport } from './multi-layer-verifier-LkP3LVKj.js';
3
- import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-DZVXOCK_.js';
2
+ import { V as VerificationReport } from './multi-layer-verifier-BNi4-8lR.js';
3
+ import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-jrSGb2xZ.js';
4
4
  import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
5
5
  import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-C0uDYwG6.js';
6
- import { I as InterimReleaseConfidence } from './sequential-Dgz1n51-.js';
7
- import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
8
- import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-bGkI7vCl.js';
9
- export { r as runEvalCampaign } from './researcher-bGkI7vCl.js';
6
+ import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
7
+ import { S as Span, T as TraceStore } from './store-BP5be6s7.js';
8
+ import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-ClDX3KZx.js';
9
+ export { r as runEvalCampaign } from './researcher-ClDX3KZx.js';
10
10
  import './errors-BZ9sTdz7.js';
11
- import './failure-cluster-C2EGSDiT.js';
12
- import './integrity-DK2EBVZC.js';
13
- import './emitter-DP_cSSiw.js';
11
+ import './failure-cluster-D1NZKqYu.js';
12
+ import './integrity-BAxLGJ9I.js';
13
+ import './emitter-BqjeOvJh.js';
14
14
 
15
15
  /**
16
16
  * Test-time compute scaling curves.
@@ -529,17 +529,12 @@ declare function toAnthropicFormat(triples: PreferenceTriple[]): Array<{
529
529
  }>;
530
530
 
531
531
  /**
532
- * Adapters: convert legacy optimization outputs into the canonical
533
- * `RunRecord[]` artifact that 0.22+ primitives consume.
532
+ * Adapters: convert `TrialResult[]` (from `runMultiShotOptimization`,
533
+ * `runPromptEvolution`) into the canonical `RunRecord[]` artifact that
534
+ * `replayCache`, `pairedEvalueSequence`, and `rubricPredictiveValidity`
535
+ * consume.
534
536
  *
535
- * The 0.22 release standardized the campaign artifact: every cell of an
536
- * eval matrix produces one `RunRecord`. The pre-0.22 optimization
537
- * primitives (`runMultiShotOptimization`, `runPromptEvolution`) produce
538
- * `TrialResult[]` with a different shape. This file bridges the two so
539
- * the new primitives (`replayCache`, `pairedEvalueSequence`,
540
- * `rubricPredictiveValidity`) compose cleanly with the existing RL stack.
541
- *
542
- * The adapters are thin and explicit — every mandatory `RunRecord` field
537
+ * Adapters are thin and explicit every mandatory `RunRecord` field
543
538
  * comes from a caller-supplied context (`commitSha`, `model`,
544
539
  * `promptHash`, `configHash`) plus the trial's runtime data. Defaults
545
540
  * exist for fields the trial doesn't carry (`tokenUsage`, `costUsd`),
@@ -1505,18 +1500,16 @@ interface DetectRewardHackingInput {
1505
1500
  declare function detectRewardHacking(input: DetectRewardHackingInput): RewardHackingReport;
1506
1501
 
1507
1502
  /**
1508
- * `analyzeOptimizationResult` — unifies the pre-0.22 auto-research stack
1503
+ * `analyzeOptimizationResult` — unifies the auto-research stack
1509
1504
  * (`runPromptEvolution`, `runMultiShotOptimization`, reflective-mutation,
1510
- * Ax/AxRLM trace analyst) with the 0.23 RL bridge in a single call.
1505
+ * Ax/AxRLM trace analyst) with the RL bridge in a single call.
1511
1506
  *
1512
- * What this fixes: until 0.23 the optimization stack and the RL bridge
1513
- * lived in parallel namespaces. The optimization primitives produced
1514
- * `TrialResult[]`; the RL bridge consumed `RunRecord[]`. Trace-analyst
1515
- * was decoupled from both. `analyzeOptimizationResult` does the wiring
1516
- * once so consumers don't have to:
1507
+ * The optimization primitives produce `TrialResult[]`; the RL bridge
1508
+ * consumes `RunRecord[]`. Trace-analyst is independent of both. This
1509
+ * function does the wiring once so consumers don't have to:
1517
1510
  *
1518
- * Optimization (existing primitives) RL bridge (0.23)
1519
- * ────────────────────────────────── ────────────────
1511
+ * Optimization (existing primitives) RL bridge
1512
+ * ────────────────────────────────── ────────
1520
1513
  * runPromptEvolution → TrialResult[] →
1521
1514
  * runMultiShotOptimization → MSTrial[] → analyzeOptimizationResult →
1522
1515
  * reflective-mutation → mutations.jsonl → ↓
@@ -1527,10 +1520,10 @@ declare function detectRewardHacking(input: DetectRewardHackingInput): RewardHac
1527
1520
  * ↓ │
1528
1521
  * TraceAnalyst.analyze(progressLog) ←─────────────────────────┘
1529
1522
  *
1530
- * The output of this function is the canonical RL artifact set:
1531
- * `RunRecord[]` (so every other 0.22+ primitive composes), preference
1532
- * triples, verifiable reward signals, reward-hacking diagnosis,
1533
- * sequential interim verdict, and (when wired) trace-analyst summary.
1523
+ * The output is the canonical RL artifact set: `RunRecord[]` (so every
1524
+ * other RL primitive composes), preference triples, verifiable reward
1525
+ * signals, reward-hacking diagnosis, sequential interim verdict, and
1526
+ * (when wired) trace-analyst summary.
1534
1527
  *
1535
1528
  * What this primitive does NOT do: it does not modify the optimization
1536
1529
  * primitives' internals. They keep producing `TrialResult` and emitting
@@ -1609,11 +1602,7 @@ declare function analyzeOptimizationResult(opts: AnalyzeOptimizationResultOption
1609
1602
  * `PredictiveValidityResearcher` — concrete `Researcher` implementation
1610
1603
  * that drives selection from outcome-anchored predictive validity.
1611
1604
  *
1612
- * `Researcher` was a placeholder interface plus `NoopResearcher` until
1613
- * 0.23. The 0.23 panel critique called this out: shipping the interface
1614
- * without a default implementation that drives the loop is incomplete.
1615
- *
1616
- * This researcher answers each method:
1605
+ * Each method:
1617
1606
  *
1618
1607
  * - `inspectFailures(runs)` — synthesizes failure modes from the
1619
1608
  * bottom-quartile of `RunRecord`s on the configured proxy reward.
@@ -1676,14 +1665,10 @@ declare class PredictiveValidityResearcher implements Researcher {
1676
1665
  }
1677
1666
 
1678
1667
  /**
1679
- * `runRLCampaign` — the missing top-level orchestrator.
1668
+ * `runRLCampaign` — top-level orchestrator that runs the matrix and
1669
+ * produces every RL-ready artifact in one call.
1680
1670
  *
1681
- * `runEvalCampaign` runs the matrix and produces `RunRecord[]`. The 0.23
1682
- * RL primitives consume that artifact in different ways. Until 0.24 they
1683
- * had to be wired together by hand at every consumer; that defeats the
1684
- * cohesion the package is supposed to provide.
1685
- *
1686
- * `runRLCampaign` wires:
1671
+ * Wires:
1687
1672
  * 1. `runEvalCampaign` for the matrix run (capture, integrity, hooks)
1688
1673
  * 2. `extractVerifiableReward` over each run, separating deterministic
1689
1674
  * from probabilistic reward sources for the trainer
@@ -1697,9 +1682,6 @@ declare class PredictiveValidityResearcher implements Researcher {
1697
1682
  * stage's output is in there. The consumer's downstream fits in a single
1698
1683
  * line: pass `result.preferences` to their DPO trainer, `result.grpoRows`
1699
1684
  * to GRPO, `result.runs` plus `result.rewardSignals` to a custom RL loop.
1700
- *
1701
- * This is what the 0.23 panel critique called the "missing top-level
1702
- * primitive." Now shipped.
1703
1685
  */
1704
1686
 
1705
1687
  interface RunRLCampaignOptions<V> extends EvalCampaignOptions<V> {
package/dist/rl.js CHANGED
@@ -1,23 +1,23 @@
1
1
  import {
2
2
  runEvalCampaign
3
- } from "./chunk-SESZDQPX.js";
3
+ } from "./chunk-RUI6SIHY.js";
4
4
  import "./chunk-4S4BM3QQ.js";
5
5
  import {
6
6
  rubricPredictiveValidity
7
7
  } from "./chunk-YRZ4M5GS.js";
8
8
  import {
9
9
  evaluateInterimReleaseConfidence
10
- } from "./chunk-NU65VQ7M.js";
10
+ } from "./chunk-MAZ26DC7.js";
11
11
  import {
12
12
  benjaminiHochberg
13
- } from "./chunk-2A5XJB43.js";
13
+ } from "./chunk-5AKPEK5L.js";
14
14
  import {
15
15
  wilcoxonSignedRank
16
- } from "./chunk-I4MBDTY5.js";
16
+ } from "./chunk-R5UQJNKC.js";
17
17
  import "./chunk-KTGTIOFD.js";
18
18
  import "./chunk-PC4UYEBM.js";
19
19
  import "./chunk-TVVP3ZZQ.js";
20
- import "./chunk-4F5DQN55.js";
20
+ import "./chunk-VSMTAMNK.js";
21
21
  import {
22
22
  ValidationError
23
23
  } from "./chunk-NG236HPC.js";