@tangle-network/agent-eval 0.27.0 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +72 -0
- package/README.md +4 -5
- package/dist/{baseline-4R5deP0N.d.ts → baseline-BwdCXUS8.d.ts} +1 -1
- package/dist/builder-eval/index.d.ts +3 -3
- package/dist/builder-eval/index.js +1 -1
- package/dist/{chunk-WWYCWKUM.js → chunk-3CKU6VGU.js} +2 -2
- package/dist/{chunk-2A5XJB43.js → chunk-5AKPEK5L.js} +3 -3
- package/dist/chunk-5AKPEK5L.js.map +1 -0
- package/dist/{chunk-RAF443UI.js → chunk-DBIGN5MJ.js} +2 -2
- package/dist/{chunk-JLZQWFV3.js → chunk-K33INZHH.js} +2 -2
- package/dist/chunk-K33INZHH.js.map +1 -0
- package/dist/{chunk-NU65VQ7M.js → chunk-MAZ26DC7.js} +1 -1
- package/dist/chunk-MAZ26DC7.js.map +1 -0
- package/dist/{chunk-LSH4MMOZ.js → chunk-NCRFYPS3.js} +1 -1
- package/dist/chunk-NCRFYPS3.js.map +1 -0
- package/dist/{chunk-ZN274SWR.js → chunk-PALJO75S.js} +2 -2
- package/dist/{chunk-OWLAAMME.js → chunk-QHF6EQKK.js} +3 -2
- package/dist/chunk-QHF6EQKK.js.map +1 -0
- package/dist/chunk-R5UQJNKC.js +722 -0
- package/dist/chunk-R5UQJNKC.js.map +1 -0
- package/dist/{chunk-SESZDQPX.js → chunk-RUI6SIHY.js} +3 -3
- package/dist/chunk-RUI6SIHY.js.map +1 -0
- package/dist/{chunk-WHZMVFUV.js → chunk-SZSBQUIJ.js} +2 -2
- package/dist/chunk-SZSBQUIJ.js.map +1 -0
- package/dist/chunk-UW4NOOZI.js +1561 -0
- package/dist/chunk-UW4NOOZI.js.map +1 -0
- package/dist/{chunk-4F5DQN55.js → chunk-VSMTAMNK.js} +1 -1
- package/dist/chunk-VSMTAMNK.js.map +1 -0
- package/dist/{chunk-5LBB5B3Z.js → chunk-XFZCM5Z3.js} +1 -1
- package/dist/chunk-XFZCM5Z3.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/{control-CBShYYA6.d.ts → control-rJhEDdpy.d.ts} +4 -4
- package/dist/{control-runtime-BuJHoLg0.d.ts → control-runtime-BRdQ0wrx.d.ts} +3 -2
- package/dist/control.d.ts +5 -5
- package/dist/control.js +2 -2
- package/dist/{emitter-DP_cSSiw.d.ts → emitter-BqjeOvJh.d.ts} +1 -1
- package/dist/{failure-cluster-C2EGSDiT.d.ts → failure-cluster-D1NZKqYu.d.ts} +2 -3
- package/dist/{feedback-trajectory-DfFdrraJ.d.ts → feedback-trajectory-j0nJFgC6.d.ts} +1 -1
- package/dist/governance/index.d.ts +2 -2
- package/dist/{index-D3iBCjdF.d.ts → index-Cgt3DKXr.d.ts} +2 -2
- package/dist/index.d.ts +1279 -468
- package/dist/index.js +1992 -1259
- package/dist/index.js.map +1 -1
- package/dist/{integrity-DK2EBVZC.d.ts → integrity-BAxLGJ9I.d.ts} +2 -2
- package/dist/knowledge/index.d.ts +3 -3
- package/dist/knowledge/index.js +2 -2
- package/dist/meta-eval/index.d.ts +1 -1
- package/dist/{multi-layer-verifier-LkP3LVKj.d.ts → multi-layer-verifier-BNi4-8lR.d.ts} +2 -2
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +8 -8
- package/dist/optimization.js +5 -5
- package/dist/pipelines/index.d.ts +6 -6
- package/dist/pipelines/index.js +2 -2
- package/dist/prm/index.d.ts +4 -4
- package/dist/{query-DODUYdPg.d.ts → query-BFDT0kX_.d.ts} +1 -1
- package/dist/{release-report-wfUySN5F.d.ts → release-report-PWhGlpfO.d.ts} +1 -1
- package/dist/replay-BX5Fm8en.d.ts +529 -0
- package/dist/reporting.d.ts +5 -5
- package/dist/reporting.js +5 -5
- package/dist/{researcher-bGkI7vCl.d.ts → researcher-ClDX3KZx.d.ts} +13 -14
- package/dist/rl.d.ts +29 -47
- package/dist/rl.js +5 -5
- package/dist/rl.js.map +1 -1
- package/dist/{rubric-D5tjHNJQ.d.ts → rubric-DgSqjqqj.d.ts} +2 -2
- package/dist/{sequential-Dgz1n51-.d.ts → sequential-5iSVfzl2.d.ts} +2 -2
- package/dist/{store-Db2Bv8Cf.d.ts → store-BP5be6s7.d.ts} +1 -1
- package/dist/{summary-report-DZVXOCK_.d.ts → summary-report-jrSGb2xZ.d.ts} +5 -5
- package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BJ54PDan.d.ts} +2 -2
- package/dist/traces.d.ts +9 -311
- package/dist/traces.js +16 -987
- package/dist/traces.js.map +1 -1
- package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-BFmveYZt.d.ts} +1 -1
- package/dist/wire/index.d.ts +4 -4
- package/dist/wire/index.js +1 -1
- package/docs/research-report-methodology.md +4 -4
- package/docs/three-package-architecture.md +12 -24
- package/package.json +1 -1
- package/dist/chunk-2A5XJB43.js.map +0 -1
- package/dist/chunk-4F5DQN55.js.map +0 -1
- package/dist/chunk-5LBB5B3Z.js.map +0 -1
- package/dist/chunk-I4MBDTY5.js +0 -272
- package/dist/chunk-I4MBDTY5.js.map +0 -1
- package/dist/chunk-JLZQWFV3.js.map +0 -1
- package/dist/chunk-K2TPS5LB.js +0 -569
- package/dist/chunk-K2TPS5LB.js.map +0 -1
- package/dist/chunk-LSH4MMOZ.js.map +0 -1
- package/dist/chunk-NU65VQ7M.js.map +0 -1
- package/dist/chunk-OWLAAMME.js.map +0 -1
- package/dist/chunk-SESZDQPX.js.map +0 -1
- package/dist/chunk-WHZMVFUV.js.map +0 -1
- package/dist/replay-BL96gCEP.d.ts +0 -226
- /package/dist/{chunk-WWYCWKUM.js.map → chunk-3CKU6VGU.js.map} +0 -0
- /package/dist/{chunk-RAF443UI.js.map → chunk-DBIGN5MJ.js.map} +0 -0
- /package/dist/{chunk-ZN274SWR.js.map → chunk-PALJO75S.js.map} +0 -0
package/dist/rl.d.ts
CHANGED
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
|
|
2
|
-
import { V as VerificationReport } from './multi-layer-verifier-
|
|
3
|
-
import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-
|
|
2
|
+
import { V as VerificationReport } from './multi-layer-verifier-BNi4-8lR.js';
|
|
3
|
+
import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-jrSGb2xZ.js';
|
|
4
4
|
import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
|
|
5
5
|
import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-C0uDYwG6.js';
|
|
6
|
-
import { I as InterimReleaseConfidence } from './sequential-
|
|
7
|
-
import { S as Span, T as TraceStore } from './store-
|
|
8
|
-
import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-
|
|
9
|
-
export { r as runEvalCampaign } from './researcher-
|
|
6
|
+
import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
|
|
7
|
+
import { S as Span, T as TraceStore } from './store-BP5be6s7.js';
|
|
8
|
+
import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-ClDX3KZx.js';
|
|
9
|
+
export { r as runEvalCampaign } from './researcher-ClDX3KZx.js';
|
|
10
10
|
import './errors-BZ9sTdz7.js';
|
|
11
|
-
import './failure-cluster-
|
|
12
|
-
import './integrity-
|
|
13
|
-
import './emitter-
|
|
11
|
+
import './failure-cluster-D1NZKqYu.js';
|
|
12
|
+
import './integrity-BAxLGJ9I.js';
|
|
13
|
+
import './emitter-BqjeOvJh.js';
|
|
14
14
|
|
|
15
15
|
/**
|
|
16
16
|
* Test-time compute scaling curves.
|
|
@@ -529,17 +529,12 @@ declare function toAnthropicFormat(triples: PreferenceTriple[]): Array<{
|
|
|
529
529
|
}>;
|
|
530
530
|
|
|
531
531
|
/**
|
|
532
|
-
* Adapters: convert
|
|
533
|
-
* `RunRecord[]` artifact that
|
|
532
|
+
* Adapters: convert `TrialResult[]` (from `runMultiShotOptimization`,
|
|
533
|
+
* `runPromptEvolution`) into the canonical `RunRecord[]` artifact that
|
|
534
|
+
* `replayCache`, `pairedEvalueSequence`, and `rubricPredictiveValidity`
|
|
535
|
+
* consume.
|
|
534
536
|
*
|
|
535
|
-
*
|
|
536
|
-
* eval matrix produces one `RunRecord`. The pre-0.22 optimization
|
|
537
|
-
* primitives (`runMultiShotOptimization`, `runPromptEvolution`) produce
|
|
538
|
-
* `TrialResult[]` with a different shape. This file bridges the two so
|
|
539
|
-
* the new primitives (`replayCache`, `pairedEvalueSequence`,
|
|
540
|
-
* `rubricPredictiveValidity`) compose cleanly with the existing RL stack.
|
|
541
|
-
*
|
|
542
|
-
* The adapters are thin and explicit — every mandatory `RunRecord` field
|
|
537
|
+
* Adapters are thin and explicit — every mandatory `RunRecord` field
|
|
543
538
|
* comes from a caller-supplied context (`commitSha`, `model`,
|
|
544
539
|
* `promptHash`, `configHash`) plus the trial's runtime data. Defaults
|
|
545
540
|
* exist for fields the trial doesn't carry (`tokenUsage`, `costUsd`),
|
|
@@ -1505,18 +1500,16 @@ interface DetectRewardHackingInput {
|
|
|
1505
1500
|
declare function detectRewardHacking(input: DetectRewardHackingInput): RewardHackingReport;
|
|
1506
1501
|
|
|
1507
1502
|
/**
|
|
1508
|
-
* `analyzeOptimizationResult` — unifies the
|
|
1503
|
+
* `analyzeOptimizationResult` — unifies the auto-research stack
|
|
1509
1504
|
* (`runPromptEvolution`, `runMultiShotOptimization`, reflective-mutation,
|
|
1510
|
-
* Ax/AxRLM trace analyst) with the
|
|
1505
|
+
* Ax/AxRLM trace analyst) with the RL bridge in a single call.
|
|
1511
1506
|
*
|
|
1512
|
-
*
|
|
1513
|
-
*
|
|
1514
|
-
*
|
|
1515
|
-
* was decoupled from both. `analyzeOptimizationResult` does the wiring
|
|
1516
|
-
* once so consumers don't have to:
|
|
1507
|
+
* The optimization primitives produce `TrialResult[]`; the RL bridge
|
|
1508
|
+
* consumes `RunRecord[]`. Trace-analyst is independent of both. This
|
|
1509
|
+
* function does the wiring once so consumers don't have to:
|
|
1517
1510
|
*
|
|
1518
|
-
* Optimization (existing primitives) RL bridge
|
|
1519
|
-
* ──────────────────────────────────
|
|
1511
|
+
* Optimization (existing primitives) RL bridge
|
|
1512
|
+
* ────────────────────────────────── ────────
|
|
1520
1513
|
* runPromptEvolution → TrialResult[] →
|
|
1521
1514
|
* runMultiShotOptimization → MSTrial[] → analyzeOptimizationResult →
|
|
1522
1515
|
* reflective-mutation → mutations.jsonl → ↓
|
|
@@ -1527,10 +1520,10 @@ declare function detectRewardHacking(input: DetectRewardHackingInput): RewardHac
|
|
|
1527
1520
|
* ↓ │
|
|
1528
1521
|
* TraceAnalyst.analyze(progressLog) ←─────────────────────────┘
|
|
1529
1522
|
*
|
|
1530
|
-
* The output
|
|
1531
|
-
*
|
|
1532
|
-
*
|
|
1533
|
-
*
|
|
1523
|
+
* The output is the canonical RL artifact set: `RunRecord[]` (so every
|
|
1524
|
+
* other RL primitive composes), preference triples, verifiable reward
|
|
1525
|
+
* signals, reward-hacking diagnosis, sequential interim verdict, and
|
|
1526
|
+
* (when wired) trace-analyst summary.
|
|
1534
1527
|
*
|
|
1535
1528
|
* What this primitive does NOT do: it does not modify the optimization
|
|
1536
1529
|
* primitives' internals. They keep producing `TrialResult` and emitting
|
|
@@ -1609,11 +1602,7 @@ declare function analyzeOptimizationResult(opts: AnalyzeOptimizationResultOption
|
|
|
1609
1602
|
* `PredictiveValidityResearcher` — concrete `Researcher` implementation
|
|
1610
1603
|
* that drives selection from outcome-anchored predictive validity.
|
|
1611
1604
|
*
|
|
1612
|
-
*
|
|
1613
|
-
* 0.23. The 0.23 panel critique called this out: shipping the interface
|
|
1614
|
-
* without a default implementation that drives the loop is incomplete.
|
|
1615
|
-
*
|
|
1616
|
-
* This researcher answers each method:
|
|
1605
|
+
* Each method:
|
|
1617
1606
|
*
|
|
1618
1607
|
* - `inspectFailures(runs)` — synthesizes failure modes from the
|
|
1619
1608
|
* bottom-quartile of `RunRecord`s on the configured proxy reward.
|
|
@@ -1676,14 +1665,10 @@ declare class PredictiveValidityResearcher implements Researcher {
|
|
|
1676
1665
|
}
|
|
1677
1666
|
|
|
1678
1667
|
/**
|
|
1679
|
-
* `runRLCampaign` —
|
|
1668
|
+
* `runRLCampaign` — top-level orchestrator that runs the matrix and
|
|
1669
|
+
* produces every RL-ready artifact in one call.
|
|
1680
1670
|
*
|
|
1681
|
-
*
|
|
1682
|
-
* RL primitives consume that artifact in different ways. Until 0.24 they
|
|
1683
|
-
* had to be wired together by hand at every consumer; that defeats the
|
|
1684
|
-
* cohesion the package is supposed to provide.
|
|
1685
|
-
*
|
|
1686
|
-
* `runRLCampaign` wires:
|
|
1671
|
+
* Wires:
|
|
1687
1672
|
* 1. `runEvalCampaign` for the matrix run (capture, integrity, hooks)
|
|
1688
1673
|
* 2. `extractVerifiableReward` over each run, separating deterministic
|
|
1689
1674
|
* from probabilistic reward sources for the trainer
|
|
@@ -1697,9 +1682,6 @@ declare class PredictiveValidityResearcher implements Researcher {
|
|
|
1697
1682
|
* stage's output is in there. The consumer's downstream fits in a single
|
|
1698
1683
|
* line: pass `result.preferences` to their DPO trainer, `result.grpoRows`
|
|
1699
1684
|
* to GRPO, `result.runs` plus `result.rewardSignals` to a custom RL loop.
|
|
1700
|
-
*
|
|
1701
|
-
* This is what the 0.23 panel critique called the "missing top-level
|
|
1702
|
-
* primitive." Now shipped.
|
|
1703
1685
|
*/
|
|
1704
1686
|
|
|
1705
1687
|
interface RunRLCampaignOptions<V> extends EvalCampaignOptions<V> {
|
package/dist/rl.js
CHANGED
|
@@ -1,23 +1,23 @@
|
|
|
1
1
|
import {
|
|
2
2
|
runEvalCampaign
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-RUI6SIHY.js";
|
|
4
4
|
import "./chunk-4S4BM3QQ.js";
|
|
5
5
|
import {
|
|
6
6
|
rubricPredictiveValidity
|
|
7
7
|
} from "./chunk-YRZ4M5GS.js";
|
|
8
8
|
import {
|
|
9
9
|
evaluateInterimReleaseConfidence
|
|
10
|
-
} from "./chunk-
|
|
10
|
+
} from "./chunk-MAZ26DC7.js";
|
|
11
11
|
import {
|
|
12
12
|
benjaminiHochberg
|
|
13
|
-
} from "./chunk-
|
|
13
|
+
} from "./chunk-5AKPEK5L.js";
|
|
14
14
|
import {
|
|
15
15
|
wilcoxonSignedRank
|
|
16
|
-
} from "./chunk-
|
|
16
|
+
} from "./chunk-R5UQJNKC.js";
|
|
17
17
|
import "./chunk-KTGTIOFD.js";
|
|
18
18
|
import "./chunk-PC4UYEBM.js";
|
|
19
19
|
import "./chunk-TVVP3ZZQ.js";
|
|
20
|
-
import "./chunk-
|
|
20
|
+
import "./chunk-VSMTAMNK.js";
|
|
21
21
|
import {
|
|
22
22
|
ValidationError
|
|
23
23
|
} from "./chunk-NG236HPC.js";
|