npm - @tangle-network/agent-eval - Versions diffs - 0.27.0 → 0.28.0 - Mend

@tangle-network/agent-eval 0.27.0 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

package/CHANGELOG.md +72 -0
package/README.md +4 -5
package/dist/{baseline-4R5deP0N.d.ts → baseline-BwdCXUS8.d.ts} +1 -1
package/dist/builder-eval/index.d.ts +3 -3
package/dist/builder-eval/index.js +1 -1
package/dist/{chunk-WWYCWKUM.js → chunk-3CKU6VGU.js} +2 -2
package/dist/{chunk-2A5XJB43.js → chunk-5AKPEK5L.js} +3 -3
package/dist/chunk-5AKPEK5L.js.map +1 -0
package/dist/{chunk-RAF443UI.js → chunk-DBIGN5MJ.js} +2 -2
package/dist/{chunk-JLZQWFV3.js → chunk-K33INZHH.js} +2 -2
package/dist/chunk-K33INZHH.js.map +1 -0
package/dist/{chunk-NU65VQ7M.js → chunk-MAZ26DC7.js} +1 -1
package/dist/chunk-MAZ26DC7.js.map +1 -0
package/dist/{chunk-LSH4MMOZ.js → chunk-NCRFYPS3.js} +1 -1
package/dist/chunk-NCRFYPS3.js.map +1 -0
package/dist/{chunk-ZN274SWR.js → chunk-PALJO75S.js} +2 -2
package/dist/{chunk-OWLAAMME.js → chunk-QHF6EQKK.js} +3 -2
package/dist/chunk-QHF6EQKK.js.map +1 -0
package/dist/chunk-R5UQJNKC.js +722 -0
package/dist/chunk-R5UQJNKC.js.map +1 -0
package/dist/{chunk-SESZDQPX.js → chunk-RUI6SIHY.js} +3 -3
package/dist/chunk-RUI6SIHY.js.map +1 -0
package/dist/{chunk-WHZMVFUV.js → chunk-SZSBQUIJ.js} +2 -2
package/dist/chunk-SZSBQUIJ.js.map +1 -0
package/dist/chunk-UW4NOOZI.js +1561 -0
package/dist/chunk-UW4NOOZI.js.map +1 -0
package/dist/{chunk-4F5DQN55.js → chunk-VSMTAMNK.js} +1 -1
package/dist/chunk-VSMTAMNK.js.map +1 -0
package/dist/{chunk-5LBB5B3Z.js → chunk-XFZCM5Z3.js} +1 -1
package/dist/chunk-XFZCM5Z3.js.map +1 -0
package/dist/cli.js +1 -1
package/dist/{control-CBShYYA6.d.ts → control-rJhEDdpy.d.ts} +4 -4
package/dist/{control-runtime-BuJHoLg0.d.ts → control-runtime-BRdQ0wrx.d.ts} +3 -2
package/dist/control.d.ts +5 -5
package/dist/control.js +2 -2
package/dist/{emitter-DP_cSSiw.d.ts → emitter-BqjeOvJh.d.ts} +1 -1
package/dist/{failure-cluster-C2EGSDiT.d.ts → failure-cluster-D1NZKqYu.d.ts} +2 -3
package/dist/{feedback-trajectory-DfFdrraJ.d.ts → feedback-trajectory-j0nJFgC6.d.ts} +1 -1
package/dist/governance/index.d.ts +2 -2
package/dist/{index-D3iBCjdF.d.ts → index-Cgt3DKXr.d.ts} +2 -2
package/dist/index.d.ts +1279 -468
package/dist/index.js +1992 -1259
package/dist/index.js.map +1 -1
package/dist/{integrity-DK2EBVZC.d.ts → integrity-BAxLGJ9I.d.ts} +2 -2
package/dist/knowledge/index.d.ts +3 -3
package/dist/knowledge/index.js +2 -2
package/dist/meta-eval/index.d.ts +1 -1
package/dist/{multi-layer-verifier-LkP3LVKj.d.ts → multi-layer-verifier-BNi4-8lR.d.ts} +2 -2
package/dist/openapi.json +1 -1
package/dist/optimization.d.ts +8 -8
package/dist/optimization.js +5 -5
package/dist/pipelines/index.d.ts +6 -6
package/dist/pipelines/index.js +2 -2
package/dist/prm/index.d.ts +4 -4
package/dist/{query-DODUYdPg.d.ts → query-BFDT0kX_.d.ts} +1 -1
package/dist/{release-report-wfUySN5F.d.ts → release-report-PWhGlpfO.d.ts} +1 -1
package/dist/replay-BX5Fm8en.d.ts +529 -0
package/dist/reporting.d.ts +5 -5
package/dist/reporting.js +5 -5
package/dist/{researcher-bGkI7vCl.d.ts → researcher-ClDX3KZx.d.ts} +13 -14
package/dist/rl.d.ts +29 -47
package/dist/rl.js +5 -5
package/dist/rl.js.map +1 -1
package/dist/{rubric-D5tjHNJQ.d.ts → rubric-DgSqjqqj.d.ts} +2 -2
package/dist/{sequential-Dgz1n51-.d.ts → sequential-5iSVfzl2.d.ts} +2 -2
package/dist/{store-Db2Bv8Cf.d.ts → store-BP5be6s7.d.ts} +1 -1
package/dist/{summary-report-DZVXOCK_.d.ts → summary-report-jrSGb2xZ.d.ts} +5 -5
package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BJ54PDan.d.ts} +2 -2
package/dist/traces.d.ts +9 -311
package/dist/traces.js +16 -987
package/dist/traces.js.map +1 -1
package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-BFmveYZt.d.ts} +1 -1
package/dist/wire/index.d.ts +4 -4
package/dist/wire/index.js +1 -1
package/docs/research-report-methodology.md +4 -4
package/docs/three-package-architecture.md +12 -24
package/package.json +1 -1
package/dist/chunk-2A5XJB43.js.map +0 -1
package/dist/chunk-4F5DQN55.js.map +0 -1
package/dist/chunk-5LBB5B3Z.js.map +0 -1
package/dist/chunk-I4MBDTY5.js +0 -272
package/dist/chunk-I4MBDTY5.js.map +0 -1
package/dist/chunk-JLZQWFV3.js.map +0 -1
package/dist/chunk-K2TPS5LB.js +0 -569
package/dist/chunk-K2TPS5LB.js.map +0 -1
package/dist/chunk-LSH4MMOZ.js.map +0 -1
package/dist/chunk-NU65VQ7M.js.map +0 -1
package/dist/chunk-OWLAAMME.js.map +0 -1
package/dist/chunk-SESZDQPX.js.map +0 -1
package/dist/chunk-WHZMVFUV.js.map +0 -1
package/dist/replay-BL96gCEP.d.ts +0 -226
/package/dist/{chunk-WWYCWKUM.js.map → chunk-3CKU6VGU.js.map} +0 -0
/package/dist/{chunk-RAF443UI.js.map → chunk-DBIGN5MJ.js.map} +0 -0
/package/dist/{chunk-ZN274SWR.js.map → chunk-PALJO75S.js.map} +0 -0

package/dist/rl.d.ts CHANGED Viewed

@@ -1,16 +1,16 @@
 import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
-import { V as VerificationReport } from './multi-layer-verifier-LkP3LVKj.js';
-import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-DZVXOCK_.js';
+import { V as VerificationReport } from './multi-layer-verifier-BNi4-8lR.js';
+import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-jrSGb2xZ.js';
 import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
 import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-C0uDYwG6.js';
-import { I as InterimReleaseConfidence } from './sequential-Dgz1n51-.js';
-import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
-import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-bGkI7vCl.js';
-export { r as runEvalCampaign } from './researcher-bGkI7vCl.js';
+import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
+import { S as Span, T as TraceStore } from './store-BP5be6s7.js';
+import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-ClDX3KZx.js';
+export { r as runEvalCampaign } from './researcher-ClDX3KZx.js';
 import './errors-BZ9sTdz7.js';
-import './failure-cluster-C2EGSDiT.js';
-import './integrity-DK2EBVZC.js';
-import './emitter-DP_cSSiw.js';
+import './failure-cluster-D1NZKqYu.js';
+import './integrity-BAxLGJ9I.js';
+import './emitter-BqjeOvJh.js';
 /**
  * Test-time compute scaling curves.
@@ -529,17 +529,12 @@ declare function toAnthropicFormat(triples: PreferenceTriple[]): Array<{
 }>;
 /**
- * Adapters: convert legacy optimization outputs into the canonical
- * `RunRecord[]` artifact that 0.22+ primitives consume.
+ * Adapters: convert `TrialResult[]` (from `runMultiShotOptimization`,
+ * `runPromptEvolution`) into the canonical `RunRecord[]` artifact that
+ * `replayCache`, `pairedEvalueSequence`, and `rubricPredictiveValidity`
+ * consume.
  *
- * The 0.22 release standardized the campaign artifact: every cell of an
- * eval matrix produces one `RunRecord`. The pre-0.22 optimization
- * primitives (`runMultiShotOptimization`, `runPromptEvolution`) produce
- * `TrialResult[]` with a different shape. This file bridges the two so
- * the new primitives (`replayCache`, `pairedEvalueSequence`,
- * `rubricPredictiveValidity`) compose cleanly with the existing RL stack.
- *
- * The adapters are thin and explicit — every mandatory `RunRecord` field
+ * Adapters are thin and explicit — every mandatory `RunRecord` field
  * comes from a caller-supplied context (`commitSha`, `model`,
  * `promptHash`, `configHash`) plus the trial's runtime data. Defaults
  * exist for fields the trial doesn't carry (`tokenUsage`, `costUsd`),
@@ -1505,18 +1500,16 @@ interface DetectRewardHackingInput {
 declare function detectRewardHacking(input: DetectRewardHackingInput): RewardHackingReport;
 /**
- * `analyzeOptimizationResult` — unifies the pre-0.22 auto-research stack
+ * `analyzeOptimizationResult` — unifies the auto-research stack
  * (`runPromptEvolution`, `runMultiShotOptimization`, reflective-mutation,
- * Ax/AxRLM trace analyst) with the 0.23 RL bridge in a single call.
+ * Ax/AxRLM trace analyst) with the RL bridge in a single call.
  *
- * What this fixes: until 0.23 the optimization stack and the RL bridge
- * lived in parallel namespaces. The optimization primitives produced
- * `TrialResult[]`; the RL bridge consumed `RunRecord[]`. Trace-analyst
- * was decoupled from both. `analyzeOptimizationResult` does the wiring
- * once so consumers don't have to:
+ * The optimization primitives produce `TrialResult[]`; the RL bridge
+ * consumes `RunRecord[]`. Trace-analyst is independent of both. This
+ * function does the wiring once so consumers don't have to:
  *
- *    Optimization (existing primitives)           RL bridge (0.23)
- *    ──────────────────────────────────           ────────────────
+ *    Optimization (existing primitives)           RL bridge
+ *    ──────────────────────────────────           ────────
  *    runPromptEvolution → TrialResult[]    →
  *    runMultiShotOptimization → MSTrial[]  → analyzeOptimizationResult →
  *    reflective-mutation → mutations.jsonl →                             ↓
@@ -1527,10 +1520,10 @@ declare function detectRewardHacking(input: DetectRewardHackingInput): RewardHac
  *    ↓                                                                   │
  *    TraceAnalyst.analyze(progressLog)         ←─────────────────────────┘
  *
- * The output of this function is the canonical RL artifact set:
- * `RunRecord[]` (so every other 0.22+ primitive composes), preference
- * triples, verifiable reward signals, reward-hacking diagnosis,
- * sequential interim verdict, and (when wired) trace-analyst summary.
+ * The output is the canonical RL artifact set: `RunRecord[]` (so every
+ * other RL primitive composes), preference triples, verifiable reward
+ * signals, reward-hacking diagnosis, sequential interim verdict, and
+ * (when wired) trace-analyst summary.
  *
  * What this primitive does NOT do: it does not modify the optimization
  * primitives' internals. They keep producing `TrialResult` and emitting
@@ -1609,11 +1602,7 @@ declare function analyzeOptimizationResult(opts: AnalyzeOptimizationResultOption
  * `PredictiveValidityResearcher` — concrete `Researcher` implementation
  * that drives selection from outcome-anchored predictive validity.
  *
- * `Researcher` was a placeholder interface plus `NoopResearcher` until
- * 0.23. The 0.23 panel critique called this out: shipping the interface
- * without a default implementation that drives the loop is incomplete.
- *
- * This researcher answers each method:
+ * Each method:
  *
  *   - `inspectFailures(runs)` — synthesizes failure modes from the
  *     bottom-quartile of `RunRecord`s on the configured proxy reward.
@@ -1676,14 +1665,10 @@ declare class PredictiveValidityResearcher implements Researcher {
 }
 /**
- * `runRLCampaign` — the missing top-level orchestrator.
+ * `runRLCampaign` — top-level orchestrator that runs the matrix and
+ * produces every RL-ready artifact in one call.
  *
- * `runEvalCampaign` runs the matrix and produces `RunRecord[]`. The 0.23
- * RL primitives consume that artifact in different ways. Until 0.24 they
- * had to be wired together by hand at every consumer; that defeats the
- * cohesion the package is supposed to provide.
- *
- * `runRLCampaign` wires:
+ * Wires:
  *   1. `runEvalCampaign` for the matrix run (capture, integrity, hooks)
  *   2. `extractVerifiableReward` over each run, separating deterministic
  *      from probabilistic reward sources for the trainer
@@ -1697,9 +1682,6 @@ declare class PredictiveValidityResearcher implements Researcher {
  * stage's output is in there. The consumer's downstream fits in a single
  * line: pass `result.preferences` to their DPO trainer, `result.grpoRows`
  * to GRPO, `result.runs` plus `result.rewardSignals` to a custom RL loop.
- *
- * This is what the 0.23 panel critique called the "missing top-level
- * primitive." Now shipped.
  */
 interface RunRLCampaignOptions<V> extends EvalCampaignOptions<V> {

package/dist/rl.js CHANGED Viewed

@@ -1,23 +1,23 @@
 import {
   runEvalCampaign
-} from "./chunk-SESZDQPX.js";
+} from "./chunk-RUI6SIHY.js";
 import "./chunk-4S4BM3QQ.js";
 import {
   rubricPredictiveValidity
 } from "./chunk-YRZ4M5GS.js";
 import {
   evaluateInterimReleaseConfidence
-} from "./chunk-NU65VQ7M.js";
+} from "./chunk-MAZ26DC7.js";
 import {
   benjaminiHochberg
-} from "./chunk-2A5XJB43.js";
+} from "./chunk-5AKPEK5L.js";
 import {
   wilcoxonSignedRank
-} from "./chunk-I4MBDTY5.js";
+} from "./chunk-R5UQJNKC.js";
 import "./chunk-KTGTIOFD.js";
 import "./chunk-PC4UYEBM.js";
 import "./chunk-TVVP3ZZQ.js";
-import "./chunk-4F5DQN55.js";
+import "./chunk-VSMTAMNK.js";
 import {
   ValidationError
 } from "./chunk-NG236HPC.js";