@tangle-network/agent-eval 0.79.0 → 0.80.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.md +50 -19
  2. package/dist/adapters/http.d.ts +1 -1
  3. package/dist/adapters/langchain.d.ts +1 -1
  4. package/dist/adapters/otel.d.ts +1 -1
  5. package/dist/analyst/index.d.ts +3 -3
  6. package/dist/belief-state/index.d.ts +188 -0
  7. package/dist/belief-state/index.js +486 -0
  8. package/dist/belief-state/index.js.map +1 -0
  9. package/dist/calibration-Cpr3WaX3.d.ts +101 -0
  10. package/dist/campaign/index.d.ts +5 -5
  11. package/dist/chunk-4DIJWVUT.js +131 -0
  12. package/dist/chunk-4DIJWVUT.js.map +1 -0
  13. package/dist/chunk-NPCTHQIO.js +91 -0
  14. package/dist/chunk-NPCTHQIO.js.map +1 -0
  15. package/dist/contract/index.d.ts +123 -10
  16. package/dist/contract/index.js +116 -0
  17. package/dist/contract/index.js.map +1 -1
  18. package/dist/governance/index.d.ts +1 -1
  19. package/dist/hosted/index.d.ts +1 -1
  20. package/dist/index.d.ts +5 -5
  21. package/dist/meta-eval/index.d.ts +5 -98
  22. package/dist/meta-eval/index.js +7 -76
  23. package/dist/meta-eval/index.js.map +1 -1
  24. package/dist/off-policy-DiwuKKg7.d.ts +132 -0
  25. package/dist/openapi.json +1 -1
  26. package/dist/{outcome-store-D6KWmYvj.d.ts → outcome-store-rnXLEqSn.d.ts} +1 -1
  27. package/dist/{provenance-CEAJI9rm.d.ts → provenance-jG-Gngg8.d.ts} +2 -2
  28. package/dist/{registry-BmEuU94S.d.ts → registry-BK0Zee01.d.ts} +1 -1
  29. package/dist/reporting.d.ts +2 -2
  30. package/dist/rl.d.ts +6 -136
  31. package/dist/rl.js +6 -120
  32. package/dist/rl.js.map +1 -1
  33. package/dist/{rubric-predictive-validity-CWyWWLBg.d.ts → rubric-predictive-validity-CLPuwiUw.d.ts} +1 -1
  34. package/dist/{run-improvement-loop-Bgu4C59E.d.ts → run-improvement-loop-BAl_aVOZ.d.ts} +1 -1
  35. package/dist/{semantic-concept-judge-Du4ZVyef.d.ts → semantic-concept-judge-qXEUV2w7.d.ts} +1 -1
  36. package/dist/{types-QHG0KnkF.d.ts → types-4mm2msnR.d.ts} +1 -1
  37. package/docs/research/belief-state-agent-eval-roadmap.md +558 -0
  38. package/docs/research/research-roadmap.md +1 -0
  39. package/package.json +7 -2
package/dist/rl.d.ts CHANGED
@@ -1,11 +1,12 @@
1
+ export { O as OffPolicyEstimate, a as OffPolicyOptions, b as OffPolicyTrajectory, d as doublyRobust, i as inverseProbabilityWeighting, o as offPolicyEstimateAll, s as selfNormalizedImportanceWeighting } from './off-policy-DiwuKKg7.js';
1
2
  import { R as RunRecord, b as RunSplitTag } from './run-record-sItO5ftF.js';
2
- import { f as CampaignResult } from './types-QHG0KnkF.js';
3
+ import { g as CampaignResult } from './types-4mm2msnR.js';
3
4
  import { a as VerificationReport } from './multi-layer-verifier-DlWCXuxL.js';
4
5
  import { S as Span } from './schema-m0gsnbt3.js';
5
6
  import { T as TraceStore } from './store-CKUAgsJz.js';
6
- import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
7
- export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from './outcome-store-D6KWmYvj.js';
8
- import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-CWyWWLBg.js';
7
+ import { b as OutcomeStore } from './outcome-store-rnXLEqSn.js';
8
+ export { D as DeploymentOutcome, F as FileSystemOutcomeStore, a as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from './outcome-store-rnXLEqSn.js';
9
+ import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-CLPuwiUw.js';
9
10
  import { R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-rInLj9De.js';
10
11
  export { r as runEvalCampaign } from './researcher-rInLj9De.js';
11
12
  import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
@@ -261,137 +262,6 @@ declare function injectIrrelevantClause<S extends {
261
262
  prompt: string;
262
263
  }>(clause: string, position?: 'prefix' | 'suffix'): ScenarioPerturbation<S>;
263
264
 
264
- /**
265
- * Off-policy evaluation primitives.
266
- *
267
- * Standard inverse-probability-weighted (IPS), self-normalized
268
- * importance-weighted (SNIPS), and doubly-robust (DR) estimators for the
269
- * value of a *target* policy given trajectories collected under a
270
- * *behavior* policy. This is the canonical RL eval task: "we have last
271
- * week's runs, we changed the policy — how would the new one do without
272
- * re-running?"
273
- *
274
- * The math here is textbook (Dudík, Langford, Li 2011 for DR; Swaminathan
275
- * & Joachims 2015 for SNIPS) but the *application* to LLM-agent
276
- * evaluation needs care:
277
- *
278
- * - The "policy" is the (prompt, tool config, model snapshot) triple.
279
- * Two policies have the same probability over an action *iff* their
280
- * LLM call would emit the same token with the same probability —
281
- * which is generally unknowable without the model log-probs.
282
- * - For LLM agents, propensity scores must be supplied by the caller
283
- * (logged in the trace, recovered from token log-probs, or estimated
284
- * via a learned propensity model). We do NOT estimate propensity here.
285
- * - Doubly-robust requires a Q-function (model-based reward predictor).
286
- * We accept any callable; consumers pass either a tabular average,
287
- * a regression fit, or a learned reward model.
288
- *
289
- * Bias / variance tradeoffs:
290
- * - IPS: unbiased; high variance for small overlap, infinite variance
291
- * when target has support outside behavior.
292
- * - SNIPS: lower variance, slight bias; usually preferred in practice.
293
- * - DR: doubly-robust — unbiased if either propensity OR Q-function is
294
- * correct. Lowest practical variance when Q is decent. Use this.
295
- *
296
- * Caveat the panel will land: on the LLM-agent setting, propensity scores
297
- * recovered from token log-probs are noisy, the action space is enormous,
298
- * and overlap is often poor. These estimators are useful but not magic;
299
- * complement with `replayCampaign` (exact replay where the request hashes
300
- * match) for high-confidence answers and OPE for the gap.
301
- */
302
- interface OffPolicyTrajectory {
303
- /** Stable id, for traceability through the dataset. */
304
- runId: string;
305
- /** Reward observed under the behavior policy (the realized outcome). */
306
- reward: number;
307
- /**
308
- * Behavior-policy probability of the action that was taken. For LLM
309
- * agents this is typically `exp(sum(token_log_probs))` over the chosen
310
- * trajectory. Must be in (0, 1].
311
- */
312
- behaviorProb: number;
313
- /**
314
- * Target-policy probability of the same action. For replay-style
315
- * counterfactual evaluation this is what the *new* policy would have
316
- * assigned to the *old* trajectory. Must be in [0, 1].
317
- */
318
- targetProb: number;
319
- /**
320
- * Optional model-based reward prediction at the same context. Used by
321
- * `doublyRobust`. Set to `null` for IPS-only evaluation.
322
- */
323
- qHat?: number | null;
324
- }
325
- interface OffPolicyEstimate {
326
- /** Estimated value of the target policy. */
327
- value: number;
328
- /** Standard error of the estimate. */
329
- standardError: number;
330
- /** Effective sample size (Kong 1992). Lower = more reliance on a few high-weight samples. */
331
- effectiveSampleSize: number;
332
- /** Number of trajectories used. */
333
- n: number;
334
- /**
335
- * Diagnostic: maximum importance weight observed. Large values (>>10x
336
- * mean) are a red flag — variance is dominated by a few outliers.
337
- */
338
- maxImportanceWeight: number;
339
- }
340
- interface OffPolicyOptions {
341
- /**
342
- * Cap importance weights at this value (Ionides 2008 truncated IS) to
343
- * trade unbiasedness for variance reduction. Default `Infinity` (no cap).
344
- * Set e.g. `10` for stable estimates when the policies are close.
345
- */
346
- weightCap?: number;
347
- /** Reward clipping range. Default `[0, 1]`. */
348
- rewardClip?: {
349
- low: number;
350
- high: number;
351
- };
352
- }
353
- /**
354
- * Inverse Probability Weighting (Horvitz-Thompson). Unbiased estimator
355
- * of E[reward under target policy]. Variance scales with the spread of
356
- * target/behavior ratios.
357
- */
358
- declare function inverseProbabilityWeighting(trajectories: OffPolicyTrajectory[], opts?: OffPolicyOptions): OffPolicyEstimate;
359
- /**
360
- * Self-Normalized Importance Sampling. Lower variance than vanilla IPS at
361
- * the cost of small bias (vanishing as N grows). The right default for
362
- * LLM-agent evaluation where overlap is often poor.
363
- */
364
- declare function selfNormalizedImportanceWeighting(trajectories: OffPolicyTrajectory[], opts?: OffPolicyOptions): OffPolicyEstimate;
365
- /**
366
- * Doubly-robust off-policy estimator (Dudík, Langford, Li 2011).
367
- *
368
- * V_DR = (1/N) * sum_i [ q_hat_i + (target_prob_i / behavior_prob_i) * (r_i - q_hat_i) ]
369
- *
370
- * Unbiased if EITHER:
371
- * - the importance ratios are correct (IPS-style validity), OR
372
- * - the Q-hat function is correct (model-based validity).
373
- *
374
- * In practice both are imperfect, but the residual bias is the *product*
375
- * of both errors — much smaller than either alone. This is why DR is the
376
- * default in production OPE pipelines.
377
- *
378
- * Requires `qHat` on every trajectory. If any are `null`, the estimator
379
- * falls back to SNIPS for those entries (loud-fallback behavior; the
380
- * report's `n` reflects the full set but `effectiveSampleSize` accounts
381
- * for the lost variance reduction).
382
- */
383
- declare function doublyRobust(trajectories: OffPolicyTrajectory[], opts?: OffPolicyOptions): OffPolicyEstimate;
384
- /**
385
- * Convenience: run all three estimators and return them side-by-side.
386
- * The recommended diagnostic — agreement across estimators is a much
387
- * stronger signal than any single one.
388
- */
389
- declare function offPolicyEstimateAll(trajectories: OffPolicyTrajectory[], opts?: OffPolicyOptions): {
390
- ips: OffPolicyEstimate;
391
- snips: OffPolicyEstimate;
392
- dr: OffPolicyEstimate;
393
- };
394
-
395
265
  /**
396
266
  * Preference dataset extraction — bridge from `RunRecord[]` to RL training.
397
267
  *
@@ -1781,4 +1651,4 @@ interface RLCampaignResult<V> {
1781
1651
  }
1782
1652
  declare function runRLCampaign<V>(opts: RunRLCampaignOptions<V>): Promise<RLCampaignResult<V>>;
1783
1653
 
1784
- export { type AdaptationCurve, type AdaptationPoint, type AdaptationRunner, type AdapterContext, type AdversarialMutation, type AdversarialScenario, type AdversarialSearchOptions, type AdversarialSearchReport, type BradleyTerryFit, type BradleyTerryRating, type BuildPairwiseFromCampaignInput, type CellObservation, type CompareCurvesResult, type ComputeBestOfNOptions, type ComputeBestOfNResult, type ComputeCurve, type ComputeCurveBudget, type ComputeCurvePoint, type ContaminationProbeInput, type ContaminationProbeOptions, type ContaminationProbeReport, type CorpusAppendResult, type CorpusRecord, type CurriculumAllocation, type DatasetFormat, type DetectRewardHackingInput, type DpoExportRow, type DpoLookups, type EloOptions, type ExtractPreferencesOptions, type ExtractStepRewardsOptions, type GrpoExportRow, type GrpoLookups, type HarvestOptions, type OffPolicyEstimate, type OffPolicyOptions, type OffPolicyTrajectory, OutcomeStore, type PairwiseOutcome, type ParetoPointInput, PredictiveValidityResearcher, type PredictiveValidityResearcherOptions, type PreferenceExtractionReport, type PreferenceStrategy, type PreferenceTriple, type PrmExportRow, type PrmLookups, type PrmTrainingTriple, type RLCampaignResult, type RewardHackingFinding, type RewardHackingReport, type RewardHackingSignal, type RewardKind, type RewardStats, type RlDatasetBundle, type RlDatasetConfig, type RlDatasetManifest, type RlDatasetStats, type RunAdaptationCurveOptions, type RunComputeCurveOptions, type RunRLCampaignOptions, type RunwiseStepSummary, type ScenarioPerturbation, type ScenarioPerturbationKind, type SelfConsistencyOptions, type SelfConsistencyResult, type SftExportRow, type SftLookups, type StepReward, type StepRewardJsonlRow, type StepScorer, type ThompsonCurriculumOptions, type VarianceCurriculumOptions, type VerifiableReward, type VerifiableRewardExtractionOptions, type VerifiableRewardSource, adversarialScenarioSearch, appendToCorpus, applyEloUpdate, bestOfN, buildDatasetFromCorpus, buildPairwiseFromCampaign, buildRlDataset, campaignToRunRecords, compareAdaptationCurves, datasheetToMarkdown, detectRewardHacking, doublyRobust, extractPreferences, extractStepRewards, extractVerifiableReward, extractVerifiableRewardsFromRecords, filterDeterministicallyRewarded, firstPassK, fitBradleyTerry, injectIrrelevantClause, inverseProbabilityWeighting, observationsFromRunRecords, offPolicyEstimateAll, paretoFrontier, prmTrainingPairs, readCorpus, renameVariables, runAdaptationCurve, runComputeCurve, runContaminationProbe, runRLCampaign, runwiseStepRewardSummary, selfConsistency, selfNormalizedImportanceWeighting, shuffleOrder, stepRewardsToJsonl, thompsonCurriculum, toAnthropicFormat, toDpoJsonl, toDpoRows, toGrpoJsonl, toGrpoRows, toPrmJsonl, toPrmRows, toSftJsonl, toSftRows, toTRLFormat, varianceBasedCurriculum, verificationReportToRunRecord };
1654
+ export { type AdaptationCurve, type AdaptationPoint, type AdaptationRunner, type AdapterContext, type AdversarialMutation, type AdversarialScenario, type AdversarialSearchOptions, type AdversarialSearchReport, type BradleyTerryFit, type BradleyTerryRating, type BuildPairwiseFromCampaignInput, type CellObservation, type CompareCurvesResult, type ComputeBestOfNOptions, type ComputeBestOfNResult, type ComputeCurve, type ComputeCurveBudget, type ComputeCurvePoint, type ContaminationProbeInput, type ContaminationProbeOptions, type ContaminationProbeReport, type CorpusAppendResult, type CorpusRecord, type CurriculumAllocation, type DatasetFormat, type DetectRewardHackingInput, type DpoExportRow, type DpoLookups, type EloOptions, type ExtractPreferencesOptions, type ExtractStepRewardsOptions, type GrpoExportRow, type GrpoLookups, type HarvestOptions, OutcomeStore, type PairwiseOutcome, type ParetoPointInput, PredictiveValidityResearcher, type PredictiveValidityResearcherOptions, type PreferenceExtractionReport, type PreferenceStrategy, type PreferenceTriple, type PrmExportRow, type PrmLookups, type PrmTrainingTriple, type RLCampaignResult, type RewardHackingFinding, type RewardHackingReport, type RewardHackingSignal, type RewardKind, type RewardStats, type RlDatasetBundle, type RlDatasetConfig, type RlDatasetManifest, type RlDatasetStats, type RunAdaptationCurveOptions, type RunComputeCurveOptions, type RunRLCampaignOptions, type RunwiseStepSummary, type ScenarioPerturbation, type ScenarioPerturbationKind, type SelfConsistencyOptions, type SelfConsistencyResult, type SftExportRow, type SftLookups, type StepReward, type StepRewardJsonlRow, type StepScorer, type ThompsonCurriculumOptions, type VarianceCurriculumOptions, type VerifiableReward, type VerifiableRewardExtractionOptions, type VerifiableRewardSource, adversarialScenarioSearch, appendToCorpus, applyEloUpdate, bestOfN, buildDatasetFromCorpus, buildPairwiseFromCampaign, buildRlDataset, campaignToRunRecords, compareAdaptationCurves, datasheetToMarkdown, detectRewardHacking, extractPreferences, extractStepRewards, extractVerifiableReward, extractVerifiableRewardsFromRecords, filterDeterministicallyRewarded, firstPassK, fitBradleyTerry, injectIrrelevantClause, observationsFromRunRecords, paretoFrontier, prmTrainingPairs, readCorpus, renameVariables, runAdaptationCurve, runComputeCurve, runContaminationProbe, runRLCampaign, runwiseStepRewardSummary, selfConsistency, shuffleOrder, stepRewardsToJsonl, thompsonCurriculum, toAnthropicFormat, toDpoJsonl, toDpoRows, toGrpoJsonl, toGrpoRows, toPrmJsonl, toPrmRows, toSftJsonl, toSftRows, toTRLFormat, varianceBasedCurriculum, verificationReportToRunRecord };
package/dist/rl.js CHANGED
@@ -1,3 +1,9 @@
1
+ import {
2
+ doublyRobust,
3
+ inverseProbabilityWeighting,
4
+ offPolicyEstimateAll,
5
+ selfNormalizedImportanceWeighting
6
+ } from "./chunk-4DIJWVUT.js";
1
7
  import {
2
8
  detectRewardHacking,
3
9
  extractVerifiableReward,
@@ -231,126 +237,6 @@ function escapeRegex(s) {
231
237
  return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
232
238
  }
233
239
 
234
- // src/rl/off-policy.ts
235
- function inverseProbabilityWeighting(trajectories, opts = {}) {
236
- const cap = opts.weightCap ?? Infinity;
237
- const clip = opts.rewardClip ?? { low: 0, high: 1 };
238
- if (trajectories.length === 0) {
239
- return zeroEstimate();
240
- }
241
- const weights = [];
242
- const weightedRewards = [];
243
- let maxW = 0;
244
- for (const t of trajectories) {
245
- if (t.behaviorProb <= 0) {
246
- throw new ValidationError(
247
- `inverseProbabilityWeighting: behaviorProb must be > 0 (runId=${t.runId})`
248
- );
249
- }
250
- const w = Math.min(cap, t.targetProb / t.behaviorProb);
251
- const r = clamp(t.reward, clip.low, clip.high);
252
- weights.push(w);
253
- weightedRewards.push(w * r);
254
- if (w > maxW) maxW = w;
255
- }
256
- const n = weights.length;
257
- const value = weightedRewards.reduce((s, x) => s + x, 0) / n;
258
- const variance = weightedRewards.reduce((s, x) => s + (x - value) ** 2, 0) / Math.max(1, n - 1);
259
- const sumW = weights.reduce((s, w) => s + w, 0);
260
- const sumW2 = weights.reduce((s, w) => s + w * w, 0);
261
- const effN = sumW === 0 ? 0 : sumW * sumW / sumW2;
262
- return {
263
- value,
264
- standardError: Math.sqrt(variance / n),
265
- effectiveSampleSize: effN,
266
- n,
267
- maxImportanceWeight: maxW
268
- };
269
- }
270
- function selfNormalizedImportanceWeighting(trajectories, opts = {}) {
271
- const cap = opts.weightCap ?? Infinity;
272
- const clip = opts.rewardClip ?? { low: 0, high: 1 };
273
- if (trajectories.length === 0) return zeroEstimate();
274
- const weights = [];
275
- const rewards = [];
276
- let maxW = 0;
277
- for (const t of trajectories) {
278
- if (t.behaviorProb <= 0) {
279
- throw new ValidationError(
280
- `selfNormalizedImportanceWeighting: behaviorProb must be > 0 (runId=${t.runId})`
281
- );
282
- }
283
- const w = Math.min(cap, t.targetProb / t.behaviorProb);
284
- weights.push(w);
285
- rewards.push(clamp(t.reward, clip.low, clip.high));
286
- if (w > maxW) maxW = w;
287
- }
288
- const sumW = weights.reduce((s, w) => s + w, 0);
289
- const sumWR = weights.reduce((s, w, i) => s + w * rewards[i], 0);
290
- const value = sumW === 0 ? 0 : sumWR / sumW;
291
- const sumW2 = weights.reduce((s, w) => s + w * w, 0);
292
- const effN = sumW === 0 ? 0 : sumW * sumW / sumW2;
293
- const phi = weights.map((w, i) => w * (rewards[i] - value));
294
- const variance = phi.reduce((s, x) => s + x * x, 0) / Math.max(1, sumW * sumW);
295
- return {
296
- value,
297
- standardError: Math.sqrt(variance),
298
- effectiveSampleSize: effN,
299
- n: trajectories.length,
300
- maxImportanceWeight: maxW
301
- };
302
- }
303
- function doublyRobust(trajectories, opts = {}) {
304
- const cap = opts.weightCap ?? Infinity;
305
- const clip = opts.rewardClip ?? { low: 0, high: 1 };
306
- if (trajectories.length === 0) return zeroEstimate();
307
- const contributions = [];
308
- let maxW = 0;
309
- let sumW = 0;
310
- let sumW2 = 0;
311
- for (const t of trajectories) {
312
- if (t.behaviorProb <= 0) {
313
- throw new ValidationError(`doublyRobust: behaviorProb must be > 0 (runId=${t.runId})`);
314
- }
315
- const w = Math.min(cap, t.targetProb / t.behaviorProb);
316
- const r = clamp(t.reward, clip.low, clip.high);
317
- const q = typeof t.qHat === "number" && Number.isFinite(t.qHat) ? clamp(t.qHat, clip.low, clip.high) : null;
318
- if (q === null) {
319
- contributions.push(w * r);
320
- } else {
321
- contributions.push(q + w * (r - q));
322
- }
323
- if (w > maxW) maxW = w;
324
- sumW += w;
325
- sumW2 += w * w;
326
- }
327
- const n = contributions.length;
328
- const value = contributions.reduce((s, x) => s + x, 0) / n;
329
- const variance = contributions.reduce((s, x) => s + (x - value) ** 2, 0) / Math.max(1, n - 1);
330
- const effN = sumW === 0 ? 0 : sumW * sumW / sumW2;
331
- return {
332
- value,
333
- standardError: Math.sqrt(variance / n),
334
- effectiveSampleSize: effN,
335
- n,
336
- maxImportanceWeight: maxW
337
- };
338
- }
339
- function offPolicyEstimateAll(trajectories, opts = {}) {
340
- return {
341
- ips: inverseProbabilityWeighting(trajectories, opts),
342
- snips: selfNormalizedImportanceWeighting(trajectories, opts),
343
- dr: doublyRobust(trajectories, opts)
344
- };
345
- }
346
- function zeroEstimate() {
347
- return { value: 0, standardError: 0, effectiveSampleSize: 0, n: 0, maxImportanceWeight: 0 };
348
- }
349
- function clamp(x, lo, hi) {
350
- if (!Number.isFinite(x)) return lo;
351
- return Math.max(lo, Math.min(hi, x));
352
- }
353
-
354
240
  // src/rl/preferences.ts
355
241
  var SPLIT_TAG_DEFAULT = "holdout";
356
242
  var DEFAULT_REWARD = (run) => {