@tangle-network/agent-eval 0.20.12 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/CHANGELOG.md +76 -0
  2. package/README.md +39 -1
  3. package/dist/{chunk-75MCTH7P.js → chunk-3GN6U53I.js} +198 -3
  4. package/dist/chunk-3GN6U53I.js.map +1 -0
  5. package/dist/chunk-3IX6QTB7.js +1349 -0
  6. package/dist/chunk-3IX6QTB7.js.map +1 -0
  7. package/dist/{chunk-PKCVBYTQ.js → chunk-5IIQKMD5.js} +38 -2
  8. package/dist/chunk-5IIQKMD5.js.map +1 -0
  9. package/dist/{chunk-MCMV7DUL.js → chunk-ARZ6BEV6.js} +2 -2
  10. package/dist/{chunk-HKYRWNHV.js → chunk-HRZELXCR.js} +2 -2
  11. package/dist/{chunk-ODFINDLQ.js → chunk-KRR4VMH7.js} +11 -1
  12. package/dist/chunk-KRR4VMH7.js.map +1 -0
  13. package/dist/chunk-SNUHRBDL.js +154 -0
  14. package/dist/chunk-SNUHRBDL.js.map +1 -0
  15. package/dist/{chunk-KWUAAIHR.js → chunk-WOK2RTWG.js} +157 -1
  16. package/dist/chunk-WOK2RTWG.js.map +1 -0
  17. package/dist/{chunk-HNJLMAJ2.js → chunk-WOPGKVN4.js} +2 -2
  18. package/dist/cli.js +3 -2
  19. package/dist/cli.js.map +1 -1
  20. package/dist/{control-C8NKbF3w.d.ts → control-cxwMOAsy.d.ts} +3 -2
  21. package/dist/control.d.ts +4 -3
  22. package/dist/control.js +2 -2
  23. package/dist/emitter-B2XqDKFU.d.ts +121 -0
  24. package/dist/{feedback-trajectory-BGQ_ANCN.d.ts → feedback-trajectory-CB0A32o3.d.ts} +2 -1
  25. package/dist/index.d.ts +71 -83
  26. package/dist/index.js +48 -60
  27. package/dist/index.js.map +1 -1
  28. package/dist/openapi.json +1 -1
  29. package/dist/optimization.d.ts +3 -2
  30. package/dist/optimization.js +2 -2
  31. package/dist/reporting-Da2ihlcM.d.ts +672 -0
  32. package/dist/reporting.d.ts +5 -426
  33. package/dist/reporting.js +6 -2
  34. package/dist/{emitter-BYO2nSDA.d.ts → store-u47QaJ9G.d.ts} +1 -91
  35. package/dist/traces.d.ts +259 -3
  36. package/dist/traces.js +24 -4
  37. package/dist/wire/index.js +3 -2
  38. package/docs/research-report-methodology.md +155 -0
  39. package/package.json +10 -12
  40. package/dist/chunk-75MCTH7P.js.map +0 -1
  41. package/dist/chunk-IKFVX537.js +0 -717
  42. package/dist/chunk-IKFVX537.js.map +0 -1
  43. package/dist/chunk-KWUAAIHR.js.map +0 -1
  44. package/dist/chunk-ODFINDLQ.js.map +0 -1
  45. package/dist/chunk-PKCVBYTQ.js.map +0 -1
  46. /package/dist/{chunk-MCMV7DUL.js.map → chunk-ARZ6BEV6.js.map} +0 -0
  47. /package/dist/{chunk-HKYRWNHV.js.map → chunk-HRZELXCR.js.map} +0 -0
  48. /package/dist/{chunk-HNJLMAJ2.js.map → chunk-WOPGKVN4.js.map} +0 -0
package/dist/index.js CHANGED
@@ -19,7 +19,7 @@ import {
19
19
  stopOnNoProgress,
20
20
  stopOnRepeatedAction,
21
21
  subjectiveEval
22
- } from "./chunk-MCMV7DUL.js";
22
+ } from "./chunk-ARZ6BEV6.js";
23
23
  import {
24
24
  CallbackResearcher,
25
25
  DEFAULT_MUTATION_PRIMITIVES,
@@ -53,7 +53,7 @@ import {
53
53
  summarizePreferenceMemory,
54
54
  trialTraceFromMultiShotTrial,
55
55
  withAssignedFeedbackSplit
56
- } from "./chunk-HKYRWNHV.js";
56
+ } from "./chunk-HRZELXCR.js";
57
57
  import {
58
58
  RunRecordValidationError,
59
59
  isRunRecord,
@@ -62,16 +62,23 @@ import {
62
62
  validateRunRecord
63
63
  } from "./chunk-YUFXO3TU.js";
64
64
  import {
65
+ RESEARCH_REPORT_HARD_PAIR_FLOOR,
65
66
  assertReleaseConfidence,
66
67
  bootstrapCi,
68
+ canonicalize,
69
+ evaluateHypothesis,
67
70
  evaluateReleaseConfidence,
68
71
  gainHistogram,
72
+ hashJson,
69
73
  judgeReplayGate,
70
74
  paretoChart,
71
75
  releaseTraceEvidenceFromMultiShotTrials,
72
76
  renderReleaseReport,
73
- summaryTable
74
- } from "./chunk-IKFVX537.js";
77
+ researchReport,
78
+ signManifest,
79
+ summaryTable,
80
+ verifyManifest
81
+ } from "./chunk-3IX6QTB7.js";
75
82
  import {
76
83
  benjaminiHochberg,
77
84
  bhAdjust,
@@ -88,7 +95,7 @@ import {
88
95
  requiredSampleSize,
89
96
  weightedMean,
90
97
  wilcoxonSignedRank
91
- } from "./chunk-ODFINDLQ.js";
98
+ } from "./chunk-KRR4VMH7.js";
92
99
  import {
93
100
  DEFAULT_REDACTION_RULES,
94
101
  DEFAULT_TRACE_ANALYST_BUDGETS,
@@ -98,6 +105,7 @@ import {
98
105
  OTEL_AGENT_EVAL_SCOPE,
99
106
  OtlpFileTraceStore,
100
107
  REDACTION_VERSION,
108
+ RunIntegrityError,
101
109
  SpanNotFoundError,
102
110
  TRACE_ANALYST_ACTOR_DESCRIPTION,
103
111
  TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
@@ -109,6 +117,7 @@ import {
109
117
  aggregateLlm,
110
118
  analyzeTraces,
111
119
  argHash,
120
+ assertRunCaptured,
112
121
  buildTraceAnalystTools,
113
122
  buildTraceInsightContext,
114
123
  buildTraceInsightPrompt,
@@ -131,22 +140,33 @@ import {
131
140
  runFailureClass,
132
141
  runsForScenario,
133
142
  scoreTraceInsightReadiness,
143
+ throwIfRunIncomplete,
134
144
  tokenizeDomainWords,
135
145
  toolSpans,
136
- traceAnalystFunctionGroup
137
- } from "./chunk-KWUAAIHR.js";
146
+ traceAnalystFunctionGroup,
147
+ traceAnalystOnRunComplete
148
+ } from "./chunk-WOK2RTWG.js";
138
149
  import {
139
150
  TraceEmitter,
140
151
  llmSpanFromProvider
141
- } from "./chunk-PKCVBYTQ.js";
152
+ } from "./chunk-5IIQKMD5.js";
142
153
  import {
143
154
  LlmCallError,
144
155
  LlmClient,
156
+ LlmRouteAssertionError,
157
+ assertLlmRoute,
145
158
  callLlm,
146
159
  callLlmJson,
147
160
  probeLlm,
148
161
  stripFencedJson
149
- } from "./chunk-75MCTH7P.js";
162
+ } from "./chunk-3GN6U53I.js";
163
+ import {
164
+ FileSystemRawProviderSink,
165
+ InMemoryRawProviderSink,
166
+ NoopRawProviderSink,
167
+ defaultProviderRedactor,
168
+ providerFromBaseUrl
169
+ } from "./chunk-SNUHRBDL.js";
150
170
  import "./chunk-PZ5AY32C.js";
151
171
 
152
172
  // src/client.ts
@@ -4847,7 +4867,7 @@ var Dataset = class _Dataset {
4847
4867
  * Write to disk for contamination-verifiable archives.
4848
4868
  */
4849
4869
  toJsonl() {
4850
- return this.scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map((s) => JSON.stringify(canonicalize(s))).join("\n") + "\n";
4870
+ return this.scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map((s) => JSON.stringify(canonicalize2(s))).join("\n") + "\n";
4851
4871
  }
4852
4872
  static fromJsonl(jsonl, manifest) {
4853
4873
  const scenarios = [];
@@ -4860,18 +4880,18 @@ var Dataset = class _Dataset {
4860
4880
  }
4861
4881
  };
4862
4882
  async function hashScenarios(scenarios) {
4863
- const canonical = scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map(canonicalize);
4883
+ const canonical = scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map(canonicalize2);
4864
4884
  const text = JSON.stringify(canonical);
4865
4885
  const bytes = new TextEncoder().encode(text);
4866
4886
  const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
4867
4887
  return Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
4868
4888
  }
4869
- function canonicalize(v) {
4889
+ function canonicalize2(v) {
4870
4890
  if (v === null || typeof v !== "object") return v;
4871
- if (Array.isArray(v)) return v.map(canonicalize);
4891
+ if (Array.isArray(v)) return v.map(canonicalize2);
4872
4892
  const keys = Object.keys(v).sort();
4873
4893
  const out = {};
4874
- for (const k of keys) out[k] = canonicalize(v[k]);
4894
+ for (const k of keys) out[k] = canonicalize2(v[k]);
4875
4895
  return out;
4876
4896
  }
4877
4897
  function seededShuffle(items, seed) {
@@ -6978,51 +6998,6 @@ function attributeStep(op, prmA, prmB) {
6978
6998
  };
6979
6999
  }
6980
7000
 
6981
- // src/pre-registration.ts
6982
- function canonicalize2(v) {
6983
- if (v === null || typeof v !== "object") return v;
6984
- if (Array.isArray(v)) return v.map(canonicalize2);
6985
- const keys = Object.keys(v).sort();
6986
- const out = {};
6987
- for (const k of keys) out[k] = canonicalize2(v[k]);
6988
- return out;
6989
- }
6990
- async function hashJson(obj) {
6991
- const canonical = canonicalize2(obj);
6992
- const bytes = new TextEncoder().encode(JSON.stringify(canonical));
6993
- const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
6994
- return Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
6995
- }
6996
- async function signManifest(m) {
6997
- const hash = await hashJson(m);
6998
- return { ...m, contentHash: hash, algo: "sha256-content" };
6999
- }
7000
- async function verifyManifest(m) {
7001
- const { contentHash, algo: _algo, ...rest } = m;
7002
- void _algo;
7003
- const resigned = await signManifest(rest);
7004
- return resigned.contentHash === contentHash;
7005
- }
7006
- async function evaluateHypothesis(manifest, observed) {
7007
- if (!await verifyManifest(manifest)) {
7008
- throw new Error("evaluateHypothesis: manifest content hash mismatch (tampered)");
7009
- }
7010
- const reasons = [];
7011
- const directionOk = manifest.direction === "increase" ? observed.effect > 0 : observed.effect < 0;
7012
- if (!directionOk) reasons.push("wrong_direction");
7013
- if (Math.abs(observed.effect) < manifest.minEffect) reasons.push("effect_too_small");
7014
- if (observed.pValue >= manifest.alpha) reasons.push("not_significant");
7015
- if (observed.n < manifest.preRegisteredN) reasons.push("undersampled");
7016
- return {
7017
- manifest,
7018
- observedN: observed.n,
7019
- observedEffect: observed.effect,
7020
- observedPValue: observed.pValue,
7021
- confirmed: reasons.length === 0,
7022
- rejectionReasons: reasons
7023
- };
7024
- }
7025
-
7026
7001
  // src/self-play.ts
7027
7002
  async function runSelfPlay(proposer, scorer, targets, options = {}) {
7028
7003
  if (targets.length < 2) throw new Error("runSelfPlay: at least 2 targets required (need a difference to measure)");
@@ -10481,6 +10456,7 @@ export {
10481
10456
  FileSystemExperimentStore,
10482
10457
  FileSystemFeedbackTrajectoryStore,
10483
10458
  FileSystemOutcomeStore,
10459
+ FileSystemRawProviderSink,
10484
10460
  FileSystemTraceStore,
10485
10461
  HeldOutGate,
10486
10462
  HoldoutAuditor,
@@ -10489,6 +10465,7 @@ export {
10489
10465
  InMemoryExperimentStore,
10490
10466
  InMemoryFeedbackTrajectoryStore,
10491
10467
  InMemoryOutcomeStore,
10468
+ InMemoryRawProviderSink,
10492
10469
  InMemoryTraceStore,
10493
10470
  InMemoryTrialCache,
10494
10471
  InMemoryWorkspaceInspector,
@@ -10497,12 +10474,14 @@ export {
10497
10474
  LineageRecorder,
10498
10475
  LlmCallError,
10499
10476
  LlmClient,
10477
+ LlmRouteAssertionError,
10500
10478
  LockedJsonlAppender,
10501
10479
  MODEL_PRICING,
10502
10480
  MetricsCollector,
10503
10481
  MultiLayerVerifier,
10504
10482
  MutationTelemetry,
10505
10483
  Mutex,
10484
+ NoopRawProviderSink,
10506
10485
  NoopResearcher,
10507
10486
  OTEL_AGENT_EVAL_SCOPE,
10508
10487
  OtlpFileTraceStore,
@@ -10512,7 +10491,9 @@ export {
10512
10491
  ProjectRegistry,
10513
10492
  PromptRegistry,
10514
10493
  REDACTION_VERSION,
10494
+ RESEARCH_REPORT_HARD_PAIR_FLOOR,
10515
10495
  RunCritic,
10496
+ RunIntegrityError,
10516
10497
  RunRecordValidationError,
10517
10498
  SEMANTIC_CONCEPT_JUDGE_VERSION,
10518
10499
  SandboxHarness,
@@ -10539,7 +10520,9 @@ export {
10539
10520
  analyzeSeries,
10540
10521
  analyzeTraces,
10541
10522
  argHash,
10523
+ assertLlmRoute,
10542
10524
  assertReleaseConfidence,
10525
+ assertRunCaptured,
10543
10526
  assignFeedbackSplit,
10544
10527
  attributeCounterfactuals,
10545
10528
  deterministicSplit as benchmarkDeterministicSplit,
@@ -10563,7 +10546,7 @@ export {
10563
10546
  callLlm,
10564
10547
  callLlmJson,
10565
10548
  canaryLeakView,
10566
- canonicalize2 as canonicalize,
10549
+ canonicalize,
10567
10550
  causalAttribution,
10568
10551
  checkBehavioralCanary,
10569
10552
  checkCanaries,
@@ -10606,6 +10589,7 @@ export {
10606
10589
  decideReferenceReplayRunPromotion,
10607
10590
  defaultJudges,
10608
10591
  defaultMultiShotObjectives,
10592
+ defaultProviderRedactor,
10609
10593
  defaultReferenceReplayMatcher,
10610
10594
  defaultTraceInsightPanel,
10611
10595
  deployGateLayer,
@@ -10720,6 +10704,7 @@ export {
10720
10704
  probeLlm,
10721
10705
  promptBisect,
10722
10706
  proposeSynthesisTargets,
10707
+ providerFromBaseUrl,
10723
10708
  pytestTestParser,
10724
10709
  redTeamDataset,
10725
10710
  redTeamReport,
@@ -10742,6 +10727,7 @@ export {
10742
10727
  replayScorerOverCorpus,
10743
10728
  replayTraceThroughJudge,
10744
10729
  requiredSampleSize,
10730
+ researchReport,
10745
10731
  resetLockedAppendersForTesting,
10746
10732
  resumeBuilderSession,
10747
10733
  roundTripRunRecord,
@@ -10799,6 +10785,7 @@ export {
10799
10785
  summaryTable,
10800
10786
  testJudge,
10801
10787
  textInSnapshot,
10788
+ throwIfRunIncomplete,
10802
10789
  toLangfuseEnvelope,
10803
10790
  toNdjson,
10804
10791
  toPrometheusText,
@@ -10810,6 +10797,7 @@ export {
10810
10797
  toolSuccessRubric,
10811
10798
  toolWasteView,
10812
10799
  traceAnalystFunctionGroup,
10800
+ traceAnalystOnRunComplete,
10813
10801
  trialTraceFromMultiShotTrial,
10814
10802
  typoMutator,
10815
10803
  urlContains,