@tangle-network/agent-eval 0.20.12 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +76 -0
- package/README.md +39 -1
- package/dist/{chunk-75MCTH7P.js → chunk-3GN6U53I.js} +198 -3
- package/dist/chunk-3GN6U53I.js.map +1 -0
- package/dist/chunk-3IX6QTB7.js +1349 -0
- package/dist/chunk-3IX6QTB7.js.map +1 -0
- package/dist/{chunk-PKCVBYTQ.js → chunk-5IIQKMD5.js} +38 -2
- package/dist/chunk-5IIQKMD5.js.map +1 -0
- package/dist/{chunk-MCMV7DUL.js → chunk-ARZ6BEV6.js} +2 -2
- package/dist/{chunk-HKYRWNHV.js → chunk-HRZELXCR.js} +2 -2
- package/dist/{chunk-ODFINDLQ.js → chunk-KRR4VMH7.js} +11 -1
- package/dist/chunk-KRR4VMH7.js.map +1 -0
- package/dist/chunk-SNUHRBDL.js +154 -0
- package/dist/chunk-SNUHRBDL.js.map +1 -0
- package/dist/{chunk-KWUAAIHR.js → chunk-WOK2RTWG.js} +157 -1
- package/dist/chunk-WOK2RTWG.js.map +1 -0
- package/dist/{chunk-HNJLMAJ2.js → chunk-WOPGKVN4.js} +2 -2
- package/dist/cli.js +3 -2
- package/dist/cli.js.map +1 -1
- package/dist/{control-C8NKbF3w.d.ts → control-cxwMOAsy.d.ts} +3 -2
- package/dist/control.d.ts +4 -3
- package/dist/control.js +2 -2
- package/dist/emitter-B2XqDKFU.d.ts +121 -0
- package/dist/{feedback-trajectory-BGQ_ANCN.d.ts → feedback-trajectory-CB0A32o3.d.ts} +2 -1
- package/dist/index.d.ts +71 -83
- package/dist/index.js +48 -60
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +3 -2
- package/dist/optimization.js +2 -2
- package/dist/reporting-Da2ihlcM.d.ts +672 -0
- package/dist/reporting.d.ts +5 -426
- package/dist/reporting.js +6 -2
- package/dist/{emitter-BYO2nSDA.d.ts → store-u47QaJ9G.d.ts} +1 -91
- package/dist/traces.d.ts +259 -3
- package/dist/traces.js +24 -4
- package/dist/wire/index.js +3 -2
- package/docs/research-report-methodology.md +155 -0
- package/package.json +10 -12
- package/dist/chunk-75MCTH7P.js.map +0 -1
- package/dist/chunk-IKFVX537.js +0 -717
- package/dist/chunk-IKFVX537.js.map +0 -1
- package/dist/chunk-KWUAAIHR.js.map +0 -1
- package/dist/chunk-ODFINDLQ.js.map +0 -1
- package/dist/chunk-PKCVBYTQ.js.map +0 -1
- /package/dist/{chunk-MCMV7DUL.js.map → chunk-ARZ6BEV6.js.map} +0 -0
- /package/dist/{chunk-HKYRWNHV.js.map → chunk-HRZELXCR.js.map} +0 -0
- /package/dist/{chunk-HNJLMAJ2.js.map → chunk-WOPGKVN4.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -19,7 +19,7 @@ import {
|
|
|
19
19
|
stopOnNoProgress,
|
|
20
20
|
stopOnRepeatedAction,
|
|
21
21
|
subjectiveEval
|
|
22
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-ARZ6BEV6.js";
|
|
23
23
|
import {
|
|
24
24
|
CallbackResearcher,
|
|
25
25
|
DEFAULT_MUTATION_PRIMITIVES,
|
|
@@ -53,7 +53,7 @@ import {
|
|
|
53
53
|
summarizePreferenceMemory,
|
|
54
54
|
trialTraceFromMultiShotTrial,
|
|
55
55
|
withAssignedFeedbackSplit
|
|
56
|
-
} from "./chunk-
|
|
56
|
+
} from "./chunk-HRZELXCR.js";
|
|
57
57
|
import {
|
|
58
58
|
RunRecordValidationError,
|
|
59
59
|
isRunRecord,
|
|
@@ -62,16 +62,23 @@ import {
|
|
|
62
62
|
validateRunRecord
|
|
63
63
|
} from "./chunk-YUFXO3TU.js";
|
|
64
64
|
import {
|
|
65
|
+
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
65
66
|
assertReleaseConfidence,
|
|
66
67
|
bootstrapCi,
|
|
68
|
+
canonicalize,
|
|
69
|
+
evaluateHypothesis,
|
|
67
70
|
evaluateReleaseConfidence,
|
|
68
71
|
gainHistogram,
|
|
72
|
+
hashJson,
|
|
69
73
|
judgeReplayGate,
|
|
70
74
|
paretoChart,
|
|
71
75
|
releaseTraceEvidenceFromMultiShotTrials,
|
|
72
76
|
renderReleaseReport,
|
|
73
|
-
|
|
74
|
-
|
|
77
|
+
researchReport,
|
|
78
|
+
signManifest,
|
|
79
|
+
summaryTable,
|
|
80
|
+
verifyManifest
|
|
81
|
+
} from "./chunk-3IX6QTB7.js";
|
|
75
82
|
import {
|
|
76
83
|
benjaminiHochberg,
|
|
77
84
|
bhAdjust,
|
|
@@ -88,7 +95,7 @@ import {
|
|
|
88
95
|
requiredSampleSize,
|
|
89
96
|
weightedMean,
|
|
90
97
|
wilcoxonSignedRank
|
|
91
|
-
} from "./chunk-
|
|
98
|
+
} from "./chunk-KRR4VMH7.js";
|
|
92
99
|
import {
|
|
93
100
|
DEFAULT_REDACTION_RULES,
|
|
94
101
|
DEFAULT_TRACE_ANALYST_BUDGETS,
|
|
@@ -98,6 +105,7 @@ import {
|
|
|
98
105
|
OTEL_AGENT_EVAL_SCOPE,
|
|
99
106
|
OtlpFileTraceStore,
|
|
100
107
|
REDACTION_VERSION,
|
|
108
|
+
RunIntegrityError,
|
|
101
109
|
SpanNotFoundError,
|
|
102
110
|
TRACE_ANALYST_ACTOR_DESCRIPTION,
|
|
103
111
|
TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
|
|
@@ -109,6 +117,7 @@ import {
|
|
|
109
117
|
aggregateLlm,
|
|
110
118
|
analyzeTraces,
|
|
111
119
|
argHash,
|
|
120
|
+
assertRunCaptured,
|
|
112
121
|
buildTraceAnalystTools,
|
|
113
122
|
buildTraceInsightContext,
|
|
114
123
|
buildTraceInsightPrompt,
|
|
@@ -131,22 +140,33 @@ import {
|
|
|
131
140
|
runFailureClass,
|
|
132
141
|
runsForScenario,
|
|
133
142
|
scoreTraceInsightReadiness,
|
|
143
|
+
throwIfRunIncomplete,
|
|
134
144
|
tokenizeDomainWords,
|
|
135
145
|
toolSpans,
|
|
136
|
-
traceAnalystFunctionGroup
|
|
137
|
-
|
|
146
|
+
traceAnalystFunctionGroup,
|
|
147
|
+
traceAnalystOnRunComplete
|
|
148
|
+
} from "./chunk-WOK2RTWG.js";
|
|
138
149
|
import {
|
|
139
150
|
TraceEmitter,
|
|
140
151
|
llmSpanFromProvider
|
|
141
|
-
} from "./chunk-
|
|
152
|
+
} from "./chunk-5IIQKMD5.js";
|
|
142
153
|
import {
|
|
143
154
|
LlmCallError,
|
|
144
155
|
LlmClient,
|
|
156
|
+
LlmRouteAssertionError,
|
|
157
|
+
assertLlmRoute,
|
|
145
158
|
callLlm,
|
|
146
159
|
callLlmJson,
|
|
147
160
|
probeLlm,
|
|
148
161
|
stripFencedJson
|
|
149
|
-
} from "./chunk-
|
|
162
|
+
} from "./chunk-3GN6U53I.js";
|
|
163
|
+
import {
|
|
164
|
+
FileSystemRawProviderSink,
|
|
165
|
+
InMemoryRawProviderSink,
|
|
166
|
+
NoopRawProviderSink,
|
|
167
|
+
defaultProviderRedactor,
|
|
168
|
+
providerFromBaseUrl
|
|
169
|
+
} from "./chunk-SNUHRBDL.js";
|
|
150
170
|
import "./chunk-PZ5AY32C.js";
|
|
151
171
|
|
|
152
172
|
// src/client.ts
|
|
@@ -4847,7 +4867,7 @@ var Dataset = class _Dataset {
|
|
|
4847
4867
|
* Write to disk for contamination-verifiable archives.
|
|
4848
4868
|
*/
|
|
4849
4869
|
toJsonl() {
|
|
4850
|
-
return this.scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map((s) => JSON.stringify(
|
|
4870
|
+
return this.scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map((s) => JSON.stringify(canonicalize2(s))).join("\n") + "\n";
|
|
4851
4871
|
}
|
|
4852
4872
|
static fromJsonl(jsonl, manifest) {
|
|
4853
4873
|
const scenarios = [];
|
|
@@ -4860,18 +4880,18 @@ var Dataset = class _Dataset {
|
|
|
4860
4880
|
}
|
|
4861
4881
|
};
|
|
4862
4882
|
async function hashScenarios(scenarios) {
|
|
4863
|
-
const canonical = scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map(
|
|
4883
|
+
const canonical = scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map(canonicalize2);
|
|
4864
4884
|
const text = JSON.stringify(canonical);
|
|
4865
4885
|
const bytes = new TextEncoder().encode(text);
|
|
4866
4886
|
const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
|
|
4867
4887
|
return Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
4868
4888
|
}
|
|
4869
|
-
function
|
|
4889
|
+
function canonicalize2(v) {
|
|
4870
4890
|
if (v === null || typeof v !== "object") return v;
|
|
4871
|
-
if (Array.isArray(v)) return v.map(
|
|
4891
|
+
if (Array.isArray(v)) return v.map(canonicalize2);
|
|
4872
4892
|
const keys = Object.keys(v).sort();
|
|
4873
4893
|
const out = {};
|
|
4874
|
-
for (const k of keys) out[k] =
|
|
4894
|
+
for (const k of keys) out[k] = canonicalize2(v[k]);
|
|
4875
4895
|
return out;
|
|
4876
4896
|
}
|
|
4877
4897
|
function seededShuffle(items, seed) {
|
|
@@ -6978,51 +6998,6 @@ function attributeStep(op, prmA, prmB) {
|
|
|
6978
6998
|
};
|
|
6979
6999
|
}
|
|
6980
7000
|
|
|
6981
|
-
// src/pre-registration.ts
|
|
6982
|
-
function canonicalize2(v) {
|
|
6983
|
-
if (v === null || typeof v !== "object") return v;
|
|
6984
|
-
if (Array.isArray(v)) return v.map(canonicalize2);
|
|
6985
|
-
const keys = Object.keys(v).sort();
|
|
6986
|
-
const out = {};
|
|
6987
|
-
for (const k of keys) out[k] = canonicalize2(v[k]);
|
|
6988
|
-
return out;
|
|
6989
|
-
}
|
|
6990
|
-
async function hashJson(obj) {
|
|
6991
|
-
const canonical = canonicalize2(obj);
|
|
6992
|
-
const bytes = new TextEncoder().encode(JSON.stringify(canonical));
|
|
6993
|
-
const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
|
|
6994
|
-
return Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
6995
|
-
}
|
|
6996
|
-
async function signManifest(m) {
|
|
6997
|
-
const hash = await hashJson(m);
|
|
6998
|
-
return { ...m, contentHash: hash, algo: "sha256-content" };
|
|
6999
|
-
}
|
|
7000
|
-
async function verifyManifest(m) {
|
|
7001
|
-
const { contentHash, algo: _algo, ...rest } = m;
|
|
7002
|
-
void _algo;
|
|
7003
|
-
const resigned = await signManifest(rest);
|
|
7004
|
-
return resigned.contentHash === contentHash;
|
|
7005
|
-
}
|
|
7006
|
-
async function evaluateHypothesis(manifest, observed) {
|
|
7007
|
-
if (!await verifyManifest(manifest)) {
|
|
7008
|
-
throw new Error("evaluateHypothesis: manifest content hash mismatch (tampered)");
|
|
7009
|
-
}
|
|
7010
|
-
const reasons = [];
|
|
7011
|
-
const directionOk = manifest.direction === "increase" ? observed.effect > 0 : observed.effect < 0;
|
|
7012
|
-
if (!directionOk) reasons.push("wrong_direction");
|
|
7013
|
-
if (Math.abs(observed.effect) < manifest.minEffect) reasons.push("effect_too_small");
|
|
7014
|
-
if (observed.pValue >= manifest.alpha) reasons.push("not_significant");
|
|
7015
|
-
if (observed.n < manifest.preRegisteredN) reasons.push("undersampled");
|
|
7016
|
-
return {
|
|
7017
|
-
manifest,
|
|
7018
|
-
observedN: observed.n,
|
|
7019
|
-
observedEffect: observed.effect,
|
|
7020
|
-
observedPValue: observed.pValue,
|
|
7021
|
-
confirmed: reasons.length === 0,
|
|
7022
|
-
rejectionReasons: reasons
|
|
7023
|
-
};
|
|
7024
|
-
}
|
|
7025
|
-
|
|
7026
7001
|
// src/self-play.ts
|
|
7027
7002
|
async function runSelfPlay(proposer, scorer, targets, options = {}) {
|
|
7028
7003
|
if (targets.length < 2) throw new Error("runSelfPlay: at least 2 targets required (need a difference to measure)");
|
|
@@ -10481,6 +10456,7 @@ export {
|
|
|
10481
10456
|
FileSystemExperimentStore,
|
|
10482
10457
|
FileSystemFeedbackTrajectoryStore,
|
|
10483
10458
|
FileSystemOutcomeStore,
|
|
10459
|
+
FileSystemRawProviderSink,
|
|
10484
10460
|
FileSystemTraceStore,
|
|
10485
10461
|
HeldOutGate,
|
|
10486
10462
|
HoldoutAuditor,
|
|
@@ -10489,6 +10465,7 @@ export {
|
|
|
10489
10465
|
InMemoryExperimentStore,
|
|
10490
10466
|
InMemoryFeedbackTrajectoryStore,
|
|
10491
10467
|
InMemoryOutcomeStore,
|
|
10468
|
+
InMemoryRawProviderSink,
|
|
10492
10469
|
InMemoryTraceStore,
|
|
10493
10470
|
InMemoryTrialCache,
|
|
10494
10471
|
InMemoryWorkspaceInspector,
|
|
@@ -10497,12 +10474,14 @@ export {
|
|
|
10497
10474
|
LineageRecorder,
|
|
10498
10475
|
LlmCallError,
|
|
10499
10476
|
LlmClient,
|
|
10477
|
+
LlmRouteAssertionError,
|
|
10500
10478
|
LockedJsonlAppender,
|
|
10501
10479
|
MODEL_PRICING,
|
|
10502
10480
|
MetricsCollector,
|
|
10503
10481
|
MultiLayerVerifier,
|
|
10504
10482
|
MutationTelemetry,
|
|
10505
10483
|
Mutex,
|
|
10484
|
+
NoopRawProviderSink,
|
|
10506
10485
|
NoopResearcher,
|
|
10507
10486
|
OTEL_AGENT_EVAL_SCOPE,
|
|
10508
10487
|
OtlpFileTraceStore,
|
|
@@ -10512,7 +10491,9 @@ export {
|
|
|
10512
10491
|
ProjectRegistry,
|
|
10513
10492
|
PromptRegistry,
|
|
10514
10493
|
REDACTION_VERSION,
|
|
10494
|
+
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
10515
10495
|
RunCritic,
|
|
10496
|
+
RunIntegrityError,
|
|
10516
10497
|
RunRecordValidationError,
|
|
10517
10498
|
SEMANTIC_CONCEPT_JUDGE_VERSION,
|
|
10518
10499
|
SandboxHarness,
|
|
@@ -10539,7 +10520,9 @@ export {
|
|
|
10539
10520
|
analyzeSeries,
|
|
10540
10521
|
analyzeTraces,
|
|
10541
10522
|
argHash,
|
|
10523
|
+
assertLlmRoute,
|
|
10542
10524
|
assertReleaseConfidence,
|
|
10525
|
+
assertRunCaptured,
|
|
10543
10526
|
assignFeedbackSplit,
|
|
10544
10527
|
attributeCounterfactuals,
|
|
10545
10528
|
deterministicSplit as benchmarkDeterministicSplit,
|
|
@@ -10563,7 +10546,7 @@ export {
|
|
|
10563
10546
|
callLlm,
|
|
10564
10547
|
callLlmJson,
|
|
10565
10548
|
canaryLeakView,
|
|
10566
|
-
|
|
10549
|
+
canonicalize,
|
|
10567
10550
|
causalAttribution,
|
|
10568
10551
|
checkBehavioralCanary,
|
|
10569
10552
|
checkCanaries,
|
|
@@ -10606,6 +10589,7 @@ export {
|
|
|
10606
10589
|
decideReferenceReplayRunPromotion,
|
|
10607
10590
|
defaultJudges,
|
|
10608
10591
|
defaultMultiShotObjectives,
|
|
10592
|
+
defaultProviderRedactor,
|
|
10609
10593
|
defaultReferenceReplayMatcher,
|
|
10610
10594
|
defaultTraceInsightPanel,
|
|
10611
10595
|
deployGateLayer,
|
|
@@ -10720,6 +10704,7 @@ export {
|
|
|
10720
10704
|
probeLlm,
|
|
10721
10705
|
promptBisect,
|
|
10722
10706
|
proposeSynthesisTargets,
|
|
10707
|
+
providerFromBaseUrl,
|
|
10723
10708
|
pytestTestParser,
|
|
10724
10709
|
redTeamDataset,
|
|
10725
10710
|
redTeamReport,
|
|
@@ -10742,6 +10727,7 @@ export {
|
|
|
10742
10727
|
replayScorerOverCorpus,
|
|
10743
10728
|
replayTraceThroughJudge,
|
|
10744
10729
|
requiredSampleSize,
|
|
10730
|
+
researchReport,
|
|
10745
10731
|
resetLockedAppendersForTesting,
|
|
10746
10732
|
resumeBuilderSession,
|
|
10747
10733
|
roundTripRunRecord,
|
|
@@ -10799,6 +10785,7 @@ export {
|
|
|
10799
10785
|
summaryTable,
|
|
10800
10786
|
testJudge,
|
|
10801
10787
|
textInSnapshot,
|
|
10788
|
+
throwIfRunIncomplete,
|
|
10802
10789
|
toLangfuseEnvelope,
|
|
10803
10790
|
toNdjson,
|
|
10804
10791
|
toPrometheusText,
|
|
@@ -10810,6 +10797,7 @@ export {
|
|
|
10810
10797
|
toolSuccessRubric,
|
|
10811
10798
|
toolWasteView,
|
|
10812
10799
|
traceAnalystFunctionGroup,
|
|
10800
|
+
traceAnalystOnRunComplete,
|
|
10813
10801
|
trialTraceFromMultiShotTrial,
|
|
10814
10802
|
typoMutator,
|
|
10815
10803
|
urlContains,
|