@tangle-network/agent-eval 0.29.1 → 0.31.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{baseline-BwdCXUS8.d.ts → baseline-4R5deP0N.d.ts} +1 -1
- package/dist/benchmarks/index.d.ts +3 -3
- package/dist/builder-eval/index.d.ts +3 -3
- package/dist/builder-eval/index.js +2 -2
- package/dist/{chunk-R5UQJNKC.js → chunk-4L3WJXQJ.js} +2 -2
- package/dist/{chunk-RUI6SIHY.js → chunk-75ZREHD7.js} +4 -4
- package/dist/{chunk-5AKPEK5L.js → chunk-CXJOVDJR.js} +2 -2
- package/dist/{chunk-K33INZHH.js → chunk-GVQT44CS.js} +2 -2
- package/dist/{chunk-UW4NOOZI.js → chunk-HIO4UIS5.js} +308 -2
- package/dist/chunk-HIO4UIS5.js.map +1 -0
- package/dist/{chunk-4S4BM3QQ.js → chunk-M6RZ5LJN.js} +2 -2
- package/dist/{chunk-NG236HPC.js → chunk-QYJT52YW.js} +1 -1
- package/dist/chunk-QYJT52YW.js.map +1 -0
- package/dist/{chunk-XFZCM5Z3.js → chunk-SMSGXM74.js} +2 -2
- package/dist/{chunk-KTGTIOFD.js → chunk-UBPIXOC4.js} +2 -2
- package/dist/{chunk-DBIGN5MJ.js → chunk-WGXZAQLR.js} +3 -3
- package/dist/{chunk-NLMNWKVM.js → chunk-WSI4K3WB.js} +2 -2
- package/dist/{chunk-PALJO75S.js → chunk-XEL6UP7C.js} +2 -2
- package/dist/{chunk-SZSBQUIJ.js → chunk-Y2CPBYKH.js} +3 -3
- package/dist/{chunk-QHF6EQKK.js → chunk-YTMXBHFM.js} +2 -2
- package/dist/cli.js +3 -3
- package/dist/{control-rJhEDdpy.d.ts → control-BFpqHFV2.d.ts} +5 -5
- package/dist/{control-runtime-BRdQ0wrx.d.ts → control-runtime-BZ_lVLYW.d.ts} +2 -2
- package/dist/control.d.ts +8 -8
- package/dist/control.js +3 -3
- package/dist/{dataset-CiK_3LDr.d.ts → dataset-ueRVTUoY.d.ts} +1 -1
- package/dist/{emitter-BqjeOvJh.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
- package/dist/{errors-BZ9sTdz7.d.ts → errors-mje_cKOs.d.ts} +1 -1
- package/dist/{failure-cluster-D1NZKqYu.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -1
- package/dist/{feedback-trajectory-j0nJFgC6.d.ts → feedback-trajectory-iATEAHmc.d.ts} +2 -2
- package/dist/governance/index.d.ts +4 -4
- package/dist/{index-Cgt3DKXr.d.ts → index-DPILdKbP.d.ts} +2 -2
- package/dist/{index--fVrWDiR.d.ts → index-TVjRYWRm.d.ts} +1 -1
- package/dist/index.d.ts +254 -38
- package/dist/index.js +378 -26
- package/dist/index.js.map +1 -1
- package/dist/{integrity-BAxLGJ9I.d.ts → integrity-DYR5gWlb.d.ts} +2 -2
- package/dist/knowledge/index.d.ts +3 -3
- package/dist/meta-eval/index.d.ts +4 -4
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +11 -11
- package/dist/optimization.js +8 -8
- package/dist/pipelines/index.d.ts +6 -6
- package/dist/pipelines/index.js +3 -3
- package/dist/prm/index.d.ts +4 -4
- package/dist/{query-BFDT0kX_.d.ts → query-DODUYdPg.d.ts} +1 -1
- package/dist/{release-report-PWhGlpfO.d.ts → release-report-C8r4Vben.d.ts} +3 -3
- package/dist/reporting.d.ts +8 -8
- package/dist/reporting.js +4 -4
- package/dist/{researcher-ClDX3KZx.d.ts → researcher-BmgJ_901.d.ts} +6 -6
- package/dist/rl.d.ts +10 -10
- package/dist/rl.js +6 -6
- package/dist/{rubric-DgSqjqqj.d.ts → rubric-D5tjHNJQ.d.ts} +2 -2
- package/dist/{rubric-predictive-validity-C0uDYwG6.d.ts → rubric-predictive-validity-Bm-CbN46.d.ts} +1 -1
- package/dist/{run-record-CqzahIbx.d.ts → run-record-nYf9x2hU.d.ts} +1 -1
- package/dist/{store-BP5be6s7.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
- package/dist/{summary-report-jrSGb2xZ.d.ts → summary-report-dir7A-eQ.d.ts} +2 -2
- package/dist/{test-graded-scenario-BJ54PDan.d.ts → test-graded-scenario-B2kWEdh9.d.ts} +2 -2
- package/dist/traces.d.ts +533 -10
- package/dist/traces.js +14 -300
- package/dist/traces.js.map +1 -1
- package/dist/{trajectory-BFmveYZt.d.ts → trajectory-CnoBo-JY.d.ts} +1 -1
- package/dist/wire/index.d.ts +6 -6
- package/dist/wire/index.js +3 -3
- package/package.json +1 -1
- package/dist/chunk-NG236HPC.js.map +0 -1
- package/dist/chunk-UW4NOOZI.js.map +0 -1
- package/dist/replay-BX5Fm8en.d.ts +0 -529
- /package/dist/{chunk-R5UQJNKC.js.map → chunk-4L3WJXQJ.js.map} +0 -0
- /package/dist/{chunk-RUI6SIHY.js.map → chunk-75ZREHD7.js.map} +0 -0
- /package/dist/{chunk-5AKPEK5L.js.map → chunk-CXJOVDJR.js.map} +0 -0
- /package/dist/{chunk-K33INZHH.js.map → chunk-GVQT44CS.js.map} +0 -0
- /package/dist/{chunk-4S4BM3QQ.js.map → chunk-M6RZ5LJN.js.map} +0 -0
- /package/dist/{chunk-XFZCM5Z3.js.map → chunk-SMSGXM74.js.map} +0 -0
- /package/dist/{chunk-KTGTIOFD.js.map → chunk-UBPIXOC4.js.map} +0 -0
- /package/dist/{chunk-DBIGN5MJ.js.map → chunk-WGXZAQLR.js.map} +0 -0
- /package/dist/{chunk-NLMNWKVM.js.map → chunk-WSI4K3WB.js.map} +0 -0
- /package/dist/{chunk-PALJO75S.js.map → chunk-XEL6UP7C.js.map} +0 -0
- /package/dist/{chunk-SZSBQUIJ.js.map → chunk-Y2CPBYKH.js.map} +0 -0
- /package/dist/{chunk-QHF6EQKK.js.map → chunk-YTMXBHFM.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -11,7 +11,7 @@ import {
|
|
|
11
11
|
failureClusterView,
|
|
12
12
|
iqr,
|
|
13
13
|
welchsTTest
|
|
14
|
-
} from "./chunk-
|
|
14
|
+
} from "./chunk-GVQT44CS.js";
|
|
15
15
|
import {
|
|
16
16
|
exportTrainingData,
|
|
17
17
|
toNdjson
|
|
@@ -28,7 +28,7 @@ import {
|
|
|
28
28
|
pytestTestParser,
|
|
29
29
|
runTestGradedScenario,
|
|
30
30
|
vitestTestParser
|
|
31
|
-
} from "./chunk-
|
|
31
|
+
} from "./chunk-YTMXBHFM.js";
|
|
32
32
|
import {
|
|
33
33
|
classifyEuAiRisk,
|
|
34
34
|
euAiActReport,
|
|
@@ -54,7 +54,7 @@ import {
|
|
|
54
54
|
runProposeReview,
|
|
55
55
|
runProposeReviewAsControlLoop,
|
|
56
56
|
scoreFromEvals
|
|
57
|
-
} from "./chunk-
|
|
57
|
+
} from "./chunk-XEL6UP7C.js";
|
|
58
58
|
import {
|
|
59
59
|
allCriticalPassed,
|
|
60
60
|
objectiveEval,
|
|
@@ -96,14 +96,14 @@ import {
|
|
|
96
96
|
summarizePreferenceMemory,
|
|
97
97
|
trialTraceFromMultiShotTrial,
|
|
98
98
|
withAssignedFeedbackSplit
|
|
99
|
-
} from "./chunk-
|
|
99
|
+
} from "./chunk-Y2CPBYKH.js";
|
|
100
100
|
import {
|
|
101
101
|
RunRecordValidationError,
|
|
102
102
|
isRunRecord,
|
|
103
103
|
parseRunRecordSafe,
|
|
104
104
|
roundTripRunRecord,
|
|
105
105
|
validateRunRecord
|
|
106
|
-
} from "./chunk-
|
|
106
|
+
} from "./chunk-WSI4K3WB.js";
|
|
107
107
|
import {
|
|
108
108
|
assertReleaseConfidence,
|
|
109
109
|
bootstrapCi,
|
|
@@ -111,10 +111,10 @@ import {
|
|
|
111
111
|
judgeReplayGate,
|
|
112
112
|
releaseTraceEvidenceFromMultiShotTrials,
|
|
113
113
|
renderReleaseReport
|
|
114
|
-
} from "./chunk-
|
|
114
|
+
} from "./chunk-WGXZAQLR.js";
|
|
115
115
|
import {
|
|
116
116
|
runEvalCampaign
|
|
117
|
-
} from "./chunk-
|
|
117
|
+
} from "./chunk-75ZREHD7.js";
|
|
118
118
|
import {
|
|
119
119
|
LlmCallError,
|
|
120
120
|
LlmClient,
|
|
@@ -124,7 +124,7 @@ import {
|
|
|
124
124
|
callLlmJson,
|
|
125
125
|
probeLlm,
|
|
126
126
|
stripFencedJson
|
|
127
|
-
} from "./chunk-
|
|
127
|
+
} from "./chunk-M6RZ5LJN.js";
|
|
128
128
|
import {
|
|
129
129
|
evaluateInterimReleaseConfidence,
|
|
130
130
|
pairedEvalueSequence
|
|
@@ -141,7 +141,7 @@ import {
|
|
|
141
141
|
requiredSampleSize,
|
|
142
142
|
researchReport,
|
|
143
143
|
summaryTable
|
|
144
|
-
} from "./chunk-
|
|
144
|
+
} from "./chunk-CXJOVDJR.js";
|
|
145
145
|
import {
|
|
146
146
|
calibrateJudge,
|
|
147
147
|
calibrateJudgeContinuous,
|
|
@@ -160,24 +160,43 @@ import {
|
|
|
160
160
|
verbosityBias,
|
|
161
161
|
weightedMean,
|
|
162
162
|
wilcoxonSignedRank
|
|
163
|
-
} from "./chunk-
|
|
163
|
+
} from "./chunk-4L3WJXQJ.js";
|
|
164
164
|
import {
|
|
165
165
|
DEFAULT_REDACTION_RULES,
|
|
166
|
+
DEFAULT_TRACE_ANALYST_BUDGETS,
|
|
166
167
|
FileSystemTraceStore,
|
|
167
168
|
InMemoryTraceStore,
|
|
168
169
|
OTEL_AGENT_EVAL_SCOPE,
|
|
170
|
+
OtlpFileTraceStore,
|
|
169
171
|
REDACTION_VERSION,
|
|
170
172
|
ReplayCache,
|
|
171
173
|
ReplayCacheMissError,
|
|
174
|
+
SpanNotFoundError,
|
|
175
|
+
TRACE_ANALYST_ACTOR_DESCRIPTION,
|
|
176
|
+
TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
|
|
177
|
+
TRACE_ANALYST_SUBAGENT_DESCRIPTION,
|
|
178
|
+
TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
|
|
172
179
|
TraceFileMissingError,
|
|
180
|
+
TraceNotFoundError,
|
|
173
181
|
analyzeTraces,
|
|
174
182
|
buildTraceAnalystTools,
|
|
183
|
+
buildTraceInsightContext,
|
|
184
|
+
buildTraceInsightPrompt,
|
|
175
185
|
createReplayFetch,
|
|
186
|
+
defaultTraceInsightPanel,
|
|
187
|
+
describeTraceInsightScope,
|
|
188
|
+
domainEvidencePattern,
|
|
176
189
|
exportRunAsOtlp,
|
|
190
|
+
inferDomainKeywords,
|
|
177
191
|
iterateRawCalls,
|
|
192
|
+
planTraceInsightQuestions,
|
|
178
193
|
redactString,
|
|
179
|
-
redactValue
|
|
180
|
-
|
|
194
|
+
redactValue,
|
|
195
|
+
scoreTraceInsightReadiness,
|
|
196
|
+
tokenizeDomainWords,
|
|
197
|
+
traceAnalystFunctionGroup,
|
|
198
|
+
traceAnalystOnRunComplete
|
|
199
|
+
} from "./chunk-HIO4UIS5.js";
|
|
181
200
|
import {
|
|
182
201
|
aggregateLlm,
|
|
183
202
|
argHash,
|
|
@@ -201,7 +220,7 @@ import {
|
|
|
201
220
|
RunIntegrityError,
|
|
202
221
|
assertRunCaptured,
|
|
203
222
|
throwIfRunIncomplete
|
|
204
|
-
} from "./chunk-
|
|
223
|
+
} from "./chunk-UBPIXOC4.js";
|
|
205
224
|
import {
|
|
206
225
|
FileSystemRawProviderSink,
|
|
207
226
|
InMemoryRawProviderSink,
|
|
@@ -229,7 +248,7 @@ import {
|
|
|
229
248
|
ReplayError,
|
|
230
249
|
ValidationError,
|
|
231
250
|
VerificationError
|
|
232
|
-
} from "./chunk-
|
|
251
|
+
} from "./chunk-QYJT52YW.js";
|
|
233
252
|
import "./chunk-PZ5AY32C.js";
|
|
234
253
|
|
|
235
254
|
// src/run-score.ts
|
|
@@ -968,17 +987,194 @@ function resolveModel(req, defaultModel) {
|
|
|
968
987
|
}
|
|
969
988
|
|
|
970
989
|
// src/analyst/finding-signature.ts
|
|
990
|
+
import { z as z2 } from "zod";
|
|
991
|
+
|
|
992
|
+
// src/analyst/finding-subject.ts
|
|
971
993
|
import { z } from "zod";
|
|
994
|
+
var FINDING_SUBJECT_KINDS = [
|
|
995
|
+
"knowledge.wiki",
|
|
996
|
+
"knowledge.claim",
|
|
997
|
+
"knowledge.raw",
|
|
998
|
+
"knowledge.stale",
|
|
999
|
+
"system-prompt",
|
|
1000
|
+
"tool-doc",
|
|
1001
|
+
"new-tool",
|
|
1002
|
+
"rag",
|
|
1003
|
+
"memory",
|
|
1004
|
+
"scaffolding",
|
|
1005
|
+
"output-schema",
|
|
1006
|
+
"websearch.outdated",
|
|
1007
|
+
"prior-run-summary",
|
|
1008
|
+
"cluster"
|
|
1009
|
+
];
|
|
1010
|
+
function parseFindingSubject(raw) {
|
|
1011
|
+
if (raw === null || raw === void 0) return null;
|
|
1012
|
+
const trimmed = raw.trim();
|
|
1013
|
+
if (trimmed.length === 0) return null;
|
|
1014
|
+
const wiki = trimmed.match(
|
|
1015
|
+
/^agent-knowledge:wiki:([a-z0-9][a-z0-9-]*)(?:#([a-z0-9][a-z0-9-]*))?$/
|
|
1016
|
+
);
|
|
1017
|
+
if (wiki)
|
|
1018
|
+
return { kind: "knowledge.wiki", slug: wiki[1], ...wiki[2] ? { heading: wiki[2] } : {} };
|
|
1019
|
+
const claim = trimmed.match(/^agent-knowledge:claim:(.+)$/);
|
|
1020
|
+
if (claim && claim[1].trim().length > 0)
|
|
1021
|
+
return { kind: "knowledge.claim", topic: claim[1].trim() };
|
|
1022
|
+
const raw_ = trimmed.match(/^agent-knowledge:raw:(.+)$/);
|
|
1023
|
+
if (raw_ && raw_[1].trim().length > 0)
|
|
1024
|
+
return { kind: "knowledge.raw", sourceId: raw_[1].trim() };
|
|
1025
|
+
const stale = trimmed.match(/^agent-knowledge:stale:([a-z0-9][a-z0-9-]*)$/);
|
|
1026
|
+
if (stale) return { kind: "knowledge.stale", slug: stale[1] };
|
|
1027
|
+
const sp = trimmed.match(/^system-prompt:(.+)$/);
|
|
1028
|
+
if (sp && sp[1].trim().length > 0) return { kind: "system-prompt", section: sp[1].trim() };
|
|
1029
|
+
const tdAspect = trimmed.match(/^tool-doc:([a-z0-9][a-z0-9_-]*):(.+)$/);
|
|
1030
|
+
if (tdAspect && tdAspect[2].trim().length > 0) {
|
|
1031
|
+
return { kind: "tool-doc", tool: tdAspect[1], aspect: tdAspect[2].trim() };
|
|
1032
|
+
}
|
|
1033
|
+
const td = trimmed.match(/^tool-doc:([a-z0-9][a-z0-9_-]*)$/);
|
|
1034
|
+
if (td) return { kind: "tool-doc", tool: td[1] };
|
|
1035
|
+
const nt = trimmed.match(/^new-tool:([a-z0-9][a-z0-9_-]*)$/);
|
|
1036
|
+
if (nt) return { kind: "new-tool", name: nt[1] };
|
|
1037
|
+
const rag = trimmed.match(/^rag:([a-z0-9][a-z0-9_-]*):(.+)$/);
|
|
1038
|
+
if (rag && rag[2].trim().length > 0) {
|
|
1039
|
+
return { kind: "rag", corpus: rag[1], docId: rag[2].trim() };
|
|
1040
|
+
}
|
|
1041
|
+
const mem = trimmed.match(/^memory:(.+)$/);
|
|
1042
|
+
if (mem && mem[1].trim().length > 0) return { kind: "memory", key: mem[1].trim() };
|
|
1043
|
+
const sc = trimmed.match(/^scaffolding:(.+)$/);
|
|
1044
|
+
if (sc && sc[1].trim().length > 0) return { kind: "scaffolding", concern: sc[1].trim() };
|
|
1045
|
+
const os = trimmed.match(/^output-schema:(.+)$/);
|
|
1046
|
+
if (os && os[1].trim().length > 0) return { kind: "output-schema", field: os[1].trim() };
|
|
1047
|
+
const ws = trimmed.match(/^websearch:outdated:(.+)$/);
|
|
1048
|
+
if (ws && ws[1].trim().length > 0) return { kind: "websearch.outdated", topic: ws[1].trim() };
|
|
1049
|
+
const prs = trimmed.match(/^prior-run-summary:(.+)$/);
|
|
1050
|
+
if (prs && prs[1].trim().length > 0) return { kind: "prior-run-summary", topic: prs[1].trim() };
|
|
1051
|
+
if (/^[a-z0-9][a-z0-9-]*$/.test(trimmed) && trimmed.length <= 80) {
|
|
1052
|
+
return { kind: "cluster", label: trimmed };
|
|
1053
|
+
}
|
|
1054
|
+
return null;
|
|
1055
|
+
}
|
|
1056
|
+
function renderFindingSubject(s) {
|
|
1057
|
+
switch (s.kind) {
|
|
1058
|
+
case "knowledge.wiki":
|
|
1059
|
+
return s.heading ? `agent-knowledge:wiki:${s.slug}#${s.heading}` : `agent-knowledge:wiki:${s.slug}`;
|
|
1060
|
+
case "knowledge.claim":
|
|
1061
|
+
return `agent-knowledge:claim:${s.topic}`;
|
|
1062
|
+
case "knowledge.raw":
|
|
1063
|
+
return `agent-knowledge:raw:${s.sourceId}`;
|
|
1064
|
+
case "knowledge.stale":
|
|
1065
|
+
return `agent-knowledge:stale:${s.slug}`;
|
|
1066
|
+
case "system-prompt":
|
|
1067
|
+
return `system-prompt:${s.section}`;
|
|
1068
|
+
case "tool-doc":
|
|
1069
|
+
return s.aspect ? `tool-doc:${s.tool}:${s.aspect}` : `tool-doc:${s.tool}`;
|
|
1070
|
+
case "new-tool":
|
|
1071
|
+
return `new-tool:${s.name}`;
|
|
1072
|
+
case "rag":
|
|
1073
|
+
return `rag:${s.corpus}:${s.docId}`;
|
|
1074
|
+
case "memory":
|
|
1075
|
+
return `memory:${s.key}`;
|
|
1076
|
+
case "scaffolding":
|
|
1077
|
+
return `scaffolding:${s.concern}`;
|
|
1078
|
+
case "output-schema":
|
|
1079
|
+
return `output-schema:${s.field}`;
|
|
1080
|
+
case "websearch.outdated":
|
|
1081
|
+
return `websearch:outdated:${s.topic}`;
|
|
1082
|
+
case "prior-run-summary":
|
|
1083
|
+
return `prior-run-summary:${s.topic}`;
|
|
1084
|
+
case "cluster":
|
|
1085
|
+
return s.label;
|
|
1086
|
+
}
|
|
1087
|
+
}
|
|
1088
|
+
var FINDING_SUBJECT_GRAMMAR_PROMPT = [
|
|
1089
|
+
"Subjects MUST match this grammar \u2014 anything else is rejected at parse time and your work is wasted:",
|
|
1090
|
+
"",
|
|
1091
|
+
" Knowledge loci (write to the agent-knowledge base):",
|
|
1092
|
+
" agent-knowledge:wiki:<slug>[#<heading>] create / update a wiki page",
|
|
1093
|
+
" agent-knowledge:claim:<topic> draft a claim / relation triple",
|
|
1094
|
+
" agent-knowledge:raw:<source-id> lift a raw source into a curated page",
|
|
1095
|
+
" agent-knowledge:stale:<slug> mark a page superseded",
|
|
1096
|
+
"",
|
|
1097
|
+
" Runtime mutable surfaces (write to prompts / tools / scaffolding):",
|
|
1098
|
+
" system-prompt:<section> add / replace a system-prompt section",
|
|
1099
|
+
" tool-doc:<tool>[:<aspect>] rewrite a tool description",
|
|
1100
|
+
" new-tool:<name> propose a new tool surface",
|
|
1101
|
+
" rag:<corpus>:<doc-id> ingest / correct a RAG document",
|
|
1102
|
+
" memory:<key> invalidate / set a memory entry",
|
|
1103
|
+
" scaffolding:<concern> change a precondition / retry / verifier",
|
|
1104
|
+
" output-schema:<field> constrain the agent output shape",
|
|
1105
|
+
"",
|
|
1106
|
+
" Stale signals (knowledge-poisoning only):",
|
|
1107
|
+
" websearch:outdated:<topic> stale web result",
|
|
1108
|
+
" prior-run-summary:<topic> stale prior-run summary",
|
|
1109
|
+
"",
|
|
1110
|
+
" Cluster label (failure-mode only):",
|
|
1111
|
+
' <kebab-case-label> short cluster id, e.g. "tool-call-loop"',
|
|
1112
|
+
"",
|
|
1113
|
+
"Slugs / tool ids: [a-z0-9-]+ (lowercase kebab). Topics / keys / sections: free-form, trimmed."
|
|
1114
|
+
].join("\n");
|
|
1115
|
+
var KIND_EXPECTED_SUBJECTS = {
|
|
1116
|
+
"failure-mode": ["cluster"],
|
|
1117
|
+
"knowledge-gap": [
|
|
1118
|
+
"knowledge.wiki",
|
|
1119
|
+
"knowledge.claim",
|
|
1120
|
+
"knowledge.raw",
|
|
1121
|
+
"knowledge.stale",
|
|
1122
|
+
"tool-doc",
|
|
1123
|
+
"system-prompt",
|
|
1124
|
+
"memory",
|
|
1125
|
+
"websearch.outdated",
|
|
1126
|
+
"prior-run-summary"
|
|
1127
|
+
],
|
|
1128
|
+
"knowledge-poisoning": [
|
|
1129
|
+
"knowledge.wiki",
|
|
1130
|
+
"knowledge.claim",
|
|
1131
|
+
"knowledge.raw",
|
|
1132
|
+
"tool-doc",
|
|
1133
|
+
"system-prompt",
|
|
1134
|
+
"memory",
|
|
1135
|
+
"websearch.outdated",
|
|
1136
|
+
"prior-run-summary"
|
|
1137
|
+
],
|
|
1138
|
+
improvement: [
|
|
1139
|
+
"system-prompt",
|
|
1140
|
+
"tool-doc",
|
|
1141
|
+
"new-tool",
|
|
1142
|
+
"rag",
|
|
1143
|
+
"memory",
|
|
1144
|
+
"scaffolding",
|
|
1145
|
+
"output-schema",
|
|
1146
|
+
"knowledge.wiki",
|
|
1147
|
+
"knowledge.claim"
|
|
1148
|
+
]
|
|
1149
|
+
};
|
|
1150
|
+
var FindingSubjectStringSchema = z.string().refine((s) => parseFindingSubject(s) !== null, {
|
|
1151
|
+
message: "subject does not match the finding-subject grammar"
|
|
1152
|
+
});
|
|
1153
|
+
|
|
1154
|
+
// src/analyst/finding-signature.ts
|
|
972
1155
|
var ANALYST_SEVERITIES = ["critical", "high", "medium", "low", "info"];
|
|
973
|
-
var RawAnalystFindingSchema =
|
|
974
|
-
severity:
|
|
975
|
-
claim:
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
1156
|
+
var RawAnalystFindingSchema = z2.object({
|
|
1157
|
+
severity: z2.enum(ANALYST_SEVERITIES),
|
|
1158
|
+
claim: z2.string().min(1).max(2e3),
|
|
1159
|
+
/**
|
|
1160
|
+
* Subject locus the finding is about. Validated at parse time
|
|
1161
|
+
* against the documented grammar (`finding-subject.ts`). Findings
|
|
1162
|
+
* with a malformed subject are rejected — they would have been
|
|
1163
|
+
* silently skipped by every downstream adapter, so failing loud at
|
|
1164
|
+
* parse time turns a hidden no-op into a kind-prompt audit signal.
|
|
1165
|
+
*
|
|
1166
|
+
* Optional because purely descriptive findings (no actionable
|
|
1167
|
+
* locus) are legitimate; they just don't route through the
|
|
1168
|
+
* KnowledgeAdapter / ImprovementAdapter.
|
|
1169
|
+
*/
|
|
1170
|
+
subject: z2.string().max(400).refine((s) => parseFindingSubject(s) !== null, {
|
|
1171
|
+
message: "subject does not match the finding-subject grammar"
|
|
1172
|
+
}).optional(),
|
|
1173
|
+
evidence_uri: z2.string().min(1).max(2e3),
|
|
1174
|
+
evidence_excerpt: z2.string().max(2e3).optional(),
|
|
1175
|
+
confidence: z2.number().min(0).max(1),
|
|
1176
|
+
rationale: z2.string().max(4e3).optional(),
|
|
1177
|
+
recommended_action: z2.string().max(2e3).optional()
|
|
982
1178
|
}).strict();
|
|
983
1179
|
var RAW_FINDING_SCHEMA_PROMPT = `Each finding MUST be a JSON object with these fields:
|
|
984
1180
|
- severity: one of "critical" | "high" | "medium" | "low" | "info"
|
|
@@ -1212,18 +1408,42 @@ function createTraceAnalystKind(spec, opts) {
|
|
|
1212
1408
|
tags: ctx.tags
|
|
1213
1409
|
});
|
|
1214
1410
|
const result = await ax.forward(opts.ai, { question: deriveQuestion(ctx, spec) });
|
|
1411
|
+
const expectedSubjects = KIND_EXPECTED_SUBJECTS[spec.id];
|
|
1215
1412
|
const out = [];
|
|
1216
1413
|
const rawRows = Array.isArray(result.findings) ? result.findings : [];
|
|
1414
|
+
let rejectedWrongKind = 0;
|
|
1217
1415
|
for (const row of rawRows) {
|
|
1218
1416
|
const parsed = parseRawFinding(row, ctx.log);
|
|
1219
1417
|
if (!parsed) continue;
|
|
1418
|
+
if (expectedSubjects && parsed.subject !== void 0) {
|
|
1419
|
+
const parsedSubject = parseFindingSubject(parsed.subject);
|
|
1420
|
+
if (parsedSubject === null) {
|
|
1421
|
+
ctx.log?.("finding rejected: subject failed to parse", {
|
|
1422
|
+
kind: spec.id,
|
|
1423
|
+
subject: parsed.subject
|
|
1424
|
+
});
|
|
1425
|
+
rejectedWrongKind += 1;
|
|
1426
|
+
continue;
|
|
1427
|
+
}
|
|
1428
|
+
if (!expectedSubjects.includes(parsedSubject.kind)) {
|
|
1429
|
+
ctx.log?.("finding rejected: subject variant not allowed for this kind", {
|
|
1430
|
+
kind: spec.id,
|
|
1431
|
+
subject_kind: parsedSubject.kind,
|
|
1432
|
+
subject: parsed.subject,
|
|
1433
|
+
allowed: expectedSubjects
|
|
1434
|
+
});
|
|
1435
|
+
rejectedWrongKind += 1;
|
|
1436
|
+
continue;
|
|
1437
|
+
}
|
|
1438
|
+
}
|
|
1220
1439
|
const postProcessed = spec.postProcess?.(parsed, ctx) ?? parsed;
|
|
1221
1440
|
if (!postProcessed) continue;
|
|
1222
1441
|
out.push(toAnalystFinding(spec, postProcessed));
|
|
1223
1442
|
}
|
|
1224
1443
|
ctx.log?.(`analyst.kind ${spec.id} done`, {
|
|
1225
1444
|
emitted: rawRows.length,
|
|
1226
|
-
accepted: out.length
|
|
1445
|
+
accepted: out.length,
|
|
1446
|
+
rejected_wrong_subject: rejectedWrongKind
|
|
1227
1447
|
});
|
|
1228
1448
|
return out;
|
|
1229
1449
|
}
|
|
@@ -3036,6 +3256,107 @@ function suggestionForManifest(input) {
|
|
|
3036
3256
|
return "No action required.";
|
|
3037
3257
|
}
|
|
3038
3258
|
|
|
3259
|
+
// src/integrity/backend-integrity.ts
|
|
3260
|
+
var BackendIntegrityError = class extends AgentEvalError {
|
|
3261
|
+
constructor(message, report) {
|
|
3262
|
+
super("backend_integrity", message);
|
|
3263
|
+
this.report = report;
|
|
3264
|
+
}
|
|
3265
|
+
report;
|
|
3266
|
+
};
|
|
3267
|
+
function isStubRecord(rec) {
|
|
3268
|
+
return rec.tokenUsage.input === 0 && rec.tokenUsage.output === 0;
|
|
3269
|
+
}
|
|
3270
|
+
function isUncostedRecord(rec) {
|
|
3271
|
+
return rec.tokenUsage.output > 0 && rec.costUsd === 0;
|
|
3272
|
+
}
|
|
3273
|
+
function summarizeBackendIntegrity(records) {
|
|
3274
|
+
const totalRecords = records.length;
|
|
3275
|
+
let stubRecords = 0;
|
|
3276
|
+
let realRecords = 0;
|
|
3277
|
+
let uncostedRecords = 0;
|
|
3278
|
+
let totalInputTokens = 0;
|
|
3279
|
+
let totalOutputTokens = 0;
|
|
3280
|
+
let totalCostUsd = 0;
|
|
3281
|
+
for (const rec of records) {
|
|
3282
|
+
totalInputTokens += rec.tokenUsage.input;
|
|
3283
|
+
totalOutputTokens += rec.tokenUsage.output;
|
|
3284
|
+
totalCostUsd += rec.costUsd;
|
|
3285
|
+
if (isStubRecord(rec)) stubRecords++;
|
|
3286
|
+
else realRecords++;
|
|
3287
|
+
if (isUncostedRecord(rec)) uncostedRecords++;
|
|
3288
|
+
}
|
|
3289
|
+
const verdict = totalRecords === 0 ? "stub" : stubRecords === totalRecords ? "stub" : stubRecords === 0 ? "real" : "mixed";
|
|
3290
|
+
const diagnosis = buildDiagnosis({
|
|
3291
|
+
totalRecords,
|
|
3292
|
+
stubRecords,
|
|
3293
|
+
realRecords,
|
|
3294
|
+
uncostedRecords,
|
|
3295
|
+
totalInputTokens,
|
|
3296
|
+
totalOutputTokens,
|
|
3297
|
+
totalCostUsd,
|
|
3298
|
+
verdict
|
|
3299
|
+
});
|
|
3300
|
+
return {
|
|
3301
|
+
totalRecords,
|
|
3302
|
+
stubRecords,
|
|
3303
|
+
realRecords,
|
|
3304
|
+
uncostedRecords,
|
|
3305
|
+
totalInputTokens,
|
|
3306
|
+
totalOutputTokens,
|
|
3307
|
+
totalCostUsd,
|
|
3308
|
+
verdict,
|
|
3309
|
+
diagnosis
|
|
3310
|
+
};
|
|
3311
|
+
}
|
|
3312
|
+
function buildDiagnosis(r) {
|
|
3313
|
+
if (r.totalRecords === 0) {
|
|
3314
|
+
return "no records \u2014 eval produced zero runs; backend likely failed before first turn";
|
|
3315
|
+
}
|
|
3316
|
+
if (r.verdict === "stub") {
|
|
3317
|
+
return [
|
|
3318
|
+
`all ${r.totalRecords} records have zero token usage \u2014 the LLM backend was never called.`,
|
|
3319
|
+
"common causes: --backend sandbox without a sandbox bridge running; stub model returning hard-coded strings;",
|
|
3320
|
+
"auth misconfigured so requests were silently dropped before the LLM. Re-run with --backend tcloud and TANGLE_API_KEY set,",
|
|
3321
|
+
"or boot the cli-bridge / sandbox before invoking the eval."
|
|
3322
|
+
].join(" ");
|
|
3323
|
+
}
|
|
3324
|
+
if (r.verdict === "mixed") {
|
|
3325
|
+
const pct = (r.stubRecords / r.totalRecords * 100).toFixed(0);
|
|
3326
|
+
return [
|
|
3327
|
+
`${r.stubRecords}/${r.totalRecords} records (${pct}%) have zero token usage \u2014 the backend partially failed.`,
|
|
3328
|
+
"common causes: rate-limit cascade (429s after the first N personas);",
|
|
3329
|
+
"transient auth expiry mid-run; provider outage. Treat the affected records as missing data, not agent failures."
|
|
3330
|
+
].join(" ");
|
|
3331
|
+
}
|
|
3332
|
+
if (r.uncostedRecords > 0) {
|
|
3333
|
+
const pct = (r.uncostedRecords / r.totalRecords * 100).toFixed(0);
|
|
3334
|
+
return [
|
|
3335
|
+
`${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens).`,
|
|
3336
|
+
`${r.uncostedRecords} (${pct}%) have output tokens but costUsd=0 \u2014 cost ledger is mis-wired (no input-token`,
|
|
3337
|
+
"propagation from the runtime stream into RunRecord)."
|
|
3338
|
+
].join(" ");
|
|
3339
|
+
}
|
|
3340
|
+
return `${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens, $${r.totalCostUsd.toFixed(4)}).`;
|
|
3341
|
+
}
|
|
3342
|
+
function assertRealBackend(records, opts = {}) {
|
|
3343
|
+
const report = summarizeBackendIntegrity(records);
|
|
3344
|
+
const allowMixed = opts.allowMixed ?? true;
|
|
3345
|
+
if (report.verdict === "stub") {
|
|
3346
|
+
throw new BackendIntegrityError(
|
|
3347
|
+
`backend-integrity: ran against a stub or unconfigured backend \u2014 ${report.diagnosis}`,
|
|
3348
|
+
report
|
|
3349
|
+
);
|
|
3350
|
+
}
|
|
3351
|
+
if (!allowMixed && report.verdict === "mixed") {
|
|
3352
|
+
throw new BackendIntegrityError(
|
|
3353
|
+
`backend-integrity: partial backend failure rejected \u2014 ${report.diagnosis}`,
|
|
3354
|
+
report
|
|
3355
|
+
);
|
|
3356
|
+
}
|
|
3357
|
+
return report;
|
|
3358
|
+
}
|
|
3359
|
+
|
|
3039
3360
|
// src/judges.ts
|
|
3040
3361
|
function createDomainExpertJudge(domain) {
|
|
3041
3362
|
return async (tc, { scenario, turns }) => {
|
|
@@ -9174,8 +9495,8 @@ function chiSquareCritical(df, alpha) {
|
|
|
9174
9495
|
if (TABLE[df]) return TABLE[df][idx];
|
|
9175
9496
|
if (df > 30) {
|
|
9176
9497
|
const zMap = { 0: 1.282, 1: 1.645, 2: 1.96, 3: 2.326 };
|
|
9177
|
-
const
|
|
9178
|
-
const term = 1 - 2 / (9 * df) +
|
|
9498
|
+
const z3 = zMap[idx] ?? 1.96;
|
|
9499
|
+
const term = 1 - 2 / (9 * df) + z3 * Math.sqrt(2 / (9 * df));
|
|
9179
9500
|
return df * term ** 3;
|
|
9180
9501
|
}
|
|
9181
9502
|
const keys = Object.keys(TABLE).map((k) => Number(k)).sort((a, b) => a - b);
|
|
@@ -10098,6 +10419,7 @@ export {
|
|
|
10098
10419
|
AnalystRegistry,
|
|
10099
10420
|
AxGepaSteeringOptimizer,
|
|
10100
10421
|
BENCHMARK_SPLIT_SEED,
|
|
10422
|
+
BackendIntegrityError,
|
|
10101
10423
|
BenchmarkRunner,
|
|
10102
10424
|
BudgetBreachError,
|
|
10103
10425
|
BudgetGuard,
|
|
@@ -10119,6 +10441,7 @@ export {
|
|
|
10119
10441
|
DEFAULT_RED_TEAM_CORPUS,
|
|
10120
10442
|
DEFAULT_RUN_SCORE_WEIGHTS,
|
|
10121
10443
|
DEFAULT_SEVERITY_WEIGHTS,
|
|
10444
|
+
DEFAULT_TRACE_ANALYST_BUDGETS,
|
|
10122
10445
|
DEFAULT_TRACE_ANALYST_KINDS,
|
|
10123
10446
|
Dataset,
|
|
10124
10447
|
DockerSandboxDriver,
|
|
@@ -10127,10 +10450,13 @@ export {
|
|
|
10127
10450
|
ExperimentTracker,
|
|
10128
10451
|
FAILURE_CLASSES,
|
|
10129
10452
|
FAILURE_MODE_KIND_SPEC,
|
|
10453
|
+
FINDING_SUBJECT_GRAMMAR_PROMPT,
|
|
10454
|
+
FINDING_SUBJECT_KINDS,
|
|
10130
10455
|
FileSystemExperimentStore,
|
|
10131
10456
|
FileSystemFeedbackTrajectoryStore,
|
|
10132
10457
|
FileSystemRawProviderSink,
|
|
10133
10458
|
FileSystemTraceStore,
|
|
10459
|
+
FindingSubjectStringSchema,
|
|
10134
10460
|
FindingsStore,
|
|
10135
10461
|
HeldOutGate,
|
|
10136
10462
|
HoldoutAuditor,
|
|
@@ -10146,6 +10472,7 @@ export {
|
|
|
10146
10472
|
JsonlTrialCache,
|
|
10147
10473
|
JudgeError,
|
|
10148
10474
|
JudgeRunner,
|
|
10475
|
+
KIND_EXPECTED_SUBJECTS,
|
|
10149
10476
|
KNOWLEDGE_GAP_KIND_SPEC,
|
|
10150
10477
|
KNOWLEDGE_POISONING_KIND_SPEC,
|
|
10151
10478
|
LineageRecorder,
|
|
@@ -10162,6 +10489,7 @@ export {
|
|
|
10162
10489
|
NoopResearcher,
|
|
10163
10490
|
NotFoundError,
|
|
10164
10491
|
OTEL_AGENT_EVAL_SCOPE,
|
|
10492
|
+
OtlpFileTraceStore,
|
|
10165
10493
|
PairwiseSteeringOptimizer,
|
|
10166
10494
|
ProductClient,
|
|
10167
10495
|
PromptRegistry,
|
|
@@ -10178,10 +10506,17 @@ export {
|
|
|
10178
10506
|
SEMANTIC_CONCEPT_JUDGE_VERSION,
|
|
10179
10507
|
SandboxHarness,
|
|
10180
10508
|
ScenarioRegistry,
|
|
10509
|
+
SpanNotFoundError,
|
|
10181
10510
|
SubprocessSandboxDriver,
|
|
10511
|
+
TRACE_ANALYST_ACTOR_DESCRIPTION,
|
|
10512
|
+
TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
|
|
10513
|
+
TRACE_ANALYST_SUBAGENT_DESCRIPTION,
|
|
10514
|
+
TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
|
|
10182
10515
|
TRACE_SCHEMA_VERSION,
|
|
10183
10516
|
TokenCounter,
|
|
10184
10517
|
TraceEmitter,
|
|
10518
|
+
TraceFileMissingError,
|
|
10519
|
+
TraceNotFoundError,
|
|
10185
10520
|
TrialTelemetry,
|
|
10186
10521
|
UNIVERSAL_FINDERS,
|
|
10187
10522
|
ValidationError,
|
|
@@ -10194,8 +10529,10 @@ export {
|
|
|
10194
10529
|
allCriticalPassed,
|
|
10195
10530
|
analyzeAntiSlop,
|
|
10196
10531
|
analyzeSeries,
|
|
10532
|
+
analyzeTraces,
|
|
10197
10533
|
argHash,
|
|
10198
10534
|
assertLlmRoute,
|
|
10535
|
+
assertRealBackend,
|
|
10199
10536
|
assertReleaseConfidence,
|
|
10200
10537
|
assertRunCaptured,
|
|
10201
10538
|
assignFeedbackSplit,
|
|
@@ -10210,6 +10547,9 @@ export {
|
|
|
10210
10547
|
bootstrapCi,
|
|
10211
10548
|
buildReflectionPrompt,
|
|
10212
10549
|
buildReviewerPrompt,
|
|
10550
|
+
buildTraceAnalystTools,
|
|
10551
|
+
buildTraceInsightContext,
|
|
10552
|
+
buildTraceInsightPrompt,
|
|
10213
10553
|
buildTraceToolsForGroup,
|
|
10214
10554
|
buildTrajectory,
|
|
10215
10555
|
byteLengthRange,
|
|
@@ -10274,10 +10614,13 @@ export {
|
|
|
10274
10614
|
defaultMultiShotObjectives,
|
|
10275
10615
|
defaultProviderRedactor,
|
|
10276
10616
|
defaultReferenceReplayMatcher,
|
|
10617
|
+
defaultTraceInsightPanel,
|
|
10277
10618
|
deployGateLayer,
|
|
10619
|
+
describeTraceInsightScope,
|
|
10278
10620
|
diffFindings,
|
|
10279
10621
|
discoverPersonas,
|
|
10280
10622
|
distillPlaybook,
|
|
10623
|
+
domainEvidencePattern,
|
|
10281
10624
|
dominates,
|
|
10282
10625
|
estimateCost,
|
|
10283
10626
|
estimateTokens,
|
|
@@ -10321,6 +10664,7 @@ export {
|
|
|
10321
10664
|
httpGithubClient,
|
|
10322
10665
|
inMemoryReferenceReplayStore,
|
|
10323
10666
|
inMemoryReviewStore,
|
|
10667
|
+
inferDomainKeywords,
|
|
10324
10668
|
integrationAsi,
|
|
10325
10669
|
integrationGateEvals,
|
|
10326
10670
|
integrationInvokeFailedPayload,
|
|
@@ -10371,12 +10715,14 @@ export {
|
|
|
10371
10715
|
paretoFrontier,
|
|
10372
10716
|
paretoFrontierWithCrowding,
|
|
10373
10717
|
parseFeedbackTrajectoriesJsonl,
|
|
10718
|
+
parseFindingSubject,
|
|
10374
10719
|
parseRawFinding,
|
|
10375
10720
|
parseReflectionResponse,
|
|
10376
10721
|
parseRunRecordSafe,
|
|
10377
10722
|
partialCredit,
|
|
10378
10723
|
passOrthogonality,
|
|
10379
10724
|
pixelDeltaRatio,
|
|
10725
|
+
planTraceInsightQuestions,
|
|
10380
10726
|
politenessPrefixMutator,
|
|
10381
10727
|
positionalBias,
|
|
10382
10728
|
printDriverSummary,
|
|
@@ -10395,6 +10741,7 @@ export {
|
|
|
10395
10741
|
regexMatch,
|
|
10396
10742
|
regexMatches,
|
|
10397
10743
|
releaseTraceEvidenceFromMultiShotTrials,
|
|
10744
|
+
renderFindingSubject,
|
|
10398
10745
|
renderMarkdown,
|
|
10399
10746
|
renderMarkdownReport,
|
|
10400
10747
|
renderPlaybookMarkdown,
|
|
@@ -10444,6 +10791,7 @@ export {
|
|
|
10444
10791
|
scoreKnowledgeReadiness,
|
|
10445
10792
|
scoreRedTeamOutput,
|
|
10446
10793
|
scoreReferenceReplay,
|
|
10794
|
+
scoreTraceInsightReadiness,
|
|
10447
10795
|
securityJudge,
|
|
10448
10796
|
selectHarnessVariant,
|
|
10449
10797
|
selfPreference,
|
|
@@ -10457,6 +10805,7 @@ export {
|
|
|
10457
10805
|
stripFencedJson,
|
|
10458
10806
|
subjectiveEval,
|
|
10459
10807
|
summarize,
|
|
10808
|
+
summarizeBackendIntegrity,
|
|
10460
10809
|
summarizeHarnessResults,
|
|
10461
10810
|
summarizePreferenceMemory,
|
|
10462
10811
|
summaryTable,
|
|
@@ -10465,8 +10814,11 @@ export {
|
|
|
10465
10814
|
throwIfRunIncomplete,
|
|
10466
10815
|
toLangfuseEnvelope,
|
|
10467
10816
|
toPrometheusText,
|
|
10817
|
+
tokenizeDomainWords,
|
|
10468
10818
|
toolNamesForRun,
|
|
10469
10819
|
toolSpans,
|
|
10820
|
+
traceAnalystFunctionGroup,
|
|
10821
|
+
traceAnalystOnRunComplete,
|
|
10470
10822
|
trialTraceFromMultiShotTrial,
|
|
10471
10823
|
typoMutator,
|
|
10472
10824
|
urlContains,
|