@tangle-network/agent-eval 0.29.1 → 0.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/dist/{baseline-BwdCXUS8.d.ts → baseline-4R5deP0N.d.ts} +1 -1
  2. package/dist/benchmarks/index.d.ts +3 -3
  3. package/dist/builder-eval/index.d.ts +3 -3
  4. package/dist/builder-eval/index.js +2 -2
  5. package/dist/{chunk-R5UQJNKC.js → chunk-4L3WJXQJ.js} +2 -2
  6. package/dist/{chunk-RUI6SIHY.js → chunk-75ZREHD7.js} +4 -4
  7. package/dist/{chunk-5AKPEK5L.js → chunk-CXJOVDJR.js} +2 -2
  8. package/dist/{chunk-K33INZHH.js → chunk-GVQT44CS.js} +2 -2
  9. package/dist/{chunk-UW4NOOZI.js → chunk-HIO4UIS5.js} +308 -2
  10. package/dist/chunk-HIO4UIS5.js.map +1 -0
  11. package/dist/{chunk-4S4BM3QQ.js → chunk-M6RZ5LJN.js} +2 -2
  12. package/dist/{chunk-NG236HPC.js → chunk-QYJT52YW.js} +1 -1
  13. package/dist/chunk-QYJT52YW.js.map +1 -0
  14. package/dist/{chunk-XFZCM5Z3.js → chunk-SMSGXM74.js} +2 -2
  15. package/dist/{chunk-KTGTIOFD.js → chunk-UBPIXOC4.js} +2 -2
  16. package/dist/{chunk-DBIGN5MJ.js → chunk-WGXZAQLR.js} +3 -3
  17. package/dist/{chunk-NLMNWKVM.js → chunk-WSI4K3WB.js} +2 -2
  18. package/dist/{chunk-PALJO75S.js → chunk-XEL6UP7C.js} +2 -2
  19. package/dist/{chunk-SZSBQUIJ.js → chunk-Y2CPBYKH.js} +3 -3
  20. package/dist/{chunk-QHF6EQKK.js → chunk-YTMXBHFM.js} +2 -2
  21. package/dist/cli.js +3 -3
  22. package/dist/{control-rJhEDdpy.d.ts → control-BFpqHFV2.d.ts} +5 -5
  23. package/dist/{control-runtime-BRdQ0wrx.d.ts → control-runtime-BZ_lVLYW.d.ts} +2 -2
  24. package/dist/control.d.ts +8 -8
  25. package/dist/control.js +3 -3
  26. package/dist/{dataset-CiK_3LDr.d.ts → dataset-ueRVTUoY.d.ts} +1 -1
  27. package/dist/{emitter-BqjeOvJh.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  28. package/dist/{errors-BZ9sTdz7.d.ts → errors-mje_cKOs.d.ts} +1 -1
  29. package/dist/{failure-cluster-D1NZKqYu.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -1
  30. package/dist/{feedback-trajectory-j0nJFgC6.d.ts → feedback-trajectory-iATEAHmc.d.ts} +2 -2
  31. package/dist/governance/index.d.ts +4 -4
  32. package/dist/{index-Cgt3DKXr.d.ts → index-DPILdKbP.d.ts} +2 -2
  33. package/dist/{index--fVrWDiR.d.ts → index-TVjRYWRm.d.ts} +1 -1
  34. package/dist/index.d.ts +254 -38
  35. package/dist/index.js +378 -26
  36. package/dist/index.js.map +1 -1
  37. package/dist/{integrity-BAxLGJ9I.d.ts → integrity-DYR5gWlb.d.ts} +2 -2
  38. package/dist/knowledge/index.d.ts +3 -3
  39. package/dist/meta-eval/index.d.ts +4 -4
  40. package/dist/openapi.json +1 -1
  41. package/dist/optimization.d.ts +11 -11
  42. package/dist/optimization.js +8 -8
  43. package/dist/pipelines/index.d.ts +6 -6
  44. package/dist/pipelines/index.js +3 -3
  45. package/dist/prm/index.d.ts +4 -4
  46. package/dist/{query-BFDT0kX_.d.ts → query-DODUYdPg.d.ts} +1 -1
  47. package/dist/{release-report-PWhGlpfO.d.ts → release-report-C8r4Vben.d.ts} +3 -3
  48. package/dist/reporting.d.ts +8 -8
  49. package/dist/reporting.js +4 -4
  50. package/dist/{researcher-ClDX3KZx.d.ts → researcher-BmgJ_901.d.ts} +6 -6
  51. package/dist/rl.d.ts +10 -10
  52. package/dist/rl.js +6 -6
  53. package/dist/{rubric-DgSqjqqj.d.ts → rubric-D5tjHNJQ.d.ts} +2 -2
  54. package/dist/{rubric-predictive-validity-C0uDYwG6.d.ts → rubric-predictive-validity-Bm-CbN46.d.ts} +1 -1
  55. package/dist/{run-record-CqzahIbx.d.ts → run-record-nYf9x2hU.d.ts} +1 -1
  56. package/dist/{store-BP5be6s7.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  57. package/dist/{summary-report-jrSGb2xZ.d.ts → summary-report-dir7A-eQ.d.ts} +2 -2
  58. package/dist/{test-graded-scenario-BJ54PDan.d.ts → test-graded-scenario-B2kWEdh9.d.ts} +2 -2
  59. package/dist/traces.d.ts +533 -10
  60. package/dist/traces.js +14 -300
  61. package/dist/traces.js.map +1 -1
  62. package/dist/{trajectory-BFmveYZt.d.ts → trajectory-CnoBo-JY.d.ts} +1 -1
  63. package/dist/wire/index.d.ts +6 -6
  64. package/dist/wire/index.js +3 -3
  65. package/package.json +1 -1
  66. package/dist/chunk-NG236HPC.js.map +0 -1
  67. package/dist/chunk-UW4NOOZI.js.map +0 -1
  68. package/dist/replay-BX5Fm8en.d.ts +0 -529
  69. /package/dist/{chunk-R5UQJNKC.js.map → chunk-4L3WJXQJ.js.map} +0 -0
  70. /package/dist/{chunk-RUI6SIHY.js.map → chunk-75ZREHD7.js.map} +0 -0
  71. /package/dist/{chunk-5AKPEK5L.js.map → chunk-CXJOVDJR.js.map} +0 -0
  72. /package/dist/{chunk-K33INZHH.js.map → chunk-GVQT44CS.js.map} +0 -0
  73. /package/dist/{chunk-4S4BM3QQ.js.map → chunk-M6RZ5LJN.js.map} +0 -0
  74. /package/dist/{chunk-XFZCM5Z3.js.map → chunk-SMSGXM74.js.map} +0 -0
  75. /package/dist/{chunk-KTGTIOFD.js.map → chunk-UBPIXOC4.js.map} +0 -0
  76. /package/dist/{chunk-DBIGN5MJ.js.map → chunk-WGXZAQLR.js.map} +0 -0
  77. /package/dist/{chunk-NLMNWKVM.js.map → chunk-WSI4K3WB.js.map} +0 -0
  78. /package/dist/{chunk-PALJO75S.js.map → chunk-XEL6UP7C.js.map} +0 -0
  79. /package/dist/{chunk-SZSBQUIJ.js.map → chunk-Y2CPBYKH.js.map} +0 -0
  80. /package/dist/{chunk-QHF6EQKK.js.map → chunk-YTMXBHFM.js.map} +0 -0
package/dist/index.js CHANGED
@@ -11,7 +11,7 @@ import {
11
11
  failureClusterView,
12
12
  iqr,
13
13
  welchsTTest
14
- } from "./chunk-K33INZHH.js";
14
+ } from "./chunk-GVQT44CS.js";
15
15
  import {
16
16
  exportTrainingData,
17
17
  toNdjson
@@ -28,7 +28,7 @@ import {
28
28
  pytestTestParser,
29
29
  runTestGradedScenario,
30
30
  vitestTestParser
31
- } from "./chunk-QHF6EQKK.js";
31
+ } from "./chunk-YTMXBHFM.js";
32
32
  import {
33
33
  classifyEuAiRisk,
34
34
  euAiActReport,
@@ -54,7 +54,7 @@ import {
54
54
  runProposeReview,
55
55
  runProposeReviewAsControlLoop,
56
56
  scoreFromEvals
57
- } from "./chunk-PALJO75S.js";
57
+ } from "./chunk-XEL6UP7C.js";
58
58
  import {
59
59
  allCriticalPassed,
60
60
  objectiveEval,
@@ -96,14 +96,14 @@ import {
96
96
  summarizePreferenceMemory,
97
97
  trialTraceFromMultiShotTrial,
98
98
  withAssignedFeedbackSplit
99
- } from "./chunk-SZSBQUIJ.js";
99
+ } from "./chunk-Y2CPBYKH.js";
100
100
  import {
101
101
  RunRecordValidationError,
102
102
  isRunRecord,
103
103
  parseRunRecordSafe,
104
104
  roundTripRunRecord,
105
105
  validateRunRecord
106
- } from "./chunk-NLMNWKVM.js";
106
+ } from "./chunk-WSI4K3WB.js";
107
107
  import {
108
108
  assertReleaseConfidence,
109
109
  bootstrapCi,
@@ -111,10 +111,10 @@ import {
111
111
  judgeReplayGate,
112
112
  releaseTraceEvidenceFromMultiShotTrials,
113
113
  renderReleaseReport
114
- } from "./chunk-DBIGN5MJ.js";
114
+ } from "./chunk-WGXZAQLR.js";
115
115
  import {
116
116
  runEvalCampaign
117
- } from "./chunk-RUI6SIHY.js";
117
+ } from "./chunk-75ZREHD7.js";
118
118
  import {
119
119
  LlmCallError,
120
120
  LlmClient,
@@ -124,7 +124,7 @@ import {
124
124
  callLlmJson,
125
125
  probeLlm,
126
126
  stripFencedJson
127
- } from "./chunk-4S4BM3QQ.js";
127
+ } from "./chunk-M6RZ5LJN.js";
128
128
  import {
129
129
  evaluateInterimReleaseConfidence,
130
130
  pairedEvalueSequence
@@ -141,7 +141,7 @@ import {
141
141
  requiredSampleSize,
142
142
  researchReport,
143
143
  summaryTable
144
- } from "./chunk-5AKPEK5L.js";
144
+ } from "./chunk-CXJOVDJR.js";
145
145
  import {
146
146
  calibrateJudge,
147
147
  calibrateJudgeContinuous,
@@ -160,24 +160,43 @@ import {
160
160
  verbosityBias,
161
161
  weightedMean,
162
162
  wilcoxonSignedRank
163
- } from "./chunk-R5UQJNKC.js";
163
+ } from "./chunk-4L3WJXQJ.js";
164
164
  import {
165
165
  DEFAULT_REDACTION_RULES,
166
+ DEFAULT_TRACE_ANALYST_BUDGETS,
166
167
  FileSystemTraceStore,
167
168
  InMemoryTraceStore,
168
169
  OTEL_AGENT_EVAL_SCOPE,
170
+ OtlpFileTraceStore,
169
171
  REDACTION_VERSION,
170
172
  ReplayCache,
171
173
  ReplayCacheMissError,
174
+ SpanNotFoundError,
175
+ TRACE_ANALYST_ACTOR_DESCRIPTION,
176
+ TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
177
+ TRACE_ANALYST_SUBAGENT_DESCRIPTION,
178
+ TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
172
179
  TraceFileMissingError,
180
+ TraceNotFoundError,
173
181
  analyzeTraces,
174
182
  buildTraceAnalystTools,
183
+ buildTraceInsightContext,
184
+ buildTraceInsightPrompt,
175
185
  createReplayFetch,
186
+ defaultTraceInsightPanel,
187
+ describeTraceInsightScope,
188
+ domainEvidencePattern,
176
189
  exportRunAsOtlp,
190
+ inferDomainKeywords,
177
191
  iterateRawCalls,
192
+ planTraceInsightQuestions,
178
193
  redactString,
179
- redactValue
180
- } from "./chunk-UW4NOOZI.js";
194
+ redactValue,
195
+ scoreTraceInsightReadiness,
196
+ tokenizeDomainWords,
197
+ traceAnalystFunctionGroup,
198
+ traceAnalystOnRunComplete
199
+ } from "./chunk-HIO4UIS5.js";
181
200
  import {
182
201
  aggregateLlm,
183
202
  argHash,
@@ -201,7 +220,7 @@ import {
201
220
  RunIntegrityError,
202
221
  assertRunCaptured,
203
222
  throwIfRunIncomplete
204
- } from "./chunk-KTGTIOFD.js";
223
+ } from "./chunk-UBPIXOC4.js";
205
224
  import {
206
225
  FileSystemRawProviderSink,
207
226
  InMemoryRawProviderSink,
@@ -229,7 +248,7 @@ import {
229
248
  ReplayError,
230
249
  ValidationError,
231
250
  VerificationError
232
- } from "./chunk-NG236HPC.js";
251
+ } from "./chunk-QYJT52YW.js";
233
252
  import "./chunk-PZ5AY32C.js";
234
253
 
235
254
  // src/run-score.ts
@@ -968,17 +987,194 @@ function resolveModel(req, defaultModel) {
968
987
  }
969
988
 
970
989
  // src/analyst/finding-signature.ts
990
+ import { z as z2 } from "zod";
991
+
992
+ // src/analyst/finding-subject.ts
971
993
  import { z } from "zod";
994
+ var FINDING_SUBJECT_KINDS = [
995
+ "knowledge.wiki",
996
+ "knowledge.claim",
997
+ "knowledge.raw",
998
+ "knowledge.stale",
999
+ "system-prompt",
1000
+ "tool-doc",
1001
+ "new-tool",
1002
+ "rag",
1003
+ "memory",
1004
+ "scaffolding",
1005
+ "output-schema",
1006
+ "websearch.outdated",
1007
+ "prior-run-summary",
1008
+ "cluster"
1009
+ ];
1010
+ function parseFindingSubject(raw) {
1011
+ if (raw === null || raw === void 0) return null;
1012
+ const trimmed = raw.trim();
1013
+ if (trimmed.length === 0) return null;
1014
+ const wiki = trimmed.match(
1015
+ /^agent-knowledge:wiki:([a-z0-9][a-z0-9-]*)(?:#([a-z0-9][a-z0-9-]*))?$/
1016
+ );
1017
+ if (wiki)
1018
+ return { kind: "knowledge.wiki", slug: wiki[1], ...wiki[2] ? { heading: wiki[2] } : {} };
1019
+ const claim = trimmed.match(/^agent-knowledge:claim:(.+)$/);
1020
+ if (claim && claim[1].trim().length > 0)
1021
+ return { kind: "knowledge.claim", topic: claim[1].trim() };
1022
+ const raw_ = trimmed.match(/^agent-knowledge:raw:(.+)$/);
1023
+ if (raw_ && raw_[1].trim().length > 0)
1024
+ return { kind: "knowledge.raw", sourceId: raw_[1].trim() };
1025
+ const stale = trimmed.match(/^agent-knowledge:stale:([a-z0-9][a-z0-9-]*)$/);
1026
+ if (stale) return { kind: "knowledge.stale", slug: stale[1] };
1027
+ const sp = trimmed.match(/^system-prompt:(.+)$/);
1028
+ if (sp && sp[1].trim().length > 0) return { kind: "system-prompt", section: sp[1].trim() };
1029
+ const tdAspect = trimmed.match(/^tool-doc:([a-z0-9][a-z0-9_-]*):(.+)$/);
1030
+ if (tdAspect && tdAspect[2].trim().length > 0) {
1031
+ return { kind: "tool-doc", tool: tdAspect[1], aspect: tdAspect[2].trim() };
1032
+ }
1033
+ const td = trimmed.match(/^tool-doc:([a-z0-9][a-z0-9_-]*)$/);
1034
+ if (td) return { kind: "tool-doc", tool: td[1] };
1035
+ const nt = trimmed.match(/^new-tool:([a-z0-9][a-z0-9_-]*)$/);
1036
+ if (nt) return { kind: "new-tool", name: nt[1] };
1037
+ const rag = trimmed.match(/^rag:([a-z0-9][a-z0-9_-]*):(.+)$/);
1038
+ if (rag && rag[2].trim().length > 0) {
1039
+ return { kind: "rag", corpus: rag[1], docId: rag[2].trim() };
1040
+ }
1041
+ const mem = trimmed.match(/^memory:(.+)$/);
1042
+ if (mem && mem[1].trim().length > 0) return { kind: "memory", key: mem[1].trim() };
1043
+ const sc = trimmed.match(/^scaffolding:(.+)$/);
1044
+ if (sc && sc[1].trim().length > 0) return { kind: "scaffolding", concern: sc[1].trim() };
1045
+ const os = trimmed.match(/^output-schema:(.+)$/);
1046
+ if (os && os[1].trim().length > 0) return { kind: "output-schema", field: os[1].trim() };
1047
+ const ws = trimmed.match(/^websearch:outdated:(.+)$/);
1048
+ if (ws && ws[1].trim().length > 0) return { kind: "websearch.outdated", topic: ws[1].trim() };
1049
+ const prs = trimmed.match(/^prior-run-summary:(.+)$/);
1050
+ if (prs && prs[1].trim().length > 0) return { kind: "prior-run-summary", topic: prs[1].trim() };
1051
+ if (/^[a-z0-9][a-z0-9-]*$/.test(trimmed) && trimmed.length <= 80) {
1052
+ return { kind: "cluster", label: trimmed };
1053
+ }
1054
+ return null;
1055
+ }
1056
+ function renderFindingSubject(s) {
1057
+ switch (s.kind) {
1058
+ case "knowledge.wiki":
1059
+ return s.heading ? `agent-knowledge:wiki:${s.slug}#${s.heading}` : `agent-knowledge:wiki:${s.slug}`;
1060
+ case "knowledge.claim":
1061
+ return `agent-knowledge:claim:${s.topic}`;
1062
+ case "knowledge.raw":
1063
+ return `agent-knowledge:raw:${s.sourceId}`;
1064
+ case "knowledge.stale":
1065
+ return `agent-knowledge:stale:${s.slug}`;
1066
+ case "system-prompt":
1067
+ return `system-prompt:${s.section}`;
1068
+ case "tool-doc":
1069
+ return s.aspect ? `tool-doc:${s.tool}:${s.aspect}` : `tool-doc:${s.tool}`;
1070
+ case "new-tool":
1071
+ return `new-tool:${s.name}`;
1072
+ case "rag":
1073
+ return `rag:${s.corpus}:${s.docId}`;
1074
+ case "memory":
1075
+ return `memory:${s.key}`;
1076
+ case "scaffolding":
1077
+ return `scaffolding:${s.concern}`;
1078
+ case "output-schema":
1079
+ return `output-schema:${s.field}`;
1080
+ case "websearch.outdated":
1081
+ return `websearch:outdated:${s.topic}`;
1082
+ case "prior-run-summary":
1083
+ return `prior-run-summary:${s.topic}`;
1084
+ case "cluster":
1085
+ return s.label;
1086
+ }
1087
+ }
1088
+ var FINDING_SUBJECT_GRAMMAR_PROMPT = [
1089
+ "Subjects MUST match this grammar \u2014 anything else is rejected at parse time and your work is wasted:",
1090
+ "",
1091
+ " Knowledge loci (write to the agent-knowledge base):",
1092
+ " agent-knowledge:wiki:<slug>[#<heading>] create / update a wiki page",
1093
+ " agent-knowledge:claim:<topic> draft a claim / relation triple",
1094
+ " agent-knowledge:raw:<source-id> lift a raw source into a curated page",
1095
+ " agent-knowledge:stale:<slug> mark a page superseded",
1096
+ "",
1097
+ " Runtime mutable surfaces (write to prompts / tools / scaffolding):",
1098
+ " system-prompt:<section> add / replace a system-prompt section",
1099
+ " tool-doc:<tool>[:<aspect>] rewrite a tool description",
1100
+ " new-tool:<name> propose a new tool surface",
1101
+ " rag:<corpus>:<doc-id> ingest / correct a RAG document",
1102
+ " memory:<key> invalidate / set a memory entry",
1103
+ " scaffolding:<concern> change a precondition / retry / verifier",
1104
+ " output-schema:<field> constrain the agent output shape",
1105
+ "",
1106
+ " Stale signals (knowledge-poisoning only):",
1107
+ " websearch:outdated:<topic> stale web result",
1108
+ " prior-run-summary:<topic> stale prior-run summary",
1109
+ "",
1110
+ " Cluster label (failure-mode only):",
1111
+ ' <kebab-case-label> short cluster id, e.g. "tool-call-loop"',
1112
+ "",
1113
+ "Slugs / tool ids: [a-z0-9-]+ (lowercase kebab). Topics / keys / sections: free-form, trimmed."
1114
+ ].join("\n");
1115
+ var KIND_EXPECTED_SUBJECTS = {
1116
+ "failure-mode": ["cluster"],
1117
+ "knowledge-gap": [
1118
+ "knowledge.wiki",
1119
+ "knowledge.claim",
1120
+ "knowledge.raw",
1121
+ "knowledge.stale",
1122
+ "tool-doc",
1123
+ "system-prompt",
1124
+ "memory",
1125
+ "websearch.outdated",
1126
+ "prior-run-summary"
1127
+ ],
1128
+ "knowledge-poisoning": [
1129
+ "knowledge.wiki",
1130
+ "knowledge.claim",
1131
+ "knowledge.raw",
1132
+ "tool-doc",
1133
+ "system-prompt",
1134
+ "memory",
1135
+ "websearch.outdated",
1136
+ "prior-run-summary"
1137
+ ],
1138
+ improvement: [
1139
+ "system-prompt",
1140
+ "tool-doc",
1141
+ "new-tool",
1142
+ "rag",
1143
+ "memory",
1144
+ "scaffolding",
1145
+ "output-schema",
1146
+ "knowledge.wiki",
1147
+ "knowledge.claim"
1148
+ ]
1149
+ };
1150
+ var FindingSubjectStringSchema = z.string().refine((s) => parseFindingSubject(s) !== null, {
1151
+ message: "subject does not match the finding-subject grammar"
1152
+ });
1153
+
1154
+ // src/analyst/finding-signature.ts
972
1155
  var ANALYST_SEVERITIES = ["critical", "high", "medium", "low", "info"];
973
- var RawAnalystFindingSchema = z.object({
974
- severity: z.enum(ANALYST_SEVERITIES),
975
- claim: z.string().min(1).max(2e3),
976
- subject: z.string().max(400).optional(),
977
- evidence_uri: z.string().min(1).max(2e3),
978
- evidence_excerpt: z.string().max(2e3).optional(),
979
- confidence: z.number().min(0).max(1),
980
- rationale: z.string().max(4e3).optional(),
981
- recommended_action: z.string().max(2e3).optional()
1156
+ var RawAnalystFindingSchema = z2.object({
1157
+ severity: z2.enum(ANALYST_SEVERITIES),
1158
+ claim: z2.string().min(1).max(2e3),
1159
+ /**
1160
+ * Subject locus the finding is about. Validated at parse time
1161
+ * against the documented grammar (`finding-subject.ts`). Findings
1162
+ * with a malformed subject are rejected — they would have been
1163
+ * silently skipped by every downstream adapter, so failing loud at
1164
+ * parse time turns a hidden no-op into a kind-prompt audit signal.
1165
+ *
1166
+ * Optional because purely descriptive findings (no actionable
1167
+ * locus) are legitimate; they just don't route through the
1168
+ * KnowledgeAdapter / ImprovementAdapter.
1169
+ */
1170
+ subject: z2.string().max(400).refine((s) => parseFindingSubject(s) !== null, {
1171
+ message: "subject does not match the finding-subject grammar"
1172
+ }).optional(),
1173
+ evidence_uri: z2.string().min(1).max(2e3),
1174
+ evidence_excerpt: z2.string().max(2e3).optional(),
1175
+ confidence: z2.number().min(0).max(1),
1176
+ rationale: z2.string().max(4e3).optional(),
1177
+ recommended_action: z2.string().max(2e3).optional()
982
1178
  }).strict();
983
1179
  var RAW_FINDING_SCHEMA_PROMPT = `Each finding MUST be a JSON object with these fields:
984
1180
  - severity: one of "critical" | "high" | "medium" | "low" | "info"
@@ -1212,18 +1408,42 @@ function createTraceAnalystKind(spec, opts) {
1212
1408
  tags: ctx.tags
1213
1409
  });
1214
1410
  const result = await ax.forward(opts.ai, { question: deriveQuestion(ctx, spec) });
1411
+ const expectedSubjects = KIND_EXPECTED_SUBJECTS[spec.id];
1215
1412
  const out = [];
1216
1413
  const rawRows = Array.isArray(result.findings) ? result.findings : [];
1414
+ let rejectedWrongKind = 0;
1217
1415
  for (const row of rawRows) {
1218
1416
  const parsed = parseRawFinding(row, ctx.log);
1219
1417
  if (!parsed) continue;
1418
+ if (expectedSubjects && parsed.subject !== void 0) {
1419
+ const parsedSubject = parseFindingSubject(parsed.subject);
1420
+ if (parsedSubject === null) {
1421
+ ctx.log?.("finding rejected: subject failed to parse", {
1422
+ kind: spec.id,
1423
+ subject: parsed.subject
1424
+ });
1425
+ rejectedWrongKind += 1;
1426
+ continue;
1427
+ }
1428
+ if (!expectedSubjects.includes(parsedSubject.kind)) {
1429
+ ctx.log?.("finding rejected: subject variant not allowed for this kind", {
1430
+ kind: spec.id,
1431
+ subject_kind: parsedSubject.kind,
1432
+ subject: parsed.subject,
1433
+ allowed: expectedSubjects
1434
+ });
1435
+ rejectedWrongKind += 1;
1436
+ continue;
1437
+ }
1438
+ }
1220
1439
  const postProcessed = spec.postProcess?.(parsed, ctx) ?? parsed;
1221
1440
  if (!postProcessed) continue;
1222
1441
  out.push(toAnalystFinding(spec, postProcessed));
1223
1442
  }
1224
1443
  ctx.log?.(`analyst.kind ${spec.id} done`, {
1225
1444
  emitted: rawRows.length,
1226
- accepted: out.length
1445
+ accepted: out.length,
1446
+ rejected_wrong_subject: rejectedWrongKind
1227
1447
  });
1228
1448
  return out;
1229
1449
  }
@@ -3036,6 +3256,107 @@ function suggestionForManifest(input) {
3036
3256
  return "No action required.";
3037
3257
  }
3038
3258
 
3259
+ // src/integrity/backend-integrity.ts
3260
+ var BackendIntegrityError = class extends AgentEvalError {
3261
+ constructor(message, report) {
3262
+ super("backend_integrity", message);
3263
+ this.report = report;
3264
+ }
3265
+ report;
3266
+ };
3267
+ function isStubRecord(rec) {
3268
+ return rec.tokenUsage.input === 0 && rec.tokenUsage.output === 0;
3269
+ }
3270
+ function isUncostedRecord(rec) {
3271
+ return rec.tokenUsage.output > 0 && rec.costUsd === 0;
3272
+ }
3273
+ function summarizeBackendIntegrity(records) {
3274
+ const totalRecords = records.length;
3275
+ let stubRecords = 0;
3276
+ let realRecords = 0;
3277
+ let uncostedRecords = 0;
3278
+ let totalInputTokens = 0;
3279
+ let totalOutputTokens = 0;
3280
+ let totalCostUsd = 0;
3281
+ for (const rec of records) {
3282
+ totalInputTokens += rec.tokenUsage.input;
3283
+ totalOutputTokens += rec.tokenUsage.output;
3284
+ totalCostUsd += rec.costUsd;
3285
+ if (isStubRecord(rec)) stubRecords++;
3286
+ else realRecords++;
3287
+ if (isUncostedRecord(rec)) uncostedRecords++;
3288
+ }
3289
+ const verdict = totalRecords === 0 ? "stub" : stubRecords === totalRecords ? "stub" : stubRecords === 0 ? "real" : "mixed";
3290
+ const diagnosis = buildDiagnosis({
3291
+ totalRecords,
3292
+ stubRecords,
3293
+ realRecords,
3294
+ uncostedRecords,
3295
+ totalInputTokens,
3296
+ totalOutputTokens,
3297
+ totalCostUsd,
3298
+ verdict
3299
+ });
3300
+ return {
3301
+ totalRecords,
3302
+ stubRecords,
3303
+ realRecords,
3304
+ uncostedRecords,
3305
+ totalInputTokens,
3306
+ totalOutputTokens,
3307
+ totalCostUsd,
3308
+ verdict,
3309
+ diagnosis
3310
+ };
3311
+ }
3312
+ function buildDiagnosis(r) {
3313
+ if (r.totalRecords === 0) {
3314
+ return "no records \u2014 eval produced zero runs; backend likely failed before first turn";
3315
+ }
3316
+ if (r.verdict === "stub") {
3317
+ return [
3318
+ `all ${r.totalRecords} records have zero token usage \u2014 the LLM backend was never called.`,
3319
+ "common causes: --backend sandbox without a sandbox bridge running; stub model returning hard-coded strings;",
3320
+ "auth misconfigured so requests were silently dropped before the LLM. Re-run with --backend tcloud and TANGLE_API_KEY set,",
3321
+ "or boot the cli-bridge / sandbox before invoking the eval."
3322
+ ].join(" ");
3323
+ }
3324
+ if (r.verdict === "mixed") {
3325
+ const pct = (r.stubRecords / r.totalRecords * 100).toFixed(0);
3326
+ return [
3327
+ `${r.stubRecords}/${r.totalRecords} records (${pct}%) have zero token usage \u2014 the backend partially failed.`,
3328
+ "common causes: rate-limit cascade (429s after the first N personas);",
3329
+ "transient auth expiry mid-run; provider outage. Treat the affected records as missing data, not agent failures."
3330
+ ].join(" ");
3331
+ }
3332
+ if (r.uncostedRecords > 0) {
3333
+ const pct = (r.uncostedRecords / r.totalRecords * 100).toFixed(0);
3334
+ return [
3335
+ `${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens).`,
3336
+ `${r.uncostedRecords} (${pct}%) have output tokens but costUsd=0 \u2014 cost ledger is mis-wired (no input-token`,
3337
+ "propagation from the runtime stream into RunRecord)."
3338
+ ].join(" ");
3339
+ }
3340
+ return `${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens, $${r.totalCostUsd.toFixed(4)}).`;
3341
+ }
3342
+ function assertRealBackend(records, opts = {}) {
3343
+ const report = summarizeBackendIntegrity(records);
3344
+ const allowMixed = opts.allowMixed ?? true;
3345
+ if (report.verdict === "stub") {
3346
+ throw new BackendIntegrityError(
3347
+ `backend-integrity: ran against a stub or unconfigured backend \u2014 ${report.diagnosis}`,
3348
+ report
3349
+ );
3350
+ }
3351
+ if (!allowMixed && report.verdict === "mixed") {
3352
+ throw new BackendIntegrityError(
3353
+ `backend-integrity: partial backend failure rejected \u2014 ${report.diagnosis}`,
3354
+ report
3355
+ );
3356
+ }
3357
+ return report;
3358
+ }
3359
+
3039
3360
  // src/judges.ts
3040
3361
  function createDomainExpertJudge(domain) {
3041
3362
  return async (tc, { scenario, turns }) => {
@@ -9174,8 +9495,8 @@ function chiSquareCritical(df, alpha) {
9174
9495
  if (TABLE[df]) return TABLE[df][idx];
9175
9496
  if (df > 30) {
9176
9497
  const zMap = { 0: 1.282, 1: 1.645, 2: 1.96, 3: 2.326 };
9177
- const z2 = zMap[idx] ?? 1.96;
9178
- const term = 1 - 2 / (9 * df) + z2 * Math.sqrt(2 / (9 * df));
9498
+ const z3 = zMap[idx] ?? 1.96;
9499
+ const term = 1 - 2 / (9 * df) + z3 * Math.sqrt(2 / (9 * df));
9179
9500
  return df * term ** 3;
9180
9501
  }
9181
9502
  const keys = Object.keys(TABLE).map((k) => Number(k)).sort((a, b) => a - b);
@@ -10098,6 +10419,7 @@ export {
10098
10419
  AnalystRegistry,
10099
10420
  AxGepaSteeringOptimizer,
10100
10421
  BENCHMARK_SPLIT_SEED,
10422
+ BackendIntegrityError,
10101
10423
  BenchmarkRunner,
10102
10424
  BudgetBreachError,
10103
10425
  BudgetGuard,
@@ -10119,6 +10441,7 @@ export {
10119
10441
  DEFAULT_RED_TEAM_CORPUS,
10120
10442
  DEFAULT_RUN_SCORE_WEIGHTS,
10121
10443
  DEFAULT_SEVERITY_WEIGHTS,
10444
+ DEFAULT_TRACE_ANALYST_BUDGETS,
10122
10445
  DEFAULT_TRACE_ANALYST_KINDS,
10123
10446
  Dataset,
10124
10447
  DockerSandboxDriver,
@@ -10127,10 +10450,13 @@ export {
10127
10450
  ExperimentTracker,
10128
10451
  FAILURE_CLASSES,
10129
10452
  FAILURE_MODE_KIND_SPEC,
10453
+ FINDING_SUBJECT_GRAMMAR_PROMPT,
10454
+ FINDING_SUBJECT_KINDS,
10130
10455
  FileSystemExperimentStore,
10131
10456
  FileSystemFeedbackTrajectoryStore,
10132
10457
  FileSystemRawProviderSink,
10133
10458
  FileSystemTraceStore,
10459
+ FindingSubjectStringSchema,
10134
10460
  FindingsStore,
10135
10461
  HeldOutGate,
10136
10462
  HoldoutAuditor,
@@ -10146,6 +10472,7 @@ export {
10146
10472
  JsonlTrialCache,
10147
10473
  JudgeError,
10148
10474
  JudgeRunner,
10475
+ KIND_EXPECTED_SUBJECTS,
10149
10476
  KNOWLEDGE_GAP_KIND_SPEC,
10150
10477
  KNOWLEDGE_POISONING_KIND_SPEC,
10151
10478
  LineageRecorder,
@@ -10162,6 +10489,7 @@ export {
10162
10489
  NoopResearcher,
10163
10490
  NotFoundError,
10164
10491
  OTEL_AGENT_EVAL_SCOPE,
10492
+ OtlpFileTraceStore,
10165
10493
  PairwiseSteeringOptimizer,
10166
10494
  ProductClient,
10167
10495
  PromptRegistry,
@@ -10178,10 +10506,17 @@ export {
10178
10506
  SEMANTIC_CONCEPT_JUDGE_VERSION,
10179
10507
  SandboxHarness,
10180
10508
  ScenarioRegistry,
10509
+ SpanNotFoundError,
10181
10510
  SubprocessSandboxDriver,
10511
+ TRACE_ANALYST_ACTOR_DESCRIPTION,
10512
+ TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
10513
+ TRACE_ANALYST_SUBAGENT_DESCRIPTION,
10514
+ TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
10182
10515
  TRACE_SCHEMA_VERSION,
10183
10516
  TokenCounter,
10184
10517
  TraceEmitter,
10518
+ TraceFileMissingError,
10519
+ TraceNotFoundError,
10185
10520
  TrialTelemetry,
10186
10521
  UNIVERSAL_FINDERS,
10187
10522
  ValidationError,
@@ -10194,8 +10529,10 @@ export {
10194
10529
  allCriticalPassed,
10195
10530
  analyzeAntiSlop,
10196
10531
  analyzeSeries,
10532
+ analyzeTraces,
10197
10533
  argHash,
10198
10534
  assertLlmRoute,
10535
+ assertRealBackend,
10199
10536
  assertReleaseConfidence,
10200
10537
  assertRunCaptured,
10201
10538
  assignFeedbackSplit,
@@ -10210,6 +10547,9 @@ export {
10210
10547
  bootstrapCi,
10211
10548
  buildReflectionPrompt,
10212
10549
  buildReviewerPrompt,
10550
+ buildTraceAnalystTools,
10551
+ buildTraceInsightContext,
10552
+ buildTraceInsightPrompt,
10213
10553
  buildTraceToolsForGroup,
10214
10554
  buildTrajectory,
10215
10555
  byteLengthRange,
@@ -10274,10 +10614,13 @@ export {
10274
10614
  defaultMultiShotObjectives,
10275
10615
  defaultProviderRedactor,
10276
10616
  defaultReferenceReplayMatcher,
10617
+ defaultTraceInsightPanel,
10277
10618
  deployGateLayer,
10619
+ describeTraceInsightScope,
10278
10620
  diffFindings,
10279
10621
  discoverPersonas,
10280
10622
  distillPlaybook,
10623
+ domainEvidencePattern,
10281
10624
  dominates,
10282
10625
  estimateCost,
10283
10626
  estimateTokens,
@@ -10321,6 +10664,7 @@ export {
10321
10664
  httpGithubClient,
10322
10665
  inMemoryReferenceReplayStore,
10323
10666
  inMemoryReviewStore,
10667
+ inferDomainKeywords,
10324
10668
  integrationAsi,
10325
10669
  integrationGateEvals,
10326
10670
  integrationInvokeFailedPayload,
@@ -10371,12 +10715,14 @@ export {
10371
10715
  paretoFrontier,
10372
10716
  paretoFrontierWithCrowding,
10373
10717
  parseFeedbackTrajectoriesJsonl,
10718
+ parseFindingSubject,
10374
10719
  parseRawFinding,
10375
10720
  parseReflectionResponse,
10376
10721
  parseRunRecordSafe,
10377
10722
  partialCredit,
10378
10723
  passOrthogonality,
10379
10724
  pixelDeltaRatio,
10725
+ planTraceInsightQuestions,
10380
10726
  politenessPrefixMutator,
10381
10727
  positionalBias,
10382
10728
  printDriverSummary,
@@ -10395,6 +10741,7 @@ export {
10395
10741
  regexMatch,
10396
10742
  regexMatches,
10397
10743
  releaseTraceEvidenceFromMultiShotTrials,
10744
+ renderFindingSubject,
10398
10745
  renderMarkdown,
10399
10746
  renderMarkdownReport,
10400
10747
  renderPlaybookMarkdown,
@@ -10444,6 +10791,7 @@ export {
10444
10791
  scoreKnowledgeReadiness,
10445
10792
  scoreRedTeamOutput,
10446
10793
  scoreReferenceReplay,
10794
+ scoreTraceInsightReadiness,
10447
10795
  securityJudge,
10448
10796
  selectHarnessVariant,
10449
10797
  selfPreference,
@@ -10457,6 +10805,7 @@ export {
10457
10805
  stripFencedJson,
10458
10806
  subjectiveEval,
10459
10807
  summarize,
10808
+ summarizeBackendIntegrity,
10460
10809
  summarizeHarnessResults,
10461
10810
  summarizePreferenceMemory,
10462
10811
  summaryTable,
@@ -10465,8 +10814,11 @@ export {
10465
10814
  throwIfRunIncomplete,
10466
10815
  toLangfuseEnvelope,
10467
10816
  toPrometheusText,
10817
+ tokenizeDomainWords,
10468
10818
  toolNamesForRun,
10469
10819
  toolSpans,
10820
+ traceAnalystFunctionGroup,
10821
+ traceAnalystOnRunComplete,
10470
10822
  trialTraceFromMultiShotTrial,
10471
10823
  typoMutator,
10472
10824
  urlContains,