@tangle-network/agent-eval 0.30.0 → 0.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/dist/{baseline-BwdCXUS8.d.ts → baseline-4R5deP0N.d.ts} +1 -1
  2. package/dist/benchmarks/index.d.ts +3 -3
  3. package/dist/builder-eval/index.d.ts +3 -3
  4. package/dist/builder-eval/index.js +2 -2
  5. package/dist/{chunk-R5UQJNKC.js → chunk-4L3WJXQJ.js} +2 -2
  6. package/dist/{chunk-RUI6SIHY.js → chunk-75ZREHD7.js} +4 -4
  7. package/dist/{chunk-5AKPEK5L.js → chunk-CXJOVDJR.js} +2 -2
  8. package/dist/{chunk-K33INZHH.js → chunk-GVQT44CS.js} +2 -2
  9. package/dist/{chunk-UW4NOOZI.js → chunk-HIO4UIS5.js} +308 -2
  10. package/dist/chunk-HIO4UIS5.js.map +1 -0
  11. package/dist/{chunk-4S4BM3QQ.js → chunk-M6RZ5LJN.js} +2 -2
  12. package/dist/{chunk-NG236HPC.js → chunk-QYJT52YW.js} +1 -1
  13. package/dist/chunk-QYJT52YW.js.map +1 -0
  14. package/dist/{chunk-XFZCM5Z3.js → chunk-SMSGXM74.js} +2 -2
  15. package/dist/{chunk-KTGTIOFD.js → chunk-UBPIXOC4.js} +2 -2
  16. package/dist/{chunk-DBIGN5MJ.js → chunk-WGXZAQLR.js} +3 -3
  17. package/dist/{chunk-NLMNWKVM.js → chunk-WSI4K3WB.js} +2 -2
  18. package/dist/{chunk-PALJO75S.js → chunk-XEL6UP7C.js} +2 -2
  19. package/dist/{chunk-SZSBQUIJ.js → chunk-Y2CPBYKH.js} +3 -3
  20. package/dist/{chunk-QHF6EQKK.js → chunk-YTMXBHFM.js} +2 -2
  21. package/dist/cli.js +3 -3
  22. package/dist/{control-rJhEDdpy.d.ts → control-BFpqHFV2.d.ts} +5 -5
  23. package/dist/{control-runtime-BRdQ0wrx.d.ts → control-runtime-BZ_lVLYW.d.ts} +2 -2
  24. package/dist/control.d.ts +8 -8
  25. package/dist/control.js +3 -3
  26. package/dist/{dataset-CiK_3LDr.d.ts → dataset-ueRVTUoY.d.ts} +1 -1
  27. package/dist/{emitter-BqjeOvJh.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  28. package/dist/{errors-BZ9sTdz7.d.ts → errors-mje_cKOs.d.ts} +1 -1
  29. package/dist/{failure-cluster-D1NZKqYu.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -1
  30. package/dist/{feedback-trajectory-j0nJFgC6.d.ts → feedback-trajectory-iATEAHmc.d.ts} +2 -2
  31. package/dist/governance/index.d.ts +4 -4
  32. package/dist/{index-Cgt3DKXr.d.ts → index-DPILdKbP.d.ts} +2 -2
  33. package/dist/{index--fVrWDiR.d.ts → index-TVjRYWRm.d.ts} +1 -1
  34. package/dist/index.d.ts +108 -38
  35. package/dist/index.js +159 -14
  36. package/dist/index.js.map +1 -1
  37. package/dist/{integrity-BAxLGJ9I.d.ts → integrity-DYR5gWlb.d.ts} +2 -2
  38. package/dist/knowledge/index.d.ts +3 -3
  39. package/dist/meta-eval/index.d.ts +4 -4
  40. package/dist/openapi.json +1 -1
  41. package/dist/optimization.d.ts +11 -11
  42. package/dist/optimization.js +8 -8
  43. package/dist/pipelines/index.d.ts +6 -6
  44. package/dist/pipelines/index.js +3 -3
  45. package/dist/prm/index.d.ts +4 -4
  46. package/dist/{query-BFDT0kX_.d.ts → query-DODUYdPg.d.ts} +1 -1
  47. package/dist/{release-report-PWhGlpfO.d.ts → release-report-C8r4Vben.d.ts} +3 -3
  48. package/dist/reporting.d.ts +8 -8
  49. package/dist/reporting.js +4 -4
  50. package/dist/{researcher-ClDX3KZx.d.ts → researcher-BmgJ_901.d.ts} +6 -6
  51. package/dist/rl.d.ts +10 -10
  52. package/dist/rl.js +6 -6
  53. package/dist/{rubric-DgSqjqqj.d.ts → rubric-D5tjHNJQ.d.ts} +2 -2
  54. package/dist/{rubric-predictive-validity-C0uDYwG6.d.ts → rubric-predictive-validity-Bm-CbN46.d.ts} +1 -1
  55. package/dist/{run-record-CqzahIbx.d.ts → run-record-nYf9x2hU.d.ts} +1 -1
  56. package/dist/{store-BP5be6s7.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  57. package/dist/{summary-report-jrSGb2xZ.d.ts → summary-report-dir7A-eQ.d.ts} +2 -2
  58. package/dist/{test-graded-scenario-BJ54PDan.d.ts → test-graded-scenario-B2kWEdh9.d.ts} +2 -2
  59. package/dist/traces.d.ts +533 -10
  60. package/dist/traces.js +14 -300
  61. package/dist/traces.js.map +1 -1
  62. package/dist/{trajectory-BFmveYZt.d.ts → trajectory-CnoBo-JY.d.ts} +1 -1
  63. package/dist/wire/index.d.ts +6 -6
  64. package/dist/wire/index.js +3 -3
  65. package/package.json +12 -21
  66. package/dist/chunk-NG236HPC.js.map +0 -1
  67. package/dist/chunk-UW4NOOZI.js.map +0 -1
  68. package/dist/replay-BX5Fm8en.d.ts +0 -529
  69. /package/dist/{chunk-R5UQJNKC.js.map → chunk-4L3WJXQJ.js.map} +0 -0
  70. /package/dist/{chunk-RUI6SIHY.js.map → chunk-75ZREHD7.js.map} +0 -0
  71. /package/dist/{chunk-5AKPEK5L.js.map → chunk-CXJOVDJR.js.map} +0 -0
  72. /package/dist/{chunk-K33INZHH.js.map → chunk-GVQT44CS.js.map} +0 -0
  73. /package/dist/{chunk-4S4BM3QQ.js.map → chunk-M6RZ5LJN.js.map} +0 -0
  74. /package/dist/{chunk-XFZCM5Z3.js.map → chunk-SMSGXM74.js.map} +0 -0
  75. /package/dist/{chunk-KTGTIOFD.js.map → chunk-UBPIXOC4.js.map} +0 -0
  76. /package/dist/{chunk-DBIGN5MJ.js.map → chunk-WGXZAQLR.js.map} +0 -0
  77. /package/dist/{chunk-NLMNWKVM.js.map → chunk-WSI4K3WB.js.map} +0 -0
  78. /package/dist/{chunk-PALJO75S.js.map → chunk-XEL6UP7C.js.map} +0 -0
  79. /package/dist/{chunk-SZSBQUIJ.js.map → chunk-Y2CPBYKH.js.map} +0 -0
  80. /package/dist/{chunk-QHF6EQKK.js.map → chunk-YTMXBHFM.js.map} +0 -0
@@ -1,4 +1,4 @@
1
- import { T as TraceStore } from './store-BP5be6s7.js';
1
+ import { T as TraceStore } from './store-Db2Bv8Cf.js';
2
2
 
3
3
  /**
4
4
  * Tool-use metrics — derived purely from trace data.
@@ -1,3 +1,3 @@
1
- export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index--fVrWDiR.js';
2
- import '../run-record-CqzahIbx.js';
3
- import '../errors-BZ9sTdz7.js';
1
+ export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index-TVjRYWRm.js';
2
+ import '../run-record-nYf9x2hU.js';
3
+ import '../errors-mje_cKOs.js';
@@ -1,6 +1,6 @@
1
- import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult, T as TestGradedScenario, b as TestGradedRunResult } from '../test-graded-scenario-BJ54PDan.js';
2
- import { T as TraceEmitter } from '../emitter-BqjeOvJh.js';
3
- import { T as TraceStore, R as Run } from '../store-BP5be6s7.js';
1
+ import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult, T as TestGradedScenario, b as TestGradedRunResult } from '../test-graded-scenario-B2kWEdh9.js';
2
+ import { T as TraceEmitter } from '../emitter-DP_cSSiw.js';
3
+ import { T as TraceStore, R as Run } from '../store-Db2Bv8Cf.js';
4
4
 
5
5
  /**
6
6
  * BuilderSession — ties a builder-of-builders workflow together.
@@ -1,7 +1,7 @@
1
1
  import {
2
2
  SandboxHarness,
3
3
  runTestGradedScenario
4
- } from "../chunk-QHF6EQKK.js";
4
+ } from "../chunk-YTMXBHFM.js";
5
5
  import {
6
6
  judgeSpans
7
7
  } from "../chunk-47X6LRCE.js";
@@ -9,7 +9,7 @@ import "../chunk-5BKGXME7.js";
9
9
  import {
10
10
  TraceEmitter
11
11
  } from "../chunk-TVVP3ZZQ.js";
12
- import "../chunk-NG236HPC.js";
12
+ import "../chunk-QYJT52YW.js";
13
13
  import "../chunk-PZ5AY32C.js";
14
14
 
15
15
  // src/builder-eval/builder-session.ts
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  ValidationError
3
- } from "./chunk-NG236HPC.js";
3
+ } from "./chunk-QYJT52YW.js";
4
4
 
5
5
  // src/judge-calibration.ts
6
6
  function calibrateJudge(golden, candidate) {
@@ -719,4 +719,4 @@ export {
719
719
  corpusInterRaterAgreement,
720
720
  corpusInterRaterAgreementFromJudgeScores
721
721
  };
722
- //# sourceMappingURL=chunk-R5UQJNKC.js.map
722
+ //# sourceMappingURL=chunk-4L3WJXQJ.js.map
@@ -1,13 +1,13 @@
1
1
  import {
2
2
  assertLlmRoute
3
- } from "./chunk-4S4BM3QQ.js";
3
+ } from "./chunk-M6RZ5LJN.js";
4
4
  import {
5
5
  researchReport
6
- } from "./chunk-5AKPEK5L.js";
6
+ } from "./chunk-CXJOVDJR.js";
7
7
  import {
8
8
  RunIntegrityError,
9
9
  assertRunCaptured
10
- } from "./chunk-KTGTIOFD.js";
10
+ } from "./chunk-UBPIXOC4.js";
11
11
  import {
12
12
  FileSystemRawProviderSink
13
13
  } from "./chunk-PC4UYEBM.js";
@@ -284,4 +284,4 @@ function defaultRunId(params) {
284
284
  export {
285
285
  runEvalCampaign
286
286
  };
287
- //# sourceMappingURL=chunk-RUI6SIHY.js.map
287
+ //# sourceMappingURL=chunk-75ZREHD7.js.map
@@ -2,7 +2,7 @@ import {
2
2
  cohensD,
3
3
  confidenceInterval,
4
4
  wilcoxonSignedRank
5
- } from "./chunk-R5UQJNKC.js";
5
+ } from "./chunk-4L3WJXQJ.js";
6
6
  import {
7
7
  canonicalize,
8
8
  hashJson
@@ -1047,4 +1047,4 @@ export {
1047
1047
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
1048
1048
  researchReport
1049
1049
  };
1050
- //# sourceMappingURL=chunk-5AKPEK5L.js.map
1050
+ //# sourceMappingURL=chunk-CXJOVDJR.js.map
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  cohensD
3
- } from "./chunk-R5UQJNKC.js";
3
+ } from "./chunk-4L3WJXQJ.js";
4
4
  import {
5
5
  argHash,
6
6
  groupBy,
@@ -615,4 +615,4 @@ export {
615
615
  iqr,
616
616
  welchsTTest
617
617
  };
618
- //# sourceMappingURL=chunk-K33INZHH.js.map
618
+ //# sourceMappingURL=chunk-GVQT44CS.js.map
@@ -5,7 +5,7 @@ import {
5
5
  import {
6
6
  NotFoundError,
7
7
  ReplayError
8
- } from "./chunk-NG236HPC.js";
8
+ } from "./chunk-QYJT52YW.js";
9
9
 
10
10
  // src/trace-analyst/prompts.ts
11
11
  var TRACE_ANALYST_ACTOR_DESCRIPTION = `You answer questions about an OTLP-shaped JSONL trace dataset using the trace tools provided in the \`traces\` namespace.
@@ -986,6 +986,302 @@ function normalizeRecordArray(value) {
986
986
  );
987
987
  }
988
988
 
989
+ // src/trace-analyst/hook.ts
990
+ var DEFAULT_QUESTION = "Summarise what happened in this run. Surface any failure modes, surprising findings, or evidence that the run's verdict is wrong.";
991
+ function traceAnalystOnRunComplete(opts) {
992
+ return async (ctx) => {
993
+ if (opts.shouldRun && !opts.shouldRun(ctx)) return;
994
+ const source = opts.analyze.source;
995
+ if (source === void 0) {
996
+ await ctx.store.appendEvent({
997
+ eventId: `analyst-skip-${ctx.runId}`,
998
+ runId: ctx.runId,
999
+ kind: "log",
1000
+ timestamp: Date.now(),
1001
+ payload: { source: "trace_analyst_hook", reason: "no source configured" }
1002
+ });
1003
+ return;
1004
+ }
1005
+ const result = await analyzeTraces({ question: opts.question ?? DEFAULT_QUESTION }, {
1006
+ ...opts.analyze,
1007
+ source
1008
+ });
1009
+ if (opts.save) await opts.save(result, ctx);
1010
+ if (opts.gateOn && !opts.gateOn(result, ctx)) {
1011
+ await ctx.store.appendEvent({
1012
+ eventId: `analyst-gate-${ctx.runId}`,
1013
+ runId: ctx.runId,
1014
+ kind: "log",
1015
+ timestamp: Date.now(),
1016
+ payload: {
1017
+ source: "trace_analyst_hook",
1018
+ reason: "analyst_gate_failed",
1019
+ findings: result.findings
1020
+ }
1021
+ });
1022
+ }
1023
+ };
1024
+ }
1025
+
1026
+ // src/trace-analyst/insights.ts
1027
+ var DOMAIN_STOP_WORDS = /* @__PURE__ */ new Set([
1028
+ "and",
1029
+ "advanced",
1030
+ "app",
1031
+ "build",
1032
+ "create",
1033
+ "easy",
1034
+ "expert",
1035
+ "extreme",
1036
+ "for",
1037
+ "from",
1038
+ "hard",
1039
+ "implementation",
1040
+ "integrate",
1041
+ "medium",
1042
+ "project",
1043
+ "task",
1044
+ "the",
1045
+ "this",
1046
+ "with",
1047
+ "workflow"
1048
+ ]);
1049
+ function tokenizeDomainWords(value) {
1050
+ return [...value.matchAll(/[A-Za-z][A-Za-z0-9.+#-]{2,}/g)].map((match) => match[0].toLowerCase()).filter((word) => !DOMAIN_STOP_WORDS.has(word));
1051
+ }
1052
+ function inferDomainKeywords(suite) {
1053
+ const suiteWords = new Set(tokenizeDomainWords(`${suite.name} ${suite.collectionId ?? ""}`));
1054
+ const source = [
1055
+ suite.name,
1056
+ suite.collectionId ?? "",
1057
+ ...suite.tasks.flatMap((task) => [
1058
+ task.id,
1059
+ task.name,
1060
+ task.prompt ?? "",
1061
+ task.difficulty ?? "",
1062
+ ...task.tags ?? [],
1063
+ ...task.gaps ?? []
1064
+ ])
1065
+ ].join(" ");
1066
+ const counts = /* @__PURE__ */ new Map();
1067
+ for (const word of tokenizeDomainWords(source)) counts.set(word, (counts.get(word) ?? 0) + 1);
1068
+ return [...counts.entries()].filter(([word, count]) => count >= 2 || suiteWords.has(word)).sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0])).map(([word]) => word).slice(0, 18);
1069
+ }
1070
+ function domainEvidencePattern(keywords) {
1071
+ const escaped = keywords.filter((keyword) => keyword.length >= 3).map((keyword) => keyword.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"));
1072
+ return escaped.length > 0 ? new RegExp(`(?<![A-Za-z0-9])(?:${escaped.join("|")})(?![A-Za-z0-9])`, "i") : /(?<![A-Za-z0-9])(?:sdk|api|css|dns|xml|provider|client|service|integration|webhook|transaction|auth|oauth|graphql|rest)(?![A-Za-z0-9])/i;
1073
+ }
1074
+ function describeTraceInsightScope(suite) {
1075
+ const taskLabel = suite.tasks.length === 1 ? "1 implementation task" : `${suite.tasks.length} implementation tasks`;
1076
+ const tags = /* @__PURE__ */ new Map();
1077
+ for (const task of suite.tasks) {
1078
+ for (const tag of task.tags ?? []) tags.set(tag, (tags.get(tag) ?? 0) + 1);
1079
+ }
1080
+ const topTags = [...tags.entries()].sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0])).slice(0, 8).map(([tag]) => tag);
1081
+ if (topTags.length > 0) return `${taskLabel} across ${topTags.join(", ")}.`;
1082
+ const difficulties = [
1083
+ ...new Set(
1084
+ suite.tasks.map((task) => task.difficulty).filter((value) => Boolean(value))
1085
+ )
1086
+ ].join(", ");
1087
+ return `${taskLabel} across ${difficulties || "the selected benchmark scope"}.`;
1088
+ }
1089
+ function planTraceInsightQuestions(input) {
1090
+ const hasFailures = input.suite.tasks.some((task) => task.outcome && task.outcome !== "satisfied");
1091
+ const hasMultipleShots = input.suite.tasks.some(
1092
+ (task) => (task.gaps ?? []).some((gap) => /shot|review|retry|continue/i.test(gap))
1093
+ );
1094
+ const questions = [
1095
+ {
1096
+ id: "execution-path",
1097
+ question: "What did the worker actually do before the first meaningful implementation edit?",
1098
+ why: "Separates grounded execution from polished but shallow output."
1099
+ },
1100
+ {
1101
+ id: "research-grounding",
1102
+ question: "Did the worker inspect docs, source, examples, or package references before committing to an implementation path?",
1103
+ why: "Identifies whether failures came from weak retrieval, weak examples, or premature coding."
1104
+ },
1105
+ {
1106
+ id: "domain-proof",
1107
+ question: "Which tasks produced executable domain proof versus UI copy, placeholders, or inferred behavior?",
1108
+ why: "Keeps product-quality claims tied to concrete evidence."
1109
+ },
1110
+ {
1111
+ id: "root-cause",
1112
+ question: "For each major failure cluster, is the likely root cause prompt/scaffold, docs/examples, SDK/API ergonomics, evaluator, runtime, or model behavior?",
1113
+ why: "Turns trace observations into actionable ownership."
1114
+ },
1115
+ {
1116
+ id: "evidence-quality",
1117
+ question: "Which external-facing claims are directly supported by trace ids, span ids, verifier findings, reviewer notes, or generated code?",
1118
+ why: "Prevents unsupported customer-report conclusions."
1119
+ }
1120
+ ];
1121
+ if (hasMultipleShots) {
1122
+ questions.push({
1123
+ id: "reviewer-lift",
1124
+ question: "Where did reviewer feedback improve score, stall, or regress across shots?",
1125
+ why: "Shows whether the driver loop is learning or merely repeating work."
1126
+ });
1127
+ }
1128
+ if (hasFailures) {
1129
+ questions.push({
1130
+ id: "optimization-targets",
1131
+ question: "Which prompt, evaluator, scaffold, or workflow changes should feed the next GEPA/autoresearch optimization run?",
1132
+ why: "Connects benchmark evidence to the optimization loop."
1133
+ });
1134
+ }
1135
+ return questions;
1136
+ }
1137
+ function buildTraceInsightContext(input) {
1138
+ return {
1139
+ suite: input.suite,
1140
+ scope: describeTraceInsightScope(input.suite),
1141
+ keywords: inferDomainKeywords(input.suite),
1142
+ questions: planTraceInsightQuestions(input),
1143
+ panel: defaultTraceInsightPanel(),
1144
+ findings: input.findings ?? [],
1145
+ agent: input.agent ?? null,
1146
+ totals: input.totals ?? null
1147
+ };
1148
+ }
1149
+ function scoreTraceInsightReadiness(context) {
1150
+ const failedTasks = context.suite.tasks.filter(
1151
+ (task) => task.outcome && task.outcome !== "satisfied"
1152
+ );
1153
+ const findingTaskIds = new Set(context.findings.flatMap((finding) => finding.taskIds));
1154
+ const failedTasksWithFindings = failedTasks.filter((task) => findingTaskIds.has(task.id));
1155
+ const tasksWithGaps = context.suite.tasks.filter((task) => (task.gaps ?? []).length > 0);
1156
+ const gates = [
1157
+ {
1158
+ id: "domain-context",
1159
+ label: "Domain context inferred",
1160
+ passed: context.keywords.length > 0,
1161
+ severity: "high",
1162
+ detail: context.keywords.length > 0 ? `${context.keywords.length} domain terms inferred: ${context.keywords.slice(0, 8).join(", ")}` : "No domain terms were inferred from suite, tasks, prompts, tags, or gaps."
1163
+ },
1164
+ {
1165
+ id: "panel-coverage",
1166
+ label: "Analyst panel planned",
1167
+ passed: context.panel.length >= 4 && context.questions.length >= 5,
1168
+ severity: "high",
1169
+ detail: `${context.panel.length} panel roles and ${context.questions.length} investigation questions planned.`
1170
+ },
1171
+ {
1172
+ id: "failure-coverage",
1173
+ label: "Failures mapped to findings",
1174
+ passed: failedTasks.length === 0 || failedTasksWithFindings.length / failedTasks.length >= 0.5,
1175
+ severity: "critical",
1176
+ detail: failedTasks.length === 0 ? "No failed tasks in suite." : `${failedTasksWithFindings.length}/${failedTasks.length} failed tasks appear in finding clusters.`
1177
+ },
1178
+ {
1179
+ id: "gap-evidence",
1180
+ label: "Task gaps captured",
1181
+ passed: failedTasks.length === 0 || tasksWithGaps.length / failedTasks.length >= 0.5,
1182
+ severity: "medium",
1183
+ detail: `${tasksWithGaps.length} tasks include explicit evaluator or analyst gaps.`
1184
+ }
1185
+ ];
1186
+ const penalty = gates.reduce((sum, gate) => {
1187
+ if (gate.passed) return sum;
1188
+ if (gate.severity === "critical") return sum + 35;
1189
+ if (gate.severity === "high") return sum + 20;
1190
+ if (gate.severity === "medium") return sum + 10;
1191
+ return sum + 5;
1192
+ }, 0);
1193
+ const score = Math.max(0, Math.min(1, 1 - penalty / 100));
1194
+ return {
1195
+ score,
1196
+ grade: score >= 0.9 ? "external-ready" : score >= 0.7 ? "internal-review" : "raw-analysis",
1197
+ gates
1198
+ };
1199
+ }
1200
+ function defaultTraceInsightPanel() {
1201
+ return [
1202
+ {
1203
+ id: "trace-forensics",
1204
+ name: "Trace Forensics",
1205
+ responsibility: "Reconstruct what the worker did in order, including research, edits, reviewer interventions, verifier feedback, and stop reason."
1206
+ },
1207
+ {
1208
+ id: "root-cause",
1209
+ name: "Root Cause",
1210
+ responsibility: "Map failures to prompt/scaffold, docs/examples, SDK/API/product ergonomics, evaluator, runtime, or model behavior."
1211
+ },
1212
+ {
1213
+ id: "optimization",
1214
+ name: "Optimization",
1215
+ responsibility: "Identify prompt, reviewer, evaluator, scaffold, and GEPA/autoresearch changes that should be tested next."
1216
+ },
1217
+ {
1218
+ id: "external-evidence",
1219
+ name: "External Evidence",
1220
+ responsibility: "Separate customer-safe claims from internal harness findings and reject conclusions without task, trace, span, code, reviewer, or verifier evidence."
1221
+ }
1222
+ ];
1223
+ }
1224
+ function buildTraceInsightPrompt(input) {
1225
+ const context = buildTraceInsightContext(input);
1226
+ const maxRepresentativeTraces = input.maxRepresentativeTraces ?? 6;
1227
+ return `Analyze this benchmark run and produce evidence-backed trace intelligence.
1228
+
1229
+ Audience:
1230
+ - internal AI/product leadership
1231
+ - possible customer-facing report for ${input.suite.name}
1232
+
1233
+ Investigation plan:
1234
+ ${context.questions.map((item, index) => `${index + 1}. ${item.question} (${item.why})`).join("\n")}
1235
+
1236
+ Analyst panel:
1237
+ ${context.panel.map((role) => `- ${role.name}: ${role.responsibility}`).join("\n")}
1238
+
1239
+ If the task branches are independent, use subagents for the panel roles above and aggregate their findings. Do not run a panel role unless its answer will change the final report.
1240
+
1241
+ Required output:
1242
+ 1. Executive verdict: what this run proves and does not prove.
1243
+ 2. The investigation questions you answered and the evidence used.
1244
+ 3. Failure taxonomy: agent prompting, evaluator/harness, docs/examples, SDK/API/product integration, infra.
1245
+ 4. Evidence-backed examples with trace ids/task ids and concrete verifier findings.
1246
+ 5. Highest-ROI fixes for the benchmark harness, prompt/GEPA optimization, and customer-facing product/docs surface.
1247
+ 6. What is safe for an external report versus what must stay internal.
1248
+ 7. One rerun plan that would validate lift after optimization.
1249
+
1250
+ Budget:
1251
+ - Inspect the dataset overview, the failure summary, and at most ${maxRepresentativeTraces} representative traces.
1252
+ - Prefer traces named in the failure summary over broad exploration.
1253
+ - Do not do exhaustive trace sweeps.
1254
+ - Return the final report as soon as the taxonomy and examples are supported.
1255
+
1256
+ Run summary:
1257
+ ${JSON.stringify(
1258
+ {
1259
+ suite: input.suite.name,
1260
+ scope: context.scope,
1261
+ inferredKeywords: context.keywords,
1262
+ agent: context.agent,
1263
+ totals: context.totals,
1264
+ findings: context.findings.map((finding) => ({
1265
+ kind: finding.kind,
1266
+ severity: finding.severity,
1267
+ taskCount: finding.taskIds.length,
1268
+ proposedFixClass: finding.proposedFixClass
1269
+ })),
1270
+ failures: input.suite.tasks.filter((task) => task.outcome && task.outcome !== "satisfied").map((task) => ({
1271
+ task: task.id,
1272
+ difficulty: task.difficulty,
1273
+ outcome: task.outcome,
1274
+ score: task.score,
1275
+ gaps: task.gaps ?? []
1276
+ }))
1277
+ },
1278
+ null,
1279
+ 2
1280
+ )}
1281
+
1282
+ Use the trace tools. Do not invent facts. Cite task ids. Separate customer-facing claims from internal harness/model findings.`;
1283
+ }
1284
+
989
1285
  // src/trace/store.ts
990
1286
  var InMemoryTraceStore = class {
991
1287
  runs = /* @__PURE__ */ new Map();
@@ -1545,6 +1841,16 @@ export {
1545
1841
  buildTraceAnalystTools,
1546
1842
  traceAnalystFunctionGroup,
1547
1843
  analyzeTraces,
1844
+ traceAnalystOnRunComplete,
1845
+ tokenizeDomainWords,
1846
+ inferDomainKeywords,
1847
+ domainEvidencePattern,
1848
+ describeTraceInsightScope,
1849
+ planTraceInsightQuestions,
1850
+ buildTraceInsightContext,
1851
+ scoreTraceInsightReadiness,
1852
+ defaultTraceInsightPanel,
1853
+ buildTraceInsightPrompt,
1548
1854
  InMemoryTraceStore,
1549
1855
  FileSystemTraceStore,
1550
1856
  OTEL_AGENT_EVAL_SCOPE,
@@ -1558,4 +1864,4 @@ export {
1558
1864
  createReplayFetch,
1559
1865
  iterateRawCalls
1560
1866
  };
1561
- //# sourceMappingURL=chunk-UW4NOOZI.js.map
1867
+ //# sourceMappingURL=chunk-HIO4UIS5.js.map