@tangle-network/agent-eval 0.30.0 → 0.31.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/CHANGELOG.md +79 -0
  2. package/dist/{baseline-BwdCXUS8.d.ts → baseline-4R5deP0N.d.ts} +1 -1
  3. package/dist/benchmarks/index.d.ts +3 -3
  4. package/dist/builder-eval/index.d.ts +3 -3
  5. package/dist/builder-eval/index.js +2 -2
  6. package/dist/{chunk-R5UQJNKC.js → chunk-4L3WJXQJ.js} +2 -2
  7. package/dist/{chunk-SZSBQUIJ.js → chunk-B73G44OH.js} +3 -3
  8. package/dist/{chunk-5AKPEK5L.js → chunk-CXJOVDJR.js} +2 -2
  9. package/dist/{chunk-RUI6SIHY.js → chunk-DTEJNZYK.js} +5 -4
  10. package/dist/chunk-DTEJNZYK.js.map +1 -0
  11. package/dist/{chunk-K33INZHH.js → chunk-GVQT44CS.js} +2 -2
  12. package/dist/{chunk-UW4NOOZI.js → chunk-HIO4UIS5.js} +308 -2
  13. package/dist/chunk-HIO4UIS5.js.map +1 -0
  14. package/dist/{chunk-4S4BM3QQ.js → chunk-M6RZ5LJN.js} +2 -2
  15. package/dist/{chunk-NG236HPC.js → chunk-QYJT52YW.js} +1 -1
  16. package/dist/chunk-QYJT52YW.js.map +1 -0
  17. package/dist/{chunk-PALJO75S.js → chunk-S4Y5VXMS.js} +2 -2
  18. package/dist/{chunk-XFZCM5Z3.js → chunk-SMSGXM74.js} +2 -2
  19. package/dist/{chunk-KTGTIOFD.js → chunk-UBPIXOC4.js} +2 -2
  20. package/dist/{chunk-DBIGN5MJ.js → chunk-WGXZAQLR.js} +3 -3
  21. package/dist/{chunk-QHF6EQKK.js → chunk-YTMXBHFM.js} +2 -2
  22. package/dist/{chunk-NLMNWKVM.js → chunk-ZN2CMQIW.js} +54 -2
  23. package/dist/chunk-ZN2CMQIW.js.map +1 -0
  24. package/dist/cli.js +3 -3
  25. package/dist/{control-rJhEDdpy.d.ts → control-p2ns7elI.d.ts} +5 -5
  26. package/dist/{control-runtime-BRdQ0wrx.d.ts → control-runtime-BZ_lVLYW.d.ts} +2 -2
  27. package/dist/control.d.ts +8 -8
  28. package/dist/control.js +3 -3
  29. package/dist/{dataset-CiK_3LDr.d.ts → dataset-ueRVTUoY.d.ts} +1 -1
  30. package/dist/{emitter-BqjeOvJh.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  31. package/dist/{errors-BZ9sTdz7.d.ts → errors-mje_cKOs.d.ts} +1 -1
  32. package/dist/{failure-cluster-D1NZKqYu.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -1
  33. package/dist/{feedback-trajectory-j0nJFgC6.d.ts → feedback-trajectory-iATEAHmc.d.ts} +2 -2
  34. package/dist/governance/index.d.ts +4 -4
  35. package/dist/{index--fVrWDiR.d.ts → index-BTqhGHJT.d.ts} +1 -1
  36. package/dist/{index-Cgt3DKXr.d.ts → index-DPILdKbP.d.ts} +2 -2
  37. package/dist/index.d.ts +108 -38
  38. package/dist/index.js +159 -14
  39. package/dist/index.js.map +1 -1
  40. package/dist/{integrity-BAxLGJ9I.d.ts → integrity-DYR5gWlb.d.ts} +2 -2
  41. package/dist/knowledge/index.d.ts +3 -3
  42. package/dist/meta-eval/index.d.ts +4 -4
  43. package/dist/openapi.json +1 -1
  44. package/dist/optimization.d.ts +11 -11
  45. package/dist/optimization.js +8 -8
  46. package/dist/pipelines/index.d.ts +6 -6
  47. package/dist/pipelines/index.js +3 -3
  48. package/dist/prm/index.d.ts +4 -4
  49. package/dist/{query-BFDT0kX_.d.ts → query-DODUYdPg.d.ts} +1 -1
  50. package/dist/{release-report-PWhGlpfO.d.ts → release-report-DLWbBPtH.d.ts} +3 -3
  51. package/dist/reporting.d.ts +8 -8
  52. package/dist/reporting.js +4 -4
  53. package/dist/{researcher-ClDX3KZx.d.ts → researcher-BRHa5Jxo.d.ts} +12 -6
  54. package/dist/rl.d.ts +10 -10
  55. package/dist/rl.js +6 -6
  56. package/dist/{rubric-DgSqjqqj.d.ts → rubric-D5tjHNJQ.d.ts} +2 -2
  57. package/dist/{rubric-predictive-validity-C0uDYwG6.d.ts → rubric-predictive-validity-CMHypZ_M.d.ts} +1 -1
  58. package/dist/{run-record-CqzahIbx.d.ts → run-record-BfX5y68A.d.ts} +43 -2
  59. package/dist/{store-BP5be6s7.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  60. package/dist/{summary-report-jrSGb2xZ.d.ts → summary-report-D7AQS7eB.d.ts} +2 -2
  61. package/dist/{test-graded-scenario-BJ54PDan.d.ts → test-graded-scenario-B2kWEdh9.d.ts} +2 -2
  62. package/dist/traces.d.ts +533 -10
  63. package/dist/traces.js +14 -300
  64. package/dist/traces.js.map +1 -1
  65. package/dist/{trajectory-BFmveYZt.d.ts → trajectory-CnoBo-JY.d.ts} +1 -1
  66. package/dist/wire/index.d.ts +6 -6
  67. package/dist/wire/index.js +3 -3
  68. package/package.json +12 -21
  69. package/dist/chunk-NG236HPC.js.map +0 -1
  70. package/dist/chunk-NLMNWKVM.js.map +0 -1
  71. package/dist/chunk-RUI6SIHY.js.map +0 -1
  72. package/dist/chunk-UW4NOOZI.js.map +0 -1
  73. package/dist/replay-BX5Fm8en.d.ts +0 -529
  74. /package/dist/{chunk-R5UQJNKC.js.map → chunk-4L3WJXQJ.js.map} +0 -0
  75. /package/dist/{chunk-SZSBQUIJ.js.map → chunk-B73G44OH.js.map} +0 -0
  76. /package/dist/{chunk-5AKPEK5L.js.map → chunk-CXJOVDJR.js.map} +0 -0
  77. /package/dist/{chunk-K33INZHH.js.map → chunk-GVQT44CS.js.map} +0 -0
  78. /package/dist/{chunk-4S4BM3QQ.js.map → chunk-M6RZ5LJN.js.map} +0 -0
  79. /package/dist/{chunk-PALJO75S.js.map → chunk-S4Y5VXMS.js.map} +0 -0
  80. /package/dist/{chunk-XFZCM5Z3.js.map → chunk-SMSGXM74.js.map} +0 -0
  81. /package/dist/{chunk-KTGTIOFD.js.map → chunk-UBPIXOC4.js.map} +0 -0
  82. /package/dist/{chunk-DBIGN5MJ.js.map → chunk-WGXZAQLR.js.map} +0 -0
  83. /package/dist/{chunk-QHF6EQKK.js.map → chunk-YTMXBHFM.js.map} +0 -0
package/dist/traces.js CHANGED
@@ -17,13 +17,23 @@ import {
17
17
  TraceNotFoundError,
18
18
  analyzeTraces,
19
19
  buildTraceAnalystTools,
20
+ buildTraceInsightContext,
21
+ buildTraceInsightPrompt,
20
22
  createReplayFetch,
23
+ defaultTraceInsightPanel,
24
+ describeTraceInsightScope,
25
+ domainEvidencePattern,
21
26
  exportRunAsOtlp,
27
+ inferDomainKeywords,
22
28
  iterateRawCalls,
29
+ planTraceInsightQuestions,
23
30
  redactString,
24
31
  redactValue,
25
- traceAnalystFunctionGroup
26
- } from "./chunk-UW4NOOZI.js";
32
+ scoreTraceInsightReadiness,
33
+ tokenizeDomainWords,
34
+ traceAnalystFunctionGroup,
35
+ traceAnalystOnRunComplete
36
+ } from "./chunk-HIO4UIS5.js";
27
37
  import {
28
38
  aggregateLlm,
29
39
  argHash,
@@ -47,7 +57,7 @@ import {
47
57
  RunIntegrityError,
48
58
  assertRunCaptured,
49
59
  throwIfRunIncomplete
50
- } from "./chunk-KTGTIOFD.js";
60
+ } from "./chunk-UBPIXOC4.js";
51
61
  import {
52
62
  FileSystemRawProviderSink,
53
63
  InMemoryRawProviderSink,
@@ -60,304 +70,8 @@ import {
60
70
  llmSpanFromProvider
61
71
  } from "./chunk-TVVP3ZZQ.js";
62
72
  import "./chunk-VSMTAMNK.js";
63
- import "./chunk-NG236HPC.js";
73
+ import "./chunk-QYJT52YW.js";
64
74
  import "./chunk-PZ5AY32C.js";
65
-
66
- // src/trace-analyst/hook.ts
67
- var DEFAULT_QUESTION = "Summarise what happened in this run. Surface any failure modes, surprising findings, or evidence that the run's verdict is wrong.";
68
- function traceAnalystOnRunComplete(opts) {
69
- return async (ctx) => {
70
- if (opts.shouldRun && !opts.shouldRun(ctx)) return;
71
- const source = opts.analyze.source;
72
- if (source === void 0) {
73
- await ctx.store.appendEvent({
74
- eventId: `analyst-skip-${ctx.runId}`,
75
- runId: ctx.runId,
76
- kind: "log",
77
- timestamp: Date.now(),
78
- payload: { source: "trace_analyst_hook", reason: "no source configured" }
79
- });
80
- return;
81
- }
82
- const result = await analyzeTraces({ question: opts.question ?? DEFAULT_QUESTION }, {
83
- ...opts.analyze,
84
- source
85
- });
86
- if (opts.save) await opts.save(result, ctx);
87
- if (opts.gateOn && !opts.gateOn(result, ctx)) {
88
- await ctx.store.appendEvent({
89
- eventId: `analyst-gate-${ctx.runId}`,
90
- runId: ctx.runId,
91
- kind: "log",
92
- timestamp: Date.now(),
93
- payload: {
94
- source: "trace_analyst_hook",
95
- reason: "analyst_gate_failed",
96
- findings: result.findings
97
- }
98
- });
99
- }
100
- };
101
- }
102
-
103
- // src/trace-analyst/insights.ts
104
- var DOMAIN_STOP_WORDS = /* @__PURE__ */ new Set([
105
- "and",
106
- "advanced",
107
- "app",
108
- "build",
109
- "create",
110
- "easy",
111
- "expert",
112
- "extreme",
113
- "for",
114
- "from",
115
- "hard",
116
- "implementation",
117
- "integrate",
118
- "medium",
119
- "project",
120
- "task",
121
- "the",
122
- "this",
123
- "with",
124
- "workflow"
125
- ]);
126
- function tokenizeDomainWords(value) {
127
- return [...value.matchAll(/[A-Za-z][A-Za-z0-9.+#-]{2,}/g)].map((match) => match[0].toLowerCase()).filter((word) => !DOMAIN_STOP_WORDS.has(word));
128
- }
129
- function inferDomainKeywords(suite) {
130
- const suiteWords = new Set(tokenizeDomainWords(`${suite.name} ${suite.collectionId ?? ""}`));
131
- const source = [
132
- suite.name,
133
- suite.collectionId ?? "",
134
- ...suite.tasks.flatMap((task) => [
135
- task.id,
136
- task.name,
137
- task.prompt ?? "",
138
- task.difficulty ?? "",
139
- ...task.tags ?? [],
140
- ...task.gaps ?? []
141
- ])
142
- ].join(" ");
143
- const counts = /* @__PURE__ */ new Map();
144
- for (const word of tokenizeDomainWords(source)) counts.set(word, (counts.get(word) ?? 0) + 1);
145
- return [...counts.entries()].filter(([word, count]) => count >= 2 || suiteWords.has(word)).sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0])).map(([word]) => word).slice(0, 18);
146
- }
147
- function domainEvidencePattern(keywords) {
148
- const escaped = keywords.filter((keyword) => keyword.length >= 3).map((keyword) => keyword.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"));
149
- return escaped.length > 0 ? new RegExp(`(?<![A-Za-z0-9])(?:${escaped.join("|")})(?![A-Za-z0-9])`, "i") : /(?<![A-Za-z0-9])(?:sdk|api|css|dns|xml|provider|client|service|integration|webhook|transaction|auth|oauth|graphql|rest)(?![A-Za-z0-9])/i;
150
- }
151
- function describeTraceInsightScope(suite) {
152
- const taskLabel = suite.tasks.length === 1 ? "1 implementation task" : `${suite.tasks.length} implementation tasks`;
153
- const tags = /* @__PURE__ */ new Map();
154
- for (const task of suite.tasks) {
155
- for (const tag of task.tags ?? []) tags.set(tag, (tags.get(tag) ?? 0) + 1);
156
- }
157
- const topTags = [...tags.entries()].sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0])).slice(0, 8).map(([tag]) => tag);
158
- if (topTags.length > 0) return `${taskLabel} across ${topTags.join(", ")}.`;
159
- const difficulties = [
160
- ...new Set(
161
- suite.tasks.map((task) => task.difficulty).filter((value) => Boolean(value))
162
- )
163
- ].join(", ");
164
- return `${taskLabel} across ${difficulties || "the selected benchmark scope"}.`;
165
- }
166
- function planTraceInsightQuestions(input) {
167
- const hasFailures = input.suite.tasks.some((task) => task.outcome && task.outcome !== "satisfied");
168
- const hasMultipleShots = input.suite.tasks.some(
169
- (task) => (task.gaps ?? []).some((gap) => /shot|review|retry|continue/i.test(gap))
170
- );
171
- const questions = [
172
- {
173
- id: "execution-path",
174
- question: "What did the worker actually do before the first meaningful implementation edit?",
175
- why: "Separates grounded execution from polished but shallow output."
176
- },
177
- {
178
- id: "research-grounding",
179
- question: "Did the worker inspect docs, source, examples, or package references before committing to an implementation path?",
180
- why: "Identifies whether failures came from weak retrieval, weak examples, or premature coding."
181
- },
182
- {
183
- id: "domain-proof",
184
- question: "Which tasks produced executable domain proof versus UI copy, placeholders, or inferred behavior?",
185
- why: "Keeps product-quality claims tied to concrete evidence."
186
- },
187
- {
188
- id: "root-cause",
189
- question: "For each major failure cluster, is the likely root cause prompt/scaffold, docs/examples, SDK/API ergonomics, evaluator, runtime, or model behavior?",
190
- why: "Turns trace observations into actionable ownership."
191
- },
192
- {
193
- id: "evidence-quality",
194
- question: "Which external-facing claims are directly supported by trace ids, span ids, verifier findings, reviewer notes, or generated code?",
195
- why: "Prevents unsupported customer-report conclusions."
196
- }
197
- ];
198
- if (hasMultipleShots) {
199
- questions.push({
200
- id: "reviewer-lift",
201
- question: "Where did reviewer feedback improve score, stall, or regress across shots?",
202
- why: "Shows whether the driver loop is learning or merely repeating work."
203
- });
204
- }
205
- if (hasFailures) {
206
- questions.push({
207
- id: "optimization-targets",
208
- question: "Which prompt, evaluator, scaffold, or workflow changes should feed the next GEPA/autoresearch optimization run?",
209
- why: "Connects benchmark evidence to the optimization loop."
210
- });
211
- }
212
- return questions;
213
- }
214
- function buildTraceInsightContext(input) {
215
- return {
216
- suite: input.suite,
217
- scope: describeTraceInsightScope(input.suite),
218
- keywords: inferDomainKeywords(input.suite),
219
- questions: planTraceInsightQuestions(input),
220
- panel: defaultTraceInsightPanel(),
221
- findings: input.findings ?? [],
222
- agent: input.agent ?? null,
223
- totals: input.totals ?? null
224
- };
225
- }
226
- function scoreTraceInsightReadiness(context) {
227
- const failedTasks = context.suite.tasks.filter(
228
- (task) => task.outcome && task.outcome !== "satisfied"
229
- );
230
- const findingTaskIds = new Set(context.findings.flatMap((finding) => finding.taskIds));
231
- const failedTasksWithFindings = failedTasks.filter((task) => findingTaskIds.has(task.id));
232
- const tasksWithGaps = context.suite.tasks.filter((task) => (task.gaps ?? []).length > 0);
233
- const gates = [
234
- {
235
- id: "domain-context",
236
- label: "Domain context inferred",
237
- passed: context.keywords.length > 0,
238
- severity: "high",
239
- detail: context.keywords.length > 0 ? `${context.keywords.length} domain terms inferred: ${context.keywords.slice(0, 8).join(", ")}` : "No domain terms were inferred from suite, tasks, prompts, tags, or gaps."
240
- },
241
- {
242
- id: "panel-coverage",
243
- label: "Analyst panel planned",
244
- passed: context.panel.length >= 4 && context.questions.length >= 5,
245
- severity: "high",
246
- detail: `${context.panel.length} panel roles and ${context.questions.length} investigation questions planned.`
247
- },
248
- {
249
- id: "failure-coverage",
250
- label: "Failures mapped to findings",
251
- passed: failedTasks.length === 0 || failedTasksWithFindings.length / failedTasks.length >= 0.5,
252
- severity: "critical",
253
- detail: failedTasks.length === 0 ? "No failed tasks in suite." : `${failedTasksWithFindings.length}/${failedTasks.length} failed tasks appear in finding clusters.`
254
- },
255
- {
256
- id: "gap-evidence",
257
- label: "Task gaps captured",
258
- passed: failedTasks.length === 0 || tasksWithGaps.length / failedTasks.length >= 0.5,
259
- severity: "medium",
260
- detail: `${tasksWithGaps.length} tasks include explicit evaluator or analyst gaps.`
261
- }
262
- ];
263
- const penalty = gates.reduce((sum, gate) => {
264
- if (gate.passed) return sum;
265
- if (gate.severity === "critical") return sum + 35;
266
- if (gate.severity === "high") return sum + 20;
267
- if (gate.severity === "medium") return sum + 10;
268
- return sum + 5;
269
- }, 0);
270
- const score = Math.max(0, Math.min(1, 1 - penalty / 100));
271
- return {
272
- score,
273
- grade: score >= 0.9 ? "external-ready" : score >= 0.7 ? "internal-review" : "raw-analysis",
274
- gates
275
- };
276
- }
277
- function defaultTraceInsightPanel() {
278
- return [
279
- {
280
- id: "trace-forensics",
281
- name: "Trace Forensics",
282
- responsibility: "Reconstruct what the worker did in order, including research, edits, reviewer interventions, verifier feedback, and stop reason."
283
- },
284
- {
285
- id: "root-cause",
286
- name: "Root Cause",
287
- responsibility: "Map failures to prompt/scaffold, docs/examples, SDK/API/product ergonomics, evaluator, runtime, or model behavior."
288
- },
289
- {
290
- id: "optimization",
291
- name: "Optimization",
292
- responsibility: "Identify prompt, reviewer, evaluator, scaffold, and GEPA/autoresearch changes that should be tested next."
293
- },
294
- {
295
- id: "external-evidence",
296
- name: "External Evidence",
297
- responsibility: "Separate customer-safe claims from internal harness findings and reject conclusions without task, trace, span, code, reviewer, or verifier evidence."
298
- }
299
- ];
300
- }
301
- function buildTraceInsightPrompt(input) {
302
- const context = buildTraceInsightContext(input);
303
- const maxRepresentativeTraces = input.maxRepresentativeTraces ?? 6;
304
- return `Analyze this benchmark run and produce evidence-backed trace intelligence.
305
-
306
- Audience:
307
- - internal AI/product leadership
308
- - possible customer-facing report for ${input.suite.name}
309
-
310
- Investigation plan:
311
- ${context.questions.map((item, index) => `${index + 1}. ${item.question} (${item.why})`).join("\n")}
312
-
313
- Analyst panel:
314
- ${context.panel.map((role) => `- ${role.name}: ${role.responsibility}`).join("\n")}
315
-
316
- If the task branches are independent, use subagents for the panel roles above and aggregate their findings. Do not run a panel role unless its answer will change the final report.
317
-
318
- Required output:
319
- 1. Executive verdict: what this run proves and does not prove.
320
- 2. The investigation questions you answered and the evidence used.
321
- 3. Failure taxonomy: agent prompting, evaluator/harness, docs/examples, SDK/API/product integration, infra.
322
- 4. Evidence-backed examples with trace ids/task ids and concrete verifier findings.
323
- 5. Highest-ROI fixes for the benchmark harness, prompt/GEPA optimization, and customer-facing product/docs surface.
324
- 6. What is safe for an external report versus what must stay internal.
325
- 7. One rerun plan that would validate lift after optimization.
326
-
327
- Budget:
328
- - Inspect the dataset overview, the failure summary, and at most ${maxRepresentativeTraces} representative traces.
329
- - Prefer traces named in the failure summary over broad exploration.
330
- - Do not do exhaustive trace sweeps.
331
- - Return the final report as soon as the taxonomy and examples are supported.
332
-
333
- Run summary:
334
- ${JSON.stringify(
335
- {
336
- suite: input.suite.name,
337
- scope: context.scope,
338
- inferredKeywords: context.keywords,
339
- agent: context.agent,
340
- totals: context.totals,
341
- findings: context.findings.map((finding) => ({
342
- kind: finding.kind,
343
- severity: finding.severity,
344
- taskCount: finding.taskIds.length,
345
- proposedFixClass: finding.proposedFixClass
346
- })),
347
- failures: input.suite.tasks.filter((task) => task.outcome && task.outcome !== "satisfied").map((task) => ({
348
- task: task.id,
349
- difficulty: task.difficulty,
350
- outcome: task.outcome,
351
- score: task.score,
352
- gaps: task.gaps ?? []
353
- }))
354
- },
355
- null,
356
- 2
357
- )}
358
-
359
- Use the trace tools. Do not invent facts. Cite task ids. Separate customer-facing claims from internal harness/model findings.`;
360
- }
361
75
  export {
362
76
  DEFAULT_REDACTION_RULES,
363
77
  DEFAULT_TRACE_ANALYST_BUDGETS,
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/trace-analyst/hook.ts","../src/trace-analyst/insights.ts"],"sourcesContent":["/**\n * Trace-analyst auto-execution hook.\n *\n * Wires `analyzeTraces` into a `TraceEmitter`'s `onRunComplete` so a\n * direct matrix run produces an analysis artifact without an out-of-band\n * step. Designed for the case where the consumer reports \"the analyst\n * never ran\" — the cause is almost always orchestration, not the analyst.\n *\n * Usage:\n *\n * const emitter = new TraceEmitter(store, {\n * onRunComplete: [traceAnalystOnRunComplete({ analyze: opts, save })],\n * })\n *\n * Hooks are best-effort by default — they never crash the underlying run.\n * The caller decides whether to gate the run on the analysis result via\n * the `gateOn` callback.\n */\n\nimport type { RunCompleteHook, RunCompleteHookContext } from '../trace/emitter'\nimport { type AnalyzeTracesOptions, type AnalyzeTracesResult, analyzeTraces } from './analyst'\n\nexport interface TraceAnalystHookOptions {\n /**\n * Options forwarded to `analyzeTraces`. The hook supplies the question\n * if you don't pass one — defaulting to a launch-grade prompt that asks\n * for failure modes, surprising findings, and a recommendation.\n */\n analyze: Omit<AnalyzeTracesOptions, 'source'> & { source?: AnalyzeTracesOptions['source'] }\n /**\n * Override the question. The default is intentionally generic:\n * \"Summarise what happened in this run, surface any failure modes,\n * surprising findings, or evidence the verdict is wrong.\"\n */\n question?: string\n /**\n * Persist the result. The hook calls this with the analysis output and\n * the run context. Common implementations write to a TraceAnalysisStore\n * or append to a per-run JSONL.\n */\n save?: (result: AnalyzeTracesResult, ctx: RunCompleteHookContext) => Promise<void>\n /**\n * Predicate gating execution per run. Default: every completed run.\n * Use to skip aborted runs, debug runs, or runs without LLM activity.\n */\n shouldRun?: (ctx: RunCompleteHookContext) => boolean\n /**\n * Optional gate: if set and returns false, the hook records the failure\n * as a log event on the run instead of staying quiet. The caller can\n * then trigger downstream alerts off `analyst_gate_failed` log events.\n */\n gateOn?: (result: AnalyzeTracesResult, ctx: RunCompleteHookContext) => boolean\n}\n\nconst DEFAULT_QUESTION =\n \"Summarise what happened in this run. Surface any failure modes, surprising findings, or evidence that the run's verdict is wrong.\"\n\nexport function traceAnalystOnRunComplete(opts: TraceAnalystHookOptions): RunCompleteHook {\n return async (ctx: RunCompleteHookContext) => {\n if (opts.shouldRun && !opts.shouldRun(ctx)) return\n const source = opts.analyze.source\n if (source === undefined) {\n // The analyst needs a source. If the caller didn't supply one we don't\n // run — but we do leave a breadcrumb so the absence is visible.\n await ctx.store.appendEvent({\n eventId: `analyst-skip-${ctx.runId}`,\n runId: ctx.runId,\n kind: 'log',\n timestamp: Date.now(),\n payload: { source: 'trace_analyst_hook', reason: 'no source configured' },\n })\n return\n }\n const result = await analyzeTraces({ question: opts.question ?? DEFAULT_QUESTION }, {\n ...opts.analyze,\n source,\n } as AnalyzeTracesOptions)\n if (opts.save) await opts.save(result, ctx)\n if (opts.gateOn && !opts.gateOn(result, ctx)) {\n await ctx.store.appendEvent({\n eventId: `analyst-gate-${ctx.runId}`,\n runId: ctx.runId,\n kind: 'log',\n timestamp: Date.now(),\n payload: {\n source: 'trace_analyst_hook',\n reason: 'analyst_gate_failed',\n findings: result.findings,\n },\n })\n }\n }\n}\n","export interface TraceInsightTask {\n id: string\n name: string\n prompt?: string\n difficulty?: string\n tags?: string[]\n outcome?: string\n score?: number\n gaps?: string[]\n}\n\nexport interface TraceInsightSuite {\n name: string\n collectionId?: string\n tasks: TraceInsightTask[]\n}\n\nexport interface TraceInsightFinding {\n kind: string\n severity?: string\n taskIds: string[]\n evidence?: string\n proposedFixClass?: string\n}\n\nexport interface TraceInsightQuestion {\n id: string\n question: string\n why: string\n}\n\nexport interface TraceInsightPanelRole {\n id: string\n name: string\n responsibility: string\n}\n\nexport interface TraceInsightPromptInput {\n suite: TraceInsightSuite\n findings?: TraceInsightFinding[]\n agent?: Record<string, unknown>\n totals?: Record<string, unknown>\n maxRepresentativeTraces?: number\n}\n\nexport interface TraceInsightContext {\n suite: TraceInsightSuite\n scope: string\n keywords: string[]\n questions: TraceInsightQuestion[]\n panel: TraceInsightPanelRole[]\n findings: TraceInsightFinding[]\n agent: Record<string, unknown> | null\n totals: Record<string, unknown> | null\n}\n\nexport interface TraceInsightQualityGate {\n id: string\n label: string\n passed: boolean\n severity: 'critical' | 'high' | 'medium' | 'low'\n detail: string\n}\n\nexport interface TraceInsightReadiness {\n score: number\n grade: 'external-ready' | 'internal-review' | 'raw-analysis'\n gates: TraceInsightQualityGate[]\n}\n\nconst DOMAIN_STOP_WORDS = new Set([\n 'and',\n 'advanced',\n 'app',\n 'build',\n 'create',\n 'easy',\n 'expert',\n 'extreme',\n 'for',\n 'from',\n 'hard',\n 'implementation',\n 'integrate',\n 'medium',\n 'project',\n 'task',\n 'the',\n 'this',\n 'with',\n 'workflow',\n])\n\nexport function tokenizeDomainWords(value: string): string[] {\n return [...value.matchAll(/[A-Za-z][A-Za-z0-9.+#-]{2,}/g)]\n .map((match) => match[0].toLowerCase())\n .filter((word) => !DOMAIN_STOP_WORDS.has(word))\n}\n\nexport function inferDomainKeywords(suite: TraceInsightSuite): string[] {\n const suiteWords = new Set(tokenizeDomainWords(`${suite.name} ${suite.collectionId ?? ''}`))\n const source = [\n suite.name,\n suite.collectionId ?? '',\n ...suite.tasks.flatMap((task) => [\n task.id,\n task.name,\n task.prompt ?? '',\n task.difficulty ?? '',\n ...(task.tags ?? []),\n ...(task.gaps ?? []),\n ]),\n ].join(' ')\n const counts = new Map<string, number>()\n for (const word of tokenizeDomainWords(source)) counts.set(word, (counts.get(word) ?? 0) + 1)\n return [...counts.entries()]\n .filter(([word, count]) => count >= 2 || suiteWords.has(word))\n .sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))\n .map(([word]) => word)\n .slice(0, 18)\n}\n\nexport function domainEvidencePattern(keywords: string[]): RegExp {\n const escaped = keywords\n .filter((keyword) => keyword.length >= 3)\n .map((keyword) => keyword.replace(/[.*+?^${}()|[\\]\\\\]/g, '\\\\$&'))\n return escaped.length > 0\n ? new RegExp(`(?<![A-Za-z0-9])(?:${escaped.join('|')})(?![A-Za-z0-9])`, 'i')\n : /(?<![A-Za-z0-9])(?:sdk|api|css|dns|xml|provider|client|service|integration|webhook|transaction|auth|oauth|graphql|rest)(?![A-Za-z0-9])/i\n}\n\nexport function describeTraceInsightScope(suite: TraceInsightSuite): string {\n const taskLabel =\n suite.tasks.length === 1\n ? '1 implementation task'\n : `${suite.tasks.length} implementation tasks`\n const tags = new Map<string, number>()\n for (const task of suite.tasks) {\n for (const tag of task.tags ?? []) tags.set(tag, (tags.get(tag) ?? 0) + 1)\n }\n const topTags = [...tags.entries()]\n .sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))\n .slice(0, 8)\n .map(([tag]) => tag)\n if (topTags.length > 0) return `${taskLabel} across ${topTags.join(', ')}.`\n const difficulties = [\n ...new Set(\n suite.tasks.map((task) => task.difficulty).filter((value): value is string => Boolean(value)),\n ),\n ].join(', ')\n return `${taskLabel} across ${difficulties || 'the selected benchmark scope'}.`\n}\n\nexport function planTraceInsightQuestions(input: TraceInsightPromptInput): TraceInsightQuestion[] {\n const hasFailures = input.suite.tasks.some((task) => task.outcome && task.outcome !== 'satisfied')\n const hasMultipleShots = input.suite.tasks.some((task) =>\n (task.gaps ?? []).some((gap) => /shot|review|retry|continue/i.test(gap)),\n )\n const questions: TraceInsightQuestion[] = [\n {\n id: 'execution-path',\n question: 'What did the worker actually do before the first meaningful implementation edit?',\n why: 'Separates grounded execution from polished but shallow output.',\n },\n {\n id: 'research-grounding',\n question:\n 'Did the worker inspect docs, source, examples, or package references before committing to an implementation path?',\n why: 'Identifies whether failures came from weak retrieval, weak examples, or premature coding.',\n },\n {\n id: 'domain-proof',\n question:\n 'Which tasks produced executable domain proof versus UI copy, placeholders, or inferred behavior?',\n why: 'Keeps product-quality claims tied to concrete evidence.',\n },\n {\n id: 'root-cause',\n question:\n 'For each major failure cluster, is the likely root cause prompt/scaffold, docs/examples, SDK/API ergonomics, evaluator, runtime, or model behavior?',\n why: 'Turns trace observations into actionable ownership.',\n },\n {\n id: 'evidence-quality',\n question:\n 'Which external-facing claims are directly supported by trace ids, span ids, verifier findings, reviewer notes, or generated code?',\n why: 'Prevents unsupported customer-report conclusions.',\n },\n ]\n if (hasMultipleShots) {\n questions.push({\n id: 'reviewer-lift',\n question: 'Where did reviewer feedback improve score, stall, or regress across shots?',\n why: 'Shows whether the driver loop is learning or merely repeating work.',\n })\n }\n if (hasFailures) {\n questions.push({\n id: 'optimization-targets',\n question:\n 'Which prompt, evaluator, scaffold, or workflow changes should feed the next GEPA/autoresearch optimization run?',\n why: 'Connects benchmark evidence to the optimization loop.',\n })\n }\n return questions\n}\n\nexport function buildTraceInsightContext(input: TraceInsightPromptInput): TraceInsightContext {\n return {\n suite: input.suite,\n scope: describeTraceInsightScope(input.suite),\n keywords: inferDomainKeywords(input.suite),\n questions: planTraceInsightQuestions(input),\n panel: defaultTraceInsightPanel(),\n findings: input.findings ?? [],\n agent: input.agent ?? null,\n totals: input.totals ?? null,\n }\n}\n\nexport function scoreTraceInsightReadiness(context: TraceInsightContext): TraceInsightReadiness {\n const failedTasks = context.suite.tasks.filter(\n (task) => task.outcome && task.outcome !== 'satisfied',\n )\n const findingTaskIds = new Set(context.findings.flatMap((finding) => finding.taskIds))\n const failedTasksWithFindings = failedTasks.filter((task) => findingTaskIds.has(task.id))\n const tasksWithGaps = context.suite.tasks.filter((task) => (task.gaps ?? []).length > 0)\n const gates: TraceInsightQualityGate[] = [\n {\n id: 'domain-context',\n label: 'Domain context inferred',\n passed: context.keywords.length > 0,\n severity: 'high',\n detail:\n context.keywords.length > 0\n ? `${context.keywords.length} domain terms inferred: ${context.keywords.slice(0, 8).join(', ')}`\n : 'No domain terms were inferred from suite, tasks, prompts, tags, or gaps.',\n },\n {\n id: 'panel-coverage',\n label: 'Analyst panel planned',\n passed: context.panel.length >= 4 && context.questions.length >= 5,\n severity: 'high',\n detail: `${context.panel.length} panel roles and ${context.questions.length} investigation questions planned.`,\n },\n {\n id: 'failure-coverage',\n label: 'Failures mapped to findings',\n passed:\n failedTasks.length === 0 || failedTasksWithFindings.length / failedTasks.length >= 0.5,\n severity: 'critical',\n detail:\n failedTasks.length === 0\n ? 'No failed tasks in suite.'\n : `${failedTasksWithFindings.length}/${failedTasks.length} failed tasks appear in finding clusters.`,\n },\n {\n id: 'gap-evidence',\n label: 'Task gaps captured',\n passed: failedTasks.length === 0 || tasksWithGaps.length / failedTasks.length >= 0.5,\n severity: 'medium',\n detail: `${tasksWithGaps.length} tasks include explicit evaluator or analyst gaps.`,\n },\n ]\n const penalty = gates.reduce((sum, gate) => {\n if (gate.passed) return sum\n if (gate.severity === 'critical') return sum + 35\n if (gate.severity === 'high') return sum + 20\n if (gate.severity === 'medium') return sum + 10\n return sum + 5\n }, 0)\n const score = Math.max(0, Math.min(1, 1 - penalty / 100))\n return {\n score,\n grade: score >= 0.9 ? 'external-ready' : score >= 0.7 ? 'internal-review' : 'raw-analysis',\n gates,\n }\n}\n\nexport function defaultTraceInsightPanel(): TraceInsightPanelRole[] {\n return [\n {\n id: 'trace-forensics',\n name: 'Trace Forensics',\n responsibility:\n 'Reconstruct what the worker did in order, including research, edits, reviewer interventions, verifier feedback, and stop reason.',\n },\n {\n id: 'root-cause',\n name: 'Root Cause',\n responsibility:\n 'Map failures to prompt/scaffold, docs/examples, SDK/API/product ergonomics, evaluator, runtime, or model behavior.',\n },\n {\n id: 'optimization',\n name: 'Optimization',\n responsibility:\n 'Identify prompt, reviewer, evaluator, scaffold, and GEPA/autoresearch changes that should be tested next.',\n },\n {\n id: 'external-evidence',\n name: 'External Evidence',\n responsibility:\n 'Separate customer-safe claims from internal harness findings and reject conclusions without task, trace, span, code, reviewer, or verifier evidence.',\n },\n ]\n}\n\nexport function buildTraceInsightPrompt(input: TraceInsightPromptInput): string {\n const context = buildTraceInsightContext(input)\n const maxRepresentativeTraces = input.maxRepresentativeTraces ?? 6\n return `Analyze this benchmark run and produce evidence-backed trace intelligence.\n\nAudience:\n- internal AI/product leadership\n- possible customer-facing report for ${input.suite.name}\n\nInvestigation plan:\n${context.questions.map((item, index) => `${index + 1}. ${item.question} (${item.why})`).join('\\n')}\n\nAnalyst panel:\n${context.panel.map((role) => `- ${role.name}: ${role.responsibility}`).join('\\n')}\n\nIf the task branches are independent, use subagents for the panel roles above and aggregate their findings. Do not run a panel role unless its answer will change the final report.\n\nRequired output:\n1. Executive verdict: what this run proves and does not prove.\n2. The investigation questions you answered and the evidence used.\n3. Failure taxonomy: agent prompting, evaluator/harness, docs/examples, SDK/API/product integration, infra.\n4. Evidence-backed examples with trace ids/task ids and concrete verifier findings.\n5. Highest-ROI fixes for the benchmark harness, prompt/GEPA optimization, and customer-facing product/docs surface.\n6. What is safe for an external report versus what must stay internal.\n7. One rerun plan that would validate lift after optimization.\n\nBudget:\n- Inspect the dataset overview, the failure summary, and at most ${maxRepresentativeTraces} representative traces.\n- Prefer traces named in the failure summary over broad exploration.\n- Do not do exhaustive trace sweeps.\n- Return the final report as soon as the taxonomy and examples are supported.\n\nRun summary:\n${JSON.stringify(\n {\n suite: input.suite.name,\n scope: context.scope,\n inferredKeywords: context.keywords,\n agent: context.agent,\n totals: context.totals,\n findings: context.findings.map((finding) => ({\n kind: finding.kind,\n severity: finding.severity,\n taskCount: finding.taskIds.length,\n proposedFixClass: finding.proposedFixClass,\n })),\n failures: input.suite.tasks\n .filter((task) => task.outcome && task.outcome !== 'satisfied')\n .map((task) => ({\n task: task.id,\n difficulty: task.difficulty,\n outcome: task.outcome,\n score: task.score,\n gaps: task.gaps ?? [],\n })),\n },\n null,\n 2,\n)}\n\nUse the trace tools. Do not invent facts. Cite task ids. Separate customer-facing claims from internal harness/model findings.`\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAsDA,IAAM,mBACJ;AAEK,SAAS,0BAA0B,MAAgD;AACxF,SAAO,OAAO,QAAgC;AAC5C,QAAI,KAAK,aAAa,CAAC,KAAK,UAAU,GAAG,EAAG;AAC5C,UAAM,SAAS,KAAK,QAAQ;AAC5B,QAAI,WAAW,QAAW;AAGxB,YAAM,IAAI,MAAM,YAAY;AAAA,QAC1B,SAAS,gBAAgB,IAAI,KAAK;AAAA,QAClC,OAAO,IAAI;AAAA,QACX,MAAM;AAAA,QACN,WAAW,KAAK,IAAI;AAAA,QACpB,SAAS,EAAE,QAAQ,sBAAsB,QAAQ,uBAAuB;AAAA,MAC1E,CAAC;AACD;AAAA,IACF;AACA,UAAM,SAAS,MAAM,cAAc,EAAE,UAAU,KAAK,YAAY,iBAAiB,GAAG;AAAA,MAClF,GAAG,KAAK;AAAA,MACR;AAAA,IACF,CAAyB;AACzB,QAAI,KAAK,KAAM,OAAM,KAAK,KAAK,QAAQ,GAAG;AAC1C,QAAI,KAAK,UAAU,CAAC,KAAK,OAAO,QAAQ,GAAG,GAAG;AAC5C,YAAM,IAAI,MAAM,YAAY;AAAA,QAC1B,SAAS,gBAAgB,IAAI,KAAK;AAAA,QAClC,OAAO,IAAI;AAAA,QACX,MAAM;AAAA,QACN,WAAW,KAAK,IAAI;AAAA,QACpB,SAAS;AAAA,UACP,QAAQ;AAAA,UACR,QAAQ;AAAA,UACR,UAAU,OAAO;AAAA,QACnB;AAAA,MACF,CAAC;AAAA,IACH;AAAA,EACF;AACF;;;ACtBA,IAAM,oBAAoB,oBAAI,IAAI;AAAA,EAChC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,CAAC;AAEM,SAAS,oBAAoB,OAAyB;AAC3D,SAAO,CAAC,GAAG,MAAM,SAAS,8BAA8B,CAAC,EACtD,IAAI,CAAC,UAAU,MAAM,CAAC,EAAE,YAAY,CAAC,EACrC,OAAO,CAAC,SAAS,CAAC,kBAAkB,IAAI,IAAI,CAAC;AAClD;AAEO,SAAS,oBAAoB,OAAoC;AACtE,QAAM,aAAa,IAAI,IAAI,oBAAoB,GAAG,MAAM,IAAI,IAAI,MAAM,gBAAgB,EAAE,EAAE,CAAC;AAC3F,QAAM,SAAS;AAAA,IACb,MAAM;AAAA,IACN,MAAM,gBAAgB;AAAA,IACtB,GAAG,MAAM,MAAM,QAAQ,CAAC,SAAS;AAAA,MAC/B,KAAK;AAAA,MACL,KAAK;AAAA,MACL,KAAK,UAAU;AAAA,MACf,KAAK,cAAc;AAAA,MACnB,GAAI,KAAK,QAAQ,CAAC;AAAA,MAClB,GAAI,KAAK,QAAQ,CAAC;AAAA,IACpB,CAAC;AAAA,EACH,EAAE,KAAK,GAAG;AACV,QAAM,SAAS,oBAAI,IAAoB;AACvC,aAAW,QAAQ,oBAAoB,MAAM,EAAG,QAAO,IAAI,OAAO,OAAO,IAAI,IAAI,KAAK,KAAK,CAAC;AAC5F,SAAO,CAAC,GAAG,OAAO,QAAQ,CAAC,EACxB,OAAO,CAAC,CAAC,MAAM,KAAK,MAAM,SAAS,KAAK,WAAW,IAAI,IAAI,CAAC,EAC5D,KAAK,CAAC,GAAG,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,KAAK,EAAE,CAAC,EAAE,cAAc,EAAE,CAAC,CAAC,CAAC,EACtD,IAAI,CAAC,CAAC,IAAI,MAAM,IAAI,EACpB,MAAM,GAAG,EAAE;AAChB;AAEO,SAAS,sBAAsB,UAA4B;AAChE,QAAM,UAAU,SACb,OAAO,CAAC,YAAY,QAAQ,UAAU,CAAC,EACvC,IAAI,CAAC,YAAY,QAAQ,QAAQ,uBAAuB,MAAM,CAAC;AAClE,SAAO,QAAQ,SAAS,IACpB,IAAI,OAAO,sBAAsB,QAAQ,KAAK,GAAG,CAAC,oBAAoB,GAAG,IACzE;AACN;AAEO,SAAS,0BAA0B,OAAkC;AAC1E,QAAM,YACJ,MAAM,MAAM,WAAW,IACnB,0BACA,GAAG,MAAM,MAAM,MAAM;AAC3B,QAAM,OAAO,oBAAI,IAAoB;AACrC,aAAW,QAAQ,MAAM,OAAO;AAC9B,eAAW,OAAO,KAAK,QAAQ,CAAC,EAAG,MAAK,IAAI,MAAM,KAAK,IAAI,GAAG,KAAK,KAAK,CAAC;AAAA,EAC3E;AACA,QAAM,UAAU,CAAC,GAAG,KAAK,QAAQ,CAAC,EAC/B,KAAK,CAAC,GAAG,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,KAAK,EAAE,CAAC,EAAE,cAAc,EAAE,CAAC,CAAC,CAAC,EACtD,MAAM,GAAG,CAAC,EACV,IAAI,CAAC,CAAC,GAAG,MAAM,GAAG;AACrB,MAAI,QAAQ,SAAS,EAAG,QAAO,GAAG,SAAS,WAAW,QAAQ,KAAK,IAAI,CAAC;AACxE,QAAM,eAAe;AAAA,IACnB,GAAG,IAAI;AAAA,MACL,MAAM,MAAM,IAAI,CAAC,SAAS,KAAK,UAAU,EAAE,OAAO,CAAC,UAA2B,QAAQ,KAAK,CAAC;AAAA,IAC9F;AAAA,EACF,EAAE,KAAK,IAAI;AACX,SAAO,GAAG,SAAS,WAAW,gBAAgB,8BAA8B;AAC9E;AAEO,SAAS,0BAA0B,OAAwD;AAChG,QAAM,cAAc,MAAM,MAAM,MAAM,KAAK,CAAC,SAAS,KAAK,WAAW,KAAK,YAAY,WAAW;AACjG,QAAM,mBAAmB,MAAM,MAAM,MAAM;AAAA,IAAK,CAAC,UAC9C,KAAK,QAAQ,CAAC,GAAG,KAAK,CAAC,QAAQ,8BAA8B,KAAK,GAAG,CAAC;AAAA,EACzE;AACA,QAAM,YAAoC;AAAA,IACxC;AAAA,MACE,IAAI;AAAA,MACJ,UAAU;AAAA,MACV,KAAK;AAAA,IACP;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,UACE;AAAA,MACF,KAAK;AAAA,IACP;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,UACE;AAAA,MACF,KAAK;AAAA,IACP;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,UACE;AAAA,MACF,KAAK;AAAA,IACP;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,UACE;AAAA,MACF,KAAK;AAAA,IACP;AAAA,EACF;AACA,MAAI,kBAAkB;AACpB,cAAU,KAAK;AAAA,MACb,IAAI;AAAA,MACJ,UAAU;AAAA,MACV,KAAK;AAAA,IACP,CAAC;AAAA,EACH;AACA,MAAI,aAAa;AACf,cAAU,KAAK;AAAA,MACb,IAAI;AAAA,MACJ,UACE;AAAA,MACF,KAAK;AAAA,IACP,CAAC;AAAA,EACH;AACA,SAAO;AACT;AAEO,SAAS,yBAAyB,OAAqD;AAC5F,SAAO;AAAA,IACL,OAAO,MAAM;AAAA,IACb,OAAO,0BAA0B,MAAM,KAAK;AAAA,IAC5C,UAAU,oBAAoB,MAAM,KAAK;AAAA,IACzC,WAAW,0BAA0B,KAAK;AAAA,IAC1C,OAAO,yBAAyB;AAAA,IAChC,UAAU,MAAM,YAAY,CAAC;AAAA,IAC7B,OAAO,MAAM,SAAS;AAAA,IACtB,QAAQ,MAAM,UAAU;AAAA,EAC1B;AACF;AAEO,SAAS,2BAA2B,SAAqD;AAC9F,QAAM,cAAc,QAAQ,MAAM,MAAM;AAAA,IACtC,CAAC,SAAS,KAAK,WAAW,KAAK,YAAY;AAAA,EAC7C;AACA,QAAM,iBAAiB,IAAI,IAAI,QAAQ,SAAS,QAAQ,CAAC,YAAY,QAAQ,OAAO,CAAC;AACrF,QAAM,0BAA0B,YAAY,OAAO,CAAC,SAAS,eAAe,IAAI,KAAK,EAAE,CAAC;AACxF,QAAM,gBAAgB,QAAQ,MAAM,MAAM,OAAO,CAAC,UAAU,KAAK,QAAQ,CAAC,GAAG,SAAS,CAAC;AACvF,QAAM,QAAmC;AAAA,IACvC;AAAA,MACE,IAAI;AAAA,MACJ,OAAO;AAAA,MACP,QAAQ,QAAQ,SAAS,SAAS;AAAA,MAClC,UAAU;AAAA,MACV,QACE,QAAQ,SAAS,SAAS,IACtB,GAAG,QAAQ,SAAS,MAAM,2BAA2B,QAAQ,SAAS,MAAM,GAAG,CAAC,EAAE,KAAK,IAAI,CAAC,KAC5F;AAAA,IACR;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,OAAO;AAAA,MACP,QAAQ,QAAQ,MAAM,UAAU,KAAK,QAAQ,UAAU,UAAU;AAAA,MACjE,UAAU;AAAA,MACV,QAAQ,GAAG,QAAQ,MAAM,MAAM,oBAAoB,QAAQ,UAAU,MAAM;AAAA,IAC7E;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,OAAO;AAAA,MACP,QACE,YAAY,WAAW,KAAK,wBAAwB,SAAS,YAAY,UAAU;AAAA,MACrF,UAAU;AAAA,MACV,QACE,YAAY,WAAW,IACnB,8BACA,GAAG,wBAAwB,MAAM,IAAI,YAAY,MAAM;AAAA,IAC/D;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,OAAO;AAAA,MACP,QAAQ,YAAY,WAAW,KAAK,cAAc,SAAS,YAAY,UAAU;AAAA,MACjF,UAAU;AAAA,MACV,QAAQ,GAAG,cAAc,MAAM;AAAA,IACjC;AAAA,EACF;AACA,QAAM,UAAU,MAAM,OAAO,CAAC,KAAK,SAAS;AAC1C,QAAI,KAAK,OAAQ,QAAO;AACxB,QAAI,KAAK,aAAa,WAAY,QAAO,MAAM;AAC/C,QAAI,KAAK,aAAa,OAAQ,QAAO,MAAM;AAC3C,QAAI,KAAK,aAAa,SAAU,QAAO,MAAM;AAC7C,WAAO,MAAM;AAAA,EACf,GAAG,CAAC;AACJ,QAAM,QAAQ,KAAK,IAAI,GAAG,KAAK,IAAI,GAAG,IAAI,UAAU,GAAG,CAAC;AACxD,SAAO;AAAA,IACL;AAAA,IACA,OAAO,SAAS,MAAM,mBAAmB,SAAS,MAAM,oBAAoB;AAAA,IAC5E;AAAA,EACF;AACF;AAEO,SAAS,2BAAoD;AAClE,SAAO;AAAA,IACL;AAAA,MACE,IAAI;AAAA,MACJ,MAAM;AAAA,MACN,gBACE;AAAA,IACJ;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,MAAM;AAAA,MACN,gBACE;AAAA,IACJ;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,MAAM;AAAA,MACN,gBACE;AAAA,IACJ;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,MAAM;AAAA,MACN,gBACE;AAAA,IACJ;AAAA,EACF;AACF;AAEO,SAAS,wBAAwB,OAAwC;AAC9E,QAAM,UAAU,yBAAyB,KAAK;AAC9C,QAAM,0BAA0B,MAAM,2BAA2B;AACjE,SAAO;AAAA;AAAA;AAAA;AAAA,wCAI+B,MAAM,MAAM,IAAI;AAAA;AAAA;AAAA,EAGtD,QAAQ,UAAU,IAAI,CAAC,MAAM,UAAU,GAAG,QAAQ,CAAC,KAAK,KAAK,QAAQ,KAAK,KAAK,GAAG,GAAG,EAAE,KAAK,IAAI,CAAC;AAAA;AAAA;AAAA,EAGjG,QAAQ,MAAM,IAAI,CAAC,SAAS,KAAK,KAAK,IAAI,KAAK,KAAK,cAAc,EAAE,EAAE,KAAK,IAAI,CAAC;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,mEAcf,uBAAuB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAMxF,KAAK;AAAA,IACL;AAAA,MACE,OAAO,MAAM,MAAM;AAAA,MACnB,OAAO,QAAQ;AAAA,MACf,kBAAkB,QAAQ;AAAA,MAC1B,OAAO,QAAQ;AAAA,MACf,QAAQ,QAAQ;AAAA,MAChB,UAAU,QAAQ,SAAS,IAAI,CAAC,aAAa;AAAA,QAC3C,MAAM,QAAQ;AAAA,QACd,UAAU,QAAQ;AAAA,QAClB,WAAW,QAAQ,QAAQ;AAAA,QAC3B,kBAAkB,QAAQ;AAAA,MAC5B,EAAE;AAAA,MACF,UAAU,MAAM,MAAM,MACnB,OAAO,CAAC,SAAS,KAAK,WAAW,KAAK,YAAY,WAAW,EAC7D,IAAI,CAAC,UAAU;AAAA,QACd,MAAM,KAAK;AAAA,QACX,YAAY,KAAK;AAAA,QACjB,SAAS,KAAK;AAAA,QACd,OAAO,KAAK;AAAA,QACZ,MAAM,KAAK,QAAQ,CAAC;AAAA,MACtB,EAAE;AAAA,IACN;AAAA,IACA;AAAA,IACA;AAAA,EACF,CAAC;AAAA;AAAA;AAGD;","names":[]}
1
+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
@@ -1,4 +1,4 @@
1
- import { S as Span, b as TraceEvent, T as TraceStore } from './store-BP5be6s7.js';
1
+ import { S as Span, b as TraceEvent, T as TraceStore } from './store-Db2Bv8Cf.js';
2
2
 
3
3
  /**
4
4
  * Trajectory — ordered, structured view over a run's spans.
@@ -1,14 +1,14 @@
1
- import { n as FeedbackTrajectoryStore } from '../feedback-trajectory-j0nJFgC6.js';
2
- import { T as TraceStore } from '../store-BP5be6s7.js';
1
+ import { n as FeedbackTrajectoryStore } from '../feedback-trajectory-iATEAHmc.js';
2
+ import { T as TraceStore } from '../store-Db2Bv8Cf.js';
3
3
  import { z } from 'zod';
4
4
  import { OpenAPIObject } from 'openapi3-ts/oas31';
5
5
  import * as hono_types from 'hono/types';
6
6
  import { ServerType } from '@hono/node-server';
7
7
  import { Hono } from 'hono';
8
- import '../control-runtime-BRdQ0wrx.js';
9
- import '../emitter-BqjeOvJh.js';
10
- import '../dataset-CiK_3LDr.js';
11
- import '../errors-BZ9sTdz7.js';
8
+ import '../control-runtime-BZ_lVLYW.js';
9
+ import '../emitter-DP_cSSiw.js';
10
+ import '../dataset-ueRVTUoY.js';
11
+ import '../errors-mje_cKOs.js';
12
12
 
13
13
  declare const RubricDimensionSchema: z.ZodObject<{
14
14
  id: z.ZodString;
@@ -33,10 +33,10 @@ import {
33
33
  runRpcBatch,
34
34
  runRpcOnce,
35
35
  startServer
36
- } from "../chunk-XFZCM5Z3.js";
37
- import "../chunk-4S4BM3QQ.js";
36
+ } from "../chunk-SMSGXM74.js";
37
+ import "../chunk-M6RZ5LJN.js";
38
38
  import "../chunk-PC4UYEBM.js";
39
- import "../chunk-NG236HPC.js";
39
+ import "../chunk-QYJT52YW.js";
40
40
  import "../chunk-PZ5AY32C.js";
41
41
  export {
42
42
  BUILTIN_RUBRICS,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-eval",
3
- "version": "0.30.0",
3
+ "version": "0.31.1",
4
4
  "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
5
5
  "homepage": "https://github.com/tangle-network/agent-eval#readme",
6
6
  "repository": {
@@ -109,17 +109,6 @@
109
109
  "publishConfig": {
110
110
  "access": "public"
111
111
  },
112
- "scripts": {
113
- "build": "tsup && pnpm openapi",
114
- "dev": "tsup --watch",
115
- "prepare": "pnpm build",
116
- "test": "vitest run",
117
- "test:watch": "vitest",
118
- "typecheck": "tsc --noEmit",
119
- "lint": "biome check src",
120
- "format": "biome format --write src",
121
- "openapi": "node dist/cli.js openapi --out dist/openapi.json"
122
- },
123
112
  "dependencies": {
124
113
  "@asteasolutions/zod-to-openapi": "^8.5.0",
125
114
  "@ax-llm/ax": "^19.0.25",
@@ -136,16 +125,18 @@
136
125
  "typescript": "^5.7.0",
137
126
  "vitest": "^3.0.0"
138
127
  },
139
- "pnpm": {
140
- "minimumReleaseAge": 4320,
141
- "minimumReleaseAgeExclude": [],
142
- "overrides": {
143
- "postcss@<8.5.10": "^8.5.10"
144
- }
145
- },
146
128
  "engines": {
147
129
  "node": ">=20"
148
130
  },
149
131
  "license": "MIT",
150
- "packageManager": "pnpm@10.22.0"
151
- }
132
+ "scripts": {
133
+ "build": "tsup && pnpm openapi",
134
+ "dev": "tsup --watch",
135
+ "test": "vitest run",
136
+ "test:watch": "vitest",
137
+ "typecheck": "tsc --noEmit",
138
+ "lint": "biome check src",
139
+ "format": "biome format --write src",
140
+ "openapi": "node dist/cli.js openapi --out dist/openapi.json"
141
+ }
142
+ }
@@ -1 +0,0 @@
1
- {"version":3,"sources":["../src/errors.ts"],"sourcesContent":["/**\n * Error taxonomy for `@tangle-network/agent-eval`.\n *\n * Every error this package throws as part of its *public contract* extends\n * `AgentEvalError`. Consumers can pattern-match by `instanceof <Subclass>` or\n * by the stable string `code` carried on the base class.\n *\n * The codes are stable across minor versions; new codes can be added, but\n * existing codes never change meaning. New subclasses are non-breaking.\n *\n * Internal invariant guards (`throw new Error('this should never happen')`)\n * remain plain `Error`s on purpose — they're programmer-mistake assertions,\n * not consumer-catchable contract failures.\n */\n\nexport type AgentEvalErrorCode =\n | 'validation'\n | 'not_found'\n | 'config'\n | 'capture_integrity'\n | 'judge'\n | 'verification'\n | 'replay'\n\nexport class AgentEvalError extends Error {\n /** Stable string code. Survives minification; safe to switch on. */\n readonly code: AgentEvalErrorCode\n\n constructor(code: AgentEvalErrorCode, message: string, options?: { cause?: unknown }) {\n super(message, options)\n this.name = this.constructor.name\n this.code = code\n }\n}\n\n/** Caller passed invalid arguments (out of range, mutually-exclusive options, bad shape). */\nexport class ValidationError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('validation', message, options)\n }\n}\n\n/** A named resource (run, span, rubric, scenario, dataset row, route) does not exist. */\nexport class NotFoundError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('not_found', message, options)\n }\n}\n\n/** Configuration missing or malformed (`HOME` unset, required image not supplied, env var absent). */\nexport class ConfigError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('config', message, options)\n }\n}\n\n/**\n * A run is missing the artifacts a launch-grade check requires:\n * raw HTTP capture absent, no LLM spans, route assertion failed, run-end\n * assertion tripped. Block ship on this; do not catch and move on.\n */\nexport class CaptureIntegrityError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('capture_integrity', message, options)\n }\n}\n\n/** A judge call failed in a way that's not retryable: schema parse failure, bad rubric, conflicting dimensions. */\nexport class JudgeError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('judge', message, options)\n }\n}\n\n/** A verifier signalled a hard failure (compile, test, schema) — distinct from a low judge score. */\nexport class VerificationError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('verification', message, options)\n }\n}\n\n/** Replay cache cannot satisfy a request: miss with no fallback, sink lacks list(), unsupported URL. */\nexport class ReplayError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('replay', message, options)\n }\n}\n"],"mappings":";AAwBO,IAAM,iBAAN,cAA6B,MAAM;AAAA;AAAA,EAE/B;AAAA,EAET,YAAY,MAA0B,SAAiB,SAA+B;AACpF,UAAM,SAAS,OAAO;AACtB,SAAK,OAAO,KAAK,YAAY;AAC7B,SAAK,OAAO;AAAA,EACd;AACF;AAGO,IAAM,kBAAN,cAA8B,eAAe;AAAA,EAClD,YAAY,SAAiB,SAA+B;AAC1D,UAAM,cAAc,SAAS,OAAO;AAAA,EACtC;AACF;AAGO,IAAM,gBAAN,cAA4B,eAAe;AAAA,EAChD,YAAY,SAAiB,SAA+B;AAC1D,UAAM,aAAa,SAAS,OAAO;AAAA,EACrC;AACF;AAGO,IAAM,cAAN,cAA0B,eAAe;AAAA,EAC9C,YAAY,SAAiB,SAA+B;AAC1D,UAAM,UAAU,SAAS,OAAO;AAAA,EAClC;AACF;AAOO,IAAM,wBAAN,cAAoC,eAAe;AAAA,EACxD,YAAY,SAAiB,SAA+B;AAC1D,UAAM,qBAAqB,SAAS,OAAO;AAAA,EAC7C;AACF;AAGO,IAAM,aAAN,cAAyB,eAAe;AAAA,EAC7C,YAAY,SAAiB,SAA+B;AAC1D,UAAM,SAAS,SAAS,OAAO;AAAA,EACjC;AACF;AAGO,IAAM,oBAAN,cAAgC,eAAe;AAAA,EACpD,YAAY,SAAiB,SAA+B;AAC1D,UAAM,gBAAgB,SAAS,OAAO;AAAA,EACxC;AACF;AAGO,IAAM,cAAN,cAA0B,eAAe;AAAA,EAC9C,YAAY,SAAiB,SAA+B;AAC1D,UAAM,UAAU,SAAS,OAAO;AAAA,EAClC;AACF;","names":[]}
@@ -1 +0,0 @@
1
- {"version":3,"sources":["../src/run-record.ts"],"sourcesContent":["/**\n * Paper-grade RunRecord schema + runtime validator.\n *\n * Every run that participates in a promotion gate, paper table, or\n * researcher loop SHOULD be recorded as a `RunRecord`. The mandatory\n * fields are exactly those the paper \"Two Loops, Three Roles\" requires\n * for reproducibility: who/what/when/cost/seed/hash, plus the search vs\n * holdout split tag and either a `searchScore` or a `holdoutScore`.\n *\n * This is intentionally NOT a replacement for the rich `Run` /\n * `ProposeReviewReport` / `ScenarioResult` types already in the\n * package. Those are runtime structures with full provenance. A\n * `RunRecord` is the analysis-time projection — the JSON-friendly\n * row you'd put in a parquet file or paste into a notebook.\n *\n * Validate at the boundary:\n *\n * const rec = validateRunRecord(rawJson) // throws on missing\n * const ok = isRunRecord(rawJson) // boolean check\n * const rec = parseRunRecordSafe(rawJson) // { ok, value | error }\n *\n * The validator runs in pure TS — zod is intentionally NOT a\n * dependency. Round-trip tested in `tests/run-record.test.ts`.\n */\n\n/** Search/dev/holdout split tag. 'search' is the paper-grade alias for the\n * combined train+test pool that the optimizer is allowed to read. */\nexport type RunSplitTag = 'search' | 'dev' | 'holdout'\n\nexport interface RunTokenUsage {\n input: number\n output: number\n cached?: number\n}\n\nexport interface RunJudgeMetadata {\n model: string\n promptVersion: string\n /** [0,1] confidence the judge declared. Constant judge confidence\n * across many runs is a fallback signal (see `canary.ts`). */\n confidence: number\n /** True if the judge degraded to a fallback path (rules-only,\n * prior-call cache, etc.). The canary uses this to alert. */\n fallback: boolean\n}\n\nexport interface RunOutcome {\n /** Score on the search/optimization split. Optional because a\n * holdout-only evaluation only fills `holdoutScore`. */\n searchScore?: number\n /** Score on the held-out split. Optional because a search-only run\n * only fills `searchScore`. At least one must be present. */\n holdoutScore?: number\n /** Bag of any other metric the run produced — judge dimensions,\n * pass/fail counters, latency stats, etc. Numeric only — keeps\n * reporters honest. */\n raw: Record<string, number>\n}\n\n/**\n * Mandatory paper-grade fields for a single evaluation run. Optional\n * fields are extension points; mandatory fields throw if missing.\n *\n * Hash discipline:\n * - `promptHash` is the sha256 of the EFFECTIVE prompt sent to the\n * model (after any steering bundle merge).\n * - `configHash` is the sha256 of the effective run config (model,\n * temperature, tools, judges, splits). The pair (promptHash,\n * configHash) uniquely identifies an experimental cell.\n *\n * Model snapshot discipline:\n * - `model` MUST encode a snapshot version. Bare aliases like\n * `claude-sonnet-4` or `gpt-4o` are banned — they remap silently.\n * Use `claude-sonnet-4-6@2025-04-15` or `gpt-4o-2024-11-20`.\n */\nexport interface RunRecord {\n /** UUID for the run. */\n runId: string\n /** Logical experiment grouping (a treatment vs a baseline within\n * the same sweep should share `experimentId`). */\n experimentId: string\n /** Stable identifier for the candidate (variant) being run. The\n * promotion gate compares two `candidateId`s on matched items. */\n candidateId: string\n /** RNG seed for the run. Always recorded — silent re-seeding is\n * the most common cause of non-reproducible numbers. */\n seed: number\n /** Model identifier WITH snapshot version. */\n model: string\n /** sha256 of the effective prompt (post-steering). */\n promptHash: string\n /** sha256 of the effective config. */\n configHash: string\n /** Git SHA the harness was run from. */\n commitSha: string\n /** End-to-end wall-clock duration in milliseconds. */\n wallMs: number\n /** Time spent queued before execution started, if known. */\n queueMs?: number\n /** Total USD cost. Mandatory — runs without a cost number are\n * unbounded by definition and must not be admitted into the gate. */\n costUsd: number\n /** Token usage breakdown. */\n tokenUsage: RunTokenUsage\n /** Judge-side metadata, if a judge was used. */\n judgeMetadata?: RunJudgeMetadata\n /** Per-split scores + raw bag. */\n outcome: RunOutcome\n /** Categorical failure tag, when the run failed and the harness\n * classified it. Free-form string; standard tags live in\n * `failure-taxonomy.ts`. */\n failureMode?: string\n /** Which split this run was drawn from. */\n splitTag: RunSplitTag\n /**\n * Stable scenario identifier the run was scored against. Optional for\n * backwards compatibility, but **strongly recommended**: every primitive\n * that pairs runs by scenario (preferences, paired stats, BT tournament)\n * keys on this. The campaign artifact populates it canonically; legacy\n * runs without it fall back to inference from `outcome.raw.scenario_id`\n * or `experimentId`.\n */\n scenarioId?: string\n}\n\n// ── Validation ───────────────────────────────────────────────────────\n\nconst MANDATORY_TOP_LEVEL = [\n 'runId',\n 'experimentId',\n 'candidateId',\n 'seed',\n 'model',\n 'promptHash',\n 'configHash',\n 'commitSha',\n 'wallMs',\n 'costUsd',\n 'tokenUsage',\n 'outcome',\n 'splitTag',\n] as const\n\nimport { ValidationError } from './errors'\n\nconst SPLIT_TAGS: ReadonlyArray<RunSplitTag> = ['search', 'dev', 'holdout']\n\nexport class RunRecordValidationError extends ValidationError {\n readonly path: string\n constructor(message: string, path = '') {\n super(path ? `${message} (at ${path})` : message)\n this.path = path\n }\n}\n\n/**\n * Strict validator. Throws `RunRecordValidationError` on the first\n * missing or wrongly-typed field. Returns the input cast to\n * `RunRecord` on success — the validator does not coerce.\n */\nexport function validateRunRecord(input: unknown): RunRecord {\n if (input === null || typeof input !== 'object') {\n throw new RunRecordValidationError('expected object')\n }\n const obj = input as Record<string, unknown>\n\n for (const key of MANDATORY_TOP_LEVEL) {\n if (!(key in obj)) {\n throw new RunRecordValidationError(`missing mandatory field \"${key}\"`)\n }\n }\n\n expectString(obj.runId, 'runId')\n expectString(obj.experimentId, 'experimentId')\n expectString(obj.candidateId, 'candidateId')\n expectFiniteNumber(obj.seed, 'seed')\n expectString(obj.model, 'model')\n expectString(obj.promptHash, 'promptHash')\n expectString(obj.configHash, 'configHash')\n expectString(obj.commitSha, 'commitSha')\n expectFiniteNumber(obj.wallMs, 'wallMs')\n if (obj.queueMs !== undefined) expectFiniteNumber(obj.queueMs, 'queueMs')\n expectFiniteNumber(obj.costUsd, 'costUsd')\n\n // Snapshot discipline: bare model aliases are not paper-grade.\n if (!modelHasSnapshot(obj.model as string)) {\n throw new RunRecordValidationError(\n `model \"${obj.model}\" lacks a snapshot version (use 'name@YYYY-MM-DD' or 'name-YYYYMMDD')`,\n 'model',\n )\n }\n\n // Token usage.\n const tu = obj.tokenUsage\n if (tu === null || typeof tu !== 'object') {\n throw new RunRecordValidationError('tokenUsage must be an object', 'tokenUsage')\n }\n const tuRec = tu as Record<string, unknown>\n expectFiniteNumber(tuRec.input, 'tokenUsage.input')\n expectFiniteNumber(tuRec.output, 'tokenUsage.output')\n if (tuRec.cached !== undefined) expectFiniteNumber(tuRec.cached, 'tokenUsage.cached')\n\n // Judge metadata, optional.\n if (obj.judgeMetadata !== undefined) {\n const jm = obj.judgeMetadata\n if (jm === null || typeof jm !== 'object') {\n throw new RunRecordValidationError('judgeMetadata must be an object', 'judgeMetadata')\n }\n const jmRec = jm as Record<string, unknown>\n expectString(jmRec.model, 'judgeMetadata.model')\n expectString(jmRec.promptVersion, 'judgeMetadata.promptVersion')\n expectFiniteNumber(jmRec.confidence, 'judgeMetadata.confidence')\n if (typeof jmRec.fallback !== 'boolean') {\n throw new RunRecordValidationError(\n 'judgeMetadata.fallback must be boolean',\n 'judgeMetadata.fallback',\n )\n }\n }\n\n // Outcome.\n const out = obj.outcome\n if (out === null || typeof out !== 'object') {\n throw new RunRecordValidationError('outcome must be an object', 'outcome')\n }\n const outRec = out as Record<string, unknown>\n if (outRec.searchScore !== undefined)\n expectFiniteNumber(outRec.searchScore, 'outcome.searchScore')\n if (outRec.holdoutScore !== undefined)\n expectFiniteNumber(outRec.holdoutScore, 'outcome.holdoutScore')\n if (outRec.searchScore === undefined && outRec.holdoutScore === undefined) {\n throw new RunRecordValidationError(\n 'outcome must define searchScore or holdoutScore (or both)',\n 'outcome',\n )\n }\n const raw = outRec.raw\n if (raw === null || typeof raw !== 'object') {\n throw new RunRecordValidationError('outcome.raw must be an object', 'outcome.raw')\n }\n for (const [k, v] of Object.entries(raw as Record<string, unknown>)) {\n expectFiniteNumber(v, `outcome.raw.${k}`)\n }\n\n // Failure mode optional.\n if (obj.failureMode !== undefined) expectString(obj.failureMode, 'failureMode')\n\n // Split tag.\n if (typeof obj.splitTag !== 'string' || !SPLIT_TAGS.includes(obj.splitTag as RunSplitTag)) {\n throw new RunRecordValidationError(\n `splitTag must be one of ${SPLIT_TAGS.join(', ')}, got ${String(obj.splitTag)}`,\n 'splitTag',\n )\n }\n\n return input as RunRecord\n}\n\n/** Boolean validator — convenience for filtering arrays. */\nexport function isRunRecord(input: unknown): input is RunRecord {\n try {\n validateRunRecord(input)\n return true\n } catch {\n return false\n }\n}\n\n/** Non-throwing validator — returns a discriminated union. */\nexport function parseRunRecordSafe(\n input: unknown,\n): { ok: true; value: RunRecord } | { ok: false; error: RunRecordValidationError } {\n try {\n return { ok: true, value: validateRunRecord(input) }\n } catch (e) {\n if (e instanceof RunRecordValidationError) return { ok: false, error: e }\n throw e\n }\n}\n\n/** Round-trip helper — `JSON.parse(JSON.stringify(record))` then validate. */\nexport function roundTripRunRecord(record: RunRecord): RunRecord {\n const json = JSON.stringify(record)\n return validateRunRecord(JSON.parse(json))\n}\n\n// ── Internals ────────────────────────────────────────────────────────\n\nfunction expectString(value: unknown, path: string): void {\n if (typeof value !== 'string' || value.length === 0) {\n throw new RunRecordValidationError(`expected non-empty string`, path)\n }\n}\n\nfunction expectFiniteNumber(value: unknown, path: string): void {\n if (typeof value !== 'number' || !Number.isFinite(value)) {\n throw new RunRecordValidationError(`expected finite number`, path)\n }\n}\n\n/**\n * Heuristic snapshot check. Accepts:\n * - `name@YYYY-MM-DD` (Anthropic style: `claude-sonnet-4-6@2025-04-15`)\n * - `name-YYYYMMDD` (OpenAI style: `gpt-4o-2024-11-20`)\n * - `name@<arbitrary-token>` (allow opaque snapshots like `@v3`)\n * - explicit `:date-...` Vertex-style tags\n *\n * Rejects bare aliases like `claude-sonnet-4` or `gpt-4o` that remap\n * silently as providers ship new snapshots.\n */\nfunction modelHasSnapshot(model: string): boolean {\n if (model.includes('@')) return true\n if (/-\\d{8}$/.test(model)) return true\n if (/-\\d{4}-\\d{2}-\\d{2}$/.test(model)) return true\n if (/:date-/.test(model)) return true\n return false\n}\n"],"mappings":";;;;;AA+HA,IAAM,sBAAsB;AAAA,EAC1B;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAIA,IAAM,aAAyC,CAAC,UAAU,OAAO,SAAS;AAEnE,IAAM,2BAAN,cAAuC,gBAAgB;AAAA,EACnD;AAAA,EACT,YAAY,SAAiB,OAAO,IAAI;AACtC,UAAM,OAAO,GAAG,OAAO,QAAQ,IAAI,MAAM,OAAO;AAChD,SAAK,OAAO;AAAA,EACd;AACF;AAOO,SAAS,kBAAkB,OAA2B;AAC3D,MAAI,UAAU,QAAQ,OAAO,UAAU,UAAU;AAC/C,UAAM,IAAI,yBAAyB,iBAAiB;AAAA,EACtD;AACA,QAAM,MAAM;AAEZ,aAAW,OAAO,qBAAqB;AACrC,QAAI,EAAE,OAAO,MAAM;AACjB,YAAM,IAAI,yBAAyB,4BAA4B,GAAG,GAAG;AAAA,IACvE;AAAA,EACF;AAEA,eAAa,IAAI,OAAO,OAAO;AAC/B,eAAa,IAAI,cAAc,cAAc;AAC7C,eAAa,IAAI,aAAa,aAAa;AAC3C,qBAAmB,IAAI,MAAM,MAAM;AACnC,eAAa,IAAI,OAAO,OAAO;AAC/B,eAAa,IAAI,YAAY,YAAY;AACzC,eAAa,IAAI,YAAY,YAAY;AACzC,eAAa,IAAI,WAAW,WAAW;AACvC,qBAAmB,IAAI,QAAQ,QAAQ;AACvC,MAAI,IAAI,YAAY,OAAW,oBAAmB,IAAI,SAAS,SAAS;AACxE,qBAAmB,IAAI,SAAS,SAAS;AAGzC,MAAI,CAAC,iBAAiB,IAAI,KAAe,GAAG;AAC1C,UAAM,IAAI;AAAA,MACR,UAAU,IAAI,KAAK;AAAA,MACnB;AAAA,IACF;AAAA,EACF;AAGA,QAAM,KAAK,IAAI;AACf,MAAI,OAAO,QAAQ,OAAO,OAAO,UAAU;AACzC,UAAM,IAAI,yBAAyB,gCAAgC,YAAY;AAAA,EACjF;AACA,QAAM,QAAQ;AACd,qBAAmB,MAAM,OAAO,kBAAkB;AAClD,qBAAmB,MAAM,QAAQ,mBAAmB;AACpD,MAAI,MAAM,WAAW,OAAW,oBAAmB,MAAM,QAAQ,mBAAmB;AAGpF,MAAI,IAAI,kBAAkB,QAAW;AACnC,UAAM,KAAK,IAAI;AACf,QAAI,OAAO,QAAQ,OAAO,OAAO,UAAU;AACzC,YAAM,IAAI,yBAAyB,mCAAmC,eAAe;AAAA,IACvF;AACA,UAAM,QAAQ;AACd,iBAAa,MAAM,OAAO,qBAAqB;AAC/C,iBAAa,MAAM,eAAe,6BAA6B;AAC/D,uBAAmB,MAAM,YAAY,0BAA0B;AAC/D,QAAI,OAAO,MAAM,aAAa,WAAW;AACvC,YAAM,IAAI;AAAA,QACR;AAAA,QACA;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAGA,QAAM,MAAM,IAAI;AAChB,MAAI,QAAQ,QAAQ,OAAO,QAAQ,UAAU;AAC3C,UAAM,IAAI,yBAAyB,6BAA6B,SAAS;AAAA,EAC3E;AACA,QAAM,SAAS;AACf,MAAI,OAAO,gBAAgB;AACzB,uBAAmB,OAAO,aAAa,qBAAqB;AAC9D,MAAI,OAAO,iBAAiB;AAC1B,uBAAmB,OAAO,cAAc,sBAAsB;AAChE,MAAI,OAAO,gBAAgB,UAAa,OAAO,iBAAiB,QAAW;AACzE,UAAM,IAAI;AAAA,MACR;AAAA,MACA;AAAA,IACF;AAAA,EACF;AACA,QAAM,MAAM,OAAO;AACnB,MAAI,QAAQ,QAAQ,OAAO,QAAQ,UAAU;AAC3C,UAAM,IAAI,yBAAyB,iCAAiC,aAAa;AAAA,EACnF;AACA,aAAW,CAAC,GAAG,CAAC,KAAK,OAAO,QAAQ,GAA8B,GAAG;AACnE,uBAAmB,GAAG,eAAe,CAAC,EAAE;AAAA,EAC1C;AAGA,MAAI,IAAI,gBAAgB,OAAW,cAAa,IAAI,aAAa,aAAa;AAG9E,MAAI,OAAO,IAAI,aAAa,YAAY,CAAC,WAAW,SAAS,IAAI,QAAuB,GAAG;AACzF,UAAM,IAAI;AAAA,MACR,2BAA2B,WAAW,KAAK,IAAI,CAAC,SAAS,OAAO,IAAI,QAAQ,CAAC;AAAA,MAC7E;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AACT;AAGO,SAAS,YAAY,OAAoC;AAC9D,MAAI;AACF,sBAAkB,KAAK;AACvB,WAAO;AAAA,EACT,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAGO,SAAS,mBACd,OACiF;AACjF,MAAI;AACF,WAAO,EAAE,IAAI,MAAM,OAAO,kBAAkB,KAAK,EAAE;AAAA,EACrD,SAAS,GAAG;AACV,QAAI,aAAa,yBAA0B,QAAO,EAAE,IAAI,OAAO,OAAO,EAAE;AACxE,UAAM;AAAA,EACR;AACF;AAGO,SAAS,mBAAmB,QAA8B;AAC/D,QAAM,OAAO,KAAK,UAAU,MAAM;AAClC,SAAO,kBAAkB,KAAK,MAAM,IAAI,CAAC;AAC3C;AAIA,SAAS,aAAa,OAAgB,MAAoB;AACxD,MAAI,OAAO,UAAU,YAAY,MAAM,WAAW,GAAG;AACnD,UAAM,IAAI,yBAAyB,6BAA6B,IAAI;AAAA,EACtE;AACF;AAEA,SAAS,mBAAmB,OAAgB,MAAoB;AAC9D,MAAI,OAAO,UAAU,YAAY,CAAC,OAAO,SAAS,KAAK,GAAG;AACxD,UAAM,IAAI,yBAAyB,0BAA0B,IAAI;AAAA,EACnE;AACF;AAYA,SAAS,iBAAiB,OAAwB;AAChD,MAAI,MAAM,SAAS,GAAG,EAAG,QAAO;AAChC,MAAI,UAAU,KAAK,KAAK,EAAG,QAAO;AAClC,MAAI,sBAAsB,KAAK,KAAK,EAAG,QAAO;AAC9C,MAAI,SAAS,KAAK,KAAK,EAAG,QAAO;AACjC,SAAO;AACT;","names":[]}
@@ -1 +0,0 @@
1
- {"version":3,"sources":["../src/eval-campaign.ts"],"sourcesContent":["/**\n * EvalCampaign — opinionated matrix runner that wires the four\n * capture-integrity directives by construction.\n *\n * The canonical benchmark shape — matrix runner → for each\n * (variant, scenario, seed) → start a TraceEmitter → call LLMs → end the\n * run → analyze — has a bug class at the integration boundary: raw\n * events not captured, route silently wrong, integrity not asserted,\n * analyst never run. The directives in `SKILL.md § Capture integrity`\n * are the mitigations.\n *\n * `EvalCampaign` is the structural fix — consumers don't wire the\n * integrity surface themselves; the campaign owns it. Specifically:\n *\n * - calls `assertLlmRoute` once at preflight before any work runs\n * - constructs a per-run `TraceStore` and `RawProviderSink` via factories\n * - constructs the `TraceEmitter` with `onRunComplete: [analyst hook]`\n * - hands the runner an `LlmClientOptions` pre-wired with the sink and\n * trace context — the runner can't accidentally call an LLM without\n * capturing the raw HTTP envelope\n * - calls `assertRunCaptured` after every `endRun` and routes failures\n * through a configurable policy (`throw` / `mark_failed` / `log`)\n * - assembles per-run `RunRecord`s and runs `researchReport` at the end\n * so the campaign artifact is launch-decision-grade by default\n * - embeds the campaign fingerprint (a SHA-256 over the canonicalised\n * run set) and optional `preregistrationHash` in the report\n *\n * The runner contract is intentionally narrow: produce a `CampaignRunOutcome`\n * given a fully-wired `CampaignRunContext`. Everything orchestration-shaped\n * lives in the campaign. This is the inversion-of-control point — consumers\n * stop writing matrix runners and start writing scenario-runners.\n *\n * Out of scope for v1 (tracked in `docs/research-report-methodology.md`):\n *\n * - Distributed/cluster execution (concurrency is local async)\n * - Adaptive sampling / sequential interim looks\n * - Resume from partial state across crashes\n * - LLM-call retry beyond what `LlmClient` already does\n */\n\nimport { assertLlmRoute, type LlmClientOptions, type LlmRouteRequirements } from './llm-client'\nimport { canonicalize, hashJson } from './pre-registration'\nimport type {\n RunJudgeMetadata,\n RunOutcome,\n RunRecord,\n RunSplitTag,\n RunTokenUsage,\n} from './run-record'\nimport { type ResearchReport, type ResearchReportOptions, researchReport } from './summary-report'\nimport type { RunCompleteHook } from './trace/emitter'\nimport { TraceEmitter } from './trace/emitter'\nimport {\n assertRunCaptured,\n RunIntegrityError,\n type RunIntegrityExpectations,\n type RunIntegrityReport,\n} from './trace/integrity'\nimport { FileSystemRawProviderSink, type RawProviderSink } from './trace/raw-provider-sink'\nimport type { TraceStore } from './trace/store'\n\n// ── Public types ─────────────────────────────────────────────────────────\n\nexport interface CampaignVariant<V> {\n id: string\n payload: V\n}\n\nexport interface CampaignScenario {\n scenarioId: string\n /** Free-form metadata propagated to runs and reports. */\n tags?: Record<string, string>\n}\n\nexport interface CampaignRunContext<V> {\n /** Stable run id. The campaign generates this; the runner does not. */\n runId: string\n /** Logical experiment id (campaignId by default; overridable per-run via opts). */\n experimentId: string\n variant: V\n variantId: string\n scenarioId: string\n scenarioTags: Record<string, string>\n seed: number\n splitTag: RunSplitTag\n /**\n * The TraceEmitter for this run, with `onRunComplete` hooks pre-wired\n * (analyst auto-execution if configured, plus integrity check). The\n * runner MUST call `emitter.startRun` before doing any work and either\n * `emitter.endRun` or `emitter.abortRun` before returning.\n */\n emitter: TraceEmitter\n store: TraceStore\n rawSink: RawProviderSink\n /**\n * Pre-wired LLM client options — `rawSink` and `traceContext` are populated\n * so any `callLlm(req, ctx.llmOpts)` automatically captures raw HTTP. The\n * runner can spread additional fields if needed.\n */\n llmOpts: LlmClientOptions\n}\n\nexport interface CampaignRunOutcome {\n /** Did the run pass? Mirrors `RunOutcome.pass` semantics. */\n pass: boolean\n /** Score for the run on its split. Maps to `searchScore` or `holdoutScore`. */\n score: number\n /** Mandatory cost in USD. Use 0 + raw.cost_unknown=1 only if truly unknown. */\n costUsd: number\n tokenUsage: RunTokenUsage\n /** Snapshot model id (e.g. `claude-sonnet-4-6@2025-04-15`). */\n model: string\n /** sha256 of the effective prompt sent to the model. */\n promptHash: string\n /** sha256 of the effective config (model, temperature, tools, judges, splits). */\n configHash: string\n /** Optional extra numeric metrics to land in `outcome.raw`. */\n raw?: Record<string, number>\n /** Optional failure-taxonomy tag if the run failed. */\n failureMode?: string\n /** Optional judge metadata when a judge was used. */\n judgeMetadata?: RunJudgeMetadata\n}\n\nexport type CampaignRunner<V> = (ctx: CampaignRunContext<V>) => Promise<CampaignRunOutcome>\n\nexport type CampaignIntegrityPolicy = 'throw' | 'mark_failed' | 'log'\n\nexport interface EvalCampaignOptions<V> {\n /**\n * Stable id for the campaign. Used as the default `experimentId` on\n * every run, and folded into the campaign fingerprint.\n */\n campaignId: string\n variants: CampaignVariant<V>[]\n scenarios: CampaignScenario[]\n /** Default `[0, 1, 2]`. */\n seeds?: number[]\n /** Default `'holdout'` — the split that anchors a launch decision. */\n splitTag?: RunSplitTag\n /** Git SHA the campaign is run against. Mandatory; `RunRecord` rejects unset. */\n commitSha: string\n /**\n * LLM client config. Augmented per-run with `rawSink` and `traceContext`\n * before being passed to the runner. The campaign asserts this config\n * matches `routeRequirements` once at preflight.\n */\n llmOpts: LlmClientOptions\n /**\n * Default `{ requireExplicitBaseUrl: true, requireAuth: true }` — fail\n * loud if the campaign would silently fall back to the public router or\n * run unauthenticated. Override with an empty object to disable.\n */\n routeRequirements?: LlmRouteRequirements\n /**\n * Per-run TraceStore factory. Common shape: a fresh store per run keyed\n * on `runId`. Implementations that share a store across the campaign\n * are valid — the campaign only writes through `emitter`.\n */\n storeFactory: (params: CampaignFactoryParams) => TraceStore\n /**\n * Per-run RawProviderSink factory. Defaults to `FileSystemRawProviderSink`\n * rooted at `${workDir}/raw-events/${runId}` if `workDir` is supplied;\n * otherwise required. Forensic capture is non-negotiable in a campaign\n * run — pass `NoopRawProviderSink` explicitly if you want to opt out.\n */\n rawSinkFactory?: (params: CampaignFactoryParams) => RawProviderSink\n /**\n * Filesystem root for default `rawSinkFactory`. Ignored if\n * `rawSinkFactory` is supplied.\n */\n workDir?: string\n /**\n * Extra `onRunComplete` hooks the campaign appends (after its own\n * integrity-check hook). Pass `traceAnalystOnRunComplete(...)` here.\n */\n onRunComplete?: RunCompleteHook[]\n /**\n * Per-run integrity expectations. Defaults to:\n * `{ llmSpansMin: 1, requireRawCoverageOfLlmSpans: true, requireOutcome: true }`.\n * Override (e.g. `{ llmSpansMin: 0 }`) for runs that don't call LLMs.\n */\n integrity?: RunIntegrityExpectations\n /** Behaviour when integrity fails. Default `'mark_failed'`. */\n onIntegrityFailure?: CampaignIntegrityPolicy\n /**\n * Per-run runner. Receives a fully-wired context; produces an outcome\n * the campaign converts into a `RunRecord`.\n */\n runner: CampaignRunner<V>\n /**\n * If set, the campaign computes `researchReport` at the end. `comparator`\n * is a `variantId`. Other fields are forwarded verbatim.\n */\n report?: { comparator?: string } & Omit<\n ResearchReportOptions,\n 'comparator' | 'preregistrationHash' | 'generatedAt'\n >\n /**\n * Hash of a signed `HypothesisManifest` (see `pre-registration.ts`).\n * Embedded in the campaign fingerprint and the research report.\n */\n preregistrationHash?: string\n /** Local concurrency. Default `1` (sequential). */\n concurrency?: number\n /**\n * Override the time source. Tests pass a mock to make wallMs deterministic.\n */\n now?: () => number\n /** Override the runId generator. Tests pin this. */\n runId?: (params: CampaignFactoryParams) => string\n}\n\nexport interface CampaignFactoryParams {\n campaignId: string\n runId: string\n variantId: string\n scenarioId: string\n seed: number\n}\n\nexport interface FailedRun {\n runId: string\n variantId: string\n scenarioId: string\n seed: number\n reason: string\n error?: string\n}\n\nexport interface EvalCampaignResult {\n campaignId: string\n /** SHA-256 over canonicalised `(variantIds, scenarioIds, seeds, comparator, splitTag, baseUrl, provider, preregistrationHash)`. */\n campaignFingerprint: string\n preregistrationHash: string | null\n /** Successful runs only. Failed runs land in `failedRuns`. */\n runs: RunRecord[]\n /** Integrity reports for every successful run. */\n integrityReports: RunIntegrityReport[]\n failedRuns: FailedRun[]\n /** Computed when `report` is set on options. */\n report?: ResearchReport\n startedAt: string\n endedAt: string\n}\n\n// ── Implementation ───────────────────────────────────────────────────────\n\nconst DEFAULT_INTEGRITY: RunIntegrityExpectations = {\n llmSpansMin: 1,\n requireRawCoverageOfLlmSpans: true,\n requireOutcome: true,\n}\n\nconst DEFAULT_ROUTE: LlmRouteRequirements = {\n requireExplicitBaseUrl: true,\n requireAuth: true,\n}\n\nexport async function runEvalCampaign<V>(\n opts: EvalCampaignOptions<V>,\n): Promise<EvalCampaignResult> {\n // ── Preflight ──────────────────────────────────────────────────────\n assertLlmRoute(opts.llmOpts, opts.routeRequirements ?? DEFAULT_ROUTE)\n\n if (opts.variants.length === 0) {\n throw new Error('runEvalCampaign: variants must be non-empty.')\n }\n if (opts.scenarios.length === 0) {\n throw new Error('runEvalCampaign: scenarios must be non-empty.')\n }\n const variantIds = new Set<string>()\n for (const v of opts.variants) {\n if (variantIds.has(v.id)) {\n throw new Error(`runEvalCampaign: duplicate variant id \"${v.id}\".`)\n }\n variantIds.add(v.id)\n }\n const scenarioIds = new Set<string>()\n for (const s of opts.scenarios) {\n if (scenarioIds.has(s.scenarioId)) {\n throw new Error(`runEvalCampaign: duplicate scenarioId \"${s.scenarioId}\".`)\n }\n scenarioIds.add(s.scenarioId)\n }\n if (opts.report?.comparator && !variantIds.has(opts.report.comparator)) {\n throw new Error(\n `runEvalCampaign: report.comparator \"${opts.report.comparator}\" is not a configured variantId.`,\n )\n }\n if (!opts.commitSha) {\n throw new Error('runEvalCampaign: commitSha is required (every RunRecord needs it).')\n }\n\n const seeds = opts.seeds ?? [0, 1, 2]\n const splitTag: RunSplitTag = opts.splitTag ?? 'holdout'\n const concurrency = Math.max(1, opts.concurrency ?? 1)\n const integrity = { ...DEFAULT_INTEGRITY, ...(opts.integrity ?? {}) }\n const onIntegrityFailure: CampaignIntegrityPolicy = opts.onIntegrityFailure ?? 'mark_failed'\n const now = opts.now ?? (() => Date.now())\n const baseUrl = (opts.llmOpts.baseUrl ?? '').replace(/\\/+$/, '')\n const provider = opts.llmOpts.provider ?? null\n const preregistrationHash = opts.preregistrationHash ?? null\n\n const rawSinkFactory = opts.rawSinkFactory ?? defaultRawSinkFactory(opts.workDir)\n\n // ── Fingerprint ────────────────────────────────────────────────────\n const campaignFingerprint = await hashJson(\n canonicalize({\n campaignId: opts.campaignId,\n variants: opts.variants.map((v) => v.id).sort(),\n scenarios: opts.scenarios.map((s) => s.scenarioId).sort(),\n seeds: [...seeds].sort((a, b) => a - b),\n splitTag,\n comparator: opts.report?.comparator ?? null,\n baseUrl,\n provider,\n preregistrationHash,\n }),\n )\n\n // ── Plan the matrix ────────────────────────────────────────────────\n type Cell = { variant: CampaignVariant<V>; scenario: CampaignScenario; seed: number }\n const cells: Cell[] = []\n for (const variant of opts.variants) {\n for (const scenario of opts.scenarios) {\n for (const seed of seeds) {\n cells.push({ variant, scenario, seed })\n }\n }\n }\n\n const startedAt = new Date(now()).toISOString()\n const runs: RunRecord[] = []\n const integrityReports: RunIntegrityReport[] = []\n const failedRuns: FailedRun[] = []\n\n // ── Execute (bounded-concurrency worker pool) ──────────────────────\n let cursor = 0\n async function worker(): Promise<void> {\n while (true) {\n const i = cursor++\n if (i >= cells.length) return\n const cell = cells[i]!\n try {\n const result = await runOneCell(cell)\n runs.push(result.record)\n integrityReports.push(result.integrity)\n } catch (err) {\n if (err instanceof CellExecutionError) {\n failedRuns.push(err.failed)\n if (err.integrity) integrityReports.push(err.integrity)\n } else {\n // Genuine bug — not a runner failure, not an integrity failure.\n // Surface it; don't silently mask.\n throw err\n }\n }\n }\n }\n\n async function runOneCell(\n cell: Cell,\n ): Promise<{ record: RunRecord; integrity: RunIntegrityReport }> {\n const runId = (opts.runId ?? defaultRunId)({\n campaignId: opts.campaignId,\n runId: '', // unused by default generator\n variantId: cell.variant.id,\n scenarioId: cell.scenario.scenarioId,\n seed: cell.seed,\n })\n const factoryParams: CampaignFactoryParams = {\n campaignId: opts.campaignId,\n runId,\n variantId: cell.variant.id,\n scenarioId: cell.scenario.scenarioId,\n seed: cell.seed,\n }\n const store = opts.storeFactory(factoryParams)\n const rawSink = rawSinkFactory(factoryParams)\n\n const emitter = new TraceEmitter(store, {\n runId,\n now: opts.now,\n onRunComplete: opts.onRunComplete,\n })\n\n const llmOpts: LlmClientOptions = {\n ...opts.llmOpts,\n rawSink,\n traceContext: { runId },\n }\n\n const ctx: CampaignRunContext<V> = {\n runId,\n experimentId: opts.campaignId,\n variant: cell.variant.payload,\n variantId: cell.variant.id,\n scenarioId: cell.scenario.scenarioId,\n scenarioTags: cell.scenario.tags ?? {},\n seed: cell.seed,\n splitTag,\n emitter,\n store,\n rawSink,\n llmOpts,\n }\n\n const wallStart = now()\n let outcome: CampaignRunOutcome\n try {\n outcome = await opts.runner(ctx)\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err)\n // The runner threw mid-execution; give it a chance to have aborted.\n try {\n await emitter.abortRun(message)\n } catch {\n // Already aborted/ended; ignore.\n }\n throw new CellExecutionError({\n runId,\n variantId: cell.variant.id,\n scenarioId: cell.scenario.scenarioId,\n seed: cell.seed,\n reason: 'runner_threw',\n error: message,\n })\n }\n const wallMs = now() - wallStart\n\n const integrityReport = await assertRunCaptured(store, runId, { ...integrity, rawSink })\n if (!integrityReport.ok) {\n switch (onIntegrityFailure) {\n case 'throw':\n throw new RunIntegrityError(integrityReport)\n case 'mark_failed':\n throw new CellExecutionError(\n {\n runId,\n variantId: cell.variant.id,\n scenarioId: cell.scenario.scenarioId,\n seed: cell.seed,\n reason: 'integrity_failed',\n error: integrityReport.issues.map((i) => i.code).join(', '),\n },\n integrityReport,\n )\n case 'log':\n // Caller wants the run admitted with a flagged report; fall through.\n break\n }\n }\n\n const recordOutcome: RunOutcome = {\n raw: outcome.raw ?? {},\n }\n if (splitTag === 'holdout') recordOutcome.holdoutScore = outcome.score\n else recordOutcome.searchScore = outcome.score\n\n const record: RunRecord = {\n runId,\n experimentId: opts.campaignId,\n candidateId: cell.variant.id,\n seed: cell.seed,\n model: outcome.model,\n promptHash: outcome.promptHash,\n configHash: outcome.configHash,\n commitSha: opts.commitSha,\n wallMs,\n costUsd: outcome.costUsd,\n tokenUsage: outcome.tokenUsage,\n judgeMetadata: outcome.judgeMetadata,\n outcome: recordOutcome,\n failureMode: outcome.failureMode,\n splitTag,\n scenarioId: cell.scenario.scenarioId,\n }\n return { record, integrity: integrityReport }\n }\n\n const workers = Array.from({ length: Math.min(concurrency, cells.length) }, () => worker())\n await Promise.all(workers)\n\n // ── Optional research report ───────────────────────────────────────\n let report: ResearchReport | undefined\n if (opts.report) {\n const reportOpts: ResearchReportOptions = {\n ...opts.report,\n comparator: opts.report.comparator,\n split: splitTag === 'dev' ? 'search' : splitTag,\n generatedAt: new Date(now()).toISOString(),\n preregistrationHash: preregistrationHash ?? undefined,\n }\n report = await researchReport(runs, reportOpts)\n }\n\n const endedAt = new Date(now()).toISOString()\n\n return {\n campaignId: opts.campaignId,\n campaignFingerprint,\n preregistrationHash,\n runs,\n integrityReports,\n failedRuns,\n report,\n startedAt,\n endedAt,\n }\n}\n\n// ── Internal ─────────────────────────────────────────────────────────────\n\nclass CellExecutionError extends Error {\n readonly failed: FailedRun\n readonly integrity?: RunIntegrityReport\n constructor(failed: FailedRun, integrity?: RunIntegrityReport) {\n super(`cell ${failed.variantId}/${failed.scenarioId}@${failed.seed} failed: ${failed.reason}`)\n this.failed = failed\n this.integrity = integrity\n }\n}\n\nfunction defaultRawSinkFactory(workDir: string | undefined) {\n return (params: CampaignFactoryParams): RawProviderSink => {\n if (!workDir) {\n throw new Error(\n 'runEvalCampaign: rawSinkFactory not supplied and workDir not set. Pass either to enable raw provider capture, or pass `new NoopRawProviderSink()` via rawSinkFactory to opt out explicitly.',\n )\n }\n return new FileSystemRawProviderSink({\n dir: `${workDir}/raw-events/${params.runId}`,\n })\n }\n}\n\nfunction defaultRunId(params: CampaignFactoryParams): string {\n // Stable across re-runs: fingerprint of (campaignId, variantId, scenarioId, seed).\n // Caller can override via opts.runId for non-deterministic IDs.\n const base = `${params.campaignId}::${params.variantId}::${params.scenarioId}::${params.seed}`\n // Lightweight hex: we don't need crypto-grade here, just stability + uniqueness.\n let h1 = 0x811c9dc5\n let h2 = 0x12345678\n for (let i = 0; i < base.length; i++) {\n const c = base.charCodeAt(i)\n h1 = Math.imul(h1 ^ c, 0x01000193) >>> 0\n h2 = Math.imul(h2 ^ c, 0x9e3779b1) >>> 0\n }\n return `run-${h1.toString(16).padStart(8, '0')}${h2.toString(16).padStart(8, '0')}`\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;AAwPA,IAAM,oBAA8C;AAAA,EAClD,aAAa;AAAA,EACb,8BAA8B;AAAA,EAC9B,gBAAgB;AAClB;AAEA,IAAM,gBAAsC;AAAA,EAC1C,wBAAwB;AAAA,EACxB,aAAa;AACf;AAEA,eAAsB,gBACpB,MAC6B;AAE7B,iBAAe,KAAK,SAAS,KAAK,qBAAqB,aAAa;AAEpE,MAAI,KAAK,SAAS,WAAW,GAAG;AAC9B,UAAM,IAAI,MAAM,8CAA8C;AAAA,EAChE;AACA,MAAI,KAAK,UAAU,WAAW,GAAG;AAC/B,UAAM,IAAI,MAAM,+CAA+C;AAAA,EACjE;AACA,QAAM,aAAa,oBAAI,IAAY;AACnC,aAAW,KAAK,KAAK,UAAU;AAC7B,QAAI,WAAW,IAAI,EAAE,EAAE,GAAG;AACxB,YAAM,IAAI,MAAM,0CAA0C,EAAE,EAAE,IAAI;AAAA,IACpE;AACA,eAAW,IAAI,EAAE,EAAE;AAAA,EACrB;AACA,QAAM,cAAc,oBAAI,IAAY;AACpC,aAAW,KAAK,KAAK,WAAW;AAC9B,QAAI,YAAY,IAAI,EAAE,UAAU,GAAG;AACjC,YAAM,IAAI,MAAM,0CAA0C,EAAE,UAAU,IAAI;AAAA,IAC5E;AACA,gBAAY,IAAI,EAAE,UAAU;AAAA,EAC9B;AACA,MAAI,KAAK,QAAQ,cAAc,CAAC,WAAW,IAAI,KAAK,OAAO,UAAU,GAAG;AACtE,UAAM,IAAI;AAAA,MACR,uCAAuC,KAAK,OAAO,UAAU;AAAA,IAC/D;AAAA,EACF;AACA,MAAI,CAAC,KAAK,WAAW;AACnB,UAAM,IAAI,MAAM,oEAAoE;AAAA,EACtF;AAEA,QAAM,QAAQ,KAAK,SAAS,CAAC,GAAG,GAAG,CAAC;AACpC,QAAM,WAAwB,KAAK,YAAY;AAC/C,QAAM,cAAc,KAAK,IAAI,GAAG,KAAK,eAAe,CAAC;AACrD,QAAM,YAAY,EAAE,GAAG,mBAAmB,GAAI,KAAK,aAAa,CAAC,EAAG;AACpE,QAAM,qBAA8C,KAAK,sBAAsB;AAC/E,QAAM,MAAM,KAAK,QAAQ,MAAM,KAAK,IAAI;AACxC,QAAM,WAAW,KAAK,QAAQ,WAAW,IAAI,QAAQ,QAAQ,EAAE;AAC/D,QAAM,WAAW,KAAK,QAAQ,YAAY;AAC1C,QAAM,sBAAsB,KAAK,uBAAuB;AAExD,QAAM,iBAAiB,KAAK,kBAAkB,sBAAsB,KAAK,OAAO;AAGhF,QAAM,sBAAsB,MAAM;AAAA,IAChC,aAAa;AAAA,MACX,YAAY,KAAK;AAAA,MACjB,UAAU,KAAK,SAAS,IAAI,CAAC,MAAM,EAAE,EAAE,EAAE,KAAK;AAAA,MAC9C,WAAW,KAAK,UAAU,IAAI,CAAC,MAAM,EAAE,UAAU,EAAE,KAAK;AAAA,MACxD,OAAO,CAAC,GAAG,KAAK,EAAE,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAAA,MACtC;AAAA,MACA,YAAY,KAAK,QAAQ,cAAc;AAAA,MACvC;AAAA,MACA;AAAA,MACA;AAAA,IACF,CAAC;AAAA,EACH;AAIA,QAAM,QAAgB,CAAC;AACvB,aAAW,WAAW,KAAK,UAAU;AACnC,eAAW,YAAY,KAAK,WAAW;AACrC,iBAAW,QAAQ,OAAO;AACxB,cAAM,KAAK,EAAE,SAAS,UAAU,KAAK,CAAC;AAAA,MACxC;AAAA,IACF;AAAA,EACF;AAEA,QAAM,YAAY,IAAI,KAAK,IAAI,CAAC,EAAE,YAAY;AAC9C,QAAM,OAAoB,CAAC;AAC3B,QAAM,mBAAyC,CAAC;AAChD,QAAM,aAA0B,CAAC;AAGjC,MAAI,SAAS;AACb,iBAAe,SAAwB;AACrC,WAAO,MAAM;AACX,YAAM,IAAI;AACV,UAAI,KAAK,MAAM,OAAQ;AACvB,YAAM,OAAO,MAAM,CAAC;AACpB,UAAI;AACF,cAAM,SAAS,MAAM,WAAW,IAAI;AACpC,aAAK,KAAK,OAAO,MAAM;AACvB,yBAAiB,KAAK,OAAO,SAAS;AAAA,MACxC,SAAS,KAAK;AACZ,YAAI,eAAe,oBAAoB;AACrC,qBAAW,KAAK,IAAI,MAAM;AAC1B,cAAI,IAAI,UAAW,kBAAiB,KAAK,IAAI,SAAS;AAAA,QACxD,OAAO;AAGL,gBAAM;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,iBAAe,WACb,MAC+D;AAC/D,UAAM,SAAS,KAAK,SAAS,cAAc;AAAA,MACzC,YAAY,KAAK;AAAA,MACjB,OAAO;AAAA;AAAA,MACP,WAAW,KAAK,QAAQ;AAAA,MACxB,YAAY,KAAK,SAAS;AAAA,MAC1B,MAAM,KAAK;AAAA,IACb,CAAC;AACD,UAAM,gBAAuC;AAAA,MAC3C,YAAY,KAAK;AAAA,MACjB;AAAA,MACA,WAAW,KAAK,QAAQ;AAAA,MACxB,YAAY,KAAK,SAAS;AAAA,MAC1B,MAAM,KAAK;AAAA,IACb;AACA,UAAM,QAAQ,KAAK,aAAa,aAAa;AAC7C,UAAM,UAAU,eAAe,aAAa;AAE5C,UAAM,UAAU,IAAI,aAAa,OAAO;AAAA,MACtC;AAAA,MACA,KAAK,KAAK;AAAA,MACV,eAAe,KAAK;AAAA,IACtB,CAAC;AAED,UAAM,UAA4B;AAAA,MAChC,GAAG,KAAK;AAAA,MACR;AAAA,MACA,cAAc,EAAE,MAAM;AAAA,IACxB;AAEA,UAAM,MAA6B;AAAA,MACjC;AAAA,MACA,cAAc,KAAK;AAAA,MACnB,SAAS,KAAK,QAAQ;AAAA,MACtB,WAAW,KAAK,QAAQ;AAAA,MACxB,YAAY,KAAK,SAAS;AAAA,MAC1B,cAAc,KAAK,SAAS,QAAQ,CAAC;AAAA,MACrC,MAAM,KAAK;AAAA,MACX;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF;AAEA,UAAM,YAAY,IAAI;AACtB,QAAI;AACJ,QAAI;AACF,gBAAU,MAAM,KAAK,OAAO,GAAG;AAAA,IACjC,SAAS,KAAK;AACZ,YAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAE/D,UAAI;AACF,cAAM,QAAQ,SAAS,OAAO;AAAA,MAChC,QAAQ;AAAA,MAER;AACA,YAAM,IAAI,mBAAmB;AAAA,QAC3B;AAAA,QACA,WAAW,KAAK,QAAQ;AAAA,QACxB,YAAY,KAAK,SAAS;AAAA,QAC1B,MAAM,KAAK;AAAA,QACX,QAAQ;AAAA,QACR,OAAO;AAAA,MACT,CAAC;AAAA,IACH;AACA,UAAM,SAAS,IAAI,IAAI;AAEvB,UAAM,kBAAkB,MAAM,kBAAkB,OAAO,OAAO,EAAE,GAAG,WAAW,QAAQ,CAAC;AACvF,QAAI,CAAC,gBAAgB,IAAI;AACvB,cAAQ,oBAAoB;AAAA,QAC1B,KAAK;AACH,gBAAM,IAAI,kBAAkB,eAAe;AAAA,QAC7C,KAAK;AACH,gBAAM,IAAI;AAAA,YACR;AAAA,cACE;AAAA,cACA,WAAW,KAAK,QAAQ;AAAA,cACxB,YAAY,KAAK,SAAS;AAAA,cAC1B,MAAM,KAAK;AAAA,cACX,QAAQ;AAAA,cACR,OAAO,gBAAgB,OAAO,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,KAAK,IAAI;AAAA,YAC5D;AAAA,YACA;AAAA,UACF;AAAA,QACF,KAAK;AAEH;AAAA,MACJ;AAAA,IACF;AAEA,UAAM,gBAA4B;AAAA,MAChC,KAAK,QAAQ,OAAO,CAAC;AAAA,IACvB;AACA,QAAI,aAAa,UAAW,eAAc,eAAe,QAAQ;AAAA,QAC5D,eAAc,cAAc,QAAQ;AAEzC,UAAM,SAAoB;AAAA,MACxB;AAAA,MACA,cAAc,KAAK;AAAA,MACnB,aAAa,KAAK,QAAQ;AAAA,MAC1B,MAAM,KAAK;AAAA,MACX,OAAO,QAAQ;AAAA,MACf,YAAY,QAAQ;AAAA,MACpB,YAAY,QAAQ;AAAA,MACpB,WAAW,KAAK;AAAA,MAChB;AAAA,MACA,SAAS,QAAQ;AAAA,MACjB,YAAY,QAAQ;AAAA,MACpB,eAAe,QAAQ;AAAA,MACvB,SAAS;AAAA,MACT,aAAa,QAAQ;AAAA,MACrB;AAAA,MACA,YAAY,KAAK,SAAS;AAAA,IAC5B;AACA,WAAO,EAAE,QAAQ,WAAW,gBAAgB;AAAA,EAC9C;AAEA,QAAM,UAAU,MAAM,KAAK,EAAE,QAAQ,KAAK,IAAI,aAAa,MAAM,MAAM,EAAE,GAAG,MAAM,OAAO,CAAC;AAC1F,QAAM,QAAQ,IAAI,OAAO;AAGzB,MAAI;AACJ,MAAI,KAAK,QAAQ;AACf,UAAM,aAAoC;AAAA,MACxC,GAAG,KAAK;AAAA,MACR,YAAY,KAAK,OAAO;AAAA,MACxB,OAAO,aAAa,QAAQ,WAAW;AAAA,MACvC,aAAa,IAAI,KAAK,IAAI,CAAC,EAAE,YAAY;AAAA,MACzC,qBAAqB,uBAAuB;AAAA,IAC9C;AACA,aAAS,MAAM,eAAe,MAAM,UAAU;AAAA,EAChD;AAEA,QAAM,UAAU,IAAI,KAAK,IAAI,CAAC,EAAE,YAAY;AAE5C,SAAO;AAAA,IACL,YAAY,KAAK;AAAA,IACjB;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACF;AAIA,IAAM,qBAAN,cAAiC,MAAM;AAAA,EAC5B;AAAA,EACA;AAAA,EACT,YAAY,QAAmB,WAAgC;AAC7D,UAAM,QAAQ,OAAO,SAAS,IAAI,OAAO,UAAU,IAAI,OAAO,IAAI,YAAY,OAAO,MAAM,EAAE;AAC7F,SAAK,SAAS;AACd,SAAK,YAAY;AAAA,EACnB;AACF;AAEA,SAAS,sBAAsB,SAA6B;AAC1D,SAAO,CAAC,WAAmD;AACzD,QAAI,CAAC,SAAS;AACZ,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AACA,WAAO,IAAI,0BAA0B;AAAA,MACnC,KAAK,GAAG,OAAO,eAAe,OAAO,KAAK;AAAA,IAC5C,CAAC;AAAA,EACH;AACF;AAEA,SAAS,aAAa,QAAuC;AAG3D,QAAM,OAAO,GAAG,OAAO,UAAU,KAAK,OAAO,SAAS,KAAK,OAAO,UAAU,KAAK,OAAO,IAAI;AAE5F,MAAI,KAAK;AACT,MAAI,KAAK;AACT,WAAS,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;AACpC,UAAM,IAAI,KAAK,WAAW,CAAC;AAC3B,SAAK,KAAK,KAAK,KAAK,GAAG,QAAU,MAAM;AACvC,SAAK,KAAK,KAAK,KAAK,GAAG,UAAU,MAAM;AAAA,EACzC;AACA,SAAO,OAAO,GAAG,SAAS,EAAE,EAAE,SAAS,GAAG,GAAG,CAAC,GAAG,GAAG,SAAS,EAAE,EAAE,SAAS,GAAG,GAAG,CAAC;AACnF;","names":[]}