@tangle-network/agent-eval 0.27.2 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/CHANGELOG.md +87 -0
  2. package/dist/{baseline-4R5deP0N.d.ts → baseline-BwdCXUS8.d.ts} +1 -1
  3. package/dist/builder-eval/index.d.ts +3 -3
  4. package/dist/chunk-UW4NOOZI.js +1561 -0
  5. package/dist/chunk-UW4NOOZI.js.map +1 -0
  6. package/dist/{control-BT4qnXiS.d.ts → control-rJhEDdpy.d.ts} +4 -4
  7. package/dist/{control-runtime-BZ_lVLYW.d.ts → control-runtime-BRdQ0wrx.d.ts} +2 -2
  8. package/dist/control.d.ts +5 -5
  9. package/dist/{emitter-DP_cSSiw.d.ts → emitter-BqjeOvJh.d.ts} +1 -1
  10. package/dist/{failure-cluster-Cw65_5FY.d.ts → failure-cluster-D1NZKqYu.d.ts} +1 -1
  11. package/dist/{feedback-trajectory-D1aGKusy.d.ts → feedback-trajectory-j0nJFgC6.d.ts} +1 -1
  12. package/dist/governance/index.d.ts +2 -2
  13. package/dist/{index-BhLlu-qO.d.ts → index-Cgt3DKXr.d.ts} +1 -1
  14. package/dist/index.d.ts +1190 -335
  15. package/dist/index.js +1580 -489
  16. package/dist/index.js.map +1 -1
  17. package/dist/{integrity-DK2EBVZC.d.ts → integrity-BAxLGJ9I.d.ts} +2 -2
  18. package/dist/knowledge/index.d.ts +3 -3
  19. package/dist/meta-eval/index.d.ts +1 -1
  20. package/dist/{multi-layer-verifier-U-c8ge1k.d.ts → multi-layer-verifier-BNi4-8lR.d.ts} +1 -1
  21. package/dist/openapi.json +1 -1
  22. package/dist/optimization.d.ts +8 -8
  23. package/dist/pipelines/index.d.ts +6 -6
  24. package/dist/prm/index.d.ts +4 -4
  25. package/dist/{query-DODUYdPg.d.ts → query-BFDT0kX_.d.ts} +1 -1
  26. package/dist/{release-report-CCQqnK46.d.ts → release-report-PWhGlpfO.d.ts} +1 -1
  27. package/dist/replay-BX5Fm8en.d.ts +529 -0
  28. package/dist/reporting.d.ts +4 -4
  29. package/dist/{researcher-G81CWc0q.d.ts → researcher-ClDX3KZx.d.ts} +5 -5
  30. package/dist/rl.d.ts +8 -8
  31. package/dist/{rubric-D5tjHNJQ.d.ts → rubric-DgSqjqqj.d.ts} +2 -2
  32. package/dist/{store-Db2Bv8Cf.d.ts → store-BP5be6s7.d.ts} +1 -1
  33. package/dist/{summary-report-Dl4akLKX.d.ts → summary-report-jrSGb2xZ.d.ts} +1 -1
  34. package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BJ54PDan.d.ts} +2 -2
  35. package/dist/traces.d.ts +9 -311
  36. package/dist/traces.js +15 -986
  37. package/dist/traces.js.map +1 -1
  38. package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-BFmveYZt.d.ts} +1 -1
  39. package/dist/wire/index.d.ts +4 -4
  40. package/package.json +1 -1
  41. package/dist/chunk-4U4BKCXK.js +0 -569
  42. package/dist/chunk-4U4BKCXK.js.map +0 -1
  43. package/dist/replay-D7z0J43-.d.ts +0 -225
package/dist/index.js CHANGED
@@ -169,12 +169,15 @@ import {
169
169
  REDACTION_VERSION,
170
170
  ReplayCache,
171
171
  ReplayCacheMissError,
172
+ TraceFileMissingError,
173
+ analyzeTraces,
174
+ buildTraceAnalystTools,
172
175
  createReplayFetch,
173
176
  exportRunAsOtlp,
174
177
  iterateRawCalls,
175
178
  redactString,
176
179
  redactValue
177
- } from "./chunk-4U4BKCXK.js";
180
+ } from "./chunk-UW4NOOZI.js";
178
181
  import {
179
182
  aggregateLlm,
180
183
  argHash,
@@ -229,6 +232,1492 @@ import {
229
232
  } from "./chunk-NG236HPC.js";
230
233
  import "./chunk-PZ5AY32C.js";
231
234
 
235
+ // src/run-score.ts
236
+ var DEFAULT_RUN_SCORE_WEIGHTS = {
237
+ success: 4,
238
+ goalProgress: 2,
239
+ repoGroundedness: 1.5,
240
+ driftPenalty: -1.5,
241
+ toolUseQuality: 1,
242
+ patchQuality: 1.25,
243
+ testReality: 1.5,
244
+ finalGate: 3,
245
+ reviewerBlockers: -2,
246
+ costUsd: -0.2,
247
+ wallSeconds: -0.1
248
+ };
249
+ function aggregateRunScore(score, weights = {}) {
250
+ const w = { ...DEFAULT_RUN_SCORE_WEIGHTS, ...weights };
251
+ return w.success * clamp01(score.success) + w.goalProgress * clamp01(score.goalProgress) + w.repoGroundedness * clamp01(score.repoGroundedness) + w.driftPenalty * clamp01(score.driftPenalty) + w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + w.finalGate * clamp01(score.finalGate) + w.reviewerBlockers * clamp01(score.reviewerBlockers) + w.costUsd * Math.max(0, finiteOrZero(score.costUsd)) + w.wallSeconds * Math.max(0, finiteOrZero(score.wallSeconds) / 60);
252
+ }
253
+ function clamp01(value) {
254
+ if (!Number.isFinite(value)) return 0;
255
+ return Math.max(0, Math.min(1, value));
256
+ }
257
+ function finiteOrZero(value) {
258
+ return Number.isFinite(value) ? value : 0;
259
+ }
260
+
261
+ // src/run-critic.ts
262
+ var DEFAULT_DRIFT_PATTERNS = [
263
+ /https?:\/\//i,
264
+ /\btitle:\s/i,
265
+ /\bsummary:\s/i,
266
+ /\burl:\s/i,
267
+ /\bnpm package usage\b/i,
268
+ /\bnews\b/i
269
+ ];
270
+ var RunCritic = class {
271
+ weights;
272
+ driftPatterns;
273
+ constructor(options = {}) {
274
+ this.weights = options.weights;
275
+ this.driftPatterns = options.driftPatterns ?? DEFAULT_DRIFT_PATTERNS;
276
+ }
277
+ async score(store, runId) {
278
+ const run = await store.getRun(runId);
279
+ if (!run) throw new NotFoundError(`run ${runId} not found`);
280
+ const [spans, events, artifacts, budget] = await Promise.all([
281
+ store.spans({ runId }),
282
+ store.events({ runId }),
283
+ store.artifacts(runId),
284
+ store.budget(runId)
285
+ ]);
286
+ return this.scoreTrace({ run, spans, events, artifacts, budget });
287
+ }
288
+ scoreTrace(trace) {
289
+ const notes = [];
290
+ const llmSpans2 = trace.spans.filter(
291
+ (s) => s.kind === "llm"
292
+ );
293
+ const toolSpans2 = trace.spans.filter(
294
+ (s) => s.kind === "tool"
295
+ );
296
+ const judgeSpans2 = trace.spans.filter(
297
+ (s) => s.kind === "judge"
298
+ );
299
+ const sandboxSpans = trace.spans.filter(
300
+ (s) => s.kind === "sandbox"
301
+ );
302
+ const finalGateSpans = judgeSpans2.filter(
303
+ (span) => span.dimension === "final_gate" || span.attributes?.finalGate === true
304
+ );
305
+ const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
306
+ if (!success) notes.push("run did not complete with pass=true");
307
+ const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum2, span) => sum2 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
308
+ const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(
309
+ trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score
310
+ ) : void 0;
311
+ const goalProgress = outcomeScore ?? judgeAverage ?? success;
312
+ const successfulTools = toolSpans2.filter((span) => span.status !== "error").length;
313
+ const toolUseQuality = toolSpans2.length === 0 ? 0 : successfulTools / toolSpans2.length;
314
+ if (toolSpans2.length === 0) notes.push("no tool spans recorded");
315
+ const patchEvidence = trace.artifacts.length + toolSpans2.filter((span) => /write|edit|patch|apply/i.test(span.toolName)).length;
316
+ const patchQuality = patchEvidence > 0 ? clamp01(patchEvidence / 4) : 0;
317
+ if (!patchQuality) notes.push("no artifact or edit evidence recorded");
318
+ const sandboxTests = sandboxSpans.filter(
319
+ (span) => typeof span.testsTotal === "number" && span.testsTotal > 0
320
+ );
321
+ const testReality = sandboxTests.length ? sandboxTests.reduce(
322
+ (sum2, span) => sum2 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1),
323
+ 0
324
+ ) / sandboxTests.length : toolSpans2.some(
325
+ (span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))
326
+ ) ? 0.4 : 0;
327
+ if (!testReality) notes.push("no real test/build evidence recorded");
328
+ const blockerSpans = judgeSpans2.filter((span) => isBlockingJudge(span));
329
+ const finalGateBlockers = finalGateSpans.filter((span) => isBlockingJudge(span));
330
+ const finalGate = finalGateSpans.length ? finalGateBlockers.length ? 0 : 1 : success;
331
+ if (finalGateBlockers.length)
332
+ notes.push(`final gate blocked by ${finalGateBlockers.length} reviewer(s)`);
333
+ else if (!finalGateSpans.length) notes.push("no final gate judgment recorded");
334
+ const reviewerBlockers = judgeSpans2.length ? blockerSpans.length / judgeSpans2.length : 0;
335
+ if (reviewerBlockers) notes.push(`detected ${blockerSpans.length} blocking reviewer signal(s)`);
336
+ const positiveGroundingSignals = patchEvidence + sandboxSpans.length + llmSpans2.filter((span) => looksRepoGrounded(span.output ?? "")).length;
337
+ const driftSignals = llmSpans2.filter((span) => this.isDrift(span.output ?? "")).length + trace.events.filter((event) => this.isDrift(JSON.stringify(event.payload))).length;
338
+ const repoGroundedness = positiveGroundingSignals + driftSignals === 0 ? 0 : positiveGroundingSignals / (positiveGroundingSignals + driftSignals);
339
+ const driftPenalty = positiveGroundingSignals + driftSignals === 0 ? 0 : driftSignals / (positiveGroundingSignals + driftSignals);
340
+ if (driftSignals > 0) notes.push(`detected ${driftSignals} drift signal(s)`);
341
+ const costUsd = trace.budget.length ? Math.max(
342
+ ...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed),
343
+ 0
344
+ ) : llmSpans2.reduce((sum2, span) => sum2 + (span.costUsd ?? 0), 0);
345
+ const wallSeconds = trace.run.endedAt && trace.run.startedAt ? Math.max(0, (trace.run.endedAt - trace.run.startedAt) / 1e3) : 0;
346
+ return {
347
+ success,
348
+ goalProgress,
349
+ repoGroundedness,
350
+ driftPenalty,
351
+ toolUseQuality,
352
+ patchQuality,
353
+ testReality,
354
+ finalGate,
355
+ reviewerBlockers,
356
+ costUsd,
357
+ wallSeconds,
358
+ notes
359
+ };
360
+ }
361
+ rank(score) {
362
+ return aggregateRunScore(score, this.weights);
363
+ }
364
+ isDrift(text) {
365
+ return this.driftPatterns.some((pattern) => pattern.test(text));
366
+ }
367
+ };
368
+ function normalizeJudgeScore(score) {
369
+ return score > 1 ? clamp01(score / 10) : clamp01(score);
370
+ }
371
+ function looksRepoGrounded(text) {
372
+ return /(?:src\/|tests?\/|package\.json|tsconfig|\.ts\b|\.tsx\b|git status|pnpm |npm |vitest|pytest|jest)/i.test(
373
+ text
374
+ );
375
+ }
376
+ function isBlockingJudge(span) {
377
+ return span.attributes?.blocking === true || span.attributes?.verdict === "BLOCKING" || positiveNumber(span.attributes?.blockingFindings) || positiveNumber(span.attributes?.highFindings) || span.score <= 2;
378
+ }
379
+ function positiveNumber(value) {
380
+ return typeof value === "number" && value > 0;
381
+ }
382
+
383
+ // src/semantic-concept-judge.ts
384
+ var DEFAULT_COMPLEXITY_WEIGHTS = {
385
+ render: 1,
386
+ integrate: 2,
387
+ compute: 2.5
388
+ };
389
+ var SEMANTIC_CONCEPT_JUDGE_VERSION = "semantic-concept-judge-v1-2026-04-24";
390
+ var DEFAULT_MAX_SOURCE = 45e3;
391
+ var DEFAULT_MAX_HTML = 3e4;
392
+ var DEFAULT_MAX_PER_FILE = 2e4;
393
+ var DEFAULT_TIMEOUT = 18e4;
394
+ var DEFAULT_MODEL = "claude-sonnet-4-6";
395
+ var SEMANTIC_SCHEMA = {
396
+ type: "object",
397
+ additionalProperties: false,
398
+ required: ["summary", "concepts"],
399
+ properties: {
400
+ summary: { type: "string", minLength: 20, maxLength: 600 },
401
+ concepts: {
402
+ type: "array",
403
+ minItems: 1,
404
+ items: {
405
+ type: "object",
406
+ additionalProperties: false,
407
+ required: ["concept", "present", "score", "evidence", "severity"],
408
+ properties: {
409
+ concept: { type: "string", minLength: 1, maxLength: 120 },
410
+ present: { type: "boolean" },
411
+ score: { type: "number", minimum: 0, maximum: 10 },
412
+ evidence: { type: "string", minLength: 5, maxLength: 400 },
413
+ severity: { type: "string", enum: ["critical", "major", "minor", "info"] }
414
+ }
415
+ }
416
+ }
417
+ }
418
+ };
419
+ function truncate(body, cap, label) {
420
+ if (body.length <= cap) return body;
421
+ return `${body.slice(0, cap)}
422
+ \u2026 [truncated ${body.length - cap} chars of ${label}]`;
423
+ }
424
+ function buildPrompt(input, opts) {
425
+ const sourceBlob = input.sourceFiles.filter((f) => f.content.length <= opts.maxPerFileChars).map((f) => `--- FILE: ${f.path} ---
426
+ ${f.content}`).join("\n\n");
427
+ const html = input.servedHtml ?? "";
428
+ return `You are a strict code-review judge evaluating whether an agent's 0-to-1 build actually implements the features the user asked for.
429
+
430
+ You MUST distinguish:
431
+ (a) WORKING code that implements the concept (rendered UI, wired handler, real API call),
432
+ (b) KEYWORD-PRESENT stub (comments mentioning the concept, variable names, TODOs),
433
+ (c) ABSENT (concept nowhere).
434
+
435
+ A comment like "// TODO: add mint button" is NOT present \u2014 score 2-3. Only count a concept as present if there is real functional code: a rendered component, a call handler wired to state or a network call, a computed value actually used.
436
+
437
+ USER REQUEST (what the agent was asked to build):
438
+ ${input.userRequest}
439
+
440
+ ${input.artifactLabel ? `ARTIFACT METADATA:
441
+ name: ${input.artifactLabel}
442
+ description: ${input.artifactDescription ?? ""}
443
+
444
+ ` : ""}EXPECTED CONCEPTS (each must be graded independently):
445
+ ${input.expectedConcepts.map(
446
+ (c, i) => ` ${i + 1}. "${c.name}"${c.keywords?.length ? ` \u2014 hints: [${c.keywords.slice(0, 6).join(" | ")}]` : ""}`
447
+ ).join("\n")}
448
+
449
+ ${html ? `SERVED HTML (what the preview returns when hit):
450
+ ${truncate(html, opts.maxHtmlChars, "HTML")}
451
+
452
+ ` : ""}SOURCE FILES (the agent's workdir):
453
+ ${truncate(sourceBlob, opts.maxSourceChars, "source")}
454
+
455
+ For EACH concept, return:
456
+ - concept: the concept name as given (match exactly)
457
+ - present: boolean \u2014 does a working implementation exist?
458
+ - score: 0-10 \u2014 10 = production-ready; 7 = functional but thin; 4 = partial/stubbed; 2 = keyword-only comment; 0 = absent
459
+ - evidence: cite "<file>:<line>" or "served-html:<selector>" pointing at the strongest supporting code. If the concept is absent or stubbed, explain what's missing.
460
+ - severity:
461
+ "info" when present: true AND score >= 7
462
+ "minor" when present: true AND 4 <= score < 7
463
+ "major" when present: false OR score < 4
464
+ "critical" when the concept is not only absent but a core user flow depends on it
465
+
466
+ Also produce a "summary" (one sentence, 20-600 chars): overall verdict on whether this is a shippable implementation of the user request vs a keyword-dense placeholder.
467
+
468
+ BE SKEPTICAL. Keyword matching already passed \u2014 your job is to catch what keyword matching misses. If the agent shipped a working build, say so. If it shipped a stub, say so. Don't grade on effort.
469
+
470
+ Return STRICT JSON. No prose outside the JSON.`;
471
+ }
472
+ async function runSemanticConceptJudge(input, options = {}) {
473
+ const start = Date.now();
474
+ const totalCount = input.expectedConcepts.length;
475
+ if (totalCount === 0) {
476
+ return {
477
+ kind: "semantic-concept",
478
+ version: SEMANTIC_CONCEPT_JUDGE_VERSION,
479
+ score: 0,
480
+ presentCount: 0,
481
+ totalCount: 0,
482
+ findings: [],
483
+ summary: "no expected concepts declared",
484
+ durationMs: 0,
485
+ costUsd: null,
486
+ available: false,
487
+ error: "no expected concepts declared"
488
+ };
489
+ }
490
+ const opts = {
491
+ model: options.model ?? DEFAULT_MODEL,
492
+ timeoutMs: options.timeoutMs ?? DEFAULT_TIMEOUT,
493
+ maxSourceChars: options.maxSourceChars ?? DEFAULT_MAX_SOURCE,
494
+ maxPerFileChars: options.maxPerFileChars ?? DEFAULT_MAX_PER_FILE,
495
+ maxHtmlChars: options.maxHtmlChars ?? DEFAULT_MAX_HTML,
496
+ llm: options.llm ?? {},
497
+ weightConcepts: options.weightConcepts ?? "mean",
498
+ complexityWeights: { ...DEFAULT_COMPLEXITY_WEIGHTS, ...options.complexityWeights ?? {} }
499
+ };
500
+ const weightForConcept = (spec) => {
501
+ if (opts.weightConcepts === "mean") return 1;
502
+ if (spec.weight != null) return spec.weight;
503
+ if (opts.weightConcepts === "complexity") {
504
+ return opts.complexityWeights[spec.complexity ?? "render"] ?? 1;
505
+ }
506
+ return 1;
507
+ };
508
+ const weightByName = new Map(
509
+ input.expectedConcepts.map((c) => [c.name, weightForConcept(c)])
510
+ );
511
+ try {
512
+ const { value, result } = await callLlmJson(
513
+ {
514
+ model: opts.model,
515
+ messages: [
516
+ {
517
+ role: "system",
518
+ content: "You are a strict code-review judge. Return strict JSON only. No prose outside the JSON. A keyword in a comment is NOT a working implementation."
519
+ },
520
+ { role: "user", content: buildPrompt(input, opts) }
521
+ ],
522
+ jsonSchema: { name: "semantic_concept_judge", schema: SEMANTIC_SCHEMA },
523
+ temperature: 0,
524
+ timeoutMs: opts.timeoutMs
525
+ },
526
+ opts.llm
527
+ );
528
+ if (!value?.concepts || !Array.isArray(value.concepts)) {
529
+ throw new Error('judge returned malformed response \u2014 expected array under "concepts"');
530
+ }
531
+ const findings = value.concepts.map((c) => ({
532
+ concept: String(c.concept),
533
+ present: Boolean(c.present),
534
+ score: Math.max(0, Math.min(10, Number(c.score ?? 0))),
535
+ evidence: String(c.evidence ?? ""),
536
+ severity: ["critical", "major", "minor", "info"].includes(c.severity) ? c.severity : "info"
537
+ }));
538
+ const presentCount = findings.filter((f) => f.present && f.score >= 7).length;
539
+ let weightSum = 0;
540
+ let weightedScoreSum = 0;
541
+ for (const f of findings) {
542
+ const w = weightByName.get(f.concept) ?? 1;
543
+ weightSum += w;
544
+ weightedScoreSum += w * f.score;
545
+ }
546
+ const scoreAvg = weightSum > 0 ? weightedScoreSum / weightSum : findings.reduce((a, f) => a + f.score, 0) / Math.max(1, findings.length);
547
+ return {
548
+ kind: "semantic-concept",
549
+ version: SEMANTIC_CONCEPT_JUDGE_VERSION,
550
+ score: Number((scoreAvg / 10).toFixed(3)),
551
+ presentCount,
552
+ totalCount,
553
+ findings,
554
+ summary: String(value.summary ?? ""),
555
+ durationMs: Date.now() - start,
556
+ costUsd: result.costUsd ?? null,
557
+ available: true
558
+ };
559
+ } catch (err) {
560
+ return {
561
+ kind: "semantic-concept",
562
+ version: SEMANTIC_CONCEPT_JUDGE_VERSION,
563
+ score: 0,
564
+ presentCount: 0,
565
+ totalCount,
566
+ findings: [],
567
+ summary: "",
568
+ durationMs: Date.now() - start,
569
+ costUsd: null,
570
+ available: false,
571
+ error: err instanceof Error ? err.message : String(err)
572
+ };
573
+ }
574
+ }
575
+ function createSemanticConceptJudge(options = {}) {
576
+ return (input) => runSemanticConceptJudge(input, options);
577
+ }
578
+
579
+ // src/analyst/types.ts
580
+ import { createHash } from "crypto";
581
+ function computeFindingId(input) {
582
+ const basis = JSON.stringify({
583
+ a: input.analyst_id,
584
+ r: input.area,
585
+ s: input.subject ?? "",
586
+ c: normalizeClaim(input.id_basis ?? input.claim)
587
+ });
588
+ return `f_${createHash("sha256").update(basis).digest("hex").slice(0, 20)}`;
589
+ }
590
+ function normalizeClaim(c) {
591
+ return c.toLowerCase().replace(/\s+/g, " ").replace(/[.!?;:,]+$/g, "").trim();
592
+ }
593
+ function makeFinding(init) {
594
+ const { id_basis, produced_at, ...rest } = init;
595
+ return {
596
+ schema_version: "1.0.0",
597
+ finding_id: computeFindingId({
598
+ analyst_id: rest.analyst_id,
599
+ area: rest.area,
600
+ subject: rest.subject,
601
+ claim: rest.claim,
602
+ id_basis
603
+ }),
604
+ produced_at: produced_at ?? (/* @__PURE__ */ new Date()).toISOString(),
605
+ ...rest
606
+ };
607
+ }
608
+
609
+ // src/analyst/adapters.ts
610
+ var ADAPTER_REV = "1";
611
+ function liftSeverity(s) {
612
+ switch (s) {
613
+ case "critical":
614
+ return "critical";
615
+ case "major":
616
+ return "high";
617
+ case "minor":
618
+ return "medium";
619
+ case "info":
620
+ return "info";
621
+ }
622
+ }
623
+ function createTraceAnalystAdapter(opts) {
624
+ const id = opts.id ?? "trace-analyst";
625
+ const area = opts.area ?? "agent-reasoning";
626
+ return {
627
+ id,
628
+ description: "Runs the agent-eval trace analyst over an OTLP trace store and lifts its bulleted findings.",
629
+ inputKind: "trace-store",
630
+ cost: { kind: "llm", models: opts.model ? [opts.model] : void 0 },
631
+ version: `trace-analyst-${ADAPTER_REV}`,
632
+ async analyze(store, ctx) {
633
+ const out = [];
634
+ for (const question of opts.questions) {
635
+ if (ctx.signal?.aborted) break;
636
+ const result = await analyzeTraces(
637
+ { question },
638
+ { source: store, ai: opts.ai, model: opts.model, ...opts.extra }
639
+ );
640
+ const subject = ctx.tags?.subject ?? question.slice(0, 60);
641
+ if (result.findings.length === 0) {
642
+ out.push(
643
+ makeFinding({
644
+ analyst_id: id,
645
+ area,
646
+ subject,
647
+ claim: result.answer.slice(0, 200),
648
+ rationale: result.answer,
649
+ severity: "info",
650
+ confidence: 0.5,
651
+ evidence_refs: [],
652
+ metadata: {
653
+ actor_prompt_version: result.actorPromptVersion,
654
+ turns: result.turnCount
655
+ }
656
+ })
657
+ );
658
+ continue;
659
+ }
660
+ result.findings.forEach((claim, i) => {
661
+ out.push(
662
+ makeFinding({
663
+ analyst_id: id,
664
+ area,
665
+ subject,
666
+ claim,
667
+ rationale: i === 0 ? result.answer : void 0,
668
+ severity: "medium",
669
+ confidence: 0.6,
670
+ evidence_refs: [],
671
+ metadata: { question, turns: result.turnCount, finding_index: i }
672
+ })
673
+ );
674
+ });
675
+ }
676
+ return out;
677
+ }
678
+ };
679
+ }
680
+ function createVerifierAdapter(opts) {
681
+ const id = opts.id ?? "multi-layer-verifier";
682
+ const area = opts.area ?? "verification";
683
+ return {
684
+ id,
685
+ description: "Runs a MultiLayerVerifier and lifts each layer's findings into the analyst envelope.",
686
+ inputKind: "custom",
687
+ cost: { kind: "deterministic" },
688
+ version: `verifier-${ADAPTER_REV}`,
689
+ async analyze(env, ctx) {
690
+ const report = await opts.verifier.run({ env, ...opts.options });
691
+ const out = [];
692
+ for (const layer of report.layers) {
693
+ for (const finding of layer.findings) {
694
+ out.push(liftLayerFinding(id, area, layer.layer, finding));
695
+ }
696
+ if (layer.status === "fail" || layer.status === "error" || layer.status === "timeout") {
697
+ out.push(
698
+ makeFinding({
699
+ analyst_id: id,
700
+ area,
701
+ subject: layer.layer,
702
+ claim: `layer "${layer.layer}" ${layer.status}: ${layer.reason ?? "no reason given"}`,
703
+ severity: layer.status === "error" ? "high" : layer.status === "timeout" ? "medium" : "high",
704
+ confidence: 1,
705
+ evidence_refs: [],
706
+ metadata: {
707
+ layer_status: layer.status,
708
+ duration_ms: layer.durationMs,
709
+ score: layer.score,
710
+ diagnostics: layer.diagnostics
711
+ }
712
+ })
713
+ );
714
+ }
715
+ }
716
+ ctx.log?.("verifier complete", {
717
+ layers: report.layers.length,
718
+ blended: report.blendedScore,
719
+ all_pass: report.allPass
720
+ });
721
+ return out;
722
+ }
723
+ };
724
+ }
725
+ function liftLayerFinding(analyst_id, area, layer, f) {
726
+ return makeFinding({
727
+ analyst_id,
728
+ area,
729
+ subject: f.layer ?? layer,
730
+ claim: f.message,
731
+ severity: liftSeverity(f.severity),
732
+ confidence: 0.85,
733
+ evidence_refs: f.evidence ? [{ kind: "artifact", uri: "inline:evidence", excerpt: f.evidence }] : [],
734
+ metadata: f.detail
735
+ });
736
+ }
737
+ function createRunCriticAdapter(opts = {}) {
738
+ const id = opts.id ?? "run-critic";
739
+ const area = opts.area ?? "run-quality";
740
+ const critic = opts.critic ?? new RunCritic();
741
+ const threshold = opts.threshold ?? 0.5;
742
+ return {
743
+ id,
744
+ description: "Scores a single run across success / grounding / drift / tool-quality and surfaces below-threshold dimensions.",
745
+ inputKind: "custom",
746
+ cost: { kind: "deterministic" },
747
+ version: `run-critic-${ADAPTER_REV}`,
748
+ async analyze(trace) {
749
+ const score = critic.scoreTrace(trace);
750
+ const out = [];
751
+ const dims = [
752
+ ["success", "critical", "run did not complete successfully"],
753
+ ["goalProgress", "high", "goal progress is low"],
754
+ ["repoGroundedness", "high", "output is poorly grounded in the repository"],
755
+ ["toolUseQuality", "medium", "tool use quality is low"],
756
+ ["patchQuality", "medium", "no real patch/edit evidence"],
757
+ ["testReality", "high", "no real test/build evidence"],
758
+ ["finalGate", "critical", "final gate is blocking"]
759
+ ];
760
+ for (const [dim, sev, msg] of dims) {
761
+ const value = score[dim];
762
+ if (typeof value === "number" && value < threshold) {
763
+ out.push(
764
+ makeFinding({
765
+ analyst_id: id,
766
+ area,
767
+ subject: dim,
768
+ claim: msg,
769
+ rationale: `${dim}=${value.toFixed(2)} below threshold ${threshold}`,
770
+ severity: sev,
771
+ confidence: 1,
772
+ evidence_refs: [],
773
+ metadata: { dimension: dim, value, threshold, run_id: trace.run.runId }
774
+ })
775
+ );
776
+ }
777
+ }
778
+ if (score.driftPenalty > 1 - threshold) {
779
+ out.push(
780
+ makeFinding({
781
+ analyst_id: id,
782
+ area,
783
+ subject: "drift",
784
+ claim: "agent output drifted from repository signal",
785
+ rationale: `driftPenalty=${score.driftPenalty.toFixed(2)}`,
786
+ severity: "medium",
787
+ confidence: 0.9,
788
+ evidence_refs: [],
789
+ metadata: { drift_penalty: score.driftPenalty, notes: score.notes }
790
+ })
791
+ );
792
+ }
793
+ return out;
794
+ }
795
+ };
796
+ }
797
+ function createJudgeAdapter(opts) {
798
+ const id = opts.id ?? "judge";
799
+ const area = opts.area ?? "judge";
800
+ const threshold = opts.threshold ?? 6;
801
+ return {
802
+ id,
803
+ description: "Wraps an agent-eval JudgeFn into an analyst; below-threshold dimensions surface as findings.",
804
+ inputKind: "judge-input",
805
+ cost: opts.cost ?? { kind: "llm" },
806
+ version: `judge-${ADAPTER_REV}`,
807
+ async analyze(input) {
808
+ const scores = await opts.judge(opts.tcloud, input);
809
+ return scores.filter((s) => normalize10(s.score) < threshold).map((s) => liftJudgeScore(id, area, s));
810
+ }
811
+ };
812
+ }
813
+ function normalize10(s) {
814
+ return s <= 1 ? s * 10 : s;
815
+ }
816
+ function liftJudgeScore(analyst_id, area, s) {
817
+ const score10 = normalize10(s.score);
818
+ const severity = score10 < 3 ? "critical" : score10 < 5 ? "high" : score10 < 7 ? "medium" : "low";
819
+ return makeFinding({
820
+ analyst_id,
821
+ area,
822
+ subject: s.dimension,
823
+ claim: `${s.judgeName}/${s.dimension} scored ${score10.toFixed(1)}/10`,
824
+ rationale: s.reasoning,
825
+ severity,
826
+ confidence: 0.8,
827
+ evidence_refs: s.evidence ? [{ kind: "artifact", uri: "inline:evidence", excerpt: s.evidence }] : [],
828
+ metadata: { judge_name: s.judgeName, dimension: s.dimension, score_10: score10 }
829
+ });
830
+ }
831
+ function createSemanticConceptJudgeAdapter(opts = {}) {
832
+ const id = opts.id ?? "semantic-concept-judge";
833
+ const area = opts.area ?? "concept-coverage";
834
+ return {
835
+ id,
836
+ description: "Runs the semantic-concept judge and surfaces missing / weak concepts as findings.",
837
+ inputKind: "custom",
838
+ cost: { kind: "llm", models: opts.options?.model ? [opts.options.model] : void 0 },
839
+ version: `${SEMANTIC_CONCEPT_JUDGE_VERSION}-adapter-${ADAPTER_REV}`,
840
+ async analyze(input) {
841
+ const result = await runSemanticConceptJudge(input, opts.options);
842
+ if (!result.available) {
843
+ return [
844
+ makeFinding({
845
+ analyst_id: id,
846
+ area,
847
+ claim: "semantic-concept judge unavailable",
848
+ rationale: result.error,
849
+ severity: "info",
850
+ confidence: 1,
851
+ evidence_refs: [],
852
+ metadata: { reason: result.error }
853
+ })
854
+ ];
855
+ }
856
+ const out = [];
857
+ for (const f of result.findings) {
858
+ if (f.present && f.score >= 7) continue;
859
+ out.push(
860
+ makeFinding({
861
+ analyst_id: id,
862
+ area,
863
+ subject: f.concept,
864
+ claim: f.present ? `concept "${f.concept}" is weak (${f.score}/10)` : `concept "${f.concept}" is missing`,
865
+ rationale: f.evidence,
866
+ severity: liftSeverity(f.severity),
867
+ confidence: 0.85,
868
+ evidence_refs: [{ kind: "artifact", uri: "inline:evidence", excerpt: f.evidence }],
869
+ metadata: {
870
+ concept: f.concept,
871
+ present: f.present,
872
+ score_10: f.score,
873
+ cost_usd: result.costUsd ?? void 0
874
+ }
875
+ })
876
+ );
877
+ }
878
+ return out;
879
+ }
880
+ };
881
+ }
882
+
883
+ // src/analyst/chat-client.ts
884
+ function createChatClient(opts) {
885
+ switch (opts.transport) {
886
+ case "router":
887
+ return wrapLlmClient(
888
+ opts.transport,
889
+ opts.defaultModel,
890
+ new LlmClient({
891
+ baseUrl: opts.baseUrl ?? "https://router.tangle.tools/v1",
892
+ apiKey: opts.apiKey
893
+ })
894
+ );
895
+ case "cli-bridge":
896
+ return wrapLlmClient(
897
+ opts.transport,
898
+ opts.defaultModel,
899
+ new LlmClient({
900
+ baseUrl: opts.baseUrl ?? "http://127.0.0.1:3344/v1",
901
+ apiKey: opts.bearer ?? ""
902
+ })
903
+ );
904
+ case "direct-provider":
905
+ return wrapLlmClient(
906
+ opts.transport,
907
+ opts.defaultModel,
908
+ new LlmClient({
909
+ baseUrl: opts.baseUrl,
910
+ apiKey: opts.apiKey
911
+ })
912
+ );
913
+ case "sandbox-sdk":
914
+ return {
915
+ transport: "sandbox-sdk",
916
+ defaultModel: opts.defaultModel,
917
+ chat: async (req, callOpts) => opts.chat(resolveModel(req, opts.defaultModel), callOpts)
918
+ };
919
+ case "mock":
920
+ return {
921
+ transport: "mock",
922
+ defaultModel: opts.defaultModel,
923
+ chat: async (req, callOpts) => opts.handler(resolveModel(req, opts.defaultModel), callOpts)
924
+ };
925
+ }
926
+ }
927
+ function wrapLlmClient(transport, defaultModel, inner) {
928
+ return {
929
+ transport,
930
+ defaultModel,
931
+ chat: async (req, callOpts) => {
932
+ const resolved = resolveModel(req, defaultModel);
933
+ const call = inner.call({
934
+ model: resolved.model,
935
+ messages: req.messages,
936
+ jsonMode: req.jsonMode,
937
+ jsonSchema: req.jsonSchema,
938
+ temperature: req.temperature,
939
+ maxTokens: req.maxTokens,
940
+ timeoutMs: req.timeoutMs
941
+ });
942
+ if (!callOpts?.signal) return await call;
943
+ return await Promise.race([call, abortAsRejection(callOpts.signal)]);
944
+ }
945
+ };
946
+ }
947
+ function abortAsRejection(signal) {
948
+ if (signal.aborted) return Promise.reject(toAbortError(signal));
949
+ return new Promise((_, reject) => {
950
+ signal.addEventListener("abort", () => reject(toAbortError(signal)), { once: true });
951
+ });
952
+ }
953
+ function toAbortError(signal) {
954
+ const reason = signal.reason;
955
+ if (reason instanceof Error) return reason;
956
+ const e = new Error("ChatClient.chat: aborted");
957
+ e.name = "AbortError";
958
+ return e;
959
+ }
960
+ function resolveModel(req, defaultModel) {
961
+ if (req.model) return req;
962
+ if (!defaultModel) {
963
+ throw new Error(
964
+ "ChatClient.chat: no model on request and no defaultModel on the client. Either pass req.model or bind defaultModel at createChatClient()."
965
+ );
966
+ }
967
+ return { ...req, model: defaultModel };
968
+ }
969
+
970
+ // src/analyst/finding-signature.ts
971
+ import { z } from "zod";
972
+ var ANALYST_SEVERITIES = ["critical", "high", "medium", "low", "info"];
973
+ var RawAnalystFindingSchema = z.object({
974
+ severity: z.enum(ANALYST_SEVERITIES),
975
+ claim: z.string().min(1).max(2e3),
976
+ subject: z.string().max(400).optional(),
977
+ evidence_uri: z.string().min(1).max(2e3),
978
+ evidence_excerpt: z.string().max(2e3).optional(),
979
+ confidence: z.number().min(0).max(1),
980
+ rationale: z.string().max(4e3).optional(),
981
+ recommended_action: z.string().max(2e3).optional()
982
+ }).strict();
983
+ var RAW_FINDING_SCHEMA_PROMPT = `Each finding MUST be a JSON object with these fields:
984
+ - severity: one of "critical" | "high" | "medium" | "low" | "info"
985
+ - claim: one-sentence statement (max 2000 chars)
986
+ - subject?: the leaf id, agent id, span id, tool name, or noun phrase the finding is about
987
+ - evidence_uri: "span://<trace_id>/<span_id>" for trace evidence, "artifact://<relative-path>" for files, "metric://<name>" for named scalars \u2014 ALWAYS cite a real id surfaced by the tools
988
+ - evidence_excerpt?: short quote (<=2000 chars) from the cited span/artifact
989
+ - confidence: number 0..1 \u2014 0.9+ when backed by exact quotes, 0.6-0.8 for inferred patterns, <0.5 for speculative
990
+ - rationale?: one or two sentences explaining the reasoning
991
+ - recommended_action?: concrete change phrased as an imperative ("Add ...", "Replace ...", "Stop ...") \u2014 omit when the finding is purely descriptive
992
+
993
+ Emit an empty array when the question has no findings to report. Do not fabricate evidence.`;
994
+ function parseRawFinding(row, log) {
995
+ const result = RawAnalystFindingSchema.safeParse(row);
996
+ if (!result.success) {
997
+ log?.("finding rejected: schema failure", {
998
+ issues: result.error.issues.map((i) => ({
999
+ path: i.path.join("."),
1000
+ code: i.code,
1001
+ message: i.message
1002
+ }))
1003
+ });
1004
+ return null;
1005
+ }
1006
+ return result.data;
1007
+ }
1008
+
1009
+ // src/analyst/findings-store.ts
1010
+ import { existsSync as existsSync2, readFileSync } from "fs";
1011
+
1012
+ // src/locked-jsonl-appender.ts
1013
+ import { appendFileSync, existsSync, mkdirSync } from "fs";
1014
+ import { dirname } from "path";
1015
+
1016
+ // src/concurrency.ts
1017
+ var Mutex = class {
1018
+ locked = false;
1019
+ waiters = [];
1020
+ async acquire() {
1021
+ if (!this.locked) {
1022
+ this.locked = true;
1023
+ return () => this.release();
1024
+ }
1025
+ return new Promise((resolve) => {
1026
+ this.waiters.push(() => {
1027
+ resolve(() => this.release());
1028
+ });
1029
+ });
1030
+ }
1031
+ release() {
1032
+ const next = this.waiters.shift();
1033
+ if (next) {
1034
+ next();
1035
+ } else {
1036
+ this.locked = false;
1037
+ }
1038
+ }
1039
+ async runExclusive(fn) {
1040
+ const release = await this.acquire();
1041
+ try {
1042
+ return await fn();
1043
+ } finally {
1044
+ release();
1045
+ }
1046
+ }
1047
+ /** True iff someone holds the lock right now. Diagnostics only. */
1048
+ get isLocked() {
1049
+ return this.locked;
1050
+ }
1051
+ /** Pending waiter count. Diagnostics only. */
1052
+ get pending() {
1053
+ return this.waiters.length;
1054
+ }
1055
+ };
1056
+
1057
+ // src/locked-jsonl-appender.ts
1058
+ var mutexes = /* @__PURE__ */ new Map();
1059
+ function getMutex(path) {
1060
+ let m = mutexes.get(path);
1061
+ if (!m) {
1062
+ m = new Mutex();
1063
+ mutexes.set(path, m);
1064
+ }
1065
+ return m;
1066
+ }
1067
+ var LockedJsonlAppender = class {
1068
+ constructor(path) {
1069
+ this.path = path;
1070
+ this.mutex = getMutex(path);
1071
+ if (!existsSync(dirname(path))) {
1072
+ mkdirSync(dirname(path), { recursive: true });
1073
+ }
1074
+ }
1075
+ path;
1076
+ mutex;
1077
+ async append(entry) {
1078
+ const line = `${JSON.stringify(entry)}
1079
+ `;
1080
+ await this.mutex.runExclusive(() => {
1081
+ appendFileSync(this.path, line);
1082
+ });
1083
+ }
1084
+ };
1085
+ function resetLockedAppendersForTesting() {
1086
+ mutexes.clear();
1087
+ }
1088
+
1089
+ // src/analyst/findings-store.ts
1090
+ var FindingsStore = class {
1091
+ constructor(path) {
1092
+ this.path = path;
1093
+ this.appender = new LockedJsonlAppender(path);
1094
+ }
1095
+ path;
1096
+ appender;
1097
+ async append(runId, findings) {
1098
+ for (const f of findings) {
1099
+ const row = { ...f, run_id: runId };
1100
+ await this.appender.append(row);
1101
+ }
1102
+ }
1103
+ /** Load every persisted finding. Discards malformed trailing lines silently. */
1104
+ loadAll() {
1105
+ if (!existsSync2(this.path)) return [];
1106
+ const raw = readFileSync(this.path, "utf8");
1107
+ if (!raw) return [];
1108
+ const out = [];
1109
+ for (const line of raw.split("\n")) {
1110
+ if (!line) continue;
1111
+ try {
1112
+ out.push(JSON.parse(line));
1113
+ } catch {
1114
+ }
1115
+ }
1116
+ return out;
1117
+ }
1118
+ /** Filter to a single run. */
1119
+ loadRun(runId) {
1120
+ return this.loadAll().filter((r) => r.run_id === runId);
1121
+ }
1122
+ };
1123
+ function defaultIsMaterial(a, b) {
1124
+ if (a.severity !== b.severity) return true;
1125
+ if (Math.abs((a.confidence ?? 0) - (b.confidence ?? 0)) > 0.05) return true;
1126
+ if (a.evidence_refs.length !== b.evidence_refs.length) return true;
1127
+ return false;
1128
+ }
1129
+ function diffFindings(previous, current, policy = {}) {
1130
+ const isMaterial = policy.isMaterial ?? defaultIsMaterial;
1131
+ const prevById = new Map(previous.map((f) => [f.finding_id, f]));
1132
+ const curById = new Map(current.map((f) => [f.finding_id, f]));
1133
+ const appeared = [];
1134
+ const disappeared = [];
1135
+ const persisted = [];
1136
+ const changed = [];
1137
+ for (const [id, cur] of curById) {
1138
+ const prev = prevById.get(id);
1139
+ if (!prev) {
1140
+ appeared.push(cur);
1141
+ continue;
1142
+ }
1143
+ if (isMaterial(prev, cur)) {
1144
+ changed.push({ previous: prev, current: cur });
1145
+ } else {
1146
+ persisted.push(cur);
1147
+ }
1148
+ }
1149
+ for (const [id, prev] of prevById) {
1150
+ if (!curById.has(id)) disappeared.push(prev);
1151
+ }
1152
+ return { appeared, disappeared, persisted, changed };
1153
+ }
1154
+
1155
+ // src/analyst/kind-factory.ts
1156
+ import { AxJSRuntime, agent } from "@ax-llm/ax";
1157
+ function createTraceAnalystKind(spec, opts) {
1158
+ const version = opts.versionSuffix ? `${spec.version}+${opts.versionSuffix}` : spec.version;
1159
+ return {
1160
+ id: spec.id,
1161
+ description: spec.description,
1162
+ inputKind: "trace-store",
1163
+ cost: spec.cost,
1164
+ version,
1165
+ async analyze(store, ctx) {
1166
+ const tools = spec.buildTools(store);
1167
+ const maxDepth = spec.recursion?.maxDepth ?? 0;
1168
+ const maxParallel = spec.recursion?.maxParallelSubagents ?? 2;
1169
+ const priorContext = renderPriorFindings(ctx.priorFindings);
1170
+ const actorDescription = spec.actorDescription.trim() + priorContext + "\n\n" + RAW_FINDING_SCHEMA_PROMPT + "\n\nReturn the array in the `findings` output field. Use `final(...)` with the structured `{ findings }` payload when you are done.";
1171
+ const ax = agent(
1172
+ "question:string -> findings:json[]",
1173
+ {
1174
+ agentIdentity: {
1175
+ name: spec.id,
1176
+ description: spec.description
1177
+ },
1178
+ contextFields: ["question"],
1179
+ runtime: new AxJSRuntime({
1180
+ permissions: [],
1181
+ blockDynamicImport: true,
1182
+ allowedModules: [],
1183
+ freezeIntrinsics: true,
1184
+ blockShadowRealm: true,
1185
+ preventGlobalThisExtensions: false
1186
+ }),
1187
+ mode: maxDepth > 0 ? "advanced" : "simple",
1188
+ recursionOptions: maxDepth > 0 ? { maxDepth } : void 0,
1189
+ maxTurns: spec.maxTurns ?? 12,
1190
+ maxRuntimeChars: spec.maxRuntimeChars ?? 6e3,
1191
+ maxBatchedLlmQueryConcurrency: maxParallel,
1192
+ promptLevel: "detailed",
1193
+ contextPolicy: { preset: "full", budget: "balanced" },
1194
+ functions: { local: tools },
1195
+ actorOptions: {
1196
+ description: actorDescription,
1197
+ ...opts.model ? { model: opts.model } : {},
1198
+ showThoughts: false,
1199
+ thinkingTokenBudget: "none"
1200
+ },
1201
+ responderOptions: {
1202
+ description: spec.responderDescription ?? "Format the structured `findings` array exactly as the actor produced it. Do not add, drop, or summarize entries.",
1203
+ ...opts.model ? { model: opts.model } : {},
1204
+ showThoughts: false
1205
+ },
1206
+ bubbleErrors: [TraceFileMissingError]
1207
+ }
1208
+ );
1209
+ ctx.log?.(`analyst.kind ${spec.id} forward`, {
1210
+ max_depth: maxDepth,
1211
+ tool_count: tools.length,
1212
+ tags: ctx.tags
1213
+ });
1214
+ const result = await ax.forward(opts.ai, { question: deriveQuestion(ctx, spec) });
1215
+ const out = [];
1216
+ const rawRows = Array.isArray(result.findings) ? result.findings : [];
1217
+ for (const row of rawRows) {
1218
+ const parsed = parseRawFinding(row, ctx.log);
1219
+ if (!parsed) continue;
1220
+ const postProcessed = spec.postProcess?.(parsed, ctx) ?? parsed;
1221
+ if (!postProcessed) continue;
1222
+ out.push(toAnalystFinding(spec, postProcessed));
1223
+ }
1224
+ ctx.log?.(`analyst.kind ${spec.id} done`, {
1225
+ emitted: rawRows.length,
1226
+ accepted: out.length
1227
+ });
1228
+ return out;
1229
+ }
1230
+ };
1231
+ }
1232
+ function deriveQuestion(ctx, spec) {
1233
+ const focus = ctx.tags?.focus?.trim();
1234
+ if (focus) return `${spec.id}: ${focus}`;
1235
+ return spec.id;
1236
+ }
1237
+ function toAnalystFinding(spec, raw) {
1238
+ return makeFinding({
1239
+ analyst_id: spec.id,
1240
+ area: spec.area,
1241
+ subject: raw.subject,
1242
+ claim: raw.claim,
1243
+ rationale: raw.rationale,
1244
+ severity: raw.severity,
1245
+ confidence: raw.confidence,
1246
+ evidence_refs: [
1247
+ {
1248
+ kind: evidenceKindFromUri(raw.evidence_uri),
1249
+ uri: raw.evidence_uri,
1250
+ excerpt: raw.evidence_excerpt
1251
+ }
1252
+ ],
1253
+ recommended_action: raw.recommended_action,
1254
+ metadata: { kind_version: spec.version }
1255
+ });
1256
+ }
1257
+ function evidenceKindFromUri(uri) {
1258
+ if (uri.startsWith("span://")) return "span";
1259
+ if (uri.startsWith("artifact://")) return "artifact";
1260
+ if (uri.startsWith("metric://")) return "metric";
1261
+ if (uri.startsWith("event://")) return "event";
1262
+ if (uri.startsWith("finding://")) return "finding";
1263
+ return "artifact";
1264
+ }
1265
+ function renderPriorFindings(prior) {
1266
+ if (!prior || prior.length === 0) return "";
1267
+ const MAX_ROWS = 40;
1268
+ const rows = prior.slice(0, MAX_ROWS).map((f) => {
1269
+ const subject = f.subject ? ` [${f.subject}]` : "";
1270
+ return ` - id=${f.finding_id} ${f.severity}${subject} ${truncateForContext(f.claim, 160)}`;
1271
+ });
1272
+ const overflow = prior.length > MAX_ROWS ? `
1273
+ ... +${prior.length - MAX_ROWS} more prior findings (older history truncated)` : "";
1274
+ return [
1275
+ "",
1276
+ "",
1277
+ "PRIOR FINDINGS (from a previous run on related data):",
1278
+ "When the work you do now matches a row below, REUSE the `finding_id` (pass it as `id_basis`) so the cross-run diff stays stable.",
1279
+ "A finding that reappears with no remediation evidence SHOULD raise its `confidence` and may justify a higher `severity`.",
1280
+ ...rows,
1281
+ overflow
1282
+ ].filter(Boolean).join("\n");
1283
+ }
1284
+ function truncateForContext(s, max) {
1285
+ if (s.length <= max) return s;
1286
+ return `${s.slice(0, max - 1).trimEnd()}\u2026`;
1287
+ }
1288
+
1289
+ // src/analyst/tool-groups.ts
1290
+ var TOOL_NAMES_BY_GROUP = {
1291
+ all: /* @__PURE__ */ new Set(),
1292
+ discovery: /* @__PURE__ */ new Set(["getDatasetOverview", "queryTraces", "countTraces"]),
1293
+ discoveryAndRead: /* @__PURE__ */ new Set([
1294
+ "getDatasetOverview",
1295
+ "queryTraces",
1296
+ "countTraces",
1297
+ "viewTrace",
1298
+ "viewSpans"
1299
+ ]),
1300
+ discoveryAndSearch: /* @__PURE__ */ new Set([
1301
+ "getDatasetOverview",
1302
+ "queryTraces",
1303
+ "countTraces",
1304
+ "searchTrace",
1305
+ "searchSpan"
1306
+ ]),
1307
+ targeted: /* @__PURE__ */ new Set(["getDatasetOverview", "queryTraces", "viewSpans", "searchSpan"])
1308
+ };
1309
+ function buildTraceToolsForGroup(group, store) {
1310
+ const all = buildTraceAnalystTools({ store });
1311
+ if (group === "all") return all;
1312
+ const allow = TOOL_NAMES_BY_GROUP[group];
1313
+ if (!allow) throw new Error(`unknown trace tool group: ${group}`);
1314
+ return all.filter((tool) => allow.has(tool.name));
1315
+ }
1316
+
1317
+ // src/analyst/kinds/failure-mode.ts
1318
+ var ACTOR_PROMPT = `You are a failure-mode classifier for an OTLP trace dataset. Your job is to identify the **distinct ways agents failed** in this dataset, not to grade individual runs.
1319
+
1320
+ DISCOVERY \u2192 CLUSTER \u2192 CITE protocol:
1321
+
1322
+ 1. Call \`traces.getDatasetOverview({})\` first. Use \`has_errors\`, \`models\`, \`agent_names\`, \`tools\`, and \`sample_trace_ids\` to size the failure surface.
1323
+ 2. Use \`traces.queryTraces({ filters: { has_errors: true }, limit })\` to pull error-bearing traces. Combine with \`traces.countTraces\` to see what fraction of the dataset failed.
1324
+ 3. For each candidate failure cluster, use \`traces.searchTrace\` with regex like \`STATUS_CODE_ERROR\`, \`MaxTurnsExceeded\`, \`assertion\`, \`unauthorized\`, \`timeout\`, \`429\`, \`5\\d\\d\`, the agent's specific error strings, or the names of its tools. Pull one or two representative traces per cluster, **not all** of them.
1325
+ 4. **Cluster, do not enumerate.** Two failures with the same root cause should be ONE finding citing both traces, not two findings. The point of this analyst is to compress N runs into K modes.
1326
+ 5. For each cluster you can defend with evidence, emit ONE finding with:
1327
+ - \`area\` = "failure-mode"
1328
+ - \`subject\` = a short label for the cluster ("tool-call-loop", "auth-revoked-mid-run", "agent-asked-clarification-too-late", ...)
1329
+ - \`claim\` = one sentence stating the mode
1330
+ - \`severity\` = "critical" when it blocks the run, "high" when the run finished degraded, "medium" when it slowed convergence
1331
+ - \`evidence_uri\` = \`span://<trace_id>/<span_id>\` of the most representative span
1332
+ - \`evidence_excerpt\` = the exact quote (e.g. error message, stuck tool call payload, contradictory turn output)
1333
+ - \`confidence\` = 0.85+ when multiple traces show the same shape; 0.6-0.8 for a single-trace inference; <0.5 for speculative.
1334
+ - \`recommended_action\` = imperative-phrased fix idea (kept short \u2014 the improvement-analyst will expand on these)
1335
+
1336
+ If the dataset has no failures, return an empty findings array \u2014 do NOT pad with low-confidence speculation.
1337
+
1338
+ **Delegate aggressively.** The recursion budget is there to be used:
1339
+ - After your first \`getDatasetOverview\` + \`queryTraces\` calls, you should have 3-6 candidate failure clusters in mind. Spawn one \`llmQuery\` per cluster in a single batch \u2014 they investigate in parallel.
1340
+ - A sub-investigator that finds its cluster is actually two distinct modes should split again at its own level. Recursion is meant to discover sub-modes, not to do trivial drilling that the parent could do in-line.
1341
+ - Pass narrow context to each subagent: { question: 'investigate the auth-revoked-mid-run cluster', context: { trace_ids: ['abc', 'def'], suspected_root_cause: 'token refresh skipped on idle sessions' } }. Subagents need enough context to skip re-discovery but not the whole conversation.
1342
+ - Each subagent returns its findings as JSON; the parent merges them. Do NOT have subagents call \`final()\` \u2014 they return their findings list to you, and you call \`final()\` once at the top.
1343
+
1344
+ OBSERVABILITY rules:
1345
+ - Each non-final turn must emit at least one \`console.log\` for evidence.
1346
+ - Reuse runtime variables across turns; don't recompute.
1347
+ - Call \`final({ findings: [...] })\` exactly once, after you've gathered evidence for every cluster you intend to report.`;
1348
+ var FAILURE_MODE_KIND_SPEC = {
1349
+ id: "failure-mode",
1350
+ description: "Clusters trace-dataset failures into distinct failure modes with cited evidence and a short recommended action.",
1351
+ area: "failure-mode",
1352
+ version: "1.0.0",
1353
+ actorDescription: ACTOR_PROMPT,
1354
+ buildTools: (store) => buildTraceToolsForGroup("all", store),
1355
+ recursion: { maxDepth: 3, maxParallelSubagents: 4 },
1356
+ maxTurns: 24,
1357
+ cost: { kind: "llm" }
1358
+ };
1359
+
1360
+ // src/analyst/kinds/improvement.ts
1361
+ var ACTOR_PROMPT2 = `You are a recursive-self-improvement analyst. Your job is to propose **concrete, locus-named edits** the agent's runtime should adopt to fix the failure modes, knowledge gaps, and poisonings present in this dataset.
1362
+
1363
+ Upstream analysts have already classified the problems. Your job is to convert each problem into a *change to make* and grade its expected leverage. Each finding is one proposed edit.
1364
+
1365
+ DISCOVERY \u2192 CANDIDATE-FIXES \u2192 COMPETE \u2192 CITE protocol:
1366
+
1367
+ 1. \`traces.getDatasetOverview({})\` first. Note the agents, tools, and any system-prompt fingerprints (look for the prompt text echoed in early spans).
1368
+ 2. For each high-severity failure pattern, generate 2-3 candidate fixes. Real candidate axes:
1369
+ - **System-prompt edit** \u2014 add an instruction, remove a misleading one, restructure precedence
1370
+ - **Tool description edit** \u2014 rewrite a tool's description so the agent picks it correctly / passes valid args
1371
+ - **New tool** \u2014 add a tool the agent kept emulating in code
1372
+ - **RAG ingestion** \u2014 add a document or correct a stale one
1373
+ - **Memory invalidation** \u2014 clear cached prior-run decisions that no longer apply
1374
+ - **Scaffolding** \u2014 add a precondition check, a retry policy, a turn budget, a verification step
1375
+ - **Output schema** \u2014 narrow the agent's output to forbid the failure shape
1376
+ 3. **Compete candidate fixes via subagents.** For each failure cluster, spawn one \`llmQuery\` per candidate-fix axis you want to evaluate. Each subagent's job: simulate the fix on the cited traces and report (i) likely effect, (ii) side effects, (iii) implementation cost as small/medium/large. Pass the cluster's failing trace_ids and the candidate axis as context.
1377
+ 4. After subagents return, **pick the winning candidate per cluster** based on (effect / cost) and emit ONE finding. Discard the losing candidates \u2014 the output is the recommendation, not the candidate set.
1378
+ 5. **Cross-reference upstream findings.** If a finding cites a prior failure-mode or knowledge-gap finding, use \`evidence_uri = "finding://<prior-finding-id>"\` (the registry supports this kind). This builds the dependency graph that lets the dashboard show "fix #X resolves failure modes A, B, C."
1379
+
1380
+ For each winning recommendation, emit ONE finding with:
1381
+ - \`area\` = "improvement"
1382
+ - \`subject\` = the locus to edit: \`system-prompt:<section>\`, \`tool-doc:<tool-name>\`, \`new-tool:<proposed-name>\`, \`rag:<corpus>:<doc-id>\`, \`memory:<key>\`, \`scaffolding:<concern>\`, \`output-schema:<field>\`
1383
+ - \`claim\` = one sentence stating the edit ("Add a precondition check to refuse tool X calls without arg Y")
1384
+ - \`severity\` = leverage rating: "critical" when fix resolves a critical failure mode; "high" when it resolves a high; "medium" when it's a quality-of-life win; "info" when it's a cleanup with no behavioral effect
1385
+ - \`evidence_uri\` = the failure-mode finding id this fix targets (\`finding://<id>\`) when it exists; else the most representative span
1386
+ - \`evidence_excerpt\` = a fragment showing the problem the fix targets
1387
+ - \`confidence\` = 0.85+ when the fix is mechanical and the failure mode is well-evidenced; 0.6-0.8 when the fix requires judgment; <0.5 for speculative
1388
+ - \`rationale\` = why this candidate beat its alternatives (2 sentences max)
1389
+ - \`recommended_action\` = the **literal edit**, phrased as a diff or a quoted replacement: "Replace section X with: '...'" or "Add tool with description: '...'" or "Set retry policy to max_attempts=3 with exponential backoff"
1390
+
1391
+ If no upstream failure findings exist in this run, derive your own from the trace dataset using the failure-mode protocol inline (\`searchTrace\` for STATUS_CODE_ERROR / MaxTurnsExceeded / etc.). But prefer to consume upstream findings when present \u2014 the kinds are designed to chain.
1392
+
1393
+ Do NOT propose a fix you cannot defend with evidence. "Tighten the prompt" is not a finding; "Add 'When the user asks for X, always Y' to the system prompt section "request-classification"" is.
1394
+
1395
+ OBSERVABILITY rules:
1396
+ - Each non-final turn must emit at least one \`console.log\` for evidence.
1397
+ - Call \`final({ findings: [...] })\` exactly once at the top level.`;
1398
+ var IMPROVEMENT_KIND_SPEC = {
1399
+ id: "improvement",
1400
+ description: "Converts upstream failure / gap / poisoning findings into concrete locus-named edits (prompt, tool-doc, RAG, scaffolding) with leverage grades.",
1401
+ area: "improvement",
1402
+ version: "1.0.0",
1403
+ actorDescription: ACTOR_PROMPT2,
1404
+ buildTools: (store) => buildTraceToolsForGroup("all", store),
1405
+ recursion: { maxDepth: 3, maxParallelSubagents: 4 },
1406
+ maxTurns: 30,
1407
+ maxRuntimeChars: 12e3,
1408
+ cost: { kind: "llm" }
1409
+ };
1410
+
1411
+ // src/analyst/kinds/knowledge-gap.ts
1412
+ var ACTOR_PROMPT3 = `You are a knowledge-gap analyst for an OTLP trace dataset. Your job is to identify the **specific pieces of information the agent lacked, or that were stale**, that caused poor decisions.
1413
+
1414
+ The agent under analysis maintains a curated knowledge base via \`@tangle-network/agent-knowledge\` \u2014 a wiki of \`KnowledgePage\`s with raw source anchors, claims, and relations. The primary expected store of agent-knowable facts IS that wiki. A "knowledge gap" is anything the agent had to discover or guess at run-time that the wiki should have held \u2014 or an outdated/contradictory fact the agent picked up from a non-wiki source.
1415
+
1416
+ DISCOVERY \u2192 ATTRIBUTE-TO-LAYER \u2192 CITE protocol:
1417
+
1418
+ 1. \`traces.getDatasetOverview({})\` first. Note which agents, tools, and models appear.
1419
+ 2. Pull traces where the agent shows gap signals. The strongest signals are:
1420
+ - Self-correction turns ("I assumed X but\u2026", "let me re-check", "actually,")
1421
+ - Clarifying-question turns where the agent asked the user something the runtime should have surfaced
1422
+ - Repeated retrieval / lookup calls for the same artifact with slightly varied queries
1423
+ - Tool errors that name a missing argument or unknown resource
1424
+ - Web-search calls returning pages dated before a known cutoff for content that changes (versioned APIs, schemas, policies)
1425
+ - Agent quoting a tool's docs / system prompt incorrectly because the actual text was insufficient
1426
+ - Fabricated identifiers that don't appear in dataset \`sample_trace_ids\`
1427
+ Use \`traces.searchTrace\` with patterns like \`I (don.?t|do not) know\`, \`assumed\`, \`unclear\`, \`could you (clarify|tell me|provide)\`, \`not found\`, \`undefined\`, \`unknown\`, \`null\`, dates older than the analysis window, or the agent's specific clarification phrases.
1428
+ 3. For each gap, identify the **layer of the runtime that should have prevented it**. The locus is the value of \`subject\` on the finding. Use one of:
1429
+ - \`agent-knowledge:wiki:<page-slug>\` \u2014 the wiki page that should exist but doesn't, or exists but lacks the claim
1430
+ - \`agent-knowledge:wiki:<page-slug>#<heading>\` \u2014 wiki page exists but a specific section is missing
1431
+ - \`agent-knowledge:claim:<topic>\` \u2014 a specific claim/relation triple that should be in the wiki
1432
+ - \`agent-knowledge:raw:<source-id>\` \u2014 raw source captured but never lifted into a curated page
1433
+ - \`agent-knowledge:stale:<page-slug>\` \u2014 wiki page exists but contradicts ground-truth evidence in this trace (the wiki itself drifted)
1434
+ - \`websearch:outdated:<topic>\` \u2014 agent relied on a web result that was stale; wiki should have superseded it
1435
+ - \`tool-doc:<tool-name>:<aspect>\` \u2014 tool description missed a behavior aspect (return shape, failure modes, side effects)
1436
+ - \`system-prompt:<section>\` \u2014 system prompt should have stated the rule directly
1437
+ - \`memory:<key>\` \u2014 prior-run memory should have surfaced an earlier decision
1438
+ 4. For each gap you can defend with evidence, emit ONE finding with:
1439
+ - \`area\` = "knowledge-gap"
1440
+ - \`subject\` = the locus string from the list above
1441
+ - \`claim\` = a sentence naming the missing or stale knowledge ("wiki has no page on invoice line-item shape, agent had to re-derive it from raw spans")
1442
+ - \`severity\` = "high" when the gap caused a failure or a clarifying question; "medium" when it caused unnecessary turns; "low" when it caused minor inefficiency
1443
+ - \`evidence_uri\` = \`span://<trace_id>/<span_id>\` of the moment the gap surfaced (the question, the self-correction, the retrieval miss, the stale web result)
1444
+ - \`evidence_excerpt\` = exact quote where the agent showed the gap
1445
+ - \`confidence\` = 0.85+ when the agent itself articulated the gap; 0.6-0.8 when inferred from behavior
1446
+ - \`recommended_action\` = phrased as a wiki edit when the locus is \`agent-knowledge:*\` ("Create wiki page \`invoice-line-items\` with claims: ..."), or as a prompt/tool-doc edit otherwise
1447
+
1448
+ **Delegate per layer.** After your first scan, you should have candidates spread across \`agent-knowledge:*\`, \`websearch:outdated\`, \`tool-doc:*\`, \`system-prompt:*\`, and \`memory:*\`. Spawn one \`llmQuery\` per layer in parallel \u2014 each subagent runs a focused detection (e.g. the \`agent-knowledge\` subagent looks for both missing-pages AND stale-pages; the \`websearch\` subagent looks specifically for date staleness signals; the \`tool-doc\` subagent looks for tool-call argument errors a fuller description would have prevented). Subagents return findings; you merge and emit one \`final({ findings })\` at the top.
1449
+
1450
+ Do NOT report a gap that the agent later recovered from cleanly within the same turn \u2014 that's resilience, not a gap. Cite the *non-recovery* version when both exist.
1451
+
1452
+ OBSERVABILITY rules:
1453
+ - Each non-final turn must emit at least one \`console.log\` for evidence.
1454
+ - Call \`final({ findings: [...] })\` exactly once at the top level.`;
1455
+ var KNOWLEDGE_GAP_KIND_SPEC = {
1456
+ id: "knowledge-gap",
1457
+ description: "Identifies missing or stale pieces of knowledge \u2014 primarily against the agent-knowledge wiki \u2014 and attributes each to the runtime layer (wiki page, claim, raw source, websearch, tool-doc, system-prompt, memory) that should have held it.",
1458
+ area: "knowledge-gap",
1459
+ version: "1.0.0",
1460
+ actorDescription: ACTOR_PROMPT3,
1461
+ buildTools: (store) => buildTraceToolsForGroup("discoveryAndSearch", store),
1462
+ recursion: { maxDepth: 2, maxParallelSubagents: 4 },
1463
+ maxTurns: 18,
1464
+ cost: { kind: "llm" }
1465
+ };
1466
+
1467
+ // src/analyst/kinds/knowledge-poisoning.ts
1468
+ var ACTOR_PROMPT4 = `You are a knowledge-poisoning analyst for an OTLP trace dataset. Your job is to identify cases where the agent **confidently used wrong information** \u2014 not where it lacked information (that's the knowledge-gap analyst).
1469
+
1470
+ DISCOVERY \u2192 DUAL-VERIFY \u2192 CITE protocol:
1471
+
1472
+ 1. \`traces.getDatasetOverview({})\` first. Identify the agents, models, and tools.
1473
+ 2. Pull traces where the agent's confident action was later contradicted. Strongest signals:
1474
+ - Agent stated a fact in one span; a later span surfaced contradictory evidence; the agent then proceeded anyway or fabricated reconciliation.
1475
+ - Tool call with stale arguments (an id that no longer exists, an API shape that changed).
1476
+ - Agent cited an \`agent-knowledge\` wiki page or claim whose content contradicts the trace's own evidence \u2014 the wiki itself drifted.
1477
+ - Web-search result the agent cited that returned an outdated page; agent treated it as canonical.
1478
+ - System-prompt instruction the agent followed that ground-truth evidence in the trace contradicts (e.g. prompt says "use endpoint A"; tool reply says "endpoint A deprecated, use B").
1479
+ - Repeated wrong-shape parsing despite the tool's actual output proving the shape.
1480
+ 3. Use \`traces.searchTrace\` with regex on phrases like \`actually\`, \`turns out\`, \`previously assumed\`, \`old version\`, \`deprecated\`, \`updated to\`, \`now uses\`, or specific entity names you suspect have changed.
1481
+ 4. For each candidate poisoning, **DUAL-VERIFY**:
1482
+ - Confirm the agent actually acted on the false belief (cite the span where it did)
1483
+ - Confirm the belief is actually false in this trace's own evidence (cite the span that contradicts it)
1484
+ Only emit a finding when both halves are nailed down. If you can only nail one, drop it \u2014 single-evidence poisoning findings are too speculative to be useful.
1485
+
1486
+ **Delegate the dual-verify.** Use the recursion budget so each candidate poisoning gets one subagent investigating "did the agent act?" and one investigating "is the belief false?". After your first scan, fire off N parallel \`llmQuery\` pairs (one cluster per pair). Subagents return their findings; you accept only the ones where BOTH halves of the pair were confirmed.
1487
+
1488
+ For each confirmed poisoning, emit ONE finding with:
1489
+ - \`area\` = "knowledge-poisoning"
1490
+ - \`subject\` = the source of the false belief, one of: \`agent-knowledge:wiki:<page-slug>\` (wiki page contradicts current ground truth), \`agent-knowledge:claim:<topic>\` (a specific claim/relation went stale), \`agent-knowledge:raw:<source-id>\` (the raw source is outdated and the wiki inherited the drift), \`websearch:outdated:<url-or-topic>\`, \`tool-doc:<tool>\`, \`system-prompt:<section>\`, \`memory:<key>\`, \`prior-run-summary:<topic>\`
1491
+ - \`claim\` = one sentence: "agent believed X (from source S); evidence in trace shows X is false"
1492
+ - \`severity\` = "critical" when poisoning caused a wrong user-visible action; "high" when caught internally but wasted significant work; "medium" for inefficiency only
1493
+ - \`evidence_uri\` = \`span://<trace_id>/<span_id>\` of the action span (the moment the agent acted on the false belief)
1494
+ - \`evidence_excerpt\` = exact quote of the confident-but-wrong claim or action
1495
+ - \`confidence\` = 0.85+ when both halves are exact-quote backed; 0.6-0.8 when one half is inferred
1496
+ - \`recommended_action\` = where the source should be updated and how ("Update wiki page \`X\` claim \`Y\` to '...'", "Invalidate raw source \`Z\` and re-curate", "Replace system-prompt section X with 'tool foo now returns Y'")
1497
+
1498
+ Do NOT report a finding if the agent caught and corrected the false belief in the same turn \u2014 that's the system working. Reserve poisoning for cases where the false belief shaped downstream action.
1499
+
1500
+ OBSERVABILITY rules:
1501
+ - Each non-final turn must emit at least one \`console.log\` for evidence.
1502
+ - Call \`final({ findings: [...] })\` exactly once at the top level.`;
1503
+ var KNOWLEDGE_POISONING_KIND_SPEC = {
1504
+ id: "knowledge-poisoning",
1505
+ description: "Identifies confident-but-wrong actions caused by stale memory, contradicting RAG, deprecated tool docs, or outdated system-prompt instructions.",
1506
+ area: "knowledge-poisoning",
1507
+ version: "1.0.0",
1508
+ actorDescription: ACTOR_PROMPT4,
1509
+ buildTools: (store) => buildTraceToolsForGroup("all", store),
1510
+ recursion: { maxDepth: 2, maxParallelSubagents: 4 },
1511
+ maxTurns: 20,
1512
+ cost: { kind: "llm" }
1513
+ };
1514
+
1515
+ // src/analyst/kinds/index.ts
1516
+ var DEFAULT_TRACE_ANALYST_KINDS = [
1517
+ FAILURE_MODE_KIND_SPEC,
1518
+ KNOWLEDGE_GAP_KIND_SPEC,
1519
+ KNOWLEDGE_POISONING_KIND_SPEC,
1520
+ IMPROVEMENT_KIND_SPEC
1521
+ ];
1522
+
1523
+ // src/analyst/registry.ts
1524
+ import { randomUUID } from "crypto";
1525
+ var AnalystRegistry = class {
1526
+ analysts = /* @__PURE__ */ new Map();
1527
+ options;
1528
+ constructor(options = {}) {
1529
+ this.options = options;
1530
+ }
1531
+ register(analyst) {
1532
+ if (!analyst.id) throw new Error("AnalystRegistry.register: analyst.id is required");
1533
+ if (this.analysts.has(analyst.id)) {
1534
+ throw new Error(`AnalystRegistry.register: duplicate analyst id "${analyst.id}"`);
1535
+ }
1536
+ if (!analyst.version) {
1537
+ throw new Error(`AnalystRegistry.register: analyst "${analyst.id}" must declare a version`);
1538
+ }
1539
+ this.analysts.set(analyst.id, analyst);
1540
+ }
1541
+ list() {
1542
+ return Array.from(this.analysts.values()).map((a) => ({
1543
+ id: a.id,
1544
+ description: a.description,
1545
+ version: a.version,
1546
+ cost: a.cost
1547
+ }));
1548
+ }
1549
+ async run(runId, inputs, runOpts = {}) {
1550
+ const correlationId = `ar_${randomUUID().slice(0, 12)}`;
1551
+ const log = this.options.log ?? (() => {
1552
+ });
1553
+ const hooks = this.options.hooks ?? {};
1554
+ const startedAt = (/* @__PURE__ */ new Date()).toISOString();
1555
+ const started = Date.now();
1556
+ const deadlineMs = runOpts.timeoutMs ? started + runOpts.timeoutMs : void 0;
1557
+ const selected = this.selectAnalysts(runOpts);
1558
+ const budget = runOpts.budget ?? this.options.defaultBudget;
1559
+ const summaries = [];
1560
+ const allFindings = [];
1561
+ let totalCost = 0;
1562
+ let remainingUsd = budget?.totalUsd;
1563
+ for (const analyst of selected) {
1564
+ const t0 = Date.now();
1565
+ const input = this.routeInput(analyst, inputs);
1566
+ if (input.kind === "missing") {
1567
+ const summary = {
1568
+ analyst_id: analyst.id,
1569
+ status: "skipped",
1570
+ reason: `missing input of kind '${analyst.inputKind}'`,
1571
+ findings_count: 0,
1572
+ latency_ms: 0,
1573
+ cost_usd: 0
1574
+ };
1575
+ summaries.push(summary);
1576
+ log(`[analyst] skip ${analyst.id} \u2014 missing input`, { runId, kind: analyst.inputKind });
1577
+ await hooks.onAfterAnalyze?.({ analyst, summary, findings: [], runId });
1578
+ continue;
1579
+ }
1580
+ const perBudget = allocateBudget(budget, {
1581
+ analyst,
1582
+ remainingUsd,
1583
+ runningCount: selected.length
1584
+ });
1585
+ const ctx = {
1586
+ runId,
1587
+ correlationId,
1588
+ deadlineMs,
1589
+ budgetUsd: perBudget,
1590
+ chat: this.options.chat,
1591
+ tags: runOpts.tags,
1592
+ log: (msg, fields) => log(`[${analyst.id}] ${msg}`, { runId, correlationId, ...fields }),
1593
+ signal: runOpts.signal,
1594
+ priorFindings: selectPriorFindings(runOpts.priorFindings, analyst.id)
1595
+ };
1596
+ await hooks.onBeforeAnalyze?.({ analyst, ctx, runId });
1597
+ try {
1598
+ const findings = await analyst.analyze(input.value, ctx);
1599
+ const latency = Date.now() - t0;
1600
+ const cost = sumFindingCost(findings);
1601
+ totalCost += cost;
1602
+ if (typeof remainingUsd === "number") remainingUsd = Math.max(0, remainingUsd - cost);
1603
+ allFindings.push(...findings);
1604
+ const summary = {
1605
+ analyst_id: analyst.id,
1606
+ status: "ok",
1607
+ findings_count: findings.length,
1608
+ latency_ms: latency,
1609
+ cost_usd: cost
1610
+ };
1611
+ summaries.push(summary);
1612
+ log(`[analyst] ok ${analyst.id}`, {
1613
+ runId,
1614
+ findings: findings.length,
1615
+ latency_ms: latency,
1616
+ cost_usd: cost
1617
+ });
1618
+ await hooks.onAfterAnalyze?.({ analyst, summary, findings, runId });
1619
+ } catch (err) {
1620
+ const latency = Date.now() - t0;
1621
+ const e = err instanceof Error ? err : new Error(String(err));
1622
+ const hookFindings = await hooks.onError?.({ analyst, error: e, runId }) ?? [];
1623
+ if (hookFindings.length) allFindings.push(...hookFindings);
1624
+ const summary = {
1625
+ analyst_id: analyst.id,
1626
+ status: "failed",
1627
+ findings_count: hookFindings.length,
1628
+ latency_ms: latency,
1629
+ cost_usd: 0,
1630
+ error: { class: e.constructor.name, message: e.message }
1631
+ };
1632
+ summaries.push(summary);
1633
+ log(`[analyst] FAIL ${analyst.id}`, {
1634
+ runId,
1635
+ error_class: e.constructor.name,
1636
+ error: e.message
1637
+ });
1638
+ await hooks.onAfterAnalyze?.({ analyst, summary, findings: hookFindings, runId });
1639
+ }
1640
+ }
1641
+ const result = {
1642
+ run_id: runId,
1643
+ correlation_id: correlationId,
1644
+ started_at: startedAt,
1645
+ ended_at: (/* @__PURE__ */ new Date()).toISOString(),
1646
+ findings: allFindings,
1647
+ per_analyst: summaries,
1648
+ total_cost_usd: totalCost
1649
+ };
1650
+ await hooks.onComplete?.({ result });
1651
+ return result;
1652
+ }
1653
+ selectAnalysts(opts) {
1654
+ let candidates = Array.from(this.analysts.values());
1655
+ if (opts.only?.length) {
1656
+ const only = new Set(opts.only);
1657
+ candidates = candidates.filter((a) => only.has(a.id));
1658
+ }
1659
+ if (opts.skip?.length) {
1660
+ const skip = new Set(opts.skip);
1661
+ candidates = candidates.filter((a) => !skip.has(a.id));
1662
+ }
1663
+ return candidates;
1664
+ }
1665
+ routeInput(analyst, inputs) {
1666
+ switch (analyst.inputKind) {
1667
+ case "trace-store":
1668
+ return inputs.traceStore ? { kind: "present", value: inputs.traceStore } : { kind: "missing" };
1669
+ case "artifact-dir":
1670
+ return inputs.artifactDir ? { kind: "present", value: inputs.artifactDir } : { kind: "missing" };
1671
+ case "run-record":
1672
+ return inputs.runRecord ? { kind: "present", value: inputs.runRecord } : { kind: "missing" };
1673
+ case "judge-input":
1674
+ return inputs.judgeInput ? { kind: "present", value: inputs.judgeInput } : { kind: "missing" };
1675
+ case "custom": {
1676
+ const v = inputs.custom?.[analyst.id];
1677
+ return v !== void 0 ? { kind: "present", value: v } : { kind: "missing" };
1678
+ }
1679
+ }
1680
+ }
1681
+ };
1682
+ function allocateBudget(policy, args) {
1683
+ if (!policy) return void 0;
1684
+ if (policy.allocate) {
1685
+ return policy.allocate({
1686
+ analyst: args.analyst,
1687
+ totalUsd: policy.totalUsd,
1688
+ remainingUsd: args.remainingUsd,
1689
+ runningCount: args.runningCount
1690
+ });
1691
+ }
1692
+ if (policy.totalUsd == null) return void 0;
1693
+ if (policy.weights) {
1694
+ const w = policy.weights[args.analyst.id] ?? 1;
1695
+ const totalWeight = Math.max(1, args.runningCount);
1696
+ return policy.totalUsd * w / totalWeight;
1697
+ }
1698
+ return policy.totalUsd / Math.max(1, args.runningCount);
1699
+ }
1700
+ function sumFindingCost(findings) {
1701
+ let sum2 = 0;
1702
+ for (const f of findings) {
1703
+ const c = f.metadata?.cost_usd;
1704
+ if (typeof c === "number" && Number.isFinite(c)) sum2 += c;
1705
+ }
1706
+ return sum2;
1707
+ }
1708
+ function selectPriorFindings(source, analystId) {
1709
+ if (!source) return void 0;
1710
+ if (Array.isArray(source)) {
1711
+ const own2 = source.filter((f) => f.analyst_id === analystId);
1712
+ return own2.length > 0 ? own2 : void 0;
1713
+ }
1714
+ const record = source;
1715
+ const own = record[analystId] ?? [];
1716
+ const wildcard = record["*"] ?? [];
1717
+ const merged = [...own, ...wildcard];
1718
+ return merged.length > 0 ? merged : void 0;
1719
+ }
1720
+
232
1721
  // src/auto-pr.ts
233
1722
  async function proposeAutomatedPullRequest(client, input) {
234
1723
  validate(input);
@@ -3135,154 +4624,6 @@ var FileSystemExperimentStore = class {
3135
4624
  }
3136
4625
  };
3137
4626
 
3138
- // src/run-score.ts
3139
- var DEFAULT_RUN_SCORE_WEIGHTS = {
3140
- success: 4,
3141
- goalProgress: 2,
3142
- repoGroundedness: 1.5,
3143
- driftPenalty: -1.5,
3144
- toolUseQuality: 1,
3145
- patchQuality: 1.25,
3146
- testReality: 1.5,
3147
- finalGate: 3,
3148
- reviewerBlockers: -2,
3149
- costUsd: -0.2,
3150
- wallSeconds: -0.1
3151
- };
3152
- function aggregateRunScore(score, weights = {}) {
3153
- const w = { ...DEFAULT_RUN_SCORE_WEIGHTS, ...weights };
3154
- return w.success * clamp01(score.success) + w.goalProgress * clamp01(score.goalProgress) + w.repoGroundedness * clamp01(score.repoGroundedness) + w.driftPenalty * clamp01(score.driftPenalty) + w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + w.finalGate * clamp01(score.finalGate) + w.reviewerBlockers * clamp01(score.reviewerBlockers) + w.costUsd * Math.max(0, finiteOrZero(score.costUsd)) + w.wallSeconds * Math.max(0, finiteOrZero(score.wallSeconds) / 60);
3155
- }
3156
- function clamp01(value) {
3157
- if (!Number.isFinite(value)) return 0;
3158
- return Math.max(0, Math.min(1, value));
3159
- }
3160
- function finiteOrZero(value) {
3161
- return Number.isFinite(value) ? value : 0;
3162
- }
3163
-
3164
- // src/run-critic.ts
3165
- var DEFAULT_DRIFT_PATTERNS = [
3166
- /https?:\/\//i,
3167
- /\btitle:\s/i,
3168
- /\bsummary:\s/i,
3169
- /\burl:\s/i,
3170
- /\bnpm package usage\b/i,
3171
- /\bnews\b/i
3172
- ];
3173
- var RunCritic = class {
3174
- weights;
3175
- driftPatterns;
3176
- constructor(options = {}) {
3177
- this.weights = options.weights;
3178
- this.driftPatterns = options.driftPatterns ?? DEFAULT_DRIFT_PATTERNS;
3179
- }
3180
- async score(store, runId) {
3181
- const run = await store.getRun(runId);
3182
- if (!run) throw new NotFoundError(`run ${runId} not found`);
3183
- const [spans, events, artifacts, budget] = await Promise.all([
3184
- store.spans({ runId }),
3185
- store.events({ runId }),
3186
- store.artifacts(runId),
3187
- store.budget(runId)
3188
- ]);
3189
- return this.scoreTrace({ run, spans, events, artifacts, budget });
3190
- }
3191
- scoreTrace(trace) {
3192
- const notes = [];
3193
- const llmSpans2 = trace.spans.filter(
3194
- (s) => s.kind === "llm"
3195
- );
3196
- const toolSpans2 = trace.spans.filter(
3197
- (s) => s.kind === "tool"
3198
- );
3199
- const judgeSpans2 = trace.spans.filter(
3200
- (s) => s.kind === "judge"
3201
- );
3202
- const sandboxSpans = trace.spans.filter(
3203
- (s) => s.kind === "sandbox"
3204
- );
3205
- const finalGateSpans = judgeSpans2.filter(
3206
- (span) => span.dimension === "final_gate" || span.attributes?.finalGate === true
3207
- );
3208
- const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
3209
- if (!success) notes.push("run did not complete with pass=true");
3210
- const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum2, span) => sum2 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
3211
- const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(
3212
- trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score
3213
- ) : void 0;
3214
- const goalProgress = outcomeScore ?? judgeAverage ?? success;
3215
- const successfulTools = toolSpans2.filter((span) => span.status !== "error").length;
3216
- const toolUseQuality = toolSpans2.length === 0 ? 0 : successfulTools / toolSpans2.length;
3217
- if (toolSpans2.length === 0) notes.push("no tool spans recorded");
3218
- const patchEvidence = trace.artifacts.length + toolSpans2.filter((span) => /write|edit|patch|apply/i.test(span.toolName)).length;
3219
- const patchQuality = patchEvidence > 0 ? clamp01(patchEvidence / 4) : 0;
3220
- if (!patchQuality) notes.push("no artifact or edit evidence recorded");
3221
- const sandboxTests = sandboxSpans.filter(
3222
- (span) => typeof span.testsTotal === "number" && span.testsTotal > 0
3223
- );
3224
- const testReality = sandboxTests.length ? sandboxTests.reduce(
3225
- (sum2, span) => sum2 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1),
3226
- 0
3227
- ) / sandboxTests.length : toolSpans2.some(
3228
- (span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))
3229
- ) ? 0.4 : 0;
3230
- if (!testReality) notes.push("no real test/build evidence recorded");
3231
- const blockerSpans = judgeSpans2.filter((span) => isBlockingJudge(span));
3232
- const finalGateBlockers = finalGateSpans.filter((span) => isBlockingJudge(span));
3233
- const finalGate = finalGateSpans.length ? finalGateBlockers.length ? 0 : 1 : success;
3234
- if (finalGateBlockers.length)
3235
- notes.push(`final gate blocked by ${finalGateBlockers.length} reviewer(s)`);
3236
- else if (!finalGateSpans.length) notes.push("no final gate judgment recorded");
3237
- const reviewerBlockers = judgeSpans2.length ? blockerSpans.length / judgeSpans2.length : 0;
3238
- if (reviewerBlockers) notes.push(`detected ${blockerSpans.length} blocking reviewer signal(s)`);
3239
- const positiveGroundingSignals = patchEvidence + sandboxSpans.length + llmSpans2.filter((span) => looksRepoGrounded(span.output ?? "")).length;
3240
- const driftSignals = llmSpans2.filter((span) => this.isDrift(span.output ?? "")).length + trace.events.filter((event) => this.isDrift(JSON.stringify(event.payload))).length;
3241
- const repoGroundedness = positiveGroundingSignals + driftSignals === 0 ? 0 : positiveGroundingSignals / (positiveGroundingSignals + driftSignals);
3242
- const driftPenalty = positiveGroundingSignals + driftSignals === 0 ? 0 : driftSignals / (positiveGroundingSignals + driftSignals);
3243
- if (driftSignals > 0) notes.push(`detected ${driftSignals} drift signal(s)`);
3244
- const costUsd = trace.budget.length ? Math.max(
3245
- ...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed),
3246
- 0
3247
- ) : llmSpans2.reduce((sum2, span) => sum2 + (span.costUsd ?? 0), 0);
3248
- const wallSeconds = trace.run.endedAt && trace.run.startedAt ? Math.max(0, (trace.run.endedAt - trace.run.startedAt) / 1e3) : 0;
3249
- return {
3250
- success,
3251
- goalProgress,
3252
- repoGroundedness,
3253
- driftPenalty,
3254
- toolUseQuality,
3255
- patchQuality,
3256
- testReality,
3257
- finalGate,
3258
- reviewerBlockers,
3259
- costUsd,
3260
- wallSeconds,
3261
- notes
3262
- };
3263
- }
3264
- rank(score) {
3265
- return aggregateRunScore(score, this.weights);
3266
- }
3267
- isDrift(text) {
3268
- return this.driftPatterns.some((pattern) => pattern.test(text));
3269
- }
3270
- };
3271
- function normalizeJudgeScore(score) {
3272
- return score > 1 ? clamp01(score / 10) : clamp01(score);
3273
- }
3274
- function looksRepoGrounded(text) {
3275
- return /(?:src\/|tests?\/|package\.json|tsconfig|\.ts\b|\.tsx\b|git status|pnpm |npm |vitest|pytest|jest)/i.test(
3276
- text
3277
- );
3278
- }
3279
- function isBlockingJudge(span) {
3280
- return span.attributes?.blocking === true || span.attributes?.verdict === "BLOCKING" || positiveNumber(span.attributes?.blockingFindings) || positiveNumber(span.attributes?.highFindings) || span.score <= 2;
3281
- }
3282
- function positiveNumber(value) {
3283
- return typeof value === "number" && value > 0;
3284
- }
3285
-
3286
4627
  // src/harness-optimizer.ts
3287
4628
  var DEFAULT_HARNESS_OBJECTIVES = [
3288
4629
  { name: "aggregate", direction: "maximize", value: (r) => r.aggregateMean },
@@ -3952,7 +5293,7 @@ function assertNonNegative(n, name) {
3952
5293
  }
3953
5294
 
3954
5295
  // src/muffled-gate-scanner.ts
3955
- import { existsSync, readdirSync, readFileSync, statSync } from "fs";
5296
+ import { existsSync as existsSync3, readdirSync, readFileSync as readFileSync2, statSync } from "fs";
3956
5297
  import { join } from "path";
3957
5298
  function codeOf(line) {
3958
5299
  return line.replace(/\/\/.*$/, "").replace(/^\s*\*.*$/, "");
@@ -4066,7 +5407,7 @@ function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
4066
5407
  const matches = [];
4067
5408
  const walk = (rel) => {
4068
5409
  const abs = join(repoRoot, rel);
4069
- if (!existsSync(abs)) return;
5410
+ if (!existsSync3(abs)) return;
4070
5411
  for (const entry of readdirSync(abs)) {
4071
5412
  const sub = join(rel, entry);
4072
5413
  const subAbs = join(repoRoot, sub);
@@ -4085,7 +5426,7 @@ function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
4085
5426
  continue;
4086
5427
  let text;
4087
5428
  try {
4088
- text = readFileSync(subAbs, "utf8");
5429
+ text = readFileSync2(subAbs, "utf8");
4089
5430
  } catch {
4090
5431
  continue;
4091
5432
  }
@@ -4101,8 +5442,8 @@ function scanForMuffledGates(opts) {
4101
5442
  const scanned = /* @__PURE__ */ new Set();
4102
5443
  for (const file of opts.scanFiles) {
4103
5444
  const abs = join(opts.repoRoot, file);
4104
- if (!existsSync(abs)) continue;
4105
- const text = readFileSync(abs, "utf8");
5445
+ if (!existsSync3(abs)) continue;
5446
+ const text = readFileSync2(abs, "utf8");
4106
5447
  for (const find of opts.finders) findings.push(...find(file, text));
4107
5448
  scanned.add(file);
4108
5449
  }
@@ -4116,8 +5457,8 @@ function scanForMuffledGates(opts) {
4116
5457
  for (const file of importers) {
4117
5458
  if (scanned.has(file)) continue;
4118
5459
  const abs = join(opts.repoRoot, file);
4119
- if (!existsSync(abs)) continue;
4120
- const text = readFileSync(abs, "utf8");
5460
+ if (!existsSync3(abs)) continue;
5461
+ const text = readFileSync2(abs, "utf8");
4121
5462
  for (const find of opts.autoDerive.universalFinders) findings.push(...find(file, text));
4122
5463
  }
4123
5464
  }
@@ -6014,7 +7355,7 @@ async function runSelfPlay(proposer, scorer, targets, options = {}) {
6014
7355
 
6015
7356
  // src/command-runner.ts
6016
7357
  import { spawnSync } from "child_process";
6017
- import { existsSync as existsSync2, readdirSync as readdirSync2, readFileSync as readFileSync2, statSync as statSync2 } from "fs";
7358
+ import { existsSync as existsSync4, readdirSync as readdirSync2, readFileSync as readFileSync3, statSync as statSync2 } from "fs";
6018
7359
  import { join as join2 } from "path";
6019
7360
  var localCommandRunner = {
6020
7361
  name: "local",
@@ -6043,11 +7384,11 @@ var localCommandRunner = {
6043
7384
  return r.status === 0 && (r.stdout ?? "").trim().length > 0;
6044
7385
  },
6045
7386
  async fileExists(path) {
6046
- return existsSync2(path);
7387
+ return existsSync4(path);
6047
7388
  },
6048
7389
  async readFile(path) {
6049
7390
  try {
6050
- return readFileSync2(path, "utf8");
7391
+ return readFileSync3(path, "utf8");
6051
7392
  } catch {
6052
7393
  return null;
6053
7394
  }
@@ -6401,11 +7742,11 @@ function flowLayer(input) {
6401
7742
 
6402
7743
  // src/intent-match-judge.ts
6403
7744
  var INTENT_MATCH_JUDGE_VERSION = "intent-match-judge-v1-2026-04-24";
6404
- var DEFAULT_MODEL = "claude-sonnet-4-6";
6405
- var DEFAULT_TIMEOUT = 9e4;
6406
- var DEFAULT_MAX_SOURCE = 25e3;
6407
- var DEFAULT_MAX_PER_FILE = 12e3;
6408
- var DEFAULT_MAX_HTML = 2e4;
7745
+ var DEFAULT_MODEL2 = "claude-sonnet-4-6";
7746
+ var DEFAULT_TIMEOUT2 = 9e4;
7747
+ var DEFAULT_MAX_SOURCE2 = 25e3;
7748
+ var DEFAULT_MAX_PER_FILE2 = 12e3;
7749
+ var DEFAULT_MAX_HTML2 = 2e4;
6409
7750
  var INTENT_SCHEMA = {
6410
7751
  type: "object",
6411
7752
  additionalProperties: false,
@@ -6415,12 +7756,12 @@ var INTENT_SCHEMA = {
6415
7756
  evidence: { type: "string", minLength: 10, maxLength: 400 }
6416
7757
  }
6417
7758
  };
6418
- function truncate(body, cap, label) {
7759
+ function truncate2(body, cap, label) {
6419
7760
  if (body.length <= cap) return body;
6420
7761
  return `${body.slice(0, cap)}
6421
7762
  \u2026 [truncated ${body.length - cap} chars of ${label}]`;
6422
7763
  }
6423
- function buildPrompt(input, opts) {
7764
+ function buildPrompt2(input, opts) {
6424
7765
  const sourceBlob = input.sourceFiles.filter((f) => f.content.length <= opts.maxPerFileChars).map((f) => `--- FILE: ${f.path} ---
6425
7766
  ${f.content}`).join("\n\n");
6426
7767
  const html = input.servedHtml ?? "";
@@ -6439,10 +7780,10 @@ ${input.artifactLabel ? `ARTIFACT METADATA:
6439
7780
  description: ${input.artifactDescription ?? ""}
6440
7781
 
6441
7782
  ` : ""}${html ? `SERVED HTML (what the preview returns):
6442
- ${truncate(html, opts.maxHtmlChars, "HTML")}
7783
+ ${truncate2(html, opts.maxHtmlChars, "HTML")}
6443
7784
 
6444
7785
  ` : ""}SOURCE FILES (the agent's workdir):
6445
- ${truncate(sourceBlob, opts.maxSourceChars, "source")}
7786
+ ${truncate2(sourceBlob, opts.maxSourceChars, "source")}
6446
7787
 
6447
7788
  Score 0\u20131:
6448
7789
  1.0 \u2014 unmistakably the right app. Even with bugs, gaps, or missing
@@ -6470,11 +7811,11 @@ Return STRICT JSON. No prose outside.`;
6470
7811
  async function runIntentMatchJudge(input, options = {}) {
6471
7812
  const start = Date.now();
6472
7813
  const opts = {
6473
- model: options.model ?? DEFAULT_MODEL,
6474
- timeoutMs: options.timeoutMs ?? DEFAULT_TIMEOUT,
6475
- maxSourceChars: options.maxSourceChars ?? DEFAULT_MAX_SOURCE,
6476
- maxPerFileChars: options.maxPerFileChars ?? DEFAULT_MAX_PER_FILE,
6477
- maxHtmlChars: options.maxHtmlChars ?? DEFAULT_MAX_HTML,
7814
+ model: options.model ?? DEFAULT_MODEL2,
7815
+ timeoutMs: options.timeoutMs ?? DEFAULT_TIMEOUT2,
7816
+ maxSourceChars: options.maxSourceChars ?? DEFAULT_MAX_SOURCE2,
7817
+ maxPerFileChars: options.maxPerFileChars ?? DEFAULT_MAX_PER_FILE2,
7818
+ maxHtmlChars: options.maxHtmlChars ?? DEFAULT_MAX_HTML2,
6478
7819
  llm: options.llm ?? {}
6479
7820
  };
6480
7821
  if (input.sourceFiles.length === 0 && !input.servedHtml) {
@@ -6498,7 +7839,7 @@ async function runIntentMatchJudge(input, options = {}) {
6498
7839
  role: "system",
6499
7840
  content: "You are a holistic code reviewer answering one question: did the agent build the right app for the user. Return strict JSON. No prose outside."
6500
7841
  },
6501
- { role: "user", content: buildPrompt(input, opts) }
7842
+ { role: "user", content: buildPrompt2(input, opts) }
6502
7843
  ],
6503
7844
  jsonSchema: { name: "intent_match_judge", schema: INTENT_SCHEMA },
6504
7845
  temperature: 0,
@@ -6958,72 +8299,29 @@ function multiToolchainLayer(config) {
6958
8299
  {
6959
8300
  severity: "major",
6960
8301
  layer: config.name,
6961
- message: err instanceof Error ? err.message : String(err),
6962
- detail: { adapter: adapterName }
6963
- }
6964
- ],
6965
- reason: err instanceof Error ? err.message : String(err)
6966
- }
6967
- };
6968
- }
6969
- };
6970
- const results = [];
6971
- for (let i = 0; i < config.adapters.length; i += maxParallel) {
6972
- const chunk = config.adapters.slice(i, i + maxParallel);
6973
- const chunkResults = await Promise.all(chunk.map(runOne));
6974
- results.push(...chunkResults);
6975
- }
6976
- return mergeLayerResults(config.name, results);
6977
- }
6978
- };
6979
- }
6980
-
6981
- // src/reference-replay.ts
6982
- import { appendFileSync, existsSync as existsSync3, mkdirSync, readFileSync as readFileSync3 } from "fs";
6983
- import { dirname } from "path";
6984
-
6985
- // src/concurrency.ts
6986
- var Mutex = class {
6987
- locked = false;
6988
- waiters = [];
6989
- async acquire() {
6990
- if (!this.locked) {
6991
- this.locked = true;
6992
- return () => this.release();
6993
- }
6994
- return new Promise((resolve) => {
6995
- this.waiters.push(() => {
6996
- resolve(() => this.release());
6997
- });
6998
- });
6999
- }
7000
- release() {
7001
- const next = this.waiters.shift();
7002
- if (next) {
7003
- next();
7004
- } else {
7005
- this.locked = false;
7006
- }
7007
- }
7008
- async runExclusive(fn) {
7009
- const release = await this.acquire();
7010
- try {
7011
- return await fn();
7012
- } finally {
7013
- release();
8302
+ message: err instanceof Error ? err.message : String(err),
8303
+ detail: { adapter: adapterName }
8304
+ }
8305
+ ],
8306
+ reason: err instanceof Error ? err.message : String(err)
8307
+ }
8308
+ };
8309
+ }
8310
+ };
8311
+ const results = [];
8312
+ for (let i = 0; i < config.adapters.length; i += maxParallel) {
8313
+ const chunk = config.adapters.slice(i, i + maxParallel);
8314
+ const chunkResults = await Promise.all(chunk.map(runOne));
8315
+ results.push(...chunkResults);
8316
+ }
8317
+ return mergeLayerResults(config.name, results);
7014
8318
  }
7015
- }
7016
- /** True iff someone holds the lock right now. Diagnostics only. */
7017
- get isLocked() {
7018
- return this.locked;
7019
- }
7020
- /** Pending waiter count. Diagnostics only. */
7021
- get pending() {
7022
- return this.waiters.length;
7023
- }
7024
- };
8319
+ };
8320
+ }
7025
8321
 
7026
8322
  // src/reference-replay.ts
8323
+ import { appendFileSync as appendFileSync2, existsSync as existsSync5, mkdirSync as mkdirSync2, readFileSync as readFileSync4 } from "fs";
8324
+ import { dirname as dirname2 } from "path";
7027
8325
  var DEFAULT_MATCH_THRESHOLD = 0.55;
7028
8326
  var ALL_SPLITS = ["train", "dev", "test", "holdout"];
7029
8327
  async function runReferenceReplay(cases, options) {
@@ -7141,14 +8439,14 @@ function jsonlReferenceReplayStore(path) {
7141
8439
  return {
7142
8440
  async save(run) {
7143
8441
  await lock.runExclusive(() => {
7144
- mkdirSync(dirname(path), { recursive: true });
7145
- appendFileSync(path, `${JSON.stringify(run)}
8442
+ mkdirSync2(dirname2(path), { recursive: true });
8443
+ appendFileSync2(path, `${JSON.stringify(run)}
7146
8444
  `);
7147
8445
  });
7148
8446
  },
7149
8447
  async list() {
7150
8448
  return lock.runExclusive(() => {
7151
- if (!existsSync3(path)) return [];
8449
+ if (!existsSync5(path)) return [];
7152
8450
  return readJsonl(path);
7153
8451
  });
7154
8452
  }
@@ -7491,7 +8789,7 @@ function throwIfAborted(signal) {
7491
8789
  throw new Error(signal.reason ? String(signal.reason) : "reference replay aborted");
7492
8790
  }
7493
8791
  function readJsonl(path) {
7494
- const raw = readFileSync3(path, "utf8");
8792
+ const raw = readFileSync4(path, "utf8");
7495
8793
  const out = [];
7496
8794
  for (const line of raw.split("\n")) {
7497
8795
  const trimmed = line.trim();
@@ -7646,202 +8944,6 @@ function createDefaultReviewer(options) {
7646
8944
  };
7647
8945
  }
7648
8946
 
7649
- // src/semantic-concept-judge.ts
7650
- var DEFAULT_COMPLEXITY_WEIGHTS = {
7651
- render: 1,
7652
- integrate: 2,
7653
- compute: 2.5
7654
- };
7655
- var SEMANTIC_CONCEPT_JUDGE_VERSION = "semantic-concept-judge-v1-2026-04-24";
7656
- var DEFAULT_MAX_SOURCE2 = 45e3;
7657
- var DEFAULT_MAX_HTML2 = 3e4;
7658
- var DEFAULT_MAX_PER_FILE2 = 2e4;
7659
- var DEFAULT_TIMEOUT2 = 18e4;
7660
- var DEFAULT_MODEL2 = "claude-sonnet-4-6";
7661
- var SEMANTIC_SCHEMA = {
7662
- type: "object",
7663
- additionalProperties: false,
7664
- required: ["summary", "concepts"],
7665
- properties: {
7666
- summary: { type: "string", minLength: 20, maxLength: 600 },
7667
- concepts: {
7668
- type: "array",
7669
- minItems: 1,
7670
- items: {
7671
- type: "object",
7672
- additionalProperties: false,
7673
- required: ["concept", "present", "score", "evidence", "severity"],
7674
- properties: {
7675
- concept: { type: "string", minLength: 1, maxLength: 120 },
7676
- present: { type: "boolean" },
7677
- score: { type: "number", minimum: 0, maximum: 10 },
7678
- evidence: { type: "string", minLength: 5, maxLength: 400 },
7679
- severity: { type: "string", enum: ["critical", "major", "minor", "info"] }
7680
- }
7681
- }
7682
- }
7683
- }
7684
- };
7685
- function truncate2(body, cap, label) {
7686
- if (body.length <= cap) return body;
7687
- return `${body.slice(0, cap)}
7688
- \u2026 [truncated ${body.length - cap} chars of ${label}]`;
7689
- }
7690
- function buildPrompt2(input, opts) {
7691
- const sourceBlob = input.sourceFiles.filter((f) => f.content.length <= opts.maxPerFileChars).map((f) => `--- FILE: ${f.path} ---
7692
- ${f.content}`).join("\n\n");
7693
- const html = input.servedHtml ?? "";
7694
- return `You are a strict code-review judge evaluating whether an agent's 0-to-1 build actually implements the features the user asked for.
7695
-
7696
- You MUST distinguish:
7697
- (a) WORKING code that implements the concept (rendered UI, wired handler, real API call),
7698
- (b) KEYWORD-PRESENT stub (comments mentioning the concept, variable names, TODOs),
7699
- (c) ABSENT (concept nowhere).
7700
-
7701
- A comment like "// TODO: add mint button" is NOT present \u2014 score 2-3. Only count a concept as present if there is real functional code: a rendered component, a call handler wired to state or a network call, a computed value actually used.
7702
-
7703
- USER REQUEST (what the agent was asked to build):
7704
- ${input.userRequest}
7705
-
7706
- ${input.artifactLabel ? `ARTIFACT METADATA:
7707
- name: ${input.artifactLabel}
7708
- description: ${input.artifactDescription ?? ""}
7709
-
7710
- ` : ""}EXPECTED CONCEPTS (each must be graded independently):
7711
- ${input.expectedConcepts.map(
7712
- (c, i) => ` ${i + 1}. "${c.name}"${c.keywords?.length ? ` \u2014 hints: [${c.keywords.slice(0, 6).join(" | ")}]` : ""}`
7713
- ).join("\n")}
7714
-
7715
- ${html ? `SERVED HTML (what the preview returns when hit):
7716
- ${truncate2(html, opts.maxHtmlChars, "HTML")}
7717
-
7718
- ` : ""}SOURCE FILES (the agent's workdir):
7719
- ${truncate2(sourceBlob, opts.maxSourceChars, "source")}
7720
-
7721
- For EACH concept, return:
7722
- - concept: the concept name as given (match exactly)
7723
- - present: boolean \u2014 does a working implementation exist?
7724
- - score: 0-10 \u2014 10 = production-ready; 7 = functional but thin; 4 = partial/stubbed; 2 = keyword-only comment; 0 = absent
7725
- - evidence: cite "<file>:<line>" or "served-html:<selector>" pointing at the strongest supporting code. If the concept is absent or stubbed, explain what's missing.
7726
- - severity:
7727
- "info" when present: true AND score >= 7
7728
- "minor" when present: true AND 4 <= score < 7
7729
- "major" when present: false OR score < 4
7730
- "critical" when the concept is not only absent but a core user flow depends on it
7731
-
7732
- Also produce a "summary" (one sentence, 20-600 chars): overall verdict on whether this is a shippable implementation of the user request vs a keyword-dense placeholder.
7733
-
7734
- BE SKEPTICAL. Keyword matching already passed \u2014 your job is to catch what keyword matching misses. If the agent shipped a working build, say so. If it shipped a stub, say so. Don't grade on effort.
7735
-
7736
- Return STRICT JSON. No prose outside the JSON.`;
7737
- }
7738
- async function runSemanticConceptJudge(input, options = {}) {
7739
- const start = Date.now();
7740
- const totalCount = input.expectedConcepts.length;
7741
- if (totalCount === 0) {
7742
- return {
7743
- kind: "semantic-concept",
7744
- version: SEMANTIC_CONCEPT_JUDGE_VERSION,
7745
- score: 0,
7746
- presentCount: 0,
7747
- totalCount: 0,
7748
- findings: [],
7749
- summary: "no expected concepts declared",
7750
- durationMs: 0,
7751
- costUsd: null,
7752
- available: false,
7753
- error: "no expected concepts declared"
7754
- };
7755
- }
7756
- const opts = {
7757
- model: options.model ?? DEFAULT_MODEL2,
7758
- timeoutMs: options.timeoutMs ?? DEFAULT_TIMEOUT2,
7759
- maxSourceChars: options.maxSourceChars ?? DEFAULT_MAX_SOURCE2,
7760
- maxPerFileChars: options.maxPerFileChars ?? DEFAULT_MAX_PER_FILE2,
7761
- maxHtmlChars: options.maxHtmlChars ?? DEFAULT_MAX_HTML2,
7762
- llm: options.llm ?? {},
7763
- weightConcepts: options.weightConcepts ?? "mean",
7764
- complexityWeights: { ...DEFAULT_COMPLEXITY_WEIGHTS, ...options.complexityWeights ?? {} }
7765
- };
7766
- const weightForConcept = (spec) => {
7767
- if (opts.weightConcepts === "mean") return 1;
7768
- if (spec.weight != null) return spec.weight;
7769
- if (opts.weightConcepts === "complexity") {
7770
- return opts.complexityWeights[spec.complexity ?? "render"] ?? 1;
7771
- }
7772
- return 1;
7773
- };
7774
- const weightByName = new Map(
7775
- input.expectedConcepts.map((c) => [c.name, weightForConcept(c)])
7776
- );
7777
- try {
7778
- const { value, result } = await callLlmJson(
7779
- {
7780
- model: opts.model,
7781
- messages: [
7782
- {
7783
- role: "system",
7784
- content: "You are a strict code-review judge. Return strict JSON only. No prose outside the JSON. A keyword in a comment is NOT a working implementation."
7785
- },
7786
- { role: "user", content: buildPrompt2(input, opts) }
7787
- ],
7788
- jsonSchema: { name: "semantic_concept_judge", schema: SEMANTIC_SCHEMA },
7789
- temperature: 0,
7790
- timeoutMs: opts.timeoutMs
7791
- },
7792
- opts.llm
7793
- );
7794
- if (!value?.concepts || !Array.isArray(value.concepts)) {
7795
- throw new Error('judge returned malformed response \u2014 expected array under "concepts"');
7796
- }
7797
- const findings = value.concepts.map((c) => ({
7798
- concept: String(c.concept),
7799
- present: Boolean(c.present),
7800
- score: Math.max(0, Math.min(10, Number(c.score ?? 0))),
7801
- evidence: String(c.evidence ?? ""),
7802
- severity: ["critical", "major", "minor", "info"].includes(c.severity) ? c.severity : "info"
7803
- }));
7804
- const presentCount = findings.filter((f) => f.present && f.score >= 7).length;
7805
- let weightSum = 0;
7806
- let weightedScoreSum = 0;
7807
- for (const f of findings) {
7808
- const w = weightByName.get(f.concept) ?? 1;
7809
- weightSum += w;
7810
- weightedScoreSum += w * f.score;
7811
- }
7812
- const scoreAvg = weightSum > 0 ? weightedScoreSum / weightSum : findings.reduce((a, f) => a + f.score, 0) / Math.max(1, findings.length);
7813
- return {
7814
- kind: "semantic-concept",
7815
- version: SEMANTIC_CONCEPT_JUDGE_VERSION,
7816
- score: Number((scoreAvg / 10).toFixed(3)),
7817
- presentCount,
7818
- totalCount,
7819
- findings,
7820
- summary: String(value.summary ?? ""),
7821
- durationMs: Date.now() - start,
7822
- costUsd: result.costUsd ?? null,
7823
- available: true
7824
- };
7825
- } catch (err) {
7826
- return {
7827
- kind: "semantic-concept",
7828
- version: SEMANTIC_CONCEPT_JUDGE_VERSION,
7829
- score: 0,
7830
- presentCount: 0,
7831
- totalCount,
7832
- findings: [],
7833
- summary: "",
7834
- durationMs: Date.now() - start,
7835
- costUsd: null,
7836
- available: false,
7837
- error: err instanceof Error ? err.message : String(err)
7838
- };
7839
- }
7840
- }
7841
- function createSemanticConceptJudge(options = {}) {
7842
- return (input) => runSemanticConceptJudge(input, options);
7843
- }
7844
-
7845
8947
  // src/canary.ts
7846
8948
  function runCanaries(runs, opts = {}) {
7847
8949
  const alerts = [
@@ -8040,8 +9142,8 @@ function chiSquareCritical(df, alpha) {
8040
9142
  if (TABLE[df]) return TABLE[df][idx];
8041
9143
  if (df > 30) {
8042
9144
  const zMap = { 0: 1.282, 1: 1.645, 2: 1.96, 3: 2.326 };
8043
- const z = zMap[idx] ?? 1.96;
8044
- const term = 1 - 2 / (9 * df) + z * Math.sqrt(2 / (9 * df));
9145
+ const z2 = zMap[idx] ?? 1.96;
9146
+ const term = 1 - 2 / (9 * df) + z2 * Math.sqrt(2 / (9 * df));
8045
9147
  return df * term ** 3;
8046
9148
  }
8047
9149
  const keys = Object.keys(TABLE).map((k) => Number(k)).sort((a, b) => a - b);
@@ -8255,44 +9357,8 @@ async function discoverPersonas(dir, opts = {}) {
8255
9357
  }
8256
9358
 
8257
9359
  // src/evolution-telemetry.ts
8258
- import { appendFileSync as appendFileSync3, existsSync as existsSync5, mkdirSync as mkdirSync3, readFileSync as readFileSync4, writeFileSync } from "fs";
9360
+ import { appendFileSync as appendFileSync3, existsSync as existsSync6, mkdirSync as mkdirSync3, readFileSync as readFileSync5, writeFileSync } from "fs";
8259
9361
  import { dirname as dirname3 } from "path";
8260
-
8261
- // src/locked-jsonl-appender.ts
8262
- import { appendFileSync as appendFileSync2, existsSync as existsSync4, mkdirSync as mkdirSync2 } from "fs";
8263
- import { dirname as dirname2 } from "path";
8264
- var mutexes = /* @__PURE__ */ new Map();
8265
- function getMutex(path) {
8266
- let m = mutexes.get(path);
8267
- if (!m) {
8268
- m = new Mutex();
8269
- mutexes.set(path, m);
8270
- }
8271
- return m;
8272
- }
8273
- var LockedJsonlAppender = class {
8274
- constructor(path) {
8275
- this.path = path;
8276
- this.mutex = getMutex(path);
8277
- if (!existsSync4(dirname2(path))) {
8278
- mkdirSync2(dirname2(path), { recursive: true });
8279
- }
8280
- }
8281
- path;
8282
- mutex;
8283
- async append(entry) {
8284
- const line = `${JSON.stringify(entry)}
8285
- `;
8286
- await this.mutex.runExclusive(() => {
8287
- appendFileSync2(this.path, line);
8288
- });
8289
- }
8290
- };
8291
- function resetLockedAppendersForTesting() {
8292
- mutexes.clear();
8293
- }
8294
-
8295
- // src/evolution-telemetry.ts
8296
9362
  var MutationTelemetry = class {
8297
9363
  appender;
8298
9364
  constructor(path) {
@@ -8322,16 +9388,16 @@ var LineageRecorder = class {
8322
9388
  this.snapshotPath = `${path}.snapshot`;
8323
9389
  this.kindOf = kindOf ?? defaultKindOf;
8324
9390
  mkdirSync3(dirname3(path), { recursive: true });
8325
- if (existsSync5(this.snapshotPath)) {
9391
+ if (existsSync6(this.snapshotPath)) {
8326
9392
  try {
8327
- const parsed = JSON.parse(readFileSync4(this.snapshotPath, "utf-8"));
9393
+ const parsed = JSON.parse(readFileSync5(this.snapshotPath, "utf-8"));
8328
9394
  for (const n of parsed) this.nodes.set(n.id, n);
8329
9395
  } catch {
8330
9396
  }
8331
9397
  }
8332
- if (existsSync5(path)) {
9398
+ if (existsSync6(path)) {
8333
9399
  try {
8334
- for (const line of readFileSync4(path, "utf-8").split("\n")) {
9400
+ for (const line of readFileSync5(path, "utf-8").split("\n")) {
8335
9401
  if (!line.trim()) continue;
8336
9402
  try {
8337
9403
  const entry = JSON.parse(line);
@@ -8343,9 +9409,9 @@ var LineageRecorder = class {
8343
9409
  } catch {
8344
9410
  }
8345
9411
  }
8346
- if (existsSync5(path) && this.nodes.size === 0) {
9412
+ if (existsSync6(path) && this.nodes.size === 0) {
8347
9413
  try {
8348
- const raw = readFileSync4(path, "utf-8").trim();
9414
+ const raw = readFileSync5(path, "utf-8").trim();
8349
9415
  if (raw.startsWith("[")) {
8350
9416
  const parsed = JSON.parse(raw);
8351
9417
  for (const n of parsed) this.nodes.set(n.id, n);
@@ -8359,8 +9425,8 @@ var LineageRecorder = class {
8359
9425
  const prev = this.nodes.get(node.id);
8360
9426
  this.nodes.set(node.id, { ...prev, ...node });
8361
9427
  try {
8362
- if (existsSync5(this.path)) {
8363
- const head = readFileSync4(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
9428
+ if (existsSync6(this.path)) {
9429
+ const head = readFileSync5(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
8364
9430
  if (head === "[") {
8365
9431
  writeFileSync(this.path, "");
8366
9432
  }
@@ -8426,9 +9492,9 @@ var CostLedger = class {
8426
9492
  mutex = new Mutex();
8427
9493
  constructor(path) {
8428
9494
  this.path = path;
8429
- if (existsSync5(path)) {
9495
+ if (existsSync6(path)) {
8430
9496
  try {
8431
- const loaded = JSON.parse(readFileSync4(path, "utf-8"));
9497
+ const loaded = JSON.parse(readFileSync5(path, "utf-8"));
8432
9498
  for (const k of Object.keys(this.totals)) {
8433
9499
  if (k === "byGeneration") {
8434
9500
  if (loaded.byGeneration && typeof loaded.byGeneration === "object") {
@@ -8597,7 +9663,7 @@ function precision(goldens, candidates, options = {}) {
8597
9663
  }
8598
9664
 
8599
9665
  // src/jsonl-trial-cache.ts
8600
- import { appendFileSync as appendFileSync4, existsSync as existsSync6, mkdirSync as mkdirSync4, readFileSync as readFileSync5 } from "fs";
9666
+ import { appendFileSync as appendFileSync4, existsSync as existsSync7, mkdirSync as mkdirSync4, readFileSync as readFileSync6 } from "fs";
8601
9667
  import { dirname as dirname4 } from "path";
8602
9668
  var JsonlTrialCache = class {
8603
9669
  map = /* @__PURE__ */ new Map();
@@ -8605,8 +9671,8 @@ var JsonlTrialCache = class {
8605
9671
  appender;
8606
9672
  constructor(path) {
8607
9673
  this.path = path;
8608
- if (existsSync6(path)) {
8609
- for (const line of readFileSync5(path, "utf-8").split("\n")) {
9674
+ if (existsSync7(path)) {
9675
+ for (const line of readFileSync6(path, "utf-8").split("\n")) {
8610
9676
  if (!line.trim()) continue;
8611
9677
  try {
8612
9678
  const entry = JSON.parse(line);
@@ -8994,8 +10060,10 @@ function aggregateTrialsByMode(trials, opts) {
8994
10060
  };
8995
10061
  }
8996
10062
  export {
10063
+ ANALYST_SEVERITIES,
8997
10064
  AgentDriver,
8998
10065
  AgentEvalError,
10066
+ AnalystRegistry,
8999
10067
  AxGepaSteeringOptimizer,
9000
10068
  BENCHMARK_SPLIT_SEED,
9001
10069
  BenchmarkRunner,
@@ -9019,19 +10087,23 @@ export {
9019
10087
  DEFAULT_RED_TEAM_CORPUS,
9020
10088
  DEFAULT_RUN_SCORE_WEIGHTS,
9021
10089
  DEFAULT_SEVERITY_WEIGHTS,
10090
+ DEFAULT_TRACE_ANALYST_KINDS,
9022
10091
  Dataset,
9023
10092
  DockerSandboxDriver,
9024
10093
  DualAgentBench,
9025
10094
  ERROR_COUNT_PATTERNS,
9026
10095
  ExperimentTracker,
9027
10096
  FAILURE_CLASSES,
10097
+ FAILURE_MODE_KIND_SPEC,
9028
10098
  FileSystemExperimentStore,
9029
10099
  FileSystemFeedbackTrajectoryStore,
9030
10100
  FileSystemRawProviderSink,
9031
10101
  FileSystemTraceStore,
10102
+ FindingsStore,
9032
10103
  HeldOutGate,
9033
10104
  HoldoutAuditor,
9034
10105
  HoldoutLockedError,
10106
+ IMPROVEMENT_KIND_SPEC,
9035
10107
  INTENT_MATCH_JUDGE_VERSION,
9036
10108
  InMemoryExperimentStore,
9037
10109
  InMemoryFeedbackTrajectoryStore,
@@ -9042,6 +10114,8 @@ export {
9042
10114
  JsonlTrialCache,
9043
10115
  JudgeError,
9044
10116
  JudgeRunner,
10117
+ KNOWLEDGE_GAP_KIND_SPEC,
10118
+ KNOWLEDGE_POISONING_KIND_SPEC,
9045
10119
  LineageRecorder,
9046
10120
  LlmCallError,
9047
10121
  LlmClient,
@@ -9059,8 +10133,10 @@ export {
9059
10133
  PairwiseSteeringOptimizer,
9060
10134
  ProductClient,
9061
10135
  PromptRegistry,
10136
+ RAW_FINDING_SCHEMA_PROMPT,
9062
10137
  REDACTION_VERSION,
9063
10138
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
10139
+ RawAnalystFindingSchema,
9064
10140
  ReplayCache,
9065
10141
  ReplayCacheMissError,
9066
10142
  ReplayError,
@@ -9102,6 +10178,7 @@ export {
9102
10178
  bootstrapCi,
9103
10179
  buildReflectionPrompt,
9104
10180
  buildReviewerPrompt,
10181
+ buildTraceToolsForGroup,
9105
10182
  buildTrajectory,
9106
10183
  byteLengthRange,
9107
10184
  calibrateJudge,
@@ -9127,6 +10204,7 @@ export {
9127
10204
  compilerJudge,
9128
10205
  composeParsers,
9129
10206
  composeValidators,
10207
+ computeFindingId,
9130
10208
  computeToolUseMetrics,
9131
10209
  confidenceInterval,
9132
10210
  containsAll,
@@ -9137,26 +10215,35 @@ export {
9137
10215
  corpusInterRaterAgreement,
9138
10216
  corpusInterRaterAgreementFromJudgeScores,
9139
10217
  createAntiSlopJudge,
10218
+ createChatClient,
9140
10219
  createCompositeMutator,
9141
10220
  createCustomJudge,
9142
10221
  createDefaultReviewer,
9143
10222
  createDomainExpertJudge,
9144
10223
  createFeedbackTrajectory,
9145
10224
  createIntentMatchJudge,
10225
+ createJudgeAdapter,
9146
10226
  createLlmReviewer,
9147
10227
  createReplayFetch,
10228
+ createRunCriticAdapter,
9148
10229
  createSandboxCodeMutator,
9149
10230
  createSandboxPool,
9150
10231
  createSemanticConceptJudge,
10232
+ createSemanticConceptJudgeAdapter,
10233
+ createTraceAnalystAdapter,
10234
+ createTraceAnalystKind,
10235
+ createVerifierAdapter,
9151
10236
  crossTraceDiff,
9152
10237
  crowdingDistance,
9153
10238
  decideReferenceReplayPromotion,
9154
10239
  decideReferenceReplayRunPromotion,
10240
+ defaultIsMaterial,
9155
10241
  defaultJudges,
9156
10242
  defaultMultiShotObjectives,
9157
10243
  defaultProviderRedactor,
9158
10244
  defaultReferenceReplayMatcher,
9159
10245
  deployGateLayer,
10246
+ diffFindings,
9160
10247
  discoverPersonas,
9161
10248
  distillPlaybook,
9162
10249
  dominates,
@@ -9225,12 +10312,14 @@ export {
9225
10312
  judgeSpans,
9226
10313
  keyPreserved,
9227
10314
  knowledgeReadinessTracePayload,
10315
+ liftSeverity,
9228
10316
  linterJudge,
9229
10317
  llmSpanFromProvider,
9230
10318
  llmSpans,
9231
10319
  loadScorerFromGrader,
9232
10320
  localCommandRunner,
9233
10321
  lowercaseMutator,
10322
+ makeFinding,
9234
10323
  mannWhitneyU,
9235
10324
  matchGoldens,
9236
10325
  mergeLayerResults,
@@ -9250,6 +10339,7 @@ export {
9250
10339
  paretoFrontier,
9251
10340
  paretoFrontierWithCrowding,
9252
10341
  parseFeedbackTrajectoriesJsonl,
10342
+ parseRawFinding,
9253
10343
  parseReflectionResponse,
9254
10344
  parseRunRecordSafe,
9255
10345
  partialCredit,
@@ -9277,6 +10367,7 @@ export {
9277
10367
  renderMarkdownReport,
9278
10368
  renderPlaybookMarkdown,
9279
10369
  renderPreferenceMemoryMarkdown,
10370
+ renderPriorFindings,
9280
10371
  renderReleaseReport,
9281
10372
  renderSteeringText,
9282
10373
  replayFeedbackTrajectories,