npm - @tangle-network/agent-eval - Versions diffs - 0.29.1 → 0.31.0 - Mend

@tangle-network/agent-eval 0.29.1 → 0.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

package/dist/traces.js CHANGED Viewed

@@ -17,13 +17,23 @@ import {
   TraceNotFoundError,
   analyzeTraces,
   buildTraceAnalystTools,
+  buildTraceInsightContext,
+  buildTraceInsightPrompt,
   createReplayFetch,
+  defaultTraceInsightPanel,
+  describeTraceInsightScope,
+  domainEvidencePattern,
   exportRunAsOtlp,
+  inferDomainKeywords,
   iterateRawCalls,
+  planTraceInsightQuestions,
   redactString,
   redactValue,
-  traceAnalystFunctionGroup
-} from "./chunk-UW4NOOZI.js";
+  scoreTraceInsightReadiness,
+  tokenizeDomainWords,
+  traceAnalystFunctionGroup,
+  traceAnalystOnRunComplete
+} from "./chunk-HIO4UIS5.js";
 import {
   aggregateLlm,
   argHash,
@@ -47,7 +57,7 @@ import {
   RunIntegrityError,
   assertRunCaptured,
   throwIfRunIncomplete
-} from "./chunk-KTGTIOFD.js";
+} from "./chunk-UBPIXOC4.js";
 import {
   FileSystemRawProviderSink,
   InMemoryRawProviderSink,
@@ -60,304 +70,8 @@ import {
   llmSpanFromProvider
 } from "./chunk-TVVP3ZZQ.js";
 import "./chunk-VSMTAMNK.js";
-import "./chunk-NG236HPC.js";
+import "./chunk-QYJT52YW.js";
 import "./chunk-PZ5AY32C.js";
-// src/trace-analyst/hook.ts
-var DEFAULT_QUESTION = "Summarise what happened in this run. Surface any failure modes, surprising findings, or evidence that the run's verdict is wrong.";
-function traceAnalystOnRunComplete(opts) {
-  return async (ctx) => {
-    if (opts.shouldRun && !opts.shouldRun(ctx)) return;
-    const source = opts.analyze.source;
-    if (source === void 0) {
-      await ctx.store.appendEvent({
-        eventId: `analyst-skip-${ctx.runId}`,
-        runId: ctx.runId,
-        kind: "log",
-        timestamp: Date.now(),
-        payload: { source: "trace_analyst_hook", reason: "no source configured" }
-      });
-      return;
-    }
-    const result = await analyzeTraces({ question: opts.question ?? DEFAULT_QUESTION }, {
-      ...opts.analyze,
-      source
-    });
-    if (opts.save) await opts.save(result, ctx);
-    if (opts.gateOn && !opts.gateOn(result, ctx)) {
-      await ctx.store.appendEvent({
-        eventId: `analyst-gate-${ctx.runId}`,
-        runId: ctx.runId,
-        kind: "log",
-        timestamp: Date.now(),
-        payload: {
-          source: "trace_analyst_hook",
-          reason: "analyst_gate_failed",
-          findings: result.findings
-        }
-      });
-    }
-  };
-}
-// src/trace-analyst/insights.ts
-var DOMAIN_STOP_WORDS = /* @__PURE__ */ new Set([
-  "and",
-  "advanced",
-  "app",
-  "build",
-  "create",
-  "easy",
-  "expert",
-  "extreme",
-  "for",
-  "from",
-  "hard",
-  "implementation",
-  "integrate",
-  "medium",
-  "project",
-  "task",
-  "the",
-  "this",
-  "with",
-  "workflow"
-]);
-function tokenizeDomainWords(value) {
-  return [...value.matchAll(/[A-Za-z][A-Za-z0-9.+#-]{2,}/g)].map((match) => match[0].toLowerCase()).filter((word) => !DOMAIN_STOP_WORDS.has(word));
-}
-function inferDomainKeywords(suite) {
-  const suiteWords = new Set(tokenizeDomainWords(`${suite.name} ${suite.collectionId ?? ""}`));
-  const source = [
-    suite.name,
-    suite.collectionId ?? "",
-    ...suite.tasks.flatMap((task) => [
-      task.id,
-      task.name,
-      task.prompt ?? "",
-      task.difficulty ?? "",
-      ...task.tags ?? [],
-      ...task.gaps ?? []
-    ])
-  ].join(" ");
-  const counts = /* @__PURE__ */ new Map();
-  for (const word of tokenizeDomainWords(source)) counts.set(word, (counts.get(word) ?? 0) + 1);
-  return [...counts.entries()].filter(([word, count]) => count >= 2 || suiteWords.has(word)).sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0])).map(([word]) => word).slice(0, 18);
-}
-function domainEvidencePattern(keywords) {
-  const escaped = keywords.filter((keyword) => keyword.length >= 3).map((keyword) => keyword.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"));
-  return escaped.length > 0 ? new RegExp(`(?<![A-Za-z0-9])(?:${escaped.join("|")})(?![A-Za-z0-9])`, "i") : /(?<![A-Za-z0-9])(?:sdk|api|css|dns|xml|provider|client|service|integration|webhook|transaction|auth|oauth|graphql|rest)(?![A-Za-z0-9])/i;
-}
-function describeTraceInsightScope(suite) {
-  const taskLabel = suite.tasks.length === 1 ? "1 implementation task" : `${suite.tasks.length} implementation tasks`;
-  const tags = /* @__PURE__ */ new Map();
-  for (const task of suite.tasks) {
-    for (const tag of task.tags ?? []) tags.set(tag, (tags.get(tag) ?? 0) + 1);
-  }
-  const topTags = [...tags.entries()].sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0])).slice(0, 8).map(([tag]) => tag);
-  if (topTags.length > 0) return `${taskLabel} across ${topTags.join(", ")}.`;
-  const difficulties = [
-    ...new Set(
-      suite.tasks.map((task) => task.difficulty).filter((value) => Boolean(value))
-    )
-  ].join(", ");
-  return `${taskLabel} across ${difficulties || "the selected benchmark scope"}.`;
-}
-function planTraceInsightQuestions(input) {
-  const hasFailures = input.suite.tasks.some((task) => task.outcome && task.outcome !== "satisfied");
-  const hasMultipleShots = input.suite.tasks.some(
-    (task) => (task.gaps ?? []).some((gap) => /shot|review|retry|continue/i.test(gap))
-  );
-  const questions = [
-    {
-      id: "execution-path",
-      question: "What did the worker actually do before the first meaningful implementation edit?",
-      why: "Separates grounded execution from polished but shallow output."
-    },
-    {
-      id: "research-grounding",
-      question: "Did the worker inspect docs, source, examples, or package references before committing to an implementation path?",
-      why: "Identifies whether failures came from weak retrieval, weak examples, or premature coding."
-    },
-    {
-      id: "domain-proof",
-      question: "Which tasks produced executable domain proof versus UI copy, placeholders, or inferred behavior?",
-      why: "Keeps product-quality claims tied to concrete evidence."
-    },
-    {
-      id: "root-cause",
-      question: "For each major failure cluster, is the likely root cause prompt/scaffold, docs/examples, SDK/API ergonomics, evaluator, runtime, or model behavior?",
-      why: "Turns trace observations into actionable ownership."
-    },
-    {
-      id: "evidence-quality",
-      question: "Which external-facing claims are directly supported by trace ids, span ids, verifier findings, reviewer notes, or generated code?",
-      why: "Prevents unsupported customer-report conclusions."
-    }
-  ];
-  if (hasMultipleShots) {
-    questions.push({
-      id: "reviewer-lift",
-      question: "Where did reviewer feedback improve score, stall, or regress across shots?",
-      why: "Shows whether the driver loop is learning or merely repeating work."
-    });
-  }
-  if (hasFailures) {
-    questions.push({
-      id: "optimization-targets",
-      question: "Which prompt, evaluator, scaffold, or workflow changes should feed the next GEPA/autoresearch optimization run?",
-      why: "Connects benchmark evidence to the optimization loop."
-    });
-  }
-  return questions;
-}
-function buildTraceInsightContext(input) {
-  return {
-    suite: input.suite,
-    scope: describeTraceInsightScope(input.suite),
-    keywords: inferDomainKeywords(input.suite),
-    questions: planTraceInsightQuestions(input),
-    panel: defaultTraceInsightPanel(),
-    findings: input.findings ?? [],
-    agent: input.agent ?? null,
-    totals: input.totals ?? null
-  };
-}
-function scoreTraceInsightReadiness(context) {
-  const failedTasks = context.suite.tasks.filter(
-    (task) => task.outcome && task.outcome !== "satisfied"
-  );
-  const findingTaskIds = new Set(context.findings.flatMap((finding) => finding.taskIds));
-  const failedTasksWithFindings = failedTasks.filter((task) => findingTaskIds.has(task.id));
-  const tasksWithGaps = context.suite.tasks.filter((task) => (task.gaps ?? []).length > 0);
-  const gates = [
-    {
-      id: "domain-context",
-      label: "Domain context inferred",
-      passed: context.keywords.length > 0,
-      severity: "high",
-      detail: context.keywords.length > 0 ? `${context.keywords.length} domain terms inferred: ${context.keywords.slice(0, 8).join(", ")}` : "No domain terms were inferred from suite, tasks, prompts, tags, or gaps."
-    },
-    {
-      id: "panel-coverage",
-      label: "Analyst panel planned",
-      passed: context.panel.length >= 4 && context.questions.length >= 5,
-      severity: "high",
-      detail: `${context.panel.length} panel roles and ${context.questions.length} investigation questions planned.`
-    },
-    {
-      id: "failure-coverage",
-      label: "Failures mapped to findings",
-      passed: failedTasks.length === 0 || failedTasksWithFindings.length / failedTasks.length >= 0.5,
-      severity: "critical",
-      detail: failedTasks.length === 0 ? "No failed tasks in suite." : `${failedTasksWithFindings.length}/${failedTasks.length} failed tasks appear in finding clusters.`
-    },
-    {
-      id: "gap-evidence",
-      label: "Task gaps captured",
-      passed: failedTasks.length === 0 || tasksWithGaps.length / failedTasks.length >= 0.5,
-      severity: "medium",
-      detail: `${tasksWithGaps.length} tasks include explicit evaluator or analyst gaps.`
-    }
-  ];
-  const penalty = gates.reduce((sum, gate) => {
-    if (gate.passed) return sum;
-    if (gate.severity === "critical") return sum + 35;
-    if (gate.severity === "high") return sum + 20;
-    if (gate.severity === "medium") return sum + 10;
-    return sum + 5;
-  }, 0);
-  const score = Math.max(0, Math.min(1, 1 - penalty / 100));
-  return {
-    score,
-    grade: score >= 0.9 ? "external-ready" : score >= 0.7 ? "internal-review" : "raw-analysis",
-    gates
-  };
-}
-function defaultTraceInsightPanel() {
-  return [
-    {
-      id: "trace-forensics",
-      name: "Trace Forensics",
-      responsibility: "Reconstruct what the worker did in order, including research, edits, reviewer interventions, verifier feedback, and stop reason."
-    },
-    {
-      id: "root-cause",
-      name: "Root Cause",
-      responsibility: "Map failures to prompt/scaffold, docs/examples, SDK/API/product ergonomics, evaluator, runtime, or model behavior."
-    },
-    {
-      id: "optimization",
-      name: "Optimization",
-      responsibility: "Identify prompt, reviewer, evaluator, scaffold, and GEPA/autoresearch changes that should be tested next."
-    },
-    {
-      id: "external-evidence",
-      name: "External Evidence",
-      responsibility: "Separate customer-safe claims from internal harness findings and reject conclusions without task, trace, span, code, reviewer, or verifier evidence."
-    }
-  ];
-}
-function buildTraceInsightPrompt(input) {
-  const context = buildTraceInsightContext(input);
-  const maxRepresentativeTraces = input.maxRepresentativeTraces ?? 6;
-  return `Analyze this benchmark run and produce evidence-backed trace intelligence.
-Audience:
-- internal AI/product leadership
-- possible customer-facing report for ${input.suite.name}
-Investigation plan:
-${context.questions.map((item, index) => `${index + 1}. ${item.question} (${item.why})`).join("\n")}
-Analyst panel:
-${context.panel.map((role) => `- ${role.name}: ${role.responsibility}`).join("\n")}
-If the task branches are independent, use subagents for the panel roles above and aggregate their findings. Do not run a panel role unless its answer will change the final report.
-Required output:
-1. Executive verdict: what this run proves and does not prove.
-2. The investigation questions you answered and the evidence used.
-3. Failure taxonomy: agent prompting, evaluator/harness, docs/examples, SDK/API/product integration, infra.
-4. Evidence-backed examples with trace ids/task ids and concrete verifier findings.
-5. Highest-ROI fixes for the benchmark harness, prompt/GEPA optimization, and customer-facing product/docs surface.
-6. What is safe for an external report versus what must stay internal.
-7. One rerun plan that would validate lift after optimization.
-Budget:
-- Inspect the dataset overview, the failure summary, and at most ${maxRepresentativeTraces} representative traces.
-- Prefer traces named in the failure summary over broad exploration.
-- Do not do exhaustive trace sweeps.
-- Return the final report as soon as the taxonomy and examples are supported.
-Run summary:
-${JSON.stringify(
-    {
-      suite: input.suite.name,
-      scope: context.scope,
-      inferredKeywords: context.keywords,
-      agent: context.agent,
-      totals: context.totals,
-      findings: context.findings.map((finding) => ({
-        kind: finding.kind,
-        severity: finding.severity,
-        taskCount: finding.taskIds.length,
-        proposedFixClass: finding.proposedFixClass
-      })),
-      failures: input.suite.tasks.filter((task) => task.outcome && task.outcome !== "satisfied").map((task) => ({
-        task: task.id,
-        difficulty: task.difficulty,
-        outcome: task.outcome,
-        score: task.score,
-        gaps: task.gaps ?? []
-      }))
-    },
-    null,
-    2
-  )}
-Use the trace tools. Do not invent facts. Cite task ids. Separate customer-facing claims from internal harness/model findings.`;
-}
 export {
   DEFAULT_REDACTION_RULES,
   DEFAULT_TRACE_ANALYST_BUDGETS,

package/dist/traces.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"sources":["../src/trace-analyst/hook.ts","../src/trace-analyst/insights.ts"],"sourcesContent":["/*\n Trace-analyst auto-execution hook.\n \n Wires `analyzeTraces` into a `TraceEmitter`'s `onRunComplete` so a\n * direct matrix run produces an analysis artifact without an out-of-band\n * step. Designed for the case where the consumer reports \"the analyst\n * never ran\" — the cause is almost always orchestration, not the analyst.\n \n Usage:\n \n const emitter = new TraceEmitter(store, {\n * onRunComplete: [traceAnalystOnRunComplete({ analyze: opts, save })],\n * })\n \n Hooks are best-effort by default — they never crash the underlying run.\n * The caller decides whether to gate the run on the analysis result via\n * the `gateOn` callback.\n /\n\nimport type { RunCompleteHook, RunCompleteHookContext } from '../trace/emitter'\nimport { type AnalyzeTracesOptions, type AnalyzeTracesResult, analyzeTraces } from './analyst'\n\nexport interface TraceAnalystHookOptions {\n /\n Options forwarded to `analyzeTraces`. The hook supplies the question\n * if you don't pass one — defaulting to a launch-grade prompt that asks\n * for failure modes, surprising findings, and a recommendation.\n /\n analyze: Omit<AnalyzeTracesOptions, 'source'> & { source?: AnalyzeTracesOptions['source'] }\n /\n Override the question. The default is intentionally generic:\n * \"Summarise what happened in this run, surface any failure modes,\n * surprising findings, or evidence the verdict is wrong.\"\n /\n question?: string\n /\n Persist the result. The hook calls this with the analysis output and\n * the run context. Common implementations write to a TraceAnalysisStore\n * or append to a per-run JSONL.\n /\n save?: (result: AnalyzeTracesResult, ctx: RunCompleteHookContext) => Promise<void>\n /\n Predicate gating execution per run. Default: every completed run.\n * Use to skip aborted runs, debug runs, or runs without LLM activity.\n /\n shouldRun?: (ctx: RunCompleteHookContext) => boolean\n /\n Optional gate: if set and returns false, the hook records the failure\n * as a log event on the run instead of staying quiet. The caller can\n * then trigger downstream alerts off `analyst_gate_failed` log events.\n /\n gateOn?: (result: AnalyzeTracesResult, ctx: RunCompleteHookContext) => boolean\n}\n\nconst DEFAULT_QUESTION =\n \"Summarise what happened in this run. Surface any failure modes, surprising findings, or evidence that the run's verdict is wrong.\"\n\nexport function traceAnalystOnRunComplete(opts: TraceAnalystHookOptions): RunCompleteHook {\n return async (ctx: RunCompleteHookContext) => {\n if (opts.shouldRun && !opts.shouldRun(ctx)) return\n const source = opts.analyze.source\n if (source === undefined) {\n // The analyst needs a source. If the caller didn't supply one we don't\n // run — but we do leave a breadcrumb so the absence is visible.\n await ctx.store.appendEvent({\n eventId: `analyst-skip-${ctx.runId}`,\n runId: ctx.runId,\n kind: 'log',\n timestamp: Date.now(),\n payload: { source: 'trace_analyst_hook', reason: 'no source configured' },\n })\n return\n }\n const result = await analyzeTraces({ question: opts.question ?? DEFAULT_QUESTION }, {\n ...opts.analyze,\n source,\n } as AnalyzeTracesOptions)\n if (opts.save) await opts.save(result, ctx)\n if (opts.gateOn && !opts.gateOn(result, ctx)) {\n await ctx.store.appendEvent({\n eventId: `analyst-gate-${ctx.runId}`,\n runId: ctx.runId,\n kind: 'log',\n timestamp: Date.now(),\n payload: {\n source: 'trace_analyst_hook',\n reason: 'analyst_gate_failed',\n findings: result.findings,\n },\n })\n }\n }\n}\n","export interface TraceInsightTask {\n id: string\n name: string\n prompt?: string\n difficulty?: string\n tags?: string[]\n outcome?: string\n score?: number\n gaps?: string[]\n}\n\nexport interface TraceInsightSuite {\n name: string\n collectionId?: string\n tasks: TraceInsightTask[]\n}\n\nexport interface TraceInsightFinding {\n kind: string\n severity?: string\n taskIds: string[]\n evidence?: string\n proposedFixClass?: string\n}\n\nexport interface TraceInsightQuestion {\n id: string\n question: string\n why: string\n}\n\nexport interface TraceInsightPanelRole {\n id: string\n name: string\n responsibility: string\n}\n\nexport interface TraceInsightPromptInput {\n suite: TraceInsightSuite\n findings?: TraceInsightFinding[]\n agent?: Record<string, unknown>\n totals?: Record<string, unknown>\n maxRepresentativeTraces?: number\n}\n\nexport interface TraceInsightContext {\n suite: TraceInsightSuite\n scope: string\n keywords: string[]\n questions: TraceInsightQuestion[]\n panel: TraceInsightPanelRole[]\n findings: TraceInsightFinding[]\n agent: Record<string, unknown> \| null\n totals: Record<string, unknown> \| null\n}\n\nexport interface TraceInsightQualityGate {\n id: string\n label: string\n passed: boolean\n severity: 'critical' \| 'high' \| 'medium' \| 'low'\n detail: string\n}\n\nexport interface TraceInsightReadiness {\n score: number\n grade: 'external-ready' \| 'internal-review' \| 'raw-analysis'\n gates: TraceInsightQualityGate[]\n}\n\nconst DOMAIN_STOP_WORDS = new Set([\n 'and',\n 'advanced',\n 'app',\n 'build',\n 'create',\n 'easy',\n 'expert',\n 'extreme',\n 'for',\n 'from',\n 'hard',\n 'implementation',\n 'integrate',\n 'medium',\n 'project',\n 'task',\n 'the',\n 'this',\n 'with',\n 'workflow',\n])\n\nexport function tokenizeDomainWords(value: string): string[] {\n return [...value.matchAll(/[A-Za-z][A-Za-z0-9.+#-]{2,}/g)]\n .map((match) => match[0].toLowerCase())\n .filter((word) => !DOMAIN_STOP_WORDS.has(word))\n}\n\nexport function inferDomainKeywords(suite: TraceInsightSuite): string[] {\n const suiteWords = new Set(tokenizeDomainWords(`${suite.name} ${suite.collectionId ?? ''}`))\n const source = [\n suite.name,\n suite.collectionId ?? '',\n ...suite.tasks.flatMap((task) => [\n task.id,\n task.name,\n task.prompt ?? '',\n task.difficulty ?? '',\n ...(task.tags ?? []),\n ...(task.gaps ?? []),\n ]),\n ].join(' ')\n const counts = new Map<string, number>()\n for (const word of tokenizeDomainWords(source)) counts.set(word, (counts.get(word) ?? 0) + 1)\n return [...counts.entries()]\n .filter(([word, count]) => count >= 2 \|\| suiteWords.has(word))\n .sort((a, b) => b[1] - a[1] \|\| a[0].localeCompare(b[0]))\n .map(([word]) => word)\n .slice(0, 18)\n}\n\nexport function domainEvidencePattern(keywords: string[]): RegExp {\n const escaped = keywords\n .filter((keyword) => keyword.length >= 3)\n .map((keyword) => keyword.replace(/[.+?^${}()\|[\\]\\\\]/g, '\\\\$&'))\n return escaped.length > 0\n ? new RegExp(`(?<![A-Za-z0-9])(?:${escaped.join('\|')})(?![A-Za-z0-9])`, 'i')\n : /(?<![A-Za-z0-9])(?:sdk\|api\|css\|dns\|xml\|provider\|client\|service\|integration\|webhook\|transaction\|auth\|oauth\|graphql\|rest)(?![A-Za-z0-9])/i\n}\n\nexport function describeTraceInsightScope(suite: TraceInsightSuite): string {\n const taskLabel =\n suite.tasks.length === 1\n ? '1 implementation task'\n : `${suite.tasks.length} implementation tasks`\n const tags = new Map<string, number>()\n for (const task of suite.tasks) {\n for (const tag of task.tags ?? []) tags.set(tag, (tags.get(tag) ?? 0) + 1)\n }\n const topTags = [...tags.entries()]\n .sort((a, b) => b[1] - a[1] \|\| a[0].localeCompare(b[0]))\n .slice(0, 8)\n .map(([tag]) => tag)\n if (topTags.length > 0) return `${taskLabel} across ${topTags.join(', ')}.`\n const difficulties = [\n ...new Set(\n suite.tasks.map((task) => task.difficulty).filter((value): value is string => Boolean(value)),\n ),\n ].join(', ')\n return `${taskLabel} across ${difficulties \|\| 'the selected benchmark scope'}.`\n}\n\nexport function planTraceInsightQuestions(input: TraceInsightPromptInput): TraceInsightQuestion[] {\n const hasFailures = input.suite.tasks.some((task) => task.outcome && task.outcome !== 'satisfied')\n const hasMultipleShots = input.suite.tasks.some((task) =>\n (task.gaps ?? []).some((gap) => /shot\|review\|retry\|continue/i.test(gap)),\n )\n const questions: TraceInsightQuestion[] = [\n {\n id: 'execution-path',\n question: 'What did the worker actually do before the first meaningful implementation edit?',\n why: 'Separates grounded execution from polished but shallow output.',\n },\n {\n id: 'research-grounding',\n question:\n 'Did the worker inspect docs, source, examples, or package references before committing to an implementation path?',\n why: 'Identifies whether failures came from weak retrieval, weak examples, or premature coding.',\n },\n {\n id: 'domain-proof',\n question:\n 'Which tasks produced executable domain proof versus UI copy, placeholders, or inferred behavior?',\n why: 'Keeps product-quality claims tied to concrete evidence.',\n },\n {\n id: 'root-cause',\n question:\n 'For each major failure cluster, is the likely root cause prompt/scaffold, docs/examples, SDK/API ergonomics, evaluator, runtime, or model behavior?',\n why: 'Turns trace observations into actionable ownership.',\n },\n {\n id: 'evidence-quality',\n question:\n 'Which external-facing claims are directly supported by trace ids, span ids, verifier findings, reviewer notes, or generated code?',\n why: 'Prevents unsupported customer-report conclusions.',\n },\n ]\n if (hasMultipleShots) {\n questions.push({\n id: 'reviewer-lift',\n question: 'Where did reviewer feedback improve score, stall, or regress across shots?',\n why: 'Shows whether the driver loop is learning or merely repeating work.',\n })\n }\n if (hasFailures) {\n questions.push({\n id: 'optimization-targets',\n question:\n 'Which prompt, evaluator, scaffold, or workflow changes should feed the next GEPA/autoresearch optimization run?',\n why: 'Connects benchmark evidence to the optimization loop.',\n })\n }\n return questions\n}\n\nexport function buildTraceInsightContext(input: TraceInsightPromptInput): TraceInsightContext {\n return {\n suite: input.suite,\n scope: describeTraceInsightScope(input.suite),\n keywords: inferDomainKeywords(input.suite),\n questions: planTraceInsightQuestions(input),\n panel: defaultTraceInsightPanel(),\n findings: input.findings ?? [],\n agent: input.agent ?? null,\n totals: input.totals ?? null,\n }\n}\n\nexport function scoreTraceInsightReadiness(context: TraceInsightContext): TraceInsightReadiness {\n const failedTasks = context.suite.tasks.filter(\n (task) => task.outcome && task.outcome !== 'satisfied',\n )\n const findingTaskIds = new Set(context.findings.flatMap((finding) => finding.taskIds))\n const failedTasksWithFindings = failedTasks.filter((task) => findingTaskIds.has(task.id))\n const tasksWithGaps = context.suite.tasks.filter((task) => (task.gaps ?? []).length > 0)\n const gates: TraceInsightQualityGate[] = [\n {\n id: 'domain-context',\n label: 'Domain context inferred',\n passed: context.keywords.length > 0,\n severity: 'high',\n detail:\n context.keywords.length > 0\n ? `${context.keywords.length} domain terms inferred: ${context.keywords.slice(0, 8).join(', ')}`\n : 'No domain terms were inferred from suite, tasks, prompts, tags, or gaps.',\n },\n {\n id: 'panel-coverage',\n label: 'Analyst panel planned',\n passed: context.panel.length >= 4 && context.questions.length >= 5,\n severity: 'high',\n detail: `${context.panel.length} panel roles and ${context.questions.length} investigation questions planned.`,\n },\n {\n id: 'failure-coverage',\n label: 'Failures mapped to findings',\n passed:\n failedTasks.length === 0 \|\| failedTasksWithFindings.length / failedTasks.length >= 0.5,\n severity: 'critical',\n detail:\n failedTasks.length === 0\n ? 'No failed tasks in suite.'\n : `${failedTasksWithFindings.length}/${failedTasks.length} failed tasks appear in finding clusters.`,\n },\n {\n id: 'gap-evidence',\n label: 'Task gaps captured',\n passed: failedTasks.length === 0 \|\| tasksWithGaps.length / failedTasks.length >= 0.5,\n severity: 'medium',\n detail: `${tasksWithGaps.length} tasks include explicit evaluator or analyst gaps.`,\n },\n ]\n const penalty = gates.reduce((sum, gate) => {\n if (gate.passed) return sum\n if (gate.severity === 'critical') return sum + 35\n if (gate.severity === 'high') return sum + 20\n if (gate.severity === 'medium') return sum + 10\n return sum + 5\n }, 0)\n const score = Math.max(0, Math.min(1, 1 - penalty / 100))\n return {\n score,\n grade: score >= 0.9 ? 'external-ready' : score >= 0.7 ? 'internal-review' : 'raw-analysis',\n gates,\n }\n}\n\nexport function defaultTraceInsightPanel(): TraceInsightPanelRole[] {\n return [\n {\n id: 'trace-forensics',\n name: 'Trace Forensics',\n responsibility:\n 'Reconstruct what the worker did in order, including research, edits, reviewer interventions, verifier feedback, and stop reason.',\n },\n {\n id: 'root-cause',\n name: 'Root Cause',\n responsibility:\n 'Map failures to prompt/scaffold, docs/examples, SDK/API/product ergonomics, evaluator, runtime, or model behavior.',\n },\n {\n id: 'optimization',\n name: 'Optimization',\n responsibility:\n 'Identify prompt, reviewer, evaluator, scaffold, and GEPA/autoresearch changes that should be tested next.',\n },\n {\n id: 'external-evidence',\n name: 'External Evidence',\n responsibility:\n 'Separate customer-safe claims from internal harness findings and reject conclusions without task, trace, span, code, reviewer, or verifier evidence.',\n },\n ]\n}\n\nexport function buildTraceInsightPrompt(input: TraceInsightPromptInput): string {\n const context = buildTraceInsightContext(input)\n const maxRepresentativeTraces = input.maxRepresentativeTraces ?? 6\n return `Analyze this benchmark run and produce evidence-backed trace intelligence.\n\nAudience:\n- internal AI/product leadership\n- possible customer-facing report for ${input.suite.name}\n\nInvestigation plan:\n${context.questions.map((item, index) => `${index + 1}. ${item.question} (${item.why})`).join('\\n')}\n\nAnalyst panel:\n${context.panel.map((role) => `- ${role.name}: ${role.responsibility}`).join('\\n')}\n\nIf the task branches are independent, use subagents for the panel roles above and aggregate their findings. Do not run a panel role unless its answer will change the final report.\n\nRequired output:\n1. Executive verdict: what this run proves and does not prove.\n2. The investigation questions you answered and the evidence used.\n3. Failure taxonomy: agent prompting, evaluator/harness, docs/examples, SDK/API/product integration, infra.\n4. Evidence-backed examples with trace ids/task ids and concrete verifier findings.\n5. Highest-ROI fixes for the benchmark harness, prompt/GEPA optimization, and customer-facing product/docs surface.\n6. What is safe for an external report versus what must stay internal.\n7. One rerun plan that would validate lift after optimization.\n\nBudget:\n- Inspect the dataset overview, the failure summary, and at most ${maxRepresentativeTraces} representative traces.\n- Prefer traces named in the failure summary over broad exploration.\n- Do not do exhaustive trace sweeps.\n- Return the final report as soon as the taxonomy and examples are supported.\n\nRun summary:\n${JSON.stringify(\n {\n suite: input.suite.name,\n scope: context.scope,\n inferredKeywords: context.keywords,\n agent: context.agent,\n totals: context.totals,\n findings: context.findings.map((finding) => ({\n kind: finding.kind,\n severity: finding.severity,\n taskCount: finding.taskIds.length,\n proposedFixClass: finding.proposedFixClass,\n })),\n failures: input.suite.tasks\n .filter((task) => task.outcome && task.outcome !== 'satisfied')\n .map((task) => ({\n task: task.id,\n difficulty: task.difficulty,\n outcome: task.outcome,\n score: task.score,\n gaps: task.gaps ?? [],\n })),\n },\n null,\n 2,\n)}\n\nUse the trace tools. Do not invent facts. Cite task ids. Separate customer-facing claims from internal harness/model findings.`\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAsDA,IAAM,mBACJ;AAEK,SAAS,0BAA0B,MAAgD;AACxF,SAAO,OAAO,QAAgC;AAC5C,QAAI,KAAK,aAAa,CAAC,KAAK,UAAU,GAAG,EAAG;AAC5C,UAAM,SAAS,KAAK,QAAQ;AAC5B,QAAI,WAAW,QAAW;AAGxB,YAAM,IAAI,MAAM,YAAY;AAAA,QAC1B,SAAS,gBAAgB,IAAI,KAAK;AAAA,QAClC,OAAO,IAAI;AAAA,QACX,MAAM;AAAA,QACN,WAAW,KAAK,IAAI;AAAA,QACpB,SAAS,EAAE,QAAQ,sBAAsB,QAAQ,uBAAuB;AAAA,MAC1E,CAAC;AACD;AAAA,IACF;AACA,UAAM,SAAS,MAAM,cAAc,EAAE,UAAU,KAAK,YAAY,iBAAiB,GAAG;AAAA,MAClF,GAAG,KAAK;AAAA,MACR;AAAA,IACF,CAAyB;AACzB,QAAI,KAAK,KAAM,OAAM,KAAK,KAAK,QAAQ,GAAG;AAC1C,QAAI,KAAK,UAAU,CAAC,KAAK,OAAO,QAAQ,GAAG,GAAG;AAC5C,YAAM,IAAI,MAAM,YAAY;AAAA,QAC1B,SAAS,gBAAgB,IAAI,KAAK;AAAA,QAClC,OAAO,IAAI;AAAA,QACX,MAAM;AAAA,QACN,WAAW,KAAK,IAAI;AAAA,QACpB,SAAS;AAAA,UACP,QAAQ;AAAA,UACR,QAAQ;AAAA,UACR,UAAU,OAAO;AAAA,QACnB;AAAA,MACF,CAAC;AAAA,IACH;AAAA,EACF;AACF;;;ACtBA,IAAM,oBAAoB,oBAAI,IAAI;AAAA,EAChC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,CAAC;AAEM,SAAS,oBAAoB,OAAyB;AAC3D,SAAO,CAAC,GAAG,MAAM,SAAS,8BAA8B,CAAC,EACtD,IAAI,CAAC,UAAU,MAAM,CAAC,EAAE,YAAY,CAAC,EACrC,OAAO,CAAC,SAAS,CAAC,kBAAkB,IAAI,IAAI,CAAC;AAClD;AAEO,SAAS,oBAAoB,OAAoC;AACtE,QAAM,aAAa,IAAI,IAAI,oBAAoB,GAAG,MAAM,IAAI,IAAI,MAAM,gBAAgB,EAAE,EAAE,CAAC;AAC3F,QAAM,SAAS;AAAA,IACb,MAAM;AAAA,IACN,MAAM,gBAAgB;AAAA,IACtB,GAAG,MAAM,MAAM,QAAQ,CAAC,SAAS;AAAA,MAC/B,KAAK;AAAA,MACL,KAAK;AAAA,MACL,KAAK,UAAU;AAAA,MACf,KAAK,cAAc;AAAA,MACnB,GAAI,KAAK,QAAQ,CAAC;AAAA,MAClB,GAAI,KAAK,QAAQ,CAAC;AAAA,IACpB,CAAC;AAAA,EACH,EAAE,KAAK,GAAG;AACV,QAAM,SAAS,oBAAI,IAAoB;AACvC,aAAW,QAAQ,oBAAoB,MAAM,EAAG,QAAO,IAAI,OAAO,OAAO,IAAI,IAAI,KAAK,KAAK,CAAC;AAC5F,SAAO,CAAC,GAAG,OAAO,QAAQ,CAAC,EACxB,OAAO,CAAC,CAAC,MAAM,KAAK,MAAM,SAAS,KAAK,WAAW,IAAI,IAAI,CAAC,EAC5D,KAAK,CAAC,GAAG,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,KAAK,EAAE,CAAC,EAAE,cAAc,EAAE,CAAC,CAAC,CAAC,EACtD,IAAI,CAAC,CAAC,IAAI,MAAM,IAAI,EACpB,MAAM,GAAG,EAAE;AAChB;AAEO,SAAS,sBAAsB,UAA4B;AAChE,QAAM,UAAU,SACb,OAAO,CAAC,YAAY,QAAQ,UAAU,CAAC,EACvC,IAAI,CAAC,YAAY,QAAQ,QAAQ,uBAAuB,MAAM,CAAC;AAClE,SAAO,QAAQ,SAAS,IACpB,IAAI,OAAO,sBAAsB,QAAQ,KAAK,GAAG,CAAC,oBAAoB,GAAG,IACzE;AACN;AAEO,SAAS,0BAA0B,OAAkC;AAC1E,QAAM,YACJ,MAAM,MAAM,WAAW,IACnB,0BACA,GAAG,MAAM,MAAM,MAAM;AAC3B,QAAM,OAAO,oBAAI,IAAoB;AACrC,aAAW,QAAQ,MAAM,OAAO;AAC9B,eAAW,OAAO,KAAK,QAAQ,CAAC,EAAG,MAAK,IAAI,MAAM,KAAK,IAAI,GAAG,KAAK,KAAK,CAAC;AAAA,EAC3E;AACA,QAAM,UAAU,CAAC,GAAG,KAAK,QAAQ,CAAC,EAC/B,KAAK,CAAC,GAAG,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,KAAK,EAAE,CAAC,EAAE,cAAc,EAAE,CAAC,CAAC,CAAC,EACtD,MAAM,GAAG,CAAC,EACV,IAAI,CAAC,CAAC,GAAG,MAAM,GAAG;AACrB,MAAI,QAAQ,SAAS,EAAG,QAAO,GAAG,SAAS,WAAW,QAAQ,KAAK,IAAI,CAAC;AACxE,QAAM,eAAe;AAAA,IACnB,GAAG,IAAI;AAAA,MACL,MAAM,MAAM,IAAI,CAAC,SAAS,KAAK,UAAU,EAAE,OAAO,CAAC,UAA2B,QAAQ,KAAK,CAAC;AAAA,IAC9F;AAAA,EACF,EAAE,KAAK,IAAI;AACX,SAAO,GAAG,SAAS,WAAW,gBAAgB,8BAA8B;AAC9E;AAEO,SAAS,0BAA0B,OAAwD;AAChG,QAAM,cAAc,MAAM,MAAM,MAAM,KAAK,CAAC,SAAS,KAAK,WAAW,KAAK,YAAY,WAAW;AACjG,QAAM,mBAAmB,MAAM,MAAM,MAAM;AAAA,IAAK,CAAC,UAC9C,KAAK,QAAQ,CAAC,GAAG,KAAK,CAAC,QAAQ,8BAA8B,KAAK,GAAG,CAAC;AAAA,EACzE;AACA,QAAM,YAAoC;AAAA,IACxC;AAAA,MACE,IAAI;AAAA,MACJ,UAAU;AAAA,MACV,KAAK;AAAA,IACP;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,UACE;AAAA,MACF,KAAK;AAAA,IACP;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,UACE;AAAA,MACF,KAAK;AAAA,IACP;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,UACE;AAAA,MACF,KAAK;AAAA,IACP;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,UACE;AAAA,MACF,KAAK;AAAA,IACP;AAAA,EACF;AACA,MAAI,kBAAkB;AACpB,cAAU,KAAK;AAAA,MACb,IAAI;AAAA,MACJ,UAAU;AAAA,MACV,KAAK;AAAA,IACP,CAAC;AAAA,EACH;AACA,MAAI,aAAa;AACf,cAAU,KAAK;AAAA,MACb,IAAI;AAAA,MACJ,UACE;AAAA,MACF,KAAK;AAAA,IACP,CAAC;AAAA,EACH;AACA,SAAO;AACT;AAEO,SAAS,yBAAyB,OAAqD;AAC5F,SAAO;AAAA,IACL,OAAO,MAAM;AAAA,IACb,OAAO,0BAA0B,MAAM,KAAK;AAAA,IAC5C,UAAU,oBAAoB,MAAM,KAAK;AAAA,IACzC,WAAW,0BAA0B,KAAK;AAAA,IAC1C,OAAO,yBAAyB;AAAA,IAChC,UAAU,MAAM,YAAY,CAAC;AAAA,IAC7B,OAAO,MAAM,SAAS;AAAA,IACtB,QAAQ,MAAM,UAAU;AAAA,EAC1B;AACF;AAEO,SAAS,2BAA2B,SAAqD;AAC9F,QAAM,cAAc,QAAQ,MAAM,MAAM;AAAA,IACtC,CAAC,SAAS,KAAK,WAAW,KAAK,YAAY;AAAA,EAC7C;AACA,QAAM,iBAAiB,IAAI,IAAI,QAAQ,SAAS,QAAQ,CAAC,YAAY,QAAQ,OAAO,CAAC;AACrF,QAAM,0BAA0B,YAAY,OAAO,CAAC,SAAS,eAAe,IAAI,KAAK,EAAE,CAAC;AACxF,QAAM,gBAAgB,QAAQ,MAAM,MAAM,OAAO,CAAC,UAAU,KAAK,QAAQ,CAAC,GAAG,SAAS,CAAC;AACvF,QAAM,QAAmC;AAAA,IACvC;AAAA,MACE,IAAI;AAAA,MACJ,OAAO;AAAA,MACP,QAAQ,QAAQ,SAAS,SAAS;AAAA,MAClC,UAAU;AAAA,MACV,QACE,QAAQ,SAAS,SAAS,IACtB,GAAG,QAAQ,SAAS,MAAM,2BAA2B,QAAQ,SAAS,MAAM,GAAG,CAAC,EAAE,KAAK,IAAI,CAAC,KAC5F;AAAA,IACR;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,OAAO;AAAA,MACP,QAAQ,QAAQ,MAAM,UAAU,KAAK,QAAQ,UAAU,UAAU;AAAA,MACjE,UAAU;AAAA,MACV,QAAQ,GAAG,QAAQ,MAAM,MAAM,oBAAoB,QAAQ,UAAU,MAAM;AAAA,IAC7E;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,OAAO;AAAA,MACP,QACE,YAAY,WAAW,KAAK,wBAAwB,SAAS,YAAY,UAAU;AAAA,MACrF,UAAU;AAAA,MACV,QACE,YAAY,WAAW,IACnB,8BACA,GAAG,wBAAwB,MAAM,IAAI,YAAY,MAAM;AAAA,IAC/D;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,OAAO;AAAA,MACP,QAAQ,YAAY,WAAW,KAAK,cAAc,SAAS,YAAY,UAAU;AAAA,MACjF,UAAU;AAAA,MACV,QAAQ,GAAG,cAAc,MAAM;AAAA,IACjC;AAAA,EACF;AACA,QAAM,UAAU,MAAM,OAAO,CAAC,KAAK,SAAS;AAC1C,QAAI,KAAK,OAAQ,QAAO;AACxB,QAAI,KAAK,aAAa,WAAY,QAAO,MAAM;AAC/C,QAAI,KAAK,aAAa,OAAQ,QAAO,MAAM;AAC3C,QAAI,KAAK,aAAa,SAAU,QAAO,MAAM;AAC7C,WAAO,MAAM;AAAA,EACf,GAAG,CAAC;AACJ,QAAM,QAAQ,KAAK,IAAI,GAAG,KAAK,IAAI,GAAG,IAAI,UAAU,GAAG,CAAC;AACxD,SAAO;AAAA,IACL;AAAA,IACA,OAAO,SAAS,MAAM,mBAAmB,SAAS,MAAM,oBAAoB;AAAA,IAC5E;AAAA,EACF;AACF;AAEO,SAAS,2BAAoD;AAClE,SAAO;AAAA,IACL;AAAA,MACE,IAAI;AAAA,MACJ,MAAM;AAAA,MACN,gBACE;AAAA,IACJ;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,MAAM;AAAA,MACN,gBACE;AAAA,IACJ;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,MAAM;AAAA,MACN,gBACE;AAAA,IACJ;AAAA,IACA;AAAA,MACE,IAAI;AAAA,MACJ,MAAM;AAAA,MACN,gBACE;AAAA,IACJ;AAAA,EACF;AACF;AAEO,SAAS,wBAAwB,OAAwC;AAC9E,QAAM,UAAU,yBAAyB,KAAK;AAC9C,QAAM,0BAA0B,MAAM,2BAA2B;AACjE,SAAO;AAAA;AAAA;AAAA;AAAA,wCAI+B,MAAM,MAAM,IAAI;AAAA;AAAA;AAAA,EAGtD,QAAQ,UAAU,IAAI,CAAC,MAAM,UAAU,GAAG,QAAQ,CAAC,KAAK,KAAK,QAAQ,KAAK,KAAK,GAAG,GAAG,EAAE,KAAK,IAAI,CAAC;AAAA;AAAA;AAAA,EAGjG,QAAQ,MAAM,IAAI,CAAC,SAAS,KAAK,KAAK,IAAI,KAAK,KAAK,cAAc,EAAE,EAAE,KAAK,IAAI,CAAC;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,mEAcf,uBAAuB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAMxF,KAAK;AAAA,IACL;AAAA,MACE,OAAO,MAAM,MAAM;AAAA,MACnB,OAAO,QAAQ;AAAA,MACf,kBAAkB,QAAQ;AAAA,MAC1B,OAAO,QAAQ;AAAA,MACf,QAAQ,QAAQ;AAAA,MAChB,UAAU,QAAQ,SAAS,IAAI,CAAC,aAAa;AAAA,QAC3C,MAAM,QAAQ;AAAA,QACd,UAAU,QAAQ;AAAA,QAClB,WAAW,QAAQ,QAAQ;AAAA,QAC3B,kBAAkB,QAAQ;AAAA,MAC5B,EAAE;AAAA,MACF,UAAU,MAAM,MAAM,MACnB,OAAO,CAAC,SAAS,KAAK,WAAW,KAAK,YAAY,WAAW,EAC7D,IAAI,CAAC,UAAU;AAAA,QACd,MAAM,KAAK;AAAA,QACX,YAAY,KAAK;AAAA,QACjB,SAAS,KAAK;AAAA,QACd,OAAO,KAAK;AAAA,QACZ,MAAM,KAAK,QAAQ,CAAC;AAAA,MACtB,EAAE;AAAA,IACN;AAAA,IACA;AAAA,IACA;AAAA,EACF,CAAC;AAAA;AAAA;AAGD;","names":[]}
1	+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}

package/dist/{trajectory-BFmveYZt.d.ts → trajectory-CnoBo-JY.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { S as Span, b as TraceEvent, T as TraceStore } from './store-BP5be6s7.js';
+import { S as Span, b as TraceEvent, T as TraceStore } from './store-Db2Bv8Cf.js';
 /**
  * Trajectory — ordered, structured view over a run's spans.

package/dist/wire/index.d.ts CHANGED Viewed

@@ -1,14 +1,14 @@
-import { n as FeedbackTrajectoryStore } from '../feedback-trajectory-j0nJFgC6.js';
-import { T as TraceStore } from '../store-BP5be6s7.js';
+import { n as FeedbackTrajectoryStore } from '../feedback-trajectory-iATEAHmc.js';
+import { T as TraceStore } from '../store-Db2Bv8Cf.js';
 import { z } from 'zod';
 import { OpenAPIObject } from 'openapi3-ts/oas31';
 import * as hono_types from 'hono/types';
 import { ServerType } from '@hono/node-server';
 import { Hono } from 'hono';
-import '../control-runtime-BRdQ0wrx.js';
-import '../emitter-BqjeOvJh.js';
-import '../dataset-CiK_3LDr.js';
-import '../errors-BZ9sTdz7.js';
+import '../control-runtime-BZ_lVLYW.js';
+import '../emitter-DP_cSSiw.js';
+import '../dataset-ueRVTUoY.js';
+import '../errors-mje_cKOs.js';
 declare const RubricDimensionSchema: z.ZodObject<{
     id: z.ZodString;

package/dist/wire/index.js CHANGED Viewed

@@ -33,10 +33,10 @@ import {
   runRpcBatch,
   runRpcOnce,
   startServer
-} from "../chunk-XFZCM5Z3.js";
-import "../chunk-4S4BM3QQ.js";
+} from "../chunk-SMSGXM74.js";
+import "../chunk-M6RZ5LJN.js";
 import "../chunk-PC4UYEBM.js";
-import "../chunk-NG236HPC.js";
+import "../chunk-QYJT52YW.js";
 import "../chunk-PZ5AY32C.js";
 export {
   BUILTIN_RUBRICS,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-eval",
-  "version": "0.29.1",
+  "version": "0.31.0",
   "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
   "homepage": "https://github.com/tangle-network/agent-eval#readme",
   "repository": {

package/dist/chunk-NG236HPC.js.map DELETED Viewed

@@ -1 +0,0 @@

- {"version":3,"sources":["../src/errors.ts"],"sourcesContent":["/**\n * Error taxonomy for `@tangle-network/agent-eval`.\n *\n * Every error this package throws as part of its *public contract* extends\n * `AgentEvalError`. Consumers can pattern-match by `instanceof <Subclass>` or\n * by the stable string `code` carried on the base class.\n *\n * The codes are stable across minor versions; new codes can be added, but\n * existing codes never change meaning. New subclasses are non-breaking.\n *\n * Internal invariant guards (`throw new Error('this should never happen')`)\n * remain plain `Error`s on purpose — they're programmer-mistake assertions,\n * not consumer-catchable contract failures.\n */\n\nexport type AgentEvalErrorCode =\n | 'validation'\n | 'not_found'\n | 'config'\n | 'capture_integrity'\n | 'judge'\n | 'verification'\n | 'replay'\n\nexport class AgentEvalError extends Error {\n /** Stable string code. Survives minification; safe to switch on. */\n readonly code: AgentEvalErrorCode\n\n constructor(code: AgentEvalErrorCode, message: string, options?: { cause?: unknown }) {\n super(message, options)\n this.name = this.constructor.name\n this.code = code\n }\n}\n\n/** Caller passed invalid arguments (out of range, mutually-exclusive options, bad shape). */\nexport class ValidationError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('validation', message, options)\n }\n}\n\n/** A named resource (run, span, rubric, scenario, dataset row, route) does not exist. */\nexport class NotFoundError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('not_found', message, options)\n }\n}\n\n/** Configuration missing or malformed (`HOME` unset, required image not supplied, env var absent). */\nexport class ConfigError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('config', message, options)\n }\n}\n\n/**\n * A run is missing the artifacts a launch-grade check requires:\n * raw HTTP capture absent, no LLM spans, route assertion failed, run-end\n * assertion tripped. Block ship on this; do not catch and move on.\n */\nexport class CaptureIntegrityError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('capture_integrity', message, options)\n }\n}\n\n/** A judge call failed in a way that's not retryable: schema parse failure, bad rubric, conflicting dimensions. */\nexport class JudgeError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('judge', message, options)\n }\n}\n\n/** A verifier signalled a hard failure (compile, test, schema) — distinct from a low judge score. */\nexport class VerificationError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('verification', message, options)\n }\n}\n\n/** Replay cache cannot satisfy a request: miss with no fallback, sink lacks list(), unsupported URL. */\nexport class ReplayError extends AgentEvalError {\n constructor(message: string, options?: { cause?: unknown }) {\n super('replay', message, options)\n }\n}\n"],"mappings":";AAwBO,IAAM,iBAAN,cAA6B,MAAM;AAAA;AAAA,EAE/B;AAAA,EAET,YAAY,MAA0B,SAAiB,SAA+B;AACpF,UAAM,SAAS,OAAO;AACtB,SAAK,OAAO,KAAK,YAAY;AAC7B,SAAK,OAAO;AAAA,EACd;AACF;AAGO,IAAM,kBAAN,cAA8B,eAAe;AAAA,EAClD,YAAY,SAAiB,SAA+B;AAC1D,UAAM,cAAc,SAAS,OAAO;AAAA,EACtC;AACF;AAGO,IAAM,gBAAN,cAA4B,eAAe;AAAA,EAChD,YAAY,SAAiB,SAA+B;AAC1D,UAAM,aAAa,SAAS,OAAO;AAAA,EACrC;AACF;AAGO,IAAM,cAAN,cAA0B,eAAe;AAAA,EAC9C,YAAY,SAAiB,SAA+B;AAC1D,UAAM,UAAU,SAAS,OAAO;AAAA,EAClC;AACF;AAOO,IAAM,wBAAN,cAAoC,eAAe;AAAA,EACxD,YAAY,SAAiB,SAA+B;AAC1D,UAAM,qBAAqB,SAAS,OAAO;AAAA,EAC7C;AACF;AAGO,IAAM,aAAN,cAAyB,eAAe;AAAA,EAC7C,YAAY,SAAiB,SAA+B;AAC1D,UAAM,SAAS,SAAS,OAAO;AAAA,EACjC;AACF;AAGO,IAAM,oBAAN,cAAgC,eAAe;AAAA,EACpD,YAAY,SAAiB,SAA+B;AAC1D,UAAM,gBAAgB,SAAS,OAAO;AAAA,EACxC;AACF;AAGO,IAAM,cAAN,cAA0B,eAAe;AAAA,EAC9C,YAAY,SAAiB,SAA+B;AAC1D,UAAM,UAAU,SAAS,OAAO;AAAA,EAClC;AACF;","names":[]}