@tangle-network/agent-eval 0.27.0 → 0.27.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/CHANGELOG.md +72 -0
  2. package/README.md +4 -5
  3. package/dist/builder-eval/index.js +1 -1
  4. package/dist/{chunk-WWYCWKUM.js → chunk-3CKU6VGU.js} +2 -2
  5. package/dist/{chunk-K2TPS5LB.js → chunk-4U4BKCXK.js} +2 -2
  6. package/dist/chunk-4U4BKCXK.js.map +1 -0
  7. package/dist/{chunk-2A5XJB43.js → chunk-5AKPEK5L.js} +3 -3
  8. package/dist/chunk-5AKPEK5L.js.map +1 -0
  9. package/dist/{chunk-RAF443UI.js → chunk-DBIGN5MJ.js} +2 -2
  10. package/dist/{chunk-JLZQWFV3.js → chunk-K33INZHH.js} +2 -2
  11. package/dist/chunk-K33INZHH.js.map +1 -0
  12. package/dist/{chunk-NU65VQ7M.js → chunk-MAZ26DC7.js} +1 -1
  13. package/dist/chunk-MAZ26DC7.js.map +1 -0
  14. package/dist/{chunk-LSH4MMOZ.js → chunk-NCRFYPS3.js} +1 -1
  15. package/dist/chunk-NCRFYPS3.js.map +1 -0
  16. package/dist/{chunk-ZN274SWR.js → chunk-PALJO75S.js} +2 -2
  17. package/dist/{chunk-OWLAAMME.js → chunk-QHF6EQKK.js} +3 -2
  18. package/dist/chunk-QHF6EQKK.js.map +1 -0
  19. package/dist/chunk-R5UQJNKC.js +722 -0
  20. package/dist/chunk-R5UQJNKC.js.map +1 -0
  21. package/dist/{chunk-SESZDQPX.js → chunk-RUI6SIHY.js} +3 -3
  22. package/dist/chunk-RUI6SIHY.js.map +1 -0
  23. package/dist/{chunk-WHZMVFUV.js → chunk-SZSBQUIJ.js} +2 -2
  24. package/dist/chunk-SZSBQUIJ.js.map +1 -0
  25. package/dist/{chunk-4F5DQN55.js → chunk-VSMTAMNK.js} +1 -1
  26. package/dist/chunk-VSMTAMNK.js.map +1 -0
  27. package/dist/{chunk-5LBB5B3Z.js → chunk-XFZCM5Z3.js} +1 -1
  28. package/dist/chunk-XFZCM5Z3.js.map +1 -0
  29. package/dist/cli.js +1 -1
  30. package/dist/{control-CBShYYA6.d.ts → control-BT4qnXiS.d.ts} +2 -2
  31. package/dist/{control-runtime-BuJHoLg0.d.ts → control-runtime-BZ_lVLYW.d.ts} +1 -0
  32. package/dist/control.d.ts +3 -3
  33. package/dist/control.js +2 -2
  34. package/dist/{failure-cluster-C2EGSDiT.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -2
  35. package/dist/{feedback-trajectory-DfFdrraJ.d.ts → feedback-trajectory-D1aGKusy.d.ts} +1 -1
  36. package/dist/governance/index.d.ts +1 -1
  37. package/dist/{index-D3iBCjdF.d.ts → index-BhLlu-qO.d.ts} +1 -1
  38. package/dist/index.d.ts +157 -167
  39. package/dist/index.js +25 -335
  40. package/dist/index.js.map +1 -1
  41. package/dist/knowledge/index.d.ts +1 -1
  42. package/dist/knowledge/index.js +2 -2
  43. package/dist/{multi-layer-verifier-LkP3LVKj.d.ts → multi-layer-verifier-U-c8ge1k.d.ts} +1 -1
  44. package/dist/openapi.json +1 -1
  45. package/dist/optimization.d.ts +5 -5
  46. package/dist/optimization.js +5 -5
  47. package/dist/pipelines/index.d.ts +1 -1
  48. package/dist/pipelines/index.js +2 -2
  49. package/dist/{release-report-wfUySN5F.d.ts → release-report-CCQqnK46.d.ts} +1 -1
  50. package/dist/{replay-BL96gCEP.d.ts → replay-D7z0J43-.d.ts} +4 -5
  51. package/dist/reporting.d.ts +4 -4
  52. package/dist/reporting.js +5 -5
  53. package/dist/{researcher-bGkI7vCl.d.ts → researcher-G81CWc0q.d.ts} +9 -10
  54. package/dist/rl.d.ts +26 -44
  55. package/dist/rl.js +5 -5
  56. package/dist/rl.js.map +1 -1
  57. package/dist/{sequential-Dgz1n51-.d.ts → sequential-5iSVfzl2.d.ts} +2 -2
  58. package/dist/{summary-report-DZVXOCK_.d.ts → summary-report-Dl4akLKX.d.ts} +5 -5
  59. package/dist/traces.d.ts +1 -1
  60. package/dist/traces.js +2 -2
  61. package/dist/wire/index.d.ts +2 -2
  62. package/dist/wire/index.js +1 -1
  63. package/docs/research-report-methodology.md +4 -4
  64. package/docs/three-package-architecture.md +12 -24
  65. package/package.json +1 -1
  66. package/dist/chunk-2A5XJB43.js.map +0 -1
  67. package/dist/chunk-4F5DQN55.js.map +0 -1
  68. package/dist/chunk-5LBB5B3Z.js.map +0 -1
  69. package/dist/chunk-I4MBDTY5.js +0 -272
  70. package/dist/chunk-I4MBDTY5.js.map +0 -1
  71. package/dist/chunk-JLZQWFV3.js.map +0 -1
  72. package/dist/chunk-K2TPS5LB.js.map +0 -1
  73. package/dist/chunk-LSH4MMOZ.js.map +0 -1
  74. package/dist/chunk-NU65VQ7M.js.map +0 -1
  75. package/dist/chunk-OWLAAMME.js.map +0 -1
  76. package/dist/chunk-SESZDQPX.js.map +0 -1
  77. package/dist/chunk-WHZMVFUV.js.map +0 -1
  78. /package/dist/{chunk-WWYCWKUM.js.map → chunk-3CKU6VGU.js.map} +0 -0
  79. /package/dist/{chunk-RAF443UI.js.map → chunk-DBIGN5MJ.js.map} +0 -0
  80. /package/dist/{chunk-ZN274SWR.js.map → chunk-PALJO75S.js.map} +0 -0
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/failure-taxonomy.ts","../src/pipelines/failure-cluster.ts","../src/tool-use-metrics.ts","../src/baseline.ts"],"sourcesContent":["/**\n * Failure taxonomy — canonical classes + a default classifier.\n *\n * Every failed run should end up in a named class. The classifier here\n * is rule-based (fast, deterministic); an LLM fallback can be added by\n * the consumer for novel cases and trained into the rule base over time.\n *\n * Consumers call `classifyFailure(run, spans, events)` and persist the\n * returned class as `Run.outcome.failureClass`.\n */\n\nimport type { FailureClass, Run, Span, TraceEvent } from './trace/schema'\nimport { FAILURE_CLASSES } from './trace/schema'\n\nexport { FAILURE_CLASSES, type FailureClass }\n\nexport interface FailureContext {\n run: Run\n spans: Span[]\n events: TraceEvent[]\n}\n\nexport interface FailureClassification {\n failureClass: FailureClass\n reason: string\n triggerSpanId?: string\n triggerEventId?: string\n}\n\n/** Ordered rules — first match wins. */\nexport interface FailureRule {\n id: string\n match: (ctx: FailureContext) => {\n failureClass: FailureClass\n reason: string\n triggerSpanId?: string\n triggerEventId?: string\n } | null\n}\n\nexport const DEFAULT_RULES: FailureRule[] = [\n // Outcome already named? Respect it.\n {\n id: 'explicit-outcome',\n match: ({ run }) => {\n const fc = run.outcome?.failureClass\n if (fc && fc !== 'unknown')\n return { failureClass: fc, reason: 'outcome.failureClass set explicitly' }\n return null\n },\n },\n {\n id: 'knowledge-readiness-blocked',\n match: ({ events }) => {\n const event = events.find(\n (e) =>\n e.kind === 'custom' &&\n e.payload.kind === 'readiness_scored' &&\n e.payload.passed === false,\n )\n return event\n ? {\n failureClass: 'knowledge_readiness_blocked',\n reason: 'knowledge readiness report blocked execution',\n triggerEventId: event.eventId,\n }\n : null\n },\n },\n {\n id: 'bad-integration-manifest',\n match: ({ events }) => {\n const event = events.find(\n (e) =>\n e.kind === 'custom' &&\n ((e.payload.kind === 'integration_manifest_validated' && e.payload.valid === false) ||\n (e.payload.kind === 'integration_invoke_failed' &&\n e.payload.code === 'manifest_invalid')),\n )\n return event\n ? {\n failureClass: 'bad_integration_manifest',\n reason: 'integration manifest validation failed before launch',\n triggerEventId: event.eventId,\n }\n : null\n },\n },\n {\n id: 'missing-integration-connection',\n match: ({ events }) => {\n const event = events.find(\n (e) =>\n e.kind === 'custom' &&\n e.payload.kind === 'integration_manifest_resolved' &&\n hasResolutionStatus(e.payload, 'missing_connection'),\n )\n return event\n ? {\n failureClass: 'missing_integration_connection',\n reason: 'required integration connection was missing',\n triggerEventId: event.eventId,\n }\n : null\n },\n },\n {\n id: 'missing-integration-scope',\n match: ({ events }) => {\n const event = events.find(\n (e) =>\n e.kind === 'custom' &&\n ((e.payload.kind === 'integration_manifest_resolved' && hasMissingScopes(e.payload)) ||\n (e.payload.kind === 'integration_invoke_failed' && e.payload.code === 'scope_denied')),\n )\n return event\n ? {\n failureClass: 'missing_integration_scope',\n reason: 'integration grant or connection lacks required scopes',\n triggerEventId: event.eventId,\n }\n : null\n },\n },\n {\n id: 'integration-approval-required',\n match: ({ events }) => {\n const event = events.find(\n (e) =>\n e.kind === 'custom' &&\n ((e.payload.kind === 'integration_invoke' && e.payload.status === 'approval_required') ||\n (e.payload.kind === 'integration_invoke_failed' &&\n e.payload.code === 'approval_required') ||\n e.payload.kind === 'integration_approval_required'),\n )\n return event\n ? {\n failureClass: 'integration_approval_required',\n reason: 'integration write paused for user approval',\n triggerEventId: event.eventId,\n }\n : null\n },\n },\n {\n id: 'integration-auth-expired',\n match: ({ events }) => {\n const event = events.find(\n (e) =>\n e.kind === 'custom' &&\n e.payload.kind === 'integration_invoke_failed' &&\n (e.payload.code === 'auth_expired' ||\n e.payload.code === 'connection_not_active' ||\n e.payload.code === 'capability_expired' ||\n e.payload.status === 'expired'),\n )\n return event\n ? {\n failureClass: 'integration_auth_expired',\n reason: 'integration connection or capability expired',\n triggerEventId: event.eventId,\n }\n : null\n },\n },\n {\n id: 'unsafe-integration-write-denied',\n match: ({ events }) => {\n const event = events.find(\n (e) =>\n e.kind === 'custom' &&\n e.payload.kind === 'integration_invoke_failed' &&\n (e.payload.code === 'unsafe_write_denied' ||\n e.payload.code === 'policy_denied' ||\n e.payload.code === 'action_denied'),\n )\n return event\n ? {\n failureClass: 'unsafe_integration_write_denied',\n reason: 'integration write was denied by policy or capability scope',\n triggerEventId: event.eventId,\n }\n : null\n },\n },\n {\n id: 'integration-provider-failure',\n match: ({ events }) => {\n const event = events.find(\n (e) =>\n e.kind === 'custom' &&\n e.payload.kind === 'integration_invoke_failed' &&\n ![\n 'scope_denied',\n 'approval_required',\n 'auth_expired',\n 'connection_not_active',\n 'capability_expired',\n 'unsafe_write_denied',\n 'policy_denied',\n 'action_denied',\n 'manifest_invalid',\n ].includes(String(e.payload.code)),\n )\n return event\n ? {\n failureClass: 'integration_provider_failure',\n reason: 'integration provider invocation failed',\n triggerEventId: event.eventId,\n }\n : null\n },\n },\n {\n id: 'missing-credentials',\n match: ({ events }) => {\n const event = events.find(\n (e) =>\n e.kind === 'custom' &&\n e.payload.kind === 'knowledge_gap' &&\n e.payload.category === 'credential_or_secret',\n )\n return event\n ? {\n failureClass: 'missing_credentials',\n reason: 'required credential or secret was missing',\n triggerEventId: event.eventId,\n }\n : null\n },\n },\n {\n id: 'bad-retrieval',\n match: ({ run, spans }) => {\n if (run.outcome?.pass !== false) return null\n const retrieval = spans.find(\n (s) =>\n s.kind === 'retrieval' && (s.hits.length === 0 || s.hits.every((hit) => hit.score <= 0)),\n )\n return retrieval\n ? {\n failureClass: 'bad_retrieval',\n reason: 'retrieval returned no useful hits for a failed run',\n triggerSpanId: retrieval.spanId,\n }\n : null\n },\n },\n {\n id: 'insufficient-evidence',\n match: ({ events }) => {\n const event = events.find(\n (e) =>\n e.kind === 'custom' &&\n e.payload.kind === 'knowledge_gap' &&\n e.payload.reason === 'insufficient_evidence',\n )\n return event\n ? {\n failureClass: 'insufficient_evidence',\n reason: 'task proceeded with insufficient supporting evidence',\n triggerEventId: event.eventId,\n }\n : null\n },\n },\n {\n id: 'contradictory-evidence',\n match: ({ events }) => {\n const event = events.find(\n (e) =>\n e.kind === 'custom' &&\n e.payload.kind === 'knowledge_gap' &&\n e.payload.reason === 'contradictory_evidence',\n )\n return event\n ? {\n failureClass: 'contradictory_evidence',\n reason: 'supporting evidence contradicted itself',\n triggerEventId: event.eventId,\n }\n : null\n },\n },\n // Budget breach events\n {\n id: 'budget-breach',\n match: ({ events }) => {\n const breach = events.find((e) => e.kind === 'budget_breach')\n return breach\n ? {\n failureClass: 'budget_exceeded',\n reason: `budget breached on ${breach.payload.dimension ?? 'unknown dimension'}`,\n triggerEventId: breach.eventId,\n }\n : null\n },\n },\n // Policy violations\n {\n id: 'policy-violation',\n match: ({ events }) => {\n const e = events.find((x) => x.kind === 'policy_violation')\n return e\n ? {\n failureClass: 'policy_violation',\n reason: 'policy_violation event emitted',\n triggerEventId: e.eventId,\n }\n : null\n },\n },\n // Sandbox non-zero exit code\n {\n id: 'sandbox-failure',\n match: ({ spans }) => {\n const s = spans.find(\n (x) => x.kind === 'sandbox' && typeof x.exitCode === 'number' && x.exitCode !== 0,\n )\n if (!s) return null\n return {\n failureClass: 'sandbox_failure',\n reason: `sandbox exited ${(s as Extract<Span, { kind: 'sandbox' }>).exitCode}`,\n triggerSpanId: s.spanId,\n }\n },\n },\n // Timeout: run aborted by external signal\n {\n id: 'timeout',\n match: ({ run, events }) => {\n if (run.status !== 'aborted') return null\n const hasTimeout = events.some(\n (e) =>\n e.kind === 'error' &&\n String(e.payload.reason ?? '')\n .toLowerCase()\n .includes('timeout'),\n )\n const note = (run.outcome?.notes ?? '').toLowerCase()\n if (hasTimeout || note.includes('timeout') || note.includes('deadline')) {\n return { failureClass: 'timeout', reason: 'timeout signal observed' }\n }\n return null\n },\n },\n // Tool recovery failure: many consecutive tool errors on the same tool\n {\n id: 'tool-recovery-failure',\n match: ({ spans }) => {\n const tools = spans.filter((s) => s.kind === 'tool')\n const byTool = new Map<string, Span[]>()\n for (const t of tools) {\n const name = (t as Extract<Span, { kind: 'tool' }>).toolName\n const arr = byTool.get(name) ?? []\n arr.push(t)\n byTool.set(name, arr)\n }\n for (const [name, arr] of byTool) {\n const errs = arr.filter((s) => s.status === 'error')\n if (errs.length >= 3 && errs.length === arr.length) {\n return {\n failureClass: 'tool_recovery_failure',\n reason: `${errs.length} consecutive errors on tool \"${name}\"`,\n triggerSpanId: errs[errs.length - 1]!.spanId,\n }\n }\n }\n return null\n },\n },\n // Tool selection error: the run failed and agent called zero tools despite having them\n {\n id: 'tool-selection-error',\n match: ({ run, spans }) => {\n if (run.outcome?.pass !== false) return null\n const hasToolsAvailable = spans.some(\n (s) =>\n s.kind === 'agent' &&\n (s.attributes?.toolsAvailable as number | undefined) !== undefined &&\n (s.attributes?.toolsAvailable as number) > 0,\n )\n const tools = spans.filter((s) => s.kind === 'tool')\n if (hasToolsAvailable && tools.length === 0) {\n return {\n failureClass: 'tool_selection_error',\n reason: 'tools were available but none were called',\n }\n }\n return null\n },\n },\n // Format drift: scored by a judge with dimension='format' below threshold\n {\n id: 'format-drift',\n match: ({ spans }) => {\n const judge = spans.find(\n (s) =>\n s.kind === 'judge' &&\n (s as Extract<Span, { kind: 'judge' }>).dimension === 'format' &&\n (s as Extract<Span, { kind: 'judge' }>).score < 0.5,\n )\n return judge\n ? {\n failureClass: 'format_drift',\n reason: 'format judge scored below 0.5',\n triggerSpanId: judge.spanId,\n }\n : null\n },\n },\n]\n\nfunction hasResolutionStatus(payload: Record<string, unknown>, status: string): boolean {\n if (status === 'missing_connection' && stringArray(payload.missingConnections).length > 0)\n return true\n return resolutionItems(payload).some((item) => item.status === status)\n}\n\nfunction hasMissingScopes(payload: Record<string, unknown>): boolean {\n if (stringArray(payload.missingScopes).length > 0) return true\n return resolutionItems(payload).some(\n (item) => Array.isArray(item.missingScopes) && item.missingScopes.length > 0,\n )\n}\n\nfunction resolutionItems(payload: Record<string, unknown>): Array<Record<string, unknown>> {\n return [\n ...records(payload.missing),\n ...records(payload.optionalMissing),\n ...records(payload.ready),\n ]\n}\n\nfunction records(value: unknown): Array<Record<string, unknown>> {\n if (!Array.isArray(value)) return []\n return value.filter(\n (item): item is Record<string, unknown> =>\n Boolean(item) && typeof item === 'object' && !Array.isArray(item),\n )\n}\n\nfunction stringArray(value: unknown): string[] {\n return Array.isArray(value)\n ? value.filter((item): item is string => typeof item === 'string')\n : []\n}\n\n/** Classify the failure mode of a run using an ordered rule list. */\nexport function classifyFailure(\n ctx: FailureContext,\n rules: FailureRule[] = DEFAULT_RULES,\n): FailureClassification {\n if (ctx.run.outcome?.pass !== false && ctx.run.status === 'completed') {\n return { failureClass: 'success', reason: 'run completed with pass=true (or no explicit fail)' }\n }\n for (const rule of rules) {\n const hit = rule.match(ctx)\n if (hit) return hit\n }\n return { failureClass: 'unknown', reason: 'no rule matched; run failed for unclassified reason' }\n}\n","/**\n * FailureClusterView — groups failed runs by (failureClass, triggerTool,\n * argHash-prefix) so weekly reviews can prioritize the top-N clusters.\n *\n * Each cluster includes: N runs, scenarios affected, representative\n * error message, a proposed mitigation hint (rule → action table).\n */\n\nimport { classifyFailure, DEFAULT_RULES, type FailureRule } from '../failure-taxonomy'\nimport { argHash, toolSpans } from '../trace/query'\nimport type { FailureClass, Span } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface FailureCluster {\n failureClass: FailureClass\n /** Tool name when the trigger was a tool span, else undefined. */\n toolName?: string\n /** First 16 chars of argHash — clusters similar args. */\n argPrefix?: string\n /**\n * Source dimension when the trigger was a judge span (e.g. `'format'`,\n * `'safety'`, `'correctness'`). Lets cross-template aggregators\n * group failures by the dimension that fired without overloading\n * `argPrefix`. Optional — clusters without this field deserialize cleanly.\n */\n dimension?: string\n runCount: number\n scenarioIds: string[]\n exampleError?: string\n exampleRunId: string\n}\n\nexport interface FailureClusterReport {\n clusters: FailureCluster[]\n totalFailures: number\n totalRuns: number\n}\n\nexport async function failureClusterView(\n store: TraceStore,\n options: { rules?: FailureRule[]; minClusterSize?: number } = {},\n): Promise<FailureClusterReport> {\n const rules = options.rules ?? DEFAULT_RULES\n const minSize = options.minClusterSize ?? 1\n const runs = await store.listRuns()\n\n type Key = string\n const clusters = new Map<Key, FailureCluster>()\n let totalFailures = 0\n\n for (const run of runs) {\n if (run.status === 'completed' && run.outcome?.pass !== false) continue\n totalFailures++\n const spans = await store.spans({ runId: run.runId })\n const events = await store.events({ runId: run.runId })\n const cls = classifyFailure({ run, spans, events }, rules)\n\n let toolName: string | undefined\n let argPrefix: string | undefined\n let dimension: string | undefined\n if (cls.triggerSpanId) {\n const trig = spans.find((s) => s.spanId === cls.triggerSpanId)\n if (trig?.kind === 'tool') {\n toolName = trig.toolName\n argPrefix = argHash(trig.args).slice(0, 16)\n } else if (trig?.kind === 'judge') {\n dimension = trig.dimension\n }\n }\n // Fallback: look at the last errored tool span\n if (!toolName) {\n const ts = await toolSpans(store, run.runId)\n const errored = ts.filter((t) => t.status === 'error').pop()\n if (errored) {\n toolName = errored.toolName\n argPrefix = argHash(errored.args).slice(0, 16)\n }\n }\n // Secondary signal: any judge span on the failed run carries a\n // dimension. Useful when the rule classified by judge score but\n // didn't surface the trigger span (or surfaced a non-judge span).\n if (!dimension) {\n const judge = spans.find((s) => s.kind === 'judge' && typeof s.dimension === 'string')\n if (judge?.kind === 'judge') dimension = judge.dimension\n }\n\n const key = `${cls.failureClass}|${toolName ?? ''}|${argPrefix ?? ''}|${dimension ?? ''}`\n let cluster = clusters.get(key)\n if (!cluster) {\n cluster = {\n failureClass: cls.failureClass,\n toolName,\n argPrefix,\n dimension,\n runCount: 0,\n scenarioIds: [],\n exampleRunId: run.runId,\n exampleError: firstErrorMessage(spans) ?? cls.reason,\n }\n clusters.set(key, cluster)\n }\n cluster.runCount++\n if (!cluster.scenarioIds.includes(run.scenarioId)) cluster.scenarioIds.push(run.scenarioId)\n }\n\n const arr = [...clusters.values()]\n .filter((c) => c.runCount >= minSize)\n .sort((a, b) => b.runCount - a.runCount)\n\n return { clusters: arr, totalFailures, totalRuns: runs.length }\n}\n\nfunction firstErrorMessage(spans: Span[]): string | undefined {\n const errored = spans.find((s) => s.status === 'error')\n return errored?.error\n}\n","/**\n * Tool-use metrics — derived purely from trace data.\n *\n * No scoring assumptions: consumers supply optional ground-truth tool\n * selections per turn + optional \"information used downstream\" signals.\n * Without those, we still compute descriptive metrics (error rate,\n * retry rate, duplicate-call rate) that are useful on their own.\n */\n\nimport { argHash, groupBy, toolSpans } from './trace/query'\nimport type { Span } from './trace/schema'\nimport type { TraceStore } from './trace/store'\n\nexport interface ToolUseMetrics {\n runId: string\n totalCalls: number\n byTool: Record<string, ToolStats>\n errorRate: number\n /** Ratio of calls with identical (toolName, argHash) already seen earlier in the same run. */\n duplicateRate: number\n /** Ratio of error calls followed by ≥1 retry on same tool. */\n retryRate: number\n /** Optional: of the calls agent made, fraction the evaluator marked as \"correct selection\". */\n selectionAccuracy?: number\n}\n\nexport interface ToolStats {\n calls: number\n errors: number\n avgLatencyMs: number\n duplicates: number\n}\n\nexport interface ToolUseOptions {\n /** Map of spanId → whether the evaluator judged the tool selection correct. Optional. */\n selectionLabels?: Record<string, boolean>\n}\n\nexport async function computeToolUseMetrics(\n store: TraceStore,\n runId: string,\n options: ToolUseOptions = {},\n): Promise<ToolUseMetrics> {\n const tools = await toolSpans(store, runId)\n if (tools.length === 0) {\n return { runId, totalCalls: 0, byTool: {}, errorRate: 0, duplicateRate: 0, retryRate: 0 }\n }\n\n const byTool: Record<string, ToolStats> = {}\n let totalErrors = 0\n let totalDuplicates = 0\n const sortedTools = [...tools].sort((a, b) => a.startedAt - b.startedAt)\n const seenSignatures = new Set<string>()\n\n // duplicate detection + per-tool aggregation\n for (const t of sortedTools) {\n const stat = (byTool[t.toolName] ??= { calls: 0, errors: 0, avgLatencyMs: 0, duplicates: 0 })\n stat.calls += 1\n if (t.status === 'error') {\n stat.errors += 1\n totalErrors += 1\n }\n if (typeof t.latencyMs === 'number') stat.avgLatencyMs += t.latencyMs\n const sig = `${t.toolName}|${argHash(t.args)}`\n if (seenSignatures.has(sig)) {\n stat.duplicates += 1\n totalDuplicates += 1\n }\n seenSignatures.add(sig)\n }\n\n for (const stat of Object.values(byTool)) {\n stat.avgLatencyMs = stat.calls > 0 ? stat.avgLatencyMs / stat.calls : 0\n }\n\n // retry detection: per-tool chronological adjacency where error → next same-tool call\n let retryOpportunities = 0\n let retriesFollowed = 0\n for (const [, arr] of groupBy(sortedTools, (t) => t.toolName)) {\n for (let i = 0; i < arr.length; i++) {\n if (arr[i]!.status !== 'error') continue\n retryOpportunities += 1\n if (arr[i + 1]) retriesFollowed += 1\n }\n }\n const retryRate = retryOpportunities > 0 ? retriesFollowed / retryOpportunities : 0\n\n let selectionAccuracy: number | undefined\n if (options.selectionLabels) {\n const labeled = sortedTools.filter((t) => t.spanId in options.selectionLabels!)\n if (labeled.length > 0) {\n selectionAccuracy =\n labeled.filter((t) => options.selectionLabels![t.spanId]).length / labeled.length\n }\n }\n\n return {\n runId,\n totalCalls: sortedTools.length,\n byTool,\n errorRate: totalErrors / sortedTools.length,\n duplicateRate: totalDuplicates / sortedTools.length,\n retryRate,\n selectionAccuracy,\n }\n}\n\nexport type { Span }\n","/**\n * Baseline regression detection.\n *\n * Lifted from ADC baseline.ts. Every promotion-blocking signal boils down\n * to: \"is this run measurably worse than baseline?\" — with enough\n * statistical rigor to distinguish noise from drift.\n *\n * Uses:\n * - Welch's t-test (unequal variance) for per-metric mean comparison\n * - Cohen's d for effect size magnitude\n * - IQR for stability flag (unstable samples can't be trusted for comparisons)\n *\n * Returns a structured verdict: improved | regressed | stable | unstable.\n */\n\nimport { cohensD } from './statistics'\n\nexport interface MetricSamples {\n /** Stable metric key (e.g. \"overallScore\", \"firstTokenMs\"). */\n metric: string\n /** Whether higher values are better. */\n higherIsBetter: boolean\n baseline: number[]\n candidate: number[]\n}\n\nexport interface MetricVerdict {\n metric: string\n baselineMean: number\n candidateMean: number\n delta: number\n cohensD: number\n welchT: number\n welchDf: number\n welchP: number\n stable: boolean\n /** IQR of the combined samples — used as a rough stability indicator. */\n iqr: number\n verdict: 'improved' | 'regressed' | 'stable' | 'unstable'\n}\n\nexport interface BaselineReport {\n metrics: MetricVerdict[]\n /** True if any critical metric regressed. */\n hasRegression: boolean\n /** True if any metric is unstable (too noisy to judge). */\n hasUnstable: boolean\n}\n\nexport interface BaselineOptions {\n /** Effect size threshold for meaningful delta (default 0.5 — medium effect). */\n effectThreshold?: number\n /** p-value threshold for statistical significance (default 0.05). */\n alpha?: number\n /** IQR/mean ratio above which samples are flagged unstable (default 0.30). */\n unstableCvThreshold?: number\n}\n\n/**\n * Compare candidate samples against baseline per metric. Verdict logic:\n * - unstable: IQR/|mean| > threshold on either set — not enough signal\n * - improved: meaningful effect in the \"better\" direction AND p < alpha\n * - regressed: meaningful effect in the \"worse\" direction AND p < alpha\n * - stable: otherwise (no significant change)\n */\nexport function compareToBaseline(\n samples: MetricSamples[],\n options: BaselineOptions = {},\n): BaselineReport {\n const effectThreshold = options.effectThreshold ?? 0.5\n const alpha = options.alpha ?? 0.05\n const cvThreshold = options.unstableCvThreshold ?? 0.3\n\n const metrics: MetricVerdict[] = samples.map((s) => {\n if (s.baseline.length < 2 || s.candidate.length < 2) {\n throw new Error(`compareToBaseline: need ≥2 samples per side for \"${s.metric}\"`)\n }\n const bMean = mean(s.baseline)\n const cMean = mean(s.candidate)\n const delta = cMean - bMean\n const d = cohensD(s.baseline, s.candidate) // positive = candidate higher\n const { t, df, p } = welchsTTest(s.baseline, s.candidate)\n // Stability is per-side: a comparison is trustworthy only when BOTH\n // samples are internally consistent. Combining the sides would flag\n // large-but-real deltas as \"unstable\" which is exactly what we want\n // to detect.\n const baselineIqr = iqr(s.baseline)\n const candidateIqr = iqr(s.candidate)\n const baselineStable = baselineIqr / Math.max(Math.abs(bMean), 1e-9) <= cvThreshold\n const candidateStable = candidateIqr / Math.max(Math.abs(cMean), 1e-9) <= cvThreshold\n const stable = baselineStable && candidateStable\n const reportedIqr = Math.max(baselineIqr, candidateIqr)\n\n let verdict: MetricVerdict['verdict']\n if (!stable) {\n verdict = 'unstable'\n } else if (p < alpha && Math.abs(d) >= effectThreshold) {\n const candidateIsBetter = s.higherIsBetter ? delta > 0 : delta < 0\n verdict = candidateIsBetter ? 'improved' : 'regressed'\n } else {\n verdict = 'stable'\n }\n\n return {\n metric: s.metric,\n baselineMean: bMean,\n candidateMean: cMean,\n delta,\n cohensD: d,\n welchT: t,\n welchDf: df,\n welchP: p,\n stable,\n iqr: reportedIqr,\n verdict,\n }\n })\n\n return {\n metrics,\n hasRegression: metrics.some((m) => m.verdict === 'regressed'),\n hasUnstable: metrics.some((m) => m.verdict === 'unstable'),\n }\n}\n\nfunction mean(xs: number[]): number {\n return xs.reduce((a, b) => a + b, 0) / xs.length\n}\n\n/** Inter-quartile range; 0 when the sample has no spread. */\nexport function iqr(xs: number[]): number {\n if (xs.length === 0) return 0\n const sorted = [...xs].sort((a, b) => a - b)\n const q = (p: number) => {\n const idx = p * (sorted.length - 1)\n const lo = Math.floor(idx)\n const hi = Math.ceil(idx)\n return sorted[lo]! + (sorted[hi]! - sorted[lo]!) * (idx - lo)\n }\n return q(0.75) - q(0.25)\n}\n\n/**\n * Welch's t-test — unequal-variance two-sample t. Uses the same Student-t\n * CDF as `pairedTTest` (via incomplete beta); falls back to normal tail\n * when df is large.\n */\nexport function welchsTTest(a: number[], b: number[]): { t: number; df: number; p: number } {\n if (a.length < 2 || b.length < 2) return { t: 0, df: 0, p: 1 }\n const mA = mean(a)\n const mB = mean(b)\n const vA = variance(a, mA)\n const vB = variance(b, mB)\n const seSquared = vA / a.length + vB / b.length\n if (seSquared === 0) return { t: mA === mB ? 0 : Infinity, df: 0, p: mA === mB ? 1 : 0 }\n const t = (mB - mA) / Math.sqrt(seSquared)\n const df =\n (seSquared * seSquared) /\n ((vA / a.length) ** 2 / (a.length - 1) + (vB / b.length) ** 2 / (b.length - 1))\n const p = 2 * (1 - studentTCdf(Math.abs(t), df))\n return { t, df, p }\n}\n\nfunction variance(xs: number[], m: number): number {\n return xs.reduce((acc, x) => acc + (x - m) ** 2, 0) / (xs.length - 1)\n}\n\n// Re-used from statistics.ts via small local copy to avoid exporting internals.\nfunction studentTCdf(t: number, df: number): number {\n if (df <= 0) return 0.5\n if (df > 100) return normalCdf(t)\n const x = df / (df + t * t)\n const ib = incompleteBeta(x, df / 2, 0.5)\n return t >= 0 ? 1 - 0.5 * ib : 0.5 * ib\n}\n\nfunction incompleteBeta(x: number, a: number, b: number): number {\n if (x <= 0) return 0\n if (x >= 1) return 1\n const lnBeta = lnGamma(a) + lnGamma(b) - lnGamma(a + b)\n const front = Math.exp(Math.log(x) * a + Math.log(1 - x) * b - lnBeta) / a\n let c = 1\n let d = 1 - ((a + b) * x) / (a + 1)\n if (Math.abs(d) < 1e-30) d = 1e-30\n d = 1 / d\n let f = d\n for (let m = 1; m <= 200; m++) {\n const m2 = 2 * m\n let num = (m * (b - m) * x) / ((a + m2 - 1) * (a + m2))\n d = 1 + num * d\n if (Math.abs(d) < 1e-30) d = 1e-30\n c = 1 + num / c\n if (Math.abs(c) < 1e-30) c = 1e-30\n d = 1 / d\n f *= d * c\n num = -((a + m) * (a + b + m) * x) / ((a + m2) * (a + m2 + 1))\n d = 1 + num * d\n if (Math.abs(d) < 1e-30) d = 1e-30\n c = 1 + num / c\n if (Math.abs(c) < 1e-30) c = 1e-30\n d = 1 / d\n const delta = d * c\n f *= delta\n if (Math.abs(delta - 1) < 3e-7) break\n }\n return front * f\n}\n\nfunction lnGamma(z: number): number {\n const coefs = [\n 0.99999999999980993, 676.5203681218851, -1259.1392167224028, 771.32342877765313,\n -176.61502916214059, 12.507343278686905, -0.13857109526572012, 9.9843695780195716e-6,\n 1.5056327351493116e-7,\n ]\n if (z < 0.5) return Math.log(Math.PI / Math.sin(Math.PI * z)) - lnGamma(1 - z)\n z -= 1\n let x = coefs[0]!\n for (let i = 1; i < 9; i++) x += coefs[i]! / (z + i)\n const t = z + 7.5\n return 0.5 * Math.log(2 * Math.PI) + (z + 0.5) * Math.log(t) - t + Math.log(x)\n}\n\nfunction normalCdf(x: number): number {\n const a1 = 0.254829592\n const a2 = -0.284496736\n const a3 = 1.421413741\n const a4 = -1.453152027\n const a5 = 1.061405429\n const p = 0.3275911\n const sign = x < 0 ? -1 : 1\n const absX = Math.abs(x)\n const t = 1 / (1 + p * absX)\n const y = 1 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp((-absX * absX) / 2)\n return 0.5 * (1 + sign * y)\n}\n"],"mappings":";;;;;;;;;;AAwCO,IAAM,gBAA+B;AAAA;AAAA,EAE1C;AAAA,IACE,IAAI;AAAA,IACJ,OAAO,CAAC,EAAE,IAAI,MAAM;AAClB,YAAM,KAAK,IAAI,SAAS;AACxB,UAAI,MAAM,OAAO;AACf,eAAO,EAAE,cAAc,IAAI,QAAQ,sCAAsC;AAC3E,aAAO;AAAA,IACT;AAAA,EACF;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,OAAO,CAAC,EAAE,OAAO,MAAM;AACrB,YAAM,QAAQ,OAAO;AAAA,QACnB,CAAC,MACC,EAAE,SAAS,YACX,EAAE,QAAQ,SAAS,sBACnB,EAAE,QAAQ,WAAW;AAAA,MACzB;AACA,aAAO,QACH;AAAA,QACE,cAAc;AAAA,QACd,QAAQ;AAAA,QACR,gBAAgB,MAAM;AAAA,MACxB,IACA;AAAA,IACN;AAAA,EACF;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,OAAO,CAAC,EAAE,OAAO,MAAM;AACrB,YAAM,QAAQ,OAAO;AAAA,QACnB,CAAC,MACC,EAAE,SAAS,aACT,EAAE,QAAQ,SAAS,oCAAoC,EAAE,QAAQ,UAAU,SAC1E,EAAE,QAAQ,SAAS,+BAClB,EAAE,QAAQ,SAAS;AAAA,MAC3B;AACA,aAAO,QACH;AAAA,QACE,cAAc;AAAA,QACd,QAAQ;AAAA,QACR,gBAAgB,MAAM;AAAA,MACxB,IACA;AAAA,IACN;AAAA,EACF;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,OAAO,CAAC,EAAE,OAAO,MAAM;AACrB,YAAM,QAAQ,OAAO;AAAA,QACnB,CAAC,MACC,EAAE,SAAS,YACX,EAAE,QAAQ,SAAS,mCACnB,oBAAoB,EAAE,SAAS,oBAAoB;AAAA,MACvD;AACA,aAAO,QACH;AAAA,QACE,cAAc;AAAA,QACd,QAAQ;AAAA,QACR,gBAAgB,MAAM;AAAA,MACxB,IACA;AAAA,IACN;AAAA,EACF;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,OAAO,CAAC,EAAE,OAAO,MAAM;AACrB,YAAM,QAAQ,OAAO;AAAA,QACnB,CAAC,MACC,EAAE,SAAS,aACT,EAAE,QAAQ,SAAS,mCAAmC,iBAAiB,EAAE,OAAO,KAC/E,EAAE,QAAQ,SAAS,+BAA+B,EAAE,QAAQ,SAAS;AAAA,MAC5E;AACA,aAAO,QACH;AAAA,QACE,cAAc;AAAA,QACd,QAAQ;AAAA,QACR,gBAAgB,MAAM;AAAA,MACxB,IACA;AAAA,IACN;AAAA,EACF;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,OAAO,CAAC,EAAE,OAAO,MAAM;AACrB,YAAM,QAAQ,OAAO;AAAA,QACnB,CAAC,MACC,EAAE,SAAS,aACT,EAAE,QAAQ,SAAS,wBAAwB,EAAE,QAAQ,WAAW,uBAC/D,EAAE,QAAQ,SAAS,+BAClB,EAAE,QAAQ,SAAS,uBACrB,EAAE,QAAQ,SAAS;AAAA,MACzB;AACA,aAAO,QACH;AAAA,QACE,cAAc;AAAA,QACd,QAAQ;AAAA,QACR,gBAAgB,MAAM;AAAA,MACxB,IACA;AAAA,IACN;AAAA,EACF;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,OAAO,CAAC,EAAE,OAAO,MAAM;AACrB,YAAM,QAAQ,OAAO;AAAA,QACnB,CAAC,MACC,EAAE,SAAS,YACX,EAAE,QAAQ,SAAS,gCAClB,EAAE,QAAQ,SAAS,kBAClB,EAAE,QAAQ,SAAS,2BACnB,EAAE,QAAQ,SAAS,wBACnB,EAAE,QAAQ,WAAW;AAAA,MAC3B;AACA,aAAO,QACH;AAAA,QACE,cAAc;AAAA,QACd,QAAQ;AAAA,QACR,gBAAgB,MAAM;AAAA,MACxB,IACA;AAAA,IACN;AAAA,EACF;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,OAAO,CAAC,EAAE,OAAO,MAAM;AACrB,YAAM,QAAQ,OAAO;AAAA,QACnB,CAAC,MACC,EAAE,SAAS,YACX,EAAE,QAAQ,SAAS,gCAClB,EAAE,QAAQ,SAAS,yBAClB,EAAE,QAAQ,SAAS,mBACnB,EAAE,QAAQ,SAAS;AAAA,MACzB;AACA,aAAO,QACH;AAAA,QACE,cAAc;AAAA,QACd,QAAQ;AAAA,QACR,gBAAgB,MAAM;AAAA,MACxB,IACA;AAAA,IACN;AAAA,EACF;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,OAAO,CAAC,EAAE,OAAO,MAAM;AACrB,YAAM,QAAQ,OAAO;AAAA,QACnB,CAAC,MACC,EAAE,SAAS,YACX,EAAE,QAAQ,SAAS,+BACnB,CAAC;AAAA,UACC;AAAA,UACA;AAAA,UACA;AAAA,UACA;AAAA,UACA;AAAA,UACA;AAAA,UACA;AAAA,UACA;AAAA,UACA;AAAA,QACF,EAAE,SAAS,OAAO,EAAE,QAAQ,IAAI,CAAC;AAAA,MACrC;AACA,aAAO,QACH;AAAA,QACE,cAAc;AAAA,QACd,QAAQ;AAAA,QACR,gBAAgB,MAAM;AAAA,MACxB,IACA;AAAA,IACN;AAAA,EACF;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,OAAO,CAAC,EAAE,OAAO,MAAM;AACrB,YAAM,QAAQ,OAAO;AAAA,QACnB,CAAC,MACC,EAAE,SAAS,YACX,EAAE,QAAQ,SAAS,mBACnB,EAAE,QAAQ,aAAa;AAAA,MAC3B;AACA,aAAO,QACH;AAAA,QACE,cAAc;AAAA,QACd,QAAQ;AAAA,QACR,gBAAgB,MAAM;AAAA,MACxB,IACA;AAAA,IACN;AAAA,EACF;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,OAAO,CAAC,EAAE,KAAK,MAAM,MAAM;AACzB,UAAI,IAAI,SAAS,SAAS,MAAO,QAAO;AACxC,YAAM,YAAY,MAAM;AAAA,QACtB,CAAC,MACC,EAAE,SAAS,gBAAgB,EAAE,KAAK,WAAW,KAAK,EAAE,KAAK,MAAM,CAAC,QAAQ,IAAI,SAAS,CAAC;AAAA,MAC1F;AACA,aAAO,YACH;AAAA,QACE,cAAc;AAAA,QACd,QAAQ;AAAA,QACR,eAAe,UAAU;AAAA,MAC3B,IACA;AAAA,IACN;AAAA,EACF;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,OAAO,CAAC,EAAE,OAAO,MAAM;AACrB,YAAM,QAAQ,OAAO;AAAA,QACnB,CAAC,MACC,EAAE,SAAS,YACX,EAAE,QAAQ,SAAS,mBACnB,EAAE,QAAQ,WAAW;AAAA,MACzB;AACA,aAAO,QACH;AAAA,QACE,cAAc;AAAA,QACd,QAAQ;AAAA,QACR,gBAAgB,MAAM;AAAA,MACxB,IACA;AAAA,IACN;AAAA,EACF;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,OAAO,CAAC,EAAE,OAAO,MAAM;AACrB,YAAM,QAAQ,OAAO;AAAA,QACnB,CAAC,MACC,EAAE,SAAS,YACX,EAAE,QAAQ,SAAS,mBACnB,EAAE,QAAQ,WAAW;AAAA,MACzB;AACA,aAAO,QACH;AAAA,QACE,cAAc;AAAA,QACd,QAAQ;AAAA,QACR,gBAAgB,MAAM;AAAA,MACxB,IACA;AAAA,IACN;AAAA,EACF;AAAA;AAAA,EAEA;AAAA,IACE,IAAI;AAAA,IACJ,OAAO,CAAC,EAAE,OAAO,MAAM;AACrB,YAAM,SAAS,OAAO,KAAK,CAAC,MAAM,EAAE,SAAS,eAAe;AAC5D,aAAO,SACH;AAAA,QACE,cAAc;AAAA,QACd,QAAQ,sBAAsB,OAAO,QAAQ,aAAa,mBAAmB;AAAA,QAC7E,gBAAgB,OAAO;AAAA,MACzB,IACA;AAAA,IACN;AAAA,EACF;AAAA;AAAA,EAEA;AAAA,IACE,IAAI;AAAA,IACJ,OAAO,CAAC,EAAE,OAAO,MAAM;AACrB,YAAM,IAAI,OAAO,KAAK,CAAC,MAAM,EAAE,SAAS,kBAAkB;AAC1D,aAAO,IACH;AAAA,QACE,cAAc;AAAA,QACd,QAAQ;AAAA,QACR,gBAAgB,EAAE;AAAA,MACpB,IACA;AAAA,IACN;AAAA,EACF;AAAA;AAAA,EAEA;AAAA,IACE,IAAI;AAAA,IACJ,OAAO,CAAC,EAAE,MAAM,MAAM;AACpB,YAAM,IAAI,MAAM;AAAA,QACd,CAAC,MAAM,EAAE,SAAS,aAAa,OAAO,EAAE,aAAa,YAAY,EAAE,aAAa;AAAA,MAClF;AACA,UAAI,CAAC,EAAG,QAAO;AACf,aAAO;AAAA,QACL,cAAc;AAAA,QACd,QAAQ,kBAAmB,EAAyC,QAAQ;AAAA,QAC5E,eAAe,EAAE;AAAA,MACnB;AAAA,IACF;AAAA,EACF;AAAA;AAAA,EAEA;AAAA,IACE,IAAI;AAAA,IACJ,OAAO,CAAC,EAAE,KAAK,OAAO,MAAM;AAC1B,UAAI,IAAI,WAAW,UAAW,QAAO;AACrC,YAAM,aAAa,OAAO;AAAA,QACxB,CAAC,MACC,EAAE,SAAS,WACX,OAAO,EAAE,QAAQ,UAAU,EAAE,EAC1B,YAAY,EACZ,SAAS,SAAS;AAAA,MACzB;AACA,YAAM,QAAQ,IAAI,SAAS,SAAS,IAAI,YAAY;AACpD,UAAI,cAAc,KAAK,SAAS,SAAS,KAAK,KAAK,SAAS,UAAU,GAAG;AACvE,eAAO,EAAE,cAAc,WAAW,QAAQ,0BAA0B;AAAA,MACtE;AACA,aAAO;AAAA,IACT;AAAA,EACF;AAAA;AAAA,EAEA;AAAA,IACE,IAAI;AAAA,IACJ,OAAO,CAAC,EAAE,MAAM,MAAM;AACpB,YAAM,QAAQ,MAAM,OAAO,CAAC,MAAM,EAAE,SAAS,MAAM;AACnD,YAAM,SAAS,oBAAI,IAAoB;AACvC,iBAAW,KAAK,OAAO;AACrB,cAAM,OAAQ,EAAsC;AACpD,cAAM,MAAM,OAAO,IAAI,IAAI,KAAK,CAAC;AACjC,YAAI,KAAK,CAAC;AACV,eAAO,IAAI,MAAM,GAAG;AAAA,MACtB;AACA,iBAAW,CAAC,MAAM,GAAG,KAAK,QAAQ;AAChC,cAAM,OAAO,IAAI,OAAO,CAAC,MAAM,EAAE,WAAW,OAAO;AACnD,YAAI,KAAK,UAAU,KAAK,KAAK,WAAW,IAAI,QAAQ;AAClD,iBAAO;AAAA,YACL,cAAc;AAAA,YACd,QAAQ,GAAG,KAAK,MAAM,gCAAgC,IAAI;AAAA,YAC1D,eAAe,KAAK,KAAK,SAAS,CAAC,EAAG;AAAA,UACxC;AAAA,QACF;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AAAA;AAAA,EAEA;AAAA,IACE,IAAI;AAAA,IACJ,OAAO,CAAC,EAAE,KAAK,MAAM,MAAM;AACzB,UAAI,IAAI,SAAS,SAAS,MAAO,QAAO;AACxC,YAAM,oBAAoB,MAAM;AAAA,QAC9B,CAAC,MACC,EAAE,SAAS,WACV,EAAE,YAAY,mBAA0C,UACxD,EAAE,YAAY,iBAA4B;AAAA,MAC/C;AACA,YAAM,QAAQ,MAAM,OAAO,CAAC,MAAM,EAAE,SAAS,MAAM;AACnD,UAAI,qBAAqB,MAAM,WAAW,GAAG;AAC3C,eAAO;AAAA,UACL,cAAc;AAAA,UACd,QAAQ;AAAA,QACV;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AAAA;AAAA,EAEA;AAAA,IACE,IAAI;AAAA,IACJ,OAAO,CAAC,EAAE,MAAM,MAAM;AACpB,YAAM,QAAQ,MAAM;AAAA,QAClB,CAAC,MACC,EAAE,SAAS,WACV,EAAuC,cAAc,YACrD,EAAuC,QAAQ;AAAA,MACpD;AACA,aAAO,QACH;AAAA,QACE,cAAc;AAAA,QACd,QAAQ;AAAA,QACR,eAAe,MAAM;AAAA,MACvB,IACA;AAAA,IACN;AAAA,EACF;AACF;AAEA,SAAS,oBAAoB,SAAkC,QAAyB;AACtF,MAAI,WAAW,wBAAwB,YAAY,QAAQ,kBAAkB,EAAE,SAAS;AACtF,WAAO;AACT,SAAO,gBAAgB,OAAO,EAAE,KAAK,CAAC,SAAS,KAAK,WAAW,MAAM;AACvE;AAEA,SAAS,iBAAiB,SAA2C;AACnE,MAAI,YAAY,QAAQ,aAAa,EAAE,SAAS,EAAG,QAAO;AAC1D,SAAO,gBAAgB,OAAO,EAAE;AAAA,IAC9B,CAAC,SAAS,MAAM,QAAQ,KAAK,aAAa,KAAK,KAAK,cAAc,SAAS;AAAA,EAC7E;AACF;AAEA,SAAS,gBAAgB,SAAkE;AACzF,SAAO;AAAA,IACL,GAAG,QAAQ,QAAQ,OAAO;AAAA,IAC1B,GAAG,QAAQ,QAAQ,eAAe;AAAA,IAClC,GAAG,QAAQ,QAAQ,KAAK;AAAA,EAC1B;AACF;AAEA,SAAS,QAAQ,OAAgD;AAC/D,MAAI,CAAC,MAAM,QAAQ,KAAK,EAAG,QAAO,CAAC;AACnC,SAAO,MAAM;AAAA,IACX,CAAC,SACC,QAAQ,IAAI,KAAK,OAAO,SAAS,YAAY,CAAC,MAAM,QAAQ,IAAI;AAAA,EACpE;AACF;AAEA,SAAS,YAAY,OAA0B;AAC7C,SAAO,MAAM,QAAQ,KAAK,IACtB,MAAM,OAAO,CAAC,SAAyB,OAAO,SAAS,QAAQ,IAC/D,CAAC;AACP;AAGO,SAAS,gBACd,KACA,QAAuB,eACA;AACvB,MAAI,IAAI,IAAI,SAAS,SAAS,SAAS,IAAI,IAAI,WAAW,aAAa;AACrE,WAAO,EAAE,cAAc,WAAW,QAAQ,qDAAqD;AAAA,EACjG;AACA,aAAW,QAAQ,OAAO;AACxB,UAAM,MAAM,KAAK,MAAM,GAAG;AAC1B,QAAI,IAAK,QAAO;AAAA,EAClB;AACA,SAAO,EAAE,cAAc,WAAW,QAAQ,sDAAsD;AAClG;;;ACvaA,eAAsB,mBACpB,OACA,UAA8D,CAAC,GAChC;AAC/B,QAAM,QAAQ,QAAQ,SAAS;AAC/B,QAAM,UAAU,QAAQ,kBAAkB;AAC1C,QAAM,OAAO,MAAM,MAAM,SAAS;AAGlC,QAAM,WAAW,oBAAI,IAAyB;AAC9C,MAAI,gBAAgB;AAEpB,aAAW,OAAO,MAAM;AACtB,QAAI,IAAI,WAAW,eAAe,IAAI,SAAS,SAAS,MAAO;AAC/D;AACA,UAAM,QAAQ,MAAM,MAAM,MAAM,EAAE,OAAO,IAAI,MAAM,CAAC;AACpD,UAAM,SAAS,MAAM,MAAM,OAAO,EAAE,OAAO,IAAI,MAAM,CAAC;AACtD,UAAM,MAAM,gBAAgB,EAAE,KAAK,OAAO,OAAO,GAAG,KAAK;AAEzD,QAAI;AACJ,QAAI;AACJ,QAAI;AACJ,QAAI,IAAI,eAAe;AACrB,YAAM,OAAO,MAAM,KAAK,CAAC,MAAM,EAAE,WAAW,IAAI,aAAa;AAC7D,UAAI,MAAM,SAAS,QAAQ;AACzB,mBAAW,KAAK;AAChB,oBAAY,QAAQ,KAAK,IAAI,EAAE,MAAM,GAAG,EAAE;AAAA,MAC5C,WAAW,MAAM,SAAS,SAAS;AACjC,oBAAY,KAAK;AAAA,MACnB;AAAA,IACF;AAEA,QAAI,CAAC,UAAU;AACb,YAAM,KAAK,MAAM,UAAU,OAAO,IAAI,KAAK;AAC3C,YAAM,UAAU,GAAG,OAAO,CAAC,MAAM,EAAE,WAAW,OAAO,EAAE,IAAI;AAC3D,UAAI,SAAS;AACX,mBAAW,QAAQ;AACnB,oBAAY,QAAQ,QAAQ,IAAI,EAAE,MAAM,GAAG,EAAE;AAAA,MAC/C;AAAA,IACF;AAIA,QAAI,CAAC,WAAW;AACd,YAAM,QAAQ,MAAM,KAAK,CAAC,MAAM,EAAE,SAAS,WAAW,OAAO,EAAE,cAAc,QAAQ;AACrF,UAAI,OAAO,SAAS,QAAS,aAAY,MAAM;AAAA,IACjD;AAEA,UAAM,MAAM,GAAG,IAAI,YAAY,IAAI,YAAY,EAAE,IAAI,aAAa,EAAE,IAAI,aAAa,EAAE;AACvF,QAAI,UAAU,SAAS,IAAI,GAAG;AAC9B,QAAI,CAAC,SAAS;AACZ,gBAAU;AAAA,QACR,cAAc,IAAI;AAAA,QAClB;AAAA,QACA;AAAA,QACA;AAAA,QACA,UAAU;AAAA,QACV,aAAa,CAAC;AAAA,QACd,cAAc,IAAI;AAAA,QAClB,cAAc,kBAAkB,KAAK,KAAK,IAAI;AAAA,MAChD;AACA,eAAS,IAAI,KAAK,OAAO;AAAA,IAC3B;AACA,YAAQ;AACR,QAAI,CAAC,QAAQ,YAAY,SAAS,IAAI,UAAU,EAAG,SAAQ,YAAY,KAAK,IAAI,UAAU;AAAA,EAC5F;AAEA,QAAM,MAAM,CAAC,GAAG,SAAS,OAAO,CAAC,EAC9B,OAAO,CAAC,MAAM,EAAE,YAAY,OAAO,EACnC,KAAK,CAAC,GAAG,MAAM,EAAE,WAAW,EAAE,QAAQ;AAEzC,SAAO,EAAE,UAAU,KAAK,eAAe,WAAW,KAAK,OAAO;AAChE;AAEA,SAAS,kBAAkB,OAAmC;AAC5D,QAAM,UAAU,MAAM,KAAK,CAAC,MAAM,EAAE,WAAW,OAAO;AACtD,SAAO,SAAS;AAClB;;;AC7EA,eAAsB,sBACpB,OACA,OACA,UAA0B,CAAC,GACF;AACzB,QAAM,QAAQ,MAAM,UAAU,OAAO,KAAK;AAC1C,MAAI,MAAM,WAAW,GAAG;AACtB,WAAO,EAAE,OAAO,YAAY,GAAG,QAAQ,CAAC,GAAG,WAAW,GAAG,eAAe,GAAG,WAAW,EAAE;AAAA,EAC1F;AAEA,QAAM,SAAoC,CAAC;AAC3C,MAAI,cAAc;AAClB,MAAI,kBAAkB;AACtB,QAAM,cAAc,CAAC,GAAG,KAAK,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,YAAY,EAAE,SAAS;AACvE,QAAM,iBAAiB,oBAAI,IAAY;AAGvC,aAAW,KAAK,aAAa;AAC3B,UAAM,OAAQ,OAAO,EAAE,QAAQ,MAAM,EAAE,OAAO,GAAG,QAAQ,GAAG,cAAc,GAAG,YAAY,EAAE;AAC3F,SAAK,SAAS;AACd,QAAI,EAAE,WAAW,SAAS;AACxB,WAAK,UAAU;AACf,qBAAe;AAAA,IACjB;AACA,QAAI,OAAO,EAAE,cAAc,SAAU,MAAK,gBAAgB,EAAE;AAC5D,UAAM,MAAM,GAAG,EAAE,QAAQ,IAAI,QAAQ,EAAE,IAAI,CAAC;AAC5C,QAAI,eAAe,IAAI,GAAG,GAAG;AAC3B,WAAK,cAAc;AACnB,yBAAmB;AAAA,IACrB;AACA,mBAAe,IAAI,GAAG;AAAA,EACxB;AAEA,aAAW,QAAQ,OAAO,OAAO,MAAM,GAAG;AACxC,SAAK,eAAe,KAAK,QAAQ,IAAI,KAAK,eAAe,KAAK,QAAQ;AAAA,EACxE;AAGA,MAAI,qBAAqB;AACzB,MAAI,kBAAkB;AACtB,aAAW,CAAC,EAAE,GAAG,KAAK,QAAQ,aAAa,CAAC,MAAM,EAAE,QAAQ,GAAG;AAC7D,aAAS,IAAI,GAAG,IAAI,IAAI,QAAQ,KAAK;AACnC,UAAI,IAAI,CAAC,EAAG,WAAW,QAAS;AAChC,4BAAsB;AACtB,UAAI,IAAI,IAAI,CAAC,EAAG,oBAAmB;AAAA,IACrC;AAAA,EACF;AACA,QAAM,YAAY,qBAAqB,IAAI,kBAAkB,qBAAqB;AAElF,MAAI;AACJ,MAAI,QAAQ,iBAAiB;AAC3B,UAAM,UAAU,YAAY,OAAO,CAAC,MAAM,EAAE,UAAU,QAAQ,eAAgB;AAC9E,QAAI,QAAQ,SAAS,GAAG;AACtB,0BACE,QAAQ,OAAO,CAAC,MAAM,QAAQ,gBAAiB,EAAE,MAAM,CAAC,EAAE,SAAS,QAAQ;AAAA,IAC/E;AAAA,EACF;AAEA,SAAO;AAAA,IACL;AAAA,IACA,YAAY,YAAY;AAAA,IACxB;AAAA,IACA,WAAW,cAAc,YAAY;AAAA,IACrC,eAAe,kBAAkB,YAAY;AAAA,IAC7C;AAAA,IACA;AAAA,EACF;AACF;;;ACxCO,SAAS,kBACd,SACA,UAA2B,CAAC,GACZ;AAChB,QAAM,kBAAkB,QAAQ,mBAAmB;AACnD,QAAM,QAAQ,QAAQ,SAAS;AAC/B,QAAM,cAAc,QAAQ,uBAAuB;AAEnD,QAAM,UAA2B,QAAQ,IAAI,CAAC,MAAM;AAClD,QAAI,EAAE,SAAS,SAAS,KAAK,EAAE,UAAU,SAAS,GAAG;AACnD,YAAM,IAAI,MAAM,yDAAoD,EAAE,MAAM,GAAG;AAAA,IACjF;AACA,UAAM,QAAQ,KAAK,EAAE,QAAQ;AAC7B,UAAM,QAAQ,KAAK,EAAE,SAAS;AAC9B,UAAM,QAAQ,QAAQ;AACtB,UAAM,IAAI,QAAQ,EAAE,UAAU,EAAE,SAAS;AACzC,UAAM,EAAE,GAAG,IAAI,EAAE,IAAI,YAAY,EAAE,UAAU,EAAE,SAAS;AAKxD,UAAM,cAAc,IAAI,EAAE,QAAQ;AAClC,UAAM,eAAe,IAAI,EAAE,SAAS;AACpC,UAAM,iBAAiB,cAAc,KAAK,IAAI,KAAK,IAAI,KAAK,GAAG,IAAI,KAAK;AACxE,UAAM,kBAAkB,eAAe,KAAK,IAAI,KAAK,IAAI,KAAK,GAAG,IAAI,KAAK;AAC1E,UAAM,SAAS,kBAAkB;AACjC,UAAM,cAAc,KAAK,IAAI,aAAa,YAAY;AAEtD,QAAI;AACJ,QAAI,CAAC,QAAQ;AACX,gBAAU;AAAA,IACZ,WAAW,IAAI,SAAS,KAAK,IAAI,CAAC,KAAK,iBAAiB;AACtD,YAAM,oBAAoB,EAAE,iBAAiB,QAAQ,IAAI,QAAQ;AACjE,gBAAU,oBAAoB,aAAa;AAAA,IAC7C,OAAO;AACL,gBAAU;AAAA,IACZ;AAEA,WAAO;AAAA,MACL,QAAQ,EAAE;AAAA,MACV,cAAc;AAAA,MACd,eAAe;AAAA,MACf;AAAA,MACA,SAAS;AAAA,MACT,QAAQ;AAAA,MACR,SAAS;AAAA,MACT,QAAQ;AAAA,MACR;AAAA,MACA,KAAK;AAAA,MACL;AAAA,IACF;AAAA,EACF,CAAC;AAED,SAAO;AAAA,IACL;AAAA,IACA,eAAe,QAAQ,KAAK,CAAC,MAAM,EAAE,YAAY,WAAW;AAAA,IAC5D,aAAa,QAAQ,KAAK,CAAC,MAAM,EAAE,YAAY,UAAU;AAAA,EAC3D;AACF;AAEA,SAAS,KAAK,IAAsB;AAClC,SAAO,GAAG,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,GAAG;AAC5C;AAGO,SAAS,IAAI,IAAsB;AACxC,MAAI,GAAG,WAAW,EAAG,QAAO;AAC5B,QAAM,SAAS,CAAC,GAAG,EAAE,EAAE,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAC3C,QAAM,IAAI,CAAC,MAAc;AACvB,UAAM,MAAM,KAAK,OAAO,SAAS;AACjC,UAAM,KAAK,KAAK,MAAM,GAAG;AACzB,UAAM,KAAK,KAAK,KAAK,GAAG;AACxB,WAAO,OAAO,EAAE,KAAM,OAAO,EAAE,IAAK,OAAO,EAAE,MAAO,MAAM;AAAA,EAC5D;AACA,SAAO,EAAE,IAAI,IAAI,EAAE,IAAI;AACzB;AAOO,SAAS,YAAY,GAAa,GAAmD;AAC1F,MAAI,EAAE,SAAS,KAAK,EAAE,SAAS,EAAG,QAAO,EAAE,GAAG,GAAG,IAAI,GAAG,GAAG,EAAE;AAC7D,QAAM,KAAK,KAAK,CAAC;AACjB,QAAM,KAAK,KAAK,CAAC;AACjB,QAAM,KAAK,SAAS,GAAG,EAAE;AACzB,QAAM,KAAK,SAAS,GAAG,EAAE;AACzB,QAAM,YAAY,KAAK,EAAE,SAAS,KAAK,EAAE;AACzC,MAAI,cAAc,EAAG,QAAO,EAAE,GAAG,OAAO,KAAK,IAAI,UAAU,IAAI,GAAG,GAAG,OAAO,KAAK,IAAI,EAAE;AACvF,QAAM,KAAK,KAAK,MAAM,KAAK,KAAK,SAAS;AACzC,QAAM,KACH,YAAY,cACX,KAAK,EAAE,WAAW,KAAK,EAAE,SAAS,MAAM,KAAK,EAAE,WAAW,KAAK,EAAE,SAAS;AAC9E,QAAM,IAAI,KAAK,IAAI,YAAY,KAAK,IAAI,CAAC,GAAG,EAAE;AAC9C,SAAO,EAAE,GAAG,IAAI,EAAE;AACpB;AAEA,SAAS,SAAS,IAAc,GAAmB;AACjD,SAAO,GAAG,OAAO,CAAC,KAAK,MAAM,OAAO,IAAI,MAAM,GAAG,CAAC,KAAK,GAAG,SAAS;AACrE;AAGA,SAAS,YAAY,GAAW,IAAoB;AAClD,MAAI,MAAM,EAAG,QAAO;AACpB,MAAI,KAAK,IAAK,QAAO,UAAU,CAAC;AAChC,QAAM,IAAI,MAAM,KAAK,IAAI;AACzB,QAAM,KAAK,eAAe,GAAG,KAAK,GAAG,GAAG;AACxC,SAAO,KAAK,IAAI,IAAI,MAAM,KAAK,MAAM;AACvC;AAEA,SAAS,eAAe,GAAW,GAAW,GAAmB;AAC/D,MAAI,KAAK,EAAG,QAAO;AACnB,MAAI,KAAK,EAAG,QAAO;AACnB,QAAM,SAAS,QAAQ,CAAC,IAAI,QAAQ,CAAC,IAAI,QAAQ,IAAI,CAAC;AACtD,QAAM,QAAQ,KAAK,IAAI,KAAK,IAAI,CAAC,IAAI,IAAI,KAAK,IAAI,IAAI,CAAC,IAAI,IAAI,MAAM,IAAI;AACzE,MAAI,IAAI;AACR,MAAI,IAAI,KAAM,IAAI,KAAK,KAAM,IAAI;AACjC,MAAI,KAAK,IAAI,CAAC,IAAI,MAAO,KAAI;AAC7B,MAAI,IAAI;AACR,MAAI,IAAI;AACR,WAAS,IAAI,GAAG,KAAK,KAAK,KAAK;AAC7B,UAAM,KAAK,IAAI;AACf,QAAI,MAAO,KAAK,IAAI,KAAK,MAAO,IAAI,KAAK,MAAM,IAAI;AACnD,QAAI,IAAI,MAAM;AACd,QAAI,KAAK,IAAI,CAAC,IAAI,MAAO,KAAI;AAC7B,QAAI,IAAI,MAAM;AACd,QAAI,KAAK,IAAI,CAAC,IAAI,MAAO,KAAI;AAC7B,QAAI,IAAI;AACR,SAAK,IAAI;AACT,UAAM,GAAG,IAAI,MAAM,IAAI,IAAI,KAAK,OAAO,IAAI,OAAO,IAAI,KAAK;AAC3D,QAAI,IAAI,MAAM;AACd,QAAI,KAAK,IAAI,CAAC,IAAI,MAAO,KAAI;AAC7B,QAAI,IAAI,MAAM;AACd,QAAI,KAAK,IAAI,CAAC,IAAI,MAAO,KAAI;AAC7B,QAAI,IAAI;AACR,UAAM,QAAQ,IAAI;AAClB,SAAK;AACL,QAAI,KAAK,IAAI,QAAQ,CAAC,IAAI,KAAM;AAAA,EAClC;AACA,SAAO,QAAQ;AACjB;AAEA,SAAS,QAAQ,GAAmB;AAClC,QAAM,QAAQ;AAAA,IACZ;AAAA,IAAqB;AAAA,IAAmB;AAAA,IAAqB;AAAA,IAC7D;AAAA,IAAqB;AAAA,IAAoB;AAAA,IAAsB;AAAA,IAC/D;AAAA,EACF;AACA,MAAI,IAAI,IAAK,QAAO,KAAK,IAAI,KAAK,KAAK,KAAK,IAAI,KAAK,KAAK,CAAC,CAAC,IAAI,QAAQ,IAAI,CAAC;AAC7E,OAAK;AACL,MAAI,IAAI,MAAM,CAAC;AACf,WAAS,IAAI,GAAG,IAAI,GAAG,IAAK,MAAK,MAAM,CAAC,KAAM,IAAI;AAClD,QAAM,IAAI,IAAI;AACd,SAAO,MAAM,KAAK,IAAI,IAAI,KAAK,EAAE,KAAK,IAAI,OAAO,KAAK,IAAI,CAAC,IAAI,IAAI,KAAK,IAAI,CAAC;AAC/E;AAEA,SAAS,UAAU,GAAmB;AACpC,QAAM,KAAK;AACX,QAAM,KAAK;AACX,QAAM,KAAK;AACX,QAAM,KAAK;AACX,QAAM,KAAK;AACX,QAAM,IAAI;AACV,QAAM,OAAO,IAAI,IAAI,KAAK;AAC1B,QAAM,OAAO,KAAK,IAAI,CAAC;AACvB,QAAM,IAAI,KAAK,IAAI,IAAI;AACvB,QAAM,IAAI,QAAQ,KAAK,IAAI,MAAM,IAAI,MAAM,IAAI,MAAM,IAAI,MAAM,IAAI,KAAK,IAAK,CAAC,OAAO,OAAQ,CAAC;AAC9F,SAAO,OAAO,IAAI,OAAO;AAC3B;","names":[]}
@@ -96,4 +96,4 @@ export {
96
96
  pairedEvalueSequence,
97
97
  evaluateInterimReleaseConfidence
98
98
  };
99
- //# sourceMappingURL=chunk-NU65VQ7M.js.map
99
+ //# sourceMappingURL=chunk-MAZ26DC7.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/sequential.ts"],"sourcesContent":["/**\n * Always-valid sequential evaluation.\n *\n * `researchReport` assumes a single pre-specified analysis. Real\n * consumers run campaigns weekly / nightly / per-PR; each new run silently\n * inflates the false-discovery rate, because the BH-FDR guarantee is for\n * the *first* look, not the 47th. Without time-uniform inference,\n * launch-decision teams either (a) don't peek, which forfeits the cost\n * advantage of stop-when-decisive, or (b) peek and pretend they didn't,\n * which forfeits scientific validity.\n *\n * This module ships **e-value-based confidence sequences** for paired\n * bounded outcomes. The methodology is the predictable plug-in betting\n * martingale of Waudby-Smith & Ramdas (2024) — provably valid at *any*\n * stopping time. Concretely:\n *\n * For paired deltas D_1, D_2, … ∈ [-c, c] with the null H_0: E[D] ≤ 0,\n * a betting fraction λ_i is chosen using only D_{1..i-1} (predictable\n * plug-in), and the running e-value is\n *\n * E_t = ∏_{i=1}^{t} (1 + λ_i · D_i)\n *\n * E_t is a non-negative martingale under H_0 with E[E_t] ≤ 1, so by\n * Ville's inequality, P(∃ t : E_t ≥ 1/α) ≤ α — we can reject the null\n * at any time without inflating the type-I error.\n *\n * Combined with `runEvalCampaign`, every consumer running rolling\n * campaigns gains the ability to ship the moment evidence is decisive,\n * stop-early on dead-on-arrival variants, and accumulate evidence across\n * partial runs without spending the FDR budget. No new sweep is wasted.\n *\n * References:\n * - Howard, S. R., Ramdas, A., McAuliffe, J., Sekhon, J. (2021).\n * Time-uniform, nonparametric, nonasymptotic confidence sequences.\n * Annals of Statistics, 49(2), 1055–1080.\n * - Waudby-Smith, I., Ramdas, A. (2024). Estimating means of bounded\n * random variables by betting. JRSS B, 86(1), 1–27.\n */\n\nexport type SequentialDecision = 'promote_now' | 'continue' | 'reject_now' | 'equivalent'\n\nexport interface PairedEvalueOptions {\n /**\n * Bound on |delta|. Default 1 (matching most score scales). Must satisfy\n * c > 0; deltas outside [-c, c] are clipped with a warning attached to\n * the return value.\n */\n bound?: number\n /** Target Type-I error. Default 0.05. */\n alpha?: number\n /**\n * Region of Practical Equivalence on the *mean* paired delta. When\n * supplied, the verdict can return `'equivalent'` once the running\n * confidence sequence on the mean is fully contained in [low, high].\n */\n rope?: { low: number; high: number }\n /** Initial bet shrinkage (0 < scale ≤ 1). Default 0.5 — empirically robust. */\n initialBetShrinkage?: number\n}\n\nexport interface PairedEvalueStep {\n /** 1-indexed observation count. */\n t: number\n delta: number\n /** Running e-value E_t = ∏ (1 + λ_i · D_i). */\n evalue: number\n /** Time-uniform p-value at stopping time t. */\n pValue: number\n /** Lower bound of the empirical Bernstein confidence sequence at level 1-α. */\n csLow: number\n csHigh: number\n /** Verdict at this stopping time. */\n decision: SequentialDecision\n}\n\nexport interface PairedEvalueSequence {\n steps: PairedEvalueStep[]\n /** The decision at the final step. */\n finalDecision: SequentialDecision\n /** Index (1-based) at which a non-`continue` decision first fired, or null. */\n decisionFiredAt: number | null\n /** True if any deltas were clipped to [-bound, bound]. */\n clipped: boolean\n}\n\n/**\n * Run the paired e-value sequence over an in-order delta stream.\n *\n * Use for *streaming* / interim analyses: pass the deltas you have so\n * far, get the verdict at every prefix length. The decision is\n * monotone-stable in the sense that once `'reject_now'` or `'promote_now'`\n * fires, the verdict at later steps remains decisive (the e-value is a\n * non-negative martingale; once it crosses the threshold, it's crossed).\n */\nexport function pairedEvalueSequence(\n deltas: number[],\n opts: PairedEvalueOptions = {},\n): PairedEvalueSequence {\n const c = opts.bound ?? 1\n const alpha = opts.alpha ?? 0.05\n const initialShrink = opts.initialBetShrinkage ?? 0.5\n const rope = opts.rope ?? null\n if (c <= 0) throw new Error('pairedEvalueSequence: bound must be > 0')\n if (alpha <= 0 || alpha >= 1) throw new Error('pairedEvalueSequence: alpha must be in (0,1)')\n if (rope && !(Number.isFinite(rope.low) && Number.isFinite(rope.high) && rope.low <= rope.high)) {\n throw new Error('pairedEvalueSequence: rope must satisfy low ≤ high')\n }\n\n const steps: PairedEvalueStep[] = []\n let clipped = false\n let evalue = 1\n let decisionFiredAt: number | null = null\n\n // Running statistics (using only D_{1..i-1} for the bet → predictable plug-in).\n let sum = 0\n let sumSq = 0\n let count = 0\n\n for (let i = 0; i < deltas.length; i++) {\n let d = deltas[i]!\n if (d < -c || d > c) {\n d = Math.max(-c, Math.min(c, d))\n clipped = true\n }\n\n // Predictable plug-in bet (positive λ tests for E[D] > 0; we run a two-sided\n // test by tracking the symmetric e-value via |bet|).\n // λ_i ∝ mean / (variance + bound^2). Shrink early to avoid overbetting.\n const muHat = count === 0 ? 0 : sum / count\n const varHat = count === 0 ? c * c : Math.max(1e-12, sumSq / count - muHat * muHat)\n const t = i + 1\n const shrink = initialShrink * Math.min(1, count / 32) // anneal toward 1\n let lambda = (muHat / (varHat + c * c)) * shrink\n // Clip to ensure 1 + λ·D > 0 for all |D| ≤ c (so the e-value stays non-negative).\n const lambdaMax = 0.99 / c\n if (lambda > lambdaMax) lambda = lambdaMax\n if (lambda < -lambdaMax) lambda = -lambdaMax\n\n evalue = evalue * (1 + lambda * d)\n if (!Number.isFinite(evalue) || evalue < 0) evalue = 0\n\n sum += d\n sumSq += d * d\n count += 1\n\n const pValue = Math.min(1, 1 / Math.max(evalue, 1e-300))\n\n // Empirical Bernstein confidence sequence on the mean. Howard et al.\n // (2021), Theorem 4.4 with σ̂² the running sample variance and a\n // calibration constant tuned for two-sided coverage at level 1 - α.\n const cs = empiricalBernsteinCs(sum, sumSq, count, c, alpha)\n\n let decision: SequentialDecision = 'continue'\n if (rope && cs.low >= rope.low && cs.high <= rope.high) decision = 'equivalent'\n else if (evalue >= 2 / alpha && muHat > 0) decision = 'promote_now'\n else if (evalue >= 2 / alpha && muHat < 0) decision = 'reject_now'\n else if (rope && cs.high < rope.low) decision = 'reject_now'\n\n if (decision !== 'continue' && decisionFiredAt === null) decisionFiredAt = t\n\n steps.push({ t, delta: d, evalue, pValue, csLow: cs.low, csHigh: cs.high, decision })\n }\n\n const finalDecision = steps.length === 0 ? 'continue' : steps[steps.length - 1]!.decision\n return { steps, finalDecision, decisionFiredAt, clipped }\n}\n\nexport interface InterimReleaseConfidenceInput {\n /**\n * One delta series per candidate (paired deltas vs comparator). Order\n * within a series is the order the campaigns were run.\n */\n deltaSeries: Array<{ candidateId: string; deltas: number[] }>\n alpha?: number\n bound?: number\n rope?: { low: number; high: number }\n}\n\nexport interface InterimReleaseConfidence {\n candidates: Array<{\n candidateId: string\n decision: SequentialDecision\n decisionFiredAt: number | null\n finalEvalue: number\n finalPValue: number\n pairs: number\n csLow: number\n csHigh: number\n }>\n /**\n * Campaign-level recommendation: pick the strongest 'promote_now', else\n * 'continue' if any candidate is still live, else 'reject_now' if every\n * candidate is dead, else 'equivalent'.\n */\n recommendation: { decision: SequentialDecision; candidateId: string | null }\n}\n\n/**\n * Run interim sequential analyses across many candidates at once,\n * preserving the time-uniform α guarantee for each candidate's series and\n * synthesising a campaign-level recommendation. Designed to be called on\n * every campaign tick — the recommendation is anytime-valid.\n */\nexport function evaluateInterimReleaseConfidence(\n input: InterimReleaseConfidenceInput,\n): InterimReleaseConfidence {\n const candidates = input.deltaSeries.map((s) => {\n const seq = pairedEvalueSequence(s.deltas, {\n alpha: input.alpha,\n bound: input.bound,\n rope: input.rope,\n })\n const last = seq.steps[seq.steps.length - 1]\n return {\n candidateId: s.candidateId,\n decision: seq.finalDecision,\n decisionFiredAt: seq.decisionFiredAt,\n finalEvalue: last?.evalue ?? 1,\n finalPValue: last?.pValue ?? 1,\n pairs: seq.steps.length,\n csLow: last?.csLow ?? Number.NEGATIVE_INFINITY,\n csHigh: last?.csHigh ?? Number.POSITIVE_INFINITY,\n }\n })\n\n const promote = candidates.find((c) => c.decision === 'promote_now')\n if (promote)\n return {\n candidates,\n recommendation: { decision: 'promote_now', candidateId: promote.candidateId },\n }\n const live = candidates.find((c) => c.decision === 'continue')\n if (live) return { candidates, recommendation: { decision: 'continue', candidateId: null } }\n const equiv = candidates.find((c) => c.decision === 'equivalent')\n if (equiv)\n return {\n candidates,\n recommendation: { decision: 'equivalent', candidateId: equiv.candidateId },\n }\n return { candidates, recommendation: { decision: 'reject_now', candidateId: null } }\n}\n\n// ── Internals ────────────────────────────────────────────────────────────\n\n/**\n * Empirical Bernstein confidence sequence on the mean of bounded variables.\n * Adapted from Howard et al. (2021) §4.4. Provides a time-uniform CI on\n * the running mean; valid at every stopping time.\n */\nfunction empiricalBernsteinCs(\n sum: number,\n sumSq: number,\n n: number,\n bound: number,\n alpha: number,\n): { low: number; high: number } {\n if (n === 0) return { low: -bound, high: bound }\n const mean = sum / n\n const variance = Math.max(0, sumSq / n - mean * mean)\n // Iterated-log calibration constant. The 1.7 exponent matches the\n // recommended choice in Howard et al. for two-sided coverage at level\n // 1 - α with mild log-corrections; tightening further requires a\n // tuned mixture and is out of scope.\n const psi = Math.log(2 / alpha) + 1.7 * Math.log(Math.log(Math.max(Math.E, n)) + 1)\n const radius = Math.sqrt((2 * variance * psi) / n) + (3 * bound * psi) / n\n return { low: mean - radius, high: mean + radius }\n}\n"],"mappings":";AA8FO,SAAS,qBACd,QACA,OAA4B,CAAC,GACP;AACtB,QAAM,IAAI,KAAK,SAAS;AACxB,QAAM,QAAQ,KAAK,SAAS;AAC5B,QAAM,gBAAgB,KAAK,uBAAuB;AAClD,QAAM,OAAO,KAAK,QAAQ;AAC1B,MAAI,KAAK,EAAG,OAAM,IAAI,MAAM,yCAAyC;AACrE,MAAI,SAAS,KAAK,SAAS,EAAG,OAAM,IAAI,MAAM,8CAA8C;AAC5F,MAAI,QAAQ,EAAE,OAAO,SAAS,KAAK,GAAG,KAAK,OAAO,SAAS,KAAK,IAAI,KAAK,KAAK,OAAO,KAAK,OAAO;AAC/F,UAAM,IAAI,MAAM,yDAAoD;AAAA,EACtE;AAEA,QAAM,QAA4B,CAAC;AACnC,MAAI,UAAU;AACd,MAAI,SAAS;AACb,MAAI,kBAAiC;AAGrC,MAAI,MAAM;AACV,MAAI,QAAQ;AACZ,MAAI,QAAQ;AAEZ,WAAS,IAAI,GAAG,IAAI,OAAO,QAAQ,KAAK;AACtC,QAAI,IAAI,OAAO,CAAC;AAChB,QAAI,IAAI,CAAC,KAAK,IAAI,GAAG;AACnB,UAAI,KAAK,IAAI,CAAC,GAAG,KAAK,IAAI,GAAG,CAAC,CAAC;AAC/B,gBAAU;AAAA,IACZ;AAKA,UAAM,QAAQ,UAAU,IAAI,IAAI,MAAM;AACtC,UAAM,SAAS,UAAU,IAAI,IAAI,IAAI,KAAK,IAAI,OAAO,QAAQ,QAAQ,QAAQ,KAAK;AAClF,UAAM,IAAI,IAAI;AACd,UAAM,SAAS,gBAAgB,KAAK,IAAI,GAAG,QAAQ,EAAE;AACrD,QAAI,SAAU,SAAS,SAAS,IAAI,KAAM;AAE1C,UAAM,YAAY,OAAO;AACzB,QAAI,SAAS,UAAW,UAAS;AACjC,QAAI,SAAS,CAAC,UAAW,UAAS,CAAC;AAEnC,aAAS,UAAU,IAAI,SAAS;AAChC,QAAI,CAAC,OAAO,SAAS,MAAM,KAAK,SAAS,EAAG,UAAS;AAErD,WAAO;AACP,aAAS,IAAI;AACb,aAAS;AAET,UAAM,SAAS,KAAK,IAAI,GAAG,IAAI,KAAK,IAAI,QAAQ,MAAM,CAAC;AAKvD,UAAM,KAAK,qBAAqB,KAAK,OAAO,OAAO,GAAG,KAAK;AAE3D,QAAI,WAA+B;AACnC,QAAI,QAAQ,GAAG,OAAO,KAAK,OAAO,GAAG,QAAQ,KAAK,KAAM,YAAW;AAAA,aAC1D,UAAU,IAAI,SAAS,QAAQ,EAAG,YAAW;AAAA,aAC7C,UAAU,IAAI,SAAS,QAAQ,EAAG,YAAW;AAAA,aAC7C,QAAQ,GAAG,OAAO,KAAK,IAAK,YAAW;AAEhD,QAAI,aAAa,cAAc,oBAAoB,KAAM,mBAAkB;AAE3E,UAAM,KAAK,EAAE,GAAG,OAAO,GAAG,QAAQ,QAAQ,OAAO,GAAG,KAAK,QAAQ,GAAG,MAAM,SAAS,CAAC;AAAA,EACtF;AAEA,QAAM,gBAAgB,MAAM,WAAW,IAAI,aAAa,MAAM,MAAM,SAAS,CAAC,EAAG;AACjF,SAAO,EAAE,OAAO,eAAe,iBAAiB,QAAQ;AAC1D;AAsCO,SAAS,iCACd,OAC0B;AAC1B,QAAM,aAAa,MAAM,YAAY,IAAI,CAAC,MAAM;AAC9C,UAAM,MAAM,qBAAqB,EAAE,QAAQ;AAAA,MACzC,OAAO,MAAM;AAAA,MACb,OAAO,MAAM;AAAA,MACb,MAAM,MAAM;AAAA,IACd,CAAC;AACD,UAAM,OAAO,IAAI,MAAM,IAAI,MAAM,SAAS,CAAC;AAC3C,WAAO;AAAA,MACL,aAAa,EAAE;AAAA,MACf,UAAU,IAAI;AAAA,MACd,iBAAiB,IAAI;AAAA,MACrB,aAAa,MAAM,UAAU;AAAA,MAC7B,aAAa,MAAM,UAAU;AAAA,MAC7B,OAAO,IAAI,MAAM;AAAA,MACjB,OAAO,MAAM,SAAS,OAAO;AAAA,MAC7B,QAAQ,MAAM,UAAU,OAAO;AAAA,IACjC;AAAA,EACF,CAAC;AAED,QAAM,UAAU,WAAW,KAAK,CAAC,MAAM,EAAE,aAAa,aAAa;AACnE,MAAI;AACF,WAAO;AAAA,MACL;AAAA,MACA,gBAAgB,EAAE,UAAU,eAAe,aAAa,QAAQ,YAAY;AAAA,IAC9E;AACF,QAAM,OAAO,WAAW,KAAK,CAAC,MAAM,EAAE,aAAa,UAAU;AAC7D,MAAI,KAAM,QAAO,EAAE,YAAY,gBAAgB,EAAE,UAAU,YAAY,aAAa,KAAK,EAAE;AAC3F,QAAM,QAAQ,WAAW,KAAK,CAAC,MAAM,EAAE,aAAa,YAAY;AAChE,MAAI;AACF,WAAO;AAAA,MACL;AAAA,MACA,gBAAgB,EAAE,UAAU,cAAc,aAAa,MAAM,YAAY;AAAA,IAC3E;AACF,SAAO,EAAE,YAAY,gBAAgB,EAAE,UAAU,cAAc,aAAa,KAAK,EAAE;AACrF;AASA,SAAS,qBACP,KACA,OACA,GACA,OACA,OAC+B;AAC/B,MAAI,MAAM,EAAG,QAAO,EAAE,KAAK,CAAC,OAAO,MAAM,MAAM;AAC/C,QAAM,OAAO,MAAM;AACnB,QAAM,WAAW,KAAK,IAAI,GAAG,QAAQ,IAAI,OAAO,IAAI;AAKpD,QAAM,MAAM,KAAK,IAAI,IAAI,KAAK,IAAI,MAAM,KAAK,IAAI,KAAK,IAAI,KAAK,IAAI,KAAK,GAAG,CAAC,CAAC,IAAI,CAAC;AAClF,QAAM,SAAS,KAAK,KAAM,IAAI,WAAW,MAAO,CAAC,IAAK,IAAI,QAAQ,MAAO;AACzE,SAAO,EAAE,KAAK,OAAO,QAAQ,MAAM,OAAO,OAAO;AACnD;","names":[]}
@@ -835,4 +835,4 @@ export {
835
835
  subjectiveEval,
836
836
  allCriticalPassed
837
837
  };
838
- //# sourceMappingURL=chunk-LSH4MMOZ.js.map
838
+ //# sourceMappingURL=chunk-NCRFYPS3.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/control-runtime.ts"],"sourcesContent":["/**\n * Policy-based agent control runtime.\n *\n * This is the minimal reusable loop behind driver-agent patterns:\n *\n * observe state -> validate -> decide next action -> act -> observe -> ...\n *\n * It deliberately does not model named \"topologies\". Direct execution,\n * critic/revise, driver intervention, specialist calls, and human escalation\n * are all just actions chosen by the control policy.\n */\n\nimport { type SpanHandle, TraceEmitter } from './trace/emitter'\nimport type { FailureClass } from './trace/schema'\nimport type { TraceStore } from './trace/store'\n\nexport type ControlSeverity = 'info' | 'warning' | 'error' | 'critical'\nexport type ControlActionFailureMode = 'continue' | 'stop'\n\nexport interface ControlEvalResult {\n /** Stable validator or judge id. */\n id: string\n /** Whether this check passed. */\n passed: boolean\n /** Optional normalized score. 1 = best, 0 = worst. */\n score?: number\n /** Objective validators should usually be \"error\" or \"critical\" when failed. */\n severity?: ControlSeverity\n /** Human-readable result. */\n detail?: string\n /** Small evidence string or pointer. Avoid large payloads. */\n evidence?: string\n /** True when the result came from deterministic state, not LLM judgment. */\n objective?: boolean\n /** Structured details for downstream control policies and reports. */\n metadata?: Record<string, unknown>\n}\n\nexport interface ControlBudget {\n maxSteps: number\n maxWallMs?: number\n maxCostUsd?: number\n}\n\nexport interface ControlStopPolicies<TState, TAction> {\n /**\n * Stop after N consecutive steps with no state fingerprint change and\n * less than `minScoreDelta` score movement. Disabled when omitted.\n */\n maxNoProgressSteps?: number\n /**\n * Stop after the same action fingerprint is selected N consecutive\n * times. Disabled when omitted.\n */\n maxRepeatedActions?: number\n /** Minimum score movement that counts as progress. Default 0.001. */\n minScoreDelta?: number\n /** Override the default JSON/string fingerprint for state comparisons. */\n stateFingerprint?: (state: TState) => string\n /** Override the default JSON/string fingerprint for repeated-action checks. */\n actionFingerprint?: (action: TAction) => string\n}\n\nexport interface ControlContext<\n TState,\n TAction,\n TActionResult,\n TEval extends ControlEvalResult = ControlEvalResult,\n> {\n intent: string\n state: TState\n evals: TEval[]\n history: ControlStep<TState, TAction, TActionResult, TEval>[]\n budget: ControlBudget\n stepIndex: number\n wallMs: number\n spentCostUsd: number\n remainingCostUsd?: number\n abortSignal: AbortSignal\n emitter?: TraceEmitter\n}\n\nexport type ControlDecision<TAction> =\n | {\n type: 'continue'\n action: TAction\n reason?: string\n }\n | {\n type: 'stop'\n reason: string\n pass?: boolean\n score?: number\n }\n\nexport interface StopDecision {\n stop: boolean\n pass: boolean\n reason: string\n score?: number\n failureClass?: FailureClass\n}\n\nexport interface ControlActionOutcome<TActionResult> {\n ok: boolean\n result?: TActionResult\n error?: string\n costUsd?: number\n durationMs: number\n}\n\nexport interface ControlRuntimeError {\n phase: 'observe' | 'validate' | 'decide' | 'act' | 'stop-policy' | 'on-step' | 'trace'\n stepIndex: number\n message: string\n}\n\nexport interface ControlStep<\n TState,\n TAction,\n TActionResult,\n TEval extends ControlEvalResult = ControlEvalResult,\n> {\n index: number\n decision: ControlDecision<TAction>\n beforeState: TState\n afterState: TState\n evalsBefore: TEval[]\n evalsAfter: TEval[]\n actionOutcome?: ControlActionOutcome<TActionResult>\n startedAt: string\n endedAt: string\n}\n\nexport interface ControlRunResult<\n TState,\n TAction,\n TActionResult,\n TEval extends ControlEvalResult = ControlEvalResult,\n> {\n intent: string\n pass: boolean\n completed: boolean\n reason: string\n score?: number\n steps: ControlStep<TState, TAction, TActionResult, TEval>[]\n finalState: TState | undefined\n finalEvals: TEval[]\n wallMs: number\n spentCostUsd: number\n /** null when the run executed without a TraceEmitter wired (no run record was persisted). */\n runId: string | null\n failureClass?: FailureClass\n runtimeErrors: ControlRuntimeError[]\n stoppedBy: 'policy' | 'stop-policy' | 'budget' | 'abort' | 'runtime-error'\n}\n\nexport interface ControlRuntimeConfig<\n TState,\n TAction,\n TActionResult,\n TEval extends ControlEvalResult = ControlEvalResult,\n> {\n intent: string\n budget?: Partial<ControlBudget>\n signal?: AbortSignal\n /** Defaults to `continue`: action failures are recorded, then the policy gets another chance. */\n actionFailure?: ControlActionFailureMode\n /**\n * Extract cost from an action result. Used for `maxCostUsd` budget\n * enforcement and trace budget ledger emission.\n */\n getActionCostUsd?: (ctx: {\n action: TAction\n result: TActionResult\n state: TState\n evals: TEval[]\n history: ControlStep<TState, TAction, TActionResult, TEval>[]\n }) => number | undefined\n\n /** Read typed task/product state. Prefer structured state over transcript-only context. */\n observe: (ctx: {\n history: ControlStep<TState, TAction, TActionResult, TEval>[]\n abortSignal: AbortSignal\n }) => Promise<TState> | TState\n\n /** Objective validators first, subjective judges only where objective state is insufficient. */\n validate: (ctx: {\n intent: string\n state: TState\n history: ControlStep<TState, TAction, TActionResult, TEval>[]\n abortSignal: AbortSignal\n }) => Promise<TEval[]> | TEval[]\n\n /** Choose the next control action. Can call a worker, ask user, run critic, inspect state, or stop. */\n decide: (\n ctx: ControlContext<TState, TAction, TActionResult, TEval>,\n ) => Promise<ControlDecision<TAction>> | ControlDecision<TAction>\n\n /** Execute the action selected by the policy. */\n act: (\n action: TAction,\n ctx: ControlContext<TState, TAction, TActionResult, TEval>,\n ) => Promise<TActionResult> | TActionResult\n\n /** Final stopping policy. Called before decide and after each action. */\n shouldStop?: (\n ctx: ControlContext<TState, TAction, TActionResult, TEval>,\n ) => Promise<StopDecision> | StopDecision\n\n /** Optional hook for tracing or live progress updates. */\n onStep?: (step: ControlStep<TState, TAction, TActionResult, TEval>) => Promise<void> | void\n\n /** Optional generic stuck-loop policies. Custom `shouldStop` still runs first. */\n stopPolicies?: ControlStopPolicies<TState, TAction>\n\n /** Optional trace sink. Emits one run plus one span per control step. */\n store?: TraceStore\n scenarioId?: string\n projectId?: string\n variantId?: string\n}\n\nconst DEFAULT_BUDGET: ControlBudget = {\n maxSteps: 8,\n maxWallMs: 5 * 60 * 1000,\n}\n\nexport async function runAgentControlLoop<\n TState,\n TAction,\n TActionResult,\n TEval extends ControlEvalResult = ControlEvalResult,\n>(\n config: ControlRuntimeConfig<TState, TAction, TActionResult, TEval>,\n): Promise<ControlRunResult<TState, TAction, TActionResult, TEval>> {\n const budget = normalizeBudget(config.budget)\n const actionFailure = config.actionFailure ?? 'continue'\n const controller = new AbortController()\n const upstreamAbort = () => controller.abort(config.signal?.reason)\n if (config.signal) {\n if (config.signal.aborted) controller.abort(config.signal.reason)\n else config.signal.addEventListener('abort', upstreamAbort, { once: true })\n }\n\n const started = Date.now()\n const wallTimer = budget.maxWallMs\n ? setTimeout(\n () => controller.abort(new Error('control runtime wall timeout')),\n budget.maxWallMs,\n )\n : undefined\n const history: ControlStep<TState, TAction, TActionResult, TEval>[] = []\n const emitter = config.store ? new TraceEmitter(config.store) : undefined\n let spentCostUsd = 0\n const runtimeErrors: ControlRuntimeError[] = []\n let lastStateFingerprint: string | undefined\n let lastActionFingerprint: string | undefined\n let noProgressStreak = 0\n let repeatedActionStreak = 0\n\n try {\n if (emitter) {\n await runTrace(runtimeErrors, 0, () =>\n emitter.startRun({\n scenarioId: config.scenarioId ?? 'agent-control-loop',\n projectId: config.projectId,\n variantId: config.variantId,\n layer: 'meta',\n tags: {\n intent: config.intent.slice(0, 120),\n maxSteps: String(budget.maxSteps),\n ...(budget.maxCostUsd !== undefined ? { maxCostUsd: String(budget.maxCostUsd) } : {}),\n },\n }),\n )\n }\n\n let state: TState\n let evals: TEval[]\n try {\n state = await config.observe({ history, abortSignal: controller.signal })\n } catch (err) {\n const error = runtimeError('observe', 0, err)\n runtimeErrors.push(error)\n return finish(emitter, {\n intent: config.intent,\n pass: false,\n completed: false,\n reason: error.message,\n steps: history,\n finalState: undefined,\n finalEvals: [],\n wallMs: Date.now() - started,\n spentCostUsd,\n runId: emitter?.runId ?? null,\n failureClass: 'unknown',\n runtimeErrors,\n stoppedBy: 'runtime-error',\n })\n }\n try {\n evals = await config.validate({\n intent: config.intent,\n state,\n history,\n abortSignal: controller.signal,\n })\n await recordEvalSpans(emitter, evals, 'initial', runtimeErrors, 0)\n } catch (err) {\n const error = runtimeError('validate', 0, err)\n runtimeErrors.push(error)\n return finish(emitter, {\n intent: config.intent,\n pass: false,\n completed: false,\n reason: error.message,\n steps: history,\n finalState: state,\n finalEvals: [],\n wallMs: Date.now() - started,\n spentCostUsd,\n runId: emitter?.runId ?? null,\n failureClass: 'unknown',\n runtimeErrors,\n stoppedBy: 'runtime-error',\n })\n }\n lastStateFingerprint = fingerprintState(state, config.stopPolicies)\n\n for (let stepIndex = 0; stepIndex < budget.maxSteps; stepIndex++) {\n if (controller.signal.aborted) {\n return finish(emitter, {\n intent: config.intent,\n pass: false,\n completed: false,\n reason: abortReason(controller.signal),\n score: undefined,\n steps: history,\n finalState: state,\n finalEvals: evals,\n wallMs: Date.now() - started,\n spentCostUsd,\n runId: emitter?.runId ?? null,\n failureClass: 'timeout',\n runtimeErrors,\n stoppedBy: 'abort',\n })\n }\n\n const budgetStop = budgetStopDecision(budget, spentCostUsd)\n if (budgetStop.stop) {\n return finish(emitter, {\n intent: config.intent,\n pass: false,\n completed: false,\n reason: budgetStop.reason,\n score: averageScore(evals),\n steps: history,\n finalState: state,\n finalEvals: evals,\n wallMs: Date.now() - started,\n spentCostUsd,\n runId: emitter?.runId ?? null,\n failureClass: 'budget_exceeded',\n runtimeErrors,\n stoppedBy: 'budget',\n })\n }\n\n const ctx = makeContext(\n config.intent,\n state,\n evals,\n history,\n budget,\n stepIndex,\n started,\n spentCostUsd,\n controller.signal,\n emitter,\n )\n let stop: StopDecision\n try {\n stop = config.shouldStop ? await config.shouldStop(ctx) : defaultStopDecision(evals)\n } catch (err) {\n runtimeErrors.push(runtimeError('stop-policy', stepIndex, err))\n return finish(emitter, {\n intent: config.intent,\n pass: false,\n completed: false,\n reason: runtimeErrors[runtimeErrors.length - 1]!.message,\n score: averageScore(evals),\n steps: history,\n finalState: state,\n finalEvals: evals,\n wallMs: Date.now() - started,\n spentCostUsd,\n runId: emitter?.runId ?? null,\n failureClass: 'unknown',\n runtimeErrors,\n stoppedBy: 'runtime-error',\n })\n }\n if (stop.stop) {\n return finish(emitter, {\n intent: config.intent,\n pass: stop.pass,\n completed: true,\n reason: stop.reason,\n score: stop.score,\n steps: history,\n finalState: state,\n finalEvals: evals,\n wallMs: Date.now() - started,\n spentCostUsd,\n runId: emitter?.runId ?? null,\n failureClass: stop.failureClass,\n runtimeErrors,\n stoppedBy: 'stop-policy',\n })\n }\n\n let decision: ControlDecision<TAction>\n try {\n decision = await config.decide(ctx)\n } catch (err) {\n runtimeErrors.push(runtimeError('decide', stepIndex, err))\n return finish(emitter, {\n intent: config.intent,\n pass: false,\n completed: false,\n reason: runtimeErrors[runtimeErrors.length - 1]!.message,\n score: averageScore(evals),\n steps: history,\n finalState: state,\n finalEvals: evals,\n wallMs: Date.now() - started,\n spentCostUsd,\n runId: emitter?.runId ?? null,\n failureClass: 'unknown',\n runtimeErrors,\n stoppedBy: 'runtime-error',\n })\n }\n if (decision.type === 'stop') {\n return finish(emitter, {\n intent: config.intent,\n pass: decision.pass ?? false,\n completed: true,\n reason: decision.reason,\n score: decision.score,\n steps: history,\n finalState: state,\n finalEvals: evals,\n wallMs: Date.now() - started,\n spentCostUsd,\n runId: emitter?.runId ?? null,\n failureClass: decision.pass === false ? 'unknown' : undefined,\n runtimeErrors,\n stoppedBy: 'policy',\n })\n }\n\n const actionFingerprint = fingerprintAction(decision.action, config.stopPolicies)\n repeatedActionStreak =\n actionFingerprint === lastActionFingerprint ? repeatedActionStreak + 1 : 1\n lastActionFingerprint = actionFingerprint\n const repeatedActionStop = repeatedActionStopDecision(\n config.stopPolicies,\n repeatedActionStreak,\n )\n if (repeatedActionStop.stop) {\n return finish(emitter, {\n intent: config.intent,\n pass: false,\n completed: true,\n reason: repeatedActionStop.reason,\n score: averageScore(evals),\n steps: history,\n finalState: state,\n finalEvals: evals,\n wallMs: Date.now() - started,\n spentCostUsd,\n runId: emitter?.runId ?? null,\n failureClass: 'tool_recovery_failure',\n runtimeErrors,\n stoppedBy: 'stop-policy',\n })\n }\n\n const beforeState = state\n const evalsBefore = evals\n const scoreBefore = averageScore(evals)\n const actionStarted = Date.now()\n const stepHandle = emitter\n ? await runTrace(runtimeErrors, stepIndex, () =>\n emitter.tool({\n name: `control-step-${stepIndex}`,\n toolName: 'agent-control-action',\n args: decision.action,\n attributes: {\n decision: decision.reason ?? 'continue',\n repeatedActionStreak,\n },\n }),\n )\n : undefined\n let actionOutcome: ControlActionOutcome<TActionResult>\n try {\n const result = await config.act(decision.action, ctx)\n const rawCostUsd = config.getActionCostUsd?.({\n action: decision.action,\n result,\n state,\n evals,\n history,\n })\n const costUsd = normalizeActionCostUsd(rawCostUsd, runtimeErrors, stepIndex)\n if (costUsd !== undefined && Number.isFinite(costUsd) && costUsd > 0) {\n spentCostUsd += costUsd\n await recordCostBudget(\n emitter,\n budget,\n spentCostUsd,\n stepHandle,\n runtimeErrors,\n stepIndex,\n )\n }\n actionOutcome = {\n ok: true,\n result,\n ...(costUsd !== undefined ? { costUsd } : {}),\n durationMs: Date.now() - actionStarted,\n }\n } catch (err) {\n runtimeErrors.push(runtimeError('act', stepIndex, err))\n actionOutcome = {\n ok: false,\n error: runtimeErrors[runtimeErrors.length - 1]!.message,\n durationMs: Date.now() - actionStarted,\n }\n if (actionFailure === 'stop') {\n await runTrace(runtimeErrors, stepIndex, () =>\n stepHandle?.fail(actionOutcome.error ?? 'action failed'),\n )\n const step: ControlStep<TState, TAction, TActionResult, TEval> = {\n index: stepIndex,\n decision,\n beforeState,\n afterState: state,\n evalsBefore,\n evalsAfter: evals,\n actionOutcome,\n startedAt: new Date(actionStarted).toISOString(),\n endedAt: new Date().toISOString(),\n }\n history.push(step)\n await runOnStep(config.onStep, step, runtimeErrors)\n return finish(emitter, {\n intent: config.intent,\n pass: false,\n completed: false,\n reason: actionOutcome.error ?? 'action failed',\n score: averageScore(evals),\n steps: history,\n finalState: state,\n finalEvals: evals,\n wallMs: Date.now() - started,\n spentCostUsd,\n runId: emitter?.runId ?? null,\n failureClass: 'unknown',\n runtimeErrors,\n stoppedBy: 'runtime-error',\n })\n }\n }\n\n try {\n state = await config.observe({ history, abortSignal: controller.signal })\n } catch (err) {\n runtimeErrors.push(runtimeError('observe', stepIndex, err))\n const step: ControlStep<TState, TAction, TActionResult, TEval> = {\n index: stepIndex,\n decision,\n beforeState,\n afterState: beforeState,\n evalsBefore,\n evalsAfter: evals,\n actionOutcome,\n startedAt: new Date(actionStarted).toISOString(),\n endedAt: new Date().toISOString(),\n }\n history.push(step)\n await runTrace(runtimeErrors, stepIndex, () =>\n stepHandle?.fail(runtimeErrors[runtimeErrors.length - 1]!.message),\n )\n await runOnStep(config.onStep, step, runtimeErrors)\n return finish(emitter, {\n intent: config.intent,\n pass: false,\n completed: false,\n reason: runtimeErrors[runtimeErrors.length - 1]!.message,\n score: averageScore(evals),\n steps: history,\n finalState: beforeState,\n finalEvals: evals,\n wallMs: Date.now() - started,\n spentCostUsd,\n runId: emitter?.runId ?? null,\n failureClass: 'unknown',\n runtimeErrors,\n stoppedBy: 'runtime-error',\n })\n }\n try {\n evals = await config.validate({\n intent: config.intent,\n state,\n history,\n abortSignal: controller.signal,\n })\n await recordEvalSpans(\n emitter,\n evals,\n `step-${stepIndex}`,\n runtimeErrors,\n stepIndex,\n stepHandle?.span.spanId,\n )\n } catch (err) {\n runtimeErrors.push(runtimeError('validate', stepIndex, err))\n const step: ControlStep<TState, TAction, TActionResult, TEval> = {\n index: stepIndex,\n decision,\n beforeState,\n afterState: state,\n evalsBefore,\n evalsAfter: evals,\n actionOutcome,\n startedAt: new Date(actionStarted).toISOString(),\n endedAt: new Date().toISOString(),\n }\n history.push(step)\n await runTrace(runtimeErrors, stepIndex, () =>\n stepHandle?.fail(runtimeErrors[runtimeErrors.length - 1]!.message),\n )\n await runOnStep(config.onStep, step, runtimeErrors)\n return finish(emitter, {\n intent: config.intent,\n pass: false,\n completed: false,\n reason: runtimeErrors[runtimeErrors.length - 1]!.message,\n score: averageScore(evals),\n steps: history,\n finalState: state,\n finalEvals: evals,\n wallMs: Date.now() - started,\n spentCostUsd,\n runId: emitter?.runId ?? null,\n failureClass: 'unknown',\n runtimeErrors,\n stoppedBy: 'runtime-error',\n })\n }\n const scoreAfter = averageScore(evals)\n const stateFingerprint = fingerprintState(state, config.stopPolicies)\n const noProgressStop = noProgressStopDecision({\n policies: config.stopPolicies,\n lastStateFingerprint,\n stateFingerprint,\n scoreBefore,\n scoreAfter,\n currentStreak: noProgressStreak,\n })\n noProgressStreak = noProgressStop.streak\n lastStateFingerprint = stateFingerprint\n\n const step: ControlStep<TState, TAction, TActionResult, TEval> = {\n index: stepIndex,\n decision,\n beforeState,\n afterState: state,\n evalsBefore,\n evalsAfter: evals,\n actionOutcome,\n startedAt: new Date(actionStarted).toISOString(),\n endedAt: new Date().toISOString(),\n }\n history.push(step)\n if (actionOutcome.ok) {\n await runTrace(runtimeErrors, stepIndex, () =>\n stepHandle?.end({\n attributes: {\n actionCostUsd: actionOutcome.costUsd ?? null,\n spentCostUsd,\n scoreBefore: scoreBefore ?? null,\n scoreAfter: scoreAfter ?? null,\n noProgressStreak,\n },\n }),\n )\n } else {\n await runTrace(runtimeErrors, stepIndex, () =>\n stepHandle?.fail(actionOutcome.error ?? 'action failed', {\n attributes: {\n spentCostUsd,\n noProgressStreak,\n },\n }),\n )\n }\n await runOnStep(config.onStep, step, runtimeErrors)\n\n if (noProgressStop.stop) {\n return finish(emitter, {\n intent: config.intent,\n pass: false,\n completed: true,\n reason: noProgressStop.reason,\n score: scoreAfter,\n steps: history,\n finalState: state,\n finalEvals: evals,\n wallMs: Date.now() - started,\n spentCostUsd,\n runId: emitter?.runId ?? null,\n failureClass: 'tool_recovery_failure',\n runtimeErrors,\n stoppedBy: 'stop-policy',\n })\n }\n\n const postStepBudgetStop = budgetStopDecision(budget, spentCostUsd)\n if (postStepBudgetStop.stop) {\n return finish(emitter, {\n intent: config.intent,\n pass: false,\n completed: false,\n reason: postStepBudgetStop.reason,\n score: scoreAfter,\n steps: history,\n finalState: state,\n finalEvals: evals,\n wallMs: Date.now() - started,\n spentCostUsd,\n runId: emitter?.runId ?? null,\n failureClass: 'budget_exceeded',\n runtimeErrors,\n stoppedBy: 'budget',\n })\n }\n\n const postStepCtx = makeContext(\n config.intent,\n state,\n evals,\n history,\n budget,\n stepIndex + 1,\n started,\n spentCostUsd,\n controller.signal,\n emitter,\n )\n let postStepStop: StopDecision\n try {\n postStepStop = config.shouldStop\n ? await config.shouldStop(postStepCtx)\n : defaultStopDecision(evals)\n } catch (err) {\n runtimeErrors.push(runtimeError('stop-policy', stepIndex + 1, err))\n return finish(emitter, {\n intent: config.intent,\n pass: false,\n completed: false,\n reason: runtimeErrors[runtimeErrors.length - 1]!.message,\n score: averageScore(evals),\n steps: history,\n finalState: state,\n finalEvals: evals,\n wallMs: Date.now() - started,\n spentCostUsd,\n runId: emitter?.runId ?? null,\n failureClass: 'unknown',\n runtimeErrors,\n stoppedBy: 'runtime-error',\n })\n }\n if (postStepStop.stop) {\n return finish(emitter, {\n intent: config.intent,\n pass: postStepStop.pass,\n completed: true,\n reason: postStepStop.reason,\n score: postStepStop.score,\n steps: history,\n finalState: state,\n finalEvals: evals,\n wallMs: Date.now() - started,\n spentCostUsd,\n runId: emitter?.runId ?? null,\n failureClass: postStepStop.failureClass,\n runtimeErrors,\n stoppedBy: 'stop-policy',\n })\n }\n }\n\n return finish(emitter, {\n intent: config.intent,\n pass: false,\n completed: false,\n reason: `budget exhausted: maxSteps=${budget.maxSteps}`,\n steps: history,\n finalState: state,\n finalEvals: evals,\n wallMs: Date.now() - started,\n spentCostUsd,\n runId: emitter?.runId ?? null,\n failureClass: 'budget_exceeded',\n runtimeErrors,\n stoppedBy: 'budget',\n })\n } catch (err) {\n runtimeErrors.push(runtimeError('act', history.length, err))\n return finish(emitter, {\n intent: config.intent,\n pass: false,\n completed: false,\n reason: runtimeErrors[runtimeErrors.length - 1]!.message,\n steps: history,\n finalState: undefined,\n finalEvals: [],\n wallMs: Date.now() - started,\n spentCostUsd,\n runId: emitter?.runId ?? null,\n failureClass: 'unknown',\n runtimeErrors,\n stoppedBy: 'runtime-error',\n })\n } finally {\n if (wallTimer) clearTimeout(wallTimer)\n if (config.signal) config.signal.removeEventListener('abort', upstreamAbort)\n }\n}\n\nexport function stopOnNoProgress<TState, TAction>(\n maxNoProgressSteps: number,\n options: Omit<ControlStopPolicies<TState, TAction>, 'maxNoProgressSteps'> = {},\n): ControlStopPolicies<TState, TAction> {\n return { ...options, maxNoProgressSteps }\n}\n\nexport function stopOnRepeatedAction<TState, TAction>(\n maxRepeatedActions: number,\n options: Omit<ControlStopPolicies<TState, TAction>, 'maxRepeatedActions'> = {},\n): ControlStopPolicies<TState, TAction> {\n return { ...options, maxRepeatedActions }\n}\n\nexport function objectiveEval(input: Omit<ControlEvalResult, 'objective'>): ControlEvalResult {\n return { ...input, objective: true }\n}\n\nexport function subjectiveEval(input: Omit<ControlEvalResult, 'objective'>): ControlEvalResult {\n return { ...input, objective: false }\n}\n\nfunction normalizeBudget(input: Partial<ControlBudget> | undefined): ControlBudget {\n const raw = { ...DEFAULT_BUDGET, ...input } as Record<string, unknown>\n if (!Number.isInteger(raw.maxSteps) || (raw.maxSteps as number) < 1) {\n throw new RangeError(\n `ControlRuntime budget.maxSteps must be an integer >= 1, got ${String(raw.maxSteps)}`,\n )\n }\n const budget: ControlBudget = { maxSteps: raw.maxSteps as number }\n if (raw.maxWallMs !== undefined) {\n if (\n typeof raw.maxWallMs !== 'number' ||\n !Number.isFinite(raw.maxWallMs) ||\n raw.maxWallMs <= 0\n ) {\n throw new RangeError(\n `ControlRuntime budget.maxWallMs must be a positive finite number, got ${String(raw.maxWallMs)}`,\n )\n }\n budget.maxWallMs = raw.maxWallMs\n }\n if (raw.maxCostUsd !== undefined) {\n if (\n typeof raw.maxCostUsd !== 'number' ||\n !Number.isFinite(raw.maxCostUsd) ||\n raw.maxCostUsd < 0\n ) {\n throw new RangeError(\n `ControlRuntime budget.maxCostUsd must be a nonnegative finite number, got ${String(raw.maxCostUsd)}`,\n )\n }\n budget.maxCostUsd = raw.maxCostUsd\n }\n return budget\n}\n\nfunction normalizeActionCostUsd(\n costUsd: number | undefined,\n runtimeErrors: ControlRuntimeError[],\n stepIndex: number,\n): number | undefined {\n if (costUsd === undefined) return undefined\n if (!Number.isFinite(costUsd) || costUsd < 0) {\n runtimeErrors.push(\n runtimeError('act', stepIndex, new Error(`invalid action costUsd: ${String(costUsd)}`)),\n )\n return undefined\n }\n return costUsd\n}\n\nexport function allCriticalPassed(evals: ControlEvalResult[]): boolean {\n return evals.every(\n (result) => result.passed || (result.severity !== 'critical' && result.severity !== 'error'),\n )\n}\n\nfunction makeContext<TState, TAction, TActionResult, TEval extends ControlEvalResult>(\n intent: string,\n state: TState,\n evals: TEval[],\n history: ControlStep<TState, TAction, TActionResult, TEval>[],\n budget: ControlBudget,\n stepIndex: number,\n started: number,\n spentCostUsd: number,\n abortSignal: AbortSignal,\n emitter?: TraceEmitter,\n): ControlContext<TState, TAction, TActionResult, TEval> {\n return {\n intent,\n state,\n evals,\n history,\n budget,\n stepIndex,\n wallMs: Date.now() - started,\n spentCostUsd,\n remainingCostUsd:\n budget.maxCostUsd === undefined ? undefined : Math.max(0, budget.maxCostUsd - spentCostUsd),\n abortSignal,\n emitter,\n }\n}\n\nfunction defaultStopDecision(evals: ControlEvalResult[]): StopDecision {\n if (!evals.length) return { stop: false, pass: false, reason: 'no evals yet' }\n const pass = allCriticalPassed(evals)\n return pass\n ? { stop: true, pass: true, reason: 'all critical evals passed', score: averageScore(evals) }\n : {\n stop: false,\n pass: false,\n reason: 'critical evals still failing',\n score: averageScore(evals),\n }\n}\n\nfunction averageScore(evals: ControlEvalResult[]): number | undefined {\n const scored = evals\n .map((result) => result.score)\n .filter((score): score is number => typeof score === 'number')\n if (!scored.length) return undefined\n return Math.round((scored.reduce((sum, score) => sum + score, 0) / scored.length) * 1000) / 1000\n}\n\nfunction budgetStopDecision(\n budget: ControlBudget,\n spentCostUsd: number,\n): { stop: boolean; reason: string } {\n if (budget.maxCostUsd !== undefined && spentCostUsd >= budget.maxCostUsd) {\n return {\n stop: true,\n reason: `budget exhausted: maxCostUsd=${budget.maxCostUsd}`,\n }\n }\n return { stop: false, reason: '' }\n}\n\nasync function recordCostBudget(\n emitter: TraceEmitter | undefined,\n budget: ControlBudget,\n spentCostUsd: number,\n handle: SpanHandle | undefined,\n runtimeErrors: ControlRuntimeError[],\n stepIndex: number,\n): Promise<void> {\n if (!emitter || budget.maxCostUsd === undefined) return\n const maxCostUsd = budget.maxCostUsd\n await runTrace(runtimeErrors, stepIndex, () =>\n emitter.recordBudget({\n dimension: 'usd',\n limit: maxCostUsd,\n consumed: spentCostUsd,\n remaining: Math.max(0, maxCostUsd - spentCostUsd),\n breached: spentCostUsd >= maxCostUsd,\n spanId: handle?.span.spanId,\n }),\n )\n}\n\nasync function recordEvalSpans(\n emitter: TraceEmitter | undefined,\n evals: ControlEvalResult[],\n phase: string,\n runtimeErrors: ControlRuntimeError[],\n stepIndex: number,\n targetSpanId?: string,\n): Promise<void> {\n if (!emitter) return\n for (const result of evals) {\n await runTrace(runtimeErrors, stepIndex, () =>\n emitter.recordJudge({\n judgeId: result.objective ? 'objective-validator' : 'subjective-judge',\n targetSpanId: targetSpanId ?? emitter.runId,\n name: `control-eval/${result.id}`,\n dimension: result.id,\n score: typeof result.score === 'number' ? result.score : result.passed ? 1 : 0,\n rationale: result.detail,\n evidence: result.evidence,\n attributes: {\n phase,\n passed: result.passed,\n severity: result.severity,\n objective: result.objective,\n },\n }),\n )\n }\n}\n\nasync function runOnStep<TState, TAction, TActionResult, TEval extends ControlEvalResult>(\n onStep: ControlRuntimeConfig<TState, TAction, TActionResult, TEval>['onStep'] | undefined,\n step: ControlStep<TState, TAction, TActionResult, TEval>,\n runtimeErrors: ControlRuntimeError[],\n): Promise<void> {\n if (!onStep) return\n try {\n await onStep(step)\n } catch (err) {\n runtimeErrors.push(runtimeError('on-step', step.index, err))\n }\n}\n\nasync function runTrace<T>(\n runtimeErrors: ControlRuntimeError[],\n stepIndex: number,\n write: () => Promise<T | undefined> | T | undefined,\n): Promise<T | undefined> {\n try {\n return await write()\n } catch (err) {\n runtimeErrors.push(runtimeError('trace', stepIndex, err))\n return undefined\n }\n}\n\nfunction noProgressStopDecision<TState, TAction>(args: {\n policies: ControlStopPolicies<TState, TAction> | undefined\n lastStateFingerprint: string | undefined\n stateFingerprint: string\n scoreBefore: number | undefined\n scoreAfter: number | undefined\n currentStreak: number\n}): { stop: boolean; reason: string; streak: number } {\n const max = args.policies?.maxNoProgressSteps\n if (!max || max <= 0) return { stop: false, reason: '', streak: 0 }\n const minScoreDelta = args.policies?.minScoreDelta ?? 0.001\n const scoreDelta = Math.abs((args.scoreAfter ?? 0) - (args.scoreBefore ?? 0))\n const stateUnchanged =\n args.lastStateFingerprint !== undefined && args.lastStateFingerprint === args.stateFingerprint\n const scoreFlat = scoreDelta < minScoreDelta\n const streak = stateUnchanged && scoreFlat ? args.currentStreak + 1 : 0\n return streak >= max\n ? { stop: true, reason: `stuck: no state/score progress for ${streak} step(s)`, streak }\n : { stop: false, reason: '', streak }\n}\n\nfunction repeatedActionStopDecision<TState, TAction>(\n policies: ControlStopPolicies<TState, TAction> | undefined,\n streak: number,\n): { stop: boolean; reason: string } {\n const max = policies?.maxRepeatedActions\n if (!max || max <= 0 || streak < max) return { stop: false, reason: '' }\n return {\n stop: true,\n reason: `stuck: repeated same action for ${streak} step(s)`,\n }\n}\n\nfunction fingerprintState<TState, TAction>(\n state: TState,\n policies?: ControlStopPolicies<TState, TAction>,\n): string {\n if (policies?.stateFingerprint) return policies.stateFingerprint(state)\n return stableFingerprint(state)\n}\n\nfunction fingerprintAction<TState, TAction>(\n action: TAction,\n policies?: ControlStopPolicies<TState, TAction>,\n): string {\n if (policies?.actionFingerprint) return policies.actionFingerprint(action)\n return stableFingerprint(action)\n}\n\nfunction stableFingerprint(value: unknown): string {\n if (typeof value === 'string') return value\n if (typeof value === 'number' || typeof value === 'boolean' || value == null) return String(value)\n try {\n return JSON.stringify(sortForFingerprint(value))\n } catch {\n return String(value)\n }\n}\n\nfunction sortForFingerprint(value: unknown): unknown {\n if (Array.isArray(value)) return value.map(sortForFingerprint)\n if (!value || typeof value !== 'object') return value\n const record = value as Record<string, unknown>\n const sorted: Record<string, unknown> = {}\n for (const key of Object.keys(record).sort()) {\n sorted[key] = sortForFingerprint(record[key])\n }\n return sorted\n}\n\nfunction abortReason(signal: AbortSignal): string {\n const reason = signal.reason\n if (reason instanceof Error) return reason.message\n return reason ? String(reason) : 'aborted'\n}\n\nfunction runtimeError(\n phase: ControlRuntimeError['phase'],\n stepIndex: number,\n err: unknown,\n): ControlRuntimeError {\n const message = err instanceof Error ? err.message : String(err)\n return { phase, stepIndex, message }\n}\n\nasync function finish<TState, TAction, TActionResult, TEval extends ControlEvalResult>(\n emitter: TraceEmitter | undefined,\n result: ControlRunResult<TState, TAction, TActionResult, TEval>,\n): Promise<ControlRunResult<TState, TAction, TActionResult, TEval>> {\n await runTrace(result.runtimeErrors, result.steps.length, () =>\n emitter?.endRun({\n pass: result.pass,\n score: result.score ?? averageScore(result.finalEvals),\n failureClass: result.failureClass,\n notes: result.reason,\n }),\n )\n return result\n}\n"],"mappings":";;;;;AA+NA,IAAM,iBAAgC;AAAA,EACpC,UAAU;AAAA,EACV,WAAW,IAAI,KAAK;AACtB;AAEA,eAAsB,oBAMpB,QACkE;AAClE,QAAM,SAAS,gBAAgB,OAAO,MAAM;AAC5C,QAAM,gBAAgB,OAAO,iBAAiB;AAC9C,QAAM,aAAa,IAAI,gBAAgB;AACvC,QAAM,gBAAgB,MAAM,WAAW,MAAM,OAAO,QAAQ,MAAM;AAClE,MAAI,OAAO,QAAQ;AACjB,QAAI,OAAO,OAAO,QAAS,YAAW,MAAM,OAAO,OAAO,MAAM;AAAA,QAC3D,QAAO,OAAO,iBAAiB,SAAS,eAAe,EAAE,MAAM,KAAK,CAAC;AAAA,EAC5E;AAEA,QAAM,UAAU,KAAK,IAAI;AACzB,QAAM,YAAY,OAAO,YACrB;AAAA,IACE,MAAM,WAAW,MAAM,IAAI,MAAM,8BAA8B,CAAC;AAAA,IAChE,OAAO;AAAA,EACT,IACA;AACJ,QAAM,UAAgE,CAAC;AACvE,QAAM,UAAU,OAAO,QAAQ,IAAI,aAAa,OAAO,KAAK,IAAI;AAChE,MAAI,eAAe;AACnB,QAAM,gBAAuC,CAAC;AAC9C,MAAI;AACJ,MAAI;AACJ,MAAI,mBAAmB;AACvB,MAAI,uBAAuB;AAE3B,MAAI;AACF,QAAI,SAAS;AACX,YAAM;AAAA,QAAS;AAAA,QAAe;AAAA,QAAG,MAC/B,QAAQ,SAAS;AAAA,UACf,YAAY,OAAO,cAAc;AAAA,UACjC,WAAW,OAAO;AAAA,UAClB,WAAW,OAAO;AAAA,UAClB,OAAO;AAAA,UACP,MAAM;AAAA,YACJ,QAAQ,OAAO,OAAO,MAAM,GAAG,GAAG;AAAA,YAClC,UAAU,OAAO,OAAO,QAAQ;AAAA,YAChC,GAAI,OAAO,eAAe,SAAY,EAAE,YAAY,OAAO,OAAO,UAAU,EAAE,IAAI,CAAC;AAAA,UACrF;AAAA,QACF,CAAC;AAAA,MACH;AAAA,IACF;AAEA,QAAI;AACJ,QAAI;AACJ,QAAI;AACF,cAAQ,MAAM,OAAO,QAAQ,EAAE,SAAS,aAAa,WAAW,OAAO,CAAC;AAAA,IAC1E,SAAS,KAAK;AACZ,YAAM,QAAQ,aAAa,WAAW,GAAG,GAAG;AAC5C,oBAAc,KAAK,KAAK;AACxB,aAAO,OAAO,SAAS;AAAA,QACrB,QAAQ,OAAO;AAAA,QACf,MAAM;AAAA,QACN,WAAW;AAAA,QACX,QAAQ,MAAM;AAAA,QACd,OAAO;AAAA,QACP,YAAY;AAAA,QACZ,YAAY,CAAC;AAAA,QACb,QAAQ,KAAK,IAAI,IAAI;AAAA,QACrB;AAAA,QACA,OAAO,SAAS,SAAS;AAAA,QACzB,cAAc;AAAA,QACd;AAAA,QACA,WAAW;AAAA,MACb,CAAC;AAAA,IACH;AACA,QAAI;AACF,cAAQ,MAAM,OAAO,SAAS;AAAA,QAC5B,QAAQ,OAAO;AAAA,QACf;AAAA,QACA;AAAA,QACA,aAAa,WAAW;AAAA,MAC1B,CAAC;AACD,YAAM,gBAAgB,SAAS,OAAO,WAAW,eAAe,CAAC;AAAA,IACnE,SAAS,KAAK;AACZ,YAAM,QAAQ,aAAa,YAAY,GAAG,GAAG;AAC7C,oBAAc,KAAK,KAAK;AACxB,aAAO,OAAO,SAAS;AAAA,QACrB,QAAQ,OAAO;AAAA,QACf,MAAM;AAAA,QACN,WAAW;AAAA,QACX,QAAQ,MAAM;AAAA,QACd,OAAO;AAAA,QACP,YAAY;AAAA,QACZ,YAAY,CAAC;AAAA,QACb,QAAQ,KAAK,IAAI,IAAI;AAAA,QACrB;AAAA,QACA,OAAO,SAAS,SAAS;AAAA,QACzB,cAAc;AAAA,QACd;AAAA,QACA,WAAW;AAAA,MACb,CAAC;AAAA,IACH;AACA,2BAAuB,iBAAiB,OAAO,OAAO,YAAY;AAElE,aAAS,YAAY,GAAG,YAAY,OAAO,UAAU,aAAa;AAChE,UAAI,WAAW,OAAO,SAAS;AAC7B,eAAO,OAAO,SAAS;AAAA,UACrB,QAAQ,OAAO;AAAA,UACf,MAAM;AAAA,UACN,WAAW;AAAA,UACX,QAAQ,YAAY,WAAW,MAAM;AAAA,UACrC,OAAO;AAAA,UACP,OAAO;AAAA,UACP,YAAY;AAAA,UACZ,YAAY;AAAA,UACZ,QAAQ,KAAK,IAAI,IAAI;AAAA,UACrB;AAAA,UACA,OAAO,SAAS,SAAS;AAAA,UACzB,cAAc;AAAA,UACd;AAAA,UACA,WAAW;AAAA,QACb,CAAC;AAAA,MACH;AAEA,YAAM,aAAa,mBAAmB,QAAQ,YAAY;AAC1D,UAAI,WAAW,MAAM;AACnB,eAAO,OAAO,SAAS;AAAA,UACrB,QAAQ,OAAO;AAAA,UACf,MAAM;AAAA,UACN,WAAW;AAAA,UACX,QAAQ,WAAW;AAAA,UACnB,OAAO,aAAa,KAAK;AAAA,UACzB,OAAO;AAAA,UACP,YAAY;AAAA,UACZ,YAAY;AAAA,UACZ,QAAQ,KAAK,IAAI,IAAI;AAAA,UACrB;AAAA,UACA,OAAO,SAAS,SAAS;AAAA,UACzB,cAAc;AAAA,UACd;AAAA,UACA,WAAW;AAAA,QACb,CAAC;AAAA,MACH;AAEA,YAAM,MAAM;AAAA,QACV,OAAO;AAAA,QACP;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA,WAAW;AAAA,QACX;AAAA,MACF;AACA,UAAI;AACJ,UAAI;AACF,eAAO,OAAO,aAAa,MAAM,OAAO,WAAW,GAAG,IAAI,oBAAoB,KAAK;AAAA,MACrF,SAAS,KAAK;AACZ,sBAAc,KAAK,aAAa,eAAe,WAAW,GAAG,CAAC;AAC9D,eAAO,OAAO,SAAS;AAAA,UACrB,QAAQ,OAAO;AAAA,UACf,MAAM;AAAA,UACN,WAAW;AAAA,UACX,QAAQ,cAAc,cAAc,SAAS,CAAC,EAAG;AAAA,UACjD,OAAO,aAAa,KAAK;AAAA,UACzB,OAAO;AAAA,UACP,YAAY;AAAA,UACZ,YAAY;AAAA,UACZ,QAAQ,KAAK,IAAI,IAAI;AAAA,UACrB;AAAA,UACA,OAAO,SAAS,SAAS;AAAA,UACzB,cAAc;AAAA,UACd;AAAA,UACA,WAAW;AAAA,QACb,CAAC;AAAA,MACH;AACA,UAAI,KAAK,MAAM;AACb,eAAO,OAAO,SAAS;AAAA,UACrB,QAAQ,OAAO;AAAA,UACf,MAAM,KAAK;AAAA,UACX,WAAW;AAAA,UACX,QAAQ,KAAK;AAAA,UACb,OAAO,KAAK;AAAA,UACZ,OAAO;AAAA,UACP,YAAY;AAAA,UACZ,YAAY;AAAA,UACZ,QAAQ,KAAK,IAAI,IAAI;AAAA,UACrB;AAAA,UACA,OAAO,SAAS,SAAS;AAAA,UACzB,cAAc,KAAK;AAAA,UACnB;AAAA,UACA,WAAW;AAAA,QACb,CAAC;AAAA,MACH;AAEA,UAAI;AACJ,UAAI;AACF,mBAAW,MAAM,OAAO,OAAO,GAAG;AAAA,MACpC,SAAS,KAAK;AACZ,sBAAc,KAAK,aAAa,UAAU,WAAW,GAAG,CAAC;AACzD,eAAO,OAAO,SAAS;AAAA,UACrB,QAAQ,OAAO;AAAA,UACf,MAAM;AAAA,UACN,WAAW;AAAA,UACX,QAAQ,cAAc,cAAc,SAAS,CAAC,EAAG;AAAA,UACjD,OAAO,aAAa,KAAK;AAAA,UACzB,OAAO;AAAA,UACP,YAAY;AAAA,UACZ,YAAY;AAAA,UACZ,QAAQ,KAAK,IAAI,IAAI;AAAA,UACrB;AAAA,UACA,OAAO,SAAS,SAAS;AAAA,UACzB,cAAc;AAAA,UACd;AAAA,UACA,WAAW;AAAA,QACb,CAAC;AAAA,MACH;AACA,UAAI,SAAS,SAAS,QAAQ;AAC5B,eAAO,OAAO,SAAS;AAAA,UACrB,QAAQ,OAAO;AAAA,UACf,MAAM,SAAS,QAAQ;AAAA,UACvB,WAAW;AAAA,UACX,QAAQ,SAAS;AAAA,UACjB,OAAO,SAAS;AAAA,UAChB,OAAO;AAAA,UACP,YAAY;AAAA,UACZ,YAAY;AAAA,UACZ,QAAQ,KAAK,IAAI,IAAI;AAAA,UACrB;AAAA,UACA,OAAO,SAAS,SAAS;AAAA,UACzB,cAAc,SAAS,SAAS,QAAQ,YAAY;AAAA,UACpD;AAAA,UACA,WAAW;AAAA,QACb,CAAC;AAAA,MACH;AAEA,YAAM,oBAAoB,kBAAkB,SAAS,QAAQ,OAAO,YAAY;AAChF,6BACE,sBAAsB,wBAAwB,uBAAuB,IAAI;AAC3E,8BAAwB;AACxB,YAAM,qBAAqB;AAAA,QACzB,OAAO;AAAA,QACP;AAAA,MACF;AACA,UAAI,mBAAmB,MAAM;AAC3B,eAAO,OAAO,SAAS;AAAA,UACrB,QAAQ,OAAO;AAAA,UACf,MAAM;AAAA,UACN,WAAW;AAAA,UACX,QAAQ,mBAAmB;AAAA,UAC3B,OAAO,aAAa,KAAK;AAAA,UACzB,OAAO;AAAA,UACP,YAAY;AAAA,UACZ,YAAY;AAAA,UACZ,QAAQ,KAAK,IAAI,IAAI;AAAA,UACrB;AAAA,UACA,OAAO,SAAS,SAAS;AAAA,UACzB,cAAc;AAAA,UACd;AAAA,UACA,WAAW;AAAA,QACb,CAAC;AAAA,MACH;AAEA,YAAM,cAAc;AACpB,YAAM,cAAc;AACpB,YAAM,cAAc,aAAa,KAAK;AACtC,YAAM,gBAAgB,KAAK,IAAI;AAC/B,YAAM,aAAa,UACf,MAAM;AAAA,QAAS;AAAA,QAAe;AAAA,QAAW,MACvC,QAAQ,KAAK;AAAA,UACX,MAAM,gBAAgB,SAAS;AAAA,UAC/B,UAAU;AAAA,UACV,MAAM,SAAS;AAAA,UACf,YAAY;AAAA,YACV,UAAU,SAAS,UAAU;AAAA,YAC7B;AAAA,UACF;AAAA,QACF,CAAC;AAAA,MACH,IACA;AACJ,UAAI;AACJ,UAAI;AACF,cAAM,SAAS,MAAM,OAAO,IAAI,SAAS,QAAQ,GAAG;AACpD,cAAM,aAAa,OAAO,mBAAmB;AAAA,UAC3C,QAAQ,SAAS;AAAA,UACjB;AAAA,UACA;AAAA,UACA;AAAA,UACA;AAAA,QACF,CAAC;AACD,cAAM,UAAU,uBAAuB,YAAY,eAAe,SAAS;AAC3E,YAAI,YAAY,UAAa,OAAO,SAAS,OAAO,KAAK,UAAU,GAAG;AACpE,0BAAgB;AAChB,gBAAM;AAAA,YACJ;AAAA,YACA;AAAA,YACA;AAAA,YACA;AAAA,YACA;AAAA,YACA;AAAA,UACF;AAAA,QACF;AACA,wBAAgB;AAAA,UACd,IAAI;AAAA,UACJ;AAAA,UACA,GAAI,YAAY,SAAY,EAAE,QAAQ,IAAI,CAAC;AAAA,UAC3C,YAAY,KAAK,IAAI,IAAI;AAAA,QAC3B;AAAA,MACF,SAAS,KAAK;AACZ,sBAAc,KAAK,aAAa,OAAO,WAAW,GAAG,CAAC;AACtD,wBAAgB;AAAA,UACd,IAAI;AAAA,UACJ,OAAO,cAAc,cAAc,SAAS,CAAC,EAAG;AAAA,UAChD,YAAY,KAAK,IAAI,IAAI;AAAA,QAC3B;AACA,YAAI,kBAAkB,QAAQ;AAC5B,gBAAM;AAAA,YAAS;AAAA,YAAe;AAAA,YAAW,MACvC,YAAY,KAAK,cAAc,SAAS,eAAe;AAAA,UACzD;AACA,gBAAMA,QAA2D;AAAA,YAC/D,OAAO;AAAA,YACP;AAAA,YACA;AAAA,YACA,YAAY;AAAA,YACZ;AAAA,YACA,YAAY;AAAA,YACZ;AAAA,YACA,WAAW,IAAI,KAAK,aAAa,EAAE,YAAY;AAAA,YAC/C,UAAS,oBAAI,KAAK,GAAE,YAAY;AAAA,UAClC;AACA,kBAAQ,KAAKA,KAAI;AACjB,gBAAM,UAAU,OAAO,QAAQA,OAAM,aAAa;AAClD,iBAAO,OAAO,SAAS;AAAA,YACrB,QAAQ,OAAO;AAAA,YACf,MAAM;AAAA,YACN,WAAW;AAAA,YACX,QAAQ,cAAc,SAAS;AAAA,YAC/B,OAAO,aAAa,KAAK;AAAA,YACzB,OAAO;AAAA,YACP,YAAY;AAAA,YACZ,YAAY;AAAA,YACZ,QAAQ,KAAK,IAAI,IAAI;AAAA,YACrB;AAAA,YACA,OAAO,SAAS,SAAS;AAAA,YACzB,cAAc;AAAA,YACd;AAAA,YACA,WAAW;AAAA,UACb,CAAC;AAAA,QACH;AAAA,MACF;AAEA,UAAI;AACF,gBAAQ,MAAM,OAAO,QAAQ,EAAE,SAAS,aAAa,WAAW,OAAO,CAAC;AAAA,MAC1E,SAAS,KAAK;AACZ,sBAAc,KAAK,aAAa,WAAW,WAAW,GAAG,CAAC;AAC1D,cAAMA,QAA2D;AAAA,UAC/D,OAAO;AAAA,UACP;AAAA,UACA;AAAA,UACA,YAAY;AAAA,UACZ;AAAA,UACA,YAAY;AAAA,UACZ;AAAA,UACA,WAAW,IAAI,KAAK,aAAa,EAAE,YAAY;AAAA,UAC/C,UAAS,oBAAI,KAAK,GAAE,YAAY;AAAA,QAClC;AACA,gBAAQ,KAAKA,KAAI;AACjB,cAAM;AAAA,UAAS;AAAA,UAAe;AAAA,UAAW,MACvC,YAAY,KAAK,cAAc,cAAc,SAAS,CAAC,EAAG,OAAO;AAAA,QACnE;AACA,cAAM,UAAU,OAAO,QAAQA,OAAM,aAAa;AAClD,eAAO,OAAO,SAAS;AAAA,UACrB,QAAQ,OAAO;AAAA,UACf,MAAM;AAAA,UACN,WAAW;AAAA,UACX,QAAQ,cAAc,cAAc,SAAS,CAAC,EAAG;AAAA,UACjD,OAAO,aAAa,KAAK;AAAA,UACzB,OAAO;AAAA,UACP,YAAY;AAAA,UACZ,YAAY;AAAA,UACZ,QAAQ,KAAK,IAAI,IAAI;AAAA,UACrB;AAAA,UACA,OAAO,SAAS,SAAS;AAAA,UACzB,cAAc;AAAA,UACd;AAAA,UACA,WAAW;AAAA,QACb,CAAC;AAAA,MACH;AACA,UAAI;AACF,gBAAQ,MAAM,OAAO,SAAS;AAAA,UAC5B,QAAQ,OAAO;AAAA,UACf;AAAA,UACA;AAAA,UACA,aAAa,WAAW;AAAA,QAC1B,CAAC;AACD,cAAM;AAAA,UACJ;AAAA,UACA;AAAA,UACA,QAAQ,SAAS;AAAA,UACjB;AAAA,UACA;AAAA,UACA,YAAY,KAAK;AAAA,QACnB;AAAA,MACF,SAAS,KAAK;AACZ,sBAAc,KAAK,aAAa,YAAY,WAAW,GAAG,CAAC;AAC3D,cAAMA,QAA2D;AAAA,UAC/D,OAAO;AAAA,UACP;AAAA,UACA;AAAA,UACA,YAAY;AAAA,UACZ;AAAA,UACA,YAAY;AAAA,UACZ;AAAA,UACA,WAAW,IAAI,KAAK,aAAa,EAAE,YAAY;AAAA,UAC/C,UAAS,oBAAI,KAAK,GAAE,YAAY;AAAA,QAClC;AACA,gBAAQ,KAAKA,KAAI;AACjB,cAAM;AAAA,UAAS;AAAA,UAAe;AAAA,UAAW,MACvC,YAAY,KAAK,cAAc,cAAc,SAAS,CAAC,EAAG,OAAO;AAAA,QACnE;AACA,cAAM,UAAU,OAAO,QAAQA,OAAM,aAAa;AAClD,eAAO,OAAO,SAAS;AAAA,UACrB,QAAQ,OAAO;AAAA,UACf,MAAM;AAAA,UACN,WAAW;AAAA,UACX,QAAQ,cAAc,cAAc,SAAS,CAAC,EAAG;AAAA,UACjD,OAAO,aAAa,KAAK;AAAA,UACzB,OAAO;AAAA,UACP,YAAY;AAAA,UACZ,YAAY;AAAA,UACZ,QAAQ,KAAK,IAAI,IAAI;AAAA,UACrB;AAAA,UACA,OAAO,SAAS,SAAS;AAAA,UACzB,cAAc;AAAA,UACd;AAAA,UACA,WAAW;AAAA,QACb,CAAC;AAAA,MACH;AACA,YAAM,aAAa,aAAa,KAAK;AACrC,YAAM,mBAAmB,iBAAiB,OAAO,OAAO,YAAY;AACpE,YAAM,iBAAiB,uBAAuB;AAAA,QAC5C,UAAU,OAAO;AAAA,QACjB;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA,eAAe;AAAA,MACjB,CAAC;AACD,yBAAmB,eAAe;AAClC,6BAAuB;AAEvB,YAAM,OAA2D;AAAA,QAC/D,OAAO;AAAA,QACP;AAAA,QACA;AAAA,QACA,YAAY;AAAA,QACZ;AAAA,QACA,YAAY;AAAA,QACZ;AAAA,QACA,WAAW,IAAI,KAAK,aAAa,EAAE,YAAY;AAAA,QAC/C,UAAS,oBAAI,KAAK,GAAE,YAAY;AAAA,MAClC;AACA,cAAQ,KAAK,IAAI;AACjB,UAAI,cAAc,IAAI;AACpB,cAAM;AAAA,UAAS;AAAA,UAAe;AAAA,UAAW,MACvC,YAAY,IAAI;AAAA,YACd,YAAY;AAAA,cACV,eAAe,cAAc,WAAW;AAAA,cACxC;AAAA,cACA,aAAa,eAAe;AAAA,cAC5B,YAAY,cAAc;AAAA,cAC1B;AAAA,YACF;AAAA,UACF,CAAC;AAAA,QACH;AAAA,MACF,OAAO;AACL,cAAM;AAAA,UAAS;AAAA,UAAe;AAAA,UAAW,MACvC,YAAY,KAAK,cAAc,SAAS,iBAAiB;AAAA,YACvD,YAAY;AAAA,cACV;AAAA,cACA;AAAA,YACF;AAAA,UACF,CAAC;AAAA,QACH;AAAA,MACF;AACA,YAAM,UAAU,OAAO,QAAQ,MAAM,aAAa;AAElD,UAAI,eAAe,MAAM;AACvB,eAAO,OAAO,SAAS;AAAA,UACrB,QAAQ,OAAO;AAAA,UACf,MAAM;AAAA,UACN,WAAW;AAAA,UACX,QAAQ,eAAe;AAAA,UACvB,OAAO;AAAA,UACP,OAAO;AAAA,UACP,YAAY;AAAA,UACZ,YAAY;AAAA,UACZ,QAAQ,KAAK,IAAI,IAAI;AAAA,UACrB;AAAA,UACA,OAAO,SAAS,SAAS;AAAA,UACzB,cAAc;AAAA,UACd;AAAA,UACA,WAAW;AAAA,QACb,CAAC;AAAA,MACH;AAEA,YAAM,qBAAqB,mBAAmB,QAAQ,YAAY;AAClE,UAAI,mBAAmB,MAAM;AAC3B,eAAO,OAAO,SAAS;AAAA,UACrB,QAAQ,OAAO;AAAA,UACf,MAAM;AAAA,UACN,WAAW;AAAA,UACX,QAAQ,mBAAmB;AAAA,UAC3B,OAAO;AAAA,UACP,OAAO;AAAA,UACP,YAAY;AAAA,UACZ,YAAY;AAAA,UACZ,QAAQ,KAAK,IAAI,IAAI;AAAA,UACrB;AAAA,UACA,OAAO,SAAS,SAAS;AAAA,UACzB,cAAc;AAAA,UACd;AAAA,UACA,WAAW;AAAA,QACb,CAAC;AAAA,MACH;AAEA,YAAM,cAAc;AAAA,QAClB,OAAO;AAAA,QACP;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA,YAAY;AAAA,QACZ;AAAA,QACA;AAAA,QACA,WAAW;AAAA,QACX;AAAA,MACF;AACA,UAAI;AACJ,UAAI;AACF,uBAAe,OAAO,aAClB,MAAM,OAAO,WAAW,WAAW,IACnC,oBAAoB,KAAK;AAAA,MAC/B,SAAS,KAAK;AACZ,sBAAc,KAAK,aAAa,eAAe,YAAY,GAAG,GAAG,CAAC;AAClE,eAAO,OAAO,SAAS;AAAA,UACrB,QAAQ,OAAO;AAAA,UACf,MAAM;AAAA,UACN,WAAW;AAAA,UACX,QAAQ,cAAc,cAAc,SAAS,CAAC,EAAG;AAAA,UACjD,OAAO,aAAa,KAAK;AAAA,UACzB,OAAO;AAAA,UACP,YAAY;AAAA,UACZ,YAAY;AAAA,UACZ,QAAQ,KAAK,IAAI,IAAI;AAAA,UACrB;AAAA,UACA,OAAO,SAAS,SAAS;AAAA,UACzB,cAAc;AAAA,UACd;AAAA,UACA,WAAW;AAAA,QACb,CAAC;AAAA,MACH;AACA,UAAI,aAAa,MAAM;AACrB,eAAO,OAAO,SAAS;AAAA,UACrB,QAAQ,OAAO;AAAA,UACf,MAAM,aAAa;AAAA,UACnB,WAAW;AAAA,UACX,QAAQ,aAAa;AAAA,UACrB,OAAO,aAAa;AAAA,UACpB,OAAO;AAAA,UACP,YAAY;AAAA,UACZ,YAAY;AAAA,UACZ,QAAQ,KAAK,IAAI,IAAI;AAAA,UACrB;AAAA,UACA,OAAO,SAAS,SAAS;AAAA,UACzB,cAAc,aAAa;AAAA,UAC3B;AAAA,UACA,WAAW;AAAA,QACb,CAAC;AAAA,MACH;AAAA,IACF;AAEA,WAAO,OAAO,SAAS;AAAA,MACrB,QAAQ,OAAO;AAAA,MACf,MAAM;AAAA,MACN,WAAW;AAAA,MACX,QAAQ,8BAA8B,OAAO,QAAQ;AAAA,MACrD,OAAO;AAAA,MACP,YAAY;AAAA,MACZ,YAAY;AAAA,MACZ,QAAQ,KAAK,IAAI,IAAI;AAAA,MACrB;AAAA,MACA,OAAO,SAAS,SAAS;AAAA,MACzB,cAAc;AAAA,MACd;AAAA,MACA,WAAW;AAAA,IACb,CAAC;AAAA,EACH,SAAS,KAAK;AACZ,kBAAc,KAAK,aAAa,OAAO,QAAQ,QAAQ,GAAG,CAAC;AAC3D,WAAO,OAAO,SAAS;AAAA,MACrB,QAAQ,OAAO;AAAA,MACf,MAAM;AAAA,MACN,WAAW;AAAA,MACX,QAAQ,cAAc,cAAc,SAAS,CAAC,EAAG;AAAA,MACjD,OAAO;AAAA,MACP,YAAY;AAAA,MACZ,YAAY,CAAC;AAAA,MACb,QAAQ,KAAK,IAAI,IAAI;AAAA,MACrB;AAAA,MACA,OAAO,SAAS,SAAS;AAAA,MACzB,cAAc;AAAA,MACd;AAAA,MACA,WAAW;AAAA,IACb,CAAC;AAAA,EACH,UAAE;AACA,QAAI,UAAW,cAAa,SAAS;AACrC,QAAI,OAAO,OAAQ,QAAO,OAAO,oBAAoB,SAAS,aAAa;AAAA,EAC7E;AACF;AAEO,SAAS,iBACd,oBACA,UAA4E,CAAC,GACvC;AACtC,SAAO,EAAE,GAAG,SAAS,mBAAmB;AAC1C;AAEO,SAAS,qBACd,oBACA,UAA4E,CAAC,GACvC;AACtC,SAAO,EAAE,GAAG,SAAS,mBAAmB;AAC1C;AAEO,SAAS,cAAc,OAAgE;AAC5F,SAAO,EAAE,GAAG,OAAO,WAAW,KAAK;AACrC;AAEO,SAAS,eAAe,OAAgE;AAC7F,SAAO,EAAE,GAAG,OAAO,WAAW,MAAM;AACtC;AAEA,SAAS,gBAAgB,OAA0D;AACjF,QAAM,MAAM,EAAE,GAAG,gBAAgB,GAAG,MAAM;AAC1C,MAAI,CAAC,OAAO,UAAU,IAAI,QAAQ,KAAM,IAAI,WAAsB,GAAG;AACnE,UAAM,IAAI;AAAA,MACR,+DAA+D,OAAO,IAAI,QAAQ,CAAC;AAAA,IACrF;AAAA,EACF;AACA,QAAM,SAAwB,EAAE,UAAU,IAAI,SAAmB;AACjE,MAAI,IAAI,cAAc,QAAW;AAC/B,QACE,OAAO,IAAI,cAAc,YACzB,CAAC,OAAO,SAAS,IAAI,SAAS,KAC9B,IAAI,aAAa,GACjB;AACA,YAAM,IAAI;AAAA,QACR,yEAAyE,OAAO,IAAI,SAAS,CAAC;AAAA,MAChG;AAAA,IACF;AACA,WAAO,YAAY,IAAI;AAAA,EACzB;AACA,MAAI,IAAI,eAAe,QAAW;AAChC,QACE,OAAO,IAAI,eAAe,YAC1B,CAAC,OAAO,SAAS,IAAI,UAAU,KAC/B,IAAI,aAAa,GACjB;AACA,YAAM,IAAI;AAAA,QACR,6EAA6E,OAAO,IAAI,UAAU,CAAC;AAAA,MACrG;AAAA,IACF;AACA,WAAO,aAAa,IAAI;AAAA,EAC1B;AACA,SAAO;AACT;AAEA,SAAS,uBACP,SACA,eACA,WACoB;AACpB,MAAI,YAAY,OAAW,QAAO;AAClC,MAAI,CAAC,OAAO,SAAS,OAAO,KAAK,UAAU,GAAG;AAC5C,kBAAc;AAAA,MACZ,aAAa,OAAO,WAAW,IAAI,MAAM,2BAA2B,OAAO,OAAO,CAAC,EAAE,CAAC;AAAA,IACxF;AACA,WAAO;AAAA,EACT;AACA,SAAO;AACT;AAEO,SAAS,kBAAkB,OAAqC;AACrE,SAAO,MAAM;AAAA,IACX,CAAC,WAAW,OAAO,UAAW,OAAO,aAAa,cAAc,OAAO,aAAa;AAAA,EACtF;AACF;AAEA,SAAS,YACP,QACA,OACA,OACA,SACA,QACA,WACA,SACA,cACA,aACA,SACuD;AACvD,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,QAAQ,KAAK,IAAI,IAAI;AAAA,IACrB;AAAA,IACA,kBACE,OAAO,eAAe,SAAY,SAAY,KAAK,IAAI,GAAG,OAAO,aAAa,YAAY;AAAA,IAC5F;AAAA,IACA;AAAA,EACF;AACF;AAEA,SAAS,oBAAoB,OAA0C;AACrE,MAAI,CAAC,MAAM,OAAQ,QAAO,EAAE,MAAM,OAAO,MAAM,OAAO,QAAQ,eAAe;AAC7E,QAAM,OAAO,kBAAkB,KAAK;AACpC,SAAO,OACH,EAAE,MAAM,MAAM,MAAM,MAAM,QAAQ,6BAA6B,OAAO,aAAa,KAAK,EAAE,IAC1F;AAAA,IACE,MAAM;AAAA,IACN,MAAM;AAAA,IACN,QAAQ;AAAA,IACR,OAAO,aAAa,KAAK;AAAA,EAC3B;AACN;AAEA,SAAS,aAAa,OAAgD;AACpE,QAAM,SAAS,MACZ,IAAI,CAAC,WAAW,OAAO,KAAK,EAC5B,OAAO,CAAC,UAA2B,OAAO,UAAU,QAAQ;AAC/D,MAAI,CAAC,OAAO,OAAQ,QAAO;AAC3B,SAAO,KAAK,MAAO,OAAO,OAAO,CAAC,KAAK,UAAU,MAAM,OAAO,CAAC,IAAI,OAAO,SAAU,GAAI,IAAI;AAC9F;AAEA,SAAS,mBACP,QACA,cACmC;AACnC,MAAI,OAAO,eAAe,UAAa,gBAAgB,OAAO,YAAY;AACxE,WAAO;AAAA,MACL,MAAM;AAAA,MACN,QAAQ,gCAAgC,OAAO,UAAU;AAAA,IAC3D;AAAA,EACF;AACA,SAAO,EAAE,MAAM,OAAO,QAAQ,GAAG;AACnC;AAEA,eAAe,iBACb,SACA,QACA,cACA,QACA,eACA,WACe;AACf,MAAI,CAAC,WAAW,OAAO,eAAe,OAAW;AACjD,QAAM,aAAa,OAAO;AAC1B,QAAM;AAAA,IAAS;AAAA,IAAe;AAAA,IAAW,MACvC,QAAQ,aAAa;AAAA,MACnB,WAAW;AAAA,MACX,OAAO;AAAA,MACP,UAAU;AAAA,MACV,WAAW,KAAK,IAAI,GAAG,aAAa,YAAY;AAAA,MAChD,UAAU,gBAAgB;AAAA,MAC1B,QAAQ,QAAQ,KAAK;AAAA,IACvB,CAAC;AAAA,EACH;AACF;AAEA,eAAe,gBACb,SACA,OACA,OACA,eACA,WACA,cACe;AACf,MAAI,CAAC,QAAS;AACd,aAAW,UAAU,OAAO;AAC1B,UAAM;AAAA,MAAS;AAAA,MAAe;AAAA,MAAW,MACvC,QAAQ,YAAY;AAAA,QAClB,SAAS,OAAO,YAAY,wBAAwB;AAAA,QACpD,cAAc,gBAAgB,QAAQ;AAAA,QACtC,MAAM,gBAAgB,OAAO,EAAE;AAAA,QAC/B,WAAW,OAAO;AAAA,QAClB,OAAO,OAAO,OAAO,UAAU,WAAW,OAAO,QAAQ,OAAO,SAAS,IAAI;AAAA,QAC7E,WAAW,OAAO;AAAA,QAClB,UAAU,OAAO;AAAA,QACjB,YAAY;AAAA,UACV;AAAA,UACA,QAAQ,OAAO;AAAA,UACf,UAAU,OAAO;AAAA,UACjB,WAAW,OAAO;AAAA,QACpB;AAAA,MACF,CAAC;AAAA,IACH;AAAA,EACF;AACF;AAEA,eAAe,UACb,QACA,MACA,eACe;AACf,MAAI,CAAC,OAAQ;AACb,MAAI;AACF,UAAM,OAAO,IAAI;AAAA,EACnB,SAAS,KAAK;AACZ,kBAAc,KAAK,aAAa,WAAW,KAAK,OAAO,GAAG,CAAC;AAAA,EAC7D;AACF;AAEA,eAAe,SACb,eACA,WACA,OACwB;AACxB,MAAI;AACF,WAAO,MAAM,MAAM;AAAA,EACrB,SAAS,KAAK;AACZ,kBAAc,KAAK,aAAa,SAAS,WAAW,GAAG,CAAC;AACxD,WAAO;AAAA,EACT;AACF;AAEA,SAAS,uBAAwC,MAOK;AACpD,QAAM,MAAM,KAAK,UAAU;AAC3B,MAAI,CAAC,OAAO,OAAO,EAAG,QAAO,EAAE,MAAM,OAAO,QAAQ,IAAI,QAAQ,EAAE;AAClE,QAAM,gBAAgB,KAAK,UAAU,iBAAiB;AACtD,QAAM,aAAa,KAAK,KAAK,KAAK,cAAc,MAAM,KAAK,eAAe,EAAE;AAC5E,QAAM,iBACJ,KAAK,yBAAyB,UAAa,KAAK,yBAAyB,KAAK;AAChF,QAAM,YAAY,aAAa;AAC/B,QAAM,SAAS,kBAAkB,YAAY,KAAK,gBAAgB,IAAI;AACtE,SAAO,UAAU,MACb,EAAE,MAAM,MAAM,QAAQ,sCAAsC,MAAM,YAAY,OAAO,IACrF,EAAE,MAAM,OAAO,QAAQ,IAAI,OAAO;AACxC;AAEA,SAAS,2BACP,UACA,QACmC;AACnC,QAAM,MAAM,UAAU;AACtB,MAAI,CAAC,OAAO,OAAO,KAAK,SAAS,IAAK,QAAO,EAAE,MAAM,OAAO,QAAQ,GAAG;AACvE,SAAO;AAAA,IACL,MAAM;AAAA,IACN,QAAQ,mCAAmC,MAAM;AAAA,EACnD;AACF;AAEA,SAAS,iBACP,OACA,UACQ;AACR,MAAI,UAAU,iBAAkB,QAAO,SAAS,iBAAiB,KAAK;AACtE,SAAO,kBAAkB,KAAK;AAChC;AAEA,SAAS,kBACP,QACA,UACQ;AACR,MAAI,UAAU,kBAAmB,QAAO,SAAS,kBAAkB,MAAM;AACzE,SAAO,kBAAkB,MAAM;AACjC;AAEA,SAAS,kBAAkB,OAAwB;AACjD,MAAI,OAAO,UAAU,SAAU,QAAO;AACtC,MAAI,OAAO,UAAU,YAAY,OAAO,UAAU,aAAa,SAAS,KAAM,QAAO,OAAO,KAAK;AACjG,MAAI;AACF,WAAO,KAAK,UAAU,mBAAmB,KAAK,CAAC;AAAA,EACjD,QAAQ;AACN,WAAO,OAAO,KAAK;AAAA,EACrB;AACF;AAEA,SAAS,mBAAmB,OAAyB;AACnD,MAAI,MAAM,QAAQ,KAAK,EAAG,QAAO,MAAM,IAAI,kBAAkB;AAC7D,MAAI,CAAC,SAAS,OAAO,UAAU,SAAU,QAAO;AAChD,QAAM,SAAS;AACf,QAAM,SAAkC,CAAC;AACzC,aAAW,OAAO,OAAO,KAAK,MAAM,EAAE,KAAK,GAAG;AAC5C,WAAO,GAAG,IAAI,mBAAmB,OAAO,GAAG,CAAC;AAAA,EAC9C;AACA,SAAO;AACT;AAEA,SAAS,YAAY,QAA6B;AAChD,QAAM,SAAS,OAAO;AACtB,MAAI,kBAAkB,MAAO,QAAO,OAAO;AAC3C,SAAO,SAAS,OAAO,MAAM,IAAI;AACnC;AAEA,SAAS,aACP,OACA,WACA,KACqB;AACrB,QAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,SAAO,EAAE,OAAO,WAAW,QAAQ;AACrC;AAEA,eAAe,OACb,SACA,QACkE;AAClE,QAAM;AAAA,IAAS,OAAO;AAAA,IAAe,OAAO,MAAM;AAAA,IAAQ,MACxD,SAAS,OAAO;AAAA,MACd,MAAM,OAAO;AAAA,MACb,OAAO,OAAO,SAAS,aAAa,OAAO,UAAU;AAAA,MACrD,cAAc,OAAO;AAAA,MACrB,OAAO,OAAO;AAAA,IAChB,CAAC;AAAA,EACH;AACA,SAAO;AACT;","names":["step"]}
@@ -1,7 +1,7 @@
1
1
  import {
2
2
  objectiveEval,
3
3
  runAgentControlLoop
4
- } from "./chunk-LSH4MMOZ.js";
4
+ } from "./chunk-NCRFYPS3.js";
5
5
  import {
6
6
  validateRunRecord
7
7
  } from "./chunk-NLMNWKVM.js";
@@ -610,4 +610,4 @@ export {
610
610
  runProposeReviewAsControlLoop,
611
611
  controlFailureClassFromVerification
612
612
  };
613
- //# sourceMappingURL=chunk-ZN274SWR.js.map
613
+ //# sourceMappingURL=chunk-PALJO75S.js.map
@@ -85,7 +85,8 @@ var SubprocessSandboxDriver = class {
85
85
  () => {
86
86
  try {
87
87
  child.kill("SIGKILL");
88
- } catch {
88
+ } catch (err) {
89
+ console.warn("[sandbox-harness] SIGKILL on timeout failed:", err);
89
90
  }
90
91
  },
91
92
  config.timeoutMs ?? 10 * 6e4
@@ -247,4 +248,4 @@ export {
247
248
  SandboxHarness,
248
249
  runTestGradedScenario
249
250
  };
250
- //# sourceMappingURL=chunk-OWLAAMME.js.map
251
+ //# sourceMappingURL=chunk-QHF6EQKK.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/sandbox-harness.ts","../src/test-graded-scenario.ts"],"sourcesContent":["/**\n * SandboxHarness — executes a scenario in an isolated environment and\n * emits a rich SandboxSpan into the trace.\n *\n * Two built-in drivers:\n * - `SubprocessSandboxDriver` — spawn in a local cwd with env vars.\n * Fast, no dependencies, fine for unit tests and most CI gates.\n * - `DockerSandboxDriver` — lifted from tangle-router's sandbox path;\n * shells out to `docker run`. Stronger isolation, slower startup.\n *\n * Consumers implement `SandboxDriver` for custom backends (Firecracker,\n * Cloudflare sandbox product, etc.). The harness doesn't care which.\n */\n\nimport { ConfigError } from './errors'\nimport type { TraceEmitter } from './trace/emitter'\nimport type { SandboxSpan } from './trace/schema'\n\nexport interface HarnessConfig {\n /** Setup command (e.g. \"pnpm install\"). Non-zero exit fails the run. */\n setupCommand?: string\n /** Run command (e.g. \"pnpm build\"). */\n runCommand?: string\n /** Test command (e.g. \"pnpm test --run\"). Drives the test count + pass count. */\n testCommand?: string\n /** Absolute cwd for the subprocess driver. Ignored by docker driver. */\n cwd?: string\n /** Max wall-clock per phase in ms. Default 10 minutes. */\n timeoutMs?: number\n /** Image for the docker driver. */\n image?: string\n /** Extra env vars (validated; shell-escaped). */\n env?: Record<string, string>\n /** Parser for the test output — maps stdout/stderr/exit code → pass count. */\n testParser?: TestOutputParser\n}\n\nexport interface TestOutputParser {\n id: string\n parse(\n stdout: string,\n stderr: string,\n exitCode: number,\n ): { testsTotal: number; testsPassed: number } | undefined\n}\n\nexport interface SandboxResult {\n phase: 'setup' | 'run' | 'test'\n exitCode: number\n stdout: string\n stderr: string\n wallMs: number\n testsTotal?: number\n testsPassed?: number\n}\n\nexport interface SandboxDriver {\n id: string\n exec(\n phase: SandboxResult['phase'],\n command: string,\n config: HarnessConfig,\n ): Promise<SandboxResult>\n}\n\n// ── Parsers ──────────────────────────────────────────────────────────\n\n/** Vitest default summary line: \"Tests X passed | Y failed\". */\nexport const vitestTestParser: TestOutputParser = {\n id: 'vitest',\n parse(stdout) {\n const m = stdout.match(/Tests\\s+(\\d+)\\s+(passed|failed)(?:\\s*\\|\\s*(\\d+)\\s+(passed|failed))?/i)\n if (!m) return undefined\n let passed = 0\n let failed = 0\n const a = parseInt(m[1]!, 10)\n const aLabel = m[2]!.toLowerCase()\n if (aLabel === 'passed') passed += a\n else failed += a\n if (m[3] && m[4]) {\n const b = parseInt(m[3], 10)\n if (m[4].toLowerCase() === 'passed') passed += b\n else failed += b\n }\n return { testsTotal: passed + failed, testsPassed: passed }\n },\n}\n\n/** Pytest default: \"collected N items\" + \" X passed, Y failed\". */\nexport const pytestTestParser: TestOutputParser = {\n id: 'pytest',\n parse(stdout) {\n const total = stdout.match(/collected\\s+(\\d+)\\s+items?/i)\n const passed = stdout.match(/(\\d+)\\s+passed/)\n if (!total || !passed) return undefined\n return { testsTotal: parseInt(total[1]!, 10), testsPassed: parseInt(passed[1]!, 10) }\n },\n}\n\n/** Jest: \"Tests: X passed, Y total\" (and optional failed). */\nexport const jestTestParser: TestOutputParser = {\n id: 'jest',\n parse(stdout) {\n const m = stdout.match(/Tests:\\s+(?:(\\d+)\\s+failed[^,]*,\\s*)?(\\d+)\\s+passed,\\s+(\\d+)\\s+total/i)\n if (!m) return undefined\n return { testsTotal: parseInt(m[3]!, 10), testsPassed: parseInt(m[2]!, 10) }\n },\n}\n\n/** Composite parser — tries a list of parsers in order. */\nexport function composeParsers(...parsers: TestOutputParser[]): TestOutputParser {\n return {\n id: parsers.map((p) => p.id).join('|'),\n parse(stdout, stderr, exitCode) {\n for (const p of parsers) {\n const res = p.parse(stdout, stderr, exitCode)\n if (res) return res\n }\n return undefined\n },\n }\n}\n\n// ── Drivers ──────────────────────────────────────────────────────────\n\nexport interface SubprocessSandboxDriverOptions {\n /**\n * Default cwd for all `exec` calls. Used when the per-call `HarnessConfig`\n * does not set its own `cwd`. Lets callers bind the driver to a working\n * directory once instead of spreading cwd into every harness config —\n * useful when the harness config is constructed far from the call site\n * (e.g. starter-foundry's promoter passes a static HarnessConfig per\n * family taxonomy but needs a per-run composed-scaffold cwd).\n */\n cwd?: string\n /**\n * Default env merged into every `exec` call's env (per-call `HarnessConfig.env`\n * still wins on key collision). Same ergonomic rationale as `cwd` above.\n */\n env?: Record<string, string>\n}\n\nexport class SubprocessSandboxDriver implements SandboxDriver {\n id = 'subprocess'\n private defaultCwd?: string\n private defaultEnv?: Record<string, string>\n\n constructor(options: SubprocessSandboxDriverOptions = {}) {\n this.defaultCwd = options.cwd\n this.defaultEnv = options.env\n }\n\n async exec(\n phase: SandboxResult['phase'],\n command: string,\n config: HarnessConfig,\n ): Promise<SandboxResult> {\n const { spawn } = await import('node:child_process')\n const start = Date.now()\n // Per-call config wins; fall back to constructor defaults. Honoring\n // the constructor `cwd` keeps the subprocess from inheriting Node's\n // cwd when only the constructor arg is supplied.\n const effectiveCwd = config.cwd ?? this.defaultCwd\n const effectiveEnv = { ...process.env, ...(this.defaultEnv ?? {}), ...(config.env ?? {}) }\n return await new Promise<SandboxResult>((resolve) => {\n const child = spawn(command, {\n shell: true,\n cwd: effectiveCwd,\n env: effectiveEnv,\n })\n let stdout = ''\n let stderr = ''\n child.stdout?.on('data', (d) => {\n stdout += String(d)\n })\n child.stderr?.on('data', (d) => {\n stderr += String(d)\n })\n const timeout = setTimeout(\n () => {\n try {\n child.kill('SIGKILL')\n } catch (err) {\n console.warn('[sandbox-harness] SIGKILL on timeout failed:', err)\n }\n },\n config.timeoutMs ?? 10 * 60_000,\n )\n child.on('close', (code) => {\n clearTimeout(timeout)\n const wallMs = Date.now() - start\n const parsed =\n phase === 'test' && config.testParser\n ? config.testParser.parse(stdout, stderr, code ?? 1)\n : undefined\n resolve({\n phase,\n exitCode: code ?? 1,\n stdout,\n stderr,\n wallMs,\n testsTotal: parsed?.testsTotal,\n testsPassed: parsed?.testsPassed,\n })\n })\n child.on('error', (err) => {\n clearTimeout(timeout)\n const wallMs = Date.now() - start\n resolve({ phase, exitCode: 127, stdout, stderr: stderr + String(err), wallMs })\n })\n })\n }\n}\n\nexport class DockerSandboxDriver implements SandboxDriver {\n id = 'docker'\n\n async exec(\n phase: SandboxResult['phase'],\n command: string,\n config: HarnessConfig,\n ): Promise<SandboxResult> {\n if (!config.image) throw new ConfigError('DockerSandboxDriver requires config.image')\n const sub = new SubprocessSandboxDriver()\n const envArgs = Object.entries(config.env ?? {})\n .map(([k, v]) => `-e ${shellQuote(k)}=${shellQuote(v)}`)\n .join(' ')\n const wrapped = `docker run --rm ${envArgs} ${shellQuote(config.image)} sh -c ${shellQuote(command)}`\n return sub.exec(phase, wrapped, { ...config, env: undefined })\n }\n}\n\nfunction shellQuote(v: string): string {\n if (/^[A-Za-z0-9_\\-/.@:=]+$/.test(v)) return v\n return `'${v.replace(/'/g, `'\\\\''`)}'`\n}\n\n// ── Harness orchestration ────────────────────────────────────────────\n\nexport interface SandboxHarnessResult {\n passed: boolean\n setup?: SandboxResult\n run?: SandboxResult\n test?: SandboxResult\n totalWallMs: number\n /** Final score — 0 when no tests; otherwise testsPassed/testsTotal. */\n score: number\n}\n\nexport class SandboxHarness {\n private driver: SandboxDriver\n constructor(driver: SandboxDriver = new SubprocessSandboxDriver()) {\n this.driver = driver\n }\n\n async run(config: HarnessConfig, emitter: TraceEmitter): Promise<SandboxHarnessResult> {\n const handle = await emitter.sandbox({\n name: `sandbox(${this.driver.id})`,\n image: config.image,\n command: [config.setupCommand, config.runCommand, config.testCommand]\n .filter(Boolean)\n .join(' && '),\n })\n const result: SandboxHarnessResult = { passed: false, totalWallMs: 0, score: 0 }\n try {\n if (config.setupCommand) {\n result.setup = await this.driver.exec('setup', config.setupCommand, config)\n result.totalWallMs += result.setup.wallMs\n if (result.setup.exitCode !== 0) {\n await handle.fail(`setup failed (exit ${result.setup.exitCode})`, {\n exitCode: result.setup.exitCode,\n wallMs: result.totalWallMs,\n } as Partial<SandboxSpan>)\n return result\n }\n }\n if (config.runCommand) {\n result.run = await this.driver.exec('run', config.runCommand, config)\n result.totalWallMs += result.run.wallMs\n if (result.run.exitCode !== 0) {\n await handle.fail(`run failed (exit ${result.run.exitCode})`, {\n exitCode: result.run.exitCode,\n wallMs: result.totalWallMs,\n } as Partial<SandboxSpan>)\n return result\n }\n }\n if (config.testCommand) {\n result.test = await this.driver.exec('test', config.testCommand, config)\n result.totalWallMs += result.test.wallMs\n const passed = result.test.exitCode === 0\n result.passed = passed\n if (result.test.testsTotal !== undefined && result.test.testsTotal > 0) {\n result.score = (result.test.testsPassed ?? 0) / result.test.testsTotal\n } else {\n result.score = passed ? 1 : 0\n }\n await handle.end({\n exitCode: result.test.exitCode,\n testsTotal: result.test.testsTotal,\n testsPassed: result.test.testsPassed,\n wallMs: result.totalWallMs,\n status: passed ? 'ok' : 'error',\n } as Partial<SandboxSpan>)\n } else {\n result.passed = true\n result.score = 1\n await handle.end({ wallMs: result.totalWallMs } as Partial<SandboxSpan>)\n }\n } catch (err) {\n await handle.fail(err instanceof Error ? err : String(err))\n throw err\n }\n return result\n }\n}\n","/**\n * TestGradedScenario — a scenario whose score comes from a test suite.\n *\n * This is the SWE-bench pattern generalized. The scenario ships:\n * - fixture data (setup instructions)\n * - a test command the harness runs\n * - optional assertion overrides\n *\n * The runner emits a run, delegates to SandboxHarness, records the\n * outcome, and returns a structured verdict. Consumers bind their own\n * agent execution to this contract.\n */\n\nimport type { HarnessConfig, SandboxDriver, SandboxHarnessResult } from './sandbox-harness'\nimport { SandboxHarness } from './sandbox-harness'\nimport { TraceEmitter } from './trace/emitter'\nimport type { FailureClass, Run } from './trace/schema'\nimport type { TraceStore } from './trace/store'\n\nexport interface TestGradedScenario {\n id: string\n description?: string\n harness: HarnessConfig\n /** Optional pass threshold in 0..1 (default 1.0 = all tests must pass). */\n passThreshold?: number\n /** Provenance for dataset tracking. */\n datasetVersion?: string\n /** Free-form tags (difficulty, category, etc.). */\n tags?: Record<string, string>\n}\n\nexport interface TestGradedRunOptions {\n variantId?: string\n driver?: SandboxDriver\n /** Metadata recorded on the Run (codeSha, promptSha, modelFingerprint, seed). */\n provenance?: Pick<Run, 'codeSha' | 'promptSha' | 'modelFingerprint' | 'seed' | 'envFingerprint'>\n}\n\nexport interface TestGradedRunResult {\n runId: string\n scenario: TestGradedScenario\n harness: SandboxHarnessResult\n pass: boolean\n score: number\n failureClass?: FailureClass\n}\n\nexport async function runTestGradedScenario(\n scenario: TestGradedScenario,\n store: TraceStore,\n options: TestGradedRunOptions = {},\n): Promise<TestGradedRunResult> {\n const emitter = new TraceEmitter(store)\n await emitter.startRun({\n scenarioId: scenario.id,\n variantId: options.variantId,\n datasetVersion: scenario.datasetVersion,\n tags: scenario.tags,\n ...options.provenance,\n })\n const harness = new SandboxHarness(options.driver)\n const result = await harness.run(scenario.harness, emitter)\n const threshold = scenario.passThreshold ?? 1.0\n const pass = result.passed && result.score >= threshold\n const setupFailed = result.setup !== undefined && result.setup.exitCode !== 0\n const runFailed = result.run !== undefined && result.run.exitCode !== 0\n const testFailed = result.test !== undefined && result.test.exitCode !== 0\n const failureClass: FailureClass | undefined = pass\n ? 'success'\n : setupFailed || runFailed\n ? 'sandbox_failure'\n : testFailed\n ? 'format_drift'\n : 'unknown'\n await emitter.endRun({\n pass,\n score: result.score,\n failureClass,\n notes: pass ? undefined : reasonForFailure(result),\n })\n return {\n runId: emitter.runId,\n scenario,\n harness: result,\n pass,\n score: result.score,\n failureClass,\n }\n}\n\nfunction reasonForFailure(result: SandboxHarnessResult): string {\n if (result.setup && result.setup.exitCode !== 0)\n return `setup failed: exit ${result.setup.exitCode}`\n if (result.run && result.run.exitCode !== 0) return `run failed: exit ${result.run.exitCode}`\n if (result.test) {\n if (result.test.testsTotal !== undefined) {\n return `tests: ${result.test.testsPassed ?? 0}/${result.test.testsTotal}`\n }\n return `test exit ${result.test.exitCode}`\n }\n return 'no test command'\n}\n"],"mappings":";;;;;;;;AAoEO,IAAM,mBAAqC;AAAA,EAChD,IAAI;AAAA,EACJ,MAAM,QAAQ;AACZ,UAAM,IAAI,OAAO,MAAM,sEAAsE;AAC7F,QAAI,CAAC,EAAG,QAAO;AACf,QAAI,SAAS;AACb,QAAI,SAAS;AACb,UAAM,IAAI,SAAS,EAAE,CAAC,GAAI,EAAE;AAC5B,UAAM,SAAS,EAAE,CAAC,EAAG,YAAY;AACjC,QAAI,WAAW,SAAU,WAAU;AAAA,QAC9B,WAAU;AACf,QAAI,EAAE,CAAC,KAAK,EAAE,CAAC,GAAG;AAChB,YAAM,IAAI,SAAS,EAAE,CAAC,GAAG,EAAE;AAC3B,UAAI,EAAE,CAAC,EAAE,YAAY,MAAM,SAAU,WAAU;AAAA,UAC1C,WAAU;AAAA,IACjB;AACA,WAAO,EAAE,YAAY,SAAS,QAAQ,aAAa,OAAO;AAAA,EAC5D;AACF;AAGO,IAAM,mBAAqC;AAAA,EAChD,IAAI;AAAA,EACJ,MAAM,QAAQ;AACZ,UAAM,QAAQ,OAAO,MAAM,6BAA6B;AACxD,UAAM,SAAS,OAAO,MAAM,gBAAgB;AAC5C,QAAI,CAAC,SAAS,CAAC,OAAQ,QAAO;AAC9B,WAAO,EAAE,YAAY,SAAS,MAAM,CAAC,GAAI,EAAE,GAAG,aAAa,SAAS,OAAO,CAAC,GAAI,EAAE,EAAE;AAAA,EACtF;AACF;AAGO,IAAM,iBAAmC;AAAA,EAC9C,IAAI;AAAA,EACJ,MAAM,QAAQ;AACZ,UAAM,IAAI,OAAO,MAAM,uEAAuE;AAC9F,QAAI,CAAC,EAAG,QAAO;AACf,WAAO,EAAE,YAAY,SAAS,EAAE,CAAC,GAAI,EAAE,GAAG,aAAa,SAAS,EAAE,CAAC,GAAI,EAAE,EAAE;AAAA,EAC7E;AACF;AAGO,SAAS,kBAAkB,SAA+C;AAC/E,SAAO;AAAA,IACL,IAAI,QAAQ,IAAI,CAAC,MAAM,EAAE,EAAE,EAAE,KAAK,GAAG;AAAA,IACrC,MAAM,QAAQ,QAAQ,UAAU;AAC9B,iBAAW,KAAK,SAAS;AACvB,cAAM,MAAM,EAAE,MAAM,QAAQ,QAAQ,QAAQ;AAC5C,YAAI,IAAK,QAAO;AAAA,MAClB;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAqBO,IAAM,0BAAN,MAAuD;AAAA,EAC5D,KAAK;AAAA,EACG;AAAA,EACA;AAAA,EAER,YAAY,UAA0C,CAAC,GAAG;AACxD,SAAK,aAAa,QAAQ;AAC1B,SAAK,aAAa,QAAQ;AAAA,EAC5B;AAAA,EAEA,MAAM,KACJ,OACA,SACA,QACwB;AACxB,UAAM,EAAE,MAAM,IAAI,MAAM,OAAO,eAAoB;AACnD,UAAM,QAAQ,KAAK,IAAI;AAIvB,UAAM,eAAe,OAAO,OAAO,KAAK;AACxC,UAAM,eAAe,EAAE,GAAG,QAAQ,KAAK,GAAI,KAAK,cAAc,CAAC,GAAI,GAAI,OAAO,OAAO,CAAC,EAAG;AACzF,WAAO,MAAM,IAAI,QAAuB,CAAC,YAAY;AACnD,YAAM,QAAQ,MAAM,SAAS;AAAA,QAC3B,OAAO;AAAA,QACP,KAAK;AAAA,QACL,KAAK;AAAA,MACP,CAAC;AACD,UAAI,SAAS;AACb,UAAI,SAAS;AACb,YAAM,QAAQ,GAAG,QAAQ,CAAC,MAAM;AAC9B,kBAAU,OAAO,CAAC;AAAA,MACpB,CAAC;AACD,YAAM,QAAQ,GAAG,QAAQ,CAAC,MAAM;AAC9B,kBAAU,OAAO,CAAC;AAAA,MACpB,CAAC;AACD,YAAM,UAAU;AAAA,QACd,MAAM;AACJ,cAAI;AACF,kBAAM,KAAK,SAAS;AAAA,UACtB,SAAS,KAAK;AACZ,oBAAQ,KAAK,gDAAgD,GAAG;AAAA,UAClE;AAAA,QACF;AAAA,QACA,OAAO,aAAa,KAAK;AAAA,MAC3B;AACA,YAAM,GAAG,SAAS,CAAC,SAAS;AAC1B,qBAAa,OAAO;AACpB,cAAM,SAAS,KAAK,IAAI,IAAI;AAC5B,cAAM,SACJ,UAAU,UAAU,OAAO,aACvB,OAAO,WAAW,MAAM,QAAQ,QAAQ,QAAQ,CAAC,IACjD;AACN,gBAAQ;AAAA,UACN;AAAA,UACA,UAAU,QAAQ;AAAA,UAClB;AAAA,UACA;AAAA,UACA;AAAA,UACA,YAAY,QAAQ;AAAA,UACpB,aAAa,QAAQ;AAAA,QACvB,CAAC;AAAA,MACH,CAAC;AACD,YAAM,GAAG,SAAS,CAAC,QAAQ;AACzB,qBAAa,OAAO;AACpB,cAAM,SAAS,KAAK,IAAI,IAAI;AAC5B,gBAAQ,EAAE,OAAO,UAAU,KAAK,QAAQ,QAAQ,SAAS,OAAO,GAAG,GAAG,OAAO,CAAC;AAAA,MAChF,CAAC;AAAA,IACH,CAAC;AAAA,EACH;AACF;AAEO,IAAM,sBAAN,MAAmD;AAAA,EACxD,KAAK;AAAA,EAEL,MAAM,KACJ,OACA,SACA,QACwB;AACxB,QAAI,CAAC,OAAO,MAAO,OAAM,IAAI,YAAY,2CAA2C;AACpF,UAAM,MAAM,IAAI,wBAAwB;AACxC,UAAM,UAAU,OAAO,QAAQ,OAAO,OAAO,CAAC,CAAC,EAC5C,IAAI,CAAC,CAAC,GAAG,CAAC,MAAM,MAAM,WAAW,CAAC,CAAC,IAAI,WAAW,CAAC,CAAC,EAAE,EACtD,KAAK,GAAG;AACX,UAAM,UAAU,mBAAmB,OAAO,IAAI,WAAW,OAAO,KAAK,CAAC,UAAU,WAAW,OAAO,CAAC;AACnG,WAAO,IAAI,KAAK,OAAO,SAAS,EAAE,GAAG,QAAQ,KAAK,OAAU,CAAC;AAAA,EAC/D;AACF;AAEA,SAAS,WAAW,GAAmB;AACrC,MAAI,yBAAyB,KAAK,CAAC,EAAG,QAAO;AAC7C,SAAO,IAAI,EAAE,QAAQ,MAAM,OAAO,CAAC;AACrC;AAcO,IAAM,iBAAN,MAAqB;AAAA,EAClB;AAAA,EACR,YAAY,SAAwB,IAAI,wBAAwB,GAAG;AACjE,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,MAAM,IAAI,QAAuB,SAAsD;AACrF,UAAM,SAAS,MAAM,QAAQ,QAAQ;AAAA,MACnC,MAAM,WAAW,KAAK,OAAO,EAAE;AAAA,MAC/B,OAAO,OAAO;AAAA,MACd,SAAS,CAAC,OAAO,cAAc,OAAO,YAAY,OAAO,WAAW,EACjE,OAAO,OAAO,EACd,KAAK,MAAM;AAAA,IAChB,CAAC;AACD,UAAM,SAA+B,EAAE,QAAQ,OAAO,aAAa,GAAG,OAAO,EAAE;AAC/E,QAAI;AACF,UAAI,OAAO,cAAc;AACvB,eAAO,QAAQ,MAAM,KAAK,OAAO,KAAK,SAAS,OAAO,cAAc,MAAM;AAC1E,eAAO,eAAe,OAAO,MAAM;AACnC,YAAI,OAAO,MAAM,aAAa,GAAG;AAC/B,gBAAM,OAAO,KAAK,sBAAsB,OAAO,MAAM,QAAQ,KAAK;AAAA,YAChE,UAAU,OAAO,MAAM;AAAA,YACvB,QAAQ,OAAO;AAAA,UACjB,CAAyB;AACzB,iBAAO;AAAA,QACT;AAAA,MACF;AACA,UAAI,OAAO,YAAY;AACrB,eAAO,MAAM,MAAM,KAAK,OAAO,KAAK,OAAO,OAAO,YAAY,MAAM;AACpE,eAAO,eAAe,OAAO,IAAI;AACjC,YAAI,OAAO,IAAI,aAAa,GAAG;AAC7B,gBAAM,OAAO,KAAK,oBAAoB,OAAO,IAAI,QAAQ,KAAK;AAAA,YAC5D,UAAU,OAAO,IAAI;AAAA,YACrB,QAAQ,OAAO;AAAA,UACjB,CAAyB;AACzB,iBAAO;AAAA,QACT;AAAA,MACF;AACA,UAAI,OAAO,aAAa;AACtB,eAAO,OAAO,MAAM,KAAK,OAAO,KAAK,QAAQ,OAAO,aAAa,MAAM;AACvE,eAAO,eAAe,OAAO,KAAK;AAClC,cAAM,SAAS,OAAO,KAAK,aAAa;AACxC,eAAO,SAAS;AAChB,YAAI,OAAO,KAAK,eAAe,UAAa,OAAO,KAAK,aAAa,GAAG;AACtE,iBAAO,SAAS,OAAO,KAAK,eAAe,KAAK,OAAO,KAAK;AAAA,QAC9D,OAAO;AACL,iBAAO,QAAQ,SAAS,IAAI;AAAA,QAC9B;AACA,cAAM,OAAO,IAAI;AAAA,UACf,UAAU,OAAO,KAAK;AAAA,UACtB,YAAY,OAAO,KAAK;AAAA,UACxB,aAAa,OAAO,KAAK;AAAA,UACzB,QAAQ,OAAO;AAAA,UACf,QAAQ,SAAS,OAAO;AAAA,QAC1B,CAAyB;AAAA,MAC3B,OAAO;AACL,eAAO,SAAS;AAChB,eAAO,QAAQ;AACf,cAAM,OAAO,IAAI,EAAE,QAAQ,OAAO,YAAY,CAAyB;AAAA,MACzE;AAAA,IACF,SAAS,KAAK;AACZ,YAAM,OAAO,KAAK,eAAe,QAAQ,MAAM,OAAO,GAAG,CAAC;AAC1D,YAAM;AAAA,IACR;AACA,WAAO;AAAA,EACT;AACF;;;AC5QA,eAAsB,sBACpB,UACA,OACA,UAAgC,CAAC,GACH;AAC9B,QAAM,UAAU,IAAI,aAAa,KAAK;AACtC,QAAM,QAAQ,SAAS;AAAA,IACrB,YAAY,SAAS;AAAA,IACrB,WAAW,QAAQ;AAAA,IACnB,gBAAgB,SAAS;AAAA,IACzB,MAAM,SAAS;AAAA,IACf,GAAG,QAAQ;AAAA,EACb,CAAC;AACD,QAAM,UAAU,IAAI,eAAe,QAAQ,MAAM;AACjD,QAAM,SAAS,MAAM,QAAQ,IAAI,SAAS,SAAS,OAAO;AAC1D,QAAM,YAAY,SAAS,iBAAiB;AAC5C,QAAM,OAAO,OAAO,UAAU,OAAO,SAAS;AAC9C,QAAM,cAAc,OAAO,UAAU,UAAa,OAAO,MAAM,aAAa;AAC5E,QAAM,YAAY,OAAO,QAAQ,UAAa,OAAO,IAAI,aAAa;AACtE,QAAM,aAAa,OAAO,SAAS,UAAa,OAAO,KAAK,aAAa;AACzE,QAAM,eAAyC,OAC3C,YACA,eAAe,YACb,oBACA,aACE,iBACA;AACR,QAAM,QAAQ,OAAO;AAAA,IACnB;AAAA,IACA,OAAO,OAAO;AAAA,IACd;AAAA,IACA,OAAO,OAAO,SAAY,iBAAiB,MAAM;AAAA,EACnD,CAAC;AACD,SAAO;AAAA,IACL,OAAO,QAAQ;AAAA,IACf;AAAA,IACA,SAAS;AAAA,IACT;AAAA,IACA,OAAO,OAAO;AAAA,IACd;AAAA,EACF;AACF;AAEA,SAAS,iBAAiB,QAAsC;AAC9D,MAAI,OAAO,SAAS,OAAO,MAAM,aAAa;AAC5C,WAAO,sBAAsB,OAAO,MAAM,QAAQ;AACpD,MAAI,OAAO,OAAO,OAAO,IAAI,aAAa,EAAG,QAAO,oBAAoB,OAAO,IAAI,QAAQ;AAC3F,MAAI,OAAO,MAAM;AACf,QAAI,OAAO,KAAK,eAAe,QAAW;AACxC,aAAO,UAAU,OAAO,KAAK,eAAe,CAAC,IAAI,OAAO,KAAK,UAAU;AAAA,IACzE;AACA,WAAO,aAAa,OAAO,KAAK,QAAQ;AAAA,EAC1C;AACA,SAAO;AACT;","names":[]}