@alis-build/harness-eval 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/README.md +17 -4
  2. package/dist/adapters/claude-code/index.d.ts +1 -1
  3. package/dist/adapters/claude-code/index.js +1 -1
  4. package/dist/{claude-code-ycT0JQZF.js → claude-code-DZ4Vkgp6.js} +35 -6
  5. package/dist/{claude-code-ycT0JQZF.js.map → claude-code-DZ4Vkgp6.js.map} +1 -1
  6. package/dist/cli/bin.js +109 -12
  7. package/dist/cli/bin.js.map +1 -1
  8. package/dist/config/loader.d.ts +1 -1
  9. package/dist/config/loader.js +1 -1
  10. package/dist/{index-6Z17eKZx.d.ts → index-V22PrR0p.d.ts} +2 -1
  11. package/dist/index.d.ts +270 -152
  12. package/dist/index.js +124 -5
  13. package/dist/index.js.map +1 -0
  14. package/dist/{loader-DTvoVfN0.d.ts → loader-C9yQHUPC.d.ts} +19 -2
  15. package/dist/{loader-BCnFJ8rm.js → loader-DcI0KfRX.js} +291 -4
  16. package/dist/loader-DcI0KfRX.js.map +1 -0
  17. package/dist/{build-DsVJ_UeU.js → projections-BcX7w-f6.js} +486 -243
  18. package/dist/projections-BcX7w-f6.js.map +1 -0
  19. package/dist/runner/suite.d.ts +1 -1
  20. package/dist/runner/suite.js +1 -1
  21. package/dist/{suite-BoOvK_lq.d.ts → suite-DPJMIEbu.d.ts} +7 -2
  22. package/dist/{suite-chj0j22j.js → suite-Dlzl-HI0.js} +58 -4
  23. package/dist/suite-Dlzl-HI0.js.map +1 -0
  24. package/dist/{types-BQol062t.d.ts → types-CD3TwOtZ.d.ts} +151 -10
  25. package/package.json +4 -2
  26. package/schemas/eval-interchange-instances.schema.json +196 -0
  27. package/schemas/eval-interchange.schema.json +65 -52
  28. package/schemas/eval-run-envelope.schema.json +182 -425
  29. package/dist/build-DsVJ_UeU.js.map +0 -1
  30. package/dist/loader-BCnFJ8rm.js.map +0 -1
  31. package/dist/suite-chj0j22j.js.map +0 -1
  32. package/schemas/eval-interchange-agent-trace.schema.json +0 -322
  33. package/schemas/eval-interchange-proto-instance.schema.json +0 -106
@@ -0,0 +1 @@
1
+ {"version":3,"file":"suite-Dlzl-HI0.js","names":["predicateMatches","_exhaustive"],"sources":["../src/assertions/patterns.ts","../src/assertions/predicates.ts","../src/assertions/tool-calls.ts","../src/assertions/behavior.ts","../src/assertions/compound.ts","../src/assertions/evaluator.ts","../src/adapters/registry.ts","../src/config/resolve-config.ts","../src/runner/case.ts","../src/runner/limit.ts","../src/runner/suite.ts"],"sourcesContent":["/**\n * Tool name pattern matching.\n *\n * Tool names follow conventions:\n * - Built-in tools: `Bash`, `Read`, `Edit`, `WebSearch`, etc.\n * - MCP tools: `mcp__<server>__<tool>`, e.g. `mcp__api__search_skills`.\n *\n * Patterns support `*` as a glob wildcard. The most useful patterns for\n * the skills-loading problem are namespace globs like `mcp__api__*` —\n * \"did any tool from the alis MCP server get called.\"\n */\n\nimport type { ToolPattern } from \"../types/assertions\";\n\n/**\n * Test whether a fully-qualified tool name matches a pattern.\n *\n * Literal patterns (no `*`) match by string equality. Glob patterns are\n * compiled to a regex on each call — fine for our scale (dozens of patterns,\n * thousands of calls per run). If this becomes a hot path, memoize.\n */\nexport function toolMatches(toolName: string, pattern: ToolPattern): boolean {\n const p = patternString(pattern);\n if (!p.includes(\"*\")) return toolName === p;\n return globToRegex(p).test(toolName);\n}\n\n/** Extract the underlying string from either pattern form. */\nexport function patternString(pattern: ToolPattern): string {\n return typeof pattern === \"string\" ? pattern : pattern.pattern;\n}\n\n/** Human-readable representation for diagnostic messages. */\nexport function describePattern(pattern: ToolPattern): string {\n return patternString(pattern);\n}\n\n/**\n * Convert a glob (with `*` wildcards only) to an anchored regex.\n * Other regex metacharacters in the input are escaped.\n */\nfunction globToRegex(glob: string): RegExp {\n const escaped = glob\n .replace(/[.+?^${}()|[\\]\\\\]/g, \"\\\\$&\") // escape regex specials\n .replace(/\\*/g, \".*\"); // * → .*\n return new RegExp(`^${escaped}$`);\n}\n","/**\n * Predicate engine for matching tool call arguments.\n *\n * Conceptually similar to MongoDB query selectors: a predicate is a tree\n * of conditions, applied recursively to a value. Examples:\n *\n * matches(\"hello world\", { contains: \"world\" }) // true\n * matches({ a: 1 }, { a: { gte: 0 } }) // true\n * matches({ a: { b: \"x\" } }, { a: { b: \"x\" } }) // true (scalar shortcut)\n * matches({ q: \"ab\" }, { any_of: [{equals:\"x\"}, {contains:\"a\"}] }) // ???\n *\n * Last example: the `any_of` applies to the value (`{q:\"ab\"}`), not to a\n * field. `equals:\"x\"` and `contains:\"a\"` are both leaf predicates that\n * apply to the whole value. `contains` requires a string, so it returns\n * false for the object. The whole thing returns false. That's deliberate.\n *\n * Disambiguation rule (single-key objects): a single-key object is interpreted as a leaf or compound predicate IF\n * the key matches a known operator name. Otherwise it falls through to\n * being treated as an object predicate (field name = key).\n *\n * This means a tool argument schema cannot have a top-level field named\n * `equals`, `contains`, `regex`, `any_of`, `all_of`, `not`, etc. — those\n * fields would be shadowed by predicate operators. For MCP tools, this\n * has never been a problem in practice; document it and move on.\n */\n\nimport type { Predicate } from \"../types/assertions\";\n\nconst LEAF_OPS = new Set([\n \"equals\",\n \"contains\",\n \"not_contains\",\n \"regex\",\n \"gte\",\n \"lte\",\n \"gt\",\n \"lt\",\n \"one_of\",\n]);\nconst COMPOUND_OPS = new Set([\"any_of\", \"all_of\", \"not\"]);\n\n/**\n * Apply a predicate to a value. Returns true if the value satisfies the\n * predicate, false otherwise.\n *\n * The `predicate` parameter is typed as `unknown` because YAML deserialization\n * produces unconstrained shapes; runtime dispatch is the validation.\n */\nexport function matches(value: unknown, predicate: unknown): boolean {\n // Scalar shortcut: anything that isn't a plain object (or is an array) is\n // treated as an equality target.\n if (!isPlainObject(predicate)) {\n return deepEquals(value, predicate);\n }\n\n const obj = predicate as Record<string, unknown>;\n const keys = Object.keys(obj);\n\n // Single-key object: check if it's a known operator.\n if (keys.length === 1) {\n const key = keys[0];\n\n if (COMPOUND_OPS.has(key)) {\n switch (key) {\n case \"any_of\":\n return (obj.any_of as Predicate[]).some((sub) => matches(value, sub));\n case \"all_of\":\n return (obj.all_of as Predicate[]).every((sub) =>\n matches(value, sub),\n );\n case \"not\":\n return !matches(value, obj.not);\n }\n }\n\n if (LEAF_OPS.has(key)) {\n return matchesLeaf(value, key, obj[key]);\n }\n\n // Single key but not a known operator → object predicate (field match).\n }\n\n // Object predicate: every key is a field on `value`, every key's value is\n // a sub-predicate that must hold for the corresponding field.\n if (!isPlainObject(value)) return false;\n const valueObj = value as Record<string, unknown>;\n\n for (const [field, subPred] of Object.entries(obj)) {\n if (!matches(valueObj[field], subPred)) return false;\n }\n return true;\n}\n\n/** Apply a single leaf operator to a value. Caller guarantees `op` is in LEAF_OPS. */\nfunction matchesLeaf(value: unknown, op: string, target: unknown): boolean {\n switch (op) {\n case \"equals\":\n return deepEquals(value, target);\n case \"contains\":\n return typeof value === \"string\" && value.includes(target as string);\n case \"not_contains\":\n return typeof value === \"string\" && !value.includes(target as string);\n case \"regex\":\n if (typeof value !== \"string\" || typeof target !== \"string\") {\n return false;\n }\n try {\n return new RegExp(target).test(value);\n } catch {\n return false;\n }\n case \"gte\":\n return typeof value === \"number\" && value >= (target as number);\n case \"lte\":\n return typeof value === \"number\" && value <= (target as number);\n case \"gt\":\n return typeof value === \"number\" && value > (target as number);\n case \"lt\":\n return typeof value === \"number\" && value < (target as number);\n case \"one_of\":\n return (target as unknown[]).some((t) => deepEquals(value, t));\n default:\n throw new Error(`unknown leaf operator: ${op}`);\n }\n}\n\n/** True for non-null, non-array objects. */\nfunction isPlainObject(x: unknown): x is Record<string, unknown> {\n return typeof x === \"object\" && x !== null && !Array.isArray(x);\n}\n\n/**\n * Structural equality for unknown values. Used by `equals` and `one_of`.\n * Strict — no coercions, no NaN-equals-NaN special case (matches `===`).\n */\nfunction deepEquals(a: unknown, b: unknown): boolean {\n if (a === b) return true;\n if (typeof a !== typeof b) return false;\n if (a === null || b === null) return false;\n if (typeof a !== \"object\") return false;\n\n if (Array.isArray(a) !== Array.isArray(b)) return false;\n if (Array.isArray(a) && Array.isArray(b)) {\n if (a.length !== b.length) return false;\n return a.every((v, i) => deepEquals(v, b[i]));\n }\n\n const aObj = a as Record<string, unknown>;\n const bObj = b as Record<string, unknown>;\n const aKeys = Object.keys(aObj);\n const bKeys = Object.keys(bObj);\n if (aKeys.length !== bKeys.length) return false;\n return aKeys.every((k) => deepEquals(aObj[k], bObj[k]));\n}\n","/**\n * Tool-call assertion evaluators.\n *\n * These assertions query the `toolCalls` array on the trajectory view:\n * presence, cardinality, ordering, and argument matching.\n *\n * Ordering is done on `turnIndex`, not wall-clock time. Parallel tool calls\n * within a single assistant turn share a turnIndex, which means \"A came\n * before B\" requires A's turn to *strictly precede* B's turn — calls within\n * the same turn are considered unordered. This is the right default\n * because Claude Code dispatches parallel calls concurrently and the\n * wall-clock ordering is non-deterministic.\n */\n\nimport type { Assertion, AssertionResult } from \"../types/assertions\";\nimport type { ToolCall, TrajectoryView } from \"../types/trajectory\";\nimport { describeCardinality, parseCardinality } from \"./cardinality\";\nimport { describePattern, toolMatches } from \"./patterns\";\nimport { matches as predicateMatches } from \"./predicates\";\n\n// presence\n\n/** Assert a tool was called with optional cardinality (`times`). */\nexport function evaluateCalled(\n view: TrajectoryView,\n assertion: Extract<Assertion, { type: \"called\" }>,\n): AssertionResult {\n const matching = view.toolCalls.filter((c) =>\n toolMatches(c.name, assertion.tool),\n );\n const check = parseCardinality(assertion.times);\n const passed = check(matching.length);\n\n return {\n passed,\n description: `called(${describePattern(assertion.tool)}, ${describeCardinality(assertion.times)})`,\n details: passed\n ? `found ${matching.length} matching call(s)`\n : `found ${matching.length} call(s), expected ${describeCardinality(assertion.times)}`,\n matches: matching,\n };\n}\n\n/** Assert a tool was never called. */\nexport function evaluateNotCalled(\n view: TrajectoryView,\n assertion: Extract<Assertion, { type: \"not_called\" }>,\n): AssertionResult {\n const matching = view.toolCalls.filter((c) =>\n toolMatches(c.name, assertion.tool),\n );\n const passed = matching.length === 0;\n\n return {\n passed,\n description: `not_called(${describePattern(assertion.tool)})`,\n details: passed\n ? \"no matching calls\"\n : `found ${matching.length} forbidden call(s)`,\n matches: matching,\n };\n}\n\n/** Assert at least one of the listed tools was called. */\nexport function evaluateCalledAnyOf(\n view: TrajectoryView,\n assertion: Extract<Assertion, { type: \"called_any_of\" }>,\n): AssertionResult {\n const allMatches: ToolCall[] = [];\n for (const pattern of assertion.tools) {\n allMatches.push(\n ...view.toolCalls.filter((c) => toolMatches(c.name, pattern)),\n );\n }\n const passed = allMatches.length > 0;\n return {\n passed,\n description: `called_any_of(${assertion.tools.map(describePattern).join(\", \")})`,\n details: passed\n ? `${allMatches.length} matching call(s)`\n : \"no calls matched any pattern\",\n matches: allMatches,\n };\n}\n\n/** Assert every listed tool was called at least once. */\nexport function evaluateCalledAllOf(\n view: TrajectoryView,\n assertion: Extract<Assertion, { type: \"called_all_of\" }>,\n): AssertionResult {\n const perPattern = assertion.tools.map((p) => ({\n pattern: p,\n matches: view.toolCalls.filter((c) => toolMatches(c.name, p)),\n }));\n const missing = perPattern.filter((p) => p.matches.length === 0);\n const passed = missing.length === 0;\n\n return {\n passed,\n description: `called_all_of(${assertion.tools.map(describePattern).join(\", \")})`,\n details: passed\n ? \"all patterns matched\"\n : `missing: ${missing.map((m) => describePattern(m.pattern)).join(\", \")}`,\n matches: perPattern.flatMap((p) => p.matches),\n };\n}\n\n// ordering\n\n/** Assert `first` tool's earliest turn strictly precedes `then` tool's earliest turn. */\nexport function evaluateCalledBefore(\n view: TrajectoryView,\n assertion: Extract<Assertion, { type: \"called_before\" }>,\n): AssertionResult {\n const firsts = view.toolCalls.filter((c) =>\n toolMatches(c.name, assertion.first),\n );\n const thens = view.toolCalls.filter((c) =>\n toolMatches(c.name, assertion.then),\n );\n const desc = `called_before(${describePattern(assertion.first)} → ${describePattern(assertion.then)})`;\n\n if (firsts.length === 0) {\n return {\n passed: false,\n description: desc,\n details: `no calls matching first`,\n };\n }\n if (thens.length === 0) {\n return {\n passed: false,\n description: desc,\n details: `no calls matching then`,\n };\n }\n\n // Earliest occurrence of each side, by turn. Strictly less than = \"before\".\n const earliestFirst = Math.min(...firsts.map((c) => c.turnIndex));\n const earliestThen = Math.min(...thens.map((c) => c.turnIndex));\n const passed = earliestFirst < earliestThen;\n\n return {\n passed,\n description: desc,\n details: passed\n ? `first @ turn ${earliestFirst}, then @ turn ${earliestThen}`\n : `first @ turn ${earliestFirst}, then @ turn ${earliestThen} (not before)`,\n matches: [...firsts, ...thens],\n };\n}\n\n/**\n * Assert tools appear in order.\n *\n * Non-strict mode allows interleaved calls; strict mode requires a contiguous subsequence.\n */\nexport function evaluateSequence(\n view: TrajectoryView,\n assertion: Extract<Assertion, { type: \"sequence\" }>,\n): AssertionResult {\n const { tools, strict = false } = assertion;\n const desc = `sequence([${tools.map(describePattern).join(\" → \")}]${strict ? \", strict\" : \"\"})`;\n\n if (tools.length === 0) {\n return {\n passed: true,\n description: desc,\n details: \"empty sequence trivially matches\",\n };\n }\n\n if (strict) {\n // Strict: the tools must appear in exact order with no other tool calls\n // interleaved. We look for a contiguous subsequence of the right shape.\n if (view.toolCalls.length < tools.length) {\n return {\n passed: false,\n description: desc,\n details: \"not enough tool calls\",\n };\n }\n for (\n let start = 0;\n start <= view.toolCalls.length - tools.length;\n start++\n ) {\n let ok = true;\n for (let i = 0; i < tools.length; i++) {\n if (!toolMatches(view.toolCalls[start + i].name, tools[i])) {\n ok = false;\n break;\n }\n }\n if (ok) {\n return {\n passed: true,\n description: desc,\n details: `matched at positions ${start}..${start + tools.length - 1}`,\n matches: view.toolCalls.slice(start, start + tools.length),\n };\n }\n }\n return { passed: false, description: desc, details: \"no contiguous match\" };\n }\n\n // Non-strict: tools must appear in order, interleaved calls allowed.\n // Walk the tool call list once, advancing the sequence pointer on each match.\n let idx = 0;\n const matched: ToolCall[] = [];\n for (const call of view.toolCalls) {\n if (idx < tools.length && toolMatches(call.name, tools[idx])) {\n matched.push(call);\n idx++;\n }\n }\n const passed = idx === tools.length;\n return {\n passed,\n description: desc,\n details: passed ? \"matched in order\" : `matched ${idx}/${tools.length}`,\n matches: matched,\n };\n}\n\n// arguments\n\n/** Assert at least one call to `tool` had arguments matching the predicate. */\nexport function evaluateCalledWith(\n view: TrajectoryView,\n assertion: Extract<Assertion, { type: \"called_with\" }>,\n): AssertionResult {\n const candidates = view.toolCalls.filter((c) =>\n toolMatches(c.name, assertion.tool),\n );\n const matching = candidates.filter((c) =>\n predicateMatches(c.args, assertion.args),\n );\n const passed = matching.length > 0;\n\n let details: string;\n if (passed) {\n details = `${matching.length} call(s) with matching args`;\n } else if (candidates.length === 0) {\n details = `no calls to ${describePattern(assertion.tool)} at all`;\n } else {\n details = `${candidates.length} call(s) but none with matching args`;\n }\n\n return {\n passed,\n description: `called_with(${describePattern(assertion.tool)}, args matching predicate)`,\n details,\n matches: matching,\n };\n}\n","/**\n * Behavior and response-text assertions.\n *\n * Cover everything that isn't a tool-call query:\n * - Did the agent answer without using any tool? (the \"blind answer\" case)\n * - Did it stay within iteration / cost / time budget?\n * - What did it say its stop reason was?\n * - Does the response text contain expected substrings or match a regex?\n * - Arbitrary user-supplied predicate (escape hatch).\n */\n\nimport type { Assertion, AssertionResult } from \"../types/assertions\";\nimport type { TrajectoryView } from \"../types/trajectory\";\n\n// behavior\n\n/**\n * Was the response delivered without using any tool? This is the primary\n * failure mode detector for the skills-loading problem: when the harness\n * ignores the MCP, the trace shows zero tool calls and one terminal\n * assistant turn with finish reason `end_turn`.\n *\n * \"Without tool calls\" is defined as `toolCalls.length === 0` AND the\n * response text is non-empty (so we don't confuse \"answered blind\" with\n * \"session died before producing anything\").\n */\nexport function evaluateRespondedWithoutToolCalls(\n view: TrajectoryView,\n _assertion: Extract<Assertion, { type: \"responded_without_tool_calls\" }>,\n): AssertionResult {\n const passed = view.toolCalls.length === 0 && view.finalResponse.length > 0;\n return {\n passed,\n description: \"responded_without_tool_calls\",\n details: passed\n ? \"no tools called, response non-empty\"\n : view.toolCalls.length > 0\n ? `${view.toolCalls.length} tool call(s) made`\n : \"response was empty (session probably aborted)\",\n };\n}\n\n/** Assert the session stayed within the reported turn count. */\nexport function evaluateIterationsWithin(\n view: TrajectoryView,\n assertion: Extract<Assertion, { type: \"iterations_within\" }>,\n): AssertionResult {\n const n = view.usage.numTurns;\n const passed = n <= assertion.max;\n return {\n passed,\n description: `iterations_within(${assertion.max})`,\n details: `used ${n} turn(s)`,\n };\n}\n\n/** Assert total session cost in USD is within budget. */\nexport function evaluateCostWithinUsd(\n view: TrajectoryView,\n assertion: Extract<Assertion, { type: \"cost_within_usd\" }>,\n): AssertionResult {\n const cost = view.usage.totalCostUsd;\n const passed = cost <= assertion.max;\n return {\n passed,\n description: `cost_within_usd(${assertion.max.toFixed(4)})`,\n details: `used $${cost.toFixed(4)}`,\n };\n}\n\n/** Assert wall-clock session duration is within budget. */\nexport function evaluateDurationWithinMs(\n view: TrajectoryView,\n assertion: Extract<Assertion, { type: \"duration_within_ms\" }>,\n): AssertionResult {\n const ms = view.usage.durationMs;\n const passed = ms <= assertion.max;\n return {\n passed,\n description: `duration_within_ms(${assertion.max})`,\n details: `took ${ms}ms`,\n };\n}\n\n/** Assert the final stop reason matches one of the allowed values. */\nexport function evaluateFinishedWith(\n view: TrajectoryView,\n assertion: Extract<Assertion, { type: \"finished_with\" }>,\n): AssertionResult {\n const allowed = Array.isArray(assertion.reasons)\n ? assertion.reasons\n : [assertion.reasons];\n const actual = view.finalStopReason;\n const passed = actual !== null && allowed.includes(actual);\n return {\n passed,\n description: `finished_with(${allowed.join(\"|\")})`,\n details: `actual: ${actual ?? \"(none)\"}`,\n };\n}\n\n// response text\n\n/** Assert `finalResponse` contains the given substring. */\nexport function evaluateResponseContains(\n view: TrajectoryView,\n assertion: Extract<Assertion, { type: \"response_contains\" }>,\n): AssertionResult {\n const passed = view.finalResponse.includes(assertion.text);\n return {\n passed,\n description: `response_contains(${JSON.stringify(assertion.text)})`,\n details: passed ? \"text found\" : \"text not in response\",\n };\n}\n\n/** Assert `finalResponse` does not contain the given substring. */\nexport function evaluateResponseNotContains(\n view: TrajectoryView,\n assertion: Extract<Assertion, { type: \"response_not_contains\" }>,\n): AssertionResult {\n const passed = !view.finalResponse.includes(assertion.text);\n return {\n passed,\n description: `response_not_contains(${JSON.stringify(assertion.text)})`,\n details: passed ? \"text absent\" : \"forbidden text found\",\n };\n}\n\n/** Assert `finalResponse` matches a regular expression. */\nexport function evaluateResponseMatches(\n view: TrajectoryView,\n assertion: Extract<Assertion, { type: \"response_matches\" }>,\n): AssertionResult {\n // Construction may throw on a malformed regex; surface that as a failure\n // rather than crashing the whole eval run.\n let passed: boolean;\n let details: string;\n try {\n const re = new RegExp(assertion.pattern, assertion.flags);\n passed = re.test(view.finalResponse);\n details = passed ? \"pattern matched\" : \"pattern did not match\";\n } catch (err) {\n passed = false;\n details = `invalid regex: ${err instanceof Error ? err.message : String(err)}`;\n }\n return {\n passed,\n description: `response_matches(/${assertion.pattern}/${assertion.flags ?? \"\"})`,\n details,\n };\n}\n\n// escape hatch\n\n/**\n * Run an arbitrary user-supplied predicate against the view.\n *\n * Only available from programmatic test definition (the YAML loader cannot\n * produce functions). Catches thrown errors and reports them as failures so\n * one bad predicate doesn't take down a whole eval run.\n */\nexport function evaluatePredicate(\n view: TrajectoryView,\n assertion: Extract<Assertion, { type: \"predicate\" }>,\n): AssertionResult {\n let passed = false;\n let details: string;\n try {\n passed = assertion.fn(view);\n details = passed ? \"predicate returned true\" : \"predicate returned false\";\n } catch (err) {\n details = `predicate threw: ${err instanceof Error ? err.message : String(err)}`;\n }\n return {\n passed,\n description: assertion.description ?? \"predicate(...)\",\n details,\n };\n}\n","/**\n * Compound assertion evaluators: `any_of`, `all_of`, `not`.\n *\n * These recurse into the main evaluator. To avoid a circular import between\n * this file and `evaluator.ts`, the dispatcher is passed in as a function\n * parameter rather than imported directly. The evaluator binds itself when\n * dispatching to these.\n */\n\nimport type { Assertion, AssertionResult } from \"../types/assertions\";\nimport type { TrajectoryView } from \"../types/trajectory\";\n\n/**\n * Signature of the top-level dispatcher. Passed into compound evaluators so\n * they can recursively evaluate child assertions without a circular import.\n */\nexport type Evaluator = (\n view: TrajectoryView,\n assertion: Assertion,\n) => AssertionResult;\n\n/** Evaluate `all_of`: every child assertion must pass. */\nexport function evaluateAllOf(\n view: TrajectoryView,\n assertion: Extract<Assertion, { type: \"all_of\" }>,\n evaluate: Evaluator,\n): AssertionResult {\n const children = assertion.assertions.map((a) => evaluate(view, a));\n const passed = children.every((c) => c.passed);\n const failedCount = children.filter((c) => !c.passed).length;\n\n return {\n passed,\n description: `all_of (${children.length} child${children.length === 1 ? \"\" : \"ren\"})`,\n details: passed\n ? \"all passed\"\n : `${failedCount} of ${children.length} failed`,\n children,\n };\n}\n\n/** Evaluate `any_of`: at least one child assertion must pass. */\nexport function evaluateAnyOf(\n view: TrajectoryView,\n assertion: Extract<Assertion, { type: \"any_of\" }>,\n evaluate: Evaluator,\n): AssertionResult {\n const children = assertion.assertions.map((a) => evaluate(view, a));\n const passedCount = children.filter((c) => c.passed).length;\n const passed = passedCount > 0;\n\n return {\n passed,\n description: `any_of (${children.length} child${children.length === 1 ? \"\" : \"ren\"})`,\n details: passed ? `${passedCount} passed` : \"all failed\",\n children,\n };\n}\n\n/** Evaluate `not`: invert the inner assertion result. */\nexport function evaluateNot(\n view: TrajectoryView,\n assertion: Extract<Assertion, { type: \"not\" }>,\n evaluate: Evaluator,\n): AssertionResult {\n const child = evaluate(view, assertion.assertion);\n return {\n passed: !child.passed,\n description: `not(${child.description})`,\n details: child.passed\n ? \"inner passed (so outer fails)\"\n : \"inner failed (so outer passes)\",\n children: [child],\n };\n}\n","/**\n * Top-level assertion evaluator.\n *\n * Dispatches on the discriminant of the `Assertion` tagged union, delegating\n * to the per-kind evaluators in the sibling modules. This file deliberately\n * contains no logic of its own — keep it boring so adding a new assertion\n * type is just (a) extend the union in `types/assertions.ts`, (b) add an\n * evaluator function in the appropriate sibling, (c) add one case here.\n */\n\nimport type { Assertion, AssertionResult } from \"../types/assertions\";\nimport type { TrajectoryView } from \"../types/trajectory\";\n\nimport {\n evaluateCalled,\n evaluateCalledAllOf,\n evaluateCalledAnyOf,\n evaluateCalledBefore,\n evaluateCalledWith,\n evaluateNotCalled,\n evaluateSequence,\n} from \"./tool-calls\";\n\nimport {\n evaluateCostWithinUsd,\n evaluateDurationWithinMs,\n evaluateFinishedWith,\n evaluateIterationsWithin,\n evaluatePredicate,\n evaluateRespondedWithoutToolCalls,\n evaluateResponseContains,\n evaluateResponseMatches,\n evaluateResponseNotContains,\n} from \"./behavior\";\n\nimport { evaluateAllOf, evaluateAnyOf, evaluateNot } from \"./compound\";\n\n/**\n * Evaluate one assertion against a trajectory view.\n *\n * The switch is exhaustive — TypeScript's `never` check at the end will\n * flag any new variant added to the `Assertion` union that hasn't been\n * wired up here.\n */\nexport function evaluate(\n view: TrajectoryView,\n assertion: Assertion,\n): AssertionResult {\n switch (assertion.type) {\n // tool-call presence and ordering\n case \"called\":\n return evaluateCalled(view, assertion);\n case \"not_called\":\n return evaluateNotCalled(view, assertion);\n case \"called_any_of\":\n return evaluateCalledAnyOf(view, assertion);\n case \"called_all_of\":\n return evaluateCalledAllOf(view, assertion);\n case \"called_before\":\n return evaluateCalledBefore(view, assertion);\n case \"sequence\":\n return evaluateSequence(view, assertion);\n\n // tool-call arguments\n case \"called_with\":\n return evaluateCalledWith(view, assertion);\n\n // behavior\n case \"responded_without_tool_calls\":\n return evaluateRespondedWithoutToolCalls(view, assertion);\n case \"iterations_within\":\n return evaluateIterationsWithin(view, assertion);\n case \"cost_within_usd\":\n return evaluateCostWithinUsd(view, assertion);\n case \"duration_within_ms\":\n return evaluateDurationWithinMs(view, assertion);\n case \"finished_with\":\n return evaluateFinishedWith(view, assertion);\n\n // response text\n case \"response_contains\":\n return evaluateResponseContains(view, assertion);\n case \"response_not_contains\":\n return evaluateResponseNotContains(view, assertion);\n case \"response_matches\":\n return evaluateResponseMatches(view, assertion);\n\n // compound — pass the dispatcher in so they can recurse without\n // creating a circular import\n case \"all_of\":\n return evaluateAllOf(view, assertion, evaluate);\n case \"any_of\":\n return evaluateAnyOf(view, assertion, evaluate);\n case \"not\":\n return evaluateNot(view, assertion, evaluate);\n\n // escape hatch\n case \"predicate\":\n return evaluatePredicate(view, assertion);\n\n default: {\n // Exhaustiveness guard. If a new assertion variant is added to the\n // union and not wired into the switch above, TypeScript will fail\n // here at compile time. Don't remove this case.\n const _exhaustive: never = assertion;\n throw new Error(`unknown assertion: ${JSON.stringify(_exhaustive)}`);\n }\n }\n}\n\n/**\n * Evaluate a list of assertions independently. Used at the test-case level\n * where each top-level assertion is reported separately (and thresholded\n * separately, in the runner layer).\n */\nexport function evaluateAll(\n view: TrajectoryView,\n assertions: Assertion[],\n): AssertionResult[] {\n return assertions.map((a) => evaluate(view, a));\n}\n","/**\n * Default harness adapter registry.\n *\n * New adapters register here so the CLI and runner can resolve `adapter`\n * names from YAML without hard-coding imports at every call site.\n *\n * ## Adding a new harness adapter\n *\n * 1. **Create an adapter module** under `src/adapters/<id>/` implementing\n * {@link HarnessAdapter} from `./types`. Set `id` to match the YAML\n * `adapter` field (e.g. `\"codex\"`).\n * 2. **Nest suite config** under a camelCase key in {@link SuiteConfig}\n * (e.g. `codex: { ... }`) so each harness keeps its own options.\n * 3. **Register at startup** via {@link registerAdapter} — either in this\n * module for built-in adapters or from plugin/bootstrap code for\n * runtime extensions.\n * 4. **Reference in suite YAML** with `adapter: <id>` and the nested config\n * block; the runner calls `getAdapter(id).run(resolvedConfig)`.\n *\n * Built-in adapters are registered when this module loads. Only `claude-code`\n * ships today; future harnesses (Codex, Gemini CLI, Antigravity CLI) follow\n * the same pattern in separate tracks.\n */\n\nimport type { HarnessAdapter } from \"./types\";\nimport { claudeCodeAdapter } from \"./claude-code/index\";\n\nconst ADAPTERS: Record<string, HarnessAdapter> = {};\n\nfunction registerBuiltIn(id: string, adapter: HarnessAdapter): void {\n ADAPTERS[id] = adapter;\n}\n\nregisterBuiltIn(\"claude-code\", claudeCodeAdapter);\n\n/**\n * Register a harness adapter by id.\n *\n * Duplicate ids throw — registration is explicit so accidental overrides\n * surface immediately during startup or test setup.\n */\nexport function registerAdapter(id: string, adapter: HarnessAdapter): void {\n if (ADAPTERS[id]) {\n throw new Error(`adapter \"${id}\" is already registered`);\n }\n ADAPTERS[id] = adapter;\n}\n\n/** Return all registered adapter ids (built-in and runtime). */\nexport function listAdapters(): string[] {\n return Object.keys(ADAPTERS);\n}\n\n/** Resolve an adapter by id. Throws if unknown. */\nexport function getAdapter(id: string): HarnessAdapter {\n const adapter = ADAPTERS[id];\n if (!adapter) {\n throw new Error(\n `unknown adapter \"${id}\". Available: ${listAdapters().join(\", \")}`,\n );\n }\n return adapter;\n}\n\n/** Default adapter when YAML omits `adapter`. */\nexport const DEFAULT_ADAPTER_ID = \"claude-code\";\n\nexport function getDefaultAdapter(): HarnessAdapter {\n return getAdapter(DEFAULT_ADAPTER_ID);\n}\n","/**\n * Flatten nested suite config into harness-specific adapter config.\n *\n * Suite YAML nests adapter options under keys like `claudeCode`; adapters\n * expect a flat config object. This module merges layers and flattens per\n * adapter id.\n */\n\nimport { DEFAULT_ADAPTER_ID } from \"../adapters/registry\";\nimport type { BaseAdapterConfig } from \"../adapters/types\";\nimport type { ClaudeCodeAdapterConfig } from \"../adapters/claude-code/types\";\nimport type { SuiteConfig } from \"../adapters/types\";\n\n/** Merged config passed to {@link HarnessAdapter.run}. */\nexport type ResolvedRunConfig = BaseAdapterConfig & Record<string, unknown>;\n\n/** Merge generic suite config layers into a flat {@link ClaudeCodeAdapterConfig}. */\nexport function toClaudeCodeConfig(\n layers: SuiteConfig[],\n prompt: string,\n): ClaudeCodeAdapterConfig {\n const merged: Record<string, unknown> = {};\n for (const layer of layers) {\n const { claudeCode, ...generic } = layer;\n Object.assign(merged, generic);\n if (claudeCode && typeof claudeCode === \"object\") {\n Object.assign(merged, claudeCode);\n }\n }\n merged.prompt = prompt;\n return merged as unknown as ClaudeCodeAdapterConfig;\n}\n\n/**\n * Resolve merged suite layers into the flat config shape expected by the\n * selected harness adapter.\n */\nexport function resolveRunConfig(\n adapterId: string,\n layers: SuiteConfig[],\n prompt: string,\n): ResolvedRunConfig {\n if (adapterId === DEFAULT_ADAPTER_ID || adapterId === \"claude-code\") {\n return toClaudeCodeConfig(layers, prompt) as ResolvedRunConfig;\n }\n\n const merged: Record<string, unknown> = {};\n for (const layer of layers) {\n Object.assign(merged, layer);\n }\n merged.prompt = prompt;\n return merged as ResolvedRunConfig;\n}\n","/**\n * Case-level runner — config merge, single-repetition execution, and cell aggregation.\n *\n * The suite runner (`suite.ts`) fans out work; this module owns the per-rep\n * lifecycle: merge config layers, invoke the adapter, evaluate assertions, and\n * compute thresholded pass rates for one matrix cell.\n */\n\nimport type { AdapterDiagnostics, AdapterResult, BaseAdapterConfig } from \"../adapters/types\";\nimport { getDefaultAdapter } from \"../adapters/registry\";\nimport { resolveRunConfig } from \"../config/resolve-config\";\nimport { evaluateAll } from \"../assertions/evaluator\";\nimport type {\n AssertionStat,\n CellReport,\n MatrixCell,\n RepetitionError,\n RepetitionResult,\n TestCase,\n TestSuite,\n} from \"./types\";\n\n/** Default repetition count when `case.repetitions` is omitted. */\nexport const DEFAULT_REPETITIONS = 5;\n\n/** Default assertion pass-rate threshold when `threshold` is omitted. */\nexport const DEFAULT_THRESHOLD = 1.0;\n\n/** Injectable adapter run function (used by tests to stub harness I/O). */\nexport type AdapterRunFn = (\n config: BaseAdapterConfig & Record<string, unknown>,\n) => Promise<AdapterResult>;\n\n/**\n * Build the effective adapter config for one (suite, case, cell).\n *\n * Merge order (later wins): defaultConfig < case.config < cell.config.\n */\nexport function mergeConfig(\n suite: TestSuite,\n testCase: TestCase,\n cell: MatrixCell,\n): BaseAdapterConfig & Record<string, unknown> {\n const adapterId = suite.adapter ?? getDefaultAdapter().id;\n const layers = [\n suite.defaultConfig ?? {},\n testCase.config ?? {},\n cell.config,\n ];\n return resolveRunConfig(adapterId, layers, testCase.prompt);\n}\n\n/** Effective repetition count for a case (`case.repetitions` or default). */\nexport function getRepetitions(testCase: TestCase): number {\n return testCase.repetitions ?? DEFAULT_REPETITIONS;\n}\n\n/**\n * Run one repetition: invoke the adapter, evaluate assertions, capture errors.\n *\n * Adapter failures are returned as {@link RepetitionResult.error} rather than\n * thrown so the suite runner can continue other reps and report adapter error counts.\n */\nexport async function runRepetition(\n testCase: TestCase,\n _cell: MatrixCell,\n config: BaseAdapterConfig & Record<string, unknown>,\n repetitionIndex: number,\n run: AdapterRunFn,\n signal?: AbortSignal,\n): Promise<RepetitionResult> {\n const startTs = Date.now();\n\n try {\n const adapterResult = await run({\n ...config,\n signal: signal ?? config.signal,\n });\n\n const assertionResults = evaluateAll(\n adapterResult.view,\n testCase.assertions.map((t) => t.assertion),\n );\n\n return {\n repetitionIndex,\n adapterResult,\n error: null,\n assertionResults,\n durationMs: Date.now() - startTs,\n };\n } catch (err) {\n return {\n repetitionIndex,\n adapterResult: null,\n error: extractError(err),\n assertionResults: [],\n durationMs: Date.now() - startTs,\n };\n }\n}\n\n/**\n * Normalize thrown values into a {@link RepetitionError}.\n *\n * Preserves {@link AdapterDiagnostics} when the thrown value is an\n * {@link AdapterError} or carries a `diagnostics` property.\n */\nfunction extractError(err: unknown): RepetitionError {\n const message = err instanceof Error ? err.message : String(err);\n\n let diagnostics: Partial<AdapterDiagnostics> = {};\n if (err !== null && typeof err === \"object\" && \"diagnostics\" in err) {\n const d = (err as { diagnostics: unknown }).diagnostics;\n if (d !== null && typeof d === \"object\") {\n diagnostics = d as Partial<AdapterDiagnostics>;\n }\n }\n\n return { message, diagnostics };\n}\n\n/**\n * Roll up repetition results into a {@link CellReport}.\n *\n * Adapter errors reduce `evaluatedCount` but do not fail the cell by\n * themselves — only assertion threshold misses mark a cell as failed.\n */\nexport function aggregateCell(\n testCase: TestCase,\n cell: MatrixCell,\n repetitions: RepetitionResult[],\n): CellReport {\n const adapterErrors = repetitions.filter((r) => r.error !== null).length;\n const evaluatedReps = repetitions.filter((r) => r.error === null);\n\n const assertionStats: AssertionStat[] = testCase.assertions.map(\n (thresholded, i) => {\n const threshold = thresholded.threshold ?? DEFAULT_THRESHOLD;\n const passedCount = evaluatedReps.filter(\n (r) => r.assertionResults[i]?.passed,\n ).length;\n const evaluatedCount = evaluatedReps.length;\n const passRate = evaluatedCount === 0 ? 0 : passedCount / evaluatedCount;\n\n const description =\n evaluatedReps[0]?.assertionResults[i]?.description ??\n `(${thresholded.assertion.type})`;\n\n return {\n description,\n threshold,\n passedCount,\n evaluatedCount,\n passRate,\n meetsThreshold: evaluatedCount > 0 && passRate >= threshold,\n };\n },\n );\n\n const passed = assertionStats.every((s) => s.meetsThreshold);\n\n return {\n caseId: testCase.id,\n category: testCase.category,\n notes: testCase.notes,\n prompt: testCase.prompt,\n expectations: testCase.expectations,\n reference_trajectory: testCase.reference_trajectory,\n human_ratings: testCase.human_ratings,\n cell,\n repetitions,\n assertionStats,\n adapterErrors,\n passed,\n };\n}\n","/**\n * Promise-based concurrency limiter.\n *\n * Functionally equivalent to the `p-limit` package, inlined to avoid an\n * external dependency for ~20 lines of code.\n *\n * Usage:\n *\n * const limit = createLimit(4);\n * const results = await Promise.all(tasks.map(t => limit(() => run(t))));\n *\n * The limiter is unbounded in queue depth — it doesn't push back on the\n * caller. If you need bounded enqueue, wrap it.\n */\n\n/** A function that runs an async task under the concurrency limit. */\nexport type LimitedRunner = <T>(fn: () => Promise<T>) => Promise<T>;\n\nexport function createLimit(max: number): LimitedRunner {\n if (!Number.isInteger(max) || max < 1) {\n throw new Error(`createLimit: max must be a positive integer, got ${max}`);\n }\n\n let running = 0;\n /**\n * FIFO list of resolvers belonging to tasks waiting for a slot. When a\n * running task finishes, the next resolver is invoked to wake one waiter.\n */\n const waiters: (() => void)[] = [];\n\n return async <T>(fn: () => Promise<T>): Promise<T> => {\n // Wait for a slot. The loop guards a race where another waiter could\n // grab the slot between our `await` resolving and our increment — in\n // single-threaded JS this is theoretical, but `while` is the right shape.\n while (running >= max) {\n await new Promise<void>((resolve) => waiters.push(resolve));\n }\n running++;\n\n try {\n return await fn();\n } finally {\n running--;\n // Wake exactly one waiter per finished task. Shifting from the front\n // gives FIFO behaviour — earlier callers get slots first.\n const next = waiters.shift();\n if (next) next();\n }\n };\n}\n","/**\n * Suite-level runner — fans out (case × cell × repetition) tasks with concurrency control.\n *\n * Tasks run under a {@link createLimit} pool; results are bucketed by case and\n * cell label, sorted by repetition index, then aggregated into a\n * {@link SuiteReport}.\n */\n\nimport { getAdapter, getDefaultAdapter } from \"../adapters/registry\";\nimport {\n aggregateCell,\n getRepetitions,\n mergeConfig,\n runRepetition,\n type AdapterRunFn,\n} from \"./case\";\nimport { createLimit } from \"./limit\";\nimport type {\n CellReport,\n MatrixCell,\n RepetitionResult,\n RunSuiteOptions,\n SuiteReport,\n TestCase,\n TestSuite,\n} from \"./types\";\n\nconst DEFAULT_MAX_CONCURRENT = 4;\n\n/** One unit of concurrent work: a single repetition for a (case, cell) pair. */\ninterface Task {\n testCase: TestCase;\n cell: MatrixCell;\n repetitionIndex: number;\n}\n\n/**\n * Execute an entire test suite and return an aggregated report.\n *\n * @throws When `suite.matrix` or `suite.cases` is empty.\n */\nexport async function runSuite(\n suite: TestSuite,\n options: RunSuiteOptions = {},\n): Promise<SuiteReport> {\n if (suite.matrix.length === 0) {\n throw new Error(\"runSuite: suite.matrix must contain at least one cell\");\n }\n if (suite.cases.length === 0) {\n throw new Error(\"runSuite: suite.cases must contain at least one case\");\n }\n\n const adapter =\n options.adapter ?? getAdapter(suite.adapter ?? getDefaultAdapter().id);\n\n const run: AdapterRunFn = (config) => adapter.run(config);\n\n const maxConcurrent = options.maxConcurrent ?? DEFAULT_MAX_CONCURRENT;\n const limit = createLimit(maxConcurrent);\n const onProgress = options.onProgress;\n\n const startTs = Date.now();\n const startedAt = new Date(startTs).toISOString();\n\n const tasks: Task[] = [];\n for (const testCase of suite.cases) {\n const reps = getRepetitions(testCase);\n for (const cell of suite.matrix) {\n for (let i = 0; i < reps; i++) {\n tasks.push({ testCase, cell, repetitionIndex: i });\n }\n }\n }\n\n onProgress?.({ kind: \"suite-start\", totalReps: tasks.length });\n\n const buckets = new Map<string, RepetitionResult[]>();\n // Stable key for grouping reps belonging to the same (case, cell).\n const bucketKey = (caseId: string, cellLabel: string) =>\n `${caseId}::${cellLabel}`;\n\n for (const testCase of suite.cases) {\n for (const cell of suite.matrix) {\n buckets.set(bucketKey(testCase.id, cell.label), []);\n }\n }\n\n await Promise.all(\n tasks.map((task) =>\n limit(async () => {\n if (options.signal?.aborted) return;\n\n onProgress?.({\n kind: \"rep-start\",\n caseId: task.testCase.id,\n cellLabel: task.cell.label,\n repIndex: task.repetitionIndex,\n });\n\n const config = mergeConfig(suite, task.testCase, task.cell);\n const result = await runRepetition(\n task.testCase,\n task.cell,\n config,\n task.repetitionIndex,\n run,\n options.signal,\n );\n\n buckets.get(bucketKey(task.testCase.id, task.cell.label))!.push(result);\n\n onProgress?.({\n kind: \"rep-complete\",\n caseId: task.testCase.id,\n cellLabel: task.cell.label,\n repIndex: task.repetitionIndex,\n ok: result.error === null,\n durationMs: result.durationMs,\n toolCallCount: result.adapterResult?.view.toolCalls.length,\n assertionResults: result.assertionResults,\n errorMessage: result.error?.message,\n });\n }),\n ),\n );\n\n const cells: CellReport[] = [];\n for (const testCase of suite.cases) {\n for (const cell of suite.matrix) {\n const reps = buckets.get(bucketKey(testCase.id, cell.label)) ?? [];\n reps.sort((a, b) => a.repetitionIndex - b.repetitionIndex);\n\n const cellReport = aggregateCell(testCase, cell, reps);\n cells.push(cellReport);\n\n onProgress?.({ kind: \"cell-complete\", report: cellReport });\n }\n }\n\n const report: SuiteReport = {\n startedAt,\n durationMs: Date.now() - startTs,\n cells,\n };\n\n onProgress?.({ kind: \"suite-complete\", report });\n\n return report;\n}\n"],"mappings":";;;;;;;;;;AAqBA,SAAgB,YAAY,UAAkB,SAA+B;CAC3E,MAAM,IAAI,cAAc,OAAO;CAC/B,IAAI,CAAC,EAAE,SAAS,GAAG,GAAG,OAAO,aAAa;CAC1C,OAAO,YAAY,CAAC,CAAC,CAAC,KAAK,QAAQ;AACrC;;AAGA,SAAgB,cAAc,SAA8B;CAC1D,OAAO,OAAO,YAAY,WAAW,UAAU,QAAQ;AACzD;;AAGA,SAAgB,gBAAgB,SAA8B;CAC5D,OAAO,cAAc,OAAO;AAC9B;;;;;AAMA,SAAS,YAAY,MAAsB;CACzC,MAAM,UAAU,KACb,QAAQ,sBAAsB,MAAM,CAAC,CACrC,QAAQ,OAAO,IAAI;CACtB,OAAO,IAAI,OAAO,IAAI,QAAQ,EAAE;AAClC;;;AClBA,MAAM,2BAAW,IAAI,IAAI;CACvB;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;AACF,CAAC;AACD,MAAM,+BAAe,IAAI,IAAI;CAAC;CAAU;CAAU;AAAK,CAAC;;;;;;;;AASxD,SAAgB,QAAQ,OAAgB,WAA6B;CAGnE,IAAI,CAAC,cAAc,SAAS,GAC1B,OAAO,WAAW,OAAO,SAAS;CAGpC,MAAM,MAAM;CACZ,MAAM,OAAO,OAAO,KAAK,GAAG;CAG5B,IAAI,KAAK,WAAW,GAAG;EACrB,MAAM,MAAM,KAAK;EAEjB,IAAI,aAAa,IAAI,GAAG,GACtB,QAAQ,KAAR;GACE,KAAK,UACH,OAAQ,IAAI,OAAuB,MAAM,QAAQ,QAAQ,OAAO,GAAG,CAAC;GACtE,KAAK,UACH,OAAQ,IAAI,OAAuB,OAAO,QACxC,QAAQ,OAAO,GAAG,CACpB;GACF,KAAK,OACH,OAAO,CAAC,QAAQ,OAAO,IAAI,GAAG;EAClC;EAGF,IAAI,SAAS,IAAI,GAAG,GAClB,OAAO,YAAY,OAAO,KAAK,IAAI,IAAI;CAI3C;CAIA,IAAI,CAAC,cAAc,KAAK,GAAG,OAAO;CAClC,MAAM,WAAW;CAEjB,KAAK,MAAM,CAAC,OAAO,YAAY,OAAO,QAAQ,GAAG,GAC/C,IAAI,CAAC,QAAQ,SAAS,QAAQ,OAAO,GAAG,OAAO;CAEjD,OAAO;AACT;;AAGA,SAAS,YAAY,OAAgB,IAAY,QAA0B;CACzE,QAAQ,IAAR;EACE,KAAK,UACH,OAAO,WAAW,OAAO,MAAM;EACjC,KAAK,YACH,OAAO,OAAO,UAAU,YAAY,MAAM,SAAS,MAAgB;EACrE,KAAK,gBACH,OAAO,OAAO,UAAU,YAAY,CAAC,MAAM,SAAS,MAAgB;EACtE,KAAK;GACH,IAAI,OAAO,UAAU,YAAY,OAAO,WAAW,UACjD,OAAO;GAET,IAAI;IACF,OAAO,IAAI,OAAO,MAAM,CAAC,CAAC,KAAK,KAAK;GACtC,QAAQ;IACN,OAAO;GACT;EACF,KAAK,OACH,OAAO,OAAO,UAAU,YAAY,SAAU;EAChD,KAAK,OACH,OAAO,OAAO,UAAU,YAAY,SAAU;EAChD,KAAK,MACH,OAAO,OAAO,UAAU,YAAY,QAAS;EAC/C,KAAK,MACH,OAAO,OAAO,UAAU,YAAY,QAAS;EAC/C,KAAK,UACH,OAAQ,OAAqB,MAAM,MAAM,WAAW,OAAO,CAAC,CAAC;EAC/D,SACE,MAAM,IAAI,MAAM,0BAA0B,IAAI;CAClD;AACF;;AAGA,SAAS,cAAc,GAA0C;CAC/D,OAAO,OAAO,MAAM,YAAY,MAAM,QAAQ,CAAC,MAAM,QAAQ,CAAC;AAChE;;;;;AAMA,SAAS,WAAW,GAAY,GAAqB;CACnD,IAAI,MAAM,GAAG,OAAO;CACpB,IAAI,OAAO,MAAM,OAAO,GAAG,OAAO;CAClC,IAAI,MAAM,QAAQ,MAAM,MAAM,OAAO;CACrC,IAAI,OAAO,MAAM,UAAU,OAAO;CAElC,IAAI,MAAM,QAAQ,CAAC,MAAM,MAAM,QAAQ,CAAC,GAAG,OAAO;CAClD,IAAI,MAAM,QAAQ,CAAC,KAAK,MAAM,QAAQ,CAAC,GAAG;EACxC,IAAI,EAAE,WAAW,EAAE,QAAQ,OAAO;EAClC,OAAO,EAAE,OAAO,GAAG,MAAM,WAAW,GAAG,EAAE,EAAE,CAAC;CAC9C;CAEA,MAAM,OAAO;CACb,MAAM,OAAO;CACb,MAAM,QAAQ,OAAO,KAAK,IAAI;CAC9B,MAAM,QAAQ,OAAO,KAAK,IAAI;CAC9B,IAAI,MAAM,WAAW,MAAM,QAAQ,OAAO;CAC1C,OAAO,MAAM,OAAO,MAAM,WAAW,KAAK,IAAI,KAAK,EAAE,CAAC;AACxD;;;;AClIA,SAAgB,eACd,MACA,WACiB;CACjB,MAAM,WAAW,KAAK,UAAU,QAAQ,MACtC,YAAY,EAAE,MAAM,UAAU,IAAI,CACpC;CAEA,MAAM,SADQ,iBAAiB,UAAU,KACtB,CAAC,CAAC,SAAS,MAAM;CAEpC,OAAO;EACL;EACA,aAAa,UAAU,gBAAgB,UAAU,IAAI,EAAE,IAAI,oBAAoB,UAAU,KAAK,EAAE;EAChG,SAAS,SACL,SAAS,SAAS,OAAO,qBACzB,SAAS,SAAS,OAAO,qBAAqB,oBAAoB,UAAU,KAAK;EACrF,SAAS;CACX;AACF;;AAGA,SAAgB,kBACd,MACA,WACiB;CACjB,MAAM,WAAW,KAAK,UAAU,QAAQ,MACtC,YAAY,EAAE,MAAM,UAAU,IAAI,CACpC;CACA,MAAM,SAAS,SAAS,WAAW;CAEnC,OAAO;EACL;EACA,aAAa,cAAc,gBAAgB,UAAU,IAAI,EAAE;EAC3D,SAAS,SACL,sBACA,SAAS,SAAS,OAAO;EAC7B,SAAS;CACX;AACF;;AAGA,SAAgB,oBACd,MACA,WACiB;CACjB,MAAM,aAAyB,CAAC;CAChC,KAAK,MAAM,WAAW,UAAU,OAC9B,WAAW,KACT,GAAG,KAAK,UAAU,QAAQ,MAAM,YAAY,EAAE,MAAM,OAAO,CAAC,CAC9D;CAEF,MAAM,SAAS,WAAW,SAAS;CACnC,OAAO;EACL;EACA,aAAa,iBAAiB,UAAU,MAAM,IAAI,eAAe,CAAC,CAAC,KAAK,IAAI,EAAE;EAC9E,SAAS,SACL,GAAG,WAAW,OAAO,qBACrB;EACJ,SAAS;CACX;AACF;;AAGA,SAAgB,oBACd,MACA,WACiB;CACjB,MAAM,aAAa,UAAU,MAAM,KAAK,OAAO;EAC7C,SAAS;EACT,SAAS,KAAK,UAAU,QAAQ,MAAM,YAAY,EAAE,MAAM,CAAC,CAAC;CAC9D,EAAE;CACF,MAAM,UAAU,WAAW,QAAQ,MAAM,EAAE,QAAQ,WAAW,CAAC;CAC/D,MAAM,SAAS,QAAQ,WAAW;CAElC,OAAO;EACL;EACA,aAAa,iBAAiB,UAAU,MAAM,IAAI,eAAe,CAAC,CAAC,KAAK,IAAI,EAAE;EAC9E,SAAS,SACL,yBACA,YAAY,QAAQ,KAAK,MAAM,gBAAgB,EAAE,OAAO,CAAC,CAAC,CAAC,KAAK,IAAI;EACxE,SAAS,WAAW,SAAS,MAAM,EAAE,OAAO;CAC9C;AACF;;AAKA,SAAgB,qBACd,MACA,WACiB;CACjB,MAAM,SAAS,KAAK,UAAU,QAAQ,MACpC,YAAY,EAAE,MAAM,UAAU,KAAK,CACrC;CACA,MAAM,QAAQ,KAAK,UAAU,QAAQ,MACnC,YAAY,EAAE,MAAM,UAAU,IAAI,CACpC;CACA,MAAM,OAAO,iBAAiB,gBAAgB,UAAU,KAAK,EAAE,KAAK,gBAAgB,UAAU,IAAI,EAAE;CAEpG,IAAI,OAAO,WAAW,GACpB,OAAO;EACL,QAAQ;EACR,aAAa;EACb,SAAS;CACX;CAEF,IAAI,MAAM,WAAW,GACnB,OAAO;EACL,QAAQ;EACR,aAAa;EACb,SAAS;CACX;CAIF,MAAM,gBAAgB,KAAK,IAAI,GAAG,OAAO,KAAK,MAAM,EAAE,SAAS,CAAC;CAChE,MAAM,eAAe,KAAK,IAAI,GAAG,MAAM,KAAK,MAAM,EAAE,SAAS,CAAC;CAC9D,MAAM,SAAS,gBAAgB;CAE/B,OAAO;EACL;EACA,aAAa;EACb,SAAS,SACL,gBAAgB,cAAc,gBAAgB,iBAC9C,gBAAgB,cAAc,gBAAgB,aAAa;EAC/D,SAAS,CAAC,GAAG,QAAQ,GAAG,KAAK;CAC/B;AACF;;;;;;AAOA,SAAgB,iBACd,MACA,WACiB;CACjB,MAAM,EAAE,OAAO,SAAS,UAAU;CAClC,MAAM,OAAO,aAAa,MAAM,IAAI,eAAe,CAAC,CAAC,KAAK,KAAK,EAAE,GAAG,SAAS,aAAa,GAAG;CAE7F,IAAI,MAAM,WAAW,GACnB,OAAO;EACL,QAAQ;EACR,aAAa;EACb,SAAS;CACX;CAGF,IAAI,QAAQ;EAGV,IAAI,KAAK,UAAU,SAAS,MAAM,QAChC,OAAO;GACL,QAAQ;GACR,aAAa;GACb,SAAS;EACX;EAEF,KACE,IAAI,QAAQ,GACZ,SAAS,KAAK,UAAU,SAAS,MAAM,QACvC,SACA;GACA,IAAI,KAAK;GACT,KAAK,IAAI,IAAI,GAAG,IAAI,MAAM,QAAQ,KAChC,IAAI,CAAC,YAAY,KAAK,UAAU,QAAQ,EAAE,CAAC,MAAM,MAAM,EAAE,GAAG;IAC1D,KAAK;IACL;GACF;GAEF,IAAI,IACF,OAAO;IACL,QAAQ;IACR,aAAa;IACb,SAAS,wBAAwB,MAAM,IAAI,QAAQ,MAAM,SAAS;IAClE,SAAS,KAAK,UAAU,MAAM,OAAO,QAAQ,MAAM,MAAM;GAC3D;EAEJ;EACA,OAAO;GAAE,QAAQ;GAAO,aAAa;GAAM,SAAS;EAAsB;CAC5E;CAIA,IAAI,MAAM;CACV,MAAM,UAAsB,CAAC;CAC7B,KAAK,MAAM,QAAQ,KAAK,WACtB,IAAI,MAAM,MAAM,UAAU,YAAY,KAAK,MAAM,MAAM,IAAI,GAAG;EAC5D,QAAQ,KAAK,IAAI;EACjB;CACF;CAEF,MAAM,SAAS,QAAQ,MAAM;CAC7B,OAAO;EACL;EACA,aAAa;EACb,SAAS,SAAS,qBAAqB,WAAW,IAAI,GAAG,MAAM;EAC/D,SAAS;CACX;AACF;;AAKA,SAAgB,mBACd,MACA,WACiB;CACjB,MAAM,aAAa,KAAK,UAAU,QAAQ,MACxC,YAAY,EAAE,MAAM,UAAU,IAAI,CACpC;CACA,MAAM,WAAW,WAAW,QAAQ,MAClCA,QAAiB,EAAE,MAAM,UAAU,IAAI,CACzC;CACA,MAAM,SAAS,SAAS,SAAS;CAEjC,IAAI;CACJ,IAAI,QACF,UAAU,GAAG,SAAS,OAAO;MACxB,IAAI,WAAW,WAAW,GAC/B,UAAU,eAAe,gBAAgB,UAAU,IAAI,EAAE;MAEzD,UAAU,GAAG,WAAW,OAAO;CAGjC,OAAO;EACL;EACA,aAAa,eAAe,gBAAgB,UAAU,IAAI,EAAE;EAC5D;EACA,SAAS;CACX;AACF;;;;;;;;;;;;;ACrOA,SAAgB,kCACd,MACA,YACiB;CACjB,MAAM,SAAS,KAAK,UAAU,WAAW,KAAK,KAAK,cAAc,SAAS;CAC1E,OAAO;EACL;EACA,aAAa;EACb,SAAS,SACL,wCACA,KAAK,UAAU,SAAS,IACtB,GAAG,KAAK,UAAU,OAAO,sBACzB;CACR;AACF;;AAGA,SAAgB,yBACd,MACA,WACiB;CACjB,MAAM,IAAI,KAAK,MAAM;CAErB,OAAO;EACL,QAFa,KAAK,UAAU;EAG5B,aAAa,qBAAqB,UAAU,IAAI;EAChD,SAAS,QAAQ,EAAE;CACrB;AACF;;AAGA,SAAgB,sBACd,MACA,WACiB;CACjB,MAAM,OAAO,KAAK,MAAM;CAExB,OAAO;EACL,QAFa,QAAQ,UAAU;EAG/B,aAAa,mBAAmB,UAAU,IAAI,QAAQ,CAAC,EAAE;EACzD,SAAS,SAAS,KAAK,QAAQ,CAAC;CAClC;AACF;;AAGA,SAAgB,yBACd,MACA,WACiB;CACjB,MAAM,KAAK,KAAK,MAAM;CAEtB,OAAO;EACL,QAFa,MAAM,UAAU;EAG7B,aAAa,sBAAsB,UAAU,IAAI;EACjD,SAAS,QAAQ,GAAG;CACtB;AACF;;AAGA,SAAgB,qBACd,MACA,WACiB;CACjB,MAAM,UAAU,MAAM,QAAQ,UAAU,OAAO,IAC3C,UAAU,UACV,CAAC,UAAU,OAAO;CACtB,MAAM,SAAS,KAAK;CAEpB,OAAO;EACL,QAFa,WAAW,QAAQ,QAAQ,SAAS,MAAM;EAGvD,aAAa,iBAAiB,QAAQ,KAAK,GAAG,EAAE;EAChD,SAAS,WAAW,UAAU;CAChC;AACF;;AAKA,SAAgB,yBACd,MACA,WACiB;CACjB,MAAM,SAAS,KAAK,cAAc,SAAS,UAAU,IAAI;CACzD,OAAO;EACL;EACA,aAAa,qBAAqB,KAAK,UAAU,UAAU,IAAI,EAAE;EACjE,SAAS,SAAS,eAAe;CACnC;AACF;;AAGA,SAAgB,4BACd,MACA,WACiB;CACjB,MAAM,SAAS,CAAC,KAAK,cAAc,SAAS,UAAU,IAAI;CAC1D,OAAO;EACL;EACA,aAAa,yBAAyB,KAAK,UAAU,UAAU,IAAI,EAAE;EACrE,SAAS,SAAS,gBAAgB;CACpC;AACF;;AAGA,SAAgB,wBACd,MACA,WACiB;CAGjB,IAAI;CACJ,IAAI;CACJ,IAAI;EAEF,SAAS,IADM,OAAO,UAAU,SAAS,UAAU,KACzC,CAAC,CAAC,KAAK,KAAK,aAAa;EACnC,UAAU,SAAS,oBAAoB;CACzC,SAAS,KAAK;EACZ,SAAS;EACT,UAAU,kBAAkB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;CAC7E;CACA,OAAO;EACL;EACA,aAAa,qBAAqB,UAAU,QAAQ,GAAG,UAAU,SAAS,GAAG;EAC7E;CACF;AACF;;;;;;;;AAWA,SAAgB,kBACd,MACA,WACiB;CACjB,IAAI,SAAS;CACb,IAAI;CACJ,IAAI;EACF,SAAS,UAAU,GAAG,IAAI;EAC1B,UAAU,SAAS,4BAA4B;CACjD,SAAS,KAAK;EACZ,UAAU,oBAAoB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;CAC/E;CACA,OAAO;EACL;EACA,aAAa,UAAU,eAAe;EACtC;CACF;AACF;;;;AC7JA,SAAgB,cACd,MACA,WACA,UACiB;CACjB,MAAM,WAAW,UAAU,WAAW,KAAK,MAAM,SAAS,MAAM,CAAC,CAAC;CAClE,MAAM,SAAS,SAAS,OAAO,MAAM,EAAE,MAAM;CAC7C,MAAM,cAAc,SAAS,QAAQ,MAAM,CAAC,EAAE,MAAM,CAAC,CAAC;CAEtD,OAAO;EACL;EACA,aAAa,WAAW,SAAS,OAAO,QAAQ,SAAS,WAAW,IAAI,KAAK,MAAM;EACnF,SAAS,SACL,eACA,GAAG,YAAY,MAAM,SAAS,OAAO;EACzC;CACF;AACF;;AAGA,SAAgB,cACd,MACA,WACA,UACiB;CACjB,MAAM,WAAW,UAAU,WAAW,KAAK,MAAM,SAAS,MAAM,CAAC,CAAC;CAClE,MAAM,cAAc,SAAS,QAAQ,MAAM,EAAE,MAAM,CAAC,CAAC;CACrD,MAAM,SAAS,cAAc;CAE7B,OAAO;EACL;EACA,aAAa,WAAW,SAAS,OAAO,QAAQ,SAAS,WAAW,IAAI,KAAK,MAAM;EACnF,SAAS,SAAS,GAAG,YAAY,WAAW;EAC5C;CACF;AACF;;AAGA,SAAgB,YACd,MACA,WACA,UACiB;CACjB,MAAM,QAAQ,SAAS,MAAM,UAAU,SAAS;CAChD,OAAO;EACL,QAAQ,CAAC,MAAM;EACf,aAAa,OAAO,MAAM,YAAY;EACtC,SAAS,MAAM,SACX,kCACA;EACJ,UAAU,CAAC,KAAK;CAClB;AACF;;;;;;;;;;AC9BA,SAAgB,SACd,MACA,WACiB;CACjB,QAAQ,UAAU,MAAlB;EAEE,KAAK,UACH,OAAO,eAAe,MAAM,SAAS;EACvC,KAAK,cACH,OAAO,kBAAkB,MAAM,SAAS;EAC1C,KAAK,iBACH,OAAO,oBAAoB,MAAM,SAAS;EAC5C,KAAK,iBACH,OAAO,oBAAoB,MAAM,SAAS;EAC5C,KAAK,iBACH,OAAO,qBAAqB,MAAM,SAAS;EAC7C,KAAK,YACH,OAAO,iBAAiB,MAAM,SAAS;EAGzC,KAAK,eACH,OAAO,mBAAmB,MAAM,SAAS;EAG3C,KAAK,gCACH,OAAO,kCAAkC,MAAM,SAAS;EAC1D,KAAK,qBACH,OAAO,yBAAyB,MAAM,SAAS;EACjD,KAAK,mBACH,OAAO,sBAAsB,MAAM,SAAS;EAC9C,KAAK,sBACH,OAAO,yBAAyB,MAAM,SAAS;EACjD,KAAK,iBACH,OAAO,qBAAqB,MAAM,SAAS;EAG7C,KAAK,qBACH,OAAO,yBAAyB,MAAM,SAAS;EACjD,KAAK,yBACH,OAAO,4BAA4B,MAAM,SAAS;EACpD,KAAK,oBACH,OAAO,wBAAwB,MAAM,SAAS;EAIhD,KAAK,UACH,OAAO,cAAc,MAAM,WAAW,QAAQ;EAChD,KAAK,UACH,OAAO,cAAc,MAAM,WAAW,QAAQ;EAChD,KAAK,OACH,OAAO,YAAY,MAAM,WAAW,QAAQ;EAG9C,KAAK,aACH,OAAO,kBAAkB,MAAM,SAAS;EAE1C,SAKE,MAAM,IAAI,MAAM,sBAAsB,KAAK,UAAUC,SAAW,GAAG;CAEvE;AACF;;;;;;AAOA,SAAgB,YACd,MACA,YACmB;CACnB,OAAO,WAAW,KAAK,MAAM,SAAS,MAAM,CAAC,CAAC;AAChD;;;AC7FA,MAAM,WAA2C,CAAC;AAElD,SAAS,gBAAgB,IAAY,SAA+B;CAClE,SAAS,MAAM;AACjB;AAEA,gBAAgB,eAAe,iBAAiB;;;;;;;AAQhD,SAAgB,gBAAgB,IAAY,SAA+B;CACzE,IAAI,SAAS,KACX,MAAM,IAAI,MAAM,YAAY,GAAG,wBAAwB;CAEzD,SAAS,MAAM;AACjB;;AAGA,SAAgB,eAAyB;CACvC,OAAO,OAAO,KAAK,QAAQ;AAC7B;;AAGA,SAAgB,WAAW,IAA4B;CACrD,MAAM,UAAU,SAAS;CACzB,IAAI,CAAC,SACH,MAAM,IAAI,MACR,oBAAoB,GAAG,gBAAgB,aAAa,CAAC,CAAC,KAAK,IAAI,GACjE;CAEF,OAAO;AACT;;AAGA,MAAa,qBAAqB;AAElC,SAAgB,oBAAoC;CAClD,OAAO,WAAW,kBAAkB;AACtC;;;;;;;;;;;ACpDA,SAAgB,mBACd,QACA,QACyB;CACzB,MAAM,SAAkC,CAAC;CACzC,KAAK,MAAM,SAAS,QAAQ;EAC1B,MAAM,EAAE,YAAY,GAAG,YAAY;EACnC,OAAO,OAAO,QAAQ,OAAO;EAC7B,IAAI,cAAc,OAAO,eAAe,UACtC,OAAO,OAAO,QAAQ,UAAU;CAEpC;CACA,OAAO,SAAS;CAChB,OAAO;AACT;;;;;AAMA,SAAgB,iBACd,WACA,QACA,QACmB;CACnB,IAAI,cAAA,iBAAoC,cAAc,eACpD,OAAO,mBAAmB,QAAQ,MAAM;CAG1C,MAAM,SAAkC,CAAC;CACzC,KAAK,MAAM,SAAS,QAClB,OAAO,OAAO,QAAQ,KAAK;CAE7B,OAAO,SAAS;CAChB,OAAO;AACT;;;;AC7BA,MAAa,sBAAsB;;AAGnC,MAAa,oBAAoB;;;;;;AAYjC,SAAgB,YACd,OACA,UACA,MAC6C;CAO7C,OAAO,iBANW,MAAM,WAAW,kBAAkB,CAAC,CAAC,IAMpB;EAJjC,MAAM,iBAAiB,CAAC;EACxB,SAAS,UAAU,CAAC;EACpB,KAAK;CAEiC,GAAG,SAAS,MAAM;AAC5D;;AAGA,SAAgB,eAAe,UAA4B;CACzD,OAAO,SAAS,eAAA;AAClB;;;;;;;AAQA,eAAsB,cACpB,UACA,OACA,QACA,iBACA,KACA,QAC2B;CAC3B,MAAM,UAAU,KAAK,IAAI;CAEzB,IAAI;EACF,MAAM,gBAAgB,MAAM,IAAI;GAC9B,GAAG;GACH,QAAQ,UAAU,OAAO;EAC3B,CAAC;EAOD,OAAO;GACL;GACA;GACA,OAAO;GACP,kBATuB,YACvB,cAAc,MACd,SAAS,WAAW,KAAK,MAAM,EAAE,SAAS,CAO3B;GACf,YAAY,KAAK,IAAI,IAAI;EAC3B;CACF,SAAS,KAAK;EACZ,OAAO;GACL;GACA,eAAe;GACf,OAAO,aAAa,GAAG;GACvB,kBAAkB,CAAC;GACnB,YAAY,KAAK,IAAI,IAAI;EAC3B;CACF;AACF;;;;;;;AAQA,SAAS,aAAa,KAA+B;CACnD,MAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;CAE/D,IAAI,cAA2C,CAAC;CAChD,IAAI,QAAQ,QAAQ,OAAO,QAAQ,YAAY,iBAAiB,KAAK;EACnE,MAAM,IAAK,IAAiC;EAC5C,IAAI,MAAM,QAAQ,OAAO,MAAM,UAC7B,cAAc;CAElB;CAEA,OAAO;EAAE;EAAS;CAAY;AAChC;;;;;;;AAQA,SAAgB,cACd,UACA,MACA,aACY;CACZ,MAAM,gBAAgB,YAAY,QAAQ,MAAM,EAAE,UAAU,IAAI,CAAC,CAAC;CAClE,MAAM,gBAAgB,YAAY,QAAQ,MAAM,EAAE,UAAU,IAAI;CAEhE,MAAM,iBAAkC,SAAS,WAAW,KACzD,aAAa,MAAM;EAClB,MAAM,YAAY,YAAY,aAAA;EAC9B,MAAM,cAAc,cAAc,QAC/B,MAAM,EAAE,iBAAiB,EAAE,EAAE,MAChC,CAAC,CAAC;EACF,MAAM,iBAAiB,cAAc;EACrC,MAAM,WAAW,mBAAmB,IAAI,IAAI,cAAc;EAM1D,OAAO;GACL,aAJA,cAAc,EAAE,EAAE,iBAAiB,EAAE,EAAE,eACvC,IAAI,YAAY,UAAU,KAAK;GAI/B;GACA;GACA;GACA;GACA,gBAAgB,iBAAiB,KAAK,YAAY;EACpD;CACF,CACF;CAEA,MAAM,SAAS,eAAe,OAAO,MAAM,EAAE,cAAc;CAE3D,OAAO;EACL,QAAQ,SAAS;EACjB,UAAU,SAAS;EACnB,OAAO,SAAS;EAChB,QAAQ,SAAS;EACjB,cAAc,SAAS;EACvB,sBAAsB,SAAS;EAC/B,eAAe,SAAS;EACxB;EACA;EACA;EACA;EACA;CACF;AACF;;;AC9JA,SAAgB,YAAY,KAA4B;CACtD,IAAI,CAAC,OAAO,UAAU,GAAG,KAAK,MAAM,GAClC,MAAM,IAAI,MAAM,oDAAoD,KAAK;CAG3E,IAAI,UAAU;;;;;CAKd,MAAM,UAA0B,CAAC;CAEjC,OAAO,OAAU,OAAqC;EAIpD,OAAO,WAAW,KAChB,MAAM,IAAI,SAAe,YAAY,QAAQ,KAAK,OAAO,CAAC;EAE5D;EAEA,IAAI;GACF,OAAO,MAAM,GAAG;EAClB,UAAU;GACR;GAGA,MAAM,OAAO,QAAQ,MAAM;GAC3B,IAAI,MAAM,KAAK;EACjB;CACF;AACF;;;;;;;;;;ACtBA,MAAM,yBAAyB;;;;;;AAc/B,eAAsB,SACpB,OACA,UAA2B,CAAC,GACN;CACtB,IAAI,MAAM,OAAO,WAAW,GAC1B,MAAM,IAAI,MAAM,uDAAuD;CAEzE,IAAI,MAAM,MAAM,WAAW,GACzB,MAAM,IAAI,MAAM,sDAAsD;CAGxE,MAAM,UACJ,QAAQ,WAAW,WAAW,MAAM,WAAW,kBAAkB,CAAC,CAAC,EAAE;CAEvE,MAAM,OAAqB,WAAW,QAAQ,IAAI,MAAM;CAGxD,MAAM,QAAQ,YADQ,QAAQ,iBAAiB,sBACR;CACvC,MAAM,aAAa,QAAQ;CAE3B,MAAM,UAAU,KAAK,IAAI;CACzB,MAAM,YAAY,IAAI,KAAK,OAAO,CAAC,CAAC,YAAY;CAEhD,MAAM,QAAgB,CAAC;CACvB,KAAK,MAAM,YAAY,MAAM,OAAO;EAClC,MAAM,OAAO,eAAe,QAAQ;EACpC,KAAK,MAAM,QAAQ,MAAM,QACvB,KAAK,IAAI,IAAI,GAAG,IAAI,MAAM,KACxB,MAAM,KAAK;GAAE;GAAU;GAAM,iBAAiB;EAAE,CAAC;CAGvD;CAEA,aAAa;EAAE,MAAM;EAAe,WAAW,MAAM;CAAO,CAAC;CAE7D,MAAM,0BAAU,IAAI,IAAgC;CAEpD,MAAM,aAAa,QAAgB,cACjC,GAAG,OAAO,IAAI;CAEhB,KAAK,MAAM,YAAY,MAAM,OAC3B,KAAK,MAAM,QAAQ,MAAM,QACvB,QAAQ,IAAI,UAAU,SAAS,IAAI,KAAK,KAAK,GAAG,CAAC,CAAC;CAItD,MAAM,QAAQ,IACZ,MAAM,KAAK,SACT,MAAM,YAAY;EAChB,IAAI,QAAQ,QAAQ,SAAS;EAE7B,aAAa;GACX,MAAM;GACN,QAAQ,KAAK,SAAS;GACtB,WAAW,KAAK,KAAK;GACrB,UAAU,KAAK;EACjB,CAAC;EAED,MAAM,SAAS,YAAY,OAAO,KAAK,UAAU,KAAK,IAAI;EAC1D,MAAM,SAAS,MAAM,cACnB,KAAK,UACL,KAAK,MACL,QACA,KAAK,iBACL,KACA,QAAQ,MACV;EAEA,QAAQ,IAAI,UAAU,KAAK,SAAS,IAAI,KAAK,KAAK,KAAK,CAAC,CAAC,CAAE,KAAK,MAAM;EAEtE,aAAa;GACX,MAAM;GACN,QAAQ,KAAK,SAAS;GACtB,WAAW,KAAK,KAAK;GACrB,UAAU,KAAK;GACf,IAAI,OAAO,UAAU;GACrB,YAAY,OAAO;GACnB,eAAe,OAAO,eAAe,KAAK,UAAU;GACpD,kBAAkB,OAAO;GACzB,cAAc,OAAO,OAAO;EAC9B,CAAC;CACH,CAAC,CACH,CACF;CAEA,MAAM,QAAsB,CAAC;CAC7B,KAAK,MAAM,YAAY,MAAM,OAC3B,KAAK,MAAM,QAAQ,MAAM,QAAQ;EAC/B,MAAM,OAAO,QAAQ,IAAI,UAAU,SAAS,IAAI,KAAK,KAAK,CAAC,KAAK,CAAC;EACjE,KAAK,MAAM,GAAG,MAAM,EAAE,kBAAkB,EAAE,eAAe;EAEzD,MAAM,aAAa,cAAc,UAAU,MAAM,IAAI;EACrD,MAAM,KAAK,UAAU;EAErB,aAAa;GAAE,MAAM;GAAiB,QAAQ;EAAW,CAAC;CAC5D;CAGF,MAAM,SAAsB;EAC1B;EACA,YAAY,KAAK,IAAI,IAAI;EACzB;CACF;CAEA,aAAa;EAAE,MAAM;EAAkB;CAAO,CAAC;CAE/C,OAAO;AACT"}
@@ -177,7 +177,134 @@ interface AssertionResult {
177
177
  children?: AssertionResult[];
178
178
  }
179
179
  //#endregion
180
+ //#region src/types/eval-interchange.d.ts
181
+ /**
182
+ * Vertex AI EvaluationService protojson wire types for eval interchange.
183
+ *
184
+ * These types mirror the JSON shape produced by protobuf's protojson encoding
185
+ * for Vertex trajectory and evaluation instance messages. Field names use
186
+ * camelCase (protojson default) rather than harness-eval's snake_case YAML.
187
+ *
188
+ * The interchange layer (`src/eval-interchange/`) converts harness
189
+ * {@link TrajectoryView} and suite reference trajectories into these wire
190
+ * objects so envelopes can be fed to Vertex EvaluateInstances without a second
191
+ * transformation step.
192
+ *
193
+ * @see https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/evaluation
194
+ */
195
+ /** How suite reference tool names are normalized before protojson export. */
196
+ type ReferenceToolNameMode = "harness" | "bare";
197
+ /**
198
+ * Suite YAML reference trajectory block.
199
+ *
200
+ * When `tool_name_mode` is `"bare"`, MCP-style names are stripped to the suffix
201
+ * after the last `__` on both predicted and reference trajectories before
202
+ * metrics and Vertex instance export.
203
+ */
204
+ interface ReferenceTrajectoryConfig {
205
+ tool_name_mode?: ReferenceToolNameMode;
206
+ steps: Array<{
207
+ tool_name: string;
208
+ tool_input: unknown;
209
+ }>;
210
+ }
211
+ /** One tool call in Vertex Trajectory protojson wire format. */
212
+ interface ProtojsonToolCall {
213
+ toolName: string;
214
+ /** JSON-serialized tool arguments string (not a parsed object). */
215
+ toolInput: string;
216
+ }
217
+ /** Ordered tool-call sequence in Vertex Trajectory wire format. */
218
+ interface ProtojsonTrajectory {
219
+ toolCalls: ProtojsonToolCall[];
220
+ }
221
+ /** Text wrapper for EvaluationInstance prompt/response/reference fields. */
222
+ interface InstanceData {
223
+ text?: string;
224
+ }
225
+ /**
226
+ * Vertex EvaluationInstance protojson wire object.
227
+ *
228
+ * Used for prompt/response grading and as a lightweight row in trajectory
229
+ * projection JSONL. `agentEvalData` is omitted in v1.
230
+ */
231
+ interface EvaluationInstanceJson {
232
+ prompt?: InstanceData;
233
+ response?: InstanceData;
234
+ reference?: InstanceData;
235
+ }
236
+ /**
237
+ * Vertex Trajectory*Instance messages keyed by metric name.
238
+ *
239
+ * Each key maps to the instance payload expected by EvaluateInstances for
240
+ * that metric. Pair metrics share {@link TrajectoryPairInstanceJson}; single
241
+ * tool use uses {@link TrajectorySingleToolUseInstanceJson}.
242
+ */
243
+ interface TrajectoryInstancesJson {
244
+ exactMatch?: TrajectoryPairInstanceJson;
245
+ inOrderMatch?: TrajectoryPairInstanceJson;
246
+ anyOrderMatch?: TrajectoryPairInstanceJson;
247
+ precision?: TrajectoryPairInstanceJson;
248
+ recall?: TrajectoryPairInstanceJson;
249
+ singleToolUse?: TrajectorySingleToolUseInstanceJson;
250
+ }
251
+ /** Shared shape for trajectory match, precision, and recall instances. */
252
+ interface TrajectoryPairInstanceJson {
253
+ predictedTrajectory: ProtojsonTrajectory;
254
+ referenceTrajectory: ProtojsonTrajectory;
255
+ }
256
+ /** Vertex TrajectorySingleToolUseInstance — predicted trajectory only. */
257
+ interface TrajectorySingleToolUseInstanceJson {
258
+ predictedTrajectory: ProtojsonTrajectory;
259
+ }
260
+ /**
261
+ * Harness-precomputed trajectory metric scores in camelCase.
262
+ *
263
+ * Values mirror {@link computeTrajectoryMetrics} output but use Vertex-style
264
+ * field names for interchange with external dashboards.
265
+ */
266
+ interface HarnessMetrics {
267
+ trajectoryExactMatch: number;
268
+ trajectoryInOrderMatch: number;
269
+ trajectoryAnyOrderMatch: number;
270
+ trajectoryPrecision: number;
271
+ trajectoryRecall: number;
272
+ trajectorySingleToolUse: number;
273
+ }
274
+ /** Keys of {@link TrajectoryInstancesJson} that carry instance payloads. */
275
+ type TrajectoryInstanceMetricKey = keyof TrajectoryInstancesJson;
276
+ /**
277
+ * One JSONL row for Vertex EvaluateInstances batch upload.
278
+ *
279
+ * `messageType` is the protobuf message name (e.g. `TrajectoryExactMatchInstance`).
280
+ */
281
+ interface InstancesJsonlRow {
282
+ messageType: string;
283
+ caseId: string;
284
+ repetitionIndex: number;
285
+ instance: TrajectoryPairInstanceJson | TrajectorySingleToolUseInstanceJson | EvaluationInstanceJson;
286
+ }
287
+ /**
288
+ * Flattened eval row for trajectory projection JSONL.
289
+ *
290
+ * One row per repetition — suitable for BigQuery or custom analytics pipelines
291
+ * without nesting under envelope cells.
292
+ */
293
+ interface EvalDatasetRow {
294
+ caseId: string;
295
+ repetitionIndex: number;
296
+ prompt?: string;
297
+ response?: string;
298
+ evaluationInstance?: EvaluationInstanceJson;
299
+ /** Session latency in seconds (Vertex convention). */
300
+ latencySeconds: number;
301
+ /** 1 when the harness run failed, 0 on success. */
302
+ failure: 0 | 1;
303
+ humanRatings?: Record<string, number>;
304
+ }
305
+ //#endregion
180
306
  //#region src/runner/types.d.ts
307
+ /** One eval prompt with assertions and optional grading metadata. */
181
308
  interface TestCase {
182
309
  id: string;
183
310
  prompt: string;
@@ -187,20 +314,23 @@ interface TestCase {
187
314
  /** Natural-language outcome checks for LLM grading (see `harness-eval grade`). */
188
315
  expectations?: string[];
189
316
  /** Reference tool-call trajectory for metric computation. */
190
- reference_trajectory?: Array<{
191
- tool_name: string;
192
- tool_input: unknown;
193
- }>;
317
+ reference_trajectory?: ReferenceTrajectoryConfig;
194
318
  /** Human ratings keyed by metric name for judge calibration. */
195
319
  human_ratings?: Record<string, number>;
196
320
  repetitions?: number;
197
321
  config?: SuiteConfig;
198
322
  }
323
+ /**
324
+ * One point in the configuration matrix — a named config overlay applied to
325
+ * every case in the suite.
326
+ */
199
327
  interface MatrixCell {
200
328
  label: string;
201
329
  config: SuiteConfig;
330
+ /** Optional axis labels for reporting (e.g. `{ model: "sonnet" }`). */
202
331
  axes?: Record<string, string>;
203
332
  }
333
+ /** Loaded suite: cases crossed with a config matrix. */
204
334
  interface TestSuite {
205
335
  /** Harness adapter id. Default: `claude-code`. */
206
336
  adapter?: string;
@@ -208,6 +338,7 @@ interface TestSuite {
208
338
  matrix: MatrixCell[];
209
339
  defaultConfig?: SuiteConfig;
210
340
  }
341
+ /** Options passed to {@link runSuite}. */
211
342
  interface RunSuiteOptions {
212
343
  /** Maximum concurrent harness processes across the entire suite. Default 4. */
213
344
  maxConcurrent?: number;
@@ -216,7 +347,12 @@ interface RunSuiteOptions {
216
347
  onProgress?: ProgressCallback;
217
348
  signal?: AbortSignal;
218
349
  }
350
+ /** Callback invoked as repetitions and cells complete during a suite run. */
219
351
  type ProgressCallback = (event: ProgressEvent) => void;
352
+ /**
353
+ * Progress events emitted by the suite runner. Consumed by CLI progress
354
+ * handlers and programmatic callers that want live feedback.
355
+ */
220
356
  type ProgressEvent = {
221
357
  kind: "suite-start";
222
358
  totalReps: number;
@@ -242,6 +378,7 @@ type ProgressEvent = {
242
378
  kind: "suite-complete";
243
379
  report: SuiteReport;
244
380
  };
381
+ /** Outcome of one harness invocation (one repetition). */
245
382
  interface RepetitionResult {
246
383
  repetitionIndex: number;
247
384
  adapterResult: AdapterResult | null;
@@ -249,10 +386,15 @@ interface RepetitionResult {
249
386
  assertionResults: AssertionResult[];
250
387
  durationMs: number;
251
388
  }
389
+ /** Adapter failure for a single repetition (process crash, timeout, etc.). */
252
390
  interface RepetitionError {
253
391
  message: string;
254
392
  diagnostics: Partial<AdapterDiagnostics>;
255
393
  }
394
+ /**
395
+ * Aggregated results for one (case, matrix cell) pair across all repetitions.
396
+ * Copied onto the report for downstream grading and envelope export.
397
+ */
256
398
  interface CellReport {
257
399
  caseId: string;
258
400
  category?: string;
@@ -262,10 +404,7 @@ interface CellReport {
262
404
  /** Outcome expectations for LLM grading. */
263
405
  expectations?: string[];
264
406
  /** Reference tool-call trajectory for metric computation. */
265
- reference_trajectory?: Array<{
266
- tool_name: string;
267
- tool_input: unknown;
268
- }>;
407
+ reference_trajectory?: ReferenceTrajectoryConfig;
269
408
  /** Human ratings keyed by metric name for judge calibration. */
270
409
  human_ratings?: Record<string, number>;
271
410
  cell: MatrixCell;
@@ -274,6 +413,7 @@ interface CellReport {
274
413
  adapterErrors: number;
275
414
  passed: boolean;
276
415
  }
416
+ /** Pass-rate rollup for one thresholded assertion within a cell. */
277
417
  interface AssertionStat {
278
418
  description: string;
279
419
  threshold: number;
@@ -282,11 +422,12 @@ interface AssertionStat {
282
422
  passRate: number;
283
423
  meetsThreshold: boolean;
284
424
  }
425
+ /** Full suite run report — one {@link CellReport} per (case, cell). */
285
426
  interface SuiteReport {
286
427
  startedAt: string;
287
428
  durationMs: number;
288
429
  cells: CellReport[];
289
430
  }
290
431
  //#endregion
291
- export { ObjectPredicate as _, ProgressEvent as a, ToolPattern as b, RunSuiteOptions as c, TestSuite as d, Assertion as f, LeafPredicate as g, CompoundPredicate as h, ProgressCallback as i, SuiteReport as l, Cardinality as m, CellReport as n, RepetitionError as o, AssertionResult as p, MatrixCell as r, RepetitionResult as s, AssertionStat as t, TestCase as u, Predicate as v, ThresholdedAssertion as y };
292
- //# sourceMappingURL=types-BQol062t.d.ts.map
432
+ export { ObjectPredicate as A, TrajectoryPairInstanceJson as C, Cardinality as D, AssertionResult as E, ThresholdedAssertion as M, ToolPattern as N, CompoundPredicate as O, TrajectoryInstancesJson as S, Assertion as T, ProtojsonToolCall as _, ProgressEvent as a, ReferenceTrajectoryConfig as b, RunSuiteOptions as c, TestSuite as d, EvalDatasetRow as f, InstancesJsonlRow as g, InstanceData as h, ProgressCallback as i, Predicate as j, LeafPredicate as k, SuiteReport as l, HarnessMetrics as m, CellReport as n, RepetitionError as o, EvaluationInstanceJson as p, MatrixCell as r, RepetitionResult as s, AssertionStat as t, TestCase as u, ProtojsonTrajectory as v, TrajectorySingleToolUseInstanceJson as w, TrajectoryInstanceMetricKey as x, ReferenceToolNameMode as y };
433
+ //# sourceMappingURL=types-CD3TwOtZ.d.ts.map
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@alis-build/harness-eval",
3
- "version": "0.1.0",
3
+ "version": "0.1.2",
4
4
  "description": "Harness-level eval framework for measuring AI coding agent tool-selection behavior",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -45,6 +45,7 @@
45
45
  "scripts": {
46
46
  "generate-schemas": "tsx src/schemas/generate.ts",
47
47
  "build": "pnpm run generate-schemas && tsdown",
48
+ "postbuild": "node scripts/link-local-bin.mjs",
48
49
  "prepack": "pnpm run build",
49
50
  "prepublishOnly": "pnpm run build",
50
51
  "watch": "tsdown --watch",
@@ -61,6 +62,7 @@
61
62
  "zod": "^4.4.3"
62
63
  },
63
64
  "devDependencies": {
65
+ "@google-cloud/aiplatform": "^6.8.1",
64
66
  "@types/node": "^22.12.0",
65
67
  "tsdown": "^0.22.3",
66
68
  "tsx": "^4.22.4",
@@ -71,4 +73,4 @@
71
73
  "access": "public"
72
74
  },
73
75
  "packageManager": "pnpm@11.3.0"
74
- }
76
+ }
@@ -0,0 +1,196 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$id": "https://raw.githubusercontent.com/alis-build/harness-eval-ts/main/schemas/eval-interchange.schema.json#InstancesJsonlRow",
4
+ "title": "InstancesJsonlRow",
5
+ "description": "Type-tagged JSONL row for Vertex EvaluateInstances batching.",
6
+ "type": "object",
7
+ "properties": {
8
+ "messageType": {
9
+ "$ref": "#/$defs/__schema0"
10
+ },
11
+ "caseId": {
12
+ "$ref": "#/$defs/__schema1"
13
+ },
14
+ "repetitionIndex": {
15
+ "$ref": "#/$defs/__schema2"
16
+ },
17
+ "instance": {
18
+ "$ref": "#/$defs/__schema3"
19
+ }
20
+ },
21
+ "required": [
22
+ "messageType",
23
+ "caseId",
24
+ "repetitionIndex",
25
+ "instance"
26
+ ],
27
+ "additionalProperties": false,
28
+ "$defs": {
29
+ "__schema0": {
30
+ "type": "string",
31
+ "description": "Vertex protobuf message type name."
32
+ },
33
+ "__schema1": {
34
+ "type": "string",
35
+ "description": "Test case id."
36
+ },
37
+ "__schema2": {
38
+ "type": "integer",
39
+ "minimum": -9007199254740991,
40
+ "maximum": 9007199254740991,
41
+ "description": "Repetition index."
42
+ },
43
+ "__schema3": {
44
+ "anyOf": [
45
+ {
46
+ "$ref": "#/$defs/TrajectoryPairInstanceJson"
47
+ },
48
+ {
49
+ "$ref": "#/$defs/TrajectorySingleToolUseInstanceJson"
50
+ },
51
+ {
52
+ "$ref": "#/$defs/EvaluationInstanceJson"
53
+ }
54
+ ],
55
+ "description": "Protojson instance payload."
56
+ },
57
+ "TrajectoryPairInstanceJson": {
58
+ "type": "object",
59
+ "properties": {
60
+ "predictedTrajectory": {
61
+ "$ref": "#/$defs/__schema4"
62
+ },
63
+ "referenceTrajectory": {
64
+ "$ref": "#/$defs/__schema8"
65
+ }
66
+ },
67
+ "required": [
68
+ "predictedTrajectory",
69
+ "referenceTrajectory"
70
+ ],
71
+ "additionalProperties": false,
72
+ "title": "TrajectoryPairInstanceJson",
73
+ "description": "Shared shape for Trajectory*Match/Precision/Recall instances."
74
+ },
75
+ "__schema4": {
76
+ "description": "Predicted tool-call trajectory.",
77
+ "$ref": "#/$defs/ProtojsonTrajectory"
78
+ },
79
+ "__schema5": {
80
+ "type": "array",
81
+ "items": {
82
+ "$ref": "#/$defs/ProtojsonToolCall"
83
+ },
84
+ "description": "Ordered tool calls in the trajectory."
85
+ },
86
+ "ProtojsonToolCall": {
87
+ "type": "object",
88
+ "properties": {
89
+ "toolName": {
90
+ "$ref": "#/$defs/__schema6"
91
+ },
92
+ "toolInput": {
93
+ "$ref": "#/$defs/__schema7"
94
+ }
95
+ },
96
+ "required": [
97
+ "toolName",
98
+ "toolInput"
99
+ ],
100
+ "additionalProperties": false,
101
+ "title": "ProtojsonToolCall",
102
+ "description": "Tool call in Vertex EvaluationService wire format."
103
+ },
104
+ "__schema6": {
105
+ "type": "string",
106
+ "description": "Tool name as emitted by the agent."
107
+ },
108
+ "__schema7": {
109
+ "type": "string",
110
+ "description": "JSON-serialized tool arguments (Vertex wire format)."
111
+ },
112
+ "ProtojsonTrajectory": {
113
+ "type": "object",
114
+ "properties": {
115
+ "toolCalls": {
116
+ "$ref": "#/$defs/__schema5"
117
+ }
118
+ },
119
+ "required": [
120
+ "toolCalls"
121
+ ],
122
+ "additionalProperties": false,
123
+ "title": "ProtojsonTrajectory",
124
+ "description": "Vertex Trajectory message wire format."
125
+ },
126
+ "__schema8": {
127
+ "description": "Reference tool-call trajectory.",
128
+ "$ref": "#/$defs/ProtojsonTrajectory"
129
+ },
130
+ "TrajectorySingleToolUseInstanceJson": {
131
+ "type": "object",
132
+ "properties": {
133
+ "predictedTrajectory": {
134
+ "$ref": "#/$defs/__schema9"
135
+ }
136
+ },
137
+ "required": [
138
+ "predictedTrajectory"
139
+ ],
140
+ "additionalProperties": false,
141
+ "title": "TrajectorySingleToolUseInstanceJson",
142
+ "description": "Vertex TrajectorySingleToolUseInstance wire format."
143
+ },
144
+ "__schema9": {
145
+ "description": "Predicted tool-call trajectory.",
146
+ "$ref": "#/$defs/ProtojsonTrajectory"
147
+ },
148
+ "EvaluationInstanceJson": {
149
+ "type": "object",
150
+ "properties": {
151
+ "prompt": {
152
+ "$ref": "#/$defs/__schema10"
153
+ },
154
+ "response": {
155
+ "$ref": "#/$defs/__schema13"
156
+ },
157
+ "reference": {
158
+ "$ref": "#/$defs/__schema14"
159
+ }
160
+ },
161
+ "additionalProperties": false,
162
+ "title": "EvaluationInstanceJson",
163
+ "description": "Vertex EvaluationInstance wire format (agentEvalData omitted in v1)."
164
+ },
165
+ "__schema10": {
166
+ "description": "Eval prompt.",
167
+ "$ref": "#/$defs/InstanceData"
168
+ },
169
+ "InstanceData": {
170
+ "type": "object",
171
+ "properties": {
172
+ "text": {
173
+ "$ref": "#/$defs/__schema11"
174
+ }
175
+ },
176
+ "additionalProperties": false,
177
+ "title": "InstanceData",
178
+ "description": "EvaluationInstance prompt/response/reference text wrapper."
179
+ },
180
+ "__schema11": {
181
+ "description": "Plain text instance data.",
182
+ "$ref": "#/$defs/__schema12"
183
+ },
184
+ "__schema12": {
185
+ "type": "string"
186
+ },
187
+ "__schema13": {
188
+ "description": "Final agent response.",
189
+ "$ref": "#/$defs/InstanceData"
190
+ },
191
+ "__schema14": {
192
+ "description": "Reference answer text.",
193
+ "$ref": "#/$defs/InstanceData"
194
+ }
195
+ }
196
+ }