@inbrowser/agent 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/LICENSE +21 -0
  2. package/dist/diagnostics/index.d.ts +5 -0
  3. package/dist/diagnostics/index.d.ts.map +1 -0
  4. package/dist/diagnostics/index.js +3 -0
  5. package/dist/diagnostics/index.js.map +1 -0
  6. package/dist/diagnostics/timing.d.ts +48 -0
  7. package/dist/diagnostics/timing.d.ts.map +1 -0
  8. package/dist/diagnostics/timing.js +85 -0
  9. package/dist/diagnostics/timing.js.map +1 -0
  10. package/dist/diagnostics/truthfulness.d.ts +36 -0
  11. package/dist/diagnostics/truthfulness.d.ts.map +1 -0
  12. package/dist/diagnostics/truthfulness.js +180 -0
  13. package/dist/diagnostics/truthfulness.js.map +1 -0
  14. package/dist/dispatch-memoization.d.ts +84 -0
  15. package/dist/dispatch-memoization.d.ts.map +1 -0
  16. package/dist/dispatch-memoization.js +197 -0
  17. package/dist/dispatch-memoization.js.map +1 -0
  18. package/dist/eval/comparison-report.d.ts +164 -0
  19. package/dist/eval/comparison-report.d.ts.map +1 -0
  20. package/dist/eval/comparison-report.js +316 -0
  21. package/dist/eval/comparison-report.js.map +1 -0
  22. package/dist/eval/fixture.d.ts +74 -0
  23. package/dist/eval/fixture.d.ts.map +1 -0
  24. package/dist/eval/fixture.js +217 -0
  25. package/dist/eval/fixture.js.map +1 -0
  26. package/dist/eval/index.d.ts +13 -0
  27. package/dist/eval/index.d.ts.map +1 -0
  28. package/dist/eval/index.js +7 -0
  29. package/dist/eval/index.js.map +1 -0
  30. package/dist/eval/load-node.d.ts +16 -0
  31. package/dist/eval/load-node.d.ts.map +1 -0
  32. package/dist/eval/load-node.js +58 -0
  33. package/dist/eval/load-node.js.map +1 -0
  34. package/dist/eval/metric-collector.d.ts +209 -0
  35. package/dist/eval/metric-collector.d.ts.map +1 -0
  36. package/dist/eval/metric-collector.js +293 -0
  37. package/dist/eval/metric-collector.js.map +1 -0
  38. package/dist/eval/run-record.d.ts +76 -0
  39. package/dist/eval/run-record.d.ts.map +1 -0
  40. package/dist/eval/run-record.js +32 -0
  41. package/dist/eval/run-record.js.map +1 -0
  42. package/dist/eval/runner.d.ts +140 -0
  43. package/dist/eval/runner.d.ts.map +1 -0
  44. package/dist/eval/runner.js +310 -0
  45. package/dist/eval/runner.js.map +1 -0
  46. package/dist/eval/spec-framework.d.ts +113 -0
  47. package/dist/eval/spec-framework.d.ts.map +1 -0
  48. package/dist/eval/spec-framework.js +100 -0
  49. package/dist/eval/spec-framework.js.map +1 -0
  50. package/dist/eval/spec-helpers.d.ts +245 -0
  51. package/dist/eval/spec-helpers.d.ts.map +1 -0
  52. package/dist/eval/spec-helpers.js +605 -0
  53. package/dist/eval/spec-helpers.js.map +1 -0
  54. package/dist/index.d.ts +24 -3
  55. package/dist/index.d.ts.map +1 -1
  56. package/dist/index.js +11 -1
  57. package/dist/index.js.map +1 -1
  58. package/dist/node.d.ts +1 -0
  59. package/dist/node.d.ts.map +1 -1
  60. package/dist/node.js +1 -0
  61. package/dist/node.js.map +1 -1
  62. package/dist/planner-executor.d.ts +132 -0
  63. package/dist/planner-executor.d.ts.map +1 -0
  64. package/dist/planner-executor.js +274 -0
  65. package/dist/planner-executor.js.map +1 -0
  66. package/dist/skill-catalog.d.ts +81 -0
  67. package/dist/skill-catalog.d.ts.map +1 -0
  68. package/dist/skill-catalog.js +388 -0
  69. package/dist/skill-catalog.js.map +1 -0
  70. package/dist/skill-router.d.ts +95 -0
  71. package/dist/skill-router.d.ts.map +1 -0
  72. package/dist/skill-router.js +130 -0
  73. package/dist/skill-router.js.map +1 -0
  74. package/dist/strategy.d.ts +20 -1
  75. package/dist/strategy.d.ts.map +1 -1
  76. package/dist/strategy.js +333 -13
  77. package/dist/strategy.js.map +1 -1
  78. package/dist/tools.d.ts +15 -1
  79. package/dist/tools.d.ts.map +1 -1
  80. package/dist/tools.js +18 -0
  81. package/dist/tools.js.map +1 -1
  82. package/dist/types/strategy.d.ts +48 -0
  83. package/dist/types/strategy.d.ts.map +1 -1
  84. package/dist/types/tools.d.ts +18 -0
  85. package/dist/types/tools.d.ts.map +1 -1
  86. package/dist/types/trace.d.ts +59 -9
  87. package/dist/types/trace.d.ts.map +1 -1
  88. package/dist/types/trace.js +5 -3
  89. package/dist/types/trace.js.map +1 -1
  90. package/package.json +1 -1
@@ -0,0 +1,197 @@
1
+ /**
2
+ * Content-addressed memoization layer over `createDispatch`.
3
+ *
4
+ * `createMemoizedDispatch(registry, options?)` returns a `ToolDispatch`-shaped
5
+ * object that caches `ToolResult`s keyed on `(toolName, argsHash,
6
+ * workspaceHash, runtimeHash)`. The cache is consulted only for handlers
7
+ * tagged `pure` (see `isPure` in `./tools.ts`). Non-pure handlers bypass
8
+ * the cache entirely and always execute. Errors from the underlying
9
+ * dispatch propagate; they are NOT cached, since they may be transient.
10
+ *
11
+ * The returned object is structurally a `ToolDispatch` — strategies and
12
+ * downstream code that already accept `ToolDispatch` use it transparently.
13
+ * The one addition is `stats()`, which returns the running counters for
14
+ * hits / misses / bypassed calls. The cache lives for the lifetime of
15
+ * one `MemoizedDispatch` instance; there is no global state.
16
+ *
17
+ * Design notes:
18
+ *
19
+ * - Hashing uses FNV-1a 32-bit over a stable-stringified JSON
20
+ * representation. The cache is for short-running test loops; a
21
+ * cryptographic hash is overkill. Collisions are tolerable at our
22
+ * cache sizes, and the cost of a missed hit is at worst a recomputation.
23
+ * - Argument keys are sorted at every level via `stableStringify` so two
24
+ * structurally-equal arg objects produce the same key regardless of
25
+ * property insertion order.
26
+ * - Workspace hash covers `presetId`, `rules`, `code`, and `appSource`.
27
+ * `stitch` is excluded per the brief — pure tools don't read from it.
28
+ * - Runtime hash is included only when `'runtime' \in keyComponents`.
29
+ * Defaults to `['workspace']`; opting into runtime opt-in keeps the
30
+ * default key small for the dominant pure-tool population.
31
+ * - No eviction in v1. Eval runs are bounded; one instance per harness
32
+ * trial keeps cache growth bounded too.
33
+ */
34
+ import { createDispatch, isPure } from './tools.js';
35
+ /**
36
+ * Wrap a registry in a memoizing dispatcher. The wrapper holds its own
37
+ * cache; the underlying dispatch is the standard `createDispatch(registry)`.
38
+ *
39
+ * Non-pure handlers (including unknown-tool errors) bypass the cache and
40
+ * are dispatched directly; `bypassed` is incremented for those calls.
41
+ */
42
+ export function createMemoizedDispatch(registry, options) {
43
+ const keyComponents = options?.keyComponents ?? ['workspace'];
44
+ const includeRuntime = keyComponents.includes('runtime');
45
+ const includeWorkspace = keyComponents.includes('workspace');
46
+ const underlying = createDispatch(registry);
47
+ const cache = new Map();
48
+ const counters = { hits: 0, misses: 0, bypassed: 0 };
49
+ return {
50
+ async execute(call, ctx) {
51
+ const handler = findHandler(registry, call.name);
52
+ // Non-pure handlers (and unknown tools) skip the cache. Unknown
53
+ // tools surface their error message through the underlying
54
+ // dispatch unchanged — the cache layer is invisible on the
55
+ // non-pure path.
56
+ if (!handler || !isPure(handler)) {
57
+ counters.bypassed += 1;
58
+ return underlying.execute(call, ctx);
59
+ }
60
+ const key = buildCacheKey(call, ctx, includeWorkspace, includeRuntime);
61
+ const cached = cache.get(key);
62
+ if (cached !== undefined) {
63
+ counters.hits += 1;
64
+ return cached;
65
+ }
66
+ // Cache miss. Underlying dispatch runs; the result is cached on
67
+ // success. Note: `createDispatch` already converts thrown
68
+ // handlers into `{ ok: false, summary: '... threw: ...' }`
69
+ // results, so we never observe a thrown error here. The brief
70
+ // says "errors propagate, are not cached" — that maps to
71
+ // `ok === false` here. We cache only successful results.
72
+ const result = await underlying.execute(call, ctx);
73
+ counters.misses += 1;
74
+ if (result.ok) {
75
+ cache.set(key, result);
76
+ }
77
+ return result;
78
+ },
79
+ stats() {
80
+ return { ...counters };
81
+ },
82
+ };
83
+ }
84
+ /**
85
+ * Build the deterministic cache key for `(toolName, argsHash,
86
+ * workspaceHash, runtimeHash)`. Components are joined with `|` so a
87
+ * single string is hashable in one pass on lookup.
88
+ */
89
+ function buildCacheKey(call, ctx, includeWorkspace, includeRuntime) {
90
+ const argsKey = hashFnv1a32(stableStringify(call.args));
91
+ const wsKey = includeWorkspace ? hashFnv1a32(stableStringify(workspaceShape(ctx.workspace))) : '';
92
+ const rtKey = includeRuntime ? hashFnv1a32(stableStringify(runtimeShape(ctx.runtime))) : '';
93
+ return `${call.name}|${argsKey}|${wsKey}|${rtKey}`;
94
+ }
95
+ /**
96
+ * Project the workspace into the subset of fields a pure tool can
97
+ * legitimately read. `stitch` is excluded — design context is
98
+ * orthogonal to the documented pure-tool population (rules-stdlib-list,
99
+ * path-discovery, etc.). Two workspaces that differ only in `stitch`
100
+ * are treated as equivalent for cache purposes.
101
+ */
102
+ function workspaceShape(ws) {
103
+ if (!ws)
104
+ return { _present: false };
105
+ return {
106
+ presetId: ws.presetId,
107
+ rules: ws.rules,
108
+ code: ws.code,
109
+ appSource: ws.appSource,
110
+ };
111
+ }
112
+ /**
113
+ * Project the runtime into a stable shape. Only included in the cache
114
+ * key when the caller opts into `'runtime'` in `keyComponents`.
115
+ */
116
+ function runtimeShape(rt) {
117
+ if (!rt)
118
+ return { _present: false };
119
+ return {
120
+ terminal: rt.terminal,
121
+ runSummary: rt.runSummary,
122
+ deploy: rt.deploy,
123
+ parseError: rt.parseError,
124
+ uiErrors: rt.uiErrors,
125
+ sandboxVersion: rt.sandboxVersion,
126
+ };
127
+ }
128
+ /**
129
+ * Stable JSON serialization: object keys are sorted alphabetically at
130
+ * every nesting level. Arrays preserve order (they are positional).
131
+ * Functions, symbols, `undefined` properties are omitted (standard
132
+ * JSON behaviour). `null` is preserved.
133
+ *
134
+ * This is intentionally not `JSON.stringify(value)` — that emits keys
135
+ * in insertion order, so two structurally-equal arg objects assembled
136
+ * differently would produce different cache keys.
137
+ */
138
+ export function stableStringify(value) {
139
+ return serialize(value);
140
+ }
141
+ function serialize(value) {
142
+ if (value === null)
143
+ return 'null';
144
+ if (value === undefined)
145
+ return 'undefined';
146
+ if (typeof value === 'number') {
147
+ return Number.isFinite(value) ? JSON.stringify(value) : 'null';
148
+ }
149
+ if (typeof value === 'string' || typeof value === 'boolean') {
150
+ return JSON.stringify(value);
151
+ }
152
+ if (Array.isArray(value)) {
153
+ return `[${value.map(serialize).join(',')}]`;
154
+ }
155
+ if (typeof value === 'object') {
156
+ const obj = value;
157
+ const keys = Object.keys(obj).sort();
158
+ const parts = [];
159
+ for (const k of keys) {
160
+ const v = obj[k];
161
+ if (v === undefined)
162
+ continue;
163
+ parts.push(`${JSON.stringify(k)}:${serialize(v)}`);
164
+ }
165
+ return `{${parts.join(',')}}`;
166
+ }
167
+ // Functions, symbols, bigints — fall back to a stable string form.
168
+ // Pure tool args should never contain these; the fallback is defensive.
169
+ return JSON.stringify(String(value));
170
+ }
171
+ /**
172
+ * FNV-1a 32-bit hash. Returns the lowercase hex string. Fast,
173
+ * dependency-free, and collision-tolerable at our cache sizes.
174
+ * Iterates the UTF-16 code units of the input; sufficient for our
175
+ * stably-stringified JSON payloads, which only contain ASCII control
176
+ * characters and JSON syntax tokens plus user-supplied string data.
177
+ */
178
+ export function hashFnv1a32(input) {
179
+ let hash = 0x811c9dc5; // FNV offset basis (32-bit)
180
+ for (let i = 0; i < input.length; i += 1) {
181
+ hash ^= input.charCodeAt(i);
182
+ // 32-bit FNV prime multiplication via shifts (avoids precision loss).
183
+ hash = (hash + ((hash << 1) + (hash << 4) + (hash << 7) + (hash << 8) + (hash << 24))) >>> 0;
184
+ }
185
+ return hash.toString(16).padStart(8, '0');
186
+ }
187
+ /**
188
+ * Lookup a handler by name without exposing a `get` method on the
189
+ * registry interface. Mirrors the helper in `tools.ts`; duplicated here
190
+ * to keep the memoization module independent of internal helpers.
191
+ */
192
+ function findHandler(registry, name) {
193
+ if (!registry.has(name))
194
+ return undefined;
195
+ return registry.list().find((h) => h.name === name);
196
+ }
197
+ //# sourceMappingURL=dispatch-memoization.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dispatch-memoization.js","sourceRoot":"","sources":["../src/dispatch-memoization.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgCG;AAEH,OAAO,EAAE,cAAc,EAAE,MAAM,EAAE,MAAM,YAAY,CAAC;AAsCpD;;;;;;GAMG;AACH,MAAM,UAAU,sBAAsB,CACpC,QAAsB,EACtB,OAAqB;IAErB,MAAM,aAAa,GAAuB,OAAO,EAAE,aAAa,IAAI,CAAC,WAAW,CAAC,CAAC;IAClF,MAAM,cAAc,GAAG,aAAa,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC;IACzD,MAAM,gBAAgB,GAAG,aAAa,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC;IAE7D,MAAM,UAAU,GAAG,cAAc,CAAC,QAAQ,CAAC,CAAC;IAC5C,MAAM,KAAK,GAAG,IAAI,GAAG,EAAsB,CAAC;IAC5C,MAAM,QAAQ,GAAc,EAAE,IAAI,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,QAAQ,EAAE,CAAC,EAAE,CAAC;IAEhE,OAAO;QACL,KAAK,CAAC,OAAO,CAAC,IAAc,EAAE,GAAgB;YAC5C,MAAM,OAAO,GAAG,WAAW,CAAC,QAAQ,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC;YACjD,gEAAgE;YAChE,2DAA2D;YAC3D,2DAA2D;YAC3D,iBAAiB;YACjB,IAAI,CAAC,OAAO,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC;gBACjC,QAAQ,CAAC,QAAQ,IAAI,CAAC,CAAC;gBACvB,OAAO,UAAU,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;YACvC,CAAC;YAED,MAAM,GAAG,GAAG,aAAa,CAAC,IAAI,EAAE,GAAG,EAAE,gBAAgB,EAAE,cAAc,CAAC,CAAC;YACvE,MAAM,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YAC9B,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;gBACzB,QAAQ,CAAC,IAAI,IAAI,CAAC,CAAC;gBACnB,OAAO,MAAM,CAAC;YAChB,CAAC;YAED,gEAAgE;YAChE,0DAA0D;YAC1D,2DAA2D;YAC3D,8DAA8D;YAC9D,yDAAyD;YACzD,yDAAyD;YACzD,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;YACnD,QAAQ,CAAC,MAAM,IAAI,CAAC,CAAC;YACrB,IAAI,MAAM,CAAC,EAAE,EAAE,CAAC;gBACd,KAAK,CAAC,GAAG,CAAC,GAAG,EAAE,MAAM,CAAC,CAAC;YACzB,CAAC;YACD,OAAO,MAAM,CAAC;QAChB,CAAC;QACD,KAAK;YACH,OAAO,EAAE,GAAG,QAAQ,EAAE,CAAC;QACzB,CAAC;KACF,CAAC;AACJ,CAAC;AAED;;;;GAIG;AACH,SAAS,aAAa,CACpB,IAAc,EACd,GAAgB,EAChB,gBAAyB,EACzB,cAAuB;IAEvB,MAAM,OAAO,GAAG,WAAW,CAAC,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;IACxD,MAAM,KAAK,GAAG,gBAAgB,CAAC,CAAC,CAAC,WAAW,CAAC,eAAe,CAAC,cAAc,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IAClG,MAAM,KAAK,GAAG,cAAc,CAAC,CAAC,CAAC,WAAW,CAAC,eAAe,CAAC,YAAY,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IAC5F,OAAO,GAAG,IAAI,CAAC,IAAI,IAAI,OAAO,IAAI,KAAK,IAAI,KAAK,EAAE,CAAC;AACrD,CAAC;AAED;;;;;;GAMG;AACH,SAAS,cAAc,CAAC,EAAc;IACpC,IAAI,CAAC,EAAE;QAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC;IACpC,OAAO;QACL,QAAQ,EAAE,EAAE,CAAC,QAAQ;QACrB,KAAK,EAAE,EAAE,CAAC,KAAK;QACf,IAAI,EAAE,EAAE,CAAC,IAAI;QACb,SAAS,EAAE,EAAE,CAAC,SAAS;KACxB,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,SAAS,YAAY,CAAC,EAAiB;IACrC,IAAI,CAAC,EAAE;QAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC;IACpC,OAAO;QACL,QAAQ,EAAE,EAAE,CAAC,QAAQ;QACrB,UAAU,EAAE,EAAE,CAAC,UAAU;QACzB,MAAM,EAAE,EAAE,CAAC,MAAM;QACjB,UAAU,EAAE,EAAE,CAAC,UAAU;QACzB,QAAQ,EAAE,EAAE,CAAC,QAAQ;QACrB,cAAc,EAAE,EAAE,CAAC,cAAc;KAClC,CAAC;AACJ,CAAC;AAED;;;;;;;;;GASG;AACH,MAAM,UAAU,eAAe,CAAC,KAAc;IAC5C,OAAO,SAAS,CAAC,KAAK,CAAC,CAAC;AAC1B,CAAC;AAED,SAAS,SAAS,CAAC,KAAc;IAC/B,IAAI,KAAK,KAAK,IAAI;QAAE,OAAO,MAAM,CAAC;IAClC,IAAI,KAAK,KAAK,SAAS;QAAE,OAAO,WAAW,CAAC;IAC5C,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;QAC9B,OAAO,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;IACjE,CAAC;IACD,IAAI,OAAO,KAAK,KAAK,QAAQ,IAAI,OAAO,KAAK,KAAK,SAAS,EAAE,CAAC;QAC5D,OAAO,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;IAC/B,CAAC;IACD,IAAI,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;QACzB,OAAO,IAAI,KAAK,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC;IAC/C,CAAC;IACD,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;QAC9B,MAAM,GAAG,GAAG,KAAgC,CAAC;QAC7C,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QACrC,MAAM,KAAK,GAAa,EAAE,CAAC;QAC3B,KAAK,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC;YACrB,MAAM,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC;YACjB,IAAI,CAAC,KAAK,SAAS;gBAAE,SAAS;YAC9B,KAAK,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,SAAS,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QACrD,CAAC;QACD,OAAO,IAAI,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC;IAChC,CAAC;IACD,mEAAmE;IACnE,wEAAwE;IACxE,OAAO,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC;AACvC,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,WAAW,CAAC,KAAa;IACvC,IAAI,IAAI,GAAG,UAAU,CAAC,CAAC,4BAA4B;IACnD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC;QACzC,IAAI,IAAI,KAAK,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QAC5B,sEAAsE;QACtE,IAAI,GAAG,CAAC,IAAI,GAAG,CAAC,CAAC,IAAI,IAAI,CAAC,CAAC,GAAG,CAAC,IAAI,IAAI,CAAC,CAAC,GAAG,CAAC,IAAI,IAAI,CAAC,CAAC,GAAG,CAAC,IAAI,IAAI,CAAC,CAAC,GAAG,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;IAC/F,CAAC;IACD,OAAO,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;AAC5C,CAAC;AAED;;;;GAIG;AACH,SAAS,WAAW,CAAC,QAAsB,EAAE,IAAY;IACvD,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC;QAAE,OAAO,SAAS,CAAC;IAC1C,OAAO,QAAQ,CAAC,IAAI,EAAE,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,IAAI,CAAC,CAAC;AACtD,CAAC"}
@@ -0,0 +1,164 @@
1
+ /**
2
+ * `compareMetrics` — the eval harness's A/B comparison report.
3
+ *
4
+ * Consumes two named `MetricsTable[]`s (typically a "baseline" and a
5
+ * "variant"), pairs them by `fixtureId`, and for each per-metric
6
+ * column emits a row containing the inputs (mean +/- spread for both
7
+ * sides), the delta (`variantMean - baselineMean`), the threshold
8
+ * (`max(baselineSpread, variantSpread)`), and a label.
9
+ *
10
+ * The label uses the implementation plan's no-effect rule:
11
+ *
12
+ * - If `Math.abs(delta) < threshold`, label `no-effect`. This is the
13
+ * central discipline: noisy trial-to-trial variance must not be
14
+ * reported as a winner.
15
+ * - Otherwise consult the metric's polarity (see `POLARITY` below):
16
+ * higher-is-better -> sign of `delta` picks the winner;
17
+ * lower-is-better -> sign of `delta` picks the loser;
18
+ * neutral -> emit `changed` (no winner, just a flag).
19
+ *
20
+ * Polarity is a static map declared in this module. Adding a new
21
+ * metric to the collector forces an entry here because the map is
22
+ * keyed by a string-literal union derived from `MetricsTable` fields.
23
+ *
24
+ * Two renderers ship with the comparator:
25
+ *
26
+ * - `renderMarkdown(report)` — a human-readable markdown table for
27
+ * piping to stdout from a CLI consumer.
28
+ * - `renderJson(report)` — `JSON.stringify(report, null, 2)`. Plain
29
+ * data; round-trips through `JSON.parse(JSON.stringify(report))`
30
+ * without loss.
31
+ *
32
+ * No statistical sophistication beyond the no-effect rule. Confidence
33
+ * intervals, cross-fixture aggregation, and multi-variant comparisons
34
+ * are deliberately deferred to follow-up branches.
35
+ *
36
+ * Browser-safe — no Node imports.
37
+ *
38
+ * Note on naming: metric names match the collector exactly. Do not
39
+ * rename. The CLI, the report, and any later UI all key on these
40
+ * identifiers, and renaming in one layer without the others silently
41
+ * breaks consumers.
42
+ */
43
+ import type { AggregateStat, MetricsTable } from './metric-collector.js';
44
+ /**
45
+ * The set of metric names a comparison row can target. Each name is
46
+ * the dotted path from `AggregatedMetrics` down to an `AggregateStat`.
47
+ * `toolCallCount` is a nested object on the aggregate; the three
48
+ * children each get their own polarity entry.
49
+ *
50
+ * Keep this in lockstep with `AggregatedMetrics` in
51
+ * `./metric-collector.ts`. The compiler enforces exhaustiveness on
52
+ * `POLARITY` below.
53
+ */
54
+ export type ComparisonMetricName = 'taskSuccessRate' | 'wallClockMs' | 'promptTokens' | 'completionTokens' | 'toolCallCount.total' | 'toolCallCount.reads' | 'toolCallCount.mutations' | 'turnCount' | 'peakContextWindowBytes' | 'truthfulnessViolationRate' | 'dispatchVsLlmRatio';
55
+ /** Direction in which "more" is better, worse, or neither. */
56
+ export type Polarity = 'higher-is-better' | 'lower-is-better' | 'neutral';
57
+ /**
58
+ * Static polarity table. Choices are documented per row; the
59
+ * neutral defaults are the metrics the implementation plan
60
+ * explicitly says are context-dependent.
61
+ *
62
+ * Adding a new metric to the collector requires adding an entry
63
+ * here. The `Record<ComparisonMetricName, Polarity>` shape keeps the
64
+ * compiler honest if `ComparisonMetricName` is extended.
65
+ */
66
+ export declare const POLARITY: Record<ComparisonMetricName, Polarity>;
67
+ /** Label assigned to a per-metric row after applying the no-effect rule. */
68
+ export type ComparisonLabel = 'no-effect' | 'winner-baseline' | 'winner-variant' | 'changed';
69
+ /**
70
+ * A single per-fixture, per-metric row.
71
+ *
72
+ * `delta` is `variantMean - baselineMean`. Both means are passed
73
+ * through verbatim from the input aggregates and may be `undefined`
74
+ * (the collector returns `undefined` for metrics it could not
75
+ * compute). When either mean is `undefined`, the row's `label` is
76
+ * `no-effect` and `delta` / `threshold` are `undefined` — there is
77
+ * nothing to compare.
78
+ */
79
+ export interface ComparisonRow {
80
+ fixtureId: string;
81
+ metric: ComparisonMetricName;
82
+ polarity: Polarity;
83
+ baseline: AggregateStat;
84
+ variant: AggregateStat;
85
+ /** `variantMean - baselineMean`. `undefined` when either mean is `undefined`. */
86
+ delta: number | undefined;
87
+ /** `max(baselineSpread, variantSpread)`. `undefined` when either spread is `undefined`. */
88
+ threshold: number | undefined;
89
+ label: ComparisonLabel;
90
+ }
91
+ /** Per-fixture grouping. Carries the comparison rows plus a coverage status. */
92
+ export interface ComparisonFixture {
93
+ fixtureId: string;
94
+ /**
95
+ * `both` when the fixture appears in both inputs;
96
+ * `baseline-only` / `variant-only` when one side is missing the
97
+ * fixture. The missing-side cases carry no per-metric rows.
98
+ */
99
+ status: 'both' | 'baseline-only' | 'variant-only';
100
+ rows: ComparisonRow[];
101
+ }
102
+ /** The full report. Suitable for both renderers. */
103
+ export interface ComparisonReport {
104
+ /** Label for the left-hand side. Defaults to `'baseline'`. */
105
+ baselineName: string;
106
+ /** Label for the right-hand side. Defaults to `'variant'`. */
107
+ variantName: string;
108
+ /** One entry per fixture across both inputs, in baseline-first union order. */
109
+ fixtures: ComparisonFixture[];
110
+ }
111
+ /** Input to `compareMetrics`. */
112
+ export interface CompareMetricsInput {
113
+ baseline: readonly MetricsTable[];
114
+ variant: readonly MetricsTable[];
115
+ /** Optional label for the baseline column. Defaults to `'baseline'`. */
116
+ baselineName?: string;
117
+ /** Optional label for the variant column. Defaults to `'variant'`. */
118
+ variantName?: string;
119
+ /**
120
+ * Override entries in the static `POLARITY` map for this report.
121
+ * Useful when a specific experiment has a directional read on a
122
+ * normally-neutral metric (e.g. parallel dispatch expects
123
+ * `toolCallCount.reads` to stay flat — which is still neutral —
124
+ * but a memoization experiment might prefer lower `wallClockMs`
125
+ * exclusively without changing other directions; in practice the
126
+ * defaults are correct and this is rarely needed).
127
+ */
128
+ polarityOverrides?: Partial<Record<ComparisonMetricName, Polarity>>;
129
+ }
130
+ /**
131
+ * Build a `ComparisonReport` from two named metric tables.
132
+ *
133
+ * Fixtures present in both sides are paired; fixtures unique to one
134
+ * side surface with an explicit `status` and no rows. Within each
135
+ * paired fixture, every entry in `POLARITY` produces one row.
136
+ *
137
+ * Never throws on missing data — `undefined` means / spreads carry
138
+ * through to `undefined` deltas / thresholds and a `no-effect` label.
139
+ */
140
+ export declare function compareMetrics(input: CompareMetricsInput): ComparisonReport;
141
+ /**
142
+ * Render a `ComparisonReport` as plain JSON.
143
+ *
144
+ * Implementation is `JSON.stringify(report, null, 2)`. The report is
145
+ * pure data: no functions, no `undefined` in places where the
146
+ * renderer cares (numeric fields surface as `null` after a round-trip
147
+ * but the consumer treats `null` and missing the same way).
148
+ */
149
+ export declare function renderJson(report: ComparisonReport): string;
150
+ /**
151
+ * Render a `ComparisonReport` as a markdown document.
152
+ *
153
+ * One section per fixture. Each section has a header with the
154
+ * fixture id and its coverage status, then a table with seven
155
+ * columns: metric, baseline (mean +/- spread), variant (mean +/-
156
+ * spread), delta, threshold, polarity, label.
157
+ *
158
+ * Numeric formatting:
159
+ * - mean / spread use up to four significant digits;
160
+ * - `undefined` renders as `-`;
161
+ * - delta and threshold use the same formatter as mean.
162
+ */
163
+ export declare function renderMarkdown(report: ComparisonReport): string;
164
+ //# sourceMappingURL=comparison-report.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"comparison-report.d.ts","sourceRoot":"","sources":["../../src/eval/comparison-report.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAyCG;AAEH,OAAO,KAAK,EAAE,aAAa,EAAE,YAAY,EAAE,MAAM,uBAAuB,CAAC;AAIzE;;;;;;;;;GASG;AACH,MAAM,MAAM,oBAAoB,GAC5B,iBAAiB,GACjB,aAAa,GACb,cAAc,GACd,kBAAkB,GAClB,qBAAqB,GACrB,qBAAqB,GACrB,yBAAyB,GACzB,WAAW,GACX,wBAAwB,GACxB,2BAA2B,GAC3B,oBAAoB,CAAC;AAEzB,8DAA8D;AAC9D,MAAM,MAAM,QAAQ,GAAG,kBAAkB,GAAG,iBAAiB,GAAG,SAAS,CAAC;AAE1E;;;;;;;;GAQG;AACH,eAAO,MAAM,QAAQ,EAAE,MAAM,CAAC,oBAAoB,EAAE,QAAQ,CAqB3D,CAAC;AAIF,4EAA4E;AAC5E,MAAM,MAAM,eAAe,GAAG,WAAW,GAAG,iBAAiB,GAAG,gBAAgB,GAAG,SAAS,CAAC;AAE7F;;;;;;;;;GASG;AACH,MAAM,WAAW,aAAa;IAC5B,SAAS,EAAE,MAAM,CAAC;IAClB,MAAM,EAAE,oBAAoB,CAAC;IAC7B,QAAQ,EAAE,QAAQ,CAAC;IACnB,QAAQ,EAAE,aAAa,CAAC;IACxB,OAAO,EAAE,aAAa,CAAC;IACvB,iFAAiF;IACjF,KAAK,EAAE,MAAM,GAAG,SAAS,CAAC;IAC1B,2FAA2F;IAC3F,SAAS,EAAE,MAAM,GAAG,SAAS,CAAC;IAC9B,KAAK,EAAE,eAAe,CAAC;CACxB;AAED,gFAAgF;AAChF,MAAM,WAAW,iBAAiB;IAChC,SAAS,EAAE,MAAM,CAAC;IAClB;;;;OAIG;IACH,MAAM,EAAE,MAAM,GAAG,eAAe,GAAG,cAAc,CAAC;IAClD,IAAI,EAAE,aAAa,EAAE,CAAC;CACvB;AAED,oDAAoD;AACpD,MAAM,WAAW,gBAAgB;IAC/B,8DAA8D;IAC9D,YAAY,EAAE,MAAM,CAAC;IACrB,8DAA8D;IAC9D,WAAW,EAAE,MAAM,CAAC;IACpB,+EAA+E;IAC/E,QAAQ,EAAE,iBAAiB,EAAE,CAAC;CAC/B;AAID,iCAAiC;AACjC,MAAM,WAAW,mBAAmB;IAClC,QAAQ,EAAE,SAAS,YAAY,EAAE,CAAC;IAClC,OAAO,EAAE,SAAS,YAAY,EAAE,CAAC;IACjC,wEAAwE;IACxE,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,sEAAsE;IACtE,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB;;;;;;;;OAQG;IACH,iBAAiB,CAAC,EAAE,OAAO,CAAC,MAAM,CAAC,oBAAoB,EAAE,QAAQ,CAAC,CAAC,CAAC;CACrE;AAED;;;;;;;;;GASG;AACH,wBAAgB,cAAc,CAAC,KAAK,EAAE,mBAAmB,GAAG,gBAAgB,CA8C3E;AAgHD;;;;;;;GAOG;AACH,wBAAgB,UAAU,CAAC,MAAM,EAAE,gBAAgB,GAAG,MAAM,CAE3D;AAED;;;;;;;;;;;;GAYG;AACH,wBAAgB,cAAc,CAAC,MAAM,EAAE,gBAAgB,GAAG,MAAM,CA6C/D"}