@inbrowser/agent 0.0.0-placeholder → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (267) hide show
  1. package/AGENTS.md +270 -0
  2. package/LICENSE +21 -0
  3. package/README.md +117 -2
  4. package/bin/agent.ts +10 -0
  5. package/dist/cli/commands/describe.d.ts +14 -0
  6. package/dist/cli/commands/describe.d.ts.map +1 -0
  7. package/dist/cli/commands/describe.js +179 -0
  8. package/dist/cli/commands/describe.js.map +1 -0
  9. package/dist/cli/commands/events.d.ts +21 -0
  10. package/dist/cli/commands/events.d.ts.map +1 -0
  11. package/dist/cli/commands/events.js +59 -0
  12. package/dist/cli/commands/events.js.map +1 -0
  13. package/dist/cli/commands/fleet.d.ts +15 -0
  14. package/dist/cli/commands/fleet.d.ts.map +1 -0
  15. package/dist/cli/commands/fleet.js +149 -0
  16. package/dist/cli/commands/fleet.js.map +1 -0
  17. package/dist/cli/commands/help.d.ts +15 -0
  18. package/dist/cli/commands/help.d.ts.map +1 -0
  19. package/dist/cli/commands/help.js +93 -0
  20. package/dist/cli/commands/help.js.map +1 -0
  21. package/dist/cli/commands/migrate.d.ts +27 -0
  22. package/dist/cli/commands/migrate.d.ts.map +1 -0
  23. package/dist/cli/commands/migrate.js +109 -0
  24. package/dist/cli/commands/migrate.js.map +1 -0
  25. package/dist/cli/commands/run.d.ts +38 -0
  26. package/dist/cli/commands/run.d.ts.map +1 -0
  27. package/dist/cli/commands/run.js +535 -0
  28. package/dist/cli/commands/run.js.map +1 -0
  29. package/dist/cli/commands/schema.d.ts +8 -0
  30. package/dist/cli/commands/schema.d.ts.map +1 -0
  31. package/dist/cli/commands/schema.js +12 -0
  32. package/dist/cli/commands/schema.js.map +1 -0
  33. package/dist/cli/commands/serve.d.ts +39 -0
  34. package/dist/cli/commands/serve.d.ts.map +1 -0
  35. package/dist/cli/commands/serve.js +65 -0
  36. package/dist/cli/commands/serve.js.map +1 -0
  37. package/dist/cli/commands/undo.d.ts +36 -0
  38. package/dist/cli/commands/undo.d.ts.map +1 -0
  39. package/dist/cli/commands/undo.js +132 -0
  40. package/dist/cli/commands/undo.js.map +1 -0
  41. package/dist/cli/fixtures.d.ts +17 -0
  42. package/dist/cli/fixtures.d.ts.map +1 -0
  43. package/dist/cli/fixtures.js +107 -0
  44. package/dist/cli/fixtures.js.map +1 -0
  45. package/dist/cli/hardening.d.ts +39 -0
  46. package/dist/cli/hardening.d.ts.map +1 -0
  47. package/dist/cli/hardening.js +68 -0
  48. package/dist/cli/hardening.js.map +1 -0
  49. package/dist/cli/index.d.ts +28 -0
  50. package/dist/cli/index.d.ts.map +1 -0
  51. package/dist/cli/index.js +19 -0
  52. package/dist/cli/index.js.map +1 -0
  53. package/dist/cli/llm/openrouter.d.ts +33 -0
  54. package/dist/cli/llm/openrouter.d.ts.map +1 -0
  55. package/dist/cli/llm/openrouter.js +285 -0
  56. package/dist/cli/llm/openrouter.js.map +1 -0
  57. package/dist/cli/main.d.ts +32 -0
  58. package/dist/cli/main.d.ts.map +1 -0
  59. package/dist/cli/main.js +106 -0
  60. package/dist/cli/main.js.map +1 -0
  61. package/dist/cli/output.d.ts +36 -0
  62. package/dist/cli/output.d.ts.map +1 -0
  63. package/dist/cli/output.js +95 -0
  64. package/dist/cli/output.js.map +1 -0
  65. package/dist/cli/parse.d.ts +26 -0
  66. package/dist/cli/parse.d.ts.map +1 -0
  67. package/dist/cli/parse.js +160 -0
  68. package/dist/cli/parse.js.map +1 -0
  69. package/dist/cli/session-log.d.ts +34 -0
  70. package/dist/cli/session-log.d.ts.map +1 -0
  71. package/dist/cli/session-log.js +52 -0
  72. package/dist/cli/session-log.js.map +1 -0
  73. package/dist/cli/spec.d.ts +62 -0
  74. package/dist/cli/spec.d.ts.map +1 -0
  75. package/dist/cli/spec.js +510 -0
  76. package/dist/cli/spec.js.map +1 -0
  77. package/dist/cli/ui/RunView.d.ts +134 -0
  78. package/dist/cli/ui/RunView.d.ts.map +1 -0
  79. package/dist/cli/ui/RunView.js +341 -0
  80. package/dist/cli/ui/RunView.js.map +1 -0
  81. package/dist/diagnostics/index.d.ts +5 -0
  82. package/dist/diagnostics/index.d.ts.map +1 -0
  83. package/dist/diagnostics/index.js +3 -0
  84. package/dist/diagnostics/index.js.map +1 -0
  85. package/dist/diagnostics/timing.d.ts +48 -0
  86. package/dist/diagnostics/timing.d.ts.map +1 -0
  87. package/dist/diagnostics/timing.js +85 -0
  88. package/dist/diagnostics/timing.js.map +1 -0
  89. package/dist/diagnostics/truthfulness.d.ts +36 -0
  90. package/dist/diagnostics/truthfulness.d.ts.map +1 -0
  91. package/dist/diagnostics/truthfulness.js +180 -0
  92. package/dist/diagnostics/truthfulness.js.map +1 -0
  93. package/dist/dispatch-memoization.d.ts +84 -0
  94. package/dist/dispatch-memoization.d.ts.map +1 -0
  95. package/dist/dispatch-memoization.js +197 -0
  96. package/dist/dispatch-memoization.js.map +1 -0
  97. package/dist/eval/comparison-report.d.ts +164 -0
  98. package/dist/eval/comparison-report.d.ts.map +1 -0
  99. package/dist/eval/comparison-report.js +316 -0
  100. package/dist/eval/comparison-report.js.map +1 -0
  101. package/dist/eval/fixture.d.ts +74 -0
  102. package/dist/eval/fixture.d.ts.map +1 -0
  103. package/dist/eval/fixture.js +217 -0
  104. package/dist/eval/fixture.js.map +1 -0
  105. package/dist/eval/index.d.ts +13 -0
  106. package/dist/eval/index.d.ts.map +1 -0
  107. package/dist/eval/index.js +7 -0
  108. package/dist/eval/index.js.map +1 -0
  109. package/dist/eval/load-node.d.ts +16 -0
  110. package/dist/eval/load-node.d.ts.map +1 -0
  111. package/dist/eval/load-node.js +58 -0
  112. package/dist/eval/load-node.js.map +1 -0
  113. package/dist/eval/metric-collector.d.ts +209 -0
  114. package/dist/eval/metric-collector.d.ts.map +1 -0
  115. package/dist/eval/metric-collector.js +293 -0
  116. package/dist/eval/metric-collector.js.map +1 -0
  117. package/dist/eval/run-record.d.ts +76 -0
  118. package/dist/eval/run-record.d.ts.map +1 -0
  119. package/dist/eval/run-record.js +32 -0
  120. package/dist/eval/run-record.js.map +1 -0
  121. package/dist/eval/runner.d.ts +140 -0
  122. package/dist/eval/runner.d.ts.map +1 -0
  123. package/dist/eval/runner.js +310 -0
  124. package/dist/eval/runner.js.map +1 -0
  125. package/dist/eval/spec-framework.d.ts +113 -0
  126. package/dist/eval/spec-framework.d.ts.map +1 -0
  127. package/dist/eval/spec-framework.js +100 -0
  128. package/dist/eval/spec-framework.js.map +1 -0
  129. package/dist/eval/spec-helpers.d.ts +245 -0
  130. package/dist/eval/spec-helpers.d.ts.map +1 -0
  131. package/dist/eval/spec-helpers.js +605 -0
  132. package/dist/eval/spec-helpers.js.map +1 -0
  133. package/dist/events/codec.d.ts +79 -0
  134. package/dist/events/codec.d.ts.map +1 -0
  135. package/dist/events/codec.js +142 -0
  136. package/dist/events/codec.js.map +1 -0
  137. package/dist/events/log-core.d.ts +76 -0
  138. package/dist/events/log-core.d.ts.map +1 -0
  139. package/dist/events/log-core.js +73 -0
  140. package/dist/events/log-core.js.map +1 -0
  141. package/dist/events/log.d.ts +60 -0
  142. package/dist/events/log.d.ts.map +1 -0
  143. package/dist/events/log.js +193 -0
  144. package/dist/events/log.js.map +1 -0
  145. package/dist/events/replay.d.ts +106 -0
  146. package/dist/events/replay.d.ts.map +1 -0
  147. package/dist/events/replay.js +137 -0
  148. package/dist/events/replay.js.map +1 -0
  149. package/dist/events/wrap.d.ts +100 -0
  150. package/dist/events/wrap.d.ts.map +1 -0
  151. package/dist/events/wrap.js +141 -0
  152. package/dist/events/wrap.js.map +1 -0
  153. package/dist/index.d.ts +73 -0
  154. package/dist/index.d.ts.map +1 -0
  155. package/dist/index.js +47 -0
  156. package/dist/index.js.map +1 -0
  157. package/dist/llm-adapter.d.ts +96 -0
  158. package/dist/llm-adapter.d.ts.map +1 -0
  159. package/dist/llm-adapter.js +132 -0
  160. package/dist/llm-adapter.js.map +1 -0
  161. package/dist/mcp/serve.d.ts +70 -0
  162. package/dist/mcp/serve.d.ts.map +1 -0
  163. package/dist/mcp/serve.js +154 -0
  164. package/dist/mcp/serve.js.map +1 -0
  165. package/dist/metrics/runs.d.ts +58 -0
  166. package/dist/metrics/runs.d.ts.map +1 -0
  167. package/dist/metrics/runs.js +99 -0
  168. package/dist/metrics/runs.js.map +1 -0
  169. package/dist/metrics.d.ts +38 -0
  170. package/dist/metrics.d.ts.map +1 -0
  171. package/dist/metrics.js +123 -0
  172. package/dist/metrics.js.map +1 -0
  173. package/dist/node.d.ts +23 -0
  174. package/dist/node.d.ts.map +1 -0
  175. package/dist/node.js +23 -0
  176. package/dist/node.js.map +1 -0
  177. package/dist/planner-executor.d.ts +132 -0
  178. package/dist/planner-executor.d.ts.map +1 -0
  179. package/dist/planner-executor.js +274 -0
  180. package/dist/planner-executor.js.map +1 -0
  181. package/dist/session.d.ts +10 -0
  182. package/dist/session.d.ts.map +1 -0
  183. package/dist/session.js +179 -0
  184. package/dist/session.js.map +1 -0
  185. package/dist/skill-catalog.d.ts +81 -0
  186. package/dist/skill-catalog.d.ts.map +1 -0
  187. package/dist/skill-catalog.js +388 -0
  188. package/dist/skill-catalog.js.map +1 -0
  189. package/dist/skill-router.d.ts +95 -0
  190. package/dist/skill-router.d.ts.map +1 -0
  191. package/dist/skill-router.js +130 -0
  192. package/dist/skill-router.js.map +1 -0
  193. package/dist/storage.d.ts +14 -0
  194. package/dist/storage.d.ts.map +1 -0
  195. package/dist/storage.js +58 -0
  196. package/dist/storage.js.map +1 -0
  197. package/dist/strategy.d.ts +45 -0
  198. package/dist/strategy.d.ts.map +1 -0
  199. package/dist/strategy.js +520 -0
  200. package/dist/strategy.js.map +1 -0
  201. package/dist/tools.d.ts +40 -0
  202. package/dist/tools.d.ts.map +1 -0
  203. package/dist/tools.js +147 -0
  204. package/dist/tools.js.map +1 -0
  205. package/dist/types/agent.d.ts +94 -0
  206. package/dist/types/agent.d.ts.map +1 -0
  207. package/dist/types/agent.js +17 -0
  208. package/dist/types/agent.js.map +1 -0
  209. package/dist/types/capabilities.d.ts +17 -0
  210. package/dist/types/capabilities.d.ts.map +1 -0
  211. package/dist/types/capabilities.js +13 -0
  212. package/dist/types/capabilities.js.map +1 -0
  213. package/dist/types/chat.d.ts +74 -0
  214. package/dist/types/chat.d.ts.map +1 -0
  215. package/dist/types/chat.js +10 -0
  216. package/dist/types/chat.js.map +1 -0
  217. package/dist/types/events.d.ts +115 -0
  218. package/dist/types/events.d.ts.map +1 -0
  219. package/dist/types/events.js +30 -0
  220. package/dist/types/events.js.map +1 -0
  221. package/dist/types/llm.d.ts +89 -0
  222. package/dist/types/llm.d.ts.map +1 -0
  223. package/dist/types/llm.js +12 -0
  224. package/dist/types/llm.js.map +1 -0
  225. package/dist/types/metrics.d.ts +34 -0
  226. package/dist/types/metrics.d.ts.map +1 -0
  227. package/dist/types/metrics.js +10 -0
  228. package/dist/types/metrics.js.map +1 -0
  229. package/dist/types/observer.d.ts +41 -0
  230. package/dist/types/observer.d.ts.map +1 -0
  231. package/dist/types/observer.js +41 -0
  232. package/dist/types/observer.js.map +1 -0
  233. package/dist/types/project-context.d.ts +18 -0
  234. package/dist/types/project-context.d.ts.map +1 -0
  235. package/dist/types/project-context.js +11 -0
  236. package/dist/types/project-context.js.map +1 -0
  237. package/dist/types/runtime.d.ts +71 -0
  238. package/dist/types/runtime.d.ts.map +1 -0
  239. package/dist/types/runtime.js +21 -0
  240. package/dist/types/runtime.js.map +1 -0
  241. package/dist/types/session.d.ts +103 -0
  242. package/dist/types/session.d.ts.map +1 -0
  243. package/dist/types/session.js +11 -0
  244. package/dist/types/session.js.map +1 -0
  245. package/dist/types/storage.d.ts +20 -0
  246. package/dist/types/storage.d.ts.map +1 -0
  247. package/dist/types/storage.js +41 -0
  248. package/dist/types/storage.js.map +1 -0
  249. package/dist/types/strategy.d.ts +124 -0
  250. package/dist/types/strategy.d.ts.map +1 -0
  251. package/dist/types/strategy.js +10 -0
  252. package/dist/types/strategy.js.map +1 -0
  253. package/dist/types/tools.d.ts +154 -0
  254. package/dist/types/tools.d.ts.map +1 -0
  255. package/dist/types/tools.js +11 -0
  256. package/dist/types/tools.js.map +1 -0
  257. package/dist/types/trace.d.ts +175 -0
  258. package/dist/types/trace.d.ts.map +1 -0
  259. package/dist/types/trace.js +26 -0
  260. package/dist/types/trace.js.map +1 -0
  261. package/dist/types/workspace.d.ts +29 -0
  262. package/dist/types/workspace.d.ts.map +1 -0
  263. package/dist/types/workspace.js +18 -0
  264. package/dist/types/workspace.js.map +1 -0
  265. package/package.json +45 -14
  266. package/skills/agent-cli.md +218 -0
  267. package/index.js +0 -2
@@ -0,0 +1,113 @@
1
+ /**
2
+ * Success-spec framework for the eval harness.
3
+ *
4
+ * A success specification (a "spec") decides whether one captured agent
5
+ * run satisfied its fixture's success criterion. The framework provides
6
+ * three things:
7
+ *
8
+ * 1. A `SpecRegistry` plus an `createSpecRegistry()` factory.
9
+ * Registration is explicit — no side-effect registration on import.
10
+ * Callers build a registry, call `registerStarterSpecs()` if they
11
+ * want the common helpers (see `spec-helpers.ts`), and register
12
+ * their own specs on top.
13
+ *
14
+ * 2. An `evaluateSpec(registry, reference, snapshot)` evaluator that
15
+ * looks the spec up by name, awaits it (specs may be async), and
16
+ * returns a structured `SpecResult`. A thrown error inside the
17
+ * spec body is caught and converted into `{ ok: false, error }`,
18
+ * so a misbehaving spec never crashes the harness.
19
+ *
20
+ * 3. The `RunSnapshot` interface — the input every spec consumes.
21
+ * Intentionally narrower than the runner's eventual `RunRecord`,
22
+ * so this branch develops independently of `eval/harness-runner`.
23
+ * The runner's `RunRecord` is designed to be a structural superset:
24
+ * a runner caller passes its `record` straight to `evaluateSpec`.
25
+ *
26
+ * Specs read three slices of state — the final workspace (rules / code
27
+ * / app source), the final runtime (the most recent run summary, any
28
+ * uiErrors, terminal output, etc.), the full assistant text, and the
29
+ * trace. Anything outside those four fields is a sign the snapshot
30
+ * shape needs widening; do that here, not in spec bodies.
31
+ *
32
+ * Spec names follow the same `family/spec-name` kebab-case form that
33
+ * `validateFixture` enforces on `SuccessSpecReference.name`. Registration
34
+ * validates the name at registration time so typos surface immediately.
35
+ */
36
+ import type { RuntimeState } from '../types/runtime.js';
37
+ import type { TraceEvent } from '../types/trace.js';
38
+ import type { Workspace } from '../types/workspace.js';
39
+ import type { SuccessSpecReference } from './fixture.js';
40
+ /**
41
+ * Input shape every spec consumes. Intentionally narrower than the
42
+ * runner's eventual `RunRecord` so this branch develops independently
43
+ * of `eval/harness-runner`. The runner's record is designed to be a
44
+ * structural superset; a runner caller can pass its record straight to
45
+ * `evaluateSpec`.
46
+ */
47
+ export interface RunSnapshot {
48
+ /** Workspace state at the end of the run. */
49
+ finalWorkspace: Workspace;
50
+ /** Runtime state at the end of the run (run summary, uiErrors, ...). */
51
+ finalRuntime: RuntimeState;
52
+ /** Concatenated assistant text across the run's iterations. */
53
+ assistantText: string;
54
+ /** All trace events emitted during the run, in emission order. */
55
+ trace: readonly TraceEvent[];
56
+ }
57
+ /**
58
+ * Structured result returned by `evaluateSpec` and by every spec body.
59
+ * `ok` is the pass/fail bit. `detail` is optional structured context a
60
+ * report can surface (matched tokens, missing tokens, the offending
61
+ * trace event id, ...). `error` carries the failure reason when the
62
+ * spec did not run cleanly — registration miss, args validation failure,
63
+ * spec body threw, etc.
64
+ */
65
+ export interface SpecResult {
66
+ ok: boolean;
67
+ detail?: Record<string, unknown>;
68
+ error?: string;
69
+ }
70
+ /**
71
+ * Spec function signature. May be sync or async; the evaluator awaits
72
+ * the return either way. `args` is whatever the fixture supplied in
73
+ * `SuccessSpecReference.args` — typed as `unknown` because every spec
74
+ * declares (and validates) its own arg shape.
75
+ */
76
+ export type SpecFn = (snapshot: RunSnapshot, args: unknown) => SpecResult | Promise<SpecResult>;
77
+ /**
78
+ * In-memory registry. Backed by a `Map`, exposed as a small object so
79
+ * callers do not depend on Map identity.
80
+ */
81
+ export interface SpecRegistry {
82
+ /**
83
+ * Register a spec by name. Throws if the name does not match the
84
+ * required `family/spec-name` kebab-case form, or if the name is
85
+ * already registered. Throws-on-conflict is intentional: silent
86
+ * overwrites mask real bugs and the harness only registers specs at
87
+ * startup, so a throw is observable.
88
+ */
89
+ register(name: string, fn: SpecFn): void;
90
+ /** Returns the registered spec function, or undefined. */
91
+ get(name: string): SpecFn | undefined;
92
+ /** True iff `name` is registered. */
93
+ has(name: string): boolean;
94
+ /** All registered names, in registration order. */
95
+ names(): string[];
96
+ }
97
+ /**
98
+ * Create a fresh, empty spec registry. Callers register specs on it
99
+ * explicitly — `registerStarterSpecs()` is the common starting point
100
+ * for fixtures that reuse the helpers; bespoke specs are registered the
101
+ * same way.
102
+ */
103
+ export declare function createSpecRegistry(): SpecRegistry;
104
+ /**
105
+ * Resolve a `SuccessSpecReference` against a registry and evaluate it
106
+ * over a `RunSnapshot`. Returns a `SpecResult`. Never throws — an
107
+ * unregistered name, a thrown spec body, or a returned non-result is
108
+ * surfaced as `{ ok: false, error }`.
109
+ *
110
+ * Async specs are awaited. Sync specs are returned unchanged.
111
+ */
112
+ export declare function evaluateSpec(registry: SpecRegistry, reference: SuccessSpecReference, snapshot: RunSnapshot): Promise<SpecResult>;
113
+ //# sourceMappingURL=spec-framework.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"spec-framework.d.ts","sourceRoot":"","sources":["../../src/eval/spec-framework.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkCG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AACxD,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,cAAc,CAAC;AAEzD;;;;;;GAMG;AACH,MAAM,WAAW,WAAW;IAC1B,6CAA6C;IAC7C,cAAc,EAAE,SAAS,CAAC;IAC1B,wEAAwE;IACxE,YAAY,EAAE,YAAY,CAAC;IAC3B,+DAA+D;IAC/D,aAAa,EAAE,MAAM,CAAC;IACtB,kEAAkE;IAClE,KAAK,EAAE,SAAS,UAAU,EAAE,CAAC;CAC9B;AAED;;;;;;;GAOG;AACH,MAAM,WAAW,UAAU;IACzB,EAAE,EAAE,OAAO,CAAC;IACZ,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACjC,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;;;;GAKG;AACH,MAAM,MAAM,MAAM,GAAG,CAAC,QAAQ,EAAE,WAAW,EAAE,IAAI,EAAE,OAAO,KAAK,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;AAEhG;;;GAGG;AACH,MAAM,WAAW,YAAY;IAC3B;;;;;;OAMG;IACH,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,GAAG,IAAI,CAAC;IACzC,0DAA0D;IAC1D,GAAG,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CAAC;IACtC,qCAAqC;IACrC,GAAG,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC;IAC3B,mDAAmD;IACnD,KAAK,IAAI,MAAM,EAAE,CAAC;CACnB;AAID;;;;;GAKG;AACH,wBAAgB,kBAAkB,IAAI,YAAY,CAwBjD;AAED;;;;;;;GAOG;AACH,wBAAsB,YAAY,CAChC,QAAQ,EAAE,YAAY,EACtB,SAAS,EAAE,oBAAoB,EAC/B,QAAQ,EAAE,WAAW,GACpB,OAAO,CAAC,UAAU,CAAC,CAkBrB"}
@@ -0,0 +1,100 @@
1
+ /**
2
+ * Success-spec framework for the eval harness.
3
+ *
4
+ * A success specification (a "spec") decides whether one captured agent
5
+ * run satisfied its fixture's success criterion. The framework provides
6
+ * three things:
7
+ *
8
+ * 1. A `SpecRegistry` plus an `createSpecRegistry()` factory.
9
+ * Registration is explicit — no side-effect registration on import.
10
+ * Callers build a registry, call `registerStarterSpecs()` if they
11
+ * want the common helpers (see `spec-helpers.ts`), and register
12
+ * their own specs on top.
13
+ *
14
+ * 2. An `evaluateSpec(registry, reference, snapshot)` evaluator that
15
+ * looks the spec up by name, awaits it (specs may be async), and
16
+ * returns a structured `SpecResult`. A thrown error inside the
17
+ * spec body is caught and converted into `{ ok: false, error }`,
18
+ * so a misbehaving spec never crashes the harness.
19
+ *
20
+ * 3. The `RunSnapshot` interface — the input every spec consumes.
21
+ * Intentionally narrower than the runner's eventual `RunRecord`,
22
+ * so this branch develops independently of `eval/harness-runner`.
23
+ * The runner's `RunRecord` is designed to be a structural superset:
24
+ * a runner caller passes its `record` straight to `evaluateSpec`.
25
+ *
26
+ * Specs read three slices of state — the final workspace (rules / code
27
+ * / app source), the final runtime (the most recent run summary, any
28
+ * uiErrors, terminal output, etc.), the full assistant text, and the
29
+ * trace. Anything outside those four fields is a sign the snapshot
30
+ * shape needs widening; do that here, not in spec bodies.
31
+ *
32
+ * Spec names follow the same `family/spec-name` kebab-case form that
33
+ * `validateFixture` enforces on `SuccessSpecReference.name`. Registration
34
+ * validates the name at registration time so typos surface immediately.
35
+ */
36
+ const SPEC_NAME_PATTERN = /^[a-z][a-z0-9-]*\/[a-z][a-z0-9-]*$/;
37
+ /**
38
+ * Create a fresh, empty spec registry. Callers register specs on it
39
+ * explicitly — `registerStarterSpecs()` is the common starting point
40
+ * for fixtures that reuse the helpers; bespoke specs are registered the
41
+ * same way.
42
+ */
43
+ export function createSpecRegistry() {
44
+ const specs = new Map();
45
+ return {
46
+ register(name, fn) {
47
+ if (typeof name !== 'string' || !SPEC_NAME_PATTERN.test(name)) {
48
+ throw new Error(`spec name must match \`family/spec-name\` kebab-case, got: ${JSON.stringify(name)}`);
49
+ }
50
+ if (specs.has(name)) {
51
+ throw new Error(`spec already registered: ${name}`);
52
+ }
53
+ specs.set(name, fn);
54
+ },
55
+ get(name) {
56
+ return specs.get(name);
57
+ },
58
+ has(name) {
59
+ return specs.has(name);
60
+ },
61
+ names() {
62
+ return Array.from(specs.keys());
63
+ },
64
+ };
65
+ }
66
+ /**
67
+ * Resolve a `SuccessSpecReference` against a registry and evaluate it
68
+ * over a `RunSnapshot`. Returns a `SpecResult`. Never throws — an
69
+ * unregistered name, a thrown spec body, or a returned non-result is
70
+ * surfaced as `{ ok: false, error }`.
71
+ *
72
+ * Async specs are awaited. Sync specs are returned unchanged.
73
+ */
74
+ export async function evaluateSpec(registry, reference, snapshot) {
75
+ const fn = registry.get(reference.name);
76
+ if (!fn) {
77
+ return { ok: false, error: `spec not registered: ${reference.name}` };
78
+ }
79
+ try {
80
+ const result = await fn(snapshot, reference.args);
81
+ if (!isSpecResult(result)) {
82
+ return {
83
+ ok: false,
84
+ error: `spec "${reference.name}" returned a non-SpecResult value`,
85
+ };
86
+ }
87
+ return result;
88
+ }
89
+ catch (err) {
90
+ const message = err instanceof Error ? err.message : String(err);
91
+ return { ok: false, error: `spec "${reference.name}" threw: ${message}` };
92
+ }
93
+ }
94
+ function isSpecResult(value) {
95
+ if (value === null || typeof value !== 'object')
96
+ return false;
97
+ const obj = value;
98
+ return typeof obj.ok === 'boolean';
99
+ }
100
+ //# sourceMappingURL=spec-framework.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"spec-framework.js","sourceRoot":"","sources":["../../src/eval/spec-framework.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkCG;AAoEH,MAAM,iBAAiB,GAAG,oCAAoC,CAAC;AAE/D;;;;;GAKG;AACH,MAAM,UAAU,kBAAkB;IAChC,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IACxC,OAAO;QACL,QAAQ,CAAC,IAAI,EAAE,EAAE;YACf,IAAI,OAAO,IAAI,KAAK,QAAQ,IAAI,CAAC,iBAAiB,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC9D,MAAM,IAAI,KAAK,CACb,8DAA8D,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,EAAE,CACrF,CAAC;YACJ,CAAC;YACD,IAAI,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;gBACpB,MAAM,IAAI,KAAK,CAAC,4BAA4B,IAAI,EAAE,CAAC,CAAC;YACtD,CAAC;YACD,KAAK,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QACtB,CAAC;QACD,GAAG,CAAC,IAAI;YACN,OAAO,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QACzB,CAAC;QACD,GAAG,CAAC,IAAI;YACN,OAAO,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QACzB,CAAC;QACD,KAAK;YACH,OAAO,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC;QAClC,CAAC;KACF,CAAC;AACJ,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,QAAsB,EACtB,SAA+B,EAC/B,QAAqB;IAErB,MAAM,EAAE,GAAG,QAAQ,CAAC,GAAG,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;IACxC,IAAI,CAAC,EAAE,EAAE,CAAC;QACR,OAAO,EAAE,EAAE,EAAE,KAAK,EAAE,KAAK,EAAE,wBAAwB,SAAS,CAAC,IAAI,EAAE,EAAE,CAAC;IACxE,CAAC;IACD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,EAAE,CAAC,QAAQ,EAAE,SAAS,CAAC,IAAI,CAAC,CAAC;QAClD,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,EAAE,CAAC;YAC1B,OAAO;gBACL,EAAE,EAAE,KAAK;gBACT,KAAK,EAAE,SAAS,SAAS,CAAC,IAAI,mCAAmC;aAClE,CAAC;QACJ,CAAC;QACD,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,MAAM,OAAO,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;QACjE,OAAO,EAAE,EAAE,EAAE,KAAK,EAAE,KAAK,EAAE,SAAS,SAAS,CAAC,IAAI,YAAY,OAAO,EAAE,EAAE,CAAC;IAC5E,CAAC;AACH,CAAC;AAED,SAAS,YAAY,CAAC,KAAc;IAClC,IAAI,KAAK,KAAK,IAAI,IAAI,OAAO,KAAK,KAAK,QAAQ;QAAE,OAAO,KAAK,CAAC;IAC9D,MAAM,GAAG,GAAG,KAAgC,CAAC;IAC7C,OAAO,OAAO,GAAG,CAAC,EAAE,KAAK,SAAS,CAAC;AACrC,CAAC"}
@@ -0,0 +1,245 @@
1
+ /**
2
+ * Starter library of common success specs.
3
+ *
4
+ * These helpers are intentionally crude. Golden-task authors compose
5
+ * them through `SuccessSpecReference.args`; custom specs are written
6
+ * only when crudeness is not enough.
7
+ *
8
+ * Registration is explicit. Callers create a registry and call
9
+ * `registerStarterSpecs(registry)` to get all six helpers under their
10
+ * documented names. No side-effect registration on import — that way
11
+ * a host can choose to register a subset, swap in stricter variants,
12
+ * or shadow a helper without monkey-patching this module.
13
+ *
14
+ * Each helper has a `family/spec-name` kebab-case identifier that
15
+ * matches the form `validateFixture` enforces on
16
+ * `SuccessSpecReference.name`. The constants below are exported so
17
+ * fixture authors and reviewers have a single place to import the
18
+ * canonical strings from.
19
+ *
20
+ * Argument shape and behavior are documented per-spec via JSDoc.
21
+ * The framework treats `args` as `unknown`; each spec validates its
22
+ * own shape and surfaces a clear `error` if the args are malformed,
23
+ * rather than throwing. (The evaluator catches throws too, but a
24
+ * structured error message is friendlier to read.)
25
+ */
26
+ import type { SpecFn, SpecRegistry } from './spec-framework.js';
27
+ export declare const SPEC_REPORT_MENTIONS_AT_LEAST_ONE_OF = "report-mentions/at-least-one-of";
28
+ export declare const SPEC_REPORT_MENTIONS_ALL_OF = "report-mentions/all-of";
29
+ export declare const SPEC_TRACE_CONTAINS_TOOL_CALL_BY_NAME = "trace-contains-tool-call/by-name";
30
+ export declare const SPEC_FINAL_RULES_INCLUDES_LITERAL = "final-rules-includes/literal";
31
+ export declare const SPEC_FINAL_RULES_EXCLUDES_LITERAL = "final-rules-excludes/literal";
32
+ export declare const SPEC_FINAL_RUNTIME_RUN_SUMMARY_OK = "final-runtime/run-summary-ok";
33
+ export declare const SPEC_GAME_RULES_SIMULATOR_ACCEPTS_POSITIVE_AND_REJECTS_CHEAT = "game-rules/simulator-accepts-positive-and-rejects-cheat";
34
+ export declare const SPEC_PYRIC_AGENTS_LINT_CLEAN_AND_RULE_REJECTS_CHEAT = "pyric-agents/lint-clean-and-rule-rejects-cheat";
35
+ /**
36
+ * Args: `{ tokens: string[]; caseSensitive?: boolean }`.
37
+ *
38
+ * Passes when `assistantText` contains at least one of `tokens`.
39
+ * Empty / non-array `tokens` is a malformed-args error.
40
+ *
41
+ * Example fixture reference:
42
+ *
43
+ * {
44
+ * "name": "report-mentions/at-least-one-of",
45
+ * "args": { "tokens": ["open-write", "missing auth check"] }
46
+ * }
47
+ */
48
+ export declare const reportMentionsAtLeastOneOf: SpecFn;
49
+ /**
50
+ * Args: `{ tokens: string[]; caseSensitive?: boolean }`.
51
+ *
52
+ * Passes when `assistantText` contains every entry in `tokens`.
53
+ * Empty / non-array `tokens` is a malformed-args error.
54
+ *
55
+ * Example fixture reference:
56
+ *
57
+ * {
58
+ * "name": "report-mentions/all-of",
59
+ * "args": { "tokens": ["users", "open-write", "fix"] }
60
+ * }
61
+ */
62
+ export declare const reportMentionsAllOf: SpecFn;
63
+ /**
64
+ * Args: `{ tool: string; minCount?: number }`.
65
+ *
66
+ * Passes when the trace contains at least `minCount` (default 1)
67
+ * `tool_call` records emitted by the response trace for a tool named
68
+ * `tool`. Tool calls are read off `llm_response` trace events, which
69
+ * is where the agent loop records what the model asked for in each
70
+ * iteration.
71
+ *
72
+ * Example fixture reference:
73
+ *
74
+ * {
75
+ * "name": "trace-contains-tool-call/by-name",
76
+ * "args": { "tool": "rulesSimulator", "minCount": 1 }
77
+ * }
78
+ */
79
+ export declare const traceContainsToolCallByName: SpecFn;
80
+ /**
81
+ * Args: `{ literal: string; caseSensitive?: boolean }`.
82
+ *
83
+ * Passes when `finalWorkspace.rules` contains `literal` as a substring.
84
+ * `literal` must be a non-empty string.
85
+ *
86
+ * Example fixture reference:
87
+ *
88
+ * {
89
+ * "name": "final-rules-includes/literal",
90
+ * "args": { "literal": "request.auth != null" }
91
+ * }
92
+ */
93
+ export declare const finalRulesIncludesLiteral: SpecFn;
94
+ /**
95
+ * Args: `{ literal: string; caseSensitive?: boolean }`.
96
+ *
97
+ * Passes when `finalWorkspace.rules` does NOT contain `literal` as a
98
+ * substring. Useful for asserting that a planted antipattern has been
99
+ * removed.
100
+ *
101
+ * Example fixture reference:
102
+ *
103
+ * {
104
+ * "name": "final-rules-excludes/literal",
105
+ * "args": { "literal": "allow write: if true" }
106
+ * }
107
+ */
108
+ export declare const finalRulesExcludesLiteral: SpecFn;
109
+ /**
110
+ * Args: none (`undefined` or `{}`).
111
+ *
112
+ * Passes when `finalRuntime.runSummary` exists and `runSummary.ok` is
113
+ * true — i.e. the most recent `runCode` invocation succeeded. Fails
114
+ * when there is no run summary at all (the spec only makes sense for
115
+ * fixtures whose skill is expected to run code).
116
+ *
117
+ * Example fixture reference:
118
+ *
119
+ * { "name": "final-runtime/run-summary-ok" }
120
+ */
121
+ export declare const finalRuntimeRunSummaryOk: SpecFn;
122
+ /**
123
+ * Args: `{ database?: 'firestore' | 'rtdb'; positive: PositiveArgs; cheat: CheatArgs }`.
124
+ *
125
+ * Both-direction check: the generated rules should accept a defined
126
+ * positive move AND reject a defined cheating attempt. The fixture
127
+ * supplies one side under `positive` and the other under `cheat`.
128
+ *
129
+ * V1 approximation: token-presence over `finalWorkspace.rules`. A
130
+ * future iteration would replace this with an actual Firestore /
131
+ * Realtime Database security-rules simulator call, exercising the
132
+ * `positive` move (expecting `allow`) and the `cheat` move
133
+ * (expecting `deny`). Wiring that simulator is out of scope for v1
134
+ * because it requires either the Firebase rules-emulator process
135
+ * (Node-only, slow to start, off-limits in a browser-safe surface)
136
+ * or a bundled WASM rules interpreter — neither of which the eval
137
+ * harness has today.
138
+ *
139
+ * The approximation supports two arg-shapes so it can serve both
140
+ * the brief's documented shape and the simulator-style shape the
141
+ * generative fixtures already use on disk:
142
+ *
143
+ * 1. Token shape (explicit, preferred for future-authored fixtures):
144
+ * positive: { description?: string; requiredTokens: string[] }
145
+ * cheat: { description?: string; rejectionTokens: string[] }
146
+ * Each token list is checked as case-sensitive substrings on
147
+ * `finalWorkspace.rules`.
148
+ *
149
+ * 2. Simulator shape (used by the existing fixtures):
150
+ * positive: { auth, path, op, data, expect: 'allow' }
151
+ * cheat: { auth, path, op, data, expect: 'deny' }
152
+ * Tokens are derived from the simulator-side `data` and `path`
153
+ * values — each string-valued leaf and the path segment are
154
+ * required to appear in the rules text. The expectation field
155
+ * (`allow` / `deny`) is recorded in the detail payload but is
156
+ * not used by the token check; the actual accept/reject
157
+ * decision is what the future simulator iteration would
158
+ * validate.
159
+ *
160
+ * Passes iff every derived positive token AND every derived cheat
161
+ * token appears in `finalWorkspace.rules`. Otherwise returns
162
+ * `{ ok: false, detail: { missingPositive, missingCheat } }`.
163
+ *
164
+ * Example fixture reference (simulator shape):
165
+ *
166
+ * {
167
+ * "name": "game-rules/simulator-accepts-positive-and-rejects-cheat",
168
+ * "args": {
169
+ * "database": "rtdb",
170
+ * "positive": { "auth": { "uid": "uidA" }, "path": "/games/g1",
171
+ * "op": "update", "data": { ... }, "expect": "allow" },
172
+ * "cheat": { "auth": { "uid": "uidB" }, "path": "/games/g1",
173
+ * "op": "update", "data": { ... }, "expect": "deny" }
174
+ * }
175
+ * }
176
+ */
177
+ export declare const gameRulesSimulatorAcceptsPositiveAndRejectsCheat: SpecFn;
178
+ /**
179
+ * Args: `{ lintToolName?: string; cheat?: CheatArgs; cheatAttempt?: CheatArgs }`.
180
+ *
181
+ * Two-part check: (1) the agent successfully called the pyric lint
182
+ * tool during the run, AND (2) the resulting rules text contains the
183
+ * tokens that should be present if the cheating attempt is
184
+ * structurally rejected by the rules.
185
+ *
186
+ * V1 approximation:
187
+ * - Step 1 walks `snapshot.trace` for any `llm_response` event
188
+ * containing a tool call whose name matches `lintToolName`
189
+ * (default `lint_firestore_rules`). If absent, returns
190
+ * `{ ok: false, detail: { reason: 'lint-not-called' } }`. The
191
+ * pyric lint tool's success/failure is observable in the
192
+ * assistant's reasoning and in the tool result, but capturing
193
+ * the precise `tool_result` event shape across providers is
194
+ * fragile — checking that the tool was *called* is the right
195
+ * v1 signal. A future iteration would also verify the lint
196
+ * tool's result was `ok: true` at the trace level.
197
+ * - Step 2 checks `finalWorkspace.rules` includes every token in
198
+ * the cheat's `rejectionTokens`. The cheat may be supplied
199
+ * under either `cheat` (the brief's name) or `cheatAttempt`
200
+ * (the name the existing fixture uses). When the cheat is
201
+ * simulator-shaped (no `rejectionTokens`, only `data` + `path`),
202
+ * tokens are derived from those leaves the same way as
203
+ * `gameRulesSimulatorAcceptsPositiveAndRejectsCheat`.
204
+ *
205
+ * Returns `{ ok: true }` on success, or
206
+ * `{ ok: false, detail: { reason: 'rejection-tokens-missing', missing } }`
207
+ * on the second-step failure.
208
+ *
209
+ * Example fixture reference:
210
+ *
211
+ * {
212
+ * "name": "pyric-agents/lint-clean-and-rule-rejects-cheat",
213
+ * "args": {
214
+ * "lintToolName": "lint_firestore_rules",
215
+ * "cheatAttempt": { "path": "/orders/orderA", "op": "create",
216
+ * "data": { ... }, "expect": "deny" }
217
+ * }
218
+ * }
219
+ */
220
+ export declare const pyricAgentsLintCleanAndRuleRejectsCheat: SpecFn;
221
+ /**
222
+ * Register every starter spec on a registry. Idempotency is not a
223
+ * design goal — calling this twice on the same registry throws (the
224
+ * registry rejects duplicate registrations on purpose). Callers that
225
+ * want a subset should call `registry.register()` themselves.
226
+ */
227
+ export declare function registerStarterSpecs(registry: SpecRegistry): void;
228
+ /**
229
+ * Register every custom (post-starter) spec on a registry. Sibling
230
+ * to `registerStarterSpecs`. Splitting the two keeps the meaning of
231
+ * "starter" stable as new custom specs are added.
232
+ */
233
+ export declare function registerCustomSpecs(registry: SpecRegistry): void;
234
+ /**
235
+ * Umbrella that registers every spec the library ships — both the
236
+ * starter library and the custom helpers. Equivalent to calling
237
+ * `registerStarterSpecs(registry)` followed by
238
+ * `registerCustomSpecs(registry)`.
239
+ */
240
+ export declare function registerAllSpecs(registry: SpecRegistry): void;
241
+ /** Stable list of starter spec names, in registration order. */
242
+ export declare const STARTER_SPEC_NAMES: readonly string[];
243
+ /** Stable list of custom spec names, in registration order. */
244
+ export declare const CUSTOM_SPEC_NAMES: readonly string[];
245
+ //# sourceMappingURL=spec-helpers.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"spec-helpers.d.ts","sourceRoot":"","sources":["../../src/eval/spec-helpers.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH,OAAO,KAAK,EAAe,MAAM,EAAE,YAAY,EAAc,MAAM,qBAAqB,CAAC;AAEzF,eAAO,MAAM,oCAAoC,oCAAoC,CAAC;AACtF,eAAO,MAAM,2BAA2B,2BAA2B,CAAC;AACpE,eAAO,MAAM,qCAAqC,qCAAqC,CAAC;AACxF,eAAO,MAAM,iCAAiC,iCAAiC,CAAC;AAChF,eAAO,MAAM,iCAAiC,iCAAiC,CAAC;AAChF,eAAO,MAAM,iCAAiC,iCAAiC,CAAC;AAEhF,eAAO,MAAM,4DAA4D,4DACd,CAAC;AAC5D,eAAO,MAAM,mDAAmD,mDACd,CAAC;AAEnD;;;;;;;;;;;;GAYG;AACH,eAAO,MAAM,0BAA0B,EAAE,MAiBxC,CAAC;AAEF;;;;;;;;;;;;GAYG;AACH,eAAO,MAAM,mBAAmB,EAAE,MAiBjC,CAAC;AAEF;;;;;;;;;;;;;;;GAeG;AACH,eAAO,MAAM,2BAA2B,EAAE,MAgBzC,CAAC;AAEF;;;;;;;;;;;;GAYG;AACH,eAAO,MAAM,yBAAyB,EAAE,MAEvC,CAAC;AAEF;;;;;;;;;;;;;GAaG;AACH,eAAO,MAAM,yBAAyB,EAAE,MAEvC,CAAC;AAEF;;;;;;;;;;;GAWG;AACH,eAAO,MAAM,wBAAwB,EAAE,MAkBtC,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAsDG;AACH,eAAO,MAAM,gDAAgD,EAAE,MAoB9D,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAyCG;AACH,eAAO,MAAM,uCAAuC,EAAE,MAqDrD,CAAC;AAEF;;;;;GAKG;AACH,wBAAgB,oBAAoB,CAAC,QAAQ,EAAE,YAAY,GAAG,IAAI,CAOjE;AAED;;;;GAIG;AACH,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,YAAY,GAAG,IAAI,CAShE;AAED;;;;;GAKG;AACH,wBAAgB,gBAAgB,CAAC,QAAQ,EAAE,YAAY,GAAG,IAAI,CAG7D;AAED,gEAAgE;AAChE,eAAO,MAAM,kBAAkB,EAAE,SAAS,MAAM,EAO/C,CAAC;AAEF,+DAA+D;AAC/D,eAAO,MAAM,iBAAiB,EAAE,SAAS,MAAM,EAG9C,CAAC"}