@inbrowser/agent 0.0.0-placeholder → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +270 -0
- package/LICENSE +21 -0
- package/README.md +117 -2
- package/bin/agent.ts +10 -0
- package/dist/cli/commands/describe.d.ts +14 -0
- package/dist/cli/commands/describe.d.ts.map +1 -0
- package/dist/cli/commands/describe.js +179 -0
- package/dist/cli/commands/describe.js.map +1 -0
- package/dist/cli/commands/events.d.ts +21 -0
- package/dist/cli/commands/events.d.ts.map +1 -0
- package/dist/cli/commands/events.js +59 -0
- package/dist/cli/commands/events.js.map +1 -0
- package/dist/cli/commands/fleet.d.ts +15 -0
- package/dist/cli/commands/fleet.d.ts.map +1 -0
- package/dist/cli/commands/fleet.js +149 -0
- package/dist/cli/commands/fleet.js.map +1 -0
- package/dist/cli/commands/help.d.ts +15 -0
- package/dist/cli/commands/help.d.ts.map +1 -0
- package/dist/cli/commands/help.js +93 -0
- package/dist/cli/commands/help.js.map +1 -0
- package/dist/cli/commands/migrate.d.ts +27 -0
- package/dist/cli/commands/migrate.d.ts.map +1 -0
- package/dist/cli/commands/migrate.js +109 -0
- package/dist/cli/commands/migrate.js.map +1 -0
- package/dist/cli/commands/run.d.ts +38 -0
- package/dist/cli/commands/run.d.ts.map +1 -0
- package/dist/cli/commands/run.js +535 -0
- package/dist/cli/commands/run.js.map +1 -0
- package/dist/cli/commands/schema.d.ts +8 -0
- package/dist/cli/commands/schema.d.ts.map +1 -0
- package/dist/cli/commands/schema.js +12 -0
- package/dist/cli/commands/schema.js.map +1 -0
- package/dist/cli/commands/serve.d.ts +39 -0
- package/dist/cli/commands/serve.d.ts.map +1 -0
- package/dist/cli/commands/serve.js +65 -0
- package/dist/cli/commands/serve.js.map +1 -0
- package/dist/cli/commands/undo.d.ts +36 -0
- package/dist/cli/commands/undo.d.ts.map +1 -0
- package/dist/cli/commands/undo.js +132 -0
- package/dist/cli/commands/undo.js.map +1 -0
- package/dist/cli/fixtures.d.ts +17 -0
- package/dist/cli/fixtures.d.ts.map +1 -0
- package/dist/cli/fixtures.js +107 -0
- package/dist/cli/fixtures.js.map +1 -0
- package/dist/cli/hardening.d.ts +39 -0
- package/dist/cli/hardening.d.ts.map +1 -0
- package/dist/cli/hardening.js +68 -0
- package/dist/cli/hardening.js.map +1 -0
- package/dist/cli/index.d.ts +28 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +19 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/llm/openrouter.d.ts +33 -0
- package/dist/cli/llm/openrouter.d.ts.map +1 -0
- package/dist/cli/llm/openrouter.js +285 -0
- package/dist/cli/llm/openrouter.js.map +1 -0
- package/dist/cli/main.d.ts +32 -0
- package/dist/cli/main.d.ts.map +1 -0
- package/dist/cli/main.js +106 -0
- package/dist/cli/main.js.map +1 -0
- package/dist/cli/output.d.ts +36 -0
- package/dist/cli/output.d.ts.map +1 -0
- package/dist/cli/output.js +95 -0
- package/dist/cli/output.js.map +1 -0
- package/dist/cli/parse.d.ts +26 -0
- package/dist/cli/parse.d.ts.map +1 -0
- package/dist/cli/parse.js +160 -0
- package/dist/cli/parse.js.map +1 -0
- package/dist/cli/session-log.d.ts +34 -0
- package/dist/cli/session-log.d.ts.map +1 -0
- package/dist/cli/session-log.js +52 -0
- package/dist/cli/session-log.js.map +1 -0
- package/dist/cli/spec.d.ts +62 -0
- package/dist/cli/spec.d.ts.map +1 -0
- package/dist/cli/spec.js +510 -0
- package/dist/cli/spec.js.map +1 -0
- package/dist/cli/ui/RunView.d.ts +134 -0
- package/dist/cli/ui/RunView.d.ts.map +1 -0
- package/dist/cli/ui/RunView.js +341 -0
- package/dist/cli/ui/RunView.js.map +1 -0
- package/dist/diagnostics/index.d.ts +5 -0
- package/dist/diagnostics/index.d.ts.map +1 -0
- package/dist/diagnostics/index.js +3 -0
- package/dist/diagnostics/index.js.map +1 -0
- package/dist/diagnostics/timing.d.ts +48 -0
- package/dist/diagnostics/timing.d.ts.map +1 -0
- package/dist/diagnostics/timing.js +85 -0
- package/dist/diagnostics/timing.js.map +1 -0
- package/dist/diagnostics/truthfulness.d.ts +36 -0
- package/dist/diagnostics/truthfulness.d.ts.map +1 -0
- package/dist/diagnostics/truthfulness.js +180 -0
- package/dist/diagnostics/truthfulness.js.map +1 -0
- package/dist/dispatch-memoization.d.ts +84 -0
- package/dist/dispatch-memoization.d.ts.map +1 -0
- package/dist/dispatch-memoization.js +197 -0
- package/dist/dispatch-memoization.js.map +1 -0
- package/dist/eval/comparison-report.d.ts +164 -0
- package/dist/eval/comparison-report.d.ts.map +1 -0
- package/dist/eval/comparison-report.js +316 -0
- package/dist/eval/comparison-report.js.map +1 -0
- package/dist/eval/fixture.d.ts +74 -0
- package/dist/eval/fixture.d.ts.map +1 -0
- package/dist/eval/fixture.js +217 -0
- package/dist/eval/fixture.js.map +1 -0
- package/dist/eval/index.d.ts +13 -0
- package/dist/eval/index.d.ts.map +1 -0
- package/dist/eval/index.js +7 -0
- package/dist/eval/index.js.map +1 -0
- package/dist/eval/load-node.d.ts +16 -0
- package/dist/eval/load-node.d.ts.map +1 -0
- package/dist/eval/load-node.js +58 -0
- package/dist/eval/load-node.js.map +1 -0
- package/dist/eval/metric-collector.d.ts +209 -0
- package/dist/eval/metric-collector.d.ts.map +1 -0
- package/dist/eval/metric-collector.js +293 -0
- package/dist/eval/metric-collector.js.map +1 -0
- package/dist/eval/run-record.d.ts +76 -0
- package/dist/eval/run-record.d.ts.map +1 -0
- package/dist/eval/run-record.js +32 -0
- package/dist/eval/run-record.js.map +1 -0
- package/dist/eval/runner.d.ts +140 -0
- package/dist/eval/runner.d.ts.map +1 -0
- package/dist/eval/runner.js +310 -0
- package/dist/eval/runner.js.map +1 -0
- package/dist/eval/spec-framework.d.ts +113 -0
- package/dist/eval/spec-framework.d.ts.map +1 -0
- package/dist/eval/spec-framework.js +100 -0
- package/dist/eval/spec-framework.js.map +1 -0
- package/dist/eval/spec-helpers.d.ts +245 -0
- package/dist/eval/spec-helpers.d.ts.map +1 -0
- package/dist/eval/spec-helpers.js +605 -0
- package/dist/eval/spec-helpers.js.map +1 -0
- package/dist/events/codec.d.ts +79 -0
- package/dist/events/codec.d.ts.map +1 -0
- package/dist/events/codec.js +142 -0
- package/dist/events/codec.js.map +1 -0
- package/dist/events/log-core.d.ts +76 -0
- package/dist/events/log-core.d.ts.map +1 -0
- package/dist/events/log-core.js +73 -0
- package/dist/events/log-core.js.map +1 -0
- package/dist/events/log.d.ts +60 -0
- package/dist/events/log.d.ts.map +1 -0
- package/dist/events/log.js +193 -0
- package/dist/events/log.js.map +1 -0
- package/dist/events/replay.d.ts +106 -0
- package/dist/events/replay.d.ts.map +1 -0
- package/dist/events/replay.js +137 -0
- package/dist/events/replay.js.map +1 -0
- package/dist/events/wrap.d.ts +100 -0
- package/dist/events/wrap.d.ts.map +1 -0
- package/dist/events/wrap.js +141 -0
- package/dist/events/wrap.js.map +1 -0
- package/dist/index.d.ts +73 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +47 -0
- package/dist/index.js.map +1 -0
- package/dist/llm-adapter.d.ts +96 -0
- package/dist/llm-adapter.d.ts.map +1 -0
- package/dist/llm-adapter.js +132 -0
- package/dist/llm-adapter.js.map +1 -0
- package/dist/mcp/serve.d.ts +70 -0
- package/dist/mcp/serve.d.ts.map +1 -0
- package/dist/mcp/serve.js +154 -0
- package/dist/mcp/serve.js.map +1 -0
- package/dist/metrics/runs.d.ts +58 -0
- package/dist/metrics/runs.d.ts.map +1 -0
- package/dist/metrics/runs.js +99 -0
- package/dist/metrics/runs.js.map +1 -0
- package/dist/metrics.d.ts +38 -0
- package/dist/metrics.d.ts.map +1 -0
- package/dist/metrics.js +123 -0
- package/dist/metrics.js.map +1 -0
- package/dist/node.d.ts +23 -0
- package/dist/node.d.ts.map +1 -0
- package/dist/node.js +23 -0
- package/dist/node.js.map +1 -0
- package/dist/planner-executor.d.ts +132 -0
- package/dist/planner-executor.d.ts.map +1 -0
- package/dist/planner-executor.js +274 -0
- package/dist/planner-executor.js.map +1 -0
- package/dist/session.d.ts +10 -0
- package/dist/session.d.ts.map +1 -0
- package/dist/session.js +179 -0
- package/dist/session.js.map +1 -0
- package/dist/skill-catalog.d.ts +81 -0
- package/dist/skill-catalog.d.ts.map +1 -0
- package/dist/skill-catalog.js +388 -0
- package/dist/skill-catalog.js.map +1 -0
- package/dist/skill-router.d.ts +95 -0
- package/dist/skill-router.d.ts.map +1 -0
- package/dist/skill-router.js +130 -0
- package/dist/skill-router.js.map +1 -0
- package/dist/storage.d.ts +14 -0
- package/dist/storage.d.ts.map +1 -0
- package/dist/storage.js +58 -0
- package/dist/storage.js.map +1 -0
- package/dist/strategy.d.ts +45 -0
- package/dist/strategy.d.ts.map +1 -0
- package/dist/strategy.js +520 -0
- package/dist/strategy.js.map +1 -0
- package/dist/tools.d.ts +40 -0
- package/dist/tools.d.ts.map +1 -0
- package/dist/tools.js +147 -0
- package/dist/tools.js.map +1 -0
- package/dist/types/agent.d.ts +94 -0
- package/dist/types/agent.d.ts.map +1 -0
- package/dist/types/agent.js +17 -0
- package/dist/types/agent.js.map +1 -0
- package/dist/types/capabilities.d.ts +17 -0
- package/dist/types/capabilities.d.ts.map +1 -0
- package/dist/types/capabilities.js +13 -0
- package/dist/types/capabilities.js.map +1 -0
- package/dist/types/chat.d.ts +74 -0
- package/dist/types/chat.d.ts.map +1 -0
- package/dist/types/chat.js +10 -0
- package/dist/types/chat.js.map +1 -0
- package/dist/types/events.d.ts +115 -0
- package/dist/types/events.d.ts.map +1 -0
- package/dist/types/events.js +30 -0
- package/dist/types/events.js.map +1 -0
- package/dist/types/llm.d.ts +89 -0
- package/dist/types/llm.d.ts.map +1 -0
- package/dist/types/llm.js +12 -0
- package/dist/types/llm.js.map +1 -0
- package/dist/types/metrics.d.ts +34 -0
- package/dist/types/metrics.d.ts.map +1 -0
- package/dist/types/metrics.js +10 -0
- package/dist/types/metrics.js.map +1 -0
- package/dist/types/observer.d.ts +41 -0
- package/dist/types/observer.d.ts.map +1 -0
- package/dist/types/observer.js +41 -0
- package/dist/types/observer.js.map +1 -0
- package/dist/types/project-context.d.ts +18 -0
- package/dist/types/project-context.d.ts.map +1 -0
- package/dist/types/project-context.js +11 -0
- package/dist/types/project-context.js.map +1 -0
- package/dist/types/runtime.d.ts +71 -0
- package/dist/types/runtime.d.ts.map +1 -0
- package/dist/types/runtime.js +21 -0
- package/dist/types/runtime.js.map +1 -0
- package/dist/types/session.d.ts +103 -0
- package/dist/types/session.d.ts.map +1 -0
- package/dist/types/session.js +11 -0
- package/dist/types/session.js.map +1 -0
- package/dist/types/storage.d.ts +20 -0
- package/dist/types/storage.d.ts.map +1 -0
- package/dist/types/storage.js +41 -0
- package/dist/types/storage.js.map +1 -0
- package/dist/types/strategy.d.ts +124 -0
- package/dist/types/strategy.d.ts.map +1 -0
- package/dist/types/strategy.js +10 -0
- package/dist/types/strategy.js.map +1 -0
- package/dist/types/tools.d.ts +154 -0
- package/dist/types/tools.d.ts.map +1 -0
- package/dist/types/tools.js +11 -0
- package/dist/types/tools.js.map +1 -0
- package/dist/types/trace.d.ts +175 -0
- package/dist/types/trace.d.ts.map +1 -0
- package/dist/types/trace.js +26 -0
- package/dist/types/trace.js.map +1 -0
- package/dist/types/workspace.d.ts +29 -0
- package/dist/types/workspace.d.ts.map +1 -0
- package/dist/types/workspace.js +18 -0
- package/dist/types/workspace.js.map +1 -0
- package/package.json +45 -14
- package/skills/agent-cli.md +218 -0
- package/index.js +0 -2
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `RunRecord` — the eval harness's per-trial capture envelope.
|
|
3
|
+
*
|
|
4
|
+
* One `RunRecord` is produced for each `runFixture` call. It is the
|
|
5
|
+
* stable contract every downstream phase-one branch consumes:
|
|
6
|
+
*
|
|
7
|
+
* - `eval/metric-collector` derives the eight metrics from a record.
|
|
8
|
+
* - `eval/comparison-report` aggregates records across trials and
|
|
9
|
+
* compares two sets.
|
|
10
|
+
* - `eval/success-spec-framework` uses `finalWorkspace`, `trace`,
|
|
11
|
+
* and `assistantText` as inputs to its spec functions.
|
|
12
|
+
*
|
|
13
|
+
* The shape is intentionally narrow and frozen-in-place: everything
|
|
14
|
+
* the metric collector needs already lives elsewhere (the trace
|
|
15
|
+
* carries usage, tool calls, and turn boundaries; the final
|
|
16
|
+
* workspace + runtime describe the agent-visible end state). The
|
|
17
|
+
* runner does not aggregate, does not derive metrics, and does not
|
|
18
|
+
* decide pass/fail. Those are downstream concerns by design.
|
|
19
|
+
*
|
|
20
|
+
* Browser-safe — no Node imports. The runner that produces these is
|
|
21
|
+
* also browser-safe in v1; a future on-disk persistence helper would
|
|
22
|
+
* live behind `@inbrowser/agent/node`.
|
|
23
|
+
*
|
|
24
|
+
* Note on naming: `@inbrowser/agent` also exports an unrelated
|
|
25
|
+
* `RunRecord` from `metrics/runs.ts` (a per-MCP-tool-call NDJSON
|
|
26
|
+
* record). To avoid breaking that public surface, the eval-side type
|
|
27
|
+
* is re-exported from the package root as `EvalRunRecord`. Within the
|
|
28
|
+
* eval barrel and inside this branch, the natural `RunRecord` name is
|
|
29
|
+
* used.
|
|
30
|
+
*/
|
|
31
|
+
import type { RuntimeState } from '../types/runtime.js';
|
|
32
|
+
import type { TraceEvent } from '../types/trace.js';
|
|
33
|
+
import type { Workspace } from '../types/workspace.js';
|
|
34
|
+
import type { TaskFixture } from './fixture.js';
|
|
35
|
+
export interface RunRecord {
|
|
36
|
+
/** The input fixture, echoed verbatim so a record is
|
|
37
|
+
* self-describing and a downstream reader does not have to
|
|
38
|
+
* cross-reference an external fixture id to know what was run. */
|
|
39
|
+
fixture: TaskFixture;
|
|
40
|
+
/** Zero-indexed trial number for this fixture. When a batch driver
|
|
41
|
+
* runs N trials per fixture, the i-th trial carries `trial: i`. */
|
|
42
|
+
trial: number;
|
|
43
|
+
/** Every trace event emitted by the strategy during this run, in
|
|
44
|
+
* emission order. Includes `llm_request`, `llm_response`, and
|
|
45
|
+
* `turn_dispatch_complete` events. Empty when no events were
|
|
46
|
+
* emitted before an early abort. */
|
|
47
|
+
trace: TraceEvent[];
|
|
48
|
+
/** The `Workspace` value at the end of the run. Equal to the
|
|
49
|
+
* fixture-seeded workspace when no tool produced a workspace
|
|
50
|
+
* patch. Frozen by the session, so safe to share by reference. */
|
|
51
|
+
finalWorkspace: Workspace;
|
|
52
|
+
/** The `RuntimeState` at the end of the run. Same freezing
|
|
53
|
+
* guarantees as `finalWorkspace`. */
|
|
54
|
+
finalRuntime: RuntimeState;
|
|
55
|
+
/** Concatenated assistant text across every turn in this run. A
|
|
56
|
+
* convenience for spec evaluators that match against assistant
|
|
57
|
+
* output — they do not have to re-walk the trace. */
|
|
58
|
+
assistantText: string;
|
|
59
|
+
/** Wall-clock ms at run start. Captured by the runner immediately
|
|
60
|
+
* before it submits the prompt. */
|
|
61
|
+
startedAt: number;
|
|
62
|
+
/** Wall-clock ms at run completion (or termination). Captured by
|
|
63
|
+
* the runner immediately after the session event stream drains or
|
|
64
|
+
* the run is aborted. */
|
|
65
|
+
completedAt: number;
|
|
66
|
+
/** Optional seed echoed from `runFixture`'s input. Threaded for
|
|
67
|
+
* traceability only — v1 does not enforce determinism on
|
|
68
|
+
* strategies or LLM clients. */
|
|
69
|
+
seed?: number;
|
|
70
|
+
/** `null` on a clean finish. A string when the run terminated for
|
|
71
|
+
* any non-success reason: external abort, max wall-clock
|
|
72
|
+
* exceeded, an unexpected exception thrown by the session, or a
|
|
73
|
+
* session-emitted `error` event. */
|
|
74
|
+
error: string | null;
|
|
75
|
+
}
|
|
76
|
+
//# sourceMappingURL=run-record.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"run-record.d.ts","sourceRoot":"","sources":["../../src/eval/run-record.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AACxD,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAEhD,MAAM,WAAW,SAAS;IACxB;;uEAEmE;IACnE,OAAO,EAAE,WAAW,CAAC;IACrB;wEACoE;IACpE,KAAK,EAAE,MAAM,CAAC;IACd;;;yCAGqC;IACrC,KAAK,EAAE,UAAU,EAAE,CAAC;IACpB;;uEAEmE;IACnE,cAAc,EAAE,SAAS,CAAC;IAC1B;0CACsC;IACtC,YAAY,EAAE,YAAY,CAAC;IAC3B;;0DAEsD;IACtD,aAAa,EAAE,MAAM,CAAC;IACtB;wCACoC;IACpC,SAAS,EAAE,MAAM,CAAC;IAClB;;8BAE0B;IAC1B,WAAW,EAAE,MAAM,CAAC;IACpB;;qCAEiC;IACjC,IAAI,CAAC,EAAE,MAAM,CAAC;IACd;;;yCAGqC;IACrC,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;CACtB"}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `RunRecord` — the eval harness's per-trial capture envelope.
|
|
3
|
+
*
|
|
4
|
+
* One `RunRecord` is produced for each `runFixture` call. It is the
|
|
5
|
+
* stable contract every downstream phase-one branch consumes:
|
|
6
|
+
*
|
|
7
|
+
* - `eval/metric-collector` derives the eight metrics from a record.
|
|
8
|
+
* - `eval/comparison-report` aggregates records across trials and
|
|
9
|
+
* compares two sets.
|
|
10
|
+
* - `eval/success-spec-framework` uses `finalWorkspace`, `trace`,
|
|
11
|
+
* and `assistantText` as inputs to its spec functions.
|
|
12
|
+
*
|
|
13
|
+
* The shape is intentionally narrow and frozen-in-place: everything
|
|
14
|
+
* the metric collector needs already lives elsewhere (the trace
|
|
15
|
+
* carries usage, tool calls, and turn boundaries; the final
|
|
16
|
+
* workspace + runtime describe the agent-visible end state). The
|
|
17
|
+
* runner does not aggregate, does not derive metrics, and does not
|
|
18
|
+
* decide pass/fail. Those are downstream concerns by design.
|
|
19
|
+
*
|
|
20
|
+
* Browser-safe — no Node imports. The runner that produces these is
|
|
21
|
+
* also browser-safe in v1; a future on-disk persistence helper would
|
|
22
|
+
* live behind `@inbrowser/agent/node`.
|
|
23
|
+
*
|
|
24
|
+
* Note on naming: `@inbrowser/agent` also exports an unrelated
|
|
25
|
+
* `RunRecord` from `metrics/runs.ts` (a per-MCP-tool-call NDJSON
|
|
26
|
+
* record). To avoid breaking that public surface, the eval-side type
|
|
27
|
+
* is re-exported from the package root as `EvalRunRecord`. Within the
|
|
28
|
+
* eval barrel and inside this branch, the natural `RunRecord` name is
|
|
29
|
+
* used.
|
|
30
|
+
*/
|
|
31
|
+
export {};
|
|
32
|
+
//# sourceMappingURL=run-record.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"run-record.js","sourceRoot":"","sources":["../../src/eval/run-record.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG"}
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `runFixture` + `runFixtures` — the eval harness's session driver.
|
|
3
|
+
*
|
|
4
|
+
* Given a `TaskFixture`, an `LlmClient`, and a tool surface, the
|
|
5
|
+
* runner:
|
|
6
|
+
*
|
|
7
|
+
* 1. Builds the seeded `Workspace` from `EMPTY_WORKSPACE +
|
|
8
|
+
* fixture.initialState`.
|
|
9
|
+
* 2. Threads that workspace through the system prompt builder and
|
|
10
|
+
* the per-call `ToolContext` so the agent sees the seeded
|
|
11
|
+
* state.
|
|
12
|
+
* 3. Plugs a buffering `Tracer` into the session so every
|
|
13
|
+
* `TraceEvent` (request, response, dispatch-complete) lands in
|
|
14
|
+
* the captured record.
|
|
15
|
+
* 4. Submits the fixture's prompt and drains the session event
|
|
16
|
+
* stream, shadow-tracking workspace + runtime patches as they
|
|
17
|
+
* flow back through `tool_finished` events.
|
|
18
|
+
* 5. Returns a `RunRecord` with the trace, the final workspace +
|
|
19
|
+
* runtime, the concatenated assistant text, timing, and a
|
|
20
|
+
* `error` slot that is `null` on a clean finish or a string on
|
|
21
|
+
* any non-success outcome.
|
|
22
|
+
*
|
|
23
|
+
* Errors never throw out of `runFixture`. An aborted run, a
|
|
24
|
+
* wall-clock cap exceeded, a session-emitted `error` event, or an
|
|
25
|
+
* unexpected exception all resolve to a `RunRecord` with `error` set
|
|
26
|
+
* and a partial trace.
|
|
27
|
+
*
|
|
28
|
+
* The runner is browser-safe — no `node:*` imports. Persistence
|
|
29
|
+
* helpers (if anyone wants them) belong in
|
|
30
|
+
* `runner-persistence-node.ts` and route through `src/node.ts`. That
|
|
31
|
+
* file does not exist in v1 by design; downstream consumers can
|
|
32
|
+
* persist in-memory records however they like.
|
|
33
|
+
*/
|
|
34
|
+
import type { LlmClient } from '../types/llm.js';
|
|
35
|
+
import type { RuntimeState } from '../types/runtime.js';
|
|
36
|
+
import type { AgentStrategy } from '../types/strategy.js';
|
|
37
|
+
import type { ToolDispatch, ToolHandler } from '../types/tools.js';
|
|
38
|
+
import { type Workspace } from '../types/workspace.js';
|
|
39
|
+
import { type TaskFixture } from './fixture.js';
|
|
40
|
+
import type { RunRecord } from './run-record.js';
|
|
41
|
+
/**
|
|
42
|
+
* Input contract for `runFixture`. The caller supplies the fixture,
|
|
43
|
+
* the LLM client, and the tool surface; the runner owns everything
|
|
44
|
+
* else.
|
|
45
|
+
*/
|
|
46
|
+
export interface RunFixtureInput {
|
|
47
|
+
fixture: TaskFixture;
|
|
48
|
+
/** The chat client the strategy will drive. Tests pass a canned
|
|
49
|
+
* stub; production code wires the real provider adapter. */
|
|
50
|
+
llm: LlmClient;
|
|
51
|
+
/** Tool dispatcher the session uses to execute tool calls. Tests
|
|
52
|
+
* can pass `createDispatch(createToolRegistry())` for the no-tool
|
|
53
|
+
* path. */
|
|
54
|
+
tools: ToolDispatch;
|
|
55
|
+
/** Tool handlers the LLM should see this run. Empty when the
|
|
56
|
+
* fixture is exercised against a no-tool baseline. */
|
|
57
|
+
toolList: ToolHandler[];
|
|
58
|
+
/** Optional strategy override. Defaults to a fresh
|
|
59
|
+
* `createReactLoopStrategy()`. */
|
|
60
|
+
strategy?: AgentStrategy;
|
|
61
|
+
/** Zero-indexed trial number. Defaults to 0. The batch driver
|
|
62
|
+
* passes the trial it is currently on. */
|
|
63
|
+
trial?: number;
|
|
64
|
+
/** Optional external abort signal. Wired into the session's
|
|
65
|
+
* `submit()` call. */
|
|
66
|
+
signal?: AbortSignal;
|
|
67
|
+
/** Optional wall-clock cap in ms. When the run exceeds this, the
|
|
68
|
+
* runner aborts the session and resolves with `error` set. Absent
|
|
69
|
+
* means no cap. */
|
|
70
|
+
maxWallClockMs?: number;
|
|
71
|
+
/** Optional seed echoed back through the `RunRecord` for
|
|
72
|
+
* traceability. Not consumed by the runner; strategies + LLM
|
|
73
|
+
* clients decide whether to honor it. */
|
|
74
|
+
seed?: number;
|
|
75
|
+
/** Optional system-prompt builder. Defaults to the canonical
|
|
76
|
+
* fixture-aware prompt — see `defaultSystemPromptBuilder` for the
|
|
77
|
+
* shape. */
|
|
78
|
+
systemPromptBuilder?: (workspace: Workspace, runtime: RuntimeState) => string;
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Drive a single `TaskFixture` end-to-end and resolve with a
|
|
82
|
+
* `RunRecord` describing the captured run. Never throws.
|
|
83
|
+
*/
|
|
84
|
+
export declare function runFixture(input: RunFixtureInput): Promise<RunRecord>;
|
|
85
|
+
/**
|
|
86
|
+
* Options for `runFixtures`. Sequential by default; `parallelism` is
|
|
87
|
+
* accepted for forward-compatibility but v1 honors only `1`. Records
|
|
88
|
+
* are returned in input order regardless.
|
|
89
|
+
*/
|
|
90
|
+
export interface RunFixturesOptions {
|
|
91
|
+
/** Number of trials per fixture. Defaults to 1. */
|
|
92
|
+
trials?: number;
|
|
93
|
+
/** Forward-compatibility hint. v1 always runs sequentially. A
|
|
94
|
+
* future implementation may honor values > 1; downstream branches
|
|
95
|
+
* should pass `1` (or omit) until then. */
|
|
96
|
+
parallelism?: number;
|
|
97
|
+
/** Optional external abort signal. Forwarded to every `runFixture`
|
|
98
|
+
* call; an abort short-circuits the remaining trials. */
|
|
99
|
+
signal?: AbortSignal;
|
|
100
|
+
/** Optional wall-clock cap per trial. Forwarded as-is. */
|
|
101
|
+
maxWallClockMs?: number;
|
|
102
|
+
/** Optional seed factory. Called once per (fixture, trial). The
|
|
103
|
+
* return value is echoed onto the produced `RunRecord.seed`. */
|
|
104
|
+
seed?: (fixture: TaskFixture, trial: number) => number;
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Static dependencies that every fixture in the batch shares — the
|
|
108
|
+
* LLM client, the tool dispatch, the tool list, and (optionally) a
|
|
109
|
+
* strategy factory. Kept separate from `RunFixturesOptions` so the
|
|
110
|
+
* caller can vary trial-count knobs without re-supplying the heavy
|
|
111
|
+
* dependencies.
|
|
112
|
+
*/
|
|
113
|
+
export interface RunFixturesDeps {
|
|
114
|
+
llm: LlmClient;
|
|
115
|
+
tools: ToolDispatch;
|
|
116
|
+
toolList: ToolHandler[];
|
|
117
|
+
/** Optional strategy factory — called once per trial so each trial
|
|
118
|
+
* gets a fresh strategy. Defaults to `createReactLoopStrategy()`. */
|
|
119
|
+
strategy?: () => AgentStrategy;
|
|
120
|
+
/** Optional system-prompt builder forwarded to every trial. */
|
|
121
|
+
systemPromptBuilder?: (workspace: Workspace, runtime: RuntimeState) => string;
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Drive every fixture in `fixtures` for `trials` trials each.
|
|
125
|
+
* Returns records in `(fixture, trial)` order — the i-th fixture's
|
|
126
|
+
* trials come before the (i+1)-th fixture's trials, and within each
|
|
127
|
+
* fixture trial 0 comes before trial 1, etc.
|
|
128
|
+
*
|
|
129
|
+
* Sequential by design; see `RunFixturesOptions.parallelism`. The
|
|
130
|
+
* eval harness's job is reproducible measurement, not throughput.
|
|
131
|
+
*/
|
|
132
|
+
export declare function runFixtures(fixtures: readonly TaskFixture[], deps: RunFixturesDeps, options?: RunFixturesOptions): Promise<RunRecord[]>;
|
|
133
|
+
/**
|
|
134
|
+
* Default system prompt builder used when the caller does not supply
|
|
135
|
+
* one. Intentionally minimal — production callers should pass their
|
|
136
|
+
* own. The eval harness uses this only so a test can run end-to-end
|
|
137
|
+
* without wiring a real prompt.
|
|
138
|
+
*/
|
|
139
|
+
export declare function defaultSystemPromptBuilder(workspace: Workspace, runtime: RuntimeState): string;
|
|
140
|
+
//# sourceMappingURL=runner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../../src/eval/runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgCG;AAKH,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAC;AACjD,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AAGxD,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AAC1D,OAAO,KAAK,EAAe,YAAY,EAAE,WAAW,EAAc,MAAM,mBAAmB,CAAC;AAE5F,OAAO,EAAmB,KAAK,SAAS,EAAE,MAAM,uBAAuB,CAAC;AACxE,OAAO,EAAE,KAAK,WAAW,EAA2B,MAAM,cAAc,CAAC;AACzE,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAC;AAEjD;;;;GAIG;AACH,MAAM,WAAW,eAAe;IAC9B,OAAO,EAAE,WAAW,CAAC;IACrB;iEAC6D;IAC7D,GAAG,EAAE,SAAS,CAAC;IACf;;gBAEY;IACZ,KAAK,EAAE,YAAY,CAAC;IACpB;2DACuD;IACvD,QAAQ,EAAE,WAAW,EAAE,CAAC;IACxB;uCACmC;IACnC,QAAQ,CAAC,EAAE,aAAa,CAAC;IACzB;+CAC2C;IAC3C,KAAK,CAAC,EAAE,MAAM,CAAC;IACf;2BACuB;IACvB,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB;;wBAEoB;IACpB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB;;8CAE0C;IAC1C,IAAI,CAAC,EAAE,MAAM,CAAC;IACd;;iBAEa;IACb,mBAAmB,CAAC,EAAE,CAAC,SAAS,EAAE,SAAS,EAAE,OAAO,EAAE,YAAY,KAAK,MAAM,CAAC;CAC/E;AAED;;;GAGG;AACH,wBAAsB,UAAU,CAAC,KAAK,EAAE,eAAe,GAAG,OAAO,CAAC,SAAS,CAAC,CA0I3E;AAED;;;;GAIG;AACH,MAAM,WAAW,kBAAkB;IACjC,mDAAmD;IACnD,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB;;gDAE4C;IAC5C,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB;8DAC0D;IAC1D,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,0DAA0D;IAC1D,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB;qEACiE;IACjE,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,WAAW,EAAE,KAAK,EAAE,MAAM,KAAK,MAAM,CAAC;CACxD;AAED;;;;;;GAMG;AACH,MAAM,WAAW,eAAe;IAC9B,GAAG,EAAE,SAAS,CAAC;IACf,KAAK,EAAE,YAAY,CAAC;IACpB,QAAQ,EAAE,WAAW,EAAE,CAAC;IACxB;0EACsE;IACtE,QAAQ,CAAC,EAAE,MAAM,aAAa,CAAC;IAC/B,+DAA+D;IAC/D,mBAAmB,CAAC,EAAE,CAAC,SAAS,EAAE,SAAS,EAAE,OAAO,EAAE,YAAY,KAAK,MAAM,CAAC;CAC/E;AAED;;;;;;;;GAQG;AACH,wBAAsB,WAAW,CAC/B,QAAQ,EAAE,SAAS,WAAW,EAAE,EAChC,IAAI,EAAE,eAAe,EACrB,OAAO,GAAE,kBAAuB,GAC/B,OAAO,CAAC,SAAS,EAAE,CAAC,CAwCtB;AAED;;;;;GAKG;AACH,wBAAgB,0BAA0B,CAAC,SAAS,EAAE,SAAS,EAAE,OAAO,EAAE,YAAY,GAAG,MAAM,CAO9F"}
|
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `runFixture` + `runFixtures` — the eval harness's session driver.
|
|
3
|
+
*
|
|
4
|
+
* Given a `TaskFixture`, an `LlmClient`, and a tool surface, the
|
|
5
|
+
* runner:
|
|
6
|
+
*
|
|
7
|
+
* 1. Builds the seeded `Workspace` from `EMPTY_WORKSPACE +
|
|
8
|
+
* fixture.initialState`.
|
|
9
|
+
* 2. Threads that workspace through the system prompt builder and
|
|
10
|
+
* the per-call `ToolContext` so the agent sees the seeded
|
|
11
|
+
* state.
|
|
12
|
+
* 3. Plugs a buffering `Tracer` into the session so every
|
|
13
|
+
* `TraceEvent` (request, response, dispatch-complete) lands in
|
|
14
|
+
* the captured record.
|
|
15
|
+
* 4. Submits the fixture's prompt and drains the session event
|
|
16
|
+
* stream, shadow-tracking workspace + runtime patches as they
|
|
17
|
+
* flow back through `tool_finished` events.
|
|
18
|
+
* 5. Returns a `RunRecord` with the trace, the final workspace +
|
|
19
|
+
* runtime, the concatenated assistant text, timing, and a
|
|
20
|
+
* `error` slot that is `null` on a clean finish or a string on
|
|
21
|
+
* any non-success outcome.
|
|
22
|
+
*
|
|
23
|
+
* Errors never throw out of `runFixture`. An aborted run, a
|
|
24
|
+
* wall-clock cap exceeded, a session-emitted `error` event, or an
|
|
25
|
+
* unexpected exception all resolve to a `RunRecord` with `error` set
|
|
26
|
+
* and a partial trace.
|
|
27
|
+
*
|
|
28
|
+
* The runner is browser-safe — no `node:*` imports. Persistence
|
|
29
|
+
* helpers (if anyone wants them) belong in
|
|
30
|
+
* `runner-persistence-node.ts` and route through `src/node.ts`. That
|
|
31
|
+
* file does not exist in v1 by design; downstream consumers can
|
|
32
|
+
* persist in-memory records however they like.
|
|
33
|
+
*/
|
|
34
|
+
import { createMetricsCollector } from '../metrics.js';
|
|
35
|
+
import { createAgentSession } from '../session.js';
|
|
36
|
+
import { createReactLoopStrategy } from '../strategy.js';
|
|
37
|
+
import { EMPTY_RUNTIME } from '../types/runtime.js';
|
|
38
|
+
import { EMPTY_WORKSPACE } from '../types/workspace.js';
|
|
39
|
+
import { applyWorkspaceOverrides } from './fixture.js';
|
|
40
|
+
/**
|
|
41
|
+
* Drive a single `TaskFixture` end-to-end and resolve with a
|
|
42
|
+
* `RunRecord` describing the captured run. Never throws.
|
|
43
|
+
*/
|
|
44
|
+
export async function runFixture(input) {
|
|
45
|
+
const trial = input.trial ?? 0;
|
|
46
|
+
const trace = [];
|
|
47
|
+
const tracer = {
|
|
48
|
+
emit(event) {
|
|
49
|
+
trace.push(event);
|
|
50
|
+
},
|
|
51
|
+
};
|
|
52
|
+
// Seed workspace + runtime from the fixture. The session does not
|
|
53
|
+
// accept a seeded workspace at construction time, so we shadow it:
|
|
54
|
+
// the seeded value is the one the agent sees (via systemPrompt +
|
|
55
|
+
// toolContext), and `tool_finished` events carry the patches we
|
|
56
|
+
// need to overlay back onto the shadow.
|
|
57
|
+
let shadowWorkspace = applyWorkspaceOverrides(EMPTY_WORKSPACE, input.fixture.initialState);
|
|
58
|
+
let shadowRuntime = EMPTY_RUNTIME;
|
|
59
|
+
// Compose the external signal with our own controller so we can
|
|
60
|
+
// abort cleanly on the wall-clock cap.
|
|
61
|
+
const internalAbort = new AbortController();
|
|
62
|
+
const linkedSignal = linkSignals(input.signal, internalAbort.signal);
|
|
63
|
+
// Wall-clock cap. `unref()` would be nicer but is Node-only; we
|
|
64
|
+
// just clear the timer on a clean finish.
|
|
65
|
+
let wallClockTimer;
|
|
66
|
+
let wallClockExceeded = false;
|
|
67
|
+
if (typeof input.maxWallClockMs === 'number' && input.maxWallClockMs > 0) {
|
|
68
|
+
wallClockTimer = setTimeout(() => {
|
|
69
|
+
wallClockExceeded = true;
|
|
70
|
+
internalAbort.abort();
|
|
71
|
+
}, input.maxWallClockMs);
|
|
72
|
+
}
|
|
73
|
+
const startedAt = Date.now();
|
|
74
|
+
let assistantText = '';
|
|
75
|
+
let error = null;
|
|
76
|
+
const strategy = input.strategy ?? createReactLoopStrategy();
|
|
77
|
+
const promptBuilder = input.systemPromptBuilder ?? defaultSystemPromptBuilder;
|
|
78
|
+
// toolContext factory threads the live shadow workspace + runtime
|
|
79
|
+
// into every per-call context, so tools that depend on the
|
|
80
|
+
// workspace see the seeded state rather than the session's empty
|
|
81
|
+
// baseline.
|
|
82
|
+
const toolContext = () => ({
|
|
83
|
+
workspace: shadowWorkspace,
|
|
84
|
+
runtime: shadowRuntime,
|
|
85
|
+
signal: linkedSignal,
|
|
86
|
+
});
|
|
87
|
+
let session;
|
|
88
|
+
try {
|
|
89
|
+
session = createAgentSession({
|
|
90
|
+
strategy,
|
|
91
|
+
llm: input.llm,
|
|
92
|
+
tools: input.tools,
|
|
93
|
+
toolList: input.toolList,
|
|
94
|
+
toolContext,
|
|
95
|
+
// The system prompt is built from the SEEDED workspace, not the
|
|
96
|
+
// session's empty baseline. The session calls this once per
|
|
97
|
+
// submit with its own (empty) workspace; we ignore that and
|
|
98
|
+
// pass our shadow instead.
|
|
99
|
+
systemPromptBuilder: (_ws, _rt) => promptBuilder(shadowWorkspace, shadowRuntime),
|
|
100
|
+
metrics: createMetricsCollector(),
|
|
101
|
+
history: [],
|
|
102
|
+
tracer,
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
catch (e) {
|
|
106
|
+
if (wallClockTimer)
|
|
107
|
+
clearTimeout(wallClockTimer);
|
|
108
|
+
return {
|
|
109
|
+
fixture: input.fixture,
|
|
110
|
+
trial,
|
|
111
|
+
trace,
|
|
112
|
+
finalWorkspace: shadowWorkspace,
|
|
113
|
+
finalRuntime: shadowRuntime,
|
|
114
|
+
assistantText: '',
|
|
115
|
+
startedAt,
|
|
116
|
+
completedAt: Date.now(),
|
|
117
|
+
...(input.seed !== undefined ? { seed: input.seed } : {}),
|
|
118
|
+
error: errorMessage(e),
|
|
119
|
+
};
|
|
120
|
+
}
|
|
121
|
+
try {
|
|
122
|
+
const events = session.submit(input.fixture.prompt, linkedSignal);
|
|
123
|
+
for await (const ev of events) {
|
|
124
|
+
if (ev.kind === 'text') {
|
|
125
|
+
assistantText += ev.chunk;
|
|
126
|
+
}
|
|
127
|
+
else if (ev.kind === 'tool_finished') {
|
|
128
|
+
const result = ev.result;
|
|
129
|
+
if (result.workspacePatch) {
|
|
130
|
+
shadowWorkspace = mergeWorkspace(shadowWorkspace, result.workspacePatch);
|
|
131
|
+
}
|
|
132
|
+
if (result.runtimePatch) {
|
|
133
|
+
shadowRuntime = mergeRuntime(shadowRuntime, result.runtimePatch);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
else if (ev.kind === 'error') {
|
|
137
|
+
// The session emitted an error event mid-stream; the stream
|
|
138
|
+
// closes after this. Record it as the run's error.
|
|
139
|
+
if (error === null)
|
|
140
|
+
error = ev.message;
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
catch (e) {
|
|
145
|
+
error = errorMessage(e);
|
|
146
|
+
}
|
|
147
|
+
finally {
|
|
148
|
+
if (wallClockTimer)
|
|
149
|
+
clearTimeout(wallClockTimer);
|
|
150
|
+
}
|
|
151
|
+
// Wall-clock cap takes precedence over a generic 'aborted' message
|
|
152
|
+
// so the caller can tell why the run ended.
|
|
153
|
+
if (wallClockExceeded && (error === null || /abort/i.test(error))) {
|
|
154
|
+
error = `runFixture: exceeded maxWallClockMs (${input.maxWallClockMs}ms)`;
|
|
155
|
+
}
|
|
156
|
+
else if (error === null &&
|
|
157
|
+
(linkedSignal.aborted || input.signal?.aborted === true) &&
|
|
158
|
+
!sawCompletion(trace)) {
|
|
159
|
+
// External abort with no session-emitted error and no terminal
|
|
160
|
+
// turn — record an aborted message so consumers do not have to
|
|
161
|
+
// probe the signal themselves.
|
|
162
|
+
error = 'runFixture: aborted';
|
|
163
|
+
}
|
|
164
|
+
return {
|
|
165
|
+
fixture: input.fixture,
|
|
166
|
+
trial,
|
|
167
|
+
trace,
|
|
168
|
+
finalWorkspace: shadowWorkspace,
|
|
169
|
+
finalRuntime: shadowRuntime,
|
|
170
|
+
assistantText,
|
|
171
|
+
startedAt,
|
|
172
|
+
completedAt: Date.now(),
|
|
173
|
+
...(input.seed !== undefined ? { seed: input.seed } : {}),
|
|
174
|
+
error,
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
/**
|
|
178
|
+
* Drive every fixture in `fixtures` for `trials` trials each.
|
|
179
|
+
* Returns records in `(fixture, trial)` order — the i-th fixture's
|
|
180
|
+
* trials come before the (i+1)-th fixture's trials, and within each
|
|
181
|
+
* fixture trial 0 comes before trial 1, etc.
|
|
182
|
+
*
|
|
183
|
+
* Sequential by design; see `RunFixturesOptions.parallelism`. The
|
|
184
|
+
* eval harness's job is reproducible measurement, not throughput.
|
|
185
|
+
*/
|
|
186
|
+
export async function runFixtures(fixtures, deps, options = {}) {
|
|
187
|
+
const trials = options.trials ?? 1;
|
|
188
|
+
const out = [];
|
|
189
|
+
for (const fixture of fixtures) {
|
|
190
|
+
for (let trial = 0; trial < trials; trial++) {
|
|
191
|
+
if (options.signal?.aborted) {
|
|
192
|
+
// Synthesize a placeholder record so callers can see which
|
|
193
|
+
// trial was skipped. Trace + state are empty; error
|
|
194
|
+
// carries the abort.
|
|
195
|
+
out.push({
|
|
196
|
+
fixture,
|
|
197
|
+
trial,
|
|
198
|
+
trace: [],
|
|
199
|
+
finalWorkspace: applyWorkspaceOverrides(EMPTY_WORKSPACE, fixture.initialState),
|
|
200
|
+
finalRuntime: EMPTY_RUNTIME,
|
|
201
|
+
assistantText: '',
|
|
202
|
+
startedAt: Date.now(),
|
|
203
|
+
completedAt: Date.now(),
|
|
204
|
+
error: 'runFixtures: aborted before trial start',
|
|
205
|
+
});
|
|
206
|
+
continue;
|
|
207
|
+
}
|
|
208
|
+
const seed = options.seed ? options.seed(fixture, trial) : undefined;
|
|
209
|
+
const strategy = deps.strategy ? deps.strategy() : undefined;
|
|
210
|
+
const record = await runFixture({
|
|
211
|
+
fixture,
|
|
212
|
+
llm: deps.llm,
|
|
213
|
+
tools: deps.tools,
|
|
214
|
+
toolList: deps.toolList,
|
|
215
|
+
...(strategy ? { strategy } : {}),
|
|
216
|
+
trial,
|
|
217
|
+
...(options.signal ? { signal: options.signal } : {}),
|
|
218
|
+
...(options.maxWallClockMs !== undefined ? { maxWallClockMs: options.maxWallClockMs } : {}),
|
|
219
|
+
...(seed !== undefined ? { seed } : {}),
|
|
220
|
+
...(deps.systemPromptBuilder ? { systemPromptBuilder: deps.systemPromptBuilder } : {}),
|
|
221
|
+
});
|
|
222
|
+
out.push(record);
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
return out;
|
|
226
|
+
}
|
|
227
|
+
/**
|
|
228
|
+
* Default system prompt builder used when the caller does not supply
|
|
229
|
+
* one. Intentionally minimal — production callers should pass their
|
|
230
|
+
* own. The eval harness uses this only so a test can run end-to-end
|
|
231
|
+
* without wiring a real prompt.
|
|
232
|
+
*/
|
|
233
|
+
export function defaultSystemPromptBuilder(workspace, runtime) {
|
|
234
|
+
const parts = ['You are an evaluation harness agent.'];
|
|
235
|
+
if (workspace.rules)
|
|
236
|
+
parts.push(`Rules:\n${workspace.rules}`);
|
|
237
|
+
if (workspace.code)
|
|
238
|
+
parts.push(`Code:\n${workspace.code}`);
|
|
239
|
+
if (workspace.appSource)
|
|
240
|
+
parts.push(`App:\n${workspace.appSource}`);
|
|
241
|
+
if (runtime.runSummary)
|
|
242
|
+
parts.push(`Last run: ${JSON.stringify(runtime.runSummary)}`);
|
|
243
|
+
return parts.join('\n\n');
|
|
244
|
+
}
|
|
245
|
+
function mergeWorkspace(base, patch) {
|
|
246
|
+
// Mirror session.ts freezing semantics so consumers see a frozen
|
|
247
|
+
// shadow workspace too. The cost is one shallow clone per patch,
|
|
248
|
+
// which is negligible in eval scenarios.
|
|
249
|
+
return Object.freeze({
|
|
250
|
+
...base,
|
|
251
|
+
...patch,
|
|
252
|
+
stitch: Object.freeze({
|
|
253
|
+
...base.stitch,
|
|
254
|
+
...(patch.stitch ?? {}),
|
|
255
|
+
}),
|
|
256
|
+
});
|
|
257
|
+
}
|
|
258
|
+
function mergeRuntime(base, patch) {
|
|
259
|
+
return Object.freeze({
|
|
260
|
+
...base,
|
|
261
|
+
...patch,
|
|
262
|
+
terminal: Object.freeze([
|
|
263
|
+
...(patch.terminal ?? base.terminal),
|
|
264
|
+
]),
|
|
265
|
+
uiErrors: Object.freeze([
|
|
266
|
+
...(patch.uiErrors ?? base.uiErrors),
|
|
267
|
+
]),
|
|
268
|
+
});
|
|
269
|
+
}
|
|
270
|
+
function errorMessage(e) {
|
|
271
|
+
if (e instanceof Error)
|
|
272
|
+
return e.message;
|
|
273
|
+
if (typeof e === 'string')
|
|
274
|
+
return e;
|
|
275
|
+
try {
|
|
276
|
+
return JSON.stringify(e);
|
|
277
|
+
}
|
|
278
|
+
catch {
|
|
279
|
+
return String(e);
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
/** Combine two abort signals (one optional) into a single signal. */
|
|
283
|
+
function linkSignals(external, internal) {
|
|
284
|
+
if (!external)
|
|
285
|
+
return internal;
|
|
286
|
+
if (external.aborted) {
|
|
287
|
+
const c = new AbortController();
|
|
288
|
+
c.abort();
|
|
289
|
+
return c.signal;
|
|
290
|
+
}
|
|
291
|
+
const controller = new AbortController();
|
|
292
|
+
if (internal.aborted) {
|
|
293
|
+
controller.abort();
|
|
294
|
+
return controller.signal;
|
|
295
|
+
}
|
|
296
|
+
external.addEventListener('abort', () => controller.abort(), { once: true });
|
|
297
|
+
internal.addEventListener('abort', () => controller.abort(), { once: true });
|
|
298
|
+
return controller.signal;
|
|
299
|
+
}
|
|
300
|
+
function sawCompletion(trace) {
|
|
301
|
+
// A clean finish emits at least one llm_response. If we have any
|
|
302
|
+
// response trace, treat the run as having reached the strategy
|
|
303
|
+
// proper rather than being aborted before it started.
|
|
304
|
+
for (const ev of trace) {
|
|
305
|
+
if (ev.kind === 'llm_response')
|
|
306
|
+
return true;
|
|
307
|
+
}
|
|
308
|
+
return false;
|
|
309
|
+
}
|
|
310
|
+
//# sourceMappingURL=runner.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.js","sourceRoot":"","sources":["../../src/eval/runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgCG;AAEH,OAAO,EAAE,sBAAsB,EAAE,MAAM,eAAe,CAAC;AACvD,OAAO,EAAE,kBAAkB,EAAE,MAAM,eAAe,CAAC;AACnD,OAAO,EAAE,uBAAuB,EAAE,MAAM,gBAAgB,CAAC;AAGzD,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AAKpD,OAAO,EAAE,eAAe,EAAkB,MAAM,uBAAuB,CAAC;AACxE,OAAO,EAAoB,uBAAuB,EAAE,MAAM,cAAc,CAAC;AA2CzE;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,KAAsB;IACrD,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,IAAI,CAAC,CAAC;IAC/B,MAAM,KAAK,GAAiB,EAAE,CAAC;IAC/B,MAAM,MAAM,GAAW;QACrB,IAAI,CAAC,KAAK;YACR,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACpB,CAAC;KACF,CAAC;IAEF,kEAAkE;IAClE,mEAAmE;IACnE,iEAAiE;IACjE,gEAAgE;IAChE,wCAAwC;IACxC,IAAI,eAAe,GAAc,uBAAuB,CACtD,eAAe,EACf,KAAK,CAAC,OAAO,CAAC,YAAY,CAC3B,CAAC;IACF,IAAI,aAAa,GAAiB,aAAa,CAAC;IAEhD,gEAAgE;IAChE,uCAAuC;IACvC,MAAM,aAAa,GAAG,IAAI,eAAe,EAAE,CAAC;IAC5C,MAAM,YAAY,GAAG,WAAW,CAAC,KAAK,CAAC,MAAM,EAAE,aAAa,CAAC,MAAM,CAAC,CAAC;IAErE,gEAAgE;IAChE,0CAA0C;IAC1C,IAAI,cAAyD,CAAC;IAC9D,IAAI,iBAAiB,GAAG,KAAK,CAAC;IAC9B,IAAI,OAAO,KAAK,CAAC,cAAc,KAAK,QAAQ,IAAI,KAAK,CAAC,cAAc,GAAG,CAAC,EAAE,CAAC;QACzE,cAAc,GAAG,UAAU,CAAC,GAAG,EAAE;YAC/B,iBAAiB,GAAG,IAAI,CAAC;YACzB,aAAa,CAAC,KAAK,EAAE,CAAC;QACxB,CAAC,EAAE,KAAK,CAAC,cAAc,CAAC,CAAC;IAC3B,CAAC;IAED,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAC7B,IAAI,aAAa,GAAG,EAAE,CAAC;IACvB,IAAI,KAAK,GAAkB,IAAI,CAAC;IAEhC,MAAM,QAAQ,GAAG,KAAK,CAAC,QAAQ,IAAI,uBAAuB,EAAE,CAAC;IAC7D,MAAM,aAAa,GAAG,KAAK,CAAC,mBAAmB,IAAI,0BAA0B,CAAC;IAE9E,kEAAkE;IAClE,2DAA2D;IAC3D,iEAAiE;IACjE,YAAY;IACZ,MAAM,WAAW,GAAG,GAAgB,EAAE,CAAC,CAAC;QACtC,SAAS,EAAE,eAAe;QAC1B,OAAO,EAAE,aAAa;QACtB,MAAM,EAAE,YAAY;KACrB,CAAC,CAAC;IAEH,IAAI,OAA8C,CAAC;IACnD,IAAI,CAAC;QACH,OAAO,GAAG,kBAAkB,CAAC;YAC3B,QAAQ;YACR,GAAG,EAAE,KAAK,CAAC,GAAG;YACd,KAAK,EAAE,KAAK,CAAC,KAAK;YAClB,QAAQ,EAAE,KAAK,CAAC,QAAQ;YACxB,WAAW;YACX,gEAAgE;YAChE,4DAA4D;YAC5D,4DAA4D;YAC5D,2BAA2B;YAC3B,mBAAmB,EAAE,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE,CAAC,aAAa,CAAC,eAAe,EAAE,aAAa,CAAC;YAChF,OAAO,EAAE,sBAAsB,EAAE;YACjC,OAAO,EAAE,EAAE;YACX,MAAM;SACP,CAAC,CAAC;IACL,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,IAAI,cAAc;YAAE,YAAY,CAAC,cAAc,CAAC,CAAC;QACjD,OAAO;YACL,OAAO,EAAE,KAAK,CAAC,OAAO;YACtB,KAAK;YACL,KAAK;YACL,cAAc,EAAE,eAAe;YAC/B,YAAY,EAAE,aAAa;YAC3B,aAAa,EAAE,EAAE;YACjB,SAAS;YACT,WAAW,EAAE,IAAI,CAAC,GAAG,EAAE;YACvB,GAAG,CAAC,KAAK,CAAC,IAAI,KAAK,SAAS,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YACzD,KAAK,EAAE,YAAY,CAAC,CAAC,CAAC;SACvB,CAAC;IACJ,CAAC;IAED,IAAI,CAAC;QACH,MAAM,MAAM,GAAgC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;QAC/F,IAAI,KAAK,EAAE,MAAM,EAAE,IAAI,MAAM,EAAE,CAAC;YAC9B,IAAI,EAAE,CAAC,IAAI,KAAK,MAAM,EAAE,CAAC;gBACvB,aAAa,IAAI,EAAE,CAAC,KAAK,CAAC;YAC5B,CAAC;iBAAM,IAAI,EAAE,CAAC,IAAI,KAAK,eAAe,EAAE,CAAC;gBACvC,MAAM,MAAM,GAAG,EAAE,CAAC,MAAoB,CAAC;gBACvC,IAAI,MAAM,CAAC,cAAc,EAAE,CAAC;oBAC1B,eAAe,GAAG,cAAc,CAAC,eAAe,EAAE,MAAM,CAAC,cAAc,CAAC,CAAC;gBAC3E,CAAC;gBACD,IAAI,MAAM,CAAC,YAAY,EAAE,CAAC;oBACxB,aAAa,GAAG,YAAY,CAAC,aAAa,EAAE,MAAM,CAAC,YAAY,CAAC,CAAC;gBACnE,CAAC;YACH,CAAC;iBAAM,IAAI,EAAE,CAAC,IAAI,KAAK,OAAO,EAAE,CAAC;gBAC/B,4DAA4D;gBAC5D,mDAAmD;gBACnD,IAAI,KAAK,KAAK,IAAI;oBAAE,KAAK,GAAG,EAAE,CAAC,OAAO,CAAC;YACzC,CAAC;QACH,CAAC;IACH,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,KAAK,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC;IAC1B,CAAC;YAAS,CAAC;QACT,IAAI,cAAc;YAAE,YAAY,CAAC,cAAc,CAAC,CAAC;IACnD,CAAC;IAED,mEAAmE;IACnE,4CAA4C;IAC5C,IAAI,iBAAiB,IAAI,CAAC,KAAK,KAAK,IAAI,IAAI,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC;QAClE,KAAK,GAAG,wCAAwC,KAAK,CAAC,cAAc,KAAK,CAAC;IAC5E,CAAC;SAAM,IACL,KAAK,KAAK,IAAI;QACd,CAAC,YAAY,CAAC,OAAO,IAAI,KAAK,CAAC,MAAM,EAAE,OAAO,KAAK,IAAI,CAAC;QACxD,CAAC,aAAa,CAAC,KAAK,CAAC,EACrB,CAAC;QACD,+DAA+D;QAC/D,+DAA+D;QAC/D,+BAA+B;QAC/B,KAAK,GAAG,qBAAqB,CAAC;IAChC,CAAC;IAED,OAAO;QACL,OAAO,EAAE,KAAK,CAAC,OAAO;QACtB,KAAK;QACL,KAAK;QACL,cAAc,EAAE,eAAe;QAC/B,YAAY,EAAE,aAAa;QAC3B,aAAa;QACb,SAAS;QACT,WAAW,EAAE,IAAI,CAAC,GAAG,EAAE;QACvB,GAAG,CAAC,KAAK,CAAC,IAAI,KAAK,SAAS,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACzD,KAAK;KACN,CAAC;AACJ,CAAC;AA0CD;;;;;;;;GAQG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,QAAgC,EAChC,IAAqB,EACrB,UAA8B,EAAE;IAEhC,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,IAAI,CAAC,CAAC;IACnC,MAAM,GAAG,GAAgB,EAAE,CAAC;IAC5B,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,GAAG,MAAM,EAAE,KAAK,EAAE,EAAE,CAAC;YAC5C,IAAI,OAAO,CAAC,MAAM,EAAE,OAAO,EAAE,CAAC;gBAC5B,2DAA2D;gBAC3D,oDAAoD;gBACpD,qBAAqB;gBACrB,GAAG,CAAC,IAAI,CAAC;oBACP,OAAO;oBACP,KAAK;oBACL,KAAK,EAAE,EAAE;oBACT,cAAc,EAAE,uBAAuB,CAAC,eAAe,EAAE,OAAO,CAAC,YAAY,CAAC;oBAC9E,YAAY,EAAE,aAAa;oBAC3B,aAAa,EAAE,EAAE;oBACjB,SAAS,EAAE,IAAI,CAAC,GAAG,EAAE;oBACrB,WAAW,EAAE,IAAI,CAAC,GAAG,EAAE;oBACvB,KAAK,EAAE,yCAAyC;iBACjD,CAAC,CAAC;gBACH,SAAS;YACX,CAAC;YACD,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YACrE,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;YAC7D,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC;gBAC9B,OAAO;gBACP,GAAG,EAAE,IAAI,CAAC,GAAG;gBACb,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,QAAQ,EAAE,IAAI,CAAC,QAAQ;gBACvB,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;gBACjC,KAAK;gBACL,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;gBACrD,GAAG,CAAC,OAAO,CAAC,cAAc,KAAK,SAAS,CAAC,CAAC,CAAC,EAAE,cAAc,EAAE,OAAO,CAAC,cAAc,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;gBAC3F,GAAG,CAAC,IAAI,KAAK,SAAS,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;gBACvC,GAAG,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC,CAAC,EAAE,mBAAmB,EAAE,IAAI,CAAC,mBAAmB,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;aACvF,CAAC,CAAC;YACH,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACnB,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,0BAA0B,CAAC,SAAoB,EAAE,OAAqB;IACpF,MAAM,KAAK,GAAa,CAAC,sCAAsC,CAAC,CAAC;IACjE,IAAI,SAAS,CAAC,KAAK;QAAE,KAAK,CAAC,IAAI,CAAC,WAAW,SAAS,CAAC,KAAK,EAAE,CAAC,CAAC;IAC9D,IAAI,SAAS,CAAC,IAAI;QAAE,KAAK,CAAC,IAAI,CAAC,UAAU,SAAS,CAAC,IAAI,EAAE,CAAC,CAAC;IAC3D,IAAI,SAAS,CAAC,SAAS;QAAE,KAAK,CAAC,IAAI,CAAC,SAAS,SAAS,CAAC,SAAS,EAAE,CAAC,CAAC;IACpE,IAAI,OAAO,CAAC,UAAU;QAAE,KAAK,CAAC,IAAI,CAAC,aAAa,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,CAAC,CAAC;IACtF,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;AAC5B,CAAC;AAED,SAAS,cAAc,CAAC,IAAe,EAAE,KAAyB;IAChE,iEAAiE;IACjE,iEAAiE;IACjE,yCAAyC;IACzC,OAAO,MAAM,CAAC,MAAM,CAAC;QACnB,GAAG,IAAI;QACP,GAAG,KAAK;QACR,MAAM,EAAE,MAAM,CAAC,MAAM,CAAC;YACpB,GAAG,IAAI,CAAC,MAAM;YACd,GAAG,CAAC,KAAK,CAAC,MAAM,IAAI,EAAE,CAAC;SACxB,CAAC;KACH,CAAc,CAAC;AAClB,CAAC;AAED,SAAS,YAAY,CAAC,IAAkB,EAAE,KAA4B;IACpE,OAAO,MAAM,CAAC,MAAM,CAAC;QACnB,GAAG,IAAI;QACP,GAAG,KAAK;QACR,QAAQ,EAAE,MAAM,CAAC,MAAM,CAAC;YACtB,GAAG,CAAC,KAAK,CAAC,QAAQ,IAAI,IAAI,CAAC,QAAQ,CAAC;SACrC,CAAwC;QACzC,QAAQ,EAAE,MAAM,CAAC,MAAM,CAAC;YACtB,GAAG,CAAC,KAAK,CAAC,QAAQ,IAAI,IAAI,CAAC,QAAQ,CAAC;SACrC,CAAwC;KAC1C,CAAiB,CAAC;AACrB,CAAC;AAED,SAAS,YAAY,CAAC,CAAU;IAC9B,IAAI,CAAC,YAAY,KAAK;QAAE,OAAO,CAAC,CAAC,OAAO,CAAC;IACzC,IAAI,OAAO,CAAC,KAAK,QAAQ;QAAE,OAAO,CAAC,CAAC;IACpC,IAAI,CAAC;QACH,OAAO,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC;IAC3B,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,MAAM,CAAC,CAAC,CAAC,CAAC;IACnB,CAAC;AACH,CAAC;AAED,qEAAqE;AACrE,SAAS,WAAW,CAAC,QAAiC,EAAE,QAAqB;IAC3E,IAAI,CAAC,QAAQ;QAAE,OAAO,QAAQ,CAAC;IAC/B,IAAI,QAAQ,CAAC,OAAO,EAAE,CAAC;QACrB,MAAM,CAAC,GAAG,IAAI,eAAe,EAAE,CAAC;QAChC,CAAC,CAAC,KAAK,EAAE,CAAC;QACV,OAAO,CAAC,CAAC,MAAM,CAAC;IAClB,CAAC;IACD,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;IACzC,IAAI,QAAQ,CAAC,OAAO,EAAE,CAAC;QACrB,UAAU,CAAC,KAAK,EAAE,CAAC;QACnB,OAAO,UAAU,CAAC,MAAM,CAAC;IAC3B,CAAC;IACD,QAAQ,CAAC,gBAAgB,CAAC,OAAO,EAAE,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC;IAC7E,QAAQ,CAAC,gBAAgB,CAAC,OAAO,EAAE,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC;IAC7E,OAAO,UAAU,CAAC,MAAM,CAAC;AAC3B,CAAC;AAED,SAAS,aAAa,CAAC,KAAmB;IACxC,iEAAiE;IACjE,+DAA+D;IAC/D,sDAAsD;IACtD,KAAK,MAAM,EAAE,IAAI,KAAK,EAAE,CAAC;QACvB,IAAI,EAAE,CAAC,IAAI,KAAK,cAAc;YAAE,OAAO,IAAI,CAAC;IAC9C,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC"}
|