@inbrowser/agent 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +2 -2
- package/dist/cli/commands/run.js.map +1 -1
- package/dist/cli/fixtures.d.ts +2 -2
- package/dist/cli/fixtures.d.ts.map +1 -1
- package/dist/cli/fixtures.js +7 -16
- package/dist/cli/fixtures.js.map +1 -1
- package/dist/cli/llm/openrouter.d.ts +4 -4
- package/dist/cli/llm/openrouter.d.ts.map +1 -1
- package/dist/cli/llm/openrouter.js +20 -31
- package/dist/cli/llm/openrouter.js.map +1 -1
- package/dist/diagnostics/index.d.ts +5 -0
- package/dist/diagnostics/index.d.ts.map +1 -0
- package/dist/diagnostics/index.js +3 -0
- package/dist/diagnostics/index.js.map +1 -0
- package/dist/diagnostics/timing.d.ts +48 -0
- package/dist/diagnostics/timing.d.ts.map +1 -0
- package/dist/diagnostics/timing.js +85 -0
- package/dist/diagnostics/timing.js.map +1 -0
- package/dist/diagnostics/truthfulness.d.ts +36 -0
- package/dist/diagnostics/truthfulness.d.ts.map +1 -0
- package/dist/diagnostics/truthfulness.js +180 -0
- package/dist/diagnostics/truthfulness.js.map +1 -0
- package/dist/dispatch-memoization.d.ts +84 -0
- package/dist/dispatch-memoization.d.ts.map +1 -0
- package/dist/dispatch-memoization.js +197 -0
- package/dist/dispatch-memoization.js.map +1 -0
- package/dist/eval/comparison-report.d.ts +164 -0
- package/dist/eval/comparison-report.d.ts.map +1 -0
- package/dist/eval/comparison-report.js +316 -0
- package/dist/eval/comparison-report.js.map +1 -0
- package/dist/eval/fixture.d.ts +74 -0
- package/dist/eval/fixture.d.ts.map +1 -0
- package/dist/eval/fixture.js +217 -0
- package/dist/eval/fixture.js.map +1 -0
- package/dist/eval/index.d.ts +13 -0
- package/dist/eval/index.d.ts.map +1 -0
- package/dist/eval/index.js +7 -0
- package/dist/eval/index.js.map +1 -0
- package/dist/eval/load-node.d.ts +16 -0
- package/dist/eval/load-node.d.ts.map +1 -0
- package/dist/eval/load-node.js +58 -0
- package/dist/eval/load-node.js.map +1 -0
- package/dist/eval/metric-collector.d.ts +209 -0
- package/dist/eval/metric-collector.d.ts.map +1 -0
- package/dist/eval/metric-collector.js +293 -0
- package/dist/eval/metric-collector.js.map +1 -0
- package/dist/eval/run-record.d.ts +76 -0
- package/dist/eval/run-record.d.ts.map +1 -0
- package/dist/eval/run-record.js +32 -0
- package/dist/eval/run-record.js.map +1 -0
- package/dist/eval/runner.d.ts +140 -0
- package/dist/eval/runner.d.ts.map +1 -0
- package/dist/eval/runner.js +310 -0
- package/dist/eval/runner.js.map +1 -0
- package/dist/eval/spec-framework.d.ts +113 -0
- package/dist/eval/spec-framework.d.ts.map +1 -0
- package/dist/eval/spec-framework.js +100 -0
- package/dist/eval/spec-framework.js.map +1 -0
- package/dist/eval/spec-helpers.d.ts +245 -0
- package/dist/eval/spec-helpers.d.ts.map +1 -0
- package/dist/eval/spec-helpers.js +605 -0
- package/dist/eval/spec-helpers.js.map +1 -0
- package/dist/index.d.ts +32 -8
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +14 -3
- package/dist/index.js.map +1 -1
- package/dist/llm-adapter.d.ts +30 -34
- package/dist/llm-adapter.d.ts.map +1 -1
- package/dist/llm-adapter.js +61 -51
- package/dist/llm-adapter.js.map +1 -1
- package/dist/mcp/connect.d.ts +68 -0
- package/dist/mcp/connect.d.ts.map +1 -0
- package/dist/mcp/connect.js +111 -0
- package/dist/mcp/connect.js.map +1 -0
- package/dist/metrics.js +4 -4
- package/dist/metrics.js.map +1 -1
- package/dist/node.d.ts +3 -0
- package/dist/node.d.ts.map +1 -1
- package/dist/node.js +2 -0
- package/dist/node.js.map +1 -1
- package/dist/planner-executor.d.ts +132 -0
- package/dist/planner-executor.d.ts.map +1 -0
- package/dist/planner-executor.js +274 -0
- package/dist/planner-executor.js.map +1 -0
- package/dist/retrieval.d.ts +74 -0
- package/dist/retrieval.d.ts.map +1 -0
- package/dist/retrieval.js +287 -0
- package/dist/retrieval.js.map +1 -0
- package/dist/session.d.ts.map +1 -1
- package/dist/session.js +8 -2
- package/dist/session.js.map +1 -1
- package/dist/skill-catalog.d.ts +81 -0
- package/dist/skill-catalog.d.ts.map +1 -0
- package/dist/skill-catalog.js +388 -0
- package/dist/skill-catalog.js.map +1 -0
- package/dist/skill-router.d.ts +95 -0
- package/dist/skill-router.d.ts.map +1 -0
- package/dist/skill-router.js +130 -0
- package/dist/skill-router.js.map +1 -0
- package/dist/strategy.d.ts +22 -2
- package/dist/strategy.d.ts.map +1 -1
- package/dist/strategy.js +358 -28
- package/dist/strategy.js.map +1 -1
- package/dist/tools.d.ts +15 -1
- package/dist/tools.d.ts.map +1 -1
- package/dist/tools.js +18 -0
- package/dist/tools.js.map +1 -1
- package/dist/types/agent.d.ts +2 -3
- package/dist/types/agent.d.ts.map +1 -1
- package/dist/types/agent.js +1 -1
- package/dist/types/chat.d.ts +0 -15
- package/dist/types/chat.d.ts.map +1 -1
- package/dist/types/llm.d.ts +11 -64
- package/dist/types/llm.d.ts.map +1 -1
- package/dist/types/llm.js +7 -8
- package/dist/types/llm.js.map +1 -1
- package/dist/types/metrics.d.ts +2 -2
- package/dist/types/metrics.d.ts.map +1 -1
- package/dist/types/session.d.ts +2 -2
- package/dist/types/session.d.ts.map +1 -1
- package/dist/types/strategy.d.ts +60 -3
- package/dist/types/strategy.d.ts.map +1 -1
- package/dist/types/tools.d.ts +18 -0
- package/dist/types/tools.d.ts.map +1 -1
- package/dist/types/trace.d.ts +67 -15
- package/dist/types/trace.d.ts.map +1 -1
- package/dist/types/trace.js +5 -3
- package/dist/types/trace.js.map +1 -1
- package/package.json +3 -2
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Success-spec framework for the eval harness.
|
|
3
|
+
*
|
|
4
|
+
* A success specification (a "spec") decides whether one captured agent
|
|
5
|
+
* run satisfied its fixture's success criterion. The framework provides
|
|
6
|
+
* three things:
|
|
7
|
+
*
|
|
8
|
+
* 1. A `SpecRegistry` plus an `createSpecRegistry()` factory.
|
|
9
|
+
* Registration is explicit — no side-effect registration on import.
|
|
10
|
+
* Callers build a registry, call `registerStarterSpecs()` if they
|
|
11
|
+
* want the common helpers (see `spec-helpers.ts`), and register
|
|
12
|
+
* their own specs on top.
|
|
13
|
+
*
|
|
14
|
+
* 2. An `evaluateSpec(registry, reference, snapshot)` evaluator that
|
|
15
|
+
* looks the spec up by name, awaits it (specs may be async), and
|
|
16
|
+
* returns a structured `SpecResult`. A thrown error inside the
|
|
17
|
+
* spec body is caught and converted into `{ ok: false, error }`,
|
|
18
|
+
* so a misbehaving spec never crashes the harness.
|
|
19
|
+
*
|
|
20
|
+
* 3. The `RunSnapshot` interface — the input every spec consumes.
|
|
21
|
+
* Intentionally narrower than the runner's eventual `RunRecord`,
|
|
22
|
+
* so this branch develops independently of `eval/harness-runner`.
|
|
23
|
+
* The runner's `RunRecord` is designed to be a structural superset:
|
|
24
|
+
* a runner caller passes its `record` straight to `evaluateSpec`.
|
|
25
|
+
*
|
|
26
|
+
* Specs read three slices of state — the final workspace (rules / code
|
|
27
|
+
* / app source), the final runtime (the most recent run summary, any
|
|
28
|
+
* uiErrors, terminal output, etc.), the full assistant text, and the
|
|
29
|
+
* trace. Anything outside those four fields is a sign the snapshot
|
|
30
|
+
* shape needs widening; do that here, not in spec bodies.
|
|
31
|
+
*
|
|
32
|
+
* Spec names follow the same `family/spec-name` kebab-case form that
|
|
33
|
+
* `validateFixture` enforces on `SuccessSpecReference.name`. Registration
|
|
34
|
+
* validates the name at registration time so typos surface immediately.
|
|
35
|
+
*/
|
|
36
|
+
import type { RuntimeState } from '../types/runtime.js';
|
|
37
|
+
import type { TraceEvent } from '../types/trace.js';
|
|
38
|
+
import type { Workspace } from '../types/workspace.js';
|
|
39
|
+
import type { SuccessSpecReference } from './fixture.js';
|
|
40
|
+
/**
|
|
41
|
+
* Input shape every spec consumes. Intentionally narrower than the
|
|
42
|
+
* runner's eventual `RunRecord` so this branch develops independently
|
|
43
|
+
* of `eval/harness-runner`. The runner's record is designed to be a
|
|
44
|
+
* structural superset; a runner caller can pass its record straight to
|
|
45
|
+
* `evaluateSpec`.
|
|
46
|
+
*/
|
|
47
|
+
export interface RunSnapshot {
|
|
48
|
+
/** Workspace state at the end of the run. */
|
|
49
|
+
finalWorkspace: Workspace;
|
|
50
|
+
/** Runtime state at the end of the run (run summary, uiErrors, ...). */
|
|
51
|
+
finalRuntime: RuntimeState;
|
|
52
|
+
/** Concatenated assistant text across the run's iterations. */
|
|
53
|
+
assistantText: string;
|
|
54
|
+
/** All trace events emitted during the run, in emission order. */
|
|
55
|
+
trace: readonly TraceEvent[];
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Structured result returned by `evaluateSpec` and by every spec body.
|
|
59
|
+
* `ok` is the pass/fail bit. `detail` is optional structured context a
|
|
60
|
+
* report can surface (matched tokens, missing tokens, the offending
|
|
61
|
+
* trace event id, ...). `error` carries the failure reason when the
|
|
62
|
+
* spec did not run cleanly — registration miss, args validation failure,
|
|
63
|
+
* spec body threw, etc.
|
|
64
|
+
*/
|
|
65
|
+
export interface SpecResult {
|
|
66
|
+
ok: boolean;
|
|
67
|
+
detail?: Record<string, unknown>;
|
|
68
|
+
error?: string;
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Spec function signature. May be sync or async; the evaluator awaits
|
|
72
|
+
* the return either way. `args` is whatever the fixture supplied in
|
|
73
|
+
* `SuccessSpecReference.args` — typed as `unknown` because every spec
|
|
74
|
+
* declares (and validates) its own arg shape.
|
|
75
|
+
*/
|
|
76
|
+
export type SpecFn = (snapshot: RunSnapshot, args: unknown) => SpecResult | Promise<SpecResult>;
|
|
77
|
+
/**
|
|
78
|
+
* In-memory registry. Backed by a `Map`, exposed as a small object so
|
|
79
|
+
* callers do not depend on Map identity.
|
|
80
|
+
*/
|
|
81
|
+
export interface SpecRegistry {
|
|
82
|
+
/**
|
|
83
|
+
* Register a spec by name. Throws if the name does not match the
|
|
84
|
+
* required `family/spec-name` kebab-case form, or if the name is
|
|
85
|
+
* already registered. Throws-on-conflict is intentional: silent
|
|
86
|
+
* overwrites mask real bugs and the harness only registers specs at
|
|
87
|
+
* startup, so a throw is observable.
|
|
88
|
+
*/
|
|
89
|
+
register(name: string, fn: SpecFn): void;
|
|
90
|
+
/** Returns the registered spec function, or undefined. */
|
|
91
|
+
get(name: string): SpecFn | undefined;
|
|
92
|
+
/** True iff `name` is registered. */
|
|
93
|
+
has(name: string): boolean;
|
|
94
|
+
/** All registered names, in registration order. */
|
|
95
|
+
names(): string[];
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Create a fresh, empty spec registry. Callers register specs on it
|
|
99
|
+
* explicitly — `registerStarterSpecs()` is the common starting point
|
|
100
|
+
* for fixtures that reuse the helpers; bespoke specs are registered the
|
|
101
|
+
* same way.
|
|
102
|
+
*/
|
|
103
|
+
export declare function createSpecRegistry(): SpecRegistry;
|
|
104
|
+
/**
|
|
105
|
+
* Resolve a `SuccessSpecReference` against a registry and evaluate it
|
|
106
|
+
* over a `RunSnapshot`. Returns a `SpecResult`. Never throws — an
|
|
107
|
+
* unregistered name, a thrown spec body, or a returned non-result is
|
|
108
|
+
* surfaced as `{ ok: false, error }`.
|
|
109
|
+
*
|
|
110
|
+
* Async specs are awaited. Sync specs are returned unchanged.
|
|
111
|
+
*/
|
|
112
|
+
export declare function evaluateSpec(registry: SpecRegistry, reference: SuccessSpecReference, snapshot: RunSnapshot): Promise<SpecResult>;
|
|
113
|
+
//# sourceMappingURL=spec-framework.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"spec-framework.d.ts","sourceRoot":"","sources":["../../src/eval/spec-framework.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkCG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AACxD,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,cAAc,CAAC;AAEzD;;;;;;GAMG;AACH,MAAM,WAAW,WAAW;IAC1B,6CAA6C;IAC7C,cAAc,EAAE,SAAS,CAAC;IAC1B,wEAAwE;IACxE,YAAY,EAAE,YAAY,CAAC;IAC3B,+DAA+D;IAC/D,aAAa,EAAE,MAAM,CAAC;IACtB,kEAAkE;IAClE,KAAK,EAAE,SAAS,UAAU,EAAE,CAAC;CAC9B;AAED;;;;;;;GAOG;AACH,MAAM,WAAW,UAAU;IACzB,EAAE,EAAE,OAAO,CAAC;IACZ,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACjC,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;;;;GAKG;AACH,MAAM,MAAM,MAAM,GAAG,CAAC,QAAQ,EAAE,WAAW,EAAE,IAAI,EAAE,OAAO,KAAK,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;AAEhG;;;GAGG;AACH,MAAM,WAAW,YAAY;IAC3B;;;;;;OAMG;IACH,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,GAAG,IAAI,CAAC;IACzC,0DAA0D;IAC1D,GAAG,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CAAC;IACtC,qCAAqC;IACrC,GAAG,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC;IAC3B,mDAAmD;IACnD,KAAK,IAAI,MAAM,EAAE,CAAC;CACnB;AAID;;;;;GAKG;AACH,wBAAgB,kBAAkB,IAAI,YAAY,CAwBjD;AAED;;;;;;;GAOG;AACH,wBAAsB,YAAY,CAChC,QAAQ,EAAE,YAAY,EACtB,SAAS,EAAE,oBAAoB,EAC/B,QAAQ,EAAE,WAAW,GACpB,OAAO,CAAC,UAAU,CAAC,CAkBrB"}
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Success-spec framework for the eval harness.
|
|
3
|
+
*
|
|
4
|
+
* A success specification (a "spec") decides whether one captured agent
|
|
5
|
+
* run satisfied its fixture's success criterion. The framework provides
|
|
6
|
+
* three things:
|
|
7
|
+
*
|
|
8
|
+
* 1. A `SpecRegistry` plus an `createSpecRegistry()` factory.
|
|
9
|
+
* Registration is explicit — no side-effect registration on import.
|
|
10
|
+
* Callers build a registry, call `registerStarterSpecs()` if they
|
|
11
|
+
* want the common helpers (see `spec-helpers.ts`), and register
|
|
12
|
+
* their own specs on top.
|
|
13
|
+
*
|
|
14
|
+
* 2. An `evaluateSpec(registry, reference, snapshot)` evaluator that
|
|
15
|
+
* looks the spec up by name, awaits it (specs may be async), and
|
|
16
|
+
* returns a structured `SpecResult`. A thrown error inside the
|
|
17
|
+
* spec body is caught and converted into `{ ok: false, error }`,
|
|
18
|
+
* so a misbehaving spec never crashes the harness.
|
|
19
|
+
*
|
|
20
|
+
* 3. The `RunSnapshot` interface — the input every spec consumes.
|
|
21
|
+
* Intentionally narrower than the runner's eventual `RunRecord`,
|
|
22
|
+
* so this branch develops independently of `eval/harness-runner`.
|
|
23
|
+
* The runner's `RunRecord` is designed to be a structural superset:
|
|
24
|
+
* a runner caller passes its `record` straight to `evaluateSpec`.
|
|
25
|
+
*
|
|
26
|
+
* Specs read three slices of state — the final workspace (rules / code
|
|
27
|
+
* / app source), the final runtime (the most recent run summary, any
|
|
28
|
+
* uiErrors, terminal output, etc.), the full assistant text, and the
|
|
29
|
+
* trace. Anything outside those four fields is a sign the snapshot
|
|
30
|
+
* shape needs widening; do that here, not in spec bodies.
|
|
31
|
+
*
|
|
32
|
+
* Spec names follow the same `family/spec-name` kebab-case form that
|
|
33
|
+
* `validateFixture` enforces on `SuccessSpecReference.name`. Registration
|
|
34
|
+
* validates the name at registration time so typos surface immediately.
|
|
35
|
+
*/
|
|
36
|
+
const SPEC_NAME_PATTERN = /^[a-z][a-z0-9-]*\/[a-z][a-z0-9-]*$/;
|
|
37
|
+
/**
|
|
38
|
+
* Create a fresh, empty spec registry. Callers register specs on it
|
|
39
|
+
* explicitly — `registerStarterSpecs()` is the common starting point
|
|
40
|
+
* for fixtures that reuse the helpers; bespoke specs are registered the
|
|
41
|
+
* same way.
|
|
42
|
+
*/
|
|
43
|
+
export function createSpecRegistry() {
|
|
44
|
+
const specs = new Map();
|
|
45
|
+
return {
|
|
46
|
+
register(name, fn) {
|
|
47
|
+
if (typeof name !== 'string' || !SPEC_NAME_PATTERN.test(name)) {
|
|
48
|
+
throw new Error(`spec name must match \`family/spec-name\` kebab-case, got: ${JSON.stringify(name)}`);
|
|
49
|
+
}
|
|
50
|
+
if (specs.has(name)) {
|
|
51
|
+
throw new Error(`spec already registered: ${name}`);
|
|
52
|
+
}
|
|
53
|
+
specs.set(name, fn);
|
|
54
|
+
},
|
|
55
|
+
get(name) {
|
|
56
|
+
return specs.get(name);
|
|
57
|
+
},
|
|
58
|
+
has(name) {
|
|
59
|
+
return specs.has(name);
|
|
60
|
+
},
|
|
61
|
+
names() {
|
|
62
|
+
return Array.from(specs.keys());
|
|
63
|
+
},
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Resolve a `SuccessSpecReference` against a registry and evaluate it
|
|
68
|
+
* over a `RunSnapshot`. Returns a `SpecResult`. Never throws — an
|
|
69
|
+
* unregistered name, a thrown spec body, or a returned non-result is
|
|
70
|
+
* surfaced as `{ ok: false, error }`.
|
|
71
|
+
*
|
|
72
|
+
* Async specs are awaited. Sync specs are returned unchanged.
|
|
73
|
+
*/
|
|
74
|
+
export async function evaluateSpec(registry, reference, snapshot) {
|
|
75
|
+
const fn = registry.get(reference.name);
|
|
76
|
+
if (!fn) {
|
|
77
|
+
return { ok: false, error: `spec not registered: ${reference.name}` };
|
|
78
|
+
}
|
|
79
|
+
try {
|
|
80
|
+
const result = await fn(snapshot, reference.args);
|
|
81
|
+
if (!isSpecResult(result)) {
|
|
82
|
+
return {
|
|
83
|
+
ok: false,
|
|
84
|
+
error: `spec "${reference.name}" returned a non-SpecResult value`,
|
|
85
|
+
};
|
|
86
|
+
}
|
|
87
|
+
return result;
|
|
88
|
+
}
|
|
89
|
+
catch (err) {
|
|
90
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
91
|
+
return { ok: false, error: `spec "${reference.name}" threw: ${message}` };
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
function isSpecResult(value) {
|
|
95
|
+
if (value === null || typeof value !== 'object')
|
|
96
|
+
return false;
|
|
97
|
+
const obj = value;
|
|
98
|
+
return typeof obj.ok === 'boolean';
|
|
99
|
+
}
|
|
100
|
+
//# sourceMappingURL=spec-framework.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"spec-framework.js","sourceRoot":"","sources":["../../src/eval/spec-framework.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkCG;AAoEH,MAAM,iBAAiB,GAAG,oCAAoC,CAAC;AAE/D;;;;;GAKG;AACH,MAAM,UAAU,kBAAkB;IAChC,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IACxC,OAAO;QACL,QAAQ,CAAC,IAAI,EAAE,EAAE;YACf,IAAI,OAAO,IAAI,KAAK,QAAQ,IAAI,CAAC,iBAAiB,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC9D,MAAM,IAAI,KAAK,CACb,8DAA8D,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,EAAE,CACrF,CAAC;YACJ,CAAC;YACD,IAAI,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;gBACpB,MAAM,IAAI,KAAK,CAAC,4BAA4B,IAAI,EAAE,CAAC,CAAC;YACtD,CAAC;YACD,KAAK,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QACtB,CAAC;QACD,GAAG,CAAC,IAAI;YACN,OAAO,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QACzB,CAAC;QACD,GAAG,CAAC,IAAI;YACN,OAAO,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QACzB,CAAC;QACD,KAAK;YACH,OAAO,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC;QAClC,CAAC;KACF,CAAC;AACJ,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,QAAsB,EACtB,SAA+B,EAC/B,QAAqB;IAErB,MAAM,EAAE,GAAG,QAAQ,CAAC,GAAG,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;IACxC,IAAI,CAAC,EAAE,EAAE,CAAC;QACR,OAAO,EAAE,EAAE,EAAE,KAAK,EAAE,KAAK,EAAE,wBAAwB,SAAS,CAAC,IAAI,EAAE,EAAE,CAAC;IACxE,CAAC;IACD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,EAAE,CAAC,QAAQ,EAAE,SAAS,CAAC,IAAI,CAAC,CAAC;QAClD,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,EAAE,CAAC;YAC1B,OAAO;gBACL,EAAE,EAAE,KAAK;gBACT,KAAK,EAAE,SAAS,SAAS,CAAC,IAAI,mCAAmC;aAClE,CAAC;QACJ,CAAC;QACD,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,MAAM,OAAO,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;QACjE,OAAO,EAAE,EAAE,EAAE,KAAK,EAAE,KAAK,EAAE,SAAS,SAAS,CAAC,IAAI,YAAY,OAAO,EAAE,EAAE,CAAC;IAC5E,CAAC;AACH,CAAC;AAED,SAAS,YAAY,CAAC,KAAc;IAClC,IAAI,KAAK,KAAK,IAAI,IAAI,OAAO,KAAK,KAAK,QAAQ;QAAE,OAAO,KAAK,CAAC;IAC9D,MAAM,GAAG,GAAG,KAAgC,CAAC;IAC7C,OAAO,OAAO,GAAG,CAAC,EAAE,KAAK,SAAS,CAAC;AACrC,CAAC"}
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Starter library of common success specs.
|
|
3
|
+
*
|
|
4
|
+
* These helpers are intentionally crude. Golden-task authors compose
|
|
5
|
+
* them through `SuccessSpecReference.args`; custom specs are written
|
|
6
|
+
* only when crudeness is not enough.
|
|
7
|
+
*
|
|
8
|
+
* Registration is explicit. Callers create a registry and call
|
|
9
|
+
* `registerStarterSpecs(registry)` to get all six helpers under their
|
|
10
|
+
* documented names. No side-effect registration on import — that way
|
|
11
|
+
* a host can choose to register a subset, swap in stricter variants,
|
|
12
|
+
* or shadow a helper without monkey-patching this module.
|
|
13
|
+
*
|
|
14
|
+
* Each helper has a `family/spec-name` kebab-case identifier that
|
|
15
|
+
* matches the form `validateFixture` enforces on
|
|
16
|
+
* `SuccessSpecReference.name`. The constants below are exported so
|
|
17
|
+
* fixture authors and reviewers have a single place to import the
|
|
18
|
+
* canonical strings from.
|
|
19
|
+
*
|
|
20
|
+
* Argument shape and behavior are documented per-spec via JSDoc.
|
|
21
|
+
* The framework treats `args` as `unknown`; each spec validates its
|
|
22
|
+
* own shape and surfaces a clear `error` if the args are malformed,
|
|
23
|
+
* rather than throwing. (The evaluator catches throws too, but a
|
|
24
|
+
* structured error message is friendlier to read.)
|
|
25
|
+
*/
|
|
26
|
+
import type { SpecFn, SpecRegistry } from './spec-framework.js';
|
|
27
|
+
export declare const SPEC_REPORT_MENTIONS_AT_LEAST_ONE_OF = "report-mentions/at-least-one-of";
|
|
28
|
+
export declare const SPEC_REPORT_MENTIONS_ALL_OF = "report-mentions/all-of";
|
|
29
|
+
export declare const SPEC_TRACE_CONTAINS_TOOL_CALL_BY_NAME = "trace-contains-tool-call/by-name";
|
|
30
|
+
export declare const SPEC_FINAL_RULES_INCLUDES_LITERAL = "final-rules-includes/literal";
|
|
31
|
+
export declare const SPEC_FINAL_RULES_EXCLUDES_LITERAL = "final-rules-excludes/literal";
|
|
32
|
+
export declare const SPEC_FINAL_RUNTIME_RUN_SUMMARY_OK = "final-runtime/run-summary-ok";
|
|
33
|
+
export declare const SPEC_GAME_RULES_SIMULATOR_ACCEPTS_POSITIVE_AND_REJECTS_CHEAT = "game-rules/simulator-accepts-positive-and-rejects-cheat";
|
|
34
|
+
export declare const SPEC_PYRIC_AGENTS_LINT_CLEAN_AND_RULE_REJECTS_CHEAT = "pyric-agents/lint-clean-and-rule-rejects-cheat";
|
|
35
|
+
/**
|
|
36
|
+
* Args: `{ tokens: string[]; caseSensitive?: boolean }`.
|
|
37
|
+
*
|
|
38
|
+
* Passes when `assistantText` contains at least one of `tokens`.
|
|
39
|
+
* Empty / non-array `tokens` is a malformed-args error.
|
|
40
|
+
*
|
|
41
|
+
* Example fixture reference:
|
|
42
|
+
*
|
|
43
|
+
* {
|
|
44
|
+
* "name": "report-mentions/at-least-one-of",
|
|
45
|
+
* "args": { "tokens": ["open-write", "missing auth check"] }
|
|
46
|
+
* }
|
|
47
|
+
*/
|
|
48
|
+
export declare const reportMentionsAtLeastOneOf: SpecFn;
|
|
49
|
+
/**
|
|
50
|
+
* Args: `{ tokens: string[]; caseSensitive?: boolean }`.
|
|
51
|
+
*
|
|
52
|
+
* Passes when `assistantText` contains every entry in `tokens`.
|
|
53
|
+
* Empty / non-array `tokens` is a malformed-args error.
|
|
54
|
+
*
|
|
55
|
+
* Example fixture reference:
|
|
56
|
+
*
|
|
57
|
+
* {
|
|
58
|
+
* "name": "report-mentions/all-of",
|
|
59
|
+
* "args": { "tokens": ["users", "open-write", "fix"] }
|
|
60
|
+
* }
|
|
61
|
+
*/
|
|
62
|
+
export declare const reportMentionsAllOf: SpecFn;
|
|
63
|
+
/**
|
|
64
|
+
* Args: `{ tool: string; minCount?: number }`.
|
|
65
|
+
*
|
|
66
|
+
* Passes when the trace contains at least `minCount` (default 1)
|
|
67
|
+
* `tool_call` records emitted by the response trace for a tool named
|
|
68
|
+
* `tool`. Tool calls are read off `llm_response` trace events, which
|
|
69
|
+
* is where the agent loop records what the model asked for in each
|
|
70
|
+
* iteration.
|
|
71
|
+
*
|
|
72
|
+
* Example fixture reference:
|
|
73
|
+
*
|
|
74
|
+
* {
|
|
75
|
+
* "name": "trace-contains-tool-call/by-name",
|
|
76
|
+
* "args": { "tool": "rulesSimulator", "minCount": 1 }
|
|
77
|
+
* }
|
|
78
|
+
*/
|
|
79
|
+
export declare const traceContainsToolCallByName: SpecFn;
|
|
80
|
+
/**
|
|
81
|
+
* Args: `{ literal: string; caseSensitive?: boolean }`.
|
|
82
|
+
*
|
|
83
|
+
* Passes when `finalWorkspace.rules` contains `literal` as a substring.
|
|
84
|
+
* `literal` must be a non-empty string.
|
|
85
|
+
*
|
|
86
|
+
* Example fixture reference:
|
|
87
|
+
*
|
|
88
|
+
* {
|
|
89
|
+
* "name": "final-rules-includes/literal",
|
|
90
|
+
* "args": { "literal": "request.auth != null" }
|
|
91
|
+
* }
|
|
92
|
+
*/
|
|
93
|
+
export declare const finalRulesIncludesLiteral: SpecFn;
|
|
94
|
+
/**
|
|
95
|
+
* Args: `{ literal: string; caseSensitive?: boolean }`.
|
|
96
|
+
*
|
|
97
|
+
* Passes when `finalWorkspace.rules` does NOT contain `literal` as a
|
|
98
|
+
* substring. Useful for asserting that a planted antipattern has been
|
|
99
|
+
* removed.
|
|
100
|
+
*
|
|
101
|
+
* Example fixture reference:
|
|
102
|
+
*
|
|
103
|
+
* {
|
|
104
|
+
* "name": "final-rules-excludes/literal",
|
|
105
|
+
* "args": { "literal": "allow write: if true" }
|
|
106
|
+
* }
|
|
107
|
+
*/
|
|
108
|
+
export declare const finalRulesExcludesLiteral: SpecFn;
|
|
109
|
+
/**
|
|
110
|
+
* Args: none (`undefined` or `{}`).
|
|
111
|
+
*
|
|
112
|
+
* Passes when `finalRuntime.runSummary` exists and `runSummary.ok` is
|
|
113
|
+
* true — i.e. the most recent `runCode` invocation succeeded. Fails
|
|
114
|
+
* when there is no run summary at all (the spec only makes sense for
|
|
115
|
+
* fixtures whose skill is expected to run code).
|
|
116
|
+
*
|
|
117
|
+
* Example fixture reference:
|
|
118
|
+
*
|
|
119
|
+
* { "name": "final-runtime/run-summary-ok" }
|
|
120
|
+
*/
|
|
121
|
+
export declare const finalRuntimeRunSummaryOk: SpecFn;
|
|
122
|
+
/**
|
|
123
|
+
* Args: `{ database?: 'firestore' | 'rtdb'; positive: PositiveArgs; cheat: CheatArgs }`.
|
|
124
|
+
*
|
|
125
|
+
* Both-direction check: the generated rules should accept a defined
|
|
126
|
+
* positive move AND reject a defined cheating attempt. The fixture
|
|
127
|
+
* supplies one side under `positive` and the other under `cheat`.
|
|
128
|
+
*
|
|
129
|
+
* V1 approximation: token-presence over `finalWorkspace.rules`. A
|
|
130
|
+
* future iteration would replace this with an actual Firestore /
|
|
131
|
+
* Realtime Database security-rules simulator call, exercising the
|
|
132
|
+
* `positive` move (expecting `allow`) and the `cheat` move
|
|
133
|
+
* (expecting `deny`). Wiring that simulator is out of scope for v1
|
|
134
|
+
* because it requires either the Firebase rules-emulator process
|
|
135
|
+
* (Node-only, slow to start, off-limits in a browser-safe surface)
|
|
136
|
+
* or a bundled WASM rules interpreter — neither of which the eval
|
|
137
|
+
* harness has today.
|
|
138
|
+
*
|
|
139
|
+
* The approximation supports two arg-shapes so it can serve both
|
|
140
|
+
* the brief's documented shape and the simulator-style shape the
|
|
141
|
+
* generative fixtures already use on disk:
|
|
142
|
+
*
|
|
143
|
+
* 1. Token shape (explicit, preferred for future-authored fixtures):
|
|
144
|
+
* positive: { description?: string; requiredTokens: string[] }
|
|
145
|
+
* cheat: { description?: string; rejectionTokens: string[] }
|
|
146
|
+
* Each token list is checked as case-sensitive substrings on
|
|
147
|
+
* `finalWorkspace.rules`.
|
|
148
|
+
*
|
|
149
|
+
* 2. Simulator shape (used by the existing fixtures):
|
|
150
|
+
* positive: { auth, path, op, data, expect: 'allow' }
|
|
151
|
+
* cheat: { auth, path, op, data, expect: 'deny' }
|
|
152
|
+
* Tokens are derived from the simulator-side `data` and `path`
|
|
153
|
+
* values — each string-valued leaf and the path segment are
|
|
154
|
+
* required to appear in the rules text. The expectation field
|
|
155
|
+
* (`allow` / `deny`) is recorded in the detail payload but is
|
|
156
|
+
* not used by the token check; the actual accept/reject
|
|
157
|
+
* decision is what the future simulator iteration would
|
|
158
|
+
* validate.
|
|
159
|
+
*
|
|
160
|
+
* Passes iff every derived positive token AND every derived cheat
|
|
161
|
+
* token appears in `finalWorkspace.rules`. Otherwise returns
|
|
162
|
+
* `{ ok: false, detail: { missingPositive, missingCheat } }`.
|
|
163
|
+
*
|
|
164
|
+
* Example fixture reference (simulator shape):
|
|
165
|
+
*
|
|
166
|
+
* {
|
|
167
|
+
* "name": "game-rules/simulator-accepts-positive-and-rejects-cheat",
|
|
168
|
+
* "args": {
|
|
169
|
+
* "database": "rtdb",
|
|
170
|
+
* "positive": { "auth": { "uid": "uidA" }, "path": "/games/g1",
|
|
171
|
+
* "op": "update", "data": { ... }, "expect": "allow" },
|
|
172
|
+
* "cheat": { "auth": { "uid": "uidB" }, "path": "/games/g1",
|
|
173
|
+
* "op": "update", "data": { ... }, "expect": "deny" }
|
|
174
|
+
* }
|
|
175
|
+
* }
|
|
176
|
+
*/
|
|
177
|
+
export declare const gameRulesSimulatorAcceptsPositiveAndRejectsCheat: SpecFn;
|
|
178
|
+
/**
|
|
179
|
+
* Args: `{ lintToolName?: string; cheat?: CheatArgs; cheatAttempt?: CheatArgs }`.
|
|
180
|
+
*
|
|
181
|
+
* Two-part check: (1) the agent successfully called the pyric lint
|
|
182
|
+
* tool during the run, AND (2) the resulting rules text contains the
|
|
183
|
+
* tokens that should be present if the cheating attempt is
|
|
184
|
+
* structurally rejected by the rules.
|
|
185
|
+
*
|
|
186
|
+
* V1 approximation:
|
|
187
|
+
* - Step 1 walks `snapshot.trace` for any `llm_response` event
|
|
188
|
+
* containing a tool call whose name matches `lintToolName`
|
|
189
|
+
* (default `lint_firestore_rules`). If absent, returns
|
|
190
|
+
* `{ ok: false, detail: { reason: 'lint-not-called' } }`. The
|
|
191
|
+
* pyric lint tool's success/failure is observable in the
|
|
192
|
+
* assistant's reasoning and in the tool result, but capturing
|
|
193
|
+
* the precise `tool_result` event shape across providers is
|
|
194
|
+
* fragile — checking that the tool was *called* is the right
|
|
195
|
+
* v1 signal. A future iteration would also verify the lint
|
|
196
|
+
* tool's result was `ok: true` at the trace level.
|
|
197
|
+
* - Step 2 checks `finalWorkspace.rules` includes every token in
|
|
198
|
+
* the cheat's `rejectionTokens`. The cheat may be supplied
|
|
199
|
+
* under either `cheat` (the brief's name) or `cheatAttempt`
|
|
200
|
+
* (the name the existing fixture uses). When the cheat is
|
|
201
|
+
* simulator-shaped (no `rejectionTokens`, only `data` + `path`),
|
|
202
|
+
* tokens are derived from those leaves the same way as
|
|
203
|
+
* `gameRulesSimulatorAcceptsPositiveAndRejectsCheat`.
|
|
204
|
+
*
|
|
205
|
+
* Returns `{ ok: true }` on success, or
|
|
206
|
+
* `{ ok: false, detail: { reason: 'rejection-tokens-missing', missing } }`
|
|
207
|
+
* on the second-step failure.
|
|
208
|
+
*
|
|
209
|
+
* Example fixture reference:
|
|
210
|
+
*
|
|
211
|
+
* {
|
|
212
|
+
* "name": "pyric-agents/lint-clean-and-rule-rejects-cheat",
|
|
213
|
+
* "args": {
|
|
214
|
+
* "lintToolName": "lint_firestore_rules",
|
|
215
|
+
* "cheatAttempt": { "path": "/orders/orderA", "op": "create",
|
|
216
|
+
* "data": { ... }, "expect": "deny" }
|
|
217
|
+
* }
|
|
218
|
+
* }
|
|
219
|
+
*/
|
|
220
|
+
export declare const pyricAgentsLintCleanAndRuleRejectsCheat: SpecFn;
|
|
221
|
+
/**
|
|
222
|
+
* Register every starter spec on a registry. Idempotency is not a
|
|
223
|
+
* design goal — calling this twice on the same registry throws (the
|
|
224
|
+
* registry rejects duplicate registrations on purpose). Callers that
|
|
225
|
+
* want a subset should call `registry.register()` themselves.
|
|
226
|
+
*/
|
|
227
|
+
export declare function registerStarterSpecs(registry: SpecRegistry): void;
|
|
228
|
+
/**
|
|
229
|
+
* Register every custom (post-starter) spec on a registry. Sibling
|
|
230
|
+
* to `registerStarterSpecs`. Splitting the two keeps the meaning of
|
|
231
|
+
* "starter" stable as new custom specs are added.
|
|
232
|
+
*/
|
|
233
|
+
export declare function registerCustomSpecs(registry: SpecRegistry): void;
|
|
234
|
+
/**
|
|
235
|
+
* Umbrella that registers every spec the library ships — both the
|
|
236
|
+
* starter library and the custom helpers. Equivalent to calling
|
|
237
|
+
* `registerStarterSpecs(registry)` followed by
|
|
238
|
+
* `registerCustomSpecs(registry)`.
|
|
239
|
+
*/
|
|
240
|
+
export declare function registerAllSpecs(registry: SpecRegistry): void;
|
|
241
|
+
/** Stable list of starter spec names, in registration order. */
|
|
242
|
+
export declare const STARTER_SPEC_NAMES: readonly string[];
|
|
243
|
+
/** Stable list of custom spec names, in registration order. */
|
|
244
|
+
export declare const CUSTOM_SPEC_NAMES: readonly string[];
|
|
245
|
+
//# sourceMappingURL=spec-helpers.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"spec-helpers.d.ts","sourceRoot":"","sources":["../../src/eval/spec-helpers.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH,OAAO,KAAK,EAAe,MAAM,EAAE,YAAY,EAAc,MAAM,qBAAqB,CAAC;AAEzF,eAAO,MAAM,oCAAoC,oCAAoC,CAAC;AACtF,eAAO,MAAM,2BAA2B,2BAA2B,CAAC;AACpE,eAAO,MAAM,qCAAqC,qCAAqC,CAAC;AACxF,eAAO,MAAM,iCAAiC,iCAAiC,CAAC;AAChF,eAAO,MAAM,iCAAiC,iCAAiC,CAAC;AAChF,eAAO,MAAM,iCAAiC,iCAAiC,CAAC;AAEhF,eAAO,MAAM,4DAA4D,4DACd,CAAC;AAC5D,eAAO,MAAM,mDAAmD,mDACd,CAAC;AAEnD;;;;;;;;;;;;GAYG;AACH,eAAO,MAAM,0BAA0B,EAAE,MAiBxC,CAAC;AAEF;;;;;;;;;;;;GAYG;AACH,eAAO,MAAM,mBAAmB,EAAE,MAiBjC,CAAC;AAEF;;;;;;;;;;;;;;;GAeG;AACH,eAAO,MAAM,2BAA2B,EAAE,MAgBzC,CAAC;AAEF;;;;;;;;;;;;GAYG;AACH,eAAO,MAAM,yBAAyB,EAAE,MAEvC,CAAC;AAEF;;;;;;;;;;;;;GAaG;AACH,eAAO,MAAM,yBAAyB,EAAE,MAEvC,CAAC;AAEF;;;;;;;;;;;GAWG;AACH,eAAO,MAAM,wBAAwB,EAAE,MAkBtC,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAsDG;AACH,eAAO,MAAM,gDAAgD,EAAE,MAoB9D,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAyCG;AACH,eAAO,MAAM,uCAAuC,EAAE,MAqDrD,CAAC;AAEF;;;;;GAKG;AACH,wBAAgB,oBAAoB,CAAC,QAAQ,EAAE,YAAY,GAAG,IAAI,CAOjE;AAED;;;;GAIG;AACH,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,YAAY,GAAG,IAAI,CAShE;AAED;;;;;GAKG;AACH,wBAAgB,gBAAgB,CAAC,QAAQ,EAAE,YAAY,GAAG,IAAI,CAG7D;AAED,gEAAgE;AAChE,eAAO,MAAM,kBAAkB,EAAE,SAAS,MAAM,EAO/C,CAAC;AAEF,+DAA+D;AAC/D,eAAO,MAAM,iBAAiB,EAAE,SAAS,MAAM,EAG9C,CAAC"}
|