@agent-relay/evals 8.8.1 → 8.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js.map +1 -1
- package/dist/scenarios/onboarding.d.ts +20 -0
- package/dist/scenarios/onboarding.d.ts.map +1 -0
- package/dist/scenarios/onboarding.js +66 -0
- package/dist/scenarios/onboarding.js.map +1 -0
- package/dist/scoring/mount.d.ts +84 -0
- package/dist/scoring/mount.d.ts.map +1 -0
- package/dist/scoring/mount.js +106 -0
- package/dist/scoring/mount.js.map +1 -0
- package/dist/types.d.ts +44 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +7 -2
package/dist/index.d.ts
CHANGED
|
@@ -10,6 +10,6 @@
|
|
|
10
10
|
* on. The full migration into this package is tracked in
|
|
11
11
|
* specs/agent-relay-evals-package.md.
|
|
12
12
|
*/
|
|
13
|
-
export type { EvalScenario, ScenarioContext, ScenarioResult, EvalTier, MetricSet, EvalReport, MatrixReport, AgentInfo, TranscriptEntry, Phantom, } from './types.js';
|
|
13
|
+
export type { EvalScenario, ScenarioContext, ScenarioResult, EvalTier, MetricSet, EvalReport, MatrixReport, AgentInfo, TranscriptEntry, Phantom, MountScenario, MountScenarioResult, MountCellMetrics, } from './types.js';
|
|
14
14
|
export { SCHEMA_VERSION } from './types.js';
|
|
15
15
|
//# sourceMappingURL=index.d.ts.map
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,YAAY,EACV,YAAY,EACZ,eAAe,EACf,cAAc,EACd,QAAQ,EACR,SAAS,EACT,UAAU,EACV,YAAY,EACZ,SAAS,EACT,eAAe,EACf,OAAO,
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,YAAY,EACV,YAAY,EACZ,eAAe,EACf,cAAc,EACd,QAAQ,EACR,SAAS,EACT,UAAU,EACV,YAAY,EACZ,SAAS,EACT,eAAe,EACf,OAAO,EAEP,aAAa,EACb,mBAAmB,EACnB,gBAAgB,GACjB,MAAM,YAAY,CAAC;AAEpB,OAAO,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC"}
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAmBH,OAAO,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Onboarding variants for spawn/release reliability evals.
|
|
3
|
+
*
|
|
4
|
+
* Ordered from lightest to heaviest. The eval runner tests each variant and
|
|
5
|
+
* the goal is to find the MINIMUM text that achieves 10/10 reliability, so
|
|
6
|
+
* variants at the top of the list are preferred if they prove sufficient.
|
|
7
|
+
*
|
|
8
|
+
* bare → zero spawn/release guidance (measures the baseline failure rate)
|
|
9
|
+
* one-liner → single sentence naming both tools (minimum viable hint)
|
|
10
|
+
* brief → tool names + parameters + when-to-use (compact but complete)
|
|
11
|
+
* skill → full reference with examples (maximum clarity, maximum tokens)
|
|
12
|
+
*/
|
|
13
|
+
export type OnboardingVariant = 'bare' | 'one-liner' | 'brief' | 'skill';
|
|
14
|
+
export declare const ONBOARDING_VARIANTS: readonly ["bare", "one-liner", "brief", "skill"];
|
|
15
|
+
/**
|
|
16
|
+
* Return the onboarding text suffix for a given variant.
|
|
17
|
+
* Appended to the scenario-specific role description.
|
|
18
|
+
*/
|
|
19
|
+
export declare function onboardingText(variant: OnboardingVariant): string;
|
|
20
|
+
//# sourceMappingURL=onboarding.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"onboarding.d.ts","sourceRoot":"","sources":["../../src/scenarios/onboarding.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,MAAM,MAAM,iBAAiB,GAAG,MAAM,GAAG,WAAW,GAAG,OAAO,GAAG,OAAO,CAAC;AAEzE,eAAO,MAAM,mBAAmB,kDAKiB,CAAC;AAElD;;;GAGG;AACH,wBAAgB,cAAc,CAAC,OAAO,EAAE,iBAAiB,GAAG,MAAM,CA8CjE"}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Onboarding variants for spawn/release reliability evals.
|
|
3
|
+
*
|
|
4
|
+
* Ordered from lightest to heaviest. The eval runner tests each variant and
|
|
5
|
+
* the goal is to find the MINIMUM text that achieves 10/10 reliability, so
|
|
6
|
+
* variants at the top of the list are preferred if they prove sufficient.
|
|
7
|
+
*
|
|
8
|
+
* bare → zero spawn/release guidance (measures the baseline failure rate)
|
|
9
|
+
* one-liner → single sentence naming both tools (minimum viable hint)
|
|
10
|
+
* brief → tool names + parameters + when-to-use (compact but complete)
|
|
11
|
+
* skill → full reference with examples (maximum clarity, maximum tokens)
|
|
12
|
+
*/
|
|
13
|
+
export const ONBOARDING_VARIANTS = [
|
|
14
|
+
'bare',
|
|
15
|
+
'one-liner',
|
|
16
|
+
'brief',
|
|
17
|
+
'skill',
|
|
18
|
+
];
|
|
19
|
+
/**
|
|
20
|
+
* Return the onboarding text suffix for a given variant.
|
|
21
|
+
* Appended to the scenario-specific role description.
|
|
22
|
+
*/
|
|
23
|
+
export function onboardingText(variant) {
|
|
24
|
+
switch (variant) {
|
|
25
|
+
case 'bare':
|
|
26
|
+
return '';
|
|
27
|
+
case 'one-liner':
|
|
28
|
+
return '\n\nCall mcp__agent-relay__add_agent to spawn a worker agent for a task, and mcp__agent-relay__remove_agent to release workers when they are done.';
|
|
29
|
+
case 'brief':
|
|
30
|
+
return `
|
|
31
|
+
|
|
32
|
+
## Agent management
|
|
33
|
+
- Spawn a relay worker: mcp__agent-relay__add_agent({ name, cli: "claude", task })
|
|
34
|
+
name = unique identifier, task = full instructions for the worker.
|
|
35
|
+
- Release a relay worker: mcp__agent-relay__remove_agent({ name })
|
|
36
|
+
When the task says to delegate or assign work, call add_agent. Release with remove_agent when the worker reports done.`;
|
|
37
|
+
case 'skill':
|
|
38
|
+
return `
|
|
39
|
+
|
|
40
|
+
## Managing Worker Agents
|
|
41
|
+
|
|
42
|
+
### Spawn a relay worker
|
|
43
|
+
To delegate work, call:
|
|
44
|
+
mcp__agent-relay__add_agent({ name: "WorkerName", cli: "claude", task: "detailed instructions" })
|
|
45
|
+
|
|
46
|
+
Required fields: name (unique string), cli ("claude"), task (full instructions for the worker).
|
|
47
|
+
The relay worker will DM you "ACK: <understanding>" when it starts and "DONE: <result>" when complete.
|
|
48
|
+
|
|
49
|
+
**Important**: When your task asks you to "assign to a worker", "delegate to an agent", or "spawn a relay worker",
|
|
50
|
+
this means calling mcp__agent-relay__add_agent — never your built-in Task capability.
|
|
51
|
+
|
|
52
|
+
### Release a relay worker
|
|
53
|
+
As soon as a relay worker reports done, call:
|
|
54
|
+
mcp__agent-relay__remove_agent({ name: "WorkerName" })
|
|
55
|
+
|
|
56
|
+
Always release relay workers when done — unreleased agents waste resources.
|
|
57
|
+
|
|
58
|
+
### When to spawn vs do the work yourself
|
|
59
|
+
If the task explicitly asks you to delegate or assign work to a worker, always spawn — do not do it yourself.
|
|
60
|
+
Spawn for anything large, parallel, or that needs specialised focus.
|
|
61
|
+
Only handle it yourself when the task is trivial AND you were not asked to delegate.`;
|
|
62
|
+
default:
|
|
63
|
+
throw new Error(`Unknown OnboardingVariant: ${variant}`);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
//# sourceMappingURL=onboarding.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"onboarding.js","sourceRoot":"","sources":["../../src/scenarios/onboarding.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAIH,MAAM,CAAC,MAAM,mBAAmB,GAAG;IACjC,MAAM;IACN,WAAW;IACX,OAAO;IACP,OAAO;CACwC,CAAC;AAElD;;;GAGG;AACH,MAAM,UAAU,cAAc,CAAC,OAA0B;IACvD,QAAQ,OAAO,EAAE,CAAC;QAChB,KAAK,MAAM;YACT,OAAO,EAAE,CAAC;QAEZ,KAAK,WAAW;YACd,OAAO,oJAAoJ,CAAC;QAE9J,KAAK,OAAO;YACV,OAAO;;;;;;uHAM0G,CAAC;QAEpH,KAAK,OAAO;YACV,OAAO;;;;;;;;;;;;;;;;;;;;;;;qFAuBwE,CAAC;QAElF;YACE,MAAM,IAAI,KAAK,CAAC,8BAA8B,OAAiB,EAAE,CAAC,CAAC;IACvE,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Filesystem scoring utilities for agents that use a relayfile writeback mount.
|
|
3
|
+
*
|
|
4
|
+
* The writeback pattern: an agent writes a JSON file to a path under the
|
|
5
|
+
* .integrations/ mount (e.g. .integrations/slack/channels/<id>/messages/msg.json)
|
|
6
|
+
* and the relayfile writeback consumer picks it up and dispatches to the provider.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* const snapshot = snapshotMount(fixtureDir)
|
|
10
|
+
* // ... spawn agent, wait for exit ...
|
|
11
|
+
* const newFiles = newMountFiles(fixtureDir, snapshot)
|
|
12
|
+
* const score = scoreMountRun({ mountDir: fixtureDir, newFiles, expectedPathPrefix, events })
|
|
13
|
+
*/
|
|
14
|
+
import type { BrokerEvent } from '@agent-relay/harness-driver';
|
|
15
|
+
/** Per-dimension scoring for one mount writeback eval run. */
|
|
16
|
+
export interface MountScore {
|
|
17
|
+
/** Agent created at least one file under the mount root. */
|
|
18
|
+
wroteSomething: boolean;
|
|
19
|
+
/** File(s) landed under the expected path prefix (correct provider + resource). */
|
|
20
|
+
correctPath: boolean;
|
|
21
|
+
/** All files at the expected path are syntactically valid JSON. */
|
|
22
|
+
jsonValid: boolean;
|
|
23
|
+
/** Agent wrote under discovery/ — schema-only, must never happen. */
|
|
24
|
+
discoveryViolation: boolean;
|
|
25
|
+
/** Agent sent at least one relay_inbound message instead of/in addition to writing a file. */
|
|
26
|
+
usedRelayMessaging: boolean;
|
|
27
|
+
/** Agent exited cleanly rather than timing out. */
|
|
28
|
+
cleanExit: boolean;
|
|
29
|
+
/** Overall pass: correctPath && jsonValid && !discoveryViolation. */
|
|
30
|
+
pass: boolean;
|
|
31
|
+
/** All files created by the agent, relative to mountDir. */
|
|
32
|
+
filesWritten: string[];
|
|
33
|
+
/** Subset of filesWritten that match expectedPathPrefix. */
|
|
34
|
+
filesAtCorrectPath: string[];
|
|
35
|
+
}
|
|
36
|
+
/** Options for scoreMountRun(). */
|
|
37
|
+
export interface ScoreMountRunOptions {
|
|
38
|
+
/** Root of the .integrations/ mount directory (absolute path). */
|
|
39
|
+
mountDir: string;
|
|
40
|
+
/** Files created by the agent since spawning (from newMountFiles). */
|
|
41
|
+
newFiles: string[];
|
|
42
|
+
/**
|
|
43
|
+
* Expected path prefix the writeback file must be under (relative to mountDir).
|
|
44
|
+
* Example: ".integrations/slack/channels/C12345__general/messages/"
|
|
45
|
+
*/
|
|
46
|
+
expectedPathPrefix: string;
|
|
47
|
+
/** Broker events captured during the run. */
|
|
48
|
+
events: BrokerEvent[];
|
|
49
|
+
/** Whether the agent exited without timing out. Defaults to true. */
|
|
50
|
+
cleanExit?: boolean;
|
|
51
|
+
}
|
|
52
|
+
/** Summary stats across repeated runs of one scenario×variant cell. */
|
|
53
|
+
export interface MountCellStats {
|
|
54
|
+
runs: number;
|
|
55
|
+
passed: number;
|
|
56
|
+
passRate: number;
|
|
57
|
+
wroteSomethingRate: number;
|
|
58
|
+
correctPathRate: number;
|
|
59
|
+
discoveryViolationRate: number;
|
|
60
|
+
usedRelayMessagingRate: number;
|
|
61
|
+
}
|
|
62
|
+
/**
|
|
63
|
+
* Snapshot all file paths currently under mountDir.
|
|
64
|
+
* Call before spawning the agent; diff with newMountFiles after.
|
|
65
|
+
*/
|
|
66
|
+
export declare function snapshotMount(mountDir: string): Set<string>;
|
|
67
|
+
/**
|
|
68
|
+
* Return absolute paths of files under mountDir that weren't in the snapshot.
|
|
69
|
+
* These are the files the agent created during the run.
|
|
70
|
+
*/
|
|
71
|
+
export declare function newMountFiles(mountDir: string, snapshot: Set<string>): string[];
|
|
72
|
+
/**
|
|
73
|
+
* Score an agent run against a mount writeback scenario.
|
|
74
|
+
*
|
|
75
|
+
* Pass = agent wrote to the correct path, the file is valid JSON, and it did
|
|
76
|
+
* NOT write under discovery/.
|
|
77
|
+
*/
|
|
78
|
+
export declare function scoreMountRun(opts: ScoreMountRunOptions): MountScore;
|
|
79
|
+
/**
|
|
80
|
+
* Roll up repeated MountScore results into pass-rate statistics for one cell
|
|
81
|
+
* (scenario × variant) in a report matrix.
|
|
82
|
+
*/
|
|
83
|
+
export declare function mountCellStats(scores: MountScore[]): MountCellStats;
|
|
84
|
+
//# sourceMappingURL=mount.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mount.d.ts","sourceRoot":"","sources":["../../src/scoring/mount.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAKH,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,6BAA6B,CAAC;AAI/D,8DAA8D;AAC9D,MAAM,WAAW,UAAU;IACzB,4DAA4D;IAC5D,cAAc,EAAE,OAAO,CAAC;IACxB,mFAAmF;IACnF,WAAW,EAAE,OAAO,CAAC;IACrB,mEAAmE;IACnE,SAAS,EAAE,OAAO,CAAC;IACnB,qEAAqE;IACrE,kBAAkB,EAAE,OAAO,CAAC;IAC5B,8FAA8F;IAC9F,kBAAkB,EAAE,OAAO,CAAC;IAC5B,mDAAmD;IACnD,SAAS,EAAE,OAAO,CAAC;IACnB,qEAAqE;IACrE,IAAI,EAAE,OAAO,CAAC;IACd,4DAA4D;IAC5D,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,4DAA4D;IAC5D,kBAAkB,EAAE,MAAM,EAAE,CAAC;CAC9B;AAED,mCAAmC;AACnC,MAAM,WAAW,oBAAoB;IACnC,kEAAkE;IAClE,QAAQ,EAAE,MAAM,CAAC;IACjB,sEAAsE;IACtE,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB;;;OAGG;IACH,kBAAkB,EAAE,MAAM,CAAC;IAC3B,6CAA6C;IAC7C,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,qEAAqE;IACrE,SAAS,CAAC,EAAE,OAAO,CAAC;CACrB;AAED,uEAAuE;AACvE,MAAM,WAAW,cAAc;IAC7B,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,MAAM,CAAC;IACjB,kBAAkB,EAAE,MAAM,CAAC;IAC3B,eAAe,EAAE,MAAM,CAAC;IACxB,sBAAsB,EAAE,MAAM,CAAC;IAC/B,sBAAsB,EAAE,MAAM,CAAC;CAChC;AAYD;;;GAGG;AACH,wBAAgB,aAAa,CAAC,QAAQ,EAAE,MAAM,GAAG,GAAG,CAAC,MAAM,CAAC,CAI3D;AAED;;;GAGG;AACH,wBAAgB,aAAa,CAAC,QAAQ,EAAE,MAAM,EAAE,QAAQ,EAAE,GAAG,CAAC,MAAM,CAAC,GAAG,MAAM,EAAE,CAI/E;AAID;;;;;GAKG;AACH,wBAAgB,aAAa,CAAC,IAAI,EAAE,oBAAoB,GAAG,UAAU,CA+BpE;AAID;;;GAGG;AACH,wBAAgB,cAAc,CAAC,MAAM,EAAE,UAAU,EAAE,GAAG,cAAc,CAsBnE"}
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Filesystem scoring utilities for agents that use a relayfile writeback mount.
|
|
3
|
+
*
|
|
4
|
+
* The writeback pattern: an agent writes a JSON file to a path under the
|
|
5
|
+
* .integrations/ mount (e.g. .integrations/slack/channels/<id>/messages/msg.json)
|
|
6
|
+
* and the relayfile writeback consumer picks it up and dispatches to the provider.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* const snapshot = snapshotMount(fixtureDir)
|
|
10
|
+
* // ... spawn agent, wait for exit ...
|
|
11
|
+
* const newFiles = newMountFiles(fixtureDir, snapshot)
|
|
12
|
+
* const score = scoreMountRun({ mountDir: fixtureDir, newFiles, expectedPathPrefix, events })
|
|
13
|
+
*/
|
|
14
|
+
import { existsSync, readdirSync, readFileSync } from 'node:fs';
|
|
15
|
+
import { join, relative } from 'node:path';
|
|
16
|
+
// ── Mount file tracking ───────────────────────────────────────────────────────
|
|
17
|
+
function walkDir(dir, out) {
|
|
18
|
+
if (!existsSync(dir))
|
|
19
|
+
return;
|
|
20
|
+
for (const entry of readdirSync(dir, { withFileTypes: true })) {
|
|
21
|
+
const full = join(dir, entry.name);
|
|
22
|
+
entry.isDirectory() ? walkDir(full, out) : out.add(full);
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Snapshot all file paths currently under mountDir.
|
|
27
|
+
* Call before spawning the agent; diff with newMountFiles after.
|
|
28
|
+
*/
|
|
29
|
+
export function snapshotMount(mountDir) {
|
|
30
|
+
const existing = new Set();
|
|
31
|
+
walkDir(mountDir, existing);
|
|
32
|
+
return existing;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Return absolute paths of files under mountDir that weren't in the snapshot.
|
|
36
|
+
* These are the files the agent created during the run.
|
|
37
|
+
*/
|
|
38
|
+
export function newMountFiles(mountDir, snapshot) {
|
|
39
|
+
const current = new Set();
|
|
40
|
+
walkDir(mountDir, current);
|
|
41
|
+
return [...current].filter((f) => !snapshot.has(f));
|
|
42
|
+
}
|
|
43
|
+
// ── Scoring ───────────────────────────────────────────────────────────────────
|
|
44
|
+
/**
|
|
45
|
+
* Score an agent run against a mount writeback scenario.
|
|
46
|
+
*
|
|
47
|
+
* Pass = agent wrote to the correct path, the file is valid JSON, and it did
|
|
48
|
+
* NOT write under discovery/.
|
|
49
|
+
*/
|
|
50
|
+
export function scoreMountRun(opts) {
|
|
51
|
+
const { mountDir, newFiles, expectedPathPrefix, events, cleanExit = true } = opts;
|
|
52
|
+
const relFiles = newFiles.map((f) => relative(mountDir, f));
|
|
53
|
+
const inExpectedPath = relFiles.filter((p) => p.startsWith(expectedPathPrefix));
|
|
54
|
+
const inDiscovery = relFiles.filter((p) => p.includes('/discovery/') || p.startsWith('discovery/'));
|
|
55
|
+
const jsonValid = inExpectedPath.length > 0 &&
|
|
56
|
+
inExpectedPath.every((p) => {
|
|
57
|
+
try {
|
|
58
|
+
JSON.parse(readFileSync(join(mountDir, p), 'utf8'));
|
|
59
|
+
return true;
|
|
60
|
+
}
|
|
61
|
+
catch {
|
|
62
|
+
return false;
|
|
63
|
+
}
|
|
64
|
+
});
|
|
65
|
+
const usedRelayMessaging = events.some((e) => e.kind === 'relay_inbound');
|
|
66
|
+
return {
|
|
67
|
+
wroteSomething: relFiles.length > 0,
|
|
68
|
+
correctPath: inExpectedPath.length > 0,
|
|
69
|
+
jsonValid,
|
|
70
|
+
discoveryViolation: inDiscovery.length > 0,
|
|
71
|
+
usedRelayMessaging,
|
|
72
|
+
cleanExit,
|
|
73
|
+
pass: inExpectedPath.length > 0 && jsonValid && inDiscovery.length === 0,
|
|
74
|
+
filesWritten: relFiles,
|
|
75
|
+
filesAtCorrectPath: inExpectedPath,
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
// ── Aggregate stats ───────────────────────────────────────────────────────────
|
|
79
|
+
/**
|
|
80
|
+
* Roll up repeated MountScore results into pass-rate statistics for one cell
|
|
81
|
+
* (scenario × variant) in a report matrix.
|
|
82
|
+
*/
|
|
83
|
+
export function mountCellStats(scores) {
|
|
84
|
+
const n = scores.length;
|
|
85
|
+
if (n === 0)
|
|
86
|
+
return {
|
|
87
|
+
runs: 0,
|
|
88
|
+
passed: 0,
|
|
89
|
+
passRate: 0,
|
|
90
|
+
wroteSomethingRate: 0,
|
|
91
|
+
correctPathRate: 0,
|
|
92
|
+
discoveryViolationRate: 0,
|
|
93
|
+
usedRelayMessagingRate: 0,
|
|
94
|
+
};
|
|
95
|
+
const rate = (pred) => scores.filter(pred).length / n;
|
|
96
|
+
return {
|
|
97
|
+
runs: n,
|
|
98
|
+
passed: scores.filter((s) => s.pass).length,
|
|
99
|
+
passRate: rate((s) => s.pass),
|
|
100
|
+
wroteSomethingRate: rate((s) => s.wroteSomething),
|
|
101
|
+
correctPathRate: rate((s) => s.correctPath),
|
|
102
|
+
discoveryViolationRate: rate((s) => s.discoveryViolation),
|
|
103
|
+
usedRelayMessagingRate: rate((s) => s.usedRelayMessaging),
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
//# sourceMappingURL=mount.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mount.js","sourceRoot":"","sources":["../../src/scoring/mount.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAEH,OAAO,EAAE,UAAU,EAAE,WAAW,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AAChE,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AAwD3C,iFAAiF;AAEjF,SAAS,OAAO,CAAC,GAAW,EAAE,GAAgB;IAC5C,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;QAAE,OAAO;IAC7B,KAAK,MAAM,KAAK,IAAI,WAAW,CAAC,GAAG,EAAE,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE,CAAC;QAC9D,MAAM,IAAI,GAAG,IAAI,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,CAAC,CAAC;QACnC,KAAK,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;IAC3D,CAAC;AACH,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,aAAa,CAAC,QAAgB;IAC5C,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAU,CAAC;IACnC,OAAO,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;IAC5B,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,aAAa,CAAC,QAAgB,EAAE,QAAqB;IACnE,MAAM,OAAO,GAAG,IAAI,GAAG,EAAU,CAAC;IAClC,OAAO,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IAC3B,OAAO,CAAC,GAAG,OAAO,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;AACtD,CAAC;AAED,iFAAiF;AAEjF;;;;;GAKG;AACH,MAAM,UAAU,aAAa,CAAC,IAA0B;IACtD,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,kBAAkB,EAAE,MAAM,EAAE,SAAS,GAAG,IAAI,EAAE,GAAG,IAAI,CAAC;IAElF,MAAM,QAAQ,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,CAAC;IAC5D,MAAM,cAAc,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,kBAAkB,CAAC,CAAC,CAAC;IAChF,MAAM,WAAW,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC,UAAU,CAAC,YAAY,CAAC,CAAC,CAAC;IAEpG,MAAM,SAAS,GACb,cAAc,CAAC,MAAM,GAAG,CAAC;QACzB,cAAc,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE;YACzB,IAAI,CAAC;gBACH,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC,CAAC;gBACpD,OAAO,IAAI,CAAC;YACd,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO,KAAK,CAAC;YACf,CAAC;QACH,CAAC,CAAC,CAAC;IAEL,MAAM,kBAAkB,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,eAAe,CAAC,CAAC;IAE1E,OAAO;QACL,cAAc,EAAE,QAAQ,CAAC,MAAM,GAAG,CAAC;QACnC,WAAW,EAAE,cAAc,CAAC,MAAM,GAAG,CAAC;QACtC,SAAS;QACT,kBAAkB,EAAE,WAAW,CAAC,MAAM,GAAG,CAAC;QAC1C,kBAAkB;QAClB,SAAS;QACT,IAAI,EAAE,cAAc,CAAC,MAAM,GAAG,CAAC,IAAI,SAAS,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC;QACxE,YAAY,EAAE,QAAQ;QACtB,kBAAkB,EAAE,cAAc;KACnC,CAAC;AACJ,CAAC;AAED,iFAAiF;AAEjF;;;GAGG;AACH,MAAM,UAAU,cAAc,CAAC,MAAoB;IACjD,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;IACxB,IAAI,CAAC,KAAK,CAAC;QACT,OAAO;YACL,IAAI,EAAE,CAAC;YACP,MAAM,EAAE,CAAC;YACT,QAAQ,EAAE,CAAC;YACX,kBAAkB,EAAE,CAAC;YACrB,eAAe,EAAE,CAAC;YAClB,sBAAsB,EAAE,CAAC;YACzB,sBAAsB,EAAE,CAAC;SAC1B,CAAC;IACJ,MAAM,IAAI,GAAG,CAAC,IAAgC,EAAE,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;IAClF,OAAO;QACL,IAAI,EAAE,CAAC;QACP,MAAM,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM;QAC3C,QAAQ,EAAE,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;QAC7B,kBAAkB,EAAE,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,cAAc,CAAC;QACjD,eAAe,EAAE,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC;QAC3C,sBAAsB,EAAE,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,kBAAkB,CAAC;QACzD,sBAAsB,EAAE,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,kBAAkB,CAAC;KAC1D,CAAC;AACJ,CAAC"}
|
package/dist/types.d.ts
CHANGED
|
@@ -158,4 +158,48 @@ export interface MatrixReport {
|
|
|
158
158
|
harnesses: Record<string, MetricSet>;
|
|
159
159
|
}
|
|
160
160
|
export declare const SCHEMA_VERSION = 1;
|
|
161
|
+
/**
|
|
162
|
+
* A scenario that tests whether an agent correctly writes a writeback file
|
|
163
|
+
* to the relayfile .integrations/ mount instead of (or alongside) relay messaging.
|
|
164
|
+
*/
|
|
165
|
+
export interface MountScenario {
|
|
166
|
+
id: string;
|
|
167
|
+
title: string;
|
|
168
|
+
/**
|
|
169
|
+
* Path prefix (relative to the fixture/mount root) the agent must write
|
|
170
|
+
* under to pass. Example: ".integrations/slack/channels/C12345__general/messages/"
|
|
171
|
+
*/
|
|
172
|
+
expectedPathPrefix: string;
|
|
173
|
+
/**
|
|
174
|
+
* Task text to inject into the agent at spawn.
|
|
175
|
+
* The eval runner may prepend a variant-specific prefix before this.
|
|
176
|
+
*/
|
|
177
|
+
task: string;
|
|
178
|
+
}
|
|
179
|
+
/** One scored run of a MountScenario. */
|
|
180
|
+
export interface MountScenarioResult {
|
|
181
|
+
scenarioId: string;
|
|
182
|
+
scenarioTitle: string;
|
|
183
|
+
/** Onboarding variant under test (e.g. "bare", "claude-md", "slim-inject", "full-inject"). */
|
|
184
|
+
variant: string;
|
|
185
|
+
/** 1-based repetition index within this scenario×variant cell. */
|
|
186
|
+
run: number;
|
|
187
|
+
pass: boolean;
|
|
188
|
+
wroteSomething: boolean;
|
|
189
|
+
correctPath: boolean;
|
|
190
|
+
jsonValid: boolean;
|
|
191
|
+
discoveryViolation: boolean;
|
|
192
|
+
usedRelayMessaging: boolean;
|
|
193
|
+
filesWritten: string[];
|
|
194
|
+
durationMs: number;
|
|
195
|
+
error?: string;
|
|
196
|
+
}
|
|
197
|
+
/** Aggregated metrics for one scenario×variant cell across repeated runs. */
|
|
198
|
+
export interface MountCellMetrics {
|
|
199
|
+
scenarioId: string;
|
|
200
|
+
variant: string;
|
|
201
|
+
runs: number;
|
|
202
|
+
passed: number;
|
|
203
|
+
passRate: number;
|
|
204
|
+
}
|
|
161
205
|
//# sourceMappingURL=types.d.ts.map
|
package/dist/types.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AACH,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,cAAc,CAAC;AAElD,wDAAwD;AACxD,MAAM,WAAW,eAAe;IAC9B,4FAA4F;IAC5F,OAAO,EAAE,aAAa,CAAC;IACvB,uEAAuE;IACvE,GAAG,EAAE,MAAM,CAAC;IACZ,2FAA2F;IAC3F,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,mEAAmE;IACnE,MAAM,EAAE,MAAM,CAAC;IACf,oBAAoB;IACpB,KAAK,EAAE,CAAC,EAAE,EAAE,MAAM,KAAK,OAAO,CAAC,IAAI,CAAC,CAAC;CACtC;AAED;;;GAGG;AACH,MAAM,WAAW,OAAO;IACtB,2CAA2C;IAC3C,KAAK,EAAE,MAAM,CAAC;IACd,sDAAsD;IACtD,IAAI,EAAE,MAAM,CAAC;IACb,kEAAkE;IAClE,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,yDAAyD;IACzD,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,4EAA4E;AAC5E,MAAM,WAAW,SAAS;IACxB,IAAI,EAAE,MAAM,CAAC;IACb,GAAG,EAAE,MAAM,CAAC;IACZ,gDAAgD;IAChD,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,kDAAkD;IAClD,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,oFAAoF;AACpF,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,IAAI,EAAE,MAAM,CAAC;IACb,+EAA+E;IAC/E,SAAS,EAAE,OAAO,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,0EAA0E;AAC1E,MAAM,WAAW,cAAc;IAC7B,EAAE,EAAE,MAAM,CAAC;IACX,KAAK,EAAE,MAAM,CAAC;IACd,2CAA2C;IAC3C,IAAI,EAAE,OAAO,CAAC;IACd,yDAAyD;IACzD,MAAM,EAAE,SAAS,EAAE,CAAC;IACpB,sEAAsE;IACtE,UAAU,EAAE,eAAe,EAAE,CAAC;IAC9B,qEAAqE;IACrE,IAAI,EAAE,MAAM,CAAC;IACb,6CAA6C;IAC7C,QAAQ,EAAE,MAAM,CAAC;IACjB,iEAAiE;IACjE,QAAQ,EAAE,OAAO,EAAE,CAAC;IACpB,oEAAoE;IACpE,YAAY,EAAE,MAAM,CAAC;IACrB,oEAAoE;IACpE,iBAAiB,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC,kFAAkF;IAClF,mBAAmB,EAAE,MAAM,CAAC;IAC5B,gEAAgE;IAChE,UAAU,EAAE,OAAO,CAAC;IACpB,0CAA0C;IAC1C,MAAM,EAAE;QACN,YAAY,EAAE,MAAM,CAAC;QACrB,OAAO,EAAE,MAAM,CAAC;QAChB,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;IACF,kEAAkE;IAClE,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,qEAAqE;IACrE,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,0DAA0D;IAC1D,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,iGAAiG;IACjG,sBAAsB,CAAC,EAAE,OAAO,CAAC;IACjC,iEAAiE;IACjE,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;;;;;;GAOG;AACH,MAAM,MAAM,QAAQ,GAAG,OAAO,GAAG,WAAW,CAAC;AAE7C,2DAA2D;AAC3D,MAAM,WAAW,YAAY;IAC3B,EAAE,EAAE,MAAM,CAAC;IACX,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,QAAQ,CAAC;IACf,iEAAiE;IACjE,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,4CAA4C;IAC5C,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;IACzB,kCAAkC;IAClC,SAAS,EAAE,MAAM,CAAC;IAClB,gFAAgF;IAChF,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B;;;OAGG;IACH,GAAG,EAAE,CAAC,GAAG,EAAE,eAAe,KAAK,OAAO,CAAC,cAAc,CAAC,CAAC;CACxD;AAED,+DAA+D;AAC/D,MAAM,WAAW,SAAS;IACxB,eAAe,EAAE,MAAM,CAAC;IACxB,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;IACrB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,mBAAmB,EAAE,MAAM,CAAC;IAC5B,mBAAmB,EAAE,MAAM,CAAC;IAC5B,eAAe,EAAE,MAAM,CAAC;IACxB,cAAc,EAAE,MAAM,CAAC;IACvB,2EAA2E;IAC3E,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,8EAA8E;IAC9E,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED,yCAAyC;AACzC,MAAM,WAAW,UAAU;IACzB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,GAAG,EAAE;QACH,OAAO,EAAE,OAAO,CAAC;QACjB,MAAM,EAAE,MAAM,CAAC;KAChB,CAAC;IACF,OAAO,EAAE,SAAS,CAAC;IACnB,SAAS,EAAE,cAAc,EAAE,CAAC;CAC7B;AAED,2CAA2C;AAC3C,MAAM,WAAW,YAAY;IAC3B,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;IAClB,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;CACtC;AAED,eAAO,MAAM,cAAc,IAAI,CAAC"}
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AACH,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,cAAc,CAAC;AAElD,wDAAwD;AACxD,MAAM,WAAW,eAAe;IAC9B,4FAA4F;IAC5F,OAAO,EAAE,aAAa,CAAC;IACvB,uEAAuE;IACvE,GAAG,EAAE,MAAM,CAAC;IACZ,2FAA2F;IAC3F,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,mEAAmE;IACnE,MAAM,EAAE,MAAM,CAAC;IACf,oBAAoB;IACpB,KAAK,EAAE,CAAC,EAAE,EAAE,MAAM,KAAK,OAAO,CAAC,IAAI,CAAC,CAAC;CACtC;AAED;;;GAGG;AACH,MAAM,WAAW,OAAO;IACtB,2CAA2C;IAC3C,KAAK,EAAE,MAAM,CAAC;IACd,sDAAsD;IACtD,IAAI,EAAE,MAAM,CAAC;IACb,kEAAkE;IAClE,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,yDAAyD;IACzD,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,4EAA4E;AAC5E,MAAM,WAAW,SAAS;IACxB,IAAI,EAAE,MAAM,CAAC;IACb,GAAG,EAAE,MAAM,CAAC;IACZ,gDAAgD;IAChD,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,kDAAkD;IAClD,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,oFAAoF;AACpF,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,IAAI,EAAE,MAAM,CAAC;IACb,+EAA+E;IAC/E,SAAS,EAAE,OAAO,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,0EAA0E;AAC1E,MAAM,WAAW,cAAc;IAC7B,EAAE,EAAE,MAAM,CAAC;IACX,KAAK,EAAE,MAAM,CAAC;IACd,2CAA2C;IAC3C,IAAI,EAAE,OAAO,CAAC;IACd,yDAAyD;IACzD,MAAM,EAAE,SAAS,EAAE,CAAC;IACpB,sEAAsE;IACtE,UAAU,EAAE,eAAe,EAAE,CAAC;IAC9B,qEAAqE;IACrE,IAAI,EAAE,MAAM,CAAC;IACb,6CAA6C;IAC7C,QAAQ,EAAE,MAAM,CAAC;IACjB,iEAAiE;IACjE,QAAQ,EAAE,OAAO,EAAE,CAAC;IACpB,oEAAoE;IACpE,YAAY,EAAE,MAAM,CAAC;IACrB,oEAAoE;IACpE,iBAAiB,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC,kFAAkF;IAClF,mBAAmB,EAAE,MAAM,CAAC;IAC5B,gEAAgE;IAChE,UAAU,EAAE,OAAO,CAAC;IACpB,0CAA0C;IAC1C,MAAM,EAAE;QACN,YAAY,EAAE,MAAM,CAAC;QACrB,OAAO,EAAE,MAAM,CAAC;QAChB,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;IACF,kEAAkE;IAClE,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,qEAAqE;IACrE,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,0DAA0D;IAC1D,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,iGAAiG;IACjG,sBAAsB,CAAC,EAAE,OAAO,CAAC;IACjC,iEAAiE;IACjE,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;;;;;;GAOG;AACH,MAAM,MAAM,QAAQ,GAAG,OAAO,GAAG,WAAW,CAAC;AAE7C,2DAA2D;AAC3D,MAAM,WAAW,YAAY;IAC3B,EAAE,EAAE,MAAM,CAAC;IACX,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,QAAQ,CAAC;IACf,iEAAiE;IACjE,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,4CAA4C;IAC5C,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;IACzB,kCAAkC;IAClC,SAAS,EAAE,MAAM,CAAC;IAClB,gFAAgF;IAChF,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B;;;OAGG;IACH,GAAG,EAAE,CAAC,GAAG,EAAE,eAAe,KAAK,OAAO,CAAC,cAAc,CAAC,CAAC;CACxD;AAED,+DAA+D;AAC/D,MAAM,WAAW,SAAS;IACxB,eAAe,EAAE,MAAM,CAAC;IACxB,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;IACrB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,mBAAmB,EAAE,MAAM,CAAC;IAC5B,mBAAmB,EAAE,MAAM,CAAC;IAC5B,eAAe,EAAE,MAAM,CAAC;IACxB,cAAc,EAAE,MAAM,CAAC;IACvB,2EAA2E;IAC3E,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,8EAA8E;IAC9E,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED,yCAAyC;AACzC,MAAM,WAAW,UAAU;IACzB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,GAAG,EAAE;QACH,OAAO,EAAE,OAAO,CAAC;QACjB,MAAM,EAAE,MAAM,CAAC;KAChB,CAAC;IACF,OAAO,EAAE,SAAS,CAAC;IACnB,SAAS,EAAE,cAAc,EAAE,CAAC;CAC7B;AAED,2CAA2C;AAC3C,MAAM,WAAW,YAAY;IAC3B,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;IAClB,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;CACtC;AAED,eAAO,MAAM,cAAc,IAAI,CAAC;AAOhC;;;GAGG;AACH,MAAM,WAAW,aAAa;IAC5B,EAAE,EAAE,MAAM,CAAC;IACX,KAAK,EAAE,MAAM,CAAC;IACd;;;OAGG;IACH,kBAAkB,EAAE,MAAM,CAAC;IAC3B;;;OAGG;IACH,IAAI,EAAE,MAAM,CAAC;CACd;AAED,yCAAyC;AACzC,MAAM,WAAW,mBAAmB;IAClC,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,8FAA8F;IAC9F,OAAO,EAAE,MAAM,CAAC;IAChB,kEAAkE;IAClE,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,EAAE,OAAO,CAAC;IACd,cAAc,EAAE,OAAO,CAAC;IACxB,WAAW,EAAE,OAAO,CAAC;IACrB,SAAS,EAAE,OAAO,CAAC;IACnB,kBAAkB,EAAE,OAAO,CAAC;IAC5B,kBAAkB,EAAE,OAAO,CAAC;IAC5B,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,UAAU,EAAE,MAAM,CAAC;IACnB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,6EAA6E;AAC7E,MAAM,WAAW,gBAAgB;IAC/B,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,MAAM,CAAC;CAClB"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@agent-relay/evals",
|
|
3
|
-
"version": "8.8.
|
|
3
|
+
"version": "8.8.2",
|
|
4
4
|
"description": "Agent Relay eval harness — scenario runner, broker harness, and scoring utilities for testing relay-connected agents",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -46,6 +46,11 @@
|
|
|
46
46
|
"import": "./dist/scenarios/onboarding.js",
|
|
47
47
|
"default": "./dist/scenarios/onboarding.js"
|
|
48
48
|
},
|
|
49
|
+
"./scoring/mount": {
|
|
50
|
+
"types": "./dist/scoring/mount.d.ts",
|
|
51
|
+
"import": "./dist/scoring/mount.js",
|
|
52
|
+
"default": "./dist/scoring/mount.js"
|
|
53
|
+
},
|
|
49
54
|
"./scenarios/core": {
|
|
50
55
|
"types": "./dist/scenarios/index.d.ts",
|
|
51
56
|
"import": "./dist/scenarios/index.js",
|
|
@@ -63,7 +68,7 @@
|
|
|
63
68
|
"check": "tsc -p tsconfig.json --noEmit"
|
|
64
69
|
},
|
|
65
70
|
"dependencies": {
|
|
66
|
-
"@agent-relay/harness-driver": "8.8.
|
|
71
|
+
"@agent-relay/harness-driver": "8.8.2"
|
|
67
72
|
},
|
|
68
73
|
"publishConfig": {
|
|
69
74
|
"access": "public"
|