@agent-relay/evals 8.3.1 → 8.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,31 @@
1
+ /**
2
+ * BrokerHarness type surface for @agent-relay/evals.
3
+ *
4
+ * The full BrokerHarness class implementation lives in
5
+ * tests/integration/broker/utils/broker-harness.ts pending migration into
6
+ * this package (see specs/agent-relay-evals-package.md). This module exports
7
+ * the interface that eval scenarios depend on so downstream consumers can
8
+ * type-check against it without importing the test-only implementation.
9
+ */
10
+ import type { BrokerEvent, ListAgent, RuntimeSpawnOptions, SendMessageInput } from '@agent-relay/harness-driver';
11
+ export interface EventWaiter {
12
+ promise: Promise<BrokerEvent>;
13
+ cancel: () => void;
14
+ }
15
+ export interface BrokerHarness {
16
+ spawnAgent(name: string, cli: string, channels: string[], options?: Partial<RuntimeSpawnOptions>): Promise<{
17
+ name: string;
18
+ }>;
19
+ releaseAgent(name: string): Promise<{
20
+ name: string;
21
+ }>;
22
+ sendMessage(input: SendMessageInput): Promise<{
23
+ event_id: string;
24
+ targets: string[];
25
+ }>;
26
+ listAgents(): Promise<ListAgent[]>;
27
+ getEvents(): BrokerEvent[];
28
+ clearEvents(): void;
29
+ waitForEvent(kind: string, timeoutMs?: number, predicate?: (event: BrokerEvent) => boolean): EventWaiter;
30
+ }
31
+ //# sourceMappingURL=harness.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"harness.d.ts","sourceRoot":"","sources":["../src/harness.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AACH,OAAO,KAAK,EACV,WAAW,EACX,SAAS,EACT,mBAAmB,EACnB,gBAAgB,EACjB,MAAM,6BAA6B,CAAC;AAErC,MAAM,WAAW,WAAW;IAC1B,OAAO,EAAE,OAAO,CAAC,WAAW,CAAC,CAAC;IAC9B,MAAM,EAAE,MAAM,IAAI,CAAC;CACpB;AAED,MAAM,WAAW,aAAa;IAC5B,UAAU,CACR,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,QAAQ,EAAE,MAAM,EAAE,EAClB,OAAO,CAAC,EAAE,OAAO,CAAC,mBAAmB,CAAC,GACrC,OAAO,CAAC;QAAE,IAAI,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAE7B,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC;QAAE,IAAI,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAEtD,WAAW,CAAC,KAAK,EAAE,gBAAgB,GAAG,OAAO,CAAC;QAAE,QAAQ,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,EAAE,CAAA;KAAE,CAAC,CAAC;IAEvF,UAAU,IAAI,OAAO,CAAC,SAAS,EAAE,CAAC,CAAC;IAEnC,SAAS,IAAI,WAAW,EAAE,CAAC;IAE3B,WAAW,IAAI,IAAI,CAAC;IAEpB,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,CAAC,KAAK,EAAE,WAAW,KAAK,OAAO,GAAG,WAAW,CAAC;CAC1G"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=harness.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"harness.js","sourceRoot":"","sources":["../src/harness.ts"],"names":[],"mappings":""}
@@ -0,0 +1,15 @@
1
+ /**
2
+ * @agent-relay/evals
3
+ *
4
+ * Agent Relay eval harness — scenario runner, broker harness, and scoring
5
+ * utilities for testing relay-connected agents across CLI harnesses.
6
+ *
7
+ * The source of truth for scenario implementations and the runner currently
8
+ * lives in tests/integration/broker/evals/. This package exposes the stable
9
+ * public surface that downstream consumers (pear, agent-assistant, etc.) depend
10
+ * on. The full migration into this package is tracked in
11
+ * specs/agent-relay-evals-package.md.
12
+ */
13
+ export type { EvalScenario, ScenarioContext, ScenarioResult, EvalTier, MetricSet, EvalReport, MatrixReport, AgentInfo, TranscriptEntry, Phantom, } from './types.js';
14
+ export { SCHEMA_VERSION } from './types.js';
15
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,YAAY,EACV,YAAY,EACZ,eAAe,EACf,cAAc,EACd,QAAQ,EACR,SAAS,EACT,UAAU,EACV,YAAY,EACZ,SAAS,EACT,eAAe,EACf,OAAO,GACR,MAAM,YAAY,CAAC;AAEpB,OAAO,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,14 @@
1
+ /**
2
+ * @agent-relay/evals
3
+ *
4
+ * Agent Relay eval harness — scenario runner, broker harness, and scoring
5
+ * utilities for testing relay-connected agents across CLI harnesses.
6
+ *
7
+ * The source of truth for scenario implementations and the runner currently
8
+ * lives in tests/integration/broker/evals/. This package exposes the stable
9
+ * public surface that downstream consumers (pear, agent-assistant, etc.) depend
10
+ * on. The full migration into this package is tracked in
11
+ * specs/agent-relay-evals-package.md.
12
+ */
13
+ export { SCHEMA_VERSION } from './types.js';
14
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAeH,OAAO,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC"}
@@ -0,0 +1,161 @@
1
+ /**
2
+ * Type definitions for the agent messaging eval harness.
3
+ *
4
+ * The eval suite spawns real agent CLIs, drives them through coordination
5
+ * scenarios, and scores — purely from broker events — whether agents actually
6
+ * used the messaging tools (MCP/CLI) versus emitting plain-text "phantom"
7
+ * messages.
8
+ */
9
+ import type { BrokerHarness } from './harness.js';
10
+ /** Context handed to each scenario's `run` function. */
11
+ export interface ScenarioContext {
12
+ /** A started broker harness. The scenario owns its agents but not the harness lifecycle. */
13
+ harness: BrokerHarness;
14
+ /** The CLI/harness under test (e.g. "claude", "codex", "opencode"). */
15
+ cli: string;
16
+ /** Optional model override to pass to the agent CLI (e.g. "claude-haiku-4-5-20251001"). */
17
+ model?: string;
18
+ /** Unique suffix for isolating agent/channel names across runs. */
19
+ suffix: string;
20
+ /** Sleep helper. */
21
+ sleep: (ms: number) => Promise<void>;
22
+ }
23
+ /**
24
+ * A single phantom message: forward-looking intent to communicate expressed in
25
+ * plain text that was never backed by an actual `relay_inbound` send.
26
+ */
27
+ export interface Phantom {
28
+ /** The agent that expressed the intent. */
29
+ agent: string;
30
+ /** The matched verb (e.g. "tell", "post", "send"). */
31
+ verb: string;
32
+ /** The parsed target, if the regex captured one (e.g. "Lead"). */
33
+ target?: string;
34
+ /** A short snippet of surrounding text for debugging. */
35
+ snippet: string;
36
+ }
37
+ /** An agent under test in a scenario, with the task prompt it was given. */
38
+ export interface AgentInfo {
39
+ name: string;
40
+ cli: string;
41
+ /** Optional role label (e.g. "relay hop A"). */
42
+ role?: string;
43
+ /** The task prompt the agent was spawned with. */
44
+ prompt: string;
45
+ }
46
+ /** One message in a scenario's conversation, derived from a relay_inbound event. */
47
+ export interface TranscriptEntry {
48
+ from: string;
49
+ target: string;
50
+ body: string;
51
+ /** True if `from` is one of the scenario's agents under test (a real send). */
52
+ fromAgent: boolean;
53
+ threadId?: string;
54
+ }
55
+ /** Raw, scenario-specific signal counts derived from the event stream. */
56
+ export interface ScenarioResult {
57
+ id: string;
58
+ title: string;
59
+ /** Overall pass/fail for this scenario. */
60
+ pass: boolean;
61
+ /** Agents under test and the prompts they were given. */
62
+ agents: AgentInfo[];
63
+ /** The full message transcript (stimulus + agent sends), in order. */
64
+ transcript: TranscriptEntry[];
65
+ /** Number of messages the agent(s) actually sent (relay_inbound). */
66
+ sent: number;
67
+ /** Number of sends the scenario expected. */
68
+ expected: number;
69
+ /** Phantom messages detected (intent without a backing send). */
70
+ phantoms: Phantom[];
71
+ /** Total forward-looking intents detected (satisfied + phantom). */
72
+ totalIntents: number;
73
+ /** Protocol adherence score in [0,1], or null if not applicable. */
74
+ protocolAdherence: number | null;
75
+ /** Replies that targeted the wrong channel / a DM when a channel was expected. */
76
+ wrongChannelReplies: number;
77
+ /** True if no delivery_dropped / acl_denied events occurred. */
78
+ deliveryOk: boolean;
79
+ /** Coarse event counts for the report. */
80
+ events: {
81
+ relayInbound: number;
82
+ dropped: number;
83
+ aclDenied: number;
84
+ };
85
+ /** Lifecycle: number of confirmed add_agent calls in this run. */
86
+ spawnCount?: number;
87
+ /** Lifecycle: number of confirmed remove_agent calls in this run. */
88
+ releaseCount?: number;
89
+ /** Onboarding variant used (lifecycle scenarios only). */
90
+ onboarding?: string;
91
+ /** True when the agent used Claude's native Task tool instead of mcp__agent-relay__add_agent. */
92
+ nativeSubagentDetected?: boolean;
93
+ /** Optional human-readable notes (e.g. partial-chain detail). */
94
+ notes?: string;
95
+ }
96
+ /**
97
+ * Eval tiers:
98
+ * - `smoke`: leading prompts that name the exact tool. A plumbing canary — proves
99
+ * the broker→MCP→agent→scoring path works; not a measure of protocol retention.
100
+ * - `realistic`: natural-language prompts where messaging is incidental to real
101
+ * work and the protocol must come from the injected onboarding (skill + broker
102
+ * hints) — what production agents actually get. This is the real benchmark.
103
+ */
104
+ export type EvalTier = 'smoke' | 'realistic';
105
+ /** A scenario the runner can execute against a harness. */
106
+ export interface EvalScenario {
107
+ id: string;
108
+ title: string;
109
+ tier: EvalTier;
110
+ /** Channels the broker should subscribe to for this scenario. */
111
+ channels: string[];
112
+ /** If set, only run for these harnesses. */
113
+ harnessFilter?: string[];
114
+ /** Overall test timeout in ms. */
115
+ timeoutMs: number;
116
+ /** Onboarding variant (lifecycle scenarios only — used for report grouping). */
117
+ onboardingVariant?: string;
118
+ /**
119
+ * Orchestrate the scenario end-to-end: spawn agents, inject the stimulus,
120
+ * wait for responses, and score the captured events into a ScenarioResult.
121
+ */
122
+ run: (ctx: ScenarioContext) => Promise<ScenarioResult>;
123
+ }
124
+ /** Aggregated metrics for one harness across all scenarios. */
125
+ export interface MetricSet {
126
+ messageSentRate: number;
127
+ phantomRate: number;
128
+ phantomCount: number;
129
+ protocolAdherence: number;
130
+ deliverySuccessRate: number;
131
+ wrongChannelReplies: number;
132
+ scenariosPassed: number;
133
+ scenariosTotal: number;
134
+ /** Lifecycle: fraction of s01/s03 scenarios where add_agent was called. */
135
+ spawnRate?: number;
136
+ /** Lifecycle: fraction of s02/s03 scenarios where remove_agent was called. */
137
+ releaseRate?: number;
138
+ }
139
+ /** A full report for one harness run. */
140
+ export interface EvalReport {
141
+ schemaVersion: number;
142
+ startedAt: string;
143
+ durationMs: number;
144
+ harness: string;
145
+ gitSha: string;
146
+ env: {
147
+ realCli: boolean;
148
+ repeat: number;
149
+ };
150
+ metrics: MetricSet;
151
+ scenarios: ScenarioResult[];
152
+ }
153
+ /** The matrix roll-up across harnesses. */
154
+ export interface MatrixReport {
155
+ schemaVersion: number;
156
+ startedAt: string;
157
+ gitSha: string;
158
+ harnesses: Record<string, MetricSet>;
159
+ }
160
+ export declare const SCHEMA_VERSION = 1;
161
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AACH,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,cAAc,CAAC;AAElD,wDAAwD;AACxD,MAAM,WAAW,eAAe;IAC9B,4FAA4F;IAC5F,OAAO,EAAE,aAAa,CAAC;IACvB,uEAAuE;IACvE,GAAG,EAAE,MAAM,CAAC;IACZ,2FAA2F;IAC3F,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,mEAAmE;IACnE,MAAM,EAAE,MAAM,CAAC;IACf,oBAAoB;IACpB,KAAK,EAAE,CAAC,EAAE,EAAE,MAAM,KAAK,OAAO,CAAC,IAAI,CAAC,CAAC;CACtC;AAED;;;GAGG;AACH,MAAM,WAAW,OAAO;IACtB,2CAA2C;IAC3C,KAAK,EAAE,MAAM,CAAC;IACd,sDAAsD;IACtD,IAAI,EAAE,MAAM,CAAC;IACb,kEAAkE;IAClE,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,yDAAyD;IACzD,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,4EAA4E;AAC5E,MAAM,WAAW,SAAS;IACxB,IAAI,EAAE,MAAM,CAAC;IACb,GAAG,EAAE,MAAM,CAAC;IACZ,gDAAgD;IAChD,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,kDAAkD;IAClD,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,oFAAoF;AACpF,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,IAAI,EAAE,MAAM,CAAC;IACb,+EAA+E;IAC/E,SAAS,EAAE,OAAO,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,0EAA0E;AAC1E,MAAM,WAAW,cAAc;IAC7B,EAAE,EAAE,MAAM,CAAC;IACX,KAAK,EAAE,MAAM,CAAC;IACd,2CAA2C;IAC3C,IAAI,EAAE,OAAO,CAAC;IACd,yDAAyD;IACzD,MAAM,EAAE,SAAS,EAAE,CAAC;IACpB,sEAAsE;IACtE,UAAU,EAAE,eAAe,EAAE,CAAC;IAC9B,qEAAqE;IACrE,IAAI,EAAE,MAAM,CAAC;IACb,6CAA6C;IAC7C,QAAQ,EAAE,MAAM,CAAC;IACjB,iEAAiE;IACjE,QAAQ,EAAE,OAAO,EAAE,CAAC;IACpB,oEAAoE;IACpE,YAAY,EAAE,MAAM,CAAC;IACrB,oEAAoE;IACpE,iBAAiB,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC,kFAAkF;IAClF,mBAAmB,EAAE,MAAM,CAAC;IAC5B,gEAAgE;IAChE,UAAU,EAAE,OAAO,CAAC;IACpB,0CAA0C;IAC1C,MAAM,EAAE;QACN,YAAY,EAAE,MAAM,CAAC;QACrB,OAAO,EAAE,MAAM,CAAC;QAChB,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;IACF,kEAAkE;IAClE,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,qEAAqE;IACrE,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,0DAA0D;IAC1D,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,iGAAiG;IACjG,sBAAsB,CAAC,EAAE,OAAO,CAAC;IACjC,iEAAiE;IACjE,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;;;;;;GAOG;AACH,MAAM,MAAM,QAAQ,GAAG,OAAO,GAAG,WAAW,CAAC;AAE7C,2DAA2D;AAC3D,MAAM,WAAW,YAAY;IAC3B,EAAE,EAAE,MAAM,CAAC;IACX,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,QAAQ,CAAC;IACf,iEAAiE;IACjE,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,4CAA4C;IAC5C,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;IACzB,kCAAkC;IAClC,SAAS,EAAE,MAAM,CAAC;IAClB,gFAAgF;IAChF,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B;;;OAGG;IACH,GAAG,EAAE,CAAC,GAAG,EAAE,eAAe,KAAK,OAAO,CAAC,cAAc,CAAC,CAAC;CACxD;AAED,+DAA+D;AAC/D,MAAM,WAAW,SAAS;IACxB,eAAe,EAAE,MAAM,CAAC;IACxB,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;IACrB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,mBAAmB,EAAE,MAAM,CAAC;IAC5B,mBAAmB,EAAE,MAAM,CAAC;IAC5B,eAAe,EAAE,MAAM,CAAC;IACxB,cAAc,EAAE,MAAM,CAAC;IACvB,2EAA2E;IAC3E,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,8EAA8E;IAC9E,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED,yCAAyC;AACzC,MAAM,WAAW,UAAU;IACzB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,GAAG,EAAE;QACH,OAAO,EAAE,OAAO,CAAC;QACjB,MAAM,EAAE,MAAM,CAAC;KAChB,CAAC;IACF,OAAO,EAAE,SAAS,CAAC;IACnB,SAAS,EAAE,cAAc,EAAE,CAAC;CAC7B;AAED,2CAA2C;AAC3C,MAAM,WAAW,YAAY;IAC3B,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;IAClB,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;CACtC;AAED,eAAO,MAAM,cAAc,IAAI,CAAC"}
package/dist/types.js ADDED
@@ -0,0 +1,2 @@
1
+ export const SCHEMA_VERSION = 1;
2
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AA0KA,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@agent-relay/evals",
3
- "version": "8.3.1",
3
+ "version": "8.8.0",
4
4
  "description": "Agent Relay eval harness — scenario runner, broker harness, and scoring utilities for testing relay-connected agents",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -63,7 +63,7 @@
63
63
  "check": "tsc -p tsconfig.json --noEmit"
64
64
  },
65
65
  "dependencies": {
66
- "@agent-relay/harness-driver": "8.3.1"
66
+ "@agent-relay/harness-driver": "8.8.0"
67
67
  },
68
68
  "publishConfig": {
69
69
  "access": "public"