@alis-build/harness-eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,292 @@
1
+ import { a as HarnessAdapter, d as ToolCall, f as TrajectoryView, r as AdapterResult, s as SuiteConfig, t as AdapterDiagnostics } from "./types-B9H4IZtA.js";
2
+
3
+ //#region src/types/assertions.d.ts
4
+ /**
5
+ * A tool name pattern. Either a literal name, or a glob with `*` wildcards.
6
+ *
7
+ * The object form (`{ pattern: "..." }`) exists only for YAML disambiguation —
8
+ * in YAML, a bare string is the default. Internally they are equivalent.
9
+ *
10
+ * @example
11
+ * "mcp__api__search_skills" // literal match
12
+ * "mcp__api__*" // any tool in mcp__api namespace
13
+ * "mcp__*" // any MCP tool
14
+ * "*" // any tool at all
15
+ */
16
+ type ToolPattern = string | {
17
+ pattern: string;
18
+ };
19
+ /**
20
+ * Cardinality spec for `called` assertions.
21
+ *
22
+ * Format: `"<op> <n>"` with `op` ∈ {`==`, `!=`, `>=`, `<=`, `>`, `<`}.
23
+ * Default (when omitted): `">= 1"`.
24
+ *
25
+ * Parsed lazily by `src/assertions/cardinality.ts`.
26
+ */
27
+ type Cardinality = string;
28
+ /**
29
+ * Argument-matching predicate.
30
+ *
31
+ * The predicate language is recursive. Three flavours:
32
+ * - Leaf: `{ equals: "x" }`, `{ contains: "foo" }`, etc.
33
+ * - Compound: `{ all_of: [...] }`, `{ any_of: [...] }`, `{ not: ... }`.
34
+ * - Object: `{ field1: <predicate>, field2: <predicate>, ... }` — descend
35
+ * into object fields. Each field's value is itself a Predicate.
36
+ *
37
+ * Disambiguation: a single-key object whose key matches a known leaf or
38
+ * compound operator is treated as a leaf/compound predicate. Otherwise it
39
+ * is treated as an object predicate (field name = key).
40
+ *
41
+ * Known limitation: if your tool's arg schema has a field literally named
42
+ * `equals`, `contains`, etc., you must wrap it: `{ equals: { equals: "x" } }`.
43
+ * In practice this never happens for MCP tools.
44
+ */
45
+ type Predicate = LeafPredicate | CompoundPredicate | ObjectPredicate;
46
+ type LeafPredicate = {
47
+ equals: unknown;
48
+ } | {
49
+ contains: string;
50
+ } | {
51
+ not_contains: string;
52
+ } | {
53
+ regex: string;
54
+ } | {
55
+ gte: number;
56
+ } | {
57
+ lte: number;
58
+ } | {
59
+ gt: number;
60
+ } | {
61
+ lt: number;
62
+ } | {
63
+ one_of: unknown[];
64
+ };
65
+ type CompoundPredicate = {
66
+ any_of: Predicate[];
67
+ } | {
68
+ all_of: Predicate[];
69
+ } | {
70
+ not: Predicate;
71
+ };
72
+ /** Object-shaped predicate. Field values may be sub-predicates or scalar shortcuts. */
73
+ type ObjectPredicate = {
74
+ [field: string]: Predicate | string | number | boolean | null;
75
+ };
76
+ /**
77
+ * The full assertion language. Each variant is evaluated by a corresponding
78
+ * function in the `src/assertions/*.ts` modules.
79
+ *
80
+ * Grouped by concern for readability:
81
+ * 1. Tool-call presence and ordering
82
+ * 2. Tool-call argument matching
83
+ * 3. Behavior (efficiency, finishing, blind-answering)
84
+ * 4. Response text
85
+ * 5. Compound (logical operators)
86
+ * 6. Escape hatch (arbitrary TypeScript predicate)
87
+ */
88
+ type Assertion = {
89
+ type: "called";
90
+ tool: ToolPattern;
91
+ times?: Cardinality;
92
+ } | {
93
+ type: "not_called";
94
+ tool: ToolPattern;
95
+ } | {
96
+ type: "called_any_of";
97
+ tools: ToolPattern[];
98
+ } | {
99
+ type: "called_all_of";
100
+ tools: ToolPattern[];
101
+ } | {
102
+ type: "called_before";
103
+ first: ToolPattern;
104
+ then: ToolPattern;
105
+ } | {
106
+ type: "sequence";
107
+ tools: ToolPattern[];
108
+ strict?: boolean;
109
+ } | {
110
+ type: "called_with";
111
+ tool: ToolPattern;
112
+ args: Predicate;
113
+ } | {
114
+ type: "responded_without_tool_calls";
115
+ } | {
116
+ type: "iterations_within";
117
+ max: number;
118
+ } | {
119
+ type: "cost_within_usd";
120
+ max: number;
121
+ } | {
122
+ type: "duration_within_ms";
123
+ max: number;
124
+ } | {
125
+ type: "finished_with";
126
+ reasons: string | string[];
127
+ } | {
128
+ type: "response_contains";
129
+ text: string;
130
+ } | {
131
+ type: "response_not_contains";
132
+ text: string;
133
+ } | {
134
+ type: "response_matches";
135
+ pattern: string;
136
+ flags?: string;
137
+ } | {
138
+ type: "all_of";
139
+ assertions: Assertion[];
140
+ } | {
141
+ type: "any_of";
142
+ assertions: Assertion[];
143
+ } | {
144
+ type: "not";
145
+ assertion: Assertion;
146
+ } | {
147
+ type: "predicate";
148
+ fn: (view: TrajectoryView) => boolean;
149
+ description?: string;
150
+ };
151
+ /** An assertion plus the pass-rate threshold it must meet across repetitions. */
152
+ interface ThresholdedAssertion {
153
+ assertion: Assertion;
154
+ /**
155
+ * Minimum pass rate across repetitions for this assertion to be considered
156
+ * passing. Range 0..1. Default 1.0 (strict — every rep must pass).
157
+ */
158
+ threshold?: number;
159
+ }
160
+ /**
161
+ * Result of evaluating a single assertion.
162
+ *
163
+ * `children` is populated for compound assertions (and/or/not) so the
164
+ * reporter can render a tree showing which leaf caused a failure. `matches`
165
+ * carries the tool calls that satisfied (or could have satisfied) the
166
+ * assertion — useful for diagnostic output.
167
+ */
168
+ interface AssertionResult {
169
+ passed: boolean;
170
+ /** Short human-readable name, e.g. `"called(mcp__api__search_skills, >= 1)"`. */
171
+ description: string;
172
+ /** Diagnostic detail. Always populated; explains the pass/fail. */
173
+ details: string;
174
+ /** Tool calls that satisfied the assertion (omitted when irrelevant). */
175
+ matches?: ToolCall[];
176
+ /** Sub-results for compound assertions. */
177
+ children?: AssertionResult[];
178
+ }
179
+ //#endregion
180
+ //#region src/runner/types.d.ts
181
+ interface TestCase {
182
+ id: string;
183
+ prompt: string;
184
+ category?: string;
185
+ notes?: string;
186
+ assertions: ThresholdedAssertion[];
187
+ /** Natural-language outcome checks for LLM grading (see `harness-eval grade`). */
188
+ expectations?: string[];
189
+ /** Reference tool-call trajectory for metric computation. */
190
+ reference_trajectory?: Array<{
191
+ tool_name: string;
192
+ tool_input: unknown;
193
+ }>;
194
+ /** Human ratings keyed by metric name for judge calibration. */
195
+ human_ratings?: Record<string, number>;
196
+ repetitions?: number;
197
+ config?: SuiteConfig;
198
+ }
199
+ interface MatrixCell {
200
+ label: string;
201
+ config: SuiteConfig;
202
+ axes?: Record<string, string>;
203
+ }
204
+ interface TestSuite {
205
+ /** Harness adapter id. Default: `claude-code`. */
206
+ adapter?: string;
207
+ cases: TestCase[];
208
+ matrix: MatrixCell[];
209
+ defaultConfig?: SuiteConfig;
210
+ }
211
+ interface RunSuiteOptions {
212
+ /** Maximum concurrent harness processes across the entire suite. Default 4. */
213
+ maxConcurrent?: number;
214
+ /** Harness adapter to run. Defaults to registry default (`claude-code`). */
215
+ adapter?: HarnessAdapter;
216
+ onProgress?: ProgressCallback;
217
+ signal?: AbortSignal;
218
+ }
219
+ type ProgressCallback = (event: ProgressEvent) => void;
220
+ type ProgressEvent = {
221
+ kind: "suite-start";
222
+ totalReps: number;
223
+ } | {
224
+ kind: "rep-start";
225
+ caseId: string;
226
+ cellLabel: string;
227
+ repIndex: number;
228
+ } | {
229
+ kind: "rep-complete";
230
+ caseId: string;
231
+ cellLabel: string;
232
+ repIndex: number;
233
+ ok: boolean;
234
+ durationMs: number;
235
+ toolCallCount?: number;
236
+ assertionResults?: AssertionResult[];
237
+ errorMessage?: string;
238
+ } | {
239
+ kind: "cell-complete";
240
+ report: CellReport;
241
+ } | {
242
+ kind: "suite-complete";
243
+ report: SuiteReport;
244
+ };
245
+ interface RepetitionResult {
246
+ repetitionIndex: number;
247
+ adapterResult: AdapterResult | null;
248
+ error: RepetitionError | null;
249
+ assertionResults: AssertionResult[];
250
+ durationMs: number;
251
+ }
252
+ interface RepetitionError {
253
+ message: string;
254
+ diagnostics: Partial<AdapterDiagnostics>;
255
+ }
256
+ interface CellReport {
257
+ caseId: string;
258
+ category?: string;
259
+ notes?: string;
260
+ /** Eval prompt (copied for grading without re-loading the suite). */
261
+ prompt?: string;
262
+ /** Outcome expectations for LLM grading. */
263
+ expectations?: string[];
264
+ /** Reference tool-call trajectory for metric computation. */
265
+ reference_trajectory?: Array<{
266
+ tool_name: string;
267
+ tool_input: unknown;
268
+ }>;
269
+ /** Human ratings keyed by metric name for judge calibration. */
270
+ human_ratings?: Record<string, number>;
271
+ cell: MatrixCell;
272
+ repetitions: RepetitionResult[];
273
+ assertionStats: AssertionStat[];
274
+ adapterErrors: number;
275
+ passed: boolean;
276
+ }
277
+ interface AssertionStat {
278
+ description: string;
279
+ threshold: number;
280
+ passedCount: number;
281
+ evaluatedCount: number;
282
+ passRate: number;
283
+ meetsThreshold: boolean;
284
+ }
285
+ interface SuiteReport {
286
+ startedAt: string;
287
+ durationMs: number;
288
+ cells: CellReport[];
289
+ }
290
+ //#endregion
291
+ export { ObjectPredicate as _, ProgressEvent as a, ToolPattern as b, RunSuiteOptions as c, TestSuite as d, Assertion as f, LeafPredicate as g, CompoundPredicate as h, ProgressCallback as i, SuiteReport as l, Cardinality as m, CellReport as n, RepetitionError as o, AssertionResult as p, MatrixCell as r, RepetitionResult as s, AssertionStat as t, TestCase as u, Predicate as v, ThresholdedAssertion as y };
292
+ //# sourceMappingURL=types-BQol062t.d.ts.map
package/package.json ADDED
@@ -0,0 +1,74 @@
1
+ {
2
+ "name": "@alis-build/harness-eval",
3
+ "version": "0.1.0",
4
+ "description": "Harness-level eval framework for measuring AI coding agent tool-selection behavior",
5
+ "type": "module",
6
+ "main": "./dist/index.js",
7
+ "types": "./dist/index.d.ts",
8
+ "author": "www.alisx.com",
9
+ "license": "Apache-2.0",
10
+ "engines": {
11
+ "node": ">=22.12.0"
12
+ },
13
+ "repository": {
14
+ "type": "git",
15
+ "url": "git+https://github.com/alis-build/harness-eval-ts.git"
16
+ },
17
+ "homepage": "https://github.com/alis-build/harness-eval-ts#readme",
18
+ "bugs": {
19
+ "url": "https://github.com/alis-build/harness-eval-ts/issues"
20
+ },
21
+ "exports": {
22
+ ".": {
23
+ "types": "./dist/index.d.ts",
24
+ "import": "./dist/index.js"
25
+ },
26
+ "./adapters/claude-code": {
27
+ "types": "./dist/adapters/claude-code/index.d.ts",
28
+ "import": "./dist/adapters/claude-code/index.js"
29
+ },
30
+ "./runner": {
31
+ "types": "./dist/runner/suite.d.ts",
32
+ "import": "./dist/runner/suite.js"
33
+ },
34
+ "./config": {
35
+ "types": "./dist/config/loader.d.ts",
36
+ "import": "./dist/config/loader.js"
37
+ }
38
+ },
39
+ "files": [
40
+ "dist",
41
+ "schemas",
42
+ "README.md",
43
+ "LICENSE"
44
+ ],
45
+ "scripts": {
46
+ "generate-schemas": "tsx src/schemas/generate.ts",
47
+ "build": "pnpm run generate-schemas && tsdown",
48
+ "prepack": "pnpm run build",
49
+ "prepublishOnly": "pnpm run build",
50
+ "watch": "tsdown --watch",
51
+ "clean": "rm -rf dist",
52
+ "test": "vitest run",
53
+ "test:watch": "vitest",
54
+ "typecheck": "tsc --noEmit"
55
+ },
56
+ "bin": {
57
+ "harness-eval": "./dist/cli/bin.js"
58
+ },
59
+ "dependencies": {
60
+ "yaml": "^2.6.0",
61
+ "zod": "^4.4.3"
62
+ },
63
+ "devDependencies": {
64
+ "@types/node": "^22.12.0",
65
+ "tsdown": "^0.22.3",
66
+ "tsx": "^4.22.4",
67
+ "typescript": "^5.6.0",
68
+ "vitest": "^2.1.0"
69
+ },
70
+ "publishConfig": {
71
+ "access": "public"
72
+ },
73
+ "packageManager": "pnpm@11.3.0"
74
+ }
@@ -0,0 +1,322 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$id": "https://raw.githubusercontent.com/alis-build/harness-eval-ts/main/schemas/eval-interchange.schema.json#AgentTrace",
4
+ "title": "AgentTrace",
5
+ "description": "Full multi-turn agent execution trace.",
6
+ "type": "object",
7
+ "properties": {
8
+ "agents": {
9
+ "$ref": "#/$defs/__schema0"
10
+ },
11
+ "turns": {
12
+ "$ref": "#/$defs/__schema13"
13
+ }
14
+ },
15
+ "required": [
16
+ "agents",
17
+ "turns"
18
+ ],
19
+ "additionalProperties": false,
20
+ "$defs": {
21
+ "__schema0": {
22
+ "type": "object",
23
+ "propertyNames": {
24
+ "$ref": "#/$defs/__schema1"
25
+ },
26
+ "additionalProperties": {
27
+ "$ref": "#/$defs/AgentConfig"
28
+ },
29
+ "description": "Agent configurations keyed by agent id."
30
+ },
31
+ "__schema1": {
32
+ "type": "string"
33
+ },
34
+ "AgentConfig": {
35
+ "type": "object",
36
+ "properties": {
37
+ "agent_id": {
38
+ "$ref": "#/$defs/__schema2"
39
+ },
40
+ "agent_type": {
41
+ "$ref": "#/$defs/__schema3"
42
+ },
43
+ "description": {
44
+ "$ref": "#/$defs/__schema5"
45
+ },
46
+ "instruction": {
47
+ "$ref": "#/$defs/__schema7"
48
+ },
49
+ "tools": {
50
+ "$ref": "#/$defs/__schema9"
51
+ },
52
+ "sub_agents": {
53
+ "$ref": "#/$defs/__schema11"
54
+ }
55
+ },
56
+ "required": [
57
+ "agent_id"
58
+ ],
59
+ "additionalProperties": false,
60
+ "title": "AgentConfig",
61
+ "description": "Static configuration for one agent in a trace."
62
+ },
63
+ "__schema2": {
64
+ "type": "string",
65
+ "description": "Stable agent identifier."
66
+ },
67
+ "__schema3": {
68
+ "description": "Agent type or role.",
69
+ "$ref": "#/$defs/__schema4"
70
+ },
71
+ "__schema4": {
72
+ "type": "string"
73
+ },
74
+ "__schema5": {
75
+ "description": "Human-readable agent description.",
76
+ "$ref": "#/$defs/__schema6"
77
+ },
78
+ "__schema6": {
79
+ "type": "string"
80
+ },
81
+ "__schema7": {
82
+ "description": "System instruction for the agent.",
83
+ "$ref": "#/$defs/__schema8"
84
+ },
85
+ "__schema8": {
86
+ "type": "string"
87
+ },
88
+ "__schema9": {
89
+ "description": "Tools available to this agent.",
90
+ "$ref": "#/$defs/__schema10"
91
+ },
92
+ "__schema10": {
93
+ "type": "array",
94
+ "items": {
95
+ "type": "object",
96
+ "properties": {
97
+ "name": {
98
+ "type": "string",
99
+ "description": "Tool name."
100
+ }
101
+ },
102
+ "required": [
103
+ "name"
104
+ ],
105
+ "additionalProperties": false
106
+ }
107
+ },
108
+ "__schema11": {
109
+ "description": "Sub-agent identifiers when using multi-agent setups.",
110
+ "$ref": "#/$defs/__schema12"
111
+ },
112
+ "__schema12": {
113
+ "type": "array",
114
+ "items": {
115
+ "type": "string"
116
+ }
117
+ },
118
+ "__schema13": {
119
+ "type": "array",
120
+ "items": {
121
+ "$ref": "#/$defs/ConversationTurn"
122
+ },
123
+ "description": "Chronological conversation turns."
124
+ },
125
+ "ConversationTurn": {
126
+ "type": "object",
127
+ "properties": {
128
+ "turn_index": {
129
+ "$ref": "#/$defs/__schema14"
130
+ },
131
+ "turn_id": {
132
+ "$ref": "#/$defs/__schema15"
133
+ },
134
+ "events": {
135
+ "$ref": "#/$defs/__schema17"
136
+ }
137
+ },
138
+ "required": [
139
+ "turn_index",
140
+ "events"
141
+ ],
142
+ "additionalProperties": false,
143
+ "title": "ConversationTurn",
144
+ "description": "One turn in a multi-turn agent conversation."
145
+ },
146
+ "__schema14": {
147
+ "type": "integer",
148
+ "minimum": -9007199254740991,
149
+ "maximum": 9007199254740991,
150
+ "description": "Zero-based turn index."
151
+ },
152
+ "__schema15": {
153
+ "description": "Optional stable turn identifier.",
154
+ "$ref": "#/$defs/__schema16"
155
+ },
156
+ "__schema16": {
157
+ "type": "string"
158
+ },
159
+ "__schema17": {
160
+ "type": "array",
161
+ "items": {
162
+ "$ref": "#/$defs/AgentEvent"
163
+ },
164
+ "description": "Events in chronological order."
165
+ },
166
+ "AgentEvent": {
167
+ "type": "object",
168
+ "properties": {
169
+ "author": {
170
+ "$ref": "#/$defs/__schema18"
171
+ },
172
+ "content": {
173
+ "$ref": "#/$defs/__schema19"
174
+ },
175
+ "event_time": {
176
+ "$ref": "#/$defs/__schema27"
177
+ },
178
+ "state_delta": {
179
+ "$ref": "#/$defs/__schema29"
180
+ },
181
+ "active_tools": {
182
+ "$ref": "#/$defs/__schema31"
183
+ }
184
+ },
185
+ "required": [
186
+ "author",
187
+ "content"
188
+ ],
189
+ "additionalProperties": false,
190
+ "title": "AgentEvent",
191
+ "description": "One event in a multi-turn agent trace."
192
+ },
193
+ "__schema18": {
194
+ "type": "string",
195
+ "description": "Agent id or user identifier for this event."
196
+ },
197
+ "__schema19": {
198
+ "type": "object",
199
+ "properties": {
200
+ "parts": {
201
+ "$ref": "#/$defs/__schema20"
202
+ }
203
+ },
204
+ "required": [
205
+ "parts"
206
+ ],
207
+ "additionalProperties": false,
208
+ "description": "Structured event content."
209
+ },
210
+ "__schema20": {
211
+ "type": "array",
212
+ "items": {
213
+ "$ref": "#/$defs/ContentPart"
214
+ },
215
+ "description": "Content parts for this event."
216
+ },
217
+ "ContentPart": {
218
+ "type": "object",
219
+ "properties": {
220
+ "text": {
221
+ "$ref": "#/$defs/__schema21"
222
+ },
223
+ "function_call": {
224
+ "$ref": "#/$defs/__schema23"
225
+ },
226
+ "function_response": {
227
+ "$ref": "#/$defs/__schema25"
228
+ }
229
+ },
230
+ "additionalProperties": false,
231
+ "title": "ContentPart",
232
+ "description": "One part of agent event content (text, function_call, or function_response)."
233
+ },
234
+ "__schema21": {
235
+ "description": "Plain text content.",
236
+ "$ref": "#/$defs/__schema22"
237
+ },
238
+ "__schema22": {
239
+ "type": "string"
240
+ },
241
+ "__schema23": {
242
+ "description": "Function call emitted by the agent.",
243
+ "$ref": "#/$defs/__schema24"
244
+ },
245
+ "__schema24": {
246
+ "type": "object",
247
+ "properties": {
248
+ "name": {
249
+ "type": "string",
250
+ "description": "Function or tool name."
251
+ },
252
+ "args": {
253
+ "description": "Function arguments."
254
+ }
255
+ },
256
+ "required": [
257
+ "name",
258
+ "args"
259
+ ],
260
+ "additionalProperties": false
261
+ },
262
+ "__schema25": {
263
+ "description": "Function response from tool execution.",
264
+ "$ref": "#/$defs/__schema26"
265
+ },
266
+ "__schema26": {
267
+ "type": "object",
268
+ "properties": {
269
+ "name": {
270
+ "type": "string",
271
+ "description": "Function or tool name."
272
+ },
273
+ "response": {
274
+ "description": "Function result payload."
275
+ }
276
+ },
277
+ "required": [
278
+ "name",
279
+ "response"
280
+ ],
281
+ "additionalProperties": false
282
+ },
283
+ "__schema27": {
284
+ "description": "ISO 8601 timestamp when the event occurred.",
285
+ "$ref": "#/$defs/__schema28"
286
+ },
287
+ "__schema28": {
288
+ "type": "string"
289
+ },
290
+ "__schema29": {
291
+ "description": "Session state changes associated with this event.",
292
+ "$ref": "#/$defs/__schema30"
293
+ },
294
+ "__schema30": {
295
+ "type": "object",
296
+ "propertyNames": {
297
+ "type": "string"
298
+ },
299
+ "additionalProperties": {}
300
+ },
301
+ "__schema31": {
302
+ "description": "Tools available to the agent at event time.",
303
+ "$ref": "#/$defs/__schema32"
304
+ },
305
+ "__schema32": {
306
+ "type": "array",
307
+ "items": {
308
+ "type": "object",
309
+ "properties": {
310
+ "name": {
311
+ "type": "string",
312
+ "description": "Tool name."
313
+ }
314
+ },
315
+ "required": [
316
+ "name"
317
+ ],
318
+ "additionalProperties": false
319
+ }
320
+ }
321
+ }
322
+ }