@alis-build/harness-eval 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -4
- package/dist/adapters/claude-code/index.d.ts +1 -1
- package/dist/adapters/claude-code/index.js +1 -1
- package/dist/{claude-code-ycT0JQZF.js → claude-code-DZ4Vkgp6.js} +35 -6
- package/dist/{claude-code-ycT0JQZF.js.map → claude-code-DZ4Vkgp6.js.map} +1 -1
- package/dist/cli/bin.js +109 -12
- package/dist/cli/bin.js.map +1 -1
- package/dist/config/loader.d.ts +1 -1
- package/dist/config/loader.js +1 -1
- package/dist/{index-6Z17eKZx.d.ts → index-V22PrR0p.d.ts} +2 -1
- package/dist/index.d.ts +270 -152
- package/dist/index.js +124 -5
- package/dist/index.js.map +1 -0
- package/dist/{loader-DTvoVfN0.d.ts → loader-C9yQHUPC.d.ts} +19 -2
- package/dist/{loader-BCnFJ8rm.js → loader-DcI0KfRX.js} +291 -4
- package/dist/loader-DcI0KfRX.js.map +1 -0
- package/dist/{build-DsVJ_UeU.js → projections-BcX7w-f6.js} +486 -243
- package/dist/projections-BcX7w-f6.js.map +1 -0
- package/dist/runner/suite.d.ts +1 -1
- package/dist/runner/suite.js +1 -1
- package/dist/{suite-BoOvK_lq.d.ts → suite-DPJMIEbu.d.ts} +7 -2
- package/dist/{suite-chj0j22j.js → suite-Dlzl-HI0.js} +58 -4
- package/dist/suite-Dlzl-HI0.js.map +1 -0
- package/dist/{types-BQol062t.d.ts → types-CD3TwOtZ.d.ts} +151 -10
- package/package.json +4 -2
- package/schemas/eval-interchange-instances.schema.json +196 -0
- package/schemas/eval-interchange.schema.json +65 -52
- package/schemas/eval-run-envelope.schema.json +182 -425
- package/dist/build-DsVJ_UeU.js.map +0 -1
- package/dist/loader-BCnFJ8rm.js.map +0 -1
- package/dist/suite-chj0j22j.js.map +0 -1
- package/schemas/eval-interchange-agent-trace.schema.json +0 -322
- package/schemas/eval-interchange-proto-instance.schema.json +0 -106
|
@@ -1,322 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
-
"$id": "https://raw.githubusercontent.com/alis-build/harness-eval-ts/main/schemas/eval-interchange.schema.json#AgentTrace",
|
|
4
|
-
"title": "AgentTrace",
|
|
5
|
-
"description": "Full multi-turn agent execution trace.",
|
|
6
|
-
"type": "object",
|
|
7
|
-
"properties": {
|
|
8
|
-
"agents": {
|
|
9
|
-
"$ref": "#/$defs/__schema0"
|
|
10
|
-
},
|
|
11
|
-
"turns": {
|
|
12
|
-
"$ref": "#/$defs/__schema13"
|
|
13
|
-
}
|
|
14
|
-
},
|
|
15
|
-
"required": [
|
|
16
|
-
"agents",
|
|
17
|
-
"turns"
|
|
18
|
-
],
|
|
19
|
-
"additionalProperties": false,
|
|
20
|
-
"$defs": {
|
|
21
|
-
"__schema0": {
|
|
22
|
-
"type": "object",
|
|
23
|
-
"propertyNames": {
|
|
24
|
-
"$ref": "#/$defs/__schema1"
|
|
25
|
-
},
|
|
26
|
-
"additionalProperties": {
|
|
27
|
-
"$ref": "#/$defs/AgentConfig"
|
|
28
|
-
},
|
|
29
|
-
"description": "Agent configurations keyed by agent id."
|
|
30
|
-
},
|
|
31
|
-
"__schema1": {
|
|
32
|
-
"type": "string"
|
|
33
|
-
},
|
|
34
|
-
"AgentConfig": {
|
|
35
|
-
"type": "object",
|
|
36
|
-
"properties": {
|
|
37
|
-
"agent_id": {
|
|
38
|
-
"$ref": "#/$defs/__schema2"
|
|
39
|
-
},
|
|
40
|
-
"agent_type": {
|
|
41
|
-
"$ref": "#/$defs/__schema3"
|
|
42
|
-
},
|
|
43
|
-
"description": {
|
|
44
|
-
"$ref": "#/$defs/__schema5"
|
|
45
|
-
},
|
|
46
|
-
"instruction": {
|
|
47
|
-
"$ref": "#/$defs/__schema7"
|
|
48
|
-
},
|
|
49
|
-
"tools": {
|
|
50
|
-
"$ref": "#/$defs/__schema9"
|
|
51
|
-
},
|
|
52
|
-
"sub_agents": {
|
|
53
|
-
"$ref": "#/$defs/__schema11"
|
|
54
|
-
}
|
|
55
|
-
},
|
|
56
|
-
"required": [
|
|
57
|
-
"agent_id"
|
|
58
|
-
],
|
|
59
|
-
"additionalProperties": false,
|
|
60
|
-
"title": "AgentConfig",
|
|
61
|
-
"description": "Static configuration for one agent in a trace."
|
|
62
|
-
},
|
|
63
|
-
"__schema2": {
|
|
64
|
-
"type": "string",
|
|
65
|
-
"description": "Stable agent identifier."
|
|
66
|
-
},
|
|
67
|
-
"__schema3": {
|
|
68
|
-
"description": "Agent type or role.",
|
|
69
|
-
"$ref": "#/$defs/__schema4"
|
|
70
|
-
},
|
|
71
|
-
"__schema4": {
|
|
72
|
-
"type": "string"
|
|
73
|
-
},
|
|
74
|
-
"__schema5": {
|
|
75
|
-
"description": "Human-readable agent description.",
|
|
76
|
-
"$ref": "#/$defs/__schema6"
|
|
77
|
-
},
|
|
78
|
-
"__schema6": {
|
|
79
|
-
"type": "string"
|
|
80
|
-
},
|
|
81
|
-
"__schema7": {
|
|
82
|
-
"description": "System instruction for the agent.",
|
|
83
|
-
"$ref": "#/$defs/__schema8"
|
|
84
|
-
},
|
|
85
|
-
"__schema8": {
|
|
86
|
-
"type": "string"
|
|
87
|
-
},
|
|
88
|
-
"__schema9": {
|
|
89
|
-
"description": "Tools available to this agent.",
|
|
90
|
-
"$ref": "#/$defs/__schema10"
|
|
91
|
-
},
|
|
92
|
-
"__schema10": {
|
|
93
|
-
"type": "array",
|
|
94
|
-
"items": {
|
|
95
|
-
"type": "object",
|
|
96
|
-
"properties": {
|
|
97
|
-
"name": {
|
|
98
|
-
"type": "string",
|
|
99
|
-
"description": "Tool name."
|
|
100
|
-
}
|
|
101
|
-
},
|
|
102
|
-
"required": [
|
|
103
|
-
"name"
|
|
104
|
-
],
|
|
105
|
-
"additionalProperties": false
|
|
106
|
-
}
|
|
107
|
-
},
|
|
108
|
-
"__schema11": {
|
|
109
|
-
"description": "Sub-agent identifiers when using multi-agent setups.",
|
|
110
|
-
"$ref": "#/$defs/__schema12"
|
|
111
|
-
},
|
|
112
|
-
"__schema12": {
|
|
113
|
-
"type": "array",
|
|
114
|
-
"items": {
|
|
115
|
-
"type": "string"
|
|
116
|
-
}
|
|
117
|
-
},
|
|
118
|
-
"__schema13": {
|
|
119
|
-
"type": "array",
|
|
120
|
-
"items": {
|
|
121
|
-
"$ref": "#/$defs/ConversationTurn"
|
|
122
|
-
},
|
|
123
|
-
"description": "Chronological conversation turns."
|
|
124
|
-
},
|
|
125
|
-
"ConversationTurn": {
|
|
126
|
-
"type": "object",
|
|
127
|
-
"properties": {
|
|
128
|
-
"turn_index": {
|
|
129
|
-
"$ref": "#/$defs/__schema14"
|
|
130
|
-
},
|
|
131
|
-
"turn_id": {
|
|
132
|
-
"$ref": "#/$defs/__schema15"
|
|
133
|
-
},
|
|
134
|
-
"events": {
|
|
135
|
-
"$ref": "#/$defs/__schema17"
|
|
136
|
-
}
|
|
137
|
-
},
|
|
138
|
-
"required": [
|
|
139
|
-
"turn_index",
|
|
140
|
-
"events"
|
|
141
|
-
],
|
|
142
|
-
"additionalProperties": false,
|
|
143
|
-
"title": "ConversationTurn",
|
|
144
|
-
"description": "One turn in a multi-turn agent conversation."
|
|
145
|
-
},
|
|
146
|
-
"__schema14": {
|
|
147
|
-
"type": "integer",
|
|
148
|
-
"minimum": -9007199254740991,
|
|
149
|
-
"maximum": 9007199254740991,
|
|
150
|
-
"description": "Zero-based turn index."
|
|
151
|
-
},
|
|
152
|
-
"__schema15": {
|
|
153
|
-
"description": "Optional stable turn identifier.",
|
|
154
|
-
"$ref": "#/$defs/__schema16"
|
|
155
|
-
},
|
|
156
|
-
"__schema16": {
|
|
157
|
-
"type": "string"
|
|
158
|
-
},
|
|
159
|
-
"__schema17": {
|
|
160
|
-
"type": "array",
|
|
161
|
-
"items": {
|
|
162
|
-
"$ref": "#/$defs/AgentEvent"
|
|
163
|
-
},
|
|
164
|
-
"description": "Events in chronological order."
|
|
165
|
-
},
|
|
166
|
-
"AgentEvent": {
|
|
167
|
-
"type": "object",
|
|
168
|
-
"properties": {
|
|
169
|
-
"author": {
|
|
170
|
-
"$ref": "#/$defs/__schema18"
|
|
171
|
-
},
|
|
172
|
-
"content": {
|
|
173
|
-
"$ref": "#/$defs/__schema19"
|
|
174
|
-
},
|
|
175
|
-
"event_time": {
|
|
176
|
-
"$ref": "#/$defs/__schema27"
|
|
177
|
-
},
|
|
178
|
-
"state_delta": {
|
|
179
|
-
"$ref": "#/$defs/__schema29"
|
|
180
|
-
},
|
|
181
|
-
"active_tools": {
|
|
182
|
-
"$ref": "#/$defs/__schema31"
|
|
183
|
-
}
|
|
184
|
-
},
|
|
185
|
-
"required": [
|
|
186
|
-
"author",
|
|
187
|
-
"content"
|
|
188
|
-
],
|
|
189
|
-
"additionalProperties": false,
|
|
190
|
-
"title": "AgentEvent",
|
|
191
|
-
"description": "One event in a multi-turn agent trace."
|
|
192
|
-
},
|
|
193
|
-
"__schema18": {
|
|
194
|
-
"type": "string",
|
|
195
|
-
"description": "Agent id or user identifier for this event."
|
|
196
|
-
},
|
|
197
|
-
"__schema19": {
|
|
198
|
-
"type": "object",
|
|
199
|
-
"properties": {
|
|
200
|
-
"parts": {
|
|
201
|
-
"$ref": "#/$defs/__schema20"
|
|
202
|
-
}
|
|
203
|
-
},
|
|
204
|
-
"required": [
|
|
205
|
-
"parts"
|
|
206
|
-
],
|
|
207
|
-
"additionalProperties": false,
|
|
208
|
-
"description": "Structured event content."
|
|
209
|
-
},
|
|
210
|
-
"__schema20": {
|
|
211
|
-
"type": "array",
|
|
212
|
-
"items": {
|
|
213
|
-
"$ref": "#/$defs/ContentPart"
|
|
214
|
-
},
|
|
215
|
-
"description": "Content parts for this event."
|
|
216
|
-
},
|
|
217
|
-
"ContentPart": {
|
|
218
|
-
"type": "object",
|
|
219
|
-
"properties": {
|
|
220
|
-
"text": {
|
|
221
|
-
"$ref": "#/$defs/__schema21"
|
|
222
|
-
},
|
|
223
|
-
"function_call": {
|
|
224
|
-
"$ref": "#/$defs/__schema23"
|
|
225
|
-
},
|
|
226
|
-
"function_response": {
|
|
227
|
-
"$ref": "#/$defs/__schema25"
|
|
228
|
-
}
|
|
229
|
-
},
|
|
230
|
-
"additionalProperties": false,
|
|
231
|
-
"title": "ContentPart",
|
|
232
|
-
"description": "One part of agent event content (text, function_call, or function_response)."
|
|
233
|
-
},
|
|
234
|
-
"__schema21": {
|
|
235
|
-
"description": "Plain text content.",
|
|
236
|
-
"$ref": "#/$defs/__schema22"
|
|
237
|
-
},
|
|
238
|
-
"__schema22": {
|
|
239
|
-
"type": "string"
|
|
240
|
-
},
|
|
241
|
-
"__schema23": {
|
|
242
|
-
"description": "Function call emitted by the agent.",
|
|
243
|
-
"$ref": "#/$defs/__schema24"
|
|
244
|
-
},
|
|
245
|
-
"__schema24": {
|
|
246
|
-
"type": "object",
|
|
247
|
-
"properties": {
|
|
248
|
-
"name": {
|
|
249
|
-
"type": "string",
|
|
250
|
-
"description": "Function or tool name."
|
|
251
|
-
},
|
|
252
|
-
"args": {
|
|
253
|
-
"description": "Function arguments."
|
|
254
|
-
}
|
|
255
|
-
},
|
|
256
|
-
"required": [
|
|
257
|
-
"name",
|
|
258
|
-
"args"
|
|
259
|
-
],
|
|
260
|
-
"additionalProperties": false
|
|
261
|
-
},
|
|
262
|
-
"__schema25": {
|
|
263
|
-
"description": "Function response from tool execution.",
|
|
264
|
-
"$ref": "#/$defs/__schema26"
|
|
265
|
-
},
|
|
266
|
-
"__schema26": {
|
|
267
|
-
"type": "object",
|
|
268
|
-
"properties": {
|
|
269
|
-
"name": {
|
|
270
|
-
"type": "string",
|
|
271
|
-
"description": "Function or tool name."
|
|
272
|
-
},
|
|
273
|
-
"response": {
|
|
274
|
-
"description": "Function result payload."
|
|
275
|
-
}
|
|
276
|
-
},
|
|
277
|
-
"required": [
|
|
278
|
-
"name",
|
|
279
|
-
"response"
|
|
280
|
-
],
|
|
281
|
-
"additionalProperties": false
|
|
282
|
-
},
|
|
283
|
-
"__schema27": {
|
|
284
|
-
"description": "ISO 8601 timestamp when the event occurred.",
|
|
285
|
-
"$ref": "#/$defs/__schema28"
|
|
286
|
-
},
|
|
287
|
-
"__schema28": {
|
|
288
|
-
"type": "string"
|
|
289
|
-
},
|
|
290
|
-
"__schema29": {
|
|
291
|
-
"description": "Session state changes associated with this event.",
|
|
292
|
-
"$ref": "#/$defs/__schema30"
|
|
293
|
-
},
|
|
294
|
-
"__schema30": {
|
|
295
|
-
"type": "object",
|
|
296
|
-
"propertyNames": {
|
|
297
|
-
"type": "string"
|
|
298
|
-
},
|
|
299
|
-
"additionalProperties": {}
|
|
300
|
-
},
|
|
301
|
-
"__schema31": {
|
|
302
|
-
"description": "Tools available to the agent at event time.",
|
|
303
|
-
"$ref": "#/$defs/__schema32"
|
|
304
|
-
},
|
|
305
|
-
"__schema32": {
|
|
306
|
-
"type": "array",
|
|
307
|
-
"items": {
|
|
308
|
-
"type": "object",
|
|
309
|
-
"properties": {
|
|
310
|
-
"name": {
|
|
311
|
-
"type": "string",
|
|
312
|
-
"description": "Tool name."
|
|
313
|
-
}
|
|
314
|
-
},
|
|
315
|
-
"required": [
|
|
316
|
-
"name"
|
|
317
|
-
],
|
|
318
|
-
"additionalProperties": false
|
|
319
|
-
}
|
|
320
|
-
}
|
|
321
|
-
}
|
|
322
|
-
}
|
|
@@ -1,106 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
-
"$id": "https://raw.githubusercontent.com/alis-build/harness-eval-ts/main/schemas/eval-interchange.schema.json#ProtoTrajectoryInstance",
|
|
4
|
-
"title": "ProtoTrajectoryInstance",
|
|
5
|
-
"description": "Proto-compatible evaluation instance with JSON-string tool_input.",
|
|
6
|
-
"type": "object",
|
|
7
|
-
"properties": {
|
|
8
|
-
"predicted_trajectory": {
|
|
9
|
-
"$ref": "#/$defs/__schema0"
|
|
10
|
-
},
|
|
11
|
-
"reference_trajectory": {
|
|
12
|
-
"$ref": "#/$defs/__schema4"
|
|
13
|
-
},
|
|
14
|
-
"prompt": {
|
|
15
|
-
"$ref": "#/$defs/__schema5"
|
|
16
|
-
},
|
|
17
|
-
"response": {
|
|
18
|
-
"$ref": "#/$defs/__schema7"
|
|
19
|
-
},
|
|
20
|
-
"reference": {
|
|
21
|
-
"$ref": "#/$defs/__schema9"
|
|
22
|
-
}
|
|
23
|
-
},
|
|
24
|
-
"required": [
|
|
25
|
-
"predicted_trajectory"
|
|
26
|
-
],
|
|
27
|
-
"additionalProperties": false,
|
|
28
|
-
"$defs": {
|
|
29
|
-
"__schema0": {
|
|
30
|
-
"description": "Predicted trajectory in wire format.",
|
|
31
|
-
"$ref": "#/$defs/InterchangeTrajectory"
|
|
32
|
-
},
|
|
33
|
-
"__schema1": {
|
|
34
|
-
"type": "array",
|
|
35
|
-
"items": {
|
|
36
|
-
"$ref": "#/$defs/InterchangeToolCall"
|
|
37
|
-
},
|
|
38
|
-
"description": "Ordered tool calls in the trajectory."
|
|
39
|
-
},
|
|
40
|
-
"InterchangeToolCall": {
|
|
41
|
-
"type": "object",
|
|
42
|
-
"properties": {
|
|
43
|
-
"tool_name": {
|
|
44
|
-
"$ref": "#/$defs/__schema2"
|
|
45
|
-
},
|
|
46
|
-
"tool_input": {
|
|
47
|
-
"$ref": "#/$defs/__schema3"
|
|
48
|
-
}
|
|
49
|
-
},
|
|
50
|
-
"required": [
|
|
51
|
-
"tool_name",
|
|
52
|
-
"tool_input"
|
|
53
|
-
],
|
|
54
|
-
"additionalProperties": false,
|
|
55
|
-
"title": "InterchangeToolCall",
|
|
56
|
-
"description": "Tool call in interchange wire format."
|
|
57
|
-
},
|
|
58
|
-
"__schema2": {
|
|
59
|
-
"type": "string",
|
|
60
|
-
"description": "Tool name as emitted by the agent."
|
|
61
|
-
},
|
|
62
|
-
"__schema3": {
|
|
63
|
-
"type": "string",
|
|
64
|
-
"description": "JSON-serialized tool arguments (wire format)."
|
|
65
|
-
},
|
|
66
|
-
"InterchangeTrajectory": {
|
|
67
|
-
"type": "object",
|
|
68
|
-
"properties": {
|
|
69
|
-
"tool_calls": {
|
|
70
|
-
"$ref": "#/$defs/__schema1"
|
|
71
|
-
}
|
|
72
|
-
},
|
|
73
|
-
"required": [
|
|
74
|
-
"tool_calls"
|
|
75
|
-
],
|
|
76
|
-
"additionalProperties": false,
|
|
77
|
-
"title": "InterchangeTrajectory",
|
|
78
|
-
"description": "Ordered sequence of tool calls."
|
|
79
|
-
},
|
|
80
|
-
"__schema4": {
|
|
81
|
-
"description": "Reference trajectory in wire format.",
|
|
82
|
-
"$ref": "#/$defs/InterchangeTrajectory"
|
|
83
|
-
},
|
|
84
|
-
"__schema5": {
|
|
85
|
-
"description": "Eval prompt.",
|
|
86
|
-
"$ref": "#/$defs/__schema6"
|
|
87
|
-
},
|
|
88
|
-
"__schema6": {
|
|
89
|
-
"type": "string"
|
|
90
|
-
},
|
|
91
|
-
"__schema7": {
|
|
92
|
-
"description": "Final response.",
|
|
93
|
-
"$ref": "#/$defs/__schema8"
|
|
94
|
-
},
|
|
95
|
-
"__schema8": {
|
|
96
|
-
"type": "string"
|
|
97
|
-
},
|
|
98
|
-
"__schema9": {
|
|
99
|
-
"description": "Reference answer text.",
|
|
100
|
-
"$ref": "#/$defs/__schema10"
|
|
101
|
-
},
|
|
102
|
-
"__schema10": {
|
|
103
|
-
"type": "string"
|
|
104
|
-
}
|
|
105
|
-
}
|
|
106
|
-
}
|