@tangle-network/agent-eval 0.23.1 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +145 -0
- package/README.md +212 -79
- package/dist/baseline-4R5deP0N.d.ts +108 -0
- package/dist/benchmarks/index.d.ts +3 -2
- package/dist/benchmarks/index.js +1 -1
- package/dist/builder-eval/index.d.ts +249 -0
- package/dist/builder-eval/index.js +391 -0
- package/dist/builder-eval/index.js.map +1 -0
- package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
- package/dist/chunk-2A5XJB43.js.map +1 -0
- package/dist/chunk-47X6LRCE.js +76 -0
- package/dist/chunk-47X6LRCE.js.map +1 -0
- package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
- package/dist/chunk-4F5DQN55.js.map +1 -0
- package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
- package/dist/chunk-4S4BM3QQ.js.map +1 -0
- package/dist/chunk-5BKGXME7.js +65 -0
- package/dist/chunk-5BKGXME7.js.map +1 -0
- package/dist/{chunk-6KQG5HAH.js → chunk-5LBB5B3Z.js} +376 -72
- package/dist/chunk-5LBB5B3Z.js.map +1 -0
- package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
- package/dist/chunk-6QDKWHLS.js.map +1 -0
- package/dist/{chunk-VQQSPGSM.js → chunk-EDUKQ5AM.js} +247 -189
- package/dist/chunk-EDUKQ5AM.js.map +1 -0
- package/dist/chunk-I4MBDTY5.js +272 -0
- package/dist/chunk-I4MBDTY5.js.map +1 -0
- package/dist/chunk-JLZQWFV3.js +618 -0
- package/dist/chunk-JLZQWFV3.js.map +1 -0
- package/dist/chunk-K2TPS5LB.js +569 -0
- package/dist/chunk-K2TPS5LB.js.map +1 -0
- package/dist/chunk-KKHDIONI.js +414 -0
- package/dist/chunk-KKHDIONI.js.map +1 -0
- package/dist/chunk-KMPRBJK4.js +74 -0
- package/dist/chunk-KMPRBJK4.js.map +1 -0
- package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
- package/dist/chunk-KTGTIOFD.js.map +1 -0
- package/dist/chunk-LSH4MMOZ.js +838 -0
- package/dist/chunk-LSH4MMOZ.js.map +1 -0
- package/dist/chunk-NG236HPC.js +57 -0
- package/dist/chunk-NG236HPC.js.map +1 -0
- package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
- package/dist/chunk-NLMNWKVM.js.map +1 -0
- package/dist/chunk-NU65VQ7M.js +99 -0
- package/dist/chunk-NU65VQ7M.js.map +1 -0
- package/dist/chunk-OWLAAMME.js +250 -0
- package/dist/chunk-OWLAAMME.js.map +1 -0
- package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
- package/dist/chunk-PC4UYEBM.js.map +1 -0
- package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
- package/dist/chunk-RAF443UI.js.map +1 -0
- package/dist/chunk-RZTMDUO7.js +49 -0
- package/dist/chunk-RZTMDUO7.js.map +1 -0
- package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
- package/dist/chunk-SESZDQPX.js.map +1 -0
- package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
- package/dist/chunk-TVVP3ZZQ.js.map +1 -0
- package/dist/chunk-WWYCWKUM.js +196 -0
- package/dist/chunk-WWYCWKUM.js.map +1 -0
- package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
- package/dist/chunk-YRZ4M5GS.js.map +1 -0
- package/dist/chunk-ZN274SWR.js +613 -0
- package/dist/chunk-ZN274SWR.js.map +1 -0
- package/dist/cli.js +10 -6
- package/dist/cli.js.map +1 -1
- package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
- package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
- package/dist/control.d.ts +8 -6
- package/dist/control.js +10 -7
- package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
- package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
- package/dist/errors-BZ9sTdz7.d.ts +70 -0
- package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
- package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
- package/dist/governance/index.d.ts +5 -0
- package/dist/governance/index.js +18 -0
- package/dist/governance/index.js.map +1 -0
- package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
- package/dist/index-Oj9fAPPN.d.ts +270 -0
- package/dist/index.d.ts +2018 -3003
- package/dist/index.js +7443 -9102
- package/dist/index.js.map +1 -1
- package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
- package/dist/knowledge/index.d.ts +102 -0
- package/dist/knowledge/index.js +18 -0
- package/dist/knowledge/index.js.map +1 -0
- package/dist/meta-eval/index.d.ts +99 -0
- package/dist/meta-eval/index.js +324 -0
- package/dist/meta-eval/index.js.map +1 -0
- package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
- package/dist/openapi.json +491 -1
- package/dist/optimization.d.ts +11 -8
- package/dist/optimization.js +11 -9
- package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
- package/dist/pipelines/index.d.ts +172 -0
- package/dist/pipelines/index.js +345 -0
- package/dist/pipelines/index.js.map +1 -0
- package/dist/prm/index.d.ts +99 -0
- package/dist/prm/index.js +222 -0
- package/dist/prm/index.js.map +1 -0
- package/dist/query-DODUYdPg.d.ts +30 -0
- package/dist/release-report-BNgMdqPF.d.ts +292 -0
- package/dist/replay-BL96gCEP.d.ts +226 -0
- package/dist/reporting.d.ts +10 -295
- package/dist/reporting.js +10 -6
- package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-BPT8x_NT.d.ts} +148 -146
- package/dist/rl.d.ts +1762 -8
- package/dist/rl.js +2035 -58
- package/dist/rl.js.map +1 -1
- package/dist/rubric-D5tjHNJQ.d.ts +72 -0
- package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
- package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
- package/dist/sequential-Dgz1n51-.d.ts +139 -0
- package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
- package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-C7VPYEj2.d.ts} +3 -76
- package/dist/telemetry/file.js +4 -1
- package/dist/telemetry/file.js.map +1 -1
- package/dist/telemetry/index.js +57 -57
- package/dist/telemetry/index.js.map +1 -1
- package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
- package/dist/traces.d.ts +142 -387
- package/dist/traces.js +1302 -40
- package/dist/traces.js.map +1 -1
- package/dist/trajectory-CnoBo-JY.d.ts +32 -0
- package/dist/wire/index.d.ts +369 -25
- package/dist/wire/index.js +22 -3
- package/package.json +44 -18
- package/dist/chunk-42I2QC2L.js.map +0 -1
- package/dist/chunk-5IIQKMD5.js.map +0 -1
- package/dist/chunk-6KQG5HAH.js.map +0 -1
- package/dist/chunk-6M774GY6.js.map +0 -1
- package/dist/chunk-7EAUOUQS.js.map +0 -1
- package/dist/chunk-AXHNWLIX.js.map +0 -1
- package/dist/chunk-EXGR4XEM.js.map +0 -1
- package/dist/chunk-IOXMGMHQ.js.map +0 -1
- package/dist/chunk-KAO3Q65R.js.map +0 -1
- package/dist/chunk-LZKIOBG2.js +0 -2026
- package/dist/chunk-LZKIOBG2.js.map +0 -1
- package/dist/chunk-QBW3YBTR.js.map +0 -1
- package/dist/chunk-QUKKGHTZ.js.map +0 -1
- package/dist/chunk-SQQLHODJ.js.map +0 -1
- package/dist/chunk-V5QSWN7L.js +0 -1310
- package/dist/chunk-V5QSWN7L.js.map +0 -1
- package/dist/chunk-VQQSPGSM.js.map +0 -1
- package/dist/chunk-XPHOZPOM.js +0 -1947
- package/dist/chunk-XPHOZPOM.js.map +0 -1
- package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
- package/dist/index-ekBXweiQ.d.ts +0 -1894
- package/dist/sequential-DgU2mFsE.d.ts +0 -304
package/dist/wire/index.d.ts
CHANGED
|
@@ -1,8 +1,14 @@
|
|
|
1
|
+
import { n as FeedbackTrajectoryStore } from '../feedback-trajectory-DfFdrraJ.js';
|
|
2
|
+
import { T as TraceStore } from '../store-Db2Bv8Cf.js';
|
|
1
3
|
import { z } from 'zod';
|
|
2
4
|
import { OpenAPIObject } from 'openapi3-ts/oas31';
|
|
3
5
|
import * as hono_types from 'hono/types';
|
|
4
6
|
import { ServerType } from '@hono/node-server';
|
|
5
7
|
import { Hono } from 'hono';
|
|
8
|
+
import '../control-runtime-BuJHoLg0.js';
|
|
9
|
+
import '../emitter-DP_cSSiw.js';
|
|
10
|
+
import '../dataset-CiK_3LDr.js';
|
|
11
|
+
import '../errors-BZ9sTdz7.js';
|
|
6
12
|
|
|
7
13
|
declare const RubricDimensionSchema: z.ZodObject<{
|
|
8
14
|
id: z.ZodString;
|
|
@@ -105,6 +111,287 @@ declare const HealthResponseSchema: z.ZodObject<{
|
|
|
105
111
|
status: z.ZodLiteral<"ok">;
|
|
106
112
|
uptimeSec: z.ZodNumber;
|
|
107
113
|
}, z.core.$strip>;
|
|
114
|
+
/**
|
|
115
|
+
* Minimal `TraceEvent` shape that the production runtime emits.
|
|
116
|
+
* Matches `trace/schema.ts` `TraceEvent` but is duplicated here as a
|
|
117
|
+
* wire schema so non-TypeScript clients can validate without depending
|
|
118
|
+
* on internal types.
|
|
119
|
+
*/
|
|
120
|
+
declare const TraceEventSchema: z.ZodObject<{
|
|
121
|
+
eventId: z.ZodString;
|
|
122
|
+
runId: z.ZodString;
|
|
123
|
+
spanId: z.ZodOptional<z.ZodString>;
|
|
124
|
+
kind: z.ZodEnum<{
|
|
125
|
+
policy_violation: "policy_violation";
|
|
126
|
+
custom: "custom";
|
|
127
|
+
error: "error";
|
|
128
|
+
log: "log";
|
|
129
|
+
budget_decrement: "budget_decrement";
|
|
130
|
+
budget_breach: "budget_breach";
|
|
131
|
+
state_mutation: "state_mutation";
|
|
132
|
+
redaction_applied: "redaction_applied";
|
|
133
|
+
}>;
|
|
134
|
+
timestamp: z.ZodNumber;
|
|
135
|
+
payload: z.ZodRecord<z.ZodString, z.ZodUnknown>;
|
|
136
|
+
}, z.core.$strip>;
|
|
137
|
+
declare const TracesIngestRequestSchema: z.ZodObject<{
|
|
138
|
+
events: z.ZodArray<z.ZodObject<{
|
|
139
|
+
eventId: z.ZodString;
|
|
140
|
+
runId: z.ZodString;
|
|
141
|
+
spanId: z.ZodOptional<z.ZodString>;
|
|
142
|
+
kind: z.ZodEnum<{
|
|
143
|
+
policy_violation: "policy_violation";
|
|
144
|
+
custom: "custom";
|
|
145
|
+
error: "error";
|
|
146
|
+
log: "log";
|
|
147
|
+
budget_decrement: "budget_decrement";
|
|
148
|
+
budget_breach: "budget_breach";
|
|
149
|
+
state_mutation: "state_mutation";
|
|
150
|
+
redaction_applied: "redaction_applied";
|
|
151
|
+
}>;
|
|
152
|
+
timestamp: z.ZodNumber;
|
|
153
|
+
payload: z.ZodRecord<z.ZodString, z.ZodUnknown>;
|
|
154
|
+
}, z.core.$strip>>;
|
|
155
|
+
}, z.core.$strip>;
|
|
156
|
+
declare const TracesIngestResponseSchema: z.ZodObject<{
|
|
157
|
+
accepted: z.ZodNumber;
|
|
158
|
+
rejected: z.ZodNumber;
|
|
159
|
+
errors: z.ZodDefault<z.ZodArray<z.ZodObject<{
|
|
160
|
+
eventId: z.ZodString;
|
|
161
|
+
message: z.ZodString;
|
|
162
|
+
}, z.core.$strip>>>;
|
|
163
|
+
}, z.core.$strip>;
|
|
164
|
+
declare const FeedbackLabelSchema: z.ZodObject<{
|
|
165
|
+
id: z.ZodOptional<z.ZodString>;
|
|
166
|
+
source: z.ZodEnum<{
|
|
167
|
+
judge: "judge";
|
|
168
|
+
system: "system";
|
|
169
|
+
user: "user";
|
|
170
|
+
policy: "policy";
|
|
171
|
+
environment: "environment";
|
|
172
|
+
metric: "metric";
|
|
173
|
+
}>;
|
|
174
|
+
kind: z.ZodEnum<{
|
|
175
|
+
approve: "approve";
|
|
176
|
+
reject: "reject";
|
|
177
|
+
select: "select";
|
|
178
|
+
edit: "edit";
|
|
179
|
+
rank: "rank";
|
|
180
|
+
rate: "rate";
|
|
181
|
+
comment: "comment";
|
|
182
|
+
metric_outcome: "metric_outcome";
|
|
183
|
+
policy_block: "policy_block";
|
|
184
|
+
revision_request: "revision_request";
|
|
185
|
+
}>;
|
|
186
|
+
value: z.ZodUnknown;
|
|
187
|
+
reason: z.ZodOptional<z.ZodString>;
|
|
188
|
+
severity: z.ZodOptional<z.ZodEnum<{
|
|
189
|
+
error: "error";
|
|
190
|
+
info: "info";
|
|
191
|
+
warning: "warning";
|
|
192
|
+
critical: "critical";
|
|
193
|
+
}>>;
|
|
194
|
+
createdAt: z.ZodString;
|
|
195
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
196
|
+
}, z.core.$strip>;
|
|
197
|
+
declare const FeedbackAttemptSchema: z.ZodObject<{
|
|
198
|
+
id: z.ZodString;
|
|
199
|
+
stepIndex: z.ZodNumber;
|
|
200
|
+
artifactType: z.ZodEnum<{
|
|
201
|
+
action: "action";
|
|
202
|
+
decision: "decision";
|
|
203
|
+
text: "text";
|
|
204
|
+
code: "code";
|
|
205
|
+
plan: "plan";
|
|
206
|
+
research: "research";
|
|
207
|
+
ui: "ui";
|
|
208
|
+
data: "data";
|
|
209
|
+
other: "other";
|
|
210
|
+
}>;
|
|
211
|
+
artifact: z.ZodUnknown;
|
|
212
|
+
options: z.ZodOptional<z.ZodArray<z.ZodUnknown>>;
|
|
213
|
+
proposedAction: z.ZodOptional<z.ZodObject<{
|
|
214
|
+
type: z.ZodString;
|
|
215
|
+
risk: z.ZodOptional<z.ZodEnum<{
|
|
216
|
+
medium: "medium";
|
|
217
|
+
low: "low";
|
|
218
|
+
high: "high";
|
|
219
|
+
}>>;
|
|
220
|
+
costUsd: z.ZodOptional<z.ZodNumber>;
|
|
221
|
+
externalSideEffect: z.ZodOptional<z.ZodBoolean>;
|
|
222
|
+
requiresApproval: z.ZodOptional<z.ZodBoolean>;
|
|
223
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
224
|
+
}, z.core.$strip>>;
|
|
225
|
+
feedback: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
226
|
+
id: z.ZodOptional<z.ZodString>;
|
|
227
|
+
source: z.ZodEnum<{
|
|
228
|
+
judge: "judge";
|
|
229
|
+
system: "system";
|
|
230
|
+
user: "user";
|
|
231
|
+
policy: "policy";
|
|
232
|
+
environment: "environment";
|
|
233
|
+
metric: "metric";
|
|
234
|
+
}>;
|
|
235
|
+
kind: z.ZodEnum<{
|
|
236
|
+
approve: "approve";
|
|
237
|
+
reject: "reject";
|
|
238
|
+
select: "select";
|
|
239
|
+
edit: "edit";
|
|
240
|
+
rank: "rank";
|
|
241
|
+
rate: "rate";
|
|
242
|
+
comment: "comment";
|
|
243
|
+
metric_outcome: "metric_outcome";
|
|
244
|
+
policy_block: "policy_block";
|
|
245
|
+
revision_request: "revision_request";
|
|
246
|
+
}>;
|
|
247
|
+
value: z.ZodUnknown;
|
|
248
|
+
reason: z.ZodOptional<z.ZodString>;
|
|
249
|
+
severity: z.ZodOptional<z.ZodEnum<{
|
|
250
|
+
error: "error";
|
|
251
|
+
info: "info";
|
|
252
|
+
warning: "warning";
|
|
253
|
+
critical: "critical";
|
|
254
|
+
}>>;
|
|
255
|
+
createdAt: z.ZodString;
|
|
256
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
257
|
+
}, z.core.$strip>>>;
|
|
258
|
+
createdAt: z.ZodString;
|
|
259
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
260
|
+
}, z.core.$strip>;
|
|
261
|
+
declare const FeedbackTrajectorySchema: z.ZodObject<{
|
|
262
|
+
id: z.ZodString;
|
|
263
|
+
projectId: z.ZodOptional<z.ZodString>;
|
|
264
|
+
scenarioId: z.ZodOptional<z.ZodString>;
|
|
265
|
+
task: z.ZodObject<{
|
|
266
|
+
intent: z.ZodString;
|
|
267
|
+
context: z.ZodOptional<z.ZodUnknown>;
|
|
268
|
+
}, z.core.$strip>;
|
|
269
|
+
attempts: z.ZodDefault<z.ZodArray<z.ZodObject<{
|
|
270
|
+
id: z.ZodString;
|
|
271
|
+
stepIndex: z.ZodNumber;
|
|
272
|
+
artifactType: z.ZodEnum<{
|
|
273
|
+
action: "action";
|
|
274
|
+
decision: "decision";
|
|
275
|
+
text: "text";
|
|
276
|
+
code: "code";
|
|
277
|
+
plan: "plan";
|
|
278
|
+
research: "research";
|
|
279
|
+
ui: "ui";
|
|
280
|
+
data: "data";
|
|
281
|
+
other: "other";
|
|
282
|
+
}>;
|
|
283
|
+
artifact: z.ZodUnknown;
|
|
284
|
+
options: z.ZodOptional<z.ZodArray<z.ZodUnknown>>;
|
|
285
|
+
proposedAction: z.ZodOptional<z.ZodObject<{
|
|
286
|
+
type: z.ZodString;
|
|
287
|
+
risk: z.ZodOptional<z.ZodEnum<{
|
|
288
|
+
medium: "medium";
|
|
289
|
+
low: "low";
|
|
290
|
+
high: "high";
|
|
291
|
+
}>>;
|
|
292
|
+
costUsd: z.ZodOptional<z.ZodNumber>;
|
|
293
|
+
externalSideEffect: z.ZodOptional<z.ZodBoolean>;
|
|
294
|
+
requiresApproval: z.ZodOptional<z.ZodBoolean>;
|
|
295
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
296
|
+
}, z.core.$strip>>;
|
|
297
|
+
feedback: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
298
|
+
id: z.ZodOptional<z.ZodString>;
|
|
299
|
+
source: z.ZodEnum<{
|
|
300
|
+
judge: "judge";
|
|
301
|
+
system: "system";
|
|
302
|
+
user: "user";
|
|
303
|
+
policy: "policy";
|
|
304
|
+
environment: "environment";
|
|
305
|
+
metric: "metric";
|
|
306
|
+
}>;
|
|
307
|
+
kind: z.ZodEnum<{
|
|
308
|
+
approve: "approve";
|
|
309
|
+
reject: "reject";
|
|
310
|
+
select: "select";
|
|
311
|
+
edit: "edit";
|
|
312
|
+
rank: "rank";
|
|
313
|
+
rate: "rate";
|
|
314
|
+
comment: "comment";
|
|
315
|
+
metric_outcome: "metric_outcome";
|
|
316
|
+
policy_block: "policy_block";
|
|
317
|
+
revision_request: "revision_request";
|
|
318
|
+
}>;
|
|
319
|
+
value: z.ZodUnknown;
|
|
320
|
+
reason: z.ZodOptional<z.ZodString>;
|
|
321
|
+
severity: z.ZodOptional<z.ZodEnum<{
|
|
322
|
+
error: "error";
|
|
323
|
+
info: "info";
|
|
324
|
+
warning: "warning";
|
|
325
|
+
critical: "critical";
|
|
326
|
+
}>>;
|
|
327
|
+
createdAt: z.ZodString;
|
|
328
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
329
|
+
}, z.core.$strip>>>;
|
|
330
|
+
createdAt: z.ZodString;
|
|
331
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
332
|
+
}, z.core.$strip>>>;
|
|
333
|
+
labels: z.ZodDefault<z.ZodArray<z.ZodObject<{
|
|
334
|
+
id: z.ZodOptional<z.ZodString>;
|
|
335
|
+
source: z.ZodEnum<{
|
|
336
|
+
judge: "judge";
|
|
337
|
+
system: "system";
|
|
338
|
+
user: "user";
|
|
339
|
+
policy: "policy";
|
|
340
|
+
environment: "environment";
|
|
341
|
+
metric: "metric";
|
|
342
|
+
}>;
|
|
343
|
+
kind: z.ZodEnum<{
|
|
344
|
+
approve: "approve";
|
|
345
|
+
reject: "reject";
|
|
346
|
+
select: "select";
|
|
347
|
+
edit: "edit";
|
|
348
|
+
rank: "rank";
|
|
349
|
+
rate: "rate";
|
|
350
|
+
comment: "comment";
|
|
351
|
+
metric_outcome: "metric_outcome";
|
|
352
|
+
policy_block: "policy_block";
|
|
353
|
+
revision_request: "revision_request";
|
|
354
|
+
}>;
|
|
355
|
+
value: z.ZodUnknown;
|
|
356
|
+
reason: z.ZodOptional<z.ZodString>;
|
|
357
|
+
severity: z.ZodOptional<z.ZodEnum<{
|
|
358
|
+
error: "error";
|
|
359
|
+
info: "info";
|
|
360
|
+
warning: "warning";
|
|
361
|
+
critical: "critical";
|
|
362
|
+
}>>;
|
|
363
|
+
createdAt: z.ZodString;
|
|
364
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
365
|
+
}, z.core.$strip>>>;
|
|
366
|
+
outcome: z.ZodOptional<z.ZodObject<{
|
|
367
|
+
success: z.ZodOptional<z.ZodBoolean>;
|
|
368
|
+
score: z.ZodOptional<z.ZodNumber>;
|
|
369
|
+
metrics: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
|
|
370
|
+
costUsd: z.ZodOptional<z.ZodNumber>;
|
|
371
|
+
detail: z.ZodOptional<z.ZodString>;
|
|
372
|
+
observedAt: z.ZodOptional<z.ZodString>;
|
|
373
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
374
|
+
}, z.core.$strip>>;
|
|
375
|
+
split: z.ZodOptional<z.ZodEnum<{
|
|
376
|
+
train: "train";
|
|
377
|
+
dev: "dev";
|
|
378
|
+
test: "test";
|
|
379
|
+
holdout: "holdout";
|
|
380
|
+
}>>;
|
|
381
|
+
tags: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
382
|
+
createdAt: z.ZodString;
|
|
383
|
+
updatedAt: z.ZodOptional<z.ZodString>;
|
|
384
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
385
|
+
}, z.core.$strip>;
|
|
386
|
+
declare const FeedbackIngestResponseSchema: z.ZodObject<{
|
|
387
|
+
id: z.ZodString;
|
|
388
|
+
persisted: z.ZodBoolean;
|
|
389
|
+
}, z.core.$strip>;
|
|
390
|
+
type TraceEvent = z.infer<typeof TraceEventSchema>;
|
|
391
|
+
type TracesIngestRequest = z.infer<typeof TracesIngestRequestSchema>;
|
|
392
|
+
type TracesIngestResponse = z.infer<typeof TracesIngestResponseSchema>;
|
|
393
|
+
type FeedbackTrajectory = z.infer<typeof FeedbackTrajectorySchema>;
|
|
394
|
+
type FeedbackIngestResponse = z.infer<typeof FeedbackIngestResponseSchema>;
|
|
108
395
|
declare const ErrorResponseSchema: z.ZodObject<{
|
|
109
396
|
error: z.ZodObject<{
|
|
110
397
|
code: z.ZodString;
|
|
@@ -132,6 +419,18 @@ declare const WIRE_VERSION = "1.0.0";
|
|
|
132
419
|
*/
|
|
133
420
|
declare function hashRubric(rubric: Rubric): string;
|
|
134
421
|
|
|
422
|
+
/**
|
|
423
|
+
* Pure handler functions — the "business logic" behind every wire-protocol
|
|
424
|
+
* method. The HTTP server (`server.ts`) and the stdio RPC (`rpc.ts`) both
|
|
425
|
+
* call these. Tests call these directly without spinning a server.
|
|
426
|
+
*
|
|
427
|
+
* Each handler:
|
|
428
|
+
* - Takes a parsed request (already Zod-validated by the transport).
|
|
429
|
+
* - Returns a result that matches the response schema.
|
|
430
|
+
* - Throws `WireError` for caller-fixable errors (404, 400, 422).
|
|
431
|
+
* - Lets unexpected errors bubble — the transport maps them to 500.
|
|
432
|
+
*/
|
|
433
|
+
|
|
135
434
|
/** Caller-fixable error. The transport renders this to 4xx + ErrorResponse. */
|
|
136
435
|
declare class WireError extends Error {
|
|
137
436
|
readonly code: string;
|
|
@@ -142,6 +441,57 @@ declare class WireError extends Error {
|
|
|
142
441
|
declare function handleJudge(req: JudgeRequest): Promise<JudgeResult>;
|
|
143
442
|
declare function handleListRubrics(): ListRubricsResponse;
|
|
144
443
|
declare function handleVersion(): VersionResponse;
|
|
444
|
+
/**
|
|
445
|
+
* Pluggable stores the wire layer routes ingestion writes into. Both
|
|
446
|
+
* are optional — when omitted, the corresponding endpoint returns 503.
|
|
447
|
+
*
|
|
448
|
+
* Production deployments wire a `FileSystemTraceStore` and
|
|
449
|
+
* `FileSystemFeedbackTrajectoryStore` here. Tests substitute in-memory
|
|
450
|
+
* stores.
|
|
451
|
+
*/
|
|
452
|
+
interface IngestionStores {
|
|
453
|
+
traceStore?: TraceStore;
|
|
454
|
+
feedbackStore?: FeedbackTrajectoryStore;
|
|
455
|
+
}
|
|
456
|
+
/**
|
|
457
|
+
* `POST /v1/traces/ingest` — accept a batch of `TraceEvent`s from the
|
|
458
|
+
* production runtime. Best-effort: each event is appended independently;
|
|
459
|
+
* one bad event does not poison the batch.
|
|
460
|
+
*
|
|
461
|
+
* Idempotency: the underlying store is append-only; consumers retrying
|
|
462
|
+
* the same payload will get duplicate events. Consumers should
|
|
463
|
+
* de-duplicate by `eventId` downstream — production traces frequently
|
|
464
|
+
* land via at-least-once buses (Kafka, SQS) where dedup is unavoidable.
|
|
465
|
+
*/
|
|
466
|
+
declare function handleTracesIngest(req: TracesIngestRequest, stores: IngestionStores): Promise<TracesIngestResponse>;
|
|
467
|
+
/**
|
|
468
|
+
* `POST /v1/feedback` — accept a single `FeedbackTrajectory` from the
|
|
469
|
+
* production runtime. Idempotent on `id`: re-posting the same trajectory
|
|
470
|
+
* replaces the prior record.
|
|
471
|
+
*/
|
|
472
|
+
declare function handleFeedbackIngest(req: FeedbackTrajectory, stores: IngestionStores): Promise<FeedbackIngestResponse>;
|
|
473
|
+
|
|
474
|
+
declare function buildOpenApi(packageVersion: string): OpenAPIObject;
|
|
475
|
+
|
|
476
|
+
interface RpcRequest {
|
|
477
|
+
method: 'judge' | 'listRubrics' | 'version';
|
|
478
|
+
params?: unknown;
|
|
479
|
+
}
|
|
480
|
+
interface RpcSuccess {
|
|
481
|
+
result: unknown;
|
|
482
|
+
}
|
|
483
|
+
interface RpcError {
|
|
484
|
+
error: {
|
|
485
|
+
code: string;
|
|
486
|
+
message: string;
|
|
487
|
+
details?: unknown;
|
|
488
|
+
};
|
|
489
|
+
}
|
|
490
|
+
declare function dispatchRpc(req: RpcRequest): Promise<RpcSuccess | RpcError>;
|
|
491
|
+
/** Read one JSON request from stdin, write one JSON response to stdout. */
|
|
492
|
+
declare function runRpcOnce(method?: string): Promise<number>;
|
|
493
|
+
/** Read JSONL requests from stdin, write JSONL responses to stdout. */
|
|
494
|
+
declare function runRpcBatch(method?: string): Promise<number>;
|
|
145
495
|
|
|
146
496
|
/**
|
|
147
497
|
* Built-in rubrics shipped with agent-eval.
|
|
@@ -177,10 +527,24 @@ declare function listBuiltinRubrics(): {
|
|
|
177
527
|
rubricVersion: string;
|
|
178
528
|
}[];
|
|
179
529
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
530
|
+
interface CreateAppOptions {
|
|
531
|
+
/** Stores wired to the ingestion endpoints. */
|
|
532
|
+
stores?: IngestionStores;
|
|
533
|
+
/**
|
|
534
|
+
* Bearer-token auth. When provided, every endpoint EXCEPT `/healthz`
|
|
535
|
+
* and `/v1/version` requires `Authorization: Bearer <token>`. The
|
|
536
|
+
* token may be a static string OR a function for time-bounded /
|
|
537
|
+
* rotating tokens.
|
|
538
|
+
*
|
|
539
|
+
* Recommended for any server that accepts ingestion writes from the
|
|
540
|
+
* public internet. Read-only deployments may omit it.
|
|
541
|
+
*/
|
|
542
|
+
auth?: {
|
|
543
|
+
bearer: string | ((token: string) => boolean | Promise<boolean>);
|
|
544
|
+
};
|
|
545
|
+
}
|
|
546
|
+
declare function createApp(opts?: CreateAppOptions): Hono<hono_types.BlankEnv, hono_types.BlankSchema, "/">;
|
|
547
|
+
interface ServeOptions extends CreateAppOptions {
|
|
184
548
|
/** Default 5005. */
|
|
185
549
|
port?: number;
|
|
186
550
|
/** Default '127.0.0.1'. Set to '0.0.0.0' to listen on all interfaces. */
|
|
@@ -188,24 +552,4 @@ interface ServeOptions {
|
|
|
188
552
|
}
|
|
189
553
|
declare function startServer(opts?: ServeOptions): ServerType;
|
|
190
554
|
|
|
191
|
-
|
|
192
|
-
method: 'judge' | 'listRubrics' | 'version';
|
|
193
|
-
params?: unknown;
|
|
194
|
-
}
|
|
195
|
-
interface RpcSuccess {
|
|
196
|
-
result: unknown;
|
|
197
|
-
}
|
|
198
|
-
interface RpcError {
|
|
199
|
-
error: {
|
|
200
|
-
code: string;
|
|
201
|
-
message: string;
|
|
202
|
-
details?: unknown;
|
|
203
|
-
};
|
|
204
|
-
}
|
|
205
|
-
declare function dispatchRpc(req: RpcRequest): Promise<RpcSuccess | RpcError>;
|
|
206
|
-
/** Read one JSON request from stdin, write one JSON response to stdout. */
|
|
207
|
-
declare function runRpcOnce(method?: string): Promise<number>;
|
|
208
|
-
/** Read JSONL requests from stdin, write JSONL responses to stdout. */
|
|
209
|
-
declare function runRpcBatch(method?: string): Promise<number>;
|
|
210
|
-
|
|
211
|
-
export { BUILTIN_RUBRICS, type ErrorResponse, ErrorResponseSchema, type FailureMode, FailureModeSchema, HealthResponseSchema, type JudgeRequest, JudgeRequestSchema, type JudgeResult, JudgeResultSchema, type ListRubricsResponse, ListRubricsResponseSchema, type Rubric, type RubricDimension, RubricDimensionSchema, type RubricInfo, RubricInfoSchema, RubricSchema, type ServeOptions, type VersionResponse, VersionResponseSchema, WIRE_VERSION, WireError, buildOpenApi, createApp, dispatchRpc, getBuiltinRubric, handleJudge, handleListRubrics, handleVersion, hashRubric, listBuiltinRubrics, runRpcBatch, runRpcOnce, startServer };
|
|
555
|
+
export { BUILTIN_RUBRICS, type ErrorResponse, ErrorResponseSchema, type FailureMode, FailureModeSchema, FeedbackAttemptSchema, type FeedbackIngestResponse, FeedbackIngestResponseSchema, FeedbackLabelSchema, type FeedbackTrajectory, FeedbackTrajectorySchema, HealthResponseSchema, type IngestionStores, type JudgeRequest, JudgeRequestSchema, type JudgeResult, JudgeResultSchema, type ListRubricsResponse, ListRubricsResponseSchema, type Rubric, type RubricDimension, RubricDimensionSchema, type RubricInfo, RubricInfoSchema, RubricSchema, type ServeOptions, type TraceEvent, TraceEventSchema, type TracesIngestRequest, TracesIngestRequestSchema, type TracesIngestResponse, TracesIngestResponseSchema, type VersionResponse, VersionResponseSchema, WIRE_VERSION, WireError, buildOpenApi, createApp, dispatchRpc, getBuiltinRubric, handleFeedbackIngest, handleJudge, handleListRubrics, handleTracesIngest, handleVersion, hashRubric, listBuiltinRubrics, runRpcBatch, runRpcOnce, startServer };
|
package/dist/wire/index.js
CHANGED
|
@@ -2,6 +2,10 @@ import {
|
|
|
2
2
|
BUILTIN_RUBRICS,
|
|
3
3
|
ErrorResponseSchema,
|
|
4
4
|
FailureModeSchema,
|
|
5
|
+
FeedbackAttemptSchema,
|
|
6
|
+
FeedbackIngestResponseSchema,
|
|
7
|
+
FeedbackLabelSchema,
|
|
8
|
+
FeedbackTrajectorySchema,
|
|
5
9
|
HealthResponseSchema,
|
|
6
10
|
JudgeRequestSchema,
|
|
7
11
|
JudgeResultSchema,
|
|
@@ -9,6 +13,9 @@ import {
|
|
|
9
13
|
RubricDimensionSchema,
|
|
10
14
|
RubricInfoSchema,
|
|
11
15
|
RubricSchema,
|
|
16
|
+
TraceEventSchema,
|
|
17
|
+
TracesIngestRequestSchema,
|
|
18
|
+
TracesIngestResponseSchema,
|
|
12
19
|
VersionResponseSchema,
|
|
13
20
|
WIRE_VERSION,
|
|
14
21
|
WireError,
|
|
@@ -16,22 +23,29 @@ import {
|
|
|
16
23
|
createApp,
|
|
17
24
|
dispatchRpc,
|
|
18
25
|
getBuiltinRubric,
|
|
26
|
+
handleFeedbackIngest,
|
|
19
27
|
handleJudge,
|
|
20
28
|
handleListRubrics,
|
|
29
|
+
handleTracesIngest,
|
|
21
30
|
handleVersion,
|
|
22
31
|
hashRubric,
|
|
23
32
|
listBuiltinRubrics,
|
|
24
33
|
runRpcBatch,
|
|
25
34
|
runRpcOnce,
|
|
26
35
|
startServer
|
|
27
|
-
} from "../chunk-
|
|
28
|
-
import "../chunk-
|
|
29
|
-
import "../chunk-
|
|
36
|
+
} from "../chunk-5LBB5B3Z.js";
|
|
37
|
+
import "../chunk-4S4BM3QQ.js";
|
|
38
|
+
import "../chunk-PC4UYEBM.js";
|
|
39
|
+
import "../chunk-NG236HPC.js";
|
|
30
40
|
import "../chunk-PZ5AY32C.js";
|
|
31
41
|
export {
|
|
32
42
|
BUILTIN_RUBRICS,
|
|
33
43
|
ErrorResponseSchema,
|
|
34
44
|
FailureModeSchema,
|
|
45
|
+
FeedbackAttemptSchema,
|
|
46
|
+
FeedbackIngestResponseSchema,
|
|
47
|
+
FeedbackLabelSchema,
|
|
48
|
+
FeedbackTrajectorySchema,
|
|
35
49
|
HealthResponseSchema,
|
|
36
50
|
JudgeRequestSchema,
|
|
37
51
|
JudgeResultSchema,
|
|
@@ -39,6 +53,9 @@ export {
|
|
|
39
53
|
RubricDimensionSchema,
|
|
40
54
|
RubricInfoSchema,
|
|
41
55
|
RubricSchema,
|
|
56
|
+
TraceEventSchema,
|
|
57
|
+
TracesIngestRequestSchema,
|
|
58
|
+
TracesIngestResponseSchema,
|
|
42
59
|
VersionResponseSchema,
|
|
43
60
|
WIRE_VERSION,
|
|
44
61
|
WireError,
|
|
@@ -46,8 +63,10 @@ export {
|
|
|
46
63
|
createApp,
|
|
47
64
|
dispatchRpc,
|
|
48
65
|
getBuiltinRubric,
|
|
66
|
+
handleFeedbackIngest,
|
|
49
67
|
handleJudge,
|
|
50
68
|
handleListRubrics,
|
|
69
|
+
handleTracesIngest,
|
|
51
70
|
handleVersion,
|
|
52
71
|
hashRubric,
|
|
53
72
|
listBuiltinRubrics,
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"description": "
|
|
3
|
+
"version": "0.25.0",
|
|
4
|
+
"description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|
|
7
7
|
"type": "git",
|
|
@@ -64,6 +64,36 @@
|
|
|
64
64
|
"import": "./dist/benchmarks/index.js",
|
|
65
65
|
"default": "./dist/benchmarks/index.js"
|
|
66
66
|
},
|
|
67
|
+
"./pipelines": {
|
|
68
|
+
"types": "./dist/pipelines/index.d.ts",
|
|
69
|
+
"import": "./dist/pipelines/index.js",
|
|
70
|
+
"default": "./dist/pipelines/index.js"
|
|
71
|
+
},
|
|
72
|
+
"./meta-eval": {
|
|
73
|
+
"types": "./dist/meta-eval/index.d.ts",
|
|
74
|
+
"import": "./dist/meta-eval/index.js",
|
|
75
|
+
"default": "./dist/meta-eval/index.js"
|
|
76
|
+
},
|
|
77
|
+
"./prm": {
|
|
78
|
+
"types": "./dist/prm/index.d.ts",
|
|
79
|
+
"import": "./dist/prm/index.js",
|
|
80
|
+
"default": "./dist/prm/index.js"
|
|
81
|
+
},
|
|
82
|
+
"./builder-eval": {
|
|
83
|
+
"types": "./dist/builder-eval/index.d.ts",
|
|
84
|
+
"import": "./dist/builder-eval/index.js",
|
|
85
|
+
"default": "./dist/builder-eval/index.js"
|
|
86
|
+
},
|
|
87
|
+
"./governance": {
|
|
88
|
+
"types": "./dist/governance/index.d.ts",
|
|
89
|
+
"import": "./dist/governance/index.js",
|
|
90
|
+
"default": "./dist/governance/index.js"
|
|
91
|
+
},
|
|
92
|
+
"./knowledge": {
|
|
93
|
+
"types": "./dist/knowledge/index.d.ts",
|
|
94
|
+
"import": "./dist/knowledge/index.js",
|
|
95
|
+
"default": "./dist/knowledge/index.js"
|
|
96
|
+
},
|
|
67
97
|
"./openapi.json": {
|
|
68
98
|
"default": "./dist/openapi.json"
|
|
69
99
|
}
|
|
@@ -79,15 +109,6 @@
|
|
|
79
109
|
"publishConfig": {
|
|
80
110
|
"access": "public"
|
|
81
111
|
},
|
|
82
|
-
"scripts": {
|
|
83
|
-
"build": "tsup && pnpm openapi",
|
|
84
|
-
"dev": "tsup --watch",
|
|
85
|
-
"prepare": "pnpm build",
|
|
86
|
-
"test": "vitest run",
|
|
87
|
-
"test:watch": "vitest",
|
|
88
|
-
"typecheck": "tsc --noEmit",
|
|
89
|
-
"openapi": "node dist/cli.js openapi --out dist/openapi.json"
|
|
90
|
-
},
|
|
91
112
|
"dependencies": {
|
|
92
113
|
"@asteasolutions/zod-to-openapi": "^8.5.0",
|
|
93
114
|
"@ax-llm/ax": "^19.0.25",
|
|
@@ -97,20 +118,25 @@
|
|
|
97
118
|
"zod": "^4.3.6"
|
|
98
119
|
},
|
|
99
120
|
"devDependencies": {
|
|
121
|
+
"@biomejs/biome": "^2.4.15",
|
|
100
122
|
"@types/node": "^25.6.0",
|
|
101
123
|
"openapi3-ts": "^4.5.0",
|
|
102
124
|
"tsup": "^8.0.0",
|
|
103
125
|
"typescript": "^5.7.0",
|
|
104
126
|
"vitest": "^3.0.0"
|
|
105
127
|
},
|
|
106
|
-
"pnpm": {
|
|
107
|
-
"overrides": {
|
|
108
|
-
"postcss@<8.5.10": "^8.5.10"
|
|
109
|
-
}
|
|
110
|
-
},
|
|
111
128
|
"engines": {
|
|
112
129
|
"node": ">=20"
|
|
113
130
|
},
|
|
114
131
|
"license": "MIT",
|
|
115
|
-
"
|
|
116
|
-
|
|
132
|
+
"scripts": {
|
|
133
|
+
"build": "tsup && pnpm openapi",
|
|
134
|
+
"dev": "tsup --watch",
|
|
135
|
+
"test": "vitest run",
|
|
136
|
+
"test:watch": "vitest",
|
|
137
|
+
"typecheck": "tsc --noEmit",
|
|
138
|
+
"lint": "biome check src",
|
|
139
|
+
"format": "biome format --write src",
|
|
140
|
+
"openapi": "node dist/cli.js openapi --out dist/openapi.json"
|
|
141
|
+
}
|
|
142
|
+
}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/benchmarks/index.ts","../src/benchmarks/types.ts","../src/benchmarks/routing/index.ts","../src/benchmarks/routing/dataset.ts"],"sourcesContent":["/**\n * Reference benchmark wrappers — entry point.\n *\n * Core surface (exported here):\n * - The `BenchmarkAdapter` contract.\n * - `deterministicSplit` + `BENCHMARK_SPLIT_SEED` for split assignment.\n * - `routing` — synthetic 16-task router benchmark. The only novel\n * benchmark we built; ships in the package.\n *\n * Example wrappers (under `examples/benchmarks/`, NOT in the bundle):\n * - `gsm8k` — exact-match math reasoning (HF mirror, dataset\n * not bundled).\n * - `swebench-lite` — 30-instance SWE-Bench subset via an external\n * grader command.\n *\n * The example wrappers are reference implementations of `BenchmarkAdapter`.\n * Read them, copy them, adapt them. They're intentionally not in the main\n * entry — every team will configure them differently.\n */\n\nexport type {\n BenchmarkAdapter,\n BenchmarkDatasetItem,\n BenchmarkEvaluation,\n} from './types'\nexport { deterministicSplit, BENCHMARK_SPLIT_SEED } from './types'\n\nexport * as routing from './routing/index'\n","/**\n * Shared types for the reference benchmark wrappers under\n * `src/benchmarks/`. Each wrapper exports the three functions in\n * `BenchmarkAdapter` plus its own typed `DatasetItem` shape.\n */\n\nimport type { RunSplitTag } from '../run-record'\n\nexport interface BenchmarkDatasetItem<TPayload = unknown> {\n /** Stable dataset-local item id (used for split assignment + paper\n * references). Unique within a benchmark. */\n id: string\n /** Free-form payload. Each benchmark defines its own shape. */\n payload: TPayload\n}\n\nexport interface BenchmarkEvaluation {\n /** [0, 1] score for the response on this item. Exact-match\n * benchmarks use 0/1; partial-credit benchmarks may return\n * fractional values. */\n score: number\n /** Optional bag of raw scoring signals — e.g. parsed numeric\n * answer, regex match, judge sub-scores. */\n raw: Record<string, unknown>\n}\n\n/** Common signature implemented by every adapter under `src/benchmarks/*`. */\n// `TPayload` is the per-item payload type; `_TItem` is preserved for\n// downstream type-narrowing extensions (a richer `BenchmarkDatasetItem`\n// subclass that adds e.g. provenance metadata) but is intentionally\n// unused here. `noUnusedLocals` requires the leading underscore.\nexport interface BenchmarkAdapter<_TItem = unknown, TPayload = unknown> {\n /** Load the dataset for the given split. May hit the network on\n * first call but should be cache-friendly. Adapters that don't\n * ship the dataset itself MUST throw a clearly-marked error\n * pointing the caller at the loader script. */\n loadDataset(split: RunSplitTag): Promise<BenchmarkDatasetItem<TPayload>[]>\n /** Score a single response. Pure with respect to the inputs. */\n evaluate(item: BenchmarkDatasetItem<TPayload>, response: string): Promise<BenchmarkEvaluation>\n /** Deterministic split assignment via item id hashing. The\n * fraction of items in each split is implementation-defined but\n * MUST be stable across processes and platforms. */\n assignSplit(itemId: string): RunSplitTag\n}\n\n// ── Deterministic split assignment ───────────────────────────────────\n\n/**\n * 32-bit FNV-1a hash. Stable, allocation-free, deterministic across\n * runtimes. We use it to assign items to splits rather than depending\n * on a polyfilled crypto.subtle path.\n */\nfunction fnv1a32(input: string): number {\n let h = 0x811c9dc5\n for (let i = 0; i < input.length; i++) {\n h ^= input.charCodeAt(i) & 0xff\n h = (h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24))) >>> 0\n }\n return h >>> 0\n}\n\n/** Split-assignment seed shared across all benchmarks. Bumping this\n * value reshuffles every split — do NOT do that lightly. */\nexport const BENCHMARK_SPLIT_SEED = 'agent-eval-v1'\n\n/**\n * Assign an item id to one of `'search' | 'dev' | 'holdout'` using a\n * stable 32-bit hash of `${seed}::${id}`. Default proportions:\n *\n * search: 60% (optimization-readable)\n * dev: 20% (held-out for tuning, leak-on-purpose during dev)\n * holdout:20% (paper-grade held-out, gated reads)\n */\nexport function deterministicSplit(\n itemId: string,\n seed: string = BENCHMARK_SPLIT_SEED,\n): RunSplitTag {\n const h = fnv1a32(`${seed}::${itemId}`)\n const pos = h / 0x100000000\n if (pos < 0.6) return 'search'\n if (pos < 0.8) return 'dev'\n return 'holdout'\n}\n","/**\n * Routing benchmark — synthetic, dependency-free, ships in the\n * package. 16 cross-category items in `dataset.ts`. See\n * `routing/README.md` for the format.\n *\n * `evaluate` does case-insensitive exact match against the canonical\n * route plus declared synonyms. The first valid route token in the\n * response wins; everything else is ignored. Wrong answers also\n * report whether they hit a hard negative — useful when triaging\n * \"always picks the popular route\" failure modes.\n */\n\nimport type {\n BenchmarkAdapter,\n BenchmarkDatasetItem,\n BenchmarkEvaluation,\n} from '../types'\nimport { deterministicSplit } from '../types'\nimport type { RunSplitTag } from '../../run-record'\nimport { ROUTING_DATASET, type RoutingItem } from './dataset'\n\nexport type { RoutingItem }\nexport type RoutingPayload = RoutingItem\nexport type RoutingDatasetItem = BenchmarkDatasetItem<RoutingPayload>\n\nclass RoutingAdapter\n implements BenchmarkAdapter<RoutingDatasetItem, RoutingPayload>\n{\n async loadDataset(split: RunSplitTag): Promise<RoutingDatasetItem[]> {\n return ROUTING_DATASET\n .map((item) => ({ id: item.id, payload: item }))\n .filter((it) => assignSplitImpl(it.id) === split)\n }\n\n async evaluate(\n item: RoutingDatasetItem,\n response: string,\n ): Promise<BenchmarkEvaluation> {\n const tokens = extractRouteTokens(response)\n const correct = new Set<string>([item.payload.route, ...item.payload.synonyms].map((s) => s.toLowerCase()))\n const hardNeg = new Set<string>(item.payload.hardNegatives.map((s) => s.toLowerCase()))\n const firstMatch = tokens.find((t) => correct.has(t.toLowerCase())) ?? null\n const firstHardNeg = tokens.find((t) => hardNeg.has(t.toLowerCase())) ?? null\n const score = firstMatch ? 1 : 0\n return {\n score,\n raw: {\n firstToken: tokens[0] ?? null,\n matchedRoute: firstMatch,\n hitHardNegative: Boolean(firstHardNeg),\n hardNegativeRoute: firstHardNeg,\n category: item.payload.category,\n },\n }\n }\n\n assignSplit(itemId: string): RunSplitTag {\n return assignSplitImpl(itemId)\n }\n}\n\nfunction assignSplitImpl(itemId: string): RunSplitTag {\n return deterministicSplit(`routing::${itemId}`)\n}\n\n/**\n * Pull route-shaped tokens out of a model response. Routes look like\n * `category.action` (`fs.write`, `chat.reply`). Bare alphanumerics\n * are not routes, but `category.action` patterns are robust to most\n * model wrappers (JSON output, prose explanations, code fences).\n */\nexport function extractRouteTokens(response: string): string[] {\n const matches = response.match(/[a-z][a-z0-9_]*\\.[a-z][a-z0-9_]*/gi)\n return matches ?? []\n}\n\nconst adapter = new RoutingAdapter()\n\nexport const loadDataset = adapter.loadDataset.bind(adapter)\nexport const evaluate = adapter.evaluate.bind(adapter)\nexport const assignSplit = adapter.assignSplit.bind(adapter)\nexport { RoutingAdapter, ROUTING_DATASET }\n","/**\n * Synthetic routing dataset. 16 tasks across 4 categories. Used as a\n * deterministic, dependency-free benchmark for any router that maps a\n * natural-language request to one of a fixed set of route labels.\n *\n * Format (see `routing/README.md` for prose):\n *\n * {\n * id: stable per-task ID (matches across processes).\n * category: one of the four route labels.\n * prompt: the user-facing request the router must classify.\n * route: the ground-truth route the router should pick.\n * synonyms: other strings that count as a correct answer.\n * hardNegatives:close-but-wrong route labels — used to detect the\n * \"always picks the popular route\" failure mode.\n * }\n *\n * The four categories are intentionally cross-domain (file ops,\n * math, search, conversation) so a router that collapses to one\n * category is easy to spot.\n */\n\nexport interface RoutingItem {\n id: string\n category: 'file' | 'math' | 'search' | 'chat'\n prompt: string\n /** Canonical correct route label. */\n route: string\n /** Alternate route labels that also count as correct. */\n synonyms: string[]\n /** Wrong-but-tempting route labels (for analysis, not grading). */\n hardNegatives: string[]\n}\n\nexport const ROUTING_DATASET: RoutingItem[] = [\n {\n id: 'file_001',\n category: 'file',\n prompt: 'Save the meeting notes to /tmp/notes-2025-04.md as markdown.',\n route: 'fs.write',\n synonyms: ['filesystem.write', 'write_file'],\n hardNegatives: ['fs.read', 'chat.reply'],\n },\n {\n id: 'file_002',\n category: 'file',\n prompt: 'Read the contents of /etc/hosts and summarize the entries.',\n route: 'fs.read',\n synonyms: ['filesystem.read', 'read_file'],\n hardNegatives: ['fs.write', 'search.web'],\n },\n {\n id: 'file_003',\n category: 'file',\n prompt: 'List every Python file under src/ recursively.',\n route: 'fs.list',\n synonyms: ['filesystem.list', 'list_files'],\n hardNegatives: ['fs.read', 'search.code'],\n },\n {\n id: 'file_004',\n category: 'file',\n prompt: 'Delete the cached build at .turbo/cache.',\n route: 'fs.delete',\n synonyms: ['filesystem.delete', 'remove_file'],\n hardNegatives: ['fs.write', 'fs.list'],\n },\n {\n id: 'math_001',\n category: 'math',\n prompt: 'What is the integral of 3x^2 + 2x from 0 to 5?',\n route: 'math.integral',\n synonyms: ['calculator.integral', 'math.solve'],\n hardNegatives: ['math.derivative', 'chat.reply'],\n },\n {\n id: 'math_002',\n category: 'math',\n prompt: 'Compute the derivative of sin(x) * cos(x).',\n route: 'math.derivative',\n synonyms: ['calculator.derivative', 'math.solve'],\n hardNegatives: ['math.integral', 'math.algebra'],\n },\n {\n id: 'math_003',\n category: 'math',\n prompt: 'Solve 2x + 7 = 19 for x.',\n route: 'math.algebra',\n synonyms: ['calculator.algebra', 'math.solve'],\n hardNegatives: ['math.derivative', 'math.integral'],\n },\n {\n id: 'math_004',\n category: 'math',\n prompt: 'What is the prime factorization of 360?',\n route: 'math.numbertheory',\n synonyms: ['calculator.factor', 'math.solve'],\n hardNegatives: ['math.algebra', 'search.web'],\n },\n {\n id: 'search_001',\n category: 'search',\n prompt: 'Find recent papers on agent prompt optimization with held-out promotion gates.',\n route: 'search.web',\n synonyms: ['web.search', 'search.papers'],\n hardNegatives: ['search.code', 'chat.reply'],\n },\n {\n id: 'search_002',\n category: 'search',\n prompt: 'Search the codebase for every call site of `runProposeReview`.',\n route: 'search.code',\n synonyms: ['code.search', 'grep'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'search_003',\n category: 'search',\n prompt: 'What is the latest release of the Tangle network on GitHub?',\n route: 'search.web',\n synonyms: ['web.search', 'github.releases'],\n hardNegatives: ['search.code', 'chat.reply'],\n },\n {\n id: 'search_004',\n category: 'search',\n prompt: 'Find all TODO comments in the agent-eval src tree.',\n route: 'search.code',\n synonyms: ['code.search', 'grep'],\n hardNegatives: ['search.web', 'fs.list'],\n },\n {\n id: 'chat_001',\n category: 'chat',\n prompt: 'Hi there, how are you doing today?',\n route: 'chat.reply',\n synonyms: ['conversation.reply'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'chat_002',\n category: 'chat',\n prompt: 'Please explain the difference between an LLM and a foundation model.',\n route: 'chat.reply',\n synonyms: ['conversation.reply', 'qa.answer'],\n hardNegatives: ['search.web', 'math.algebra'],\n },\n {\n id: 'chat_003',\n category: 'chat',\n prompt: 'Tell me a short joke about distributed systems.',\n route: 'chat.reply',\n synonyms: ['conversation.reply'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'chat_004',\n category: 'chat',\n prompt: 'Acknowledge my last message with a thumbs up.',\n route: 'chat.reply',\n synonyms: ['conversation.reply', 'react'],\n hardNegatives: ['fs.write', 'search.web'],\n },\n]\n"],"mappings":";;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACoDA,SAAS,QAAQ,OAAuB;AACtC,MAAI,IAAI;AACR,WAAS,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;AACrC,SAAK,MAAM,WAAW,CAAC,IAAI;AAC3B,QAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,SAAU;AAAA,EACxE;AACA,SAAO,MAAM;AACf;AAIO,IAAM,uBAAuB;AAU7B,SAAS,mBACd,QACA,OAAe,sBACF;AACb,QAAM,IAAI,QAAQ,GAAG,IAAI,KAAK,MAAM,EAAE;AACtC,QAAM,MAAM,IAAI;AAChB,MAAI,MAAM,IAAK,QAAO;AACtB,MAAI,MAAM,IAAK,QAAO;AACtB,SAAO;AACT;;;AClFA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACkCO,IAAM,kBAAiC;AAAA,EAC5C;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB,YAAY;AAAA,IAC3C,eAAe,CAAC,WAAW,YAAY;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,mBAAmB,WAAW;AAAA,IACzC,eAAe,CAAC,YAAY,YAAY;AAAA,EAC1C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,mBAAmB,YAAY;AAAA,IAC1C,eAAe,CAAC,WAAW,aAAa;AAAA,EAC1C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,qBAAqB,aAAa;AAAA,IAC7C,eAAe,CAAC,YAAY,SAAS;AAAA,EACvC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,uBAAuB,YAAY;AAAA,IAC9C,eAAe,CAAC,mBAAmB,YAAY;AAAA,EACjD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,yBAAyB,YAAY;AAAA,IAChD,eAAe,CAAC,iBAAiB,cAAc;AAAA,EACjD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,YAAY;AAAA,IAC7C,eAAe,CAAC,mBAAmB,eAAe;AAAA,EACpD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,qBAAqB,YAAY;AAAA,IAC5C,eAAe,CAAC,gBAAgB,YAAY;AAAA,EAC9C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,cAAc,eAAe;AAAA,IACxC,eAAe,CAAC,eAAe,YAAY;AAAA,EAC7C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,eAAe,MAAM;AAAA,IAChC,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,cAAc,iBAAiB;AAAA,IAC1C,eAAe,CAAC,eAAe,YAAY;AAAA,EAC7C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,eAAe,MAAM;AAAA,IAChC,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB;AAAA,IAC/B,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,WAAW;AAAA,IAC5C,eAAe,CAAC,cAAc,cAAc;AAAA,EAC9C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB;AAAA,IAC/B,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,OAAO;AAAA,IACxC,eAAe,CAAC,YAAY,YAAY;AAAA,EAC1C;AACF;;;AD1IA,IAAM,iBAAN,MAEA;AAAA,EACE,MAAM,YAAY,OAAmD;AACnE,WAAO,gBACJ,IAAI,CAAC,UAAU,EAAE,IAAI,KAAK,IAAI,SAAS,KAAK,EAAE,EAC9C,OAAO,CAAC,OAAO,gBAAgB,GAAG,EAAE,MAAM,KAAK;AAAA,EACpD;AAAA,EAEA,MAAM,SACJ,MACA,UAC8B;AAC9B,UAAM,SAAS,mBAAmB,QAAQ;AAC1C,UAAM,UAAU,IAAI,IAAY,CAAC,KAAK,QAAQ,OAAO,GAAG,KAAK,QAAQ,QAAQ,EAAE,IAAI,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;AAC1G,UAAM,UAAU,IAAI,IAAY,KAAK,QAAQ,cAAc,IAAI,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;AACtF,UAAM,aAAa,OAAO,KAAK,CAAC,MAAM,QAAQ,IAAI,EAAE,YAAY,CAAC,CAAC,KAAK;AACvE,UAAM,eAAe,OAAO,KAAK,CAAC,MAAM,QAAQ,IAAI,EAAE,YAAY,CAAC,CAAC,KAAK;AACzE,UAAM,QAAQ,aAAa,IAAI;AAC/B,WAAO;AAAA,MACL;AAAA,MACA,KAAK;AAAA,QACH,YAAY,OAAO,CAAC,KAAK;AAAA,QACzB,cAAc;AAAA,QACd,iBAAiB,QAAQ,YAAY;AAAA,QACrC,mBAAmB;AAAA,QACnB,UAAU,KAAK,QAAQ;AAAA,MACzB;AAAA,IACF;AAAA,EACF;AAAA,EAEA,YAAY,QAA6B;AACvC,WAAO,gBAAgB,MAAM;AAAA,EAC/B;AACF;AAEA,SAAS,gBAAgB,QAA6B;AACpD,SAAO,mBAAmB,YAAY,MAAM,EAAE;AAChD;AAQO,SAAS,mBAAmB,UAA4B;AAC7D,QAAM,UAAU,SAAS,MAAM,oCAAoC;AACnE,SAAO,WAAW,CAAC;AACrB;AAEA,IAAM,UAAU,IAAI,eAAe;AAE5B,IAAM,cAAc,QAAQ,YAAY,KAAK,OAAO;AACpD,IAAM,WAAW,QAAQ,SAAS,KAAK,OAAO;AAC9C,IAAM,cAAc,QAAQ,YAAY,KAAK,OAAO;","names":[]}
|