@tangle-network/agent-eval 0.23.1 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/CHANGELOG.md +145 -0
  2. package/README.md +212 -79
  3. package/dist/baseline-4R5deP0N.d.ts +108 -0
  4. package/dist/benchmarks/index.d.ts +3 -2
  5. package/dist/benchmarks/index.js +1 -1
  6. package/dist/builder-eval/index.d.ts +249 -0
  7. package/dist/builder-eval/index.js +391 -0
  8. package/dist/builder-eval/index.js.map +1 -0
  9. package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
  10. package/dist/chunk-2A5XJB43.js.map +1 -0
  11. package/dist/chunk-47X6LRCE.js +76 -0
  12. package/dist/chunk-47X6LRCE.js.map +1 -0
  13. package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
  14. package/dist/chunk-4F5DQN55.js.map +1 -0
  15. package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
  16. package/dist/chunk-4S4BM3QQ.js.map +1 -0
  17. package/dist/chunk-5BKGXME7.js +65 -0
  18. package/dist/chunk-5BKGXME7.js.map +1 -0
  19. package/dist/{chunk-6KQG5HAH.js → chunk-5LBB5B3Z.js} +376 -72
  20. package/dist/chunk-5LBB5B3Z.js.map +1 -0
  21. package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
  22. package/dist/chunk-6QDKWHLS.js.map +1 -0
  23. package/dist/{chunk-VQQSPGSM.js → chunk-EDUKQ5AM.js} +247 -189
  24. package/dist/chunk-EDUKQ5AM.js.map +1 -0
  25. package/dist/chunk-I4MBDTY5.js +272 -0
  26. package/dist/chunk-I4MBDTY5.js.map +1 -0
  27. package/dist/chunk-JLZQWFV3.js +618 -0
  28. package/dist/chunk-JLZQWFV3.js.map +1 -0
  29. package/dist/chunk-K2TPS5LB.js +569 -0
  30. package/dist/chunk-K2TPS5LB.js.map +1 -0
  31. package/dist/chunk-KKHDIONI.js +414 -0
  32. package/dist/chunk-KKHDIONI.js.map +1 -0
  33. package/dist/chunk-KMPRBJK4.js +74 -0
  34. package/dist/chunk-KMPRBJK4.js.map +1 -0
  35. package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
  36. package/dist/chunk-KTGTIOFD.js.map +1 -0
  37. package/dist/chunk-LSH4MMOZ.js +838 -0
  38. package/dist/chunk-LSH4MMOZ.js.map +1 -0
  39. package/dist/chunk-NG236HPC.js +57 -0
  40. package/dist/chunk-NG236HPC.js.map +1 -0
  41. package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
  42. package/dist/chunk-NLMNWKVM.js.map +1 -0
  43. package/dist/chunk-NU65VQ7M.js +99 -0
  44. package/dist/chunk-NU65VQ7M.js.map +1 -0
  45. package/dist/chunk-OWLAAMME.js +250 -0
  46. package/dist/chunk-OWLAAMME.js.map +1 -0
  47. package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
  48. package/dist/chunk-PC4UYEBM.js.map +1 -0
  49. package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
  50. package/dist/chunk-RAF443UI.js.map +1 -0
  51. package/dist/chunk-RZTMDUO7.js +49 -0
  52. package/dist/chunk-RZTMDUO7.js.map +1 -0
  53. package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
  54. package/dist/chunk-SESZDQPX.js.map +1 -0
  55. package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
  56. package/dist/chunk-TVVP3ZZQ.js.map +1 -0
  57. package/dist/chunk-WWYCWKUM.js +196 -0
  58. package/dist/chunk-WWYCWKUM.js.map +1 -0
  59. package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
  60. package/dist/chunk-YRZ4M5GS.js.map +1 -0
  61. package/dist/chunk-ZN274SWR.js +613 -0
  62. package/dist/chunk-ZN274SWR.js.map +1 -0
  63. package/dist/cli.js +10 -6
  64. package/dist/cli.js.map +1 -1
  65. package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
  66. package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
  67. package/dist/control.d.ts +8 -6
  68. package/dist/control.js +10 -7
  69. package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
  70. package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  71. package/dist/errors-BZ9sTdz7.d.ts +70 -0
  72. package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
  73. package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
  74. package/dist/governance/index.d.ts +5 -0
  75. package/dist/governance/index.js +18 -0
  76. package/dist/governance/index.js.map +1 -0
  77. package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
  78. package/dist/index-Oj9fAPPN.d.ts +270 -0
  79. package/dist/index.d.ts +2018 -3003
  80. package/dist/index.js +7443 -9102
  81. package/dist/index.js.map +1 -1
  82. package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
  83. package/dist/knowledge/index.d.ts +102 -0
  84. package/dist/knowledge/index.js +18 -0
  85. package/dist/knowledge/index.js.map +1 -0
  86. package/dist/meta-eval/index.d.ts +99 -0
  87. package/dist/meta-eval/index.js +324 -0
  88. package/dist/meta-eval/index.js.map +1 -0
  89. package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
  90. package/dist/openapi.json +491 -1
  91. package/dist/optimization.d.ts +11 -8
  92. package/dist/optimization.js +11 -9
  93. package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
  94. package/dist/pipelines/index.d.ts +172 -0
  95. package/dist/pipelines/index.js +345 -0
  96. package/dist/pipelines/index.js.map +1 -0
  97. package/dist/prm/index.d.ts +99 -0
  98. package/dist/prm/index.js +222 -0
  99. package/dist/prm/index.js.map +1 -0
  100. package/dist/query-DODUYdPg.d.ts +30 -0
  101. package/dist/release-report-BNgMdqPF.d.ts +292 -0
  102. package/dist/replay-BL96gCEP.d.ts +226 -0
  103. package/dist/reporting.d.ts +10 -295
  104. package/dist/reporting.js +10 -6
  105. package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-BPT8x_NT.d.ts} +148 -146
  106. package/dist/rl.d.ts +1762 -8
  107. package/dist/rl.js +2035 -58
  108. package/dist/rl.js.map +1 -1
  109. package/dist/rubric-D5tjHNJQ.d.ts +72 -0
  110. package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
  111. package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
  112. package/dist/sequential-Dgz1n51-.d.ts +139 -0
  113. package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  114. package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-C7VPYEj2.d.ts} +3 -76
  115. package/dist/telemetry/file.js +4 -1
  116. package/dist/telemetry/file.js.map +1 -1
  117. package/dist/telemetry/index.js +57 -57
  118. package/dist/telemetry/index.js.map +1 -1
  119. package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
  120. package/dist/traces.d.ts +142 -387
  121. package/dist/traces.js +1302 -40
  122. package/dist/traces.js.map +1 -1
  123. package/dist/trajectory-CnoBo-JY.d.ts +32 -0
  124. package/dist/wire/index.d.ts +369 -25
  125. package/dist/wire/index.js +22 -3
  126. package/package.json +44 -18
  127. package/dist/chunk-42I2QC2L.js.map +0 -1
  128. package/dist/chunk-5IIQKMD5.js.map +0 -1
  129. package/dist/chunk-6KQG5HAH.js.map +0 -1
  130. package/dist/chunk-6M774GY6.js.map +0 -1
  131. package/dist/chunk-7EAUOUQS.js.map +0 -1
  132. package/dist/chunk-AXHNWLIX.js.map +0 -1
  133. package/dist/chunk-EXGR4XEM.js.map +0 -1
  134. package/dist/chunk-IOXMGMHQ.js.map +0 -1
  135. package/dist/chunk-KAO3Q65R.js.map +0 -1
  136. package/dist/chunk-LZKIOBG2.js +0 -2026
  137. package/dist/chunk-LZKIOBG2.js.map +0 -1
  138. package/dist/chunk-QBW3YBTR.js.map +0 -1
  139. package/dist/chunk-QUKKGHTZ.js.map +0 -1
  140. package/dist/chunk-SQQLHODJ.js.map +0 -1
  141. package/dist/chunk-V5QSWN7L.js +0 -1310
  142. package/dist/chunk-V5QSWN7L.js.map +0 -1
  143. package/dist/chunk-VQQSPGSM.js.map +0 -1
  144. package/dist/chunk-XPHOZPOM.js +0 -1947
  145. package/dist/chunk-XPHOZPOM.js.map +0 -1
  146. package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
  147. package/dist/index-ekBXweiQ.d.ts +0 -1894
  148. package/dist/sequential-DgU2mFsE.d.ts +0 -304
@@ -1,8 +1,14 @@
1
+ import { n as FeedbackTrajectoryStore } from '../feedback-trajectory-DfFdrraJ.js';
2
+ import { T as TraceStore } from '../store-Db2Bv8Cf.js';
1
3
  import { z } from 'zod';
2
4
  import { OpenAPIObject } from 'openapi3-ts/oas31';
3
5
  import * as hono_types from 'hono/types';
4
6
  import { ServerType } from '@hono/node-server';
5
7
  import { Hono } from 'hono';
8
+ import '../control-runtime-BuJHoLg0.js';
9
+ import '../emitter-DP_cSSiw.js';
10
+ import '../dataset-CiK_3LDr.js';
11
+ import '../errors-BZ9sTdz7.js';
6
12
 
7
13
  declare const RubricDimensionSchema: z.ZodObject<{
8
14
  id: z.ZodString;
@@ -105,6 +111,287 @@ declare const HealthResponseSchema: z.ZodObject<{
105
111
  status: z.ZodLiteral<"ok">;
106
112
  uptimeSec: z.ZodNumber;
107
113
  }, z.core.$strip>;
114
+ /**
115
+ * Minimal `TraceEvent` shape that the production runtime emits.
116
+ * Matches `trace/schema.ts` `TraceEvent` but is duplicated here as a
117
+ * wire schema so non-TypeScript clients can validate without depending
118
+ * on internal types.
119
+ */
120
+ declare const TraceEventSchema: z.ZodObject<{
121
+ eventId: z.ZodString;
122
+ runId: z.ZodString;
123
+ spanId: z.ZodOptional<z.ZodString>;
124
+ kind: z.ZodEnum<{
125
+ policy_violation: "policy_violation";
126
+ custom: "custom";
127
+ error: "error";
128
+ log: "log";
129
+ budget_decrement: "budget_decrement";
130
+ budget_breach: "budget_breach";
131
+ state_mutation: "state_mutation";
132
+ redaction_applied: "redaction_applied";
133
+ }>;
134
+ timestamp: z.ZodNumber;
135
+ payload: z.ZodRecord<z.ZodString, z.ZodUnknown>;
136
+ }, z.core.$strip>;
137
+ declare const TracesIngestRequestSchema: z.ZodObject<{
138
+ events: z.ZodArray<z.ZodObject<{
139
+ eventId: z.ZodString;
140
+ runId: z.ZodString;
141
+ spanId: z.ZodOptional<z.ZodString>;
142
+ kind: z.ZodEnum<{
143
+ policy_violation: "policy_violation";
144
+ custom: "custom";
145
+ error: "error";
146
+ log: "log";
147
+ budget_decrement: "budget_decrement";
148
+ budget_breach: "budget_breach";
149
+ state_mutation: "state_mutation";
150
+ redaction_applied: "redaction_applied";
151
+ }>;
152
+ timestamp: z.ZodNumber;
153
+ payload: z.ZodRecord<z.ZodString, z.ZodUnknown>;
154
+ }, z.core.$strip>>;
155
+ }, z.core.$strip>;
156
+ declare const TracesIngestResponseSchema: z.ZodObject<{
157
+ accepted: z.ZodNumber;
158
+ rejected: z.ZodNumber;
159
+ errors: z.ZodDefault<z.ZodArray<z.ZodObject<{
160
+ eventId: z.ZodString;
161
+ message: z.ZodString;
162
+ }, z.core.$strip>>>;
163
+ }, z.core.$strip>;
164
+ declare const FeedbackLabelSchema: z.ZodObject<{
165
+ id: z.ZodOptional<z.ZodString>;
166
+ source: z.ZodEnum<{
167
+ judge: "judge";
168
+ system: "system";
169
+ user: "user";
170
+ policy: "policy";
171
+ environment: "environment";
172
+ metric: "metric";
173
+ }>;
174
+ kind: z.ZodEnum<{
175
+ approve: "approve";
176
+ reject: "reject";
177
+ select: "select";
178
+ edit: "edit";
179
+ rank: "rank";
180
+ rate: "rate";
181
+ comment: "comment";
182
+ metric_outcome: "metric_outcome";
183
+ policy_block: "policy_block";
184
+ revision_request: "revision_request";
185
+ }>;
186
+ value: z.ZodUnknown;
187
+ reason: z.ZodOptional<z.ZodString>;
188
+ severity: z.ZodOptional<z.ZodEnum<{
189
+ error: "error";
190
+ info: "info";
191
+ warning: "warning";
192
+ critical: "critical";
193
+ }>>;
194
+ createdAt: z.ZodString;
195
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
196
+ }, z.core.$strip>;
197
+ declare const FeedbackAttemptSchema: z.ZodObject<{
198
+ id: z.ZodString;
199
+ stepIndex: z.ZodNumber;
200
+ artifactType: z.ZodEnum<{
201
+ action: "action";
202
+ decision: "decision";
203
+ text: "text";
204
+ code: "code";
205
+ plan: "plan";
206
+ research: "research";
207
+ ui: "ui";
208
+ data: "data";
209
+ other: "other";
210
+ }>;
211
+ artifact: z.ZodUnknown;
212
+ options: z.ZodOptional<z.ZodArray<z.ZodUnknown>>;
213
+ proposedAction: z.ZodOptional<z.ZodObject<{
214
+ type: z.ZodString;
215
+ risk: z.ZodOptional<z.ZodEnum<{
216
+ medium: "medium";
217
+ low: "low";
218
+ high: "high";
219
+ }>>;
220
+ costUsd: z.ZodOptional<z.ZodNumber>;
221
+ externalSideEffect: z.ZodOptional<z.ZodBoolean>;
222
+ requiresApproval: z.ZodOptional<z.ZodBoolean>;
223
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
224
+ }, z.core.$strip>>;
225
+ feedback: z.ZodOptional<z.ZodArray<z.ZodObject<{
226
+ id: z.ZodOptional<z.ZodString>;
227
+ source: z.ZodEnum<{
228
+ judge: "judge";
229
+ system: "system";
230
+ user: "user";
231
+ policy: "policy";
232
+ environment: "environment";
233
+ metric: "metric";
234
+ }>;
235
+ kind: z.ZodEnum<{
236
+ approve: "approve";
237
+ reject: "reject";
238
+ select: "select";
239
+ edit: "edit";
240
+ rank: "rank";
241
+ rate: "rate";
242
+ comment: "comment";
243
+ metric_outcome: "metric_outcome";
244
+ policy_block: "policy_block";
245
+ revision_request: "revision_request";
246
+ }>;
247
+ value: z.ZodUnknown;
248
+ reason: z.ZodOptional<z.ZodString>;
249
+ severity: z.ZodOptional<z.ZodEnum<{
250
+ error: "error";
251
+ info: "info";
252
+ warning: "warning";
253
+ critical: "critical";
254
+ }>>;
255
+ createdAt: z.ZodString;
256
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
257
+ }, z.core.$strip>>>;
258
+ createdAt: z.ZodString;
259
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
260
+ }, z.core.$strip>;
261
+ declare const FeedbackTrajectorySchema: z.ZodObject<{
262
+ id: z.ZodString;
263
+ projectId: z.ZodOptional<z.ZodString>;
264
+ scenarioId: z.ZodOptional<z.ZodString>;
265
+ task: z.ZodObject<{
266
+ intent: z.ZodString;
267
+ context: z.ZodOptional<z.ZodUnknown>;
268
+ }, z.core.$strip>;
269
+ attempts: z.ZodDefault<z.ZodArray<z.ZodObject<{
270
+ id: z.ZodString;
271
+ stepIndex: z.ZodNumber;
272
+ artifactType: z.ZodEnum<{
273
+ action: "action";
274
+ decision: "decision";
275
+ text: "text";
276
+ code: "code";
277
+ plan: "plan";
278
+ research: "research";
279
+ ui: "ui";
280
+ data: "data";
281
+ other: "other";
282
+ }>;
283
+ artifact: z.ZodUnknown;
284
+ options: z.ZodOptional<z.ZodArray<z.ZodUnknown>>;
285
+ proposedAction: z.ZodOptional<z.ZodObject<{
286
+ type: z.ZodString;
287
+ risk: z.ZodOptional<z.ZodEnum<{
288
+ medium: "medium";
289
+ low: "low";
290
+ high: "high";
291
+ }>>;
292
+ costUsd: z.ZodOptional<z.ZodNumber>;
293
+ externalSideEffect: z.ZodOptional<z.ZodBoolean>;
294
+ requiresApproval: z.ZodOptional<z.ZodBoolean>;
295
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
296
+ }, z.core.$strip>>;
297
+ feedback: z.ZodOptional<z.ZodArray<z.ZodObject<{
298
+ id: z.ZodOptional<z.ZodString>;
299
+ source: z.ZodEnum<{
300
+ judge: "judge";
301
+ system: "system";
302
+ user: "user";
303
+ policy: "policy";
304
+ environment: "environment";
305
+ metric: "metric";
306
+ }>;
307
+ kind: z.ZodEnum<{
308
+ approve: "approve";
309
+ reject: "reject";
310
+ select: "select";
311
+ edit: "edit";
312
+ rank: "rank";
313
+ rate: "rate";
314
+ comment: "comment";
315
+ metric_outcome: "metric_outcome";
316
+ policy_block: "policy_block";
317
+ revision_request: "revision_request";
318
+ }>;
319
+ value: z.ZodUnknown;
320
+ reason: z.ZodOptional<z.ZodString>;
321
+ severity: z.ZodOptional<z.ZodEnum<{
322
+ error: "error";
323
+ info: "info";
324
+ warning: "warning";
325
+ critical: "critical";
326
+ }>>;
327
+ createdAt: z.ZodString;
328
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
329
+ }, z.core.$strip>>>;
330
+ createdAt: z.ZodString;
331
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
332
+ }, z.core.$strip>>>;
333
+ labels: z.ZodDefault<z.ZodArray<z.ZodObject<{
334
+ id: z.ZodOptional<z.ZodString>;
335
+ source: z.ZodEnum<{
336
+ judge: "judge";
337
+ system: "system";
338
+ user: "user";
339
+ policy: "policy";
340
+ environment: "environment";
341
+ metric: "metric";
342
+ }>;
343
+ kind: z.ZodEnum<{
344
+ approve: "approve";
345
+ reject: "reject";
346
+ select: "select";
347
+ edit: "edit";
348
+ rank: "rank";
349
+ rate: "rate";
350
+ comment: "comment";
351
+ metric_outcome: "metric_outcome";
352
+ policy_block: "policy_block";
353
+ revision_request: "revision_request";
354
+ }>;
355
+ value: z.ZodUnknown;
356
+ reason: z.ZodOptional<z.ZodString>;
357
+ severity: z.ZodOptional<z.ZodEnum<{
358
+ error: "error";
359
+ info: "info";
360
+ warning: "warning";
361
+ critical: "critical";
362
+ }>>;
363
+ createdAt: z.ZodString;
364
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
365
+ }, z.core.$strip>>>;
366
+ outcome: z.ZodOptional<z.ZodObject<{
367
+ success: z.ZodOptional<z.ZodBoolean>;
368
+ score: z.ZodOptional<z.ZodNumber>;
369
+ metrics: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
370
+ costUsd: z.ZodOptional<z.ZodNumber>;
371
+ detail: z.ZodOptional<z.ZodString>;
372
+ observedAt: z.ZodOptional<z.ZodString>;
373
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
374
+ }, z.core.$strip>>;
375
+ split: z.ZodOptional<z.ZodEnum<{
376
+ train: "train";
377
+ dev: "dev";
378
+ test: "test";
379
+ holdout: "holdout";
380
+ }>>;
381
+ tags: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
382
+ createdAt: z.ZodString;
383
+ updatedAt: z.ZodOptional<z.ZodString>;
384
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
385
+ }, z.core.$strip>;
386
+ declare const FeedbackIngestResponseSchema: z.ZodObject<{
387
+ id: z.ZodString;
388
+ persisted: z.ZodBoolean;
389
+ }, z.core.$strip>;
390
+ type TraceEvent = z.infer<typeof TraceEventSchema>;
391
+ type TracesIngestRequest = z.infer<typeof TracesIngestRequestSchema>;
392
+ type TracesIngestResponse = z.infer<typeof TracesIngestResponseSchema>;
393
+ type FeedbackTrajectory = z.infer<typeof FeedbackTrajectorySchema>;
394
+ type FeedbackIngestResponse = z.infer<typeof FeedbackIngestResponseSchema>;
108
395
  declare const ErrorResponseSchema: z.ZodObject<{
109
396
  error: z.ZodObject<{
110
397
  code: z.ZodString;
@@ -132,6 +419,18 @@ declare const WIRE_VERSION = "1.0.0";
132
419
  */
133
420
  declare function hashRubric(rubric: Rubric): string;
134
421
 
422
+ /**
423
+ * Pure handler functions — the "business logic" behind every wire-protocol
424
+ * method. The HTTP server (`server.ts`) and the stdio RPC (`rpc.ts`) both
425
+ * call these. Tests call these directly without spinning a server.
426
+ *
427
+ * Each handler:
428
+ * - Takes a parsed request (already Zod-validated by the transport).
429
+ * - Returns a result that matches the response schema.
430
+ * - Throws `WireError` for caller-fixable errors (404, 400, 422).
431
+ * - Lets unexpected errors bubble — the transport maps them to 500.
432
+ */
433
+
135
434
  /** Caller-fixable error. The transport renders this to 4xx + ErrorResponse. */
136
435
  declare class WireError extends Error {
137
436
  readonly code: string;
@@ -142,6 +441,57 @@ declare class WireError extends Error {
142
441
  declare function handleJudge(req: JudgeRequest): Promise<JudgeResult>;
143
442
  declare function handleListRubrics(): ListRubricsResponse;
144
443
  declare function handleVersion(): VersionResponse;
444
+ /**
445
+ * Pluggable stores the wire layer routes ingestion writes into. Both
446
+ * are optional — when omitted, the corresponding endpoint returns 503.
447
+ *
448
+ * Production deployments wire a `FileSystemTraceStore` and
449
+ * `FileSystemFeedbackTrajectoryStore` here. Tests substitute in-memory
450
+ * stores.
451
+ */
452
+ interface IngestionStores {
453
+ traceStore?: TraceStore;
454
+ feedbackStore?: FeedbackTrajectoryStore;
455
+ }
456
+ /**
457
+ * `POST /v1/traces/ingest` — accept a batch of `TraceEvent`s from the
458
+ * production runtime. Best-effort: each event is appended independently;
459
+ * one bad event does not poison the batch.
460
+ *
461
+ * Idempotency: the underlying store is append-only; consumers retrying
462
+ * the same payload will get duplicate events. Consumers should
463
+ * de-duplicate by `eventId` downstream — production traces frequently
464
+ * land via at-least-once buses (Kafka, SQS) where dedup is unavoidable.
465
+ */
466
+ declare function handleTracesIngest(req: TracesIngestRequest, stores: IngestionStores): Promise<TracesIngestResponse>;
467
+ /**
468
+ * `POST /v1/feedback` — accept a single `FeedbackTrajectory` from the
469
+ * production runtime. Idempotent on `id`: re-posting the same trajectory
470
+ * replaces the prior record.
471
+ */
472
+ declare function handleFeedbackIngest(req: FeedbackTrajectory, stores: IngestionStores): Promise<FeedbackIngestResponse>;
473
+
474
+ declare function buildOpenApi(packageVersion: string): OpenAPIObject;
475
+
476
+ interface RpcRequest {
477
+ method: 'judge' | 'listRubrics' | 'version';
478
+ params?: unknown;
479
+ }
480
+ interface RpcSuccess {
481
+ result: unknown;
482
+ }
483
+ interface RpcError {
484
+ error: {
485
+ code: string;
486
+ message: string;
487
+ details?: unknown;
488
+ };
489
+ }
490
+ declare function dispatchRpc(req: RpcRequest): Promise<RpcSuccess | RpcError>;
491
+ /** Read one JSON request from stdin, write one JSON response to stdout. */
492
+ declare function runRpcOnce(method?: string): Promise<number>;
493
+ /** Read JSONL requests from stdin, write JSONL responses to stdout. */
494
+ declare function runRpcBatch(method?: string): Promise<number>;
145
495
 
146
496
  /**
147
497
  * Built-in rubrics shipped with agent-eval.
@@ -177,10 +527,24 @@ declare function listBuiltinRubrics(): {
177
527
  rubricVersion: string;
178
528
  }[];
179
529
 
180
- declare function buildOpenApi(packageVersion: string): OpenAPIObject;
181
-
182
- declare function createApp(): Hono<hono_types.BlankEnv, hono_types.BlankSchema, "/">;
183
- interface ServeOptions {
530
+ interface CreateAppOptions {
531
+ /** Stores wired to the ingestion endpoints. */
532
+ stores?: IngestionStores;
533
+ /**
534
+ * Bearer-token auth. When provided, every endpoint EXCEPT `/healthz`
535
+ * and `/v1/version` requires `Authorization: Bearer <token>`. The
536
+ * token may be a static string OR a function for time-bounded /
537
+ * rotating tokens.
538
+ *
539
+ * Recommended for any server that accepts ingestion writes from the
540
+ * public internet. Read-only deployments may omit it.
541
+ */
542
+ auth?: {
543
+ bearer: string | ((token: string) => boolean | Promise<boolean>);
544
+ };
545
+ }
546
+ declare function createApp(opts?: CreateAppOptions): Hono<hono_types.BlankEnv, hono_types.BlankSchema, "/">;
547
+ interface ServeOptions extends CreateAppOptions {
184
548
  /** Default 5005. */
185
549
  port?: number;
186
550
  /** Default '127.0.0.1'. Set to '0.0.0.0' to listen on all interfaces. */
@@ -188,24 +552,4 @@ interface ServeOptions {
188
552
  }
189
553
  declare function startServer(opts?: ServeOptions): ServerType;
190
554
 
191
- interface RpcRequest {
192
- method: 'judge' | 'listRubrics' | 'version';
193
- params?: unknown;
194
- }
195
- interface RpcSuccess {
196
- result: unknown;
197
- }
198
- interface RpcError {
199
- error: {
200
- code: string;
201
- message: string;
202
- details?: unknown;
203
- };
204
- }
205
- declare function dispatchRpc(req: RpcRequest): Promise<RpcSuccess | RpcError>;
206
- /** Read one JSON request from stdin, write one JSON response to stdout. */
207
- declare function runRpcOnce(method?: string): Promise<number>;
208
- /** Read JSONL requests from stdin, write JSONL responses to stdout. */
209
- declare function runRpcBatch(method?: string): Promise<number>;
210
-
211
- export { BUILTIN_RUBRICS, type ErrorResponse, ErrorResponseSchema, type FailureMode, FailureModeSchema, HealthResponseSchema, type JudgeRequest, JudgeRequestSchema, type JudgeResult, JudgeResultSchema, type ListRubricsResponse, ListRubricsResponseSchema, type Rubric, type RubricDimension, RubricDimensionSchema, type RubricInfo, RubricInfoSchema, RubricSchema, type ServeOptions, type VersionResponse, VersionResponseSchema, WIRE_VERSION, WireError, buildOpenApi, createApp, dispatchRpc, getBuiltinRubric, handleJudge, handleListRubrics, handleVersion, hashRubric, listBuiltinRubrics, runRpcBatch, runRpcOnce, startServer };
555
+ export { BUILTIN_RUBRICS, type ErrorResponse, ErrorResponseSchema, type FailureMode, FailureModeSchema, FeedbackAttemptSchema, type FeedbackIngestResponse, FeedbackIngestResponseSchema, FeedbackLabelSchema, type FeedbackTrajectory, FeedbackTrajectorySchema, HealthResponseSchema, type IngestionStores, type JudgeRequest, JudgeRequestSchema, type JudgeResult, JudgeResultSchema, type ListRubricsResponse, ListRubricsResponseSchema, type Rubric, type RubricDimension, RubricDimensionSchema, type RubricInfo, RubricInfoSchema, RubricSchema, type ServeOptions, type TraceEvent, TraceEventSchema, type TracesIngestRequest, TracesIngestRequestSchema, type TracesIngestResponse, TracesIngestResponseSchema, type VersionResponse, VersionResponseSchema, WIRE_VERSION, WireError, buildOpenApi, createApp, dispatchRpc, getBuiltinRubric, handleFeedbackIngest, handleJudge, handleListRubrics, handleTracesIngest, handleVersion, hashRubric, listBuiltinRubrics, runRpcBatch, runRpcOnce, startServer };
@@ -2,6 +2,10 @@ import {
2
2
  BUILTIN_RUBRICS,
3
3
  ErrorResponseSchema,
4
4
  FailureModeSchema,
5
+ FeedbackAttemptSchema,
6
+ FeedbackIngestResponseSchema,
7
+ FeedbackLabelSchema,
8
+ FeedbackTrajectorySchema,
5
9
  HealthResponseSchema,
6
10
  JudgeRequestSchema,
7
11
  JudgeResultSchema,
@@ -9,6 +13,9 @@ import {
9
13
  RubricDimensionSchema,
10
14
  RubricInfoSchema,
11
15
  RubricSchema,
16
+ TraceEventSchema,
17
+ TracesIngestRequestSchema,
18
+ TracesIngestResponseSchema,
12
19
  VersionResponseSchema,
13
20
  WIRE_VERSION,
14
21
  WireError,
@@ -16,22 +23,29 @@ import {
16
23
  createApp,
17
24
  dispatchRpc,
18
25
  getBuiltinRubric,
26
+ handleFeedbackIngest,
19
27
  handleJudge,
20
28
  handleListRubrics,
29
+ handleTracesIngest,
21
30
  handleVersion,
22
31
  hashRubric,
23
32
  listBuiltinRubrics,
24
33
  runRpcBatch,
25
34
  runRpcOnce,
26
35
  startServer
27
- } from "../chunk-6KQG5HAH.js";
28
- import "../chunk-KAO3Q65R.js";
29
- import "../chunk-SQQLHODJ.js";
36
+ } from "../chunk-5LBB5B3Z.js";
37
+ import "../chunk-4S4BM3QQ.js";
38
+ import "../chunk-PC4UYEBM.js";
39
+ import "../chunk-NG236HPC.js";
30
40
  import "../chunk-PZ5AY32C.js";
31
41
  export {
32
42
  BUILTIN_RUBRICS,
33
43
  ErrorResponseSchema,
34
44
  FailureModeSchema,
45
+ FeedbackAttemptSchema,
46
+ FeedbackIngestResponseSchema,
47
+ FeedbackLabelSchema,
48
+ FeedbackTrajectorySchema,
35
49
  HealthResponseSchema,
36
50
  JudgeRequestSchema,
37
51
  JudgeResultSchema,
@@ -39,6 +53,9 @@ export {
39
53
  RubricDimensionSchema,
40
54
  RubricInfoSchema,
41
55
  RubricSchema,
56
+ TraceEventSchema,
57
+ TracesIngestRequestSchema,
58
+ TracesIngestResponseSchema,
42
59
  VersionResponseSchema,
43
60
  WIRE_VERSION,
44
61
  WireError,
@@ -46,8 +63,10 @@ export {
46
63
  createApp,
47
64
  dispatchRpc,
48
65
  getBuiltinRubric,
66
+ handleFeedbackIngest,
49
67
  handleJudge,
50
68
  handleListRubrics,
69
+ handleTracesIngest,
51
70
  handleVersion,
52
71
  hashRubric,
53
72
  listBuiltinRubrics,
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-eval",
3
- "version": "0.23.1",
4
- "description": "Trace-first evaluation infrastructure for agent systems: traces, harnesses, verifier pipelines, judges, datasets, gates, optimization, and reporting.",
3
+ "version": "0.25.0",
4
+ "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
5
5
  "homepage": "https://github.com/tangle-network/agent-eval#readme",
6
6
  "repository": {
7
7
  "type": "git",
@@ -64,6 +64,36 @@
64
64
  "import": "./dist/benchmarks/index.js",
65
65
  "default": "./dist/benchmarks/index.js"
66
66
  },
67
+ "./pipelines": {
68
+ "types": "./dist/pipelines/index.d.ts",
69
+ "import": "./dist/pipelines/index.js",
70
+ "default": "./dist/pipelines/index.js"
71
+ },
72
+ "./meta-eval": {
73
+ "types": "./dist/meta-eval/index.d.ts",
74
+ "import": "./dist/meta-eval/index.js",
75
+ "default": "./dist/meta-eval/index.js"
76
+ },
77
+ "./prm": {
78
+ "types": "./dist/prm/index.d.ts",
79
+ "import": "./dist/prm/index.js",
80
+ "default": "./dist/prm/index.js"
81
+ },
82
+ "./builder-eval": {
83
+ "types": "./dist/builder-eval/index.d.ts",
84
+ "import": "./dist/builder-eval/index.js",
85
+ "default": "./dist/builder-eval/index.js"
86
+ },
87
+ "./governance": {
88
+ "types": "./dist/governance/index.d.ts",
89
+ "import": "./dist/governance/index.js",
90
+ "default": "./dist/governance/index.js"
91
+ },
92
+ "./knowledge": {
93
+ "types": "./dist/knowledge/index.d.ts",
94
+ "import": "./dist/knowledge/index.js",
95
+ "default": "./dist/knowledge/index.js"
96
+ },
67
97
  "./openapi.json": {
68
98
  "default": "./dist/openapi.json"
69
99
  }
@@ -79,15 +109,6 @@
79
109
  "publishConfig": {
80
110
  "access": "public"
81
111
  },
82
- "scripts": {
83
- "build": "tsup && pnpm openapi",
84
- "dev": "tsup --watch",
85
- "prepare": "pnpm build",
86
- "test": "vitest run",
87
- "test:watch": "vitest",
88
- "typecheck": "tsc --noEmit",
89
- "openapi": "node dist/cli.js openapi --out dist/openapi.json"
90
- },
91
112
  "dependencies": {
92
113
  "@asteasolutions/zod-to-openapi": "^8.5.0",
93
114
  "@ax-llm/ax": "^19.0.25",
@@ -97,20 +118,25 @@
97
118
  "zod": "^4.3.6"
98
119
  },
99
120
  "devDependencies": {
121
+ "@biomejs/biome": "^2.4.15",
100
122
  "@types/node": "^25.6.0",
101
123
  "openapi3-ts": "^4.5.0",
102
124
  "tsup": "^8.0.0",
103
125
  "typescript": "^5.7.0",
104
126
  "vitest": "^3.0.0"
105
127
  },
106
- "pnpm": {
107
- "overrides": {
108
- "postcss@<8.5.10": "^8.5.10"
109
- }
110
- },
111
128
  "engines": {
112
129
  "node": ">=20"
113
130
  },
114
131
  "license": "MIT",
115
- "packageManager": "pnpm@10.22.0"
116
- }
132
+ "scripts": {
133
+ "build": "tsup && pnpm openapi",
134
+ "dev": "tsup --watch",
135
+ "test": "vitest run",
136
+ "test:watch": "vitest",
137
+ "typecheck": "tsc --noEmit",
138
+ "lint": "biome check src",
139
+ "format": "biome format --write src",
140
+ "openapi": "node dist/cli.js openapi --out dist/openapi.json"
141
+ }
142
+ }
@@ -1 +0,0 @@
1
- {"version":3,"sources":["../src/benchmarks/index.ts","../src/benchmarks/types.ts","../src/benchmarks/routing/index.ts","../src/benchmarks/routing/dataset.ts"],"sourcesContent":["/**\n * Reference benchmark wrappers — entry point.\n *\n * Core surface (exported here):\n * - The `BenchmarkAdapter` contract.\n * - `deterministicSplit` + `BENCHMARK_SPLIT_SEED` for split assignment.\n * - `routing` — synthetic 16-task router benchmark. The only novel\n * benchmark we built; ships in the package.\n *\n * Example wrappers (under `examples/benchmarks/`, NOT in the bundle):\n * - `gsm8k` — exact-match math reasoning (HF mirror, dataset\n * not bundled).\n * - `swebench-lite` — 30-instance SWE-Bench subset via an external\n * grader command.\n *\n * The example wrappers are reference implementations of `BenchmarkAdapter`.\n * Read them, copy them, adapt them. They're intentionally not in the main\n * entry — every team will configure them differently.\n */\n\nexport type {\n BenchmarkAdapter,\n BenchmarkDatasetItem,\n BenchmarkEvaluation,\n} from './types'\nexport { deterministicSplit, BENCHMARK_SPLIT_SEED } from './types'\n\nexport * as routing from './routing/index'\n","/**\n * Shared types for the reference benchmark wrappers under\n * `src/benchmarks/`. Each wrapper exports the three functions in\n * `BenchmarkAdapter` plus its own typed `DatasetItem` shape.\n */\n\nimport type { RunSplitTag } from '../run-record'\n\nexport interface BenchmarkDatasetItem<TPayload = unknown> {\n /** Stable dataset-local item id (used for split assignment + paper\n * references). Unique within a benchmark. */\n id: string\n /** Free-form payload. Each benchmark defines its own shape. */\n payload: TPayload\n}\n\nexport interface BenchmarkEvaluation {\n /** [0, 1] score for the response on this item. Exact-match\n * benchmarks use 0/1; partial-credit benchmarks may return\n * fractional values. */\n score: number\n /** Optional bag of raw scoring signals — e.g. parsed numeric\n * answer, regex match, judge sub-scores. */\n raw: Record<string, unknown>\n}\n\n/** Common signature implemented by every adapter under `src/benchmarks/*`. */\n// `TPayload` is the per-item payload type; `_TItem` is preserved for\n// downstream type-narrowing extensions (a richer `BenchmarkDatasetItem`\n// subclass that adds e.g. provenance metadata) but is intentionally\n// unused here. `noUnusedLocals` requires the leading underscore.\nexport interface BenchmarkAdapter<_TItem = unknown, TPayload = unknown> {\n /** Load the dataset for the given split. May hit the network on\n * first call but should be cache-friendly. Adapters that don't\n * ship the dataset itself MUST throw a clearly-marked error\n * pointing the caller at the loader script. */\n loadDataset(split: RunSplitTag): Promise<BenchmarkDatasetItem<TPayload>[]>\n /** Score a single response. Pure with respect to the inputs. */\n evaluate(item: BenchmarkDatasetItem<TPayload>, response: string): Promise<BenchmarkEvaluation>\n /** Deterministic split assignment via item id hashing. The\n * fraction of items in each split is implementation-defined but\n * MUST be stable across processes and platforms. */\n assignSplit(itemId: string): RunSplitTag\n}\n\n// ── Deterministic split assignment ───────────────────────────────────\n\n/**\n * 32-bit FNV-1a hash. Stable, allocation-free, deterministic across\n * runtimes. We use it to assign items to splits rather than depending\n * on a polyfilled crypto.subtle path.\n */\nfunction fnv1a32(input: string): number {\n let h = 0x811c9dc5\n for (let i = 0; i < input.length; i++) {\n h ^= input.charCodeAt(i) & 0xff\n h = (h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24))) >>> 0\n }\n return h >>> 0\n}\n\n/** Split-assignment seed shared across all benchmarks. Bumping this\n * value reshuffles every split — do NOT do that lightly. */\nexport const BENCHMARK_SPLIT_SEED = 'agent-eval-v1'\n\n/**\n * Assign an item id to one of `'search' | 'dev' | 'holdout'` using a\n * stable 32-bit hash of `${seed}::${id}`. Default proportions:\n *\n * search: 60% (optimization-readable)\n * dev: 20% (held-out for tuning, leak-on-purpose during dev)\n * holdout:20% (paper-grade held-out, gated reads)\n */\nexport function deterministicSplit(\n itemId: string,\n seed: string = BENCHMARK_SPLIT_SEED,\n): RunSplitTag {\n const h = fnv1a32(`${seed}::${itemId}`)\n const pos = h / 0x100000000\n if (pos < 0.6) return 'search'\n if (pos < 0.8) return 'dev'\n return 'holdout'\n}\n","/**\n * Routing benchmark — synthetic, dependency-free, ships in the\n * package. 16 cross-category items in `dataset.ts`. See\n * `routing/README.md` for the format.\n *\n * `evaluate` does case-insensitive exact match against the canonical\n * route plus declared synonyms. The first valid route token in the\n * response wins; everything else is ignored. Wrong answers also\n * report whether they hit a hard negative — useful when triaging\n * \"always picks the popular route\" failure modes.\n */\n\nimport type {\n BenchmarkAdapter,\n BenchmarkDatasetItem,\n BenchmarkEvaluation,\n} from '../types'\nimport { deterministicSplit } from '../types'\nimport type { RunSplitTag } from '../../run-record'\nimport { ROUTING_DATASET, type RoutingItem } from './dataset'\n\nexport type { RoutingItem }\nexport type RoutingPayload = RoutingItem\nexport type RoutingDatasetItem = BenchmarkDatasetItem<RoutingPayload>\n\nclass RoutingAdapter\n implements BenchmarkAdapter<RoutingDatasetItem, RoutingPayload>\n{\n async loadDataset(split: RunSplitTag): Promise<RoutingDatasetItem[]> {\n return ROUTING_DATASET\n .map((item) => ({ id: item.id, payload: item }))\n .filter((it) => assignSplitImpl(it.id) === split)\n }\n\n async evaluate(\n item: RoutingDatasetItem,\n response: string,\n ): Promise<BenchmarkEvaluation> {\n const tokens = extractRouteTokens(response)\n const correct = new Set<string>([item.payload.route, ...item.payload.synonyms].map((s) => s.toLowerCase()))\n const hardNeg = new Set<string>(item.payload.hardNegatives.map((s) => s.toLowerCase()))\n const firstMatch = tokens.find((t) => correct.has(t.toLowerCase())) ?? null\n const firstHardNeg = tokens.find((t) => hardNeg.has(t.toLowerCase())) ?? null\n const score = firstMatch ? 1 : 0\n return {\n score,\n raw: {\n firstToken: tokens[0] ?? null,\n matchedRoute: firstMatch,\n hitHardNegative: Boolean(firstHardNeg),\n hardNegativeRoute: firstHardNeg,\n category: item.payload.category,\n },\n }\n }\n\n assignSplit(itemId: string): RunSplitTag {\n return assignSplitImpl(itemId)\n }\n}\n\nfunction assignSplitImpl(itemId: string): RunSplitTag {\n return deterministicSplit(`routing::${itemId}`)\n}\n\n/**\n * Pull route-shaped tokens out of a model response. Routes look like\n * `category.action` (`fs.write`, `chat.reply`). Bare alphanumerics\n * are not routes, but `category.action` patterns are robust to most\n * model wrappers (JSON output, prose explanations, code fences).\n */\nexport function extractRouteTokens(response: string): string[] {\n const matches = response.match(/[a-z][a-z0-9_]*\\.[a-z][a-z0-9_]*/gi)\n return matches ?? []\n}\n\nconst adapter = new RoutingAdapter()\n\nexport const loadDataset = adapter.loadDataset.bind(adapter)\nexport const evaluate = adapter.evaluate.bind(adapter)\nexport const assignSplit = adapter.assignSplit.bind(adapter)\nexport { RoutingAdapter, ROUTING_DATASET }\n","/**\n * Synthetic routing dataset. 16 tasks across 4 categories. Used as a\n * deterministic, dependency-free benchmark for any router that maps a\n * natural-language request to one of a fixed set of route labels.\n *\n * Format (see `routing/README.md` for prose):\n *\n * {\n * id: stable per-task ID (matches across processes).\n * category: one of the four route labels.\n * prompt: the user-facing request the router must classify.\n * route: the ground-truth route the router should pick.\n * synonyms: other strings that count as a correct answer.\n * hardNegatives:close-but-wrong route labels — used to detect the\n * \"always picks the popular route\" failure mode.\n * }\n *\n * The four categories are intentionally cross-domain (file ops,\n * math, search, conversation) so a router that collapses to one\n * category is easy to spot.\n */\n\nexport interface RoutingItem {\n id: string\n category: 'file' | 'math' | 'search' | 'chat'\n prompt: string\n /** Canonical correct route label. */\n route: string\n /** Alternate route labels that also count as correct. */\n synonyms: string[]\n /** Wrong-but-tempting route labels (for analysis, not grading). */\n hardNegatives: string[]\n}\n\nexport const ROUTING_DATASET: RoutingItem[] = [\n {\n id: 'file_001',\n category: 'file',\n prompt: 'Save the meeting notes to /tmp/notes-2025-04.md as markdown.',\n route: 'fs.write',\n synonyms: ['filesystem.write', 'write_file'],\n hardNegatives: ['fs.read', 'chat.reply'],\n },\n {\n id: 'file_002',\n category: 'file',\n prompt: 'Read the contents of /etc/hosts and summarize the entries.',\n route: 'fs.read',\n synonyms: ['filesystem.read', 'read_file'],\n hardNegatives: ['fs.write', 'search.web'],\n },\n {\n id: 'file_003',\n category: 'file',\n prompt: 'List every Python file under src/ recursively.',\n route: 'fs.list',\n synonyms: ['filesystem.list', 'list_files'],\n hardNegatives: ['fs.read', 'search.code'],\n },\n {\n id: 'file_004',\n category: 'file',\n prompt: 'Delete the cached build at .turbo/cache.',\n route: 'fs.delete',\n synonyms: ['filesystem.delete', 'remove_file'],\n hardNegatives: ['fs.write', 'fs.list'],\n },\n {\n id: 'math_001',\n category: 'math',\n prompt: 'What is the integral of 3x^2 + 2x from 0 to 5?',\n route: 'math.integral',\n synonyms: ['calculator.integral', 'math.solve'],\n hardNegatives: ['math.derivative', 'chat.reply'],\n },\n {\n id: 'math_002',\n category: 'math',\n prompt: 'Compute the derivative of sin(x) * cos(x).',\n route: 'math.derivative',\n synonyms: ['calculator.derivative', 'math.solve'],\n hardNegatives: ['math.integral', 'math.algebra'],\n },\n {\n id: 'math_003',\n category: 'math',\n prompt: 'Solve 2x + 7 = 19 for x.',\n route: 'math.algebra',\n synonyms: ['calculator.algebra', 'math.solve'],\n hardNegatives: ['math.derivative', 'math.integral'],\n },\n {\n id: 'math_004',\n category: 'math',\n prompt: 'What is the prime factorization of 360?',\n route: 'math.numbertheory',\n synonyms: ['calculator.factor', 'math.solve'],\n hardNegatives: ['math.algebra', 'search.web'],\n },\n {\n id: 'search_001',\n category: 'search',\n prompt: 'Find recent papers on agent prompt optimization with held-out promotion gates.',\n route: 'search.web',\n synonyms: ['web.search', 'search.papers'],\n hardNegatives: ['search.code', 'chat.reply'],\n },\n {\n id: 'search_002',\n category: 'search',\n prompt: 'Search the codebase for every call site of `runProposeReview`.',\n route: 'search.code',\n synonyms: ['code.search', 'grep'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'search_003',\n category: 'search',\n prompt: 'What is the latest release of the Tangle network on GitHub?',\n route: 'search.web',\n synonyms: ['web.search', 'github.releases'],\n hardNegatives: ['search.code', 'chat.reply'],\n },\n {\n id: 'search_004',\n category: 'search',\n prompt: 'Find all TODO comments in the agent-eval src tree.',\n route: 'search.code',\n synonyms: ['code.search', 'grep'],\n hardNegatives: ['search.web', 'fs.list'],\n },\n {\n id: 'chat_001',\n category: 'chat',\n prompt: 'Hi there, how are you doing today?',\n route: 'chat.reply',\n synonyms: ['conversation.reply'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'chat_002',\n category: 'chat',\n prompt: 'Please explain the difference between an LLM and a foundation model.',\n route: 'chat.reply',\n synonyms: ['conversation.reply', 'qa.answer'],\n hardNegatives: ['search.web', 'math.algebra'],\n },\n {\n id: 'chat_003',\n category: 'chat',\n prompt: 'Tell me a short joke about distributed systems.',\n route: 'chat.reply',\n synonyms: ['conversation.reply'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'chat_004',\n category: 'chat',\n prompt: 'Acknowledge my last message with a thumbs up.',\n route: 'chat.reply',\n synonyms: ['conversation.reply', 'react'],\n hardNegatives: ['fs.write', 'search.web'],\n },\n]\n"],"mappings":";;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACoDA,SAAS,QAAQ,OAAuB;AACtC,MAAI,IAAI;AACR,WAAS,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;AACrC,SAAK,MAAM,WAAW,CAAC,IAAI;AAC3B,QAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,SAAU;AAAA,EACxE;AACA,SAAO,MAAM;AACf;AAIO,IAAM,uBAAuB;AAU7B,SAAS,mBACd,QACA,OAAe,sBACF;AACb,QAAM,IAAI,QAAQ,GAAG,IAAI,KAAK,MAAM,EAAE;AACtC,QAAM,MAAM,IAAI;AAChB,MAAI,MAAM,IAAK,QAAO;AACtB,MAAI,MAAM,IAAK,QAAO;AACtB,SAAO;AACT;;;AClFA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACkCO,IAAM,kBAAiC;AAAA,EAC5C;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB,YAAY;AAAA,IAC3C,eAAe,CAAC,WAAW,YAAY;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,mBAAmB,WAAW;AAAA,IACzC,eAAe,CAAC,YAAY,YAAY;AAAA,EAC1C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,mBAAmB,YAAY;AAAA,IAC1C,eAAe,CAAC,WAAW,aAAa;AAAA,EAC1C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,qBAAqB,aAAa;AAAA,IAC7C,eAAe,CAAC,YAAY,SAAS;AAAA,EACvC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,uBAAuB,YAAY;AAAA,IAC9C,eAAe,CAAC,mBAAmB,YAAY;AAAA,EACjD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,yBAAyB,YAAY;AAAA,IAChD,eAAe,CAAC,iBAAiB,cAAc;AAAA,EACjD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,YAAY;AAAA,IAC7C,eAAe,CAAC,mBAAmB,eAAe;AAAA,EACpD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,qBAAqB,YAAY;AAAA,IAC5C,eAAe,CAAC,gBAAgB,YAAY;AAAA,EAC9C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,cAAc,eAAe;AAAA,IACxC,eAAe,CAAC,eAAe,YAAY;AAAA,EAC7C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,eAAe,MAAM;AAAA,IAChC,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,cAAc,iBAAiB;AAAA,IAC1C,eAAe,CAAC,eAAe,YAAY;AAAA,EAC7C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,eAAe,MAAM;AAAA,IAChC,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB;AAAA,IAC/B,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,WAAW;AAAA,IAC5C,eAAe,CAAC,cAAc,cAAc;AAAA,EAC9C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB;AAAA,IAC/B,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,OAAO;AAAA,IACxC,eAAe,CAAC,YAAY,YAAY;AAAA,EAC1C;AACF;;;AD1IA,IAAM,iBAAN,MAEA;AAAA,EACE,MAAM,YAAY,OAAmD;AACnE,WAAO,gBACJ,IAAI,CAAC,UAAU,EAAE,IAAI,KAAK,IAAI,SAAS,KAAK,EAAE,EAC9C,OAAO,CAAC,OAAO,gBAAgB,GAAG,EAAE,MAAM,KAAK;AAAA,EACpD;AAAA,EAEA,MAAM,SACJ,MACA,UAC8B;AAC9B,UAAM,SAAS,mBAAmB,QAAQ;AAC1C,UAAM,UAAU,IAAI,IAAY,CAAC,KAAK,QAAQ,OAAO,GAAG,KAAK,QAAQ,QAAQ,EAAE,IAAI,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;AAC1G,UAAM,UAAU,IAAI,IAAY,KAAK,QAAQ,cAAc,IAAI,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;AACtF,UAAM,aAAa,OAAO,KAAK,CAAC,MAAM,QAAQ,IAAI,EAAE,YAAY,CAAC,CAAC,KAAK;AACvE,UAAM,eAAe,OAAO,KAAK,CAAC,MAAM,QAAQ,IAAI,EAAE,YAAY,CAAC,CAAC,KAAK;AACzE,UAAM,QAAQ,aAAa,IAAI;AAC/B,WAAO;AAAA,MACL;AAAA,MACA,KAAK;AAAA,QACH,YAAY,OAAO,CAAC,KAAK;AAAA,QACzB,cAAc;AAAA,QACd,iBAAiB,QAAQ,YAAY;AAAA,QACrC,mBAAmB;AAAA,QACnB,UAAU,KAAK,QAAQ;AAAA,MACzB;AAAA,IACF;AAAA,EACF;AAAA,EAEA,YAAY,QAA6B;AACvC,WAAO,gBAAgB,MAAM;AAAA,EAC/B;AACF;AAEA,SAAS,gBAAgB,QAA6B;AACpD,SAAO,mBAAmB,YAAY,MAAM,EAAE;AAChD;AAQO,SAAS,mBAAmB,UAA4B;AAC7D,QAAM,UAAU,SAAS,MAAM,oCAAoC;AACnE,SAAO,WAAW,CAAC;AACrB;AAEA,IAAM,UAAU,IAAI,eAAe;AAE5B,IAAM,cAAc,QAAQ,YAAY,KAAK,OAAO;AACpD,IAAM,WAAW,QAAQ,SAAS,KAAK,OAAO;AAC9C,IAAM,cAAc,QAAQ,YAAY,KAAK,OAAO;","names":[]}