@tangle-network/agent-eval 0.23.1 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/CHANGELOG.md +145 -0
  2. package/README.md +212 -79
  3. package/dist/baseline-4R5deP0N.d.ts +108 -0
  4. package/dist/benchmarks/index.d.ts +3 -2
  5. package/dist/benchmarks/index.js +1 -1
  6. package/dist/builder-eval/index.d.ts +249 -0
  7. package/dist/builder-eval/index.js +391 -0
  8. package/dist/builder-eval/index.js.map +1 -0
  9. package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
  10. package/dist/chunk-2A5XJB43.js.map +1 -0
  11. package/dist/chunk-47X6LRCE.js +76 -0
  12. package/dist/chunk-47X6LRCE.js.map +1 -0
  13. package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
  14. package/dist/chunk-4F5DQN55.js.map +1 -0
  15. package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
  16. package/dist/chunk-4S4BM3QQ.js.map +1 -0
  17. package/dist/chunk-5BKGXME7.js +65 -0
  18. package/dist/chunk-5BKGXME7.js.map +1 -0
  19. package/dist/{chunk-6KQG5HAH.js → chunk-5LBB5B3Z.js} +376 -72
  20. package/dist/chunk-5LBB5B3Z.js.map +1 -0
  21. package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
  22. package/dist/chunk-6QDKWHLS.js.map +1 -0
  23. package/dist/{chunk-VQQSPGSM.js → chunk-EDUKQ5AM.js} +247 -189
  24. package/dist/chunk-EDUKQ5AM.js.map +1 -0
  25. package/dist/chunk-I4MBDTY5.js +272 -0
  26. package/dist/chunk-I4MBDTY5.js.map +1 -0
  27. package/dist/chunk-JLZQWFV3.js +618 -0
  28. package/dist/chunk-JLZQWFV3.js.map +1 -0
  29. package/dist/chunk-K2TPS5LB.js +569 -0
  30. package/dist/chunk-K2TPS5LB.js.map +1 -0
  31. package/dist/chunk-KKHDIONI.js +414 -0
  32. package/dist/chunk-KKHDIONI.js.map +1 -0
  33. package/dist/chunk-KMPRBJK4.js +74 -0
  34. package/dist/chunk-KMPRBJK4.js.map +1 -0
  35. package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
  36. package/dist/chunk-KTGTIOFD.js.map +1 -0
  37. package/dist/chunk-LSH4MMOZ.js +838 -0
  38. package/dist/chunk-LSH4MMOZ.js.map +1 -0
  39. package/dist/chunk-NG236HPC.js +57 -0
  40. package/dist/chunk-NG236HPC.js.map +1 -0
  41. package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
  42. package/dist/chunk-NLMNWKVM.js.map +1 -0
  43. package/dist/chunk-NU65VQ7M.js +99 -0
  44. package/dist/chunk-NU65VQ7M.js.map +1 -0
  45. package/dist/chunk-OWLAAMME.js +250 -0
  46. package/dist/chunk-OWLAAMME.js.map +1 -0
  47. package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
  48. package/dist/chunk-PC4UYEBM.js.map +1 -0
  49. package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
  50. package/dist/chunk-RAF443UI.js.map +1 -0
  51. package/dist/chunk-RZTMDUO7.js +49 -0
  52. package/dist/chunk-RZTMDUO7.js.map +1 -0
  53. package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
  54. package/dist/chunk-SESZDQPX.js.map +1 -0
  55. package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
  56. package/dist/chunk-TVVP3ZZQ.js.map +1 -0
  57. package/dist/chunk-WWYCWKUM.js +196 -0
  58. package/dist/chunk-WWYCWKUM.js.map +1 -0
  59. package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
  60. package/dist/chunk-YRZ4M5GS.js.map +1 -0
  61. package/dist/chunk-ZN274SWR.js +613 -0
  62. package/dist/chunk-ZN274SWR.js.map +1 -0
  63. package/dist/cli.js +10 -6
  64. package/dist/cli.js.map +1 -1
  65. package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
  66. package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
  67. package/dist/control.d.ts +8 -6
  68. package/dist/control.js +10 -7
  69. package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
  70. package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  71. package/dist/errors-BZ9sTdz7.d.ts +70 -0
  72. package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
  73. package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
  74. package/dist/governance/index.d.ts +5 -0
  75. package/dist/governance/index.js +18 -0
  76. package/dist/governance/index.js.map +1 -0
  77. package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
  78. package/dist/index-Oj9fAPPN.d.ts +270 -0
  79. package/dist/index.d.ts +2018 -3003
  80. package/dist/index.js +7443 -9102
  81. package/dist/index.js.map +1 -1
  82. package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
  83. package/dist/knowledge/index.d.ts +102 -0
  84. package/dist/knowledge/index.js +18 -0
  85. package/dist/knowledge/index.js.map +1 -0
  86. package/dist/meta-eval/index.d.ts +99 -0
  87. package/dist/meta-eval/index.js +324 -0
  88. package/dist/meta-eval/index.js.map +1 -0
  89. package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
  90. package/dist/openapi.json +491 -1
  91. package/dist/optimization.d.ts +11 -8
  92. package/dist/optimization.js +11 -9
  93. package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
  94. package/dist/pipelines/index.d.ts +172 -0
  95. package/dist/pipelines/index.js +345 -0
  96. package/dist/pipelines/index.js.map +1 -0
  97. package/dist/prm/index.d.ts +99 -0
  98. package/dist/prm/index.js +222 -0
  99. package/dist/prm/index.js.map +1 -0
  100. package/dist/query-DODUYdPg.d.ts +30 -0
  101. package/dist/release-report-BNgMdqPF.d.ts +292 -0
  102. package/dist/replay-BL96gCEP.d.ts +226 -0
  103. package/dist/reporting.d.ts +10 -295
  104. package/dist/reporting.js +10 -6
  105. package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-BPT8x_NT.d.ts} +148 -146
  106. package/dist/rl.d.ts +1762 -8
  107. package/dist/rl.js +2035 -58
  108. package/dist/rl.js.map +1 -1
  109. package/dist/rubric-D5tjHNJQ.d.ts +72 -0
  110. package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
  111. package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
  112. package/dist/sequential-Dgz1n51-.d.ts +139 -0
  113. package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  114. package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-C7VPYEj2.d.ts} +3 -76
  115. package/dist/telemetry/file.js +4 -1
  116. package/dist/telemetry/file.js.map +1 -1
  117. package/dist/telemetry/index.js +57 -57
  118. package/dist/telemetry/index.js.map +1 -1
  119. package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
  120. package/dist/traces.d.ts +142 -387
  121. package/dist/traces.js +1302 -40
  122. package/dist/traces.js.map +1 -1
  123. package/dist/trajectory-CnoBo-JY.d.ts +32 -0
  124. package/dist/wire/index.d.ts +369 -25
  125. package/dist/wire/index.js +22 -3
  126. package/package.json +44 -18
  127. package/dist/chunk-42I2QC2L.js.map +0 -1
  128. package/dist/chunk-5IIQKMD5.js.map +0 -1
  129. package/dist/chunk-6KQG5HAH.js.map +0 -1
  130. package/dist/chunk-6M774GY6.js.map +0 -1
  131. package/dist/chunk-7EAUOUQS.js.map +0 -1
  132. package/dist/chunk-AXHNWLIX.js.map +0 -1
  133. package/dist/chunk-EXGR4XEM.js.map +0 -1
  134. package/dist/chunk-IOXMGMHQ.js.map +0 -1
  135. package/dist/chunk-KAO3Q65R.js.map +0 -1
  136. package/dist/chunk-LZKIOBG2.js +0 -2026
  137. package/dist/chunk-LZKIOBG2.js.map +0 -1
  138. package/dist/chunk-QBW3YBTR.js.map +0 -1
  139. package/dist/chunk-QUKKGHTZ.js.map +0 -1
  140. package/dist/chunk-SQQLHODJ.js.map +0 -1
  141. package/dist/chunk-V5QSWN7L.js +0 -1310
  142. package/dist/chunk-V5QSWN7L.js.map +0 -1
  143. package/dist/chunk-VQQSPGSM.js.map +0 -1
  144. package/dist/chunk-XPHOZPOM.js +0 -1947
  145. package/dist/chunk-XPHOZPOM.js.map +0 -1
  146. package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
  147. package/dist/index-ekBXweiQ.d.ts +0 -1894
  148. package/dist/sequential-DgU2mFsE.d.ts +0 -304
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  callLlmJson
3
- } from "./chunk-KAO3Q65R.js";
3
+ } from "./chunk-4S4BM3QQ.js";
4
4
 
5
5
  // src/wire/schemas.ts
6
6
  import { extendZodWithOpenApi } from "@asteasolutions/zod-to-openapi";
@@ -74,9 +74,119 @@ var HealthResponseSchema = z.object({
74
74
  status: z.literal("ok"),
75
75
  uptimeSec: z.number()
76
76
  }).openapi("HealthResponse");
77
+ var TraceEventSchema = z.object({
78
+ eventId: z.string().min(1).describe("Stable id for the event. Use ULID or UUID."),
79
+ runId: z.string().min(1).describe("Run this event belongs to."),
80
+ spanId: z.string().optional().describe("Span that emitted the event, if any."),
81
+ kind: z.enum([
82
+ "log",
83
+ "error",
84
+ "budget_decrement",
85
+ "budget_breach",
86
+ "state_mutation",
87
+ "policy_violation",
88
+ "redaction_applied",
89
+ "custom"
90
+ ]).describe("Coarse event category \u2014 matches the TraceSchema v1 EventKind enum."),
91
+ timestamp: z.number().int().nonnegative().describe("Unix millis. Must be monotonically non-decreasing within a span."),
92
+ payload: z.record(z.string(), z.unknown()).describe("Free-form payload \u2014 the runtime owns the shape.")
93
+ }).openapi("TraceEvent");
94
+ var TracesIngestRequestSchema = z.object({
95
+ events: z.array(TraceEventSchema).min(1).max(1e4).describe("Batch of events. Max 10k per call \u2014 bigger streams should be chunked.")
96
+ }).openapi("TracesIngestRequest");
97
+ var TracesIngestResponseSchema = z.object({
98
+ accepted: z.number().int().nonnegative().describe("Number of events persisted."),
99
+ rejected: z.number().int().nonnegative().describe("Number of events the store refused \u2014 see `errors[]` for reasons."),
100
+ errors: z.array(
101
+ z.object({
102
+ eventId: z.string().describe("Event id this error applies to."),
103
+ message: z.string().describe("Why the event was rejected.")
104
+ })
105
+ ).default([])
106
+ }).openapi("TracesIngestResponse");
107
+ var FeedbackLabelSchema = z.object({
108
+ id: z.string().optional(),
109
+ source: z.enum(["user", "judge", "environment", "metric", "policy", "system"]),
110
+ kind: z.enum([
111
+ "approve",
112
+ "reject",
113
+ "select",
114
+ "edit",
115
+ "rank",
116
+ "rate",
117
+ "comment",
118
+ "metric_outcome",
119
+ "policy_block",
120
+ "revision_request"
121
+ ]),
122
+ value: z.unknown(),
123
+ reason: z.string().optional(),
124
+ severity: z.enum(["info", "warning", "error", "critical"]).optional(),
125
+ createdAt: z.string().describe("ISO-8601 UTC."),
126
+ metadata: z.record(z.string(), z.unknown()).optional()
127
+ }).openapi("FeedbackLabel");
128
+ var FeedbackAttemptSchema = z.object({
129
+ id: z.string().min(1),
130
+ stepIndex: z.number().int().nonnegative(),
131
+ artifactType: z.enum([
132
+ "text",
133
+ "code",
134
+ "plan",
135
+ "research",
136
+ "action",
137
+ "ui",
138
+ "decision",
139
+ "data",
140
+ "other"
141
+ ]),
142
+ artifact: z.unknown(),
143
+ options: z.array(z.unknown()).optional(),
144
+ proposedAction: z.object({
145
+ type: z.string(),
146
+ risk: z.enum(["low", "medium", "high"]).optional(),
147
+ costUsd: z.number().optional(),
148
+ externalSideEffect: z.boolean().optional(),
149
+ requiresApproval: z.boolean().optional(),
150
+ metadata: z.record(z.string(), z.unknown()).optional()
151
+ }).optional(),
152
+ feedback: z.array(FeedbackLabelSchema).optional(),
153
+ createdAt: z.string(),
154
+ metadata: z.record(z.string(), z.unknown()).optional()
155
+ }).openapi("FeedbackAttempt");
156
+ var FeedbackTrajectorySchema = z.object({
157
+ id: z.string().min(1).describe("Stable id; idempotency key for the trajectory."),
158
+ projectId: z.string().optional(),
159
+ scenarioId: z.string().optional(),
160
+ task: z.object({
161
+ intent: z.string().min(1),
162
+ context: z.unknown().optional()
163
+ }),
164
+ attempts: z.array(FeedbackAttemptSchema).default([]),
165
+ labels: z.array(FeedbackLabelSchema).default([]),
166
+ outcome: z.object({
167
+ success: z.boolean().optional(),
168
+ score: z.number().optional(),
169
+ metrics: z.record(z.string(), z.number()).optional(),
170
+ costUsd: z.number().optional(),
171
+ detail: z.string().optional(),
172
+ observedAt: z.string().optional(),
173
+ metadata: z.record(z.string(), z.unknown()).optional()
174
+ }).optional(),
175
+ split: z.enum(["train", "dev", "test", "holdout"]).optional(),
176
+ tags: z.record(z.string(), z.string()).optional(),
177
+ createdAt: z.string().describe("ISO-8601 UTC."),
178
+ updatedAt: z.string().optional(),
179
+ metadata: z.record(z.string(), z.unknown()).optional()
180
+ }).openapi("FeedbackTrajectory");
181
+ var FeedbackIngestResponseSchema = z.object({
182
+ id: z.string().describe("Trajectory id that was persisted."),
183
+ persisted: z.boolean().describe("True when the trajectory was saved (idempotent on id).")
184
+ }).openapi("FeedbackIngestResponse");
77
185
  var ErrorResponseSchema = z.object({
78
186
  error: z.object({
79
- code: z.string().describe('Machine-readable code: "validation_error", "rubric_not_found", "judge_error".'),
187
+ code: z.string().describe(
188
+ 'Machine-readable code: "validation_error", "rubric_not_found", "judge_error".'
189
+ ),
80
190
  message: z.string().describe("Human-readable message."),
81
191
  details: z.unknown().optional().describe("Optional structured detail.")
82
192
  }).describe("Errors are always wrapped in this shape across all endpoints.")
@@ -247,7 +357,12 @@ function validateJudgeOutput(value, rubric) {
247
357
  for (const dim of rubric.dimensions) {
248
358
  const score = dimensionRecord[dim.id];
249
359
  if (typeof score !== "number" || !Number.isFinite(score) || score < dim.min || score > dim.max) {
250
- throw new WireError("judge_error", `Judge returned invalid score for dimension "${dim.id}".`, 500, value);
360
+ throw new WireError(
361
+ "judge_error",
362
+ `Judge returned invalid score for dimension "${dim.id}".`,
363
+ 500,
364
+ value
365
+ );
251
366
  }
252
367
  dimensions[dim.id] = score;
253
368
  }
@@ -268,7 +383,12 @@ function validateIdArray(raw, allowed, field, original) {
268
383
  const out = [];
269
384
  for (const item of raw) {
270
385
  if (typeof item !== "string" || !allowed.has(item)) {
271
- throw new WireError("judge_error", `Judge returned unknown ${field} id "${String(item)}".`, 500, original);
386
+ throw new WireError(
387
+ "judge_error",
388
+ `Judge returned unknown ${field} id "${String(item)}".`,
389
+ 500,
390
+ original
391
+ );
272
392
  }
273
393
  out.push(item);
274
394
  }
@@ -366,12 +486,46 @@ function handleVersion() {
366
486
  package: "@tangle-network/agent-eval",
367
487
  version: readPackageVersion(),
368
488
  wireVersion: WIRE_VERSION,
369
- apiSurface: ["judge", "listRubrics", "version"]
489
+ apiSurface: ["judge", "listRubrics", "version", "feedback.ingest", "traces.ingest"]
370
490
  };
371
491
  }
492
+ async function handleTracesIngest(req, stores) {
493
+ if (!stores.traceStore) {
494
+ throw new WireError(
495
+ "service_unavailable",
496
+ "No trace store configured on this server. Pass `traceStore` to `createApp`.",
497
+ 503
498
+ );
499
+ }
500
+ const errors = [];
501
+ let accepted = 0;
502
+ for (const event of req.events) {
503
+ try {
504
+ await stores.traceStore.appendEvent(event);
505
+ accepted++;
506
+ } catch (err) {
507
+ errors.push({
508
+ eventId: event.eventId,
509
+ message: err instanceof Error ? err.message : String(err)
510
+ });
511
+ }
512
+ }
513
+ return { accepted, rejected: errors.length, errors };
514
+ }
515
+ async function handleFeedbackIngest(req, stores) {
516
+ if (!stores.feedbackStore) {
517
+ throw new WireError(
518
+ "service_unavailable",
519
+ "No feedback store configured on this server. Pass `feedbackStore` to `createApp`.",
520
+ 503
521
+ );
522
+ }
523
+ await stores.feedbackStore.save(req);
524
+ return { id: req.id, persisted: true };
525
+ }
372
526
 
373
527
  // src/wire/openapi.ts
374
- import { OpenApiGeneratorV31, OpenAPIRegistry } from "@asteasolutions/zod-to-openapi";
528
+ import { OpenAPIRegistry, OpenApiGeneratorV31 } from "@asteasolutions/zod-to-openapi";
375
529
  function buildOpenApi(packageVersion) {
376
530
  const registry = new OpenAPIRegistry();
377
531
  registry.register("JudgeRequest", JudgeRequestSchema);
@@ -380,6 +534,10 @@ function buildOpenApi(packageVersion) {
380
534
  registry.register("VersionResponse", VersionResponseSchema);
381
535
  registry.register("HealthResponse", HealthResponseSchema);
382
536
  registry.register("ErrorResponse", ErrorResponseSchema);
537
+ registry.register("TracesIngestRequest", TracesIngestRequestSchema);
538
+ registry.register("TracesIngestResponse", TracesIngestResponseSchema);
539
+ registry.register("FeedbackTrajectory", FeedbackTrajectorySchema);
540
+ registry.register("FeedbackIngestResponse", FeedbackIngestResponseSchema);
383
541
  registry.registerPath({
384
542
  method: "post",
385
543
  path: "/v1/judge",
@@ -446,6 +604,69 @@ function buildOpenApi(packageVersion) {
446
604
  }
447
605
  }
448
606
  });
607
+ registry.registerPath({
608
+ method: "post",
609
+ path: "/v1/traces/ingest",
610
+ summary: "Ingest a batch of production TraceEvents",
611
+ description: "Append a batch of TraceEvents to the configured TraceStore. Accepts application/json ({events:[...]}) or application/x-ndjson (one event per line). Returns counts of accepted + rejected events.",
612
+ request: {
613
+ body: {
614
+ content: {
615
+ "application/json": { schema: TracesIngestRequestSchema },
616
+ "application/x-ndjson": { schema: TracesIngestRequestSchema }
617
+ }
618
+ }
619
+ },
620
+ responses: {
621
+ 200: {
622
+ description: "Ingestion summary",
623
+ content: { "application/json": { schema: TracesIngestResponseSchema } }
624
+ },
625
+ 400: {
626
+ description: "Validation error",
627
+ content: { "application/json": { schema: ErrorResponseSchema } }
628
+ },
629
+ 401: {
630
+ description: "Unauthorized (when bearer auth is configured)",
631
+ content: { "application/json": { schema: ErrorResponseSchema } }
632
+ },
633
+ 503: {
634
+ description: "No trace store configured",
635
+ content: { "application/json": { schema: ErrorResponseSchema } }
636
+ }
637
+ }
638
+ });
639
+ registry.registerPath({
640
+ method: "post",
641
+ path: "/v1/feedback",
642
+ summary: "Ingest a FeedbackTrajectory from production",
643
+ description: "Persist a single FeedbackTrajectory. Idempotent on trajectory.id \u2014 re-posting replaces the prior record. Used by production runtimes to forward user \u{1F44D}/\u{1F44E}/edits into the eval substrate.",
644
+ request: {
645
+ body: {
646
+ content: {
647
+ "application/json": { schema: FeedbackTrajectorySchema }
648
+ }
649
+ }
650
+ },
651
+ responses: {
652
+ 200: {
653
+ description: "Persisted",
654
+ content: { "application/json": { schema: FeedbackIngestResponseSchema } }
655
+ },
656
+ 400: {
657
+ description: "Validation error",
658
+ content: { "application/json": { schema: ErrorResponseSchema } }
659
+ },
660
+ 401: {
661
+ description: "Unauthorized (when bearer auth is configured)",
662
+ content: { "application/json": { schema: ErrorResponseSchema } }
663
+ },
664
+ 503: {
665
+ description: "No feedback store configured",
666
+ content: { "application/json": { schema: ErrorResponseSchema } }
667
+ }
668
+ }
669
+ });
449
670
  const generator = new OpenApiGeneratorV31(registry.definitions);
450
671
  const doc = generator.generateDocument({
451
672
  openapi: "3.1.0",
@@ -494,62 +715,6 @@ Wire-protocol version: ${WIRE_VERSION}. Bumps on breaking changes to request/res
494
715
  return doc;
495
716
  }
496
717
 
497
- // src/wire/server.ts
498
- import { serve } from "@hono/node-server";
499
- import { Hono } from "hono";
500
- import { cors } from "hono/cors";
501
- var STARTED_AT = Date.now();
502
- function createApp() {
503
- const app = new Hono();
504
- app.use("*", cors());
505
- app.onError((err, c) => {
506
- if (err instanceof WireError) {
507
- return c.json(
508
- { error: { code: err.code, message: err.message, details: err.details } },
509
- err.status
510
- );
511
- }
512
- console.error("[agent-eval] unhandled error:", err);
513
- return c.json(
514
- { error: { code: "internal_error", message: "Internal server error." } },
515
- 500
516
- );
517
- });
518
- app.get(
519
- "/healthz",
520
- (c) => c.json({ status: "ok", uptimeSec: (Date.now() - STARTED_AT) / 1e3 })
521
- );
522
- app.get("/v1/version", (c) => c.json(handleVersion()));
523
- app.get("/v1/rubrics", (c) => c.json(handleListRubrics()));
524
- app.post("/v1/judge", async (c) => {
525
- const raw = await c.req.json().catch(() => null);
526
- if (raw == null) {
527
- throw new WireError("validation_error", "Request body must be JSON.", 400);
528
- }
529
- const parsed = JudgeRequestSchema.safeParse(raw);
530
- if (!parsed.success) {
531
- throw new WireError(
532
- "validation_error",
533
- "Request did not match JudgeRequest schema.",
534
- 400,
535
- parsed.error.issues
536
- );
537
- }
538
- const result = await handleJudge(parsed.data);
539
- return c.json(result);
540
- });
541
- app.get("/openapi.json", (c) => c.json(buildOpenApi(handleVersion().version)));
542
- return app;
543
- }
544
- function startServer(opts = {}) {
545
- const app = createApp();
546
- const port = opts.port ?? 5005;
547
- const host = opts.host ?? "127.0.0.1";
548
- return serve({ fetch: app.fetch, port, hostname: host }, ({ address, port: actualPort }) => {
549
- console.log(`[agent-eval] serving on http://${address}:${actualPort}`);
550
- });
551
- }
552
-
553
718
  // src/wire/rpc.ts
554
719
  async function dispatchRpc(req) {
555
720
  try {
@@ -602,17 +767,19 @@ async function runRpcOnce(method) {
602
767
  req = method ? { method, params: body } : body;
603
768
  } catch (err) {
604
769
  process.stdout.write(
605
- JSON.stringify({
770
+ `${JSON.stringify({
606
771
  error: {
607
772
  code: "parse_error",
608
773
  message: `stdin was not valid JSON: ${err instanceof Error ? err.message : String(err)}`
609
774
  }
610
- }) + "\n"
775
+ })}
776
+ `
611
777
  );
612
778
  return 1;
613
779
  }
614
780
  const out = await dispatchRpc(req);
615
- process.stdout.write(JSON.stringify(out) + "\n");
781
+ process.stdout.write(`${JSON.stringify(out)}
782
+ `);
616
783
  return "error" in out ? 1 : 0;
617
784
  }
618
785
  async function runRpcBatch(method) {
@@ -626,23 +793,151 @@ async function runRpcBatch(method) {
626
793
  req = method ? { method, params: body } : body;
627
794
  } catch (err) {
628
795
  process.stdout.write(
629
- JSON.stringify({
796
+ `${JSON.stringify({
630
797
  error: {
631
798
  code: "parse_error",
632
799
  message: `line was not valid JSON: ${err instanceof Error ? err.message : String(err)}`
633
800
  }
634
- }) + "\n"
801
+ })}
802
+ `
635
803
  );
636
804
  exitCode = 1;
637
805
  continue;
638
806
  }
639
807
  const out = await dispatchRpc(req);
640
- process.stdout.write(JSON.stringify(out) + "\n");
808
+ process.stdout.write(`${JSON.stringify(out)}
809
+ `);
641
810
  if ("error" in out) exitCode = 1;
642
811
  }
643
812
  return exitCode;
644
813
  }
645
814
 
815
+ // src/wire/server.ts
816
+ import { serve } from "@hono/node-server";
817
+ import { Hono } from "hono";
818
+ import { cors } from "hono/cors";
819
+ var STARTED_AT = Date.now();
820
+ var AUTH_EXEMPT_PATHS = /* @__PURE__ */ new Set(["/healthz", "/v1/version", "/openapi.json"]);
821
+ function createApp(opts = {}) {
822
+ const app = new Hono();
823
+ app.use("*", cors());
824
+ if (opts.auth) {
825
+ const verify = opts.auth.bearer;
826
+ app.use("*", async (c, next) => {
827
+ const path = new URL(c.req.url).pathname;
828
+ if (AUTH_EXEMPT_PATHS.has(path)) return next();
829
+ const raw = c.req.header("authorization") ?? "";
830
+ const match = raw.match(/^Bearer\s+(.+)$/i);
831
+ if (!match) {
832
+ throw new WireError("unauthorized", "Missing or malformed Authorization header.", 401);
833
+ }
834
+ const token = match[1];
835
+ const ok = typeof verify === "string" ? token === verify : await verify(token);
836
+ if (!ok) {
837
+ throw new WireError("unauthorized", "Invalid bearer token.", 401);
838
+ }
839
+ return next();
840
+ });
841
+ }
842
+ app.onError((err, c) => {
843
+ if (err instanceof WireError) {
844
+ const status = err.status;
845
+ return c.json(
846
+ { error: { code: err.code, message: err.message, details: err.details } },
847
+ status
848
+ );
849
+ }
850
+ console.error("[agent-eval] unhandled error:", err);
851
+ return c.json({ error: { code: "internal_error", message: "Internal server error." } }, 500);
852
+ });
853
+ app.get(
854
+ "/healthz",
855
+ (c) => c.json({ status: "ok", uptimeSec: (Date.now() - STARTED_AT) / 1e3 })
856
+ );
857
+ app.get("/v1/version", (c) => c.json(handleVersion()));
858
+ app.get("/v1/rubrics", (c) => c.json(handleListRubrics()));
859
+ app.post("/v1/judge", async (c) => {
860
+ const raw = await c.req.json().catch(() => null);
861
+ if (raw == null) {
862
+ throw new WireError("validation_error", "Request body must be JSON.", 400);
863
+ }
864
+ const parsed = JudgeRequestSchema.safeParse(raw);
865
+ if (!parsed.success) {
866
+ throw new WireError(
867
+ "validation_error",
868
+ "Request did not match JudgeRequest schema.",
869
+ 400,
870
+ parsed.error.issues
871
+ );
872
+ }
873
+ const result = await handleJudge(parsed.data);
874
+ return c.json(result);
875
+ });
876
+ app.post("/v1/traces/ingest", async (c) => {
877
+ const contentType = c.req.header("content-type") ?? "";
878
+ let payload;
879
+ if (contentType.includes("application/x-ndjson")) {
880
+ const text = await c.req.text();
881
+ const events = text.split("\n").map((line) => line.trim()).filter((line) => line.length > 0).map((line) => {
882
+ try {
883
+ return JSON.parse(line);
884
+ } catch {
885
+ throw new WireError(
886
+ "validation_error",
887
+ "NDJSON line did not parse as JSON.",
888
+ 400,
889
+ line.slice(0, 200)
890
+ );
891
+ }
892
+ });
893
+ payload = { events };
894
+ } else {
895
+ payload = await c.req.json().catch(() => null);
896
+ }
897
+ if (payload == null) {
898
+ throw new WireError("validation_error", "Request body must be JSON or NDJSON.", 400);
899
+ }
900
+ const parsed = TracesIngestRequestSchema.safeParse(payload);
901
+ if (!parsed.success) {
902
+ throw new WireError(
903
+ "validation_error",
904
+ "Request did not match TracesIngestRequest schema.",
905
+ 400,
906
+ parsed.error.issues
907
+ );
908
+ }
909
+ const result = await handleTracesIngest(parsed.data, opts.stores ?? {});
910
+ return c.json(result);
911
+ });
912
+ app.post("/v1/feedback", async (c) => {
913
+ const raw = await c.req.json().catch(() => null);
914
+ if (raw == null) {
915
+ throw new WireError("validation_error", "Request body must be JSON.", 400);
916
+ }
917
+ const parsed = FeedbackTrajectorySchema.safeParse(raw);
918
+ if (!parsed.success) {
919
+ throw new WireError(
920
+ "validation_error",
921
+ "Request did not match FeedbackTrajectory schema.",
922
+ 400,
923
+ parsed.error.issues
924
+ );
925
+ }
926
+ const result = await handleFeedbackIngest(parsed.data, opts.stores ?? {});
927
+ return c.json(result);
928
+ });
929
+ app.get("/openapi.json", (c) => c.json(buildOpenApi(handleVersion().version)));
930
+ return app;
931
+ }
932
+ function startServer(opts = {}) {
933
+ const app = createApp(opts);
934
+ const port = opts.port ?? 5005;
935
+ const host = opts.host ?? "127.0.0.1";
936
+ return serve({ fetch: app.fetch, port, hostname: host }, ({ address, port: actualPort }) => {
937
+ console.log(`[agent-eval] serving on http://${address}:${actualPort}`);
938
+ });
939
+ }
940
+
646
941
  export {
647
942
  RubricDimensionSchema,
648
943
  FailureModeSchema,
@@ -653,6 +948,13 @@ export {
653
948
  ListRubricsResponseSchema,
654
949
  VersionResponseSchema,
655
950
  HealthResponseSchema,
951
+ TraceEventSchema,
952
+ TracesIngestRequestSchema,
953
+ TracesIngestResponseSchema,
954
+ FeedbackLabelSchema,
955
+ FeedbackAttemptSchema,
956
+ FeedbackTrajectorySchema,
957
+ FeedbackIngestResponseSchema,
656
958
  ErrorResponseSchema,
657
959
  WIRE_VERSION,
658
960
  hashRubric,
@@ -663,11 +965,13 @@ export {
663
965
  handleJudge,
664
966
  handleListRubrics,
665
967
  handleVersion,
968
+ handleTracesIngest,
969
+ handleFeedbackIngest,
666
970
  buildOpenApi,
667
- createApp,
668
- startServer,
669
971
  dispatchRpc,
670
972
  runRpcOnce,
671
- runRpcBatch
973
+ runRpcBatch,
974
+ createApp,
975
+ startServer
672
976
  };
673
- //# sourceMappingURL=chunk-6KQG5HAH.js.map
977
+ //# sourceMappingURL=chunk-5LBB5B3Z.js.map