@tangle-network/agent-eval 0.23.1 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +145 -0
- package/README.md +212 -79
- package/dist/baseline-4R5deP0N.d.ts +108 -0
- package/dist/benchmarks/index.d.ts +3 -2
- package/dist/benchmarks/index.js +1 -1
- package/dist/builder-eval/index.d.ts +249 -0
- package/dist/builder-eval/index.js +391 -0
- package/dist/builder-eval/index.js.map +1 -0
- package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
- package/dist/chunk-2A5XJB43.js.map +1 -0
- package/dist/chunk-47X6LRCE.js +76 -0
- package/dist/chunk-47X6LRCE.js.map +1 -0
- package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
- package/dist/chunk-4F5DQN55.js.map +1 -0
- package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
- package/dist/chunk-4S4BM3QQ.js.map +1 -0
- package/dist/chunk-5BKGXME7.js +65 -0
- package/dist/chunk-5BKGXME7.js.map +1 -0
- package/dist/{chunk-6KQG5HAH.js → chunk-5LBB5B3Z.js} +376 -72
- package/dist/chunk-5LBB5B3Z.js.map +1 -0
- package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
- package/dist/chunk-6QDKWHLS.js.map +1 -0
- package/dist/{chunk-VQQSPGSM.js → chunk-EDUKQ5AM.js} +247 -189
- package/dist/chunk-EDUKQ5AM.js.map +1 -0
- package/dist/chunk-I4MBDTY5.js +272 -0
- package/dist/chunk-I4MBDTY5.js.map +1 -0
- package/dist/chunk-JLZQWFV3.js +618 -0
- package/dist/chunk-JLZQWFV3.js.map +1 -0
- package/dist/chunk-K2TPS5LB.js +569 -0
- package/dist/chunk-K2TPS5LB.js.map +1 -0
- package/dist/chunk-KKHDIONI.js +414 -0
- package/dist/chunk-KKHDIONI.js.map +1 -0
- package/dist/chunk-KMPRBJK4.js +74 -0
- package/dist/chunk-KMPRBJK4.js.map +1 -0
- package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
- package/dist/chunk-KTGTIOFD.js.map +1 -0
- package/dist/chunk-LSH4MMOZ.js +838 -0
- package/dist/chunk-LSH4MMOZ.js.map +1 -0
- package/dist/chunk-NG236HPC.js +57 -0
- package/dist/chunk-NG236HPC.js.map +1 -0
- package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
- package/dist/chunk-NLMNWKVM.js.map +1 -0
- package/dist/chunk-NU65VQ7M.js +99 -0
- package/dist/chunk-NU65VQ7M.js.map +1 -0
- package/dist/chunk-OWLAAMME.js +250 -0
- package/dist/chunk-OWLAAMME.js.map +1 -0
- package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
- package/dist/chunk-PC4UYEBM.js.map +1 -0
- package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
- package/dist/chunk-RAF443UI.js.map +1 -0
- package/dist/chunk-RZTMDUO7.js +49 -0
- package/dist/chunk-RZTMDUO7.js.map +1 -0
- package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
- package/dist/chunk-SESZDQPX.js.map +1 -0
- package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
- package/dist/chunk-TVVP3ZZQ.js.map +1 -0
- package/dist/chunk-WWYCWKUM.js +196 -0
- package/dist/chunk-WWYCWKUM.js.map +1 -0
- package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
- package/dist/chunk-YRZ4M5GS.js.map +1 -0
- package/dist/chunk-ZN274SWR.js +613 -0
- package/dist/chunk-ZN274SWR.js.map +1 -0
- package/dist/cli.js +10 -6
- package/dist/cli.js.map +1 -1
- package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
- package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
- package/dist/control.d.ts +8 -6
- package/dist/control.js +10 -7
- package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
- package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
- package/dist/errors-BZ9sTdz7.d.ts +70 -0
- package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
- package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
- package/dist/governance/index.d.ts +5 -0
- package/dist/governance/index.js +18 -0
- package/dist/governance/index.js.map +1 -0
- package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
- package/dist/index-Oj9fAPPN.d.ts +270 -0
- package/dist/index.d.ts +2018 -3003
- package/dist/index.js +7443 -9102
- package/dist/index.js.map +1 -1
- package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
- package/dist/knowledge/index.d.ts +102 -0
- package/dist/knowledge/index.js +18 -0
- package/dist/knowledge/index.js.map +1 -0
- package/dist/meta-eval/index.d.ts +99 -0
- package/dist/meta-eval/index.js +324 -0
- package/dist/meta-eval/index.js.map +1 -0
- package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
- package/dist/openapi.json +491 -1
- package/dist/optimization.d.ts +11 -8
- package/dist/optimization.js +11 -9
- package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
- package/dist/pipelines/index.d.ts +172 -0
- package/dist/pipelines/index.js +345 -0
- package/dist/pipelines/index.js.map +1 -0
- package/dist/prm/index.d.ts +99 -0
- package/dist/prm/index.js +222 -0
- package/dist/prm/index.js.map +1 -0
- package/dist/query-DODUYdPg.d.ts +30 -0
- package/dist/release-report-BNgMdqPF.d.ts +292 -0
- package/dist/replay-BL96gCEP.d.ts +226 -0
- package/dist/reporting.d.ts +10 -295
- package/dist/reporting.js +10 -6
- package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-BPT8x_NT.d.ts} +148 -146
- package/dist/rl.d.ts +1762 -8
- package/dist/rl.js +2035 -58
- package/dist/rl.js.map +1 -1
- package/dist/rubric-D5tjHNJQ.d.ts +72 -0
- package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
- package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
- package/dist/sequential-Dgz1n51-.d.ts +139 -0
- package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
- package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-C7VPYEj2.d.ts} +3 -76
- package/dist/telemetry/file.js +4 -1
- package/dist/telemetry/file.js.map +1 -1
- package/dist/telemetry/index.js +57 -57
- package/dist/telemetry/index.js.map +1 -1
- package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
- package/dist/traces.d.ts +142 -387
- package/dist/traces.js +1302 -40
- package/dist/traces.js.map +1 -1
- package/dist/trajectory-CnoBo-JY.d.ts +32 -0
- package/dist/wire/index.d.ts +369 -25
- package/dist/wire/index.js +22 -3
- package/package.json +44 -18
- package/dist/chunk-42I2QC2L.js.map +0 -1
- package/dist/chunk-5IIQKMD5.js.map +0 -1
- package/dist/chunk-6KQG5HAH.js.map +0 -1
- package/dist/chunk-6M774GY6.js.map +0 -1
- package/dist/chunk-7EAUOUQS.js.map +0 -1
- package/dist/chunk-AXHNWLIX.js.map +0 -1
- package/dist/chunk-EXGR4XEM.js.map +0 -1
- package/dist/chunk-IOXMGMHQ.js.map +0 -1
- package/dist/chunk-KAO3Q65R.js.map +0 -1
- package/dist/chunk-LZKIOBG2.js +0 -2026
- package/dist/chunk-LZKIOBG2.js.map +0 -1
- package/dist/chunk-QBW3YBTR.js.map +0 -1
- package/dist/chunk-QUKKGHTZ.js.map +0 -1
- package/dist/chunk-SQQLHODJ.js.map +0 -1
- package/dist/chunk-V5QSWN7L.js +0 -1310
- package/dist/chunk-V5QSWN7L.js.map +0 -1
- package/dist/chunk-VQQSPGSM.js.map +0 -1
- package/dist/chunk-XPHOZPOM.js +0 -1947
- package/dist/chunk-XPHOZPOM.js.map +0 -1
- package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
- package/dist/index-ekBXweiQ.d.ts +0 -1894
- package/dist/sequential-DgU2mFsE.d.ts +0 -304
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
callLlmJson
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-4S4BM3QQ.js";
|
|
4
4
|
|
|
5
5
|
// src/wire/schemas.ts
|
|
6
6
|
import { extendZodWithOpenApi } from "@asteasolutions/zod-to-openapi";
|
|
@@ -74,9 +74,119 @@ var HealthResponseSchema = z.object({
|
|
|
74
74
|
status: z.literal("ok"),
|
|
75
75
|
uptimeSec: z.number()
|
|
76
76
|
}).openapi("HealthResponse");
|
|
77
|
+
var TraceEventSchema = z.object({
|
|
78
|
+
eventId: z.string().min(1).describe("Stable id for the event. Use ULID or UUID."),
|
|
79
|
+
runId: z.string().min(1).describe("Run this event belongs to."),
|
|
80
|
+
spanId: z.string().optional().describe("Span that emitted the event, if any."),
|
|
81
|
+
kind: z.enum([
|
|
82
|
+
"log",
|
|
83
|
+
"error",
|
|
84
|
+
"budget_decrement",
|
|
85
|
+
"budget_breach",
|
|
86
|
+
"state_mutation",
|
|
87
|
+
"policy_violation",
|
|
88
|
+
"redaction_applied",
|
|
89
|
+
"custom"
|
|
90
|
+
]).describe("Coarse event category \u2014 matches the TraceSchema v1 EventKind enum."),
|
|
91
|
+
timestamp: z.number().int().nonnegative().describe("Unix millis. Must be monotonically non-decreasing within a span."),
|
|
92
|
+
payload: z.record(z.string(), z.unknown()).describe("Free-form payload \u2014 the runtime owns the shape.")
|
|
93
|
+
}).openapi("TraceEvent");
|
|
94
|
+
var TracesIngestRequestSchema = z.object({
|
|
95
|
+
events: z.array(TraceEventSchema).min(1).max(1e4).describe("Batch of events. Max 10k per call \u2014 bigger streams should be chunked.")
|
|
96
|
+
}).openapi("TracesIngestRequest");
|
|
97
|
+
var TracesIngestResponseSchema = z.object({
|
|
98
|
+
accepted: z.number().int().nonnegative().describe("Number of events persisted."),
|
|
99
|
+
rejected: z.number().int().nonnegative().describe("Number of events the store refused \u2014 see `errors[]` for reasons."),
|
|
100
|
+
errors: z.array(
|
|
101
|
+
z.object({
|
|
102
|
+
eventId: z.string().describe("Event id this error applies to."),
|
|
103
|
+
message: z.string().describe("Why the event was rejected.")
|
|
104
|
+
})
|
|
105
|
+
).default([])
|
|
106
|
+
}).openapi("TracesIngestResponse");
|
|
107
|
+
var FeedbackLabelSchema = z.object({
|
|
108
|
+
id: z.string().optional(),
|
|
109
|
+
source: z.enum(["user", "judge", "environment", "metric", "policy", "system"]),
|
|
110
|
+
kind: z.enum([
|
|
111
|
+
"approve",
|
|
112
|
+
"reject",
|
|
113
|
+
"select",
|
|
114
|
+
"edit",
|
|
115
|
+
"rank",
|
|
116
|
+
"rate",
|
|
117
|
+
"comment",
|
|
118
|
+
"metric_outcome",
|
|
119
|
+
"policy_block",
|
|
120
|
+
"revision_request"
|
|
121
|
+
]),
|
|
122
|
+
value: z.unknown(),
|
|
123
|
+
reason: z.string().optional(),
|
|
124
|
+
severity: z.enum(["info", "warning", "error", "critical"]).optional(),
|
|
125
|
+
createdAt: z.string().describe("ISO-8601 UTC."),
|
|
126
|
+
metadata: z.record(z.string(), z.unknown()).optional()
|
|
127
|
+
}).openapi("FeedbackLabel");
|
|
128
|
+
var FeedbackAttemptSchema = z.object({
|
|
129
|
+
id: z.string().min(1),
|
|
130
|
+
stepIndex: z.number().int().nonnegative(),
|
|
131
|
+
artifactType: z.enum([
|
|
132
|
+
"text",
|
|
133
|
+
"code",
|
|
134
|
+
"plan",
|
|
135
|
+
"research",
|
|
136
|
+
"action",
|
|
137
|
+
"ui",
|
|
138
|
+
"decision",
|
|
139
|
+
"data",
|
|
140
|
+
"other"
|
|
141
|
+
]),
|
|
142
|
+
artifact: z.unknown(),
|
|
143
|
+
options: z.array(z.unknown()).optional(),
|
|
144
|
+
proposedAction: z.object({
|
|
145
|
+
type: z.string(),
|
|
146
|
+
risk: z.enum(["low", "medium", "high"]).optional(),
|
|
147
|
+
costUsd: z.number().optional(),
|
|
148
|
+
externalSideEffect: z.boolean().optional(),
|
|
149
|
+
requiresApproval: z.boolean().optional(),
|
|
150
|
+
metadata: z.record(z.string(), z.unknown()).optional()
|
|
151
|
+
}).optional(),
|
|
152
|
+
feedback: z.array(FeedbackLabelSchema).optional(),
|
|
153
|
+
createdAt: z.string(),
|
|
154
|
+
metadata: z.record(z.string(), z.unknown()).optional()
|
|
155
|
+
}).openapi("FeedbackAttempt");
|
|
156
|
+
var FeedbackTrajectorySchema = z.object({
|
|
157
|
+
id: z.string().min(1).describe("Stable id; idempotency key for the trajectory."),
|
|
158
|
+
projectId: z.string().optional(),
|
|
159
|
+
scenarioId: z.string().optional(),
|
|
160
|
+
task: z.object({
|
|
161
|
+
intent: z.string().min(1),
|
|
162
|
+
context: z.unknown().optional()
|
|
163
|
+
}),
|
|
164
|
+
attempts: z.array(FeedbackAttemptSchema).default([]),
|
|
165
|
+
labels: z.array(FeedbackLabelSchema).default([]),
|
|
166
|
+
outcome: z.object({
|
|
167
|
+
success: z.boolean().optional(),
|
|
168
|
+
score: z.number().optional(),
|
|
169
|
+
metrics: z.record(z.string(), z.number()).optional(),
|
|
170
|
+
costUsd: z.number().optional(),
|
|
171
|
+
detail: z.string().optional(),
|
|
172
|
+
observedAt: z.string().optional(),
|
|
173
|
+
metadata: z.record(z.string(), z.unknown()).optional()
|
|
174
|
+
}).optional(),
|
|
175
|
+
split: z.enum(["train", "dev", "test", "holdout"]).optional(),
|
|
176
|
+
tags: z.record(z.string(), z.string()).optional(),
|
|
177
|
+
createdAt: z.string().describe("ISO-8601 UTC."),
|
|
178
|
+
updatedAt: z.string().optional(),
|
|
179
|
+
metadata: z.record(z.string(), z.unknown()).optional()
|
|
180
|
+
}).openapi("FeedbackTrajectory");
|
|
181
|
+
var FeedbackIngestResponseSchema = z.object({
|
|
182
|
+
id: z.string().describe("Trajectory id that was persisted."),
|
|
183
|
+
persisted: z.boolean().describe("True when the trajectory was saved (idempotent on id).")
|
|
184
|
+
}).openapi("FeedbackIngestResponse");
|
|
77
185
|
var ErrorResponseSchema = z.object({
|
|
78
186
|
error: z.object({
|
|
79
|
-
code: z.string().describe(
|
|
187
|
+
code: z.string().describe(
|
|
188
|
+
'Machine-readable code: "validation_error", "rubric_not_found", "judge_error".'
|
|
189
|
+
),
|
|
80
190
|
message: z.string().describe("Human-readable message."),
|
|
81
191
|
details: z.unknown().optional().describe("Optional structured detail.")
|
|
82
192
|
}).describe("Errors are always wrapped in this shape across all endpoints.")
|
|
@@ -247,7 +357,12 @@ function validateJudgeOutput(value, rubric) {
|
|
|
247
357
|
for (const dim of rubric.dimensions) {
|
|
248
358
|
const score = dimensionRecord[dim.id];
|
|
249
359
|
if (typeof score !== "number" || !Number.isFinite(score) || score < dim.min || score > dim.max) {
|
|
250
|
-
throw new WireError(
|
|
360
|
+
throw new WireError(
|
|
361
|
+
"judge_error",
|
|
362
|
+
`Judge returned invalid score for dimension "${dim.id}".`,
|
|
363
|
+
500,
|
|
364
|
+
value
|
|
365
|
+
);
|
|
251
366
|
}
|
|
252
367
|
dimensions[dim.id] = score;
|
|
253
368
|
}
|
|
@@ -268,7 +383,12 @@ function validateIdArray(raw, allowed, field, original) {
|
|
|
268
383
|
const out = [];
|
|
269
384
|
for (const item of raw) {
|
|
270
385
|
if (typeof item !== "string" || !allowed.has(item)) {
|
|
271
|
-
throw new WireError(
|
|
386
|
+
throw new WireError(
|
|
387
|
+
"judge_error",
|
|
388
|
+
`Judge returned unknown ${field} id "${String(item)}".`,
|
|
389
|
+
500,
|
|
390
|
+
original
|
|
391
|
+
);
|
|
272
392
|
}
|
|
273
393
|
out.push(item);
|
|
274
394
|
}
|
|
@@ -366,12 +486,46 @@ function handleVersion() {
|
|
|
366
486
|
package: "@tangle-network/agent-eval",
|
|
367
487
|
version: readPackageVersion(),
|
|
368
488
|
wireVersion: WIRE_VERSION,
|
|
369
|
-
apiSurface: ["judge", "listRubrics", "version"]
|
|
489
|
+
apiSurface: ["judge", "listRubrics", "version", "feedback.ingest", "traces.ingest"]
|
|
370
490
|
};
|
|
371
491
|
}
|
|
492
|
+
async function handleTracesIngest(req, stores) {
|
|
493
|
+
if (!stores.traceStore) {
|
|
494
|
+
throw new WireError(
|
|
495
|
+
"service_unavailable",
|
|
496
|
+
"No trace store configured on this server. Pass `traceStore` to `createApp`.",
|
|
497
|
+
503
|
|
498
|
+
);
|
|
499
|
+
}
|
|
500
|
+
const errors = [];
|
|
501
|
+
let accepted = 0;
|
|
502
|
+
for (const event of req.events) {
|
|
503
|
+
try {
|
|
504
|
+
await stores.traceStore.appendEvent(event);
|
|
505
|
+
accepted++;
|
|
506
|
+
} catch (err) {
|
|
507
|
+
errors.push({
|
|
508
|
+
eventId: event.eventId,
|
|
509
|
+
message: err instanceof Error ? err.message : String(err)
|
|
510
|
+
});
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
return { accepted, rejected: errors.length, errors };
|
|
514
|
+
}
|
|
515
|
+
async function handleFeedbackIngest(req, stores) {
|
|
516
|
+
if (!stores.feedbackStore) {
|
|
517
|
+
throw new WireError(
|
|
518
|
+
"service_unavailable",
|
|
519
|
+
"No feedback store configured on this server. Pass `feedbackStore` to `createApp`.",
|
|
520
|
+
503
|
|
521
|
+
);
|
|
522
|
+
}
|
|
523
|
+
await stores.feedbackStore.save(req);
|
|
524
|
+
return { id: req.id, persisted: true };
|
|
525
|
+
}
|
|
372
526
|
|
|
373
527
|
// src/wire/openapi.ts
|
|
374
|
-
import {
|
|
528
|
+
import { OpenAPIRegistry, OpenApiGeneratorV31 } from "@asteasolutions/zod-to-openapi";
|
|
375
529
|
function buildOpenApi(packageVersion) {
|
|
376
530
|
const registry = new OpenAPIRegistry();
|
|
377
531
|
registry.register("JudgeRequest", JudgeRequestSchema);
|
|
@@ -380,6 +534,10 @@ function buildOpenApi(packageVersion) {
|
|
|
380
534
|
registry.register("VersionResponse", VersionResponseSchema);
|
|
381
535
|
registry.register("HealthResponse", HealthResponseSchema);
|
|
382
536
|
registry.register("ErrorResponse", ErrorResponseSchema);
|
|
537
|
+
registry.register("TracesIngestRequest", TracesIngestRequestSchema);
|
|
538
|
+
registry.register("TracesIngestResponse", TracesIngestResponseSchema);
|
|
539
|
+
registry.register("FeedbackTrajectory", FeedbackTrajectorySchema);
|
|
540
|
+
registry.register("FeedbackIngestResponse", FeedbackIngestResponseSchema);
|
|
383
541
|
registry.registerPath({
|
|
384
542
|
method: "post",
|
|
385
543
|
path: "/v1/judge",
|
|
@@ -446,6 +604,69 @@ function buildOpenApi(packageVersion) {
|
|
|
446
604
|
}
|
|
447
605
|
}
|
|
448
606
|
});
|
|
607
|
+
registry.registerPath({
|
|
608
|
+
method: "post",
|
|
609
|
+
path: "/v1/traces/ingest",
|
|
610
|
+
summary: "Ingest a batch of production TraceEvents",
|
|
611
|
+
description: "Append a batch of TraceEvents to the configured TraceStore. Accepts application/json ({events:[...]}) or application/x-ndjson (one event per line). Returns counts of accepted + rejected events.",
|
|
612
|
+
request: {
|
|
613
|
+
body: {
|
|
614
|
+
content: {
|
|
615
|
+
"application/json": { schema: TracesIngestRequestSchema },
|
|
616
|
+
"application/x-ndjson": { schema: TracesIngestRequestSchema }
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
},
|
|
620
|
+
responses: {
|
|
621
|
+
200: {
|
|
622
|
+
description: "Ingestion summary",
|
|
623
|
+
content: { "application/json": { schema: TracesIngestResponseSchema } }
|
|
624
|
+
},
|
|
625
|
+
400: {
|
|
626
|
+
description: "Validation error",
|
|
627
|
+
content: { "application/json": { schema: ErrorResponseSchema } }
|
|
628
|
+
},
|
|
629
|
+
401: {
|
|
630
|
+
description: "Unauthorized (when bearer auth is configured)",
|
|
631
|
+
content: { "application/json": { schema: ErrorResponseSchema } }
|
|
632
|
+
},
|
|
633
|
+
503: {
|
|
634
|
+
description: "No trace store configured",
|
|
635
|
+
content: { "application/json": { schema: ErrorResponseSchema } }
|
|
636
|
+
}
|
|
637
|
+
}
|
|
638
|
+
});
|
|
639
|
+
registry.registerPath({
|
|
640
|
+
method: "post",
|
|
641
|
+
path: "/v1/feedback",
|
|
642
|
+
summary: "Ingest a FeedbackTrajectory from production",
|
|
643
|
+
description: "Persist a single FeedbackTrajectory. Idempotent on trajectory.id \u2014 re-posting replaces the prior record. Used by production runtimes to forward user \u{1F44D}/\u{1F44E}/edits into the eval substrate.",
|
|
644
|
+
request: {
|
|
645
|
+
body: {
|
|
646
|
+
content: {
|
|
647
|
+
"application/json": { schema: FeedbackTrajectorySchema }
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
},
|
|
651
|
+
responses: {
|
|
652
|
+
200: {
|
|
653
|
+
description: "Persisted",
|
|
654
|
+
content: { "application/json": { schema: FeedbackIngestResponseSchema } }
|
|
655
|
+
},
|
|
656
|
+
400: {
|
|
657
|
+
description: "Validation error",
|
|
658
|
+
content: { "application/json": { schema: ErrorResponseSchema } }
|
|
659
|
+
},
|
|
660
|
+
401: {
|
|
661
|
+
description: "Unauthorized (when bearer auth is configured)",
|
|
662
|
+
content: { "application/json": { schema: ErrorResponseSchema } }
|
|
663
|
+
},
|
|
664
|
+
503: {
|
|
665
|
+
description: "No feedback store configured",
|
|
666
|
+
content: { "application/json": { schema: ErrorResponseSchema } }
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
});
|
|
449
670
|
const generator = new OpenApiGeneratorV31(registry.definitions);
|
|
450
671
|
const doc = generator.generateDocument({
|
|
451
672
|
openapi: "3.1.0",
|
|
@@ -494,62 +715,6 @@ Wire-protocol version: ${WIRE_VERSION}. Bumps on breaking changes to request/res
|
|
|
494
715
|
return doc;
|
|
495
716
|
}
|
|
496
717
|
|
|
497
|
-
// src/wire/server.ts
|
|
498
|
-
import { serve } from "@hono/node-server";
|
|
499
|
-
import { Hono } from "hono";
|
|
500
|
-
import { cors } from "hono/cors";
|
|
501
|
-
var STARTED_AT = Date.now();
|
|
502
|
-
function createApp() {
|
|
503
|
-
const app = new Hono();
|
|
504
|
-
app.use("*", cors());
|
|
505
|
-
app.onError((err, c) => {
|
|
506
|
-
if (err instanceof WireError) {
|
|
507
|
-
return c.json(
|
|
508
|
-
{ error: { code: err.code, message: err.message, details: err.details } },
|
|
509
|
-
err.status
|
|
510
|
-
);
|
|
511
|
-
}
|
|
512
|
-
console.error("[agent-eval] unhandled error:", err);
|
|
513
|
-
return c.json(
|
|
514
|
-
{ error: { code: "internal_error", message: "Internal server error." } },
|
|
515
|
-
500
|
|
516
|
-
);
|
|
517
|
-
});
|
|
518
|
-
app.get(
|
|
519
|
-
"/healthz",
|
|
520
|
-
(c) => c.json({ status: "ok", uptimeSec: (Date.now() - STARTED_AT) / 1e3 })
|
|
521
|
-
);
|
|
522
|
-
app.get("/v1/version", (c) => c.json(handleVersion()));
|
|
523
|
-
app.get("/v1/rubrics", (c) => c.json(handleListRubrics()));
|
|
524
|
-
app.post("/v1/judge", async (c) => {
|
|
525
|
-
const raw = await c.req.json().catch(() => null);
|
|
526
|
-
if (raw == null) {
|
|
527
|
-
throw new WireError("validation_error", "Request body must be JSON.", 400);
|
|
528
|
-
}
|
|
529
|
-
const parsed = JudgeRequestSchema.safeParse(raw);
|
|
530
|
-
if (!parsed.success) {
|
|
531
|
-
throw new WireError(
|
|
532
|
-
"validation_error",
|
|
533
|
-
"Request did not match JudgeRequest schema.",
|
|
534
|
-
400,
|
|
535
|
-
parsed.error.issues
|
|
536
|
-
);
|
|
537
|
-
}
|
|
538
|
-
const result = await handleJudge(parsed.data);
|
|
539
|
-
return c.json(result);
|
|
540
|
-
});
|
|
541
|
-
app.get("/openapi.json", (c) => c.json(buildOpenApi(handleVersion().version)));
|
|
542
|
-
return app;
|
|
543
|
-
}
|
|
544
|
-
function startServer(opts = {}) {
|
|
545
|
-
const app = createApp();
|
|
546
|
-
const port = opts.port ?? 5005;
|
|
547
|
-
const host = opts.host ?? "127.0.0.1";
|
|
548
|
-
return serve({ fetch: app.fetch, port, hostname: host }, ({ address, port: actualPort }) => {
|
|
549
|
-
console.log(`[agent-eval] serving on http://${address}:${actualPort}`);
|
|
550
|
-
});
|
|
551
|
-
}
|
|
552
|
-
|
|
553
718
|
// src/wire/rpc.ts
|
|
554
719
|
async function dispatchRpc(req) {
|
|
555
720
|
try {
|
|
@@ -602,17 +767,19 @@ async function runRpcOnce(method) {
|
|
|
602
767
|
req = method ? { method, params: body } : body;
|
|
603
768
|
} catch (err) {
|
|
604
769
|
process.stdout.write(
|
|
605
|
-
JSON.stringify({
|
|
770
|
+
`${JSON.stringify({
|
|
606
771
|
error: {
|
|
607
772
|
code: "parse_error",
|
|
608
773
|
message: `stdin was not valid JSON: ${err instanceof Error ? err.message : String(err)}`
|
|
609
774
|
}
|
|
610
|
-
})
|
|
775
|
+
})}
|
|
776
|
+
`
|
|
611
777
|
);
|
|
612
778
|
return 1;
|
|
613
779
|
}
|
|
614
780
|
const out = await dispatchRpc(req);
|
|
615
|
-
process.stdout.write(JSON.stringify(out)
|
|
781
|
+
process.stdout.write(`${JSON.stringify(out)}
|
|
782
|
+
`);
|
|
616
783
|
return "error" in out ? 1 : 0;
|
|
617
784
|
}
|
|
618
785
|
async function runRpcBatch(method) {
|
|
@@ -626,23 +793,151 @@ async function runRpcBatch(method) {
|
|
|
626
793
|
req = method ? { method, params: body } : body;
|
|
627
794
|
} catch (err) {
|
|
628
795
|
process.stdout.write(
|
|
629
|
-
JSON.stringify({
|
|
796
|
+
`${JSON.stringify({
|
|
630
797
|
error: {
|
|
631
798
|
code: "parse_error",
|
|
632
799
|
message: `line was not valid JSON: ${err instanceof Error ? err.message : String(err)}`
|
|
633
800
|
}
|
|
634
|
-
})
|
|
801
|
+
})}
|
|
802
|
+
`
|
|
635
803
|
);
|
|
636
804
|
exitCode = 1;
|
|
637
805
|
continue;
|
|
638
806
|
}
|
|
639
807
|
const out = await dispatchRpc(req);
|
|
640
|
-
process.stdout.write(JSON.stringify(out)
|
|
808
|
+
process.stdout.write(`${JSON.stringify(out)}
|
|
809
|
+
`);
|
|
641
810
|
if ("error" in out) exitCode = 1;
|
|
642
811
|
}
|
|
643
812
|
return exitCode;
|
|
644
813
|
}
|
|
645
814
|
|
|
815
|
+
// src/wire/server.ts
|
|
816
|
+
import { serve } from "@hono/node-server";
|
|
817
|
+
import { Hono } from "hono";
|
|
818
|
+
import { cors } from "hono/cors";
|
|
819
|
+
var STARTED_AT = Date.now();
|
|
820
|
+
var AUTH_EXEMPT_PATHS = /* @__PURE__ */ new Set(["/healthz", "/v1/version", "/openapi.json"]);
|
|
821
|
+
function createApp(opts = {}) {
|
|
822
|
+
const app = new Hono();
|
|
823
|
+
app.use("*", cors());
|
|
824
|
+
if (opts.auth) {
|
|
825
|
+
const verify = opts.auth.bearer;
|
|
826
|
+
app.use("*", async (c, next) => {
|
|
827
|
+
const path = new URL(c.req.url).pathname;
|
|
828
|
+
if (AUTH_EXEMPT_PATHS.has(path)) return next();
|
|
829
|
+
const raw = c.req.header("authorization") ?? "";
|
|
830
|
+
const match = raw.match(/^Bearer\s+(.+)$/i);
|
|
831
|
+
if (!match) {
|
|
832
|
+
throw new WireError("unauthorized", "Missing or malformed Authorization header.", 401);
|
|
833
|
+
}
|
|
834
|
+
const token = match[1];
|
|
835
|
+
const ok = typeof verify === "string" ? token === verify : await verify(token);
|
|
836
|
+
if (!ok) {
|
|
837
|
+
throw new WireError("unauthorized", "Invalid bearer token.", 401);
|
|
838
|
+
}
|
|
839
|
+
return next();
|
|
840
|
+
});
|
|
841
|
+
}
|
|
842
|
+
app.onError((err, c) => {
|
|
843
|
+
if (err instanceof WireError) {
|
|
844
|
+
const status = err.status;
|
|
845
|
+
return c.json(
|
|
846
|
+
{ error: { code: err.code, message: err.message, details: err.details } },
|
|
847
|
+
status
|
|
848
|
+
);
|
|
849
|
+
}
|
|
850
|
+
console.error("[agent-eval] unhandled error:", err);
|
|
851
|
+
return c.json({ error: { code: "internal_error", message: "Internal server error." } }, 500);
|
|
852
|
+
});
|
|
853
|
+
app.get(
|
|
854
|
+
"/healthz",
|
|
855
|
+
(c) => c.json({ status: "ok", uptimeSec: (Date.now() - STARTED_AT) / 1e3 })
|
|
856
|
+
);
|
|
857
|
+
app.get("/v1/version", (c) => c.json(handleVersion()));
|
|
858
|
+
app.get("/v1/rubrics", (c) => c.json(handleListRubrics()));
|
|
859
|
+
app.post("/v1/judge", async (c) => {
|
|
860
|
+
const raw = await c.req.json().catch(() => null);
|
|
861
|
+
if (raw == null) {
|
|
862
|
+
throw new WireError("validation_error", "Request body must be JSON.", 400);
|
|
863
|
+
}
|
|
864
|
+
const parsed = JudgeRequestSchema.safeParse(raw);
|
|
865
|
+
if (!parsed.success) {
|
|
866
|
+
throw new WireError(
|
|
867
|
+
"validation_error",
|
|
868
|
+
"Request did not match JudgeRequest schema.",
|
|
869
|
+
400,
|
|
870
|
+
parsed.error.issues
|
|
871
|
+
);
|
|
872
|
+
}
|
|
873
|
+
const result = await handleJudge(parsed.data);
|
|
874
|
+
return c.json(result);
|
|
875
|
+
});
|
|
876
|
+
app.post("/v1/traces/ingest", async (c) => {
|
|
877
|
+
const contentType = c.req.header("content-type") ?? "";
|
|
878
|
+
let payload;
|
|
879
|
+
if (contentType.includes("application/x-ndjson")) {
|
|
880
|
+
const text = await c.req.text();
|
|
881
|
+
const events = text.split("\n").map((line) => line.trim()).filter((line) => line.length > 0).map((line) => {
|
|
882
|
+
try {
|
|
883
|
+
return JSON.parse(line);
|
|
884
|
+
} catch {
|
|
885
|
+
throw new WireError(
|
|
886
|
+
"validation_error",
|
|
887
|
+
"NDJSON line did not parse as JSON.",
|
|
888
|
+
400,
|
|
889
|
+
line.slice(0, 200)
|
|
890
|
+
);
|
|
891
|
+
}
|
|
892
|
+
});
|
|
893
|
+
payload = { events };
|
|
894
|
+
} else {
|
|
895
|
+
payload = await c.req.json().catch(() => null);
|
|
896
|
+
}
|
|
897
|
+
if (payload == null) {
|
|
898
|
+
throw new WireError("validation_error", "Request body must be JSON or NDJSON.", 400);
|
|
899
|
+
}
|
|
900
|
+
const parsed = TracesIngestRequestSchema.safeParse(payload);
|
|
901
|
+
if (!parsed.success) {
|
|
902
|
+
throw new WireError(
|
|
903
|
+
"validation_error",
|
|
904
|
+
"Request did not match TracesIngestRequest schema.",
|
|
905
|
+
400,
|
|
906
|
+
parsed.error.issues
|
|
907
|
+
);
|
|
908
|
+
}
|
|
909
|
+
const result = await handleTracesIngest(parsed.data, opts.stores ?? {});
|
|
910
|
+
return c.json(result);
|
|
911
|
+
});
|
|
912
|
+
app.post("/v1/feedback", async (c) => {
|
|
913
|
+
const raw = await c.req.json().catch(() => null);
|
|
914
|
+
if (raw == null) {
|
|
915
|
+
throw new WireError("validation_error", "Request body must be JSON.", 400);
|
|
916
|
+
}
|
|
917
|
+
const parsed = FeedbackTrajectorySchema.safeParse(raw);
|
|
918
|
+
if (!parsed.success) {
|
|
919
|
+
throw new WireError(
|
|
920
|
+
"validation_error",
|
|
921
|
+
"Request did not match FeedbackTrajectory schema.",
|
|
922
|
+
400,
|
|
923
|
+
parsed.error.issues
|
|
924
|
+
);
|
|
925
|
+
}
|
|
926
|
+
const result = await handleFeedbackIngest(parsed.data, opts.stores ?? {});
|
|
927
|
+
return c.json(result);
|
|
928
|
+
});
|
|
929
|
+
app.get("/openapi.json", (c) => c.json(buildOpenApi(handleVersion().version)));
|
|
930
|
+
return app;
|
|
931
|
+
}
|
|
932
|
+
function startServer(opts = {}) {
|
|
933
|
+
const app = createApp(opts);
|
|
934
|
+
const port = opts.port ?? 5005;
|
|
935
|
+
const host = opts.host ?? "127.0.0.1";
|
|
936
|
+
return serve({ fetch: app.fetch, port, hostname: host }, ({ address, port: actualPort }) => {
|
|
937
|
+
console.log(`[agent-eval] serving on http://${address}:${actualPort}`);
|
|
938
|
+
});
|
|
939
|
+
}
|
|
940
|
+
|
|
646
941
|
export {
|
|
647
942
|
RubricDimensionSchema,
|
|
648
943
|
FailureModeSchema,
|
|
@@ -653,6 +948,13 @@ export {
|
|
|
653
948
|
ListRubricsResponseSchema,
|
|
654
949
|
VersionResponseSchema,
|
|
655
950
|
HealthResponseSchema,
|
|
951
|
+
TraceEventSchema,
|
|
952
|
+
TracesIngestRequestSchema,
|
|
953
|
+
TracesIngestResponseSchema,
|
|
954
|
+
FeedbackLabelSchema,
|
|
955
|
+
FeedbackAttemptSchema,
|
|
956
|
+
FeedbackTrajectorySchema,
|
|
957
|
+
FeedbackIngestResponseSchema,
|
|
656
958
|
ErrorResponseSchema,
|
|
657
959
|
WIRE_VERSION,
|
|
658
960
|
hashRubric,
|
|
@@ -663,11 +965,13 @@ export {
|
|
|
663
965
|
handleJudge,
|
|
664
966
|
handleListRubrics,
|
|
665
967
|
handleVersion,
|
|
968
|
+
handleTracesIngest,
|
|
969
|
+
handleFeedbackIngest,
|
|
666
970
|
buildOpenApi,
|
|
667
|
-
createApp,
|
|
668
|
-
startServer,
|
|
669
971
|
dispatchRpc,
|
|
670
972
|
runRpcOnce,
|
|
671
|
-
runRpcBatch
|
|
973
|
+
runRpcBatch,
|
|
974
|
+
createApp,
|
|
975
|
+
startServer
|
|
672
976
|
};
|
|
673
|
-
//# sourceMappingURL=chunk-
|
|
977
|
+
//# sourceMappingURL=chunk-5LBB5B3Z.js.map
|