@tangle-network/agent-eval 0.24.0 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +65 -0
- package/README.md +71 -0
- package/dist/{chunk-SY6WAAAD.js → chunk-5LBB5B3Z.js} +296 -5
- package/dist/chunk-5LBB5B3Z.js.map +1 -0
- package/dist/{chunk-VRJVTXRV.js → chunk-EDUKQ5AM.js} +85 -85
- package/dist/{chunk-VRJVTXRV.js.map → chunk-EDUKQ5AM.js.map} +1 -1
- package/dist/{chunk-OHEPNJQN.js → chunk-JLZQWFV3.js} +65 -1
- package/dist/chunk-JLZQWFV3.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/index.d.ts +311 -11
- package/dist/index.js +695 -2
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +491 -1
- package/dist/optimization.d.ts +2 -2
- package/dist/optimization.js +1 -1
- package/dist/pipelines/index.js +3 -67
- package/dist/pipelines/index.js.map +1 -1
- package/dist/{release-report-TDPn1cxq.d.ts → release-report-BNgMdqPF.d.ts} +1 -1
- package/dist/reporting.d.ts +2 -2
- package/dist/{researcher-CUOiGcGv.d.ts → researcher-BPT8x_NT.d.ts} +1 -1
- package/dist/rl.d.ts +3 -3
- package/dist/{summary-report-BXGs_9V0.d.ts → summary-report-C7VPYEj2.d.ts} +1 -1
- package/dist/wire/index.d.ts +347 -3
- package/dist/wire/index.js +19 -1
- package/package.json +1 -1
- package/dist/chunk-OHEPNJQN.js.map +0 -1
- package/dist/chunk-SY6WAAAD.js.map +0 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,70 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.25.0 — ProductionLoop primitive: close the eval → prod → eval cycle
|
|
4
|
+
|
|
5
|
+
This release ships the **orchestration layer** that turns the existing
|
|
6
|
+
eval substrate into a continuously-improving production system. Static
|
|
7
|
+
prompts decay; today's regulation flips tomorrow. The pieces to close
|
|
8
|
+
the loop were already in the package (`runMultiShotOptimization`,
|
|
9
|
+
`failureClusterView`, `evaluateReleaseConfidence`, `extractPreferences`,
|
|
10
|
+
`FeedbackTrajectoryStore`, `TraceStore`); this release adds the one
|
|
11
|
+
clean primitive that wires them together end-to-end.
|
|
12
|
+
|
|
13
|
+
### Added
|
|
14
|
+
|
|
15
|
+
- **`runProductionLoop({ ... })`** (`src/production-loop.ts`,
|
|
16
|
+
`@experimental`) — one call = one cycle. Ingests production traces
|
|
17
|
+
and feedback, clusters failures, runs evolve against the worst
|
|
18
|
+
cluster, gates with `HeldOutGate` + `evaluateReleaseConfidence`
|
|
19
|
+
(fail-closed), and — when wired with an `AutoPrClient` — opens a PR
|
|
20
|
+
with the improved prompt. Idempotent + replayable: same `runId`
|
|
21
|
+
yields the same plan. Cron / GitHub Actions are the consumer's job;
|
|
22
|
+
the primitive doesn't own scheduling.
|
|
23
|
+
|
|
24
|
+
- **`proposeAutomatedPullRequest(client, input)`** + two transports
|
|
25
|
+
(`src/auto-pr.ts`, `@experimental`):
|
|
26
|
+
- `httpGithubClient({ token, ... })` — direct REST against
|
|
27
|
+
`api.github.com`, no extra deps. Idempotent on branch name:
|
|
28
|
+
existing open PRs are returned, not duplicated.
|
|
29
|
+
- `ghCliClient({ ... })` — shells out to `gh` for environments
|
|
30
|
+
where developer auth state is already configured.
|
|
31
|
+
Both validate inputs (no `..` paths, no whitespace branches, no
|
|
32
|
+
duplicate file changes) and surface `ValidationError` / `ConfigError`
|
|
33
|
+
from the typed taxonomy.
|
|
34
|
+
|
|
35
|
+
- **`POST /v1/feedback` + `POST /v1/traces/ingest`** wire endpoints
|
|
36
|
+
(`src/wire/`). Both Zod-validated, both append to the configured
|
|
37
|
+
store (`FeedbackTrajectoryStore` / `TraceStore`). 503 when no store
|
|
38
|
+
is wired (fail loud, not silent). Traces ingest accepts both
|
|
39
|
+
`application/json` (`{events:[...]}`) and `application/x-ndjson` for
|
|
40
|
+
streaming production runtimes. Schemas (`TraceEvent`,
|
|
41
|
+
`FeedbackTrajectory`, `TracesIngestRequest/Response`,
|
|
42
|
+
`FeedbackIngestResponse`) added to `openapi.json` for cross-language
|
|
43
|
+
clients.
|
|
44
|
+
|
|
45
|
+
- **Optional bearer-token auth** on the wire server, configured via
|
|
46
|
+
`createApp({ auth: { bearer: '...' } })` or as a verifier function
|
|
47
|
+
for rotating tokens. `/healthz` and `/v1/version` remain unprotected
|
|
48
|
+
(regression: never lock monitoring out of the runtime).
|
|
49
|
+
|
|
50
|
+
- **`examples/production-loop/`** — synthetic end-to-end demo wiring
|
|
51
|
+
the loop against in-memory trace + feedback stores and a fake
|
|
52
|
+
auto-PR client. Shows the failure-cluster trigger, the evolve round,
|
|
53
|
+
the gate verdict, and the PR-shaped output without requiring
|
|
54
|
+
credentials or a live model.
|
|
55
|
+
|
|
56
|
+
### Changed
|
|
57
|
+
|
|
58
|
+
- **Wire server** (`createApp(opts)`) now accepts optional
|
|
59
|
+
`IngestionStores` (`{ traceStore?, feedbackStore? }`) and `auth`.
|
|
60
|
+
Existing zero-arg callers continue to work — judge / rubrics /
|
|
61
|
+
version / healthz are unchanged.
|
|
62
|
+
|
|
63
|
+
### Status tags
|
|
64
|
+
|
|
65
|
+
- Every new export is `@experimental` initially. Pin the patch version
|
|
66
|
+
if you depend on it. All other 0.24.0 stability tags are preserved.
|
|
67
|
+
|
|
3
68
|
## 0.24.0 — DX cleanup: framing, stability tags, lint, taxonomy, strict indices
|
|
4
69
|
|
|
5
70
|
This release is **DX + correctness**. No production behavior moved; consumer
|
package/README.md
CHANGED
|
@@ -88,6 +88,75 @@ await product.storeEvalResult(task.id, result)
|
|
|
88
88
|
Same loop shape in production, replay, benchmark, and optimization. Swap the
|
|
89
89
|
dependencies behind `observe()` and `act()`, never the eval contract.
|
|
90
90
|
|
|
91
|
+
## Production loop — close the eval → prod → eval cycle (0.25.0)
|
|
92
|
+
|
|
93
|
+
Static prompts decay. Yesterday's FTC rule flips today; yesterday's tool quirk
|
|
94
|
+
becomes today's incident. The production agents that win are the ones that
|
|
95
|
+
**continuously re-train against live failure modes**.
|
|
96
|
+
|
|
97
|
+
`runProductionLoop` is the orchestration layer that wires the existing eval
|
|
98
|
+
substrate into a self-improvement cron:
|
|
99
|
+
|
|
100
|
+
```ts
|
|
101
|
+
import {
|
|
102
|
+
runProductionLoop,
|
|
103
|
+
httpGithubClient,
|
|
104
|
+
FileSystemFeedbackTrajectoryStore,
|
|
105
|
+
} from '@tangle-network/agent-eval'
|
|
106
|
+
import { FileSystemTraceStore } from '@tangle-network/agent-eval/traces'
|
|
107
|
+
|
|
108
|
+
const result = await runProductionLoop({
|
|
109
|
+
runId: `weekly-${new Date().toISOString().slice(0, 10)}`,
|
|
110
|
+
target: 'tax-agent',
|
|
111
|
+
|
|
112
|
+
// 1. Where production traces + feedback land. Wire the HTTP ingestion
|
|
113
|
+
// endpoints (POST /v1/traces/ingest, POST /v1/feedback) from your
|
|
114
|
+
// runtime; the same store reads them here.
|
|
115
|
+
traceStore: new FileSystemTraceStore({ dir: 'data/prod-traces' }),
|
|
116
|
+
feedbackStore: new FileSystemFeedbackTrajectoryStore({ dir: 'data/prod-feedback' }),
|
|
117
|
+
|
|
118
|
+
// 2. Cluster threshold: act on failure groups ≥ 20 runs or ≥ 5% of corpus.
|
|
119
|
+
cluster: { minClusterSize: 20, minSeverityRatio: 0.05, maxClustersPerCycle: 1 },
|
|
120
|
+
|
|
121
|
+
// 3. Evolve: seed = current prompt, gate against holdout scenarios.
|
|
122
|
+
evolve: {
|
|
123
|
+
baselinePrompt: currentSystemPrompt,
|
|
124
|
+
holdoutScenarios: productionShapeScenarios,
|
|
125
|
+
runner, // your agent driver
|
|
126
|
+
scorer, // calibrated judge or rubric
|
|
127
|
+
mutator, // GEPA-style or addendum-style mutator
|
|
128
|
+
gate: {
|
|
129
|
+
baselineKey: 'baseline',
|
|
130
|
+
minProductiveRuns: 5,
|
|
131
|
+
pairedDeltaThreshold: 0.03, // require Nσ improvement on holdout
|
|
132
|
+
overfitGapThreshold: 0.10,
|
|
133
|
+
},
|
|
134
|
+
},
|
|
135
|
+
|
|
136
|
+
// 4. Ship: when the gate passes, open a PR with the new prompt.
|
|
137
|
+
ship: {
|
|
138
|
+
client: httpGithubClient({ token: process.env.GITHUB_TOKEN! }),
|
|
139
|
+
repo: { owner: 'tangle-network', name: 'tax-agent' },
|
|
140
|
+
branchPrefix: 'eval/auto-improve',
|
|
141
|
+
promptFilePath: 'prompts/tax-agent-system.txt',
|
|
142
|
+
reviewers: ['drew'],
|
|
143
|
+
},
|
|
144
|
+
|
|
145
|
+
cron: { cadence: 'weekly' }, // surface-only; consumer schedules
|
|
146
|
+
})
|
|
147
|
+
|
|
148
|
+
console.log(result.decision) // 'pr_opened' | 'gate_failed' | 'no_actionable_failures' | ...
|
|
149
|
+
console.log(result.pullRequest?.prUrl) // populated when a PR was opened
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
The primitive runs **one cycle**. Schedule it with `workflow_dispatch` + cron in
|
|
153
|
+
GitHub Actions. It is **idempotent + replayable**: same `runId` → same plan.
|
|
154
|
+
Gate failures are fail-closed — a candidate that beats baseline on search but
|
|
155
|
+
overfits on holdout never lands.
|
|
156
|
+
|
|
157
|
+
Full runnable demo (synthetic traces, no credentials) in
|
|
158
|
+
[`examples/production-loop`](./examples/production-loop/README.md).
|
|
159
|
+
|
|
91
160
|
## Self-improvement loop
|
|
92
161
|
|
|
93
162
|
Eval doesn't end at "pass/fail." Outcomes become training signal, mutation
|
|
@@ -222,6 +291,8 @@ and runtime. See [`examples/`](./examples/).
|
|
|
222
291
|
closed loop — score, reflect, mutate, re-score, repeat.
|
|
223
292
|
- [`examples/fine-tune-with-prime-rl`](./examples/fine-tune-with-prime-rl/README.md):
|
|
224
293
|
RunRecord → preferences → trainer (prime-rl) → next campaign.
|
|
294
|
+
- [`examples/production-loop`](./examples/production-loop/README.md):
|
|
295
|
+
ingest prod traces + feedback, cluster failures, evolve, gate, open a PR.
|
|
225
296
|
|
|
226
297
|
## Docs
|
|
227
298
|
|
|
@@ -74,6 +74,114 @@ var HealthResponseSchema = z.object({
|
|
|
74
74
|
status: z.literal("ok"),
|
|
75
75
|
uptimeSec: z.number()
|
|
76
76
|
}).openapi("HealthResponse");
|
|
77
|
+
var TraceEventSchema = z.object({
|
|
78
|
+
eventId: z.string().min(1).describe("Stable id for the event. Use ULID or UUID."),
|
|
79
|
+
runId: z.string().min(1).describe("Run this event belongs to."),
|
|
80
|
+
spanId: z.string().optional().describe("Span that emitted the event, if any."),
|
|
81
|
+
kind: z.enum([
|
|
82
|
+
"log",
|
|
83
|
+
"error",
|
|
84
|
+
"budget_decrement",
|
|
85
|
+
"budget_breach",
|
|
86
|
+
"state_mutation",
|
|
87
|
+
"policy_violation",
|
|
88
|
+
"redaction_applied",
|
|
89
|
+
"custom"
|
|
90
|
+
]).describe("Coarse event category \u2014 matches the TraceSchema v1 EventKind enum."),
|
|
91
|
+
timestamp: z.number().int().nonnegative().describe("Unix millis. Must be monotonically non-decreasing within a span."),
|
|
92
|
+
payload: z.record(z.string(), z.unknown()).describe("Free-form payload \u2014 the runtime owns the shape.")
|
|
93
|
+
}).openapi("TraceEvent");
|
|
94
|
+
var TracesIngestRequestSchema = z.object({
|
|
95
|
+
events: z.array(TraceEventSchema).min(1).max(1e4).describe("Batch of events. Max 10k per call \u2014 bigger streams should be chunked.")
|
|
96
|
+
}).openapi("TracesIngestRequest");
|
|
97
|
+
var TracesIngestResponseSchema = z.object({
|
|
98
|
+
accepted: z.number().int().nonnegative().describe("Number of events persisted."),
|
|
99
|
+
rejected: z.number().int().nonnegative().describe("Number of events the store refused \u2014 see `errors[]` for reasons."),
|
|
100
|
+
errors: z.array(
|
|
101
|
+
z.object({
|
|
102
|
+
eventId: z.string().describe("Event id this error applies to."),
|
|
103
|
+
message: z.string().describe("Why the event was rejected.")
|
|
104
|
+
})
|
|
105
|
+
).default([])
|
|
106
|
+
}).openapi("TracesIngestResponse");
|
|
107
|
+
var FeedbackLabelSchema = z.object({
|
|
108
|
+
id: z.string().optional(),
|
|
109
|
+
source: z.enum(["user", "judge", "environment", "metric", "policy", "system"]),
|
|
110
|
+
kind: z.enum([
|
|
111
|
+
"approve",
|
|
112
|
+
"reject",
|
|
113
|
+
"select",
|
|
114
|
+
"edit",
|
|
115
|
+
"rank",
|
|
116
|
+
"rate",
|
|
117
|
+
"comment",
|
|
118
|
+
"metric_outcome",
|
|
119
|
+
"policy_block",
|
|
120
|
+
"revision_request"
|
|
121
|
+
]),
|
|
122
|
+
value: z.unknown(),
|
|
123
|
+
reason: z.string().optional(),
|
|
124
|
+
severity: z.enum(["info", "warning", "error", "critical"]).optional(),
|
|
125
|
+
createdAt: z.string().describe("ISO-8601 UTC."),
|
|
126
|
+
metadata: z.record(z.string(), z.unknown()).optional()
|
|
127
|
+
}).openapi("FeedbackLabel");
|
|
128
|
+
var FeedbackAttemptSchema = z.object({
|
|
129
|
+
id: z.string().min(1),
|
|
130
|
+
stepIndex: z.number().int().nonnegative(),
|
|
131
|
+
artifactType: z.enum([
|
|
132
|
+
"text",
|
|
133
|
+
"code",
|
|
134
|
+
"plan",
|
|
135
|
+
"research",
|
|
136
|
+
"action",
|
|
137
|
+
"ui",
|
|
138
|
+
"decision",
|
|
139
|
+
"data",
|
|
140
|
+
"other"
|
|
141
|
+
]),
|
|
142
|
+
artifact: z.unknown(),
|
|
143
|
+
options: z.array(z.unknown()).optional(),
|
|
144
|
+
proposedAction: z.object({
|
|
145
|
+
type: z.string(),
|
|
146
|
+
risk: z.enum(["low", "medium", "high"]).optional(),
|
|
147
|
+
costUsd: z.number().optional(),
|
|
148
|
+
externalSideEffect: z.boolean().optional(),
|
|
149
|
+
requiresApproval: z.boolean().optional(),
|
|
150
|
+
metadata: z.record(z.string(), z.unknown()).optional()
|
|
151
|
+
}).optional(),
|
|
152
|
+
feedback: z.array(FeedbackLabelSchema).optional(),
|
|
153
|
+
createdAt: z.string(),
|
|
154
|
+
metadata: z.record(z.string(), z.unknown()).optional()
|
|
155
|
+
}).openapi("FeedbackAttempt");
|
|
156
|
+
var FeedbackTrajectorySchema = z.object({
|
|
157
|
+
id: z.string().min(1).describe("Stable id; idempotency key for the trajectory."),
|
|
158
|
+
projectId: z.string().optional(),
|
|
159
|
+
scenarioId: z.string().optional(),
|
|
160
|
+
task: z.object({
|
|
161
|
+
intent: z.string().min(1),
|
|
162
|
+
context: z.unknown().optional()
|
|
163
|
+
}),
|
|
164
|
+
attempts: z.array(FeedbackAttemptSchema).default([]),
|
|
165
|
+
labels: z.array(FeedbackLabelSchema).default([]),
|
|
166
|
+
outcome: z.object({
|
|
167
|
+
success: z.boolean().optional(),
|
|
168
|
+
score: z.number().optional(),
|
|
169
|
+
metrics: z.record(z.string(), z.number()).optional(),
|
|
170
|
+
costUsd: z.number().optional(),
|
|
171
|
+
detail: z.string().optional(),
|
|
172
|
+
observedAt: z.string().optional(),
|
|
173
|
+
metadata: z.record(z.string(), z.unknown()).optional()
|
|
174
|
+
}).optional(),
|
|
175
|
+
split: z.enum(["train", "dev", "test", "holdout"]).optional(),
|
|
176
|
+
tags: z.record(z.string(), z.string()).optional(),
|
|
177
|
+
createdAt: z.string().describe("ISO-8601 UTC."),
|
|
178
|
+
updatedAt: z.string().optional(),
|
|
179
|
+
metadata: z.record(z.string(), z.unknown()).optional()
|
|
180
|
+
}).openapi("FeedbackTrajectory");
|
|
181
|
+
var FeedbackIngestResponseSchema = z.object({
|
|
182
|
+
id: z.string().describe("Trajectory id that was persisted."),
|
|
183
|
+
persisted: z.boolean().describe("True when the trajectory was saved (idempotent on id).")
|
|
184
|
+
}).openapi("FeedbackIngestResponse");
|
|
77
185
|
var ErrorResponseSchema = z.object({
|
|
78
186
|
error: z.object({
|
|
79
187
|
code: z.string().describe(
|
|
@@ -378,9 +486,43 @@ function handleVersion() {
|
|
|
378
486
|
package: "@tangle-network/agent-eval",
|
|
379
487
|
version: readPackageVersion(),
|
|
380
488
|
wireVersion: WIRE_VERSION,
|
|
381
|
-
apiSurface: ["judge", "listRubrics", "version"]
|
|
489
|
+
apiSurface: ["judge", "listRubrics", "version", "feedback.ingest", "traces.ingest"]
|
|
382
490
|
};
|
|
383
491
|
}
|
|
492
|
+
async function handleTracesIngest(req, stores) {
|
|
493
|
+
if (!stores.traceStore) {
|
|
494
|
+
throw new WireError(
|
|
495
|
+
"service_unavailable",
|
|
496
|
+
"No trace store configured on this server. Pass `traceStore` to `createApp`.",
|
|
497
|
+
503
|
|
498
|
+
);
|
|
499
|
+
}
|
|
500
|
+
const errors = [];
|
|
501
|
+
let accepted = 0;
|
|
502
|
+
for (const event of req.events) {
|
|
503
|
+
try {
|
|
504
|
+
await stores.traceStore.appendEvent(event);
|
|
505
|
+
accepted++;
|
|
506
|
+
} catch (err) {
|
|
507
|
+
errors.push({
|
|
508
|
+
eventId: event.eventId,
|
|
509
|
+
message: err instanceof Error ? err.message : String(err)
|
|
510
|
+
});
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
return { accepted, rejected: errors.length, errors };
|
|
514
|
+
}
|
|
515
|
+
async function handleFeedbackIngest(req, stores) {
|
|
516
|
+
if (!stores.feedbackStore) {
|
|
517
|
+
throw new WireError(
|
|
518
|
+
"service_unavailable",
|
|
519
|
+
"No feedback store configured on this server. Pass `feedbackStore` to `createApp`.",
|
|
520
|
+
503
|
|
521
|
+
);
|
|
522
|
+
}
|
|
523
|
+
await stores.feedbackStore.save(req);
|
|
524
|
+
return { id: req.id, persisted: true };
|
|
525
|
+
}
|
|
384
526
|
|
|
385
527
|
// src/wire/openapi.ts
|
|
386
528
|
import { OpenAPIRegistry, OpenApiGeneratorV31 } from "@asteasolutions/zod-to-openapi";
|
|
@@ -392,6 +534,10 @@ function buildOpenApi(packageVersion) {
|
|
|
392
534
|
registry.register("VersionResponse", VersionResponseSchema);
|
|
393
535
|
registry.register("HealthResponse", HealthResponseSchema);
|
|
394
536
|
registry.register("ErrorResponse", ErrorResponseSchema);
|
|
537
|
+
registry.register("TracesIngestRequest", TracesIngestRequestSchema);
|
|
538
|
+
registry.register("TracesIngestResponse", TracesIngestResponseSchema);
|
|
539
|
+
registry.register("FeedbackTrajectory", FeedbackTrajectorySchema);
|
|
540
|
+
registry.register("FeedbackIngestResponse", FeedbackIngestResponseSchema);
|
|
395
541
|
registry.registerPath({
|
|
396
542
|
method: "post",
|
|
397
543
|
path: "/v1/judge",
|
|
@@ -458,6 +604,69 @@ function buildOpenApi(packageVersion) {
|
|
|
458
604
|
}
|
|
459
605
|
}
|
|
460
606
|
});
|
|
607
|
+
registry.registerPath({
|
|
608
|
+
method: "post",
|
|
609
|
+
path: "/v1/traces/ingest",
|
|
610
|
+
summary: "Ingest a batch of production TraceEvents",
|
|
611
|
+
description: "Append a batch of TraceEvents to the configured TraceStore. Accepts application/json ({events:[...]}) or application/x-ndjson (one event per line). Returns counts of accepted + rejected events.",
|
|
612
|
+
request: {
|
|
613
|
+
body: {
|
|
614
|
+
content: {
|
|
615
|
+
"application/json": { schema: TracesIngestRequestSchema },
|
|
616
|
+
"application/x-ndjson": { schema: TracesIngestRequestSchema }
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
},
|
|
620
|
+
responses: {
|
|
621
|
+
200: {
|
|
622
|
+
description: "Ingestion summary",
|
|
623
|
+
content: { "application/json": { schema: TracesIngestResponseSchema } }
|
|
624
|
+
},
|
|
625
|
+
400: {
|
|
626
|
+
description: "Validation error",
|
|
627
|
+
content: { "application/json": { schema: ErrorResponseSchema } }
|
|
628
|
+
},
|
|
629
|
+
401: {
|
|
630
|
+
description: "Unauthorized (when bearer auth is configured)",
|
|
631
|
+
content: { "application/json": { schema: ErrorResponseSchema } }
|
|
632
|
+
},
|
|
633
|
+
503: {
|
|
634
|
+
description: "No trace store configured",
|
|
635
|
+
content: { "application/json": { schema: ErrorResponseSchema } }
|
|
636
|
+
}
|
|
637
|
+
}
|
|
638
|
+
});
|
|
639
|
+
registry.registerPath({
|
|
640
|
+
method: "post",
|
|
641
|
+
path: "/v1/feedback",
|
|
642
|
+
summary: "Ingest a FeedbackTrajectory from production",
|
|
643
|
+
description: "Persist a single FeedbackTrajectory. Idempotent on trajectory.id \u2014 re-posting replaces the prior record. Used by production runtimes to forward user \u{1F44D}/\u{1F44E}/edits into the eval substrate.",
|
|
644
|
+
request: {
|
|
645
|
+
body: {
|
|
646
|
+
content: {
|
|
647
|
+
"application/json": { schema: FeedbackTrajectorySchema }
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
},
|
|
651
|
+
responses: {
|
|
652
|
+
200: {
|
|
653
|
+
description: "Persisted",
|
|
654
|
+
content: { "application/json": { schema: FeedbackIngestResponseSchema } }
|
|
655
|
+
},
|
|
656
|
+
400: {
|
|
657
|
+
description: "Validation error",
|
|
658
|
+
content: { "application/json": { schema: ErrorResponseSchema } }
|
|
659
|
+
},
|
|
660
|
+
401: {
|
|
661
|
+
description: "Unauthorized (when bearer auth is configured)",
|
|
662
|
+
content: { "application/json": { schema: ErrorResponseSchema } }
|
|
663
|
+
},
|
|
664
|
+
503: {
|
|
665
|
+
description: "No feedback store configured",
|
|
666
|
+
content: { "application/json": { schema: ErrorResponseSchema } }
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
});
|
|
461
670
|
const generator = new OpenApiGeneratorV31(registry.definitions);
|
|
462
671
|
const doc = generator.generateDocument({
|
|
463
672
|
openapi: "3.1.0",
|
|
@@ -608,14 +817,34 @@ import { serve } from "@hono/node-server";
|
|
|
608
817
|
import { Hono } from "hono";
|
|
609
818
|
import { cors } from "hono/cors";
|
|
610
819
|
var STARTED_AT = Date.now();
|
|
611
|
-
|
|
820
|
+
var AUTH_EXEMPT_PATHS = /* @__PURE__ */ new Set(["/healthz", "/v1/version", "/openapi.json"]);
|
|
821
|
+
function createApp(opts = {}) {
|
|
612
822
|
const app = new Hono();
|
|
613
823
|
app.use("*", cors());
|
|
824
|
+
if (opts.auth) {
|
|
825
|
+
const verify = opts.auth.bearer;
|
|
826
|
+
app.use("*", async (c, next) => {
|
|
827
|
+
const path = new URL(c.req.url).pathname;
|
|
828
|
+
if (AUTH_EXEMPT_PATHS.has(path)) return next();
|
|
829
|
+
const raw = c.req.header("authorization") ?? "";
|
|
830
|
+
const match = raw.match(/^Bearer\s+(.+)$/i);
|
|
831
|
+
if (!match) {
|
|
832
|
+
throw new WireError("unauthorized", "Missing or malformed Authorization header.", 401);
|
|
833
|
+
}
|
|
834
|
+
const token = match[1];
|
|
835
|
+
const ok = typeof verify === "string" ? token === verify : await verify(token);
|
|
836
|
+
if (!ok) {
|
|
837
|
+
throw new WireError("unauthorized", "Invalid bearer token.", 401);
|
|
838
|
+
}
|
|
839
|
+
return next();
|
|
840
|
+
});
|
|
841
|
+
}
|
|
614
842
|
app.onError((err, c) => {
|
|
615
843
|
if (err instanceof WireError) {
|
|
844
|
+
const status = err.status;
|
|
616
845
|
return c.json(
|
|
617
846
|
{ error: { code: err.code, message: err.message, details: err.details } },
|
|
618
|
-
|
|
847
|
+
status
|
|
619
848
|
);
|
|
620
849
|
}
|
|
621
850
|
console.error("[agent-eval] unhandled error:", err);
|
|
@@ -644,11 +873,64 @@ function createApp() {
|
|
|
644
873
|
const result = await handleJudge(parsed.data);
|
|
645
874
|
return c.json(result);
|
|
646
875
|
});
|
|
876
|
+
app.post("/v1/traces/ingest", async (c) => {
|
|
877
|
+
const contentType = c.req.header("content-type") ?? "";
|
|
878
|
+
let payload;
|
|
879
|
+
if (contentType.includes("application/x-ndjson")) {
|
|
880
|
+
const text = await c.req.text();
|
|
881
|
+
const events = text.split("\n").map((line) => line.trim()).filter((line) => line.length > 0).map((line) => {
|
|
882
|
+
try {
|
|
883
|
+
return JSON.parse(line);
|
|
884
|
+
} catch {
|
|
885
|
+
throw new WireError(
|
|
886
|
+
"validation_error",
|
|
887
|
+
"NDJSON line did not parse as JSON.",
|
|
888
|
+
400,
|
|
889
|
+
line.slice(0, 200)
|
|
890
|
+
);
|
|
891
|
+
}
|
|
892
|
+
});
|
|
893
|
+
payload = { events };
|
|
894
|
+
} else {
|
|
895
|
+
payload = await c.req.json().catch(() => null);
|
|
896
|
+
}
|
|
897
|
+
if (payload == null) {
|
|
898
|
+
throw new WireError("validation_error", "Request body must be JSON or NDJSON.", 400);
|
|
899
|
+
}
|
|
900
|
+
const parsed = TracesIngestRequestSchema.safeParse(payload);
|
|
901
|
+
if (!parsed.success) {
|
|
902
|
+
throw new WireError(
|
|
903
|
+
"validation_error",
|
|
904
|
+
"Request did not match TracesIngestRequest schema.",
|
|
905
|
+
400,
|
|
906
|
+
parsed.error.issues
|
|
907
|
+
);
|
|
908
|
+
}
|
|
909
|
+
const result = await handleTracesIngest(parsed.data, opts.stores ?? {});
|
|
910
|
+
return c.json(result);
|
|
911
|
+
});
|
|
912
|
+
app.post("/v1/feedback", async (c) => {
|
|
913
|
+
const raw = await c.req.json().catch(() => null);
|
|
914
|
+
if (raw == null) {
|
|
915
|
+
throw new WireError("validation_error", "Request body must be JSON.", 400);
|
|
916
|
+
}
|
|
917
|
+
const parsed = FeedbackTrajectorySchema.safeParse(raw);
|
|
918
|
+
if (!parsed.success) {
|
|
919
|
+
throw new WireError(
|
|
920
|
+
"validation_error",
|
|
921
|
+
"Request did not match FeedbackTrajectory schema.",
|
|
922
|
+
400,
|
|
923
|
+
parsed.error.issues
|
|
924
|
+
);
|
|
925
|
+
}
|
|
926
|
+
const result = await handleFeedbackIngest(parsed.data, opts.stores ?? {});
|
|
927
|
+
return c.json(result);
|
|
928
|
+
});
|
|
647
929
|
app.get("/openapi.json", (c) => c.json(buildOpenApi(handleVersion().version)));
|
|
648
930
|
return app;
|
|
649
931
|
}
|
|
650
932
|
function startServer(opts = {}) {
|
|
651
|
-
const app = createApp();
|
|
933
|
+
const app = createApp(opts);
|
|
652
934
|
const port = opts.port ?? 5005;
|
|
653
935
|
const host = opts.host ?? "127.0.0.1";
|
|
654
936
|
return serve({ fetch: app.fetch, port, hostname: host }, ({ address, port: actualPort }) => {
|
|
@@ -666,6 +948,13 @@ export {
|
|
|
666
948
|
ListRubricsResponseSchema,
|
|
667
949
|
VersionResponseSchema,
|
|
668
950
|
HealthResponseSchema,
|
|
951
|
+
TraceEventSchema,
|
|
952
|
+
TracesIngestRequestSchema,
|
|
953
|
+
TracesIngestResponseSchema,
|
|
954
|
+
FeedbackLabelSchema,
|
|
955
|
+
FeedbackAttemptSchema,
|
|
956
|
+
FeedbackTrajectorySchema,
|
|
957
|
+
FeedbackIngestResponseSchema,
|
|
669
958
|
ErrorResponseSchema,
|
|
670
959
|
WIRE_VERSION,
|
|
671
960
|
hashRubric,
|
|
@@ -676,6 +965,8 @@ export {
|
|
|
676
965
|
handleJudge,
|
|
677
966
|
handleListRubrics,
|
|
678
967
|
handleVersion,
|
|
968
|
+
handleTracesIngest,
|
|
969
|
+
handleFeedbackIngest,
|
|
679
970
|
buildOpenApi,
|
|
680
971
|
dispatchRpc,
|
|
681
972
|
runRpcOnce,
|
|
@@ -683,4 +974,4 @@ export {
|
|
|
683
974
|
createApp,
|
|
684
975
|
startServer
|
|
685
976
|
};
|
|
686
|
-
//# sourceMappingURL=chunk-
|
|
977
|
+
//# sourceMappingURL=chunk-5LBB5B3Z.js.map
|