@tangle-network/agent-eval 0.24.0 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +138 -0
- package/README.md +72 -0
- package/dist/{chunk-SY6WAAAD.js → chunk-5LBB5B3Z.js} +296 -5
- package/dist/chunk-5LBB5B3Z.js.map +1 -0
- package/dist/{chunk-OHEPNJQN.js → chunk-JLZQWFV3.js} +65 -1
- package/dist/chunk-JLZQWFV3.js.map +1 -0
- package/dist/{chunk-VRJVTXRV.js → chunk-WHZMVFUV.js} +85 -85
- package/dist/chunk-WHZMVFUV.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/governance/index.d.ts +1 -1
- package/dist/{index-Oj9fAPPN.d.ts → index-D3iBCjdF.d.ts} +63 -2
- package/dist/index.d.ts +529 -12
- package/dist/index.js +1106 -17
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +491 -1
- package/dist/optimization.d.ts +2 -2
- package/dist/optimization.js +1 -1
- package/dist/pipelines/index.js +3 -67
- package/dist/pipelines/index.js.map +1 -1
- package/dist/{release-report-TDPn1cxq.d.ts → release-report-wfUySN5F.d.ts} +1 -1
- package/dist/reporting.d.ts +2 -2
- package/dist/{researcher-CUOiGcGv.d.ts → researcher-bGkI7vCl.d.ts} +1 -1
- package/dist/rl.d.ts +3 -3
- package/dist/{summary-report-BXGs_9V0.d.ts → summary-report-DZVXOCK_.d.ts} +13 -1
- package/dist/wire/index.d.ts +347 -3
- package/dist/wire/index.js +19 -1
- package/docs/concepts.md +11 -0
- package/package.json +1 -1
- package/dist/chunk-OHEPNJQN.js.map +0 -1
- package/dist/chunk-SY6WAAAD.js.map +0 -1
- package/dist/chunk-VRJVTXRV.js.map +0 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,143 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.27.0 — 2026-05-17
|
|
4
|
+
|
|
5
|
+
### Substrate reliability — eliminate silent-zero judge corruption
|
|
6
|
+
|
|
7
|
+
Today's tax + gtm evals shipped composites where the judge LLM silently
|
|
8
|
+
aborted (verbose new prompts streamed past the 60s default timeout) and
|
|
9
|
+
the per-trial score collapsed to `0`. The composite formula then weighted
|
|
10
|
+
that zero into the mean, producing a "−27pp tax regression" that was
|
|
11
|
+
actually a measurement-instrument failure, not a prompt regression.
|
|
12
|
+
|
|
13
|
+
This release adds three substrate primitives so consumers can stop
|
|
14
|
+
silent-zeroing their own data:
|
|
15
|
+
|
|
16
|
+
- **`withJudgeRetry(judgeFn, policy)`** — wraps any judge call with retry
|
|
17
|
+
on transient failures (Abort, Timeout, fetch failed, 429/502/503/504),
|
|
18
|
+
optional fallback-model rotation, and a typed outcome (`succeeded`,
|
|
19
|
+
`attempts`, `value`, `error`). Refuses to default to a silent zero.
|
|
20
|
+
- **`aggregateTrialsByMode(trials, { mode })`** — `'exclude-failed'` mode
|
|
21
|
+
drops trials with `judgeSucceeded === false` from the mean so a failed
|
|
22
|
+
judge doesn't corrupt the composite. `'strict-fail'` mode refuses the
|
|
23
|
+
aggregate when any judge failed. `'zero-fill'` preserves legacy.
|
|
24
|
+
- **`discoverPersonas(dir, opts)`** — replaces every consumer's hardcoded
|
|
25
|
+
`TRAINING_PERSONA_FILES` constant. New personas on disk are picked up
|
|
26
|
+
automatically; consumers can filter via include/exclude patterns.
|
|
27
|
+
|
|
28
|
+
Additive to `TrialResult`: `judgeSucceeded?`, `judgeAttempts?`, `judgeError?`
|
|
29
|
+
fields. Existing adapters that don't set these continue to work
|
|
30
|
+
unchanged via `'zero-fill'` mode (default for back-compat).
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
## 0.26.0 — Continuous-value inter-rater agreement (ICC + weighted κ)
|
|
34
|
+
|
|
35
|
+
The original `calibrateJudge` rounded scores to ints before computing
|
|
36
|
+
Cohen's κ. For fine-grained judges that's lossy — 0.78 vs 0.81 both
|
|
37
|
+
round to "1" and the integer κ pretends they agreed perfectly when they
|
|
38
|
+
actually disagree by 3 percentage points. This release ships principled
|
|
39
|
+
continuous-value agreement metrics so calibration findings become
|
|
40
|
+
quantitative for [0,1]-valued judges.
|
|
41
|
+
|
|
42
|
+
### Added
|
|
43
|
+
|
|
44
|
+
- **`continuousAgreement(scores, opts?)`** (`src/judge-calibration.ts`) —
|
|
45
|
+
inter-rater agreement on continuous scores. Returns:
|
|
46
|
+
- `weightedKappa` — Cohen's κ_w with quadratic (or linear) weights on
|
|
47
|
+
raw scores, no quantisation.
|
|
48
|
+
- `icc` — ICC(2,1), two-way random effects, absolute agreement,
|
|
49
|
+
single rater (Shrout & Fleiss 1979). The principled reliability
|
|
50
|
+
coefficient when judges are a random sample of the judge population.
|
|
51
|
+
- `pearson` / `spearman` — averaged over rater pairs when N ≥ 2 raters.
|
|
52
|
+
- `ci.icc` / `ci.weightedKappa` — bootstrap percentile 95% CIs
|
|
53
|
+
(default `n=1000`, seeded for reproducibility).
|
|
54
|
+
Accepts `scores: number[][]` shaped `[n_items][n_raters]`. Rows with
|
|
55
|
+
non-finite entries are dropped, not coerced.
|
|
56
|
+
|
|
57
|
+
- **`calibrateJudgeContinuous(golden, candidate, opts?)`** — drop-in
|
|
58
|
+
superset of `calibrateJudge`. Preserves every legacy field
|
|
59
|
+
(`n`, `pearson`, `kappa`, `mae`, `worstItems`) and adds
|
|
60
|
+
`weightedKappaContinuous`, `icc`, `spearman`, and `ci`. Use this when
|
|
61
|
+
the judge produces fine-grained [0,1] scores; keep `calibrateJudge`
|
|
62
|
+
for the original integer-quantised report.
|
|
63
|
+
|
|
64
|
+
### Why two κ flavours
|
|
65
|
+
|
|
66
|
+
ICC(2,1) catches systematic bias that Pearson misses. If judge B scores
|
|
67
|
+
2× judge A, Pearson stays ≈ 1 (linear association is perfect) while ICC
|
|
68
|
+
plummets (absolute agreement is poor). The new tests assert this exact
|
|
69
|
+
failure mode so the regression can't sneak back in.
|
|
70
|
+
|
|
71
|
+
### Unchanged
|
|
72
|
+
|
|
73
|
+
- `calibrateJudge` keeps its original integer-rounded κ semantics for
|
|
74
|
+
backwards compatibility. Nothing else moves.
|
|
75
|
+
|
|
76
|
+
## 0.25.0 — ProductionLoop primitive: close the eval → prod → eval cycle
|
|
77
|
+
|
|
78
|
+
This release ships the **orchestration layer** that turns the existing
|
|
79
|
+
eval substrate into a continuously-improving production system. Static
|
|
80
|
+
prompts decay; today's regulation flips tomorrow. The pieces to close
|
|
81
|
+
the loop were already in the package (`runMultiShotOptimization`,
|
|
82
|
+
`failureClusterView`, `evaluateReleaseConfidence`, `extractPreferences`,
|
|
83
|
+
`FeedbackTrajectoryStore`, `TraceStore`); this release adds the one
|
|
84
|
+
clean primitive that wires them together end-to-end.
|
|
85
|
+
|
|
86
|
+
### Added
|
|
87
|
+
|
|
88
|
+
- **`runProductionLoop({ ... })`** (`src/production-loop.ts`,
|
|
89
|
+
`@experimental`) — one call = one cycle. Ingests production traces
|
|
90
|
+
and feedback, clusters failures, runs evolve against the worst
|
|
91
|
+
cluster, gates with `HeldOutGate` + `evaluateReleaseConfidence`
|
|
92
|
+
(fail-closed), and — when wired with an `AutoPrClient` — opens a PR
|
|
93
|
+
with the improved prompt. Idempotent + replayable: same `runId`
|
|
94
|
+
yields the same plan. Cron / GitHub Actions are the consumer's job;
|
|
95
|
+
the primitive doesn't own scheduling.
|
|
96
|
+
|
|
97
|
+
- **`proposeAutomatedPullRequest(client, input)`** + two transports
|
|
98
|
+
(`src/auto-pr.ts`, `@experimental`):
|
|
99
|
+
- `httpGithubClient({ token, ... })` — direct REST against
|
|
100
|
+
`api.github.com`, no extra deps. Idempotent on branch name:
|
|
101
|
+
existing open PRs are returned, not duplicated.
|
|
102
|
+
- `ghCliClient({ ... })` — shells out to `gh` for environments
|
|
103
|
+
where developer auth state is already configured.
|
|
104
|
+
Both validate inputs (no `..` paths, no whitespace branches, no
|
|
105
|
+
duplicate file changes) and surface `ValidationError` / `ConfigError`
|
|
106
|
+
from the typed taxonomy.
|
|
107
|
+
|
|
108
|
+
- **`POST /v1/feedback` + `POST /v1/traces/ingest`** wire endpoints
|
|
109
|
+
(`src/wire/`). Both Zod-validated, both append to the configured
|
|
110
|
+
store (`FeedbackTrajectoryStore` / `TraceStore`). 503 when no store
|
|
111
|
+
is wired (fail loud, not silent). Traces ingest accepts both
|
|
112
|
+
`application/json` (`{events:[...]}`) and `application/x-ndjson` for
|
|
113
|
+
streaming production runtimes. Schemas (`TraceEvent`,
|
|
114
|
+
`FeedbackTrajectory`, `TracesIngestRequest/Response`,
|
|
115
|
+
`FeedbackIngestResponse`) added to `openapi.json` for cross-language
|
|
116
|
+
clients.
|
|
117
|
+
|
|
118
|
+
- **Optional bearer-token auth** on the wire server, configured via
|
|
119
|
+
`createApp({ auth: { bearer: '...' } })` or as a verifier function
|
|
120
|
+
for rotating tokens. `/healthz` and `/v1/version` remain unprotected
|
|
121
|
+
(regression: never lock monitoring out of the runtime).
|
|
122
|
+
|
|
123
|
+
- **`examples/production-loop/`** — synthetic end-to-end demo wiring
|
|
124
|
+
the loop against in-memory trace + feedback stores and a fake
|
|
125
|
+
auto-PR client. Shows the failure-cluster trigger, the evolve round,
|
|
126
|
+
the gate verdict, and the PR-shaped output without requiring
|
|
127
|
+
credentials or a live model.
|
|
128
|
+
|
|
129
|
+
### Changed
|
|
130
|
+
|
|
131
|
+
- **Wire server** (`createApp(opts)`) now accepts optional
|
|
132
|
+
`IngestionStores` (`{ traceStore?, feedbackStore? }`) and `auth`.
|
|
133
|
+
Existing zero-arg callers continue to work — judge / rubrics /
|
|
134
|
+
version / healthz are unchanged.
|
|
135
|
+
|
|
136
|
+
### Status tags
|
|
137
|
+
|
|
138
|
+
- Every new export is `@experimental` initially. Pin the patch version
|
|
139
|
+
if you depend on it. All other 0.24.0 stability tags are preserved.
|
|
140
|
+
|
|
3
141
|
## 0.24.0 — DX cleanup: framing, stability tags, lint, taxonomy, strict indices
|
|
4
142
|
|
|
5
143
|
This release is **DX + correctness**. No production behavior moved; consumer
|
package/README.md
CHANGED
|
@@ -88,6 +88,75 @@ await product.storeEvalResult(task.id, result)
|
|
|
88
88
|
Same loop shape in production, replay, benchmark, and optimization. Swap the
|
|
89
89
|
dependencies behind `observe()` and `act()`, never the eval contract.
|
|
90
90
|
|
|
91
|
+
## Production loop — close the eval → prod → eval cycle (0.25.0)
|
|
92
|
+
|
|
93
|
+
Static prompts decay. Yesterday's FTC rule flips today; yesterday's tool quirk
|
|
94
|
+
becomes today's incident. The production agents that win are the ones that
|
|
95
|
+
**continuously re-train against live failure modes**.
|
|
96
|
+
|
|
97
|
+
`runProductionLoop` is the orchestration layer that wires the existing eval
|
|
98
|
+
substrate into a self-improvement cron:
|
|
99
|
+
|
|
100
|
+
```ts
|
|
101
|
+
import {
|
|
102
|
+
runProductionLoop,
|
|
103
|
+
httpGithubClient,
|
|
104
|
+
FileSystemFeedbackTrajectoryStore,
|
|
105
|
+
} from '@tangle-network/agent-eval'
|
|
106
|
+
import { FileSystemTraceStore } from '@tangle-network/agent-eval/traces'
|
|
107
|
+
|
|
108
|
+
const result = await runProductionLoop({
|
|
109
|
+
runId: `weekly-${new Date().toISOString().slice(0, 10)}`,
|
|
110
|
+
target: 'tax-agent',
|
|
111
|
+
|
|
112
|
+
// 1. Where production traces + feedback land. Wire the HTTP ingestion
|
|
113
|
+
// endpoints (POST /v1/traces/ingest, POST /v1/feedback) from your
|
|
114
|
+
// runtime; the same store reads them here.
|
|
115
|
+
traceStore: new FileSystemTraceStore({ dir: 'data/prod-traces' }),
|
|
116
|
+
feedbackStore: new FileSystemFeedbackTrajectoryStore({ dir: 'data/prod-feedback' }),
|
|
117
|
+
|
|
118
|
+
// 2. Cluster threshold: act on failure groups ≥ 20 runs or ≥ 5% of corpus.
|
|
119
|
+
cluster: { minClusterSize: 20, minSeverityRatio: 0.05, maxClustersPerCycle: 1 },
|
|
120
|
+
|
|
121
|
+
// 3. Evolve: seed = current prompt, gate against holdout scenarios.
|
|
122
|
+
evolve: {
|
|
123
|
+
baselinePrompt: currentSystemPrompt,
|
|
124
|
+
holdoutScenarios: productionShapeScenarios,
|
|
125
|
+
runner, // your agent driver
|
|
126
|
+
scorer, // calibrated judge or rubric
|
|
127
|
+
mutator, // GEPA-style or addendum-style mutator
|
|
128
|
+
gate: {
|
|
129
|
+
baselineKey: 'baseline',
|
|
130
|
+
minProductiveRuns: 5,
|
|
131
|
+
pairedDeltaThreshold: 0.03, // require Nσ improvement on holdout
|
|
132
|
+
overfitGapThreshold: 0.10,
|
|
133
|
+
},
|
|
134
|
+
},
|
|
135
|
+
|
|
136
|
+
// 4. Ship: when the gate passes, open a PR with the new prompt.
|
|
137
|
+
ship: {
|
|
138
|
+
client: httpGithubClient({ token: process.env.GITHUB_TOKEN! }),
|
|
139
|
+
repo: { owner: 'tangle-network', name: 'tax-agent' },
|
|
140
|
+
branchPrefix: 'eval/auto-improve',
|
|
141
|
+
promptFilePath: 'prompts/tax-agent-system.txt',
|
|
142
|
+
reviewers: ['drew'],
|
|
143
|
+
},
|
|
144
|
+
|
|
145
|
+
cron: { cadence: 'weekly' }, // surface-only; consumer schedules
|
|
146
|
+
})
|
|
147
|
+
|
|
148
|
+
console.log(result.decision) // 'pr_opened' | 'gate_failed' | 'no_actionable_failures' | ...
|
|
149
|
+
console.log(result.pullRequest?.prUrl) // populated when a PR was opened
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
The primitive runs **one cycle**. Schedule it with `workflow_dispatch` + cron in
|
|
153
|
+
GitHub Actions. It is **idempotent + replayable**: same `runId` → same plan.
|
|
154
|
+
Gate failures are fail-closed — a candidate that beats baseline on search but
|
|
155
|
+
overfits on holdout never lands.
|
|
156
|
+
|
|
157
|
+
Full runnable demo (synthetic traces, no credentials) in
|
|
158
|
+
[`examples/production-loop`](./examples/production-loop/README.md).
|
|
159
|
+
|
|
91
160
|
## Self-improvement loop
|
|
92
161
|
|
|
93
162
|
Eval doesn't end at "pass/fail." Outcomes become training signal, mutation
|
|
@@ -125,6 +194,7 @@ const next = await analyzeOptimizationResult(campaign, { researcher })
|
|
|
125
194
|
| Deterministic re-judge / audit | `ReplayCache`, `createReplayFetch` | `/` |
|
|
126
195
|
| Anytime-valid α across rolling looks | `pairedEvalueSequence` | `/reporting` |
|
|
127
196
|
| Judge quality vs gold | `calibrateJudge` (κ, Pearson, MAE, bias probes) | `/` |
|
|
197
|
+
| Continuous inter-rater agreement | `calibrateJudgeContinuous`, `continuousAgreement` (κ_w, ICC(2,1), bootstrap CIs) | `/` |
|
|
128
198
|
| (chosen, rejected) for DPO/KTO/PPO | `extractPreferences` | `/rl` |
|
|
129
199
|
| Verifiable reward signal | `extractVerifiableReward` | `/rl` |
|
|
130
200
|
| Step-level / PRM training data | `extractStepRewards`, `prmTrainingPairs` | `/rl` |
|
|
@@ -222,6 +292,8 @@ and runtime. See [`examples/`](./examples/).
|
|
|
222
292
|
closed loop — score, reflect, mutate, re-score, repeat.
|
|
223
293
|
- [`examples/fine-tune-with-prime-rl`](./examples/fine-tune-with-prime-rl/README.md):
|
|
224
294
|
RunRecord → preferences → trainer (prime-rl) → next campaign.
|
|
295
|
+
- [`examples/production-loop`](./examples/production-loop/README.md):
|
|
296
|
+
ingest prod traces + feedback, cluster failures, evolve, gate, open a PR.
|
|
225
297
|
|
|
226
298
|
## Docs
|
|
227
299
|
|
|
@@ -74,6 +74,114 @@ var HealthResponseSchema = z.object({
|
|
|
74
74
|
status: z.literal("ok"),
|
|
75
75
|
uptimeSec: z.number()
|
|
76
76
|
}).openapi("HealthResponse");
|
|
77
|
+
var TraceEventSchema = z.object({
|
|
78
|
+
eventId: z.string().min(1).describe("Stable id for the event. Use ULID or UUID."),
|
|
79
|
+
runId: z.string().min(1).describe("Run this event belongs to."),
|
|
80
|
+
spanId: z.string().optional().describe("Span that emitted the event, if any."),
|
|
81
|
+
kind: z.enum([
|
|
82
|
+
"log",
|
|
83
|
+
"error",
|
|
84
|
+
"budget_decrement",
|
|
85
|
+
"budget_breach",
|
|
86
|
+
"state_mutation",
|
|
87
|
+
"policy_violation",
|
|
88
|
+
"redaction_applied",
|
|
89
|
+
"custom"
|
|
90
|
+
]).describe("Coarse event category \u2014 matches the TraceSchema v1 EventKind enum."),
|
|
91
|
+
timestamp: z.number().int().nonnegative().describe("Unix millis. Must be monotonically non-decreasing within a span."),
|
|
92
|
+
payload: z.record(z.string(), z.unknown()).describe("Free-form payload \u2014 the runtime owns the shape.")
|
|
93
|
+
}).openapi("TraceEvent");
|
|
94
|
+
var TracesIngestRequestSchema = z.object({
|
|
95
|
+
events: z.array(TraceEventSchema).min(1).max(1e4).describe("Batch of events. Max 10k per call \u2014 bigger streams should be chunked.")
|
|
96
|
+
}).openapi("TracesIngestRequest");
|
|
97
|
+
var TracesIngestResponseSchema = z.object({
|
|
98
|
+
accepted: z.number().int().nonnegative().describe("Number of events persisted."),
|
|
99
|
+
rejected: z.number().int().nonnegative().describe("Number of events the store refused \u2014 see `errors[]` for reasons."),
|
|
100
|
+
errors: z.array(
|
|
101
|
+
z.object({
|
|
102
|
+
eventId: z.string().describe("Event id this error applies to."),
|
|
103
|
+
message: z.string().describe("Why the event was rejected.")
|
|
104
|
+
})
|
|
105
|
+
).default([])
|
|
106
|
+
}).openapi("TracesIngestResponse");
|
|
107
|
+
var FeedbackLabelSchema = z.object({
|
|
108
|
+
id: z.string().optional(),
|
|
109
|
+
source: z.enum(["user", "judge", "environment", "metric", "policy", "system"]),
|
|
110
|
+
kind: z.enum([
|
|
111
|
+
"approve",
|
|
112
|
+
"reject",
|
|
113
|
+
"select",
|
|
114
|
+
"edit",
|
|
115
|
+
"rank",
|
|
116
|
+
"rate",
|
|
117
|
+
"comment",
|
|
118
|
+
"metric_outcome",
|
|
119
|
+
"policy_block",
|
|
120
|
+
"revision_request"
|
|
121
|
+
]),
|
|
122
|
+
value: z.unknown(),
|
|
123
|
+
reason: z.string().optional(),
|
|
124
|
+
severity: z.enum(["info", "warning", "error", "critical"]).optional(),
|
|
125
|
+
createdAt: z.string().describe("ISO-8601 UTC."),
|
|
126
|
+
metadata: z.record(z.string(), z.unknown()).optional()
|
|
127
|
+
}).openapi("FeedbackLabel");
|
|
128
|
+
var FeedbackAttemptSchema = z.object({
|
|
129
|
+
id: z.string().min(1),
|
|
130
|
+
stepIndex: z.number().int().nonnegative(),
|
|
131
|
+
artifactType: z.enum([
|
|
132
|
+
"text",
|
|
133
|
+
"code",
|
|
134
|
+
"plan",
|
|
135
|
+
"research",
|
|
136
|
+
"action",
|
|
137
|
+
"ui",
|
|
138
|
+
"decision",
|
|
139
|
+
"data",
|
|
140
|
+
"other"
|
|
141
|
+
]),
|
|
142
|
+
artifact: z.unknown(),
|
|
143
|
+
options: z.array(z.unknown()).optional(),
|
|
144
|
+
proposedAction: z.object({
|
|
145
|
+
type: z.string(),
|
|
146
|
+
risk: z.enum(["low", "medium", "high"]).optional(),
|
|
147
|
+
costUsd: z.number().optional(),
|
|
148
|
+
externalSideEffect: z.boolean().optional(),
|
|
149
|
+
requiresApproval: z.boolean().optional(),
|
|
150
|
+
metadata: z.record(z.string(), z.unknown()).optional()
|
|
151
|
+
}).optional(),
|
|
152
|
+
feedback: z.array(FeedbackLabelSchema).optional(),
|
|
153
|
+
createdAt: z.string(),
|
|
154
|
+
metadata: z.record(z.string(), z.unknown()).optional()
|
|
155
|
+
}).openapi("FeedbackAttempt");
|
|
156
|
+
var FeedbackTrajectorySchema = z.object({
|
|
157
|
+
id: z.string().min(1).describe("Stable id; idempotency key for the trajectory."),
|
|
158
|
+
projectId: z.string().optional(),
|
|
159
|
+
scenarioId: z.string().optional(),
|
|
160
|
+
task: z.object({
|
|
161
|
+
intent: z.string().min(1),
|
|
162
|
+
context: z.unknown().optional()
|
|
163
|
+
}),
|
|
164
|
+
attempts: z.array(FeedbackAttemptSchema).default([]),
|
|
165
|
+
labels: z.array(FeedbackLabelSchema).default([]),
|
|
166
|
+
outcome: z.object({
|
|
167
|
+
success: z.boolean().optional(),
|
|
168
|
+
score: z.number().optional(),
|
|
169
|
+
metrics: z.record(z.string(), z.number()).optional(),
|
|
170
|
+
costUsd: z.number().optional(),
|
|
171
|
+
detail: z.string().optional(),
|
|
172
|
+
observedAt: z.string().optional(),
|
|
173
|
+
metadata: z.record(z.string(), z.unknown()).optional()
|
|
174
|
+
}).optional(),
|
|
175
|
+
split: z.enum(["train", "dev", "test", "holdout"]).optional(),
|
|
176
|
+
tags: z.record(z.string(), z.string()).optional(),
|
|
177
|
+
createdAt: z.string().describe("ISO-8601 UTC."),
|
|
178
|
+
updatedAt: z.string().optional(),
|
|
179
|
+
metadata: z.record(z.string(), z.unknown()).optional()
|
|
180
|
+
}).openapi("FeedbackTrajectory");
|
|
181
|
+
var FeedbackIngestResponseSchema = z.object({
|
|
182
|
+
id: z.string().describe("Trajectory id that was persisted."),
|
|
183
|
+
persisted: z.boolean().describe("True when the trajectory was saved (idempotent on id).")
|
|
184
|
+
}).openapi("FeedbackIngestResponse");
|
|
77
185
|
var ErrorResponseSchema = z.object({
|
|
78
186
|
error: z.object({
|
|
79
187
|
code: z.string().describe(
|
|
@@ -378,9 +486,43 @@ function handleVersion() {
|
|
|
378
486
|
package: "@tangle-network/agent-eval",
|
|
379
487
|
version: readPackageVersion(),
|
|
380
488
|
wireVersion: WIRE_VERSION,
|
|
381
|
-
apiSurface: ["judge", "listRubrics", "version"]
|
|
489
|
+
apiSurface: ["judge", "listRubrics", "version", "feedback.ingest", "traces.ingest"]
|
|
382
490
|
};
|
|
383
491
|
}
|
|
492
|
+
async function handleTracesIngest(req, stores) {
|
|
493
|
+
if (!stores.traceStore) {
|
|
494
|
+
throw new WireError(
|
|
495
|
+
"service_unavailable",
|
|
496
|
+
"No trace store configured on this server. Pass `traceStore` to `createApp`.",
|
|
497
|
+
503
|
|
498
|
+
);
|
|
499
|
+
}
|
|
500
|
+
const errors = [];
|
|
501
|
+
let accepted = 0;
|
|
502
|
+
for (const event of req.events) {
|
|
503
|
+
try {
|
|
504
|
+
await stores.traceStore.appendEvent(event);
|
|
505
|
+
accepted++;
|
|
506
|
+
} catch (err) {
|
|
507
|
+
errors.push({
|
|
508
|
+
eventId: event.eventId,
|
|
509
|
+
message: err instanceof Error ? err.message : String(err)
|
|
510
|
+
});
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
return { accepted, rejected: errors.length, errors };
|
|
514
|
+
}
|
|
515
|
+
async function handleFeedbackIngest(req, stores) {
|
|
516
|
+
if (!stores.feedbackStore) {
|
|
517
|
+
throw new WireError(
|
|
518
|
+
"service_unavailable",
|
|
519
|
+
"No feedback store configured on this server. Pass `feedbackStore` to `createApp`.",
|
|
520
|
+
503
|
|
521
|
+
);
|
|
522
|
+
}
|
|
523
|
+
await stores.feedbackStore.save(req);
|
|
524
|
+
return { id: req.id, persisted: true };
|
|
525
|
+
}
|
|
384
526
|
|
|
385
527
|
// src/wire/openapi.ts
|
|
386
528
|
import { OpenAPIRegistry, OpenApiGeneratorV31 } from "@asteasolutions/zod-to-openapi";
|
|
@@ -392,6 +534,10 @@ function buildOpenApi(packageVersion) {
|
|
|
392
534
|
registry.register("VersionResponse", VersionResponseSchema);
|
|
393
535
|
registry.register("HealthResponse", HealthResponseSchema);
|
|
394
536
|
registry.register("ErrorResponse", ErrorResponseSchema);
|
|
537
|
+
registry.register("TracesIngestRequest", TracesIngestRequestSchema);
|
|
538
|
+
registry.register("TracesIngestResponse", TracesIngestResponseSchema);
|
|
539
|
+
registry.register("FeedbackTrajectory", FeedbackTrajectorySchema);
|
|
540
|
+
registry.register("FeedbackIngestResponse", FeedbackIngestResponseSchema);
|
|
395
541
|
registry.registerPath({
|
|
396
542
|
method: "post",
|
|
397
543
|
path: "/v1/judge",
|
|
@@ -458,6 +604,69 @@ function buildOpenApi(packageVersion) {
|
|
|
458
604
|
}
|
|
459
605
|
}
|
|
460
606
|
});
|
|
607
|
+
registry.registerPath({
|
|
608
|
+
method: "post",
|
|
609
|
+
path: "/v1/traces/ingest",
|
|
610
|
+
summary: "Ingest a batch of production TraceEvents",
|
|
611
|
+
description: "Append a batch of TraceEvents to the configured TraceStore. Accepts application/json ({events:[...]}) or application/x-ndjson (one event per line). Returns counts of accepted + rejected events.",
|
|
612
|
+
request: {
|
|
613
|
+
body: {
|
|
614
|
+
content: {
|
|
615
|
+
"application/json": { schema: TracesIngestRequestSchema },
|
|
616
|
+
"application/x-ndjson": { schema: TracesIngestRequestSchema }
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
},
|
|
620
|
+
responses: {
|
|
621
|
+
200: {
|
|
622
|
+
description: "Ingestion summary",
|
|
623
|
+
content: { "application/json": { schema: TracesIngestResponseSchema } }
|
|
624
|
+
},
|
|
625
|
+
400: {
|
|
626
|
+
description: "Validation error",
|
|
627
|
+
content: { "application/json": { schema: ErrorResponseSchema } }
|
|
628
|
+
},
|
|
629
|
+
401: {
|
|
630
|
+
description: "Unauthorized (when bearer auth is configured)",
|
|
631
|
+
content: { "application/json": { schema: ErrorResponseSchema } }
|
|
632
|
+
},
|
|
633
|
+
503: {
|
|
634
|
+
description: "No trace store configured",
|
|
635
|
+
content: { "application/json": { schema: ErrorResponseSchema } }
|
|
636
|
+
}
|
|
637
|
+
}
|
|
638
|
+
});
|
|
639
|
+
registry.registerPath({
|
|
640
|
+
method: "post",
|
|
641
|
+
path: "/v1/feedback",
|
|
642
|
+
summary: "Ingest a FeedbackTrajectory from production",
|
|
643
|
+
description: "Persist a single FeedbackTrajectory. Idempotent on trajectory.id \u2014 re-posting replaces the prior record. Used by production runtimes to forward user \u{1F44D}/\u{1F44E}/edits into the eval substrate.",
|
|
644
|
+
request: {
|
|
645
|
+
body: {
|
|
646
|
+
content: {
|
|
647
|
+
"application/json": { schema: FeedbackTrajectorySchema }
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
},
|
|
651
|
+
responses: {
|
|
652
|
+
200: {
|
|
653
|
+
description: "Persisted",
|
|
654
|
+
content: { "application/json": { schema: FeedbackIngestResponseSchema } }
|
|
655
|
+
},
|
|
656
|
+
400: {
|
|
657
|
+
description: "Validation error",
|
|
658
|
+
content: { "application/json": { schema: ErrorResponseSchema } }
|
|
659
|
+
},
|
|
660
|
+
401: {
|
|
661
|
+
description: "Unauthorized (when bearer auth is configured)",
|
|
662
|
+
content: { "application/json": { schema: ErrorResponseSchema } }
|
|
663
|
+
},
|
|
664
|
+
503: {
|
|
665
|
+
description: "No feedback store configured",
|
|
666
|
+
content: { "application/json": { schema: ErrorResponseSchema } }
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
});
|
|
461
670
|
const generator = new OpenApiGeneratorV31(registry.definitions);
|
|
462
671
|
const doc = generator.generateDocument({
|
|
463
672
|
openapi: "3.1.0",
|
|
@@ -608,14 +817,34 @@ import { serve } from "@hono/node-server";
|
|
|
608
817
|
import { Hono } from "hono";
|
|
609
818
|
import { cors } from "hono/cors";
|
|
610
819
|
var STARTED_AT = Date.now();
|
|
611
|
-
|
|
820
|
+
var AUTH_EXEMPT_PATHS = /* @__PURE__ */ new Set(["/healthz", "/v1/version", "/openapi.json"]);
|
|
821
|
+
function createApp(opts = {}) {
|
|
612
822
|
const app = new Hono();
|
|
613
823
|
app.use("*", cors());
|
|
824
|
+
if (opts.auth) {
|
|
825
|
+
const verify = opts.auth.bearer;
|
|
826
|
+
app.use("*", async (c, next) => {
|
|
827
|
+
const path = new URL(c.req.url).pathname;
|
|
828
|
+
if (AUTH_EXEMPT_PATHS.has(path)) return next();
|
|
829
|
+
const raw = c.req.header("authorization") ?? "";
|
|
830
|
+
const match = raw.match(/^Bearer\s+(.+)$/i);
|
|
831
|
+
if (!match) {
|
|
832
|
+
throw new WireError("unauthorized", "Missing or malformed Authorization header.", 401);
|
|
833
|
+
}
|
|
834
|
+
const token = match[1];
|
|
835
|
+
const ok = typeof verify === "string" ? token === verify : await verify(token);
|
|
836
|
+
if (!ok) {
|
|
837
|
+
throw new WireError("unauthorized", "Invalid bearer token.", 401);
|
|
838
|
+
}
|
|
839
|
+
return next();
|
|
840
|
+
});
|
|
841
|
+
}
|
|
614
842
|
app.onError((err, c) => {
|
|
615
843
|
if (err instanceof WireError) {
|
|
844
|
+
const status = err.status;
|
|
616
845
|
return c.json(
|
|
617
846
|
{ error: { code: err.code, message: err.message, details: err.details } },
|
|
618
|
-
|
|
847
|
+
status
|
|
619
848
|
);
|
|
620
849
|
}
|
|
621
850
|
console.error("[agent-eval] unhandled error:", err);
|
|
@@ -644,11 +873,64 @@ function createApp() {
|
|
|
644
873
|
const result = await handleJudge(parsed.data);
|
|
645
874
|
return c.json(result);
|
|
646
875
|
});
|
|
876
|
+
app.post("/v1/traces/ingest", async (c) => {
|
|
877
|
+
const contentType = c.req.header("content-type") ?? "";
|
|
878
|
+
let payload;
|
|
879
|
+
if (contentType.includes("application/x-ndjson")) {
|
|
880
|
+
const text = await c.req.text();
|
|
881
|
+
const events = text.split("\n").map((line) => line.trim()).filter((line) => line.length > 0).map((line) => {
|
|
882
|
+
try {
|
|
883
|
+
return JSON.parse(line);
|
|
884
|
+
} catch {
|
|
885
|
+
throw new WireError(
|
|
886
|
+
"validation_error",
|
|
887
|
+
"NDJSON line did not parse as JSON.",
|
|
888
|
+
400,
|
|
889
|
+
line.slice(0, 200)
|
|
890
|
+
);
|
|
891
|
+
}
|
|
892
|
+
});
|
|
893
|
+
payload = { events };
|
|
894
|
+
} else {
|
|
895
|
+
payload = await c.req.json().catch(() => null);
|
|
896
|
+
}
|
|
897
|
+
if (payload == null) {
|
|
898
|
+
throw new WireError("validation_error", "Request body must be JSON or NDJSON.", 400);
|
|
899
|
+
}
|
|
900
|
+
const parsed = TracesIngestRequestSchema.safeParse(payload);
|
|
901
|
+
if (!parsed.success) {
|
|
902
|
+
throw new WireError(
|
|
903
|
+
"validation_error",
|
|
904
|
+
"Request did not match TracesIngestRequest schema.",
|
|
905
|
+
400,
|
|
906
|
+
parsed.error.issues
|
|
907
|
+
);
|
|
908
|
+
}
|
|
909
|
+
const result = await handleTracesIngest(parsed.data, opts.stores ?? {});
|
|
910
|
+
return c.json(result);
|
|
911
|
+
});
|
|
912
|
+
app.post("/v1/feedback", async (c) => {
|
|
913
|
+
const raw = await c.req.json().catch(() => null);
|
|
914
|
+
if (raw == null) {
|
|
915
|
+
throw new WireError("validation_error", "Request body must be JSON.", 400);
|
|
916
|
+
}
|
|
917
|
+
const parsed = FeedbackTrajectorySchema.safeParse(raw);
|
|
918
|
+
if (!parsed.success) {
|
|
919
|
+
throw new WireError(
|
|
920
|
+
"validation_error",
|
|
921
|
+
"Request did not match FeedbackTrajectory schema.",
|
|
922
|
+
400,
|
|
923
|
+
parsed.error.issues
|
|
924
|
+
);
|
|
925
|
+
}
|
|
926
|
+
const result = await handleFeedbackIngest(parsed.data, opts.stores ?? {});
|
|
927
|
+
return c.json(result);
|
|
928
|
+
});
|
|
647
929
|
app.get("/openapi.json", (c) => c.json(buildOpenApi(handleVersion().version)));
|
|
648
930
|
return app;
|
|
649
931
|
}
|
|
650
932
|
function startServer(opts = {}) {
|
|
651
|
-
const app = createApp();
|
|
933
|
+
const app = createApp(opts);
|
|
652
934
|
const port = opts.port ?? 5005;
|
|
653
935
|
const host = opts.host ?? "127.0.0.1";
|
|
654
936
|
return serve({ fetch: app.fetch, port, hostname: host }, ({ address, port: actualPort }) => {
|
|
@@ -666,6 +948,13 @@ export {
|
|
|
666
948
|
ListRubricsResponseSchema,
|
|
667
949
|
VersionResponseSchema,
|
|
668
950
|
HealthResponseSchema,
|
|
951
|
+
TraceEventSchema,
|
|
952
|
+
TracesIngestRequestSchema,
|
|
953
|
+
TracesIngestResponseSchema,
|
|
954
|
+
FeedbackLabelSchema,
|
|
955
|
+
FeedbackAttemptSchema,
|
|
956
|
+
FeedbackTrajectorySchema,
|
|
957
|
+
FeedbackIngestResponseSchema,
|
|
669
958
|
ErrorResponseSchema,
|
|
670
959
|
WIRE_VERSION,
|
|
671
960
|
hashRubric,
|
|
@@ -676,6 +965,8 @@ export {
|
|
|
676
965
|
handleJudge,
|
|
677
966
|
handleListRubrics,
|
|
678
967
|
handleVersion,
|
|
968
|
+
handleTracesIngest,
|
|
969
|
+
handleFeedbackIngest,
|
|
679
970
|
buildOpenApi,
|
|
680
971
|
dispatchRpc,
|
|
681
972
|
runRpcOnce,
|
|
@@ -683,4 +974,4 @@ export {
|
|
|
683
974
|
createApp,
|
|
684
975
|
startServer
|
|
685
976
|
};
|
|
686
|
-
//# sourceMappingURL=chunk-
|
|
977
|
+
//# sourceMappingURL=chunk-5LBB5B3Z.js.map
|