@tangle-network/agent-eval 0.20.9 → 0.20.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +302 -0
- package/README.md +86 -8
- package/dist/benchmarks/index.d.ts +1 -1
- package/dist/benchmarks/index.js +1 -1
- package/dist/{chunk-XDGJUIV2.js → chunk-42I2QC2L.js} +1 -1
- package/dist/chunk-42I2QC2L.js.map +1 -0
- package/dist/{chunk-CJJSB6ZQ.js → chunk-LSR4IAYN.js} +90 -11
- package/dist/chunk-LSR4IAYN.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/{index-CEWY1rmu.d.ts → index-1PZOtZFr.d.ts} +2 -2
- package/dist/index.d.ts +2 -2
- package/dist/index.js +63 -14
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +50 -25
- package/dist/{sink-fetch-C0B8ximv.d.ts → sink-fetch-B1Yg4Til.d.ts} +1 -1
- package/dist/telemetry/file.d.ts +1 -1
- package/dist/telemetry/index.d.ts +2 -2
- package/dist/telemetry/index.js.map +1 -1
- package/dist/wire/index.js +1 -1
- package/docs/product-eval-adoption.md +194 -0
- package/docs/wire-protocol.md +2 -2
- package/package.json +5 -4
- package/dist/chunk-CJJSB6ZQ.js.map +0 -1
- package/dist/chunk-XDGJUIV2.js.map +0 -1
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.20.10 — hardening audit follow-up
|
|
4
|
+
|
|
5
|
+
### Fixed
|
|
6
|
+
|
|
7
|
+
- `hashRubric` now recursively sorts nested rubric fields before hashing, so
|
|
8
|
+
dimension, failure-mode, and win changes alter `rubricVersion`.
|
|
9
|
+
- Wire judge handling now validates LLM output before returning it: finite
|
|
10
|
+
dimension scores, rationale, and known failure/win ids are enforced.
|
|
11
|
+
- Control-runtime budgets reject invalid numeric config, and invalid action
|
|
12
|
+
costs are omitted from step telemetry instead of leaking `NaN`/`Infinity`.
|
|
13
|
+
- Knowledge readiness now treats invalid `validUntil` timestamps as stale.
|
|
14
|
+
- Trace-analyst regex search supports leading `(?i)` and stops scanning once
|
|
15
|
+
bounded match output is reached.
|
|
16
|
+
- SWE-Bench Lite example wording now reflects the implemented external-grader
|
|
17
|
+
adapter, with quoted command parsing and timeout coverage.
|
|
18
|
+
|
|
19
|
+
### Changed
|
|
20
|
+
|
|
21
|
+
- Published package contents now include `CHANGELOG.md`.
|
|
22
|
+
- Public docs now use GitHub URLs for repository-only examples and Python
|
|
23
|
+
client source.
|
|
24
|
+
- Publish CI now checks npm, Python package, runtime fallback version, and tag
|
|
25
|
+
version agree before publishing.
|
|
26
|
+
|
|
27
|
+
## 0.20.9 — release hygiene and runtime failure fixes
|
|
28
|
+
|
|
29
|
+
### Fixed
|
|
30
|
+
|
|
31
|
+
- Initial `runAgentControlLoop` observe/validate failures now report the
|
|
32
|
+
actual observe/validate error even when trace start/end emission also fails.
|
|
33
|
+
- Knowledge readiness recommended actions now honor non-blocking gap
|
|
34
|
+
acquisition modes such as `ask_user`, `search_web`, `query_connector`, and
|
|
35
|
+
`inspect_repo`.
|
|
36
|
+
- Npm builds now generate `dist/openapi.json`, and the package exports
|
|
37
|
+
`@tangle-network/agent-eval/openapi.json`.
|
|
38
|
+
- Npm and Python client versions are locked at `0.20.9`.
|
|
39
|
+
|
|
40
|
+
### Added
|
|
41
|
+
|
|
42
|
+
- `CallbackResearcher`, a concrete callback-backed implementation of the
|
|
43
|
+
stable `Researcher` interface for scripts, tests, and small integrations.
|
|
44
|
+
- Public `@tangle-network/agent-eval/benchmarks` subpath for the supported
|
|
45
|
+
routing benchmark surface.
|
|
46
|
+
- Root MIT `LICENSE`.
|
|
47
|
+
|
|
48
|
+
### Changed
|
|
49
|
+
|
|
50
|
+
- Raw TypeScript examples are no longer included in the npm package; they remain
|
|
51
|
+
repository examples to read, copy, and adapt.
|
|
52
|
+
|
|
53
|
+
## 0.20.2 — freshness-aware knowledge readiness
|
|
54
|
+
|
|
55
|
+
### Added
|
|
56
|
+
|
|
57
|
+
- `KnowledgeRequirement.validUntil` and `lastVerifiedAt` for explicit freshness
|
|
58
|
+
contracts.
|
|
59
|
+
- `scoreKnowledgeReadiness({ now })` support for deterministic freshness gates.
|
|
60
|
+
|
|
61
|
+
### Changed
|
|
62
|
+
|
|
63
|
+
- Expired knowledge requirements now score as missing even when confidence and
|
|
64
|
+
evidence are otherwise high.
|
|
65
|
+
|
|
66
|
+
## 0.20.0 — knowledge readiness contracts
|
|
67
|
+
|
|
68
|
+
### Added
|
|
69
|
+
|
|
70
|
+
- First-class knowledge-readiness contracts: `KnowledgeRequirement`,
|
|
71
|
+
`KnowledgeBundle`, `KnowledgeReadinessReport`, `UserQuestion`, and
|
|
72
|
+
`DataAcquisitionPlan`.
|
|
73
|
+
- `scoreKnowledgeReadiness`, `blockingKnowledgeEval`,
|
|
74
|
+
`userQuestionsForKnowledgeGaps`, and `acquisitionPlansForKnowledgeGaps`.
|
|
75
|
+
- Knowledge/data failure classes including `knowledge_readiness_blocked`,
|
|
76
|
+
`missing_credentials`, `bad_retrieval`, `insufficient_evidence`, and
|
|
77
|
+
`contradictory_evidence`.
|
|
78
|
+
- `docs/knowledge-readiness.md`, plus documented knowledge-related ASI
|
|
79
|
+
responsible surfaces for multi-shot optimization.
|
|
80
|
+
|
|
81
|
+
## 0.19.1 — release confidence gate
|
|
82
|
+
|
|
83
|
+
### Added
|
|
84
|
+
|
|
85
|
+
- `evaluateReleaseConfidence`, a conservative release scorecard over corpus
|
|
86
|
+
coverage, search/holdout run evidence, ASI diagnostics, overfit checks, and
|
|
87
|
+
cost/latency budgets.
|
|
88
|
+
- `assertReleaseConfidence`, a throwing variant for CI/release scripts.
|
|
89
|
+
- `releaseTraceEvidenceFromMultiShotTrials`, a helper that projects
|
|
90
|
+
`MultiShotTrialResult` rows into release trace evidence so single-shot and
|
|
91
|
+
variable multi-shot apps use the same release gate.
|
|
92
|
+
|
|
93
|
+
## 0.19.0 — legacy optimizer removal
|
|
94
|
+
|
|
95
|
+
### Removed
|
|
96
|
+
|
|
97
|
+
- Removed the legacy pairwise prompt optimizer surface:
|
|
98
|
+
`PromptOptimizer`, `OptimizationLoop`, and their associated root-exported
|
|
99
|
+
types are gone. The blessed optimization path is now
|
|
100
|
+
`runMultiShotOptimization` for task trajectories and the steering-specific
|
|
101
|
+
optimizers for explicit steering tables.
|
|
102
|
+
- Removed the old `PromptVariant` root export. Public callers should use
|
|
103
|
+
`MultiShotVariant` for multi-shot trajectory optimization or
|
|
104
|
+
`EvolvableVariant` for the lower-level prompt/code evolution core.
|
|
105
|
+
|
|
106
|
+
### Changed
|
|
107
|
+
|
|
108
|
+
- Documentation now points optimization users at `runMultiShotOptimization`
|
|
109
|
+
instead of the removed pairwise prompt optimizer.
|
|
110
|
+
|
|
111
|
+
## 0.18.0 — multi-shot optimization
|
|
112
|
+
|
|
113
|
+
### Added
|
|
114
|
+
|
|
115
|
+
- `runMultiShotOptimization`, the canonical GEPA-style adapter for
|
|
116
|
+
variable-length agent trajectories. It wraps `runPromptEvolution` while
|
|
117
|
+
preserving full multi-shot traces, actionable side information, stable paired
|
|
118
|
+
seeds, score/cost objectives, and optional held-out promotion gating.
|
|
119
|
+
- `trialTraceFromMultiShotTrial`, a bridge from multi-shot trial results into
|
|
120
|
+
reflective mutation prompts.
|
|
121
|
+
- `ActionableSideInfo`, `MultiShotVariant`, `MultiShotTrace`, `MultiShotRun`,
|
|
122
|
+
`MultiShotScore`, `MultiShotTrialResult`, `MultiShotMutateAdapter`, and
|
|
123
|
+
related public types.
|
|
124
|
+
- `docs/multi-shot-optimization.md` and
|
|
125
|
+
`examples/multi-shot-optimization/index.ts`.
|
|
126
|
+
|
|
127
|
+
### Changed
|
|
128
|
+
|
|
129
|
+
- The multi-shot result shape explicitly separates `searchBestVariant` from
|
|
130
|
+
`promotedVariant`. If a holdout gate rejects the search winner, the promoted
|
|
131
|
+
variant is the baseline.
|
|
132
|
+
- `runMultiShotOptimization` validates release-critical configuration up front:
|
|
133
|
+
unique variant/scenario ids, positive integer run counts, population size,
|
|
134
|
+
disjoint search/holdout ids, and a gate baseline key matching the first seed
|
|
135
|
+
variant.
|
|
136
|
+
|
|
137
|
+
## 0.17.2 — agent control runtime
|
|
138
|
+
|
|
139
|
+
### Added
|
|
140
|
+
|
|
141
|
+
- `runAgentControlLoop`, a generic `observe -> validate -> decide -> act`
|
|
142
|
+
runtime for agentic tasks with step, wall-clock, and recorded-cost budgets;
|
|
143
|
+
no-progress and repeated-action stop policies; structured runtime failures;
|
|
144
|
+
objective/subjective eval helpers; and `TraceStore` emission.
|
|
145
|
+
- `runProposeReviewAsControlLoop`, a bridge preset that expresses
|
|
146
|
+
propose/verify/review as a specialization of the generic control runtime.
|
|
147
|
+
- feedback trajectory helpers for turning control-loop runs and user/judge
|
|
148
|
+
labels into reusable dataset scenarios, optimizer rows, and preference
|
|
149
|
+
memory.
|
|
150
|
+
- `docs/control-runtime.md`, with integration patterns for tax, legal,
|
|
151
|
+
agent-builder, and film-agent products.
|
|
152
|
+
|
|
153
|
+
### Changed
|
|
154
|
+
|
|
155
|
+
- control runtime trace sink and `onStep` callback failures are now recorded
|
|
156
|
+
as structured runtime errors without aborting an otherwise valid run.
|
|
157
|
+
- `runProposeReviewAsControlLoop` accepts a caller-provided verifier failure
|
|
158
|
+
mapper for domain-specific failure classes.
|
|
159
|
+
|
|
160
|
+
## 0.17.0 — surface cleanup + usage-guidance pitfalls
|
|
161
|
+
|
|
162
|
+
This release tightens the public benchmark surface and lands internal usage guidance that the v0.15 dispatch couldn't write.
|
|
163
|
+
|
|
164
|
+
### Moved
|
|
165
|
+
|
|
166
|
+
- `src/benchmarks/gsm8k/` → `examples/benchmarks/gsm8k/`
|
|
167
|
+
- `src/benchmarks/swebench-lite/` → `examples/benchmarks/swebench-lite/`
|
|
168
|
+
|
|
169
|
+
These are reference implementations of `BenchmarkAdapter`, not core surface. Consumers read them, copy them, adapt them. The novel `routing` benchmark stays in `src/benchmarks/` because it's our own and broadly useful.
|
|
170
|
+
|
|
171
|
+
`src/benchmarks/index.ts` now exports the shared types + the `routing` benchmark only. The previous `gsm8k` and `swebenchLite` namespace exports are gone — import directly from `examples/benchmarks/<name>/index.ts` (or copy the wrapper into your own project).
|
|
172
|
+
|
|
173
|
+
### Added
|
|
174
|
+
|
|
175
|
+
- `examples/benchmarks/README.md` documents how to use, copy, and extend the example wrappers.
|
|
176
|
+
- Internal agent-eval usage guidance gains production-rigor and pitfalls sections covering the v0.16 primitives.
|
|
177
|
+
|
|
178
|
+
### Migration
|
|
179
|
+
|
|
180
|
+
If you imported `gsm8k` or `swebenchLite` from `@tangle-network/agent-eval/benchmarks`:
|
|
181
|
+
|
|
182
|
+
```ts
|
|
183
|
+
// before
|
|
184
|
+
import { gsm8k, swebenchLite } from '@tangle-network/agent-eval/benchmarks'
|
|
185
|
+
|
|
186
|
+
// after — copy the file from examples/benchmarks/<name>/index.ts into your project,
|
|
187
|
+
// or import via relative path from the cloned repo.
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
The `routing` benchmark and the shared `BenchmarkAdapter` types are unchanged.
|
|
191
|
+
|
|
192
|
+
## 0.16.0 — naming cleanup
|
|
193
|
+
|
|
194
|
+
The v0.15 primitives were framed as "paper-grade" but most are production-rigor utilities any team needs. This release renames the three reporting helpers and drops the "paper" framing from the public API. Behavior unchanged.
|
|
195
|
+
|
|
196
|
+
### Renamed
|
|
197
|
+
|
|
198
|
+
- `paperTable` → `summaryTable`
|
|
199
|
+
- `paretoFigure` → `paretoChart`
|
|
200
|
+
- `gainDistributionFigure` → `gainHistogram`
|
|
201
|
+
- `PaperTable` / `PaperTableOptions` / `PaperTableRow` types → `SummaryTable` / `SummaryTableOptions` / `SummaryTableRow`
|
|
202
|
+
- File: `src/paper-report.ts` → `src/summary-report.ts`
|
|
203
|
+
|
|
204
|
+
### Migration
|
|
205
|
+
|
|
206
|
+
Drop-in: search-and-replace the three function names and the file path. Type names follow the same pattern. No behavior change.
|
|
207
|
+
|
|
208
|
+
```ts
|
|
209
|
+
// before
|
|
210
|
+
import { paperTable, paretoFigure, gainDistributionFigure } from '@tangle-network/agent-eval'
|
|
211
|
+
// after
|
|
212
|
+
import { summaryTable, paretoChart, gainHistogram } from '@tangle-network/agent-eval'
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
## 0.15.0 — paper-grade primitives
|
|
216
|
+
|
|
217
|
+
Substrate for the "Two Loops, Three Roles" paper on multi-level prompt
|
|
218
|
+
optimization with held-out promotion gates.
|
|
219
|
+
|
|
220
|
+
### Added
|
|
221
|
+
|
|
222
|
+
- **`HeldOutGate`** (`src/promotion-gate.ts`) — first-class held-out
|
|
223
|
+
paired-delta promotion gate. Three checks: minimum productive runs,
|
|
224
|
+
positive lower bound on bootstrap CI of paired holdout median delta,
|
|
225
|
+
bounded overfit-gap relative to baseline. Decisions carry a
|
|
226
|
+
machine-readable `rejectionCode` (`few_runs` | `negative_delta` |
|
|
227
|
+
`overfit_gap`) plus an `evidence` block with every number the gate
|
|
228
|
+
read. Generalizes the inline pattern that lived in
|
|
229
|
+
`redteam/scripts/agent-eval-autoresearch.ts:138–171`.
|
|
230
|
+
- **`RunRecord`** (`src/run-record.ts`) — paper-grade JSON-friendly run
|
|
231
|
+
schema with mandatory fields: `runId`, `experimentId`, `candidateId`,
|
|
232
|
+
`seed`, snapshot-versioned `model`, `promptHash`, `configHash`,
|
|
233
|
+
`commitSha`, `wallMs`, `costUsd`, `tokenUsage`, `outcome`, `splitTag`.
|
|
234
|
+
Runtime validator (`validateRunRecord`, `isRunRecord`,
|
|
235
|
+
`parseRunRecordSafe`, `roundTripRunRecord`) throws on missing fields
|
|
236
|
+
and on bare model aliases without snapshot suffix.
|
|
237
|
+
- **`Researcher`** (`src/researcher.ts`) — stable hook for an
|
|
238
|
+
autonomous-research agent: `inspectFailures` → `proposeChange` →
|
|
239
|
+
`applyChange` → `evaluateChange`. `NoopResearcher` is the
|
|
240
|
+
fail-loud placeholder. Implementations live downstream.
|
|
241
|
+
- **Reference benchmarks** (`src/benchmarks/`) — three adapters that
|
|
242
|
+
share the `BenchmarkAdapter<TItem, TPayload>` shape:
|
|
243
|
+
- `gsm8k`: HF-mirror loader (JSONL via `AGENT_EVAL_GSM8K_PATH`),
|
|
244
|
+
exact-match grading via `parseGsm8kAnswer`.
|
|
245
|
+
- `swebench-lite`: 30-instance subset stub. Loader reads
|
|
246
|
+
`AGENT_EVAL_SWEBENCH_PATH`; grader shells out to
|
|
247
|
+
`AGENT_EVAL_SWEBENCH_GRADER_CMD`. Both fail loud when unset.
|
|
248
|
+
- `routing`: synthetic 16-task router benchmark, ships in the
|
|
249
|
+
package, dependency-free. Format documented in
|
|
250
|
+
`src/benchmarks/routing/README.md`.
|
|
251
|
+
- `deterministicSplit(itemId, seed?)`: stable 60/20/20 split via
|
|
252
|
+
FNV-1a hash. Default seed `agent-eval-v1`.
|
|
253
|
+
- **`summaryTable`, `paretoChart`, `gainHistogram`**
|
|
254
|
+
(`sr./summary-report.ts`) — Table 1 + Pareto + gain-distribution specs.
|
|
255
|
+
Returns data structures (markdown table, point lists, histogram bins);
|
|
256
|
+
caller picks the plotting library.
|
|
257
|
+
- **`runCanaries`** (`src/canary.ts`) — three liveness canaries:
|
|
258
|
+
silent judge fallback (consecutive constant-confidence streak),
|
|
259
|
+
judge calibration drift (KS test on confidence distribution), eval-set
|
|
260
|
+
distribution shift (chi-square on category bucket counts).
|
|
261
|
+
- **`pairedBootstrap`, `pairedWilcoxon`, `bhAdjust`**
|
|
262
|
+
(`src/paired-stats.ts`) — paper-style aliases + the missing paired
|
|
263
|
+
bootstrap CI primitive. Deterministic with optional seed.
|
|
264
|
+
|
|
265
|
+
### Notes
|
|
266
|
+
|
|
267
|
+
- No breaking changes. Every existing module is untouched; new types
|
|
268
|
+
are additive.
|
|
269
|
+
- All new public symbols carry JSDoc.
|
|
270
|
+
- 87 new tests across 7 new test files. 571 total tests pass.
|
|
271
|
+
- See the package docs for usage directives and pitfalls.
|
|
272
|
+
|
|
273
|
+
## 0.11.0
|
|
274
|
+
|
|
275
|
+
intent-match + flow-layer + deploy-gate + concept complexity
|
|
276
|
+
weighting.
|
|
277
|
+
|
|
278
|
+
## 0.10.0
|
|
279
|
+
|
|
280
|
+
`LayerResult.diagnostics` + `buildReviewerPrompt` +
|
|
281
|
+
`createDefaultReviewer` + `mergeLayerResults` options.
|
|
282
|
+
|
|
283
|
+
## 0.9.0
|
|
284
|
+
|
|
285
|
+
`CommandRunner` contract + `multiToolchainLayer` + `Finding.detail`.
|
|
286
|
+
|
|
287
|
+
## 0.8.x
|
|
288
|
+
|
|
289
|
+
`probeLlm` + `keyword-coverage-judge`. Honestly-absent primitives
|
|
290
|
+
backfilled — `llm-client`, multi-layer verifier, semantic concept judge,
|
|
291
|
+
extractor utilities.
|
|
292
|
+
|
|
293
|
+
## 0.7.x
|
|
294
|
+
|
|
295
|
+
Extracted muffled-gate scanner; `CostTracker.recordVerdict`. Footgun
|
|
296
|
+
fix: `cwd` belongs in `HarnessConfig`, not the driver constructor.
|
|
297
|
+
|
|
298
|
+
## 0.6.x
|
|
299
|
+
|
|
300
|
+
Tier 1 (meta-eval correlation, PRM, bisector), Tier 2 (counterfactual,
|
|
301
|
+
cross-trace diff, pre-registration), Tier 3 (self-play, causal
|
|
302
|
+
attribution, active learning, RM export), governance templates.
|
package/README.md
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# @tangle-network/agent-eval
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Evaluation infrastructure for agent systems.
|
|
4
4
|
|
|
5
|
-
`agent-eval`
|
|
6
|
-
|
|
7
|
-
|
|
5
|
+
`agent-eval` gives agent products a reusable way to record what happened,
|
|
6
|
+
verify outcomes, classify failures, compare variants, optimize prompts or
|
|
7
|
+
policies, and make release decisions from evidence instead of anecdotes.
|
|
8
8
|
|
|
9
9
|
It does not own your product state, credentials, UI, or model routing. Product
|
|
10
10
|
teams keep those boundaries; this package standardizes how runs are recorded,
|
|
@@ -15,7 +15,9 @@ checked, compared, and promoted.
|
|
|
15
15
|
- [When To Use It](#when-to-use-it)
|
|
16
16
|
- [Architecture](#architecture)
|
|
17
17
|
- [Install](#install)
|
|
18
|
+
- [Quick Start](#quick-start)
|
|
18
19
|
- [Core Primitives](#core-primitives)
|
|
20
|
+
- [Adoption Path](#adoption-path)
|
|
19
21
|
- [Examples](#examples)
|
|
20
22
|
- [Documentation](#documentation)
|
|
21
23
|
- [Development](#development)
|
|
@@ -80,6 +82,59 @@ cd clients/python
|
|
|
80
82
|
pip install -e .
|
|
81
83
|
```
|
|
82
84
|
|
|
85
|
+
## Quick Start
|
|
86
|
+
|
|
87
|
+
Wrap the real product loop first. Do not build a toy eval path that users never
|
|
88
|
+
exercise.
|
|
89
|
+
|
|
90
|
+
```ts
|
|
91
|
+
import {
|
|
92
|
+
objectiveEval,
|
|
93
|
+
runAgentControlLoop,
|
|
94
|
+
} from '@tangle-network/agent-eval'
|
|
95
|
+
|
|
96
|
+
const result = await runAgentControlLoop({
|
|
97
|
+
intent: task.prompt,
|
|
98
|
+
budget: { maxSteps: 8, maxWallMs: 180_000, maxCostUsd: 2 },
|
|
99
|
+
|
|
100
|
+
async observe() {
|
|
101
|
+
return productAdapter.readState(task.id)
|
|
102
|
+
},
|
|
103
|
+
|
|
104
|
+
async validate({ state }) {
|
|
105
|
+
return [
|
|
106
|
+
objectiveEval({
|
|
107
|
+
id: 'build-passes',
|
|
108
|
+
passed: state.build.exitCode === 0,
|
|
109
|
+
severity: 'critical',
|
|
110
|
+
metadata: state.build,
|
|
111
|
+
}),
|
|
112
|
+
objectiveEval({
|
|
113
|
+
id: 'preview-serves',
|
|
114
|
+
passed: state.preview.httpStatus === 200,
|
|
115
|
+
severity: 'critical',
|
|
116
|
+
}),
|
|
117
|
+
]
|
|
118
|
+
},
|
|
119
|
+
|
|
120
|
+
async decide({ evals }) {
|
|
121
|
+
return evals.every((evalResult) => evalResult.passed)
|
|
122
|
+
? { type: 'stop', reason: 'all critical checks passed' }
|
|
123
|
+
: { type: 'continue', action: { type: 'repair' }, reason: 'checks failed' }
|
|
124
|
+
},
|
|
125
|
+
|
|
126
|
+
async act(action) {
|
|
127
|
+
return productAdapter.runAgentStep(task.id, action)
|
|
128
|
+
},
|
|
129
|
+
})
|
|
130
|
+
|
|
131
|
+
await productAdapter.storeControlResult(task.id, result)
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Once this loop represents production behavior, convert completed runs into
|
|
135
|
+
feedback trajectories, split them into train/dev/test/holdout sets, and run
|
|
136
|
+
multi-shot optimization against the same adapter.
|
|
137
|
+
|
|
83
138
|
## Core Primitives
|
|
84
139
|
|
|
85
140
|
| Primitive | Purpose |
|
|
@@ -98,16 +153,38 @@ pip install -e .
|
|
|
98
153
|
| `summaryTable`, `paretoChart`, `gainHistogram` | Report-ready structured outputs. |
|
|
99
154
|
| `KnowledgeRequirement`, `KnowledgeBundle` | Shared contracts for knowledge readiness. |
|
|
100
155
|
|
|
156
|
+
`NoopResearcher` is a fail-loud sentinel for wiring tests. Production systems
|
|
157
|
+
should implement `Researcher` directly or use `CallbackResearcher`.
|
|
158
|
+
|
|
159
|
+
## Adoption Path
|
|
160
|
+
|
|
161
|
+
1. Choose one real workflow: code generation, browser task, research task,
|
|
162
|
+
workflow builder, voice interaction, or domain agent task.
|
|
163
|
+
2. Write a product adapter that can observe state and execute one agent step.
|
|
164
|
+
3. Add deterministic validators first: build, test, serve, schema, policy,
|
|
165
|
+
permission, retrieval, and deployment checks.
|
|
166
|
+
4. Add LLM judges only for subjective quality that deterministic checks cannot
|
|
167
|
+
measure.
|
|
168
|
+
5. Emit traces and convert successful and failed attempts into
|
|
169
|
+
`FeedbackTrajectory` records.
|
|
170
|
+
6. Build train/dev/test/holdout scenarios from those trajectories.
|
|
171
|
+
7. Run `runMultiShotOptimization()` or prompt/code evolution on train/dev.
|
|
172
|
+
8. Promote only when test/holdout gates and real product telemetry improve.
|
|
173
|
+
|
|
174
|
+
For a complete product integration guide, see
|
|
175
|
+
[Product Eval Adoption](./docs/product-eval-adoption.md).
|
|
176
|
+
|
|
101
177
|
## Examples
|
|
102
178
|
|
|
103
|
-
Runnable examples live in the repository's
|
|
179
|
+
Runnable examples live in the repository's
|
|
180
|
+
[`examples/`](https://github.com/tangle-network/agent-eval/tree/main/examples)
|
|
104
181
|
directory. They are not part of the published npm package.
|
|
105
182
|
|
|
106
|
-
- [`examples/same-sandbox-harness`](
|
|
183
|
+
- [`examples/same-sandbox-harness`](https://github.com/tangle-network/agent-eval/tree/main/examples/same-sandbox-harness) - run
|
|
107
184
|
multiple eval passes against the same workspace.
|
|
108
|
-
- [`examples/multi-shot-optimization`](
|
|
185
|
+
- [`examples/multi-shot-optimization`](https://github.com/tangle-network/agent-eval/tree/main/examples/multi-shot-optimization) -
|
|
109
186
|
optimize full agent trajectories with held-out promotion.
|
|
110
|
-
- [`examples/benchmarks`](
|
|
187
|
+
- [`examples/benchmarks`](https://github.com/tangle-network/agent-eval/tree/main/examples/benchmarks) - benchmark adapter shape and
|
|
111
188
|
reference benchmark wrappers.
|
|
112
189
|
|
|
113
190
|
The examples are intentionally kept outside the README so they can be expanded,
|
|
@@ -117,6 +194,7 @@ tested, and copied without turning this page into a tutorial.
|
|
|
117
194
|
|
|
118
195
|
- [Concepts](./docs/concepts.md)
|
|
119
196
|
- [Feature Guide](./docs/feature-guide.md)
|
|
197
|
+
- [Product Eval Adoption](./docs/product-eval-adoption.md)
|
|
120
198
|
- [Control Runtime](./docs/control-runtime.md)
|
|
121
199
|
- [Knowledge Readiness](./docs/knowledge-readiness.md)
|
|
122
200
|
- [Multi-Shot Optimization](./docs/multi-shot-optimization.md)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
export { B as BENCHMARK_SPLIT_SEED, b as BenchmarkAdapter, c as BenchmarkDatasetItem, d as BenchmarkEvaluation, i as deterministicSplit, l as routing } from '../index-
|
|
1
|
+
export { B as BENCHMARK_SPLIT_SEED, b as BenchmarkAdapter, c as BenchmarkDatasetItem, d as BenchmarkEvaluation, i as deterministicSplit, l as routing } from '../index-1PZOtZFr.js';
|
package/dist/benchmarks/index.js
CHANGED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/benchmarks/index.ts","../src/benchmarks/types.ts","../src/benchmarks/routing/index.ts","../src/benchmarks/routing/dataset.ts"],"sourcesContent":["/**\n * Reference benchmark wrappers — entry point.\n *\n * Core surface (exported here):\n * - The `BenchmarkAdapter` contract.\n * - `deterministicSplit` + `BENCHMARK_SPLIT_SEED` for split assignment.\n * - `routing` — synthetic 16-task router benchmark. The only novel\n * benchmark we built; ships in the package.\n *\n * Example wrappers (under `examples/benchmarks/`, NOT in the bundle):\n * - `gsm8k` — exact-match math reasoning (HF mirror, dataset\n * not bundled).\n * - `swebench-lite` — 30-instance SWE-Bench subset via an external\n * grader command.\n *\n * The example wrappers are reference implementations of `BenchmarkAdapter`.\n * Read them, copy them, adapt them. They're intentionally not in the main\n * entry — every team will configure them differently.\n */\n\nexport type {\n BenchmarkAdapter,\n BenchmarkDatasetItem,\n BenchmarkEvaluation,\n} from './types'\nexport { deterministicSplit, BENCHMARK_SPLIT_SEED } from './types'\n\nexport * as routing from './routing/index'\n","/**\n * Shared types for the reference benchmark wrappers under\n * `src/benchmarks/`. Each wrapper exports the three functions in\n * `BenchmarkAdapter` plus its own typed `DatasetItem` shape.\n */\n\nimport type { RunSplitTag } from '../run-record'\n\nexport interface BenchmarkDatasetItem<TPayload = unknown> {\n /** Stable dataset-local item id (used for split assignment + paper\n * references). Unique within a benchmark. */\n id: string\n /** Free-form payload. Each benchmark defines its own shape. */\n payload: TPayload\n}\n\nexport interface BenchmarkEvaluation {\n /** [0, 1] score for the response on this item. Exact-match\n * benchmarks use 0/1; partial-credit benchmarks may return\n * fractional values. */\n score: number\n /** Optional bag of raw scoring signals — e.g. parsed numeric\n * answer, regex match, judge sub-scores. */\n raw: Record<string, unknown>\n}\n\n/** Common signature implemented by every adapter under `src/benchmarks/*`. */\n// `TPayload` is the per-item payload type; `_TItem` is preserved for\n// downstream type-narrowing extensions (a richer `BenchmarkDatasetItem`\n// subclass that adds e.g. provenance metadata) but is intentionally\n// unused here. `noUnusedLocals` requires the leading underscore.\nexport interface BenchmarkAdapter<_TItem = unknown, TPayload = unknown> {\n /** Load the dataset for the given split. May hit the network on\n * first call but should be cache-friendly. Adapters that don't\n * ship the dataset itself MUST throw a clearly-marked error\n * pointing the caller at the loader script. */\n loadDataset(split: RunSplitTag): Promise<BenchmarkDatasetItem<TPayload>[]>\n /** Score a single response. Pure with respect to the inputs. */\n evaluate(item: BenchmarkDatasetItem<TPayload>, response: string): Promise<BenchmarkEvaluation>\n /** Deterministic split assignment via item id hashing. The\n * fraction of items in each split is implementation-defined but\n * MUST be stable across processes and platforms. */\n assignSplit(itemId: string): RunSplitTag\n}\n\n// ── Deterministic split assignment ───────────────────────────────────\n\n/**\n * 32-bit FNV-1a hash. Stable, allocation-free, deterministic across\n * runtimes. We use it to assign items to splits rather than depending\n * on a polyfilled crypto.subtle path.\n */\nfunction fnv1a32(input: string): number {\n let h = 0x811c9dc5\n for (let i = 0; i < input.length; i++) {\n h ^= input.charCodeAt(i) & 0xff\n h = (h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24))) >>> 0\n }\n return h >>> 0\n}\n\n/** Split-assignment seed shared across all benchmarks. Bumping this\n * value reshuffles every split — do NOT do that lightly. */\nexport const BENCHMARK_SPLIT_SEED = 'agent-eval-v1'\n\n/**\n * Assign an item id to one of `'search' | 'dev' | 'holdout'` using a\n * stable 32-bit hash of `${seed}::${id}`. Default proportions:\n *\n * search: 60% (optimization-readable)\n * dev: 20% (held-out for tuning, leak-on-purpose during dev)\n * holdout:20% (paper-grade held-out, gated reads)\n */\nexport function deterministicSplit(\n itemId: string,\n seed: string = BENCHMARK_SPLIT_SEED,\n): RunSplitTag {\n const h = fnv1a32(`${seed}::${itemId}`)\n const pos = h / 0x100000000\n if (pos < 0.6) return 'search'\n if (pos < 0.8) return 'dev'\n return 'holdout'\n}\n","/**\n * Routing benchmark — synthetic, dependency-free, ships in the\n * package. 16 cross-category items in `dataset.ts`. See\n * `routing/README.md` for the format.\n *\n * `evaluate` does case-insensitive exact match against the canonical\n * route plus declared synonyms. The first valid route token in the\n * response wins; everything else is ignored. Wrong answers also\n * report whether they hit a hard negative — useful when triaging\n * \"always picks the popular route\" failure modes.\n */\n\nimport type {\n BenchmarkAdapter,\n BenchmarkDatasetItem,\n BenchmarkEvaluation,\n} from '../types'\nimport { deterministicSplit } from '../types'\nimport type { RunSplitTag } from '../../run-record'\nimport { ROUTING_DATASET, type RoutingItem } from './dataset'\n\nexport type { RoutingItem }\nexport type RoutingPayload = RoutingItem\nexport type RoutingDatasetItem = BenchmarkDatasetItem<RoutingPayload>\n\nclass RoutingAdapter\n implements BenchmarkAdapter<RoutingDatasetItem, RoutingPayload>\n{\n async loadDataset(split: RunSplitTag): Promise<RoutingDatasetItem[]> {\n return ROUTING_DATASET\n .map((item) => ({ id: item.id, payload: item }))\n .filter((it) => assignSplitImpl(it.id) === split)\n }\n\n async evaluate(\n item: RoutingDatasetItem,\n response: string,\n ): Promise<BenchmarkEvaluation> {\n const tokens = extractRouteTokens(response)\n const correct = new Set<string>([item.payload.route, ...item.payload.synonyms].map((s) => s.toLowerCase()))\n const hardNeg = new Set<string>(item.payload.hardNegatives.map((s) => s.toLowerCase()))\n const firstMatch = tokens.find((t) => correct.has(t.toLowerCase())) ?? null\n const firstHardNeg = tokens.find((t) => hardNeg.has(t.toLowerCase())) ?? null\n const score = firstMatch ? 1 : 0\n return {\n score,\n raw: {\n firstToken: tokens[0] ?? null,\n matchedRoute: firstMatch,\n hitHardNegative: Boolean(firstHardNeg),\n hardNegativeRoute: firstHardNeg,\n category: item.payload.category,\n },\n }\n }\n\n assignSplit(itemId: string): RunSplitTag {\n return assignSplitImpl(itemId)\n }\n}\n\nfunction assignSplitImpl(itemId: string): RunSplitTag {\n return deterministicSplit(`routing::${itemId}`)\n}\n\n/**\n * Pull route-shaped tokens out of a model response. Routes look like\n * `category.action` (`fs.write`, `chat.reply`). Bare alphanumerics\n * are not routes, but `category.action` patterns are robust to most\n * model wrappers (JSON output, prose explanations, code fences).\n */\nexport function extractRouteTokens(response: string): string[] {\n const matches = response.match(/[a-z][a-z0-9_]*\\.[a-z][a-z0-9_]*/gi)\n return matches ?? []\n}\n\nconst adapter = new RoutingAdapter()\n\nexport const loadDataset = adapter.loadDataset.bind(adapter)\nexport const evaluate = adapter.evaluate.bind(adapter)\nexport const assignSplit = adapter.assignSplit.bind(adapter)\nexport { RoutingAdapter, ROUTING_DATASET }\n","/**\n * Synthetic routing dataset. 16 tasks across 4 categories. Used as a\n * deterministic, dependency-free benchmark for any router that maps a\n * natural-language request to one of a fixed set of route labels.\n *\n * Format (see `routing/README.md` for prose):\n *\n * {\n * id: stable per-task ID (matches across processes).\n * category: one of the four route labels.\n * prompt: the user-facing request the router must classify.\n * route: the ground-truth route the router should pick.\n * synonyms: other strings that count as a correct answer.\n * hardNegatives:close-but-wrong route labels — used to detect the\n * \"always picks the popular route\" failure mode.\n * }\n *\n * The four categories are intentionally cross-domain (file ops,\n * math, search, conversation) so a router that collapses to one\n * category is easy to spot.\n */\n\nexport interface RoutingItem {\n id: string\n category: 'file' | 'math' | 'search' | 'chat'\n prompt: string\n /** Canonical correct route label. */\n route: string\n /** Alternate route labels that also count as correct. */\n synonyms: string[]\n /** Wrong-but-tempting route labels (for analysis, not grading). */\n hardNegatives: string[]\n}\n\nexport const ROUTING_DATASET: RoutingItem[] = [\n {\n id: 'file_001',\n category: 'file',\n prompt: 'Save the meeting notes to /tmp/notes-2025-04.md as markdown.',\n route: 'fs.write',\n synonyms: ['filesystem.write', 'write_file'],\n hardNegatives: ['fs.read', 'chat.reply'],\n },\n {\n id: 'file_002',\n category: 'file',\n prompt: 'Read the contents of /etc/hosts and summarize the entries.',\n route: 'fs.read',\n synonyms: ['filesystem.read', 'read_file'],\n hardNegatives: ['fs.write', 'search.web'],\n },\n {\n id: 'file_003',\n category: 'file',\n prompt: 'List every Python file under src/ recursively.',\n route: 'fs.list',\n synonyms: ['filesystem.list', 'list_files'],\n hardNegatives: ['fs.read', 'search.code'],\n },\n {\n id: 'file_004',\n category: 'file',\n prompt: 'Delete the cached build at .turbo/cache.',\n route: 'fs.delete',\n synonyms: ['filesystem.delete', 'remove_file'],\n hardNegatives: ['fs.write', 'fs.list'],\n },\n {\n id: 'math_001',\n category: 'math',\n prompt: 'What is the integral of 3x^2 + 2x from 0 to 5?',\n route: 'math.integral',\n synonyms: ['calculator.integral', 'math.solve'],\n hardNegatives: ['math.derivative', 'chat.reply'],\n },\n {\n id: 'math_002',\n category: 'math',\n prompt: 'Compute the derivative of sin(x) * cos(x).',\n route: 'math.derivative',\n synonyms: ['calculator.derivative', 'math.solve'],\n hardNegatives: ['math.integral', 'math.algebra'],\n },\n {\n id: 'math_003',\n category: 'math',\n prompt: 'Solve 2x + 7 = 19 for x.',\n route: 'math.algebra',\n synonyms: ['calculator.algebra', 'math.solve'],\n hardNegatives: ['math.derivative', 'math.integral'],\n },\n {\n id: 'math_004',\n category: 'math',\n prompt: 'What is the prime factorization of 360?',\n route: 'math.numbertheory',\n synonyms: ['calculator.factor', 'math.solve'],\n hardNegatives: ['math.algebra', 'search.web'],\n },\n {\n id: 'search_001',\n category: 'search',\n prompt: 'Find recent papers on agent prompt optimization with held-out promotion gates.',\n route: 'search.web',\n synonyms: ['web.search', 'search.papers'],\n hardNegatives: ['search.code', 'chat.reply'],\n },\n {\n id: 'search_002',\n category: 'search',\n prompt: 'Search the codebase for every call site of `runProposeReview`.',\n route: 'search.code',\n synonyms: ['code.search', 'grep'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'search_003',\n category: 'search',\n prompt: 'What is the latest release of the Tangle network on GitHub?',\n route: 'search.web',\n synonyms: ['web.search', 'github.releases'],\n hardNegatives: ['search.code', 'chat.reply'],\n },\n {\n id: 'search_004',\n category: 'search',\n prompt: 'Find all TODO comments in the agent-eval src tree.',\n route: 'search.code',\n synonyms: ['code.search', 'grep'],\n hardNegatives: ['search.web', 'fs.list'],\n },\n {\n id: 'chat_001',\n category: 'chat',\n prompt: 'Hi there, how are you doing today?',\n route: 'chat.reply',\n synonyms: ['conversation.reply'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'chat_002',\n category: 'chat',\n prompt: 'Please explain the difference between an LLM and a foundation model.',\n route: 'chat.reply',\n synonyms: ['conversation.reply', 'qa.answer'],\n hardNegatives: ['search.web', 'math.algebra'],\n },\n {\n id: 'chat_003',\n category: 'chat',\n prompt: 'Tell me a short joke about distributed systems.',\n route: 'chat.reply',\n synonyms: ['conversation.reply'],\n hardNegatives: ['search.web', 'fs.read'],\n },\n {\n id: 'chat_004',\n category: 'chat',\n prompt: 'Acknowledge my last message with a thumbs up.',\n route: 'chat.reply',\n synonyms: ['conversation.reply', 'react'],\n hardNegatives: ['fs.write', 'search.web'],\n },\n]\n"],"mappings":";;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACoDA,SAAS,QAAQ,OAAuB;AACtC,MAAI,IAAI;AACR,WAAS,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;AACrC,SAAK,MAAM,WAAW,CAAC,IAAI;AAC3B,QAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,MAAM,KAAK,SAAU;AAAA,EACxE;AACA,SAAO,MAAM;AACf;AAIO,IAAM,uBAAuB;AAU7B,SAAS,mBACd,QACA,OAAe,sBACF;AACb,QAAM,IAAI,QAAQ,GAAG,IAAI,KAAK,MAAM,EAAE;AACtC,QAAM,MAAM,IAAI;AAChB,MAAI,MAAM,IAAK,QAAO;AACtB,MAAI,MAAM,IAAK,QAAO;AACtB,SAAO;AACT;;;AClFA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACkCO,IAAM,kBAAiC;AAAA,EAC5C;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB,YAAY;AAAA,IAC3C,eAAe,CAAC,WAAW,YAAY;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,mBAAmB,WAAW;AAAA,IACzC,eAAe,CAAC,YAAY,YAAY;AAAA,EAC1C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,mBAAmB,YAAY;AAAA,IAC1C,eAAe,CAAC,WAAW,aAAa;AAAA,EAC1C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,qBAAqB,aAAa;AAAA,IAC7C,eAAe,CAAC,YAAY,SAAS;AAAA,EACvC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,uBAAuB,YAAY;AAAA,IAC9C,eAAe,CAAC,mBAAmB,YAAY;AAAA,EACjD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,yBAAyB,YAAY;AAAA,IAChD,eAAe,CAAC,iBAAiB,cAAc;AAAA,EACjD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,YAAY;AAAA,IAC7C,eAAe,CAAC,mBAAmB,eAAe;AAAA,EACpD;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,qBAAqB,YAAY;AAAA,IAC5C,eAAe,CAAC,gBAAgB,YAAY;AAAA,EAC9C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,cAAc,eAAe;AAAA,IACxC,eAAe,CAAC,eAAe,YAAY;AAAA,EAC7C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,eAAe,MAAM;AAAA,IAChC,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,cAAc,iBAAiB;AAAA,IAC1C,eAAe,CAAC,eAAe,YAAY;AAAA,EAC7C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,eAAe,MAAM;AAAA,IAChC,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB;AAAA,IAC/B,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,WAAW;AAAA,IAC5C,eAAe,CAAC,cAAc,cAAc;AAAA,EAC9C;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,oBAAoB;AAAA,IAC/B,eAAe,CAAC,cAAc,SAAS;AAAA,EACzC;AAAA,EACA;AAAA,IACE,IAAI;AAAA,IACJ,UAAU;AAAA,IACV,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,UAAU,CAAC,sBAAsB,OAAO;AAAA,IACxC,eAAe,CAAC,YAAY,YAAY;AAAA,EAC1C;AACF;;;AD1IA,IAAM,iBAAN,MAEA;AAAA,EACE,MAAM,YAAY,OAAmD;AACnE,WAAO,gBACJ,IAAI,CAAC,UAAU,EAAE,IAAI,KAAK,IAAI,SAAS,KAAK,EAAE,EAC9C,OAAO,CAAC,OAAO,gBAAgB,GAAG,EAAE,MAAM,KAAK;AAAA,EACpD;AAAA,EAEA,MAAM,SACJ,MACA,UAC8B;AAC9B,UAAM,SAAS,mBAAmB,QAAQ;AAC1C,UAAM,UAAU,IAAI,IAAY,CAAC,KAAK,QAAQ,OAAO,GAAG,KAAK,QAAQ,QAAQ,EAAE,IAAI,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;AAC1G,UAAM,UAAU,IAAI,IAAY,KAAK,QAAQ,cAAc,IAAI,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;AACtF,UAAM,aAAa,OAAO,KAAK,CAAC,MAAM,QAAQ,IAAI,EAAE,YAAY,CAAC,CAAC,KAAK;AACvE,UAAM,eAAe,OAAO,KAAK,CAAC,MAAM,QAAQ,IAAI,EAAE,YAAY,CAAC,CAAC,KAAK;AACzE,UAAM,QAAQ,aAAa,IAAI;AAC/B,WAAO;AAAA,MACL;AAAA,MACA,KAAK;AAAA,QACH,YAAY,OAAO,CAAC,KAAK;AAAA,QACzB,cAAc;AAAA,QACd,iBAAiB,QAAQ,YAAY;AAAA,QACrC,mBAAmB;AAAA,QACnB,UAAU,KAAK,QAAQ;AAAA,MACzB;AAAA,IACF;AAAA,EACF;AAAA,EAEA,YAAY,QAA6B;AACvC,WAAO,gBAAgB,MAAM;AAAA,EAC/B;AACF;AAEA,SAAS,gBAAgB,QAA6B;AACpD,SAAO,mBAAmB,YAAY,MAAM,EAAE;AAChD;AAQO,SAAS,mBAAmB,UAA4B;AAC7D,QAAM,UAAU,SAAS,MAAM,oCAAoC;AACnE,SAAO,WAAW,CAAC;AACrB;AAEA,IAAM,UAAU,IAAI,eAAe;AAE5B,IAAM,cAAc,QAAQ,YAAY,KAAK,OAAO;AACpD,IAAM,WAAW,QAAQ,SAAS,KAAK,OAAO;AAC9C,IAAM,cAAc,QAAQ,YAAY,KAAK,OAAO;","names":[]}
|
|
@@ -83,13 +83,21 @@ var ErrorResponseSchema = z.object({
|
|
|
83
83
|
}).openapi("ErrorResponse");
|
|
84
84
|
var WIRE_VERSION = "1.0.0";
|
|
85
85
|
function hashRubric(rubric) {
|
|
86
|
-
const stable =
|
|
86
|
+
const stable = stableStringify(rubric);
|
|
87
87
|
let h = 5381;
|
|
88
88
|
for (let i = 0; i < stable.length; i++) {
|
|
89
89
|
h = h * 33 ^ stable.charCodeAt(i);
|
|
90
90
|
}
|
|
91
91
|
return `${rubric.name}@${(h >>> 0).toString(16).padStart(8, "0")}`;
|
|
92
92
|
}
|
|
93
|
+
function stableStringify(value) {
|
|
94
|
+
if (Array.isArray(value)) return `[${value.map((item) => stableStringify(item)).join(",")}]`;
|
|
95
|
+
if (value && typeof value === "object") {
|
|
96
|
+
const entries = Object.entries(value).sort(([a], [b]) => a.localeCompare(b)).map(([key, item]) => `${JSON.stringify(key)}:${stableStringify(item)}`);
|
|
97
|
+
return `{${entries.join(",")}}`;
|
|
98
|
+
}
|
|
99
|
+
return JSON.stringify(value);
|
|
100
|
+
}
|
|
93
101
|
|
|
94
102
|
// src/wire/rubrics.ts
|
|
95
103
|
var ANTI_SLOP = {
|
|
@@ -225,6 +233,47 @@ function judgeOutputSchema(rubric) {
|
|
|
225
233
|
}
|
|
226
234
|
};
|
|
227
235
|
}
|
|
236
|
+
function validateJudgeOutput(value, rubric) {
|
|
237
|
+
if (!value || typeof value !== "object") {
|
|
238
|
+
throw new WireError("judge_error", "Judge returned malformed output.", 500, value);
|
|
239
|
+
}
|
|
240
|
+
const raw = value;
|
|
241
|
+
const rawDimensions = raw.dimensions;
|
|
242
|
+
if (!rawDimensions || typeof rawDimensions !== "object" || Array.isArray(rawDimensions)) {
|
|
243
|
+
throw new WireError("judge_error", "Judge returned malformed dimensions.", 500, value);
|
|
244
|
+
}
|
|
245
|
+
const dimensions = {};
|
|
246
|
+
const dimensionRecord = rawDimensions;
|
|
247
|
+
for (const dim of rubric.dimensions) {
|
|
248
|
+
const score = dimensionRecord[dim.id];
|
|
249
|
+
if (typeof score !== "number" || !Number.isFinite(score) || score < dim.min || score > dim.max) {
|
|
250
|
+
throw new WireError("judge_error", `Judge returned invalid score for dimension "${dim.id}".`, 500, value);
|
|
251
|
+
}
|
|
252
|
+
dimensions[dim.id] = score;
|
|
253
|
+
}
|
|
254
|
+
const allowedFailures = new Set(rubric.failureModes.map((mode) => mode.id));
|
|
255
|
+
const allowedWins = new Set(rubric.wins.map((win) => win.id));
|
|
256
|
+
const failureModes = validateIdArray(raw.failureModes, allowedFailures, "failureModes", value);
|
|
257
|
+
const wins = validateIdArray(raw.wins, allowedWins, "wins", value);
|
|
258
|
+
if (typeof raw.rationale !== "string" || raw.rationale.trim().length === 0) {
|
|
259
|
+
throw new WireError("judge_error", "Judge returned missing rationale.", 500, value);
|
|
260
|
+
}
|
|
261
|
+
return { dimensions, failureModes, wins, rationale: raw.rationale };
|
|
262
|
+
}
|
|
263
|
+
function validateIdArray(raw, allowed, field, original) {
|
|
264
|
+
if (raw === void 0) return [];
|
|
265
|
+
if (!Array.isArray(raw)) {
|
|
266
|
+
throw new WireError("judge_error", `Judge returned non-array ${field}.`, 500, original);
|
|
267
|
+
}
|
|
268
|
+
const out = [];
|
|
269
|
+
for (const item of raw) {
|
|
270
|
+
if (typeof item !== "string" || !allowed.has(item)) {
|
|
271
|
+
throw new WireError("judge_error", `Judge returned unknown ${field} id "${String(item)}".`, 500, original);
|
|
272
|
+
}
|
|
273
|
+
out.push(item);
|
|
274
|
+
}
|
|
275
|
+
return out;
|
|
276
|
+
}
|
|
228
277
|
function compositeScore(dimensions, rubric) {
|
|
229
278
|
let weighted = 0;
|
|
230
279
|
let totalWeight = 0;
|
|
@@ -273,17 +322,15 @@ async function handleJudge(req) {
|
|
|
273
322
|
temperature: 0,
|
|
274
323
|
timeoutMs: 6e4
|
|
275
324
|
});
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
}
|
|
279
|
-
const composite = compositeScore(value.dimensions, rubric);
|
|
325
|
+
const output = validateJudgeOutput(value, rubric);
|
|
326
|
+
const composite = compositeScore(output.dimensions, rubric);
|
|
280
327
|
const durationMs = Date.now() - startedAt;
|
|
281
328
|
return {
|
|
282
329
|
composite,
|
|
283
|
-
dimensions:
|
|
284
|
-
failureModes:
|
|
285
|
-
wins:
|
|
286
|
-
rationale:
|
|
330
|
+
dimensions: output.dimensions,
|
|
331
|
+
failureModes: output.failureModes ?? [],
|
|
332
|
+
wins: output.wins ?? [],
|
|
333
|
+
rationale: output.rationale,
|
|
287
334
|
rubricVersion: hashRubric(rubric),
|
|
288
335
|
model: result.model,
|
|
289
336
|
durationMs
|
|
@@ -400,7 +447,7 @@ function buildOpenApi(packageVersion) {
|
|
|
400
447
|
}
|
|
401
448
|
});
|
|
402
449
|
const generator = new OpenApiGeneratorV31(registry.definitions);
|
|
403
|
-
|
|
450
|
+
const doc = generator.generateDocument({
|
|
404
451
|
openapi: "3.1.0",
|
|
405
452
|
info: {
|
|
406
453
|
title: "@tangle-network/agent-eval \u2014 wire protocol",
|
|
@@ -413,6 +460,38 @@ Wire-protocol version: ${WIRE_VERSION}. Bumps on breaking changes to request/res
|
|
|
413
460
|
},
|
|
414
461
|
servers: [{ url: "http://localhost:5005", description: "Local agent-eval serve" }]
|
|
415
462
|
});
|
|
463
|
+
const rubricRef = { $ref: "#/components/schemas/Rubric" };
|
|
464
|
+
const commonJudgeFields = {
|
|
465
|
+
content: { type: "string", minLength: 1 },
|
|
466
|
+
context: { type: "object", additionalProperties: true },
|
|
467
|
+
model: { type: "string" }
|
|
468
|
+
};
|
|
469
|
+
doc.components ??= {};
|
|
470
|
+
doc.components.schemas ??= {};
|
|
471
|
+
doc.components.schemas.JudgeRequest = {
|
|
472
|
+
oneOf: [
|
|
473
|
+
{
|
|
474
|
+
type: "object",
|
|
475
|
+
additionalProperties: false,
|
|
476
|
+
required: ["rubricName", "content"],
|
|
477
|
+
properties: {
|
|
478
|
+
rubricName: { type: "string", minLength: 1 },
|
|
479
|
+
...commonJudgeFields
|
|
480
|
+
}
|
|
481
|
+
},
|
|
482
|
+
{
|
|
483
|
+
type: "object",
|
|
484
|
+
additionalProperties: false,
|
|
485
|
+
required: ["rubric", "content"],
|
|
486
|
+
properties: {
|
|
487
|
+
rubric: rubricRef,
|
|
488
|
+
...commonJudgeFields
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
],
|
|
492
|
+
description: "Judge request. Provide exactly one of rubricName or rubric."
|
|
493
|
+
};
|
|
494
|
+
return doc;
|
|
416
495
|
}
|
|
417
496
|
|
|
418
497
|
// src/wire/server.ts
|
|
@@ -591,4 +670,4 @@ export {
|
|
|
591
670
|
runRpcOnce,
|
|
592
671
|
runRpcBatch
|
|
593
672
|
};
|
|
594
|
-
//# sourceMappingURL=chunk-
|
|
673
|
+
//# sourceMappingURL=chunk-LSR4IAYN.js.map
|