@tangle-network/agent-eval 0.71.0 → 0.72.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +63 -0
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/otel.d.ts +3 -2
- package/dist/agent-profile-DYRboYWu.d.ts +364 -0
- package/dist/analyst/index.d.ts +221 -0
- package/dist/analyst/index.js +371 -0
- package/dist/analyst/index.js.map +1 -0
- package/dist/analyst-t7zZS3TV.d.ts +88 -0
- package/dist/campaign/index.d.ts +485 -9
- package/dist/campaign/index.js +618 -30
- package/dist/campaign/index.js.map +1 -1
- package/dist/chunk-7W4SM7FD.js +1075 -0
- package/dist/chunk-7W4SM7FD.js.map +1 -0
- package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
- package/dist/chunk-JHA3ZGSO.js +1496 -0
- package/dist/chunk-JHA3ZGSO.js.map +1 -0
- package/dist/{chunk-VMAYE3LM.js → chunk-JYE3WOTE.js} +57 -9
- package/dist/{chunk-VMAYE3LM.js.map → chunk-JYE3WOTE.js.map} +1 -1
- package/dist/chunk-LB2UOI5F.js +412 -0
- package/dist/chunk-LB2UOI5F.js.map +1 -0
- package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
- package/dist/chunk-VUINJM5M.js.map +1 -0
- package/dist/chunk-WYIHD6EB.js +1044 -0
- package/dist/chunk-WYIHD6EB.js.map +1 -0
- package/dist/{chunk-6QZUCFKM.js → chunk-XPILG2CA.js} +120 -3
- package/dist/chunk-XPILG2CA.js.map +1 -0
- package/dist/{chunk-6XQIEUQ2.js → chunk-ZPSKPT3V.js} +5 -3
- package/dist/{chunk-6XQIEUQ2.js.map → chunk-ZPSKPT3V.js.map} +1 -1
- package/dist/contract/index.d.ts +17 -13
- package/dist/contract/index.js +14 -8
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
- package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
- package/dist/hosted/index.d.ts +223 -2
- package/dist/index.d.ts +49 -1323
- package/dist/index.js +339 -2627
- package/dist/index.js.map +1 -1
- package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
- package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
- package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
- package/dist/openapi.json +1 -1
- package/dist/pareto-E-pembql.d.ts +81 -0
- package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
- package/dist/redact-B40YG2M_.d.ts +45 -0
- package/dist/registry-DuVYiTvw.d.ts +128 -0
- package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
- package/dist/rl.d.ts +4 -3
- package/dist/rl.js +4 -4
- package/dist/{run-campaign-BVY3RGAZ.js → run-campaign-OVEZF24D.js} +2 -2
- package/dist/run-critic-BAIjX99r.d.ts +56 -0
- package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
- package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
- package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
- package/dist/traces.d.ts +371 -308
- package/dist/traces.js +43 -18
- package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
- package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
- package/dist/wire/index.d.ts +1 -1
- package/dist/workflow/index.d.ts +494 -0
- package/dist/workflow/index.js +2177 -0
- package/dist/workflow/index.js.map +1 -0
- package/docs/design/self-improvement-roadmap.md +106 -0
- package/package.json +36 -12
- package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
- package/dist/chunk-6QZUCFKM.js.map +0 -1
- package/dist/chunk-ODGETRTM.js.map +0 -1
- package/dist/chunk-PQV2TKC3.js +0 -27
- package/dist/chunk-PQV2TKC3.js.map +0 -1
- /package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0
- /package/dist/{run-campaign-BVY3RGAZ.js.map → run-campaign-OVEZF24D.js.map} +0 -0
package/CHANGELOG.md
CHANGED
|
@@ -4,6 +4,69 @@ All notable changes to `@tangle-network/agent-eval` and its sibling `agent-eval-
|
|
|
4
4
|
|
|
5
5
|
---
|
|
6
6
|
|
|
7
|
+
## [0.72.3] — 2026-06-01 — workflow trace hardening and driver backtests
|
|
8
|
+
|
|
9
|
+
### Added
|
|
10
|
+
|
|
11
|
+
- **Canonical workflow branch events in `/workflow`.** Runtime traces now project branch start/end/failure counts into workflow summaries, RunRecords, and feedback trajectories so fanout topology failures are measurable instead of hidden in raw trace blobs.
|
|
12
|
+
- **`workflowPhaseGraph` in `/workflow`.** Builds phase nodes and branch edges from workflow trace events with per-phase calls, branch failures, cost, and token counters. Product adopters can consume this instead of maintaining local graph mirrors.
|
|
13
|
+
- **Stricter workflow event schema validation.** Workflow traces now reject unknown event kinds, malformed typed payloads, non-monotonic timestamps, missing `workflow.started`, multiple terminal events, and events after terminal completion.
|
|
14
|
+
- **Driver comparison substrate proof.** `compareDrivers` now carries analyst findings through the canonical campaign path and includes GSM8K/AppWorld driver backtest examples.
|
|
15
|
+
|
|
16
|
+
### Fixed
|
|
17
|
+
|
|
18
|
+
- **Publish skew guard.** PyPI publishing depends on successful npm publishing, and the npm publish job now checks registry authentication and `@tangle-network` package access before building or attempting a publish.
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## [0.72.2] — 2026-06-01 — workflow driver promotion gates
|
|
23
|
+
|
|
24
|
+
### Added
|
|
25
|
+
|
|
26
|
+
- **`decideWorkflowDriverPromotion` in `/workflow`.** Compares a dynamic workflow driver against the reviewer-loop baseline using paired heldout `RunRecord`s keyed by `scenarioId::seed`, then fails closed on missing pairs, too few pairs, insufficient lift, or candidate cost ceilings.
|
|
27
|
+
- **Explicit workflow comparison axis.** `expectedScenarioIds` defines the promotion gate's comparison set so unrelated scenarios cannot skew the lift or confidence interval.
|
|
28
|
+
|
|
29
|
+
### Fixed
|
|
30
|
+
|
|
31
|
+
- **No seed-only workflow pairing.** Promotion records without `scenarioId` are rejected instead of being paired by seed alone.
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## [0.72.1] — 2026-06-01 — workflow execution summaries for dynamic drivers
|
|
36
|
+
|
|
37
|
+
### Added
|
|
38
|
+
|
|
39
|
+
- **`summarizeWorkflowExecution` in `/workflow`.** Builds the canonical rich projection from a workflow trace: event-kind counts, phase order, agent and loop delegate summaries, verifier/analyst/reviewer checkpoint outputs, cost, tokens, and failure status.
|
|
40
|
+
- **Checkpoint output extraction.** Verifier, analyst, and reviewer traces preserve the returned output through `trace.checkpointOutput`, with `trace.output` accepted for compatibility.
|
|
41
|
+
|
|
42
|
+
### Fixed
|
|
43
|
+
|
|
44
|
+
- **npm/PyPI version lock.** The Python RPC package version is bumped back into lockstep with the npm package so the publish workflow can release both artifacts from one tag.
|
|
45
|
+
|
|
46
|
+
## [0.72.0] — 2026-05-31 — cost axis prices unpriced-at-source models (every run carries a real, labeled cost)
|
|
47
|
+
|
|
48
|
+
A live tax-agent full-loop run (real sandbox, `deepseek-v4-pro`, real tokens) exposed the second root of the cost-ledger split: the sandbox reported `totalCostUsd: 0` despite `17537` input / `622` output tokens — not a stub, not a mis-wired ledger, but a model the **source** can't rate. The cost / Pareto / `tokens_per_dollar` axes blanked even though the substrate's pricing table prices `deepseek` correctly; the table was simply never consulted on the matrix cost projection. A $0 cost on a run that burned real tokens reads as "free," which is the more misleading state.
|
|
49
|
+
|
|
50
|
+
### Fixed
|
|
51
|
+
|
|
52
|
+
- **`runProfileMatrix` prices measured tokens when the source reports $0.** Cost precedence is now explicit: **source-billed > token-estimated > none**. When `cell.costUsd === 0` and real output tokens flowed and the model is priced (`isModelPriced`), `buildRunRecord` sets the cost from `estimateCost(in, out, model)` (real published rate × real tokens) and stamps `raw.cost_estimated = 1`. A billed cost is never overridden; a model the table also can't rate stays $0 (no fabrication). The estimate flows into `record.costUsd`, so `byProfile.totalCostUsd`, `integrity.totalCostUsd`, and `tokens_per_dollar` / `cost_per_quality` all populate.
|
|
53
|
+
- **Every cost surface in the matrix result agrees.** The embedded `campaigns[id].aggregates.totalCostUsd` is reconciled to the priced total instead of runCampaign's raw `ctx.cost` ledger (which only sees the source's $0). No more two-`totalCostUsd`-that-disagree in one result.
|
|
54
|
+
- **Honest integrity diagnosis.** `summarizeBackendIntegrity`'s uncosted-records message now names **both** roots — mis-wired ledger OR unpriced-at-source model — and points at `estimateCost` for the latter, instead of asserting the ledger is broken.
|
|
55
|
+
|
|
56
|
+
Live proof: the same tax case that recorded `$0` now records **`$0.0059453`** (`17537 × 0.0003/1k + 622 × 0.0011/1k`, exact), `cost_estimated: 1`, `uncostedRecords: 0`, verdict `real`. Generalizes to every consumer of `runProfileMatrix`. New regression tests: priced-when-source-zero, billed-takes-precedence, truly-unpriced-stays-$0, campaign-aggregate-reconciled. Full suite (1663) green.
|
|
57
|
+
|
|
58
|
+
## [0.71.0] — 2026-05-31 — corpus-by-default + multi-dimensional capture (datasets as eval exhaust)
|
|
59
|
+
|
|
60
|
+
Every matrix run now emits a multi-dimensional, dataset-able record with no side-channel — the groundwork for "datasets gathered for free by running evals."
|
|
61
|
+
|
|
62
|
+
### Added
|
|
63
|
+
|
|
64
|
+
- **Multi-dim guardrail projection in `buildRunRecord`.** Each `RunRecord.outcome.raw` carries `cost_usd`, `tokens_input` / `tokens_output` (+ `tokens_cached` when present), `latency_ms`, and the guarded ratios `tokens_per_dollar` / `cost_per_quality`. RAW-ONLY — the composite stays the judge objective (anti-Goodhart); these are tracked + dashboarded + carried into datasets, never optimized.
|
|
65
|
+
- **Corpus-by-default via `corpusText`.** An optional `corpusText(artifact, scenario) => {prompt, completion}` stamps the trajectory text onto each record (the `CorpusRecord` shape), so a run is dataset-able with no side-channel. Fail-soft: a throwing extractor omits the text and keeps the graded record.
|
|
66
|
+
- **`appendToCorpus` / `readCorpus` / `buildDatasetFromCorpus`** (`src/rl/corpus.ts`) — append-only JSONL corpus (deduped by `runId`), with score/split filtering into a train/holdout dataset.
|
|
67
|
+
|
|
68
|
+
`buildRunRecord` is generic over `<TScenario, TArtifact>`; a `scenarioById` map threads each scenario into the projection.
|
|
69
|
+
|
|
7
70
|
## [0.70.0] — 2026-05-31 — error-grounded reflection (the driver targets real failures, not blind rewrites)
|
|
8
71
|
|
|
9
72
|
Adversarial verification on TWO domains (legal + tax, two worker models) found the same root cause: the gepaDriver's candidates **regressed** the baseline, so the gate correctly held — but nothing improved. The driver was reflecting on per-scenario *scores* only; the judge's `notes` (the "why it failed") were computed but **dropped** before the reflection. So it proposed generic rewrites a capable model already knows, which distract rather than help.
|
package/dist/adapters/http.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-
|
|
1
|
+
import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-Bba0vl1V.js';
|
|
2
2
|
import '../run-record-BgTFzO2r.js';
|
|
3
3
|
import '../errors-Dwqw-T_m.js';
|
|
4
4
|
import '../schema-m0gsnbt3.js';
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-
|
|
1
|
+
import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-Bba0vl1V.js';
|
|
2
2
|
import '../run-record-BgTFzO2r.js';
|
|
3
3
|
import '../errors-Dwqw-T_m.js';
|
|
4
4
|
import '../schema-m0gsnbt3.js';
|
package/dist/adapters/otel.d.ts
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import '../types-
|
|
1
|
+
import { TraceSpanEvent, HostedClient } from '../hosted/index.js';
|
|
2
|
+
import '../types-Bba0vl1V.js';
|
|
3
3
|
import '../run-record-BgTFzO2r.js';
|
|
4
4
|
import '../errors-Dwqw-T_m.js';
|
|
5
5
|
import '../schema-m0gsnbt3.js';
|
|
6
|
+
import '../insight-report-Df3lxYXM.js';
|
|
6
7
|
import '../summary-report-ByiOUrHj.js';
|
|
7
8
|
import '../failure-cluster-CL7IVgkJ.js';
|
|
8
9
|
import '../store-CKUAgsJz.js';
|
|
@@ -0,0 +1,364 @@
|
|
|
1
|
+
import { A as AgentEvalError } from './errors-Dwqw-T_m.js';
|
|
2
|
+
import { R as RunRecord } from './run-record-BgTFzO2r.js';
|
|
3
|
+
import { TCloud } from '@tangle-network/tcloud';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Backend-integrity guard: distinguish "agent failed" from "eval ran against
|
|
7
|
+
* a stub / unconfigured backend." Without this guard a canonical eval can
|
|
8
|
+
* silently report `0/N passed` and look like an agent-quality problem when
|
|
9
|
+
* the LLM was never actually called — the failure mode we just hit running
|
|
10
|
+
* the 4-vertical parallel eval (legal-sandbox-stub returned hard-coded 33-104
|
|
11
|
+
* char strings; gtm/creative defaulted to a cli-bridge that wasn't running).
|
|
12
|
+
*
|
|
13
|
+
* The shape:
|
|
14
|
+
*
|
|
15
|
+
* const report = summarizeBackendIntegrity(records)
|
|
16
|
+
* assertRealBackend(records) // throws BackendIntegrityError if 100% stub
|
|
17
|
+
*
|
|
18
|
+
* A record is "stub-mode" if its `tokenUsage.input === 0 && tokenUsage.output === 0`.
|
|
19
|
+
* (`costUsd` alone is unreliable — some backends successfully call LLMs but
|
|
20
|
+
* don't propagate pricing, producing real tokens with $0 cost.)
|
|
21
|
+
*
|
|
22
|
+
* Verdicts:
|
|
23
|
+
* - `real` — at least one record has nonzero token usage
|
|
24
|
+
* - `stub` — every record is stub-mode (eval ran blind)
|
|
25
|
+
* - `mixed` — some records real, some stub (partial backend failure;
|
|
26
|
+
* often the 429-cascade or auth-half-failed case)
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
interface BackendIntegrityReport {
|
|
30
|
+
/** Total records inspected. */
|
|
31
|
+
totalRecords: number;
|
|
32
|
+
/** Records with input=0 AND output=0 (a stub fingerprint). */
|
|
33
|
+
stubRecords: number;
|
|
34
|
+
/** Records with nonzero token usage (real LLM activity). */
|
|
35
|
+
realRecords: number;
|
|
36
|
+
/** Records where output>0 but costUsd=0 (real LLM, broken cost ledger). */
|
|
37
|
+
uncostedRecords: number;
|
|
38
|
+
/** Sum of input tokens across all records. */
|
|
39
|
+
totalInputTokens: number;
|
|
40
|
+
/** Sum of output tokens across all records. */
|
|
41
|
+
totalOutputTokens: number;
|
|
42
|
+
/** Sum of costUsd across all records. */
|
|
43
|
+
totalCostUsd: number;
|
|
44
|
+
/** Worst-case integrity verdict. */
|
|
45
|
+
verdict: 'real' | 'mixed' | 'stub';
|
|
46
|
+
/** Human-readable diagnosis suitable for terminal output. */
|
|
47
|
+
diagnosis: string;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Error thrown when an integrity assertion fails. Caller can pattern-match
|
|
51
|
+
* by `code === 'AGENT_EVAL_BACKEND_STUB'` to differentiate from other
|
|
52
|
+
* errors.
|
|
53
|
+
*/
|
|
54
|
+
declare class BackendIntegrityError extends AgentEvalError {
|
|
55
|
+
readonly report: BackendIntegrityReport;
|
|
56
|
+
constructor(message: string, report: BackendIntegrityReport);
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Inspect a batch of RunRecords and return an integrity report. Pure
|
|
60
|
+
* function — no I/O, no logging. The caller decides what to do with the
|
|
61
|
+
* verdict (print warning, throw, gate CI, etc.).
|
|
62
|
+
*/
|
|
63
|
+
declare function summarizeBackendIntegrity(records: ReadonlyArray<RunRecord>): BackendIntegrityReport;
|
|
64
|
+
/**
|
|
65
|
+
* Throw BackendIntegrityError if the verdict is 'stub' — i.e. every record
|
|
66
|
+
* shows zero LLM activity. Non-strict callers can pass `{ allowMixed: false }`
|
|
67
|
+
* to also reject mixed verdicts (recommended for CI gates).
|
|
68
|
+
*
|
|
69
|
+
* Real backends pass through silently.
|
|
70
|
+
*/
|
|
71
|
+
declare function assertRealBackend(records: ReadonlyArray<RunRecord>, opts?: {
|
|
72
|
+
allowMixed?: boolean;
|
|
73
|
+
}): BackendIntegrityReport;
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Artifact validators.
|
|
77
|
+
*
|
|
78
|
+
* Generic "score a produced artifact" primitive. Tax uses it for PDF form
|
|
79
|
+
* correctness, research for sourced briefs, browser for task assertions, coding
|
|
80
|
+
* for social posts. One interface, many validators; all plug into
|
|
81
|
+
* `BenchmarkRunner` the same way.
|
|
82
|
+
*
|
|
83
|
+
* A validator receives an `Artifact` (file on disk, JSON blob, text, binary)
|
|
84
|
+
* plus a `ValidationContext` (scenario id, the turns that produced it) and
|
|
85
|
+
* returns a `ValidationResult` with pass/fail + 0..1 score + structured
|
|
86
|
+
* issues.
|
|
87
|
+
*/
|
|
88
|
+
interface Artifact {
|
|
89
|
+
/** Logical kind — validators type-guard on this */
|
|
90
|
+
kind: 'file' | 'json' | 'text' | 'binary' | string;
|
|
91
|
+
/** Filesystem-style path, optional */
|
|
92
|
+
path?: string;
|
|
93
|
+
/** String content for text/json/file kinds */
|
|
94
|
+
content?: string;
|
|
95
|
+
/** Binary content (if kind === 'binary') */
|
|
96
|
+
bytes?: Uint8Array;
|
|
97
|
+
/** Caller-supplied metadata (mimeType, sha256, size, etc.) */
|
|
98
|
+
metadata?: Record<string, unknown>;
|
|
99
|
+
}
|
|
100
|
+
interface ValidationContext {
|
|
101
|
+
scenarioId: string;
|
|
102
|
+
turnIndex?: number;
|
|
103
|
+
/** Prior artifacts for multi-artifact scenarios */
|
|
104
|
+
priorArtifacts?: Artifact[];
|
|
105
|
+
/** Free-form hints the validator uses for domain-specific checks */
|
|
106
|
+
hints?: Record<string, unknown>;
|
|
107
|
+
}
|
|
108
|
+
interface ValidationIssue {
|
|
109
|
+
severity: 'error' | 'warning' | 'info';
|
|
110
|
+
message: string;
|
|
111
|
+
/** Optional path into the artifact (e.g. JSON path or byte offset) */
|
|
112
|
+
locus?: string;
|
|
113
|
+
}
|
|
114
|
+
interface ValidationResult {
|
|
115
|
+
pass: boolean;
|
|
116
|
+
/** 0–1 normalized score. Validators should be monotonic in pass-ness. */
|
|
117
|
+
score: number;
|
|
118
|
+
issues: ValidationIssue[];
|
|
119
|
+
/** Diagnostic payload for reporters */
|
|
120
|
+
evidence?: Record<string, unknown>;
|
|
121
|
+
}
|
|
122
|
+
interface ArtifactValidator {
|
|
123
|
+
/** Stable identifier for the validator; appears in reports. */
|
|
124
|
+
name: string;
|
|
125
|
+
/** Optional description for human-facing reports. */
|
|
126
|
+
description?: string;
|
|
127
|
+
/** Called once per artifact; validators are expected to be pure + idempotent. */
|
|
128
|
+
validate(artifact: Artifact, context: ValidationContext): Promise<ValidationResult>;
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Run every validator on the same artifact; aggregate pass as AND, score as
|
|
132
|
+
* (weighted) mean, issues concatenated. Weights default to 1 each.
|
|
133
|
+
*/
|
|
134
|
+
declare function composeValidators(validators: ArtifactValidator[], options?: {
|
|
135
|
+
name?: string;
|
|
136
|
+
weights?: number[];
|
|
137
|
+
}): ArtifactValidator;
|
|
138
|
+
/** Pass if the artifact body matches a provided regex. */
|
|
139
|
+
declare function regexMatch(name: string, pattern: RegExp): ArtifactValidator;
|
|
140
|
+
/** Pass if JSON parses and every required key is present. */
|
|
141
|
+
declare function jsonHasKeys(name: string, requiredPaths: string[]): ArtifactValidator;
|
|
142
|
+
/** Pass if min ≤ byte length ≤ max. */
|
|
143
|
+
declare function byteLengthRange(name: string, min: number, max: number): ArtifactValidator;
|
|
144
|
+
/** Pass if the artifact contains every required substring (case-insensitive by default). */
|
|
145
|
+
declare function containsAll(name: string, required: string[], options?: {
|
|
146
|
+
caseSensitive?: boolean;
|
|
147
|
+
}): ArtifactValidator;
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Completion verifier — the task-completion oracle.
|
|
151
|
+
*
|
|
152
|
+
* Answers the only eval question that is not a proxy: did the agent actually
|
|
153
|
+
* COMPLETE the task — produce every required deliverable, persisted and
|
|
154
|
+
* correct — rather than describe what should be done. A fluent transcript
|
|
155
|
+
* that never produces the artifact scores zero here.
|
|
156
|
+
*
|
|
157
|
+
* Per requirement, a two-stage check:
|
|
158
|
+
* 1. Structural — a produced item (vault artifact / approved proposal /
|
|
159
|
+
* tool call) of the right kind is matched against the requirement and
|
|
160
|
+
* carries non-empty content. Deterministic; no LLM.
|
|
161
|
+
* 2. Correctness — only if structurally present AND the matched item
|
|
162
|
+
* carries content, one targeted check decides whether that item
|
|
163
|
+
* actually fulfils the requirement. A hallucinated artifact fails here;
|
|
164
|
+
* an absent one already failed stage 1.
|
|
165
|
+
*
|
|
166
|
+
* `completionRate` is satisfied / total. Quality dimensions are meaningless
|
|
167
|
+
* on an incomplete task — callers gate on `fullyComplete` / `completionRate`
|
|
168
|
+
* before scoring quality.
|
|
169
|
+
*/
|
|
170
|
+
|
|
171
|
+
/** What kind of produced state can satisfy a requirement structurally. */
|
|
172
|
+
type SatisfiedBy = 'artifact' | 'proposal' | 'tool-call' | 'any';
|
|
173
|
+
interface CompletionRequirement {
|
|
174
|
+
/** Stable id from the task gold (e.g. a persona's `expected_requirements[].req_id`). */
|
|
175
|
+
reqId: string;
|
|
176
|
+
/** Human-readable description of the required deliverable. */
|
|
177
|
+
title: string;
|
|
178
|
+
/** Optional kind/category hint, matched against a produced item's kind. */
|
|
179
|
+
category?: string;
|
|
180
|
+
/** What produced state satisfies this requirement. Defaults to 'any'. */
|
|
181
|
+
satisfiedBy?: SatisfiedBy;
|
|
182
|
+
}
|
|
183
|
+
interface TaskGold {
|
|
184
|
+
taskId: string;
|
|
185
|
+
requirements: CompletionRequirement[];
|
|
186
|
+
}
|
|
187
|
+
interface ProducedProposal {
|
|
188
|
+
id: string;
|
|
189
|
+
title: string;
|
|
190
|
+
status: 'pending' | 'approved' | 'rejected';
|
|
191
|
+
/** Optional persisted body — when present, enables a correctness check. */
|
|
192
|
+
content?: string;
|
|
193
|
+
}
|
|
194
|
+
/** Everything observable about what a run actually produced. */
|
|
195
|
+
interface ProducedState {
|
|
196
|
+
/** Persisted vault artifacts. Reuses the shared `Artifact` shape. */
|
|
197
|
+
artifacts: Artifact[];
|
|
198
|
+
/** Proposals / filings the agent created. */
|
|
199
|
+
proposals: ProducedProposal[];
|
|
200
|
+
/** Names of tools the agent invoked. */
|
|
201
|
+
toolCalls: string[];
|
|
202
|
+
}
|
|
203
|
+
interface RequirementCheck {
|
|
204
|
+
reqId: string;
|
|
205
|
+
title: string;
|
|
206
|
+
/** A produced item of the right kind matched the requirement, non-empty. */
|
|
207
|
+
structurallyPresent: boolean;
|
|
208
|
+
/**
|
|
209
|
+
* Whether the matched item actually fulfils the requirement. `null` when
|
|
210
|
+
* not structurally present, or when the matched item carries no content
|
|
211
|
+
* to assess.
|
|
212
|
+
*/
|
|
213
|
+
correct: boolean | null;
|
|
214
|
+
/** structurallyPresent && correct !== false. */
|
|
215
|
+
satisfied: boolean;
|
|
216
|
+
/** Human-readable evidence for the verdict. */
|
|
217
|
+
evidence: string[];
|
|
218
|
+
}
|
|
219
|
+
interface CompletionVerdict {
|
|
220
|
+
taskId: string;
|
|
221
|
+
requirements: RequirementCheck[];
|
|
222
|
+
/** satisfied / total requirements. */
|
|
223
|
+
completionRate: number;
|
|
224
|
+
/** Every requirement satisfied. */
|
|
225
|
+
fullyComplete: boolean;
|
|
226
|
+
}
|
|
227
|
+
/**
|
|
228
|
+
* Decides whether a produced item's content actually fulfils a requirement.
|
|
229
|
+
* Injected so the structural verifier stays pure and unit-testable; the
|
|
230
|
+
* production implementation is `createLlmCorrectnessChecker`.
|
|
231
|
+
*/
|
|
232
|
+
type CorrectnessChecker = (requirement: CompletionRequirement, content: string) => Promise<{
|
|
233
|
+
correct: boolean;
|
|
234
|
+
reason: string;
|
|
235
|
+
}>;
|
|
236
|
+
/**
|
|
237
|
+
* Verify whether a run completed the task. `checkCorrectness` is injected —
|
|
238
|
+
* `createLlmCorrectnessChecker` for production, a deterministic stub in tests.
|
|
239
|
+
*
|
|
240
|
+
* Throws on a gold spec with no requirements: an eval task that requires
|
|
241
|
+
* nothing is a misconfiguration, not a vacuously-complete task.
|
|
242
|
+
*/
|
|
243
|
+
declare function verifyCompletion(gold: TaskGold, state: ProducedState, checkCorrectness: CorrectnessChecker): Promise<CompletionVerdict>;
|
|
244
|
+
interface LlmCorrectnessCheckerOpts {
|
|
245
|
+
model?: string;
|
|
246
|
+
/** Max chars of artifact content sent to the checker. */
|
|
247
|
+
maxContentChars?: number;
|
|
248
|
+
}
|
|
249
|
+
/** Parse the correctness checker's model response. Fails loud on a bad shape. */
|
|
250
|
+
declare function parseCorrectnessResponse(raw: string): {
|
|
251
|
+
correct: boolean;
|
|
252
|
+
reason: string;
|
|
253
|
+
};
|
|
254
|
+
/**
|
|
255
|
+
* Production `CorrectnessChecker` — one LLM call per matched artifact,
|
|
256
|
+
* deterministic (temperature 0), structured JSON out. Judges fulfilment
|
|
257
|
+
* only: a plan, a gesture, or a description of what should be done does not
|
|
258
|
+
* fulfil a requirement — the artifact must BE the deliverable.
|
|
259
|
+
*/
|
|
260
|
+
declare function createLlmCorrectnessChecker(tc: TCloud, opts?: LlmCorrectnessCheckerOpts): CorrectnessChecker;
|
|
261
|
+
|
|
262
|
+
/**
|
|
263
|
+
* Produced-state extraction — normalize a run's runtime event stream into the
|
|
264
|
+
* typed `ProducedState` the completion oracle consumes.
|
|
265
|
+
*
|
|
266
|
+
* `ProducedState` answers "what did the agent actually produce" — vault
|
|
267
|
+
* artifacts, proposals, tool calls. The runtime emits these as a stream of
|
|
268
|
+
* events; this module is the single normalization point from that stream to
|
|
269
|
+
* the shape `verifyCompletion` expects.
|
|
270
|
+
*
|
|
271
|
+
* Input is structurally typed (`RuntimeEventLike`) so this module does not
|
|
272
|
+
* depend on agent-runtime — agent-runtime's `RuntimeStreamEvent` satisfies it
|
|
273
|
+
* structurally. The `content` on `ArtifactEventLike` and the whole
|
|
274
|
+
* `proposal_created` variant are the runtime-side enrichments this contract
|
|
275
|
+
* requires; the runtime emits them, this module consumes them.
|
|
276
|
+
*/
|
|
277
|
+
|
|
278
|
+
/** A tool the agent invoked. */
|
|
279
|
+
interface ToolCallEventLike {
|
|
280
|
+
type: 'tool_call';
|
|
281
|
+
toolName: string;
|
|
282
|
+
}
|
|
283
|
+
/**
|
|
284
|
+
* An artifact the agent produced. `content` is the enriched field — the
|
|
285
|
+
* runtime's base `artifact` event carries only metadata; the completion
|
|
286
|
+
* oracle needs the body to verify the deliverable, so the runtime emits it.
|
|
287
|
+
*/
|
|
288
|
+
interface ArtifactEventLike {
|
|
289
|
+
type: 'artifact';
|
|
290
|
+
artifactId: string;
|
|
291
|
+
name?: string;
|
|
292
|
+
mimeType?: string;
|
|
293
|
+
uri?: string;
|
|
294
|
+
content?: string;
|
|
295
|
+
}
|
|
296
|
+
/** A proposal / filing the agent created. */
|
|
297
|
+
interface ProposalEventLike {
|
|
298
|
+
type: 'proposal_created';
|
|
299
|
+
proposalId: string;
|
|
300
|
+
title: string;
|
|
301
|
+
status?: 'pending' | 'approved' | 'rejected';
|
|
302
|
+
}
|
|
303
|
+
/**
|
|
304
|
+
* The subset of runtime stream events `extractProducedState` consumes.
|
|
305
|
+
* agent-runtime's full `RuntimeStreamEvent` union satisfies this structurally;
|
|
306
|
+
* the `{ type: string }` catch-all keeps the input permissive so callers can
|
|
307
|
+
* pass the whole unfiltered telemetry stream — unrecognized events are skipped.
|
|
308
|
+
*/
|
|
309
|
+
type RuntimeEventLike = ToolCallEventLike | ArtifactEventLike | ProposalEventLike | {
|
|
310
|
+
type: string;
|
|
311
|
+
};
|
|
312
|
+
/**
|
|
313
|
+
* Normalize a run's runtime event stream into `ProducedState`.
|
|
314
|
+
*
|
|
315
|
+
* Pure and total — unrecognized event types are skipped. `toolCalls` is
|
|
316
|
+
* deduplicated by name in first-seen order (completion cares about a tool's
|
|
317
|
+
* presence, not its call count). An artifact with neither a name nor a uri
|
|
318
|
+
* still yields an entry keyed by its `artifactId` so it is never silently
|
|
319
|
+
* dropped; an artifact with no `content` yields empty content, which the
|
|
320
|
+
* completion oracle's structural check then rejects on its own.
|
|
321
|
+
*/
|
|
322
|
+
declare function extractProducedState(events: readonly RuntimeEventLike[]): ProducedState;
|
|
323
|
+
|
|
324
|
+
/**
|
|
325
|
+
* @stable
|
|
326
|
+
*
|
|
327
|
+
* AgentProfile — the eval harness's unit of variation.
|
|
328
|
+
*
|
|
329
|
+
* A profile pins everything that changes agent behaviour for a benchmark
|
|
330
|
+
* cell: the model, the active skills, the prompt version, the available
|
|
331
|
+
* tools. Vary the profile — swap a model, add a skill — and re-run the suite
|
|
332
|
+
* to benchmark the change. The scorecard keys a cell on
|
|
333
|
+
* `(scenarioId, profileHash)`, so the model is not a separate axis: it lives
|
|
334
|
+
* inside the profile, and two profiles with the same model but different
|
|
335
|
+
* skills are different cells.
|
|
336
|
+
*
|
|
337
|
+
* `agentProfileHash` is the profile's behaviour identity. Two profiles that
|
|
338
|
+
* produce the same agent behaviour share a hash (and a scorecard cell);
|
|
339
|
+
* reordering `skills` or `tools` does not change it; the human-facing `id`
|
|
340
|
+
* label does not affect it.
|
|
341
|
+
*/
|
|
342
|
+
interface AgentProfile {
|
|
343
|
+
/** Human-facing label, e.g. `sonnet-legal-skills-v3`. Not part of the hash. */
|
|
344
|
+
id: string;
|
|
345
|
+
/** Model snapshot id this profile pins, e.g. `claude-sonnet-4-6@2025-04-15`. */
|
|
346
|
+
model: string;
|
|
347
|
+
/** Skill ids/versions active in this profile — the primary behaviour lever. */
|
|
348
|
+
skills?: string[];
|
|
349
|
+
/** Prompt version identifier. */
|
|
350
|
+
promptVersion?: string;
|
|
351
|
+
/** Tool ids available to the agent. */
|
|
352
|
+
tools?: string[];
|
|
353
|
+
/** Any other behaviour-bearing knobs that should fingerprint into the hash. */
|
|
354
|
+
metadata?: Record<string, string | number | boolean>;
|
|
355
|
+
}
|
|
356
|
+
/**
|
|
357
|
+
* Deterministic behaviour identity of a profile — a sha256 over the
|
|
358
|
+
* behaviour-bearing fields. `skills` and `tools` are order-insensitive; the
|
|
359
|
+
* `id` label is excluded. Throws on a profile with no `model` — an unkeyable
|
|
360
|
+
* profile must fail loud rather than collapse into a blank-model cell.
|
|
361
|
+
*/
|
|
362
|
+
declare function agentProfileHash(profile: AgentProfile): string;
|
|
363
|
+
|
|
364
|
+
export { type AgentProfile as A, type BackendIntegrityReport as B, type CompletionRequirement as C, type LlmCorrectnessCheckerOpts as L, type ProducedState as P, type RuntimeEventLike as R, type SatisfiedBy as S, type TaskGold as T, type ValidationContext as V, type CompletionVerdict as a, type CorrectnessChecker as b, type Artifact as c, type ArtifactEventLike as d, type ArtifactValidator as e, BackendIntegrityError as f, type ProducedProposal as g, type ProposalEventLike as h, type RequirementCheck as i, type ToolCallEventLike as j, type ValidationIssue as k, type ValidationResult as l, agentProfileHash as m, assertRealBackend as n, byteLengthRange as o, composeValidators as p, containsAll as q, createLlmCorrectnessChecker as r, extractProducedState as s, jsonHasKeys as t, parseCorrectnessResponse as u, regexMatch as v, summarizeBackendIntegrity as w, verifyCompletion as x };
|