@tangle-network/agent-eval 0.72.0 → 0.72.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +39 -0
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/otel.d.ts +3 -2
- package/dist/agent-profile-DYRboYWu.d.ts +364 -0
- package/dist/analyst/index.d.ts +221 -0
- package/dist/analyst/index.js +371 -0
- package/dist/analyst/index.js.map +1 -0
- package/dist/analyst-t7zZS3TV.d.ts +88 -0
- package/dist/campaign/index.d.ts +485 -9
- package/dist/campaign/index.js +597 -22
- package/dist/campaign/index.js.map +1 -1
- package/dist/chunk-7W4SM7FD.js +1075 -0
- package/dist/chunk-7W4SM7FD.js.map +1 -0
- package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
- package/dist/chunk-JHA3ZGSO.js +1496 -0
- package/dist/chunk-JHA3ZGSO.js.map +1 -0
- package/dist/{chunk-4QJN7RDX.js → chunk-JYE3WOTE.js} +55 -7
- package/dist/{chunk-4QJN7RDX.js.map → chunk-JYE3WOTE.js.map} +1 -1
- package/dist/chunk-LB2UOI5F.js +412 -0
- package/dist/chunk-LB2UOI5F.js.map +1 -0
- package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
- package/dist/chunk-VUINJM5M.js.map +1 -0
- package/dist/chunk-WYIHD6EB.js +1044 -0
- package/dist/chunk-WYIHD6EB.js.map +1 -0
- package/dist/{chunk-UD6EF73X.js → chunk-XPILG2CA.js} +119 -2
- package/dist/chunk-XPILG2CA.js.map +1 -0
- package/dist/contract/index.d.ts +17 -13
- package/dist/contract/index.js +13 -7
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
- package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
- package/dist/hosted/index.d.ts +223 -2
- package/dist/index.d.ts +49 -1323
- package/dist/index.js +353 -2496
- package/dist/index.js.map +1 -1
- package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
- package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
- package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
- package/dist/openapi.json +1 -1
- package/dist/pareto-E-pembql.d.ts +81 -0
- package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
- package/dist/redact-B40YG2M_.d.ts +45 -0
- package/dist/registry-DuVYiTvw.d.ts +128 -0
- package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
- package/dist/rl.d.ts +4 -3
- package/dist/rl.js +4 -4
- package/dist/run-critic-BAIjX99r.d.ts +56 -0
- package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
- package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
- package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
- package/dist/traces.d.ts +371 -308
- package/dist/traces.js +43 -18
- package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
- package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
- package/dist/wire/index.d.ts +1 -1
- package/dist/workflow/index.d.ts +494 -0
- package/dist/workflow/index.js +2177 -0
- package/dist/workflow/index.js.map +1 -0
- package/docs/design/self-improvement-roadmap.md +106 -0
- package/package.json +36 -12
- package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
- package/dist/chunk-ODGETRTM.js.map +0 -1
- package/dist/chunk-SL55X4VN.js +0 -186
- package/dist/chunk-SL55X4VN.js.map +0 -1
- package/dist/chunk-UD6EF73X.js.map +0 -1
- /package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.72.
|
|
3
|
+
"version": "0.72.3",
|
|
4
4
|
"description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|
|
@@ -19,6 +19,11 @@
|
|
|
19
19
|
"import": "./dist/index.js",
|
|
20
20
|
"default": "./dist/index.js"
|
|
21
21
|
},
|
|
22
|
+
"./analyst": {
|
|
23
|
+
"types": "./dist/analyst/index.d.ts",
|
|
24
|
+
"import": "./dist/analyst/index.js",
|
|
25
|
+
"default": "./dist/analyst/index.js"
|
|
26
|
+
},
|
|
22
27
|
"./control": {
|
|
23
28
|
"types": "./dist/control.d.ts",
|
|
24
29
|
"import": "./dist/control.js",
|
|
@@ -104,6 +109,11 @@
|
|
|
104
109
|
"import": "./dist/campaign/index.js",
|
|
105
110
|
"default": "./dist/campaign/index.js"
|
|
106
111
|
},
|
|
112
|
+
"./workflow": {
|
|
113
|
+
"types": "./dist/workflow/index.d.ts",
|
|
114
|
+
"import": "./dist/workflow/index.js",
|
|
115
|
+
"default": "./dist/workflow/index.js"
|
|
116
|
+
},
|
|
107
117
|
"./contract": {
|
|
108
118
|
"types": "./dist/contract/index.d.ts",
|
|
109
119
|
"import": "./dist/contract/index.js",
|
|
@@ -144,6 +154,19 @@
|
|
|
144
154
|
"publishConfig": {
|
|
145
155
|
"access": "public"
|
|
146
156
|
},
|
|
157
|
+
"scripts": {
|
|
158
|
+
"build": "tsup && pnpm openapi",
|
|
159
|
+
"dev": "tsup --watch",
|
|
160
|
+
"prepare": "husky",
|
|
161
|
+
"prepublishOnly": "pnpm build",
|
|
162
|
+
"test": "vitest run",
|
|
163
|
+
"test:watch": "vitest",
|
|
164
|
+
"typecheck": "tsc --noEmit",
|
|
165
|
+
"lint": "biome check src",
|
|
166
|
+
"format": "biome format --write src",
|
|
167
|
+
"openapi": "node dist/cli.js openapi --out dist/openapi.json",
|
|
168
|
+
"verify:package": "node scripts/verify-package-exports.mjs"
|
|
169
|
+
},
|
|
147
170
|
"dependencies": {
|
|
148
171
|
"@asteasolutions/zod-to-openapi": "^8.5.0",
|
|
149
172
|
"@ax-llm/ax": "^19.0.25",
|
|
@@ -171,6 +194,16 @@
|
|
|
171
194
|
"typescript": "^5.7.0",
|
|
172
195
|
"vitest": "^3.0.0"
|
|
173
196
|
},
|
|
197
|
+
"pnpm": {
|
|
198
|
+
"minimumReleaseAge": 4320,
|
|
199
|
+
"minimumReleaseAgeExclude": [
|
|
200
|
+
"@tangle-network/sandbox"
|
|
201
|
+
],
|
|
202
|
+
"overrides": {
|
|
203
|
+
"postcss@<8.5.10": "^8.5.10",
|
|
204
|
+
"ws@>=8.0.0 <8.20.1": "^8.20.1"
|
|
205
|
+
}
|
|
206
|
+
},
|
|
174
207
|
"engines": {
|
|
175
208
|
"node": ">=20"
|
|
176
209
|
},
|
|
@@ -183,14 +216,5 @@
|
|
|
183
216
|
]
|
|
184
217
|
},
|
|
185
218
|
"license": "MIT",
|
|
186
|
-
"
|
|
187
|
-
|
|
188
|
-
"dev": "tsup --watch",
|
|
189
|
-
"test": "vitest run",
|
|
190
|
-
"test:watch": "vitest",
|
|
191
|
-
"typecheck": "tsc --noEmit",
|
|
192
|
-
"lint": "biome check src",
|
|
193
|
-
"format": "biome format --write src",
|
|
194
|
-
"openapi": "node dist/cli.js openapi --out dist/openapi.json"
|
|
195
|
-
}
|
|
196
|
-
}
|
|
219
|
+
"packageManager": "pnpm@10.22.0"
|
|
220
|
+
}
|
|
@@ -1,114 +0,0 @@
|
|
|
1
|
-
import { A as AgentEvalError } from './errors-Dwqw-T_m.js';
|
|
2
|
-
import { R as RunRecord } from './run-record-BgTFzO2r.js';
|
|
3
|
-
|
|
4
|
-
/**
|
|
5
|
-
* Backend-integrity guard: distinguish "agent failed" from "eval ran against
|
|
6
|
-
* a stub / unconfigured backend." Without this guard a canonical eval can
|
|
7
|
-
* silently report `0/N passed` and look like an agent-quality problem when
|
|
8
|
-
* the LLM was never actually called — the failure mode we just hit running
|
|
9
|
-
* the 4-vertical parallel eval (legal-sandbox-stub returned hard-coded 33-104
|
|
10
|
-
* char strings; gtm/creative defaulted to a cli-bridge that wasn't running).
|
|
11
|
-
*
|
|
12
|
-
* The shape:
|
|
13
|
-
*
|
|
14
|
-
* const report = summarizeBackendIntegrity(records)
|
|
15
|
-
* assertRealBackend(records) // throws BackendIntegrityError if 100% stub
|
|
16
|
-
*
|
|
17
|
-
* A record is "stub-mode" if its `tokenUsage.input === 0 && tokenUsage.output === 0`.
|
|
18
|
-
* (`costUsd` alone is unreliable — some backends successfully call LLMs but
|
|
19
|
-
* don't propagate pricing, producing real tokens with $0 cost.)
|
|
20
|
-
*
|
|
21
|
-
* Verdicts:
|
|
22
|
-
* - `real` — at least one record has nonzero token usage
|
|
23
|
-
* - `stub` — every record is stub-mode (eval ran blind)
|
|
24
|
-
* - `mixed` — some records real, some stub (partial backend failure;
|
|
25
|
-
* often the 429-cascade or auth-half-failed case)
|
|
26
|
-
*/
|
|
27
|
-
|
|
28
|
-
interface BackendIntegrityReport {
|
|
29
|
-
/** Total records inspected. */
|
|
30
|
-
totalRecords: number;
|
|
31
|
-
/** Records with input=0 AND output=0 (a stub fingerprint). */
|
|
32
|
-
stubRecords: number;
|
|
33
|
-
/** Records with nonzero token usage (real LLM activity). */
|
|
34
|
-
realRecords: number;
|
|
35
|
-
/** Records where output>0 but costUsd=0 (real LLM, broken cost ledger). */
|
|
36
|
-
uncostedRecords: number;
|
|
37
|
-
/** Sum of input tokens across all records. */
|
|
38
|
-
totalInputTokens: number;
|
|
39
|
-
/** Sum of output tokens across all records. */
|
|
40
|
-
totalOutputTokens: number;
|
|
41
|
-
/** Sum of costUsd across all records. */
|
|
42
|
-
totalCostUsd: number;
|
|
43
|
-
/** Worst-case integrity verdict. */
|
|
44
|
-
verdict: 'real' | 'mixed' | 'stub';
|
|
45
|
-
/** Human-readable diagnosis suitable for terminal output. */
|
|
46
|
-
diagnosis: string;
|
|
47
|
-
}
|
|
48
|
-
/**
|
|
49
|
-
* Error thrown when an integrity assertion fails. Caller can pattern-match
|
|
50
|
-
* by `code === 'AGENT_EVAL_BACKEND_STUB'` to differentiate from other
|
|
51
|
-
* errors.
|
|
52
|
-
*/
|
|
53
|
-
declare class BackendIntegrityError extends AgentEvalError {
|
|
54
|
-
readonly report: BackendIntegrityReport;
|
|
55
|
-
constructor(message: string, report: BackendIntegrityReport);
|
|
56
|
-
}
|
|
57
|
-
/**
|
|
58
|
-
* Inspect a batch of RunRecords and return an integrity report. Pure
|
|
59
|
-
* function — no I/O, no logging. The caller decides what to do with the
|
|
60
|
-
* verdict (print warning, throw, gate CI, etc.).
|
|
61
|
-
*/
|
|
62
|
-
declare function summarizeBackendIntegrity(records: ReadonlyArray<RunRecord>): BackendIntegrityReport;
|
|
63
|
-
/**
|
|
64
|
-
* Throw BackendIntegrityError if the verdict is 'stub' — i.e. every record
|
|
65
|
-
* shows zero LLM activity. Non-strict callers can pass `{ allowMixed: false }`
|
|
66
|
-
* to also reject mixed verdicts (recommended for CI gates).
|
|
67
|
-
*
|
|
68
|
-
* Real backends pass through silently.
|
|
69
|
-
*/
|
|
70
|
-
declare function assertRealBackend(records: ReadonlyArray<RunRecord>, opts?: {
|
|
71
|
-
allowMixed?: boolean;
|
|
72
|
-
}): BackendIntegrityReport;
|
|
73
|
-
|
|
74
|
-
/**
|
|
75
|
-
* @stable
|
|
76
|
-
*
|
|
77
|
-
* AgentProfile — the eval harness's unit of variation.
|
|
78
|
-
*
|
|
79
|
-
* A profile pins everything that changes agent behaviour for a benchmark
|
|
80
|
-
* cell: the model, the active skills, the prompt version, the available
|
|
81
|
-
* tools. Vary the profile — swap a model, add a skill — and re-run the suite
|
|
82
|
-
* to benchmark the change. The scorecard keys a cell on
|
|
83
|
-
* `(scenarioId, profileHash)`, so the model is not a separate axis: it lives
|
|
84
|
-
* inside the profile, and two profiles with the same model but different
|
|
85
|
-
* skills are different cells.
|
|
86
|
-
*
|
|
87
|
-
* `agentProfileHash` is the profile's behaviour identity. Two profiles that
|
|
88
|
-
* produce the same agent behaviour share a hash (and a scorecard cell);
|
|
89
|
-
* reordering `skills` or `tools` does not change it; the human-facing `id`
|
|
90
|
-
* label does not affect it.
|
|
91
|
-
*/
|
|
92
|
-
interface AgentProfile {
|
|
93
|
-
/** Human-facing label, e.g. `sonnet-legal-skills-v3`. Not part of the hash. */
|
|
94
|
-
id: string;
|
|
95
|
-
/** Model snapshot id this profile pins, e.g. `claude-sonnet-4-6@2025-04-15`. */
|
|
96
|
-
model: string;
|
|
97
|
-
/** Skill ids/versions active in this profile — the primary behaviour lever. */
|
|
98
|
-
skills?: string[];
|
|
99
|
-
/** Prompt version identifier. */
|
|
100
|
-
promptVersion?: string;
|
|
101
|
-
/** Tool ids available to the agent. */
|
|
102
|
-
tools?: string[];
|
|
103
|
-
/** Any other behaviour-bearing knobs that should fingerprint into the hash. */
|
|
104
|
-
metadata?: Record<string, string | number | boolean>;
|
|
105
|
-
}
|
|
106
|
-
/**
|
|
107
|
-
* Deterministic behaviour identity of a profile — a sha256 over the
|
|
108
|
-
* behaviour-bearing fields. `skills` and `tools` are order-insensitive; the
|
|
109
|
-
* `id` label is excluded. Throws on a profile with no `model` — an unkeyable
|
|
110
|
-
* profile must fail loud rather than collapse into a blank-model cell.
|
|
111
|
-
*/
|
|
112
|
-
declare function agentProfileHash(profile: AgentProfile): string;
|
|
113
|
-
|
|
114
|
-
export { type AgentProfile as A, type BackendIntegrityReport as B, BackendIntegrityError as a, agentProfileHash as b, assertRealBackend as c, summarizeBackendIntegrity as s };
|