@tangle-network/agent-eval 0.23.0 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +102 -0
- package/README.md +141 -79
- package/dist/baseline-4R5deP0N.d.ts +108 -0
- package/dist/benchmarks/index.d.ts +3 -2
- package/dist/benchmarks/index.js +1 -1
- package/dist/builder-eval/index.d.ts +249 -0
- package/dist/builder-eval/index.js +391 -0
- package/dist/builder-eval/index.js.map +1 -0
- package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
- package/dist/chunk-2A5XJB43.js.map +1 -0
- package/dist/chunk-47X6LRCE.js +76 -0
- package/dist/chunk-47X6LRCE.js.map +1 -0
- package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
- package/dist/chunk-4F5DQN55.js.map +1 -0
- package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
- package/dist/chunk-4S4BM3QQ.js.map +1 -0
- package/dist/chunk-5BKGXME7.js +65 -0
- package/dist/chunk-5BKGXME7.js.map +1 -0
- package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
- package/dist/chunk-6QDKWHLS.js.map +1 -0
- package/dist/chunk-I4MBDTY5.js +272 -0
- package/dist/chunk-I4MBDTY5.js.map +1 -0
- package/dist/chunk-K2TPS5LB.js +569 -0
- package/dist/chunk-K2TPS5LB.js.map +1 -0
- package/dist/chunk-KKHDIONI.js +414 -0
- package/dist/chunk-KKHDIONI.js.map +1 -0
- package/dist/chunk-KMPRBJK4.js +74 -0
- package/dist/chunk-KMPRBJK4.js.map +1 -0
- package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
- package/dist/chunk-KTGTIOFD.js.map +1 -0
- package/dist/chunk-LSH4MMOZ.js +838 -0
- package/dist/chunk-LSH4MMOZ.js.map +1 -0
- package/dist/chunk-NG236HPC.js +57 -0
- package/dist/chunk-NG236HPC.js.map +1 -0
- package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
- package/dist/chunk-NLMNWKVM.js.map +1 -0
- package/dist/chunk-NU65VQ7M.js +99 -0
- package/dist/chunk-NU65VQ7M.js.map +1 -0
- package/dist/chunk-OHEPNJQN.js +554 -0
- package/dist/chunk-OHEPNJQN.js.map +1 -0
- package/dist/chunk-OWLAAMME.js +250 -0
- package/dist/chunk-OWLAAMME.js.map +1 -0
- package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
- package/dist/chunk-PC4UYEBM.js.map +1 -0
- package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
- package/dist/chunk-RAF443UI.js.map +1 -0
- package/dist/chunk-RZTMDUO7.js +49 -0
- package/dist/chunk-RZTMDUO7.js.map +1 -0
- package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
- package/dist/chunk-SESZDQPX.js.map +1 -0
- package/dist/{chunk-6KQG5HAH.js → chunk-SY6WAAAD.js} +84 -71
- package/dist/chunk-SY6WAAAD.js.map +1 -0
- package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
- package/dist/chunk-TVVP3ZZQ.js.map +1 -0
- package/dist/{chunk-VQQSPGSM.js → chunk-VRJVTXRV.js} +169 -111
- package/dist/chunk-VRJVTXRV.js.map +1 -0
- package/dist/chunk-WWYCWKUM.js +196 -0
- package/dist/chunk-WWYCWKUM.js.map +1 -0
- package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
- package/dist/chunk-YRZ4M5GS.js.map +1 -0
- package/dist/chunk-ZN274SWR.js +613 -0
- package/dist/chunk-ZN274SWR.js.map +1 -0
- package/dist/cli.js +10 -6
- package/dist/cli.js.map +1 -1
- package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
- package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
- package/dist/control.d.ts +8 -6
- package/dist/control.js +10 -7
- package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
- package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
- package/dist/errors-BZ9sTdz7.d.ts +70 -0
- package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
- package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
- package/dist/governance/index.d.ts +5 -0
- package/dist/governance/index.js +18 -0
- package/dist/governance/index.js.map +1 -0
- package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
- package/dist/index-Oj9fAPPN.d.ts +270 -0
- package/dist/index.d.ts +1866 -3151
- package/dist/index.js +5457 -7809
- package/dist/index.js.map +1 -1
- package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
- package/dist/knowledge/index.d.ts +102 -0
- package/dist/knowledge/index.js +18 -0
- package/dist/knowledge/index.js.map +1 -0
- package/dist/meta-eval/index.d.ts +99 -0
- package/dist/meta-eval/index.js +324 -0
- package/dist/meta-eval/index.js.map +1 -0
- package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +11 -8
- package/dist/optimization.js +11 -9
- package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
- package/dist/pipelines/index.d.ts +172 -0
- package/dist/pipelines/index.js +409 -0
- package/dist/pipelines/index.js.map +1 -0
- package/dist/prm/index.d.ts +99 -0
- package/dist/prm/index.js +222 -0
- package/dist/prm/index.js.map +1 -0
- package/dist/query-DODUYdPg.d.ts +30 -0
- package/dist/release-report-TDPn1cxq.d.ts +292 -0
- package/dist/replay-BL96gCEP.d.ts +226 -0
- package/dist/reporting.d.ts +10 -295
- package/dist/reporting.js +10 -6
- package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-CUOiGcGv.d.ts} +148 -146
- package/dist/rl.d.ts +1762 -8
- package/dist/rl.js +2035 -58
- package/dist/rl.js.map +1 -1
- package/dist/rubric-D5tjHNJQ.d.ts +72 -0
- package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
- package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
- package/dist/sequential-Dgz1n51-.d.ts +139 -0
- package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
- package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-BXGs_9V0.d.ts} +3 -76
- package/dist/telemetry/file.js +4 -1
- package/dist/telemetry/file.js.map +1 -1
- package/dist/telemetry/index.js +57 -57
- package/dist/telemetry/index.js.map +1 -1
- package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
- package/dist/traces.d.ts +142 -387
- package/dist/traces.js +1302 -40
- package/dist/traces.js.map +1 -1
- package/dist/trajectory-CnoBo-JY.d.ts +32 -0
- package/dist/wire/index.d.ts +22 -22
- package/dist/wire/index.js +4 -3
- package/package.json +35 -2
- package/dist/chunk-42I2QC2L.js.map +0 -1
- package/dist/chunk-4W4NCYM2.js +0 -1945
- package/dist/chunk-4W4NCYM2.js.map +0 -1
- package/dist/chunk-5IIQKMD5.js.map +0 -1
- package/dist/chunk-6KQG5HAH.js.map +0 -1
- package/dist/chunk-6M774GY6.js.map +0 -1
- package/dist/chunk-7EAUOUQS.js.map +0 -1
- package/dist/chunk-AXHNWLIX.js.map +0 -1
- package/dist/chunk-EXGR4XEM.js.map +0 -1
- package/dist/chunk-IOXMGMHQ.js.map +0 -1
- package/dist/chunk-KAO3Q65R.js.map +0 -1
- package/dist/chunk-LZKIOBG2.js +0 -2026
- package/dist/chunk-LZKIOBG2.js.map +0 -1
- package/dist/chunk-QBW3YBTR.js.map +0 -1
- package/dist/chunk-QUKKGHTZ.js.map +0 -1
- package/dist/chunk-SQQLHODJ.js.map +0 -1
- package/dist/chunk-V5QSWN7L.js +0 -1310
- package/dist/chunk-V5QSWN7L.js.map +0 -1
- package/dist/chunk-VQQSPGSM.js.map +0 -1
- package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
- package/dist/index-ekBXweiQ.d.ts +0 -1894
- package/dist/sequential-DgU2mFsE.d.ts +0 -304
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
|
|
2
|
+
import { R as Run, F as FailureClass, T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* SandboxHarness — executes a scenario in an isolated environment and
|
|
6
|
+
* emits a rich SandboxSpan into the trace.
|
|
7
|
+
*
|
|
8
|
+
* Two built-in drivers:
|
|
9
|
+
* - `SubprocessSandboxDriver` — spawn in a local cwd with env vars.
|
|
10
|
+
* Fast, no dependencies, fine for unit tests and most CI gates.
|
|
11
|
+
* - `DockerSandboxDriver` — lifted from tangle-router's sandbox path;
|
|
12
|
+
* shells out to `docker run`. Stronger isolation, slower startup.
|
|
13
|
+
*
|
|
14
|
+
* Consumers implement `SandboxDriver` for custom backends (Firecracker,
|
|
15
|
+
* Cloudflare sandbox product, etc.). The harness doesn't care which.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
interface HarnessConfig {
|
|
19
|
+
/** Setup command (e.g. "pnpm install"). Non-zero exit fails the run. */
|
|
20
|
+
setupCommand?: string;
|
|
21
|
+
/** Run command (e.g. "pnpm build"). */
|
|
22
|
+
runCommand?: string;
|
|
23
|
+
/** Test command (e.g. "pnpm test --run"). Drives the test count + pass count. */
|
|
24
|
+
testCommand?: string;
|
|
25
|
+
/** Absolute cwd for the subprocess driver. Ignored by docker driver. */
|
|
26
|
+
cwd?: string;
|
|
27
|
+
/** Max wall-clock per phase in ms. Default 10 minutes. */
|
|
28
|
+
timeoutMs?: number;
|
|
29
|
+
/** Image for the docker driver. */
|
|
30
|
+
image?: string;
|
|
31
|
+
/** Extra env vars (validated; shell-escaped). */
|
|
32
|
+
env?: Record<string, string>;
|
|
33
|
+
/** Parser for the test output — maps stdout/stderr/exit code → pass count. */
|
|
34
|
+
testParser?: TestOutputParser;
|
|
35
|
+
}
|
|
36
|
+
interface TestOutputParser {
|
|
37
|
+
id: string;
|
|
38
|
+
parse(stdout: string, stderr: string, exitCode: number): {
|
|
39
|
+
testsTotal: number;
|
|
40
|
+
testsPassed: number;
|
|
41
|
+
} | undefined;
|
|
42
|
+
}
|
|
43
|
+
interface SandboxResult {
|
|
44
|
+
phase: 'setup' | 'run' | 'test';
|
|
45
|
+
exitCode: number;
|
|
46
|
+
stdout: string;
|
|
47
|
+
stderr: string;
|
|
48
|
+
wallMs: number;
|
|
49
|
+
testsTotal?: number;
|
|
50
|
+
testsPassed?: number;
|
|
51
|
+
}
|
|
52
|
+
interface SandboxDriver {
|
|
53
|
+
id: string;
|
|
54
|
+
exec(phase: SandboxResult['phase'], command: string, config: HarnessConfig): Promise<SandboxResult>;
|
|
55
|
+
}
|
|
56
|
+
/** Vitest default summary line: "Tests X passed | Y failed". */
|
|
57
|
+
declare const vitestTestParser: TestOutputParser;
|
|
58
|
+
/** Pytest default: "collected N items" + " X passed, Y failed". */
|
|
59
|
+
declare const pytestTestParser: TestOutputParser;
|
|
60
|
+
/** Jest: "Tests: X passed, Y total" (and optional failed). */
|
|
61
|
+
declare const jestTestParser: TestOutputParser;
|
|
62
|
+
/** Composite parser — tries a list of parsers in order. */
|
|
63
|
+
declare function composeParsers(...parsers: TestOutputParser[]): TestOutputParser;
|
|
64
|
+
interface SubprocessSandboxDriverOptions {
|
|
65
|
+
/**
|
|
66
|
+
* Default cwd for all `exec` calls. Used when the per-call `HarnessConfig`
|
|
67
|
+
* does not set its own `cwd`. Lets callers bind the driver to a working
|
|
68
|
+
* directory once instead of spreading cwd into every harness config —
|
|
69
|
+
* useful when the harness config is constructed far from the call site
|
|
70
|
+
* (e.g. starter-foundry's promoter passes a static HarnessConfig per
|
|
71
|
+
* family taxonomy but needs a per-run composed-scaffold cwd).
|
|
72
|
+
*/
|
|
73
|
+
cwd?: string;
|
|
74
|
+
/**
|
|
75
|
+
* Default env merged into every `exec` call's env (per-call `HarnessConfig.env`
|
|
76
|
+
* still wins on key collision). Same ergonomic rationale as `cwd` above.
|
|
77
|
+
*/
|
|
78
|
+
env?: Record<string, string>;
|
|
79
|
+
}
|
|
80
|
+
declare class SubprocessSandboxDriver implements SandboxDriver {
|
|
81
|
+
id: string;
|
|
82
|
+
private defaultCwd?;
|
|
83
|
+
private defaultEnv?;
|
|
84
|
+
constructor(options?: SubprocessSandboxDriverOptions);
|
|
85
|
+
exec(phase: SandboxResult['phase'], command: string, config: HarnessConfig): Promise<SandboxResult>;
|
|
86
|
+
}
|
|
87
|
+
declare class DockerSandboxDriver implements SandboxDriver {
|
|
88
|
+
id: string;
|
|
89
|
+
exec(phase: SandboxResult['phase'], command: string, config: HarnessConfig): Promise<SandboxResult>;
|
|
90
|
+
}
|
|
91
|
+
interface SandboxHarnessResult {
|
|
92
|
+
passed: boolean;
|
|
93
|
+
setup?: SandboxResult;
|
|
94
|
+
run?: SandboxResult;
|
|
95
|
+
test?: SandboxResult;
|
|
96
|
+
totalWallMs: number;
|
|
97
|
+
/** Final score — 0 when no tests; otherwise testsPassed/testsTotal. */
|
|
98
|
+
score: number;
|
|
99
|
+
}
|
|
100
|
+
declare class SandboxHarness {
|
|
101
|
+
private driver;
|
|
102
|
+
constructor(driver?: SandboxDriver);
|
|
103
|
+
run(config: HarnessConfig, emitter: TraceEmitter): Promise<SandboxHarnessResult>;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* TestGradedScenario — a scenario whose score comes from a test suite.
|
|
108
|
+
*
|
|
109
|
+
* This is the SWE-bench pattern generalized. The scenario ships:
|
|
110
|
+
* - fixture data (setup instructions)
|
|
111
|
+
* - a test command the harness runs
|
|
112
|
+
* - optional assertion overrides
|
|
113
|
+
*
|
|
114
|
+
* The runner emits a run, delegates to SandboxHarness, records the
|
|
115
|
+
* outcome, and returns a structured verdict. Consumers bind their own
|
|
116
|
+
* agent execution to this contract.
|
|
117
|
+
*/
|
|
118
|
+
|
|
119
|
+
interface TestGradedScenario {
|
|
120
|
+
id: string;
|
|
121
|
+
description?: string;
|
|
122
|
+
harness: HarnessConfig;
|
|
123
|
+
/** Optional pass threshold in 0..1 (default 1.0 = all tests must pass). */
|
|
124
|
+
passThreshold?: number;
|
|
125
|
+
/** Provenance for dataset tracking. */
|
|
126
|
+
datasetVersion?: string;
|
|
127
|
+
/** Free-form tags (difficulty, category, etc.). */
|
|
128
|
+
tags?: Record<string, string>;
|
|
129
|
+
}
|
|
130
|
+
interface TestGradedRunOptions {
|
|
131
|
+
variantId?: string;
|
|
132
|
+
driver?: SandboxDriver;
|
|
133
|
+
/** Metadata recorded on the Run (codeSha, promptSha, modelFingerprint, seed). */
|
|
134
|
+
provenance?: Pick<Run, 'codeSha' | 'promptSha' | 'modelFingerprint' | 'seed' | 'envFingerprint'>;
|
|
135
|
+
}
|
|
136
|
+
interface TestGradedRunResult {
|
|
137
|
+
runId: string;
|
|
138
|
+
scenario: TestGradedScenario;
|
|
139
|
+
harness: SandboxHarnessResult;
|
|
140
|
+
pass: boolean;
|
|
141
|
+
score: number;
|
|
142
|
+
failureClass?: FailureClass;
|
|
143
|
+
}
|
|
144
|
+
declare function runTestGradedScenario(scenario: TestGradedScenario, store: TraceStore, options?: TestGradedRunOptions): Promise<TestGradedRunResult>;
|
|
145
|
+
|
|
146
|
+
export { DockerSandboxDriver as D, type HarnessConfig as H, type SandboxDriver as S, type TestGradedScenario as T, type SandboxHarnessResult as a, type TestGradedRunResult as b, SandboxHarness as c, type SandboxResult as d, SubprocessSandboxDriver as e, type SubprocessSandboxDriverOptions as f, type TestGradedRunOptions as g, type TestOutputParser as h, composeParsers as i, jestTestParser as j, pytestTestParser as p, runTestGradedScenario as r, vitestTestParser as v };
|