@tangle-network/agent-eval 0.23.1 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/CHANGELOG.md +145 -0
  2. package/README.md +212 -79
  3. package/dist/baseline-4R5deP0N.d.ts +108 -0
  4. package/dist/benchmarks/index.d.ts +3 -2
  5. package/dist/benchmarks/index.js +1 -1
  6. package/dist/builder-eval/index.d.ts +249 -0
  7. package/dist/builder-eval/index.js +391 -0
  8. package/dist/builder-eval/index.js.map +1 -0
  9. package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
  10. package/dist/chunk-2A5XJB43.js.map +1 -0
  11. package/dist/chunk-47X6LRCE.js +76 -0
  12. package/dist/chunk-47X6LRCE.js.map +1 -0
  13. package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
  14. package/dist/chunk-4F5DQN55.js.map +1 -0
  15. package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
  16. package/dist/chunk-4S4BM3QQ.js.map +1 -0
  17. package/dist/chunk-5BKGXME7.js +65 -0
  18. package/dist/chunk-5BKGXME7.js.map +1 -0
  19. package/dist/{chunk-6KQG5HAH.js → chunk-5LBB5B3Z.js} +376 -72
  20. package/dist/chunk-5LBB5B3Z.js.map +1 -0
  21. package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
  22. package/dist/chunk-6QDKWHLS.js.map +1 -0
  23. package/dist/{chunk-VQQSPGSM.js → chunk-EDUKQ5AM.js} +247 -189
  24. package/dist/chunk-EDUKQ5AM.js.map +1 -0
  25. package/dist/chunk-I4MBDTY5.js +272 -0
  26. package/dist/chunk-I4MBDTY5.js.map +1 -0
  27. package/dist/chunk-JLZQWFV3.js +618 -0
  28. package/dist/chunk-JLZQWFV3.js.map +1 -0
  29. package/dist/chunk-K2TPS5LB.js +569 -0
  30. package/dist/chunk-K2TPS5LB.js.map +1 -0
  31. package/dist/chunk-KKHDIONI.js +414 -0
  32. package/dist/chunk-KKHDIONI.js.map +1 -0
  33. package/dist/chunk-KMPRBJK4.js +74 -0
  34. package/dist/chunk-KMPRBJK4.js.map +1 -0
  35. package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
  36. package/dist/chunk-KTGTIOFD.js.map +1 -0
  37. package/dist/chunk-LSH4MMOZ.js +838 -0
  38. package/dist/chunk-LSH4MMOZ.js.map +1 -0
  39. package/dist/chunk-NG236HPC.js +57 -0
  40. package/dist/chunk-NG236HPC.js.map +1 -0
  41. package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
  42. package/dist/chunk-NLMNWKVM.js.map +1 -0
  43. package/dist/chunk-NU65VQ7M.js +99 -0
  44. package/dist/chunk-NU65VQ7M.js.map +1 -0
  45. package/dist/chunk-OWLAAMME.js +250 -0
  46. package/dist/chunk-OWLAAMME.js.map +1 -0
  47. package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
  48. package/dist/chunk-PC4UYEBM.js.map +1 -0
  49. package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
  50. package/dist/chunk-RAF443UI.js.map +1 -0
  51. package/dist/chunk-RZTMDUO7.js +49 -0
  52. package/dist/chunk-RZTMDUO7.js.map +1 -0
  53. package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
  54. package/dist/chunk-SESZDQPX.js.map +1 -0
  55. package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
  56. package/dist/chunk-TVVP3ZZQ.js.map +1 -0
  57. package/dist/chunk-WWYCWKUM.js +196 -0
  58. package/dist/chunk-WWYCWKUM.js.map +1 -0
  59. package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
  60. package/dist/chunk-YRZ4M5GS.js.map +1 -0
  61. package/dist/chunk-ZN274SWR.js +613 -0
  62. package/dist/chunk-ZN274SWR.js.map +1 -0
  63. package/dist/cli.js +10 -6
  64. package/dist/cli.js.map +1 -1
  65. package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
  66. package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
  67. package/dist/control.d.ts +8 -6
  68. package/dist/control.js +10 -7
  69. package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
  70. package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  71. package/dist/errors-BZ9sTdz7.d.ts +70 -0
  72. package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
  73. package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
  74. package/dist/governance/index.d.ts +5 -0
  75. package/dist/governance/index.js +18 -0
  76. package/dist/governance/index.js.map +1 -0
  77. package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
  78. package/dist/index-Oj9fAPPN.d.ts +270 -0
  79. package/dist/index.d.ts +2018 -3003
  80. package/dist/index.js +7443 -9102
  81. package/dist/index.js.map +1 -1
  82. package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
  83. package/dist/knowledge/index.d.ts +102 -0
  84. package/dist/knowledge/index.js +18 -0
  85. package/dist/knowledge/index.js.map +1 -0
  86. package/dist/meta-eval/index.d.ts +99 -0
  87. package/dist/meta-eval/index.js +324 -0
  88. package/dist/meta-eval/index.js.map +1 -0
  89. package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
  90. package/dist/openapi.json +491 -1
  91. package/dist/optimization.d.ts +11 -8
  92. package/dist/optimization.js +11 -9
  93. package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
  94. package/dist/pipelines/index.d.ts +172 -0
  95. package/dist/pipelines/index.js +345 -0
  96. package/dist/pipelines/index.js.map +1 -0
  97. package/dist/prm/index.d.ts +99 -0
  98. package/dist/prm/index.js +222 -0
  99. package/dist/prm/index.js.map +1 -0
  100. package/dist/query-DODUYdPg.d.ts +30 -0
  101. package/dist/release-report-BNgMdqPF.d.ts +292 -0
  102. package/dist/replay-BL96gCEP.d.ts +226 -0
  103. package/dist/reporting.d.ts +10 -295
  104. package/dist/reporting.js +10 -6
  105. package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-BPT8x_NT.d.ts} +148 -146
  106. package/dist/rl.d.ts +1762 -8
  107. package/dist/rl.js +2035 -58
  108. package/dist/rl.js.map +1 -1
  109. package/dist/rubric-D5tjHNJQ.d.ts +72 -0
  110. package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
  111. package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
  112. package/dist/sequential-Dgz1n51-.d.ts +139 -0
  113. package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  114. package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-C7VPYEj2.d.ts} +3 -76
  115. package/dist/telemetry/file.js +4 -1
  116. package/dist/telemetry/file.js.map +1 -1
  117. package/dist/telemetry/index.js +57 -57
  118. package/dist/telemetry/index.js.map +1 -1
  119. package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
  120. package/dist/traces.d.ts +142 -387
  121. package/dist/traces.js +1302 -40
  122. package/dist/traces.js.map +1 -1
  123. package/dist/trajectory-CnoBo-JY.d.ts +32 -0
  124. package/dist/wire/index.d.ts +369 -25
  125. package/dist/wire/index.js +22 -3
  126. package/package.json +44 -18
  127. package/dist/chunk-42I2QC2L.js.map +0 -1
  128. package/dist/chunk-5IIQKMD5.js.map +0 -1
  129. package/dist/chunk-6KQG5HAH.js.map +0 -1
  130. package/dist/chunk-6M774GY6.js.map +0 -1
  131. package/dist/chunk-7EAUOUQS.js.map +0 -1
  132. package/dist/chunk-AXHNWLIX.js.map +0 -1
  133. package/dist/chunk-EXGR4XEM.js.map +0 -1
  134. package/dist/chunk-IOXMGMHQ.js.map +0 -1
  135. package/dist/chunk-KAO3Q65R.js.map +0 -1
  136. package/dist/chunk-LZKIOBG2.js +0 -2026
  137. package/dist/chunk-LZKIOBG2.js.map +0 -1
  138. package/dist/chunk-QBW3YBTR.js.map +0 -1
  139. package/dist/chunk-QUKKGHTZ.js.map +0 -1
  140. package/dist/chunk-SQQLHODJ.js.map +0 -1
  141. package/dist/chunk-V5QSWN7L.js +0 -1310
  142. package/dist/chunk-V5QSWN7L.js.map +0 -1
  143. package/dist/chunk-VQQSPGSM.js.map +0 -1
  144. package/dist/chunk-XPHOZPOM.js +0 -1947
  145. package/dist/chunk-XPHOZPOM.js.map +0 -1
  146. package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
  147. package/dist/index-ekBXweiQ.d.ts +0 -1894
  148. package/dist/sequential-DgU2mFsE.d.ts +0 -304
@@ -0,0 +1,146 @@
1
+ import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
2
+ import { R as Run, F as FailureClass, T as TraceStore } from './store-Db2Bv8Cf.js';
3
+
4
+ /**
5
+ * SandboxHarness — executes a scenario in an isolated environment and
6
+ * emits a rich SandboxSpan into the trace.
7
+ *
8
+ * Two built-in drivers:
9
+ * - `SubprocessSandboxDriver` — spawn in a local cwd with env vars.
10
+ * Fast, no dependencies, fine for unit tests and most CI gates.
11
+ * - `DockerSandboxDriver` — lifted from tangle-router's sandbox path;
12
+ * shells out to `docker run`. Stronger isolation, slower startup.
13
+ *
14
+ * Consumers implement `SandboxDriver` for custom backends (Firecracker,
15
+ * Cloudflare sandbox product, etc.). The harness doesn't care which.
16
+ */
17
+
18
+ interface HarnessConfig {
19
+ /** Setup command (e.g. "pnpm install"). Non-zero exit fails the run. */
20
+ setupCommand?: string;
21
+ /** Run command (e.g. "pnpm build"). */
22
+ runCommand?: string;
23
+ /** Test command (e.g. "pnpm test --run"). Drives the test count + pass count. */
24
+ testCommand?: string;
25
+ /** Absolute cwd for the subprocess driver. Ignored by docker driver. */
26
+ cwd?: string;
27
+ /** Max wall-clock per phase in ms. Default 10 minutes. */
28
+ timeoutMs?: number;
29
+ /** Image for the docker driver. */
30
+ image?: string;
31
+ /** Extra env vars (validated; shell-escaped). */
32
+ env?: Record<string, string>;
33
+ /** Parser for the test output — maps stdout/stderr/exit code → pass count. */
34
+ testParser?: TestOutputParser;
35
+ }
36
+ interface TestOutputParser {
37
+ id: string;
38
+ parse(stdout: string, stderr: string, exitCode: number): {
39
+ testsTotal: number;
40
+ testsPassed: number;
41
+ } | undefined;
42
+ }
43
+ interface SandboxResult {
44
+ phase: 'setup' | 'run' | 'test';
45
+ exitCode: number;
46
+ stdout: string;
47
+ stderr: string;
48
+ wallMs: number;
49
+ testsTotal?: number;
50
+ testsPassed?: number;
51
+ }
52
+ interface SandboxDriver {
53
+ id: string;
54
+ exec(phase: SandboxResult['phase'], command: string, config: HarnessConfig): Promise<SandboxResult>;
55
+ }
56
+ /** Vitest default summary line: "Tests X passed | Y failed". */
57
+ declare const vitestTestParser: TestOutputParser;
58
+ /** Pytest default: "collected N items" + " X passed, Y failed". */
59
+ declare const pytestTestParser: TestOutputParser;
60
+ /** Jest: "Tests: X passed, Y total" (and optional failed). */
61
+ declare const jestTestParser: TestOutputParser;
62
+ /** Composite parser — tries a list of parsers in order. */
63
+ declare function composeParsers(...parsers: TestOutputParser[]): TestOutputParser;
64
+ interface SubprocessSandboxDriverOptions {
65
+ /**
66
+ * Default cwd for all `exec` calls. Used when the per-call `HarnessConfig`
67
+ * does not set its own `cwd`. Lets callers bind the driver to a working
68
+ * directory once instead of spreading cwd into every harness config —
69
+ * useful when the harness config is constructed far from the call site
70
+ * (e.g. starter-foundry's promoter passes a static HarnessConfig per
71
+ * family taxonomy but needs a per-run composed-scaffold cwd).
72
+ */
73
+ cwd?: string;
74
+ /**
75
+ * Default env merged into every `exec` call's env (per-call `HarnessConfig.env`
76
+ * still wins on key collision). Same ergonomic rationale as `cwd` above.
77
+ */
78
+ env?: Record<string, string>;
79
+ }
80
+ declare class SubprocessSandboxDriver implements SandboxDriver {
81
+ id: string;
82
+ private defaultCwd?;
83
+ private defaultEnv?;
84
+ constructor(options?: SubprocessSandboxDriverOptions);
85
+ exec(phase: SandboxResult['phase'], command: string, config: HarnessConfig): Promise<SandboxResult>;
86
+ }
87
+ declare class DockerSandboxDriver implements SandboxDriver {
88
+ id: string;
89
+ exec(phase: SandboxResult['phase'], command: string, config: HarnessConfig): Promise<SandboxResult>;
90
+ }
91
+ interface SandboxHarnessResult {
92
+ passed: boolean;
93
+ setup?: SandboxResult;
94
+ run?: SandboxResult;
95
+ test?: SandboxResult;
96
+ totalWallMs: number;
97
+ /** Final score — 0 when no tests; otherwise testsPassed/testsTotal. */
98
+ score: number;
99
+ }
100
+ declare class SandboxHarness {
101
+ private driver;
102
+ constructor(driver?: SandboxDriver);
103
+ run(config: HarnessConfig, emitter: TraceEmitter): Promise<SandboxHarnessResult>;
104
+ }
105
+
106
+ /**
107
+ * TestGradedScenario — a scenario whose score comes from a test suite.
108
+ *
109
+ * This is the SWE-bench pattern generalized. The scenario ships:
110
+ * - fixture data (setup instructions)
111
+ * - a test command the harness runs
112
+ * - optional assertion overrides
113
+ *
114
+ * The runner emits a run, delegates to SandboxHarness, records the
115
+ * outcome, and returns a structured verdict. Consumers bind their own
116
+ * agent execution to this contract.
117
+ */
118
+
119
+ interface TestGradedScenario {
120
+ id: string;
121
+ description?: string;
122
+ harness: HarnessConfig;
123
+ /** Optional pass threshold in 0..1 (default 1.0 = all tests must pass). */
124
+ passThreshold?: number;
125
+ /** Provenance for dataset tracking. */
126
+ datasetVersion?: string;
127
+ /** Free-form tags (difficulty, category, etc.). */
128
+ tags?: Record<string, string>;
129
+ }
130
+ interface TestGradedRunOptions {
131
+ variantId?: string;
132
+ driver?: SandboxDriver;
133
+ /** Metadata recorded on the Run (codeSha, promptSha, modelFingerprint, seed). */
134
+ provenance?: Pick<Run, 'codeSha' | 'promptSha' | 'modelFingerprint' | 'seed' | 'envFingerprint'>;
135
+ }
136
+ interface TestGradedRunResult {
137
+ runId: string;
138
+ scenario: TestGradedScenario;
139
+ harness: SandboxHarnessResult;
140
+ pass: boolean;
141
+ score: number;
142
+ failureClass?: FailureClass;
143
+ }
144
+ declare function runTestGradedScenario(scenario: TestGradedScenario, store: TraceStore, options?: TestGradedRunOptions): Promise<TestGradedRunResult>;
145
+
146
+ export { DockerSandboxDriver as D, type HarnessConfig as H, type SandboxDriver as S, type TestGradedScenario as T, type SandboxHarnessResult as a, type TestGradedRunResult as b, SandboxHarness as c, type SandboxResult as d, SubprocessSandboxDriver as e, type SubprocessSandboxDriverOptions as f, type TestGradedRunOptions as g, type TestOutputParser as h, composeParsers as i, jestTestParser as j, pytestTestParser as p, runTestGradedScenario as r, vitestTestParser as v };