@tangle-network/agent-eval 0.37.0 → 0.40.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/dist/campaign/index.d.ts +695 -0
  2. package/dist/campaign/index.js +741 -0
  3. package/dist/campaign/index.js.map +1 -0
  4. package/dist/chunk-5U2DOJU4.js +565 -0
  5. package/dist/chunk-5U2DOJU4.js.map +1 -0
  6. package/dist/{chunk-KE7TDJUO.js → chunk-AU2JLNSZ.js} +2 -2
  7. package/dist/{chunk-TSPOEDM3.js → chunk-BWZEGTES.js} +2 -5
  8. package/dist/chunk-BWZEGTES.js.map +1 -0
  9. package/dist/{chunk-3HYQXPC2.js → chunk-DMW5VENN.js} +3 -3
  10. package/dist/{chunk-TQL7BAOY.js → chunk-EGIPWXHL.js} +2 -2
  11. package/dist/chunk-GGE4NNQT.js +65 -0
  12. package/dist/chunk-GGE4NNQT.js.map +1 -0
  13. package/dist/{chunk-7PR3WPWE.js → chunk-L7XMNXLO.js} +2 -2
  14. package/dist/{chunk-RL6TERL2.js → chunk-LCIDRYGP.js} +3 -3
  15. package/dist/{chunk-L5UNCDAJ.js → chunk-MAOZCN36.js} +2 -64
  16. package/dist/chunk-MAOZCN36.js.map +1 -0
  17. package/dist/{chunk-LGAPK7NA.js → chunk-NKLGKF2Q.js} +2 -2
  18. package/dist/chunk-QWV226SL.js +276 -0
  19. package/dist/chunk-QWV226SL.js.map +1 -0
  20. package/dist/chunk-TMXPFWC7.js +305 -0
  21. package/dist/chunk-TMXPFWC7.js.map +1 -0
  22. package/dist/{chunk-KHZRNY3F.js → chunk-WP7SY7AI.js} +5 -4
  23. package/dist/chunk-WP7SY7AI.js.map +1 -0
  24. package/dist/chunk-YV7J7X5N.js +313 -0
  25. package/dist/chunk-YV7J7X5N.js.map +1 -0
  26. package/dist/{control-DVrmvM_k.d.ts → control-CmLJk3IG.d.ts} +1 -1
  27. package/dist/control.d.ts +3 -3
  28. package/dist/control.js +2 -2
  29. package/dist/{dataset-ueRVTUoY.d.ts → dataset-BlwAtYYf.d.ts} +1 -1
  30. package/dist/{feedback-trajectory-iATEAHmc.d.ts → feedback-trajectory-Dvy-bt7x.d.ts} +1 -1
  31. package/dist/governance/index.d.ts +133 -5
  32. package/dist/index.d.ts +35 -34
  33. package/dist/index.js +97 -630
  34. package/dist/index.js.map +1 -1
  35. package/dist/matrix/index.d.ts +2 -109
  36. package/dist/matrix/index.js +5 -270
  37. package/dist/matrix/index.js.map +1 -1
  38. package/dist/multishot/index.d.ts +276 -0
  39. package/dist/multishot/index.js +516 -0
  40. package/dist/multishot/index.js.map +1 -0
  41. package/dist/openapi.json +1 -1
  42. package/dist/optimization.d.ts +2 -2
  43. package/dist/optimization.js +5 -5
  44. package/dist/pipelines/index.js +2 -2
  45. package/dist/red-team-30II1T4o.d.ts +63 -0
  46. package/dist/{release-report-D2ykiLSe.d.ts → release-report-Di84bXD7.d.ts} +5 -2
  47. package/dist/reporting.d.ts +2 -2
  48. package/dist/reporting.js +3 -3
  49. package/dist/rl.js +15 -315
  50. package/dist/rl.js.map +1 -1
  51. package/dist/run-campaign-JYJXYHHL.js +10 -0
  52. package/dist/run-campaign-JYJXYHHL.js.map +1 -0
  53. package/dist/traces.js +7 -5
  54. package/dist/types-DHqkLwEU.d.ts +110 -0
  55. package/dist/wire/index.d.ts +2 -2
  56. package/docs/design/loop-taxonomy.md +233 -0
  57. package/package.json +38 -24
  58. package/dist/chunk-KHZRNY3F.js.map +0 -1
  59. package/dist/chunk-L5UNCDAJ.js.map +0 -1
  60. package/dist/chunk-TSPOEDM3.js.map +0 -1
  61. package/dist/index-CN2agEaO.d.ts +0 -191
  62. /package/dist/{chunk-KE7TDJUO.js.map → chunk-AU2JLNSZ.js.map} +0 -0
  63. /package/dist/{chunk-3HYQXPC2.js.map → chunk-DMW5VENN.js.map} +0 -0
  64. /package/dist/{chunk-TQL7BAOY.js.map → chunk-EGIPWXHL.js.map} +0 -0
  65. /package/dist/{chunk-7PR3WPWE.js.map → chunk-L7XMNXLO.js.map} +0 -0
  66. /package/dist/{chunk-RL6TERL2.js.map → chunk-LCIDRYGP.js.map} +0 -0
  67. /package/dist/{chunk-LGAPK7NA.js.map → chunk-NKLGKF2Q.js.map} +0 -0
@@ -0,0 +1,10 @@
1
+ import {
2
+ runCampaign
3
+ } from "./chunk-TMXPFWC7.js";
4
+ import "./chunk-WP7SY7AI.js";
5
+ import "./chunk-QYJT52YW.js";
6
+ import "./chunk-PZ5AY32C.js";
7
+ export {
8
+ runCampaign
9
+ };
10
+ //# sourceMappingURL=run-campaign-JYJXYHHL.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
package/dist/traces.js CHANGED
@@ -1,11 +1,9 @@
1
1
  import {
2
- DEFAULT_REDACTION_RULES,
3
2
  DEFAULT_TRACE_ANALYST_BUDGETS,
4
3
  FileSystemTraceStore,
5
4
  InMemoryTraceStore,
6
5
  OTEL_AGENT_EVAL_SCOPE,
7
6
  OtlpFileTraceStore,
8
- REDACTION_VERSION,
9
7
  ReplayCache,
10
8
  ReplayCacheMissError,
11
9
  SpanNotFoundError,
@@ -30,13 +28,17 @@ import {
30
28
  iterateRawCalls,
31
29
  otelRunCompleteHook,
32
30
  planTraceInsightQuestions,
33
- redactString,
34
- redactValue,
35
31
  scoreTraceInsightReadiness,
36
32
  tokenizeDomainWords,
37
33
  traceAnalystFunctionGroup,
38
34
  traceAnalystOnRunComplete
39
- } from "./chunk-L5UNCDAJ.js";
35
+ } from "./chunk-MAOZCN36.js";
36
+ import {
37
+ DEFAULT_REDACTION_RULES,
38
+ REDACTION_VERSION,
39
+ redactString,
40
+ redactValue
41
+ } from "./chunk-GGE4NNQT.js";
40
42
  import {
41
43
  aggregateLlm,
42
44
  argHash,
@@ -0,0 +1,110 @@
1
+ import { DefaultVerdict } from '@tangle-network/agent-runtime/loops';
2
+
3
+ /**
4
+ * @experimental
5
+ *
6
+ * N-axis cartesian matrix over substrate types — types module.
7
+ *
8
+ * The matrix is a runner + aggregator. It iterates the cartesian product of
9
+ * caller-provided axes (any value type — `AgentProfile` from sandbox, `Driver`
10
+ * / `Validator` from agent-runtime, rubric records, thinking levels, anything)
11
+ * and aggregates per-axis pass/score/cost summaries. Substrate types are
12
+ * imported at the boundary by JSDoc only; the matrix never wraps them.
13
+ */
14
+
15
+ /** One axis = one dimension to iterate. `V` is the value type — pass any
16
+ * substrate type (AgentProfile, Driver, Validator, rubric record). */
17
+ interface MatrixAxis<V> {
18
+ /** Axis name. Becomes the key in `MatrixResult.byAxis`. */
19
+ name: string;
20
+ /** Stable id per value. Used as the bucket key in aggregation. */
21
+ values: Array<{
22
+ id: string;
23
+ value: V;
24
+ }>;
25
+ /** Optional bucket label override. Receives the same `(value, id)` the
26
+ * runner stored on the cell; default label is `id`. */
27
+ label?: (value: V, id: string) => string;
28
+ }
29
+ /** A cell carries one picked value from each axis, keyed by axis name. */
30
+ interface MatrixCell {
31
+ axes: Record<string, {
32
+ id: string;
33
+ value: unknown;
34
+ }>;
35
+ /** 0-based replicate index within the same axis combination. */
36
+ rep: number;
37
+ /** Stable sort key — preserves cartesian order across concurrent execution. */
38
+ ordinal: number;
39
+ }
40
+ interface CellResult<Output> {
41
+ output: Output;
42
+ verdict: DefaultVerdict;
43
+ costUsd: number;
44
+ durationMs: number;
45
+ runId?: string;
46
+ /** Populated when `runCell` threw. The cell contributes 0 to passRate AND
47
+ * meanScore regardless of `verdict`. */
48
+ error?: {
49
+ message: string;
50
+ kind: string;
51
+ };
52
+ }
53
+ interface AxisSummary {
54
+ axisName: string;
55
+ axisValue: string;
56
+ cells: number;
57
+ passRate: number;
58
+ meanScore: number;
59
+ p50Score: number;
60
+ p90Score: number;
61
+ totalCostUsd: number;
62
+ meanDurationMs: number;
63
+ }
64
+ interface MatrixResult<Output> {
65
+ cells: Array<{
66
+ cell: MatrixCell;
67
+ runs: CellResult<Output>[];
68
+ }>;
69
+ /** `byAxis[axisName][axisValueId] = summary`. Populated only for axes
70
+ * named in `aggregateBy` (default = every axis in `axes`). */
71
+ byAxis: Record<string, Record<string, AxisSummary>>;
72
+ summary: {
73
+ totalCells: number;
74
+ runsExecuted: number;
75
+ /** Cells removed by `filter` plus cells unscheduled after the cost
76
+ * ceiling or abort signal tripped. */
77
+ cellsSkipped: number;
78
+ overallPassRate: number;
79
+ overallMeanScore: number;
80
+ totalCostUsd: number;
81
+ durationMs: number;
82
+ };
83
+ /** Stable id-like string generated at the end of the run. */
84
+ matrixId: string;
85
+ }
86
+ interface RunAgentMatrixOptions<Output> {
87
+ axes: MatrixAxis<unknown>[];
88
+ /** User-supplied cell executor. May throw; the matrix captures throws as
89
+ * `CellResult.error` and continues. */
90
+ runCell: (cell: MatrixCell) => Promise<CellResult<Output>>;
91
+ /** Replicates per cell. Default 1. */
92
+ reps?: number;
93
+ /** Prune cells from the cartesian BEFORE rep expansion. */
94
+ filter?: (cell: Omit<MatrixCell, 'rep' | 'ordinal'>) => boolean;
95
+ /** Axes to aggregate into `byAxis`. Default: every axis in `axes`. */
96
+ aggregateBy?: string[];
97
+ /** Max concurrent in-flight `runCell` invocations. Default 4. */
98
+ maxConcurrency?: number;
99
+ /** Cumulative-cost abort threshold (USD). When the running sum of
100
+ * `result.costUsd` crosses this value, no new cells are scheduled.
101
+ * In-flight cells finish. Default `Infinity`. */
102
+ costCeiling?: number;
103
+ /** Fires once per executed cell, after its promise settles. */
104
+ onCellComplete?: (cell: MatrixCell, result: CellResult<Output>) => void;
105
+ /** External cancellation. Aborts in-flight cells via a forwarded signal
106
+ * and suppresses scheduling of new ones. */
107
+ signal?: AbortSignal;
108
+ }
109
+
110
+ export type { AxisSummary as A, CellResult as C, MatrixResult as M, RunAgentMatrixOptions as R, MatrixAxis as a, MatrixCell as b };
@@ -1,4 +1,4 @@
1
- import { n as FeedbackTrajectoryStore } from '../feedback-trajectory-iATEAHmc.js';
1
+ import { n as FeedbackTrajectoryStore } from '../feedback-trajectory-Dvy-bt7x.js';
2
2
  import { T as TraceStore } from '../store-Db2Bv8Cf.js';
3
3
  import { z } from 'zod';
4
4
  import { OpenAPIObject } from 'openapi3-ts/oas31';
@@ -7,7 +7,7 @@ import { ServerType } from '@hono/node-server';
7
7
  import { Hono } from 'hono';
8
8
  import '../control-runtime-BZ_lVLYW.js';
9
9
  import '../emitter-DP_cSSiw.js';
10
- import '../dataset-ueRVTUoY.js';
10
+ import '../dataset-BlwAtYYf.js';
11
11
  import '../errors-mje_cKOs.js';
12
12
 
13
13
  declare const RubricDimensionSchema: z.ZodObject<{
@@ -0,0 +1,233 @@
1
+ # Loop taxonomy: driver, worker, measurement, and the improvement loop
2
+
3
+ This is the canonical vocabulary for the Tangle agent stack. It exists because
4
+ the same word ("loop", "shot", "worker") was being used at three different
5
+ layers, and the layers were getting conflated. Every role below has exactly
6
+ one meaning. Use these words and nothing else.
7
+
8
+ Cross-links: [`three-package-architecture.md`](../three-package-architecture.md)
9
+ (who owns what), [`concepts.md`](../concepts.md) (eval mental model),
10
+ [`multi-shot-optimization.md`](../multi-shot-optimization.md) (GEPA),
11
+ [`auto-research-loop-end-to-end.md`](../auto-research-loop-end-to-end.md)
12
+ (analyst / autoresearch).
13
+
14
+ ## The three roles
15
+
16
+ | Role | Definition | Lives at |
17
+ |---|---|---|
18
+ | **Driver** | The thing that *decides what happens next*. Plans, then decides whether to continue. | Both layers (see below) |
19
+ | **Worker** | An agent harness instance (Claude Code, Codex, OpenCode, …) running inside a sandbox. Does the actual work; responds in chat. | Inner layer only |
20
+ | **Sandbox** | A multi-harness VM. Hosts **1..N workers**, which can share a workspace. Not an agent — the substrate an agent runs in. | Inner layer only |
21
+ | **Measurement** | Runs the worker over a set of scenarios and judges the outputs into a scorecard with confidence intervals. This is `runCampaign`. | Outer layer |
22
+
23
+ Two facts that trip people up:
24
+
25
+ 1. **A sandbox is not a worker.** One sandbox can hold ten workers — a driver
26
+ can coordinate CC + Codex + OpenCode siblings sharing one workspace, or a
27
+ fleet spread across machines. `runLoop`'s placement encodes exactly this:
28
+ `{ sibling, sandboxId }` = co-located workers; `{ fleet, fleetId,
29
+ machineId, sandboxId }` = workers across machines.
30
+
31
+ 2. **"Driver" exists at two layers and means the same *kind* of thing
32
+ (a decider) at each, but the things it decides differ:**
33
+ - **Conversation driver** (inner): decides the next *turn* — a persona/user
34
+ simulating chat, or a planner fanning work to workers.
35
+ - **Improvement driver** (outer): decides the next *surface* — what system
36
+ prompt / tool config / code the workers should run.
37
+
38
+ ## The nesting
39
+
40
+ There are two loops. The outer one improves the thing the inner one runs.
41
+
42
+ ```
43
+ runImprovementLoop OUTER loop — improve the agent over time
44
+
45
+ ├─ DRIVER = ImprovementDriver proposes a candidate SURFACE
46
+ │ (evolutionary mutator | (the worker's system prompt / tools / config)
47
+ │ reflective analyst) — NOT a conversation turn
48
+
49
+ └─ for each candidate surface:
50
+
51
+ runCampaign a MEASUREMENT — scores ONE surface
52
+
53
+ └─ for each scenario × rep:
54
+
55
+ dispatch(scenario) THE SEAM — topology-opaque, returns an artifact
56
+
57
+ └─ runLoop / runMultishot INNER loop — one conversation
58
+ ├─ DRIVER = persona / user / planner chats with ↓
59
+ └─ WORKERS = 1..N agent harnesses in 1..M sandboxes
60
+
61
+ → transcript / artifact
62
+ judge(artifact) → score
63
+ → scorecard + CIs
64
+ gate(winner vs baseline) → PR
65
+ ```
66
+
67
+ ### `dispatch` is the topology-opaque seam
68
+
69
+ `dispatch(scenario) → artifact` is the boundary between the measurement layer
70
+ and the execution layer. The measurement does **not** know or care how the
71
+ artifact was produced. Behind the seam can be:
72
+
73
+ - one LLM call,
74
+ - one worker (CC) in one sandbox,
75
+ - a conversation driver coordinating 10 workers (CC + Codex + OpenCode)
76
+ sharing a workspace in one sandbox,
77
+ - a fleet across machines.
78
+
79
+ All of it is invisible to `runCampaign`. This is why the substrate has no
80
+ opinion about execution topology: the topology lives inside `dispatch`.
81
+
82
+ ### Corrected statements (things that were said backwards)
83
+
84
+ - The worker is the agent in the sandbox. The driver talks to it. ✓
85
+ - `runCampaign` is a **measurement**, not a worker. It *runs the worker* (via
86
+ `dispatch`); the worker does not "run the eval".
87
+ - The outer improvement loop has **no single worker** — its driver proposes a
88
+ *surface*, and each surface is scored by a *measurement* that drives the
89
+ inner workers.
90
+
91
+ ## The dataset flywheel — why every loop run matters
92
+
93
+ **Every loop run, regardless of why it ran, feeds the same dataset.** This is
94
+ the through-line that ties measurement and improvement together.
95
+
96
+ When `runCampaign` runs with a `labeledStore`, each cell captures
97
+ `(scenario, artifact, judgeScore, source)` into the `LabeledScenarioStore`.
98
+ The `source` discriminates *why* the run happened — but the captured tuple is
99
+ identical in shape:
100
+
101
+ | `captureSource` | The run that produced it |
102
+ |---|---|
103
+ | `'eval-run'` | a plain evaluation campaign |
104
+ | `'production-trace'` | a real user conversation in production |
105
+ | `'red-team'` | an adversarial probe |
106
+ | `'synthetic'` | a generated scenario |
107
+ | `'manual'` | a human-curated example |
108
+
109
+ That captured corpus **is the GEPA training set.** A basic eval run, a
110
+ production conversation, and an autoresearch loop all deposit the same
111
+ `(input, output, reward)` tuples. The optimization driver later samples from
112
+ that corpus to evolve the surface. So:
113
+
114
+ > Running *any* loop — even one whose purpose is not optimization — builds the
115
+ > dataset that optimization needs. The flywheel turns whether or not you are
116
+ > currently optimizing.
117
+
118
+ This is enforced, not aspirational: `runImprovementLoop` **refuses**
119
+ `tracing: 'off'` whenever a driver is wired, precisely because a loop that
120
+ doesn't feed the dataset is a loop that breaks the flywheel.
121
+
122
+ Temporal-split discipline (train vs holdout, `capturedBefore`) and
123
+ default-off-for-training of `production-trace` are enforced at the
124
+ `LabeledScenarioStore.sample()` boundary so the flywheel cannot contaminate
125
+ the holdout it is judged against. See `src/campaign/labeled-store/`.
126
+
127
+ ## One improvement loop, pluggable drivers
128
+
129
+ The improvement loop is **driver-agnostic**. `runOptimization` (the loop body)
130
+ and `runImprovementLoop` (the gated-promotion shell) call
131
+ `driver.propose(...)` → measure → `driver.decide(...)`. They do not know which
132
+ strategy is driving. Two strategies conform to the same `ImprovementDriver`
133
+ interface:
134
+
135
+ ```ts
136
+ interface ImprovementDriver<TFindings = unknown> {
137
+ kind: string
138
+ propose(args: {
139
+ currentSurface: MutableSurface
140
+ history: GenerationRecord[] // what's been tried + scored
141
+ findings: TFindings[] // external signal (e.g. analyst output)
142
+ populationSize: number
143
+ generation: number
144
+ signal: AbortSignal
145
+ }): Promise<MutableSurface[]>
146
+ decide?(args: { history: GenerationRecord[] }): { stop: boolean; reason?: string }
147
+ }
148
+ ```
149
+
150
+ | Driver | Strategy | How it proposes | Where it lives |
151
+ |---|---|---|---|
152
+ | `evolutionaryDriver` | Evolutionary (GEPA / AxGEPA) | Mutates the current best surface into N candidates, blind to history beyond the current best. Optimizes against the dataset's rewards. | **agent-eval** (pure: dataset → surface, no sandbox) |
153
+ | `analystDriver` *(planned)* | Reflective | Reads trace findings + generation history, reasons about *why* candidates failed, proposes targeted edits. | **agent-runtime** (runs sandboxes to do research) — implements agent-eval's `ImprovementDriver` |
154
+
155
+ This resolves the prior duplication where `runImprovementLoop` (evolutionary,
156
+ agent-eval) and `runAnalystLoop` (reflective, agent-runtime) were two parallel
157
+ loops doing "propose change → measure → gate → PR". There is **one loop**;
158
+ the analyst becomes a driver of it. The dependency direction permits this
159
+ cleanly: agent-eval is the leaf and owns the `ImprovementDriver` contract;
160
+ agent-runtime imports agent-eval and implements the contract.
161
+
162
+ ## What "the surface" is — improvement tiers
163
+
164
+ `MutableSurface` is the thing a driver changes. It has tiers, least → most
165
+ invasive. Today `MutableSurface = string` models tiers 1–2; tiers 3–4 are the
166
+ open design question below.
167
+
168
+ | Tier | Surface | Driver that changes it | Blast radius |
169
+ |---|---|---|---|
170
+ | 1 | System prompt / prompt-signature addendum | `evolutionaryDriver` (GEPA), `analystDriver` | prompt only |
171
+ | 2 | Tool config / tool signatures | `analystDriver` | which tools, their schemas |
172
+ | 3 | Knowledge (wiki / knowledge graph) | agent-knowledge's knowledge adapter | what the agent *knows* |
173
+ | 4 | Code / scaffolding | autoresearch (reads codebase + traces) → worktree / PR | the implementation itself |
174
+
175
+ The key distinction Drew drew:
176
+
177
+ - **Analyst** updates the *signatures* — the prompt and tool surface (tiers
178
+ 1–2). Cheap, reversible, measured directly against the dataset.
179
+ - **Autoresearch** updates the *code* (tier 4). It reads the repository plus
180
+ the trace findings, opens a worktree, and proposes implementation changes —
181
+ measured by re-running the inner loop against the changed code.
182
+
183
+ Both are `ImprovementDriver`s in the abstract (propose a change → measure →
184
+ gate → PR). They differ only in *what* they edit and *how invasive* it is. And
185
+ both consume the **same dataset** the flywheel builds.
186
+
187
+ ## Resolved design decisions
188
+
189
+ 1. **`MutableSurface` widens to span all tiers.** `MutableSurface = string |
190
+ CodeSurface`. The `string` form is tiers 1–2 (prompt / serialized tool
191
+ config); `CodeSurface = { kind: 'code'; worktreeRef; baseRef?; summary? }`
192
+ is tier 4 (an implementation change behind a worktree ref). One loop spans
193
+ prompt *and* code improvement. `surfaceHash` hashes a string by content and
194
+ a code surface by its `(worktreeRef, baseRef)` identity (the content lives
195
+ in git). **Shipped in agent-eval 0.40.1.** The consumer's
196
+ `dispatchWithSurface` is responsible for checking out a code surface's
197
+ worktree before running the worker.
198
+
199
+ 2. **`runAnalystLoop` (agent-runtime): analyst becomes a driver; knowledge
200
+ stays separate.** Extract an `analystDriver` (implements agent-eval's
201
+ `ImprovementDriver`) for the surface-proposal part, and feed it into
202
+ `runImprovementLoop`'s gate + PR machinery. `runAnalystLoop`'s other
203
+ responsibilities — the findings ledger and knowledge-graph updates, which
204
+ are *not* surface optimization — stay where they are. **Phase 3
205
+ (agent-runtime); the `ImprovementDriver` contract it implements is already
206
+ shipped in agent-eval 0.40.1.**
207
+
208
+ 3. **`runLoop` + `runMultishot` converge into one parameterized
209
+ `runConversationLoop`** with a pluggable backend (`sandbox | router`). The
210
+ two are the same shape (driver ↔ workers, iterate) differing only in
211
+ backend and intent; unify them. **Phase 3+ (cross-repo); needs its own
212
+ design pass — introduces a backend abstraction and couples the two repos'
213
+ inner loops, so it lands after the `ImprovementDriver` model is proven in
214
+ product use.**
215
+
216
+ ## Vocabulary quick reference
217
+
218
+ - **shot** — one conversational turn (driver says X, worker responds Y). Used
219
+ in `runMultishot`. Never used to mean a whole eval run.
220
+ - **runMultishot** — many shots in one conversation; persona-driver ↔ one
221
+ router-agent. agent-eval.
222
+ - **runLoop** — driver ↔ workers in sandboxes; topology-agnostic execution.
223
+ agent-runtime.
224
+ - **runCampaign** — a measurement: a surface scored over N scenarios × M reps.
225
+ agent-eval. (A "campaign" = a coordinated batch of measurements.)
226
+ - **runOptimization** — the improvement loop body: driver proposes surfaces,
227
+ each measured by a campaign, top-K promoted per generation. agent-eval.
228
+ - **runImprovementLoop** — `runOptimization` + holdout re-score + release gate
229
+ + optional PR. agent-eval.
230
+ - **runAnalystLoop** — reflective autoresearch: findings + knowledge updates +
231
+ improvement proposals. agent-runtime.
232
+ - **ImprovementDriver** — the pluggable strategy that proposes surfaces;
233
+ `evolutionaryDriver` and (planned) `analystDriver` conform.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-eval",
3
- "version": "0.37.0",
3
+ "version": "0.40.1",
4
4
  "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
5
5
  "homepage": "https://github.com/tangle-network/agent-eval#readme",
6
6
  "repository": {
@@ -99,6 +99,16 @@
99
99
  "import": "./dist/matrix/index.js",
100
100
  "default": "./dist/matrix/index.js"
101
101
  },
102
+ "./multishot": {
103
+ "types": "./dist/multishot/index.d.ts",
104
+ "import": "./dist/multishot/index.js",
105
+ "default": "./dist/multishot/index.js"
106
+ },
107
+ "./campaign": {
108
+ "types": "./dist/campaign/index.d.ts",
109
+ "import": "./dist/campaign/index.js",
110
+ "default": "./dist/campaign/index.js"
111
+ },
102
112
  "./openapi.json": {
103
113
  "default": "./dist/openapi.json"
104
114
  }
@@ -114,17 +124,6 @@
114
124
  "publishConfig": {
115
125
  "access": "public"
116
126
  },
117
- "scripts": {
118
- "build": "tsup && pnpm openapi",
119
- "dev": "tsup --watch",
120
- "prepare": "pnpm build",
121
- "test": "vitest run",
122
- "test:watch": "vitest",
123
- "typecheck": "tsc --noEmit",
124
- "lint": "biome check src",
125
- "format": "biome format --write src",
126
- "openapi": "node dist/cli.js openapi --out dist/openapi.json"
127
- },
128
127
  "dependencies": {
129
128
  "@asteasolutions/zod-to-openapi": "^8.5.0",
130
129
  "@ax-llm/ax": "^19.0.25",
@@ -138,30 +137,45 @@
138
137
  "@tangle-network/sandbox": "^0.2.1"
139
138
  },
140
139
  "peerDependenciesMeta": {
141
- "@tangle-network/agent-runtime": { "optional": true },
142
- "@tangle-network/sandbox": { "optional": true }
140
+ "@tangle-network/agent-runtime": {
141
+ "optional": true
142
+ },
143
+ "@tangle-network/sandbox": {
144
+ "optional": true
145
+ }
143
146
  },
144
147
  "devDependencies": {
145
148
  "@biomejs/biome": "^2.4.15",
146
149
  "@tangle-network/agent-runtime": "^0.21.0",
147
150
  "@tangle-network/sandbox": "^0.2.1",
148
151
  "@types/node": "^25.6.0",
152
+ "husky": "^9.1.7",
153
+ "lint-staged": "^17.0.5",
149
154
  "openapi3-ts": "^4.5.0",
150
155
  "tsup": "^8.0.0",
151
156
  "typescript": "^5.7.0",
152
157
  "vitest": "^3.0.0"
153
158
  },
154
- "pnpm": {
155
- "minimumReleaseAge": 4320,
156
- "minimumReleaseAgeExclude": ["@tangle-network/sandbox", "@tangle-network/agent-runtime"],
157
- "overrides": {
158
- "postcss@<8.5.10": "^8.5.10",
159
- "ws@>=8.0.0 <8.20.1": "^8.20.1"
160
- }
161
- },
162
159
  "engines": {
163
160
  "node": ">=20"
164
161
  },
162
+ "lint-staged": {
163
+ "src/**/*.{ts,tsx}": [
164
+ "biome check --write --no-errors-on-unmatched"
165
+ ],
166
+ "tests/**/*.{ts,tsx}": [
167
+ "biome check --write --no-errors-on-unmatched"
168
+ ]
169
+ },
165
170
  "license": "MIT",
166
- "packageManager": "pnpm@10.22.0"
167
- }
171
+ "scripts": {
172
+ "build": "tsup && pnpm openapi",
173
+ "dev": "tsup --watch",
174
+ "test": "vitest run",
175
+ "test:watch": "vitest",
176
+ "typecheck": "tsc --noEmit",
177
+ "lint": "biome check src",
178
+ "format": "biome format --write src",
179
+ "openapi": "node dist/cli.js openapi --out dist/openapi.json"
180
+ }
181
+ }