@tangle-network/agent-eval 0.44.1 → 0.46.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/http.d.ts +138 -0
- package/dist/adapters/http.js +203 -0
- package/dist/adapters/http.js.map +1 -0
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/langchain.js.map +1 -1
- package/dist/campaign/index.d.ts +3 -3
- package/dist/campaign/index.js +2 -2
- package/dist/{chunk-H5BGRSN4.js → chunk-HRKOCLQA.js} +3 -3
- package/dist/{chunk-RXK7FXLV.js → chunk-J3EIOI3O.js} +7 -2
- package/dist/chunk-J3EIOI3O.js.map +1 -0
- package/dist/contract/index.d.ts +199 -2
- package/dist/contract/index.js +128 -3
- package/dist/contract/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/rl.d.ts +1 -1
- package/dist/{run-campaign-GNDO66B4.js → run-campaign-6UEVBPP3.js} +2 -2
- package/dist/{run-improvement-loop-CbilHQAb.d.ts → run-improvement-loop-Bfam3MT1.d.ts} +18 -2
- package/dist/{types-DToGONFA.d.ts → types-8u72Gc76.d.ts} +9 -1
- package/docs/adapters-observability.md +121 -0
- package/docs/design/external-agent-wedge.md +2 -2
- package/docs/distributed-driver.md +173 -0
- package/docs/phase-b-pairing-kit.md +188 -0
- package/docs/phase-b-runbook.md +176 -0
- package/docs/quickstart-external.md +43 -4
- package/package.json +6 -1
- package/dist/chunk-RXK7FXLV.js.map +0 -1
- /package/dist/{chunk-H5BGRSN4.js.map → chunk-HRKOCLQA.js.map} +0 -0
- /package/dist/{run-campaign-GNDO66B4.js.map → run-campaign-6UEVBPP3.js.map} +0 -0
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
# Distributed driver — driver-on-A, workers-on-B (and C, D, E…)
|
|
2
|
+
|
|
3
|
+
The driver (running `runCampaign` / `runImprovementLoop` / `gepaDriver`)
|
|
4
|
+
and the worker (running your actual agent) **do not have to live in the
|
|
5
|
+
same process, machine, region, or cloud.** `Dispatch` is just a
|
|
6
|
+
function: scenario in, artifact out. Whatever returns the artifact is
|
|
7
|
+
the worker — local, remote, sandboxed, or fanned out across a fleet.
|
|
8
|
+
|
|
9
|
+
## Why you'd want this
|
|
10
|
+
|
|
11
|
+
| Pattern | Reason |
|
|
12
|
+
|---|---|
|
|
13
|
+
| **Driver on your VPC, workers on our sandbox fleet** | Driver holds secrets, training data, prompt corpus; workers stay stateless and scale horizontally |
|
|
14
|
+
| **Multi-region campaigns** | Each cell runs in the region closest to its target API (latency, compliance, data residency) |
|
|
15
|
+
| **Driver-as-a-service** | Long-running optimization process; reuses across many short-lived worker invocations |
|
|
16
|
+
| **Heterogeneous workers** | One cell on a CPU container, another on a GPU box, another against a third-party API — same Dispatch shape, different placement |
|
|
17
|
+
| **Budget-isolated workers** | Worker boxes get scoped, time-bounded credentials; driver never holds production keys |
|
|
18
|
+
|
|
19
|
+
## Two new pieces in 0.45.0
|
|
20
|
+
|
|
21
|
+
| Where | What |
|
|
22
|
+
|---|---|
|
|
23
|
+
| **`DispatchContext.placement?: string`** | Opaque placement key the substrate forwards to the Dispatch. |
|
|
24
|
+
| **`RunCampaignOptions.cellPlacement?(input) → string \| undefined`** | Strategy function the substrate calls per cell to compute the placement key. |
|
|
25
|
+
| **`@tangle-network/agent-eval/adapters/http`** | `httpDispatch` (client) + `runDispatchServer` (server) — wire shape for HTTP-based remote workers. |
|
|
26
|
+
|
|
27
|
+
Both ends of the wire are in the same package; no peer dep, no separate
|
|
28
|
+
install. The substrate doesn't strategy-pick; you provide the
|
|
29
|
+
`cellPlacement` function, the substrate forwards its result, the
|
|
30
|
+
Dispatch reads it. Clean seam, no policy baked in.
|
|
31
|
+
|
|
32
|
+
## The three reference topologies
|
|
33
|
+
|
|
34
|
+
### 1. In-process (the default — what you already have)
|
|
35
|
+
|
|
36
|
+
```ts
|
|
37
|
+
await runCampaign({
|
|
38
|
+
scenarios,
|
|
39
|
+
dispatch, // runs in-process
|
|
40
|
+
judges: [judge],
|
|
41
|
+
storage,
|
|
42
|
+
runDir,
|
|
43
|
+
})
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
`ctx.placement` is `undefined`; nothing changes for existing consumers.
|
|
47
|
+
This shipped in 0.40.
|
|
48
|
+
|
|
49
|
+
### 2. Single remote worker
|
|
50
|
+
|
|
51
|
+
Driver-on-A talks to one worker-on-B over HTTP.
|
|
52
|
+
|
|
53
|
+
**Driver side (machine A):**
|
|
54
|
+
|
|
55
|
+
```ts
|
|
56
|
+
import { httpDispatch } from '@tangle-network/agent-eval/adapters/http'
|
|
57
|
+
|
|
58
|
+
const dispatch = httpDispatch<MyScenario, MyArtifact>({
|
|
59
|
+
url: 'https://worker.your-infra.com/dispatch',
|
|
60
|
+
auth: process.env.WORKER_TOKEN,
|
|
61
|
+
timeoutMs: 5 * 60 * 1000,
|
|
62
|
+
retries: 2,
|
|
63
|
+
})
|
|
64
|
+
|
|
65
|
+
await runImprovementLoop({ scenarios, baselineSurface, dispatchWithSurface: (surface, s, ctx) =>
|
|
66
|
+
dispatch(s, { ...ctx, /* pass the surface through your own protocol */ }),
|
|
67
|
+
/* ... */ })
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
**Worker side (machine B):**
|
|
71
|
+
|
|
72
|
+
```ts
|
|
73
|
+
import { runDispatchServer } from '@tangle-network/agent-eval/adapters/http'
|
|
74
|
+
|
|
75
|
+
const handle = await runDispatchServer<MyScenario, MyArtifact>({
|
|
76
|
+
dispatch: async (scenario, ctx) => {
|
|
77
|
+
// your agent — call OpenAI, LangChain, your sandbox, anything.
|
|
78
|
+
const artifact = await runMyAgent(scenario, ctx.signal)
|
|
79
|
+
return artifact
|
|
80
|
+
},
|
|
81
|
+
port: 8080,
|
|
82
|
+
auth: process.env.WORKER_TOKEN, // required; `false` only for closed networks
|
|
83
|
+
})
|
|
84
|
+
console.log(`worker listening on ${handle.port}`)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Cancellation, retries on 5xx / 408 / 429, bounded timeouts, optional
|
|
88
|
+
custom auth headers, optional `fetchImpl` override — all there.
|
|
89
|
+
|
|
90
|
+
### 3. Multi-region fan-out
|
|
91
|
+
|
|
92
|
+
Driver picks a region per cell; the same `httpDispatch` routes to
|
|
93
|
+
different worker URLs based on placement.
|
|
94
|
+
|
|
95
|
+
```ts
|
|
96
|
+
import { httpDispatch } from '@tangle-network/agent-eval/adapters/http'
|
|
97
|
+
|
|
98
|
+
const REGION_URLS: Record<string, string> = {
|
|
99
|
+
'us-east': 'https://worker-use1.your-infra.com/dispatch',
|
|
100
|
+
'eu-west': 'https://worker-euw1.your-infra.com/dispatch',
|
|
101
|
+
'ap-south': 'https://worker-aps1.your-infra.com/dispatch',
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
const dispatch = httpDispatch<MyScenario, MyArtifact>({
|
|
105
|
+
resolveUrl: ({ placement }) => REGION_URLS[placement ?? 'us-east'],
|
|
106
|
+
auth: process.env.WORKER_TOKEN,
|
|
107
|
+
})
|
|
108
|
+
|
|
109
|
+
await runCampaign({
|
|
110
|
+
scenarios,
|
|
111
|
+
dispatch,
|
|
112
|
+
judges: [judge],
|
|
113
|
+
storage,
|
|
114
|
+
runDir,
|
|
115
|
+
cellPlacement: ({ scenario }) => {
|
|
116
|
+
if (scenario.tags?.includes('eu')) return 'eu-west'
|
|
117
|
+
if (scenario.tags?.includes('ap')) return 'ap-south'
|
|
118
|
+
return 'us-east'
|
|
119
|
+
},
|
|
120
|
+
maxConcurrency: 8, // 8 cells fan across regions in parallel
|
|
121
|
+
})
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
`cellPlacement` is a pure function the substrate calls per cell — no
|
|
125
|
+
state. Use whatever signal you want (tags, hash of scenario id,
|
|
126
|
+
round-robin, region-affinity from a previous run, scheduling table).
|
|
127
|
+
|
|
128
|
+
## What's preserved across the wire
|
|
129
|
+
|
|
130
|
+
| Concern | How |
|
|
131
|
+
|---|---|
|
|
132
|
+
| **Cancellation** | Driver's `AbortSignal` forwards into the HTTP request; server translates `AbortError` → `499` so client doesn't retry. |
|
|
133
|
+
| **Timeouts** | Per-call `timeoutMs` on the client; server can layer its own. |
|
|
134
|
+
| **Retries** | Idempotent retries on 5xx / 408 / 429 with exponential backoff + jitter. Driver-aborts never retry. |
|
|
135
|
+
| **Auth** | Bearer token on `Authorization`; pluggable via `auth: string \| () => string \| Promise<string>` for rotation/refresh. |
|
|
136
|
+
| **Payload size** | Server enforces `maxBodyBytes` (default 10 MB). |
|
|
137
|
+
| **Traces** | Both ends emit OTel — if both point at the same OTLP collector, you get a unified trace per cell. See `docs/adapters-observability.md`. |
|
|
138
|
+
| **Cost** | Worker's `ctx.cost.observe(usd, source)` is local to the worker process. Roll up server-side and attach to your worker-side telemetry; we don't (yet) forward cost back to the driver. Tracked as follow-up. |
|
|
139
|
+
|
|
140
|
+
## Running the reference example
|
|
141
|
+
|
|
142
|
+
See `examples/distributed-driver/`:
|
|
143
|
+
|
|
144
|
+
```sh
|
|
145
|
+
# Terminal 1 — worker
|
|
146
|
+
pnpm tsx examples/distributed-driver/worker.ts
|
|
147
|
+
|
|
148
|
+
# Terminal 2 — driver
|
|
149
|
+
WORKER_URL=http://localhost:8080/dispatch \
|
|
150
|
+
WORKER_TOKEN=dev-token \
|
|
151
|
+
pnpm tsx examples/distributed-driver/driver.ts
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
Two processes, one local TCP loopback, full self-improvement loop end
|
|
155
|
+
to end. Scaling out is dropping `WORKER_URL` to a non-loopback hostname
|
|
156
|
+
and using `cellPlacement` to fan across many of them.
|
|
157
|
+
|
|
158
|
+
## Known gaps + follow-ups
|
|
159
|
+
|
|
160
|
+
- **Cost roll-up across the wire** — worker-side `ctx.cost` observations
|
|
161
|
+
stay on the worker. We need to forward them in the response body so
|
|
162
|
+
`defaultProductionGate`'s `budgetUsd` ceiling reflects total spend, not
|
|
163
|
+
driver-side spend. Tracked as a 0.45.x follow-up.
|
|
164
|
+
- **Per-cell artifact streaming** — when the worker writes intermediate
|
|
165
|
+
artifacts via `ctx.artifacts.write`, those land on the worker's
|
|
166
|
+
storage. For multi-worker campaigns you'll want a shared object store
|
|
167
|
+
(S3/GCS) reachable from both sides; today consumers wire that as a
|
|
168
|
+
`CampaignStorage` impl. A reference S3-backed storage is on the
|
|
169
|
+
roadmap.
|
|
170
|
+
- **gRPC / NATS / Temporal transports** — the wire is HTTP today by
|
|
171
|
+
default because everything speaks HTTP. Other transports can ship as
|
|
172
|
+
additional adapters; the `Dispatch` interface itself is
|
|
173
|
+
transport-agnostic.
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
# Phase-B partner pairing kit
|
|
2
|
+
|
|
3
|
+
Everything we hand a design partner — the pitch, the discovery doc,
|
|
4
|
+
the judge worksheet, the 4-hour pairing agenda, the success criteria.
|
|
5
|
+
|
|
6
|
+
> This file is **partner-facing**. The internal driving runbook is in
|
|
7
|
+
> [`phase-b-runbook.md`](./phase-b-runbook.md).
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## The pitch (one-pager)
|
|
12
|
+
|
|
13
|
+
You have a working agent. You don't have evals. You don't have a
|
|
14
|
+
self-improvement loop. You don't know which prompt change actually
|
|
15
|
+
made the agent better last week.
|
|
16
|
+
|
|
17
|
+
We have all of that on a shelf — same engine our six internal product
|
|
18
|
+
agents use in production. It's open source, free at the LAND tier, and
|
|
19
|
+
sandbox-free if you don't want our sandbox.
|
|
20
|
+
|
|
21
|
+
**The Phase-B offer:** in one 4-hour pairing, we wrap your agent
|
|
22
|
+
behind our `Dispatch`, author your domain-specific judge with you,
|
|
23
|
+
and run one real campaign + improvement loop on **your actual use
|
|
24
|
+
case**. You walk away with:
|
|
25
|
+
|
|
26
|
+
- A reproducible eval harness against scenarios you control.
|
|
27
|
+
- A judge that scores your outputs on dimensions you defined.
|
|
28
|
+
- One measurable lift on your real product, with a held-out gate.
|
|
29
|
+
- Trace artifacts you own (locally on disk; nothing leaves your
|
|
30
|
+
network unless you point at our hosted tier).
|
|
31
|
+
|
|
32
|
+
What we get: design-partner evidence the substrate works on a foreign
|
|
33
|
+
agent we did not build. That validates the wedge for us. Nothing else
|
|
34
|
+
changes hands.
|
|
35
|
+
|
|
36
|
+
**Cost to you:** 4 hours of pairing + your LLM bill for the campaign
|
|
37
|
+
run (typically $5-$50 depending on model + scenario count). No
|
|
38
|
+
commitment, no contract, no exclusivity. We don't take your code, your
|
|
39
|
+
data, or your secrets.
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Discovery questions (15 min, before the pairing)
|
|
44
|
+
|
|
45
|
+
Send these to the partner ahead of the pairing so they walk in with
|
|
46
|
+
their answers.
|
|
47
|
+
|
|
48
|
+
### About the agent
|
|
49
|
+
|
|
50
|
+
1. What does your agent **do** — one paragraph, end-user perspective?
|
|
51
|
+
2. What's the **input** it accepts and the **output** it produces?
|
|
52
|
+
(Schemas help; English is fine.)
|
|
53
|
+
3. What framework / stack? (LangChain / Mastra / OpenAI Agents SDK /
|
|
54
|
+
bespoke / something else.)
|
|
55
|
+
4. Where does it run? (Local node / serverless / your sandbox /
|
|
56
|
+
browser / mobile / other.)
|
|
57
|
+
5. What model(s) does it use today? Any model-routing layer
|
|
58
|
+
(OpenRouter, Portkey, your own)?
|
|
59
|
+
|
|
60
|
+
### About quality
|
|
61
|
+
|
|
62
|
+
6. How do you currently know your agent is good? (Eyeballing /
|
|
63
|
+
user feedback / metrics / nothing yet — all fine answers.)
|
|
64
|
+
7. What does a **bad** output look like for you? Give 2-3 concrete
|
|
65
|
+
examples. Be specific.
|
|
66
|
+
8. What does a **good** output look like? Same.
|
|
67
|
+
9. Are there outputs that are *technically correct but feel wrong*?
|
|
68
|
+
What's the signal?
|
|
69
|
+
10. How would a senior person on your team **score** an output, if
|
|
70
|
+
they had to give it a 1-10? Walk us through the rubric they'd
|
|
71
|
+
use, even informally.
|
|
72
|
+
|
|
73
|
+
### About the loop
|
|
74
|
+
|
|
75
|
+
11. If we could improve one thing about the agent in 4 hours, what
|
|
76
|
+
would move the needle the most for you?
|
|
77
|
+
12. Are there *prompt* changes you've wanted to try but haven't had
|
|
78
|
+
the loop to validate?
|
|
79
|
+
13. Anything you've explicitly tried that **didn't** work? (Saves us
|
|
80
|
+
suggesting it.)
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## Judge-design worksheet (45 min into the pairing)
|
|
85
|
+
|
|
86
|
+
The judge is the most under-discussed piece of an eval system. Most
|
|
87
|
+
projects fail at the judge, not the agent.
|
|
88
|
+
|
|
89
|
+
We start with a **strawman** — the 6 dimensions in our canonical
|
|
90
|
+
marketing-quality judge:
|
|
91
|
+
|
|
92
|
+
| Dim | What it measures |
|
|
93
|
+
|---|---|
|
|
94
|
+
| hook_strength | Opens with concrete user outcome, not category |
|
|
95
|
+
| voice_match | Reads human-written; no AI slop |
|
|
96
|
+
| cta_clarity | Next step unambiguous for the audience |
|
|
97
|
+
| factual_grounding | Only claims things the brief supports |
|
|
98
|
+
| surface_fit | Length + register correct for medium |
|
|
99
|
+
| audience_specificity | Vocabulary the audience actually responds to |
|
|
100
|
+
|
|
101
|
+
**Your job in this 45 min:** rip this apart. We expect:
|
|
102
|
+
|
|
103
|
+
- **2-3 of these are wrong for you.** Replace them.
|
|
104
|
+
- **2-3 dimensions are missing.** Add them. (E.g., "tone matches our
|
|
105
|
+
brand book" or "safety-critical claim has a citation" or "answer is
|
|
106
|
+
decisive — no hedging when the user wants a recommendation".)
|
|
107
|
+
- **Weights are wrong.** For your use case some dims matter 5x more.
|
|
108
|
+
|
|
109
|
+
The deliverable: a judge with 4-8 dimensions, each scored 0.0 - 1.0,
|
|
110
|
+
each unambiguous enough that two independent humans would score the
|
|
111
|
+
same artifact within 0.1.
|
|
112
|
+
|
|
113
|
+
If a dimension is squishy, throw it out. A noisy judge poisons the
|
|
114
|
+
loop.
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## The 4-hour pairing agenda
|
|
119
|
+
|
|
120
|
+
### Hour 1 — Discovery + Dispatch wiring
|
|
121
|
+
|
|
122
|
+
| Time | What | Deliverable |
|
|
123
|
+
|---|---|---|
|
|
124
|
+
| 0:00 - 0:15 | Review discovery answers, align on scope | Shared doc with goals + constraints |
|
|
125
|
+
| 0:15 - 0:45 | Wire `Dispatch` around their agent — typically 1 function | Working `Dispatch<TScenario, TArtifact>` |
|
|
126
|
+
| 0:45 - 1:00 | Run 1-2 scenarios through `Dispatch` manually; see real artifacts | Confirmed wire shape |
|
|
127
|
+
|
|
128
|
+
### Hour 2 — Judge calibration
|
|
129
|
+
|
|
130
|
+
| Time | What | Deliverable |
|
|
131
|
+
|---|---|---|
|
|
132
|
+
| 1:00 - 1:45 | Walk through the strawman judge; redesign dimensions with the partner | Final `JudgeConfig` for their domain |
|
|
133
|
+
| 1:45 - 2:00 | Calibrate judge against the 2 manual outputs from Hour 1 | Confirmed judge gives same scores a human would |
|
|
134
|
+
|
|
135
|
+
### Hour 3 — First campaign + tuning
|
|
136
|
+
|
|
137
|
+
| Time | What | Deliverable |
|
|
138
|
+
|---|---|---|
|
|
139
|
+
| 2:00 - 2:30 | Define 8-15 scenarios with the partner (or use ours as a template) | Scenario set with train + holdout split |
|
|
140
|
+
| 2:30 - 3:00 | Run `runEval` for baseline; review per-scenario scores | Baseline score + identified failure modes |
|
|
141
|
+
|
|
142
|
+
### Hour 4 — Improvement loop + go/no-go
|
|
143
|
+
|
|
144
|
+
| Time | What | Deliverable |
|
|
145
|
+
|---|---|---|
|
|
146
|
+
| 3:00 - 3:30 | Configure `runImprovementLoop` with `gepaDriver` (3 generations, population 2) + `defaultProductionGate` | Improvement run completes |
|
|
147
|
+
| 3:30 - 3:50 | Walk the partner through the gate decision + lift per scenario | Report artifact |
|
|
148
|
+
| 3:50 - 4:00 | Capture: was the lift real? Would they ship the winner? Will they keep using the lib? | **Go/no-go signal for Phase D** |
|
|
149
|
+
|
|
150
|
+
If we're tracking ahead at any hour, use the slack to deepen — add a
|
|
151
|
+
red-team battery, swap the judge model, run more generations. If we're
|
|
152
|
+
behind, cut the scenario set to 6 and ship.
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
## Success criteria — what counts as Phase B passed
|
|
157
|
+
|
|
158
|
+
For us to greenlight Phase D (hosted orchestrator + metered billing),
|
|
159
|
+
we need ALL of:
|
|
160
|
+
|
|
161
|
+
1. **Real lift.** Held-out winner score > baseline by ≥ 0.05 composite
|
|
162
|
+
points (or the partner's chosen threshold). Not just train; held-out.
|
|
163
|
+
2. **Partner-validated lift.** The partner reads the winner output on
|
|
164
|
+
3+ held-out scenarios and confirms it's actually better.
|
|
165
|
+
3. **Integration time ≤ 1 day.** Discovery + wiring + judge took ≤ 4
|
|
166
|
+
hours for the pairing; partner could reach the same point solo in
|
|
167
|
+
≤ 1 day from the quickstart doc.
|
|
168
|
+
4. **Public commitment.** Partner agrees to a public reference (case
|
|
169
|
+
study / quote / logo) OR commits to running the LAND tier in their
|
|
170
|
+
own product within 2 weeks.
|
|
171
|
+
|
|
172
|
+
3-of-4 = soft pass (revisit Phase D scope but proceed). 4-of-4 = hard
|
|
173
|
+
pass (build Phase D). ≤ 2 = fail (back to substrate iteration).
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## What we don't ask for
|
|
178
|
+
|
|
179
|
+
- Your code. Wire `Dispatch` around your existing API; we never see the
|
|
180
|
+
source.
|
|
181
|
+
- Your customer data. Use synthetic scenarios or anonymized real ones —
|
|
182
|
+
whichever you prefer.
|
|
183
|
+
- Your model keys. You bring your own; if you want, route through Tangle
|
|
184
|
+
Router and we never see the prompts either.
|
|
185
|
+
- Exclusivity, commitment, or contract. Walk away whenever.
|
|
186
|
+
|
|
187
|
+
The point is to learn if the substrate works for someone we didn't
|
|
188
|
+
build it for. That's it.
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
# Phase-B runbook (internal)
|
|
2
|
+
|
|
3
|
+
How we drive a design-partner pairing. Goes alongside
|
|
4
|
+
[`phase-b-pairing-kit.md`](./phase-b-pairing-kit.md) (the partner-facing
|
|
5
|
+
materials) — this file is for us.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Before the pairing
|
|
10
|
+
|
|
11
|
+
- **24-48h prior:** send discovery questions from
|
|
12
|
+
[`phase-b-pairing-kit.md`](./phase-b-pairing-kit.md). Don't run the
|
|
13
|
+
pairing without answers in hand. The pairing fails when we discover
|
|
14
|
+
the partner's quality bar live; we don't have time to interview AND
|
|
15
|
+
build in 4 hours.
|
|
16
|
+
- **48h prior:** run the canonical demo (`pnpm tsx
|
|
17
|
+
examples/marketing-agent-canonical/index.ts`) end-to-end against the
|
|
18
|
+
partner's preferred model. Confirms the substrate + their LLM tier
|
|
19
|
+
compose. If it errors, fix the substrate before the pairing.
|
|
20
|
+
- **24h prior:** mirror the partner's stack locally. If they're on
|
|
21
|
+
Cloudflare Workers, run a Worker. On LangChain, install `@langchain/*`.
|
|
22
|
+
Don't debug their tooling on the call.
|
|
23
|
+
- **1h prior:** open the pairing kit, the agent-eval repo, the partner's
|
|
24
|
+
agent code/endpoint, a shared doc, and a screenshare ready.
|
|
25
|
+
|
|
26
|
+
## During the pairing
|
|
27
|
+
|
|
28
|
+
### Driving principles
|
|
29
|
+
|
|
30
|
+
- **Talk less, ship more.** The partner is paying with their time and
|
|
31
|
+
attention; every minute we talk we aren't shipping their lift.
|
|
32
|
+
- **They write the judge.** We start with our strawman so they have
|
|
33
|
+
something to react to, but the judge that ends up running is theirs.
|
|
34
|
+
This is the most-discussed seam — they should own it.
|
|
35
|
+
- **No invented features.** Don't promise capabilities that don't exist
|
|
36
|
+
("we have a hosted ingest for this") unless they actually exist.
|
|
37
|
+
Phase B is honesty's purest test.
|
|
38
|
+
- **Capture verbatim.** Write down their exact words on what's broken /
|
|
39
|
+
what would change their mind. The wedge-gate evidence is qualitative
|
|
40
|
+
too.
|
|
41
|
+
|
|
42
|
+
### When to escalate to Drew
|
|
43
|
+
|
|
44
|
+
- Partner wants something Phase D would have (hosted dashboard, multi-
|
|
45
|
+
tenant, billing). **Escalate same day** — this is the GTM signal we're
|
|
46
|
+
hunting for; Drew should hear it directly.
|
|
47
|
+
- Partner is the wrong fit (technical or business) and the pairing
|
|
48
|
+
would burn both sides' time. **Pause the pairing**, debrief with Drew,
|
|
49
|
+
reschedule with a better-fit partner.
|
|
50
|
+
- Substrate breaks in a way that requires a published bump. **Pause
|
|
51
|
+
the pairing**, ship the fix in a focused PR, resume.
|
|
52
|
+
|
|
53
|
+
### What to capture for the wedge gate
|
|
54
|
+
|
|
55
|
+
Per [`docs/design/external-agent-wedge.md`](./design/external-agent-wedge.md),
|
|
56
|
+
the gate decision hinges on Phase B evidence. We capture:
|
|
57
|
+
|
|
58
|
+
1. **Quantitative lift** — held-out winner composite vs baseline, per
|
|
59
|
+
scenario + overall. Auto-generated in the report artifact by the
|
|
60
|
+
canonical demo (`.phase-b-runs/<ts>/phase-b-report.md`).
|
|
61
|
+
2. **Qualitative partner-validation** — partner read 3+ winner outputs
|
|
62
|
+
and confirmed they're better. Capture as a 1-paragraph quote.
|
|
63
|
+
3. **Integration friction** — minutes spent on each pairing phase. Were
|
|
64
|
+
any > 2x estimated? What broke?
|
|
65
|
+
4. **Judge-design surprise** — which dimensions the partner added or
|
|
66
|
+
killed vs our strawman. Strong signal about what the substrate's
|
|
67
|
+
default judge templates are missing for adjacent domains.
|
|
68
|
+
5. **Soft commitments** — would they reference us? Would they
|
|
69
|
+
self-serve from the quickstart doc? Would they pay for hosted?
|
|
70
|
+
|
|
71
|
+
Capture into a single `phase-b-debrief.md` per partner. We don't
|
|
72
|
+
publish these; they feed the next substrate iteration + the wedge
|
|
73
|
+
go/no-go.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Failure modes — what we do NOT do
|
|
78
|
+
|
|
79
|
+
### "We'll just optimize on the train set"
|
|
80
|
+
|
|
81
|
+
Hard no. The held-out gate is the entire point. A win that doesn't
|
|
82
|
+
generalize is worse than no win — it's evidence that the substrate
|
|
83
|
+
overfits, which is the failure mode the wedge tier rewards.
|
|
84
|
+
|
|
85
|
+
If the holdout lift is < threshold but train looks great:
|
|
86
|
+
|
|
87
|
+
1. Show the partner the gap. Explain what overfitting means here.
|
|
88
|
+
2. Try raising `maxGenerations` to 5 (gives gepa more search budget).
|
|
89
|
+
3. Try widening `populationSize` to 3 (more diverse mutations per gen).
|
|
90
|
+
4. If still no lift on holdout: **report the result honestly**. A
|
|
91
|
+
negative finding is real evidence for us too — tells us this surface
|
|
92
|
+
isn't amenable to prompt-only mutation, and the partner needs Phase
|
|
93
|
+
C (code-tier optimization) or a different approach.
|
|
94
|
+
|
|
95
|
+
### "The judge is too noisy"
|
|
96
|
+
|
|
97
|
+
A judge whose two-run variance > 0.1 on the same artifact is broken.
|
|
98
|
+
Fixes, in order:
|
|
99
|
+
|
|
100
|
+
1. Lower temperature to 0.0 (the canonical judge uses 0.2, which is
|
|
101
|
+
already low).
|
|
102
|
+
2. Use a stronger model than the agent (default: same model. Bump the
|
|
103
|
+
judge to GPT-5.5 / Claude Opus.)
|
|
104
|
+
3. Add anchors to each dimension ("0.0 = X, 0.5 = Y, 1.0 = Z").
|
|
105
|
+
4. If still noisy: collapse to fewer, simpler dimensions. 3 unambiguous
|
|
106
|
+
dimensions beat 6 squishy ones.
|
|
107
|
+
|
|
108
|
+
### "We can't decide what the partner's judge should be"
|
|
109
|
+
|
|
110
|
+
Then we don't have Phase B. The judge IS the partner's quality bar.
|
|
111
|
+
If they can't articulate it in 45 minutes of pairing, we're in the
|
|
112
|
+
wrong pairing — they need to do the interview-themselves work first.
|
|
113
|
+
|
|
114
|
+
**Pause the pairing, send the discovery doc again, regroup in a week.**
|
|
115
|
+
|
|
116
|
+
### "Their agent is slow / expensive"
|
|
117
|
+
|
|
118
|
+
`maxConcurrency: 1` and reduce scenarios to 6. Cost scales linearly;
|
|
119
|
+
time scales as `(scenarios × reps × generations × population) /
|
|
120
|
+
concurrency`. Tune until the loop completes in ≤ 30 min.
|
|
121
|
+
|
|
122
|
+
If the per-call cost is > $1, talk to Drew before the pairing — we
|
|
123
|
+
might want to subsidize the partner's first run.
|
|
124
|
+
|
|
125
|
+
### "They want to share their secrets through Tangle Router"
|
|
126
|
+
|
|
127
|
+
Fine — `OPENAI_BASE_URL=https://router.tangle.tools/v1` works. Make
|
|
128
|
+
sure they understand: every call routes through us; the prompts and
|
|
129
|
+
responses are visible to whatever observability we have on the router.
|
|
130
|
+
If they want zero data leaving their network, point at their own
|
|
131
|
+
endpoint, not Tangle Router.
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## After the pairing
|
|
136
|
+
|
|
137
|
+
### Same day
|
|
138
|
+
|
|
139
|
+
- Save the `phase-b-report.md` artifact + the partner's debrief notes
|
|
140
|
+
to `~/company/design-partners/<partner>/<date>/`.
|
|
141
|
+
- Send the partner a thank-you with the winner artifact + the next-
|
|
142
|
+
steps doc. Whether or not we proceed to Phase D, leave them with
|
|
143
|
+
something concrete they can ship in their product.
|
|
144
|
+
- Slack Drew the verdict against the [success criteria](./phase-b-pairing-kit.md#success-criteria--what-counts-as-phase-b-passed).
|
|
145
|
+
|
|
146
|
+
### Within a week
|
|
147
|
+
|
|
148
|
+
- If Phase B passed: open the Phase D RFC. Reuse the partner-validated
|
|
149
|
+
judge dimensions + scenarios as the spec for what the hosted tier
|
|
150
|
+
needs to support out of the box.
|
|
151
|
+
- If Phase B failed: substrate iteration ticket(s). Specific gaps the
|
|
152
|
+
pairing surfaced (judge dim defaults, doc clarity, missing helper).
|
|
153
|
+
- Either way: update the wedge doc (`docs/design/external-agent-wedge.md`)
|
|
154
|
+
with the partner-name redacted + the qualitative signal.
|
|
155
|
+
|
|
156
|
+
### Within a month (regardless of go/no-go)
|
|
157
|
+
|
|
158
|
+
- Followup with the partner. If they're still using the lib, capture a
|
|
159
|
+
metric. If they stopped, find out why. Both data points feed product.
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## The canonical demo as a forcing function
|
|
164
|
+
|
|
165
|
+
`examples/marketing-agent-canonical/` is the demo we open the pairing
|
|
166
|
+
with. It does three things at once:
|
|
167
|
+
|
|
168
|
+
1. **Proves the substrate works** — they see a real lift on a real-
|
|
169
|
+
feeling agent before we touch their code.
|
|
170
|
+
2. **Sets the bar for the judge conversation** — they react to concrete
|
|
171
|
+
dimensions, not abstract questions.
|
|
172
|
+
3. **Trains us** — running the canonical demo before the pairing
|
|
173
|
+
surfaces substrate bugs on the partner's preferred model BEFORE the
|
|
174
|
+
partner is watching. We hit those bugs first.
|
|
175
|
+
|
|
176
|
+
Run the canonical demo before every Phase-B pairing. It's not optional.
|
|
@@ -13,12 +13,51 @@ Tangle sandbox, no Tangle account, and no hosted infrastructure.
|
|
|
13
13
|
## Install
|
|
14
14
|
|
|
15
15
|
```sh
|
|
16
|
-
npm i @tangle-network/agent-eval@^0.
|
|
16
|
+
npm i @tangle-network/agent-eval@^0.46.0
|
|
17
17
|
```
|
|
18
18
|
|
|
19
|
-
The package's `@tangle-network/sandbox` peer is `optional
|
|
20
|
-
|
|
21
|
-
|
|
19
|
+
The package's `@tangle-network/sandbox` peer is `optional`. Foreign
|
|
20
|
+
consumers install agent-eval and run the full LAND tier without our
|
|
21
|
+
sandbox or its dependencies.
|
|
22
|
+
|
|
23
|
+
## The one-shot happy path
|
|
24
|
+
|
|
25
|
+
If you don't want to learn the substrate, the entire LAND tier reduces
|
|
26
|
+
to one function call:
|
|
27
|
+
|
|
28
|
+
```ts
|
|
29
|
+
import { selfImprove } from '@tangle-network/agent-eval/contract'
|
|
30
|
+
|
|
31
|
+
const result = await selfImprove({
|
|
32
|
+
agent: (surface, scenario, ctx) =>
|
|
33
|
+
runYourAgent({ systemPrompt: surface as string, scenario, signal: ctx.signal }),
|
|
34
|
+
scenarios,
|
|
35
|
+
judge,
|
|
36
|
+
baselineSurface: 'You are a senior copywriter…',
|
|
37
|
+
budget: { dollars: 10, generations: 3 },
|
|
38
|
+
})
|
|
39
|
+
|
|
40
|
+
console.log(`lift: ${result.lift.toFixed(3)} (${result.gateDecision})`)
|
|
41
|
+
if (result.gateDecision === 'ship') {
|
|
42
|
+
// result.winner.surface is the optimized prompt
|
|
43
|
+
}
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
That's the LAND happy path. Smart defaults pick: in-memory storage,
|
|
47
|
+
`gepaDriver` with copywriting-flavored mutation primitives,
|
|
48
|
+
`defaultProductionGate` with `deltaThreshold: 0.05`, 25% deterministic
|
|
49
|
+
train/holdout split.
|
|
50
|
+
|
|
51
|
+
Every escape hatch the substrate exposes is reachable from
|
|
52
|
+
`selfImprove` — custom `driver`, custom `gate`, distributed-driver
|
|
53
|
+
`cellPlacement`, `onProgress` streaming callback, `autoOnPromote: 'pr'`
|
|
54
|
+
to open a GitHub PR with the winner. See the type signatures in
|
|
55
|
+
[`src/contract/self-improve.ts`](../src/contract/self-improve.ts) for
|
|
56
|
+
the full surface.
|
|
57
|
+
|
|
58
|
+
The sections below are the lower-level path — useful when you want
|
|
59
|
+
fine-grained control over each piece. Read those next if `selfImprove`
|
|
60
|
+
isn't enough.
|
|
22
61
|
|
|
23
62
|
## Five types, four functions
|
|
24
63
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.46.0",
|
|
4
4
|
"description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|
|
@@ -114,6 +114,11 @@
|
|
|
114
114
|
"import": "./dist/adapters/langchain.js",
|
|
115
115
|
"default": "./dist/adapters/langchain.js"
|
|
116
116
|
},
|
|
117
|
+
"./adapters/http": {
|
|
118
|
+
"types": "./dist/adapters/http.d.ts",
|
|
119
|
+
"import": "./dist/adapters/http.js",
|
|
120
|
+
"default": "./dist/adapters/http.js"
|
|
121
|
+
},
|
|
117
122
|
"./openapi.json": {
|
|
118
123
|
"default": "./dist/openapi.json"
|
|
119
124
|
}
|