@tangle-network/agent-eval 0.46.0 → 0.48.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/traceai.d.ts +109 -0
- package/dist/adapters/traceai.js +106 -0
- package/dist/adapters/traceai.js.map +1 -0
- package/dist/campaign/index.d.ts +2 -2
- package/dist/campaign/index.js +1 -1
- package/dist/chunk-OYI6RZJK.js +80 -0
- package/dist/chunk-OYI6RZJK.js.map +1 -0
- package/dist/{chunk-HRKOCLQA.js → chunk-XAP6DJZE.js} +1 -1
- package/dist/chunk-XAP6DJZE.js.map +1 -0
- package/dist/contract/index.d.ts +21 -3
- package/dist/contract/index.js +83 -3
- package/dist/contract/index.js.map +1 -1
- package/dist/hosted/index.d.ts +192 -0
- package/dist/hosted/index.js +10 -0
- package/dist/hosted/index.js.map +1 -0
- package/dist/index.d.ts +6 -5
- package/dist/index.js +30 -3
- package/dist/index.js.map +1 -1
- package/dist/matrix/index.d.ts +2 -2
- package/dist/multishot/index.d.ts +2 -2
- package/dist/openapi.json +1 -1
- package/dist/{release-report-BtpgWRI0.d.ts → release-report-DBB8lB1P.d.ts} +1 -1
- package/dist/reporting.d.ts +2 -2
- package/dist/{researcher-CoJMs2Iz.d.ts → researcher-CHMO56K0.d.ts} +1 -1
- package/dist/rl.d.ts +3 -3
- package/dist/rl.js +3 -1
- package/dist/rl.js.map +1 -1
- package/dist/{run-improvement-loop-Bfam3MT1.d.ts → run-improvement-loop-B-L8GgpW.d.ts} +1 -1
- package/dist/{sequential-DdV5ShjT.d.ts → sequential-CbFH___X.d.ts} +23 -1
- package/dist/{types-DHqkLwEU.d.ts → types-CqPax19X.d.ts} +1 -1
- package/dist/verdict-CeEgtjyI.d.ts +32 -0
- package/docs/adapters-observability.md +15 -0
- package/docs/design/phase-d-rfc.md +125 -0
- package/docs/design/substrate-gaps-2026-05-27.md +118 -0
- package/docs/hosted-ingest-spec.md +204 -0
- package/package.json +22 -31
- package/dist/chunk-HRKOCLQA.js.map +0 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { S as Scenario, d as CampaignResult, j as GateResult, o as Mutator, I as ImprovementDriver, G as Gate, g as DispatchFn, J as JudgeConfig, L as LabeledScenarioStore, e as CampaignTraceWriter, M as MutableSurface, l as GenerationRecord } from './types-8u72Gc76.js';
|
|
2
2
|
import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
|
|
3
|
-
import { RunRecord } from '@tangle-network/agent-runtime';
|
|
4
3
|
import { R as RedTeamCase } from './red-team-30II1T4o.js';
|
|
4
|
+
import { R as RunRecord } from './run-record-BGY6bHRh.js';
|
|
5
5
|
|
|
6
6
|
/**
|
|
7
7
|
* @experimental
|
|
@@ -35,7 +35,7 @@ import { F as FailureClusterReport } from './failure-cluster-Cw65_5FY.js';
|
|
|
35
35
|
* specific promotion path (still useful for replay-style evals).
|
|
36
36
|
*/
|
|
37
37
|
|
|
38
|
-
type HeldOutGateRejectionCode = 'few_runs' | 'negative_delta' | 'overfit_gap';
|
|
38
|
+
type HeldOutGateRejectionCode = 'few_runs' | 'negative_delta' | 'overfit_gap' | 'cost_ceiling';
|
|
39
39
|
interface HeldOutGateConfig {
|
|
40
40
|
/** Minimum number of paired (candidate, baseline) holdout observations
|
|
41
41
|
* required before the gate will even consider promoting. Default 3. */
|
|
@@ -57,6 +57,19 @@ interface HeldOutGateConfig {
|
|
|
57
57
|
/** Optional deterministic seed for the bootstrap. Default undefined
|
|
58
58
|
* (Math.random). */
|
|
59
59
|
seed?: number;
|
|
60
|
+
/**
|
|
61
|
+
* Hard ceiling on the candidate's median per-task USD cost. When the
|
|
62
|
+
* candidate clears the quality gates (paired-delta + overfit-gap) but
|
|
63
|
+
* its median cost exceeds this number, the gate rejects with
|
|
64
|
+
* `cost_ceiling`. Default `undefined` = no cost ceiling, behaving
|
|
65
|
+
* exactly like the pre-cost gate.
|
|
66
|
+
*
|
|
67
|
+
* This exists because "we ship the better prompt" is only an honest
|
|
68
|
+
* pitch when the better prompt also fits a customer-stated budget.
|
|
69
|
+
* Cost is read from `RunRecord.costUsd` (already mandatory on every
|
|
70
|
+
* run) so no new schema is required.
|
|
71
|
+
*/
|
|
72
|
+
costPerTaskCeiling?: number;
|
|
60
73
|
}
|
|
61
74
|
interface GateEvidence {
|
|
62
75
|
/** Number of paired (candidate, baseline) holdout observations used. */
|
|
@@ -78,6 +91,14 @@ interface GateEvidence {
|
|
|
78
91
|
overfitGap: number;
|
|
79
92
|
/** Baseline (search − holdout) gap. */
|
|
80
93
|
baselineOverfitGap: number;
|
|
94
|
+
/** Median per-task USD cost across the candidate's runs. Recorded
|
|
95
|
+
* even when no `costPerTaskCeiling` is configured so downstream
|
|
96
|
+
* dashboards (intelligence.tangle.tools) can render \$/task per
|
|
97
|
+
* generation regardless of gating policy. */
|
|
98
|
+
medianCandidateCost: number;
|
|
99
|
+
/** Median per-task USD cost across the baseline runs, for
|
|
100
|
+
* symmetric reporting. */
|
|
101
|
+
medianBaselineCost: number;
|
|
81
102
|
}
|
|
82
103
|
interface GateDecision {
|
|
83
104
|
/** Final promote/no-promote verdict. */
|
|
@@ -106,6 +127,7 @@ declare class HeldOutGate {
|
|
|
106
127
|
private readonly confidence;
|
|
107
128
|
private readonly resamples;
|
|
108
129
|
private readonly seed?;
|
|
130
|
+
private readonly costPerTaskCeiling?;
|
|
109
131
|
constructor(config: HeldOutGateConfig);
|
|
110
132
|
/** Decide whether `candidate` should replace `baseline`. Pairing
|
|
111
133
|
* is by (experimentId, seed) — identical experiment + seed pairs
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Validator-output verdict — substrate primitive for "did this output pass,
|
|
3
|
+
* and how well?"
|
|
4
|
+
*
|
|
5
|
+
* Used by:
|
|
6
|
+
* - `@tangle-network/agent-eval/matrix` — verdict per cell in the cartesian.
|
|
7
|
+
* - `@tangle-network/agent-runtime` — Validator<Output, Verdict = DefaultVerdict>.
|
|
8
|
+
* Runtime keeps `Validator` because it's coupled to runtime-shaped
|
|
9
|
+
* `ValidationCtx` (iteration, signal, traceEmitter); the verdict TYPE
|
|
10
|
+
* itself is a substrate concept and lives here.
|
|
11
|
+
*
|
|
12
|
+
* Repo layering: agent-eval is the substrate (no upward deps). Both
|
|
13
|
+
* agent-runtime and agent-knowledge consume this type FROM agent-eval —
|
|
14
|
+
* never the other way around. See CLAUDE.md "Repo layering" for the rule.
|
|
15
|
+
*/
|
|
16
|
+
/**
|
|
17
|
+
* Minimal verdict shape — `valid` + `score` are required; `scores` +
|
|
18
|
+
* `notes` are optional surface. Validators that need richer shapes
|
|
19
|
+
* parameterise `Validator<Output, MyVerdict>` with their own type.
|
|
20
|
+
*/
|
|
21
|
+
interface DefaultVerdict {
|
|
22
|
+
/** Whether the output meets the validator's pass criteria. */
|
|
23
|
+
valid: boolean;
|
|
24
|
+
/** Aggregate score in [0, 1]. Drivers use this for winner selection. */
|
|
25
|
+
score: number;
|
|
26
|
+
/** Per-dimension scores. Free-form; weighted into `score` by the validator. */
|
|
27
|
+
scores?: Record<string, number>;
|
|
28
|
+
/** Human-readable rationale; surfaces in trace + final-result `winner.verdict`. */
|
|
29
|
+
notes?: string;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export type { DefaultVerdict as D };
|
|
@@ -35,6 +35,21 @@ it*. Unified at the trace level, you see both as one timeline per cell.
|
|
|
35
35
|
- Compose: register TraceAI's instrumentations on the global tracer
|
|
36
36
|
provider, then either point both at your OTLP collector or at
|
|
37
37
|
TraceAI's hosted backend if you want their UI.
|
|
38
|
+
- **Or use the bridge: `@tangle-network/agent-eval/adapters/traceai`.**
|
|
39
|
+
Forwards finished OTel spans (`ReadableSpan` shape) directly into the
|
|
40
|
+
hosted-tier ingest, lifting `tangle.runId` / `tangle.scenarioId` /
|
|
41
|
+
`tangle.cellId` / `tangle.generation` to first-class wire fields so
|
|
42
|
+
the dashboard pivots correctly. Zero dependency on `@opentelemetry/*`
|
|
43
|
+
at the substrate; consumers pass spans from their own OTel SDK.
|
|
44
|
+
```ts
|
|
45
|
+
import { createHostedClient } from '@tangle-network/agent-eval/hosted'
|
|
46
|
+
import { createTraceAiBridge } from '@tangle-network/agent-eval/adapters/traceai'
|
|
47
|
+
|
|
48
|
+
const client = createHostedClient({ endpoint, apiKey, tenantId })
|
|
49
|
+
const bridge = createTraceAiBridge({ client, defaultRunId: substrateRunId })
|
|
50
|
+
processor.onEnd = (span) => { void bridge.ingest([span]) }
|
|
51
|
+
// ...or call `bridge.ingest(batch)` from a SpanProcessor.onShutdown.
|
|
52
|
+
```
|
|
38
53
|
|
|
39
54
|
### Langfuse SDK
|
|
40
55
|
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# Phase D RFC — hosted-tier substrate
|
|
2
|
+
|
|
3
|
+
Pinned scope decisions for the EXPAND tier. What we built, what we
|
|
4
|
+
deliberately did NOT, and what's gated on Phase B evidence.
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## What's in this version
|
|
9
|
+
|
|
10
|
+
**Wire-format substrate (shipped):**
|
|
11
|
+
|
|
12
|
+
1. `@tangle-network/agent-eval/hosted` — public client + types for shipping
|
|
13
|
+
eval-run events + trace spans to any orchestrator that speaks the wire
|
|
14
|
+
format.
|
|
15
|
+
2. `docs/hosted-ingest-spec.md` — semver-committed wire spec
|
|
16
|
+
(`HostedWireVersion = "2026-05-26.v1"`).
|
|
17
|
+
3. `examples/hosted-ingest-server/` — minimal hono-based reference
|
|
18
|
+
receiver (~200 LOC). Executable spec. Stays as the reference even
|
|
19
|
+
after the production orchestrator ships.
|
|
20
|
+
4. `selfImprove({ hostedTenant })` opt-in — when set, the substrate
|
|
21
|
+
POSTs the final eval-run event to the configured endpoint. Failures
|
|
22
|
+
are logged but never fail the loop (LAND tier never blocks on
|
|
23
|
+
EXPAND-tier infra).
|
|
24
|
+
|
|
25
|
+
**Production orchestrator (started):**
|
|
26
|
+
|
|
27
|
+
5. HTTP ingest service in `@tangle-network/monorepo` accepting the wire
|
|
28
|
+
format. Lives under the orchestrator app. Tenant auth + isolation
|
|
29
|
+
+ persistent storage + read endpoints. *Started this session — see
|
|
30
|
+
the @tangle-network/agent-dev-container PR. Not feature-complete:
|
|
31
|
+
tenant CRUD + adversarial isolation tests pending.*
|
|
32
|
+
|
|
33
|
+
## What's deliberately deferred
|
|
34
|
+
|
|
35
|
+
The wedge doc gates these on Phase B evidence — partner-validated
|
|
36
|
+
signal about what the hosted product actually needs to do. Shipping
|
|
37
|
+
them without that signal risks building the wrong thing.
|
|
38
|
+
|
|
39
|
+
| Deferred until Phase B passes | Why |
|
|
40
|
+
|---|---|
|
|
41
|
+
| **Metered billing wire-up (Stripe + cost-ledger)** | The billable units (per-eval-run, per-ingested-MB, per-seat) depend on actual partner consumption patterns. Picking dimensions in a vacuum locks us into wrong pricing. |
|
|
42
|
+
| **Multi-tenant dashboard UX** | Partners' first dashboard request defines the right default views. We have a stub list-runs page; the rest is post-signal. |
|
|
43
|
+
| **Webhook callbacks per tenant** | The events partners want pushed (gate-decided, cost-threshold, regression-alert) are partner-shaped. Add them when a partner asks. |
|
|
44
|
+
| **Cross-tenant aggregation / benchmarking** | This is the "Datadog for agents" tier — explicit roadmap, requires user volume we don't have. |
|
|
45
|
+
| **Sandbox-cost roll-up into hosted billing** | Cross-product billing integration requires PLATFORM-tier partners. Out of scope until at least one. |
|
|
46
|
+
| **Trace UI** | OTel-shape spans store fine. Visualization comes after partners ask. Phoenix / Jaeger / any OTLP-compatible viewer covers it in the interim. |
|
|
47
|
+
| **Soc2 / compliance audit work** | Required for enterprise; not required for design partners. |
|
|
48
|
+
|
|
49
|
+
## Architecture decisions locked
|
|
50
|
+
|
|
51
|
+
These are committed and won't change without a major-version wire bump
|
|
52
|
+
or a documented migration:
|
|
53
|
+
|
|
54
|
+
1. **Wire format is JSON over HTTP**, not gRPC. Reasons: works in
|
|
55
|
+
browsers + edge + node + curl; OTel-compatible at the trace stream
|
|
56
|
+
level; lowest possible barrier to a self-hosted orchestrator.
|
|
57
|
+
2. **Tenant auth is bearer-token + tenant-id header**, not OIDC /
|
|
58
|
+
service-account / mutual-TLS. Reasons: simplest thing that's
|
|
59
|
+
actually secure with proper key handling; defers complex IAM until
|
|
60
|
+
enterprise demand.
|
|
61
|
+
3. **Idempotency via header, not transactional API**. Servers MUST
|
|
62
|
+
dedupe by `(tenantId, Idempotency-Key)` for 24h. Simpler than
|
|
63
|
+
making clients commit transactions.
|
|
64
|
+
4. **Eval-runs and traces are SEPARATE streams** with pivot keys
|
|
65
|
+
(`tangle.runId` etc.) on spans. Reasons: traces can be best-effort
|
|
66
|
+
(lossy) without corrupting eval-run semantics; orchestrators can
|
|
67
|
+
prioritize eval-run durability without forcing trace durability.
|
|
68
|
+
5. **Wire version is a date.v-N string**, not semver. Reasons: dates
|
|
69
|
+
communicate "when was this contract frozen"; v-N captures
|
|
70
|
+
incremental breaking changes between dates.
|
|
71
|
+
|
|
72
|
+
## Open questions for Phase B to answer
|
|
73
|
+
|
|
74
|
+
When the design-partner pairing happens, capture answers to these
|
|
75
|
+
explicitly:
|
|
76
|
+
|
|
77
|
+
1. **Surface confidentiality**: do partners want the verbatim surface
|
|
78
|
+
(system prompt) shipped, or just the hash? Today the wire format
|
|
79
|
+
has `surface?` as optional; partner default is what we ship.
|
|
80
|
+
2. **Trace sampling**: at what cells-per-second do trace spans become
|
|
81
|
+
noise? What's the right default sampling rate?
|
|
82
|
+
3. **Cost attribution granularity**: per cell? per generation? per
|
|
83
|
+
run? Per judge dimension? Partner needs determine what we surface
|
|
84
|
+
in billing reports.
|
|
85
|
+
4. **Replay**: do partners want to re-run an old eval-run from the
|
|
86
|
+
stored data? That would require us to store more than the summary —
|
|
87
|
+
actual artifacts + prompts. Storage cost implication.
|
|
88
|
+
5. **PII / sensitive scenarios**: how do partners want to handle
|
|
89
|
+
scenarios containing user data? Encryption-at-rest is table stakes;
|
|
90
|
+
redaction-at-ingest may be required for some.
|
|
91
|
+
|
|
92
|
+
The partner pairing kit (`docs/phase-b-pairing-kit.md`) has discovery
|
|
93
|
+
questions that probe these.
|
|
94
|
+
|
|
95
|
+
## Non-goals (explicit)
|
|
96
|
+
|
|
97
|
+
This RFC does NOT plan for:
|
|
98
|
+
|
|
99
|
+
- Replacing Langfuse / Phoenix / Arize. We INGEST OTel; we don't
|
|
100
|
+
build a generic trace viewer. The dashboard is eval-run-shaped, not
|
|
101
|
+
trace-shaped.
|
|
102
|
+
- Becoming a model gateway. Tangle Router exists; the hosted
|
|
103
|
+
orchestrator routes to Tangle Router by default but doesn't
|
|
104
|
+
duplicate its function.
|
|
105
|
+
- Becoming an LLM-call CDN. Caching is the consumer's job (their
|
|
106
|
+
agent code, their HTTP client). We don't intercept LLM calls.
|
|
107
|
+
- Building an "agents IDE." Substrate, not surface.
|
|
108
|
+
|
|
109
|
+
## Migration path (post Phase B)
|
|
110
|
+
|
|
111
|
+
When Phase B passes the gate, the production orchestrator finishes:
|
|
112
|
+
|
|
113
|
+
1. Replace in-memory store with Postgres (tenant data) + S3 (large
|
|
114
|
+
artifacts) OR Cloudflare D1 + R2 (Workers-native).
|
|
115
|
+
2. Wire metered events to Stripe + the cost-ledger.
|
|
116
|
+
3. Tenant CRUD UI + onboarding flow.
|
|
117
|
+
4. Multi-tenant dashboard MVP (list runs, drill into one, diff
|
|
118
|
+
generations, view shipped prompt).
|
|
119
|
+
5. Adversarial tenant-isolation test battery in CI.
|
|
120
|
+
6. Webhooks + observability for the orchestrator itself.
|
|
121
|
+
|
|
122
|
+
Estimated effort post-Phase-B: ~1 week focused work for one engineer.
|
|
123
|
+
This is fast precisely BECAUSE the wire format is locked and the
|
|
124
|
+
reference receiver exists — the production server is a different
|
|
125
|
+
implementation of the same contract.
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# Substrate gaps — design-partner readiness
|
|
2
|
+
|
|
3
|
+
What's missing from `@tangle-network/agent-eval` substrate (this repo) and `~/code/agent-dev-container/products/intelligence/` (the orchestrator) to credibly hand to a first design partner.
|
|
4
|
+
|
|
5
|
+
This doc is the engineering-side mirror of `~/company/gtm/experiments/2026-05-27/design-partner-readiness.md` — gtm tracks the partner-facing readiness, this tracks the code that backs each bar.
|
|
6
|
+
|
|
7
|
+
## Current substrate state
|
|
8
|
+
|
|
9
|
+
Shipped in v0.47:
|
|
10
|
+
- `selfImprove({ scenarios, judges, dispatch, hostedTenant })` one-shot helper
|
|
11
|
+
- `defaultProductionGate(deltaThreshold)` autonomous-ship gate
|
|
12
|
+
- Wire format frozen at `HOSTED_WIRE_VERSION = '2026-05-26.v1'`
|
|
13
|
+
- `/hosted/client.ts` — bearer auth, idempotency, bounded retries on 5xx/408/429
|
|
14
|
+
- `examples/hosted-ingest-server/` — reference receiver implementing the spec
|
|
15
|
+
- `docs/hosted-ingest-spec.md` — semver-locked wire spec
|
|
16
|
+
- `docs/design/phase-d-rfc.md` — scope decisions + deferred items
|
|
17
|
+
- `docs/quickstart-external.md` — foreign-agent quickstart
|
|
18
|
+
- `docs/phase-b-pairing-kit.md` — partner discovery script
|
|
19
|
+
- `adapters/langchain` + `adapters/http`
|
|
20
|
+
|
|
21
|
+
## Engineering gaps keyed to first-partner readiness
|
|
22
|
+
|
|
23
|
+
### Substrate gaps (this repo)
|
|
24
|
+
|
|
25
|
+
**S1. TraceAI/OTel adapter (`adapters/traceai`).**
|
|
26
|
+
Future AGI's `traceai` library is the strongest OTel-native instrumentation in the TS ecosystem. Partners using it should be able to wire its emitted spans into our hosted ingest via one config line. The adapter receives OTel spans, normalizes them to `TraceSpanEvent`, ensures the `tangle.runId` attribute is present, and forwards via the existing hosted client.
|
|
27
|
+
|
|
28
|
+
Path: `src/adapters/traceai.ts`. Export from `tsup.config.ts`. Add to `docs/adapters-observability.md`.
|
|
29
|
+
|
|
30
|
+
Estimate: 6-8h. Owner: claude. Priority: medium (Tier C in partner-readiness — defer until first partner asks, but pre-build the contract).
|
|
31
|
+
|
|
32
|
+
**S2. Run-diff data primitive.**
|
|
33
|
+
The orchestrator needs to render "v3 vs v4" comparisons. The substrate should expose a `diffRuns(runA: EvalRunEvent, runB: EvalRunEvent): RunDiff` helper that computes cell-by-cell judge-score deltas, artifact-text diff (using a stable diff algorithm), and lift summary. Without this, every consumer rebuilds the diff logic.
|
|
34
|
+
|
|
35
|
+
Path: `src/contract/diff.ts`. Add to `/contract` entry.
|
|
36
|
+
|
|
37
|
+
Estimate: 4-6h. Owner: claude. Priority: high (orchestrator's run-diff view depends on this).
|
|
38
|
+
|
|
39
|
+
**S3. Sampling controls in hosted client.**
|
|
40
|
+
The Phase D RFC flags trace sampling as an open question. Add `sampling: { traces: number /* 0-1 */ }` to `createHostedClient` options. Default 1.0. Document the cost implication. Reservoir-sample if over budget.
|
|
41
|
+
|
|
42
|
+
Path: `src/hosted/client.ts`. Update `docs/hosted-ingest-spec.md` accordingly.
|
|
43
|
+
|
|
44
|
+
Estimate: 2h. Owner: claude. Priority: medium (Tier B partner-readiness).
|
|
45
|
+
|
|
46
|
+
**S4. Auto-instrumentation library (`@tangle-network/agent-eval/auto`).**
|
|
47
|
+
LangSmith's `@traceable` decorator + auto-wrap of OpenAI/Anthropic SDKs is their highest-leverage adoption tool. Build the equivalent: a `traceable()` HOF that emits OTel spans with `tangle.runId` attribute and forwards via the hosted client. Optional auto-wrap of `OpenAI` / `Anthropic` SDK clients.
|
|
48
|
+
|
|
49
|
+
This is the biggest unlock for non-LangChain TS partners. Defer until at least one partner asks — pre-shipping costs 16-20h and may be the wrong shape without partner signal.
|
|
50
|
+
|
|
51
|
+
Path: `src/auto/index.ts` (new entry). Estimate: 16-20h. Owner: claude. Priority: low (Tier C — defer).
|
|
52
|
+
|
|
53
|
+
**S5. Surface-confidentiality option in wire format.**
|
|
54
|
+
RFC open question 1. Add `surfaceMode: 'verbatim' | 'hashed' | 'omitted'` to `selfImprove` config. When `'hashed'`, ship `surfaceHash` instead of `surface` on the eval-run event. When `'omitted'`, ship neither.
|
|
55
|
+
|
|
56
|
+
This is partner-shaped — wait for the first conversation. But the wire format should accommodate it without a breaking version bump. Add the optional field to types now.
|
|
57
|
+
|
|
58
|
+
Path: `src/hosted/types.ts` (add `surfaceHash?: string`). Estimate: 1h to land the type change; later partner work to wire selfImprove. Priority: low until asked.
|
|
59
|
+
|
|
60
|
+
### Orchestrator gaps (`agent-dev-container/products/intelligence/`)
|
|
61
|
+
|
|
62
|
+
These are the gtm-doc's Tier A items rephrased for engineering tracking.
|
|
63
|
+
|
|
64
|
+
**O1. Adversarial tenant-isolation test suite.**
|
|
65
|
+
`tests/auth.test.ts` exists. Need `tests/isolation.test.ts` covering: cross-tenant header mismatch, cross-tenant `/v1/runs/:id` reads, webhook tenant scoping, idempotency-key tenant scoping, raw-SQL cross-tenant query, JWT replay after revocation. Use `VITEST_INTEGRATION=1` with a real Postgres in CI.
|
|
66
|
+
|
|
67
|
+
Estimate: 4-6h. Owner: claude. Priority: **critical** (blocker for any partner conversation).
|
|
68
|
+
|
|
69
|
+
**O2. Web dashboard MVP.**
|
|
70
|
+
List runs + run detail + login. See gtm doc A2 for shape. Pages: `/login`, `/runs`, `/runs/:id`, `/keys`. Use `intelligence-web` Vite scaffold; wire to `/v1/runs*` reads.
|
|
71
|
+
|
|
72
|
+
Estimate: 12-16h. Owner: claude. Priority: **critical** (without UI, partner can't show anyone in their org).
|
|
73
|
+
|
|
74
|
+
**O3. Free-tier plan limits enforcement.**
|
|
75
|
+
`lib/plans.ts` defines limits. `routes/ingest.ts` does not enforce them. Add per-tenant counters (eval-runs/mo, trace-spans/day), check against plan, return 429 with clear message + reset time on exceed.
|
|
76
|
+
|
|
77
|
+
Estimate: 3-4h. Owner: claude. Priority: medium (Tier B partner-readiness).
|
|
78
|
+
|
|
79
|
+
**O4. Stale README sweep.**
|
|
80
|
+
`api/README.md` lists T0-3..T0-8 as "next/pending" when 5 of them are shipped. Broken link to `../../../docs/intelligence-product-rfc.md` (gitignored path; should be `../RFC.md`).
|
|
81
|
+
|
|
82
|
+
Estimate: 30min. Owner: claude. Priority: **must-fix-now** (5-min job, awful first impression for anyone reading the repo).
|
|
83
|
+
|
|
84
|
+
**O5. Onboarding partner-facing doc.**
|
|
85
|
+
Engineer-shaped `quickstart-external.md` exists in this repo. Partner-facing 10-minute walkthrough does not. Lives at `intelligence.tangle.tools/docs` once provisioned; for now, write at `products/intelligence/docs/partner-onboarding.md`.
|
|
86
|
+
|
|
87
|
+
Estimate: 2-3h. Owner: claude. Priority: high (Tier A — needed for any partner call).
|
|
88
|
+
|
|
89
|
+
## Recommended sequencing (engineering view)
|
|
90
|
+
|
|
91
|
+
**Sprint 1 — partner-ready (≤ 1 week):**
|
|
92
|
+
- O4 (README sweep) — ship today
|
|
93
|
+
- O1 (isolation tests) — 4-6h
|
|
94
|
+
- O2 (dashboard MVP) — 12-16h
|
|
95
|
+
- O5 (partner onboarding doc) — 2-3h
|
|
96
|
+
- S2 (run-diff primitive) — 4-6h (substrate side of O2 follow-up)
|
|
97
|
+
|
|
98
|
+
Total: ~25-32h focused work. One engineer-week.
|
|
99
|
+
|
|
100
|
+
**Sprint 2 — concurrent with first partner conversations:**
|
|
101
|
+
- O3 (plan limits enforcement) — partner will hit it
|
|
102
|
+
- S3 (trace sampling) — partner will ask about cost
|
|
103
|
+
|
|
104
|
+
**Sprint 3 — after first partner ships to prod:**
|
|
105
|
+
- S1 (TraceAI adapter)
|
|
106
|
+
- Stripe billing wire-up in orchestrator
|
|
107
|
+
|
|
108
|
+
**Holding for partner signal:**
|
|
109
|
+
- S4 (auto-instrumentation library) — 16-20h speculative without ask
|
|
110
|
+
- S5 wiring (surface confidentiality) — partner-shaped
|
|
111
|
+
|
|
112
|
+
## Cross-references
|
|
113
|
+
|
|
114
|
+
- `docs/design/phase-d-rfc.md` — substrate scope decisions (this doc operationalizes its "what's deferred until Phase B")
|
|
115
|
+
- `docs/hosted-ingest-spec.md` — wire format spec (any change here is a wire-version bump)
|
|
116
|
+
- `~/company/gtm/experiments/2026-05-27/design-partner-readiness.md` — partner-facing readiness, mirrors this
|
|
117
|
+
- `~/company/gtm/products/tangle-intelligence.md` — product hub
|
|
118
|
+
- `~/company/gtm/competitor-analysis/agent-improvement.md` — competitive frame
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
# Hosted-ingest wire spec — `2026-05-26.v1`
|
|
2
|
+
|
|
3
|
+
The schema **every** orchestrator (ours, partners' self-hosted ones,
|
|
4
|
+
any future open implementation) must accept. Frozen under semver:
|
|
5
|
+
**new minors only add optional fields. Breaking changes mean a major
|
|
6
|
+
bump and a new `HostedWireVersion` literal.**
|
|
7
|
+
|
|
8
|
+
This is the contract that decouples the LAND-tier substrate
|
|
9
|
+
(`@tangle-network/agent-eval`) from the EXPAND-tier hosted product. A
|
|
10
|
+
foreign builder can:
|
|
11
|
+
|
|
12
|
+
- Use our orchestrator at `https://orchestrator.tangle.tools/v1`.
|
|
13
|
+
- Self-host the reference receiver from
|
|
14
|
+
`examples/hosted-ingest-server/`.
|
|
15
|
+
- Implement their own orchestrator against this spec.
|
|
16
|
+
|
|
17
|
+
All three are wire-compatible by definition.
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## Transport
|
|
22
|
+
|
|
23
|
+
Two endpoints, both `POST`, both JSON. Headers on every request:
|
|
24
|
+
|
|
25
|
+
| Header | Value |
|
|
26
|
+
|---|---|
|
|
27
|
+
| `Authorization` | `Bearer <tenant-key>` (the orchestrator issues this) |
|
|
28
|
+
| `Content-Type` | `application/json` |
|
|
29
|
+
| `X-Tangle-Tenant-Id` | The tenant's stable id (the orchestrator's primary key for the tenant) |
|
|
30
|
+
| `X-Tangle-Wire-Version` | `2026-05-26.v1` (this spec) |
|
|
31
|
+
| `Idempotency-Key` (optional) | UUID; servers MUST treat repeated keys as dedup |
|
|
32
|
+
|
|
33
|
+
Responses are JSON of shape `{ accepted: number, rejected: Array<{ index, reason }> }`. The
|
|
34
|
+
server SHOULD return 202 (accepted, async) or 200 (accepted, synchronous);
|
|
35
|
+
both are equivalent for the wire's purposes.
|
|
36
|
+
|
|
37
|
+
### `POST /v1/ingest/eval-runs`
|
|
38
|
+
|
|
39
|
+
Body: `IngestEvalRunsRequest = { wireVersion, events: EvalRunEvent[] }`.
|
|
40
|
+
|
|
41
|
+
One ingest call per logical eval-run; generations stream in
|
|
42
|
+
incrementally via repeated calls with the same `runId`. The
|
|
43
|
+
orchestrator deduplicates by `(tenantId, runId, generation.index)`.
|
|
44
|
+
|
|
45
|
+
### `POST /v1/ingest/traces`
|
|
46
|
+
|
|
47
|
+
Body: `IngestTracesRequest = { wireVersion, spans: TraceSpanEvent[] }`.
|
|
48
|
+
|
|
49
|
+
Standard OTLP-shaped spans with a few additional attributes
|
|
50
|
+
(`tangle.runId`, `tangle.generation`, `tangle.cellId`,
|
|
51
|
+
`tangle.scenarioId`) so the orchestrator can pivot between the
|
|
52
|
+
eval-run stream and the underlying execution trace.
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## `EvalRunEvent`
|
|
57
|
+
|
|
58
|
+
```ts
|
|
59
|
+
interface EvalRunEvent {
|
|
60
|
+
runId: string // stable; same id across all generations of one run
|
|
61
|
+
runDir: string // logical run directory (mem://... or filesystem path)
|
|
62
|
+
timestamp: string // ISO-8601
|
|
63
|
+
status: // lifecycle stage this event represents
|
|
64
|
+
| 'started'
|
|
65
|
+
| 'baseline-complete'
|
|
66
|
+
| 'generation-complete'
|
|
67
|
+
| 'gate-decided'
|
|
68
|
+
| 'finished'
|
|
69
|
+
| 'errored'
|
|
70
|
+
labels: Record<string, string> // free-form (env, branch, model id, etc.)
|
|
71
|
+
baseline?: EvalRunGenerationSnapshot // present when status >= baseline-complete
|
|
72
|
+
generations: EvalRunGenerationSnapshot[]
|
|
73
|
+
gateDecision?: // present when status >= gate-decided
|
|
74
|
+
| 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling'
|
|
75
|
+
holdoutLift?: number // winner-on-holdout - baseline-on-holdout
|
|
76
|
+
totalCostUsd: number
|
|
77
|
+
totalDurationMs: number
|
|
78
|
+
errorMessage?: string // present when status === 'errored'
|
|
79
|
+
}
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## `EvalRunGenerationSnapshot`
|
|
83
|
+
|
|
84
|
+
```ts
|
|
85
|
+
interface EvalRunGenerationSnapshot {
|
|
86
|
+
index: number // 0 is baseline; 1..N are improvement generations
|
|
87
|
+
surfaceHash: string // stable hash of the candidate surface (pivot key)
|
|
88
|
+
surface?: MutableSurface // OMITTED to avoid PII when consumer prefers
|
|
89
|
+
cells: EvalRunCellScore[]
|
|
90
|
+
compositeMean: number
|
|
91
|
+
costUsd: number
|
|
92
|
+
durationMs: number
|
|
93
|
+
}
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## `EvalRunCellScore`
|
|
97
|
+
|
|
98
|
+
```ts
|
|
99
|
+
interface EvalRunCellScore {
|
|
100
|
+
scenarioId: string
|
|
101
|
+
rep: number // 0 for the default; > 0 when reps > 1
|
|
102
|
+
compositeMean: number // composite across all judges + dimensions
|
|
103
|
+
dimensions: Record< // outer key = judge name; inner = dimension name → score
|
|
104
|
+
string,
|
|
105
|
+
Record<string, number>
|
|
106
|
+
>
|
|
107
|
+
errorMessage?: string // present when the dispatch threw
|
|
108
|
+
}
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## `TraceSpanEvent`
|
|
112
|
+
|
|
113
|
+
```ts
|
|
114
|
+
interface TraceSpanEvent {
|
|
115
|
+
// Standard OTel
|
|
116
|
+
traceId: string
|
|
117
|
+
spanId: string
|
|
118
|
+
parentSpanId?: string
|
|
119
|
+
name: string
|
|
120
|
+
startTimeUnixNano: number
|
|
121
|
+
endTimeUnixNano: number
|
|
122
|
+
attributes: Record<string, string | number | boolean>
|
|
123
|
+
events?: Array<{ timeUnixNano, name, attributes? }>
|
|
124
|
+
status?: { code: 'OK' | 'ERROR' | 'UNSET', message? }
|
|
125
|
+
|
|
126
|
+
// Tangle additions (all optional) for pivoting
|
|
127
|
+
'tangle.runId'?: string
|
|
128
|
+
'tangle.generation'?: number
|
|
129
|
+
'tangle.cellId'?: string
|
|
130
|
+
'tangle.scenarioId'?: string
|
|
131
|
+
}
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## Server requirements
|
|
137
|
+
|
|
138
|
+
Any orchestrator implementing this spec MUST:
|
|
139
|
+
|
|
140
|
+
1. **Validate auth**: reject without `Authorization` header (401), with a
|
|
141
|
+
mismatched bearer token (401), or without a recognized `X-Tangle-Tenant-Id`
|
|
142
|
+
(404).
|
|
143
|
+
2. **Validate wire version**: reject incompatible wire versions (400 with
|
|
144
|
+
a clear error message). The major component is the breaking-change axis.
|
|
145
|
+
3. **Validate tenant isolation**: queries with `tenantId` X never return
|
|
146
|
+
data tagged with `tenantId` Y. Test this adversarially.
|
|
147
|
+
4. **Honor idempotency**: when an `Idempotency-Key` matches a prior
|
|
148
|
+
request from the same tenant in the last 24h, return the same response
|
|
149
|
+
without double-processing.
|
|
150
|
+
5. **Persist eval-runs durably**: at least the event + cell scores must
|
|
151
|
+
survive an orchestrator restart. Trace spans MAY be best-effort.
|
|
152
|
+
6. **Provide read access**: GET endpoints for the tenant to list + fetch
|
|
153
|
+
their own runs. Wire format for reads is NOT part of this spec — each
|
|
154
|
+
orchestrator can pick its own (REST + JSON, gRPC, GraphQL).
|
|
155
|
+
|
|
156
|
+
Servers SHOULD also:
|
|
157
|
+
|
|
158
|
+
- Provide a webhook callback per tenant for `gate-decided` events.
|
|
159
|
+
- Provide a billable-events emitter (Stripe meter / equivalent) per ingest
|
|
160
|
+
call so consumption can be metered.
|
|
161
|
+
- Provide a dashboard or API to view + diff per-scenario lifts over time.
|
|
162
|
+
|
|
163
|
+
---
|
|
164
|
+
|
|
165
|
+
## Reference implementation
|
|
166
|
+
|
|
167
|
+
`examples/hosted-ingest-server/` — a minimal hono-based receiver. ~200
|
|
168
|
+
LOC. Validates auth, accepts ingest, stores in memory, exposes a
|
|
169
|
+
read endpoint. Runs anywhere Node runs.
|
|
170
|
+
|
|
171
|
+
```sh
|
|
172
|
+
TENANT_KEY=dev-token TENANT_ID=acme pnpm tsx examples/hosted-ingest-server/server.ts
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
In another terminal:
|
|
176
|
+
|
|
177
|
+
```sh
|
|
178
|
+
HOSTED_ENDPOINT=http://localhost:8080 \
|
|
179
|
+
HOSTED_TENANT_KEY=dev-token \
|
|
180
|
+
HOSTED_TENANT_ID=acme \
|
|
181
|
+
pnpm tsx examples/foreign-agent-quickstart/index.ts
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
The quickstart's eval-run gets POSTed to the reference receiver; the
|
|
185
|
+
receiver's `GET /v1/runs` lists it back.
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## Versioning
|
|
190
|
+
|
|
191
|
+
`HostedWireVersion` is `"2026-05-26.v1"`.
|
|
192
|
+
|
|
193
|
+
- Adding an optional field → no version change.
|
|
194
|
+
- Adding a new endpoint or new event type → minor wire bump
|
|
195
|
+
(`2026-05-26.v2`).
|
|
196
|
+
- Changing the shape of an existing field, removing a field, or
|
|
197
|
+
changing semantics of an existing field → major wire bump
|
|
198
|
+
(`2026-11-XX.v1`); a server may accept both versions during a
|
|
199
|
+
transition window.
|
|
200
|
+
|
|
201
|
+
Servers MUST reject requests with `X-Tangle-Wire-Version` they don't
|
|
202
|
+
support, with a 400 listing the versions they DO accept.
|
|
203
|
+
|
|
204
|
+
The version string IS the spec id — pin against it.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.48.0",
|
|
4
4
|
"description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|
|
@@ -119,6 +119,16 @@
|
|
|
119
119
|
"import": "./dist/adapters/http.js",
|
|
120
120
|
"default": "./dist/adapters/http.js"
|
|
121
121
|
},
|
|
122
|
+
"./adapters/traceai": {
|
|
123
|
+
"types": "./dist/adapters/traceai.d.ts",
|
|
124
|
+
"import": "./dist/adapters/traceai.js",
|
|
125
|
+
"default": "./dist/adapters/traceai.js"
|
|
126
|
+
},
|
|
127
|
+
"./hosted": {
|
|
128
|
+
"types": "./dist/hosted/index.d.ts",
|
|
129
|
+
"import": "./dist/hosted/index.js",
|
|
130
|
+
"default": "./dist/hosted/index.js"
|
|
131
|
+
},
|
|
122
132
|
"./openapi.json": {
|
|
123
133
|
"default": "./dist/openapi.json"
|
|
124
134
|
}
|
|
@@ -134,18 +144,6 @@
|
|
|
134
144
|
"publishConfig": {
|
|
135
145
|
"access": "public"
|
|
136
146
|
},
|
|
137
|
-
"scripts": {
|
|
138
|
-
"build": "tsup && pnpm openapi",
|
|
139
|
-
"dev": "tsup --watch",
|
|
140
|
-
"prepare": "husky",
|
|
141
|
-
"prepublishOnly": "pnpm build",
|
|
142
|
-
"test": "vitest run",
|
|
143
|
-
"test:watch": "vitest",
|
|
144
|
-
"typecheck": "tsc --noEmit",
|
|
145
|
-
"lint": "biome check src",
|
|
146
|
-
"format": "biome format --write src",
|
|
147
|
-
"openapi": "node dist/cli.js openapi --out dist/openapi.json"
|
|
148
|
-
},
|
|
149
147
|
"dependencies": {
|
|
150
148
|
"@asteasolutions/zod-to-openapi": "^8.5.0",
|
|
151
149
|
"@ax-llm/ax": "^19.0.25",
|
|
@@ -155,20 +153,15 @@
|
|
|
155
153
|
"zod": "^4.3.6"
|
|
156
154
|
},
|
|
157
155
|
"peerDependencies": {
|
|
158
|
-
"@tangle-network/agent-runtime": ">=0.21.0 <0.26.0",
|
|
159
156
|
"@tangle-network/sandbox": ">=0.2.1 <0.4.0"
|
|
160
157
|
},
|
|
161
158
|
"peerDependenciesMeta": {
|
|
162
|
-
"@tangle-network/agent-runtime": {
|
|
163
|
-
"optional": true
|
|
164
|
-
},
|
|
165
159
|
"@tangle-network/sandbox": {
|
|
166
160
|
"optional": true
|
|
167
161
|
}
|
|
168
162
|
},
|
|
169
163
|
"devDependencies": {
|
|
170
164
|
"@biomejs/biome": "^2.4.15",
|
|
171
|
-
"@tangle-network/agent-runtime": ">=0.21.0 <0.26.0",
|
|
172
165
|
"@tangle-network/sandbox": "0.3.0",
|
|
173
166
|
"@types/node": "^25.6.0",
|
|
174
167
|
"husky": "^9.1.7",
|
|
@@ -178,17 +171,6 @@
|
|
|
178
171
|
"typescript": "^5.7.0",
|
|
179
172
|
"vitest": "^3.0.0"
|
|
180
173
|
},
|
|
181
|
-
"pnpm": {
|
|
182
|
-
"minimumReleaseAge": 4320,
|
|
183
|
-
"minimumReleaseAgeExclude": [
|
|
184
|
-
"@tangle-network/sandbox",
|
|
185
|
-
"@tangle-network/agent-runtime"
|
|
186
|
-
],
|
|
187
|
-
"overrides": {
|
|
188
|
-
"postcss@<8.5.10": "^8.5.10",
|
|
189
|
-
"ws@>=8.0.0 <8.20.1": "^8.20.1"
|
|
190
|
-
}
|
|
191
|
-
},
|
|
192
174
|
"engines": {
|
|
193
175
|
"node": ">=20"
|
|
194
176
|
},
|
|
@@ -201,5 +183,14 @@
|
|
|
201
183
|
]
|
|
202
184
|
},
|
|
203
185
|
"license": "MIT",
|
|
204
|
-
"
|
|
205
|
-
|
|
186
|
+
"scripts": {
|
|
187
|
+
"build": "tsup && pnpm openapi",
|
|
188
|
+
"dev": "tsup --watch",
|
|
189
|
+
"test": "vitest run",
|
|
190
|
+
"test:watch": "vitest",
|
|
191
|
+
"typecheck": "tsc --noEmit",
|
|
192
|
+
"lint": "biome check src",
|
|
193
|
+
"format": "biome format --write src",
|
|
194
|
+
"openapi": "node dist/cli.js openapi --out dist/openapi.json"
|
|
195
|
+
}
|
|
196
|
+
}
|