@tangle-network/agent-eval 0.52.0 → 0.54.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/CHANGELOG.md +23 -0
  2. package/dist/adapters/http.d.ts +1 -1
  3. package/dist/adapters/langchain.d.ts +1 -1
  4. package/dist/adapters/otel.d.ts +7 -6
  5. package/dist/{baseline-4R5deP0N.d.ts → baseline-DE36-Np7.d.ts} +1 -1
  6. package/dist/benchmarks/index.d.ts +3 -2
  7. package/dist/builder-eval/index.d.ts +4 -3
  8. package/dist/campaign/index.d.ts +9 -7
  9. package/dist/campaign/index.js +33 -4
  10. package/dist/campaign/index.js.map +1 -1
  11. package/dist/{chunk-L7XMNXLO.js → chunk-J4DIMSRK.js} +2 -2
  12. package/dist/{chunk-BWZEGTES.js → chunk-NCK5QLGT.js} +1 -1
  13. package/dist/chunk-NCK5QLGT.js.map +1 -0
  14. package/dist/{chunk-5KSDYBYH.js → chunk-YXTT6GSZ.js} +2 -2
  15. package/dist/contract/index.d.ts +25 -12
  16. package/dist/contract/index.js +171 -0
  17. package/dist/contract/index.js.map +1 -1
  18. package/dist/{control-ojEWkMfJ.d.ts → control-DjEgwWNo.d.ts} +6 -5
  19. package/dist/{control-runtime-BZ_lVLYW.d.ts → control-runtime-DuFBYg7A.d.ts} +3 -2
  20. package/dist/control.d.ts +7 -6
  21. package/dist/control.js +2 -2
  22. package/dist/{emitter-DP_cSSiw.d.ts → emitter-DEZwY14K.d.ts} +2 -1
  23. package/dist/{failure-cluster-Cw65_5FY.d.ts → failure-cluster-CL7IVgkJ.d.ts} +2 -1
  24. package/dist/{feedback-trajectory-BSxqEpu7.d.ts → feedback-trajectory-DpUmE90J.d.ts} +1 -1
  25. package/dist/governance/index.d.ts +3 -2
  26. package/dist/hosted/index.d.ts +7 -6
  27. package/dist/{index-DQHtWQ57.d.ts → index-D2nT6_KT.d.ts} +66 -2
  28. package/dist/{index-0pu_fBwZ.d.ts → index-wlaiph9Y.d.ts} +1 -1
  29. package/dist/index.d.ts +31 -29
  30. package/dist/index.js +3 -3
  31. package/dist/{integrity-CTDhR1Sg.d.ts → integrity-CfXjSqEv.d.ts} +1 -1
  32. package/dist/knowledge/index.d.ts +4 -3
  33. package/dist/meta-eval/index.d.ts +4 -3
  34. package/dist/openapi.json +1 -1
  35. package/dist/pipelines/index.d.ts +7 -6
  36. package/dist/prm/index.d.ts +5 -4
  37. package/dist/{query-DODUYdPg.d.ts → query-CqTxMwDw.d.ts} +2 -1
  38. package/dist/{red-team-30II1T4o.d.ts → red-team-CrC5MZYd.d.ts} +1 -1
  39. package/dist/{registry-8KAs18kY.d.ts → registry-BSWy0rvH.d.ts} +1 -1
  40. package/dist/{release-report-DSu0DWy8.d.ts → release-report-B6l5fi7T.d.ts} +2 -2
  41. package/dist/reporting.d.ts +7 -6
  42. package/dist/{researcher-LZD0qHEa.d.ts → researcher-D4AZjxNa.d.ts} +5 -5
  43. package/dist/rl.d.ts +11 -10
  44. package/dist/rl.js +2 -2
  45. package/dist/{rubric-D5tjHNJQ.d.ts → rubric-BOfxn4ja.d.ts} +3 -2
  46. package/dist/{rubric-predictive-validity-ByZEC3BX.d.ts → rubric-predictive-validity-B3qNa4aY.d.ts} +1 -1
  47. package/dist/{run-improvement-loop-Cc7oZlRP.d.ts → run-improvement-loop-BhfdjrMY.d.ts} +3 -3
  48. package/dist/{run-record-BGY6bHRh.d.ts → run-record-etiCMsUq.d.ts} +11 -3
  49. package/dist/{store-Db2Bv8Cf.d.ts → schema-m0gsnbt3.d.ts} +1 -99
  50. package/dist/store-CKUAgsJz.d.ts +101 -0
  51. package/dist/{summary-report-B7gNRX-r.d.ts → summary-report-DLxh4yWk.d.ts} +2 -2
  52. package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BdVaPyHT.d.ts} +3 -2
  53. package/dist/traces.d.ts +7 -6
  54. package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-GEdXJCL5.d.ts} +2 -1
  55. package/dist/{types-Dbj5gu8n.d.ts → types-BgrxOJSf.d.ts} +31 -1
  56. package/dist/wire/index.d.ts +5 -4
  57. package/docs/design/self-improvement-protocol.md +223 -0
  58. package/docs/pilot/README.md +62 -0
  59. package/docs/pilot/customer-checklist.md +90 -0
  60. package/docs/pilot/integration-foreign-stack.md +296 -0
  61. package/docs/pilot/integration-tangle-stack.md +248 -0
  62. package/docs/pilot/one-pager.md +161 -0
  63. package/docs/pilot/sample-insight-report.json +172 -0
  64. package/docs/research/research-roadmap.md +204 -0
  65. package/package.json +1 -1
  66. package/dist/chunk-BWZEGTES.js.map +0 -1
  67. /package/dist/{chunk-L7XMNXLO.js.map → chunk-J4DIMSRK.js.map} +0 -0
  68. /package/dist/{chunk-5KSDYBYH.js.map → chunk-YXTT6GSZ.js.map} +0 -0
@@ -196,102 +196,4 @@ declare function isRetrievalSpan(s: Span): s is RetrievalSpan;
196
196
  declare function isJudgeSpan(s: Span): s is JudgeSpan;
197
197
  declare function isSandboxSpan(s: Span): s is SandboxSpan;
198
198
 
199
- interface RunFilter {
200
- scenarioId?: string;
201
- variantId?: string;
202
- status?: RunStatus;
203
- since?: number;
204
- until?: number;
205
- tag?: {
206
- key: string;
207
- value: string;
208
- };
209
- parentRunId?: string;
210
- projectId?: string;
211
- chatId?: string;
212
- layer?: RunLayer;
213
- }
214
- interface SpanFilter {
215
- runId?: string;
216
- parentSpanId?: string;
217
- kind?: SpanKind;
218
- name?: string;
219
- toolName?: string;
220
- judgeId?: string;
221
- since?: number;
222
- until?: number;
223
- }
224
- interface EventFilter {
225
- runId?: string;
226
- spanId?: string;
227
- kind?: EventKind;
228
- since?: number;
229
- until?: number;
230
- }
231
- interface TraceStore {
232
- appendRun(run: Run): Promise<void>;
233
- updateRun(runId: string, patch: Partial<Run>): Promise<void>;
234
- appendSpan(span: Span): Promise<void>;
235
- updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
236
- appendEvent(event: TraceEvent): Promise<void>;
237
- appendArtifact(artifact: Artifact): Promise<void>;
238
- appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
239
- getRun(runId: string): Promise<Run | undefined>;
240
- listRuns(filter?: RunFilter): Promise<Run[]>;
241
- spans(filter?: SpanFilter): Promise<Span[]>;
242
- events(filter?: EventFilter): Promise<TraceEvent[]>;
243
- budget(runId: string): Promise<BudgetLedgerEntry[]>;
244
- artifacts(runId: string): Promise<Artifact[]>;
245
- }
246
- declare class InMemoryTraceStore implements TraceStore {
247
- private runs;
248
- private allSpans;
249
- private allEvents;
250
- private allArtifacts;
251
- private allBudget;
252
- appendRun(run: Run): Promise<void>;
253
- updateRun(runId: string, patch: Partial<Run>): Promise<void>;
254
- appendSpan(span: Span): Promise<void>;
255
- updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
256
- appendEvent(event: TraceEvent): Promise<void>;
257
- appendArtifact(artifact: Artifact): Promise<void>;
258
- appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
259
- getRun(runId: string): Promise<Run | undefined>;
260
- listRuns(filter?: RunFilter): Promise<Run[]>;
261
- spans(filter?: SpanFilter): Promise<Span[]>;
262
- events(filter?: EventFilter): Promise<TraceEvent[]>;
263
- budget(runId: string): Promise<BudgetLedgerEntry[]>;
264
- artifacts(runId: string): Promise<Artifact[]>;
265
- }
266
- interface FileSystemTraceStoreOptions {
267
- dir: string;
268
- /** Roll over NDJSON files when they exceed this size in bytes. Default 32 MB. */
269
- maxBytes?: number;
270
- }
271
- declare class FileSystemTraceStore implements TraceStore {
272
- private dir;
273
- private maxBytes;
274
- /** Lazy in-memory index for queries — populated on first read. */
275
- private index?;
276
- private loaded;
277
- constructor(options: FileSystemTraceStoreOptions);
278
- private ensureDir;
279
- private append;
280
- private insertInto;
281
- private load;
282
- appendRun(run: Run): Promise<void>;
283
- updateRun(runId: string, patch: Partial<Run>): Promise<void>;
284
- appendSpan(span: Span): Promise<void>;
285
- updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
286
- appendEvent(event: TraceEvent): Promise<void>;
287
- appendArtifact(artifact: Artifact): Promise<void>;
288
- appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
289
- getRun(runId: string): Promise<Run | undefined>;
290
- listRuns(filter?: RunFilter): Promise<Run[]>;
291
- spans(filter?: SpanFilter): Promise<Span[]>;
292
- events(filter?: EventFilter): Promise<TraceEvent[]>;
293
- budget(runId: string): Promise<BudgetLedgerEntry[]>;
294
- artifacts(runId: string): Promise<Artifact[]>;
295
- }
296
-
297
- export { type Artifact as A, type BudgetLedgerEntry as B, type EventKind as E, type FailureClass as F, type GenericSpan as G, InMemoryTraceStore as I, type JudgeSpan as J, type LlmSpan as L, type Message as M, type Run as R, type Span as S, type TraceStore as T, type ToolSpan as a, type TraceEvent as b, type RunOutcome as c, type SpanKind as d, type RetrievalSpan as e, type SandboxSpan as f, type BudgetSpec as g, type RunFilter as h, type EventFilter as i, FAILURE_CLASSES as j, FileSystemTraceStore as k, type FileSystemTraceStoreOptions as l, type RunLayer as m, type RunStatus as n, type SpanBase as o, type SpanFilter as p, type SpanStatus as q, TRACE_SCHEMA_VERSION as r, isJudgeSpan as s, isLlmSpan as t, isRetrievalSpan as u, isSandboxSpan as v, isToolSpan as w };
199
+ export { type Artifact as A, type BudgetLedgerEntry as B, type EventKind as E, type FailureClass as F, type GenericSpan as G, type JudgeSpan as J, type LlmSpan as L, type Message as M, type Run as R, type Span as S, type ToolSpan as T, type TraceEvent as a, type RunOutcome as b, type SpanKind as c, type RetrievalSpan as d, type SandboxSpan as e, type RunStatus as f, type RunLayer as g, type BudgetSpec as h, FAILURE_CLASSES as i, type SpanBase as j, type SpanStatus as k, TRACE_SCHEMA_VERSION as l, isJudgeSpan as m, isLlmSpan as n, isRetrievalSpan as o, isSandboxSpan as p, isToolSpan as q };
@@ -0,0 +1,101 @@
1
+ import { R as Run, S as Span, a as TraceEvent, A as Artifact, B as BudgetLedgerEntry, f as RunStatus, g as RunLayer, c as SpanKind, E as EventKind } from './schema-m0gsnbt3.js';
2
+
3
+ interface RunFilter {
4
+ scenarioId?: string;
5
+ variantId?: string;
6
+ status?: RunStatus;
7
+ since?: number;
8
+ until?: number;
9
+ tag?: {
10
+ key: string;
11
+ value: string;
12
+ };
13
+ parentRunId?: string;
14
+ projectId?: string;
15
+ chatId?: string;
16
+ layer?: RunLayer;
17
+ }
18
+ interface SpanFilter {
19
+ runId?: string;
20
+ parentSpanId?: string;
21
+ kind?: SpanKind;
22
+ name?: string;
23
+ toolName?: string;
24
+ judgeId?: string;
25
+ since?: number;
26
+ until?: number;
27
+ }
28
+ interface EventFilter {
29
+ runId?: string;
30
+ spanId?: string;
31
+ kind?: EventKind;
32
+ since?: number;
33
+ until?: number;
34
+ }
35
+ interface TraceStore {
36
+ appendRun(run: Run): Promise<void>;
37
+ updateRun(runId: string, patch: Partial<Run>): Promise<void>;
38
+ appendSpan(span: Span): Promise<void>;
39
+ updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
40
+ appendEvent(event: TraceEvent): Promise<void>;
41
+ appendArtifact(artifact: Artifact): Promise<void>;
42
+ appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
43
+ getRun(runId: string): Promise<Run | undefined>;
44
+ listRuns(filter?: RunFilter): Promise<Run[]>;
45
+ spans(filter?: SpanFilter): Promise<Span[]>;
46
+ events(filter?: EventFilter): Promise<TraceEvent[]>;
47
+ budget(runId: string): Promise<BudgetLedgerEntry[]>;
48
+ artifacts(runId: string): Promise<Artifact[]>;
49
+ }
50
+ declare class InMemoryTraceStore implements TraceStore {
51
+ private runs;
52
+ private allSpans;
53
+ private allEvents;
54
+ private allArtifacts;
55
+ private allBudget;
56
+ appendRun(run: Run): Promise<void>;
57
+ updateRun(runId: string, patch: Partial<Run>): Promise<void>;
58
+ appendSpan(span: Span): Promise<void>;
59
+ updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
60
+ appendEvent(event: TraceEvent): Promise<void>;
61
+ appendArtifact(artifact: Artifact): Promise<void>;
62
+ appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
63
+ getRun(runId: string): Promise<Run | undefined>;
64
+ listRuns(filter?: RunFilter): Promise<Run[]>;
65
+ spans(filter?: SpanFilter): Promise<Span[]>;
66
+ events(filter?: EventFilter): Promise<TraceEvent[]>;
67
+ budget(runId: string): Promise<BudgetLedgerEntry[]>;
68
+ artifacts(runId: string): Promise<Artifact[]>;
69
+ }
70
+ interface FileSystemTraceStoreOptions {
71
+ dir: string;
72
+ /** Roll over NDJSON files when they exceed this size in bytes. Default 32 MB. */
73
+ maxBytes?: number;
74
+ }
75
+ declare class FileSystemTraceStore implements TraceStore {
76
+ private dir;
77
+ private maxBytes;
78
+ /** Lazy in-memory index for queries — populated on first read. */
79
+ private index?;
80
+ private loaded;
81
+ constructor(options: FileSystemTraceStoreOptions);
82
+ private ensureDir;
83
+ private append;
84
+ private insertInto;
85
+ private load;
86
+ appendRun(run: Run): Promise<void>;
87
+ updateRun(runId: string, patch: Partial<Run>): Promise<void>;
88
+ appendSpan(span: Span): Promise<void>;
89
+ updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
90
+ appendEvent(event: TraceEvent): Promise<void>;
91
+ appendArtifact(artifact: Artifact): Promise<void>;
92
+ appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
93
+ getRun(runId: string): Promise<Run | undefined>;
94
+ listRuns(filter?: RunFilter): Promise<Run[]>;
95
+ spans(filter?: SpanFilter): Promise<Span[]>;
96
+ events(filter?: EventFilter): Promise<TraceEvent[]>;
97
+ budget(runId: string): Promise<BudgetLedgerEntry[]>;
98
+ artifacts(runId: string): Promise<Artifact[]>;
99
+ }
100
+
101
+ export { type EventFilter as E, FileSystemTraceStore as F, InMemoryTraceStore as I, type RunFilter as R, type SpanFilter as S, type TraceStore as T, type FileSystemTraceStoreOptions as a };
@@ -1,5 +1,5 @@
1
- import { R as RunRecord } from './run-record-BGY6bHRh.js';
2
- import { F as FailureClusterReport } from './failure-cluster-Cw65_5FY.js';
1
+ import { R as RunRecord } from './run-record-etiCMsUq.js';
2
+ import { F as FailureClusterReport } from './failure-cluster-CL7IVgkJ.js';
3
3
 
4
4
  /**
5
5
  * HeldOutGate — first-class held-out paired-delta promotion gate.
@@ -1,5 +1,6 @@
1
- import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
2
- import { R as Run, F as FailureClass, T as TraceStore } from './store-Db2Bv8Cf.js';
1
+ import { T as TraceEmitter } from './emitter-DEZwY14K.js';
2
+ import { R as Run, F as FailureClass } from './schema-m0gsnbt3.js';
3
+ import { T as TraceStore } from './store-CKUAgsJz.js';
3
4
 
4
5
  /**
5
6
  * SandboxHarness — executes a scenario in an isolated environment and
package/dist/traces.d.ts CHANGED
@@ -1,12 +1,13 @@
1
1
  import { N as NotFoundError, R as ReplayError } from './errors-mje_cKOs.js';
2
2
  import { R as RawProviderSink, d as RawProviderEvent } from './raw-provider-sink-C46HDghv.js';
3
3
  export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, c as RawProviderDirection, e as RawProviderSinkFilter, f as defaultProviderRedactor, p as providerFromBaseUrl } from './raw-provider-sink-C46HDghv.js';
4
- import { R as RunCompleteHook, a as RunCompleteHookContext } from './emitter-DP_cSSiw.js';
5
- export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DP_cSSiw.js';
6
- export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CTDhR1Sg.js';
7
- import { T as TraceStore } from './store-Db2Bv8Cf.js';
8
- export { A as Artifact, B as BudgetLedgerEntry, g as BudgetSpec, i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, L as LlmSpan, M as Message, e as RetrievalSpan, R as Run, h as RunFilter, m as RunLayer, c as RunOutcome, n as RunStatus, f as SandboxSpan, S as Span, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, b as TraceEvent, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
9
- export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-DODUYdPg.js';
4
+ import { R as RunCompleteHook, a as RunCompleteHookContext } from './emitter-DEZwY14K.js';
5
+ export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DEZwY14K.js';
6
+ export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CfXjSqEv.js';
7
+ import { T as TraceStore } from './store-CKUAgsJz.js';
8
+ export { E as EventFilter, F as FileSystemTraceStore, a as FileSystemTraceStoreOptions, I as InMemoryTraceStore, R as RunFilter, S as SpanFilter } from './store-CKUAgsJz.js';
9
+ export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-CqTxMwDw.js';
10
+ export { A as Artifact, B as BudgetLedgerEntry, h as BudgetSpec, E as EventKind, i as FAILURE_CLASSES, F as FailureClass, G as GenericSpan, J as JudgeSpan, L as LlmSpan, M as Message, d as RetrievalSpan, R as Run, g as RunLayer, b as RunOutcome, f as RunStatus, e as SandboxSpan, S as Span, j as SpanBase, c as SpanKind, k as SpanStatus, l as TRACE_SCHEMA_VERSION, T as ToolSpan, a as TraceEvent, m as isJudgeSpan, n as isLlmSpan, o as isRetrievalSpan, p as isSandboxSpan, q as isToolSpan } from './schema-m0gsnbt3.js';
10
11
  import { AxAIService, AxFunction } from '@ax-llm/ax';
11
12
  import { T as TraceAnalysisStore, f as TraceAnalystFilters, a as DatasetOverview, Q as QueryTracesPage, l as ViewTraceResult, V as ViewSpansResult, b as SearchTraceResult, S as SearchSpanResult } from './store-CJbzDxZ2.js';
12
13
  export { D as DEFAULT_TRACE_ANALYST_BUDGETS, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, k as ViewTraceOversized } from './store-CJbzDxZ2.js';
@@ -1,4 +1,5 @@
1
- import { S as Span, b as TraceEvent, T as TraceStore } from './store-Db2Bv8Cf.js';
1
+ import { S as Span, a as TraceEvent } from './schema-m0gsnbt3.js';
2
+ import { T as TraceStore } from './store-CKUAgsJz.js';
2
3
 
3
4
  /**
4
5
  * Trajectory — ordered, structured view over a run's spans.
@@ -248,6 +248,23 @@ interface CampaignCostMeter {
248
248
  * training scenarios unless explicitly opted in). */
249
249
  type LabeledScenarioSource = 'production-trace' | 'eval-run' | 'manual' | 'red-team' | 'synthetic';
250
250
  type RedactionStatus = 'raw' | 'redacted-pii' | 'redacted-secrets' | 'fully-redacted';
251
+ /** How much a label can be trusted to evaluate against — the gold-admission
252
+ * gate. Strictly ordered: a record qualifies for a `minTrust` filter when its
253
+ * trust rank is >= the requested rank.
254
+ *
255
+ * - `unverified` — label is a heuristic (e.g. raw outcome success/fail).
256
+ * Fine as corpus; MUST NOT enter a gold set that lift
257
+ * numbers are computed against.
258
+ * - `verified-signal` — an external signal confirmed the outcome (PR merged,
259
+ * tests green, user did not retry, downstream check).
260
+ * - `human-rated` — a human explicitly rated or corrected the artifact.
261
+ *
262
+ * Absent on a write ⇒ treated as `unverified` (fail-closed: a writer must
263
+ * explicitly assert trust to make a record gold-eligible — it never happens
264
+ * by accident). */
265
+ type LabelTrust = 'unverified' | 'verified-signal' | 'human-rated';
266
+ /** Ordinal rank for a label-trust tier; absent ⇒ `unverified` (rank 0). */
267
+ declare function labelTrustRank(trust: LabelTrust | undefined): number;
251
268
  /** @experimental Required-provenance write. The store rejects writes that
252
269
  * lack provenance — a default-on flywheel without provenance is the
253
270
  * data-poisoning vector flagged in the alignment review. */
@@ -259,6 +276,11 @@ interface LabeledScenarioWrite<TScenario extends Scenario = Scenario, TArtifact
259
276
  sourceVersionHash: string;
260
277
  capturedAt: string;
261
278
  redactionStatus: RedactionStatus;
279
+ /** Gold-admission trust tier. Absent ⇒ `unverified` (fail-closed): the
280
+ * record is corpus, never gold. A writer must explicitly assert
281
+ * `verified-signal` or `human-rated` to make it eligible for a gold
282
+ * sample. See {@link LabelTrust}. */
283
+ labelTrust?: LabelTrust;
262
284
  /** Optional per-source rate-limit bucket key (e.g., the tenant id). */
263
285
  rateLimitBucket?: string;
264
286
  }
@@ -282,6 +304,11 @@ interface LabeledScenarioSampleArgs {
282
304
  source?: LabeledScenarioSource | LabeledScenarioSource[];
283
305
  minComposite?: number;
284
306
  maxComposite?: number;
307
+ /** Gold gate: only records whose trust rank is >= this tier are
308
+ * returned. `sample({ split: 'test', minTrust: 'verified-signal' })` is
309
+ * the canonical "give me the gold set" call. Absent ⇒ no trust gate
310
+ * (corpus-level read). */
311
+ minTrust?: LabelTrust;
285
312
  };
286
313
  }
287
314
  interface LabeledScenarioStore {
@@ -291,6 +318,9 @@ interface LabeledScenarioStore {
291
318
  train: number;
292
319
  test: number;
293
320
  bySource: Record<string, number>;
321
+ /** Count by trust tier — tells the flywheel how much gold it has
322
+ * accumulated vs. raw corpus. */
323
+ byTrust: Record<LabelTrust, number>;
294
324
  }>;
295
325
  }
296
326
  interface CampaignCellResult<TArtifact> {
@@ -372,4 +402,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
372
402
  scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
373
403
  }
374
404
 
375
- export type { CodeSurface as C, DispatchFn as D, Gate as G, ImprovementDriver as I, JudgeScore as J, LabeledScenarioStore as L, MutableSurface as M, OptimizerConfig as O, ProposeContext as P, RedactionStatus as R, Scenario as S, TraceSpan as T, JudgeConfig as a, DispatchContext as b, LabeledScenarioWrite as c, LabeledScenarioSampleArgs as d, LabeledScenarioRecord as e, CampaignAggregates as f, CampaignArtifactWriter as g, CampaignCellResult as h, CampaignCostMeter as i, CampaignResult as j, CampaignTraceWriter as k, GateContext as l, GateDecision as m, GateResult as n, GenerationCandidate as o, GenerationRecord as p, JudgeAggregate as q, JudgeDimension as r, LabeledScenarioSource as s, Mutator as t, ScenarioAggregate as u, SessionScript as v };
405
+ export { type CodeSurface as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ProposeContext as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type LabeledScenarioWrite as c, type LabeledScenarioSampleArgs as d, type LabeledScenarioRecord as e, type LabelTrust as f, type CampaignAggregates as g, type CampaignArtifactWriter as h, type CampaignCellResult as i, type CampaignCostMeter as j, type CampaignResult as k, type CampaignTraceWriter as l, type GateContext as m, type GateDecision as n, type GateResult as o, type GenerationCandidate as p, type GenerationRecord as q, type JudgeAggregate as r, type JudgeDimension as s, type LabeledScenarioSource as t, type Mutator as u, type ScenarioAggregate as v, type SessionScript as w, labelTrustRank as x };
@@ -1,12 +1,13 @@
1
- import { F as FeedbackTrajectoryStore } from '../feedback-trajectory-BSxqEpu7.js';
2
- import { T as TraceStore } from '../store-Db2Bv8Cf.js';
1
+ import { F as FeedbackTrajectoryStore } from '../feedback-trajectory-DpUmE90J.js';
2
+ import { T as TraceStore } from '../store-CKUAgsJz.js';
3
3
  import { z } from 'zod';
4
4
  import { OpenAPIObject } from 'openapi3-ts/oas31';
5
5
  import * as hono_types from 'hono/types';
6
6
  import { ServerType } from '@hono/node-server';
7
7
  import { Hono } from 'hono';
8
- import '../control-runtime-BZ_lVLYW.js';
9
- import '../emitter-DP_cSSiw.js';
8
+ import '../control-runtime-DuFBYg7A.js';
9
+ import '../emitter-DEZwY14K.js';
10
+ import '../schema-m0gsnbt3.js';
10
11
  import '../dataset-BlwAtYYf.js';
11
12
  import '../errors-mje_cKOs.js';
12
13
 
@@ -0,0 +1,223 @@
1
+ # Self-improvement protocol — the world-class architecture
2
+
3
+ **Status:** Strategic design. The artifact that every roadmap entry maps to.
4
+ **Date:** 2026-05-27.
5
+
6
+ ## Thesis
7
+
8
+ **Self-improvement is a protocol, not a product.** We define the wire formats, surface abstractions, driver interface, gate interface, and insight format. We ship reference implementations. Customers plug in whatever framework, model, or runtime they already use — our infrastructure handles the rigorous middle (analysis, gating, version-safe deployment).
9
+
10
+ No competitor ships this combination. LangSmith / Braintrust / Phoenix / LangFuse ship tracing. Hermes ships an agent. SkillOpt ships an academic optimizer. Anthropic's Claude Code ships skill-creation. **Nobody ships the protocol.**
11
+
12
+ ## The pipeline as a single abstract flow
13
+
14
+ ```
15
+ ┌──────────────────────────────────────────────────────────────────────┐
16
+ │ WHATEVER YOU ALREADY USE │
17
+ │ LangChain · LlamaIndex · Anthropic SDK · OpenAI Assistants · │
18
+ │ Hermes · Claude Code · Codex · agent-runtime · your own stack │
19
+ └─────────────────────────────────┬────────────────────────────────────┘
20
+ │ traces (any format)
21
+
22
+ ┌──────────────────────────────────────────────────────────────────────┐
23
+ │ INGEST — universal trace adapters │
24
+ │ fromOtelSpans · fromFeedbackTable · fromLangChain · fromLlamaIndex ·│
25
+ │ fromAnthropicSDK · fromOpenAISDK · fromHermesProfileLog · BYO │
26
+ │ → canonical RunRecord[] │
27
+ └─────────────────────────────────┬────────────────────────────────────┘
28
+
29
+ ┌──────────────────────────────────────────────────────────────────────┐
30
+ │ ANALYZE — analyzeRuns({ runs, baselineRuns?, userFeedback? }) │
31
+ │ paired-bootstrap CI · Pareto · failure clusters · prior-period │
32
+ │ delta · user-corrective-signal extraction · recommendations │
33
+ │ ← THE STATISTICAL EDGE NOBODY ELSE SHIPS │
34
+ └─────────────────────────────────┬────────────────────────────────────┘
35
+
36
+ ┌──────────────────────────────────────────────────────────────────────┐
37
+ │ IMPROVE — selfImprove() closed loop │
38
+ │ gepaDriver · evolutionaryDriver · BYO ImprovementDriver │
39
+ │ → ProfileDiff (versioned, hashed, content-addressable) │
40
+ └─────────────────────────────────┬────────────────────────────────────┘
41
+
42
+ ┌──────────────────────────────────────────────────────────────────────┐
43
+ │ GATE — defaultProductionGate (paired-CI) · BYO gate │
44
+ │ ship-substrate / ship-harness / merge / inconclusive │
45
+ │ ← STATISTICALLY STRICTER THAN ANY COMPETITOR │
46
+ └─────────────────────────────────┬────────────────────────────────────┘
47
+
48
+ ┌──────────────────────────────────────────────────────────────────────┐
49
+ │ DEPLOY — back into WHATEVER YOU ALREADY USE │
50
+ │ agent-runtime · Hermes profile log · LangChain config · custom hook │
51
+ └──────────────────────────────────────────────────────────────────────┘
52
+ ```
53
+
54
+ ## The integration promise
55
+
56
+ Customers pick one of three integration shapes. All three work today (some are aspirational on adapter coverage). Every shape uses the same canonical types underneath.
57
+
58
+ ### Shape A — offline analysis only
59
+
60
+ You have traces, you want a decision packet. Zero LLM cost. Zero closed loop.
61
+
62
+ ```typescript
63
+ import { fromOtelSpans, analyzeRuns } from '@tangle-network/agent-eval'
64
+
65
+ const runs = fromOtelSpans({ spans: mySpans })
66
+ const report = await analyzeRuns({ runs })
67
+ // → InsightReport with composite, recommendations, Pareto, ...
68
+ ```
69
+
70
+ Use case: dashboards, weekly post-mortems, "did anything regress" checks. The intelligence-kernel ships this.
71
+
72
+ ### Shape B — closed loop, your runtime
73
+
74
+ You have an agent, you want to improve it. We provide drivers + gate + insight. You decide when to deploy.
75
+
76
+ ```typescript
77
+ import { selfImprove, gepaDriver } from '@tangle-network/agent-eval'
78
+
79
+ const result = await selfImprove({
80
+ scenarios,
81
+ agent: yourAgent, // any function (surface, scenario) → artifact
82
+ judge: yourJudge, // any function (artifact) → JudgeScore
83
+ baselineSurface,
84
+ driver: gepaDriver({ llm, model, target }),
85
+ budget: { generations: 3, populationSize: 4, holdoutFraction: 0.3 },
86
+ })
87
+ // → SelfImproveResult { baselineHash, diff, winningHash, lift, gateDecision, insight }
88
+ ```
89
+
90
+ Use case: every product agent we ship. Hermes-on-our-sandbox. Claude Code with skills. Anyone wanting "ship if statistically better, else hold."
91
+
92
+ ### Shape C — hosted, cross-language
93
+
94
+ You stream traces from anywhere, get InsightReports + selfImprove orchestration. Bills usage-based.
95
+
96
+ ```sh
97
+ # Stream traces
98
+ curl https://api.tangle.tools/v1/ingest/otel \
99
+ -H "Authorization: Bearer ${TANGLE_KEY}" \
100
+ --data-binary @traces.jsonl
101
+
102
+ # Get the decision packet
103
+ curl https://api.tangle.tools/v1/insight/${runId}
104
+
105
+ # Or run a closed-loop campaign
106
+ curl https://api.tangle.tools/v1/improve \
107
+ -d '{"scenarios": ..., "baselineHash": "...", "budget": {...}}'
108
+ ```
109
+
110
+ Use case: Python customers, Go customers, customers behind firewalls, customers who don't want to operate the substrate.
111
+
112
+ ## The five non-negotiables
113
+
114
+ The protocol claim only holds if all five of these survive integration. Customers shouldn't have to compromise on any.
115
+
116
+ 1. **Universal ingest.** Any trace format → canonical RunRecord. Coverage: OTel ✓, multi-rater feedback ✓, LangChain ⏳, LlamaIndex ⏳, Anthropic SDK ⏳, OpenAI Assistants ⏳, Hermes profile log ⏳.
117
+ 2. **Statistical rigor.** Every claim falsifiable. Paired bootstrap CI on lift, Cohen's d on effect size, MDE-aware sample-size recommendations, p-values. **SkillOpt's gate is literal `cand > current`. Hermes has no gate. Ours has all of the above.** This is the moat.
118
+ 3. **Plug-in everything.** Driver, judge, gate, intake adapter, storage all swappable. Customer brings their LLM, their judge, their scenarios. We bring the rigor.
119
+ 4. **Version-safe deployment.** AgentProfile is content-addressable. Two writers (harness + substrate) can both mutate without lost-update. Gate verdicts are scoped to baseline hash, not absolute. Tracked as #98.
120
+ 5. **Cross-language wire format.** Python client at parity with TypeScript. Hosted ingest spec versioned. Customers in any language consume the same shape.
121
+
122
+ ## Where we are honest about gaps
123
+
124
+ | Component | Status | Customer impact when missing |
125
+ |---|---|---|
126
+ | `fromOtelSpans` ingest adapter | ✓ shipped 0.50.0 | — |
127
+ | `fromFeedbackTable` multi-rater intake | ✓ shipped 0.50.0 | — |
128
+ | `analyzeRuns` decision packet | ✓ shipped 0.50.0 / 0.50.2 actionability | — |
129
+ | `selfImprove` closed loop | ✓ shipped 0.50.0 | — |
130
+ | Paired-bootstrap gate | ✓ shipped early; still our edge | — |
131
+ | `gepaDriver` reflection (not full Pareto — task #101) | ⚠ partial | OK; customers don't need Pareto until plateau hit |
132
+ | **Prior-period comparison** in `analyzeRuns` | ✗ MISSING | "Did my last change help?" — the #1 customer question — has no rigorous answer today |
133
+ | **User-corrective-feedback signal extraction** | ✗ MISSING | Hermes' first-class skill signal. We have the trace data. We don't mine it. |
134
+ | **`init` CLI** scaffolding canonical eval/ layout | ✗ MISSING | Every new consumer wires it by hand; the skill describes 80 lines they have to copy |
135
+ | **Framework-specific intake adapters** (LangChain, LlamaIndex, Anthropic SDK, OpenAI Assistants) | ✗ MISSING | Customers using these frameworks can't ingest without writing custom adapter code |
136
+ | **Profile versioning** (task #98) | ✗ MISSING | Offline/online drift; gate verdicts can be stale by the time they're applied |
137
+ | **Composite driver** (optimize all surfaces against one gate) | ✗ MISSING | Customers can optimize prompts OR skills, not both jointly |
138
+ | **Empirical proof drivers work** | ✗ MISSING | We've never published "we ran gepaDriver on real customer data, here's the lift CI" |
139
+ | Hosted-tier production launch | ⚠ in scaffolding (intelligence-kernel) | Customers must self-host today |
140
+
141
+ ## The roadmap — what closes each gap
142
+
143
+ Mapping every roadmap entry back to a concrete protocol gap.
144
+
145
+ ### 0.53.0 (this session-or-next) — answer "did my last change help?"
146
+
147
+ - **`analyzeRuns({ runs, baselineRuns? })`** — when `baselineRuns` is provided, the report includes a `priorPeriodComparison?` block: per-metric delta with paired-bootstrap CI, MDE-aware significance judgment, "regressed metrics" surfaced in `recommendations`.
148
+ - Built on top of existing `diffRuns()` primitive (already shipped 0.48.0).
149
+ - 1 PR. Pure additive surface.
150
+ - **Customer impact**: this is the conversion question for every prospect.
151
+
152
+ ### 0.54.0 — extract Hermes' missing signal
153
+
154
+ - **`extractUserCorrections(runs)`** — new substrate primitive. Mines user messages in traces for corrective markers (regex pass + LLM classifier for nuance). Returns `UserCorrectionEvent[]` keyed by runId.
155
+ - `analyzeRuns({ runs, userFeedback? })` includes a "common corrections" cluster in `recommendations`.
156
+ - Bridge to Hermes-style signal without adopting Hermes' runtime.
157
+ - **Customer impact**: distinctive — no competitor mines this signal.
158
+
159
+ ### 0.55.0 — framework-specific intake adapters
160
+
161
+ - **`fromLangChain(traces)`**, **`fromLlamaIndex(traces)`**, **`fromAnthropicSDK(traces)`**, **`fromOpenAIAssistants(traces)`**.
162
+ - Each maps the framework's native trace shape to RunRecord.
163
+ - Top 4 frameworks = 80% of agent-builder market coverage.
164
+ - **Customer impact**: removes "we don't support your framework" friction.
165
+
166
+ ### 0.56.0 — `init` CLI + worked examples
167
+
168
+ - `pnpm dlx @tangle-network/agent-eval init` scaffolds the canonical `eval/scenarios.json` + 3 pnpm scripts + judges template + `.runs/` directory.
169
+ - Adds 5+ end-to-end runnable examples covering Shapes A/B/C across the 4 framework adapters.
170
+ - **Customer impact**: time-to-first-eval drops from 4 hours to 5 minutes.
171
+
172
+ ### 1.0.0 — profile versioning (#98) + composite driver
173
+
174
+ - Content-addressable `AgentProfileVersion` + `ProfileDiff` + 3-way merge + 4-way `DriftGateDecision`.
175
+ - `compositeDriver` — optimize all surfaces of one AgentProfile against one gate.
176
+ - Hermes-on-sandbox forcing function validates the work before commit.
177
+ - **Customer impact**: production-safe; the moat is locked.
178
+
179
+ ### 1.1.0 — empirical-proof publication
180
+
181
+ - Pick one named customer or one synthetic-realistic corpus (legal-agent canonical).
182
+ - Run gepaDriver end-to-end with real LLM cost.
183
+ - Publish: "n=, lift=, CI=, p=, $cost=, vs no-driver baseline."
184
+ - One blog post, one demo video, one runnable repro.
185
+ - **Customer impact**: every other claim becomes credible because this one is verified.
186
+
187
+ ## Why this design is 100x
188
+
189
+ Not a 10% improvement over LangSmith. A category change.
190
+
191
+ | Capability | LangSmith / Braintrust / Phoenix | Hermes / Claude Code | Tangle (target) |
192
+ |---|---|---|---|
193
+ | Trace ingest | ✓ proprietary | ✓ own runtime | ✓ universal |
194
+ | Decision packet | ⚠ scorecards (no CI) | ✗ | ✓ paired-bootstrap |
195
+ | Closed loop | ✗ | ✓ heuristic | ✓ statistically rigorous |
196
+ | Plug-in drivers | ✗ | ✗ | ✓ |
197
+ | Profile versioning | ✗ | ✗ | ✓ (1.0.0) |
198
+ | Composite multi-surface | ✗ | ✗ | ✓ (1.0.0) |
199
+ | Cross-language | ✗ | ✗ | ✓ (Python at parity) |
200
+ | Empirical-proof publication | ✗ | ✗ | ✓ (1.1.0) |
201
+
202
+ Eight rows. Nobody else has eight. We can be the only one. The work is named, scoped, and queued.
203
+
204
+ ## What's NOT on the roadmap (and why)
205
+
206
+ - **Building our own agent runtime.** Hermes / agent-runtime / Claude Code cover that. We are infrastructure, not a runtime.
207
+ - **Single-vendor LLM.** Substrate stays model-agnostic.
208
+ - **UI-first product.** API-first. UIs are downstream.
209
+ - **LangChain replacement.** Wrong layer.
210
+ - **"Self-improvement" without a held-out gate.** Hermes and SkillOpt both ship this; we explicitly refuse — every selfImprove() requires a holdout.
211
+
212
+ ## Decision log — what we committed to in 0.52.0 → 1.0.0
213
+
214
+ 1. **`skillOptDriver` removed; behavior in `gepaDriver({ constraints })`** — 0.52.0 ✓ shipped
215
+ 2. **Honest spec docs** — 0.52.0 ✓ shipped
216
+ 3. **Profile-versioning spec with symmetric-fork framing** — 0.52.0 ✓ shipped
217
+ 4. **No V2 names anywhere** — enforced
218
+ 5. **Forcing-function gate on profile-versioning work** — Hermes-on-sandbox experiment required before phases 1-5 commit
219
+ 6. **Single-PR-per-repo discipline** — enforced 0.52.0 onwards
220
+ 7. **Prior-period comparison as 0.53.0** — committed; the customer-conversion primitive
221
+ 8. **User-feedback extraction as 0.54.0** — committed; the Hermes-signal bridge
222
+ 9. **Framework intake adapters as 0.55.0** — committed; 80% market coverage
223
+ 10. **Empirical-proof publication as 1.1.0** — committed; the credibility lock
@@ -0,0 +1,62 @@
1
+ # Pilot Kit — customer handoff materials
2
+
3
+ What's here, in order of use:
4
+
5
+ | File | For | When |
6
+ |---|---|---|
7
+ | [one-pager.md](./one-pager.md) | Customer's first read | Send as initial pitch — what they get, why it's different, what it looks like, what it costs. Now includes intake-paths matrix for non-Tangle customers (LangChain / LlamaIndex / Anthropic SDK / OpenAI Assistants / OpenRouter / vLLM / Ollama / custom). |
8
+ | [integration-tangle-stack.md](./integration-tangle-stack.md) | Customer's engineer (Tangle-stack customers) | Send after one-pager when they want to see the code; full integration walkthrough for the canonical Tangle stack (sandbox + tcloud) |
9
+ | [integration-foreign-stack.md](./integration-foreign-stack.md) | Customer's engineer (non-Tangle customers) | Send after one-pager when they're on OTel, LangChain, LlamaIndex, Anthropic SDK, OpenAI Assistants, OpenRouter, vLLM, Ollama, or custom. Covers every path. |
10
+ | [sample-insight-report.json](./sample-insight-report.json) | Customer's team meeting | Concrete JSON they can show to demonstrate value pre-integration |
11
+ | [customer-checklist.md](./customer-checklist.md) | Pre-onboarding-call | Send 48h before the call; ensures the 90min slot is productive. Provider-agnostic — works for any stack. |
12
+
13
+ ## How to use this kit
14
+
15
+ **For a Tangle customer asking for it RIGHT NOW:**
16
+
17
+ 1. Reply with the one-pager (`one-pager.md`) inline + the sample InsightReport (`sample-insight-report.json`) attached. Their senior engineer reads this and decides if it's worth a call.
18
+ 2. If they say yes, send the integration guide (`integration-tangle-stack.md`) + the checklist (`customer-checklist.md`). Schedule a 90-minute onboarding call.
19
+ 3. On the call: walk through the integration, run a live `analyzeRuns()` against their existing sandbox sessions, render the deterministic packet, fire one small `selfImprove` cycle. By the end of the call they have a working pilot.
20
+
21
+ **For Drew handling the conversation himself:**
22
+
23
+ The whole kit is written in our voice (technical, direct, no marketing fluff). You can paste sections directly into Slack / email / a customer call. The one-pager is meant to read as YOUR pitch, not a generic SaaS handout.
24
+
25
+ ## What this kit assumes
26
+
27
+ - Customer is on the Tangle stack (sandbox + tcloud) OR emits OTel traces
28
+ - Customer has an agent with a clear system-prompt addendum we can optimize
29
+ - Customer has at least 20 scenarios their agent handles
30
+ - Customer is willing to set a `maxUsd` budget for closed-loop campaigns
31
+
32
+ If any of those don't apply, the one-pager still works as a positioning piece. The integration doc gets adapted on the call.
33
+
34
+ ## Where this maps in the substrate
35
+
36
+ - Substrate version: `@tangle-network/agent-eval@0.53.0` (npm), `agent-eval-rpc@0.53.0` (PyPI)
37
+ - agent-runtime version: `@tangle-network/agent-runtime@0.29.0`
38
+ - Key APIs: `fromTangleSandbox`, `fromOtelSpans`, `analyzeRuns`, `selfImprove`, `gepaDriver`, `defaultProductionGate`, `openAutoPr`
39
+ - All ship today; no version-blocking dependencies
40
+
41
+ ## What this kit doesn't yet do
42
+
43
+ - No `npx @tangle-network/intelligence demo` command shipped yet (queued #115 — extend existing `tangle-intel` CLI in ADC with customer-zero-touch subcommands `init` / `demo` / `report` / `improve`)
44
+ - No `staging-intelligence.tangle.tools` live yet (queued #116 — matches existing `staging-{product}.tangle.tools` precedent like sandbox)
45
+ - No live demo video (queued #117 — recorded against legal-agent canonical real data)
46
+ - No screenshot dashboard (gated on Gate 2 task #109 — ADC intelligence frontend renders canonical InsightReport)
47
+ - No published case study with named numbers (Gate 3 task #112 — after first pilot completes 4+ cycles)
48
+
49
+ ## Architectural decisions baked into this kit
50
+
51
+ - **Customer-facing CLI is `@tangle-network/intelligence`** (binary `tangle-intel`), NOT `agent-eval`. `agent-eval` is the substrate package; `intelligence` is the customer product that wraps it. The CLI already exists at `services/intelligence/src/cli/` in agent-dev-container — we extend it with `init` / `demo` / `report` / `improve` subcommands per task #115.
52
+ - **Hosted URL is `staging-intelligence.tangle.tools`** matching `staging-sandbox.tangle.tools` precedent. Production becomes `intelligence.tangle.tools` once Gate 2/3 close.
53
+ - **`agent-eval` mentioned only when customer wants direct programmatic access** (not the default path). 90%+ of customers stay at the CLI + hosted dashboard layer.
54
+
55
+ For the FIRST pilot conversation, the JSON sample is the dashboard substitute. After Gate 2 lands we replace it with live screenshots.
56
+
57
+ ## Update cadence
58
+
59
+ This kit gets updated each time:
60
+ - A substrate version ships that customers should know about
61
+ - A real pilot completes and we have a case study to add
62
+ - A customer gives feedback that re-shapes how we pitch