@tangle-network/agent-eval 0.53.0 → 0.55.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/dist/adapters/http.d.ts +1 -1
  2. package/dist/adapters/langchain.d.ts +1 -1
  3. package/dist/adapters/otel.d.ts +7 -6
  4. package/dist/{baseline-4R5deP0N.d.ts → baseline-DE36-Np7.d.ts} +1 -1
  5. package/dist/benchmarks/index.d.ts +3 -2
  6. package/dist/builder-eval/index.d.ts +4 -3
  7. package/dist/campaign/index.d.ts +9 -7
  8. package/dist/campaign/index.js +33 -4
  9. package/dist/campaign/index.js.map +1 -1
  10. package/dist/{chunk-L7XMNXLO.js → chunk-J4DIMSRK.js} +2 -2
  11. package/dist/{chunk-5KSDYBYH.js → chunk-LYL4SOKT.js} +3 -2
  12. package/dist/chunk-LYL4SOKT.js.map +1 -0
  13. package/dist/{chunk-BWZEGTES.js → chunk-NCK5QLGT.js} +1 -1
  14. package/dist/chunk-NCK5QLGT.js.map +1 -0
  15. package/dist/contract/index.d.ts +13 -12
  16. package/dist/contract/index.js +25 -0
  17. package/dist/contract/index.js.map +1 -1
  18. package/dist/{control-ojEWkMfJ.d.ts → control-DjEgwWNo.d.ts} +6 -5
  19. package/dist/{control-runtime-BZ_lVLYW.d.ts → control-runtime-DuFBYg7A.d.ts} +3 -2
  20. package/dist/control.d.ts +7 -6
  21. package/dist/control.js +2 -2
  22. package/dist/{emitter-DP_cSSiw.d.ts → emitter-DEZwY14K.d.ts} +2 -1
  23. package/dist/{failure-cluster-Cw65_5FY.d.ts → failure-cluster-CL7IVgkJ.d.ts} +2 -1
  24. package/dist/{feedback-trajectory-BSxqEpu7.d.ts → feedback-trajectory-DpUmE90J.d.ts} +1 -1
  25. package/dist/governance/index.d.ts +3 -2
  26. package/dist/hosted/index.d.ts +7 -6
  27. package/dist/{index-C7RhhEME.d.ts → index-D2nT6_KT.d.ts} +20 -2
  28. package/dist/{index-0pu_fBwZ.d.ts → index-wlaiph9Y.d.ts} +1 -1
  29. package/dist/index.d.ts +31 -29
  30. package/dist/index.js +3 -3
  31. package/dist/{integrity-CTDhR1Sg.d.ts → integrity-CfXjSqEv.d.ts} +1 -1
  32. package/dist/knowledge/index.d.ts +4 -3
  33. package/dist/meta-eval/index.d.ts +4 -3
  34. package/dist/openapi.json +1 -1
  35. package/dist/pipelines/index.d.ts +7 -6
  36. package/dist/prm/index.d.ts +5 -4
  37. package/dist/{query-DODUYdPg.d.ts → query-CqTxMwDw.d.ts} +2 -1
  38. package/dist/{red-team-30II1T4o.d.ts → red-team-CrC5MZYd.d.ts} +1 -1
  39. package/dist/{registry-8KAs18kY.d.ts → registry-BSWy0rvH.d.ts} +1 -1
  40. package/dist/{release-report-DSu0DWy8.d.ts → release-report-B6l5fi7T.d.ts} +2 -2
  41. package/dist/reporting.d.ts +7 -6
  42. package/dist/{researcher-LZD0qHEa.d.ts → researcher-JP8EvnLv.d.ts} +11 -6
  43. package/dist/rl.d.ts +11 -10
  44. package/dist/rl.js +2 -2
  45. package/dist/{rubric-D5tjHNJQ.d.ts → rubric-BOfxn4ja.d.ts} +3 -2
  46. package/dist/{rubric-predictive-validity-ByZEC3BX.d.ts → rubric-predictive-validity-B3qNa4aY.d.ts} +1 -1
  47. package/dist/{run-improvement-loop-Cc7oZlRP.d.ts → run-improvement-loop-BhfdjrMY.d.ts} +3 -3
  48. package/dist/{run-record-BGY6bHRh.d.ts → run-record-etiCMsUq.d.ts} +11 -3
  49. package/dist/{store-Db2Bv8Cf.d.ts → schema-m0gsnbt3.d.ts} +1 -99
  50. package/dist/store-CKUAgsJz.d.ts +101 -0
  51. package/dist/{summary-report-B7gNRX-r.d.ts → summary-report-DLxh4yWk.d.ts} +2 -2
  52. package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BdVaPyHT.d.ts} +3 -2
  53. package/dist/traces.d.ts +7 -6
  54. package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-GEdXJCL5.d.ts} +2 -1
  55. package/dist/{types-Dbj5gu8n.d.ts → types-BgrxOJSf.d.ts} +31 -1
  56. package/dist/wire/index.d.ts +5 -4
  57. package/docs/pilot/README.md +62 -0
  58. package/docs/pilot/customer-checklist.md +90 -0
  59. package/docs/pilot/integration-foreign-stack.md +296 -0
  60. package/docs/pilot/integration-tangle-stack.md +248 -0
  61. package/docs/pilot/one-pager.md +161 -0
  62. package/docs/pilot/sample-insight-report.json +172 -0
  63. package/docs/research/research-roadmap.md +204 -0
  64. package/package.json +1 -1
  65. package/dist/chunk-5KSDYBYH.js.map +0 -1
  66. package/dist/chunk-BWZEGTES.js.map +0 -1
  67. /package/dist/{chunk-L7XMNXLO.js.map → chunk-J4DIMSRK.js.map} +0 -0
@@ -196,102 +196,4 @@ declare function isRetrievalSpan(s: Span): s is RetrievalSpan;
196
196
  declare function isJudgeSpan(s: Span): s is JudgeSpan;
197
197
  declare function isSandboxSpan(s: Span): s is SandboxSpan;
198
198
 
199
- interface RunFilter {
200
- scenarioId?: string;
201
- variantId?: string;
202
- status?: RunStatus;
203
- since?: number;
204
- until?: number;
205
- tag?: {
206
- key: string;
207
- value: string;
208
- };
209
- parentRunId?: string;
210
- projectId?: string;
211
- chatId?: string;
212
- layer?: RunLayer;
213
- }
214
- interface SpanFilter {
215
- runId?: string;
216
- parentSpanId?: string;
217
- kind?: SpanKind;
218
- name?: string;
219
- toolName?: string;
220
- judgeId?: string;
221
- since?: number;
222
- until?: number;
223
- }
224
- interface EventFilter {
225
- runId?: string;
226
- spanId?: string;
227
- kind?: EventKind;
228
- since?: number;
229
- until?: number;
230
- }
231
- interface TraceStore {
232
- appendRun(run: Run): Promise<void>;
233
- updateRun(runId: string, patch: Partial<Run>): Promise<void>;
234
- appendSpan(span: Span): Promise<void>;
235
- updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
236
- appendEvent(event: TraceEvent): Promise<void>;
237
- appendArtifact(artifact: Artifact): Promise<void>;
238
- appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
239
- getRun(runId: string): Promise<Run | undefined>;
240
- listRuns(filter?: RunFilter): Promise<Run[]>;
241
- spans(filter?: SpanFilter): Promise<Span[]>;
242
- events(filter?: EventFilter): Promise<TraceEvent[]>;
243
- budget(runId: string): Promise<BudgetLedgerEntry[]>;
244
- artifacts(runId: string): Promise<Artifact[]>;
245
- }
246
- declare class InMemoryTraceStore implements TraceStore {
247
- private runs;
248
- private allSpans;
249
- private allEvents;
250
- private allArtifacts;
251
- private allBudget;
252
- appendRun(run: Run): Promise<void>;
253
- updateRun(runId: string, patch: Partial<Run>): Promise<void>;
254
- appendSpan(span: Span): Promise<void>;
255
- updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
256
- appendEvent(event: TraceEvent): Promise<void>;
257
- appendArtifact(artifact: Artifact): Promise<void>;
258
- appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
259
- getRun(runId: string): Promise<Run | undefined>;
260
- listRuns(filter?: RunFilter): Promise<Run[]>;
261
- spans(filter?: SpanFilter): Promise<Span[]>;
262
- events(filter?: EventFilter): Promise<TraceEvent[]>;
263
- budget(runId: string): Promise<BudgetLedgerEntry[]>;
264
- artifacts(runId: string): Promise<Artifact[]>;
265
- }
266
- interface FileSystemTraceStoreOptions {
267
- dir: string;
268
- /** Roll over NDJSON files when they exceed this size in bytes. Default 32 MB. */
269
- maxBytes?: number;
270
- }
271
- declare class FileSystemTraceStore implements TraceStore {
272
- private dir;
273
- private maxBytes;
274
- /** Lazy in-memory index for queries — populated on first read. */
275
- private index?;
276
- private loaded;
277
- constructor(options: FileSystemTraceStoreOptions);
278
- private ensureDir;
279
- private append;
280
- private insertInto;
281
- private load;
282
- appendRun(run: Run): Promise<void>;
283
- updateRun(runId: string, patch: Partial<Run>): Promise<void>;
284
- appendSpan(span: Span): Promise<void>;
285
- updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
286
- appendEvent(event: TraceEvent): Promise<void>;
287
- appendArtifact(artifact: Artifact): Promise<void>;
288
- appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
289
- getRun(runId: string): Promise<Run | undefined>;
290
- listRuns(filter?: RunFilter): Promise<Run[]>;
291
- spans(filter?: SpanFilter): Promise<Span[]>;
292
- events(filter?: EventFilter): Promise<TraceEvent[]>;
293
- budget(runId: string): Promise<BudgetLedgerEntry[]>;
294
- artifacts(runId: string): Promise<Artifact[]>;
295
- }
296
-
297
- export { type Artifact as A, type BudgetLedgerEntry as B, type EventKind as E, type FailureClass as F, type GenericSpan as G, InMemoryTraceStore as I, type JudgeSpan as J, type LlmSpan as L, type Message as M, type Run as R, type Span as S, type TraceStore as T, type ToolSpan as a, type TraceEvent as b, type RunOutcome as c, type SpanKind as d, type RetrievalSpan as e, type SandboxSpan as f, type BudgetSpec as g, type RunFilter as h, type EventFilter as i, FAILURE_CLASSES as j, FileSystemTraceStore as k, type FileSystemTraceStoreOptions as l, type RunLayer as m, type RunStatus as n, type SpanBase as o, type SpanFilter as p, type SpanStatus as q, TRACE_SCHEMA_VERSION as r, isJudgeSpan as s, isLlmSpan as t, isRetrievalSpan as u, isSandboxSpan as v, isToolSpan as w };
199
+ export { type Artifact as A, type BudgetLedgerEntry as B, type EventKind as E, type FailureClass as F, type GenericSpan as G, type JudgeSpan as J, type LlmSpan as L, type Message as M, type Run as R, type Span as S, type ToolSpan as T, type TraceEvent as a, type RunOutcome as b, type SpanKind as c, type RetrievalSpan as d, type SandboxSpan as e, type RunStatus as f, type RunLayer as g, type BudgetSpec as h, FAILURE_CLASSES as i, type SpanBase as j, type SpanStatus as k, TRACE_SCHEMA_VERSION as l, isJudgeSpan as m, isLlmSpan as n, isRetrievalSpan as o, isSandboxSpan as p, isToolSpan as q };
@@ -0,0 +1,101 @@
1
+ import { R as Run, S as Span, a as TraceEvent, A as Artifact, B as BudgetLedgerEntry, f as RunStatus, g as RunLayer, c as SpanKind, E as EventKind } from './schema-m0gsnbt3.js';
2
+
3
+ interface RunFilter {
4
+ scenarioId?: string;
5
+ variantId?: string;
6
+ status?: RunStatus;
7
+ since?: number;
8
+ until?: number;
9
+ tag?: {
10
+ key: string;
11
+ value: string;
12
+ };
13
+ parentRunId?: string;
14
+ projectId?: string;
15
+ chatId?: string;
16
+ layer?: RunLayer;
17
+ }
18
+ interface SpanFilter {
19
+ runId?: string;
20
+ parentSpanId?: string;
21
+ kind?: SpanKind;
22
+ name?: string;
23
+ toolName?: string;
24
+ judgeId?: string;
25
+ since?: number;
26
+ until?: number;
27
+ }
28
+ interface EventFilter {
29
+ runId?: string;
30
+ spanId?: string;
31
+ kind?: EventKind;
32
+ since?: number;
33
+ until?: number;
34
+ }
35
+ interface TraceStore {
36
+ appendRun(run: Run): Promise<void>;
37
+ updateRun(runId: string, patch: Partial<Run>): Promise<void>;
38
+ appendSpan(span: Span): Promise<void>;
39
+ updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
40
+ appendEvent(event: TraceEvent): Promise<void>;
41
+ appendArtifact(artifact: Artifact): Promise<void>;
42
+ appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
43
+ getRun(runId: string): Promise<Run | undefined>;
44
+ listRuns(filter?: RunFilter): Promise<Run[]>;
45
+ spans(filter?: SpanFilter): Promise<Span[]>;
46
+ events(filter?: EventFilter): Promise<TraceEvent[]>;
47
+ budget(runId: string): Promise<BudgetLedgerEntry[]>;
48
+ artifacts(runId: string): Promise<Artifact[]>;
49
+ }
50
+ declare class InMemoryTraceStore implements TraceStore {
51
+ private runs;
52
+ private allSpans;
53
+ private allEvents;
54
+ private allArtifacts;
55
+ private allBudget;
56
+ appendRun(run: Run): Promise<void>;
57
+ updateRun(runId: string, patch: Partial<Run>): Promise<void>;
58
+ appendSpan(span: Span): Promise<void>;
59
+ updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
60
+ appendEvent(event: TraceEvent): Promise<void>;
61
+ appendArtifact(artifact: Artifact): Promise<void>;
62
+ appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
63
+ getRun(runId: string): Promise<Run | undefined>;
64
+ listRuns(filter?: RunFilter): Promise<Run[]>;
65
+ spans(filter?: SpanFilter): Promise<Span[]>;
66
+ events(filter?: EventFilter): Promise<TraceEvent[]>;
67
+ budget(runId: string): Promise<BudgetLedgerEntry[]>;
68
+ artifacts(runId: string): Promise<Artifact[]>;
69
+ }
70
+ interface FileSystemTraceStoreOptions {
71
+ dir: string;
72
+ /** Roll over NDJSON files when they exceed this size in bytes. Default 32 MB. */
73
+ maxBytes?: number;
74
+ }
75
+ declare class FileSystemTraceStore implements TraceStore {
76
+ private dir;
77
+ private maxBytes;
78
+ /** Lazy in-memory index for queries — populated on first read. */
79
+ private index?;
80
+ private loaded;
81
+ constructor(options: FileSystemTraceStoreOptions);
82
+ private ensureDir;
83
+ private append;
84
+ private insertInto;
85
+ private load;
86
+ appendRun(run: Run): Promise<void>;
87
+ updateRun(runId: string, patch: Partial<Run>): Promise<void>;
88
+ appendSpan(span: Span): Promise<void>;
89
+ updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
90
+ appendEvent(event: TraceEvent): Promise<void>;
91
+ appendArtifact(artifact: Artifact): Promise<void>;
92
+ appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
93
+ getRun(runId: string): Promise<Run | undefined>;
94
+ listRuns(filter?: RunFilter): Promise<Run[]>;
95
+ spans(filter?: SpanFilter): Promise<Span[]>;
96
+ events(filter?: EventFilter): Promise<TraceEvent[]>;
97
+ budget(runId: string): Promise<BudgetLedgerEntry[]>;
98
+ artifacts(runId: string): Promise<Artifact[]>;
99
+ }
100
+
101
+ export { type EventFilter as E, FileSystemTraceStore as F, InMemoryTraceStore as I, type RunFilter as R, type SpanFilter as S, type TraceStore as T, type FileSystemTraceStoreOptions as a };
@@ -1,5 +1,5 @@
1
- import { R as RunRecord } from './run-record-BGY6bHRh.js';
2
- import { F as FailureClusterReport } from './failure-cluster-Cw65_5FY.js';
1
+ import { R as RunRecord } from './run-record-etiCMsUq.js';
2
+ import { F as FailureClusterReport } from './failure-cluster-CL7IVgkJ.js';
3
3
 
4
4
  /**
5
5
  * HeldOutGate — first-class held-out paired-delta promotion gate.
@@ -1,5 +1,6 @@
1
- import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
2
- import { R as Run, F as FailureClass, T as TraceStore } from './store-Db2Bv8Cf.js';
1
+ import { T as TraceEmitter } from './emitter-DEZwY14K.js';
2
+ import { R as Run, F as FailureClass } from './schema-m0gsnbt3.js';
3
+ import { T as TraceStore } from './store-CKUAgsJz.js';
3
4
 
4
5
  /**
5
6
  * SandboxHarness — executes a scenario in an isolated environment and
package/dist/traces.d.ts CHANGED
@@ -1,12 +1,13 @@
1
1
  import { N as NotFoundError, R as ReplayError } from './errors-mje_cKOs.js';
2
2
  import { R as RawProviderSink, d as RawProviderEvent } from './raw-provider-sink-C46HDghv.js';
3
3
  export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, c as RawProviderDirection, e as RawProviderSinkFilter, f as defaultProviderRedactor, p as providerFromBaseUrl } from './raw-provider-sink-C46HDghv.js';
4
- import { R as RunCompleteHook, a as RunCompleteHookContext } from './emitter-DP_cSSiw.js';
5
- export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DP_cSSiw.js';
6
- export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CTDhR1Sg.js';
7
- import { T as TraceStore } from './store-Db2Bv8Cf.js';
8
- export { A as Artifact, B as BudgetLedgerEntry, g as BudgetSpec, i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, L as LlmSpan, M as Message, e as RetrievalSpan, R as Run, h as RunFilter, m as RunLayer, c as RunOutcome, n as RunStatus, f as SandboxSpan, S as Span, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, b as TraceEvent, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
9
- export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-DODUYdPg.js';
4
+ import { R as RunCompleteHook, a as RunCompleteHookContext } from './emitter-DEZwY14K.js';
5
+ export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DEZwY14K.js';
6
+ export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CfXjSqEv.js';
7
+ import { T as TraceStore } from './store-CKUAgsJz.js';
8
+ export { E as EventFilter, F as FileSystemTraceStore, a as FileSystemTraceStoreOptions, I as InMemoryTraceStore, R as RunFilter, S as SpanFilter } from './store-CKUAgsJz.js';
9
+ export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-CqTxMwDw.js';
10
+ export { A as Artifact, B as BudgetLedgerEntry, h as BudgetSpec, E as EventKind, i as FAILURE_CLASSES, F as FailureClass, G as GenericSpan, J as JudgeSpan, L as LlmSpan, M as Message, d as RetrievalSpan, R as Run, g as RunLayer, b as RunOutcome, f as RunStatus, e as SandboxSpan, S as Span, j as SpanBase, c as SpanKind, k as SpanStatus, l as TRACE_SCHEMA_VERSION, T as ToolSpan, a as TraceEvent, m as isJudgeSpan, n as isLlmSpan, o as isRetrievalSpan, p as isSandboxSpan, q as isToolSpan } from './schema-m0gsnbt3.js';
10
11
  import { AxAIService, AxFunction } from '@ax-llm/ax';
11
12
  import { T as TraceAnalysisStore, f as TraceAnalystFilters, a as DatasetOverview, Q as QueryTracesPage, l as ViewTraceResult, V as ViewSpansResult, b as SearchTraceResult, S as SearchSpanResult } from './store-CJbzDxZ2.js';
12
13
  export { D as DEFAULT_TRACE_ANALYST_BUDGETS, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, k as ViewTraceOversized } from './store-CJbzDxZ2.js';
@@ -1,4 +1,5 @@
1
- import { S as Span, b as TraceEvent, T as TraceStore } from './store-Db2Bv8Cf.js';
1
+ import { S as Span, a as TraceEvent } from './schema-m0gsnbt3.js';
2
+ import { T as TraceStore } from './store-CKUAgsJz.js';
2
3
 
3
4
  /**
4
5
  * Trajectory — ordered, structured view over a run's spans.
@@ -248,6 +248,23 @@ interface CampaignCostMeter {
248
248
  * training scenarios unless explicitly opted in). */
249
249
  type LabeledScenarioSource = 'production-trace' | 'eval-run' | 'manual' | 'red-team' | 'synthetic';
250
250
  type RedactionStatus = 'raw' | 'redacted-pii' | 'redacted-secrets' | 'fully-redacted';
251
+ /** How much a label can be trusted to evaluate against — the gold-admission
252
+ * gate. Strictly ordered: a record qualifies for a `minTrust` filter when its
253
+ * trust rank is >= the requested rank.
254
+ *
255
+ * - `unverified` — label is a heuristic (e.g. raw outcome success/fail).
256
+ * Fine as corpus; MUST NOT enter a gold set that lift
257
+ * numbers are computed against.
258
+ * - `verified-signal` — an external signal confirmed the outcome (PR merged,
259
+ * tests green, user did not retry, downstream check).
260
+ * - `human-rated` — a human explicitly rated or corrected the artifact.
261
+ *
262
+ * Absent on a write ⇒ treated as `unverified` (fail-closed: a writer must
263
+ * explicitly assert trust to make a record gold-eligible — it never happens
264
+ * by accident). */
265
+ type LabelTrust = 'unverified' | 'verified-signal' | 'human-rated';
266
+ /** Ordinal rank for a label-trust tier; absent ⇒ `unverified` (rank 0). */
267
+ declare function labelTrustRank(trust: LabelTrust | undefined): number;
251
268
  /** @experimental Required-provenance write. The store rejects writes that
252
269
  * lack provenance — a default-on flywheel without provenance is the
253
270
  * data-poisoning vector flagged in the alignment review. */
@@ -259,6 +276,11 @@ interface LabeledScenarioWrite<TScenario extends Scenario = Scenario, TArtifact
259
276
  sourceVersionHash: string;
260
277
  capturedAt: string;
261
278
  redactionStatus: RedactionStatus;
279
+ /** Gold-admission trust tier. Absent ⇒ `unverified` (fail-closed): the
280
+ * record is corpus, never gold. A writer must explicitly assert
281
+ * `verified-signal` or `human-rated` to make it eligible for a gold
282
+ * sample. See {@link LabelTrust}. */
283
+ labelTrust?: LabelTrust;
262
284
  /** Optional per-source rate-limit bucket key (e.g., the tenant id). */
263
285
  rateLimitBucket?: string;
264
286
  }
@@ -282,6 +304,11 @@ interface LabeledScenarioSampleArgs {
282
304
  source?: LabeledScenarioSource | LabeledScenarioSource[];
283
305
  minComposite?: number;
284
306
  maxComposite?: number;
307
+ /** Gold gate: only records whose trust rank is >= this tier are
308
+ * returned. `sample({ split: 'test', minTrust: 'verified-signal' })` is
309
+ * the canonical "give me the gold set" call. Absent ⇒ no trust gate
310
+ * (corpus-level read). */
311
+ minTrust?: LabelTrust;
285
312
  };
286
313
  }
287
314
  interface LabeledScenarioStore {
@@ -291,6 +318,9 @@ interface LabeledScenarioStore {
291
318
  train: number;
292
319
  test: number;
293
320
  bySource: Record<string, number>;
321
+ /** Count by trust tier — tells the flywheel how much gold it has
322
+ * accumulated vs. raw corpus. */
323
+ byTrust: Record<LabelTrust, number>;
294
324
  }>;
295
325
  }
296
326
  interface CampaignCellResult<TArtifact> {
@@ -372,4 +402,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
372
402
  scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
373
403
  }
374
404
 
375
- export type { CodeSurface as C, DispatchFn as D, Gate as G, ImprovementDriver as I, JudgeScore as J, LabeledScenarioStore as L, MutableSurface as M, OptimizerConfig as O, ProposeContext as P, RedactionStatus as R, Scenario as S, TraceSpan as T, JudgeConfig as a, DispatchContext as b, LabeledScenarioWrite as c, LabeledScenarioSampleArgs as d, LabeledScenarioRecord as e, CampaignAggregates as f, CampaignArtifactWriter as g, CampaignCellResult as h, CampaignCostMeter as i, CampaignResult as j, CampaignTraceWriter as k, GateContext as l, GateDecision as m, GateResult as n, GenerationCandidate as o, GenerationRecord as p, JudgeAggregate as q, JudgeDimension as r, LabeledScenarioSource as s, Mutator as t, ScenarioAggregate as u, SessionScript as v };
405
+ export { type CodeSurface as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ProposeContext as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type LabeledScenarioWrite as c, type LabeledScenarioSampleArgs as d, type LabeledScenarioRecord as e, type LabelTrust as f, type CampaignAggregates as g, type CampaignArtifactWriter as h, type CampaignCellResult as i, type CampaignCostMeter as j, type CampaignResult as k, type CampaignTraceWriter as l, type GateContext as m, type GateDecision as n, type GateResult as o, type GenerationCandidate as p, type GenerationRecord as q, type JudgeAggregate as r, type JudgeDimension as s, type LabeledScenarioSource as t, type Mutator as u, type ScenarioAggregate as v, type SessionScript as w, labelTrustRank as x };
@@ -1,12 +1,13 @@
1
- import { F as FeedbackTrajectoryStore } from '../feedback-trajectory-BSxqEpu7.js';
2
- import { T as TraceStore } from '../store-Db2Bv8Cf.js';
1
+ import { F as FeedbackTrajectoryStore } from '../feedback-trajectory-DpUmE90J.js';
2
+ import { T as TraceStore } from '../store-CKUAgsJz.js';
3
3
  import { z } from 'zod';
4
4
  import { OpenAPIObject } from 'openapi3-ts/oas31';
5
5
  import * as hono_types from 'hono/types';
6
6
  import { ServerType } from '@hono/node-server';
7
7
  import { Hono } from 'hono';
8
- import '../control-runtime-BZ_lVLYW.js';
9
- import '../emitter-DP_cSSiw.js';
8
+ import '../control-runtime-DuFBYg7A.js';
9
+ import '../emitter-DEZwY14K.js';
10
+ import '../schema-m0gsnbt3.js';
10
11
  import '../dataset-BlwAtYYf.js';
11
12
  import '../errors-mje_cKOs.js';
12
13
 
@@ -0,0 +1,62 @@
1
+ # Pilot Kit — customer handoff materials
2
+
3
+ What's here, in order of use:
4
+
5
+ | File | For | When |
6
+ |---|---|---|
7
+ | [one-pager.md](./one-pager.md) | Customer's first read | Send as initial pitch — what they get, why it's different, what it looks like, what it costs. Now includes intake-paths matrix for non-Tangle customers (LangChain / LlamaIndex / Anthropic SDK / OpenAI Assistants / OpenRouter / vLLM / Ollama / custom). |
8
+ | [integration-tangle-stack.md](./integration-tangle-stack.md) | Customer's engineer (Tangle-stack customers) | Send after one-pager when they want to see the code; full integration walkthrough for the canonical Tangle stack (sandbox + tcloud) |
9
+ | [integration-foreign-stack.md](./integration-foreign-stack.md) | Customer's engineer (non-Tangle customers) | Send after one-pager when they're on OTel, LangChain, LlamaIndex, Anthropic SDK, OpenAI Assistants, OpenRouter, vLLM, Ollama, or custom. Covers every path. |
10
+ | [sample-insight-report.json](./sample-insight-report.json) | Customer's team meeting | Concrete JSON they can show to demonstrate value pre-integration |
11
+ | [customer-checklist.md](./customer-checklist.md) | Pre-onboarding-call | Send 48h before the call; ensures the 90min slot is productive. Provider-agnostic — works for any stack. |
12
+
13
+ ## How to use this kit
14
+
15
+ **For a Tangle customer asking for it RIGHT NOW:**
16
+
17
+ 1. Reply with the one-pager (`one-pager.md`) inline + the sample InsightReport (`sample-insight-report.json`) attached. Their senior engineer reads this and decides if it's worth a call.
18
+ 2. If they say yes, send the integration guide (`integration-tangle-stack.md`) + the checklist (`customer-checklist.md`). Schedule a 90-minute onboarding call.
19
+ 3. On the call: walk through the integration, run a live `analyzeRuns()` against their existing sandbox sessions, render the deterministic packet, fire one small `selfImprove` cycle. By the end of the call they have a working pilot.
20
+
21
+ **For Drew handling the conversation himself:**
22
+
23
+ The whole kit is written in our voice (technical, direct, no marketing fluff). You can paste sections directly into Slack / email / a customer call. The one-pager is meant to read as YOUR pitch, not a generic SaaS handout.
24
+
25
+ ## What this kit assumes
26
+
27
+ - Customer is on the Tangle stack (sandbox + tcloud) OR emits OTel traces
28
+ - Customer has an agent with a clear system-prompt addendum we can optimize
29
+ - Customer has at least 20 scenarios their agent handles
30
+ - Customer is willing to set a `maxUsd` budget for closed-loop campaigns
31
+
32
+ If any of those don't apply, the one-pager still works as a positioning piece. The integration doc gets adapted on the call.
33
+
34
+ ## Where this maps in the substrate
35
+
36
+ - Substrate version: `@tangle-network/agent-eval@0.53.0` (npm), `agent-eval-rpc@0.53.0` (PyPI)
37
+ - agent-runtime version: `@tangle-network/agent-runtime@0.29.0`
38
+ - Key APIs: `fromTangleSandbox`, `fromOtelSpans`, `analyzeRuns`, `selfImprove`, `gepaDriver`, `defaultProductionGate`, `openAutoPr`
39
+ - All ship today; no version-blocking dependencies
40
+
41
+ ## What this kit doesn't yet do
42
+
43
+ - No `npx @tangle-network/intelligence demo` command shipped yet (queued #115 — extend existing `tangle-intel` CLI in ADC with customer-zero-touch subcommands `init` / `demo` / `report` / `improve`)
44
+ - No `staging-intelligence.tangle.tools` live yet (queued #116 — matches existing `staging-{product}.tangle.tools` precedent like sandbox)
45
+ - No live demo video (queued #117 — recorded against legal-agent canonical real data)
46
+ - No screenshot dashboard (gated on Gate 2 task #109 — ADC intelligence frontend renders canonical InsightReport)
47
+ - No published case study with named numbers (Gate 3 task #112 — after first pilot completes 4+ cycles)
48
+
49
+ ## Architectural decisions baked into this kit
50
+
51
+ - **Customer-facing CLI is `@tangle-network/intelligence`** (binary `tangle-intel`), NOT `agent-eval`. `agent-eval` is the substrate package; `intelligence` is the customer product that wraps it. The CLI already exists at `services/intelligence/src/cli/` in agent-dev-container — we extend it with `init` / `demo` / `report` / `improve` subcommands per task #115.
52
+ - **Hosted URL is `staging-intelligence.tangle.tools`** matching `staging-sandbox.tangle.tools` precedent. Production becomes `intelligence.tangle.tools` once Gate 2/3 close.
53
+ - **`agent-eval` mentioned only when customer wants direct programmatic access** (not the default path). 90%+ of customers stay at the CLI + hosted dashboard layer.
54
+
55
+ For the FIRST pilot conversation, the JSON sample is the dashboard substitute. After Gate 2 lands we replace it with live screenshots.
56
+
57
+ ## Update cadence
58
+
59
+ This kit gets updated each time:
60
+ - A substrate version ships that customers should know about
61
+ - A real pilot completes and we have a case study to add
62
+ - A customer gives feedback that re-shapes how we pitch
@@ -0,0 +1,90 @@
1
+ # Pre-onboarding checklist — what to have ready
2
+
3
+ Send this to the customer 48h before the onboarding call. If they show up to the call having done this, the 90-minute slot ends with a working pilot.
4
+
5
+ ## What we need from you before the call
6
+
7
+ ### Credentials
8
+
9
+ - [ ] **LLM provider API key** — tcloud key, OpenRouter key, OpenAI key, Anthropic key, or any OpenAI-compat router endpoint
10
+ - [ ] **GitHub token** with PR-write access to your agent repo (optional — required only if you want auto-PR promotion on green gate decisions)
11
+ - [ ] **Sandbox session access** (Tangle stack customers only) — read access to the session IDs we'll analyze
12
+
13
+ ### Data
14
+
15
+ - [ ] **Trace data** — ONE of:
16
+ - Tangle sandbox session IDs (we use `fromTangleSandbox`)
17
+ - OTel spans dumped as JSONL (we use `fromOtelSpans`)
18
+ - Multi-rater feedback table (CSV with runId / rater / score, we use `fromFeedbackTable`)
19
+ - LangChain / LlamaIndex / OpenAI Assistants trace export (we use the corresponding adapter)
20
+ - Custom trace format (we map it together on the call — usually 20 lines of glue)
21
+ - [ ] **Scenarios** — 20-50 representative inputs your agent handles. Even YAML / JSON / TS array is fine; we'll convert to canonical `DatasetScenario[]` shape together
22
+ - [ ] **The system prompt addendum** your agent uses today (or whichever text surface you want to optimize) — the closed loop edits this
23
+
24
+ ### Judge
25
+
26
+ - [ ] **A judge function or rubric** — either:
27
+ - An existing function `(artifact) → { composite, dimensions }`
28
+ - A rubric describing what "good output" means (1-2 paragraphs is enough — we'll build the judge on the call)
29
+ - A set of "good" / "bad" labeled examples (we use these as anchors)
30
+
31
+ ### Constraints
32
+
33
+ - [ ] **LLM cost budget for the closed loop** — default $25 per campaign. Tell us if you want a different ceiling
34
+ - [ ] **Cadence** — how often should the loop run? Default: weekly. Some customers want daily; others want on-demand only
35
+ - [ ] **Deployment gate preference** — do you want:
36
+ - Auto-PR on `ship-substrate` (we open the PR, your team reviews)
37
+ - Manual review only (we report; you decide)
38
+ - Auto-deploy on `ship-substrate` (only with explicit ack; not default)
39
+
40
+ ## Call agenda — 90 minutes
41
+
42
+ | Time | Topic |
43
+ |---|---|
44
+ | 0:00 — 0:10 | Walk through your existing setup — what runs where, what scenarios exist, what success looks like for you |
45
+ | 0:10 — 0:30 | Pick the right intake adapter; pull traces; run `analyzeRuns()` against last week's data — first decision packet rendered live |
46
+ | 0:30 — 0:50 | Build the judge — either wrap your existing one or scaffold a new one from your rubric |
47
+ | 0:50 — 1:10 | Fire one `selfImprove` cycle with a small budget ($5, single generation, 2 candidates) — watch the loop run end-to-end |
48
+ | 1:10 — 1:25 | Wire the cron + auto-PR target; schedule first weekly run |
49
+ | 1:25 — 1:30 | Confirm what we hand back to you between runs and what reaches you when |
50
+
51
+ If something on the checklist isn't ready, we adapt — just send what you have. Worst case, we spend the first 30 minutes getting unblocked.
52
+
53
+ ## What you'll have at the end of the call
54
+
55
+ - A working `analyzeRuns()` call against YOUR live trace data, returning a real `InsightReport`
56
+ - A judge function (yours or scaffolded) wired to your agent's output shape
57
+ - One completed `selfImprove` cycle with a real `gateDecision` + lift CI
58
+ - A scheduled cron / GitHub Action that runs the loop weekly
59
+ - Optional: an auto-PR target if you want green-gate proposals to land as draft PRs
60
+
61
+ ## After the call
62
+
63
+ - Day 1-7: first weekly run fires; we monitor + jump in if anything breaks
64
+ - Day 7: we send you a `selfImprove`-result summary + the corresponding `InsightReport`
65
+ - Day 14-28: 3 more cycles complete; you have enough data to evaluate the pilot
66
+ - Day 30: pilot review — what we found, what shipped, what's next
67
+
68
+ ## What we send back to you between runs
69
+
70
+ - The full `InsightReport` JSON (you render it however you want, or use our hosted dashboard if it's available for your tier)
71
+ - Slack / email digest of `regressedMetrics` + critical recommendations (opt-in)
72
+ - Cost tally per campaign
73
+ - Auto-PR links if green gate verdicts opened any
74
+
75
+ ## Common pre-call questions
76
+
77
+ **Q: How small a corpus can we start with?**
78
+ A: 15 scenarios works for the deterministic packet. 25+ is recommended for `selfImprove`'s held-out gate (the default `holdoutFraction: 0.3` reserves ~30% of scenarios for the gate).
79
+
80
+ **Q: What if our judge isn't reliable yet?**
81
+ A: Start with multi-rater intake — `fromFeedbackTable` produces inter-rater agreement (κ) so you can see exactly which scenarios humans disagree on. Iterate the judge until κ > 0.7, then go to closed loop.
82
+
83
+ **Q: We don't use Tangle's sandbox — can we still pilot?**
84
+ A: Yes. We have intake adapters for OTel, LangChain, LlamaIndex, Anthropic SDK, OpenAI Assistants, OpenRouter, multi-rater feedback tables, and custom trace formats. See `integration-foreign-stack.md`.
85
+
86
+ **Q: We use OpenRouter — does the closed-loop driver work with our routing setup?**
87
+ A: Yes. `gepaDriver` accepts any OpenAI-compatible endpoint via its `llm.baseUrl` option. Most customers run their selfImprove campaigns through OpenRouter or their existing provider — no migration required.
88
+
89
+ **Q: What if the pilot fails — what do we get?**
90
+ A: You get the deterministic `InsightReport` weekly regardless. Even if no `selfImprove` cycle ever ships a green gate verdict, you get the failure-cluster analysis, regressed-metric detection, and worst-runs surfacing. Those alone replace what most teams currently get from LangSmith / Braintrust / Phoenix scorecards.