@tangle-network/agent-eval 0.53.0 → 0.54.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/otel.d.ts +7 -6
- package/dist/{baseline-4R5deP0N.d.ts → baseline-DE36-Np7.d.ts} +1 -1
- package/dist/benchmarks/index.d.ts +3 -2
- package/dist/builder-eval/index.d.ts +4 -3
- package/dist/campaign/index.d.ts +9 -7
- package/dist/campaign/index.js +33 -4
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-L7XMNXLO.js → chunk-J4DIMSRK.js} +2 -2
- package/dist/{chunk-BWZEGTES.js → chunk-NCK5QLGT.js} +1 -1
- package/dist/chunk-NCK5QLGT.js.map +1 -0
- package/dist/{chunk-5KSDYBYH.js → chunk-YXTT6GSZ.js} +2 -2
- package/dist/contract/index.d.ts +13 -12
- package/dist/contract/index.js +25 -0
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-ojEWkMfJ.d.ts → control-DjEgwWNo.d.ts} +6 -5
- package/dist/{control-runtime-BZ_lVLYW.d.ts → control-runtime-DuFBYg7A.d.ts} +3 -2
- package/dist/control.d.ts +7 -6
- package/dist/control.js +2 -2
- package/dist/{emitter-DP_cSSiw.d.ts → emitter-DEZwY14K.d.ts} +2 -1
- package/dist/{failure-cluster-Cw65_5FY.d.ts → failure-cluster-CL7IVgkJ.d.ts} +2 -1
- package/dist/{feedback-trajectory-BSxqEpu7.d.ts → feedback-trajectory-DpUmE90J.d.ts} +1 -1
- package/dist/governance/index.d.ts +3 -2
- package/dist/hosted/index.d.ts +7 -6
- package/dist/{index-C7RhhEME.d.ts → index-D2nT6_KT.d.ts} +20 -2
- package/dist/{index-0pu_fBwZ.d.ts → index-wlaiph9Y.d.ts} +1 -1
- package/dist/index.d.ts +31 -29
- package/dist/index.js +3 -3
- package/dist/{integrity-CTDhR1Sg.d.ts → integrity-CfXjSqEv.d.ts} +1 -1
- package/dist/knowledge/index.d.ts +4 -3
- package/dist/meta-eval/index.d.ts +4 -3
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.d.ts +7 -6
- package/dist/prm/index.d.ts +5 -4
- package/dist/{query-DODUYdPg.d.ts → query-CqTxMwDw.d.ts} +2 -1
- package/dist/{red-team-30II1T4o.d.ts → red-team-CrC5MZYd.d.ts} +1 -1
- package/dist/{registry-8KAs18kY.d.ts → registry-BSWy0rvH.d.ts} +1 -1
- package/dist/{release-report-DSu0DWy8.d.ts → release-report-B6l5fi7T.d.ts} +2 -2
- package/dist/reporting.d.ts +7 -6
- package/dist/{researcher-LZD0qHEa.d.ts → researcher-D4AZjxNa.d.ts} +5 -5
- package/dist/rl.d.ts +11 -10
- package/dist/rl.js +2 -2
- package/dist/{rubric-D5tjHNJQ.d.ts → rubric-BOfxn4ja.d.ts} +3 -2
- package/dist/{rubric-predictive-validity-ByZEC3BX.d.ts → rubric-predictive-validity-B3qNa4aY.d.ts} +1 -1
- package/dist/{run-improvement-loop-Cc7oZlRP.d.ts → run-improvement-loop-BhfdjrMY.d.ts} +3 -3
- package/dist/{run-record-BGY6bHRh.d.ts → run-record-etiCMsUq.d.ts} +11 -3
- package/dist/{store-Db2Bv8Cf.d.ts → schema-m0gsnbt3.d.ts} +1 -99
- package/dist/store-CKUAgsJz.d.ts +101 -0
- package/dist/{summary-report-B7gNRX-r.d.ts → summary-report-DLxh4yWk.d.ts} +2 -2
- package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BdVaPyHT.d.ts} +3 -2
- package/dist/traces.d.ts +7 -6
- package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-GEdXJCL5.d.ts} +2 -1
- package/dist/{types-Dbj5gu8n.d.ts → types-BgrxOJSf.d.ts} +31 -1
- package/dist/wire/index.d.ts +5 -4
- package/docs/pilot/README.md +62 -0
- package/docs/pilot/customer-checklist.md +90 -0
- package/docs/pilot/integration-foreign-stack.md +296 -0
- package/docs/pilot/integration-tangle-stack.md +248 -0
- package/docs/pilot/one-pager.md +161 -0
- package/docs/pilot/sample-insight-report.json +172 -0
- package/docs/research/research-roadmap.md +204 -0
- package/package.json +1 -1
- package/dist/chunk-BWZEGTES.js.map +0 -1
- /package/dist/{chunk-L7XMNXLO.js.map → chunk-J4DIMSRK.js.map} +0 -0
- /package/dist/{chunk-5KSDYBYH.js.map → chunk-YXTT6GSZ.js.map} +0 -0
|
@@ -196,102 +196,4 @@ declare function isRetrievalSpan(s: Span): s is RetrievalSpan;
|
|
|
196
196
|
declare function isJudgeSpan(s: Span): s is JudgeSpan;
|
|
197
197
|
declare function isSandboxSpan(s: Span): s is SandboxSpan;
|
|
198
198
|
|
|
199
|
-
|
|
200
|
-
scenarioId?: string;
|
|
201
|
-
variantId?: string;
|
|
202
|
-
status?: RunStatus;
|
|
203
|
-
since?: number;
|
|
204
|
-
until?: number;
|
|
205
|
-
tag?: {
|
|
206
|
-
key: string;
|
|
207
|
-
value: string;
|
|
208
|
-
};
|
|
209
|
-
parentRunId?: string;
|
|
210
|
-
projectId?: string;
|
|
211
|
-
chatId?: string;
|
|
212
|
-
layer?: RunLayer;
|
|
213
|
-
}
|
|
214
|
-
interface SpanFilter {
|
|
215
|
-
runId?: string;
|
|
216
|
-
parentSpanId?: string;
|
|
217
|
-
kind?: SpanKind;
|
|
218
|
-
name?: string;
|
|
219
|
-
toolName?: string;
|
|
220
|
-
judgeId?: string;
|
|
221
|
-
since?: number;
|
|
222
|
-
until?: number;
|
|
223
|
-
}
|
|
224
|
-
interface EventFilter {
|
|
225
|
-
runId?: string;
|
|
226
|
-
spanId?: string;
|
|
227
|
-
kind?: EventKind;
|
|
228
|
-
since?: number;
|
|
229
|
-
until?: number;
|
|
230
|
-
}
|
|
231
|
-
interface TraceStore {
|
|
232
|
-
appendRun(run: Run): Promise<void>;
|
|
233
|
-
updateRun(runId: string, patch: Partial<Run>): Promise<void>;
|
|
234
|
-
appendSpan(span: Span): Promise<void>;
|
|
235
|
-
updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
|
|
236
|
-
appendEvent(event: TraceEvent): Promise<void>;
|
|
237
|
-
appendArtifact(artifact: Artifact): Promise<void>;
|
|
238
|
-
appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
|
|
239
|
-
getRun(runId: string): Promise<Run | undefined>;
|
|
240
|
-
listRuns(filter?: RunFilter): Promise<Run[]>;
|
|
241
|
-
spans(filter?: SpanFilter): Promise<Span[]>;
|
|
242
|
-
events(filter?: EventFilter): Promise<TraceEvent[]>;
|
|
243
|
-
budget(runId: string): Promise<BudgetLedgerEntry[]>;
|
|
244
|
-
artifacts(runId: string): Promise<Artifact[]>;
|
|
245
|
-
}
|
|
246
|
-
declare class InMemoryTraceStore implements TraceStore {
|
|
247
|
-
private runs;
|
|
248
|
-
private allSpans;
|
|
249
|
-
private allEvents;
|
|
250
|
-
private allArtifacts;
|
|
251
|
-
private allBudget;
|
|
252
|
-
appendRun(run: Run): Promise<void>;
|
|
253
|
-
updateRun(runId: string, patch: Partial<Run>): Promise<void>;
|
|
254
|
-
appendSpan(span: Span): Promise<void>;
|
|
255
|
-
updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
|
|
256
|
-
appendEvent(event: TraceEvent): Promise<void>;
|
|
257
|
-
appendArtifact(artifact: Artifact): Promise<void>;
|
|
258
|
-
appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
|
|
259
|
-
getRun(runId: string): Promise<Run | undefined>;
|
|
260
|
-
listRuns(filter?: RunFilter): Promise<Run[]>;
|
|
261
|
-
spans(filter?: SpanFilter): Promise<Span[]>;
|
|
262
|
-
events(filter?: EventFilter): Promise<TraceEvent[]>;
|
|
263
|
-
budget(runId: string): Promise<BudgetLedgerEntry[]>;
|
|
264
|
-
artifacts(runId: string): Promise<Artifact[]>;
|
|
265
|
-
}
|
|
266
|
-
interface FileSystemTraceStoreOptions {
|
|
267
|
-
dir: string;
|
|
268
|
-
/** Roll over NDJSON files when they exceed this size in bytes. Default 32 MB. */
|
|
269
|
-
maxBytes?: number;
|
|
270
|
-
}
|
|
271
|
-
declare class FileSystemTraceStore implements TraceStore {
|
|
272
|
-
private dir;
|
|
273
|
-
private maxBytes;
|
|
274
|
-
/** Lazy in-memory index for queries — populated on first read. */
|
|
275
|
-
private index?;
|
|
276
|
-
private loaded;
|
|
277
|
-
constructor(options: FileSystemTraceStoreOptions);
|
|
278
|
-
private ensureDir;
|
|
279
|
-
private append;
|
|
280
|
-
private insertInto;
|
|
281
|
-
private load;
|
|
282
|
-
appendRun(run: Run): Promise<void>;
|
|
283
|
-
updateRun(runId: string, patch: Partial<Run>): Promise<void>;
|
|
284
|
-
appendSpan(span: Span): Promise<void>;
|
|
285
|
-
updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
|
|
286
|
-
appendEvent(event: TraceEvent): Promise<void>;
|
|
287
|
-
appendArtifact(artifact: Artifact): Promise<void>;
|
|
288
|
-
appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
|
|
289
|
-
getRun(runId: string): Promise<Run | undefined>;
|
|
290
|
-
listRuns(filter?: RunFilter): Promise<Run[]>;
|
|
291
|
-
spans(filter?: SpanFilter): Promise<Span[]>;
|
|
292
|
-
events(filter?: EventFilter): Promise<TraceEvent[]>;
|
|
293
|
-
budget(runId: string): Promise<BudgetLedgerEntry[]>;
|
|
294
|
-
artifacts(runId: string): Promise<Artifact[]>;
|
|
295
|
-
}
|
|
296
|
-
|
|
297
|
-
export { type Artifact as A, type BudgetLedgerEntry as B, type EventKind as E, type FailureClass as F, type GenericSpan as G, InMemoryTraceStore as I, type JudgeSpan as J, type LlmSpan as L, type Message as M, type Run as R, type Span as S, type TraceStore as T, type ToolSpan as a, type TraceEvent as b, type RunOutcome as c, type SpanKind as d, type RetrievalSpan as e, type SandboxSpan as f, type BudgetSpec as g, type RunFilter as h, type EventFilter as i, FAILURE_CLASSES as j, FileSystemTraceStore as k, type FileSystemTraceStoreOptions as l, type RunLayer as m, type RunStatus as n, type SpanBase as o, type SpanFilter as p, type SpanStatus as q, TRACE_SCHEMA_VERSION as r, isJudgeSpan as s, isLlmSpan as t, isRetrievalSpan as u, isSandboxSpan as v, isToolSpan as w };
|
|
199
|
+
export { type Artifact as A, type BudgetLedgerEntry as B, type EventKind as E, type FailureClass as F, type GenericSpan as G, type JudgeSpan as J, type LlmSpan as L, type Message as M, type Run as R, type Span as S, type ToolSpan as T, type TraceEvent as a, type RunOutcome as b, type SpanKind as c, type RetrievalSpan as d, type SandboxSpan as e, type RunStatus as f, type RunLayer as g, type BudgetSpec as h, FAILURE_CLASSES as i, type SpanBase as j, type SpanStatus as k, TRACE_SCHEMA_VERSION as l, isJudgeSpan as m, isLlmSpan as n, isRetrievalSpan as o, isSandboxSpan as p, isToolSpan as q };
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import { R as Run, S as Span, a as TraceEvent, A as Artifact, B as BudgetLedgerEntry, f as RunStatus, g as RunLayer, c as SpanKind, E as EventKind } from './schema-m0gsnbt3.js';
|
|
2
|
+
|
|
3
|
+
interface RunFilter {
|
|
4
|
+
scenarioId?: string;
|
|
5
|
+
variantId?: string;
|
|
6
|
+
status?: RunStatus;
|
|
7
|
+
since?: number;
|
|
8
|
+
until?: number;
|
|
9
|
+
tag?: {
|
|
10
|
+
key: string;
|
|
11
|
+
value: string;
|
|
12
|
+
};
|
|
13
|
+
parentRunId?: string;
|
|
14
|
+
projectId?: string;
|
|
15
|
+
chatId?: string;
|
|
16
|
+
layer?: RunLayer;
|
|
17
|
+
}
|
|
18
|
+
interface SpanFilter {
|
|
19
|
+
runId?: string;
|
|
20
|
+
parentSpanId?: string;
|
|
21
|
+
kind?: SpanKind;
|
|
22
|
+
name?: string;
|
|
23
|
+
toolName?: string;
|
|
24
|
+
judgeId?: string;
|
|
25
|
+
since?: number;
|
|
26
|
+
until?: number;
|
|
27
|
+
}
|
|
28
|
+
interface EventFilter {
|
|
29
|
+
runId?: string;
|
|
30
|
+
spanId?: string;
|
|
31
|
+
kind?: EventKind;
|
|
32
|
+
since?: number;
|
|
33
|
+
until?: number;
|
|
34
|
+
}
|
|
35
|
+
interface TraceStore {
|
|
36
|
+
appendRun(run: Run): Promise<void>;
|
|
37
|
+
updateRun(runId: string, patch: Partial<Run>): Promise<void>;
|
|
38
|
+
appendSpan(span: Span): Promise<void>;
|
|
39
|
+
updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
|
|
40
|
+
appendEvent(event: TraceEvent): Promise<void>;
|
|
41
|
+
appendArtifact(artifact: Artifact): Promise<void>;
|
|
42
|
+
appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
|
|
43
|
+
getRun(runId: string): Promise<Run | undefined>;
|
|
44
|
+
listRuns(filter?: RunFilter): Promise<Run[]>;
|
|
45
|
+
spans(filter?: SpanFilter): Promise<Span[]>;
|
|
46
|
+
events(filter?: EventFilter): Promise<TraceEvent[]>;
|
|
47
|
+
budget(runId: string): Promise<BudgetLedgerEntry[]>;
|
|
48
|
+
artifacts(runId: string): Promise<Artifact[]>;
|
|
49
|
+
}
|
|
50
|
+
declare class InMemoryTraceStore implements TraceStore {
|
|
51
|
+
private runs;
|
|
52
|
+
private allSpans;
|
|
53
|
+
private allEvents;
|
|
54
|
+
private allArtifacts;
|
|
55
|
+
private allBudget;
|
|
56
|
+
appendRun(run: Run): Promise<void>;
|
|
57
|
+
updateRun(runId: string, patch: Partial<Run>): Promise<void>;
|
|
58
|
+
appendSpan(span: Span): Promise<void>;
|
|
59
|
+
updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
|
|
60
|
+
appendEvent(event: TraceEvent): Promise<void>;
|
|
61
|
+
appendArtifact(artifact: Artifact): Promise<void>;
|
|
62
|
+
appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
|
|
63
|
+
getRun(runId: string): Promise<Run | undefined>;
|
|
64
|
+
listRuns(filter?: RunFilter): Promise<Run[]>;
|
|
65
|
+
spans(filter?: SpanFilter): Promise<Span[]>;
|
|
66
|
+
events(filter?: EventFilter): Promise<TraceEvent[]>;
|
|
67
|
+
budget(runId: string): Promise<BudgetLedgerEntry[]>;
|
|
68
|
+
artifacts(runId: string): Promise<Artifact[]>;
|
|
69
|
+
}
|
|
70
|
+
interface FileSystemTraceStoreOptions {
|
|
71
|
+
dir: string;
|
|
72
|
+
/** Roll over NDJSON files when they exceed this size in bytes. Default 32 MB. */
|
|
73
|
+
maxBytes?: number;
|
|
74
|
+
}
|
|
75
|
+
declare class FileSystemTraceStore implements TraceStore {
|
|
76
|
+
private dir;
|
|
77
|
+
private maxBytes;
|
|
78
|
+
/** Lazy in-memory index for queries — populated on first read. */
|
|
79
|
+
private index?;
|
|
80
|
+
private loaded;
|
|
81
|
+
constructor(options: FileSystemTraceStoreOptions);
|
|
82
|
+
private ensureDir;
|
|
83
|
+
private append;
|
|
84
|
+
private insertInto;
|
|
85
|
+
private load;
|
|
86
|
+
appendRun(run: Run): Promise<void>;
|
|
87
|
+
updateRun(runId: string, patch: Partial<Run>): Promise<void>;
|
|
88
|
+
appendSpan(span: Span): Promise<void>;
|
|
89
|
+
updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
|
|
90
|
+
appendEvent(event: TraceEvent): Promise<void>;
|
|
91
|
+
appendArtifact(artifact: Artifact): Promise<void>;
|
|
92
|
+
appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
|
|
93
|
+
getRun(runId: string): Promise<Run | undefined>;
|
|
94
|
+
listRuns(filter?: RunFilter): Promise<Run[]>;
|
|
95
|
+
spans(filter?: SpanFilter): Promise<Span[]>;
|
|
96
|
+
events(filter?: EventFilter): Promise<TraceEvent[]>;
|
|
97
|
+
budget(runId: string): Promise<BudgetLedgerEntry[]>;
|
|
98
|
+
artifacts(runId: string): Promise<Artifact[]>;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
export { type EventFilter as E, FileSystemTraceStore as F, InMemoryTraceStore as I, type RunFilter as R, type SpanFilter as S, type TraceStore as T, type FileSystemTraceStoreOptions as a };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { R as RunRecord } from './run-record-
|
|
2
|
-
import { F as FailureClusterReport } from './failure-cluster-
|
|
1
|
+
import { R as RunRecord } from './run-record-etiCMsUq.js';
|
|
2
|
+
import { F as FailureClusterReport } from './failure-cluster-CL7IVgkJ.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* HeldOutGate — first-class held-out paired-delta promotion gate.
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
import { T as TraceEmitter } from './emitter-
|
|
2
|
-
import { R as Run, F as FailureClass
|
|
1
|
+
import { T as TraceEmitter } from './emitter-DEZwY14K.js';
|
|
2
|
+
import { R as Run, F as FailureClass } from './schema-m0gsnbt3.js';
|
|
3
|
+
import { T as TraceStore } from './store-CKUAgsJz.js';
|
|
3
4
|
|
|
4
5
|
/**
|
|
5
6
|
* SandboxHarness — executes a scenario in an isolated environment and
|
package/dist/traces.d.ts
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import { N as NotFoundError, R as ReplayError } from './errors-mje_cKOs.js';
|
|
2
2
|
import { R as RawProviderSink, d as RawProviderEvent } from './raw-provider-sink-C46HDghv.js';
|
|
3
3
|
export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, c as RawProviderDirection, e as RawProviderSinkFilter, f as defaultProviderRedactor, p as providerFromBaseUrl } from './raw-provider-sink-C46HDghv.js';
|
|
4
|
-
import { R as RunCompleteHook, a as RunCompleteHookContext } from './emitter-
|
|
5
|
-
export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-
|
|
6
|
-
export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-
|
|
7
|
-
import { T as TraceStore } from './store-
|
|
8
|
-
export {
|
|
9
|
-
export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-
|
|
4
|
+
import { R as RunCompleteHook, a as RunCompleteHookContext } from './emitter-DEZwY14K.js';
|
|
5
|
+
export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DEZwY14K.js';
|
|
6
|
+
export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CfXjSqEv.js';
|
|
7
|
+
import { T as TraceStore } from './store-CKUAgsJz.js';
|
|
8
|
+
export { E as EventFilter, F as FileSystemTraceStore, a as FileSystemTraceStoreOptions, I as InMemoryTraceStore, R as RunFilter, S as SpanFilter } from './store-CKUAgsJz.js';
|
|
9
|
+
export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-CqTxMwDw.js';
|
|
10
|
+
export { A as Artifact, B as BudgetLedgerEntry, h as BudgetSpec, E as EventKind, i as FAILURE_CLASSES, F as FailureClass, G as GenericSpan, J as JudgeSpan, L as LlmSpan, M as Message, d as RetrievalSpan, R as Run, g as RunLayer, b as RunOutcome, f as RunStatus, e as SandboxSpan, S as Span, j as SpanBase, c as SpanKind, k as SpanStatus, l as TRACE_SCHEMA_VERSION, T as ToolSpan, a as TraceEvent, m as isJudgeSpan, n as isLlmSpan, o as isRetrievalSpan, p as isSandboxSpan, q as isToolSpan } from './schema-m0gsnbt3.js';
|
|
10
11
|
import { AxAIService, AxFunction } from '@ax-llm/ax';
|
|
11
12
|
import { T as TraceAnalysisStore, f as TraceAnalystFilters, a as DatasetOverview, Q as QueryTracesPage, l as ViewTraceResult, V as ViewSpansResult, b as SearchTraceResult, S as SearchSpanResult } from './store-CJbzDxZ2.js';
|
|
12
13
|
export { D as DEFAULT_TRACE_ANALYST_BUDGETS, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, k as ViewTraceOversized } from './store-CJbzDxZ2.js';
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { S as Span,
|
|
1
|
+
import { S as Span, a as TraceEvent } from './schema-m0gsnbt3.js';
|
|
2
|
+
import { T as TraceStore } from './store-CKUAgsJz.js';
|
|
2
3
|
|
|
3
4
|
/**
|
|
4
5
|
* Trajectory — ordered, structured view over a run's spans.
|
|
@@ -248,6 +248,23 @@ interface CampaignCostMeter {
|
|
|
248
248
|
* training scenarios unless explicitly opted in). */
|
|
249
249
|
type LabeledScenarioSource = 'production-trace' | 'eval-run' | 'manual' | 'red-team' | 'synthetic';
|
|
250
250
|
type RedactionStatus = 'raw' | 'redacted-pii' | 'redacted-secrets' | 'fully-redacted';
|
|
251
|
+
/** How much a label can be trusted to evaluate against — the gold-admission
|
|
252
|
+
* gate. Strictly ordered: a record qualifies for a `minTrust` filter when its
|
|
253
|
+
* trust rank is >= the requested rank.
|
|
254
|
+
*
|
|
255
|
+
* - `unverified` — label is a heuristic (e.g. raw outcome success/fail).
|
|
256
|
+
* Fine as corpus; MUST NOT enter a gold set that lift
|
|
257
|
+
* numbers are computed against.
|
|
258
|
+
* - `verified-signal` — an external signal confirmed the outcome (PR merged,
|
|
259
|
+
* tests green, user did not retry, downstream check).
|
|
260
|
+
* - `human-rated` — a human explicitly rated or corrected the artifact.
|
|
261
|
+
*
|
|
262
|
+
* Absent on a write ⇒ treated as `unverified` (fail-closed: a writer must
|
|
263
|
+
* explicitly assert trust to make a record gold-eligible — it never happens
|
|
264
|
+
* by accident). */
|
|
265
|
+
type LabelTrust = 'unverified' | 'verified-signal' | 'human-rated';
|
|
266
|
+
/** Ordinal rank for a label-trust tier; absent ⇒ `unverified` (rank 0). */
|
|
267
|
+
declare function labelTrustRank(trust: LabelTrust | undefined): number;
|
|
251
268
|
/** @experimental Required-provenance write. The store rejects writes that
|
|
252
269
|
* lack provenance — a default-on flywheel without provenance is the
|
|
253
270
|
* data-poisoning vector flagged in the alignment review. */
|
|
@@ -259,6 +276,11 @@ interface LabeledScenarioWrite<TScenario extends Scenario = Scenario, TArtifact
|
|
|
259
276
|
sourceVersionHash: string;
|
|
260
277
|
capturedAt: string;
|
|
261
278
|
redactionStatus: RedactionStatus;
|
|
279
|
+
/** Gold-admission trust tier. Absent ⇒ `unverified` (fail-closed): the
|
|
280
|
+
* record is corpus, never gold. A writer must explicitly assert
|
|
281
|
+
* `verified-signal` or `human-rated` to make it eligible for a gold
|
|
282
|
+
* sample. See {@link LabelTrust}. */
|
|
283
|
+
labelTrust?: LabelTrust;
|
|
262
284
|
/** Optional per-source rate-limit bucket key (e.g., the tenant id). */
|
|
263
285
|
rateLimitBucket?: string;
|
|
264
286
|
}
|
|
@@ -282,6 +304,11 @@ interface LabeledScenarioSampleArgs {
|
|
|
282
304
|
source?: LabeledScenarioSource | LabeledScenarioSource[];
|
|
283
305
|
minComposite?: number;
|
|
284
306
|
maxComposite?: number;
|
|
307
|
+
/** Gold gate: only records whose trust rank is >= this tier are
|
|
308
|
+
* returned. `sample({ split: 'test', minTrust: 'verified-signal' })` is
|
|
309
|
+
* the canonical "give me the gold set" call. Absent ⇒ no trust gate
|
|
310
|
+
* (corpus-level read). */
|
|
311
|
+
minTrust?: LabelTrust;
|
|
285
312
|
};
|
|
286
313
|
}
|
|
287
314
|
interface LabeledScenarioStore {
|
|
@@ -291,6 +318,9 @@ interface LabeledScenarioStore {
|
|
|
291
318
|
train: number;
|
|
292
319
|
test: number;
|
|
293
320
|
bySource: Record<string, number>;
|
|
321
|
+
/** Count by trust tier — tells the flywheel how much gold it has
|
|
322
|
+
* accumulated vs. raw corpus. */
|
|
323
|
+
byTrust: Record<LabelTrust, number>;
|
|
294
324
|
}>;
|
|
295
325
|
}
|
|
296
326
|
interface CampaignCellResult<TArtifact> {
|
|
@@ -372,4 +402,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
|
|
|
372
402
|
scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
|
|
373
403
|
}
|
|
374
404
|
|
|
375
|
-
export type
|
|
405
|
+
export { type CodeSurface as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ProposeContext as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type LabeledScenarioWrite as c, type LabeledScenarioSampleArgs as d, type LabeledScenarioRecord as e, type LabelTrust as f, type CampaignAggregates as g, type CampaignArtifactWriter as h, type CampaignCellResult as i, type CampaignCostMeter as j, type CampaignResult as k, type CampaignTraceWriter as l, type GateContext as m, type GateDecision as n, type GateResult as o, type GenerationCandidate as p, type GenerationRecord as q, type JudgeAggregate as r, type JudgeDimension as s, type LabeledScenarioSource as t, type Mutator as u, type ScenarioAggregate as v, type SessionScript as w, labelTrustRank as x };
|
package/dist/wire/index.d.ts
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
|
-
import { F as FeedbackTrajectoryStore } from '../feedback-trajectory-
|
|
2
|
-
import { T as TraceStore } from '../store-
|
|
1
|
+
import { F as FeedbackTrajectoryStore } from '../feedback-trajectory-DpUmE90J.js';
|
|
2
|
+
import { T as TraceStore } from '../store-CKUAgsJz.js';
|
|
3
3
|
import { z } from 'zod';
|
|
4
4
|
import { OpenAPIObject } from 'openapi3-ts/oas31';
|
|
5
5
|
import * as hono_types from 'hono/types';
|
|
6
6
|
import { ServerType } from '@hono/node-server';
|
|
7
7
|
import { Hono } from 'hono';
|
|
8
|
-
import '../control-runtime-
|
|
9
|
-
import '../emitter-
|
|
8
|
+
import '../control-runtime-DuFBYg7A.js';
|
|
9
|
+
import '../emitter-DEZwY14K.js';
|
|
10
|
+
import '../schema-m0gsnbt3.js';
|
|
10
11
|
import '../dataset-BlwAtYYf.js';
|
|
11
12
|
import '../errors-mje_cKOs.js';
|
|
12
13
|
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Pilot Kit — customer handoff materials
|
|
2
|
+
|
|
3
|
+
What's here, in order of use:
|
|
4
|
+
|
|
5
|
+
| File | For | When |
|
|
6
|
+
|---|---|---|
|
|
7
|
+
| [one-pager.md](./one-pager.md) | Customer's first read | Send as initial pitch — what they get, why it's different, what it looks like, what it costs. Now includes intake-paths matrix for non-Tangle customers (LangChain / LlamaIndex / Anthropic SDK / OpenAI Assistants / OpenRouter / vLLM / Ollama / custom). |
|
|
8
|
+
| [integration-tangle-stack.md](./integration-tangle-stack.md) | Customer's engineer (Tangle-stack customers) | Send after one-pager when they want to see the code; full integration walkthrough for the canonical Tangle stack (sandbox + tcloud) |
|
|
9
|
+
| [integration-foreign-stack.md](./integration-foreign-stack.md) | Customer's engineer (non-Tangle customers) | Send after one-pager when they're on OTel, LangChain, LlamaIndex, Anthropic SDK, OpenAI Assistants, OpenRouter, vLLM, Ollama, or custom. Covers every path. |
|
|
10
|
+
| [sample-insight-report.json](./sample-insight-report.json) | Customer's team meeting | Concrete JSON they can show to demonstrate value pre-integration |
|
|
11
|
+
| [customer-checklist.md](./customer-checklist.md) | Pre-onboarding-call | Send 48h before the call; ensures the 90min slot is productive. Provider-agnostic — works for any stack. |
|
|
12
|
+
|
|
13
|
+
## How to use this kit
|
|
14
|
+
|
|
15
|
+
**For a Tangle customer asking for it RIGHT NOW:**
|
|
16
|
+
|
|
17
|
+
1. Reply with the one-pager (`one-pager.md`) inline + the sample InsightReport (`sample-insight-report.json`) attached. Their senior engineer reads this and decides if it's worth a call.
|
|
18
|
+
2. If they say yes, send the integration guide (`integration-tangle-stack.md`) + the checklist (`customer-checklist.md`). Schedule a 90-minute onboarding call.
|
|
19
|
+
3. On the call: walk through the integration, run a live `analyzeRuns()` against their existing sandbox sessions, render the deterministic packet, fire one small `selfImprove` cycle. By the end of the call they have a working pilot.
|
|
20
|
+
|
|
21
|
+
**For Drew handling the conversation himself:**
|
|
22
|
+
|
|
23
|
+
The whole kit is written in our voice (technical, direct, no marketing fluff). You can paste sections directly into Slack / email / a customer call. The one-pager is meant to read as YOUR pitch, not a generic SaaS handout.
|
|
24
|
+
|
|
25
|
+
## What this kit assumes
|
|
26
|
+
|
|
27
|
+
- Customer is on the Tangle stack (sandbox + tcloud) OR emits OTel traces
|
|
28
|
+
- Customer has an agent with a clear system-prompt addendum we can optimize
|
|
29
|
+
- Customer has at least 20 scenarios their agent handles
|
|
30
|
+
- Customer is willing to set a `maxUsd` budget for closed-loop campaigns
|
|
31
|
+
|
|
32
|
+
If any of those don't apply, the one-pager still works as a positioning piece. The integration doc gets adapted on the call.
|
|
33
|
+
|
|
34
|
+
## Where this maps in the substrate
|
|
35
|
+
|
|
36
|
+
- Substrate version: `@tangle-network/agent-eval@0.53.0` (npm), `agent-eval-rpc@0.53.0` (PyPI)
|
|
37
|
+
- agent-runtime version: `@tangle-network/agent-runtime@0.29.0`
|
|
38
|
+
- Key APIs: `fromTangleSandbox`, `fromOtelSpans`, `analyzeRuns`, `selfImprove`, `gepaDriver`, `defaultProductionGate`, `openAutoPr`
|
|
39
|
+
- All ship today; no version-blocking dependencies
|
|
40
|
+
|
|
41
|
+
## What this kit doesn't yet do
|
|
42
|
+
|
|
43
|
+
- No `npx @tangle-network/intelligence demo` command shipped yet (queued #115 — extend existing `tangle-intel` CLI in ADC with customer-zero-touch subcommands `init` / `demo` / `report` / `improve`)
|
|
44
|
+
- No `staging-intelligence.tangle.tools` live yet (queued #116 — matches existing `staging-{product}.tangle.tools` precedent like sandbox)
|
|
45
|
+
- No live demo video (queued #117 — recorded against legal-agent canonical real data)
|
|
46
|
+
- No screenshot dashboard (gated on Gate 2 task #109 — ADC intelligence frontend renders canonical InsightReport)
|
|
47
|
+
- No published case study with named numbers (Gate 3 task #112 — after first pilot completes 4+ cycles)
|
|
48
|
+
|
|
49
|
+
## Architectural decisions baked into this kit
|
|
50
|
+
|
|
51
|
+
- **Customer-facing CLI is `@tangle-network/intelligence`** (binary `tangle-intel`), NOT `agent-eval`. `agent-eval` is the substrate package; `intelligence` is the customer product that wraps it. The CLI already exists at `services/intelligence/src/cli/` in agent-dev-container — we extend it with `init` / `demo` / `report` / `improve` subcommands per task #115.
|
|
52
|
+
- **Hosted URL is `staging-intelligence.tangle.tools`** matching `staging-sandbox.tangle.tools` precedent. Production becomes `intelligence.tangle.tools` once Gate 2/3 close.
|
|
53
|
+
- **`agent-eval` mentioned only when customer wants direct programmatic access** (not the default path). 90%+ of customers stay at the CLI + hosted dashboard layer.
|
|
54
|
+
|
|
55
|
+
For the FIRST pilot conversation, the JSON sample is the dashboard substitute. After Gate 2 lands we replace it with live screenshots.
|
|
56
|
+
|
|
57
|
+
## Update cadence
|
|
58
|
+
|
|
59
|
+
This kit gets updated each time:
|
|
60
|
+
- A substrate version ships that customers should know about
|
|
61
|
+
- A real pilot completes and we have a case study to add
|
|
62
|
+
- A customer gives feedback that re-shapes how we pitch
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# Pre-onboarding checklist — what to have ready
|
|
2
|
+
|
|
3
|
+
Send this to the customer 48h before the onboarding call. If they show up to the call having done this, the 90-minute slot ends with a working pilot.
|
|
4
|
+
|
|
5
|
+
## What we need from you before the call
|
|
6
|
+
|
|
7
|
+
### Credentials
|
|
8
|
+
|
|
9
|
+
- [ ] **LLM provider API key** — tcloud key, OpenRouter key, OpenAI key, Anthropic key, or any OpenAI-compat router endpoint
|
|
10
|
+
- [ ] **GitHub token** with PR-write access to your agent repo (optional — required only if you want auto-PR promotion on green gate decisions)
|
|
11
|
+
- [ ] **Sandbox session access** (Tangle stack customers only) — read access to the session IDs we'll analyze
|
|
12
|
+
|
|
13
|
+
### Data
|
|
14
|
+
|
|
15
|
+
- [ ] **Trace data** — ONE of:
|
|
16
|
+
- Tangle sandbox session IDs (we use `fromTangleSandbox`)
|
|
17
|
+
- OTel spans dumped as JSONL (we use `fromOtelSpans`)
|
|
18
|
+
- Multi-rater feedback table (CSV with runId / rater / score, we use `fromFeedbackTable`)
|
|
19
|
+
- LangChain / LlamaIndex / OpenAI Assistants trace export (we use the corresponding adapter)
|
|
20
|
+
- Custom trace format (we map it together on the call — usually 20 lines of glue)
|
|
21
|
+
- [ ] **Scenarios** — 20-50 representative inputs your agent handles. Even YAML / JSON / TS array is fine; we'll convert to canonical `DatasetScenario[]` shape together
|
|
22
|
+
- [ ] **The system prompt addendum** your agent uses today (or whichever text surface you want to optimize) — the closed loop edits this
|
|
23
|
+
|
|
24
|
+
### Judge
|
|
25
|
+
|
|
26
|
+
- [ ] **A judge function or rubric** — either:
|
|
27
|
+
- An existing function `(artifact) → { composite, dimensions }`
|
|
28
|
+
- A rubric describing what "good output" means (1-2 paragraphs is enough — we'll build the judge on the call)
|
|
29
|
+
- A set of "good" / "bad" labeled examples (we use these as anchors)
|
|
30
|
+
|
|
31
|
+
### Constraints
|
|
32
|
+
|
|
33
|
+
- [ ] **LLM cost budget for the closed loop** — default $25 per campaign. Tell us if you want a different ceiling
|
|
34
|
+
- [ ] **Cadence** — how often should the loop run? Default: weekly. Some customers want daily; others want on-demand only
|
|
35
|
+
- [ ] **Deployment gate preference** — do you want:
|
|
36
|
+
- Auto-PR on `ship-substrate` (we open the PR, your team reviews)
|
|
37
|
+
- Manual review only (we report; you decide)
|
|
38
|
+
- Auto-deploy on `ship-substrate` (only with explicit ack; not default)
|
|
39
|
+
|
|
40
|
+
## Call agenda — 90 minutes
|
|
41
|
+
|
|
42
|
+
| Time | Topic |
|
|
43
|
+
|---|---|
|
|
44
|
+
| 0:00 — 0:10 | Walk through your existing setup — what runs where, what scenarios exist, what success looks like for you |
|
|
45
|
+
| 0:10 — 0:30 | Pick the right intake adapter; pull traces; run `analyzeRuns()` against last week's data — first decision packet rendered live |
|
|
46
|
+
| 0:30 — 0:50 | Build the judge — either wrap your existing one or scaffold a new one from your rubric |
|
|
47
|
+
| 0:50 — 1:10 | Fire one `selfImprove` cycle with a small budget ($5, single generation, 2 candidates) — watch the loop run end-to-end |
|
|
48
|
+
| 1:10 — 1:25 | Wire the cron + auto-PR target; schedule first weekly run |
|
|
49
|
+
| 1:25 — 1:30 | Confirm what we hand back to you between runs and what reaches you when |
|
|
50
|
+
|
|
51
|
+
If something on the checklist isn't ready, we adapt — just send what you have. Worst case, we spend the first 30 minutes getting unblocked.
|
|
52
|
+
|
|
53
|
+
## What you'll have at the end of the call
|
|
54
|
+
|
|
55
|
+
- A working `analyzeRuns()` call against YOUR live trace data, returning a real `InsightReport`
|
|
56
|
+
- A judge function (yours or scaffolded) wired to your agent's output shape
|
|
57
|
+
- One completed `selfImprove` cycle with a real `gateDecision` + lift CI
|
|
58
|
+
- A scheduled cron / GitHub Action that runs the loop weekly
|
|
59
|
+
- Optional: an auto-PR target if you want green-gate proposals to land as draft PRs
|
|
60
|
+
|
|
61
|
+
## After the call
|
|
62
|
+
|
|
63
|
+
- Day 1-7: first weekly run fires; we monitor + jump in if anything breaks
|
|
64
|
+
- Day 7: we send you a `selfImprove`-result summary + the corresponding `InsightReport`
|
|
65
|
+
- Day 14-28: 3 more cycles complete; you have enough data to evaluate the pilot
|
|
66
|
+
- Day 30: pilot review — what we found, what shipped, what's next
|
|
67
|
+
|
|
68
|
+
## What we send back to you between runs
|
|
69
|
+
|
|
70
|
+
- The full `InsightReport` JSON (you render it however you want, or use our hosted dashboard if it's available for your tier)
|
|
71
|
+
- Slack / email digest of `regressedMetrics` + critical recommendations (opt-in)
|
|
72
|
+
- Cost tally per campaign
|
|
73
|
+
- Auto-PR links if green gate verdicts opened any
|
|
74
|
+
|
|
75
|
+
## Common pre-call questions
|
|
76
|
+
|
|
77
|
+
**Q: How small a corpus can we start with?**
|
|
78
|
+
A: 15 scenarios works for the deterministic packet. 25+ is recommended for `selfImprove`'s held-out gate (the default `holdoutFraction: 0.3` reserves ~30% of scenarios for the gate).
|
|
79
|
+
|
|
80
|
+
**Q: What if our judge isn't reliable yet?**
|
|
81
|
+
A: Start with multi-rater intake — `fromFeedbackTable` produces inter-rater agreement (κ) so you can see exactly which scenarios humans disagree on. Iterate the judge until κ > 0.7, then go to closed loop.
|
|
82
|
+
|
|
83
|
+
**Q: We don't use Tangle's sandbox — can we still pilot?**
|
|
84
|
+
A: Yes. We have intake adapters for OTel, LangChain, LlamaIndex, Anthropic SDK, OpenAI Assistants, OpenRouter, multi-rater feedback tables, and custom trace formats. See `integration-foreign-stack.md`.
|
|
85
|
+
|
|
86
|
+
**Q: We use OpenRouter — does the closed-loop driver work with our routing setup?**
|
|
87
|
+
A: Yes. `gepaDriver` accepts any OpenAI-compatible endpoint via its `llm.baseUrl` option. Most customers run their selfImprove campaigns through OpenRouter or their existing provider — no migration required.
|
|
88
|
+
|
|
89
|
+
**Q: What if the pilot fails — what do we get?**
|
|
90
|
+
A: You get the deterministic `InsightReport` weekly regardless. Even if no `selfImprove` cycle ever ships a green gate verdict, you get the failure-cluster analysis, regressed-metric detection, and worst-runs surfacing. Those alone replace what most teams currently get from LangSmith / Braintrust / Phoenix scorecards.
|