@tangle-network/agent-eval 0.20.11 → 0.20.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +99 -170
  2. package/dist/benchmarks/index.d.ts +2 -1
  3. package/dist/{chunk-JAOLXRIA.js → chunk-75MCTH7P.js} +8 -2
  4. package/dist/chunk-75MCTH7P.js.map +1 -0
  5. package/dist/chunk-HKYRWNHV.js +1354 -0
  6. package/dist/chunk-HKYRWNHV.js.map +1 -0
  7. package/dist/{chunk-LSR4IAYN.js → chunk-HNJLMAJ2.js} +2 -2
  8. package/dist/chunk-IKFVX537.js +717 -0
  9. package/dist/chunk-IKFVX537.js.map +1 -0
  10. package/dist/chunk-KWUAAIHR.js +1764 -0
  11. package/dist/chunk-KWUAAIHR.js.map +1 -0
  12. package/dist/chunk-MCMV7DUL.js +1310 -0
  13. package/dist/chunk-MCMV7DUL.js.map +1 -0
  14. package/dist/chunk-ODFINDLQ.js +413 -0
  15. package/dist/chunk-ODFINDLQ.js.map +1 -0
  16. package/dist/chunk-PKCVBYTQ.js +200 -0
  17. package/dist/chunk-PKCVBYTQ.js.map +1 -0
  18. package/dist/chunk-YUFXO3TU.js +148 -0
  19. package/dist/chunk-YUFXO3TU.js.map +1 -0
  20. package/dist/cli.js +2 -2
  21. package/dist/control-C8NKbF3w.d.ts +258 -0
  22. package/dist/control.d.ts +5 -0
  23. package/dist/control.js +30 -0
  24. package/dist/control.js.map +1 -0
  25. package/dist/dataset-B9qvlm_o.d.ts +112 -0
  26. package/dist/emitter-BYO2nSDA.d.ts +387 -0
  27. package/dist/feedback-trajectory-BGQ_ANCN.d.ts +345 -0
  28. package/dist/{index-1PZOtZFr.d.ts → index-c5saLbKD.d.ts} +2 -133
  29. package/dist/index.d.ts +115 -2870
  30. package/dist/index.js +1049 -6156
  31. package/dist/index.js.map +1 -1
  32. package/dist/multi-shot-optimization-Bvtz294B.d.ts +598 -0
  33. package/dist/openapi.json +1 -1
  34. package/dist/optimization.d.ts +145 -0
  35. package/dist/optimization.js +60 -0
  36. package/dist/optimization.js.map +1 -0
  37. package/dist/reporting.d.ts +426 -0
  38. package/dist/reporting.js +32 -0
  39. package/dist/reporting.js.map +1 -0
  40. package/dist/run-record-CX_jcAyr.d.ts +134 -0
  41. package/dist/traces.d.ts +658 -0
  42. package/dist/traces.js +100 -0
  43. package/dist/traces.js.map +1 -0
  44. package/dist/wire/index.js +2 -2
  45. package/docs/concepts.md +16 -11
  46. package/docs/feature-guide.md +10 -17
  47. package/docs/integration-launch-gates.md +77 -0
  48. package/docs/product-eval-adoption.md +27 -0
  49. package/docs/trace-analysis.md +75 -0
  50. package/package.json +21 -1
  51. package/dist/chunk-JAOLXRIA.js.map +0 -1
  52. /package/dist/{chunk-LSR4IAYN.js.map → chunk-HNJLMAJ2.js.map} +0 -0
@@ -0,0 +1,387 @@
1
+ /**
2
+ * TraceSchema v1 — the canonical data model for agent-eval.
3
+ *
4
+ * Every score, every failure class, every pipeline in the framework is
5
+ * a view over this data. Shape it once, live with it.
6
+ *
7
+ * Wire-compatible with OpenTelemetry span semantics (see trace/otel.ts)
8
+ * but extended with agent-specific span kinds (llm, tool, retrieval,
9
+ * judge, sandbox) and first-class BudgetLedger / Artifact / JudgeVerdict
10
+ * entities that OTEL leaves as free-form attributes.
11
+ */
12
+ declare const TRACE_SCHEMA_VERSION = "1.0.0";
13
+ type RunStatus = 'running' | 'completed' | 'failed' | 'aborted';
14
+ interface BudgetSpec {
15
+ tokens?: number;
16
+ wallMs?: number;
17
+ calls?: number;
18
+ usd?: number;
19
+ }
20
+ interface RunOutcome {
21
+ score?: number;
22
+ pass?: boolean;
23
+ failureClass?: FailureClass;
24
+ notes?: string;
25
+ }
26
+ /**
27
+ * Layer — optional classification in a nested build workflow.
28
+ * `builder`: the meta-agent editing a project (e.g. agent-builder Forge chat).
29
+ * `app-build`: sandbox harness that compiled + tested the generated scaffold.
30
+ * `app-runtime`: a run of the generated agent against a domain scenario.
31
+ * `meta`: any meta-eval (judge replay, correlation analysis).
32
+ */
33
+ type RunLayer = 'builder' | 'app-build' | 'app-runtime' | 'meta' | 'custom';
34
+ interface Run {
35
+ runId: string;
36
+ /**
37
+ * Stable identifier of the scenario being executed.
38
+ *
39
+ * Always populated on the persisted Run — but `TraceEmitter.startRun` accepts
40
+ * input WITHOUT this field, substituting a sensible default
41
+ * (`run.layer ?? run.tags?.['kind'] ?? 'runtime'`) when the caller has no
42
+ * curated scenario to anchor to (runtime / operator / meta-eval runs). This
43
+ * keeps the persisted shape unambiguous for downstream filters + aggregations
44
+ * while removing the boilerplate of inventing placeholder ids at the call site.
45
+ */
46
+ scenarioId: string;
47
+ variantId?: string;
48
+ datasetVersion?: string;
49
+ /** Git SHA of agent code at run time. */
50
+ codeSha?: string;
51
+ /** Hash of the prompt template + any system prompt. */
52
+ promptSha?: string;
53
+ /** Model id + date + system-prompt hash, concatenated. */
54
+ modelFingerprint?: string;
55
+ seed?: number;
56
+ /** Arbitrary environment markers (shell, docker version, tz). */
57
+ envFingerprint?: Record<string, string>;
58
+ /** Version of the redaction rules applied to this run. */
59
+ redactionVersion?: string;
60
+ /** Parent run in a nested build workflow. A builder run's children are
61
+ * app-build runs; those children are app-runtime runs. */
62
+ parentRunId?: string;
63
+ /** Stable project identifier — groups runs across chats + sessions. */
64
+ projectId?: string;
65
+ /** Chat/conversation identifier within a project. */
66
+ chatId?: string;
67
+ /** Layer classification — hint for aggregation; not enforced. */
68
+ layer?: RunLayer;
69
+ startedAt: number;
70
+ endedAt?: number;
71
+ status: RunStatus;
72
+ outcome?: RunOutcome;
73
+ budget?: BudgetSpec;
74
+ /** Free-form labels for downstream grouping. */
75
+ tags?: Record<string, string>;
76
+ }
77
+ type SpanKind = 'agent' | 'llm' | 'tool' | 'retrieval' | 'judge' | 'sandbox' | 'custom';
78
+ type SpanStatus = 'ok' | 'error';
79
+ interface SpanBase {
80
+ spanId: string;
81
+ parentSpanId?: string;
82
+ runId: string;
83
+ kind: SpanKind;
84
+ name: string;
85
+ startedAt: number;
86
+ endedAt?: number;
87
+ status?: SpanStatus;
88
+ error?: string;
89
+ /** Anything not covered by typed fields. Kept deliberately free-form. */
90
+ attributes?: Record<string, unknown>;
91
+ }
92
+ interface Message {
93
+ role: 'system' | 'user' | 'assistant' | 'tool';
94
+ content: string;
95
+ tokens?: number;
96
+ /** Multi-modal content descriptors; blobs themselves live in Artifacts. */
97
+ images?: Array<{
98
+ artifactId?: string;
99
+ url?: string;
100
+ mime?: string;
101
+ }>;
102
+ }
103
+ interface LlmSpan extends SpanBase {
104
+ kind: 'llm';
105
+ model: string;
106
+ messages: Message[];
107
+ output?: string;
108
+ inputTokens?: number;
109
+ outputTokens?: number;
110
+ cachedTokens?: number;
111
+ reasoningTokens?: number;
112
+ costUsd?: number;
113
+ finishReason?: string;
114
+ }
115
+ interface ToolSpan extends SpanBase {
116
+ kind: 'tool';
117
+ toolName: string;
118
+ args: unknown;
119
+ result?: unknown;
120
+ latencyMs?: number;
121
+ }
122
+ interface RetrievalSpan extends SpanBase {
123
+ kind: 'retrieval';
124
+ query: string;
125
+ hits: Array<{
126
+ docId: string;
127
+ score: number;
128
+ content?: string;
129
+ }>;
130
+ }
131
+ interface JudgeSpan extends SpanBase {
132
+ kind: 'judge';
133
+ judgeId: string;
134
+ /** Span this judgment applies to. */
135
+ targetSpanId: string;
136
+ dimension: string;
137
+ /** Numeric score (free-range; interpretation up to the judge). */
138
+ score: number;
139
+ rationale?: string;
140
+ evidence?: string;
141
+ }
142
+ interface SandboxSpan extends SpanBase {
143
+ kind: 'sandbox';
144
+ image?: string;
145
+ command?: string;
146
+ exitCode?: number;
147
+ testsTotal?: number;
148
+ testsPassed?: number;
149
+ stdoutHash?: string;
150
+ stderrHash?: string;
151
+ /** Duration in ms; the harness fills this explicitly (endedAt - startedAt may miss setup). */
152
+ wallMs?: number;
153
+ }
154
+ interface GenericSpan extends SpanBase {
155
+ kind: 'agent' | 'custom';
156
+ }
157
+ type Span = LlmSpan | ToolSpan | RetrievalSpan | JudgeSpan | SandboxSpan | GenericSpan;
158
+ type EventKind = 'log' | 'error' | 'budget_decrement' | 'budget_breach' | 'state_mutation' | 'policy_violation' | 'redaction_applied' | 'custom';
159
+ interface TraceEvent {
160
+ eventId: string;
161
+ runId: string;
162
+ spanId?: string;
163
+ kind: EventKind;
164
+ timestamp: number;
165
+ payload: Record<string, unknown>;
166
+ }
167
+ interface BudgetLedgerEntry {
168
+ runId: string;
169
+ dimension: keyof BudgetSpec;
170
+ limit: number;
171
+ consumed: number;
172
+ remaining: number;
173
+ timestamp: number;
174
+ breached: boolean;
175
+ /** Span that triggered this entry, if any. */
176
+ spanId?: string;
177
+ }
178
+ interface Artifact {
179
+ artifactId: string;
180
+ runId: string;
181
+ spanId?: string;
182
+ contentType: string;
183
+ sizeBytes: number;
184
+ /** sha256 in hex. */
185
+ hash: string;
186
+ /** External storage URL (R2, S3, filesystem path). */
187
+ storageUrl?: string;
188
+ /** Inline content for small blobs — keep under ~64KB. */
189
+ inlineContent?: string;
190
+ }
191
+ type FailureClass = 'success' | 'reasoning_error' | 'tool_selection_error' | 'tool_argument_error' | 'tool_recovery_failure' | 'hallucination' | 'instruction_following' | 'safety_refusal_miss' | 'policy_violation' | 'budget_exceeded' | 'format_drift' | 'permission_escalation' | 'pii_leak' | 'cost_overrun' | 'timeout' | 'sandbox_failure' | 'missing_user_data' | 'missing_domain_data' | 'missing_codebase_context' | 'missing_runtime_context' | 'missing_credentials' | 'missing_integration_connection' | 'missing_integration_scope' | 'integration_approval_required' | 'integration_auth_expired' | 'integration_provider_failure' | 'bad_integration_manifest' | 'unsafe_integration_write_denied' | 'stale_external_data' | 'bad_retrieval' | 'insufficient_evidence' | 'contradictory_evidence' | 'ambiguous_user_intent' | 'knowledge_readiness_blocked' | 'unknown';
192
+ declare const FAILURE_CLASSES: readonly FailureClass[];
193
+ declare function isLlmSpan(s: Span): s is LlmSpan;
194
+ declare function isToolSpan(s: Span): s is ToolSpan;
195
+ declare function isRetrievalSpan(s: Span): s is RetrievalSpan;
196
+ declare function isJudgeSpan(s: Span): s is JudgeSpan;
197
+ declare function isSandboxSpan(s: Span): s is SandboxSpan;
198
+
199
+ interface RunFilter {
200
+ scenarioId?: string;
201
+ variantId?: string;
202
+ status?: RunStatus;
203
+ since?: number;
204
+ until?: number;
205
+ tag?: {
206
+ key: string;
207
+ value: string;
208
+ };
209
+ parentRunId?: string;
210
+ projectId?: string;
211
+ chatId?: string;
212
+ layer?: RunLayer;
213
+ }
214
+ interface SpanFilter {
215
+ runId?: string;
216
+ parentSpanId?: string;
217
+ kind?: SpanKind;
218
+ name?: string;
219
+ toolName?: string;
220
+ judgeId?: string;
221
+ since?: number;
222
+ until?: number;
223
+ }
224
+ interface EventFilter {
225
+ runId?: string;
226
+ spanId?: string;
227
+ kind?: EventKind;
228
+ since?: number;
229
+ until?: number;
230
+ }
231
+ interface TraceStore {
232
+ appendRun(run: Run): Promise<void>;
233
+ updateRun(runId: string, patch: Partial<Run>): Promise<void>;
234
+ appendSpan(span: Span): Promise<void>;
235
+ updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
236
+ appendEvent(event: TraceEvent): Promise<void>;
237
+ appendArtifact(artifact: Artifact): Promise<void>;
238
+ appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
239
+ getRun(runId: string): Promise<Run | undefined>;
240
+ listRuns(filter?: RunFilter): Promise<Run[]>;
241
+ spans(filter?: SpanFilter): Promise<Span[]>;
242
+ events(filter?: EventFilter): Promise<TraceEvent[]>;
243
+ budget(runId: string): Promise<BudgetLedgerEntry[]>;
244
+ artifacts(runId: string): Promise<Artifact[]>;
245
+ }
246
+ declare class InMemoryTraceStore implements TraceStore {
247
+ private runs;
248
+ private allSpans;
249
+ private allEvents;
250
+ private allArtifacts;
251
+ private allBudget;
252
+ appendRun(run: Run): Promise<void>;
253
+ updateRun(runId: string, patch: Partial<Run>): Promise<void>;
254
+ appendSpan(span: Span): Promise<void>;
255
+ updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
256
+ appendEvent(event: TraceEvent): Promise<void>;
257
+ appendArtifact(artifact: Artifact): Promise<void>;
258
+ appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
259
+ getRun(runId: string): Promise<Run | undefined>;
260
+ listRuns(filter?: RunFilter): Promise<Run[]>;
261
+ spans(filter?: SpanFilter): Promise<Span[]>;
262
+ events(filter?: EventFilter): Promise<TraceEvent[]>;
263
+ budget(runId: string): Promise<BudgetLedgerEntry[]>;
264
+ artifacts(runId: string): Promise<Artifact[]>;
265
+ }
266
+ interface FileSystemTraceStoreOptions {
267
+ dir: string;
268
+ /** Roll over NDJSON files when they exceed this size in bytes. Default 32 MB. */
269
+ maxBytes?: number;
270
+ }
271
+ declare class FileSystemTraceStore implements TraceStore {
272
+ private dir;
273
+ private maxBytes;
274
+ /** Lazy in-memory index for queries — populated on first read. */
275
+ private index?;
276
+ private loaded;
277
+ constructor(options: FileSystemTraceStoreOptions);
278
+ private ensureDir;
279
+ private append;
280
+ private insertInto;
281
+ private load;
282
+ appendRun(run: Run): Promise<void>;
283
+ updateRun(runId: string, patch: Partial<Run>): Promise<void>;
284
+ appendSpan(span: Span): Promise<void>;
285
+ updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
286
+ appendEvent(event: TraceEvent): Promise<void>;
287
+ appendArtifact(artifact: Artifact): Promise<void>;
288
+ appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
289
+ getRun(runId: string): Promise<Run | undefined>;
290
+ listRuns(filter?: RunFilter): Promise<Run[]>;
291
+ spans(filter?: SpanFilter): Promise<Span[]>;
292
+ events(filter?: EventFilter): Promise<TraceEvent[]>;
293
+ budget(runId: string): Promise<BudgetLedgerEntry[]>;
294
+ artifacts(runId: string): Promise<Artifact[]>;
295
+ }
296
+
297
+ /**
298
+ * TraceEmitter — hierarchical span builder that auto-parents using an
299
+ * internal stack. One emitter per Run; emitters do NOT share state.
300
+ *
301
+ * Convenience methods (`llm`, `tool`, `retrieval`, `judge`, `sandbox`)
302
+ * return a `SpanHandle` with `.end()` / `.fail()` so callers don't
303
+ * have to thread spanIds manually. For async workflows that can't use
304
+ * the stack (e.g. fan-out parallel calls), pass `parentSpanId`
305
+ * explicitly.
306
+ */
307
+
308
+ interface SpanHandle<S extends Span = Span> {
309
+ span: S;
310
+ end(patch?: Partial<S>): Promise<void>;
311
+ fail(error: string | Error, patch?: Partial<S>): Promise<void>;
312
+ }
313
+ interface TraceEmitterOptions {
314
+ runId?: string;
315
+ /** Inject a clock for deterministic tests. */
316
+ now?: () => number;
317
+ /** Inject an id generator for deterministic tests. */
318
+ id?: () => string;
319
+ }
320
+ declare class TraceEmitter {
321
+ private store;
322
+ private stack;
323
+ private _runId;
324
+ private now;
325
+ private id;
326
+ constructor(store: TraceStore, options?: TraceEmitterOptions);
327
+ get runId(): string;
328
+ /**
329
+ * Begin a Run.
330
+ *
331
+ * `scenarioId` is required on the persisted Run shape — every Run downstream
332
+ * gets a non-empty scenarioId so filters and aggregations stay simple — but
333
+ * the INPUT here accepts it as optional. When omitted, startRun substitutes
334
+ * a sensible default (`run.layer ?? run.tags?.['kind'] ?? 'runtime'`) so
335
+ * runtime / operator / meta-eval runs that have no curated-scenario corpus
336
+ * to anchor to don't have to invent placeholder strings at the call site.
337
+ */
338
+ startRun(run: Omit<Run, 'runId' | 'scenarioId' | 'startedAt' | 'status'> & {
339
+ scenarioId?: string;
340
+ }): Promise<Run>;
341
+ endRun(outcome?: RunOutcome): Promise<void>;
342
+ abortRun(reason: string): Promise<void>;
343
+ span<S extends Span = Span>(init: {
344
+ kind: SpanKind;
345
+ name: string;
346
+ parentSpanId?: string;
347
+ attributes?: Record<string, unknown>;
348
+ } & Partial<Omit<S, 'spanId' | 'runId' | 'startedAt' | 'kind' | 'name'>>): Promise<SpanHandle<S>>;
349
+ private handle;
350
+ private pop;
351
+ llm(init: Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<LlmSpan>>;
352
+ tool(init: Omit<ToolSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<ToolSpan>>;
353
+ retrieval(init: Omit<RetrievalSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<RetrievalSpan>>;
354
+ recordJudge(verdict: Omit<JudgeSpan, 'spanId' | 'runId' | 'kind' | 'startedAt' | 'endedAt'>): Promise<JudgeSpan>;
355
+ sandbox(init: Omit<SandboxSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<SandboxSpan>>;
356
+ emit(event: {
357
+ kind: EventKind;
358
+ spanId?: string;
359
+ payload?: Record<string, unknown>;
360
+ }): Promise<TraceEvent>;
361
+ recordBudget(entry: Omit<BudgetLedgerEntry, 'runId' | 'timestamp'> & {
362
+ timestamp?: number;
363
+ }): Promise<BudgetLedgerEntry>;
364
+ recordArtifact(artifact: Omit<Artifact, 'artifactId' | 'runId'>): Promise<Artifact>;
365
+ /**
366
+ * Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
367
+ * Returns the fn's return value. Use this for the 95% case.
368
+ */
369
+ within<T>(init: Parameters<TraceEmitter['span']>[0], fn: (handle: SpanHandle) => Promise<T>): Promise<T>;
370
+ }
371
+ /** Helper to build an LLM span handle args object from a provider-shaped response. */
372
+ declare function llmSpanFromProvider(args: {
373
+ name?: string;
374
+ model: string;
375
+ messages: Message[];
376
+ output: string;
377
+ usage?: {
378
+ inputTokens?: number;
379
+ outputTokens?: number;
380
+ cachedTokens?: number;
381
+ reasoningTokens?: number;
382
+ };
383
+ costUsd?: number;
384
+ finishReason?: string;
385
+ }): Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>;
386
+
387
+ export { type Artifact as A, type BudgetLedgerEntry as B, type RunOutcome as C, type EventFilter as E, type FailureClass as F, type GenericSpan as G, InMemoryTraceStore as I, type JudgeSpan as J, type LlmSpan as L, type Message as M, type Run as R, type Span as S, TraceEmitter as T, type TraceStore as a, type TraceEvent as b, type BudgetSpec as c, type ToolSpan as d, type RunFilter as e, type EventKind as f, FAILURE_CLASSES as g, FileSystemTraceStore as h, type FileSystemTraceStoreOptions as i, type RetrievalSpan as j, type RunLayer as k, type RunStatus as l, type SandboxSpan as m, type SpanBase as n, type SpanFilter as o, type SpanHandle as p, type SpanKind as q, type SpanStatus as r, TRACE_SCHEMA_VERSION as s, type TraceEmitterOptions as t, isJudgeSpan as u, isLlmSpan as v, isRetrievalSpan as w, isSandboxSpan as x, isToolSpan as y, llmSpanFromProvider as z };