@tangle-network/agent-eval 0.36.0 → 0.38.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/traces.d.ts CHANGED
@@ -1,7 +1,7 @@
1
1
  import { N as NotFoundError, R as ReplayError } from './errors-mje_cKOs.js';
2
2
  import { R as RawProviderSink, f as RawProviderEvent } from './integrity-DYR5gWlb.js';
3
3
  export { F as FileSystemRawProviderSink, c as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, d as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, e as RawProviderDirection, g as RawProviderSinkFilter, h as RunIntegrityError, a as RunIntegrityExpectations, i as RunIntegrityIssue, j as RunIntegrityIssueCode, b as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-DYR5gWlb.js';
4
- import { a as RunCompleteHookContext, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
4
+ import { R as RunCompleteHook, a as RunCompleteHookContext } from './emitter-DP_cSSiw.js';
5
5
  export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DP_cSSiw.js';
6
6
  import { T as TraceStore } from './store-Db2Bv8Cf.js';
7
7
  export { A as Artifact, B as BudgetLedgerEntry, g as BudgetSpec, i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, L as LlmSpan, M as Message, e as RetrievalSpan, R as Run, h as RunFilter, m as RunLayer, c as RunOutcome, n as RunStatus, f as SandboxSpan, S as Span, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, b as TraceEvent, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
@@ -65,6 +65,85 @@ interface OtlpExport {
65
65
  /** Export a single run's spans + events in OTLP/JSON. */
66
66
  declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
67
67
 
68
+ /**
69
+ * OTEL span exporter — streams spans to an OTLP/HTTP collector.
70
+ *
71
+ * Reads OTEL_EXPORTER_OTLP_ENDPOINT + OTEL_EXPORTER_OTLP_HEADERS from env
72
+ * when no explicit config is given. Batches spans and flushes periodically
73
+ * or when the batch fills. No @opentelemetry SDK dependency — minimal
74
+ * OTLP/JSON serializer (~120 LOC) using the existing otel.ts helpers.
75
+ */
76
+ interface OtelExportConfig {
77
+ /** OTLP endpoint. Reads OTEL_EXPORTER_OTLP_ENDPOINT env by default. */
78
+ endpoint?: string;
79
+ /** OTLP headers. Reads OTEL_EXPORTER_OTLP_HEADERS env by default. */
80
+ headers?: Record<string, string>;
81
+ /** Batch size before flush. Default 64. */
82
+ batchSize?: number;
83
+ /** Flush interval ms. Default 5000. */
84
+ flushIntervalMs?: number;
85
+ /** Resource attributes stamped on every export. */
86
+ resourceAttributes?: Record<string, string | number | boolean>;
87
+ /** Service name. Default 'agent-eval'. */
88
+ serviceName?: string;
89
+ }
90
+ interface OtelExporter {
91
+ /** Called by the TraceEmitter on every span close. */
92
+ exportSpan(span: ExportableSpan): void;
93
+ /** Force flush pending spans. */
94
+ flush(): Promise<void>;
95
+ /** Shutdown cleanly — flushes remaining spans and stops the timer. */
96
+ shutdown(): Promise<void>;
97
+ }
98
+ interface ExportableSpan {
99
+ traceId: string;
100
+ spanId: string;
101
+ parentSpanId?: string;
102
+ name: string;
103
+ kind: string;
104
+ startedAt: number;
105
+ endedAt?: number;
106
+ status?: string;
107
+ error?: string;
108
+ model?: string;
109
+ inputTokens?: number;
110
+ outputTokens?: number;
111
+ costUsd?: number;
112
+ attributes?: Record<string, unknown>;
113
+ }
114
+ /**
115
+ * Create an OTEL exporter. Returns undefined when no endpoint is configured
116
+ * (neither via config nor env) — callers should check before attaching.
117
+ */
118
+ declare function createOtelExporter(config?: OtelExportConfig): OtelExporter | undefined;
119
+
120
+ /**
121
+ * OTEL bridge — connects TraceEmitter span lifecycle to the OtelExporter.
122
+ *
123
+ * When an OtelExporter is active, every span that closes through the
124
+ * TraceEmitter is also pushed to the exporter for real-time streaming to
125
+ * the user's OTEL collector.
126
+ *
127
+ * The bridge is opt-in: attach via `otelRunCompleteHook(exporter)` as a
128
+ * RunCompleteHook, or wrap the store with `createOtelTracingStore` for
129
+ * real-time per-span export.
130
+ */
131
+
132
+ /**
133
+ * Create a RunCompleteHook that exports all spans from the completed run
134
+ * to the OTEL exporter, then flushes.
135
+ */
136
+ declare function otelRunCompleteHook(exporter: OtelExporter): RunCompleteHook;
137
+ /**
138
+ * Create an auto-exporting TraceStore wrapper that intercepts updateSpan
139
+ * calls. When a span gets an endedAt, it's exported immediately. This
140
+ * gives real-time streaming instead of batch-at-end.
141
+ *
142
+ * This is the preferred integration path: wrap the store before
143
+ * constructing the TraceEmitter.
144
+ */
145
+ declare function createOtelTracingStore(inner: TraceStore, exporter: OtelExporter, traceId: string): TraceStore;
146
+
68
147
  /**
69
148
  * Redaction — remove PII / secrets from trace payloads before persist.
70
149
  *
@@ -807,4 +886,4 @@ declare function iterateRawCalls(sink: RawProviderSink, filter?: {
807
886
  spanId?: string;
808
887
  }): AsyncGenerator<ReplayCacheEntry>;
809
888
 
810
- export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, OTEL_AGENT_EVAL_SCOPE, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type QueryTracesPage, REDACTION_VERSION, RawProviderEvent, RawProviderSink, type RedactionReport, type RedactionRule, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, RunCompleteHook, RunCompleteHookContext, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystHookOptions, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete };
889
+ export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, type ExportableSpan, OTEL_AGENT_EVAL_SCOPE, type OtelExportConfig, type OtelExporter, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type QueryTracesPage, REDACTION_VERSION, RawProviderEvent, RawProviderSink, type RedactionReport, type RedactionRule, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, RunCompleteHook, RunCompleteHookContext, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystHookOptions, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete };
package/dist/traces.js CHANGED
@@ -19,6 +19,8 @@ import {
19
19
  buildTraceAnalystTools,
20
20
  buildTraceInsightContext,
21
21
  buildTraceInsightPrompt,
22
+ createOtelExporter,
23
+ createOtelTracingStore,
22
24
  createReplayFetch,
23
25
  defaultTraceInsightPanel,
24
26
  describeTraceInsightScope,
@@ -26,6 +28,7 @@ import {
26
28
  exportRunAsOtlp,
27
29
  inferDomainKeywords,
28
30
  iterateRawCalls,
31
+ otelRunCompleteHook,
29
32
  planTraceInsightQuestions,
30
33
  redactString,
31
34
  redactValue,
@@ -33,7 +36,7 @@ import {
33
36
  tokenizeDomainWords,
34
37
  traceAnalystFunctionGroup,
35
38
  traceAnalystOnRunComplete
36
- } from "./chunk-HIO4UIS5.js";
39
+ } from "./chunk-L5UNCDAJ.js";
37
40
  import {
38
41
  aggregateLlm,
39
42
  argHash,
@@ -103,6 +106,8 @@ export {
103
106
  buildTraceAnalystTools,
104
107
  buildTraceInsightContext,
105
108
  buildTraceInsightPrompt,
109
+ createOtelExporter,
110
+ createOtelTracingStore,
106
111
  createReplayFetch,
107
112
  defaultProviderRedactor,
108
113
  defaultTraceInsightPanel,
@@ -120,6 +125,7 @@ export {
120
125
  judgeSpans,
121
126
  llmSpanFromProvider,
122
127
  llmSpans,
128
+ otelRunCompleteHook,
123
129
  planTraceInsightQuestions,
124
130
  providerFromBaseUrl,
125
131
  redactString,
@@ -0,0 +1,110 @@
1
+ import { DefaultVerdict } from '@tangle-network/agent-runtime/loops';
2
+
3
+ /**
4
+ * @experimental
5
+ *
6
+ * N-axis cartesian matrix over substrate types — types module.
7
+ *
8
+ * The matrix is a runner + aggregator. It iterates the cartesian product of
9
+ * caller-provided axes (any value type — `AgentProfile` from sandbox, `Driver`
10
+ * / `Validator` from agent-runtime, rubric records, thinking levels, anything)
11
+ * and aggregates per-axis pass/score/cost summaries. Substrate types are
12
+ * imported at the boundary by JSDoc only; the matrix never wraps them.
13
+ */
14
+
15
+ /** One axis = one dimension to iterate. `V` is the value type — pass any
16
+ * substrate type (AgentProfile, Driver, Validator, rubric record). */
17
+ interface MatrixAxis<V> {
18
+ /** Axis name. Becomes the key in `MatrixResult.byAxis`. */
19
+ name: string;
20
+ /** Stable id per value. Used as the bucket key in aggregation. */
21
+ values: Array<{
22
+ id: string;
23
+ value: V;
24
+ }>;
25
+ /** Optional bucket label override. Receives the same `(value, id)` the
26
+ * runner stored on the cell; default label is `id`. */
27
+ label?: (value: V, id: string) => string;
28
+ }
29
+ /** A cell carries one picked value from each axis, keyed by axis name. */
30
+ interface MatrixCell {
31
+ axes: Record<string, {
32
+ id: string;
33
+ value: unknown;
34
+ }>;
35
+ /** 0-based replicate index within the same axis combination. */
36
+ rep: number;
37
+ /** Stable sort key — preserves cartesian order across concurrent execution. */
38
+ ordinal: number;
39
+ }
40
+ interface CellResult<Output> {
41
+ output: Output;
42
+ verdict: DefaultVerdict;
43
+ costUsd: number;
44
+ durationMs: number;
45
+ runId?: string;
46
+ /** Populated when `runCell` threw. The cell contributes 0 to passRate AND
47
+ * meanScore regardless of `verdict`. */
48
+ error?: {
49
+ message: string;
50
+ kind: string;
51
+ };
52
+ }
53
+ interface AxisSummary {
54
+ axisName: string;
55
+ axisValue: string;
56
+ cells: number;
57
+ passRate: number;
58
+ meanScore: number;
59
+ p50Score: number;
60
+ p90Score: number;
61
+ totalCostUsd: number;
62
+ meanDurationMs: number;
63
+ }
64
+ interface MatrixResult<Output> {
65
+ cells: Array<{
66
+ cell: MatrixCell;
67
+ runs: CellResult<Output>[];
68
+ }>;
69
+ /** `byAxis[axisName][axisValueId] = summary`. Populated only for axes
70
+ * named in `aggregateBy` (default = every axis in `axes`). */
71
+ byAxis: Record<string, Record<string, AxisSummary>>;
72
+ summary: {
73
+ totalCells: number;
74
+ runsExecuted: number;
75
+ /** Cells removed by `filter` plus cells unscheduled after the cost
76
+ * ceiling or abort signal tripped. */
77
+ cellsSkipped: number;
78
+ overallPassRate: number;
79
+ overallMeanScore: number;
80
+ totalCostUsd: number;
81
+ durationMs: number;
82
+ };
83
+ /** Stable id-like string generated at the end of the run. */
84
+ matrixId: string;
85
+ }
86
+ interface RunAgentMatrixOptions<Output> {
87
+ axes: MatrixAxis<unknown>[];
88
+ /** User-supplied cell executor. May throw; the matrix captures throws as
89
+ * `CellResult.error` and continues. */
90
+ runCell: (cell: MatrixCell) => Promise<CellResult<Output>>;
91
+ /** Replicates per cell. Default 1. */
92
+ reps?: number;
93
+ /** Prune cells from the cartesian BEFORE rep expansion. */
94
+ filter?: (cell: Omit<MatrixCell, 'rep' | 'ordinal'>) => boolean;
95
+ /** Axes to aggregate into `byAxis`. Default: every axis in `axes`. */
96
+ aggregateBy?: string[];
97
+ /** Max concurrent in-flight `runCell` invocations. Default 4. */
98
+ maxConcurrency?: number;
99
+ /** Cumulative-cost abort threshold (USD). When the running sum of
100
+ * `result.costUsd` crosses this value, no new cells are scheduled.
101
+ * In-flight cells finish. Default `Infinity`. */
102
+ costCeiling?: number;
103
+ /** Fires once per executed cell, after its promise settles. */
104
+ onCellComplete?: (cell: MatrixCell, result: CellResult<Output>) => void;
105
+ /** External cancellation. Aborts in-flight cells via a forwarded signal
106
+ * and suppresses scheduling of new ones. */
107
+ signal?: AbortSignal;
108
+ }
109
+
110
+ export type { AxisSummary as A, CellResult as C, MatrixResult as M, RunAgentMatrixOptions as R, MatrixAxis as a, MatrixCell as b };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-eval",
3
- "version": "0.36.0",
3
+ "version": "0.38.0",
4
4
  "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
5
5
  "homepage": "https://github.com/tangle-network/agent-eval#readme",
6
6
  "repository": {
@@ -99,6 +99,11 @@
99
99
  "import": "./dist/matrix/index.js",
100
100
  "default": "./dist/matrix/index.js"
101
101
  },
102
+ "./multishot": {
103
+ "types": "./dist/multishot/index.d.ts",
104
+ "import": "./dist/multishot/index.js",
105
+ "default": "./dist/multishot/index.js"
106
+ },
102
107
  "./openapi.json": {
103
108
  "default": "./dist/openapi.json"
104
109
  }