@tangle-network/agent-eval 0.36.0 → 0.38.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-HIO4UIS5.js → chunk-L5UNCDAJ.js} +207 -1
- package/dist/chunk-L5UNCDAJ.js.map +1 -0
- package/dist/chunk-QWV226SL.js +276 -0
- package/dist/chunk-QWV226SL.js.map +1 -0
- package/dist/index.d.ts +111 -3
- package/dist/index.js +204 -1
- package/dist/index.js.map +1 -1
- package/dist/matrix/index.d.ts +2 -109
- package/dist/matrix/index.js +5 -270
- package/dist/matrix/index.js.map +1 -1
- package/dist/multishot/index.d.ts +276 -0
- package/dist/multishot/index.js +467 -0
- package/dist/multishot/index.js.map +1 -0
- package/dist/openapi.json +1 -1
- package/dist/traces.d.ts +81 -2
- package/dist/traces.js +7 -1
- package/dist/types-DHqkLwEU.d.ts +110 -0
- package/package.json +6 -1
- package/dist/chunk-HIO4UIS5.js.map +0 -1
package/dist/traces.d.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { N as NotFoundError, R as ReplayError } from './errors-mje_cKOs.js';
|
|
2
2
|
import { R as RawProviderSink, f as RawProviderEvent } from './integrity-DYR5gWlb.js';
|
|
3
3
|
export { F as FileSystemRawProviderSink, c as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, d as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, e as RawProviderDirection, g as RawProviderSinkFilter, h as RunIntegrityError, a as RunIntegrityExpectations, i as RunIntegrityIssue, j as RunIntegrityIssueCode, b as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-DYR5gWlb.js';
|
|
4
|
-
import {
|
|
4
|
+
import { R as RunCompleteHook, a as RunCompleteHookContext } from './emitter-DP_cSSiw.js';
|
|
5
5
|
export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DP_cSSiw.js';
|
|
6
6
|
import { T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
7
7
|
export { A as Artifact, B as BudgetLedgerEntry, g as BudgetSpec, i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, L as LlmSpan, M as Message, e as RetrievalSpan, R as Run, h as RunFilter, m as RunLayer, c as RunOutcome, n as RunStatus, f as SandboxSpan, S as Span, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, b as TraceEvent, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
|
|
@@ -65,6 +65,85 @@ interface OtlpExport {
|
|
|
65
65
|
/** Export a single run's spans + events in OTLP/JSON. */
|
|
66
66
|
declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
|
|
67
67
|
|
|
68
|
+
/**
|
|
69
|
+
* OTEL span exporter — streams spans to an OTLP/HTTP collector.
|
|
70
|
+
*
|
|
71
|
+
* Reads OTEL_EXPORTER_OTLP_ENDPOINT + OTEL_EXPORTER_OTLP_HEADERS from env
|
|
72
|
+
* when no explicit config is given. Batches spans and flushes periodically
|
|
73
|
+
* or when the batch fills. No @opentelemetry SDK dependency — minimal
|
|
74
|
+
* OTLP/JSON serializer (~120 LOC) using the existing otel.ts helpers.
|
|
75
|
+
*/
|
|
76
|
+
interface OtelExportConfig {
|
|
77
|
+
/** OTLP endpoint. Reads OTEL_EXPORTER_OTLP_ENDPOINT env by default. */
|
|
78
|
+
endpoint?: string;
|
|
79
|
+
/** OTLP headers. Reads OTEL_EXPORTER_OTLP_HEADERS env by default. */
|
|
80
|
+
headers?: Record<string, string>;
|
|
81
|
+
/** Batch size before flush. Default 64. */
|
|
82
|
+
batchSize?: number;
|
|
83
|
+
/** Flush interval ms. Default 5000. */
|
|
84
|
+
flushIntervalMs?: number;
|
|
85
|
+
/** Resource attributes stamped on every export. */
|
|
86
|
+
resourceAttributes?: Record<string, string | number | boolean>;
|
|
87
|
+
/** Service name. Default 'agent-eval'. */
|
|
88
|
+
serviceName?: string;
|
|
89
|
+
}
|
|
90
|
+
interface OtelExporter {
|
|
91
|
+
/** Called by the TraceEmitter on every span close. */
|
|
92
|
+
exportSpan(span: ExportableSpan): void;
|
|
93
|
+
/** Force flush pending spans. */
|
|
94
|
+
flush(): Promise<void>;
|
|
95
|
+
/** Shutdown cleanly — flushes remaining spans and stops the timer. */
|
|
96
|
+
shutdown(): Promise<void>;
|
|
97
|
+
}
|
|
98
|
+
interface ExportableSpan {
|
|
99
|
+
traceId: string;
|
|
100
|
+
spanId: string;
|
|
101
|
+
parentSpanId?: string;
|
|
102
|
+
name: string;
|
|
103
|
+
kind: string;
|
|
104
|
+
startedAt: number;
|
|
105
|
+
endedAt?: number;
|
|
106
|
+
status?: string;
|
|
107
|
+
error?: string;
|
|
108
|
+
model?: string;
|
|
109
|
+
inputTokens?: number;
|
|
110
|
+
outputTokens?: number;
|
|
111
|
+
costUsd?: number;
|
|
112
|
+
attributes?: Record<string, unknown>;
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Create an OTEL exporter. Returns undefined when no endpoint is configured
|
|
116
|
+
* (neither via config nor env) — callers should check before attaching.
|
|
117
|
+
*/
|
|
118
|
+
declare function createOtelExporter(config?: OtelExportConfig): OtelExporter | undefined;
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* OTEL bridge — connects TraceEmitter span lifecycle to the OtelExporter.
|
|
122
|
+
*
|
|
123
|
+
* When an OtelExporter is active, every span that closes through the
|
|
124
|
+
* TraceEmitter is also pushed to the exporter for real-time streaming to
|
|
125
|
+
* the user's OTEL collector.
|
|
126
|
+
*
|
|
127
|
+
* The bridge is opt-in: attach via `otelRunCompleteHook(exporter)` as a
|
|
128
|
+
* RunCompleteHook, or wrap the store with `createOtelTracingStore` for
|
|
129
|
+
* real-time per-span export.
|
|
130
|
+
*/
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Create a RunCompleteHook that exports all spans from the completed run
|
|
134
|
+
* to the OTEL exporter, then flushes.
|
|
135
|
+
*/
|
|
136
|
+
declare function otelRunCompleteHook(exporter: OtelExporter): RunCompleteHook;
|
|
137
|
+
/**
|
|
138
|
+
* Create an auto-exporting TraceStore wrapper that intercepts updateSpan
|
|
139
|
+
* calls. When a span gets an endedAt, it's exported immediately. This
|
|
140
|
+
* gives real-time streaming instead of batch-at-end.
|
|
141
|
+
*
|
|
142
|
+
* This is the preferred integration path: wrap the store before
|
|
143
|
+
* constructing the TraceEmitter.
|
|
144
|
+
*/
|
|
145
|
+
declare function createOtelTracingStore(inner: TraceStore, exporter: OtelExporter, traceId: string): TraceStore;
|
|
146
|
+
|
|
68
147
|
/**
|
|
69
148
|
* Redaction — remove PII / secrets from trace payloads before persist.
|
|
70
149
|
*
|
|
@@ -807,4 +886,4 @@ declare function iterateRawCalls(sink: RawProviderSink, filter?: {
|
|
|
807
886
|
spanId?: string;
|
|
808
887
|
}): AsyncGenerator<ReplayCacheEntry>;
|
|
809
888
|
|
|
810
|
-
export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, OTEL_AGENT_EVAL_SCOPE, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type QueryTracesPage, REDACTION_VERSION, RawProviderEvent, RawProviderSink, type RedactionReport, type RedactionRule, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, RunCompleteHook, RunCompleteHookContext, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystHookOptions, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete };
|
|
889
|
+
export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, type ExportableSpan, OTEL_AGENT_EVAL_SCOPE, type OtelExportConfig, type OtelExporter, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type QueryTracesPage, REDACTION_VERSION, RawProviderEvent, RawProviderSink, type RedactionReport, type RedactionRule, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, RunCompleteHook, RunCompleteHookContext, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystHookOptions, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete };
|
package/dist/traces.js
CHANGED
|
@@ -19,6 +19,8 @@ import {
|
|
|
19
19
|
buildTraceAnalystTools,
|
|
20
20
|
buildTraceInsightContext,
|
|
21
21
|
buildTraceInsightPrompt,
|
|
22
|
+
createOtelExporter,
|
|
23
|
+
createOtelTracingStore,
|
|
22
24
|
createReplayFetch,
|
|
23
25
|
defaultTraceInsightPanel,
|
|
24
26
|
describeTraceInsightScope,
|
|
@@ -26,6 +28,7 @@ import {
|
|
|
26
28
|
exportRunAsOtlp,
|
|
27
29
|
inferDomainKeywords,
|
|
28
30
|
iterateRawCalls,
|
|
31
|
+
otelRunCompleteHook,
|
|
29
32
|
planTraceInsightQuestions,
|
|
30
33
|
redactString,
|
|
31
34
|
redactValue,
|
|
@@ -33,7 +36,7 @@ import {
|
|
|
33
36
|
tokenizeDomainWords,
|
|
34
37
|
traceAnalystFunctionGroup,
|
|
35
38
|
traceAnalystOnRunComplete
|
|
36
|
-
} from "./chunk-
|
|
39
|
+
} from "./chunk-L5UNCDAJ.js";
|
|
37
40
|
import {
|
|
38
41
|
aggregateLlm,
|
|
39
42
|
argHash,
|
|
@@ -103,6 +106,8 @@ export {
|
|
|
103
106
|
buildTraceAnalystTools,
|
|
104
107
|
buildTraceInsightContext,
|
|
105
108
|
buildTraceInsightPrompt,
|
|
109
|
+
createOtelExporter,
|
|
110
|
+
createOtelTracingStore,
|
|
106
111
|
createReplayFetch,
|
|
107
112
|
defaultProviderRedactor,
|
|
108
113
|
defaultTraceInsightPanel,
|
|
@@ -120,6 +125,7 @@ export {
|
|
|
120
125
|
judgeSpans,
|
|
121
126
|
llmSpanFromProvider,
|
|
122
127
|
llmSpans,
|
|
128
|
+
otelRunCompleteHook,
|
|
123
129
|
planTraceInsightQuestions,
|
|
124
130
|
providerFromBaseUrl,
|
|
125
131
|
redactString,
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import { DefaultVerdict } from '@tangle-network/agent-runtime/loops';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* @experimental
|
|
5
|
+
*
|
|
6
|
+
* N-axis cartesian matrix over substrate types — types module.
|
|
7
|
+
*
|
|
8
|
+
* The matrix is a runner + aggregator. It iterates the cartesian product of
|
|
9
|
+
* caller-provided axes (any value type — `AgentProfile` from sandbox, `Driver`
|
|
10
|
+
* / `Validator` from agent-runtime, rubric records, thinking levels, anything)
|
|
11
|
+
* and aggregates per-axis pass/score/cost summaries. Substrate types are
|
|
12
|
+
* imported at the boundary by JSDoc only; the matrix never wraps them.
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
/** One axis = one dimension to iterate. `V` is the value type — pass any
|
|
16
|
+
* substrate type (AgentProfile, Driver, Validator, rubric record). */
|
|
17
|
+
interface MatrixAxis<V> {
|
|
18
|
+
/** Axis name. Becomes the key in `MatrixResult.byAxis`. */
|
|
19
|
+
name: string;
|
|
20
|
+
/** Stable id per value. Used as the bucket key in aggregation. */
|
|
21
|
+
values: Array<{
|
|
22
|
+
id: string;
|
|
23
|
+
value: V;
|
|
24
|
+
}>;
|
|
25
|
+
/** Optional bucket label override. Receives the same `(value, id)` the
|
|
26
|
+
* runner stored on the cell; default label is `id`. */
|
|
27
|
+
label?: (value: V, id: string) => string;
|
|
28
|
+
}
|
|
29
|
+
/** A cell carries one picked value from each axis, keyed by axis name. */
|
|
30
|
+
interface MatrixCell {
|
|
31
|
+
axes: Record<string, {
|
|
32
|
+
id: string;
|
|
33
|
+
value: unknown;
|
|
34
|
+
}>;
|
|
35
|
+
/** 0-based replicate index within the same axis combination. */
|
|
36
|
+
rep: number;
|
|
37
|
+
/** Stable sort key — preserves cartesian order across concurrent execution. */
|
|
38
|
+
ordinal: number;
|
|
39
|
+
}
|
|
40
|
+
interface CellResult<Output> {
|
|
41
|
+
output: Output;
|
|
42
|
+
verdict: DefaultVerdict;
|
|
43
|
+
costUsd: number;
|
|
44
|
+
durationMs: number;
|
|
45
|
+
runId?: string;
|
|
46
|
+
/** Populated when `runCell` threw. The cell contributes 0 to passRate AND
|
|
47
|
+
* meanScore regardless of `verdict`. */
|
|
48
|
+
error?: {
|
|
49
|
+
message: string;
|
|
50
|
+
kind: string;
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
interface AxisSummary {
|
|
54
|
+
axisName: string;
|
|
55
|
+
axisValue: string;
|
|
56
|
+
cells: number;
|
|
57
|
+
passRate: number;
|
|
58
|
+
meanScore: number;
|
|
59
|
+
p50Score: number;
|
|
60
|
+
p90Score: number;
|
|
61
|
+
totalCostUsd: number;
|
|
62
|
+
meanDurationMs: number;
|
|
63
|
+
}
|
|
64
|
+
interface MatrixResult<Output> {
|
|
65
|
+
cells: Array<{
|
|
66
|
+
cell: MatrixCell;
|
|
67
|
+
runs: CellResult<Output>[];
|
|
68
|
+
}>;
|
|
69
|
+
/** `byAxis[axisName][axisValueId] = summary`. Populated only for axes
|
|
70
|
+
* named in `aggregateBy` (default = every axis in `axes`). */
|
|
71
|
+
byAxis: Record<string, Record<string, AxisSummary>>;
|
|
72
|
+
summary: {
|
|
73
|
+
totalCells: number;
|
|
74
|
+
runsExecuted: number;
|
|
75
|
+
/** Cells removed by `filter` plus cells unscheduled after the cost
|
|
76
|
+
* ceiling or abort signal tripped. */
|
|
77
|
+
cellsSkipped: number;
|
|
78
|
+
overallPassRate: number;
|
|
79
|
+
overallMeanScore: number;
|
|
80
|
+
totalCostUsd: number;
|
|
81
|
+
durationMs: number;
|
|
82
|
+
};
|
|
83
|
+
/** Stable id-like string generated at the end of the run. */
|
|
84
|
+
matrixId: string;
|
|
85
|
+
}
|
|
86
|
+
interface RunAgentMatrixOptions<Output> {
|
|
87
|
+
axes: MatrixAxis<unknown>[];
|
|
88
|
+
/** User-supplied cell executor. May throw; the matrix captures throws as
|
|
89
|
+
* `CellResult.error` and continues. */
|
|
90
|
+
runCell: (cell: MatrixCell) => Promise<CellResult<Output>>;
|
|
91
|
+
/** Replicates per cell. Default 1. */
|
|
92
|
+
reps?: number;
|
|
93
|
+
/** Prune cells from the cartesian BEFORE rep expansion. */
|
|
94
|
+
filter?: (cell: Omit<MatrixCell, 'rep' | 'ordinal'>) => boolean;
|
|
95
|
+
/** Axes to aggregate into `byAxis`. Default: every axis in `axes`. */
|
|
96
|
+
aggregateBy?: string[];
|
|
97
|
+
/** Max concurrent in-flight `runCell` invocations. Default 4. */
|
|
98
|
+
maxConcurrency?: number;
|
|
99
|
+
/** Cumulative-cost abort threshold (USD). When the running sum of
|
|
100
|
+
* `result.costUsd` crosses this value, no new cells are scheduled.
|
|
101
|
+
* In-flight cells finish. Default `Infinity`. */
|
|
102
|
+
costCeiling?: number;
|
|
103
|
+
/** Fires once per executed cell, after its promise settles. */
|
|
104
|
+
onCellComplete?: (cell: MatrixCell, result: CellResult<Output>) => void;
|
|
105
|
+
/** External cancellation. Aborts in-flight cells via a forwarded signal
|
|
106
|
+
* and suppresses scheduling of new ones. */
|
|
107
|
+
signal?: AbortSignal;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
export type { AxisSummary as A, CellResult as C, MatrixResult as M, RunAgentMatrixOptions as R, MatrixAxis as a, MatrixCell as b };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.38.0",
|
|
4
4
|
"description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|
|
@@ -99,6 +99,11 @@
|
|
|
99
99
|
"import": "./dist/matrix/index.js",
|
|
100
100
|
"default": "./dist/matrix/index.js"
|
|
101
101
|
},
|
|
102
|
+
"./multishot": {
|
|
103
|
+
"types": "./dist/multishot/index.d.ts",
|
|
104
|
+
"import": "./dist/multishot/index.js",
|
|
105
|
+
"default": "./dist/multishot/index.js"
|
|
106
|
+
},
|
|
102
107
|
"./openapi.json": {
|
|
103
108
|
"default": "./dist/openapi.json"
|
|
104
109
|
}
|