@tangle-network/agent-eval 0.58.2 → 0.59.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-5GLYP2IQ.js → chunk-PIEAE33T.js} +71 -2
- package/dist/chunk-PIEAE33T.js.map +1 -0
- package/dist/contract/index.d.ts +2 -2
- package/dist/index.d.ts +4 -4
- package/dist/index.js +1 -1
- package/dist/openapi.json +1 -1
- package/dist/{registry-BSWy0rvH.d.ts → registry-DK9kqXvb.d.ts} +1 -1
- package/dist/{store-CJbzDxZ2.d.ts → store-jzKpMl16.d.ts} +28 -0
- package/dist/traces.d.ts +2 -2
- package/dist/traces.js +1 -1
- package/package.json +3 -3
- package/dist/chunk-5GLYP2IQ.js.map +0 -1
package/dist/index.d.ts
CHANGED
|
@@ -12,12 +12,12 @@ import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
|
|
|
12
12
|
export { d as LlmCallError, b as LlmCallRequest, c as LlmCallResult, e as LlmClient, f as LlmMessage, g as LlmRouteAssertionError, a as LlmRouteRequirements, h as LlmUsage, i as assertLlmRoute, j as backoffMs, k as callLlm, l as callLlmJson, m as isTransientLlmError, p as probeLlm, s as stripFencedJson } from './llm-client-BXVRUZyX.js';
|
|
13
13
|
import { AnalyzeTracesOptions, OtelExporter, OtelExportConfig, AnalyzeTracesInput, AnalyzeTracesResult } from './traces.js';
|
|
14
14
|
export { AnalyzeTracesTurnSnapshot, CaptureFetchContext, CaptureFetchOptions, DEFAULT_REDACTION_RULES, ExportableSpan, FlattenOtlpOptions, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpFlatLine, OtlpResourceSpans, OtlpSpan, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TraceAnalystHookOptions, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, captureFetchToRawSink, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, flattenOtlpExportToNdjson, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
|
|
15
|
-
import { T as TraceAnalysisStore } from './store-
|
|
16
|
-
export { D as DEFAULT_TRACE_ANALYST_BUDGETS, a as DatasetOverview, Q as QueryTracesPage, S as SearchSpanResult, b as SearchTraceResult, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, f as TraceAnalystFilters, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, V as ViewSpansResult, k as ViewTraceOversized, l as ViewTraceResult } from './store-
|
|
15
|
+
import { T as TraceAnalysisStore } from './store-jzKpMl16.js';
|
|
16
|
+
export { D as DEFAULT_TRACE_ANALYST_BUDGETS, a as DatasetOverview, Q as QueryTracesPage, S as SearchSpanResult, b as SearchTraceResult, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, f as TraceAnalystFilters, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, V as ViewSpansResult, k as ViewTraceOversized, l as ViewTraceResult } from './store-jzKpMl16.js';
|
|
17
17
|
import { b as JudgeFn, J as JudgeInput, B as BenchmarkRunnerConfig, S as Scenario, c as BenchmarkReport, P as ProductClientConfig, C as CheckResult, T as TestResult, d as PersonaConfig, D as DriverResult, e as DriverState, f as CollectedArtifacts, g as ScenarioResult, h as TurnMetrics, i as ScenarioFile, j as CompletionCriterion } from './types-DhqpAi_z.js';
|
|
18
18
|
export { A as ArtifactCheck, k as ArtifactResult, E as EvalResult, F as FeedbackPattern, l as JudgeConfig, m as JudgeRubric, a as JudgeScore, n as PersonaRigor, R as RouteMap, o as RubricDimension, p as Turn, q as TurnResult } from './types-DhqpAi_z.js';
|
|
19
|
-
import { a as Analyst, b as AnalystSeverity, c as AnalystFinding, d as AnalystCost, e as AnalystContext } from './registry-
|
|
20
|
-
export { f as AnalystHooks, g as AnalystInputKind, A as AnalystRegistry, h as AnalystRegistryOptions, i as AnalystRequirements, j as AnalystRunEvent, k as AnalystRunInputs, l as AnalystRunResult, m as AnalystRunSummary, B as BudgetPolicy, C as ChatCallOpts, n as ChatClient, o as ChatRequest, p as ChatResponse, q as ChatTransport, r as CliBridgeTransportOpts, s as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RegistryRunOpts, t as RouterTransportOpts, S as SandboxSdkTransportOpts, u as computeFindingId, v as createChatClient, w as makeFinding } from './registry-
|
|
19
|
+
import { a as Analyst, b as AnalystSeverity, c as AnalystFinding, d as AnalystCost, e as AnalystContext } from './registry-DK9kqXvb.js';
|
|
20
|
+
export { f as AnalystHooks, g as AnalystInputKind, A as AnalystRegistry, h as AnalystRegistryOptions, i as AnalystRequirements, j as AnalystRunEvent, k as AnalystRunInputs, l as AnalystRunResult, m as AnalystRunSummary, B as BudgetPolicy, C as ChatCallOpts, n as ChatClient, o as ChatRequest, p as ChatResponse, q as ChatTransport, r as CliBridgeTransportOpts, s as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RegistryRunOpts, t as RouterTransportOpts, S as SandboxSdkTransportOpts, u as computeFindingId, v as createChatClient, w as makeFinding } from './registry-DK9kqXvb.js';
|
|
21
21
|
import { TCloud } from '@tangle-network/tcloud';
|
|
22
22
|
import { z } from 'zod';
|
|
23
23
|
export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-DuFBYg7A.js';
|
package/dist/index.js
CHANGED
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.59.1",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-BXVRUZyX.js';
|
|
2
2
|
import { R as RunRecord } from './run-record-etiCMsUq.js';
|
|
3
|
-
import { T as TraceAnalysisStore } from './store-
|
|
3
|
+
import { T as TraceAnalysisStore } from './store-jzKpMl16.js';
|
|
4
4
|
import { J as JudgeInput } from './types-DhqpAi_z.js';
|
|
5
5
|
|
|
6
6
|
/**
|
|
@@ -69,6 +69,29 @@ interface TraceAnalystFilters {
|
|
|
69
69
|
* expensive on large datasets. Use the indexed filters above first. */
|
|
70
70
|
regex_pattern?: string;
|
|
71
71
|
}
|
|
72
|
+
/** One distinct error signature across the dataset — the deterministic unit of
|
|
73
|
+
* failure coverage. Signatures normalize volatile tokens (digits, hex/uuids,
|
|
74
|
+
* paths, durations) out of the span `status_message` so semantically identical
|
|
75
|
+
* failures collapse into one cluster. An analyst that accounts for every
|
|
76
|
+
* cluster has, by construction, covered every distinct failure mode. */
|
|
77
|
+
interface ErrorCluster {
|
|
78
|
+
/** Normalized status_message — the cluster key. */
|
|
79
|
+
signature: string;
|
|
80
|
+
/** A verbatim, un-normalized exemplar message (for exact-string citation). */
|
|
81
|
+
status_message_sample: string;
|
|
82
|
+
/** The span name that most often carries this signature, if any. */
|
|
83
|
+
span_name: string | null;
|
|
84
|
+
/** The tool that most often carries this signature, if any. */
|
|
85
|
+
tool_name: string | null;
|
|
86
|
+
trace_count: number;
|
|
87
|
+
span_count: number;
|
|
88
|
+
/** trace_count / total error traces in the matched set (0..1). */
|
|
89
|
+
prevalence: number;
|
|
90
|
+
/** Real trace ids carrying this signature (capped), passable to view/search. */
|
|
91
|
+
exemplar_trace_ids: string[];
|
|
92
|
+
/** Real span ids carrying this signature (capped). */
|
|
93
|
+
exemplar_span_ids: string[];
|
|
94
|
+
}
|
|
72
95
|
interface DatasetOverview {
|
|
73
96
|
total_traces: number;
|
|
74
97
|
raw_jsonl_bytes: number;
|
|
@@ -82,6 +105,11 @@ interface DatasetOverview {
|
|
|
82
105
|
trace_count: number;
|
|
83
106
|
span_count: number;
|
|
84
107
|
};
|
|
108
|
+
/** The COMPLETE deterministic error-signature population, sorted by
|
|
109
|
+
* trace_count desc. This is the failure-coverage checklist: an analysis is
|
|
110
|
+
* complete only when every cluster here is accounted for. Empty when the
|
|
111
|
+
* matched set has no error spans. */
|
|
112
|
+
error_clusters: ErrorCluster[];
|
|
85
113
|
time_range: {
|
|
86
114
|
earliest: string;
|
|
87
115
|
latest: string;
|
package/dist/traces.d.ts
CHANGED
|
@@ -9,8 +9,8 @@ export { E as EventFilter, F as FileSystemTraceStore, a as FileSystemTraceStoreO
|
|
|
9
9
|
export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-CqTxMwDw.js';
|
|
10
10
|
export { A as Artifact, B as BudgetLedgerEntry, h as BudgetSpec, E as EventKind, i as FAILURE_CLASSES, F as FailureClass, G as GenericSpan, J as JudgeSpan, L as LlmSpan, M as Message, d as RetrievalSpan, R as Run, g as RunLayer, b as RunOutcome, f as RunStatus, e as SandboxSpan, S as Span, j as SpanBase, c as SpanKind, k as SpanStatus, l as TRACE_SCHEMA_VERSION, T as ToolSpan, a as TraceEvent, m as isJudgeSpan, n as isLlmSpan, o as isRetrievalSpan, p as isSandboxSpan, q as isToolSpan } from './schema-m0gsnbt3.js';
|
|
11
11
|
import { AxAIService, AxFunction } from '@ax-llm/ax';
|
|
12
|
-
import { T as TraceAnalysisStore, f as TraceAnalystFilters, a as DatasetOverview, Q as QueryTracesPage, l as ViewTraceResult, V as ViewSpansResult, b as SearchTraceResult, S as SearchSpanResult } from './store-
|
|
13
|
-
export { D as DEFAULT_TRACE_ANALYST_BUDGETS, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, k as ViewTraceOversized } from './store-
|
|
12
|
+
import { T as TraceAnalysisStore, f as TraceAnalystFilters, a as DatasetOverview, Q as QueryTracesPage, l as ViewTraceResult, V as ViewSpansResult, b as SearchTraceResult, S as SearchSpanResult } from './store-jzKpMl16.js';
|
|
13
|
+
export { D as DEFAULT_TRACE_ANALYST_BUDGETS, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, k as ViewTraceOversized } from './store-jzKpMl16.js';
|
|
14
14
|
|
|
15
15
|
/**
|
|
16
16
|
* `captureFetchToRawSink` — wrap a `fetch` so every request / response / error
|
package/dist/traces.js
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.59.1",
|
|
4
4
|
"description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|
|
@@ -153,7 +153,7 @@
|
|
|
153
153
|
"zod": "^4.3.6"
|
|
154
154
|
},
|
|
155
155
|
"peerDependencies": {
|
|
156
|
-
"@tangle-network/sandbox": ">=0.2.1 <0.
|
|
156
|
+
"@tangle-network/sandbox": ">=0.2.1 <0.5.0"
|
|
157
157
|
},
|
|
158
158
|
"peerDependenciesMeta": {
|
|
159
159
|
"@tangle-network/sandbox": {
|
|
@@ -162,7 +162,7 @@
|
|
|
162
162
|
},
|
|
163
163
|
"devDependencies": {
|
|
164
164
|
"@biomejs/biome": "^2.4.15",
|
|
165
|
-
"@tangle-network/sandbox": "0.
|
|
165
|
+
"@tangle-network/sandbox": "0.4.0",
|
|
166
166
|
"@types/node": "^25.6.0",
|
|
167
167
|
"husky": "^9.1.7",
|
|
168
168
|
"lint-staged": "^17.0.5",
|