@tangle-network/agent-eval 0.58.2 → 0.59.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -12,12 +12,12 @@ import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
12
12
  export { d as LlmCallError, b as LlmCallRequest, c as LlmCallResult, e as LlmClient, f as LlmMessage, g as LlmRouteAssertionError, a as LlmRouteRequirements, h as LlmUsage, i as assertLlmRoute, j as backoffMs, k as callLlm, l as callLlmJson, m as isTransientLlmError, p as probeLlm, s as stripFencedJson } from './llm-client-BXVRUZyX.js';
13
13
  import { AnalyzeTracesOptions, OtelExporter, OtelExportConfig, AnalyzeTracesInput, AnalyzeTracesResult } from './traces.js';
14
14
  export { AnalyzeTracesTurnSnapshot, CaptureFetchContext, CaptureFetchOptions, DEFAULT_REDACTION_RULES, ExportableSpan, FlattenOtlpOptions, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpFlatLine, OtlpResourceSpans, OtlpSpan, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TraceAnalystHookOptions, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, captureFetchToRawSink, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, flattenOtlpExportToNdjson, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
15
- import { T as TraceAnalysisStore } from './store-CJbzDxZ2.js';
16
- export { D as DEFAULT_TRACE_ANALYST_BUDGETS, a as DatasetOverview, Q as QueryTracesPage, S as SearchSpanResult, b as SearchTraceResult, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, f as TraceAnalystFilters, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, V as ViewSpansResult, k as ViewTraceOversized, l as ViewTraceResult } from './store-CJbzDxZ2.js';
15
+ import { T as TraceAnalysisStore } from './store-jzKpMl16.js';
16
+ export { D as DEFAULT_TRACE_ANALYST_BUDGETS, a as DatasetOverview, Q as QueryTracesPage, S as SearchSpanResult, b as SearchTraceResult, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, f as TraceAnalystFilters, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, V as ViewSpansResult, k as ViewTraceOversized, l as ViewTraceResult } from './store-jzKpMl16.js';
17
17
  import { b as JudgeFn, J as JudgeInput, B as BenchmarkRunnerConfig, S as Scenario, c as BenchmarkReport, P as ProductClientConfig, C as CheckResult, T as TestResult, d as PersonaConfig, D as DriverResult, e as DriverState, f as CollectedArtifacts, g as ScenarioResult, h as TurnMetrics, i as ScenarioFile, j as CompletionCriterion } from './types-DhqpAi_z.js';
18
18
  export { A as ArtifactCheck, k as ArtifactResult, E as EvalResult, F as FeedbackPattern, l as JudgeConfig, m as JudgeRubric, a as JudgeScore, n as PersonaRigor, R as RouteMap, o as RubricDimension, p as Turn, q as TurnResult } from './types-DhqpAi_z.js';
19
- import { a as Analyst, b as AnalystSeverity, c as AnalystFinding, d as AnalystCost, e as AnalystContext } from './registry-BSWy0rvH.js';
20
- export { f as AnalystHooks, g as AnalystInputKind, A as AnalystRegistry, h as AnalystRegistryOptions, i as AnalystRequirements, j as AnalystRunEvent, k as AnalystRunInputs, l as AnalystRunResult, m as AnalystRunSummary, B as BudgetPolicy, C as ChatCallOpts, n as ChatClient, o as ChatRequest, p as ChatResponse, q as ChatTransport, r as CliBridgeTransportOpts, s as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RegistryRunOpts, t as RouterTransportOpts, S as SandboxSdkTransportOpts, u as computeFindingId, v as createChatClient, w as makeFinding } from './registry-BSWy0rvH.js';
19
+ import { a as Analyst, b as AnalystSeverity, c as AnalystFinding, d as AnalystCost, e as AnalystContext } from './registry-DK9kqXvb.js';
20
+ export { f as AnalystHooks, g as AnalystInputKind, A as AnalystRegistry, h as AnalystRegistryOptions, i as AnalystRequirements, j as AnalystRunEvent, k as AnalystRunInputs, l as AnalystRunResult, m as AnalystRunSummary, B as BudgetPolicy, C as ChatCallOpts, n as ChatClient, o as ChatRequest, p as ChatResponse, q as ChatTransport, r as CliBridgeTransportOpts, s as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RegistryRunOpts, t as RouterTransportOpts, S as SandboxSdkTransportOpts, u as computeFindingId, v as createChatClient, w as makeFinding } from './registry-DK9kqXvb.js';
21
21
  import { TCloud } from '@tangle-network/tcloud';
22
22
  import { z } from 'zod';
23
23
  export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-DuFBYg7A.js';
package/dist/index.js CHANGED
@@ -189,7 +189,7 @@ import {
189
189
  tokenizeDomainWords,
190
190
  traceAnalystFunctionGroup,
191
191
  traceAnalystOnRunComplete
192
- } from "./chunk-5GLYP2IQ.js";
192
+ } from "./chunk-PIEAE33T.js";
193
193
  import {
194
194
  DEFAULT_REDACTION_RULES,
195
195
  REDACTION_VERSION,
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.58.2",
5
+ "version": "0.59.1",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -1,6 +1,6 @@
1
1
  import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-BXVRUZyX.js';
2
2
  import { R as RunRecord } from './run-record-etiCMsUq.js';
3
- import { T as TraceAnalysisStore } from './store-CJbzDxZ2.js';
3
+ import { T as TraceAnalysisStore } from './store-jzKpMl16.js';
4
4
  import { J as JudgeInput } from './types-DhqpAi_z.js';
5
5
 
6
6
  /**
@@ -69,6 +69,29 @@ interface TraceAnalystFilters {
69
69
  * expensive on large datasets. Use the indexed filters above first. */
70
70
  regex_pattern?: string;
71
71
  }
72
+ /** One distinct error signature across the dataset — the deterministic unit of
73
+ * failure coverage. Signatures normalize volatile tokens (digits, hex/uuids,
74
+ * paths, durations) out of the span `status_message` so semantically identical
75
+ * failures collapse into one cluster. An analyst that accounts for every
76
+ * cluster has, by construction, covered every distinct failure mode. */
77
+ interface ErrorCluster {
78
+ /** Normalized status_message — the cluster key. */
79
+ signature: string;
80
+ /** A verbatim, un-normalized exemplar message (for exact-string citation). */
81
+ status_message_sample: string;
82
+ /** The span name that most often carries this signature, if any. */
83
+ span_name: string | null;
84
+ /** The tool that most often carries this signature, if any. */
85
+ tool_name: string | null;
86
+ trace_count: number;
87
+ span_count: number;
88
+ /** trace_count / total error traces in the matched set (0..1). */
89
+ prevalence: number;
90
+ /** Real trace ids carrying this signature (capped), passable to view/search. */
91
+ exemplar_trace_ids: string[];
92
+ /** Real span ids carrying this signature (capped). */
93
+ exemplar_span_ids: string[];
94
+ }
72
95
  interface DatasetOverview {
73
96
  total_traces: number;
74
97
  raw_jsonl_bytes: number;
@@ -82,6 +105,11 @@ interface DatasetOverview {
82
105
  trace_count: number;
83
106
  span_count: number;
84
107
  };
108
+ /** The COMPLETE deterministic error-signature population, sorted by
109
+ * trace_count desc. This is the failure-coverage checklist: an analysis is
110
+ * complete only when every cluster here is accounted for. Empty when the
111
+ * matched set has no error spans. */
112
+ error_clusters: ErrorCluster[];
85
113
  time_range: {
86
114
  earliest: string;
87
115
  latest: string;
package/dist/traces.d.ts CHANGED
@@ -9,8 +9,8 @@ export { E as EventFilter, F as FileSystemTraceStore, a as FileSystemTraceStoreO
9
9
  export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-CqTxMwDw.js';
10
10
  export { A as Artifact, B as BudgetLedgerEntry, h as BudgetSpec, E as EventKind, i as FAILURE_CLASSES, F as FailureClass, G as GenericSpan, J as JudgeSpan, L as LlmSpan, M as Message, d as RetrievalSpan, R as Run, g as RunLayer, b as RunOutcome, f as RunStatus, e as SandboxSpan, S as Span, j as SpanBase, c as SpanKind, k as SpanStatus, l as TRACE_SCHEMA_VERSION, T as ToolSpan, a as TraceEvent, m as isJudgeSpan, n as isLlmSpan, o as isRetrievalSpan, p as isSandboxSpan, q as isToolSpan } from './schema-m0gsnbt3.js';
11
11
  import { AxAIService, AxFunction } from '@ax-llm/ax';
12
- import { T as TraceAnalysisStore, f as TraceAnalystFilters, a as DatasetOverview, Q as QueryTracesPage, l as ViewTraceResult, V as ViewSpansResult, b as SearchTraceResult, S as SearchSpanResult } from './store-CJbzDxZ2.js';
13
- export { D as DEFAULT_TRACE_ANALYST_BUDGETS, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, k as ViewTraceOversized } from './store-CJbzDxZ2.js';
12
+ import { T as TraceAnalysisStore, f as TraceAnalystFilters, a as DatasetOverview, Q as QueryTracesPage, l as ViewTraceResult, V as ViewSpansResult, b as SearchTraceResult, S as SearchSpanResult } from './store-jzKpMl16.js';
13
+ export { D as DEFAULT_TRACE_ANALYST_BUDGETS, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, k as ViewTraceOversized } from './store-jzKpMl16.js';
14
14
 
15
15
  /**
16
16
  * `captureFetchToRawSink` — wrap a `fetch` so every request / response / error
package/dist/traces.js CHANGED
@@ -34,7 +34,7 @@ import {
34
34
  tokenizeDomainWords,
35
35
  traceAnalystFunctionGroup,
36
36
  traceAnalystOnRunComplete
37
- } from "./chunk-5GLYP2IQ.js";
37
+ } from "./chunk-PIEAE33T.js";
38
38
  import {
39
39
  DEFAULT_REDACTION_RULES,
40
40
  REDACTION_VERSION,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-eval",
3
- "version": "0.58.2",
3
+ "version": "0.59.1",
4
4
  "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
5
5
  "homepage": "https://github.com/tangle-network/agent-eval#readme",
6
6
  "repository": {
@@ -153,7 +153,7 @@
153
153
  "zod": "^4.3.6"
154
154
  },
155
155
  "peerDependencies": {
156
- "@tangle-network/sandbox": ">=0.2.1 <0.4.0"
156
+ "@tangle-network/sandbox": ">=0.2.1 <0.5.0"
157
157
  },
158
158
  "peerDependenciesMeta": {
159
159
  "@tangle-network/sandbox": {
@@ -162,7 +162,7 @@
162
162
  },
163
163
  "devDependencies": {
164
164
  "@biomejs/biome": "^2.4.15",
165
- "@tangle-network/sandbox": "0.3.0",
165
+ "@tangle-network/sandbox": "0.4.0",
166
166
  "@types/node": "^25.6.0",
167
167
  "husky": "^9.1.7",
168
168
  "lint-staged": "^17.0.5",