npm - @tangle-network/agent-eval - Versions diffs - 0.58.2 → 0.59.1 - Mend

@tangle-network/agent-eval 0.58.2 → 0.59.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/{chunk-5GLYP2IQ.js → chunk-PIEAE33T.js} +71 -2
package/dist/chunk-PIEAE33T.js.map +1 -0
package/dist/contract/index.d.ts +2 -2
package/dist/index.d.ts +4 -4
package/dist/index.js +1 -1
package/dist/openapi.json +1 -1
package/dist/{registry-BSWy0rvH.d.ts → registry-DK9kqXvb.d.ts} +1 -1
package/dist/{store-CJbzDxZ2.d.ts → store-jzKpMl16.d.ts} +28 -0
package/dist/traces.d.ts +2 -2
package/dist/traces.js +1 -1
package/package.json +3 -3
package/dist/chunk-5GLYP2IQ.js.map +0 -1

package/dist/index.d.ts CHANGED Viewed

@@ -12,12 +12,12 @@ import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
 export { d as LlmCallError, b as LlmCallRequest, c as LlmCallResult, e as LlmClient, f as LlmMessage, g as LlmRouteAssertionError, a as LlmRouteRequirements, h as LlmUsage, i as assertLlmRoute, j as backoffMs, k as callLlm, l as callLlmJson, m as isTransientLlmError, p as probeLlm, s as stripFencedJson } from './llm-client-BXVRUZyX.js';
 import { AnalyzeTracesOptions, OtelExporter, OtelExportConfig, AnalyzeTracesInput, AnalyzeTracesResult } from './traces.js';
 export { AnalyzeTracesTurnSnapshot, CaptureFetchContext, CaptureFetchOptions, DEFAULT_REDACTION_RULES, ExportableSpan, FlattenOtlpOptions, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpFlatLine, OtlpResourceSpans, OtlpSpan, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TraceAnalystHookOptions, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, captureFetchToRawSink, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, flattenOtlpExportToNdjson, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
-import { T as TraceAnalysisStore } from './store-CJbzDxZ2.js';
-export { D as DEFAULT_TRACE_ANALYST_BUDGETS, a as DatasetOverview, Q as QueryTracesPage, S as SearchSpanResult, b as SearchTraceResult, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, f as TraceAnalystFilters, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, V as ViewSpansResult, k as ViewTraceOversized, l as ViewTraceResult } from './store-CJbzDxZ2.js';
+import { T as TraceAnalysisStore } from './store-jzKpMl16.js';
+export { D as DEFAULT_TRACE_ANALYST_BUDGETS, a as DatasetOverview, Q as QueryTracesPage, S as SearchSpanResult, b as SearchTraceResult, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, f as TraceAnalystFilters, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, V as ViewSpansResult, k as ViewTraceOversized, l as ViewTraceResult } from './store-jzKpMl16.js';
 import { b as JudgeFn, J as JudgeInput, B as BenchmarkRunnerConfig, S as Scenario, c as BenchmarkReport, P as ProductClientConfig, C as CheckResult, T as TestResult, d as PersonaConfig, D as DriverResult, e as DriverState, f as CollectedArtifacts, g as ScenarioResult, h as TurnMetrics, i as ScenarioFile, j as CompletionCriterion } from './types-DhqpAi_z.js';
 export { A as ArtifactCheck, k as ArtifactResult, E as EvalResult, F as FeedbackPattern, l as JudgeConfig, m as JudgeRubric, a as JudgeScore, n as PersonaRigor, R as RouteMap, o as RubricDimension, p as Turn, q as TurnResult } from './types-DhqpAi_z.js';
-import { a as Analyst, b as AnalystSeverity, c as AnalystFinding, d as AnalystCost, e as AnalystContext } from './registry-BSWy0rvH.js';
-export { f as AnalystHooks, g as AnalystInputKind, A as AnalystRegistry, h as AnalystRegistryOptions, i as AnalystRequirements, j as AnalystRunEvent, k as AnalystRunInputs, l as AnalystRunResult, m as AnalystRunSummary, B as BudgetPolicy, C as ChatCallOpts, n as ChatClient, o as ChatRequest, p as ChatResponse, q as ChatTransport, r as CliBridgeTransportOpts, s as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RegistryRunOpts, t as RouterTransportOpts, S as SandboxSdkTransportOpts, u as computeFindingId, v as createChatClient, w as makeFinding } from './registry-BSWy0rvH.js';
+import { a as Analyst, b as AnalystSeverity, c as AnalystFinding, d as AnalystCost, e as AnalystContext } from './registry-DK9kqXvb.js';
+export { f as AnalystHooks, g as AnalystInputKind, A as AnalystRegistry, h as AnalystRegistryOptions, i as AnalystRequirements, j as AnalystRunEvent, k as AnalystRunInputs, l as AnalystRunResult, m as AnalystRunSummary, B as BudgetPolicy, C as ChatCallOpts, n as ChatClient, o as ChatRequest, p as ChatResponse, q as ChatTransport, r as CliBridgeTransportOpts, s as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RegistryRunOpts, t as RouterTransportOpts, S as SandboxSdkTransportOpts, u as computeFindingId, v as createChatClient, w as makeFinding } from './registry-DK9kqXvb.js';
 import { TCloud } from '@tangle-network/tcloud';
 import { z } from 'zod';
 export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-DuFBYg7A.js';

package/dist/index.js CHANGED Viewed

@@ -189,7 +189,7 @@ import {
   tokenizeDomainWords,
   traceAnalystFunctionGroup,
   traceAnalystOnRunComplete
-} from "./chunk-5GLYP2IQ.js";
+} from "./chunk-PIEAE33T.js";
 import {
   DEFAULT_REDACTION_RULES,
   REDACTION_VERSION,

package/dist/openapi.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "openapi": "3.1.0",
   "info": {
     "title": "@tangle-network/agent-eval — wire protocol",
-    "version": "0.58.2",
+    "version": "0.59.1",
     "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
     "contact": {
       "name": "Tangle Network",

package/dist/{registry-BSWy0rvH.d.ts → registry-DK9kqXvb.d.ts} RENAMED Viewed

@@ -1,6 +1,6 @@
 import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-BXVRUZyX.js';
 import { R as RunRecord } from './run-record-etiCMsUq.js';
-import { T as TraceAnalysisStore } from './store-CJbzDxZ2.js';
+import { T as TraceAnalysisStore } from './store-jzKpMl16.js';
 import { J as JudgeInput } from './types-DhqpAi_z.js';
 /**

package/dist/{store-CJbzDxZ2.d.ts → store-jzKpMl16.d.ts} RENAMED Viewed

@@ -69,6 +69,29 @@ interface TraceAnalystFilters {
      *  expensive on large datasets. Use the indexed filters above first. */
     regex_pattern?: string;
 }
+/** One distinct error signature across the dataset — the deterministic unit of
+ *  failure coverage. Signatures normalize volatile tokens (digits, hex/uuids,
+ *  paths, durations) out of the span `status_message` so semantically identical
+ *  failures collapse into one cluster. An analyst that accounts for every
+ *  cluster has, by construction, covered every distinct failure mode. */
+interface ErrorCluster {
+    /** Normalized status_message — the cluster key. */
+    signature: string;
+    /** A verbatim, un-normalized exemplar message (for exact-string citation). */
+    status_message_sample: string;
+    /** The span name that most often carries this signature, if any. */
+    span_name: string | null;
+    /** The tool that most often carries this signature, if any. */
+    tool_name: string | null;
+    trace_count: number;
+    span_count: number;
+    /** trace_count / total error traces in the matched set (0..1). */
+    prevalence: number;
+    /** Real trace ids carrying this signature (capped), passable to view/search. */
+    exemplar_trace_ids: string[];
+    /** Real span ids carrying this signature (capped). */
+    exemplar_span_ids: string[];
+}
 interface DatasetOverview {
     total_traces: number;
     raw_jsonl_bytes: number;
@@ -82,6 +105,11 @@ interface DatasetOverview {
         trace_count: number;
         span_count: number;
     };
+    /** The COMPLETE deterministic error-signature population, sorted by
+     *  trace_count desc. This is the failure-coverage checklist: an analysis is
+     *  complete only when every cluster here is accounted for. Empty when the
+     *  matched set has no error spans. */
+    error_clusters: ErrorCluster[];
     time_range: {
         earliest: string;
         latest: string;

package/dist/traces.d.ts CHANGED Viewed

@@ -9,8 +9,8 @@ export { E as EventFilter, F as FileSystemTraceStore, a as FileSystemTraceStoreO
 export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-CqTxMwDw.js';
 export { A as Artifact, B as BudgetLedgerEntry, h as BudgetSpec, E as EventKind, i as FAILURE_CLASSES, F as FailureClass, G as GenericSpan, J as JudgeSpan, L as LlmSpan, M as Message, d as RetrievalSpan, R as Run, g as RunLayer, b as RunOutcome, f as RunStatus, e as SandboxSpan, S as Span, j as SpanBase, c as SpanKind, k as SpanStatus, l as TRACE_SCHEMA_VERSION, T as ToolSpan, a as TraceEvent, m as isJudgeSpan, n as isLlmSpan, o as isRetrievalSpan, p as isSandboxSpan, q as isToolSpan } from './schema-m0gsnbt3.js';
 import { AxAIService, AxFunction } from '@ax-llm/ax';
-import { T as TraceAnalysisStore, f as TraceAnalystFilters, a as DatasetOverview, Q as QueryTracesPage, l as ViewTraceResult, V as ViewSpansResult, b as SearchTraceResult, S as SearchSpanResult } from './store-CJbzDxZ2.js';
-export { D as DEFAULT_TRACE_ANALYST_BUDGETS, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, k as ViewTraceOversized } from './store-CJbzDxZ2.js';
+import { T as TraceAnalysisStore, f as TraceAnalystFilters, a as DatasetOverview, Q as QueryTracesPage, l as ViewTraceResult, V as ViewSpansResult, b as SearchTraceResult, S as SearchSpanResult } from './store-jzKpMl16.js';
+export { D as DEFAULT_TRACE_ANALYST_BUDGETS, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, k as ViewTraceOversized } from './store-jzKpMl16.js';
 /**
  * `captureFetchToRawSink` — wrap a `fetch` so every request / response / error

package/dist/traces.js CHANGED Viewed

@@ -34,7 +34,7 @@ import {
   tokenizeDomainWords,
   traceAnalystFunctionGroup,
   traceAnalystOnRunComplete
-} from "./chunk-5GLYP2IQ.js";
+} from "./chunk-PIEAE33T.js";
 import {
   DEFAULT_REDACTION_RULES,
   REDACTION_VERSION,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-eval",
-  "version": "0.58.2",
+  "version": "0.59.1",
   "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
   "homepage": "https://github.com/tangle-network/agent-eval#readme",
   "repository": {
@@ -153,7 +153,7 @@
     "zod": "^4.3.6"
   },
   "peerDependencies": {
-    "@tangle-network/sandbox": ">=0.2.1 <0.4.0"
+    "@tangle-network/sandbox": ">=0.2.1 <0.5.0"
   },
   "peerDependenciesMeta": {
     "@tangle-network/sandbox": {
@@ -162,7 +162,7 @@
   },
   "devDependencies": {
     "@biomejs/biome": "^2.4.15",
-    "@tangle-network/sandbox": "0.3.0",
+    "@tangle-network/sandbox": "0.4.0",
     "@types/node": "^25.6.0",
     "husky": "^9.1.7",
     "lint-staged": "^17.0.5",