@tangle-network/agent-eval 0.56.0 → 0.58.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-MAOZCN36.js → chunk-5GLYP2IQ.js} +207 -1
- package/dist/chunk-5GLYP2IQ.js.map +1 -0
- package/dist/index.d.ts +2 -2
- package/dist/index.js +18 -3
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/traces.d.ts +86 -3
- package/dist/traces.js +5 -1
- package/package.json +1 -1
- package/dist/chunk-MAOZCN36.js.map +0 -1
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.58.0",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
package/dist/traces.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { N as NotFoundError, R as ReplayError } from './errors-mje_cKOs.js';
|
|
2
|
-
import { R as RawProviderSink, d as RawProviderEvent } from './raw-provider-sink-C46HDghv.js';
|
|
3
|
-
export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink,
|
|
2
|
+
import { P as ProviderRedactor, R as RawProviderSink, d as RawProviderEvent } from './raw-provider-sink-C46HDghv.js';
|
|
3
|
+
export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, c as RawProviderDirection, e as RawProviderSinkFilter, f as defaultProviderRedactor, p as providerFromBaseUrl } from './raw-provider-sink-C46HDghv.js';
|
|
4
4
|
import { R as RunCompleteHook, a as RunCompleteHookContext } from './emitter-DEZwY14K.js';
|
|
5
5
|
export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DEZwY14K.js';
|
|
6
6
|
export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CfXjSqEv.js';
|
|
@@ -12,6 +12,45 @@ import { AxAIService, AxFunction } from '@ax-llm/ax';
|
|
|
12
12
|
import { T as TraceAnalysisStore, f as TraceAnalystFilters, a as DatasetOverview, Q as QueryTracesPage, l as ViewTraceResult, V as ViewSpansResult, b as SearchTraceResult, S as SearchSpanResult } from './store-CJbzDxZ2.js';
|
|
13
13
|
export { D as DEFAULT_TRACE_ANALYST_BUDGETS, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, k as ViewTraceOversized } from './store-CJbzDxZ2.js';
|
|
14
14
|
|
|
15
|
+
/**
|
|
16
|
+
* `captureFetchToRawSink` — wrap a `fetch` so every request / response / error
|
|
17
|
+
* against a provider is recorded into a `RawProviderSink` as the canonical
|
|
18
|
+
* `RawProviderEvent` triple. The one substrate copy of the fetch-capture
|
|
19
|
+
* pattern four consumers hand-roll (legal ships two copies).
|
|
20
|
+
*
|
|
21
|
+
* The returned value is a plain `typeof fetch` — pass it as the `fetchImpl` to
|
|
22
|
+
* any OpenAI-compatible backend factory. Capture is best-effort by default: a
|
|
23
|
+
* sink write that throws does NOT take down the underlying LLM call (set
|
|
24
|
+
* `failClosed` to change that). Uses the existing `defaultProviderRedactor` +
|
|
25
|
+
* `providerFromBaseUrl` — no new redaction policy.
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
interface CaptureFetchContext {
|
|
29
|
+
/** Logical run id stamped on every captured event. Required — without it
|
|
30
|
+
* the raw events can't be paired with their parent `Run`. */
|
|
31
|
+
runId: string;
|
|
32
|
+
/** Optional logical span id (enables span-level sink filtering). */
|
|
33
|
+
spanId?: string;
|
|
34
|
+
/** Resolved base URL (normalised, no trailing slash). Used for the event's
|
|
35
|
+
* `baseUrl` and for endpoint-path extraction. */
|
|
36
|
+
baseUrl: string;
|
|
37
|
+
/** Model id the caller intends to invoke. Stamped on every event. */
|
|
38
|
+
model: string;
|
|
39
|
+
/** Provider override. When omitted, `providerFromBaseUrl(baseUrl)`. */
|
|
40
|
+
provider?: string;
|
|
41
|
+
}
|
|
42
|
+
interface CaptureFetchOptions {
|
|
43
|
+
/** Override the capture-time redactor. Default `defaultProviderRedactor`. */
|
|
44
|
+
redactor?: ProviderRedactor;
|
|
45
|
+
/** Cap on captured response-body bytes; beyond it the body is truncated and
|
|
46
|
+
* `body_truncated` is added to `redactedFields`. Default 2 MiB. */
|
|
47
|
+
responseBodyByteCap?: number;
|
|
48
|
+
/** When true, a sink-write failure propagates to the caller. Default false
|
|
49
|
+
* — capture is best-effort so a sink failure never kills the LLM call. */
|
|
50
|
+
failClosed?: boolean;
|
|
51
|
+
}
|
|
52
|
+
declare function captureFetchToRawSink(fetch: typeof globalThis.fetch, sink: RawProviderSink, ctx: CaptureFetchContext, opts?: CaptureFetchOptions): typeof globalThis.fetch;
|
|
53
|
+
|
|
15
54
|
/**
|
|
16
55
|
* OpenTelemetry JSON export — maps TraceSchema v1 to OTLP/JSON so
|
|
17
56
|
* traces render natively in Jaeger / Honeycomb / Langfuse / Grafana.
|
|
@@ -401,6 +440,50 @@ declare function scoreTraceInsightReadiness(context: TraceInsightContext): Trace
|
|
|
401
440
|
declare function defaultTraceInsightPanel(): TraceInsightPanelRole[];
|
|
402
441
|
declare function buildTraceInsightPrompt(input: TraceInsightPromptInput): string;
|
|
403
442
|
|
|
443
|
+
/**
|
|
444
|
+
* `flattenOtlpExportToNdjson` — flatten an `OtlpExport` (the shape
|
|
445
|
+
* `exportRunAsOtlp` produces) into the per-line JSON the analyst's
|
|
446
|
+
* `OtlpFileTraceStore` index reads. Replaces three per-consumer OTLP
|
|
447
|
+
* flatteners with one canonical projection.
|
|
448
|
+
*
|
|
449
|
+
* Pure function, no I/O — the caller does `.map(JSON.stringify).join('\n')`
|
|
450
|
+
* and writes the file (consumers want control over rotation + naming).
|
|
451
|
+
*/
|
|
452
|
+
|
|
453
|
+
interface OtlpFlatLine {
|
|
454
|
+
trace_id: string;
|
|
455
|
+
span_id: string;
|
|
456
|
+
parent_span_id: string | null;
|
|
457
|
+
name: string;
|
|
458
|
+
kind: string;
|
|
459
|
+
start_time: string;
|
|
460
|
+
end_time: string;
|
|
461
|
+
status: {
|
|
462
|
+
code: 'STATUS_CODE_OK' | 'STATUS_CODE_ERROR' | 'STATUS_CODE_UNSET';
|
|
463
|
+
message?: string;
|
|
464
|
+
};
|
|
465
|
+
resource: {
|
|
466
|
+
attributes: Record<string, string | number | boolean>;
|
|
467
|
+
};
|
|
468
|
+
attributes: Record<string, string | number | boolean>;
|
|
469
|
+
events?: Array<{
|
|
470
|
+
name: string;
|
|
471
|
+
timeUnixNano?: string;
|
|
472
|
+
attributes?: Record<string, unknown>;
|
|
473
|
+
}>;
|
|
474
|
+
}
|
|
475
|
+
interface FlattenOtlpOptions {
|
|
476
|
+
/** `'openinference'` (default) mirrors per-span attributes into the
|
|
477
|
+
* OpenInference vocabulary the analyst's `inferKind` reads
|
|
478
|
+
* (`llm.model`→`llm.model_name`, `tool.name`→`inference.tool.name`,
|
|
479
|
+
* `span.kind`→`openinference.span.kind` uppercased). `'none'` passes
|
|
480
|
+
* attributes through untouched. */
|
|
481
|
+
attributeVocabulary?: 'openinference' | 'none';
|
|
482
|
+
/** Override the numeric-kind → otlp-string mapping. */
|
|
483
|
+
kindMap?: Partial<Record<number, string>>;
|
|
484
|
+
}
|
|
485
|
+
declare function flattenOtlpExportToNdjson(otlpExport: OtlpExport, opts?: FlattenOtlpOptions): OtlpFlatLine[];
|
|
486
|
+
|
|
404
487
|
/** Ax RLM prompt for bounded trace discovery and evidence-backed analysis. */
|
|
405
488
|
declare const TRACE_ANALYST_ACTOR_DESCRIPTION = "You answer questions about an OTLP-shaped JSONL trace dataset using the trace tools provided in the `traces` namespace.\n\nDISCOVERY \u2192 NARROW \u2192 DEEP-READ protocol \u2014 follow exactly:\n\n1. ALWAYS call `traces.getDatasetOverview({})` FIRST without a regex_pattern. The result tells you total_traces, raw_jsonl_bytes, services, agents, models, and sample_trace_ids (real ids \u2014 never fabricate one).\n\n2. Use raw_jsonl_bytes to gauge how expensive raw scans will be. `filters.regex_pattern` is the one scan-heavy filter on getDatasetOverview / queryTraces / countTraces \u2014 narrow with indexed fields (has_errors, model_names, service_names, agent_names, time bounds) BEFORE adding a regex on a large dataset.\n\n3. To list more traces than the sample, call `traces.queryTraces({ filters?, limit, offset? })`. Each summary carries raw_jsonl_bytes \u2014 use it to choose between viewTrace and searchTrace BEFORE calling either.\n\n4. Per-trace inspection:\n - SMALL trace (raw_jsonl_bytes well under 150_000): call `traces.viewTrace({ trace_id })`. Returns all spans. Per-attribute payloads are head-capped at ~4KB; large `input.value` / `output.value` / `llm.input_messages` will show a `[trace-analyst truncated: N bytes]` marker.\n - LARGE trace (raw_jsonl_bytes near or above 150_000, or you saw an `oversized` response): use `traces.searchTrace({ trace_id, regex_pattern })` to get bounded SpanMatchRecords (span metadata + matched text + surrounding context). Then call `traces.viewSpans({ trace_id, span_ids: [...] })` for surgical reads (~16KB cap, 4\u00D7 higher than discovery), or `traces.searchSpan({ trace_id, span_id, regex_pattern })` for one large span. Stays bounded regardless of trace size.\n - Useful regex patterns: `STATUS_CODE_ERROR` (failures), tool names like `grep` or `view_trace`, error strings like `MaxTurnsExceeded`, model names, attribute keys.\n\n5. ONLY call viewTrace / viewSpans / searchTrace / searchSpan with trace/span ids you have already seen in sample_trace_ids, a queryTraces page, or a previous search result. Never invent ids.\n\n5a. **Result-shape contract** \u2014 searchTrace and searchSpan return `{ trace_id, hits, total_matches, has_more }`. Iterate `result.hits` (NOT result.matches). Each hit has `{ span_id, span_name, span_kind, attribute_path, matched_text, context_before, context_after, match_offset }`. viewTrace returns `{ trace_id, spans }` (or `oversized`). viewSpans returns `{ trace_id, spans, missing_span_ids, truncated_attribute_count }`. Never assume a field name \u2014 log the result shape first if unsure.\n\n6. If viewTrace returns an `oversized` summary instead of `spans`, DO NOT retry the same call. Read the summary's top_span_names, span_count, span_response_bytes_max, error_span_count to plan a follow-up: switch to searchTrace (or searchSpan for one large span), then viewSpans on a smaller, surgical span_ids set.\n\n7. If searchTrace or searchSpan returns has_more=true, REFINE the regex to be more specific rather than blindly raising max_matches.\n\n8. If a tool errors (invalid regex, range error), STOP and reconsider \u2014 don't retry with a guessed id or argument. Use the discovery tools above to recover.\n\n9. If a ~4KB-truncated payload from viewTrace / searchTrace matters for your answer, first try viewSpans on that span id (~16KB cap). If a 16KB-truncated payload from viewSpans still matters, narrow further with searchSpan against a more specific regex rather than asking for the full payload again.\n\n10. If maxDepth > 0 and the question splits into independent semantic branches, delegate well-defined subtasks to subagents using `await llmQuery(...)`. Pass narrow context and a focused query. Examples:\n\n const reviews = await llmQuery([\n { query: 'Drill into trace abc123 \u2014 what tool calls preceded the failure?', context: { trace_id: 'abc123' } },\n { query: 'Drill into trace def456 \u2014 same failure mode?', context: { trace_id: 'def456' } },\n ]);\n\nOBSERVABILITY rules:\n- Each non-final actor turn must emit at least one `console.log(...)` for evidence. Up to 3 logs per turn is fine when correlating multiple data sources (e.g. one log for findings list, one for source-file content, one for derived analysis).\n- Do NOT combine `console.log` with `final(...)` or `askClarification(...)` in the same turn \u2014 finish gathering data first, then call final on its own turn.\n- Reuse runtime variables across turns; don't recompute.\n- When done, call `await final(answer)` with the fully-formed report. The responder rewrites the answer into output fields; if you only pass a vague summary string the responder has nothing concrete to format.\n\nCRITICAL \u2014 `final()` payload contract for evidence-grounded analysis tasks:\n- Pass a STRUCTURED object as the second arg with the actual data the responder needs to format the answer. Do NOT pass abstract instructions; pass evidence.\n- Example for per-item verdict tasks:\n ```js\n await final(\"Format the per-item verdict report from the evidence below.\", {\n findings: [\n { id: 'sub-1-finding-1', claim: '...', verdict: 'TRUE-POSITIVE', evidence: 'lines 42-45 of contracts/X.sol show ...' },\n ...all items\n ],\n systemic_summary: '3 sentences I wrote based on the evidence above'\n });\n ```\n- Calling `final(\"answer\", {})` with no evidence is a failure mode \u2014 the responder will hallucinate or echo back the field names. Always include the gathered data.\n- Premature final after a single viewSpans call is INSUFFICIENT for per-finding analysis tasks. Read the requested attributes (e.g. `spans[i].attributes['redteam.finding.title']`), and for each one perform the requested cross-reference (e.g. read the source SPAN's `attributes['source.content']`).\n\nOUTPUT contract \u2014 your final answer must include:\n- A clear prose conclusion answering the user's question.\n- Trace ids and span ids cited as evidence for each claim.\n- Failure modes named in the user's domain language, with frequency and concrete examples.\n\nDo NOT invent trace ids, span ids, error messages, or model names. Every fact must be traceable to a tool result.";
|
|
406
489
|
declare const TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION = "trace-analyst-actor-v5-2026-05-06";
|
|
@@ -671,4 +754,4 @@ declare function iterateRawCalls(sink: RawProviderSink, filter?: {
|
|
|
671
754
|
spanId?: string;
|
|
672
755
|
}): AsyncGenerator<ReplayCacheEntry>;
|
|
673
756
|
|
|
674
|
-
export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DatasetOverview, type ExportableSpan, OTEL_AGENT_EVAL_SCOPE, type OtelExportConfig, type OtelExporter, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, QueryTracesPage, REDACTION_VERSION, RawProviderEvent, RawProviderSink, type RedactionReport, type RedactionRule, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, RunCompleteHook, RunCompleteHookContext, SearchSpanResult, SearchTraceResult, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TraceAnalysisStore, TraceAnalystFilters, type TraceAnalystHookOptions, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, ViewSpansResult, ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete };
|
|
757
|
+
export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, type CaptureFetchContext, type CaptureFetchOptions, DEFAULT_REDACTION_RULES, DatasetOverview, type ExportableSpan, type FlattenOtlpOptions, OTEL_AGENT_EVAL_SCOPE, type OtelExportConfig, type OtelExporter, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpFlatLine, type OtlpResourceSpans, type OtlpSpan, ProviderRedactor, QueryTracesPage, REDACTION_VERSION, RawProviderEvent, RawProviderSink, type RedactionReport, type RedactionRule, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, RunCompleteHook, RunCompleteHookContext, SearchSpanResult, SearchTraceResult, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TraceAnalysisStore, TraceAnalystFilters, type TraceAnalystHookOptions, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, ViewSpansResult, ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, captureFetchToRawSink, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, flattenOtlpExportToNdjson, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete };
|
package/dist/traces.js
CHANGED
|
@@ -17,6 +17,7 @@ import {
|
|
|
17
17
|
buildTraceAnalystTools,
|
|
18
18
|
buildTraceInsightContext,
|
|
19
19
|
buildTraceInsightPrompt,
|
|
20
|
+
captureFetchToRawSink,
|
|
20
21
|
createOtelExporter,
|
|
21
22
|
createOtelTracingStore,
|
|
22
23
|
createReplayFetch,
|
|
@@ -24,6 +25,7 @@ import {
|
|
|
24
25
|
describeTraceInsightScope,
|
|
25
26
|
domainEvidencePattern,
|
|
26
27
|
exportRunAsOtlp,
|
|
28
|
+
flattenOtlpExportToNdjson,
|
|
27
29
|
inferDomainKeywords,
|
|
28
30
|
iterateRawCalls,
|
|
29
31
|
otelRunCompleteHook,
|
|
@@ -32,7 +34,7 @@ import {
|
|
|
32
34
|
tokenizeDomainWords,
|
|
33
35
|
traceAnalystFunctionGroup,
|
|
34
36
|
traceAnalystOnRunComplete
|
|
35
|
-
} from "./chunk-
|
|
37
|
+
} from "./chunk-5GLYP2IQ.js";
|
|
36
38
|
import {
|
|
37
39
|
DEFAULT_REDACTION_RULES,
|
|
38
40
|
REDACTION_VERSION,
|
|
@@ -108,6 +110,7 @@ export {
|
|
|
108
110
|
buildTraceAnalystTools,
|
|
109
111
|
buildTraceInsightContext,
|
|
110
112
|
buildTraceInsightPrompt,
|
|
113
|
+
captureFetchToRawSink,
|
|
111
114
|
createOtelExporter,
|
|
112
115
|
createOtelTracingStore,
|
|
113
116
|
createReplayFetch,
|
|
@@ -116,6 +119,7 @@ export {
|
|
|
116
119
|
describeTraceInsightScope,
|
|
117
120
|
domainEvidencePattern,
|
|
118
121
|
exportRunAsOtlp,
|
|
122
|
+
flattenOtlpExportToNdjson,
|
|
119
123
|
groupBy,
|
|
120
124
|
inferDomainKeywords,
|
|
121
125
|
isJudgeSpan,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.58.0",
|
|
4
4
|
"description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|