npm - @tangle-network/agent-eval - Versions diffs - 0.72.0 → 0.72.4 - Mend

@tangle-network/agent-eval 0.72.0 → 0.72.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

package/CHANGELOG.md +39 -0
package/dist/adapters/http.d.ts +1 -1
package/dist/adapters/langchain.d.ts +1 -1
package/dist/adapters/otel.d.ts +3 -2
package/dist/agent-profile-DYRboYWu.d.ts +364 -0
package/dist/analyst/index.d.ts +221 -0
package/dist/analyst/index.js +371 -0
package/dist/analyst/index.js.map +1 -0
package/dist/analyst-t7zZS3TV.d.ts +88 -0
package/dist/campaign/index.d.ts +518 -9
package/dist/campaign/index.js +672 -22
package/dist/campaign/index.js.map +1 -1
package/dist/chunk-7W4SM7FD.js +1075 -0
package/dist/chunk-7W4SM7FD.js.map +1 -0
package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
package/dist/chunk-JHA3ZGSO.js +1496 -0
package/dist/chunk-JHA3ZGSO.js.map +1 -0
package/dist/{chunk-4QJN7RDX.js → chunk-JYE3WOTE.js} +55 -7
package/dist/{chunk-4QJN7RDX.js.map → chunk-JYE3WOTE.js.map} +1 -1
package/dist/chunk-LB2UOI5F.js +412 -0
package/dist/chunk-LB2UOI5F.js.map +1 -0
package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
package/dist/chunk-VUINJM5M.js.map +1 -0
package/dist/chunk-WYIHD6EB.js +1044 -0
package/dist/chunk-WYIHD6EB.js.map +1 -0
package/dist/{chunk-UD6EF73X.js → chunk-XPILG2CA.js} +119 -2
package/dist/chunk-XPILG2CA.js.map +1 -0
package/dist/contract/index.d.ts +17 -13
package/dist/contract/index.js +13 -7
package/dist/contract/index.js.map +1 -1
package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
package/dist/control.d.ts +2 -2
package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
package/dist/hosted/index.d.ts +223 -2
package/dist/index.d.ts +49 -1323
package/dist/index.js +353 -2496
package/dist/index.js.map +1 -1
package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
package/dist/openapi.json +1 -1
package/dist/pareto-E-pembql.d.ts +81 -0
package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
package/dist/redact-B40YG2M_.d.ts +45 -0
package/dist/registry-DuVYiTvw.d.ts +128 -0
package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
package/dist/rl.d.ts +4 -3
package/dist/rl.js +4 -4
package/dist/run-critic-BAIjX99r.d.ts +56 -0
package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
package/dist/traces.d.ts +371 -308
package/dist/traces.js +43 -18
package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
package/dist/wire/index.d.ts +1 -1
package/dist/workflow/index.d.ts +494 -0
package/dist/workflow/index.js +2177 -0
package/dist/workflow/index.js.map +1 -0
package/docs/design/self-improvement-roadmap.md +106 -0
package/package.json +36 -12
package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
package/dist/chunk-ODGETRTM.js.map +0 -1
package/dist/chunk-SL55X4VN.js +0 -186
package/dist/chunk-SL55X4VN.js.map +0 -1
package/dist/chunk-UD6EF73X.js.map +0 -1
/package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0

package/dist/traces.d.ts CHANGED Viewed

@@ -1,319 +1,20 @@
 import { N as NotFoundError, R as ReplayError } from './errors-Dwqw-T_m.js';
 import { P as ProviderRedactor, R as RawProviderSink, d as RawProviderEvent } from './raw-provider-sink-C46HDghv.js';
 export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, c as RawProviderDirection, e as RawProviderSinkFilter, f as defaultProviderRedactor, p as providerFromBaseUrl } from './raw-provider-sink-C46HDghv.js';
-import { R as RunCompleteHook, a as RunCompleteHookContext } from './emitter-DEZwY14K.js';
+import { a as RunCompleteHookContext, R as RunCompleteHook } from './emitter-DEZwY14K.js';
 export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DEZwY14K.js';
 export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CJzrpUua.js';
 import { T as TraceStore } from './store-CKUAgsJz.js';
 export { E as EventFilter, F as FileSystemTraceStore, a as FileSystemTraceStoreOptions, I as InMemoryTraceStore, R as RunFilter, S as SpanFilter } from './store-CKUAgsJz.js';
 export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-CqTxMwDw.js';
+export { D as DEFAULT_REDACTION_RULES, b as REDACTION_VERSION, a as RedactionReport, R as RedactionRule, r as redactString, c as redactValue } from './redact-B40YG2M_.js';
 export { A as Artifact, B as BudgetLedgerEntry, h as BudgetSpec, E as EventKind, i as FAILURE_CLASSES, F as FailureClass, G as GenericSpan, J as JudgeSpan, L as LlmSpan, M as Message, d as RetrievalSpan, R as Run, g as RunLayer, b as RunOutcome, f as RunStatus, e as SandboxSpan, S as Span, j as SpanBase, c as SpanKind, k as SpanStatus, l as TRACE_SCHEMA_VERSION, T as ToolSpan, a as TraceEvent, m as isJudgeSpan, n as isLlmSpan, o as isRetrievalSpan, p as isSandboxSpan, q as isToolSpan } from './schema-m0gsnbt3.js';
-import { AxAIService, AxFunction } from '@ax-llm/ax';
-import { T as TraceAnalysisStore, f as TraceAnalystFilters, a as DatasetOverview, Q as QueryTracesPage, l as ViewTraceResult, V as ViewSpansResult, b as SearchTraceResult, S as SearchSpanResult } from './store-jzKpMl16.js';
-export { D as DEFAULT_TRACE_ANALYST_BUDGETS, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, k as ViewTraceOversized } from './store-jzKpMl16.js';
-/**
- * `captureFetchToRawSink` — wrap a `fetch` so every request / response / error
- * against a provider is recorded into a `RawProviderSink` as the canonical
- * `RawProviderEvent` triple. The one substrate copy of the fetch-capture
- * pattern four consumers hand-roll (legal ships two copies).
- *
- * The returned value is a plain `typeof fetch` — pass it as the `fetchImpl` to
- * any OpenAI-compatible backend factory. Capture is best-effort by default: a
- * sink write that throws does NOT take down the underlying LLM call (set
- * `failClosed` to change that). Uses the existing `defaultProviderRedactor` +
- * `providerFromBaseUrl` — no new redaction policy.
- */
-interface CaptureFetchContext {
-    /** Logical run id stamped on every captured event. Required — without it
-     *  the raw events can't be paired with their parent `Run`. */
-    runId: string;
-    /** Optional logical span id (enables span-level sink filtering). */
-    spanId?: string;
-    /** Resolved base URL (normalised, no trailing slash). Used for the event's
-     *  `baseUrl` and for endpoint-path extraction. */
-    baseUrl: string;
-    /** Model id the caller intends to invoke. Stamped on every event. */
-    model: string;
-    /** Provider override. When omitted, `providerFromBaseUrl(baseUrl)`. */
-    provider?: string;
-}
-interface CaptureFetchOptions {
-    /** Override the capture-time redactor. Default `defaultProviderRedactor`. */
-    redactor?: ProviderRedactor;
-    /** Cap on captured response-body bytes; beyond it the body is truncated and
-     *  `body_truncated` is added to `redactedFields`. Default 2 MiB. */
-    responseBodyByteCap?: number;
-    /** When true, a sink-write failure propagates to the caller. Default false
-     *  — capture is best-effort so a sink failure never kills the LLM call. */
-    failClosed?: boolean;
-}
-declare function captureFetchToRawSink(fetch: typeof globalThis.fetch, sink: RawProviderSink, ctx: CaptureFetchContext, opts?: CaptureFetchOptions): typeof globalThis.fetch;
-/**
- * OpenTelemetry JSON export — maps TraceSchema v1 to OTLP/JSON so
- * traces render natively in Jaeger / Honeycomb / Langfuse / Grafana.
- *
- * Wire format only. We do NOT depend on the @opentelemetry SDK — that
- * would drag in polyfills incompatible with Workers/Edge. Consumers
- * push the JSON to their collector of choice via HTTP.
- *
- * Reference: OTLP 1.3.2 (ResourceSpans / ScopeSpans / Span).
- */
-declare const OTEL_AGENT_EVAL_SCOPE: {
-    name: string;
-    version: string;
-};
-interface OtlpSpan {
-    traceId: string;
-    spanId: string;
-    parentSpanId?: string;
-    name: string;
-    kind: number;
-    startTimeUnixNano: string;
-    endTimeUnixNano: string;
-    attributes: Array<{
-        key: string;
-        value: {
-            stringValue?: string;
-            intValue?: string;
-            doubleValue?: number;
-            boolValue?: boolean;
-        };
-    }>;
-    events?: Array<{
-        timeUnixNano: string;
-        name: string;
-        attributes?: OtlpSpan['attributes'];
-    }>;
-    status?: {
-        code: number;
-        message?: string;
-    };
-}
-interface OtlpResourceSpans {
-    resource: {
-        attributes: OtlpSpan['attributes'];
-    };
-    scopeSpans: Array<{
-        scope: typeof OTEL_AGENT_EVAL_SCOPE;
-        spans: OtlpSpan[];
-    }>;
-}
-interface OtlpExport {
-    resourceSpans: OtlpResourceSpans[];
-}
-/** Export a single run's spans + events in OTLP/JSON. */
-declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
-/**
- * OTEL span exporter — streams spans to an OTLP/HTTP collector.
- *
- * Reads OTEL_EXPORTER_OTLP_ENDPOINT + OTEL_EXPORTER_OTLP_HEADERS from env
- * when no explicit config is given. Batches spans and flushes periodically
- * or when the batch fills. No @opentelemetry SDK dependency — minimal
- * OTLP/JSON serializer (~120 LOC) using the existing otel.ts helpers.
- */
-interface OtelExportConfig {
-    /** OTLP endpoint. Reads OTEL_EXPORTER_OTLP_ENDPOINT env by default. */
-    endpoint?: string;
-    /** OTLP headers. Reads OTEL_EXPORTER_OTLP_HEADERS env by default. */
-    headers?: Record<string, string>;
-    /** Batch size before flush. Default 64. */
-    batchSize?: number;
-    /** Flush interval ms. Default 5000. */
-    flushIntervalMs?: number;
-    /** Resource attributes stamped on every export. */
-    resourceAttributes?: Record<string, string | number | boolean>;
-    /** Service name. Default 'agent-eval'. */
-    serviceName?: string;
-}
-interface OtelExporter {
-    /** Called by the TraceEmitter on every span close. */
-    exportSpan(span: ExportableSpan): void;
-    /** Force flush pending spans. */
-    flush(): Promise<void>;
-    /** Shutdown cleanly — flushes remaining spans and stops the timer. */
-    shutdown(): Promise<void>;
-}
-interface ExportableSpan {
-    traceId: string;
-    spanId: string;
-    parentSpanId?: string;
-    name: string;
-    kind: string;
-    startedAt: number;
-    endedAt?: number;
-    status?: string;
-    error?: string;
-    model?: string;
-    inputTokens?: number;
-    outputTokens?: number;
-    costUsd?: number;
-    attributes?: Record<string, unknown>;
-}
-/**
- * Create an OTEL exporter. Returns undefined when no endpoint is configured
- * (neither via config nor env) — callers should check before attaching.
- */
-declare function createOtelExporter(config?: OtelExportConfig): OtelExporter | undefined;
-/**
- * OTEL bridge — connects TraceEmitter span lifecycle to the OtelExporter.
- *
- * When an OtelExporter is active, every span that closes through the
- * TraceEmitter is also pushed to the exporter for real-time streaming to
- * the user's OTEL collector.
- *
- * The bridge is opt-in: attach via `otelRunCompleteHook(exporter)` as a
- * RunCompleteHook, or wrap the store with `createOtelTracingStore` for
- * real-time per-span export.
- */
-/**
- * Create a RunCompleteHook that exports all spans from the completed run
- * to the OTEL exporter, then flushes.
- */
-declare function otelRunCompleteHook(exporter: OtelExporter): RunCompleteHook;
-/**
- * Create an auto-exporting TraceStore wrapper that intercepts updateSpan
- * calls. When a span gets an endedAt, it's exported immediately. This
- * gives real-time streaming instead of batch-at-end.
- *
- * This is the preferred integration path: wrap the store before
- * constructing the TraceEmitter.
- */
-declare function createOtelTracingStore(inner: TraceStore, exporter: OtelExporter, traceId: string): TraceStore;
-/**
- * Redaction — remove PII / secrets from trace payloads before persist.
- *
- * Pre-persistence rules mean raw traces in storage are already scrubbed.
- * Unredacted variants (for debugging / post-mortems) live in a separate
- * storage layer with stricter access controls; this module only covers
- * the default scrub-then-persist path.
- *
- * Rules compose: pass an array of `RedactionRule`, each is applied in
- * order. Strings that match get replaced with a tagged sentinel so the
- * eval framework can count how many redactions happened per run
- * (surfaced via `redaction_applied` events).
- */
-interface RedactionRule {
-    id: string;
-    pattern: RegExp;
-    /** Replacement — e.g. '[PII:email]'. Defaults to `[redacted:{id}]`. */
-    replacement?: string;
-}
-interface RedactionReport {
-    redactionCount: number;
-    byRule: Record<string, number>;
-}
-/** OWASP / common-sense defaults — extend per-domain. */
-declare const DEFAULT_REDACTION_RULES: RedactionRule[];
-declare const REDACTION_VERSION = "1.0.0";
-/**
- * Redact a single string. Returns the new string and a per-rule count of
- * how many substitutions fired.
- */
-declare function redactString(input: string, rules?: RedactionRule[]): {
-    output: string;
-    report: RedactionReport;
-};
-/**
- * Walk a JSON-ish value applying `redactString` to every string leaf.
- * Arrays and plain objects are recursed; other types pass through
- * untouched. Circular references throw — traces should be tree-shaped.
- */
-declare function redactValue(value: unknown, rules?: RedactionRule[], report?: RedactionReport): {
-    value: unknown;
-    report: RedactionReport;
-};
-interface AnalyzeTracesInput {
-    /** The user-facing question. Domain framing belongs here, not in the
-     *  actor description. */
-    question: string;
-}
-interface AnalyzeTracesResult {
-    /** The responder's prose answer. */
-    answer: string;
-    /** Bulleted findings extracted from the responder's structured output. */
-    findings: string[];
-    /** Per-actor-turn snapshots captured via `actorTurnCallback`. */
-    turns: AnalyzeTracesTurnSnapshot[];
-    /** Total turns the actor took. */
-    turnCount: number;
-    /** Token usage by role. */
-    usage: TraceAnalystUsage;
-    /** Full system + assistant + tool message log by role. */
-    chatLog: TraceAnalystChatLog;
-    /** Prompt version that produced this run. */
-    actorPromptVersion: string;
-}
-interface TraceAnalystUsage {
-    actor: TraceAnalystUsageEntry[];
-    responder: TraceAnalystUsageEntry[];
-}
-interface TraceAnalystUsageEntry {
-    [key: string]: unknown;
-}
-interface TraceAnalystChatLog {
-    actor: TraceAnalystChatMessage[];
-    responder: TraceAnalystChatMessage[];
-}
-interface TraceAnalystChatMessage {
-    [key: string]: unknown;
-}
-interface AnalyzeTracesTurnSnapshot {
-    turn: number;
-    isError: boolean;
-    /** The JS code the actor produced for this turn. */
-    code: string;
-    /** The formatted action-log entry the actor sees on the next turn. */
-    output: string;
-    /** Provider thought (when `actorOptions.showThoughts` is true and the
-     *  provider returns it). */
-    thought?: string;
-}
-interface AnalyzeTracesOptions {
-    /** Trace data source. Pass either an OTLP-JSONL path or a custom store. */
-    source: string | TraceAnalysisStore;
-    /** Caller-provided AxAIService. */
-    ai: AxAIService;
-    /** Model id forwarded to actor + responder. */
-    model?: string;
-    /** Recursion depth. 0 = no sub-agent dispatch. Default 1. */
-    maxDepth?: number;
-    /** Maximum actor turns. Default 12. */
-    maxTurns?: number;
-    /** Maximum parallel sub-agent calls in batched llmQuery. Default 2. */
-    maxParallelSubagents?: number;
-    /** Override the actor description. */
-    actorDescription?: string;
-    /** Override the subagent description. */
-    subagentDescription?: string;
-    /** Per-turn observability hook. */
-    onTurn?: (turn: AnalyzeTracesTurnSnapshot) => void | Promise<void>;
-    /** Override max runtime characters per turn. Default 6000. */
-    maxRuntimeChars?: number;
-    /** When set, every turn's snapshot is appended to this JSONL file
-     *  immediately. If the analyst crashes mid-loop (provider 503,
-     *  network error, validator reject) the partial reasoning is still
-     *  on disk. Replay the file with the responder afterward to recover
-     *  evidence. */
-    progressLogPath?: string;
-}
-/**
- * Run the trace analyst.
- *
- * Throws:
- *   - `TraceFileMissingError` if `source` is a path and doesn't exist.
- *   - `AxAgentClarificationError` if the analyst asks for clarification.
- *   - Provider errors (auth, rate limits) propagate from the AI service.
- */
-declare function analyzeTraces(input: AnalyzeTracesInput, options: AnalyzeTracesOptions): Promise<AnalyzeTracesResult>;
+import { A as AnalyzeTracesOptions, b as AnalyzeTracesResult } from './analyst-t7zZS3TV.js';
+export { a as AnalyzeTracesInput, c as AnalyzeTracesTurnSnapshot, d as analyzeTraces } from './analyst-t7zZS3TV.js';
+import { h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, T as TraceAnalysisStore, g as TraceAnalystFilters, b as DatasetOverview, Q as QueryTracesPage, l as ViewTraceResult, V as ViewSpansResult, c as SearchTraceResult, S as SearchSpanResult } from './store-GmBE2pZZ.js';
+export { D as DEFAULT_TRACE_ANALYST_BUDGETS, d as SpanMatchRecord, e as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, f as TraceAnalystByteBudgets, a as TraceAnalystSpan, j as TraceAnalystTraceSummary, k as ViewTraceOversized } from './store-GmBE2pZZ.js';
+import { b as RunSplitTag, a as RunTokenUsage, R as RunRecord } from './run-record-BgTFzO2r.js';
+import { AxFunction } from '@ax-llm/ax';
 /**
  * Trace-analyst auto-execution hook.
@@ -440,6 +141,63 @@ declare function scoreTraceInsightReadiness(context: TraceInsightContext): Trace
 declare function defaultTraceInsightPanel(): TraceInsightPanelRole[];
 declare function buildTraceInsightPrompt(input: TraceInsightPromptInput): string;
+/**
+ * OpenTelemetry JSON export — maps TraceSchema v1 to OTLP/JSON so
+ * traces render natively in Jaeger / Honeycomb / Langfuse / Grafana.
+ *
+ * Wire format only. We do NOT depend on the @opentelemetry SDK — that
+ * would drag in polyfills incompatible with Workers/Edge. Consumers
+ * push the JSON to their collector of choice via HTTP.
+ *
+ * Reference: OTLP 1.3.2 (ResourceSpans / ScopeSpans / Span).
+ */
+declare const OTEL_AGENT_EVAL_SCOPE: {
+    name: string;
+    version: string;
+};
+interface OtlpSpan {
+    traceId: string;
+    spanId: string;
+    parentSpanId?: string;
+    name: string;
+    kind: number;
+    startTimeUnixNano: string;
+    endTimeUnixNano: string;
+    attributes: Array<{
+        key: string;
+        value: {
+            stringValue?: string;
+            intValue?: string;
+            doubleValue?: number;
+            boolValue?: boolean;
+        };
+    }>;
+    events?: Array<{
+        timeUnixNano: string;
+        name: string;
+        attributes?: OtlpSpan['attributes'];
+    }>;
+    status?: {
+        code: number;
+        message?: string;
+    };
+}
+interface OtlpResourceSpans {
+    resource: {
+        attributes: OtlpSpan['attributes'];
+    };
+    scopeSpans: Array<{
+        scope: typeof OTEL_AGENT_EVAL_SCOPE;
+        spans: OtlpSpan[];
+    }>;
+}
+interface OtlpExport {
+    resourceSpans: OtlpResourceSpans[];
+}
+/** Export a single run's spans + events in OTLP/JSON. */
+declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
 /**
  * `flattenOtlpExportToNdjson` — flatten an `OtlpExport` (the shape
  * `exportRunAsOtlp` produces) into the per-line JSON the analyst's
@@ -484,6 +242,193 @@ interface FlattenOtlpOptions {
 }
 declare function flattenOtlpExportToNdjson(otlpExport: OtlpExport, opts?: FlattenOtlpOptions): OtlpFlatLine[];
+/**
+ * Canonical OTLP-flat-line readers shared by every consumer of the
+ * OTLP-JSONL wire shape (one OTLP span per line; the form
+ * `flattenOtlpExportToNdjson` produces and the form AppWorld / HALO
+ * emit via their OpenInference OTLP exporter).
+ *
+ * `OtlpFileTraceStore` indexes spans with these; `otlpToRunRecords`
+ * aggregates spans into `RunRecord`s with the same readers. One parser,
+ * one vocabulary — a divergence between the analyst's view of a trace and
+ * the RunRecord projected from it is a class of bug this consolidation
+ * removes by construction.
+ *
+ * Vocabulary. The readers understand BOTH dialects that appear in the
+ * wild:
+ *   - the substrate's own `llm.*` / `tool.*` / `span.kind` attributes
+ *     (`flattenSpanAttributes` in `trace/otel.ts`), and
+ *   - the OpenInference / inference-export attributes AppWorld / HALO
+ *     emit (`openinference.span.kind`, `inference.observation_kind`,
+ *     `inference.llm.input_tokens`, `llm.token_count.prompt`, …).
+ *
+ * Pure, no I/O.
+ */
+/**
+ * The structural fields a flat OTLP-JSONL line projects to. `attributes`
+ * is the merged resource+span attribute map (span overrides resource);
+ * the named fields are the pivots every reader of a trace needs without
+ * paying the full attribute materialisation.
+ */
+interface ProjectedOtlpSpan {
+    trace_id: string;
+    span_id: string;
+    parent_span_id: string | null;
+    name: string;
+    kind: TraceAnalystSpanKind;
+    start_time: string;
+    end_time: string;
+    duration_ms: number;
+    status: TraceAnalystSpanStatus;
+    status_message: string | undefined;
+    service_name: string | null;
+    agent_name: string | null;
+    model_name: string | null;
+    tool_name: string | null;
+    /** Merged resource + span attributes, span winning on overlap. */
+    attributes: Record<string, unknown>;
+}
+/**
+ * Project one parsed OTLP-JSONL object to `ProjectedOtlpSpan`, or `null`
+ * when the line is missing the mandatory `trace_id` + `span_id`.
+ */
+declare function projectOtlpFlatLine(raw: Record<string, unknown>): ProjectedOtlpSpan | null;
+declare function readOtlpStatus(raw: Record<string, unknown>): {
+    code: TraceAnalystSpanStatus;
+    message: string | undefined;
+};
+declare function inferOtlpKind(attrs: Record<string, unknown>): TraceAnalystSpanKind;
+/**
+ * Flatten OTLP `attributes` + `resource.attributes` into a single
+ * dotted-key map. Span attributes override resource attributes when keys
+ * overlap. Nested objects/arrays are preserved as-is.
+ */
+declare function extractOtlpAttributes(raw: Record<string, unknown>): Record<string, unknown>;
+declare function stringField(raw: Record<string, unknown>, key: string): string | undefined;
+declare function asString(v: unknown): string | null;
+/** Read a numeric attribute, tolerating numeric strings; `null` if absent/NaN. */
+declare function asNumber(v: unknown): number | null;
+/** First finite numeric value across a list of candidate attribute keys. */
+declare function firstNumberAttr(attrs: Record<string, unknown>, keys: readonly string[]): number | null;
+/** First non-empty string value across a list of candidate attribute keys. */
+declare function firstStringAttr(attrs: Record<string, unknown>, keys: readonly string[]): string | null;
+/**
+ * `otlpToRunRecords` — fold an OTLP traces.jsonl (one OTLP span per line;
+ * the form AppWorld / HALO emit via their OpenInference OTLP exporter, the
+ * same shape `flattenOtlpExportToNdjson` produces) into validated
+ * `RunRecord[]` — one record per `trace_id` (one trace == one task).
+ *
+ * This is the offline ingestion primitive the AppWorld driver bench and the
+ * hosted Intelligence product both stand on: traces in, paper-grade rows
+ * out, ready for `compareDrivers` / `analyzeRuns` / the promotion gate.
+ *
+ * Aggregation per trace:
+ *   - tokenUsage: sum LLM-span `input` / `output` (+ `cached` when present)
+ *     across every LLM span in the trace.
+ *   - costUsd: sum per-span LLM cost when present; else priced via
+ *     `opts.priceUsdPerToken` from the aggregated tokens; else 0 with a
+ *     loud `raw.cost_unpriced = 1` marker so a missing price is visible, not
+ *     a silent zero folded into a gate.
+ *   - failureMode: the first `STATUS_CODE_ERROR` span's normalized status
+ *     message (carries the real failure signature, not a generic class).
+ *   - model: the dominant LLM model in the trace (snapshot-padded to satisfy
+ *     `validateRunRecord` when the trace's model is a bare alias).
+ *   - outcome score: `opts.scoreForTrace` (AppWorld `world.evaluate()` →
+ *     TGC/SGC) when supplied; else 1 when the trace had no error span, 0
+ *     when it did — a defensible default the caller can override.
+ *   - prompt / completion: carried into `raw` as token-count signals and,
+ *     when the first/last LLM span exposes `input.value` / `output.value`,
+ *     the verbatim text is preserved on the optional `promptText` /
+ *     `completionText` of the returned `OtlpTraceRunRecord`.
+ *
+ * Fail-loud: an OTLP file with zero valid spans throws. A trace with no
+ * spans is impossible (a trace exists only because a span referenced it).
+ * `validateRunRecord` runs on every row — a malformed projection throws
+ * rather than silently producing a half-record.
+ */
+interface OtlpToRunRecordsOptions {
+    /** Logical experiment grouping for every produced record. */
+    experimentId: string;
+    /** Candidate (variant) id — the surface these traces exercised. The
+     *  bench passes the driver label here so `compareDrivers` can pair rows. */
+    candidateId: string;
+    /** Split assignment for every produced record. Default `'holdout'` —
+     *  ingested traces are evidence, not the optimizer's training pool. */
+    splitTag?: RunSplitTag;
+    /** Git SHA the traces were produced from. Default `'unknown'`. */
+    commitSha?: string;
+    /** sha256 of the effective prompt surface. Default `'unknown'`. */
+    promptHash?: string;
+    /** sha256 of the effective config. Default `'unknown'`. */
+    configHash?: string;
+    /** RNG seed recorded on every row. Default 0. */
+    seed?: number;
+    /**
+     * Fallback model snapshot when the trace exposes no LLM model attribute
+     * OR exposes a bare alias `validateRunRecord` would reject. The trace's
+     * own model wins when it already carries a snapshot. Default
+     * `'unknown@otlp'` (opaque-snapshot form the validator accepts).
+     */
+    fallbackModel?: string;
+    /**
+     * USD per total token (input+output) used to price a trace when no
+     * per-span cost attribute is present. When unset, an unpriced trace
+     * records `costUsd: 0` AND `raw.cost_unpriced = 1` — the zero is flagged,
+     * never silent.
+     */
+    priceUsdPerToken?: number;
+    /**
+     * Score for a trace's outcome (AppWorld `world.evaluate()` → TGC/SGC, or
+     * any [0,1] task-success signal). Keyed by `trace_id`; falls through to
+     * the error-derived default (1 = no error span, 0 = had one) when the map
+     * has no entry or the function returns undefined.
+     */
+    scoreForTrace?: (traceId: string, span: TraceAggregate) => number | undefined;
+    /**
+     * Per-record judge metadata when an external judge produced the score.
+     * Keyed by `trace_id`.
+     */
+    judgeMetadataForTrace?: (traceId: string) => RunRecord['judgeMetadata'] | undefined;
+}
+/** A `RunRecord` plus the verbatim prompt/completion text when the trace's
+ *  LLM spans exposed it. The text is NOT on the validated `RunRecord`
+ *  (`outcome.raw` is numeric-only) but consumers ingesting full traces want
+ *  it — so it rides alongside. */
+interface OtlpTraceRunRecord {
+    record: RunRecord;
+    /** Verbatim first-LLM-span `input.value`, when present. */
+    promptText?: string;
+    /** Verbatim last-LLM-span `output.value`, when present. */
+    completionText?: string;
+}
+/** Per-trace rollup the score callback can inspect. */
+interface TraceAggregate {
+    traceId: string;
+    spanCount: number;
+    llmSpanCount: number;
+    toolSpanCount: number;
+    agentSpanCount: number;
+    errorSpanCount: number;
+    tokenUsage: RunTokenUsage;
+    /** First error span's normalized status message, if any. */
+    firstErrorMessage?: string;
+    model: string;
+    startTime: string;
+    endTime: string;
+    wallMs: number;
+}
+/**
+ * Parse + aggregate an OTLP traces.jsonl string into validated
+ * `RunRecord[]` (one per trace). Use {@link otlpToTraceRunRecords} when you
+ * also want the verbatim prompt/completion text alongside each record.
+ */
+declare function otlpToRunRecords(otlpJsonl: string, opts: OtlpToRunRecordsOptions): RunRecord[];
+/** As {@link otlpToRunRecords} but returns the prompt/completion text too. */
+declare function otlpToTraceRunRecords(otlpJsonl: string, opts: OtlpToRunRecordsOptions): OtlpTraceRunRecord[];
 /** Ax RLM prompt for bounded trace discovery and evidence-backed analysis. */
 declare const TRACE_ANALYST_ACTOR_DESCRIPTION = "You answer questions about an OTLP-shaped JSONL trace dataset using the trace tools provided in the `traces` namespace.\n\nDISCOVERY \u2192 NARROW \u2192 DEEP-READ protocol \u2014 follow exactly:\n\n1. ALWAYS call `traces.getDatasetOverview({})` FIRST without a regex_pattern. The result tells you total_traces, raw_jsonl_bytes, services, agents, models, and sample_trace_ids (real ids \u2014 never fabricate one).\n\n2. Use raw_jsonl_bytes to gauge how expensive raw scans will be. `filters.regex_pattern` is the one scan-heavy filter on getDatasetOverview / queryTraces / countTraces \u2014 narrow with indexed fields (has_errors, model_names, service_names, agent_names, time bounds) BEFORE adding a regex on a large dataset.\n\n3. To list more traces than the sample, call `traces.queryTraces({ filters?, limit, offset? })`. Each summary carries raw_jsonl_bytes \u2014 use it to choose between viewTrace and searchTrace BEFORE calling either.\n\n4. Per-trace inspection:\n   - SMALL trace (raw_jsonl_bytes well under 150_000): call `traces.viewTrace({ trace_id })`. Returns all spans. Per-attribute payloads are head-capped at ~4KB; large `input.value` / `output.value` / `llm.input_messages` will show a `[trace-analyst truncated: N bytes]` marker.\n   - LARGE trace (raw_jsonl_bytes near or above 150_000, or you saw an `oversized` response): use `traces.searchTrace({ trace_id, regex_pattern })` to get bounded SpanMatchRecords (span metadata + matched text + surrounding context). Then call `traces.viewSpans({ trace_id, span_ids: [...] })` for surgical reads (~16KB cap, 4\u00D7 higher than discovery), or `traces.searchSpan({ trace_id, span_id, regex_pattern })` for one large span. Stays bounded regardless of trace size.\n   - Useful regex patterns: `STATUS_CODE_ERROR` (failures), tool names like `grep` or `view_trace`, error strings like `MaxTurnsExceeded`, model names, attribute keys.\n\n5. ONLY call viewTrace / viewSpans / searchTrace / searchSpan with trace/span ids you have already seen in sample_trace_ids, a queryTraces page, or a previous search result. Never invent ids.\n\n5a. **Result-shape contract** \u2014 searchTrace and searchSpan return `{ trace_id, hits, total_matches, has_more }`. Iterate `result.hits` (NOT result.matches). Each hit has `{ span_id, span_name, span_kind, attribute_path, matched_text, context_before, context_after, match_offset }`. viewTrace returns `{ trace_id, spans }` (or `oversized`). viewSpans returns `{ trace_id, spans, missing_span_ids, truncated_attribute_count }`. Never assume a field name \u2014 log the result shape first if unsure.\n\n6. If viewTrace returns an `oversized` summary instead of `spans`, DO NOT retry the same call. Read the summary's top_span_names, span_count, span_response_bytes_max, error_span_count to plan a follow-up: switch to searchTrace (or searchSpan for one large span), then viewSpans on a smaller, surgical span_ids set.\n\n7. If searchTrace or searchSpan returns has_more=true, REFINE the regex to be more specific rather than blindly raising max_matches.\n\n8. If a tool errors (invalid regex, range error), STOP and reconsider \u2014 don't retry with a guessed id or argument. Use the discovery tools above to recover.\n\n9. If a ~4KB-truncated payload from viewTrace / searchTrace matters for your answer, first try viewSpans on that span id (~16KB cap). If a 16KB-truncated payload from viewSpans still matters, narrow further with searchSpan against a more specific regex rather than asking for the full payload again.\n\n10. If maxDepth > 0 and the question splits into independent semantic branches, delegate well-defined subtasks to subagents using `await llmQuery(...)`. Pass narrow context and a focused query. Examples:\n\n    const reviews = await llmQuery([\n      { query: 'Drill into trace abc123 \u2014 what tool calls preceded the failure?', context: { trace_id: 'abc123' } },\n      { query: 'Drill into trace def456 \u2014 same failure mode?', context: { trace_id: 'def456' } },\n    ]);\n\nOBSERVABILITY rules:\n- Each non-final actor turn must emit at least one `console.log(...)` for evidence. Up to 3 logs per turn is fine when correlating multiple data sources (e.g. one log for findings list, one for source-file content, one for derived analysis).\n- Do NOT combine `console.log` with `final(...)` or `askClarification(...)` in the same turn \u2014 finish gathering data first, then call final on its own turn.\n- Reuse runtime variables across turns; don't recompute.\n- When done, call `await final(answer)` with the fully-formed report. The responder rewrites the answer into output fields; if you only pass a vague summary string the responder has nothing concrete to format.\n\nCRITICAL \u2014 `final()` payload contract for evidence-grounded analysis tasks:\n- Pass a STRUCTURED object as the second arg with the actual data the responder needs to format the answer. Do NOT pass abstract instructions; pass evidence.\n- Example for per-item verdict tasks:\n  ```js\n  await final(\"Format the per-item verdict report from the evidence below.\", {\n    findings: [\n      { id: 'sub-1-finding-1', claim: '...', verdict: 'TRUE-POSITIVE', evidence: 'lines 42-45 of contracts/X.sol show ...' },\n      ...all items\n    ],\n    systemic_summary: '3 sentences I wrote based on the evidence above'\n  });\n  ```\n- Calling `final(\"answer\", {})` with no evidence is a failure mode \u2014 the responder will hallucinate or echo back the field names. Always include the gathered data.\n- Premature final after a single viewSpans call is INSUFFICIENT for per-finding analysis tasks. Read the requested attributes (e.g. `spans[i].attributes['redteam.finding.title']`), and for each one perform the requested cross-reference (e.g. read the source SPAN's `attributes['source.content']`).\n\nOUTPUT contract \u2014 your final answer must include:\n- A clear prose conclusion answering the user's question.\n- Trace ids and span ids cited as evidence for each claim.\n- Failure modes named in the user's domain language, with frequency and concrete examples.\n\nDo NOT invent trace ids, span ids, error messages, or model names. Every fact must be traceable to a tool result.";
 declare const TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION = "trace-analyst-actor-v5-2026-05-06";
@@ -635,6 +580,124 @@ declare function traceAnalystFunctionGroup(opts: BuildTraceAnalystToolsOpts): {
     functions: AxFunction[];
 };
+/**
+ * `captureFetchToRawSink` — wrap a `fetch` so every request / response / error
+ * against a provider is recorded into a `RawProviderSink` as the canonical
+ * `RawProviderEvent` triple. The one substrate copy of the fetch-capture
+ * pattern four consumers hand-roll (legal ships two copies).
+ *
+ * The returned value is a plain `typeof fetch` — pass it as the `fetchImpl` to
+ * any OpenAI-compatible backend factory. Capture is best-effort by default: a
+ * sink write that throws does NOT take down the underlying LLM call (set
+ * `failClosed` to change that). Uses the existing `defaultProviderRedactor` +
+ * `providerFromBaseUrl` — no new redaction policy.
+ */
+interface CaptureFetchContext {
+    /** Logical run id stamped on every captured event. Required — without it
+     *  the raw events can't be paired with their parent `Run`. */
+    runId: string;
+    /** Optional logical span id (enables span-level sink filtering). */
+    spanId?: string;
+    /** Resolved base URL (normalised, no trailing slash). Used for the event's
+     *  `baseUrl` and for endpoint-path extraction. */
+    baseUrl: string;
+    /** Model id the caller intends to invoke. Stamped on every event. */
+    model: string;
+    /** Provider override. When omitted, `providerFromBaseUrl(baseUrl)`. */
+    provider?: string;
+}
+interface CaptureFetchOptions {
+    /** Override the capture-time redactor. Default `defaultProviderRedactor`. */
+    redactor?: ProviderRedactor;
+    /** Cap on captured response-body bytes; beyond it the body is truncated and
+     *  `body_truncated` is added to `redactedFields`. Default 2 MiB. */
+    responseBodyByteCap?: number;
+    /** When true, a sink-write failure propagates to the caller. Default false
+     *  — capture is best-effort so a sink failure never kills the LLM call. */
+    failClosed?: boolean;
+}
+declare function captureFetchToRawSink(fetch: typeof globalThis.fetch, sink: RawProviderSink, ctx: CaptureFetchContext, opts?: CaptureFetchOptions): typeof globalThis.fetch;
+/**
+ * OTEL span exporter — streams spans to an OTLP/HTTP collector.
+ *
+ * Reads OTEL_EXPORTER_OTLP_ENDPOINT + OTEL_EXPORTER_OTLP_HEADERS from env
+ * when no explicit config is given. Batches spans and flushes periodically
+ * or when the batch fills. No @opentelemetry SDK dependency — minimal
+ * OTLP/JSON serializer (~120 LOC) using the existing otel.ts helpers.
+ */
+interface OtelExportConfig {
+    /** OTLP endpoint. Reads OTEL_EXPORTER_OTLP_ENDPOINT env by default. */
+    endpoint?: string;
+    /** OTLP headers. Reads OTEL_EXPORTER_OTLP_HEADERS env by default. */
+    headers?: Record<string, string>;
+    /** Batch size before flush. Default 64. */
+    batchSize?: number;
+    /** Flush interval ms. Default 5000. */
+    flushIntervalMs?: number;
+    /** Resource attributes stamped on every export. */
+    resourceAttributes?: Record<string, string | number | boolean>;
+    /** Service name. Default 'agent-eval'. */
+    serviceName?: string;
+}
+interface OtelExporter {
+    /** Called by the TraceEmitter on every span close. */
+    exportSpan(span: ExportableSpan): void;
+    /** Force flush pending spans. */
+    flush(): Promise<void>;
+    /** Shutdown cleanly — flushes remaining spans and stops the timer. */
+    shutdown(): Promise<void>;
+}
+interface ExportableSpan {
+    traceId: string;
+    spanId: string;
+    parentSpanId?: string;
+    name: string;
+    kind: string;
+    startedAt: number;
+    endedAt?: number;
+    status?: string;
+    error?: string;
+    model?: string;
+    inputTokens?: number;
+    outputTokens?: number;
+    costUsd?: number;
+    attributes?: Record<string, unknown>;
+}
+/**
+ * Create an OTEL exporter. Returns undefined when no endpoint is configured
+ * (neither via config nor env) — callers should check before attaching.
+ */
+declare function createOtelExporter(config?: OtelExportConfig): OtelExporter | undefined;
+/**
+ * OTEL bridge — connects TraceEmitter span lifecycle to the OtelExporter.
+ *
+ * When an OtelExporter is active, every span that closes through the
+ * TraceEmitter is also pushed to the exporter for real-time streaming to
+ * the user's OTEL collector.
+ *
+ * The bridge is opt-in: attach via `otelRunCompleteHook(exporter)` as a
+ * RunCompleteHook, or wrap the store with `createOtelTracingStore` for
+ * real-time per-span export.
+ */
+/**
+ * Create a RunCompleteHook that exports all spans from the completed run
+ * to the OTEL exporter, then flushes.
+ */
+declare function otelRunCompleteHook(exporter: OtelExporter): RunCompleteHook;
+/**
+ * Create an auto-exporting TraceStore wrapper that intercepts updateSpan
+ * calls. When a span gets an endedAt, it's exported immediately. This
+ * gives real-time streaming instead of batch-at-end.
+ *
+ * This is the preferred integration path: wrap the store before
+ * constructing the TraceEmitter.
+ */
+declare function createOtelTracingStore(inner: TraceStore, exporter: OtelExporter, traceId: string): TraceStore;
 /**
  * Replay-from-raw-events — turn every captured campaign run into a
  * re-runnable artifact.
@@ -754,4 +817,4 @@ declare function iterateRawCalls(sink: RawProviderSink, filter?: {
     spanId?: string;
 }): AsyncGenerator<ReplayCacheEntry>;
-export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, type CaptureFetchContext, type CaptureFetchOptions, DEFAULT_REDACTION_RULES, DatasetOverview, type ExportableSpan, type FlattenOtlpOptions, OTEL_AGENT_EVAL_SCOPE, type OtelExportConfig, type OtelExporter, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpFlatLine, type OtlpResourceSpans, type OtlpSpan, ProviderRedactor, QueryTracesPage, REDACTION_VERSION, RawProviderEvent, RawProviderSink, type RedactionReport, type RedactionRule, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, RunCompleteHook, RunCompleteHookContext, SearchSpanResult, SearchTraceResult, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TraceAnalysisStore, TraceAnalystFilters, type TraceAnalystHookOptions, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, ViewSpansResult, ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, captureFetchToRawSink, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, flattenOtlpExportToNdjson, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete };
+export { AnalyzeTracesOptions, AnalyzeTracesResult, type CaptureFetchContext, type CaptureFetchOptions, DatasetOverview, type ExportableSpan, type FlattenOtlpOptions, OTEL_AGENT_EVAL_SCOPE, type OtelExportConfig, type OtelExporter, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpFlatLine, type OtlpResourceSpans, type OtlpSpan, type OtlpToRunRecordsOptions, type OtlpTraceRunRecord, type ProjectedOtlpSpan, ProviderRedactor, QueryTracesPage, RawProviderEvent, RawProviderSink, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, RunCompleteHook, RunCompleteHookContext, SearchSpanResult, SearchTraceResult, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, type TraceAggregate, TraceAnalysisStore, TraceAnalystFilters, type TraceAnalystHookOptions, TraceAnalystSpanKind, TraceAnalystSpanStatus, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, ViewSpansResult, ViewTraceResult, asNumber, asString, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, captureFetchToRawSink, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, extractOtlpAttributes, firstNumberAttr, firstStringAttr, flattenOtlpExportToNdjson, inferDomainKeywords, inferOtlpKind, iterateRawCalls, otelRunCompleteHook, otlpToRunRecords, otlpToTraceRunRecords, planTraceInsightQuestions, projectOtlpFlatLine, readOtlpStatus, scoreTraceInsightReadiness, stringField, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete };