npm - @tangle-network/agent-eval - Versions diffs - 0.48.0 → 0.50.0 - Mend

@tangle-network/agent-eval 0.48.0 → 0.50.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

package/README.md +7 -0
package/dist/adapters/http.d.ts +1 -1
package/dist/adapters/langchain.d.ts +1 -1
package/dist/adapters/{traceai.d.ts → otel.d.ts} +29 -29
package/dist/adapters/{traceai.js → otel.js} +9 -5
package/dist/adapters/otel.js.map +1 -0
package/dist/campaign/index.d.ts +3 -3
package/dist/{chunk-PD3MH6WU.js → chunk-5KSDYBYH.js} +2 -2
package/dist/{chunk-MNL6LXGQ.js → chunk-EGIPWXHL.js} +2 -98
package/dist/chunk-EGIPWXHL.js.map +1 -0
package/dist/{chunk-OYI6RZJK.js → chunk-FQK2CCIM.js} +1 -1
package/dist/chunk-FQK2CCIM.js.map +1 -0
package/dist/chunk-MAZ26DC7.js +99 -0
package/dist/chunk-MAZ26DC7.js.map +1 -0
package/dist/chunk-SHTXZ4O2.js +113 -0
package/dist/chunk-SHTXZ4O2.js.map +1 -0
package/dist/{chunk-KQ26DYTQ.js → chunk-UBQGWD3O.js} +2 -2
package/dist/contract/index.d.ts +206 -9
package/dist/contract/index.js +751 -3
package/dist/contract/index.js.map +1 -1
package/dist/governance/index.d.ts +1 -1
package/dist/hosted/index.d.ts +8 -192
package/dist/hosted/index.js +1 -1
package/dist/index-BRxz6qov.d.ts +409 -0
package/dist/index.d.ts +18 -462
package/dist/index.js +14 -106
package/dist/index.js.map +1 -1
package/dist/meta-eval/index.d.ts +3 -3
package/dist/openapi.json +1 -1
package/dist/{outcome-store-BxJ3DQKJ.d.ts → outcome-store-D6KWmYvj.d.ts} +1 -1
package/dist/registry-8KAs18kY.d.ts +457 -0
package/dist/{release-report-DBB8lB1P.d.ts → release-report-DSu0DWy8.d.ts} +3 -296
package/dist/reporting.d.ts +6 -4
package/dist/reporting.js +6 -4
package/dist/{researcher-CHMO56K0.d.ts → researcher-LZD0qHEa.d.ts} +1 -1
package/dist/rl.d.ts +9 -8
package/dist/rl.js +3 -2
package/dist/rl.js.map +1 -1
package/dist/{rubric-predictive-validity-CJ08tGwq.d.ts → rubric-predictive-validity-ByZEC3BX.d.ts} +1 -1
package/dist/{run-improvement-loop-B-L8GgpW.d.ts → run-improvement-loop-BPMjNKMJ.d.ts} +2 -2
package/dist/sequential-5iSVfzl2.d.ts +139 -0
package/dist/store-CJbzDxZ2.d.ts +220 -0
package/dist/{sequential-CbFH___X.d.ts → summary-report-B7gNRX-r.d.ts} +1 -139
package/dist/traces.d.ts +3 -220
package/dist/{types-8u72Gc76.d.ts → types-Dbj5gu8n.d.ts} +1 -1
package/dist/types-DhqpAi_z.d.ts +296 -0
package/docs/adapters-observability.md +3 -3
package/package.json +5 -5
package/dist/adapters/traceai.js.map +0 -1
package/dist/chunk-MNL6LXGQ.js.map +0 -1
package/dist/chunk-OYI6RZJK.js.map +0 -1
/package/dist/{chunk-PD3MH6WU.js.map → chunk-5KSDYBYH.js.map} +0 -0
/package/dist/{chunk-KQ26DYTQ.js.map → chunk-UBQGWD3O.js.map} +0 -0
/package/docs/design/{substrate-gaps-2026-05-27.md → substrate-gaps.md} +0 -0

package/dist/traces.d.ts CHANGED Viewed

@@ -8,6 +8,8 @@ import { T as TraceStore } from './store-Db2Bv8Cf.js';
 export { A as Artifact, B as BudgetLedgerEntry, g as BudgetSpec, i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, L as LlmSpan, M as Message, e as RetrievalSpan, R as Run, h as RunFilter, m as RunLayer, c as RunOutcome, n as RunStatus, f as SandboxSpan, S as Span, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, b as TraceEvent, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
 export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-DODUYdPg.js';
 import { AxAIService, AxFunction } from '@ax-llm/ax';
+import { T as TraceAnalysisStore, f as TraceAnalystFilters, a as DatasetOverview, Q as QueryTracesPage, l as ViewTraceResult, V as ViewSpansResult, b as SearchTraceResult, S as SearchSpanResult } from './store-CJbzDxZ2.js';
+export { D as DEFAULT_TRACE_ANALYST_BUDGETS, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, k as ViewTraceOversized } from './store-CJbzDxZ2.js';
 /**
  * OpenTelemetry JSON export — maps TraceSchema v1 to OTLP/JSON so
@@ -189,225 +191,6 @@ declare function redactValue(value: unknown, rules?: RedactionRule[], report?: R
     report: RedactionReport;
 };
-/**
- * Shared types for the trace-analyst module.
- *
- * Wire format. The store interface speaks `OtlpSpanLike` rows — one JSONL
- * line per span, OTLP-shaped. We do NOT depend on a specific tracing
- * vendor at the type level. Adapter
- * layers map upstream shapes onto this interface.
- *
- * Design constraint. Every read operation that can return arbitrary
- * payload must carry a byte budget so the agent's tool result stays
- * bounded regardless of input trace size. Oversized responses
- * substitute a deterministic summary instead of bytes — see
- * `ViewTraceOversized`.
- */
-/** OTLP span kind (subset we actually use). */
-type TraceAnalystSpanKind = 'AGENT' | 'LLM' | 'TOOL' | 'CHAIN' | 'GUARDRAIL' | 'SPAN' | 'UNKNOWN';
-type TraceAnalystSpanStatus = 'OK' | 'ERROR' | 'UNSET';
-/** Subset of OTLP span fields the analyst exposes to the agent. The
- *  store's job is to project upstream's full span shape down to this
- *  view — the analyst never sees vendor extensions directly. */
-interface TraceAnalystSpan {
-    trace_id: string;
-    span_id: string;
-    parent_span_id: string | null;
-    name: string;
-    kind: TraceAnalystSpanKind;
-    start_time: string;
-    end_time: string;
-    duration_ms: number;
-    status: TraceAnalystSpanStatus;
-    status_message?: string;
-    service_name: string | null;
-    agent_name: string | null;
-    model_name: string | null;
-    tool_name: string | null;
-    /** Raw JSON-serialisable attribute map. May contain large strings;
-     *  callers must respect the per-attribute byte cap. */
-    attributes: Record<string, unknown>;
-}
-interface TraceAnalystTraceSummary {
-    trace_id: string;
-    service_name: string | null;
-    agent_name: string | null;
-    span_count: number;
-    has_errors: boolean;
-    start_time: string;
-    end_time: string;
-    duration_ms: number;
-    raw_jsonl_bytes: number;
-    models: string[];
-    tools: string[];
-}
-interface TraceAnalystFilters {
-    /** Restrict to traces that contain at least one error span. */
-    has_errors?: boolean;
-    /** Match if any span's `service.name` is in this list. */
-    service_names?: string[];
-    /** Match if any span's `agent.name` is in this list. */
-    agent_names?: string[];
-    /** Match if any LLM span's `llm.model_name` is in this list. */
-    model_names?: string[];
-    /** Match if any tool span's `tool.name` is in this list. */
-    tool_names?: string[];
-    /** ISO-8601 lower bound on the trace's earliest start time. */
-    start_time_after?: string;
-    /** ISO-8601 upper bound on the trace's earliest start time. */
-    start_time_before?: string;
-    /** Single regex applied to raw JSONL bytes for the trace. Opt-in;
-     *  expensive on large datasets. Use the indexed filters above first. */
-    regex_pattern?: string;
-}
-interface DatasetOverview {
-    total_traces: number;
-    raw_jsonl_bytes: number;
-    services: string[];
-    agents: string[];
-    models: string[];
-    tool_names: string[];
-    /** Up to 20 real trace ids the agent may pass to view/search tools. */
-    sample_trace_ids: string[];
-    errors: {
-        trace_count: number;
-        span_count: number;
-    };
-    time_range: {
-        earliest: string;
-        latest: string;
-    } | null;
-}
-interface QueryTracesPage {
-    traces: TraceAnalystTraceSummary[];
-    total: number;
-    has_more: boolean;
-}
-/** Full-trace view. When the response would exceed the per-call byte
- *  budget, `oversized` is populated INSTEAD of `spans` so the agent
- *  knows to switch to `searchTrace` / `viewSpans`. */
-interface ViewTraceResult {
-    trace_id: string;
-    spans?: TraceAnalystSpan[];
-    oversized?: ViewTraceOversized;
-}
-interface ViewTraceOversized {
-    span_count: number;
-    /** Names with their counts, sorted desc. Capped at 20 entries. */
-    top_span_names: Array<[string, number]>;
-    /** Largest single span body (bytes after attribute-cap projection). */
-    span_response_bytes_max: number;
-    error_span_count: number;
-}
-interface ViewSpansResult {
-    trace_id: string;
-    spans: TraceAnalystSpan[];
-    /** Number of requested span ids that were not found in the trace. */
-    missing_span_ids: string[];
-    /** Number of attribute fields truncated to fit the per-attribute cap. */
-    truncated_attribute_count: number;
-}
-interface SpanMatchRecord {
-    trace_id: string;
-    span_id: string;
-    span_name: string;
-    span_kind: TraceAnalystSpanKind;
-    /** JSON pointer-style path to the matched value, e.g.
-     *  `attributes."llm.input_messages"[2].content`. */
-    attribute_path: string;
-    matched_text: string;
-    context_before: string;
-    context_after: string;
-    match_offset: number;
-}
-interface SearchTraceResult {
-    trace_id: string;
-    hits: SpanMatchRecord[];
-    total_matches: number;
-    has_more: boolean;
-}
-interface SearchSpanResult {
-    trace_id: string;
-    span_id: string;
-    hits: SpanMatchRecord[];
-    total_matches: number;
-    has_more: boolean;
-}
-/** Tunable byte budgets for bounded RLM tool output. */
-interface TraceAnalystByteBudgets {
-    /** Max bytes any single tool response may emit. Hard ceiling enforced
-     *  by the store; oversized → summary. Default 150_000. */
-    perCallByteCeiling: number;
-    /** Per-attribute string truncation cap on `viewTrace` (discovery scan).
-     *  Default 4096. */
-    perAttributeViewBudget: number;
-    /** Per-attribute string truncation cap on `viewSpans` (surgical reads).
-     *  Default 16384. */
-    perAttributeSpanBudget: number;
-    /** Per-attribute cap on a single match record's `matched_text` and
-     *  context window. Default 1024. */
-    perMatchTextBudget: number;
-}
-declare const DEFAULT_TRACE_ANALYST_BUDGETS: TraceAnalystByteBudgets;
-/** Marker substituted in place of truncated string payloads. Callers
- *  parsing tool output can detect it deterministically. */
-declare const TRACE_ANALYST_TRUNCATION_MARKER_PREFIX = "[trace-analyst truncated:";
-/**
- * `TraceAnalysisStore` — read-side interface the trace-analyst calls
- * through. Six operations, all bounded:
- *
- *   - `getOverview(filters?)` — dataset rollup + sample trace ids.
- *   - `queryTraces(filters?, limit, offset)` — paginated summaries.
- *   - `countTraces(filters?)` — cheap count without materialisation.
- *   - `viewTrace(trace_id, perAttrCap)` — full span list, oversized → summary.
- *   - `viewSpans(trace_id, span_ids, perAttrCap)` — surgical span fetch.
- *   - `searchTrace(trace_id, regex, max_matches)` — bounded regex hits.
- *   - `searchSpan(trace_id, span_id, regex, max_matches)` — single-span search.
- *
- * Multiple implementations ship in the core (`OtlpFileTraceStore`).
- * Downstream callers can supply their own — e.g. a DuckDB-backed
- * adapter or an in-memory adapter for tests — by implementing this
- * interface.
- *
- * Filters compose with AND semantics. Empty/undefined fields impose
- * no constraint. `regex_pattern` is the only opt-in raw-bytes scan —
- * implementations may skip it via `count`/`overview` when not set.
- */
-interface TraceAnalysisStore {
-    getOverview(filters?: TraceAnalystFilters): Promise<DatasetOverview>;
-    queryTraces(opts: {
-        filters?: TraceAnalystFilters;
-        limit: number;
-        offset?: number;
-    }): Promise<QueryTracesPage>;
-    countTraces(filters?: TraceAnalystFilters): Promise<number>;
-    viewTrace(opts: {
-        trace_id: string;
-        /** Override per-attribute byte cap. Defaults to discovery budget. */
-        per_attribute_byte_cap?: number;
-    }): Promise<ViewTraceResult>;
-    viewSpans(opts: {
-        trace_id: string;
-        span_ids: readonly string[];
-        /** Override per-attribute byte cap. Defaults to surgical budget. */
-        per_attribute_byte_cap?: number;
-    }): Promise<ViewSpansResult>;
-    searchTrace(opts: {
-        trace_id: string;
-        regex_pattern: string;
-        /** Hard cap on matches returned. Default 50. */
-        max_matches?: number;
-    }): Promise<SearchTraceResult>;
-    searchSpan(opts: {
-        trace_id: string;
-        span_id: string;
-        regex_pattern: string;
-        max_matches?: number;
-    }): Promise<SearchSpanResult>;
-}
 interface AnalyzeTracesInput {
     /** The user-facing question. Domain framing belongs here, not in the
      *  actor description. */
@@ -887,4 +670,4 @@ declare function iterateRawCalls(sink: RawProviderSink, filter?: {
     spanId?: string;
 }): AsyncGenerator<ReplayCacheEntry>;
-export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, type ExportableSpan, OTEL_AGENT_EVAL_SCOPE, type OtelExportConfig, type OtelExporter, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type QueryTracesPage, REDACTION_VERSION, RawProviderEvent, RawProviderSink, type RedactionReport, type RedactionRule, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, RunCompleteHook, RunCompleteHookContext, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystHookOptions, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete };
+export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DatasetOverview, type ExportableSpan, OTEL_AGENT_EVAL_SCOPE, type OtelExportConfig, type OtelExporter, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, QueryTracesPage, REDACTION_VERSION, RawProviderEvent, RawProviderSink, type RedactionReport, type RedactionRule, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, RunCompleteHook, RunCompleteHookContext, SearchSpanResult, SearchTraceResult, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TraceAnalysisStore, TraceAnalystFilters, type TraceAnalystHookOptions, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, ViewSpansResult, ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete };

package/dist/{types-8u72Gc76.d.ts → types-Dbj5gu8n.d.ts} RENAMED Viewed

@@ -372,4 +372,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
     scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
 }
-export type { CampaignAggregates as C, DispatchContext as D, Gate as G, ImprovementDriver as I, JudgeConfig as J, LabeledScenarioStore as L, MutableSurface as M, OptimizerConfig as O, ProposeContext as P, RedactionStatus as R, Scenario as S, TraceSpan as T, CampaignArtifactWriter as a, CampaignCellResult as b, CampaignCostMeter as c, CampaignResult as d, CampaignTraceWriter as e, CodeSurface as f, DispatchFn as g, GateContext as h, GateDecision as i, GateResult as j, GenerationCandidate as k, GenerationRecord as l, JudgeDimension as m, JudgeScore as n, Mutator as o, SessionScript as p, LabeledScenarioWrite as q, LabeledScenarioSampleArgs as r, LabeledScenarioRecord as s, JudgeAggregate as t, LabeledScenarioSource as u, ScenarioAggregate as v };
+export type { CodeSurface as C, DispatchFn as D, Gate as G, ImprovementDriver as I, JudgeScore as J, LabeledScenarioStore as L, MutableSurface as M, OptimizerConfig as O, ProposeContext as P, RedactionStatus as R, Scenario as S, TraceSpan as T, JudgeConfig as a, DispatchContext as b, LabeledScenarioWrite as c, LabeledScenarioSampleArgs as d, LabeledScenarioRecord as e, CampaignAggregates as f, CampaignArtifactWriter as g, CampaignCellResult as h, CampaignCostMeter as i, CampaignResult as j, CampaignTraceWriter as k, GateContext as l, GateDecision as m, GateResult as n, GenerationCandidate as o, GenerationRecord as p, JudgeAggregate as q, JudgeDimension as r, LabeledScenarioSource as s, Mutator as t, ScenarioAggregate as u, SessionScript as v };

package/dist/types-DhqpAi_z.d.ts ADDED Viewed

@@ -0,0 +1,296 @@
+import { TCloud } from '@tangle-network/tcloud';
+interface Scenario {
+    id: string;
+    persona: string;
+    label: string;
+    thesis: string;
+    dimensions: string[];
+    turns: Turn[];
+    artifactChecks: ArtifactCheck[];
+    systemPromptAppend?: string;
+}
+interface Turn {
+    user: string;
+    expectedBehaviors: string[];
+    adversarial?: boolean;
+    feedbackType?: 'correction' | 'rejection' | 'vague' | 'contradictory' | 'escalation';
+}
+interface ArtifactCheck {
+    type: 'vault_file_exists' | 'vault_file_contains' | 'block_extracted' | 'code_valid' | 'generation_produced' | 'tool_created' | string;
+    target: string;
+    contains?: string;
+    minCount?: number;
+    description: string;
+}
+interface JudgeConfig {
+    model: string;
+    temperature: number;
+    rubric: JudgeRubric;
+}
+interface JudgeRubric {
+    name: string;
+    description: string;
+    dimensions: RubricDimension[];
+}
+interface RubricDimension {
+    name: string;
+    description: string;
+    anchor_low: string;
+    anchor_high: string;
+    weight: number;
+}
+interface ScenarioResult {
+    scenarioId: string;
+    persona: string;
+    turns: TurnResult[];
+    artifactResults: ArtifactResult[];
+    judgeScores: JudgeScore[];
+    judgeErrors: number;
+    overallScore: number;
+    totalDurationMs: number;
+    artifacts: CollectedArtifacts;
+}
+interface TurnResult {
+    turnIndex: number;
+    userMessage: string;
+    agentResponse: string;
+    durationMs: number;
+    blocksExtracted: {
+        type: string;
+        title: string;
+    }[];
+    containsCode: boolean;
+    containsToolCall: boolean;
+}
+interface ArtifactResult {
+    check: ArtifactCheck;
+    passed: boolean;
+    detail?: string;
+}
+interface JudgeScore {
+    judgeName: string;
+    dimension: string;
+    score: number;
+    reasoning: string;
+    evidence?: string;
+}
+interface CollectedArtifacts {
+    vaultFiles: {
+        path: string;
+        content: string;
+    }[];
+    blocksExtracted: {
+        type: string;
+        fields: Record<string, string>;
+    }[];
+    codeBlocks: {
+        language: string;
+        code: string;
+    }[];
+    toolCalls: string[];
+}
+interface BenchmarkReport {
+    timestamp: string;
+    generation: number;
+    promptVersion: string;
+    scenarioCount: number;
+    results: ScenarioResult[];
+    summary: {
+        overallAvg: number;
+        byPersona: Record<string, {
+            avg: number;
+            passed: number;
+            total: number;
+        }>;
+        byDimension: Record<string, {
+            avg: number;
+            scores: number[];
+        }>;
+        weakest: {
+            scenario: string;
+            score: number;
+            reason: string;
+        }[];
+        strongest: {
+            scenario: string;
+            score: number;
+            reason: string;
+        }[];
+    };
+}
+interface RouteMap {
+    signup?: string;
+    login?: string;
+    workspaces?: string;
+    threads?: string;
+    chat?: string;
+    tasks?: string;
+    events?: string;
+    approvals?: string;
+    vault?: string;
+    generations?: string;
+    [key: string]: string | undefined;
+}
+interface ProductClientConfig {
+    baseUrl: string;
+    routes: RouteMap;
+}
+interface ScenarioFile {
+    id: string;
+    category: string;
+    persona: string;
+    label: string;
+    thesis: string;
+    isControl?: boolean;
+    rubric?: {
+        dimensions: {
+            name: string;
+            description: string;
+            weight: number;
+        }[];
+    };
+    turns: Turn[];
+    artifactChecks: ArtifactCheck[];
+}
+interface CompletionCriterion {
+    name: string;
+    check: (state: DriverState) => boolean;
+    progress?: (state: DriverState) => number;
+}
+interface FeedbackPattern {
+    trigger: string;
+    response: string;
+}
+/**
+ * How hard the simulated user pushes back. The driver LLM scales its tone
+ * and follow-up aggression to this:
+ *   cooperative — forgiving early adopter; accepts reasonable answers.
+ *   demanding   — experienced professional; rejects vague or hedged answers.
+ *   relentless  — senior partner reviewing for a client who will litigate;
+ *                 interrogates every claim, accepts nothing undefended.
+ */
+type PersonaRigor = 'cooperative' | 'demanding' | 'relentless';
+interface PersonaConfig {
+    id: string;
+    role: string;
+    goal: string;
+    completionCriteria: CompletionCriterion[];
+    feedbackPatterns?: FeedbackPattern[];
+    maxTurns: number;
+    driverModel?: string;
+    /** How adversarial the simulated user is. Defaults to 'demanding'. */
+    rigor?: PersonaRigor;
+    /**
+     * Domain expertise the simulated user holds — quoted into the driver
+     * prompt so it challenges the agent with authority instead of vague
+     * dissatisfaction. e.g. "a 15-year M&A partner who knows GAAP
+     * working-capital mechanics cold".
+     */
+    expertise?: string;
+    /**
+     * Substantive issues a senior professional in this role would
+     * interrogate — traps the scenario hides, claims that must be defended.
+     * The driver probes these without revealing them verbatim; the agent
+     * must surface them on its own.
+     */
+    pressurePoints?: string[];
+    /**
+     * Curveballs the driver may inject once the agent is coasting — changed
+     * facts, a hostile counterparty position, a new constraint. Forces the
+     * agent to re-derive rather than recite.
+     */
+    curveballs?: string[];
+}
+interface DriverState {
+    tasks: number;
+    events: number;
+    proposals: {
+        pending: number;
+        approved: number;
+        rejected: number;
+    };
+    vaultFiles: string[];
+    codeBlocks: number;
+    generations: number;
+}
+interface TurnMetrics {
+    turn: number;
+    timestamp: string;
+    tasks: number;
+    events: number;
+    proposals: {
+        pending: number;
+        approved: number;
+        rejected: number;
+    };
+    vaultFiles: number;
+    responseLatencyMs: number;
+    responseChars: number;
+    codeBlocksProduced: number;
+    blocksExtracted: number;
+    qualityScore?: number;
+    inputTokens: number;
+    outputTokens: number;
+    estimatedCostUsd: number;
+    totalCostUsd: number;
+    completionPercent: number;
+}
+interface DriverResult {
+    personaId: string;
+    /** True when the simulated user professionally signed off (driver said DONE). */
+    completed: boolean;
+    /** Turn at which the simulated user signed off, or null if it never did. */
+    turnsToCompletion: number | null;
+    /**
+     * Turn at which nominal completionCriteria were first all met, or null.
+     * Distinct from turnsToCompletion: criteria can be met while the
+     * simulated professional is still unsatisfied with the work's rigor.
+     */
+    criteriaMetAtTurn: number | null;
+    totalTurns: number;
+    metrics: TurnMetrics[];
+    finalState: DriverState;
+    convergenceCurve: number[];
+    totalCostUsd: number;
+    finalQualityScore: number | null;
+}
+interface BenchmarkRunnerConfig {
+    scenarios: Scenario[];
+    judges: JudgeFn[];
+    systemPrompt: string;
+    model?: string;
+    judgeModel?: string;
+    passThreshold?: number;
+    generation?: number;
+    promptVersion?: string;
+}
+interface JudgeInput {
+    scenario: Scenario;
+    turns: TurnResult[];
+    artifacts: CollectedArtifacts;
+}
+type JudgeFn = (tc: TCloud, input: JudgeInput) => Promise<JudgeScore[]>;
+interface TestResult {
+    name: string;
+    passed: boolean;
+    duration: number;
+    detail?: string;
+    checks: CheckResult[];
+}
+interface CheckResult {
+    name: string;
+    passed: boolean;
+    expected: string;
+    actual: string;
+}
+interface EvalResult {
+    scenario: string;
+    status: 'pass' | 'fail' | 'skip';
+    duration: number;
+    detail?: string;
+    artifact?: string;
+}
+export type { ArtifactCheck as A, BenchmarkRunnerConfig as B, CheckResult as C, DriverResult as D, EvalResult as E, FeedbackPattern as F, JudgeInput as J, ProductClientConfig as P, RouteMap as R, Scenario as S, TestResult as T, JudgeScore as a, JudgeFn as b, BenchmarkReport as c, PersonaConfig as d, DriverState as e, CollectedArtifacts as f, ScenarioResult as g, TurnMetrics as h, ScenarioFile as i, CompletionCriterion as j, ArtifactResult as k, JudgeConfig as l, JudgeRubric as m, PersonaRigor as n, RubricDimension as o, Turn as p, TurnResult as q };

package/docs/adapters-observability.md CHANGED Viewed

@@ -35,7 +35,7 @@ it*. Unified at the trace level, you see both as one timeline per cell.
 - Compose: register TraceAI's instrumentations on the global tracer
   provider, then either point both at your OTLP collector or at
   TraceAI's hosted backend if you want their UI.
-- **Or use the bridge: `@tangle-network/agent-eval/adapters/traceai`.**
+- **Or use the bridge: `@tangle-network/agent-eval/adapters/otel`.**
   Forwards finished OTel spans (`ReadableSpan` shape) directly into the
   hosted-tier ingest, lifting `tangle.runId` / `tangle.scenarioId` /
   `tangle.cellId` / `tangle.generation` to first-class wire fields so
@@ -43,10 +43,10 @@ it*. Unified at the trace level, you see both as one timeline per cell.
   at the substrate; consumers pass spans from their own OTel SDK.
   ```ts
   import { createHostedClient } from '@tangle-network/agent-eval/hosted'
-  import { createTraceAiBridge } from '@tangle-network/agent-eval/adapters/traceai'
+  import { createOtelBridge } from '@tangle-network/agent-eval/adapters/otel'
   const client = createHostedClient({ endpoint, apiKey, tenantId })
-  const bridge = createTraceAiBridge({ client, defaultRunId: substrateRunId })
+  const bridge = createOtelBridge({ client, defaultRunId: substrateRunId })
   processor.onEnd = (span) => { void bridge.ingest([span]) }
   // ...or call `bridge.ingest(batch)` from a SpanProcessor.onShutdown.
   ```

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-eval",
-  "version": "0.48.0",
+  "version": "0.50.0",
   "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
   "homepage": "https://github.com/tangle-network/agent-eval#readme",
   "repository": {
@@ -119,10 +119,10 @@
       "import": "./dist/adapters/http.js",
       "default": "./dist/adapters/http.js"
     },
-    "./adapters/traceai": {
-      "types": "./dist/adapters/traceai.d.ts",
-      "import": "./dist/adapters/traceai.js",
-      "default": "./dist/adapters/traceai.js"
+    "./adapters/otel": {
+      "types": "./dist/adapters/otel.d.ts",
+      "import": "./dist/adapters/otel.js",
+      "default": "./dist/adapters/otel.js"
     },
     "./hosted": {
       "types": "./dist/hosted/index.d.ts",

package/dist/adapters/traceai.js.map DELETED Viewed

@@ -1 +0,0 @@

- {"version":3,"sources":["../../src/adapters/traceai.ts"],"sourcesContent":["/**\n * # `@tangle-network/agent-eval/adapters/traceai` — OTel→hosted bridge.\n *\n * Forwards OpenTelemetry-shaped spans (from `future-agi/traceai`, from the\n * OTel SDK directly, or from any library that emits OTel `ReadableSpan`s)\n * into the hosted-tier ingest endpoint via `createHostedClient`.\n *\n * **Why this exists:** future-agi ships the strongest OTel-native\n * instrumentation library in the TypeScript-agent ecosystem. Partners using\n * traceai for tracing should be able to plug it into Tangle Intelligence\n * with one config line — not rebuild OTel emission from scratch. Adapter\n * shape applies equally to any OTel SpanProcessor pipeline.\n *\n * **Pattern:**\n *\n * ```ts\n * import { createHostedClient } from '@tangle-network/agent-eval/hosted'\n * import { createTraceAiBridge } from '@tangle-network/agent-eval/adapters/traceai'\n *\n * const client = createHostedClient({ endpoint, apiKey, tenantId })\n * const bridge = createTraceAiBridge({ client, defaultRunId: substrateRunId })\n *\n * // Wherever your OTel SpanProcessor hands you a finished span:\n * processor.onEnd = (span) => bridge.ingest([span])\n * // …or in a SpanProcessor.onShutdown / batch flush:\n * await bridge.ingest(batchedSpans)\n * ```\n *\n * No `@opentelemetry/*` dependency is declared here — the adapter accepts\n * a structurally-typed `OtelLikeSpan`. This keeps the substrate dep graph\n * lean while remaining compatible with OTel SDK `ReadableSpan` instances\n * and with traceai's emitted spans. If a consumer's span shape differs\n * (e.g. `parentSpanId` as a top-level field rather than via\n * `parentSpanContext()`), the adapter accepts both forms.\n */\n\nimport type { HostedClient } from '../hosted/client'\nimport type { TraceSpanEvent } from '../hosted/types'\n\n// ── OTel-compatible structural types ─────────────────────────────────\n\n/**\n * `[seconds, nanoseconds]` — the OTel SDK's `HrTime` shape. Spans emitted\n * by the OTel SDK carry timestamps in this representation; we convert to\n * a single unix-nano number for the wire format.\n */\nexport type HrTime = [number, number]\n\n/** Standard OTel `SpanStatusCode` numeric values: 0 = UNSET, 1 = OK, 2 = ERROR. */\nexport const OTEL_STATUS_UNSET = 0\nexport const OTEL_STATUS_OK = 1\nexport const OTEL_STATUS_ERROR = 2\n\nexport type OtelAttributeValue = string | number | boolean | null | undefined\n\n/**\n * Structural surface compatible with `@opentelemetry/sdk-trace-base`'s\n * `ReadableSpan`. Consumers pass instances they get from their OTel SDK\n * (or from `future-agi/traceai`, which produces spans of this shape).\n */\nexport interface OtelLikeSpan {\n spanContext: () => { traceId: string; spanId: string; traceFlags?: number }\n /** Set on the span itself by some SDKs (legacy / OTLP-shape). Some SDKs\n * expose the parent via `parentSpanContext()` instead — the adapter\n * checks both. */\n parentSpanId?: string\n parentSpanContext?: () => { spanId: string } | undefined\n name: string\n startTime: HrTime\n endTime: HrTime\n attributes: Record<string, OtelAttributeValue>\n events?: Array<{\n name: string\n time: HrTime\n attributes?: Record<string, OtelAttributeValue>\n }>\n status?: { code: number; message?: string }\n}\n\n// ── Conversion ───────────────────────────────────────────────────────\n\n/** `[seconds, nanoseconds]` → unix-nano number. */\nexport function hrTimeToUnixNano(hr: HrTime): number {\n const [seconds, nanos] = hr\n return seconds * 1_000_000_000 + nanos\n}\n\nfunction statusCodeName(code: number | undefined): 'OK' | 'ERROR' | 'UNSET' {\n if (code === OTEL_STATUS_OK) return 'OK'\n if (code === OTEL_STATUS_ERROR) return 'ERROR'\n return 'UNSET'\n}\n\n/** Drop null/undefined attribute values; keep string/number/boolean. */\nfunction cleanAttributes(\n attrs: Record<string, OtelAttributeValue> | undefined,\n): Record<string, string | number | boolean> {\n const out: Record<string, string | number | boolean> = {}\n if (!attrs) return out\n for (const [k, v] of Object.entries(attrs)) {\n if (v === null || v === undefined) continue\n if (typeof v === 'string' || typeof v === 'number' || typeof v === 'boolean') {\n out[k] = v\n }\n }\n return out\n}\n\nfunction readPivotString(\n attrs: Record<string, OtelAttributeValue>,\n key: string,\n): string | undefined {\n const v = attrs[key]\n return typeof v === 'string' ? v : undefined\n}\n\nfunction readPivotNumber(\n attrs: Record<string, OtelAttributeValue>,\n key: string,\n): number | undefined {\n const v = attrs[key]\n return typeof v === 'number' ? v : undefined\n}\n\nfunction resolveParentSpanId(span: OtelLikeSpan): string | undefined {\n if (span.parentSpanId) return span.parentSpanId\n const ctx = span.parentSpanContext?.()\n return ctx?.spanId\n}\n\n// ── Bridge ───────────────────────────────────────────────────────────\n\nexport interface TraceAiBridgeOptions {\n /** Hosted client to forward spans to. */\n client: HostedClient\n /** When set, spans missing a `tangle.runId` attribute receive this value\n * on the way out. Useful when the OTel emitter doesn't know which\n * substrate run it's serving. */\n defaultRunId?: string\n /** Max spans per ingest call. Default 200. The hosted ingest endpoint\n * caps at 5000 per call; we batch smaller by default to keep individual\n * retries cheap. */\n batchSize?: number\n /** Called when a batch fails to ingest. Defaults to a console.warn. Hook\n * this when you need backpressure or to spill to a fallback. */\n onError?: (err: unknown, batch: TraceSpanEvent[]) => void | Promise<void>\n}\n\nexport interface TraceAiBridge {\n /** Convert + ingest a batch of OTel-shape spans. */\n ingest(spans: OtelLikeSpan[]): Promise<void>\n /** Convert one OTel span to the wire-format event. Useful for tests or\n * custom batching pipelines. */\n spanToEvent(span: OtelLikeSpan): TraceSpanEvent\n}\n\nexport function createTraceAiBridge(opts: TraceAiBridgeOptions): TraceAiBridge {\n const batchSize = opts.batchSize ?? 200\n const onError =\n opts.onError ??\n ((err) => {\n console.warn('[traceai-bridge] ingest batch failed:', err)\n })\n\n function convert(span: OtelLikeSpan): TraceSpanEvent {\n const ctx = span.spanContext()\n const attributes = cleanAttributes(span.attributes)\n // Pull pivot attributes off the cleaned attribute map so they round-trip\n // through the wire format's first-class fields. They REMAIN in\n // `attributes` as well so downstream OTel viewers see the same values.\n const runId = readPivotString(attributes, 'tangle.runId') ?? opts.defaultRunId\n const scenarioId = readPivotString(attributes, 'tangle.scenarioId')\n const cellId = readPivotString(attributes, 'tangle.cellId')\n const generation = readPivotNumber(attributes, 'tangle.generation')\n\n if (runId && !attributes['tangle.runId']) {\n attributes['tangle.runId'] = runId\n }\n\n const event: TraceSpanEvent = {\n traceId: ctx.traceId,\n spanId: ctx.spanId,\n name: span.name,\n startTimeUnixNano: hrTimeToUnixNano(span.startTime),\n endTimeUnixNano: hrTimeToUnixNano(span.endTime),\n attributes,\n }\n const parentSpanId = resolveParentSpanId(span)\n if (parentSpanId) event.parentSpanId = parentSpanId\n if (span.events && span.events.length > 0) {\n event.events = span.events.map((e) => {\n const eventAttrs = cleanAttributes(e.attributes)\n const node: {\n timeUnixNano: number\n name: string\n attributes?: Record<string, string | number | boolean>\n } = {\n timeUnixNano: hrTimeToUnixNano(e.time),\n name: e.name,\n }\n if (Object.keys(eventAttrs).length > 0) node.attributes = eventAttrs\n return node\n })\n }\n if (span.status) {\n event.status = { code: statusCodeName(span.status.code), message: span.status.message }\n }\n if (runId) event['tangle.runId'] = runId\n if (scenarioId) event['tangle.scenarioId'] = scenarioId\n if (cellId) event['tangle.cellId'] = cellId\n if (generation !== undefined) event['tangle.generation'] = generation\n return event\n }\n\n async function ingest(spans: OtelLikeSpan[]): Promise<void> {\n if (spans.length === 0) return\n const events = spans.map(convert)\n for (let i = 0; i < events.length; i += batchSize) {\n const batch = events.slice(i, i + batchSize)\n try {\n await opts.client.ingestTraces(batch)\n } catch (err) {\n await onError(err, batch)\n }\n }\n }\n\n return { ingest, spanToEvent: convert }\n}\n"],"mappings":";;;AAiDO,IAAM,oBAAoB;AAC1B,IAAM,iBAAiB;AACvB,IAAM,oBAAoB;AA+B1B,SAAS,iBAAiB,IAAoB;AACnD,QAAM,CAAC,SAAS,KAAK,IAAI;AACzB,SAAO,UAAU,MAAgB;AACnC;AAEA,SAAS,eAAe,MAAoD;AAC1E,MAAI,SAAS,eAAgB,QAAO;AACpC,MAAI,SAAS,kBAAmB,QAAO;AACvC,SAAO;AACT;AAGA,SAAS,gBACP,OAC2C;AAC3C,QAAM,MAAiD,CAAC;AACxD,MAAI,CAAC,MAAO,QAAO;AACnB,aAAW,CAAC,GAAG,CAAC,KAAK,OAAO,QAAQ,KAAK,GAAG;AAC1C,QAAI,MAAM,QAAQ,MAAM,OAAW;AACnC,QAAI,OAAO,MAAM,YAAY,OAAO,MAAM,YAAY,OAAO,MAAM,WAAW;AAC5E,UAAI,CAAC,IAAI;AAAA,IACX;AAAA,EACF;AACA,SAAO;AACT;AAEA,SAAS,gBACP,OACA,KACoB;AACpB,QAAM,IAAI,MAAM,GAAG;AACnB,SAAO,OAAO,MAAM,WAAW,IAAI;AACrC;AAEA,SAAS,gBACP,OACA,KACoB;AACpB,QAAM,IAAI,MAAM,GAAG;AACnB,SAAO,OAAO,MAAM,WAAW,IAAI;AACrC;AAEA,SAAS,oBAAoB,MAAwC;AACnE,MAAI,KAAK,aAAc,QAAO,KAAK;AACnC,QAAM,MAAM,KAAK,oBAAoB;AACrC,SAAO,KAAK;AACd;AA4BO,SAAS,oBAAoB,MAA2C;AAC7E,QAAM,YAAY,KAAK,aAAa;AACpC,QAAM,UACJ,KAAK,YACJ,CAAC,QAAQ;AACR,YAAQ,KAAK,yCAAyC,GAAG;AAAA,EAC3D;AAEF,WAAS,QAAQ,MAAoC;AACnD,UAAM,MAAM,KAAK,YAAY;AAC7B,UAAM,aAAa,gBAAgB,KAAK,UAAU;AAIlD,UAAM,QAAQ,gBAAgB,YAAY,cAAc,KAAK,KAAK;AAClE,UAAM,aAAa,gBAAgB,YAAY,mBAAmB;AAClE,UAAM,SAAS,gBAAgB,YAAY,eAAe;AAC1D,UAAM,aAAa,gBAAgB,YAAY,mBAAmB;AAElE,QAAI,SAAS,CAAC,WAAW,cAAc,GAAG;AACxC,iBAAW,cAAc,IAAI;AAAA,IAC/B;AAEA,UAAM,QAAwB;AAAA,MAC5B,SAAS,IAAI;AAAA,MACb,QAAQ,IAAI;AAAA,MACZ,MAAM,KAAK;AAAA,MACX,mBAAmB,iBAAiB,KAAK,SAAS;AAAA,MAClD,iBAAiB,iBAAiB,KAAK,OAAO;AAAA,MAC9C;AAAA,IACF;AACA,UAAM,eAAe,oBAAoB,IAAI;AAC7C,QAAI,aAAc,OAAM,eAAe;AACvC,QAAI,KAAK,UAAU,KAAK,OAAO,SAAS,GAAG;AACzC,YAAM,SAAS,KAAK,OAAO,IAAI,CAAC,MAAM;AACpC,cAAM,aAAa,gBAAgB,EAAE,UAAU;AAC/C,cAAM,OAIF;AAAA,UACF,cAAc,iBAAiB,EAAE,IAAI;AAAA,UACrC,MAAM,EAAE;AAAA,QACV;AACA,YAAI,OAAO,KAAK,UAAU,EAAE,SAAS,EAAG,MAAK,aAAa;AAC1D,eAAO;AAAA,MACT,CAAC;AAAA,IACH;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,SAAS,EAAE,MAAM,eAAe,KAAK,OAAO,IAAI,GAAG,SAAS,KAAK,OAAO,QAAQ;AAAA,IACxF;AACA,QAAI,MAAO,OAAM,cAAc,IAAI;AACnC,QAAI,WAAY,OAAM,mBAAmB,IAAI;AAC7C,QAAI,OAAQ,OAAM,eAAe,IAAI;AACrC,QAAI,eAAe,OAAW,OAAM,mBAAmB,IAAI;AAC3D,WAAO;AAAA,EACT;AAEA,iBAAe,OAAO,OAAsC;AAC1D,QAAI,MAAM,WAAW,EAAG;AACxB,UAAM,SAAS,MAAM,IAAI,OAAO;AAChC,aAAS,IAAI,GAAG,IAAI,OAAO,QAAQ,KAAK,WAAW;AACjD,YAAM,QAAQ,OAAO,MAAM,GAAG,IAAI,SAAS;AAC3C,UAAI;AACF,cAAM,KAAK,OAAO,aAAa,KAAK;AAAA,MACtC,SAAS,KAAK;AACZ,cAAM,QAAQ,KAAK,KAAK;AAAA,MAC1B;AAAA,IACF;AAAA,EACF;AAEA,SAAO,EAAE,QAAQ,aAAa,QAAQ;AACxC;","names":[]}