npm - @tangle-network/agent-eval - Versions diffs - 0.49.0 → 0.50.1 - Mend

@tangle-network/agent-eval 0.49.0 → 0.50.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

package/CHANGELOG.md +135 -0
package/README.md +235 -331
package/dist/adapters/http.d.ts +1 -1
package/dist/adapters/langchain.d.ts +1 -1
package/dist/adapters/otel.d.ts +8 -2
package/dist/campaign/index.d.ts +3 -3
package/dist/{chunk-PD3MH6WU.js → chunk-5KSDYBYH.js} +2 -2
package/dist/{chunk-MNL6LXGQ.js → chunk-EGIPWXHL.js} +2 -98
package/dist/chunk-EGIPWXHL.js.map +1 -0
package/dist/{chunk-OYI6RZJK.js → chunk-FQK2CCIM.js} +1 -1
package/dist/chunk-FQK2CCIM.js.map +1 -0
package/dist/chunk-MAZ26DC7.js +99 -0
package/dist/chunk-MAZ26DC7.js.map +1 -0
package/dist/chunk-SHTXZ4O2.js +113 -0
package/dist/chunk-SHTXZ4O2.js.map +1 -0
package/dist/{chunk-KQ26DYTQ.js → chunk-UBQGWD3O.js} +2 -2
package/dist/contract/index.d.ts +206 -9
package/dist/contract/index.js +751 -3
package/dist/contract/index.js.map +1 -1
package/dist/governance/index.d.ts +1 -1
package/dist/hosted/index.d.ts +8 -192
package/dist/hosted/index.js +1 -1
package/dist/index-BRxz6qov.d.ts +409 -0
package/dist/index.d.ts +18 -462
package/dist/index.js +14 -106
package/dist/index.js.map +1 -1
package/dist/meta-eval/index.d.ts +3 -3
package/dist/openapi.json +1 -1
package/dist/{outcome-store-BxJ3DQKJ.d.ts → outcome-store-D6KWmYvj.d.ts} +1 -1
package/dist/registry-8KAs18kY.d.ts +457 -0
package/dist/{release-report-DBB8lB1P.d.ts → release-report-DSu0DWy8.d.ts} +3 -296
package/dist/reporting.d.ts +6 -4
package/dist/reporting.js +6 -4
package/dist/{researcher-CHMO56K0.d.ts → researcher-LZD0qHEa.d.ts} +1 -1
package/dist/rl.d.ts +9 -8
package/dist/rl.js +3 -2
package/dist/rl.js.map +1 -1
package/dist/{rubric-predictive-validity-CJ08tGwq.d.ts → rubric-predictive-validity-ByZEC3BX.d.ts} +1 -1
package/dist/{run-improvement-loop-B-L8GgpW.d.ts → run-improvement-loop-BPMjNKMJ.d.ts} +2 -2
package/dist/sequential-5iSVfzl2.d.ts +139 -0
package/dist/store-CJbzDxZ2.d.ts +220 -0
package/dist/{sequential-CbFH___X.d.ts → summary-report-B7gNRX-r.d.ts} +1 -139
package/dist/traces.d.ts +3 -220
package/dist/{types-8u72Gc76.d.ts → types-Dbj5gu8n.d.ts} +1 -1
package/dist/types-DhqpAi_z.d.ts +296 -0
package/docs/concepts.md +20 -0
package/docs/customer-journeys.md +208 -0
package/docs/insight-report.md +337 -0
package/package.json +1 -1
package/dist/chunk-MNL6LXGQ.js.map +0 -1
package/dist/chunk-OYI6RZJK.js.map +0 -1
/package/dist/{chunk-PD3MH6WU.js.map → chunk-5KSDYBYH.js.map} +0 -0
/package/dist/{chunk-KQ26DYTQ.js.map → chunk-UBQGWD3O.js.map} +0 -0

package/dist/traces.d.ts CHANGED Viewed

@@ -8,6 +8,8 @@ import { T as TraceStore } from './store-Db2Bv8Cf.js';
 export { A as Artifact, B as BudgetLedgerEntry, g as BudgetSpec, i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, L as LlmSpan, M as Message, e as RetrievalSpan, R as Run, h as RunFilter, m as RunLayer, c as RunOutcome, n as RunStatus, f as SandboxSpan, S as Span, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, b as TraceEvent, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
 export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-DODUYdPg.js';
 import { AxAIService, AxFunction } from '@ax-llm/ax';
+import { T as TraceAnalysisStore, f as TraceAnalystFilters, a as DatasetOverview, Q as QueryTracesPage, l as ViewTraceResult, V as ViewSpansResult, b as SearchTraceResult, S as SearchSpanResult } from './store-CJbzDxZ2.js';
+export { D as DEFAULT_TRACE_ANALYST_BUDGETS, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, k as ViewTraceOversized } from './store-CJbzDxZ2.js';
 /**
  * OpenTelemetry JSON export — maps TraceSchema v1 to OTLP/JSON so
@@ -189,225 +191,6 @@ declare function redactValue(value: unknown, rules?: RedactionRule[], report?: R
     report: RedactionReport;
 };
-/**
- * Shared types for the trace-analyst module.
- *
- * Wire format. The store interface speaks `OtlpSpanLike` rows — one JSONL
- * line per span, OTLP-shaped. We do NOT depend on a specific tracing
- * vendor at the type level. Adapter
- * layers map upstream shapes onto this interface.
- *
- * Design constraint. Every read operation that can return arbitrary
- * payload must carry a byte budget so the agent's tool result stays
- * bounded regardless of input trace size. Oversized responses
- * substitute a deterministic summary instead of bytes — see
- * `ViewTraceOversized`.
- */
-/** OTLP span kind (subset we actually use). */
-type TraceAnalystSpanKind = 'AGENT' | 'LLM' | 'TOOL' | 'CHAIN' | 'GUARDRAIL' | 'SPAN' | 'UNKNOWN';
-type TraceAnalystSpanStatus = 'OK' | 'ERROR' | 'UNSET';
-/** Subset of OTLP span fields the analyst exposes to the agent. The
- *  store's job is to project upstream's full span shape down to this
- *  view — the analyst never sees vendor extensions directly. */
-interface TraceAnalystSpan {
-    trace_id: string;
-    span_id: string;
-    parent_span_id: string | null;
-    name: string;
-    kind: TraceAnalystSpanKind;
-    start_time: string;
-    end_time: string;
-    duration_ms: number;
-    status: TraceAnalystSpanStatus;
-    status_message?: string;
-    service_name: string | null;
-    agent_name: string | null;
-    model_name: string | null;
-    tool_name: string | null;
-    /** Raw JSON-serialisable attribute map. May contain large strings;
-     *  callers must respect the per-attribute byte cap. */
-    attributes: Record<string, unknown>;
-}
-interface TraceAnalystTraceSummary {
-    trace_id: string;
-    service_name: string | null;
-    agent_name: string | null;
-    span_count: number;
-    has_errors: boolean;
-    start_time: string;
-    end_time: string;
-    duration_ms: number;
-    raw_jsonl_bytes: number;
-    models: string[];
-    tools: string[];
-}
-interface TraceAnalystFilters {
-    /** Restrict to traces that contain at least one error span. */
-    has_errors?: boolean;
-    /** Match if any span's `service.name` is in this list. */
-    service_names?: string[];
-    /** Match if any span's `agent.name` is in this list. */
-    agent_names?: string[];
-    /** Match if any LLM span's `llm.model_name` is in this list. */
-    model_names?: string[];
-    /** Match if any tool span's `tool.name` is in this list. */
-    tool_names?: string[];
-    /** ISO-8601 lower bound on the trace's earliest start time. */
-    start_time_after?: string;
-    /** ISO-8601 upper bound on the trace's earliest start time. */
-    start_time_before?: string;
-    /** Single regex applied to raw JSONL bytes for the trace. Opt-in;
-     *  expensive on large datasets. Use the indexed filters above first. */
-    regex_pattern?: string;
-}
-interface DatasetOverview {
-    total_traces: number;
-    raw_jsonl_bytes: number;
-    services: string[];
-    agents: string[];
-    models: string[];
-    tool_names: string[];
-    /** Up to 20 real trace ids the agent may pass to view/search tools. */
-    sample_trace_ids: string[];
-    errors: {
-        trace_count: number;
-        span_count: number;
-    };
-    time_range: {
-        earliest: string;
-        latest: string;
-    } | null;
-}
-interface QueryTracesPage {
-    traces: TraceAnalystTraceSummary[];
-    total: number;
-    has_more: boolean;
-}
-/** Full-trace view. When the response would exceed the per-call byte
- *  budget, `oversized` is populated INSTEAD of `spans` so the agent
- *  knows to switch to `searchTrace` / `viewSpans`. */
-interface ViewTraceResult {
-    trace_id: string;
-    spans?: TraceAnalystSpan[];
-    oversized?: ViewTraceOversized;
-}
-interface ViewTraceOversized {
-    span_count: number;
-    /** Names with their counts, sorted desc. Capped at 20 entries. */
-    top_span_names: Array<[string, number]>;
-    /** Largest single span body (bytes after attribute-cap projection). */
-    span_response_bytes_max: number;
-    error_span_count: number;
-}
-interface ViewSpansResult {
-    trace_id: string;
-    spans: TraceAnalystSpan[];
-    /** Number of requested span ids that were not found in the trace. */
-    missing_span_ids: string[];
-    /** Number of attribute fields truncated to fit the per-attribute cap. */
-    truncated_attribute_count: number;
-}
-interface SpanMatchRecord {
-    trace_id: string;
-    span_id: string;
-    span_name: string;
-    span_kind: TraceAnalystSpanKind;
-    /** JSON pointer-style path to the matched value, e.g.
-     *  `attributes."llm.input_messages"[2].content`. */
-    attribute_path: string;
-    matched_text: string;
-    context_before: string;
-    context_after: string;
-    match_offset: number;
-}
-interface SearchTraceResult {
-    trace_id: string;
-    hits: SpanMatchRecord[];
-    total_matches: number;
-    has_more: boolean;
-}
-interface SearchSpanResult {
-    trace_id: string;
-    span_id: string;
-    hits: SpanMatchRecord[];
-    total_matches: number;
-    has_more: boolean;
-}
-/** Tunable byte budgets for bounded RLM tool output. */
-interface TraceAnalystByteBudgets {
-    /** Max bytes any single tool response may emit. Hard ceiling enforced
-     *  by the store; oversized → summary. Default 150_000. */
-    perCallByteCeiling: number;
-    /** Per-attribute string truncation cap on `viewTrace` (discovery scan).
-     *  Default 4096. */
-    perAttributeViewBudget: number;
-    /** Per-attribute string truncation cap on `viewSpans` (surgical reads).
-     *  Default 16384. */
-    perAttributeSpanBudget: number;
-    /** Per-attribute cap on a single match record's `matched_text` and
-     *  context window. Default 1024. */
-    perMatchTextBudget: number;
-}
-declare const DEFAULT_TRACE_ANALYST_BUDGETS: TraceAnalystByteBudgets;
-/** Marker substituted in place of truncated string payloads. Callers
- *  parsing tool output can detect it deterministically. */
-declare const TRACE_ANALYST_TRUNCATION_MARKER_PREFIX = "[trace-analyst truncated:";
-/**
- * `TraceAnalysisStore` — read-side interface the trace-analyst calls
- * through. Six operations, all bounded:
- *
- *   - `getOverview(filters?)` — dataset rollup + sample trace ids.
- *   - `queryTraces(filters?, limit, offset)` — paginated summaries.
- *   - `countTraces(filters?)` — cheap count without materialisation.
- *   - `viewTrace(trace_id, perAttrCap)` — full span list, oversized → summary.
- *   - `viewSpans(trace_id, span_ids, perAttrCap)` — surgical span fetch.
- *   - `searchTrace(trace_id, regex, max_matches)` — bounded regex hits.
- *   - `searchSpan(trace_id, span_id, regex, max_matches)` — single-span search.
- *
- * Multiple implementations ship in the core (`OtlpFileTraceStore`).
- * Downstream callers can supply their own — e.g. a DuckDB-backed
- * adapter or an in-memory adapter for tests — by implementing this
- * interface.
- *
- * Filters compose with AND semantics. Empty/undefined fields impose
- * no constraint. `regex_pattern` is the only opt-in raw-bytes scan —
- * implementations may skip it via `count`/`overview` when not set.
- */
-interface TraceAnalysisStore {
-    getOverview(filters?: TraceAnalystFilters): Promise<DatasetOverview>;
-    queryTraces(opts: {
-        filters?: TraceAnalystFilters;
-        limit: number;
-        offset?: number;
-    }): Promise<QueryTracesPage>;
-    countTraces(filters?: TraceAnalystFilters): Promise<number>;
-    viewTrace(opts: {
-        trace_id: string;
-        /** Override per-attribute byte cap. Defaults to discovery budget. */
-        per_attribute_byte_cap?: number;
-    }): Promise<ViewTraceResult>;
-    viewSpans(opts: {
-        trace_id: string;
-        span_ids: readonly string[];
-        /** Override per-attribute byte cap. Defaults to surgical budget. */
-        per_attribute_byte_cap?: number;
-    }): Promise<ViewSpansResult>;
-    searchTrace(opts: {
-        trace_id: string;
-        regex_pattern: string;
-        /** Hard cap on matches returned. Default 50. */
-        max_matches?: number;
-    }): Promise<SearchTraceResult>;
-    searchSpan(opts: {
-        trace_id: string;
-        span_id: string;
-        regex_pattern: string;
-        max_matches?: number;
-    }): Promise<SearchSpanResult>;
-}
 interface AnalyzeTracesInput {
     /** The user-facing question. Domain framing belongs here, not in the
      *  actor description. */
@@ -887,4 +670,4 @@ declare function iterateRawCalls(sink: RawProviderSink, filter?: {
     spanId?: string;
 }): AsyncGenerator<ReplayCacheEntry>;
-export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, type ExportableSpan, OTEL_AGENT_EVAL_SCOPE, type OtelExportConfig, type OtelExporter, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type QueryTracesPage, REDACTION_VERSION, RawProviderEvent, RawProviderSink, type RedactionReport, type RedactionRule, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, RunCompleteHook, RunCompleteHookContext, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystHookOptions, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete };
+export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DatasetOverview, type ExportableSpan, OTEL_AGENT_EVAL_SCOPE, type OtelExportConfig, type OtelExporter, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, QueryTracesPage, REDACTION_VERSION, RawProviderEvent, RawProviderSink, type RedactionReport, type RedactionRule, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, RunCompleteHook, RunCompleteHookContext, SearchSpanResult, SearchTraceResult, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TraceAnalysisStore, TraceAnalystFilters, type TraceAnalystHookOptions, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, ViewSpansResult, ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete };

package/dist/{types-8u72Gc76.d.ts → types-Dbj5gu8n.d.ts} RENAMED Viewed

@@ -372,4 +372,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
     scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
 }
-export type { CampaignAggregates as C, DispatchContext as D, Gate as G, ImprovementDriver as I, JudgeConfig as J, LabeledScenarioStore as L, MutableSurface as M, OptimizerConfig as O, ProposeContext as P, RedactionStatus as R, Scenario as S, TraceSpan as T, CampaignArtifactWriter as a, CampaignCellResult as b, CampaignCostMeter as c, CampaignResult as d, CampaignTraceWriter as e, CodeSurface as f, DispatchFn as g, GateContext as h, GateDecision as i, GateResult as j, GenerationCandidate as k, GenerationRecord as l, JudgeDimension as m, JudgeScore as n, Mutator as o, SessionScript as p, LabeledScenarioWrite as q, LabeledScenarioSampleArgs as r, LabeledScenarioRecord as s, JudgeAggregate as t, LabeledScenarioSource as u, ScenarioAggregate as v };
+export type { CodeSurface as C, DispatchFn as D, Gate as G, ImprovementDriver as I, JudgeScore as J, LabeledScenarioStore as L, MutableSurface as M, OptimizerConfig as O, ProposeContext as P, RedactionStatus as R, Scenario as S, TraceSpan as T, JudgeConfig as a, DispatchContext as b, LabeledScenarioWrite as c, LabeledScenarioSampleArgs as d, LabeledScenarioRecord as e, CampaignAggregates as f, CampaignArtifactWriter as g, CampaignCellResult as h, CampaignCostMeter as i, CampaignResult as j, CampaignTraceWriter as k, GateContext as l, GateDecision as m, GateResult as n, GenerationCandidate as o, GenerationRecord as p, JudgeAggregate as q, JudgeDimension as r, LabeledScenarioSource as s, Mutator as t, ScenarioAggregate as u, SessionScript as v };

package/dist/types-DhqpAi_z.d.ts ADDED Viewed

@@ -0,0 +1,296 @@
+import { TCloud } from '@tangle-network/tcloud';
+interface Scenario {
+    id: string;
+    persona: string;
+    label: string;
+    thesis: string;
+    dimensions: string[];
+    turns: Turn[];
+    artifactChecks: ArtifactCheck[];
+    systemPromptAppend?: string;
+}
+interface Turn {
+    user: string;
+    expectedBehaviors: string[];
+    adversarial?: boolean;
+    feedbackType?: 'correction' | 'rejection' | 'vague' | 'contradictory' | 'escalation';
+}
+interface ArtifactCheck {
+    type: 'vault_file_exists' | 'vault_file_contains' | 'block_extracted' | 'code_valid' | 'generation_produced' | 'tool_created' | string;
+    target: string;
+    contains?: string;
+    minCount?: number;
+    description: string;
+}
+interface JudgeConfig {
+    model: string;
+    temperature: number;
+    rubric: JudgeRubric;
+}
+interface JudgeRubric {
+    name: string;
+    description: string;
+    dimensions: RubricDimension[];
+}
+interface RubricDimension {
+    name: string;
+    description: string;
+    anchor_low: string;
+    anchor_high: string;
+    weight: number;
+}
+interface ScenarioResult {
+    scenarioId: string;
+    persona: string;
+    turns: TurnResult[];
+    artifactResults: ArtifactResult[];
+    judgeScores: JudgeScore[];
+    judgeErrors: number;
+    overallScore: number;
+    totalDurationMs: number;
+    artifacts: CollectedArtifacts;
+}
+interface TurnResult {
+    turnIndex: number;
+    userMessage: string;
+    agentResponse: string;
+    durationMs: number;
+    blocksExtracted: {
+        type: string;
+        title: string;
+    }[];
+    containsCode: boolean;
+    containsToolCall: boolean;
+}
+interface ArtifactResult {
+    check: ArtifactCheck;
+    passed: boolean;
+    detail?: string;
+}
+interface JudgeScore {
+    judgeName: string;
+    dimension: string;
+    score: number;
+    reasoning: string;
+    evidence?: string;
+}
+interface CollectedArtifacts {
+    vaultFiles: {
+        path: string;
+        content: string;
+    }[];
+    blocksExtracted: {
+        type: string;
+        fields: Record<string, string>;
+    }[];
+    codeBlocks: {
+        language: string;
+        code: string;
+    }[];
+    toolCalls: string[];
+}
+interface BenchmarkReport {
+    timestamp: string;
+    generation: number;
+    promptVersion: string;
+    scenarioCount: number;
+    results: ScenarioResult[];
+    summary: {
+        overallAvg: number;
+        byPersona: Record<string, {
+            avg: number;
+            passed: number;
+            total: number;
+        }>;
+        byDimension: Record<string, {
+            avg: number;
+            scores: number[];
+        }>;
+        weakest: {
+            scenario: string;
+            score: number;
+            reason: string;
+        }[];
+        strongest: {
+            scenario: string;
+            score: number;
+            reason: string;
+        }[];
+    };
+}
+interface RouteMap {
+    signup?: string;
+    login?: string;
+    workspaces?: string;
+    threads?: string;
+    chat?: string;
+    tasks?: string;
+    events?: string;
+    approvals?: string;
+    vault?: string;
+    generations?: string;
+    [key: string]: string | undefined;
+}
+interface ProductClientConfig {
+    baseUrl: string;
+    routes: RouteMap;
+}
+interface ScenarioFile {
+    id: string;
+    category: string;
+    persona: string;
+    label: string;
+    thesis: string;
+    isControl?: boolean;
+    rubric?: {
+        dimensions: {
+            name: string;
+            description: string;
+            weight: number;
+        }[];
+    };
+    turns: Turn[];
+    artifactChecks: ArtifactCheck[];
+}
+interface CompletionCriterion {
+    name: string;
+    check: (state: DriverState) => boolean;
+    progress?: (state: DriverState) => number;
+}
+interface FeedbackPattern {
+    trigger: string;
+    response: string;
+}
+/**
+ * How hard the simulated user pushes back. The driver LLM scales its tone
+ * and follow-up aggression to this:
+ *   cooperative — forgiving early adopter; accepts reasonable answers.
+ *   demanding   — experienced professional; rejects vague or hedged answers.
+ *   relentless  — senior partner reviewing for a client who will litigate;
+ *                 interrogates every claim, accepts nothing undefended.
+ */
+type PersonaRigor = 'cooperative' | 'demanding' | 'relentless';
+interface PersonaConfig {
+    id: string;
+    role: string;
+    goal: string;
+    completionCriteria: CompletionCriterion[];
+    feedbackPatterns?: FeedbackPattern[];
+    maxTurns: number;
+    driverModel?: string;
+    /** How adversarial the simulated user is. Defaults to 'demanding'. */
+    rigor?: PersonaRigor;
+    /**
+     * Domain expertise the simulated user holds — quoted into the driver
+     * prompt so it challenges the agent with authority instead of vague
+     * dissatisfaction. e.g. "a 15-year M&A partner who knows GAAP
+     * working-capital mechanics cold".
+     */
+    expertise?: string;
+    /**
+     * Substantive issues a senior professional in this role would
+     * interrogate — traps the scenario hides, claims that must be defended.
+     * The driver probes these without revealing them verbatim; the agent
+     * must surface them on its own.
+     */
+    pressurePoints?: string[];
+    /**
+     * Curveballs the driver may inject once the agent is coasting — changed
+     * facts, a hostile counterparty position, a new constraint. Forces the
+     * agent to re-derive rather than recite.
+     */
+    curveballs?: string[];
+}
+interface DriverState {
+    tasks: number;
+    events: number;
+    proposals: {
+        pending: number;
+        approved: number;
+        rejected: number;
+    };
+    vaultFiles: string[];
+    codeBlocks: number;
+    generations: number;
+}
+interface TurnMetrics {
+    turn: number;
+    timestamp: string;
+    tasks: number;
+    events: number;
+    proposals: {
+        pending: number;
+        approved: number;
+        rejected: number;
+    };
+    vaultFiles: number;
+    responseLatencyMs: number;
+    responseChars: number;
+    codeBlocksProduced: number;
+    blocksExtracted: number;
+    qualityScore?: number;
+    inputTokens: number;
+    outputTokens: number;
+    estimatedCostUsd: number;
+    totalCostUsd: number;
+    completionPercent: number;
+}
+interface DriverResult {
+    personaId: string;
+    /** True when the simulated user professionally signed off (driver said DONE). */
+    completed: boolean;
+    /** Turn at which the simulated user signed off, or null if it never did. */
+    turnsToCompletion: number | null;
+    /**
+     * Turn at which nominal completionCriteria were first all met, or null.
+     * Distinct from turnsToCompletion: criteria can be met while the
+     * simulated professional is still unsatisfied with the work's rigor.
+     */
+    criteriaMetAtTurn: number | null;
+    totalTurns: number;
+    metrics: TurnMetrics[];
+    finalState: DriverState;
+    convergenceCurve: number[];
+    totalCostUsd: number;
+    finalQualityScore: number | null;
+}
+interface BenchmarkRunnerConfig {
+    scenarios: Scenario[];
+    judges: JudgeFn[];
+    systemPrompt: string;
+    model?: string;
+    judgeModel?: string;
+    passThreshold?: number;
+    generation?: number;
+    promptVersion?: string;
+}
+interface JudgeInput {
+    scenario: Scenario;
+    turns: TurnResult[];
+    artifacts: CollectedArtifacts;
+}
+type JudgeFn = (tc: TCloud, input: JudgeInput) => Promise<JudgeScore[]>;
+interface TestResult {
+    name: string;
+    passed: boolean;
+    duration: number;
+    detail?: string;
+    checks: CheckResult[];
+}
+interface CheckResult {
+    name: string;
+    passed: boolean;
+    expected: string;
+    actual: string;
+}
+interface EvalResult {
+    scenario: string;
+    status: 'pass' | 'fail' | 'skip';
+    duration: number;
+    detail?: string;
+    artifact?: string;
+}
+export type { ArtifactCheck as A, BenchmarkRunnerConfig as B, CheckResult as C, DriverResult as D, EvalResult as E, FeedbackPattern as F, JudgeInput as J, ProductClientConfig as P, RouteMap as R, Scenario as S, TestResult as T, JudgeScore as a, JudgeFn as b, BenchmarkReport as c, PersonaConfig as d, DriverState as e, CollectedArtifacts as f, ScenarioResult as g, TurnMetrics as h, ScenarioFile as i, CompletionCriterion as j, ArtifactResult as k, JudgeConfig as l, JudgeRubric as m, PersonaRigor as n, RubricDimension as o, Turn as p, TurnResult as q };

package/docs/concepts.md CHANGED Viewed

@@ -9,6 +9,26 @@ connected, or the answer lacks required sources. The package gives products a
 shared way to record runs, check outcomes, classify failures, compare variants,
 and make release decisions.
+## The three top-level functions
+Everything funnels through `/contract`. Three entries, one shape coming back:
+| Function | When to call it | What you give it | What you get back |
+|---|---|---|---|
+| **`selfImprove()`** | You have a closed loop — scenarios, judge, agent in hand, and you want the substrate to propose better candidates + gate them. | scenarios, agent, judge, baseline surface | `SelfImproveResult.insight: InsightReport` + ship/hold verdict + winner surface |
+| **`analyzeRuns()`** | You have observed runs (production traces, an approve/reject corpus, a CSV gold set) and want the same rigor packet without invoking an agent. | `RunRecord[]` + optional flags | `InsightReport` |
+| **Intake adapters** (`fromFeedbackTable`, `fromOtelSpans`) | Your data isn't already in `RunRecord` shape — it's in Obsidian, Sheets, an OTel collector, etc. | source-specific input | `RunRecord[]` ready to pipe into `analyzeRuns()` |
+The three customer maturity stages — logs only → ratings → closed loop — map exactly to the three functions. See [`customer-journeys.md`](./customer-journeys.md) for the runnable walkthroughs.
+The shape of the answer — `InsightReport` — is identical across all three paths. Distributional summary, paired-bootstrap lift CI, judge stats, inter-rater agreement, cost-quality Pareto, failure clusters, contamination check, outcome correlation, release axes, and a ranked recommendations array. Walked through section-by-section in [`insight-report.md`](./insight-report.md).
+## The layering rule
+`agent-eval` is the **substrate** at the bottom of the Tangle agent stack. `agent-runtime` and `agent-knowledge` depend on it; `agent-eval` MUST NOT import from either. Primitives that "feel like" they belong in a consumer but are actually substrate-shaped (validator verdicts, run records, scenarios, judge scores) live here. Primitives that genuinely require a running agent loop (`ValidationCtx` with iteration + signal + traceEmitter, sandbox `AgentRunSpec`) stay in `agent-runtime`.
+The test: *does this concept make sense WITHOUT a running agent loop?* If yes, it's substrate. If no, it's runtime. The full rule is in [`/CLAUDE.md`](../CLAUDE.md#repo-layering--this-package-is-the-substrate).
 ## Main Objects
 | Thing | What it is | One-line example |