npm - @tangle-network/agent-eval - Versions diffs - 0.72.0 → 0.72.4 - Mend

@tangle-network/agent-eval 0.72.0 → 0.72.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

package/CHANGELOG.md +39 -0
package/dist/adapters/http.d.ts +1 -1
package/dist/adapters/langchain.d.ts +1 -1
package/dist/adapters/otel.d.ts +3 -2
package/dist/agent-profile-DYRboYWu.d.ts +364 -0
package/dist/analyst/index.d.ts +221 -0
package/dist/analyst/index.js +371 -0
package/dist/analyst/index.js.map +1 -0
package/dist/analyst-t7zZS3TV.d.ts +88 -0
package/dist/campaign/index.d.ts +518 -9
package/dist/campaign/index.js +672 -22
package/dist/campaign/index.js.map +1 -1
package/dist/chunk-7W4SM7FD.js +1075 -0
package/dist/chunk-7W4SM7FD.js.map +1 -0
package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
package/dist/chunk-JHA3ZGSO.js +1496 -0
package/dist/chunk-JHA3ZGSO.js.map +1 -0
package/dist/{chunk-4QJN7RDX.js → chunk-JYE3WOTE.js} +55 -7
package/dist/{chunk-4QJN7RDX.js.map → chunk-JYE3WOTE.js.map} +1 -1
package/dist/chunk-LB2UOI5F.js +412 -0
package/dist/chunk-LB2UOI5F.js.map +1 -0
package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
package/dist/chunk-VUINJM5M.js.map +1 -0
package/dist/chunk-WYIHD6EB.js +1044 -0
package/dist/chunk-WYIHD6EB.js.map +1 -0
package/dist/{chunk-UD6EF73X.js → chunk-XPILG2CA.js} +119 -2
package/dist/chunk-XPILG2CA.js.map +1 -0
package/dist/contract/index.d.ts +17 -13
package/dist/contract/index.js +13 -7
package/dist/contract/index.js.map +1 -1
package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
package/dist/control.d.ts +2 -2
package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
package/dist/hosted/index.d.ts +223 -2
package/dist/index.d.ts +49 -1323
package/dist/index.js +353 -2496
package/dist/index.js.map +1 -1
package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
package/dist/openapi.json +1 -1
package/dist/pareto-E-pembql.d.ts +81 -0
package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
package/dist/redact-B40YG2M_.d.ts +45 -0
package/dist/registry-DuVYiTvw.d.ts +128 -0
package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
package/dist/rl.d.ts +4 -3
package/dist/rl.js +4 -4
package/dist/run-critic-BAIjX99r.d.ts +56 -0
package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
package/dist/traces.d.ts +371 -308
package/dist/traces.js +43 -18
package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
package/dist/wire/index.d.ts +1 -1
package/dist/workflow/index.d.ts +494 -0
package/dist/workflow/index.js +2177 -0
package/dist/workflow/index.js.map +1 -0
package/docs/design/self-improvement-roadmap.md +106 -0
package/package.json +36 -12
package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
package/dist/chunk-ODGETRTM.js.map +0 -1
package/dist/chunk-SL55X4VN.js +0 -186
package/dist/chunk-SL55X4VN.js.map +0 -1
package/dist/chunk-UD6EF73X.js.map +0 -1
/package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0

package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} RENAMED Viewed

@@ -1,4 +1,3 @@
-import { M as MutableSurface, j as GateDecision } from './types-CnmZ2bkP.js';
 import { G as GainDistributionBin, P as ParetoFigureSpec } from './summary-report-ByiOUrHj.js';
 import { a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
@@ -290,223 +289,4 @@ interface Recommendation {
     evidencePath?: string;
 }
-/**
- * # Hosted-tier wire format — the schema that EVERY orchestrator (ours,
- * a partner's self-hosted one, a future open implementation) must accept.
- *
- * **Stability:** every type in this file is committed under semver. New
- * minors only ADD optional fields. Breaking changes mean a major bump
- * (`HostedWireVersion` literal increment).
- *
- * The wire format is two event streams in one transport:
- *
- *   1. **Eval-run events** (`POST /v1/ingest/eval-runs`). Posted when a
- *      campaign / improvement-loop completes (or per-generation if
- *      streaming). Carries the structured result + per-cell scores +
- *      surface diffs the orchestrator stores for the dashboard.
- *
- *   2. **Trace spans** (`POST /v1/ingest/traces`). Standard OTLP-shaped
- *      spans with a few additional attributes so the orchestrator can
- *      pivot from eval-run → underlying execution. Compatible with any
- *      OTel collector.
- *
- * Both endpoints are authenticated with a bearer token + a tenant id
- * header. Tenants isolate everything downstream of ingest; no tenant
- * ever sees another tenant's data.
- */
-declare const HOSTED_WIRE_VERSION: "2026-05-26.v1";
-type HostedWireVersion = typeof HOSTED_WIRE_VERSION;
-/** Every ingest request carries these. */
-interface HostedIngestHeaders {
-    /** Bearer token. The orchestrator validates against the tenant key. */
-    authorization: `Bearer ${string}`;
-    /** Stable tenant id (the orchestrator-side primary key for the tenant). */
-    'x-tangle-tenant-id': string;
-    /** Wire-version pin so the server can reject incompatible payloads. */
-    'x-tangle-wire-version': HostedWireVersion;
-    /** Optional idempotency key for retry-safe ingest. */
-    'idempotency-key'?: string;
-}
-/** Lifecycle stages of an eval-run as the substrate reports them. */
-type EvalRunStatus = 'started' | 'baseline-complete' | 'generation-complete' | 'gate-decided' | 'finished' | 'errored';
-interface EvalRunCellScore {
-    /** Stable scenario id from the consumer's scenario set. */
-    scenarioId: string;
-    /** Repetition index when reps > 1; 0 for the default. */
-    rep: number;
-    /** Composite score across all judges + dimensions for this cell. */
-    compositeMean: number;
-    /** Per-judge → per-dimension scores; null where the judge did not run. */
-    dimensions: Record<string, Record<string, number>>;
-    /** Per-cell error message if the dispatch threw. Null on success. */
-    errorMessage?: string;
-}
-interface EvalRunGenerationSnapshot {
-    /** Generation index. 0 is baseline. */
-    index: number;
-    /** Candidate surface fingerprint (stable hash) — pivot key into the
-     *  trace stream to fetch the underlying execution. */
-    surfaceHash: string;
-    /** The candidate surface itself. May be omitted to avoid PII when the
-     *  consumer prefers not to ship verbatim prompts. */
-    surface?: MutableSurface;
-    /** Per-cell scores for this generation. */
-    cells: EvalRunCellScore[];
-    /** Aggregate composite mean across all cells in this generation. */
-    compositeMean: number;
-    /** Total $ spent across this generation. */
-    costUsd: number;
-    /** Wall-clock duration of this generation. */
-    durationMs: number;
-}
-/**
- * The top-level eval-run event. One ingest call per logical eval-run;
- * generations stream in incrementally via repeated calls with the same
- * `runId`. The orchestrator deduplicates by `(runId, generation.index)`.
- */
-interface EvalRunEvent {
-    /** Stable run id (the substrate's `runId`). UUID or substrate-generated. */
-    runId: string;
-    /** Where this run was happening — derived from `RunCampaignOptions.runDir`. */
-    runDir: string;
-    /** ISO-8601 timestamp the substrate recorded the event. */
-    timestamp: string;
-    /** Lifecycle stage this event represents. */
-    status: EvalRunStatus;
-    /** Free-form consumer tags (env, branch, model id, etc.). Searchable. */
-    labels: Record<string, string>;
-    /** Baseline campaign snapshot. Present when status >= baseline-complete. */
-    baseline?: EvalRunGenerationSnapshot;
-    /** Per-generation snapshots. Streams in; orchestrator appends. */
-    generations: EvalRunGenerationSnapshot[];
-    /** Final gate decision. Present when status >= gate-decided. */
-    gateDecision?: GateDecision;
-    /** Held-out lift = winner-on-holdout - baseline-on-holdout. */
-    holdoutLift?: number;
-    /** Total $ spent across baseline + every generation. */
-    totalCostUsd: number;
-    /** Total wall-clock duration. */
-    totalDurationMs: number;
-    /** Error message if status === 'errored'. */
-    errorMessage?: string;
-    /** Rigor packet emitted alongside the run — distributional summary,
-     *  paired-bootstrap lift CI, judge stats, inter-rater agreement,
-     *  contamination check, failure clusters (when an analyst is wired),
-     *  outcome correlation (when downstream signal is supplied), and the
-     *  recommendations the dashboard surfaces verbatim. Additive; older
-     *  clients that don't know about this field continue to work. */
-    insightReport?: InsightReport;
-}
-/**
- * OTel-shape span with a few additional attributes for eval-run pivoting.
- * Compatible with any OTLP collector — `name`, `traceId`, `spanId`,
- * `startTimeUnixNano`, `endTimeUnixNano`, `attributes` are stock OTel.
- */
-interface TraceSpanEvent {
-    traceId: string;
-    spanId: string;
-    parentSpanId?: string;
-    name: string;
-    startTimeUnixNano: number;
-    endTimeUnixNano: number;
-    attributes: Record<string, string | number | boolean>;
-    events?: Array<{
-        timeUnixNano: number;
-        name: string;
-        attributes?: Record<string, string | number | boolean>;
-    }>;
-    status?: {
-        code: 'OK' | 'ERROR' | 'UNSET';
-        message?: string;
-    };
-    /** Pivot back into the eval-run stream. */
-    'tangle.runId'?: string;
-    /** Pivot to the specific generation. */
-    'tangle.generation'?: number;
-    /** Pivot to the specific cell. */
-    'tangle.cellId'?: string;
-    /** Pivot to the specific scenario. */
-    'tangle.scenarioId'?: string;
-}
-interface IngestEvalRunsRequest {
-    wireVersion: HostedWireVersion;
-    events: EvalRunEvent[];
-}
-interface IngestTracesRequest {
-    wireVersion: HostedWireVersion;
-    spans: TraceSpanEvent[];
-}
-interface IngestResponse {
-    /** Accepted events / spans count. */
-    accepted: number;
-    /** Rejected events with reasons (validation failures, dup idempotency key, etc.). */
-    rejected: Array<{
-        index: number;
-        reason: string;
-    }>;
-}
-/**
- * # Hosted-tier ingest client.
- *
- * Ships eval-run events + trace spans to any orchestrator (ours, a
- * partner's self-hosted one, or a future open implementation) that
- * speaks the wire format in `./types.ts`.
- *
- * Three modes:
- *   - **Ours:** point at `https://orchestrator.tangle.tools` (the host root —
- *     the client appends the versioned `/v1/ingest/...` path itself; a trailing
- *     `/v1` on the endpoint is tolerated and normalized away). We handle ingest
- *     + storage + dashboard.
- *   - **Self-hosted:** point at whatever URL runs the reference receiver
- *     from `examples/hosted-ingest-server/`.
- *   - **Off (default):** when `hostedTenant` is unset, nothing is sent.
- *     Everything stays local.
- */
-interface HostedTenant {
-    /** Orchestrator endpoint base URL (no trailing slash). Required. */
-    endpoint: string;
-    /** Bearer token issued by the orchestrator. Required. */
-    apiKey: string;
-    /** Tenant id — the orchestrator's primary key for this consumer. Required. */
-    tenantId: string;
-    /** Optional `fetch` override (auth wrappers, custom agent, test mocks). */
-    fetchImpl?: typeof fetch;
-    /** Per-call timeout in ms. Default 30s. */
-    timeoutMs?: number;
-    /** Retries on 5xx / network errors. Default 2. */
-    retries?: number;
-}
-interface HostedClient {
-    ingestEvalRun(event: EvalRunEvent, idempotencyKey?: string): Promise<IngestResponse>;
-    ingestEvalRuns(events: EvalRunEvent[], idempotencyKey?: string): Promise<IngestResponse>;
-    ingestTraces(spans: TraceSpanEvent[], idempotencyKey?: string): Promise<IngestResponse>;
-    readonly tenant: HostedTenant;
-    readonly wireVersion: HostedWireVersion;
-}
-declare function createHostedClient(tenant: HostedTenant): HostedClient;
-/**
- * Build a `HostedClient` from environment, or `undefined` when ingest is not
- * configured — the canonical, fail-soft wiring every product uses so eval-run +
- * trace provenance lands in the Intelligence dashboard with ONE call:
- *
- *   const hosted = hostedClientFromEnv()
- *   // ...run the loop...
- *   await emitLoopProvenance({ ..., hostedClient: hosted })  // no-op if undefined
- *
- * Returns `undefined` (NOT an error) when any of endpoint / apiKey / tenantId is
- * missing — so a product wires the ship call unconditionally and it stays a
- * no-op until the env is set. Env precedence:
- *   - endpoint:  `TANGLE_INGEST_URL` → `TANGLE_ORCHESTRATOR_URL`
- *   - apiKey:    `TANGLE_INGEST_API_KEY` → `TANGLE_API_KEY`
- *   - tenantId:  `TANGLE_TENANT_ID`
- * A trailing slash on the endpoint is stripped. Pass `overrides` to supply any
- * field directly (e.g. a fixed `tenantId` per product) — overrides win over env.
- */
-declare function hostedClientFromEnv(overrides?: Partial<HostedTenant> & {
-    env?: Record<string, string | undefined>;
-}): HostedClient | undefined;
-export { type EvalRunCellScore as E, type FailureClusterInsight as F, type HostedClient as H, type InsightReport as I, type JudgeInsight as J, type LiftInsight as L, type OutcomeCorrelationInsight as O, type Recommendation as R, type ScalarDistribution as S, type TraceSpanEvent as T, type HostedTenant as a, type InterRaterInsight as b, type ReleaseSummary as c, type EvalRunEvent as d, type EvalRunGenerationSnapshot as e, type EvalRunStatus as f, HOSTED_WIRE_VERSION as g, type HostedIngestHeaders as h, type HostedWireVersion as i, type IngestEvalRunsRequest as j, type IngestResponse as k, type IngestTracesRequest as l, createHostedClient as m, hostedClientFromEnv as n };
+export type { FailureClusterInsight as F, InsightReport as I, JudgeInsight as J, LiftInsight as L, OutcomeCorrelationInsight as O, Recommendation as R, ScalarDistribution as S, InterRaterInsight as a, ReleaseSummary as b };

package/dist/kind-factory-DW9XWPvM.d.ts ADDED Viewed

@@ -0,0 +1,172 @@
+import { AxAIService, AxFunction } from '@ax-llm/ax';
+import { T as TraceAnalysisStore } from './store-GmBE2pZZ.js';
+import { z } from 'zod';
+import { g as AnalystCost, a as AnalystContext, A as Analyst } from './types-CRD68aH7.js';
+/**
+ * Typed Ax output for analyst findings.
+ *
+ * Replaces the legacy `findings:string[]` pattern (where every bullet
+ * became a flat-severity `AnalystFinding`) with a structured object
+ * array. Ax binds the field as `findings:json[]` so the provider emits
+ * native structured output; at the kind-factory boundary we Zod-validate
+ * each emitted finding so malformed rows fail loud instead of being
+ * silently lifted with default severity.
+ *
+ * Why not `f.object().array()` directly in the signature? The Ax
+ * signature string `question:string -> findings:json[]` already lets
+ * the provider emit JSON arrays. A Zod boundary is required either
+ * way (the provider can return any JSON), and Zod gives us a single
+ * validation surface independent of which Ax version is installed.
+ */
+declare const ANALYST_SEVERITIES: readonly ["critical", "high", "medium", "low", "info"];
+declare const RawAnalystFindingSchema: z.ZodObject<{
+    severity: z.ZodEnum<{
+        info: "info";
+        critical: "critical";
+        medium: "medium";
+        low: "low";
+        high: "high";
+    }>;
+    claim: z.ZodString;
+    subject: z.ZodOptional<z.ZodString>;
+    evidence_uri: z.ZodString;
+    evidence_excerpt: z.ZodOptional<z.ZodString>;
+    confidence: z.ZodNumber;
+    rationale: z.ZodOptional<z.ZodString>;
+    recommended_action: z.ZodOptional<z.ZodString>;
+}, z.core.$strict>;
+type RawAnalystFinding = z.infer<typeof RawAnalystFindingSchema>;
+/**
+ * Description embedded into the actor prompt so the LLM knows what
+ * shape to emit. Kept here so kinds share one source of truth rather
+ * than restating the schema in every prompt.
+ */
+declare const RAW_FINDING_SCHEMA_PROMPT = "Each finding MUST be a JSON object with these fields:\n  - severity: one of \"critical\" | \"high\" | \"medium\" | \"low\" | \"info\"\n  - claim: one-sentence statement (max 2000 chars)\n  - subject?: the routing locus this finding is about. It MUST be one of the exact subject forms listed in this kind's instructions above (e.g. `system-prompt:<section>`, `agent-knowledge:wiki:<slug>`, `tool-doc:<tool>`). A free phrase, a bare noun, or any form not in that list is REJECTED at parse time and the finding is discarded \u2014 omit subject entirely rather than guess a form.\n  - evidence_uri: REQUIRED, never blank. Exactly one of \"span://<trace_id>/<span_id>\" (trace evidence), \"artifact://<relative-path>\" (files), \"metric://<name>\" (named scalars) \u2014 ALWAYS cite a real id surfaced by the tools. If you have no citable id, do not emit the finding.\n  - evidence_excerpt?: short quote (<=2000 chars) from the cited span/artifact\n  - confidence: number 0..1 \u2014 0.9+ when backed by exact quotes, 0.6-0.8 for inferred patterns, <0.5 for speculative\n  - rationale?: one or two sentences explaining the reasoning\n  - recommended_action?: concrete change phrased as an imperative (\"Add ...\", \"Replace ...\", \"Stop ...\") \u2014 omit when the finding is purely descriptive\n\nEmit an empty array when the question has no findings to report. Do not fabricate evidence.";
+/**
+ * Validate one row emitted by the LLM. Returns the typed finding on
+ * success; returns `null` and logs the reason on failure so the kind
+ * factory can skip-and-count rather than abort the whole analyst run.
+ */
+declare function parseRawFinding(row: unknown, log?: (msg: string, fields?: Record<string, unknown>) => void): RawAnalystFinding | null;
+/**
+ * Analyst-kind factory — the typed, focused replacement for the
+ * legacy `createTraceAnalystAdapter`.
+ *
+ * A "kind" is a specialized analyst whose actor prompt, tool subset,
+ * and Ax recursion config target one failure-mode lens (failure-mode
+ * classification, knowledge gap discovery, knowledge poisoning, recursive
+ * self-improvement, ...). Kinds emit findings in the typed `RawAnalystFinding`
+ * shape via a JSON-array Ax output; the factory validates each row with
+ * Zod and lifts it into `AnalystFinding[]` with no shape guessing.
+ *
+ * Composition rules:
+ *   - Each kind owns its actor description. No generic "answer this
+ *     question" prompt — the prompt names the failure lens.
+ *   - Each kind picks a narrow tool subset from `ANALYST_TOOL_GROUPS`.
+ *     A kind that never needs full-trace dumps can drop `viewTrace` /
+ *     `viewSpans` and stay cheap.
+ *   - Each kind declares its recursion + parallelism budget. Discovery-
+ *     heavy kinds (failure-mode) get higher `maxDepth`; lens kinds
+ *     (poisoning) usually stay at 0 since they have a tighter brief.
+ *
+ * Optimizer hook: kinds may declare `goldens` — labeled examples used
+ * by `AxMiPRO` / `AxBootstrapFewShot` / `AxGEPA` to fit the actor
+ * description programmatically. Stored on the kind, not the registry,
+ * because the right metric is kind-specific.
+ */
+/**
+ * Per-kind specification. The factory turns this into a regular
+ * `Analyst<TraceAnalysisStore>` ready for `AnalystRegistry.register()`.
+ */
+interface TraceAnalystKindSpec {
+    /** Stable id. Appears in finding_id, telemetry, and registry exclusions. */
+    id: string;
+    /** One-sentence description shown in `registry.list()`. */
+    description: string;
+    /** Coarse classification stamped on every emitted finding (`failure-mode`, `knowledge-gap`, ...). */
+    area: string;
+    /** Bump on any breaking change to the actor prompt or output schema. */
+    version: string;
+    /** Actor system prompt. Must instruct the LLM to emit `findings` per the schema. */
+    actorDescription: string;
+    /** Responder system prompt; falls back to a minimal "format the findings" instruction. */
+    responderDescription?: string;
+    /** Tool functions the actor may call. Pick narrow subsets via `ANALYST_TOOL_GROUPS`. */
+    buildTools: (store: TraceAnalysisStore) => AxFunction[];
+    /** Recursion budget. `maxDepth: 0` disables subagents. */
+    recursion?: {
+        maxDepth: number;
+        maxParallelSubagents?: number;
+    };
+    /** Actor turn cap. Default 12. */
+    maxTurns?: number;
+    /** Runtime char cap. Default 6000. */
+    maxRuntimeChars?: number;
+    /** Cost classification surfaced in `registry.list()` and budget enforcement. */
+    cost: AnalystCost;
+    /** Per-finding-row hook — kinds may reject / rewrite before lifting. */
+    postProcess?: (row: RawAnalystFinding, ctx: AnalystContext) => RawAnalystFinding | null;
+    /** Optional optimizer hook — populated when a kind wants to fit its prompt against labeled examples. */
+    goldens?: TraceAnalystGolden[];
+}
+/**
+ * One labeled example consumed by Ax optimizers (MIPRO / GEPA / Bootstrap).
+ * Each input is the same `{question}` an analyst would receive; `expected`
+ * is the ground-truth finding set a fitted prompt should produce on this
+ * input. Metric: kind-specific (default: F1 on `finding_id` overlap).
+ */
+interface TraceAnalystGolden {
+    question: string;
+    expected: ReadonlyArray<Omit<RawAnalystFinding, 'confidence'>>;
+}
+interface CreateTraceAnalystKindOpts {
+    /** AxAIService bound at registration time. */
+    ai: AxAIService;
+    /** Optional model override; falls back to the AI service's default. */
+    model?: string;
+    /** Override the spec's `version` (e.g. when an optimizer has fitted a new prompt). */
+    versionSuffix?: string;
+    /**
+     * Optional two-phase recovery: when the agentic harvest is empty but the
+     * actor produced a substantive free-form `report`, extract findings from that
+     * prose via a tolerant chat-completions pass (`structureFindings`) — no
+     * strict-emission contract, so it works on weak models. Omit to leave the
+     * actor's harvest as-is (the report is still surfaced fail-loud either way).
+     */
+    recovery?: {
+        baseUrl: string;
+        apiKey?: string;
+        model?: string;
+        fetchImpl?: typeof fetch;
+    };
+}
+/**
+ * Build an `Analyst<TraceAnalysisStore>` from a kind spec.
+ *
+ * Lifts the Ax pipeline once at registration time so the registry
+ * gets a stateless analyst. The Ax agent is freshly constructed per
+ * `analyze()` call (the agent carries chat-log + usage state we don't
+ * want shared across analyst runs).
+ */
+declare function createTraceAnalystKind(spec: TraceAnalystKindSpec, opts: CreateTraceAnalystKindOpts): Analyst<TraceAnalysisStore>;
+/**
+ * Render a compact prior-findings block the actor reads alongside its
+ * brief. Each row is one line so the actor can scan dozens cheaply.
+ * The kind's prompt instructs the actor to (a) check whether a new
+ * cluster matches a prior `finding_id` (carry the id forward via
+ * `id_basis` to keep diffs stable) and (b) raise severity / confidence
+ * when a prior finding has reappeared without remediation.
+ *
+ * Returns the empty string when there are no prior findings — most
+ * runs are "first-of-its-kind" and the prompt stays unchanged.
+ *
+ * Exported for tests + for consumers that build their own actor
+ * prompts (e.g. specialized analysts living outside the default kinds).
+ */
+declare function renderPriorFindings(prior: AnalystContext['priorFindings']): string;
+export { ANALYST_SEVERITIES as A, type CreateTraceAnalystKindOpts as C, RAW_FINDING_SCHEMA_PROMPT as R, type TraceAnalystGolden as T, type RawAnalystFinding as a, RawAnalystFindingSchema as b, type TraceAnalystKindSpec as c, createTraceAnalystKind as d, parseRawFinding as p, renderPriorFindings as r };

package/dist/multi-layer-verifier-DlWCXuxL.d.ts ADDED Viewed

@@ -0,0 +1,141 @@
+/**
+ * Multi-layer verifier — ordered pipeline of verification layers.
+ *
+ * Different contract from {@link JudgeRunner} (which runs parallel
+ * specs against a sandbox). MultiLayerVerifier is a DAG of layers
+ * (install → typecheck → build → lint → serve → semantic → …) with
+ * dependency-based skip, per-layer findings, soft-fail semantics, and
+ * an aggregated `blendedScore` across all passed layers.
+ *
+ * Use when you want:
+ *   - ordered stages where a failing upstream stage skips downstream ones
+ *   - each stage produces rich `findings` (severity + message + evidence)
+ *   - a single composite score across stages with per-stage weights
+ *   - soft-fail stages whose failure doesn't abort the pipeline
+ *
+ * Use {@link JudgeRunner} when you want:
+ *   - N independent judges running in parallel against the same artifact
+ *   - no inter-judge dependencies
+ *   - boolean `passed` per judge + overall
+ *
+ * Both primitives compose — JudgeRunner can be invoked as a single
+ * layer inside a MultiLayerVerifier if that suits the caller.
+ */
+type LayerStatus = 'pass' | 'fail' | 'skipped' | 'error' | 'timeout';
+type Severity = 'critical' | 'major' | 'minor' | 'info';
+interface Finding {
+    severity: Severity;
+    message: string;
+    evidence?: string;
+    /** Optional layer name the finding belongs to (set by the verifier if omitted). */
+    layer?: string;
+    /**
+     * Free-form structured payload — used by `multiToolchainLayer` to attach
+     * `{ adapter: 'pnpm' }`, by judges to attach evidence pointers, etc.
+     * Renderers MAY interrogate; agent-eval primitives never assume shape.
+     */
+    detail?: Record<string, unknown>;
+}
+interface LayerResult {
+    layer: string;
+    status: LayerStatus;
+    /** 0..1 score, optional — layers that don't produce a numeric score omit. */
+    score?: number;
+    durationMs: number;
+    findings: Finding[];
+    /** Short human-readable summary (one line). */
+    reason?: string;
+    /**
+     * Numeric layer-level diagnostics: error counts, warning counts,
+     * cyclomatic complexity, total adapter wall-time, etc. Keyed by
+     * diagnostic name; null = "diagnostic not applicable / not measured."
+     * Renderers that know the keys can display them; ones that don't,
+     * ignore. Free-form on purpose — consumers type the value shape in
+     * their own namespace.
+     */
+    diagnostics?: Record<string, number | null>;
+    /** Any rich per-layer detail — rendered as-is by consumers that know the layer. */
+    detail?: Record<string, unknown>;
+}
+interface VerifyContext<Env = unknown> {
+    /** Per-run opaque context the caller provides. Layers destructure what they need. */
+    env: Env;
+    /** Previously-computed results from layers that already ran. */
+    prior: Record<string, LayerResult>;
+    /** Signal — if aborted, layers MUST bail within reasonable wall. */
+    signal: AbortSignal;
+}
+interface Layer<Env = unknown> {
+    name: string;
+    /** Stages that must have `status: 'pass'` before this layer runs. */
+    dependsOn?: string[];
+    /**
+     * Weight in the composite `blendedScore`. Default 1.0. Layers with weight 0
+     * contribute findings but not score.
+     */
+    weight?: number;
+    /**
+     * If true, a `fail` status contributes to `blendedScore` (as 0) instead of
+     * being dropped — use for layers whose failure is a real signal. Default:
+     * fail drops from numerator + denominator, matching VB's existing semantics.
+     */
+    failContributesToScore?: boolean;
+    /** Optional per-layer wall-cap in ms. Honored by the verifier (AbortSignal). */
+    capMs?: number;
+    run: (ctx: VerifyContext<Env>) => Promise<LayerResult> | LayerResult;
+}
+interface VerifyOptions<Env = unknown> {
+    env: Env;
+    /**
+     * Overall wall cap. Default: sum of layer capMs, or Infinity if any layer
+     * omits a cap. The verifier short-circuits remaining layers on overall cap.
+     */
+    overallCapMs?: number;
+    /** Called with each layer result as it completes. */
+    onLayer?: (result: LayerResult) => void;
+}
+interface VerificationReport {
+    layers: LayerResult[];
+    passCount: number;
+    failCount: number;
+    skippedCount: number;
+    errorCount: number;
+    /** True iff at least one scored layer ran AND every scored layer passed. */
+    allPass: boolean;
+    /**
+     * Weighted mean of `score` across contributing layers. 0 when no layers
+     * contributed. See {@link Layer.failContributesToScore} for fail semantics.
+     */
+    blendedScore: number;
+    durationMs: number;
+    startedAt: string;
+    finishedAt: string;
+}
+/**
+ * Grade a semantic-concept-style judge result into a single layer status.
+ *
+ * Pass when overall score >= threshold AND no critical-severity concept gap.
+ * Fail otherwise. Use inside a `Layer.run` when wrapping a concept judge.
+ *
+ * Generalized from VerticalBench H3 fix: `failingConcepts.length === 0` was
+ * too strict — a single concept at 6/10 failed the entire layer despite
+ * overall score being >= 0.7. Now we trust the judge's own `severity` field:
+ * `critical` findings veto; `major`/`minor` reduce the score but don't veto.
+ */
+declare function gradeSemanticStatus(input: {
+    score: number;
+    findings: Array<{
+        severity: Severity;
+        present?: boolean;
+        score?: number;
+    }>;
+    available: boolean;
+    threshold?: number;
+}): LayerStatus;
+declare class MultiLayerVerifier<Env = unknown> {
+    private readonly layers;
+    constructor(layers: Layer<Env>[]);
+    run(opts: VerifyOptions<Env>): Promise<VerificationReport>;
+}
+export { type Finding as F, type LayerResult as L, MultiLayerVerifier as M, type Severity as S, type VerifyOptions as V, type VerificationReport as a, type Layer as b, type VerifyContext as c, type LayerStatus as d, gradeSemanticStatus as g };

package/dist/openapi.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "openapi": "3.1.0",
   "info": {
     "title": "@tangle-network/agent-eval — wire protocol",
-    "version": "0.72.0",
+    "version": "0.72.4",
     "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
     "contact": {
       "name": "Tangle Network",

package/dist/pareto-E-pembql.d.ts ADDED Viewed

@@ -0,0 +1,81 @@
+/**
+ * Pareto frontier — multi-objective optimization over candidate runs.
+ *
+ * Lifted from ADC pareto.ts and blueprint-agent frontier.ts. When you're
+ * trading off (cost, latency, quality) or (passRate, tokenBudget,
+ * ttfb), you rarely have a single "winner" — you have a set of
+ * non-dominated candidates. This module exposes:
+ *
+ *   - `paretoFrontier`: filter a set of candidates to the non-dominated ones
+ *   - `dominates`: does A dominate B across all objectives?
+ *
+ * Each objective is declared with a direction: 'maximize' (higher=better)
+ * or 'minimize' (lower=better). Candidates are any object; pass an
+ * `objective(candidate)` accessor.
+ */
+type Direction = 'maximize' | 'minimize';
+interface Objective<T> {
+    /** Stable label used in reports. */
+    name: string;
+    direction: Direction;
+    value: (candidate: T) => number;
+}
+interface ParetoResult<T> {
+    frontier: T[];
+    dominated: T[];
+    /** Index map: frontier[i] dominates each of dominatedBy[i]. */
+    dominanceMap: Array<{
+        dominator: T;
+        dominated: T[];
+    }>;
+}
+/** Does candidate A weakly dominate B — strictly better on at least one objective and no worse on any? */
+declare function dominates<T>(a: T, b: T, objectives: Objective<T>[]): boolean;
+/**
+ * Compute the non-dominated frontier. Candidates with NaN/Infinity on any
+ * objective are excluded (can't rank them). A candidate enters the frontier
+ * iff no other candidate dominates it.
+ */
+declare function paretoFrontier<T>(candidates: T[], objectives: Objective<T>[]): ParetoResult<T>;
+/**
+ * Weighted-sum scalarisation. Use as a tie-break / single-winner selector
+ * when callers don't want to consume a frontier. Each objective contributes
+ * its normalised value (0..1 via min-max across the candidate pool) times
+ * its weight; missing weights default to 1/N.
+ *
+ * Direction is honoured automatically — `minimize` axes have their values
+ * inverted before scaling so "higher scalar = better" always holds.
+ */
+declare function scalarScore<T>(candidates: T[], objectives: Objective<T>[], options?: {
+    weights?: Partial<Record<string, number>>;
+}): Array<{
+    candidate: T;
+    score: number;
+}>;
+/**
+ * NSGA-II crowding distance — secondary sort for ties on the frontier.
+ *
+ * When the Pareto front collapses to a single point (or many candidates tie
+ * on dominance), naive selection picks arbitrarily and the population
+ * degenerates over generations. NSGA-II preserves diversity by preferring
+ * candidates with more empty space around them on the frontier.
+ *
+ * Returns an array of `{ candidate, distance }` in the SAME order as the
+ * input. Higher distance = more isolated = should be preferred when
+ * preserving diversity.
+ */
+declare function crowdingDistance<T>(candidates: T[], objectives: Objective<T>[]): Array<{
+    candidate: T;
+    distance: number;
+}>;
+/**
+ * Pareto frontier with tie-break by crowding distance — the canonical
+ * NSGA-II selection step. Returns the frontier sorted by descending crowding
+ * distance so callers can `.slice(0, k)` to pick K diverse winners.
+ */
+declare function paretoFrontierWithCrowding<T>(candidates: T[], objectives: Objective<T>[]): Array<{
+    candidate: T;
+    distance: number;
+}>;
+export { type Direction as D, type Objective as O, type ParetoResult as P, paretoFrontierWithCrowding as a, crowdingDistance as c, dominates as d, paretoFrontier as p, scalarScore as s };