npm - @tangle-network/agent-eval - Versions diffs - 0.72.0 → 0.72.4 - Mend

@tangle-network/agent-eval 0.72.0 → 0.72.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

package/CHANGELOG.md +39 -0
package/dist/adapters/http.d.ts +1 -1
package/dist/adapters/langchain.d.ts +1 -1
package/dist/adapters/otel.d.ts +3 -2
package/dist/agent-profile-DYRboYWu.d.ts +364 -0
package/dist/analyst/index.d.ts +221 -0
package/dist/analyst/index.js +371 -0
package/dist/analyst/index.js.map +1 -0
package/dist/analyst-t7zZS3TV.d.ts +88 -0
package/dist/campaign/index.d.ts +518 -9
package/dist/campaign/index.js +672 -22
package/dist/campaign/index.js.map +1 -1
package/dist/chunk-7W4SM7FD.js +1075 -0
package/dist/chunk-7W4SM7FD.js.map +1 -0
package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
package/dist/chunk-JHA3ZGSO.js +1496 -0
package/dist/chunk-JHA3ZGSO.js.map +1 -0
package/dist/{chunk-4QJN7RDX.js → chunk-JYE3WOTE.js} +55 -7
package/dist/{chunk-4QJN7RDX.js.map → chunk-JYE3WOTE.js.map} +1 -1
package/dist/chunk-LB2UOI5F.js +412 -0
package/dist/chunk-LB2UOI5F.js.map +1 -0
package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
package/dist/chunk-VUINJM5M.js.map +1 -0
package/dist/chunk-WYIHD6EB.js +1044 -0
package/dist/chunk-WYIHD6EB.js.map +1 -0
package/dist/{chunk-UD6EF73X.js → chunk-XPILG2CA.js} +119 -2
package/dist/chunk-XPILG2CA.js.map +1 -0
package/dist/contract/index.d.ts +17 -13
package/dist/contract/index.js +13 -7
package/dist/contract/index.js.map +1 -1
package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
package/dist/control.d.ts +2 -2
package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
package/dist/hosted/index.d.ts +223 -2
package/dist/index.d.ts +49 -1323
package/dist/index.js +353 -2496
package/dist/index.js.map +1 -1
package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
package/dist/openapi.json +1 -1
package/dist/pareto-E-pembql.d.ts +81 -0
package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
package/dist/redact-B40YG2M_.d.ts +45 -0
package/dist/registry-DuVYiTvw.d.ts +128 -0
package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
package/dist/rl.d.ts +4 -3
package/dist/rl.js +4 -4
package/dist/run-critic-BAIjX99r.d.ts +56 -0
package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
package/dist/traces.d.ts +371 -308
package/dist/traces.js +43 -18
package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
package/dist/wire/index.d.ts +1 -1
package/dist/workflow/index.d.ts +494 -0
package/dist/workflow/index.js +2177 -0
package/dist/workflow/index.js.map +1 -0
package/docs/design/self-improvement-roadmap.md +106 -0
package/package.json +36 -12
package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
package/dist/chunk-ODGETRTM.js.map +0 -1
package/dist/chunk-SL55X4VN.js +0 -186
package/dist/chunk-SL55X4VN.js.map +0 -1
package/dist/chunk-UD6EF73X.js.map +0 -1
/package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0

package/CHANGELOG.md CHANGED Viewed

@@ -4,6 +4,45 @@ All notable changes to `@tangle-network/agent-eval` and its sibling `agent-eval-
 ---
+## [0.72.3] — 2026-06-01 — workflow trace hardening and driver backtests
+### Added
+- **Canonical workflow branch events in `/workflow`.** Runtime traces now project branch start/end/failure counts into workflow summaries, RunRecords, and feedback trajectories so fanout topology failures are measurable instead of hidden in raw trace blobs.
+- **`workflowPhaseGraph` in `/workflow`.** Builds phase nodes and branch edges from workflow trace events with per-phase calls, branch failures, cost, and token counters. Product adopters can consume this instead of maintaining local graph mirrors.
+- **Stricter workflow event schema validation.** Workflow traces now reject unknown event kinds, malformed typed payloads, non-monotonic timestamps, missing `workflow.started`, multiple terminal events, and events after terminal completion.
+- **Driver comparison substrate proof.** `compareDrivers` now carries analyst findings through the canonical campaign path and includes GSM8K/AppWorld driver backtest examples.
+### Fixed
+- **Publish skew guard.** PyPI publishing depends on successful npm publishing, and the npm publish job now checks registry authentication and `@tangle-network` package access before building or attempting a publish.
+---
+## [0.72.2] — 2026-06-01 — workflow driver promotion gates
+### Added
+- **`decideWorkflowDriverPromotion` in `/workflow`.** Compares a dynamic workflow driver against the reviewer-loop baseline using paired heldout `RunRecord`s keyed by `scenarioId::seed`, then fails closed on missing pairs, too few pairs, insufficient lift, or candidate cost ceilings.
+- **Explicit workflow comparison axis.** `expectedScenarioIds` defines the promotion gate's comparison set so unrelated scenarios cannot skew the lift or confidence interval.
+### Fixed
+- **No seed-only workflow pairing.** Promotion records without `scenarioId` are rejected instead of being paired by seed alone.
+---
+## [0.72.1] — 2026-06-01 — workflow execution summaries for dynamic drivers
+### Added
+- **`summarizeWorkflowExecution` in `/workflow`.** Builds the canonical rich projection from a workflow trace: event-kind counts, phase order, agent and loop delegate summaries, verifier/analyst/reviewer checkpoint outputs, cost, tokens, and failure status.
+- **Checkpoint output extraction.** Verifier, analyst, and reviewer traces preserve the returned output through `trace.checkpointOutput`, with `trace.output` accepted for compatibility.
+### Fixed
+- **npm/PyPI version lock.** The Python RPC package version is bumped back into lockstep with the npm package so the publish workflow can release both artifacts from one tag.
 ## [0.72.0] — 2026-05-31 — cost axis prices unpriced-at-source models (every run carries a real, labeled cost)
 A live tax-agent full-loop run (real sandbox, `deepseek-v4-pro`, real tokens) exposed the second root of the cost-ledger split: the sandbox reported `totalCostUsd: 0` despite `17537` input / `622` output tokens — not a stub, not a mis-wired ledger, but a model the **source** can't rate. The cost / Pareto / `tokens_per_dollar` axes blanked even though the substrate's pricing table prices `deepseek` correctly; the table was simply never consulted on the matrix cost projection. A $0 cost on a run that burned real tokens reads as "free," which is the more misleading state.

package/dist/adapters/http.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-CnmZ2bkP.js';
+import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-Bba0vl1V.js';
 import '../run-record-BgTFzO2r.js';
 import '../errors-Dwqw-T_m.js';
 import '../schema-m0gsnbt3.js';

package/dist/adapters/langchain.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-CnmZ2bkP.js';
+import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-Bba0vl1V.js';
 import '../run-record-BgTFzO2r.js';
 import '../errors-Dwqw-T_m.js';
 import '../schema-m0gsnbt3.js';

package/dist/adapters/otel.d.ts CHANGED Viewed

@@ -1,8 +1,9 @@
-import { T as TraceSpanEvent, H as HostedClient } from '../index-BGBrVS24.js';
-import '../types-CnmZ2bkP.js';
+import { TraceSpanEvent, HostedClient } from '../hosted/index.js';
+import '../types-Bba0vl1V.js';
 import '../run-record-BgTFzO2r.js';
 import '../errors-Dwqw-T_m.js';
 import '../schema-m0gsnbt3.js';
+import '../insight-report-Df3lxYXM.js';
 import '../summary-report-ByiOUrHj.js';
 import '../failure-cluster-CL7IVgkJ.js';
 import '../store-CKUAgsJz.js';

package/dist/agent-profile-DYRboYWu.d.ts ADDED Viewed

@@ -0,0 +1,364 @@
+import { A as AgentEvalError } from './errors-Dwqw-T_m.js';
+import { R as RunRecord } from './run-record-BgTFzO2r.js';
+import { TCloud } from '@tangle-network/tcloud';
+/**
+ * Backend-integrity guard: distinguish "agent failed" from "eval ran against
+ * a stub / unconfigured backend." Without this guard a canonical eval can
+ * silently report `0/N passed` and look like an agent-quality problem when
+ * the LLM was never actually called — the failure mode we just hit running
+ * the 4-vertical parallel eval (legal-sandbox-stub returned hard-coded 33-104
+ * char strings; gtm/creative defaulted to a cli-bridge that wasn't running).
+ *
+ * The shape:
+ *
+ *   const report = summarizeBackendIntegrity(records)
+ *   assertRealBackend(records)   // throws BackendIntegrityError if 100% stub
+ *
+ * A record is "stub-mode" if its `tokenUsage.input === 0 && tokenUsage.output === 0`.
+ * (`costUsd` alone is unreliable — some backends successfully call LLMs but
+ *  don't propagate pricing, producing real tokens with $0 cost.)
+ *
+ * Verdicts:
+ *   - `real`   — at least one record has nonzero token usage
+ *   - `stub`   — every record is stub-mode (eval ran blind)
+ *   - `mixed`  — some records real, some stub (partial backend failure;
+ *                often the 429-cascade or auth-half-failed case)
+ */
+interface BackendIntegrityReport {
+    /** Total records inspected. */
+    totalRecords: number;
+    /** Records with input=0 AND output=0 (a stub fingerprint). */
+    stubRecords: number;
+    /** Records with nonzero token usage (real LLM activity). */
+    realRecords: number;
+    /** Records where output>0 but costUsd=0 (real LLM, broken cost ledger). */
+    uncostedRecords: number;
+    /** Sum of input tokens across all records. */
+    totalInputTokens: number;
+    /** Sum of output tokens across all records. */
+    totalOutputTokens: number;
+    /** Sum of costUsd across all records. */
+    totalCostUsd: number;
+    /** Worst-case integrity verdict. */
+    verdict: 'real' | 'mixed' | 'stub';
+    /** Human-readable diagnosis suitable for terminal output. */
+    diagnosis: string;
+}
+/**
+ * Error thrown when an integrity assertion fails. Caller can pattern-match
+ * by `code === 'AGENT_EVAL_BACKEND_STUB'` to differentiate from other
+ * errors.
+ */
+declare class BackendIntegrityError extends AgentEvalError {
+    readonly report: BackendIntegrityReport;
+    constructor(message: string, report: BackendIntegrityReport);
+}
+/**
+ * Inspect a batch of RunRecords and return an integrity report. Pure
+ * function — no I/O, no logging. The caller decides what to do with the
+ * verdict (print warning, throw, gate CI, etc.).
+ */
+declare function summarizeBackendIntegrity(records: ReadonlyArray<RunRecord>): BackendIntegrityReport;
+/**
+ * Throw BackendIntegrityError if the verdict is 'stub' — i.e. every record
+ * shows zero LLM activity. Non-strict callers can pass `{ allowMixed: false }`
+ * to also reject mixed verdicts (recommended for CI gates).
+ *
+ * Real backends pass through silently.
+ */
+declare function assertRealBackend(records: ReadonlyArray<RunRecord>, opts?: {
+    allowMixed?: boolean;
+}): BackendIntegrityReport;
+/**
+ * Artifact validators.
+ *
+ * Generic "score a produced artifact" primitive. Tax uses it for PDF form
+ * correctness, research for sourced briefs, browser for task assertions, coding
+ * for social posts. One interface, many validators; all plug into
+ * `BenchmarkRunner` the same way.
+ *
+ * A validator receives an `Artifact` (file on disk, JSON blob, text, binary)
+ * plus a `ValidationContext` (scenario id, the turns that produced it) and
+ * returns a `ValidationResult` with pass/fail + 0..1 score + structured
+ * issues.
+ */
+interface Artifact {
+    /** Logical kind — validators type-guard on this */
+    kind: 'file' | 'json' | 'text' | 'binary' | string;
+    /** Filesystem-style path, optional */
+    path?: string;
+    /** String content for text/json/file kinds */
+    content?: string;
+    /** Binary content (if kind === 'binary') */
+    bytes?: Uint8Array;
+    /** Caller-supplied metadata (mimeType, sha256, size, etc.) */
+    metadata?: Record<string, unknown>;
+}
+interface ValidationContext {
+    scenarioId: string;
+    turnIndex?: number;
+    /** Prior artifacts for multi-artifact scenarios */
+    priorArtifacts?: Artifact[];
+    /** Free-form hints the validator uses for domain-specific checks */
+    hints?: Record<string, unknown>;
+}
+interface ValidationIssue {
+    severity: 'error' | 'warning' | 'info';
+    message: string;
+    /** Optional path into the artifact (e.g. JSON path or byte offset) */
+    locus?: string;
+}
+interface ValidationResult {
+    pass: boolean;
+    /** 0–1 normalized score. Validators should be monotonic in pass-ness. */
+    score: number;
+    issues: ValidationIssue[];
+    /** Diagnostic payload for reporters */
+    evidence?: Record<string, unknown>;
+}
+interface ArtifactValidator {
+    /** Stable identifier for the validator; appears in reports. */
+    name: string;
+    /** Optional description for human-facing reports. */
+    description?: string;
+    /** Called once per artifact; validators are expected to be pure + idempotent. */
+    validate(artifact: Artifact, context: ValidationContext): Promise<ValidationResult>;
+}
+/**
+ * Run every validator on the same artifact; aggregate pass as AND, score as
+ * (weighted) mean, issues concatenated. Weights default to 1 each.
+ */
+declare function composeValidators(validators: ArtifactValidator[], options?: {
+    name?: string;
+    weights?: number[];
+}): ArtifactValidator;
+/** Pass if the artifact body matches a provided regex. */
+declare function regexMatch(name: string, pattern: RegExp): ArtifactValidator;
+/** Pass if JSON parses and every required key is present. */
+declare function jsonHasKeys(name: string, requiredPaths: string[]): ArtifactValidator;
+/** Pass if min ≤ byte length ≤ max. */
+declare function byteLengthRange(name: string, min: number, max: number): ArtifactValidator;
+/** Pass if the artifact contains every required substring (case-insensitive by default). */
+declare function containsAll(name: string, required: string[], options?: {
+    caseSensitive?: boolean;
+}): ArtifactValidator;
+/**
+ * Completion verifier — the task-completion oracle.
+ *
+ * Answers the only eval question that is not a proxy: did the agent actually
+ * COMPLETE the task — produce every required deliverable, persisted and
+ * correct — rather than describe what should be done. A fluent transcript
+ * that never produces the artifact scores zero here.
+ *
+ * Per requirement, a two-stage check:
+ *   1. Structural — a produced item (vault artifact / approved proposal /
+ *      tool call) of the right kind is matched against the requirement and
+ *      carries non-empty content. Deterministic; no LLM.
+ *   2. Correctness — only if structurally present AND the matched item
+ *      carries content, one targeted check decides whether that item
+ *      actually fulfils the requirement. A hallucinated artifact fails here;
+ *      an absent one already failed stage 1.
+ *
+ * `completionRate` is satisfied / total. Quality dimensions are meaningless
+ * on an incomplete task — callers gate on `fullyComplete` / `completionRate`
+ * before scoring quality.
+ */
+/** What kind of produced state can satisfy a requirement structurally. */
+type SatisfiedBy = 'artifact' | 'proposal' | 'tool-call' | 'any';
+interface CompletionRequirement {
+    /** Stable id from the task gold (e.g. a persona's `expected_requirements[].req_id`). */
+    reqId: string;
+    /** Human-readable description of the required deliverable. */
+    title: string;
+    /** Optional kind/category hint, matched against a produced item's kind. */
+    category?: string;
+    /** What produced state satisfies this requirement. Defaults to 'any'. */
+    satisfiedBy?: SatisfiedBy;
+}
+interface TaskGold {
+    taskId: string;
+    requirements: CompletionRequirement[];
+}
+interface ProducedProposal {
+    id: string;
+    title: string;
+    status: 'pending' | 'approved' | 'rejected';
+    /** Optional persisted body — when present, enables a correctness check. */
+    content?: string;
+}
+/** Everything observable about what a run actually produced. */
+interface ProducedState {
+    /** Persisted vault artifacts. Reuses the shared `Artifact` shape. */
+    artifacts: Artifact[];
+    /** Proposals / filings the agent created. */
+    proposals: ProducedProposal[];
+    /** Names of tools the agent invoked. */
+    toolCalls: string[];
+}
+interface RequirementCheck {
+    reqId: string;
+    title: string;
+    /** A produced item of the right kind matched the requirement, non-empty. */
+    structurallyPresent: boolean;
+    /**
+     * Whether the matched item actually fulfils the requirement. `null` when
+     * not structurally present, or when the matched item carries no content
+     * to assess.
+     */
+    correct: boolean | null;
+    /** structurallyPresent && correct !== false. */
+    satisfied: boolean;
+    /** Human-readable evidence for the verdict. */
+    evidence: string[];
+}
+interface CompletionVerdict {
+    taskId: string;
+    requirements: RequirementCheck[];
+    /** satisfied / total requirements. */
+    completionRate: number;
+    /** Every requirement satisfied. */
+    fullyComplete: boolean;
+}
+/**
+ * Decides whether a produced item's content actually fulfils a requirement.
+ * Injected so the structural verifier stays pure and unit-testable; the
+ * production implementation is `createLlmCorrectnessChecker`.
+ */
+type CorrectnessChecker = (requirement: CompletionRequirement, content: string) => Promise<{
+    correct: boolean;
+    reason: string;
+}>;
+/**
+ * Verify whether a run completed the task. `checkCorrectness` is injected —
+ * `createLlmCorrectnessChecker` for production, a deterministic stub in tests.
+ *
+ * Throws on a gold spec with no requirements: an eval task that requires
+ * nothing is a misconfiguration, not a vacuously-complete task.
+ */
+declare function verifyCompletion(gold: TaskGold, state: ProducedState, checkCorrectness: CorrectnessChecker): Promise<CompletionVerdict>;
+interface LlmCorrectnessCheckerOpts {
+    model?: string;
+    /** Max chars of artifact content sent to the checker. */
+    maxContentChars?: number;
+}
+/** Parse the correctness checker's model response. Fails loud on a bad shape. */
+declare function parseCorrectnessResponse(raw: string): {
+    correct: boolean;
+    reason: string;
+};
+/**
+ * Production `CorrectnessChecker` — one LLM call per matched artifact,
+ * deterministic (temperature 0), structured JSON out. Judges fulfilment
+ * only: a plan, a gesture, or a description of what should be done does not
+ * fulfil a requirement — the artifact must BE the deliverable.
+ */
+declare function createLlmCorrectnessChecker(tc: TCloud, opts?: LlmCorrectnessCheckerOpts): CorrectnessChecker;
+/**
+ * Produced-state extraction — normalize a run's runtime event stream into the
+ * typed `ProducedState` the completion oracle consumes.
+ *
+ * `ProducedState` answers "what did the agent actually produce" — vault
+ * artifacts, proposals, tool calls. The runtime emits these as a stream of
+ * events; this module is the single normalization point from that stream to
+ * the shape `verifyCompletion` expects.
+ *
+ * Input is structurally typed (`RuntimeEventLike`) so this module does not
+ * depend on agent-runtime — agent-runtime's `RuntimeStreamEvent` satisfies it
+ * structurally. The `content` on `ArtifactEventLike` and the whole
+ * `proposal_created` variant are the runtime-side enrichments this contract
+ * requires; the runtime emits them, this module consumes them.
+ */
+/** A tool the agent invoked. */
+interface ToolCallEventLike {
+    type: 'tool_call';
+    toolName: string;
+}
+/**
+ * An artifact the agent produced. `content` is the enriched field — the
+ * runtime's base `artifact` event carries only metadata; the completion
+ * oracle needs the body to verify the deliverable, so the runtime emits it.
+ */
+interface ArtifactEventLike {
+    type: 'artifact';
+    artifactId: string;
+    name?: string;
+    mimeType?: string;
+    uri?: string;
+    content?: string;
+}
+/** A proposal / filing the agent created. */
+interface ProposalEventLike {
+    type: 'proposal_created';
+    proposalId: string;
+    title: string;
+    status?: 'pending' | 'approved' | 'rejected';
+}
+/**
+ * The subset of runtime stream events `extractProducedState` consumes.
+ * agent-runtime's full `RuntimeStreamEvent` union satisfies this structurally;
+ * the `{ type: string }` catch-all keeps the input permissive so callers can
+ * pass the whole unfiltered telemetry stream — unrecognized events are skipped.
+ */
+type RuntimeEventLike = ToolCallEventLike | ArtifactEventLike | ProposalEventLike | {
+    type: string;
+};
+/**
+ * Normalize a run's runtime event stream into `ProducedState`.
+ *
+ * Pure and total — unrecognized event types are skipped. `toolCalls` is
+ * deduplicated by name in first-seen order (completion cares about a tool's
+ * presence, not its call count). An artifact with neither a name nor a uri
+ * still yields an entry keyed by its `artifactId` so it is never silently
+ * dropped; an artifact with no `content` yields empty content, which the
+ * completion oracle's structural check then rejects on its own.
+ */
+declare function extractProducedState(events: readonly RuntimeEventLike[]): ProducedState;
+/**
+ * @stable
+ *
+ * AgentProfile — the eval harness's unit of variation.
+ *
+ * A profile pins everything that changes agent behaviour for a benchmark
+ * cell: the model, the active skills, the prompt version, the available
+ * tools. Vary the profile — swap a model, add a skill — and re-run the suite
+ * to benchmark the change. The scorecard keys a cell on
+ * `(scenarioId, profileHash)`, so the model is not a separate axis: it lives
+ * inside the profile, and two profiles with the same model but different
+ * skills are different cells.
+ *
+ * `agentProfileHash` is the profile's behaviour identity. Two profiles that
+ * produce the same agent behaviour share a hash (and a scorecard cell);
+ * reordering `skills` or `tools` does not change it; the human-facing `id`
+ * label does not affect it.
+ */
+interface AgentProfile {
+    /** Human-facing label, e.g. `sonnet-legal-skills-v3`. Not part of the hash. */
+    id: string;
+    /** Model snapshot id this profile pins, e.g. `claude-sonnet-4-6@2025-04-15`. */
+    model: string;
+    /** Skill ids/versions active in this profile — the primary behaviour lever. */
+    skills?: string[];
+    /** Prompt version identifier. */
+    promptVersion?: string;
+    /** Tool ids available to the agent. */
+    tools?: string[];
+    /** Any other behaviour-bearing knobs that should fingerprint into the hash. */
+    metadata?: Record<string, string | number | boolean>;
+}
+/**
+ * Deterministic behaviour identity of a profile — a sha256 over the
+ * behaviour-bearing fields. `skills` and `tools` are order-insensitive; the
+ * `id` label is excluded. Throws on a profile with no `model` — an unkeyable
+ * profile must fail loud rather than collapse into a blank-model cell.
+ */
+declare function agentProfileHash(profile: AgentProfile): string;
+export { type AgentProfile as A, type BackendIntegrityReport as B, type CompletionRequirement as C, type LlmCorrectnessCheckerOpts as L, type ProducedState as P, type RuntimeEventLike as R, type SatisfiedBy as S, type TaskGold as T, type ValidationContext as V, type CompletionVerdict as a, type CorrectnessChecker as b, type Artifact as c, type ArtifactEventLike as d, type ArtifactValidator as e, BackendIntegrityError as f, type ProducedProposal as g, type ProposalEventLike as h, type RequirementCheck as i, type ToolCallEventLike as j, type ValidationIssue as k, type ValidationResult as l, agentProfileHash as m, assertRealBackend as n, byteLengthRange as o, composeValidators as p, containsAll as q, createLlmCorrectnessChecker as r, extractProducedState as s, jsonHasKeys as t, parseCorrectnessResponse as u, regexMatch as v, summarizeBackendIntegrity as w, verifyCompletion as x };

package/dist/analyst/index.d.ts ADDED Viewed

@@ -0,0 +1,221 @@
+import { AxAIService, AxFunction } from '@ax-llm/ax';
+import { M as MultiLayerVerifier, V as VerifyOptions, S as Severity } from '../multi-layer-verifier-DlWCXuxL.js';
+import { c as RunCritic, a as RunTrace } from '../run-critic-BAIjX99r.js';
+import { S as SemanticConceptJudgeOptions, a as SemanticConceptJudgeInput, B as BehavioralMetrics } from '../semantic-concept-judge-CV9Wlx4t.js';
+export { C as CreateAnalystAiConfig, D as DEFAULT_TRACE_ANALYST_KINDS, b as DefaultAnalystRegistryOptions, c as DiffPolicy, F as FAILURE_MODE_KIND_SPEC, d as FINDING_SUBJECT_GRAMMAR_PROMPT, e as FINDING_SUBJECT_KINDS, f as FindingSubject, g as FindingSubjectKind, h as FindingSubjectStringSchema, i as FindingsDiff, j as FindingsStore, I as IMPROVEMENT_KIND_SPEC, K as KIND_EXPECTED_SUBJECTS, k as KNOWLEDGE_GAP_KIND_SPEC, l as KNOWLEDGE_POISONING_KIND_SPEC, P as PersistedFinding, m as SKILL_USAGE_ANALYST, n as SkillUsageAnalyst, o as SkillUsageRecord, p as SkillUsageReport, q as SkillUsageScanConfig, r as buildDefaultAnalystRegistry, s as buildSkillUsageReport, t as createAnalystAi, u as defaultIsMaterial, v as diffFindings, w as emitSkillUsageFindings, x as parseFindingSubject, y as renderFindingSubject } from '../semantic-concept-judge-CV9Wlx4t.js';
+import { A as AnalyzeTracesOptions } from '../analyst-t7zZS3TV.js';
+import { T as TraceAnalysisStore } from '../store-GmBE2pZZ.js';
+import { b as JudgeFn, a as JudgeInput } from '../types-Croy5h7V.js';
+import { A as Analyst, h as AnalystSeverity, c as AnalystFinding } from '../types-CRD68aH7.js';
+export { a as AnalystContext, g as AnalystCost, i as AnalystInputKind, j as AnalystRequirements, f as AnalystRunEvent, e as AnalystRunInputs, d as AnalystRunResult, b as AnalystRunSummary, k as ChatCallOpts, C as ChatClient, l as ChatRequest, m as ChatResponse, n as ChatTransport, o as CliBridgeTransportOpts, p as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RouterTransportOpts, S as SandboxSdkTransportOpts, q as computeFindingId, r as createChatClient, s as makeFinding } from '../types-CRD68aH7.js';
+import { TCloud } from '@tangle-network/tcloud';
+export { A as ANALYST_SEVERITIES, C as CreateTraceAnalystKindOpts, R as RAW_FINDING_SCHEMA_PROMPT, a as RawAnalystFinding, b as RawAnalystFindingSchema, T as TraceAnalystGolden, c as TraceAnalystKindSpec, d as createTraceAnalystKind, p as parseRawFinding, r as renderPriorFindings } from '../kind-factory-DW9XWPvM.js';
+export { A as AnalystHooks, a as AnalystRegistry, b as AnalystRegistryOptions, B as BudgetPolicy, R as RegistryRunOpts } from '../registry-DuVYiTvw.js';
+import { L as LlmClientOptions } from '../llm-client-DbjLfz-K.js';
+import '../schema-m0gsnbt3.js';
+import '../store-CKUAgsJz.js';
+import 'zod';
+import '../run-record-BgTFzO2r.js';
+import '../errors-Dwqw-T_m.js';
+import '../raw-provider-sink-C46HDghv.js';
+/**
+ * Adapter factories — lift each existing agent-eval primitive into the
+ * Analyst contract without re-implementing it.
+ *
+ * Five primitives, five factories. Each one:
+ *   - Builds an Analyst with a stable id (caller chooses; defaults
+ *     given), a sensible default `inputKind`, a version derived from
+ *     the wrapped primitive's version + an adapter revision, and an
+ *     `analyze()` that calls the primitive and lifts its output to
+ *     AnalystFinding[] using `makeFinding()`.
+ *   - Maps severities: the existing `Severity` ('critical' | 'major' |
+ *     'minor' | 'info') projects onto AnalystSeverity ('critical' |
+ *     'high' | 'medium' | 'low' | 'info'); 'major' → 'high', 'minor' →
+ *     'medium'. Domain analysts that want finer-grained mapping override.
+ *
+ * Adapters never own state. Calling the same factory twice with the
+ * same primitive instance is safe.
+ */
+declare function liftSeverity(s: Severity): AnalystSeverity;
+interface TraceAnalystAdapterOpts {
+    id?: string;
+    area?: string;
+    /** The natural-language question(s) put to the analyst. One finding per question. */
+    questions: string[];
+    /** Caller-provided AxAI service — same one trace-analyst.ts expects. */
+    ai: AxAIService;
+    model?: string;
+    /** Forwarded to analyzeTraces. */
+    extra?: Omit<AnalyzeTracesOptions, 'source' | 'ai' | 'model'>;
+}
+/**
+ * @deprecated Prefer `createTraceAnalystKind` + one of the failure /
+ * improvement kinds from `./kinds`. This adapter wraps the legacy
+ * `analyzeTraces` flow whose output is `findings:string[]` — every
+ * bullet gets flat-defaulted severity `medium` / confidence `0.6`,
+ * which loses the per-finding grading kinds provide via Ax structured
+ * output + Zod validation. Kept for one minor while consumers migrate.
+ */
+declare function createTraceAnalystAdapter(opts: TraceAnalystAdapterOpts): Analyst<TraceAnalysisStore>;
+interface VerifierAdapterOpts<Env> {
+    id?: string;
+    area?: string;
+    verifier: MultiLayerVerifier<Env>;
+    /**
+     * The verifier expects an `env` per run. Adapters take it from
+     * `AnalystRunInputs.custom[<id>]` via the registry's 'custom' routing.
+     */
+    options?: Omit<VerifyOptions<Env>, 'env'>;
+}
+declare function createVerifierAdapter<Env>(opts: VerifierAdapterOpts<Env>): Analyst<Env>;
+interface RunCriticAdapterOpts {
+    id?: string;
+    area?: string;
+    critic?: RunCritic;
+    /** Optional threshold below which a dimension is reported as a finding. Default 0.5. */
+    threshold?: number;
+}
+declare function createRunCriticAdapter(opts?: RunCriticAdapterOpts): Analyst<RunTrace>;
+interface JudgeAdapterOpts {
+    id?: string;
+    area?: string;
+    judge: JudgeFn;
+    /** TCloud handle the JudgeFn calls. */
+    tcloud: TCloud;
+    /** Optional cost classification — most judges call an LLM. */
+    cost?: Analyst['cost'];
+    /** Optional threshold below which a JudgeScore becomes a finding. Default 6 (on 0-10 scale). */
+    threshold?: number;
+}
+declare function createJudgeAdapter(opts: JudgeAdapterOpts): Analyst<JudgeInput>;
+interface SemanticConceptJudgeAdapterOpts {
+    id?: string;
+    area?: string;
+    options?: SemanticConceptJudgeOptions;
+}
+declare function createSemanticConceptJudgeAdapter(opts?: SemanticConceptJudgeAdapterOpts): Analyst<SemanticConceptJudgeInput>;
+/**
+ * `behavioralAnalyst` — a DETERMINISTIC analyst (cost.kind = 'deterministic',
+ * never calls the LLM). It produces the efficiency/behavioral findings a
+ * tolerant agentic analyzer (HALO) re-derives per run inside the model —
+ * context bloat, output decay, tool monoculture, missing self-verification —
+ * directly from arithmetic over spans (`computeTraceMetrics`).
+ *
+ * Why it matters: these findings are model-agnostic BY CONSTRUCTION (no model
+ * in the loop), so they cannot return 0 on a weak model the way the Ax-RLM
+ * does — and they are strictly more reliable than HALO, which spends tokens
+ * re-deriving the same numbers and can hallucinate the trend. The agentic
+ * RLM kinds remain for SEMANTIC findings that genuinely need a model; this
+ * analyst owns the behavioral class.
+ */
+/**
+ * Map computed signals → structured AnalystFindings. Pure: no LLM, no clock
+ * dependence beyond `produced_at` (overridable for deterministic tests).
+ */
+declare function deriveEfficiencyFindings(metrics: BehavioralMetrics, opts?: {
+    analystId?: string;
+    producedAt?: string;
+}): AnalystFinding[];
+/** The deterministic behavioral/efficiency analyst (no LLM, any-model). */
+declare function behavioralAnalyst(): Analyst<TraceAnalysisStore>;
+/**
+ * Forgiving pre-parse for analyst findings. Weak models routinely emit
+ * schema-correct content in an unusable wrapper — fenced ```json blocks, a
+ * single object where an array is expected, trailing commas. Measured: GPT-4o
+ * drops to 0% usable output purely from markdown-fence wrapping
+ * (arXiv:2605.02363). A five-line de-fence recovers most of it. This module is
+ * the de-fence/coerce step that runs BEFORE Zod, so a recoverable finding is
+ * repaired, not dropped.
+ *
+ * Pure + deterministic. No model, no network.
+ */
+/** Strip a ```lang ... ``` (or bare ``` ... ```) code fence, if the string is one. */
+declare function stripCodeFences(text: string): string;
+/**
+ * Best-effort parse of a string into JSON. De-fences, drops trailing commas,
+ * then `JSON.parse`. Returns `undefined` (never throws) when unrecoverable.
+ */
+declare function coerceJson(text: string): unknown;
+/**
+ * Coerce arbitrary actor/structurer output into an array of candidate finding
+ * rows: a JSON string → parse; a single object → 1-element array; an array →
+ * as-is; anything else → []. Callers still run each row through Zod
+ * (`parseRawFinding`) — this only fixes the SHAPE, never invents fields.
+ */
+declare function coerceToFindingRows(raw: unknown): unknown[];
+/**
+ * `structureFindings` — the deferred structuring pass (DSPy TwoStepAdapter /
+ * HALO `synthesize_traces` analog). The agentic actor reasons FREE-FORM and
+ * emits a prose `report` (which any model does reliably); this separate, cheap
+ * call's ONLY job is to turn that report into `AnalystFinding[]`. Decoupling
+ * reasoning from structuring is what makes the SEMANTIC findings model-agnostic
+ * — the reasoning model never has to satisfy a strict typed-array contract
+ * while it diagnoses.
+ *
+ * Forgiving: the response runs through `coerceToFindingRows` (de-fence, lift
+ * single→array) before Zod, and on a zero-finding extraction from a substantive
+ * report it reasks ONCE with the schema restated. Returns a typed outcome so a
+ * legitimate "nothing to report" is distinguishable from a failed extraction
+ * (no silent empty).
+ */
+interface StructureFindingsOptions {
+    /** The actor's free-form diagnosis prose. */
+    report: string;
+    analystId: string;
+    /** Coarse classification stamped on every extracted finding. */
+    area: string;
+    model: string;
+    baseUrl: string;
+    apiKey?: string;
+    /** Max reask attempts after a zero/invalid extraction. Default 1. */
+    maxReasks?: number;
+    /** Test seam: inject a fetch (no network in unit tests). */
+    fetchImpl?: LlmClientOptions['fetch'];
+}
+interface StructureFindingsResult {
+    findings: AnalystFinding[];
+    outcome: 'ok' | 'extraction_failed';
+}
+declare function structureFindings(opts: StructureFindingsOptions): Promise<StructureFindingsResult>;
+/**
+ * Pre-curated tool subsets for analyst kinds.
+ *
+ * The full trace-analyst tool set is seven functions. Most kinds only
+ * need three or four. Picking from named groups instead of importing
+ * the whole bundle keeps every kind's actor-context budget tight and
+ * makes "what can this analyst see?" obvious at registration time.
+ *
+ * Each function in the group keeps its full `name`/`description` from
+ * `buildTraceAnalystTools` — we filter, we don't re-implement.
+ */
+/** Named tool sets. Kinds pass `tools: TRACE_TOOL_GROUPS.failureForensics` etc. */
+type TraceToolGroupName =
+/** All seven tools. Use for open-ended discovery kinds. */
+'all'
+/** Overview + paginated query + count. No deep reads. Cheap. */
+ | 'discovery'
+/** Discovery + viewTrace + viewSpans. Deep-read but no regex search. */
+ | 'discoveryAndRead'
+/** Discovery + search tools. For pattern-matching across many traces. */
+ | 'discoveryAndSearch'
+/** Discovery + viewSpans + searchSpan. Targeted-span work after another kind narrows down. */
+ | 'targeted';
+/**
+ * Build the tool set for a named group bound to a specific trace store.
+ *
+ * `all` returns every tool. Other groups filter `buildTraceAnalystTools`
+ * by name to the documented subset. An unrecognised group name throws —
+ * silently returning all tools would defeat the cost-control point.
+ */
+declare function buildTraceToolsForGroup(group: TraceToolGroupName, store: TraceAnalysisStore): AxFunction[];
+export { Analyst, AnalystFinding, AnalystSeverity, type JudgeAdapterOpts, type RunCriticAdapterOpts, type SemanticConceptJudgeAdapterOpts, type StructureFindingsOptions, type StructureFindingsResult, type TraceAnalystAdapterOpts, type TraceToolGroupName, type VerifierAdapterOpts, behavioralAnalyst, buildTraceToolsForGroup, coerceJson, coerceToFindingRows, createJudgeAdapter, createRunCriticAdapter, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createVerifierAdapter, deriveEfficiencyFindings, liftSeverity, stripCodeFences, structureFindings };