npm - @tangle-network/agent-eval - Versions diffs - 0.23.1 → 0.24.0 - Mend

@tangle-network/agent-eval 0.23.1 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (148) hide show

package/CHANGELOG.md +80 -0
package/README.md +141 -79
package/dist/baseline-4R5deP0N.d.ts +108 -0
package/dist/benchmarks/index.d.ts +3 -2
package/dist/benchmarks/index.js +1 -1
package/dist/builder-eval/index.d.ts +249 -0
package/dist/builder-eval/index.js +391 -0
package/dist/builder-eval/index.js.map +1 -0
package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
package/dist/chunk-2A5XJB43.js.map +1 -0
package/dist/chunk-47X6LRCE.js +76 -0
package/dist/chunk-47X6LRCE.js.map +1 -0
package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
package/dist/chunk-4F5DQN55.js.map +1 -0
package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
package/dist/chunk-4S4BM3QQ.js.map +1 -0
package/dist/chunk-5BKGXME7.js +65 -0
package/dist/chunk-5BKGXME7.js.map +1 -0
package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
package/dist/chunk-6QDKWHLS.js.map +1 -0
package/dist/chunk-I4MBDTY5.js +272 -0
package/dist/chunk-I4MBDTY5.js.map +1 -0
package/dist/chunk-K2TPS5LB.js +569 -0
package/dist/chunk-K2TPS5LB.js.map +1 -0
package/dist/chunk-KKHDIONI.js +414 -0
package/dist/chunk-KKHDIONI.js.map +1 -0
package/dist/chunk-KMPRBJK4.js +74 -0
package/dist/chunk-KMPRBJK4.js.map +1 -0
package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
package/dist/chunk-KTGTIOFD.js.map +1 -0
package/dist/chunk-LSH4MMOZ.js +838 -0
package/dist/chunk-LSH4MMOZ.js.map +1 -0
package/dist/chunk-NG236HPC.js +57 -0
package/dist/chunk-NG236HPC.js.map +1 -0
package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
package/dist/chunk-NLMNWKVM.js.map +1 -0
package/dist/chunk-NU65VQ7M.js +99 -0
package/dist/chunk-NU65VQ7M.js.map +1 -0
package/dist/chunk-OHEPNJQN.js +554 -0
package/dist/chunk-OHEPNJQN.js.map +1 -0
package/dist/chunk-OWLAAMME.js +250 -0
package/dist/chunk-OWLAAMME.js.map +1 -0
package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
package/dist/chunk-PC4UYEBM.js.map +1 -0
package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
package/dist/chunk-RAF443UI.js.map +1 -0
package/dist/chunk-RZTMDUO7.js +49 -0
package/dist/chunk-RZTMDUO7.js.map +1 -0
package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
package/dist/chunk-SESZDQPX.js.map +1 -0
package/dist/{chunk-6KQG5HAH.js → chunk-SY6WAAAD.js} +84 -71
package/dist/chunk-SY6WAAAD.js.map +1 -0
package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
package/dist/chunk-TVVP3ZZQ.js.map +1 -0
package/dist/{chunk-VQQSPGSM.js → chunk-VRJVTXRV.js} +169 -111
package/dist/chunk-VRJVTXRV.js.map +1 -0
package/dist/chunk-WWYCWKUM.js +196 -0
package/dist/chunk-WWYCWKUM.js.map +1 -0
package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
package/dist/chunk-YRZ4M5GS.js.map +1 -0
package/dist/chunk-ZN274SWR.js +613 -0
package/dist/chunk-ZN274SWR.js.map +1 -0
package/dist/cli.js +10 -6
package/dist/cli.js.map +1 -1
package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
package/dist/control.d.ts +8 -6
package/dist/control.js +10 -7
package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
package/dist/errors-BZ9sTdz7.d.ts +70 -0
package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
package/dist/governance/index.d.ts +5 -0
package/dist/governance/index.js +18 -0
package/dist/governance/index.js.map +1 -0
package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
package/dist/index-Oj9fAPPN.d.ts +270 -0
package/dist/index.d.ts +1866 -3151
package/dist/index.js +5457 -7809
package/dist/index.js.map +1 -1
package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
package/dist/knowledge/index.d.ts +102 -0
package/dist/knowledge/index.js +18 -0
package/dist/knowledge/index.js.map +1 -0
package/dist/meta-eval/index.d.ts +99 -0
package/dist/meta-eval/index.js +324 -0
package/dist/meta-eval/index.js.map +1 -0
package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
package/dist/openapi.json +1 -1
package/dist/optimization.d.ts +11 -8
package/dist/optimization.js +11 -9
package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
package/dist/pipelines/index.d.ts +172 -0
package/dist/pipelines/index.js +409 -0
package/dist/pipelines/index.js.map +1 -0
package/dist/prm/index.d.ts +99 -0
package/dist/prm/index.js +222 -0
package/dist/prm/index.js.map +1 -0
package/dist/query-DODUYdPg.d.ts +30 -0
package/dist/release-report-TDPn1cxq.d.ts +292 -0
package/dist/replay-BL96gCEP.d.ts +226 -0
package/dist/reporting.d.ts +10 -295
package/dist/reporting.js +10 -6
package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-CUOiGcGv.d.ts} +148 -146
package/dist/rl.d.ts +1762 -8
package/dist/rl.js +2035 -58
package/dist/rl.js.map +1 -1
package/dist/rubric-D5tjHNJQ.d.ts +72 -0
package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
package/dist/sequential-Dgz1n51-.d.ts +139 -0
package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-BXGs_9V0.d.ts} +3 -76
package/dist/telemetry/file.js +4 -1
package/dist/telemetry/file.js.map +1 -1
package/dist/telemetry/index.js +57 -57
package/dist/telemetry/index.js.map +1 -1
package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
package/dist/traces.d.ts +142 -387
package/dist/traces.js +1302 -40
package/dist/traces.js.map +1 -1
package/dist/trajectory-CnoBo-JY.d.ts +32 -0
package/dist/wire/index.d.ts +22 -22
package/dist/wire/index.js +4 -3
package/package.json +44 -18
package/dist/chunk-42I2QC2L.js.map +0 -1
package/dist/chunk-5IIQKMD5.js.map +0 -1
package/dist/chunk-6KQG5HAH.js.map +0 -1
package/dist/chunk-6M774GY6.js.map +0 -1
package/dist/chunk-7EAUOUQS.js.map +0 -1
package/dist/chunk-AXHNWLIX.js.map +0 -1
package/dist/chunk-EXGR4XEM.js.map +0 -1
package/dist/chunk-IOXMGMHQ.js.map +0 -1
package/dist/chunk-KAO3Q65R.js.map +0 -1
package/dist/chunk-LZKIOBG2.js +0 -2026
package/dist/chunk-LZKIOBG2.js.map +0 -1
package/dist/chunk-QBW3YBTR.js.map +0 -1
package/dist/chunk-QUKKGHTZ.js.map +0 -1
package/dist/chunk-SQQLHODJ.js.map +0 -1
package/dist/chunk-V5QSWN7L.js +0 -1310
package/dist/chunk-V5QSWN7L.js.map +0 -1
package/dist/chunk-VQQSPGSM.js.map +0 -1
package/dist/chunk-XPHOZPOM.js +0 -1947
package/dist/chunk-XPHOZPOM.js.map +0 -1
package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
package/dist/index-ekBXweiQ.d.ts +0 -1894
package/dist/sequential-DgU2mFsE.d.ts +0 -304

package/dist/failure-cluster-C2EGSDiT.d.ts ADDED Viewed

@@ -0,0 +1,76 @@
+import { R as Run, S as Span, b as TraceEvent, F as FailureClass, T as TraceStore } from './store-Db2Bv8Cf.js';
+/**
+ * Failure taxonomy — canonical classes + a default classifier.
+ *
+ * Every failed run should end up in a named class. The classifier here
+ * is rule-based (fast, deterministic); an LLM fallback can be added by
+ * the consumer for novel cases and trained into the rule base over time.
+ *
+ * Consumers call `classifyFailure(run, spans, events)` and persist the
+ * returned class as `Run.outcome.failureClass`.
+ */
+interface FailureContext {
+    run: Run;
+    spans: Span[];
+    events: TraceEvent[];
+}
+interface FailureClassification {
+    failureClass: FailureClass;
+    reason: string;
+    triggerSpanId?: string;
+    triggerEventId?: string;
+}
+/** Ordered rules — first match wins. */
+interface FailureRule {
+    id: string;
+    match: (ctx: FailureContext) => {
+        failureClass: FailureClass;
+        reason: string;
+        triggerSpanId?: string;
+        triggerEventId?: string;
+    } | null;
+}
+declare const DEFAULT_RULES: FailureRule[];
+/** Classify the failure mode of a run using an ordered rule list. */
+declare function classifyFailure(ctx: FailureContext, rules?: FailureRule[]): FailureClassification;
+/**
+ * FailureClusterView — groups failed runs by (failureClass, triggerTool,
+ * argHash-prefix) so weekly reviews can prioritize the top-N clusters.
+ *
+ * Each cluster includes: N runs, scenarios affected, representative
+ * error message, a proposed mitigation hint (rule → action table).
+ */
+interface FailureCluster {
+    failureClass: FailureClass;
+    /** Tool name when the trigger was a tool span, else undefined. */
+    toolName?: string;
+    /** First 16 chars of argHash — clusters similar args. */
+    argPrefix?: string;
+    /**
+     * Source dimension when the trigger was a judge span (e.g. `'format'`,
+     * `'safety'`, `'correctness'`). Lets cross-template aggregators
+     * group failures by the dimension that fired without overloading
+     * `argPrefix`. Optional — legacy clusters without this field
+     * deserialize cleanly.
+     */
+    dimension?: string;
+    runCount: number;
+    scenarioIds: string[];
+    exampleError?: string;
+    exampleRunId: string;
+}
+interface FailureClusterReport {
+    clusters: FailureCluster[];
+    totalFailures: number;
+    totalRuns: number;
+}
+declare function failureClusterView(store: TraceStore, options?: {
+    rules?: FailureRule[];
+    minClusterSize?: number;
+}): Promise<FailureClusterReport>;
+export { DEFAULT_RULES as D, type FailureClusterReport as F, type FailureCluster as a, type FailureClassification as b, type FailureContext as c, type FailureRule as d, classifyFailure as e, failureClusterView as f };

package/dist/feedback-trajectory-DfFdrraJ.d.ts ADDED Viewed

@@ -0,0 +1,169 @@
+import { C as ControlEvalResult, a as ControlRunResult, b as ControlStep } from './control-runtime-BuJHoLg0.js';
+import { D as DatasetSplit, a as DatasetScenario } from './dataset-CiK_3LDr.js';
+type FeedbackArtifactType = 'text' | 'code' | 'plan' | 'research' | 'action' | 'ui' | 'decision' | 'data' | 'other';
+type FeedbackLabelSource = 'user' | 'judge' | 'environment' | 'metric' | 'policy' | 'system';
+type FeedbackLabelKind = 'approve' | 'reject' | 'select' | 'edit' | 'rank' | 'rate' | 'comment' | 'metric_outcome' | 'policy_block' | 'revision_request';
+type FeedbackSeverity = 'info' | 'warning' | 'error' | 'critical';
+interface FeedbackTask {
+    intent: string;
+    context?: unknown;
+}
+interface ProposedSideEffect {
+    type: string;
+    risk?: 'low' | 'medium' | 'high';
+    costUsd?: number;
+    externalSideEffect?: boolean;
+    requiresApproval?: boolean;
+    metadata?: Record<string, unknown>;
+}
+interface FeedbackLabel {
+    id?: string;
+    source: FeedbackLabelSource;
+    kind: FeedbackLabelKind;
+    value: unknown;
+    reason?: string;
+    severity?: FeedbackSeverity;
+    createdAt: string;
+    metadata?: Record<string, unknown>;
+}
+interface FeedbackAttempt {
+    id: string;
+    stepIndex: number;
+    artifactType: FeedbackArtifactType;
+    artifact: unknown;
+    options?: unknown[];
+    proposedAction?: ProposedSideEffect;
+    evals?: ControlEvalResult[];
+    feedback?: FeedbackLabel[];
+    createdAt: string;
+    metadata?: Record<string, unknown>;
+}
+interface FeedbackOutcome {
+    success?: boolean;
+    score?: number;
+    metrics?: Record<string, number>;
+    costUsd?: number;
+    detail?: string;
+    observedAt?: string;
+    metadata?: Record<string, unknown>;
+}
+interface FeedbackTrajectory {
+    id: string;
+    projectId?: string;
+    scenarioId?: string;
+    task: FeedbackTask;
+    attempts: FeedbackAttempt[];
+    labels: FeedbackLabel[];
+    outcome?: FeedbackOutcome;
+    split?: DatasetSplit;
+    tags?: Record<string, string>;
+    createdAt: string;
+    updatedAt?: string;
+    metadata?: Record<string, unknown>;
+}
+interface FeedbackTrajectoryStore {
+    save(trajectory: FeedbackTrajectory): Promise<void>;
+    get(id: string): Promise<FeedbackTrajectory | null>;
+    list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
+    appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
+    appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
+}
+interface FeedbackTrajectoryFilter {
+    projectId?: string;
+    scenarioId?: string;
+    split?: DatasetSplit;
+    tag?: [string, string];
+}
+interface FeedbackSplitPolicy {
+    trainPct?: number;
+    devPct?: number;
+    testPct?: number;
+    holdoutPct?: number;
+}
+interface PreferenceMemoryEntry {
+    instruction: string;
+    rationale: string;
+    weight: number;
+    sourceTrajectoryId: string;
+    sourceLabelId?: string;
+    category?: string;
+}
+interface FeedbackOptimizerRow {
+    scenarioId: string;
+    trajectoryId: string;
+    labelKinds: FeedbackLabelKind[];
+    score?: number;
+    metadata?: Record<string, unknown>;
+}
+interface FeedbackReplayResult {
+    trajectoryId: string;
+    pass: boolean;
+    score?: number;
+    labels: FeedbackLabel[];
+    outcome?: FeedbackOutcome;
+    metadata?: Record<string, unknown>;
+}
+interface FeedbackReplayAdapter {
+    replay(trajectory: FeedbackTrajectory): Promise<Omit<FeedbackReplayResult, 'trajectoryId'>> | Omit<FeedbackReplayResult, 'trajectoryId'>;
+}
+declare class InMemoryFeedbackTrajectoryStore implements FeedbackTrajectoryStore {
+    private readonly trajectories;
+    save(trajectory: FeedbackTrajectory): Promise<void>;
+    get(id: string): Promise<FeedbackTrajectory | null>;
+    list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
+    appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
+    appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
+}
+declare class FileSystemFeedbackTrajectoryStore implements FeedbackTrajectoryStore {
+    private readonly dir;
+    private readonly memory;
+    private loaded;
+    constructor(options: {
+        dir: string;
+    });
+    save(trajectory: FeedbackTrajectory): Promise<void>;
+    get(id: string): Promise<FeedbackTrajectory | null>;
+    list(filter?: FeedbackTrajectoryFilter): Promise<FeedbackTrajectory[]>;
+    appendAttempt(id: string, attempt: FeedbackAttempt): Promise<FeedbackTrajectory>;
+    appendLabel(id: string, label: FeedbackLabel, attemptId?: string): Promise<FeedbackTrajectory>;
+    private append;
+    private load;
+}
+declare function createFeedbackTrajectory(input: {
+    id?: string;
+    projectId?: string;
+    scenarioId?: string;
+    task: FeedbackTask;
+    attempts?: FeedbackAttempt[];
+    labels?: FeedbackLabel[];
+    outcome?: FeedbackOutcome;
+    split?: DatasetSplit;
+    tags?: Record<string, string>;
+    createdAt?: string;
+    metadata?: Record<string, unknown>;
+}): FeedbackTrajectory;
+declare function assignFeedbackSplit(trajectory: Pick<FeedbackTrajectory, 'id' | 'projectId' | 'scenarioId' | 'task'>, policy?: FeedbackSplitPolicy): DatasetSplit;
+declare function withAssignedFeedbackSplit(trajectory: FeedbackTrajectory, policy?: FeedbackSplitPolicy): FeedbackTrajectory;
+declare function feedbackTrajectoryToDatasetScenario(trajectory: FeedbackTrajectory): DatasetScenario;
+declare function feedbackTrajectoriesToDatasetScenarios(trajectories: FeedbackTrajectory[]): DatasetScenario[];
+declare function feedbackTrajectoryToOptimizerRow(trajectory: FeedbackTrajectory): FeedbackOptimizerRow;
+declare function feedbackTrajectoriesToOptimizerRows(trajectories: FeedbackTrajectory[]): FeedbackOptimizerRow[];
+declare function replayFeedbackTrajectory(trajectory: FeedbackTrajectory, adapter: FeedbackReplayAdapter): Promise<FeedbackReplayResult>;
+declare function replayFeedbackTrajectories(trajectories: FeedbackTrajectory[], adapter: FeedbackReplayAdapter): Promise<FeedbackReplayResult[]>;
+declare function summarizePreferenceMemory(trajectories: FeedbackTrajectory[], options?: {
+    maxEntries?: number;
+}): PreferenceMemoryEntry[];
+declare function renderPreferenceMemoryMarkdown(entries: PreferenceMemoryEntry[]): string;
+declare function serializeFeedbackTrajectoriesJsonl(trajectories: FeedbackTrajectory[]): string;
+declare function parseFeedbackTrajectoriesJsonl(jsonl: string): FeedbackTrajectory[];
+declare function controlRunToFeedbackTrajectory<TState, TAction, TActionResult>(run: ControlRunResult<TState, TAction, TActionResult>, options?: {
+    projectId?: string;
+    scenarioId?: string;
+    artifactType?: FeedbackArtifactType;
+    artifactFromStep?: (step: ControlStep<TState, TAction, TActionResult>) => unknown;
+    proposedActionFromStep?: (step: ControlStep<TState, TAction, TActionResult>) => ProposedSideEffect | undefined;
+    createdAt?: string;
+}): FeedbackTrajectory;
+export { replayFeedbackTrajectory as A, serializeFeedbackTrajectoriesJsonl as B, summarizePreferenceMemory as C, withAssignedFeedbackSplit as D, type FeedbackArtifactType as F, InMemoryFeedbackTrajectoryStore as I, type PreferenceMemoryEntry as P, type FeedbackAttempt as a, type FeedbackLabel as b, type FeedbackLabelKind as c, type FeedbackLabelSource as d, type FeedbackOptimizerRow as e, type FeedbackOutcome as f, type FeedbackReplayAdapter as g, type FeedbackReplayResult as h, type FeedbackSeverity as i, type FeedbackSplitPolicy as j, type FeedbackTask as k, type FeedbackTrajectory as l, type FeedbackTrajectoryFilter as m, type FeedbackTrajectoryStore as n, FileSystemFeedbackTrajectoryStore as o, type ProposedSideEffect as p, assignFeedbackSplit as q, controlRunToFeedbackTrajectory as r, createFeedbackTrajectory as s, feedbackTrajectoriesToDatasetScenarios as t, feedbackTrajectoriesToOptimizerRows as u, feedbackTrajectoryToDatasetScenario as v, feedbackTrajectoryToOptimizerRow as w, parseFeedbackTrajectoriesJsonl as x, renderPreferenceMemoryMarkdown as y, replayFeedbackTrajectories as z };

package/dist/governance/index.d.ts ADDED Viewed

@@ -0,0 +1,5 @@
+export { E as EuRiskClass, b as GovernanceContext, c as GovernanceFinding, d as GovernanceReport, U as UseCaseSignals, j as classifyEuAiRisk, k as euAiActReport, n as nistAiRmfReport, m as renderMarkdown, q as soc2Report, t as summarize } from '../index-Oj9fAPPN.js';
+import '../dataset-CiK_3LDr.js';
+import '../errors-BZ9sTdz7.js';
+import '../outcome-store-D6KWmYvj.js';
+import '../store-Db2Bv8Cf.js';

package/dist/governance/index.js ADDED Viewed

@@ -0,0 +1,18 @@
+import {
+  classifyEuAiRisk,
+  euAiActReport,
+  nistAiRmfReport,
+  renderMarkdown,
+  soc2Report,
+  summarize
+} from "../chunk-KKHDIONI.js";
+import "../chunk-PZ5AY32C.js";
+export {
+  classifyEuAiRisk,
+  euAiActReport,
+  nistAiRmfReport,
+  renderMarkdown,
+  soc2Report,
+  summarize
+};
+//# sourceMappingURL=index.js.map

package/dist/governance/index.js.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}

package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { a as RunSplitTag } from './run-record-DNiOMBrZ.js';
+import { a as RunSplitTag } from './run-record-CqzahIbx.js';
 /**
  * Shared types for the reference benchmark wrappers under

package/dist/index-Oj9fAPPN.d.ts ADDED Viewed

@@ -0,0 +1,270 @@
+import { a as DatasetScenario, c as Dataset, b as DatasetManifest } from './dataset-CiK_3LDr.js';
+import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
+import { T as TraceStore } from './store-Db2Bv8Cf.js';
+/**
+ * Judge calibration — measure judge quality against human gold + bias.
+ *
+ * Workflow:
+ *   1. Build a golden set: {itemId, humanScore}[].
+ *   2. Run candidate judges; each produces {itemId, score}.
+ *   3. `calibrateJudge(golden, candidate)` reports κ + Pearson + MAE.
+ *   4. Run bias probes (positional, verbosity, self-preference) to
+ *      detect systematic score inflation.
+ *
+ * Returns actionable diagnostics, not a single number. Consumers then
+ * decide whether to trust the judge, retrain it, or add a tie-breaker.
+ */
+interface GoldenItem {
+    itemId: string;
+    humanScore: number;
+    /** Optional group used for per-group bias audits (e.g. model-of-output family). */
+    group?: string;
+}
+interface CandidateScore {
+    itemId: string;
+    score: number;
+    /** Optional — enables positional-bias analysis (did order matter?). */
+    positionOfAInput?: 'first' | 'second';
+}
+interface CalibrationResult {
+    n: number;
+    pearson: number;
+    /** Cohen's κ with quadratic weights over integer-rounded scores. */
+    kappa: number;
+    /** Mean absolute error vs human. */
+    mae: number;
+    /** Worst-5 miscalibrations (largest |judge - human|). */
+    worstItems: Array<{
+        itemId: string;
+        judge: number;
+        human: number;
+        delta: number;
+    }>;
+}
+declare function calibrateJudge(golden: GoldenItem[], candidate: CandidateScore[]): CalibrationResult;
+interface PositionalBiasResult {
+    /**
+     * Score delta (first-position - second-position) averaged across items
+     * presented in both positions. Non-zero = positional bias.
+     */
+    avgDelta: number;
+    n: number;
+}
+/**
+ * Feed the same items to the judge twice with A/B swapped and pass all
+ * results here. Items that don't appear in both positions are ignored.
+ */
+declare function positionalBias(scores: CandidateScore[]): PositionalBiasResult;
+interface VerbosityBiasResult {
+    /** Pearson correlation between output length and score. Strong positive = verbosity bias. */
+    pearson: number;
+    n: number;
+}
+declare function verbosityBias(samples: Array<{
+    outputLen: number;
+    score: number;
+}>): VerbosityBiasResult;
+interface SelfPreferenceResult {
+    /** Mean judge score when judge's family matches output's family. */
+    inFamilyMean: number;
+    outOfFamilyMean: number;
+    deltaMean: number;
+    n: number;
+}
+/**
+ * Pass the same scenarios scored with judge-model X grading outputs from
+ * model X (in-family) and model Y (out-of-family). Non-zero delta
+ * indicates self-preference.
+ */
+declare function selfPreference(samples: Array<{
+    score: number;
+    inFamily: boolean;
+}>): SelfPreferenceResult;
+/**
+ * Red-team battery — adversarial scenario corpus with per-category
+ * scorers.
+ *
+ * Every scenario is a `DatasetScenario` with a `redTeam: {...}` payload
+ * that declares: category, expected refusal, attack surface. Scorers
+ * inspect the agent's output + trajectory and return a PASS/FAIL per
+ * category. Aggregated via `redTeamReport`.
+ *
+ * The built-in corpus is intentionally conservative — 40 cases
+ * covering the most-seen attack patterns. Extend via `extendCorpus`.
+ */
+type RedTeamCategory = 'prompt_injection_direct' | 'prompt_injection_indirect' | 'jailbreak_persona' | 'jailbreak_dan' | 'pii_leak' | 'permission_escalation' | 'data_exfil' | 'policy_override';
+interface RedTeamPayload {
+    category: RedTeamCategory;
+    /** The hostile input the agent sees. */
+    input: string;
+    /**
+     * What a safe agent should do: "refuse" means explicit refusal,
+     * "ignore" means process benignly without executing the attack.
+     */
+    expected: 'refuse' | 'ignore';
+    /** Substring(s) that MUST NOT appear in output. */
+    forbiddenStrings?: string[];
+    /** Tool names the agent MUST NOT call for this scenario. */
+    forbiddenTools?: string[];
+}
+interface RedTeamCase extends DatasetScenario {
+    payload: RedTeamPayload;
+}
+interface RedTeamFinding {
+    scenarioId: string;
+    category: RedTeamCategory;
+    passed: boolean;
+    reason: string;
+    evidence?: string;
+}
+interface RedTeamReport {
+    findings: RedTeamFinding[];
+    passRateByCategory: Record<RedTeamCategory, number>;
+    overallPassRate: number;
+}
+/** Ship a canonical, small corpus. Consumers extend via `extendCorpus`. */
+declare const DEFAULT_RED_TEAM_CORPUS: RedTeamCase[];
+declare function redTeamDataset(extraCases?: RedTeamCase[]): Dataset;
+/**
+ * Score a single agent response against the red-team case.
+ * `toolCalls` should be the names of tools the agent invoked during the run.
+ */
+declare function scoreRedTeamOutput(output: string, toolCalls: string[], rtCase: RedTeamCase): RedTeamFinding;
+/** Aggregate red-team findings into per-category pass rates. */
+declare function redTeamReport(findings: RedTeamFinding[]): RedTeamReport;
+/**
+ * Extract the tool-call names from a corpus run — convenience for the
+ * common pipeline (run the scenario → score the run).
+ */
+declare function toolNamesForRun(store: TraceStore, runId: string): Promise<string[]>;
+/**
+ * Governance reporting — shared types.
+ *
+ * The framework collects a `GovernanceContext` (traces + outcomes +
+ * dataset manifests + red-team results + judge calibration) and each
+ * specific template (NIST AI RMF, SOC2, EU AI Act) renders a
+ * structured report from it.
+ *
+ * Reports are machine-readable JSON first; human-readable Markdown is a
+ * pure transform on top. External auditors consume the Markdown; CI
+ * consumes the JSON.
+ */
+interface GovernanceContext {
+    /** Legal / org identity for the report. */
+    organization: string;
+    /** System / agent identifier. */
+    systemName: string;
+    /** ISO8601 period the report covers. */
+    periodStart: string;
+    periodEnd: string;
+    /** Versioned dataset manifests used during the period. */
+    datasets: DatasetManifest[];
+    traceStore: TraceStore;
+    outcomeStore?: OutcomeStore;
+    /** Cached red-team results for the period, if available. */
+    redTeam?: RedTeamReport;
+    /** Judge-vs-human calibration results, if measured. */
+    judgeCalibration?: CalibrationResult[];
+    /** Responsible owner for the system — role + name + email. */
+    owner: {
+        role: string;
+        name: string;
+        email: string;
+    };
+}
+interface GovernanceFinding {
+    id: string;
+    severity: 'info' | 'low' | 'medium' | 'high' | 'critical';
+    /** Control reference the finding maps to (e.g. "NIST-AI-RMF:MEASURE-2.1"). */
+    control: string;
+    summary: string;
+    evidence?: string;
+    remediation?: string;
+}
+interface GovernanceReport {
+    framework: 'NIST-AI-RMF' | 'SOC2' | 'EU-AI-ACT';
+    version: string;
+    context: Pick<GovernanceContext, 'organization' | 'systemName' | 'periodStart' | 'periodEnd' | 'owner'>;
+    summary: {
+        findings: number;
+        byeverity: Record<GovernanceFinding['severity'], number>;
+        overall: 'compliant' | 'compliant-with-findings' | 'non-compliant';
+    };
+    findings: GovernanceFinding[];
+    /** Framework-specific structured payload (mapped controls, risk class, etc.). */
+    payload: Record<string, unknown>;
+    generatedAt: string;
+}
+declare function renderMarkdown(report: GovernanceReport): string;
+declare function summarize(findings: GovernanceFinding[]): GovernanceReport['summary'];
+/**
+ * EU AI Act — risk-class classification + compliance checklist.
+ *
+ * Classification is declarative: caller supplies the domain/use-case
+ * signals (biometric? critical infrastructure? education? employment?
+ * access to services?) and we map to the Act's risk tiers:
+ *   - "unacceptable" (prohibited)
+ *   - "high"        (Annex III — strict obligations)
+ *   - "limited"     (transparency obligations)
+ *   - "minimal"     (voluntary codes of conduct)
+ *
+ * Then the compliance checklist enumerates Article 9 (risk mgmt),
+ * 10 (data + data governance), 11 (technical documentation), 13
+ * (transparency), 14 (human oversight), 15 (accuracy + robustness)
+ * requirements and flags gaps.
+ */
+type EuRiskClass = 'unacceptable' | 'high' | 'limited' | 'minimal';
+interface UseCaseSignals {
+    /** Used for biometric identification in public spaces? (Art. 5 — unacceptable). */
+    biometricPublic?: boolean;
+    /** Social scoring by public authorities? (Art. 5). */
+    socialScoring?: boolean;
+    /** Subliminal manipulation? (Art. 5). */
+    subliminal?: boolean;
+    /** Annex III sector: critical infrastructure / education / employment /
+     *  access to essential services / law enforcement / migration /
+     *  administration of justice / democratic processes? */
+    annexIII?: boolean;
+    /** Interacts directly with natural persons (chatbot, agent)? — limited risk. */
+    chatbot?: boolean;
+    /** Generates synthetic media (image/audio/video/text deepfakes)? — limited risk. */
+    generatesSyntheticMedia?: boolean;
+}
+declare function classifyEuAiRisk(signals: UseCaseSignals): EuRiskClass;
+declare function euAiActReport(ctx: GovernanceContext, signals: UseCaseSignals): Promise<GovernanceReport>;
+/**
+ * NIST AI RMF 1.0 — Govern / Map / Measure / Manage mapping.
+ *
+ * Each subcategory derives its status from concrete framework state:
+ *   MEASURE 2.x: do we have a calibration regime? contamination controls?
+ *   MEASURE 2.7: are red-team results available?
+ *   MANAGE 1.x: are outcome metrics captured? correlation measured?
+ *   GOVERN 1.x: dataset + prompt provenance recorded?
+ *
+ * We ship the mapping and the derivation rules; consumers supply the
+ * GovernanceContext.
+ */
+declare function nistAiRmfReport(ctx: GovernanceContext): Promise<GovernanceReport>;
+/**
+ * SOC 2 — Common Criteria 7 (system operations + change management)
+ * audit trail derived from the trace corpus.
+ *
+ * This is NOT a formal SOC2 report — that requires an external
+ * auditor. What we ship is the machine-readable *evidence* package
+ * that an auditor consumes: run counts, deploy events, access log
+ * summary, anomaly tracking, response-time SLOs.
+ */
+declare function soc2Report(ctx: GovernanceContext): Promise<GovernanceReport>;
+export { type CalibrationResult as C, DEFAULT_RED_TEAM_CORPUS as D, type EuRiskClass as E, type GoldenItem as G, type PositionalBiasResult as P, type RedTeamCase as R, type SelfPreferenceResult as S, type UseCaseSignals as U, type VerbosityBiasResult as V, type CandidateScore as a, type GovernanceContext as b, type GovernanceFinding as c, type GovernanceReport as d, type RedTeamCategory as e, type RedTeamFinding as f, type RedTeamPayload as g, type RedTeamReport as h, calibrateJudge as i, classifyEuAiRisk as j, euAiActReport as k, redTeamReport as l, renderMarkdown as m, nistAiRmfReport as n, selfPreference as o, positionalBias as p, soc2Report as q, redTeamDataset as r, scoreRedTeamOutput as s, summarize as t, toolNamesForRun as u, verbosityBias as v };