npm - @tangle-network/agent-eval - Versions diffs - 0.20.11 → 0.20.12 - Mend

@tangle-network/agent-eval 0.20.11 → 0.20.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

package/README.md +99 -170
package/dist/benchmarks/index.d.ts +2 -1
package/dist/{chunk-JAOLXRIA.js → chunk-75MCTH7P.js} +8 -2
package/dist/chunk-75MCTH7P.js.map +1 -0
package/dist/chunk-HKYRWNHV.js +1354 -0
package/dist/chunk-HKYRWNHV.js.map +1 -0
package/dist/{chunk-LSR4IAYN.js → chunk-HNJLMAJ2.js} +2 -2
package/dist/chunk-IKFVX537.js +717 -0
package/dist/chunk-IKFVX537.js.map +1 -0
package/dist/chunk-KWUAAIHR.js +1764 -0
package/dist/chunk-KWUAAIHR.js.map +1 -0
package/dist/chunk-MCMV7DUL.js +1310 -0
package/dist/chunk-MCMV7DUL.js.map +1 -0
package/dist/chunk-ODFINDLQ.js +413 -0
package/dist/chunk-ODFINDLQ.js.map +1 -0
package/dist/chunk-PKCVBYTQ.js +200 -0
package/dist/chunk-PKCVBYTQ.js.map +1 -0
package/dist/chunk-YUFXO3TU.js +148 -0
package/dist/chunk-YUFXO3TU.js.map +1 -0
package/dist/cli.js +2 -2
package/dist/control-C8NKbF3w.d.ts +258 -0
package/dist/control.d.ts +5 -0
package/dist/control.js +30 -0
package/dist/control.js.map +1 -0
package/dist/dataset-B9qvlm_o.d.ts +112 -0
package/dist/emitter-BYO2nSDA.d.ts +387 -0
package/dist/feedback-trajectory-BGQ_ANCN.d.ts +345 -0
package/dist/{index-1PZOtZFr.d.ts → index-c5saLbKD.d.ts} +2 -133
package/dist/index.d.ts +115 -2870
package/dist/index.js +1049 -6156
package/dist/index.js.map +1 -1
package/dist/multi-shot-optimization-Bvtz294B.d.ts +598 -0
package/dist/openapi.json +1 -1
package/dist/optimization.d.ts +145 -0
package/dist/optimization.js +60 -0
package/dist/optimization.js.map +1 -0
package/dist/reporting.d.ts +426 -0
package/dist/reporting.js +32 -0
package/dist/reporting.js.map +1 -0
package/dist/run-record-CX_jcAyr.d.ts +134 -0
package/dist/traces.d.ts +658 -0
package/dist/traces.js +100 -0
package/dist/traces.js.map +1 -0
package/dist/wire/index.js +2 -2
package/docs/concepts.md +16 -11
package/docs/feature-guide.md +10 -17
package/docs/integration-launch-gates.md +77 -0
package/docs/product-eval-adoption.md +27 -0
package/docs/trace-analysis.md +75 -0
package/package.json +21 -1
package/dist/chunk-JAOLXRIA.js.map +0 -1
/package/dist/{chunk-LSR4IAYN.js.map → chunk-HNJLMAJ2.js.map} +0 -0

package/dist/control-C8NKbF3w.d.ts ADDED Viewed

@@ -0,0 +1,258 @@
+import { c as ControlEvalResult, i as ControlRunResult, F as FeedbackLabel, A as ProposedSideEffect, j as ControlRuntimeConfig } from './feedback-trajectory-BGQ_ANCN.js';
+import { R as RunSplitTag, e as RunTokenUsage, a as RunRecord } from './run-record-CX_jcAyr.js';
+import { T as TraceEmitter, a as TraceStore, F as FailureClass } from './emitter-BYO2nSDA.js';
+interface RunEvidenceMetadata {
+    experimentId: string;
+    candidateId: string;
+    seed: number;
+    model: string;
+    promptHash: string;
+    configHash: string;
+    commitSha: string;
+    splitTag: RunSplitTag;
+    tokenUsage: RunTokenUsage;
+    queueMs?: number;
+    judgeMetadata?: RunRecord['judgeMetadata'];
+    raw?: Record<string, number>;
+}
+interface ControlRunToRunRecordOptions extends RunEvidenceMetadata {
+    runId?: string;
+    score?: number;
+    failureMode?: string;
+}
+/**
+ * Project a completed control-loop run into the strict RunRecord shape used by
+ * release gates, optimizer tables, and research reports.
+ *
+ * The control loop owns live execution evidence. The caller still supplies the
+ * experimental cell metadata because prompt/config hashes, split assignment,
+ * model snapshot, and commit SHA are product/harness concerns.
+ */
+declare function controlRunToRunRecord<TState, TAction, TActionResult, TEval extends ControlEvalResult = ControlEvalResult>(run: ControlRunResult<TState, TAction, TActionResult, TEval>, options: ControlRunToRunRecordOptions): RunRecord;
+declare function scoreFromEvals(evals: readonly ControlEvalResult[]): number | undefined;
+interface ActionExecutionPolicy {
+    allowedTypes?: string[];
+    blockedTypes?: string[];
+    alwaysRequireApprovalTypes?: string[];
+    autoApproveTypes?: string[];
+    requireApprovalForExternalSideEffects?: boolean;
+    requireApprovalAboveCostUsd?: number;
+    maxActionCostUsd?: number;
+    remainingBudgetUsd?: number;
+    expectedOutcomeRequired?: boolean;
+    killCriteriaRequired?: boolean;
+}
+interface ActionPolicyDecision {
+    allowed: boolean;
+    blocked: boolean;
+    requiresApproval: boolean;
+    reasons: string[];
+    label?: FeedbackLabel;
+}
+declare function evaluateActionPolicy(action: ProposedSideEffect, policy?: ActionExecutionPolicy, options?: {
+    createdAt?: string;
+}): ActionPolicyDecision;
+/**
+ * Propose / Verify / Review — the core multi-shot primitive.
+ *
+ *   shot N:  propose(state, priorReview) → new state
+ *            verify(state)               → pass/fail, optional layers
+ *            review(state, verification, memory) → observations + next-shot
+ *                                                   instruction + shouldContinue
+ *            memory.append(entry)
+ *
+ * Roles are strictly separated:
+ *
+ *   - The WORKER is whatever the caller wraps in `propose`. It is
+ *     stateful — caller owns its resume/session mechanism.
+ *   - The VERIFIER grades the state. It produces the ground truth.
+ *     The reviewer cannot overturn or downgrade a verification layer.
+ *   - The REVIEWER is stateless per call. Its continuity is the
+ *     `ReviewMemoryStore` — durable JSONL by default, or any store
+ *     implementing the interface. It reads memory + trace summary +
+ *     verification and directs the NEXT proposer shot.
+ *
+ * This shape is load-bearing. The reviewer never grades; the verifier
+ * never directs. Two processes, two prompts, two concerns — which is
+ * what keeps the loop from confirmation-biasing itself into "all
+ * passed" when it didn't.
+ *
+ * Short-circuits and soft-fails are both first-class:
+ *   - verify.pass === true  → reviewer LLM call is skipped, memory
+ *     records a success entry, loop exits.
+ *   - review throws         → the shot still counts; the loop uses the
+ *     last-known instruction (or `fallbackInstruction`) for the next
+ *     propose call. A transient reviewer failure must NEVER abort a
+ *     valid arc.
+ *
+ * Composable: `propose` itself can be another `runProposeReview` call.
+ * That's the dogfooding path — a harness built on this primitive is in
+ * turn evaluable by it.
+ */
+interface Verification {
+    pass: boolean;
+    score?: number;
+    failingLayers?: string[];
+    details?: unknown;
+}
+interface Review {
+    observations: string;
+    diagnosis: string;
+    nextShotInstruction: string;
+    shouldContinue: boolean;
+    confidence: number;
+}
+interface ReviewMemoryEntry extends Review {
+    shot: number;
+    timestamp: number;
+    verification: {
+        pass: boolean;
+        score?: number;
+        failingLayers?: string[];
+    };
+}
+interface ProposeInput<State> {
+    shot: number;
+    goal: string;
+    state: State;
+    priorReview: Review | null;
+    abortSignal: AbortSignal;
+    emitter?: TraceEmitter;
+}
+interface ProposeOutput<State, Summary = unknown> {
+    state: State;
+    traceSummary?: Summary;
+}
+interface ReviewInput<State, Summary = unknown> {
+    shot: number;
+    goal: string;
+    state: State;
+    verification: Verification;
+    traceSummary: Summary | undefined;
+    memory: ReviewMemoryEntry[];
+}
+type ProposeFn<State, Summary = unknown> = (input: ProposeInput<State>) => Promise<ProposeOutput<State, Summary>>;
+type VerifyFn<State> = (state: State) => Promise<Verification>;
+type ReviewFn<State, Summary = unknown> = (input: ReviewInput<State, Summary>) => Promise<Review>;
+interface ReviewMemoryStore {
+    load(): Promise<ReviewMemoryEntry[]>;
+    append(entry: ReviewMemoryEntry): Promise<void>;
+}
+interface ProposeReviewConfig<State, Summary = unknown> {
+    goal: string;
+    initialState: State;
+    propose: ProposeFn<State, Summary>;
+    verify: VerifyFn<State>;
+    review: ReviewFn<State, Summary>;
+    /** Hard shot cap. Default 10. */
+    maxShots?: number;
+    /** Wall-clock cap in ms. Default 10 min. */
+    maxWallMs?: number;
+    /**
+     * If the reviewer returns confidence ≤ floor on `confidenceFloorWindow`
+     * consecutive shots, terminate early. Default floor 0.3, window 2.
+     * Set window to 0 or floor to <0 to disable.
+     */
+    confidenceFloor?: number;
+    confidenceFloorWindow?: number;
+    /** Defaults to an in-memory store if omitted. */
+    memory?: ReviewMemoryStore;
+    /** If provided, emit a Run + per-shot spans. */
+    store?: TraceStore;
+    scenarioId?: string;
+    projectId?: string;
+    variantId?: string;
+    /**
+     * Used when the reviewer soft-fails on shot 1 (no prior instruction to
+     * fall back to). Default is a generic "inspect failures and fix".
+     */
+    fallbackInstruction?: string;
+}
+interface ProposeReviewShot<State, Summary = unknown> {
+    shot: number;
+    state: State;
+    verification: Verification;
+    traceSummary: Summary | undefined;
+    review: Review;
+    reviewAvailable: boolean;
+    reviewError?: string;
+    durationMs: number;
+}
+interface ProposeReviewReport<State, Summary = unknown> {
+    runId: string | null;
+    completed: boolean;
+    shots: ProposeReviewShot<State, Summary>[];
+    finalState: State;
+    finalVerification: Verification;
+    failureClass?: FailureClass;
+    wallMs: number;
+    score: number;
+}
+declare function inMemoryReviewStore(initial?: ReviewMemoryEntry[]): ReviewMemoryStore;
+declare function jsonlReviewStore(path: string): ReviewMemoryStore;
+declare function runProposeReview<State, Summary = unknown>(config: ProposeReviewConfig<State, Summary>): Promise<ProposeReviewReport<State, Summary>>;
+interface LlmJsonCall {
+    (req: {
+        system: string;
+        user: string;
+    }): Promise<unknown>;
+}
+interface LlmReviewerConfig<State, Summary = unknown> {
+    callJson: LlmJsonCall;
+    renderState?: (state: State) => string;
+    renderTraceSummary?: (summary: Summary | undefined) => string;
+    /** Appended to the default system prompt. */
+    systemPromptAddendum?: string;
+}
+declare function createLlmReviewer<State, Summary = unknown>(cfg: LlmReviewerConfig<State, Summary>): ReviewFn<State, Summary>;
+interface ProposeReviewControlState<State, Summary = unknown> {
+    shot: number;
+    state: State;
+    priorReview: Review | null;
+    verification: Verification;
+    traceSummary?: Summary;
+    memory: ReviewMemoryEntry[];
+    completed: boolean;
+    reviewAvailable: boolean;
+    reviewError?: string;
+}
+interface ProposeReviewControlAction {
+    type: 'propose-review-shot';
+    shot: number;
+}
+interface ProposeReviewControlResult<State, Summary = unknown> {
+    state: State;
+    verification: Verification;
+    traceSummary?: Summary;
+    review: Review | null;
+    reviewAvailable: boolean;
+    reviewError?: string;
+}
+interface ProposeReviewControlConfig<State, Summary = unknown> {
+    goal: string;
+    initialState: State;
+    propose: ProposeFn<State, Summary>;
+    verify: VerifyFn<State>;
+    review: ReviewFn<State, Summary>;
+    maxShots?: number;
+    maxWallMs?: number;
+    memory?: ReviewMemoryStore;
+    store?: TraceStore;
+    scenarioId?: string;
+    projectId?: string;
+    variantId?: string;
+    fallbackInstruction?: string;
+    confidenceFloor?: number;
+    confidenceFloorWindow?: number;
+    failureClassFromVerification?: (verification: Verification) => FailureClass | undefined;
+    actionFailure?: ControlRuntimeConfig<ProposeReviewControlState<State, Summary>, ProposeReviewControlAction, ProposeReviewControlResult<State, Summary>>['actionFailure'];
+}
+declare function runProposeReviewAsControlLoop<State, Summary = unknown>(config: ProposeReviewControlConfig<State, Summary>): Promise<ControlRunResult<ProposeReviewControlState<State, Summary>, ProposeReviewControlAction, ProposeReviewControlResult<State, Summary>>>;
+declare function controlFailureClassFromVerification(verification: Verification): FailureClass | undefined;
+export { type ActionExecutionPolicy as A, type ControlRunToRunRecordOptions as C, type LlmJsonCall as L, type ProposeFn as P, type Review as R, type Verification as V, type ActionPolicyDecision as a, type LlmReviewerConfig as b, type ProposeInput as c, type ProposeOutput as d, type ProposeReviewConfig as e, type ProposeReviewControlAction as f, type ProposeReviewControlConfig as g, type ProposeReviewControlResult as h, type ProposeReviewControlState as i, type ProposeReviewReport as j, type ProposeReviewShot as k, type ReviewFn as l, type ReviewInput as m, type ReviewMemoryEntry as n, type ReviewMemoryStore as o, type RunEvidenceMetadata as p, type VerifyFn as q, controlFailureClassFromVerification as r, controlRunToRunRecord as s, createLlmReviewer as t, evaluateActionPolicy as u, inMemoryReviewStore as v, jsonlReviewStore as w, runProposeReview as x, runProposeReviewAsControlLoop as y, scoreFromEvals as z };

package/dist/control.d.ts ADDED Viewed

@@ -0,0 +1,5 @@
+export { d as ControlActionFailureMode, e as ControlActionOutcome, f as ControlBudget, g as ControlContext, h as ControlDecision, c as ControlEvalResult, i as ControlRunResult, j as ControlRuntimeConfig, k as ControlRuntimeError, C as ControlSeverity, l as ControlStep, m as ControlStopPolicies, S as StopDecision, B as allCriticalPassed, M as objectiveEval, T as runAgentControlLoop, V as stopOnNoProgress, W as stopOnRepeatedAction, X as subjectiveEval } from './feedback-trajectory-BGQ_ANCN.js';
+export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, p as RunEvidenceMetadata, s as controlRunToRunRecord, u as evaluateActionPolicy, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-C8NKbF3w.js';
+import './dataset-B9qvlm_o.js';
+import './emitter-BYO2nSDA.js';
+import './run-record-CX_jcAyr.js';

package/dist/control.js ADDED Viewed

@@ -0,0 +1,30 @@
+import {
+  allCriticalPassed,
+  controlRunToRunRecord,
+  evaluateActionPolicy,
+  objectiveEval,
+  runAgentControlLoop,
+  runProposeReview,
+  runProposeReviewAsControlLoop,
+  scoreFromEvals,
+  stopOnNoProgress,
+  stopOnRepeatedAction,
+  subjectiveEval
+} from "./chunk-MCMV7DUL.js";
+import "./chunk-YUFXO3TU.js";
+import "./chunk-PKCVBYTQ.js";
+import "./chunk-PZ5AY32C.js";
+export {
+  allCriticalPassed,
+  controlRunToRunRecord,
+  evaluateActionPolicy,
+  objectiveEval,
+  runAgentControlLoop,
+  runProposeReview,
+  runProposeReviewAsControlLoop,
+  scoreFromEvals,
+  stopOnNoProgress,
+  stopOnRepeatedAction,
+  subjectiveEval
+};
+//# sourceMappingURL=control.js.map

package/dist/control.js.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}

package/dist/dataset-B9qvlm_o.d.ts ADDED Viewed

@@ -0,0 +1,112 @@
+/**
+ * Dataset — versioned, sliceable, content-hashed scenario collection.
+ *
+ * Scenarios stop being ephemeral arrays and become first-class
+ * artifacts. Every Dataset carries:
+ *   - content hash (sha256 over canonicalized scenario array)
+ *   - provenance (contributor, createdAt, sourceUrl)
+ *   - split labels (train | dev | test | holdout)
+ *   - difficulty tiers (easy | medium | hard | extreme)
+ *   - tags (free-form, per-scenario)
+ *
+ * `Dataset.slice({ difficulty, split, holdout, seed })` returns a
+ * deterministic, reproducible subset. Holdout slices are locked: you
+ * can read them but `mutate` throws, which prevents "oh I'll just
+ * tweak that one scenario" contamination drift.
+ */
+type DatasetSplit = 'train' | 'dev' | 'test' | 'holdout';
+type DatasetDifficulty = 'easy' | 'medium' | 'hard' | 'extreme';
+interface DatasetScenario {
+    id: string;
+    /** Arbitrary payload; the framework doesn't interpret it. */
+    payload: unknown;
+    split?: DatasetSplit;
+    difficulty?: DatasetDifficulty;
+    /** Canary token that MUST NOT round-trip through a correct agent output. */
+    canary?: string;
+    /**
+     * Behavioral-canary forbidden pattern. A string OR a serialized regex
+     * (`/.../flags`) that the agent under test MUST NOT emit. Used by
+     * {@link import('./canary').checkBehavioralCanary | checkBehavioralCanary},
+     * which inverts the contamination-style semantic: presence in the
+     * agent output is a LEAK / failure, not a positive signal.
+     *
+     * Falls back to {@link canary} when omitted.
+     */
+    forbiddenPattern?: string;
+    tags?: Record<string, string>;
+}
+interface DatasetProvenance {
+    contributor?: string;
+    createdAt: string;
+    sourceUrl?: string;
+    license?: string;
+    description?: string;
+    /** Monotonic human-readable version (e.g. "2026.04.20"). */
+    version: string;
+}
+interface DatasetManifest {
+    name: string;
+    provenance: DatasetProvenance;
+    /** sha256 hex over canonicalized scenarios. */
+    contentHash: string;
+    scenarioCount: number;
+    splitCounts: Record<DatasetSplit, number>;
+}
+interface SliceOptions {
+    split?: DatasetSplit;
+    difficulty?: DatasetDifficulty;
+    /** Number of scenarios (random sample, seeded). Omit to take all that match. */
+    limit?: number;
+    seed?: number;
+    /** Predicate narrowing. Applied after split/difficulty filters. */
+    filter?: (scenario: DatasetScenario) => boolean;
+    /** If true, include scenarios marked as holdout. Default false. */
+    includeHoldout?: boolean;
+}
+/** Locked holdouts — throws on mutate. Callers that need a mutable dataset fork it. */
+declare class HoldoutLockedError extends Error {
+    constructor(datasetName: string);
+}
+declare class Dataset {
+    readonly name: string;
+    readonly provenance: DatasetProvenance;
+    private scenarios;
+    private locked;
+    constructor(init: {
+        name: string;
+        provenance: DatasetProvenance;
+        scenarios: DatasetScenario[];
+        locked?: boolean;
+    });
+    /** All scenarios. Readonly — callers must go through `slice` or `clone`. */
+    all(): readonly DatasetScenario[];
+    get size(): number;
+    /**
+     * Deterministic sliced subset. Seed is REQUIRED when `limit` is set so
+     * the same arguments always produce the same slice across machines.
+     */
+    slice(options?: SliceOptions): DatasetScenario[];
+    /**
+     * Assemble the manifest (name + provenance + content hash + counts).
+     * Content hash is deterministic over canonicalized scenarios.
+     */
+    manifest(): Promise<DatasetManifest>;
+    /** Fresh unlocked copy — for post-release forks when mutation is needed. */
+    clone(overrides?: Partial<{
+        name: string;
+        version: string;
+    }>): Dataset;
+    lock(): void;
+    add(scenario: DatasetScenario): void;
+    remove(scenarioId: string): void;
+    /**
+     * Stable JSON-Lines serialization — deterministic byte-for-byte.
+     * Write to disk for contamination-verifiable archives.
+     */
+    toJsonl(): string;
+    static fromJsonl(jsonl: string, manifest: Omit<DatasetManifest, 'contentHash' | 'scenarioCount' | 'splitCounts'>): Dataset;
+}
+declare function hashScenarios(scenarios: DatasetScenario[]): Promise<string>;
+export { type DatasetSplit as D, HoldoutLockedError as H, type SliceOptions as S, type DatasetScenario as a, Dataset as b, type DatasetManifest as c, type DatasetDifficulty as d, type DatasetProvenance as e, hashScenarios as h };