npm - @tangle-network/agent-eval - Versions diffs - 0.65.0 → 0.66.0 - Mend

@tangle-network/agent-eval 0.65.0 → 0.66.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/CHANGELOG.md +12 -0
package/dist/adapters/otel.d.ts +1 -1
package/dist/campaign/index.d.ts +4 -3
package/dist/campaign/index.js +18 -19
package/dist/campaign/index.js.map +1 -1
package/dist/{chunk-7TPYV2ER.js → chunk-6XQIEUQ2.js} +140 -7
package/dist/chunk-6XQIEUQ2.js.map +1 -0
package/dist/{chunk-HKINEDRZ.js → chunk-DFS3FEXO.js} +3 -2
package/dist/chunk-DFS3FEXO.js.map +1 -0
package/dist/{chunk-4ODZXQV2.js → chunk-Q56RRLEC.js} +635 -2
package/dist/chunk-Q56RRLEC.js.map +1 -0
package/dist/chunk-RDK3P4JE.js +482 -0
package/dist/chunk-RDK3P4JE.js.map +1 -0
package/dist/contract/index.d.ts +10 -8
package/dist/contract/index.js +11 -12
package/dist/contract/index.js.map +1 -1
package/dist/hosted/index.d.ts +1 -1
package/dist/hosted/index.js +1 -1
package/dist/{index-CzhtwYBT.d.ts → index-DSEHMwvS.d.ts} +4 -2
package/dist/index.d.ts +246 -3
package/dist/index.js +292 -2
package/dist/index.js.map +1 -1
package/dist/openapi.json +1 -1
package/dist/provenance-BZUFC1_D.d.ts +292 -0
package/dist/{registry-DPly4_hZ.d.ts → registry-BzAEvqAt.d.ts} +1 -1
package/dist/{run-campaign-5J3ED2UJ.js → run-campaign-BVY3RGAZ.js} +2 -3
package/dist/{provenance-lqyLpOYR.d.ts → run-improvement-loop-BKpM5T4t.d.ts} +51 -329
package/package.json +1 -1
package/dist/chunk-4ODZXQV2.js.map +0 -1
package/dist/chunk-7TPYV2ER.js.map +0 -1
package/dist/chunk-CZRKD2X2.js +0 -1104
package/dist/chunk-CZRKD2X2.js.map +0 -1
package/dist/chunk-E22YUOAL.js +0 -111
package/dist/chunk-E22YUOAL.js.map +0 -1
package/dist/chunk-HKINEDRZ.js.map +0 -1
/package/dist/{run-campaign-5J3ED2UJ.js.map → run-campaign-BVY3RGAZ.js.map} +0 -0

package/dist/openapi.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "openapi": "3.1.0",
   "info": {
     "title": "@tangle-network/agent-eval — wire protocol",
-    "version": "0.65.0",
+    "version": "0.66.0",
     "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
     "contact": {
       "name": "Tangle Network",

package/dist/provenance-BZUFC1_D.d.ts ADDED Viewed

@@ -0,0 +1,292 @@
+import { o as Mutator, I as ImprovementDriver, S as Scenario, G as Gate, f as CampaignResult, M as MutableSurface, k as GateResult, j as GateDecision } from './types-c2R2kfmv.js';
+import { R as RedTeamCase } from './red-team-DW9Ca_tj.js';
+import { R as RunRecord } from './run-record-BgTFzO2r.js';
+import { a as RunCampaignOptions, C as CampaignStorage } from './run-improvement-loop-BKpM5T4t.js';
+import { H as HostedClient, T as TraceSpanEvent } from './index-DSEHMwvS.js';
+/**
+ * @experimental
+ *
+ * `evolutionaryDriver` — adapts a stateless `Mutator` (population mutation:
+ * GEPA / AxGEPA / reflective-mutation) into an `ImprovementDriver`. This is
+ * the evolutionary strategy: each generation, mutate the current best surface
+ * into N candidates, measure, select. No generation memory beyond the current
+ * surface; the loop body handles ranking + promotion.
+ *
+ * The reflective alternative is agent-runtime's `improvementDriver` with a
+ * `reflectiveGenerator` / `agenticGenerator`: it reasons over the report +
+ * trace findings to propose targeted edits rather than blind mutations. Both
+ * conform to `ImprovementDriver`; the improvement loop is identical regardless
+ * of which drives it.
+ */
+interface EvolutionaryDriverOptions<TFindings = unknown> {
+    mutator: Mutator<TFindings>;
+    /** External findings fed to the mutator each generation. Default: []. */
+    findings?: TFindings[];
+}
+declare function evolutionaryDriver<TFindings = unknown>(opts: EvolutionaryDriverOptions<TFindings>): ImprovementDriver<TFindings>;
+/**
+ * @experimental
+ *
+ * Compose multiple `Gate` implementations — every gate must pass for the
+ * composite to ship. Closes the alignment reviewer's "default-only
+ * heldOutGate + costGate would happily promote a reward-hacked prompt"
+ * concern by making safety gates first-class composable defaults.
+ */
+/** Compose gates — all must `ship` for the composite to `ship`. First
+ *  non-ship verdict short-circuits the composite verdict, but ALL gates run
+ *  (so the result records every gate's reason — useful for diagnostics). */
+declare function composeGate<TArtifact = unknown, TScenario extends Scenario = Scenario>(...gates: Array<Gate<TArtifact, TScenario>>): Gate<TArtifact, TScenario>;
+/**
+ * @experimental
+ *
+ * `defaultProductionGate` — composes the substrate's existing safety
+ * primitives (red-team / reward-hacking / canary / heldout) into a single
+ * Gate.decide shape. Closes the alignment + Anthropic-SI reviewers' "safety
+ * primitives are off the critical path" blocker.
+ *
+ * The composition is opinionated — when consumers wire `runImprovementLoop`,
+ * THIS gate is the default. Consumers can still pass a custom gate to
+ * override; the recommended pattern is to compose THIS gate with whatever
+ * extra domain-specific gates they need (`composeGate(defaultProductionGate(...), customGate)`).
+ */
+interface DefaultProductionGateOptions {
+    /** Required: scenarios held out from training; substrate compares
+     *  candidate-on-holdout vs baseline-on-holdout. */
+    holdoutScenarios: Scenario[];
+    /** Minimum mean-composite improvement required to ship. Default 0.5. */
+    deltaThreshold?: number;
+    /** Total $ budget for ALL cells in this campaign — including baseline + candidate.
+     *  Composite verdict refuses to ship when spend exceeded budget. */
+    budgetUsd?: number;
+    /** Red-team cases to probe candidate outputs against. When omitted the
+     *  substrate uses `DEFAULT_RED_TEAM_CORPUS`. Provide a domain-specific
+     *  battery for tighter coverage. */
+    redTeamBattery?: RedTeamCase[];
+    /** Run records (oldest-first) needed for the reward-hacking detector.
+     *  Substrate populates from prior production-loop generations. */
+    recentRuns?: RunRecord[];
+    /** When true, the gate refuses to ship if the reward-hacking detector
+     *  fires at the `gaming` severity. Default true. */
+    blockOnRewardHackingGaming?: boolean;
+}
+declare function defaultProductionGate<TArtifact, TScenario extends Scenario>(options: DefaultProductionGateOptions): Gate<TArtifact, TScenario>;
+/**
+ * @experimental
+ *
+ * Thin Gate adapter — exposes delta-threshold-on-holdout as a composable
+ * `Gate`. Use when you want held-out as one of N composed gates instead of
+ * the full `defaultProductionGate` stack.
+ */
+interface HeldOutGateOptions<TScenario extends Scenario = Scenario> {
+    scenarios: TScenario[];
+    deltaThreshold?: number;
+}
+declare function heldOutGate<TArtifact, TScenario extends Scenario>(options: HeldOutGateOptions<TScenario>): Gate<TArtifact, TScenario>;
+/**
+ * @experimental
+ *
+ * `runEval` — the simplest preset over `runCampaign`. No optimizer, no
+ * gate, no auto-PR. Just: run scenarios through dispatch, score with
+ * judges, return CampaignResult.
+ *
+ * The 80% case for consumers who want a scorecard, not an improvement loop.
+ */
+interface RunEvalOptions<TScenario extends Scenario, TArtifact> extends Omit<RunCampaignOptions<TScenario, TArtifact>, 'runDir'> {
+    runDir: string;
+}
+declare function runEval<TScenario extends Scenario, TArtifact>(opts: RunEvalOptions<TScenario, TArtifact>): Promise<CampaignResult<TArtifact, TScenario>>;
+/**
+ * @experimental
+ *
+ * Loop provenance — the durable, queryable record of WHAT a self-improvement
+ * loop did and WHY, plus the OTel spans that let an OTLP collector pivot from
+ * an eval-run to the underlying candidate→cell→gate→promote chain.
+ *
+ * Two artifacts, one source of truth:
+ *
+ *   1. `LoopProvenanceRecord` — a structured JSON record capturing every
+ *      candidate (surfaceHash + label + rationale), its measured composite,
+ *      the gate decision + reasons + delta, the held-out lift, the explicit
+ *      baseline→candidate diff, and BACKEND PROVENANCE (the
+ *      `assertRealBackend` verdict + worker call count + model). This is the
+ *      ingestable audit artifact: the +lift recomputes from it, the "because
+ *      Z" rationale survives in it, and a stub backend is detectable from it.
+ *
+ *   2. `loopProvenanceSpans()` — the same chain emitted as OTLP-ingestable
+ *      `TraceSpanEvent`s, pivoted on the substrate's standard
+ *      `tangle.runId` / `tangle.scenarioId` / `tangle.cellId` /
+ *      `tangle.generation` attributes (the same pivots `/adapters/otel`
+ *      reads). The hosted `/v1/ingest/traces` endpoint receives the FULL loop,
+ *      not just the `cost.*` spans `runCampaign` already emits per cell.
+ *
+ * The record is built from the substrate's own loop result + the per-call
+ * `RunRecord`s the worker emitted — no new measurement, no recomputation that
+ * could drift from what the gate actually saw.
+ */
+/** Stable sha256 (full hex) of a surface's effective text. Code surfaces hash
+ *  their worktree+base identity since the content lives in git. Distinct from
+ *  `surfaceHash` (16-char content fingerprint used as a loop identity key);
+ *  this is the byte-identical-verifiable content hash the provenance record +
+ *  `RunRecord.promptHash` carry. */
+declare function surfaceContentHash(surface: MutableSurface): string;
+interface LoopProvenanceCandidate {
+    /** Generation index this candidate was proposed in. */
+    generation: number;
+    /** 16-char loop-identity fingerprint (matches `GenerationCandidate.surfaceHash`). */
+    surfaceHash: string;
+    /** Full sha256 content hash — byte-identical-verifiable. */
+    contentHash: string;
+    /** Driver label, when the driver returned a `ProposedCandidate`. */
+    label?: string;
+    /** Driver rationale — the "because Z". When the driver returned a bare
+     *  surface (blind mutator) this is absent. */
+    rationale?: string;
+    /** Mean composite this candidate scored on the search split. */
+    composite: number;
+    /** Whether this candidate was promoted out of its generation. */
+    promoted: boolean;
+}
+interface LoopProvenanceBackend {
+    /** `assertRealBackend`-grade verdict over the worker call records. */
+    verdict: 'real' | 'mixed' | 'stub';
+    /** Number of worker LLM calls captured (the audit's "worker call count"). */
+    workerCallCount: number;
+    /** Distinct model ids observed across worker calls. */
+    models: string[];
+    totalInputTokens: number;
+    totalOutputTokens: number;
+    totalCostUsd: number;
+}
+/**
+ * The durable provenance record. Aligns to the hosted `EvalRunEvent` path but
+ * ADDS the rationale + the explicit baseline→candidate diff (both omitted from
+ * the bare hosted event) + backend provenance.
+ */
+interface LoopProvenanceRecord {
+    schema: 'tangle.loop-provenance.v1';
+    runId: string;
+    runDir: string;
+    timestamp: string;
+    /** Baseline + winner surface content hashes — distinguishable, byte-verifiable. */
+    baselineContentHash: string;
+    winnerContentHash: string;
+    /** Driver label/rationale for the promoted change. Absent ⇒ winner == baseline. */
+    winnerLabel?: string;
+    winnerRationale?: string;
+    /** The explicit baseline→winner unified diff the gate decided on. */
+    diff: string;
+    /** Every candidate across every generation, each carrying its rationale. */
+    candidates: LoopProvenanceCandidate[];
+    /** The gate verdict — decision + reasons + contributing gates + delta. */
+    gate: {
+        decision: GateDecision;
+        reasons: string[];
+        delta?: number;
+        contributingGates: Array<{
+            name: string;
+            passed: boolean;
+        }>;
+    };
+    /** baseline-on-holdout composite mean. */
+    baselineHoldoutComposite: number;
+    /** winner-on-holdout composite mean. */
+    winnerHoldoutComposite: number;
+    /** winnerHoldout - baselineHoldout — RECOMPUTABLE from this record. */
+    heldOutLift: number;
+    /** Backend provenance: stub-vs-real verdict + worker call count + models. */
+    backend: LoopProvenanceBackend;
+    totalCostUsd: number;
+    totalDurationMs: number;
+}
+interface BuildLoopProvenanceArgs<TArtifact, TScenario extends Scenario> {
+    runId: string;
+    runDir: string;
+    timestamp: string;
+    baselineSurface: MutableSurface;
+    winnerSurface: MutableSurface;
+    winnerLabel?: string;
+    winnerRationale?: string;
+    diff: string;
+    /** Per-generation candidate records straight off the loop result. */
+    generations: Array<{
+        generationIndex: number;
+        candidates: Array<{
+            surfaceHash: string;
+            composite: number;
+            label?: string;
+            rationale?: string;
+        }>;
+        promoted: string[];
+        /** Surfaces measured this generation, keyed positionally to candidates so
+         *  the content hash can be computed from the real surface text. */
+        surfaces: Array<{
+            surfaceHash: string;
+            surface: MutableSurface;
+        }>;
+    }>;
+    gate: GateResult;
+    baselineOnHoldout: CampaignResult<TArtifact, TScenario>;
+    winnerOnHoldout: CampaignResult<TArtifact, TScenario>;
+    /** Worker call records — the source for backend provenance. */
+    workerRecords: ReadonlyArray<RunRecord>;
+    totalCostUsd: number;
+    totalDurationMs: number;
+}
+/** Build the durable provenance record from a completed loop result. */
+declare function buildLoopProvenanceRecord<TArtifact, TScenario extends Scenario>(args: BuildLoopProvenanceArgs<TArtifact, TScenario>): LoopProvenanceRecord;
+/**
+ * Build the loop's OTLP-ingestable spans from a provenance record. One root
+ * span per loop (`tangle.runId`), one span per generation, one span per
+ * candidate (carrying its surfaceHash + label), and one span for the gate
+ * decision (carrying reasons + delta + lift). Candidate + gate spans pivot on
+ * the same `tangle.runId` / `tangle.generation` attributes `/adapters/otel`
+ * reads, so the hosted collector reconstructs the full tree.
+ *
+ * Times are synthesized monotonically off a single base so the span tree is
+ * orderable; the substrate does not retain per-candidate wall-clock starts.
+ */
+declare function loopProvenanceSpans(record: LoopProvenanceRecord, opts?: {
+    baseTimeMs?: number;
+}): TraceSpanEvent[];
+/** Canonical durable paths under the run dir. */
+declare function provenanceRecordPath(runDir: string): string;
+declare function provenanceSpansPath(runDir: string): string;
+interface EmitLoopProvenanceResult {
+    record: LoopProvenanceRecord;
+    spans: TraceSpanEvent[];
+    /** Absolute paths the record + spans were written to, when storage persists. */
+    recordPath: string;
+    spansPath: string;
+}
+interface EmitLoopProvenanceArgs<TArtifact, TScenario extends Scenario> extends BuildLoopProvenanceArgs<TArtifact, TScenario> {
+    /** Storage the record + spans are written through. */
+    storage: CampaignStorage;
+    /** When set, the spans are also shipped to the hosted `/v1/ingest/traces`
+     *  endpoint so the collector receives the full loop, not just `cost.*`. */
+    hostedClient?: HostedClient;
+}
+/**
+ * Build the provenance record + OTel spans and persist them durably under the
+ * run dir (and ship spans to a hosted collector when one is wired). Returns
+ * both artifacts so the caller can assert on / re-derive from them.
+ *
+ * Fail-loud: the durable write throws on storage failure (a swallowed write is
+ * exactly the "emitted but lost" failure this closes). The hosted span ship is
+ * the one best-effort leg — its failure is logged, not thrown, so an offline
+ * collector never fails the loop (the durable artifact is the source of truth).
+ */
+declare function emitLoopProvenance<TArtifact, TScenario extends Scenario>(args: EmitLoopProvenanceArgs<TArtifact, TScenario>): Promise<EmitLoopProvenanceResult>;
+export { type BuildLoopProvenanceArgs as B, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type HeldOutGateOptions as H, type LoopProvenanceRecord as L, type RunEvalOptions as R, type EmitLoopProvenanceArgs as a, type EmitLoopProvenanceResult as b, composeGate as c, defaultProductionGate as d, evolutionaryDriver as e, type LoopProvenanceBackend as f, type LoopProvenanceCandidate as g, heldOutGate as h, buildLoopProvenanceRecord as i, emitLoopProvenance as j, provenanceSpansPath as k, loopProvenanceSpans as l, provenanceRecordPath as p, runEval as r, surfaceContentHash as s };

package/dist/{registry-DPly4_hZ.d.ts → registry-BzAEvqAt.d.ts} RENAMED Viewed

@@ -454,4 +454,4 @@ declare class AnalystRegistry {
     private routeInput;
 }
-export { AnalystRegistry as A, type BudgetPolicy as B, type ChatCallOpts as C, type DirectProviderTransportOpts as D, type EvidenceRef as E, type MockTransportOpts as M, type RegistryRunOpts as R, type SandboxSdkTransportOpts as S, type Analyst as a, type AnalystSeverity as b, type AnalystFinding as c, type AnalystCost as d, type AnalystContext as e, type AnalystHooks as f, type AnalystInputKind as g, type AnalystRegistryOptions as h, type AnalystRequirements as i, type AnalystRunEvent as j, type AnalystRunInputs as k, type AnalystRunResult as l, type AnalystRunSummary as m, type ChatClient as n, type ChatRequest as o, type ChatResponse as p, type ChatTransport as q, type CliBridgeTransportOpts as r, type CreateChatClientOpts as s, type RouterTransportOpts as t, computeFindingId as u, createChatClient as v, makeFinding as w };
+export { AnalystRegistry as A, type BudgetPolicy as B, type ChatRequest as C, type DirectProviderTransportOpts as D, type EvidenceRef as E, type MockTransportOpts as M, type RegistryRunOpts as R, type SandboxSdkTransportOpts as S, type Analyst as a, type AnalystSeverity as b, type AnalystFinding as c, type AnalystCost as d, type AnalystContext as e, type CreateChatClientOpts as f, type AnalystHooks as g, type AnalystInputKind as h, type AnalystRegistryOptions as i, type AnalystRequirements as j, type AnalystRunEvent as k, type AnalystRunInputs as l, type AnalystRunResult as m, type AnalystRunSummary as n, type ChatCallOpts as o, type ChatClient as p, type ChatResponse as q, type ChatTransport as r, type CliBridgeTransportOpts as s, type RouterTransportOpts as t, computeFindingId as u, createChatClient as v, makeFinding as w };

package/dist/{run-campaign-5J3ED2UJ.js → run-campaign-BVY3RGAZ.js} RENAMED Viewed

@@ -1,11 +1,10 @@
 import {
   runCampaign
-} from "./chunk-7TPYV2ER.js";
-import "./chunk-E22YUOAL.js";
+} from "./chunk-6XQIEUQ2.js";
 import "./chunk-ITBRCT73.js";
 import "./chunk-3BFEG2F6.js";
 import "./chunk-PZ5AY32C.js";
 export {
   runCampaign
 };
-//# sourceMappingURL=run-campaign-5J3ED2UJ.js.map
+//# sourceMappingURL=run-campaign-BVY3RGAZ.js.map