npm - @tangle-network/agent-eval - Versions diffs - 0.23.1 → 0.24.0 - Mend

@tangle-network/agent-eval 0.23.1 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (148) hide show

package/CHANGELOG.md +80 -0
package/README.md +141 -79
package/dist/baseline-4R5deP0N.d.ts +108 -0
package/dist/benchmarks/index.d.ts +3 -2
package/dist/benchmarks/index.js +1 -1
package/dist/builder-eval/index.d.ts +249 -0
package/dist/builder-eval/index.js +391 -0
package/dist/builder-eval/index.js.map +1 -0
package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
package/dist/chunk-2A5XJB43.js.map +1 -0
package/dist/chunk-47X6LRCE.js +76 -0
package/dist/chunk-47X6LRCE.js.map +1 -0
package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
package/dist/chunk-4F5DQN55.js.map +1 -0
package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
package/dist/chunk-4S4BM3QQ.js.map +1 -0
package/dist/chunk-5BKGXME7.js +65 -0
package/dist/chunk-5BKGXME7.js.map +1 -0
package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
package/dist/chunk-6QDKWHLS.js.map +1 -0
package/dist/chunk-I4MBDTY5.js +272 -0
package/dist/chunk-I4MBDTY5.js.map +1 -0
package/dist/chunk-K2TPS5LB.js +569 -0
package/dist/chunk-K2TPS5LB.js.map +1 -0
package/dist/chunk-KKHDIONI.js +414 -0
package/dist/chunk-KKHDIONI.js.map +1 -0
package/dist/chunk-KMPRBJK4.js +74 -0
package/dist/chunk-KMPRBJK4.js.map +1 -0
package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
package/dist/chunk-KTGTIOFD.js.map +1 -0
package/dist/chunk-LSH4MMOZ.js +838 -0
package/dist/chunk-LSH4MMOZ.js.map +1 -0
package/dist/chunk-NG236HPC.js +57 -0
package/dist/chunk-NG236HPC.js.map +1 -0
package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
package/dist/chunk-NLMNWKVM.js.map +1 -0
package/dist/chunk-NU65VQ7M.js +99 -0
package/dist/chunk-NU65VQ7M.js.map +1 -0
package/dist/chunk-OHEPNJQN.js +554 -0
package/dist/chunk-OHEPNJQN.js.map +1 -0
package/dist/chunk-OWLAAMME.js +250 -0
package/dist/chunk-OWLAAMME.js.map +1 -0
package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
package/dist/chunk-PC4UYEBM.js.map +1 -0
package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
package/dist/chunk-RAF443UI.js.map +1 -0
package/dist/chunk-RZTMDUO7.js +49 -0
package/dist/chunk-RZTMDUO7.js.map +1 -0
package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
package/dist/chunk-SESZDQPX.js.map +1 -0
package/dist/{chunk-6KQG5HAH.js → chunk-SY6WAAAD.js} +84 -71
package/dist/chunk-SY6WAAAD.js.map +1 -0
package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
package/dist/chunk-TVVP3ZZQ.js.map +1 -0
package/dist/{chunk-VQQSPGSM.js → chunk-VRJVTXRV.js} +169 -111
package/dist/chunk-VRJVTXRV.js.map +1 -0
package/dist/chunk-WWYCWKUM.js +196 -0
package/dist/chunk-WWYCWKUM.js.map +1 -0
package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
package/dist/chunk-YRZ4M5GS.js.map +1 -0
package/dist/chunk-ZN274SWR.js +613 -0
package/dist/chunk-ZN274SWR.js.map +1 -0
package/dist/cli.js +10 -6
package/dist/cli.js.map +1 -1
package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
package/dist/control.d.ts +8 -6
package/dist/control.js +10 -7
package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
package/dist/errors-BZ9sTdz7.d.ts +70 -0
package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
package/dist/governance/index.d.ts +5 -0
package/dist/governance/index.js +18 -0
package/dist/governance/index.js.map +1 -0
package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
package/dist/index-Oj9fAPPN.d.ts +270 -0
package/dist/index.d.ts +1866 -3151
package/dist/index.js +5457 -7809
package/dist/index.js.map +1 -1
package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
package/dist/knowledge/index.d.ts +102 -0
package/dist/knowledge/index.js +18 -0
package/dist/knowledge/index.js.map +1 -0
package/dist/meta-eval/index.d.ts +99 -0
package/dist/meta-eval/index.js +324 -0
package/dist/meta-eval/index.js.map +1 -0
package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
package/dist/openapi.json +1 -1
package/dist/optimization.d.ts +11 -8
package/dist/optimization.js +11 -9
package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
package/dist/pipelines/index.d.ts +172 -0
package/dist/pipelines/index.js +409 -0
package/dist/pipelines/index.js.map +1 -0
package/dist/prm/index.d.ts +99 -0
package/dist/prm/index.js +222 -0
package/dist/prm/index.js.map +1 -0
package/dist/query-DODUYdPg.d.ts +30 -0
package/dist/release-report-TDPn1cxq.d.ts +292 -0
package/dist/replay-BL96gCEP.d.ts +226 -0
package/dist/reporting.d.ts +10 -295
package/dist/reporting.js +10 -6
package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-CUOiGcGv.d.ts} +148 -146
package/dist/rl.d.ts +1762 -8
package/dist/rl.js +2035 -58
package/dist/rl.js.map +1 -1
package/dist/rubric-D5tjHNJQ.d.ts +72 -0
package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
package/dist/sequential-Dgz1n51-.d.ts +139 -0
package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-BXGs_9V0.d.ts} +3 -76
package/dist/telemetry/file.js +4 -1
package/dist/telemetry/file.js.map +1 -1
package/dist/telemetry/index.js +57 -57
package/dist/telemetry/index.js.map +1 -1
package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
package/dist/traces.d.ts +142 -387
package/dist/traces.js +1302 -40
package/dist/traces.js.map +1 -1
package/dist/trajectory-CnoBo-JY.d.ts +32 -0
package/dist/wire/index.d.ts +22 -22
package/dist/wire/index.js +4 -3
package/package.json +44 -18
package/dist/chunk-42I2QC2L.js.map +0 -1
package/dist/chunk-5IIQKMD5.js.map +0 -1
package/dist/chunk-6KQG5HAH.js.map +0 -1
package/dist/chunk-6M774GY6.js.map +0 -1
package/dist/chunk-7EAUOUQS.js.map +0 -1
package/dist/chunk-AXHNWLIX.js.map +0 -1
package/dist/chunk-EXGR4XEM.js.map +0 -1
package/dist/chunk-IOXMGMHQ.js.map +0 -1
package/dist/chunk-KAO3Q65R.js.map +0 -1
package/dist/chunk-LZKIOBG2.js +0 -2026
package/dist/chunk-LZKIOBG2.js.map +0 -1
package/dist/chunk-QBW3YBTR.js.map +0 -1
package/dist/chunk-QUKKGHTZ.js.map +0 -1
package/dist/chunk-SQQLHODJ.js.map +0 -1
package/dist/chunk-V5QSWN7L.js +0 -1310
package/dist/chunk-V5QSWN7L.js.map +0 -1
package/dist/chunk-VQQSPGSM.js.map +0 -1
package/dist/chunk-XPHOZPOM.js +0 -1947
package/dist/chunk-XPHOZPOM.js.map +0 -1
package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
package/dist/index-ekBXweiQ.d.ts +0 -1894
package/dist/sequential-DgU2mFsE.d.ts +0 -304

package/dist/sequential-DgU2mFsE.d.ts DELETED Viewed

@@ -1,304 +0,0 @@
-import { R as RunRecord } from './run-record-DNiOMBrZ.js';
-/**
- * OutcomeStore — deployment outcomes attached to Run IDs.
- *
- * Outcomes arrive asynchronously from production telemetry after the
- * eval run completed: user ratings, retention flags, conversion events,
- * revenue, support-ticket rate, anything a product team can measure.
- * The store is a peer to TraceStore — separate lifecycle, same runId
- * foreign key.
- *
- * The whole point of this module is to make the meta-eval correlation
- * question computable: `correlate(evalMetric, outcomeMetric) → r, ρ, n, CI`.
- */
-interface DeploymentOutcome {
-    runId: string;
-    capturedAt: number;
-    /** Numeric outcomes keyed by name — retention_7d, csat, revenue_usd, etc. */
-    metrics: Record<string, number>;
-    /** Dimensions for stratified analysis — cohort, region, user_segment. */
-    labels?: Record<string, string>;
-    /** Free-form provenance (source system, pipeline version). */
-    source?: string;
-}
-interface OutcomeFilter {
-    runIds?: string[];
-    since?: number;
-    until?: number;
-    label?: {
-        key: string;
-        value: string;
-    };
-    source?: string;
-}
-interface OutcomeStore {
-    append(outcome: DeploymentOutcome): Promise<void>;
-    /** All outcomes attached to this run (a single run can have many — multiple
-     *  capture windows over deployment time). */
-    forRun(runId: string): Promise<DeploymentOutcome[]>;
-    list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
-}
-declare class InMemoryOutcomeStore implements OutcomeStore {
-    private items;
-    append(outcome: DeploymentOutcome): Promise<void>;
-    forRun(runId: string): Promise<DeploymentOutcome[]>;
-    list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
-}
-interface FileSystemOutcomeStoreOptions {
-    dir: string;
-    maxBytes?: number;
-}
-declare class FileSystemOutcomeStore implements OutcomeStore {
-    private dir;
-    private maxBytes;
-    private memo?;
-    private loaded;
-    constructor(options: FileSystemOutcomeStoreOptions);
-    private ensureDir;
-    append(outcome: DeploymentOutcome): Promise<void>;
-    private load;
-    forRun(runId: string): Promise<DeploymentOutcome[]>;
-    list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
-}
-/**
- * Rubric predictive validity — does our eval rubric predict deployment
- * outcomes?
- *
- * `correlationStudy` (already in this package) joins a `TraceStore` to an
- * `OutcomeStore` and computes Pearson + Spearman + bootstrap CI for each
- * (eval-metric, outcome-metric) pair. That answers "does X correlate with
- * Y at all." `rubricPredictiveValidity` is the campaign-shaped wrapper
- * around it: take a sequence of `RunRecord`s (the canonical campaign
- * artifact) and a `DeploymentOutcomeStore`, join on `runId`, return a
- * ranked verdict on every rubric whose dimension scores were captured in
- * `outcome.raw`.
- *
- * The point — quoting the methodology doc — is that **without this loop
- * every rubric is faith-based**. Once it's wired, you know which rubrics
- * have earned their promotion power and which ones are decoration.
- *
- *   const validity = await rubricPredictiveValidity({
- *     runs: lastQuarter,
- *     outcomes: shipFlagOutcomeStore,
- *     outcomeMetrics: ['revenue_lift', 'retention_30d', 'csat'],
- *     rubrics: ['anti_slop', 'semantic_concept', 'tool_recovery'],
- *   })
- *   for (const r of validity.ranked) {
- *     console.log(`${r.rubric} → ${r.bestOutcome}: ρ=${r.spearman.toFixed(2)}`)
- *   }
- *
- * The function is intentionally read-only. Use the verdict to deprecate
- * decorative rubrics, re-weight composite scores, or trigger a
- * recalibration sweep when predictive validity drops below a threshold.
- */
-interface RubricPredictiveValidityInput {
-    /**
-     * Canonical campaign output. Each record's `outcome.raw[<rubricId>]`
-     * provides the eval score; missing keys are silently skipped per pair.
-     */
-    runs: RunRecord[];
-    outcomes: OutcomeStore;
-    /**
-     * Outcome metric names to evaluate against. Each must appear in at
-     * least one `DeploymentOutcome.metrics` keyspace; pairs with too few
-     * joined samples are excluded from the result.
-     */
-    outcomeMetrics: string[];
-    /**
-     * Rubric ids to evaluate. Must appear as keys in `RunRecord.outcome.raw`.
-     * If omitted, every numeric key in `outcome.raw` across the run set is
-     * treated as a rubric.
-     */
-    rubrics?: string[];
-    /** Minimum joined-sample count before a pair is reported. Default 8. */
-    minSamples?: number;
-    /** Bootstrap resamples for CI. Default 500. */
-    bootstrapResamples?: number;
-    /** Random seed for the bootstrap (mulberry32). Default unset (Math.random). */
-    seed?: number;
-    /**
-     * Reduction when multiple outcomes attach to one runId. Default `'latest'`
-     * (most recently captured).
-     */
-    reduction?: 'latest' | 'mean' | 'max';
-}
-interface RubricOutcomePair {
-    rubric: string;
-    outcome: string;
-    n: number;
-    pearson: number;
-    spearman: number;
-    ci95: {
-        low: number;
-        high: number;
-    };
-    /**
-     * Verdict bucket. `load_bearing` ≥ 0.7, `informative` ≥ 0.4,
-     * `decorative` < 0.4 in absolute correlation. A negative correlation
-     * with a desired outcome is also `decorative` — actively misleading
-     * is worse than uninformative.
-     */
-    verdict: 'load_bearing' | 'informative' | 'decorative';
-}
-interface RubricRanking {
-    rubric: string;
-    /** Outcome metric this rubric correlated best with. */
-    bestOutcome: string;
-    spearman: number;
-    pearson: number;
-    n: number;
-    verdict: RubricOutcomePair['verdict'];
-}
-interface RubricPredictiveValidityReport {
-    pairs: RubricOutcomePair[];
-    /** Per-rubric best pair, sorted descending by |spearman|. */
-    ranked: RubricRanking[];
-    joinedSamples: number;
-    skippedRuns: number;
-    /** Rubrics that were declared but never produced a usable score. */
-    rubricsWithoutData: string[];
-}
-declare function rubricPredictiveValidity(input: RubricPredictiveValidityInput): Promise<RubricPredictiveValidityReport>;
-/**
- * Always-valid sequential evaluation.
- *
- * `researchReport` (0.21+) assumes a single pre-specified analysis. Real
- * consumers run campaigns weekly / nightly / per-PR; each new run silently
- * inflates the false-discovery rate, because the BH-FDR guarantee was for
- * the *first* look, not the 47th. Without time-uniform inference,
- * launch-decision teams either (a) don't peek, which forfeits the cost
- * advantage of stop-when-decisive, or (b) peek and pretend they didn't,
- * which forfeits scientific validity.
- *
- * This module ships **e-value-based confidence sequences** for paired
- * bounded outcomes. The methodology is the predictable plug-in betting
- * martingale of Waudby-Smith & Ramdas (2024) — provably valid at *any*
- * stopping time. Concretely:
- *
- *   For paired deltas D_1, D_2, … ∈ [-c, c] with the null H_0: E[D] ≤ 0,
- *   a betting fraction λ_i is chosen using only D_{1..i-1} (predictable
- *   plug-in), and the running e-value is
- *
- *     E_t = ∏_{i=1}^{t} (1 + λ_i · D_i)
- *
- *   E_t is a non-negative martingale under H_0 with E[E_t] ≤ 1, so by
- *   Ville's inequality, P(∃ t : E_t ≥ 1/α) ≤ α — we can reject the null
- *   at any time without inflating the type-I error.
- *
- * Combined with `runEvalCampaign`, every consumer running rolling
- * campaigns gains the ability to ship the moment evidence is decisive,
- * stop-early on dead-on-arrival variants, and accumulate evidence across
- * partial runs without spending the FDR budget. No new sweep is wasted.
- *
- * References:
- *   - Howard, S. R., Ramdas, A., McAuliffe, J., Sekhon, J. (2021).
- *     Time-uniform, nonparametric, nonasymptotic confidence sequences.
- *     Annals of Statistics, 49(2), 1055–1080.
- *   - Waudby-Smith, I., Ramdas, A. (2024). Estimating means of bounded
- *     random variables by betting. JRSS B, 86(1), 1–27.
- */
-type SequentialDecision = 'promote_now' | 'continue' | 'reject_now' | 'equivalent';
-interface PairedEvalueOptions {
-    /**
-     * Bound on |delta|. Default 1 (matching most score scales). Must satisfy
-     * c > 0; deltas outside [-c, c] are clipped with a warning attached to
-     * the return value.
-     */
-    bound?: number;
-    /** Target Type-I error. Default 0.05. */
-    alpha?: number;
-    /**
-     * Region of Practical Equivalence on the *mean* paired delta. When
-     * supplied, the verdict can return `'equivalent'` once the running
-     * confidence sequence on the mean is fully contained in [low, high].
-     */
-    rope?: {
-        low: number;
-        high: number;
-    };
-    /** Initial bet shrinkage (0 < scale ≤ 1). Default 0.5 — empirically robust. */
-    initialBetShrinkage?: number;
-}
-interface PairedEvalueStep {
-    /** 1-indexed observation count. */
-    t: number;
-    delta: number;
-    /** Running e-value E_t = ∏ (1 + λ_i · D_i). */
-    evalue: number;
-    /** Time-uniform p-value at stopping time t. */
-    pValue: number;
-    /** Lower bound of the empirical Bernstein confidence sequence at level 1-α. */
-    csLow: number;
-    csHigh: number;
-    /** Verdict at this stopping time. */
-    decision: SequentialDecision;
-}
-interface PairedEvalueSequence {
-    steps: PairedEvalueStep[];
-    /** The decision at the final step. */
-    finalDecision: SequentialDecision;
-    /** Index (1-based) at which a non-`continue` decision first fired, or null. */
-    decisionFiredAt: number | null;
-    /** True if any deltas were clipped to [-bound, bound]. */
-    clipped: boolean;
-}
-/**
- * Run the paired e-value sequence over an in-order delta stream.
- *
- * Use for *streaming* / interim analyses: pass the deltas you have so
- * far, get the verdict at every prefix length. The decision is
- * monotone-stable in the sense that once `'reject_now'` or `'promote_now'`
- * fires, the verdict at later steps remains decisive (the e-value is a
- * non-negative martingale; once it crosses the threshold, it's crossed).
- */
-declare function pairedEvalueSequence(deltas: number[], opts?: PairedEvalueOptions): PairedEvalueSequence;
-interface InterimReleaseConfidenceInput {
-    /**
-     * One delta series per candidate (paired deltas vs comparator). Order
-     * within a series is the order the campaigns were run.
-     */
-    deltaSeries: Array<{
-        candidateId: string;
-        deltas: number[];
-    }>;
-    alpha?: number;
-    bound?: number;
-    rope?: {
-        low: number;
-        high: number;
-    };
-}
-interface InterimReleaseConfidence {
-    candidates: Array<{
-        candidateId: string;
-        decision: SequentialDecision;
-        decisionFiredAt: number | null;
-        finalEvalue: number;
-        finalPValue: number;
-        pairs: number;
-        csLow: number;
-        csHigh: number;
-    }>;
-    /**
-     * Campaign-level recommendation: pick the strongest 'promote_now', else
-     * 'continue' if any candidate is still live, else 'reject_now' if every
-     * candidate is dead, else 'equivalent'.
-     */
-    recommendation: {
-        decision: SequentialDecision;
-        candidateId: string | null;
-    };
-}
-/**
- * Run interim sequential analyses across many candidates at once,
- * preserving the time-uniform α guarantee for each candidate's series and
- * synthesising a campaign-level recommendation. Designed to be called on
- * every campaign tick — the recommendation is anytime-valid.
- */
-declare function evaluateInterimReleaseConfidence(input: InterimReleaseConfidenceInput): InterimReleaseConfidence;
-export { type DeploymentOutcome as D, FileSystemOutcomeStore as F, InMemoryOutcomeStore as I, type OutcomeFilter as O, type PairedEvalueOptions as P, type RubricOutcomePair as R, type SequentialDecision as S, type OutcomeStore as a, type FileSystemOutcomeStoreOptions as b, type InterimReleaseConfidence as c, type InterimReleaseConfidenceInput as d, type PairedEvalueSequence as e, type PairedEvalueStep as f, type RubricPredictiveValidityInput as g, type RubricPredictiveValidityReport as h, type RubricRanking as i, evaluateInterimReleaseConfidence as j, pairedEvalueSequence as p, rubricPredictiveValidity as r };