npm - @tangle-network/agent-eval - Versions diffs - 0.21.0 → 0.22.0 - Mend

@tangle-network/agent-eval 0.21.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/CHANGELOG.md +102 -1
package/README.md +4 -0
package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
package/dist/chunk-4W4NCYM2.js.map +1 -0
package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
package/dist/chunk-6M774GY6.js +53 -0
package/dist/chunk-6M774GY6.js.map +1 -0
package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
package/dist/chunk-IOXMGMHQ.js.map +1 -0
package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
package/dist/chunk-QUKKGHTZ.js +121 -0
package/dist/chunk-QUKKGHTZ.js.map +1 -0
package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
package/dist/chunk-UAND2LOT.js +738 -0
package/dist/chunk-UAND2LOT.js.map +1 -0
package/dist/{chunk-HRZELXCR.js → chunk-USHQBPMH.js} +283 -7
package/dist/chunk-USHQBPMH.js.map +1 -0
package/dist/cli.js +3 -3
package/dist/index.d.ts +10 -284
package/dist/index.js +39 -19
package/dist/index.js.map +1 -1
package/dist/integrity-K2oVlF57.d.ts +210 -0
package/dist/openapi.json +1 -1
package/dist/optimization-UVDNKaO6.d.ts +574 -0
package/dist/optimization.d.ts +6 -144
package/dist/optimization.js +9 -2
package/dist/reporting-B82RSv9C.d.ts +593 -0
package/dist/reporting.d.ts +2 -2
package/dist/reporting.js +15 -8
package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
package/dist/traces.d.ts +101 -181
package/dist/traces.js +16 -5
package/dist/wire/index.js +3 -3
package/docs/research-report-methodology.md +19 -4
package/docs/wire-protocol.md +1 -1
package/package.json +2 -2
package/dist/chunk-3IX6QTB7.js.map +0 -1
package/dist/chunk-HRZELXCR.js.map +0 -1
package/dist/chunk-KRR4VMH7.js +0 -423
package/dist/chunk-KRR4VMH7.js.map +0 -1
package/dist/chunk-WOK2RTWG.js.map +0 -1
package/dist/reporting-Da2ihlcM.d.ts +0 -672
/package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
/package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0

package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} RENAMED Viewed

@@ -1,4 +1,5 @@
 import { a as RunRecord, R as RunSplitTag } from './run-record-CX_jcAyr.js';
+import { a as Run, S as Span, f as TraceEvent, F as FailureClass, T as TraceStore } from './store-u47QaJ9G.js';
 /**
  * HeldOutGate — first-class held-out paired-delta promotion gate.
@@ -595,4 +596,383 @@ declare function runMultiShotOptimization<P>(config: MultiShotOptimizationConfig
 declare function defaultMultiShotObjectives(): Objective<VariantAggregate>[];
 declare function trialTraceFromMultiShotTrial(trial: MultiShotTrialResult): TrialTrace;
-export { type ActionableSideInfo as A, type TrialTrace as B, buildReflectionPrompt as C, DEFAULT_MUTATION_PRIMITIVES as D, type EvolvableVariant as E, crowdingDistance as F, type GateDecision as G, HeldOutGate as H, InMemoryTrialCache as I, defaultMultiShotObjectives as J, dominates as K, paretoFrontier as L, type MutateAdapter as M, paretoFrontierWithCrowding as N, type Objective as O, type ParetoResult as P, parseReflectionResponse as Q, type ReflectionContext as R, type ScenarioAggregate as S, type TrialCache as T, runMultiShotOptimization as U, type VariantAggregate as V, runPromptEvolution as W, scalarScore as X, trialTraceFromMultiShotTrial as Y, type TrialResult as a, type AsiSeverity as b, type Direction as c, type GateEvidence as d, type GenerationReport as e, type HeldOutGateConfig as f, type HeldOutGateRejectionCode as g, type MultiShotGateConfig as h, type MultiShotGateResult as i, type MultiShotMutateAdapter as j, type MultiShotOptimizationConfig as k, type MultiShotOptimizationResult as l, type MultiShotRun as m, type MultiShotRunInput as n, type MultiShotRunner as o, type MultiShotScore as p, type MultiShotScorer as q, type MultiShotSplit as r, type MultiShotTrace as s, type MultiShotTrialResult as t, type MultiShotVariant as u, type PromptEvolutionConfig as v, type PromptEvolutionEvent as w, type PromptEvolutionResult as x, type ReflectionProposal as y, type ScoreAdapter as z };
+/**
+ * Failure taxonomy — canonical classes + a default classifier.
+ *
+ * Every failed run should end up in a named class. The classifier here
+ * is rule-based (fast, deterministic); an LLM fallback can be added by
+ * the consumer for novel cases and trained into the rule base over time.
+ *
+ * Consumers call `classifyFailure(run, spans, events)` and persist the
+ * returned class as `Run.outcome.failureClass`.
+ */
+interface FailureContext {
+    run: Run;
+    spans: Span[];
+    events: TraceEvent[];
+}
+interface FailureClassification {
+    failureClass: FailureClass;
+    reason: string;
+    triggerSpanId?: string;
+    triggerEventId?: string;
+}
+/** Ordered rules — first match wins. */
+interface FailureRule {
+    id: string;
+    match: (ctx: FailureContext) => {
+        failureClass: FailureClass;
+        reason: string;
+        triggerSpanId?: string;
+        triggerEventId?: string;
+    } | null;
+}
+declare const DEFAULT_RULES: FailureRule[];
+/** Classify the failure mode of a run using an ordered rule list. */
+declare function classifyFailure(ctx: FailureContext, rules?: FailureRule[]): FailureClassification;
+/**
+ * FailureClusterView — groups failed runs by (failureClass, triggerTool,
+ * argHash-prefix) so weekly reviews can prioritize the top-N clusters.
+ *
+ * Each cluster includes: N runs, scenarios affected, representative
+ * error message, a proposed mitigation hint (rule → action table).
+ */
+interface FailureCluster {
+    failureClass: FailureClass;
+    /** Tool name when the trigger was a tool span, else undefined. */
+    toolName?: string;
+    /** First 16 chars of argHash — clusters similar args. */
+    argPrefix?: string;
+    /**
+     * Source dimension when the trigger was a judge span (e.g. `'format'`,
+     * `'safety'`, `'correctness'`). Lets cross-template aggregators
+     * group failures by the dimension that fired without overloading
+     * `argPrefix`. Optional — legacy clusters without this field
+     * deserialize cleanly.
+     */
+    dimension?: string;
+    runCount: number;
+    scenarioIds: string[];
+    exampleError?: string;
+    exampleRunId: string;
+}
+interface FailureClusterReport {
+    clusters: FailureCluster[];
+    totalFailures: number;
+    totalRuns: number;
+}
+declare function failureClusterView(store: TraceStore, options?: {
+    rules?: FailureRule[];
+    minClusterSize?: number;
+}): Promise<FailureClusterReport>;
+/**
+ * Reporting helpers — production summaries and paper-quality figures — sit alongside `reporter.ts` rather
+ * than replacing it.
+ *
+ * Three artefacts:
+ *
+ *   - `summaryTable`           Markdown table of per-candidate means,
+ *                            95% bootstrap CIs, BH-adjusted Wilcoxon
+ *                            p-values, and Cohen's d versus a
+ *                            comparator candidate.
+ *   - `paretoChart`         Abstract spec for a cost vs quality
+ *                            scatter, with gate decisions overlaid.
+ *                            Returns numbers + labels — caller
+ *                            chooses the plotting library.
+ *   - `gainHistogram`
+ *                            Per-item paired holdout deltas as a
+ *                            histogram spec (bins + counts + median +
+ *                            CI). Same "data, not images" contract.
+ *
+ * The figure types are PlotSpecs — JSON-friendly, library-agnostic.
+ * They aren't React components and they aren't PNGs; they are
+ * what you'd hand to vega-lite, plotly, matplotlib, or your own
+ * Canvas renderer to draw the actual figure.
+ */
+interface SummaryTableOptions {
+    /** Comparator candidate id. Wilcoxon + Cohen's d are computed
+     *  versus this candidate. Required for paired stats columns. */
+    comparator?: string;
+    /** Which split to read scores from. Default 'holdout'. */
+    split?: 'search' | 'holdout';
+    /** Confidence level for the bootstrap CI on the mean. Default 0.95. */
+    confidence?: number;
+    /** FDR for BH adjustment of the comparison p-values. Default 0.05. */
+    fdr?: number;
+}
+interface SummaryTableRow {
+    candidateId: string;
+    n: number;
+    mean: number;
+    ciLow: number;
+    ciHigh: number;
+    /** BH-adjusted q-value vs comparator. NaN if no comparator. */
+    qValue: number;
+    /** Cohen's d vs comparator. NaN if no comparator. */
+    cohensD: number;
+}
+interface SummaryTable {
+    rows: SummaryTableRow[];
+    comparator: string | null;
+    split: 'search' | 'holdout';
+    /** Pre-rendered markdown — drop into a paper or PR. */
+    markdown: string;
+}
+/**
+ * Table 1 helper. Buckets runs by `candidateId`, computes mean +
+ * bootstrap CI on the chosen split, and (when a comparator is given)
+ * BH-adjusted Wilcoxon p + Cohen's d versus that comparator.
+ */
+declare function summaryTable(runs: RunRecord[], opts?: SummaryTableOptions): SummaryTable;
+interface ParetoPoint {
+    candidateId: string;
+    /** Mean USD cost per run on the chosen split. */
+    cost: number;
+    /** Mean score on the chosen split. */
+    quality: number;
+    /** Number of runs that informed this point. */
+    n: number;
+    /** Whether this candidate is on the Pareto frontier — high
+     *  quality, low cost, no dominator. */
+    onFrontier: boolean;
+    /** Optional gate verdict for this candidate, if a `GateDecision`
+     *  for it was passed in. */
+    gate?: 'promote' | 'reject_few_runs' | 'reject_negative_delta' | 'reject_overfit_gap' | null;
+}
+interface ParetoFigureSpec {
+    kind: 'pareto-cost-quality';
+    split: 'search' | 'holdout';
+    points: ParetoPoint[];
+    axes: {
+        x: 'costUsd';
+        y: 'score';
+    };
+}
+/**
+ * Cost vs quality scatter spec. `gateDecisions` is keyed by
+ * candidate id; if present, every point picks up the gate verdict
+ * for overlay.
+ */
+declare function paretoChart(runs: RunRecord[], opts?: {
+    split?: 'search' | 'holdout';
+    gateDecisions?: Record<string, GateDecision>;
+}): ParetoFigureSpec;
+interface GainDistributionBin {
+    /** Inclusive lower edge. */
+    lo: number;
+    /** Exclusive upper edge (or inclusive if it's the last bin). */
+    hi: number;
+    /** Number of pairs whose delta lands in this bin. */
+    count: number;
+}
+interface GainDistributionFigureSpec {
+    kind: 'gain-distribution';
+    candidateId: string;
+    comparator: string;
+    split: 'search' | 'holdout';
+    /** Number of pairs used. */
+    n: number;
+    bins: GainDistributionBin[];
+    median: number;
+    ci: {
+        low: number;
+        high: number;
+    };
+}
+interface GainDistributionOptions {
+    /** Number of histogram bins. Default 11 (so the centre is exact at 0). */
+    bins?: number;
+    /** Which split to use. Default 'holdout'. */
+    split?: 'search' | 'holdout';
+    /** Confidence level for the CI. Default 0.95. */
+    confidence?: number;
+    /** Bootstrap resamples. Default 2000. */
+    resamples?: number;
+    /** Deterministic seed. */
+    seed?: number;
+}
+/**
+ * Held-out improvement distribution: per-pair delta (candidate −
+ * comparator), histogrammed. Includes the bootstrap CI on the median
+ * delta — same primitive the promotion gate uses.
+ */
+declare function gainHistogram(runs: RunRecord[], candidateId: string, comparator: string, opts?: GainDistributionOptions): GainDistributionFigureSpec;
+type ResearchReportDecision = 'promote' | 'hold' | 'reject' | 'equivalent' | 'needs_more_data';
+/**
+ * Hard floor below which a paired comparison is treated as uninformative
+ * regardless of `minPairs`. Mirrors the lower limit on Wilcoxon signed-rank
+ * exact tables; below this the test has no power to separate effect sizes.
+ */
+declare const RESEARCH_REPORT_HARD_PAIR_FLOOR = 6;
+interface ResearchReportOptions {
+    /** Human-readable report title. */
+    title?: string;
+    /** Comparator candidate id. Required for statistical decision guidance. */
+    comparator?: string;
+    /** Which split to use for the primary decision. Default 'holdout'. */
+    split?: 'search' | 'holdout';
+    /** Confidence level used by lower-level report helpers. Default 0.95. */
+    confidence?: number;
+    /** FDR threshold for q-values. Default 0.05. */
+    fdr?: number;
+    /**
+     * Soft floor on paired observations before issuing a directional
+     * promote / reject. Below this we report `needs_more_data` and surface the
+     * minimum detectable effect at the current N. Default 20 — chosen so the
+     * Wilcoxon signed-rank approximation is reasonable and so the paired
+     * bootstrap CI has non-degenerate coverage. Hard floor is enforced at
+     * `RESEARCH_REPORT_HARD_PAIR_FLOOR` (6) regardless of this value.
+     */
+    minPairs?: number;
+    /**
+     * Region of Practical Equivalence on the paired delta. When a candidate's
+     * paired-delta CI is fully contained in `[low, high]`, the decision is
+     * `equivalent` rather than `hold`. Sourced from the domain owner — there is
+     * no statistically-defensible default.
+     */
+    rope?: {
+        low: number;
+        high: number;
+    };
+    /**
+     * Power for the minimum detectable effect (MDE) reported on each candidate.
+     * Default 0.8.
+     */
+    mdePower?: number;
+    /**
+     * Two-sided alpha for the MDE. Default matches `fdr` so the reported MDE
+     * lines up with the test the report actually runs.
+     */
+    mdeAlpha?: number;
+    /** Optional held-out gate decisions keyed by candidate id. */
+    gateDecisions?: Record<string, GateDecision>;
+    /** Optional failure clusters from failureClusterView. */
+    failureClusters?: FailureClusterReport;
+    /** Build gain histograms for these candidates. Defaults to all non-comparator candidates. */
+    candidateIds?: string[];
+    /** Deterministic bootstrap seed passed to gainHistogram and the posterior helper. */
+    seed?: number;
+    /** Report timestamp. Defaults to current time. */
+    generatedAt?: string;
+    /**
+     * Hash of a preregistered protocol (e.g. `signManifest({...}).contentHash`).
+     * Embedded verbatim in the report so the analysis can be cited as the
+     * preregistered one rather than a post-hoc fishing expedition.
+     */
+    preregistrationHash?: string;
+}
+interface ResearchReportRecommendation {
+    decision: ResearchReportDecision;
+    candidateId: string | null;
+    rationale: string[];
+    risks: string[];
+    nextActions: string[];
+}
+interface ResearchReportCandidate {
+    candidateId: string;
+    n: number;
+    mean: number;
+    ciLow: number;
+    ciHigh: number;
+    qValue: number;
+    cohensD: number;
+    meanDeltaVsComparator: number | null;
+    pairedN: number;
+    medianGain: number | null;
+    meanGain: number | null;
+    gainCi: {
+        low: number;
+        high: number;
+    } | null;
+    /**
+     * Bayesian-bootstrap-style posterior summaries on the paired delta. Computed
+     * from the same resamples that produce the gain CI; interpretable as
+     * "fraction of resamples in which the candidate beats the comparator on
+     * matched pairs."
+     */
+    prGreaterThanZero: number | null;
+    prInRope: number | null;
+    /**
+     * Minimum detectable effect (in score units) at the candidate's paired N,
+     * the configured power, and the configured alpha. Standardised by the
+     * observed paired-delta SD and inverted via `requiredSampleSize`. Reported
+     * for every candidate so a `needs_more_data` verdict is actionable.
+     */
+    mde: number | null;
+    onParetoFrontier: boolean;
+    gate?: ParetoPoint['gate'];
+    decision: ResearchReportDecision;
+    decisionReason: string;
+}
+interface ResearchReportMethodology {
+    /**
+     * Plain-language assumptions the report depends on. Read these first when
+     * deciding whether the verdict is load-bearing for a launch decision.
+     */
+    assumptions: string[];
+    /** Tests and estimators the verdict was computed from. */
+    methods: string[];
+    /** Alternatives the author considered and why this report didn't take them. */
+    alternatives: string[];
+    /** Failure modes — when this report should NOT drive a decision. */
+    whenNotToApply: string[];
+    /** Citations for the methodological choices above. */
+    citations: string[];
+}
+interface ResearchReport {
+    kind: 'agent-eval-research-report';
+    title: string;
+    generatedAt: string;
+    split: 'search' | 'holdout';
+    comparator: string | null;
+    /**
+     * SHA-256 over the canonicalised set of `(runId, candidateId, split)` triples
+     * the report was computed from, plus the comparator and split. Stable across
+     * key insertion order; recomputable by the reader to verify provenance.
+     */
+    runFingerprint: string;
+    preregistrationHash: string | null;
+    rope: {
+        low: number;
+        high: number;
+    } | null;
+    executiveSummary: string[];
+    recommendation: ResearchReportRecommendation;
+    candidates: ResearchReportCandidate[];
+    summary: SummaryTable;
+    charts: {
+        pareto: ParetoFigureSpec;
+        gains: GainDistributionFigureSpec[];
+    };
+    methodology: ResearchReportMethodology;
+    failureClusters?: FailureClusterReport;
+    markdown: string;
+    html: string;
+}
+/**
+ * Executive research report for CPO / AI-lead / launch-review consumption.
+ *
+ * Composes:
+ *   - `summaryTable`         marginal stats with BH-FDR-adjusted q-values
+ *   - `paretoChart`           cost-vs-quality frontier with gate overlay
+ *   - `gainHistogram`         per-candidate paired-delta distribution
+ *   - paired posterior (this file): bootstrap CI on median, Pr(Δ>0),
+ *                              Pr(Δ∈ROPE), MDE at the configured power
+ *
+ * Decisions are made on paired evidence — never on marginal means alone —
+ * and respect any held-out gate decision the caller passes through. The
+ * report embeds a SHA-256 fingerprint of the input run set and, optionally,
+ * the hash of a preregistered protocol so a downstream reader can verify
+ * provenance and that the analysis was the preregistered one.
+ *
+ * Async because the fingerprint uses Web Crypto via `hashJson`; deterministic
+ * for any fixed `runs`, `seed`, and ROPE.
+ */
+declare function researchReport(runs: RunRecord[], opts?: ResearchReportOptions): Promise<ResearchReport>;
+export { type ResearchReportOptions as $, type ActionableSideInfo as A, type MultiShotTrace as B, type MultiShotTrialResult as C, DEFAULT_RULES as D, type EvolvableVariant as E, type FailureClassification as F, type GainDistributionBin as G, HeldOutGate as H, InMemoryTrialCache as I, type MultiShotVariant as J, type ParetoFigureSpec as K, type ParetoPoint as L, type MutateAdapter as M, type PromptEvolutionConfig as N, type Objective as O, type ParetoResult as P, type PromptEvolutionEvent as Q, type PromptEvolutionResult as R, RESEARCH_REPORT_HARD_PAIR_FLOOR as S, type TrialCache as T, type ReflectionContext as U, type VariantAggregate as V, type ReflectionProposal as W, type ResearchReport as X, type ResearchReportCandidate as Y, type ResearchReportDecision as Z, type ResearchReportMethodology as _, type TrialResult as a, type ResearchReportRecommendation as a0, type ScenarioAggregate as a1, type ScoreAdapter as a2, type SummaryTable as a3, type SummaryTableOptions as a4, type SummaryTableRow as a5, type TrialTrace as a6, buildReflectionPrompt as a7, classifyFailure as a8, crowdingDistance as a9, defaultMultiShotObjectives as aa, dominates as ab, failureClusterView as ac, gainHistogram as ad, paretoChart as ae, paretoFrontier as af, paretoFrontierWithCrowding as ag, parseReflectionResponse as ah, researchReport as ai, runMultiShotOptimization as aj, runPromptEvolution as ak, scalarScore as al, summaryTable as am, trialTraceFromMultiShotTrial as an, type AsiSeverity as b, DEFAULT_MUTATION_PRIMITIVES as c, type Direction as d, type FailureCluster as e, type FailureClusterReport as f, type FailureContext as g, type FailureRule as h, type GainDistributionFigureSpec as i, type GainDistributionOptions as j, type GateDecision as k, type GateEvidence as l, type GenerationReport as m, type HeldOutGateConfig as n, type HeldOutGateRejectionCode as o, type MultiShotGateConfig as p, type MultiShotGateResult as q, type MultiShotMutateAdapter as r, type MultiShotOptimizationConfig as s, type MultiShotOptimizationResult as t, type MultiShotRun as u, type MultiShotRunInput as v, type MultiShotRunner as w, type MultiShotScore as x, type MultiShotScorer as y, type MultiShotSplit as z };