@tangle-network/agent-eval 0.20.12 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/CHANGELOG.md +177 -0
  2. package/README.md +43 -1
  3. package/dist/{chunk-KWUAAIHR.js → chunk-4W4NCYM2.js} +182 -1
  4. package/dist/chunk-4W4NCYM2.js.map +1 -0
  5. package/dist/{chunk-PKCVBYTQ.js → chunk-5IIQKMD5.js} +38 -2
  6. package/dist/chunk-5IIQKMD5.js.map +1 -0
  7. package/dist/{chunk-HNJLMAJ2.js → chunk-6KQG5HAH.js} +2 -2
  8. package/dist/chunk-6M774GY6.js +53 -0
  9. package/dist/chunk-6M774GY6.js.map +1 -0
  10. package/dist/{chunk-MCMV7DUL.js → chunk-ARZ6BEV6.js} +2 -2
  11. package/dist/chunk-IOXMGMHQ.js +1226 -0
  12. package/dist/chunk-IOXMGMHQ.js.map +1 -0
  13. package/dist/{chunk-75MCTH7P.js → chunk-KAO3Q65R.js} +198 -3
  14. package/dist/chunk-KAO3Q65R.js.map +1 -0
  15. package/dist/chunk-QUKKGHTZ.js +121 -0
  16. package/dist/chunk-QUKKGHTZ.js.map +1 -0
  17. package/dist/chunk-SQQLHODJ.js +163 -0
  18. package/dist/chunk-SQQLHODJ.js.map +1 -0
  19. package/dist/{chunk-IKFVX537.js → chunk-UAND2LOT.js} +232 -211
  20. package/dist/chunk-UAND2LOT.js.map +1 -0
  21. package/dist/{chunk-HKYRWNHV.js → chunk-USHQBPMH.js} +283 -7
  22. package/dist/chunk-USHQBPMH.js.map +1 -0
  23. package/dist/cli.js +3 -2
  24. package/dist/cli.js.map +1 -1
  25. package/dist/{control-C8NKbF3w.d.ts → control-cxwMOAsy.d.ts} +3 -2
  26. package/dist/control.d.ts +4 -3
  27. package/dist/control.js +2 -2
  28. package/dist/emitter-B2XqDKFU.d.ts +121 -0
  29. package/dist/{feedback-trajectory-BGQ_ANCN.d.ts → feedback-trajectory-CB0A32o3.d.ts} +2 -1
  30. package/dist/index.d.ts +16 -302
  31. package/dist/index.js +70 -62
  32. package/dist/index.js.map +1 -1
  33. package/dist/integrity-K2oVlF57.d.ts +210 -0
  34. package/dist/openapi.json +1 -1
  35. package/dist/optimization-UVDNKaO6.d.ts +574 -0
  36. package/dist/optimization.d.ts +7 -144
  37. package/dist/optimization.js +9 -2
  38. package/dist/reporting-B82RSv9C.d.ts +593 -0
  39. package/dist/reporting.d.ts +5 -426
  40. package/dist/reporting.js +17 -6
  41. package/dist/{emitter-BYO2nSDA.d.ts → store-u47QaJ9G.d.ts} +1 -91
  42. package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
  43. package/dist/traces.d.ts +179 -3
  44. package/dist/traces.js +35 -4
  45. package/dist/wire/index.js +3 -2
  46. package/docs/research-report-methodology.md +170 -0
  47. package/docs/wire-protocol.md +1 -1
  48. package/package.json +11 -13
  49. package/dist/chunk-75MCTH7P.js.map +0 -1
  50. package/dist/chunk-HKYRWNHV.js.map +0 -1
  51. package/dist/chunk-IKFVX537.js.map +0 -1
  52. package/dist/chunk-KWUAAIHR.js.map +0 -1
  53. package/dist/chunk-ODFINDLQ.js +0 -413
  54. package/dist/chunk-ODFINDLQ.js.map +0 -1
  55. package/dist/chunk-PKCVBYTQ.js.map +0 -1
  56. /package/dist/{chunk-HNJLMAJ2.js.map → chunk-6KQG5HAH.js.map} +0 -0
  57. /package/dist/{chunk-MCMV7DUL.js.map → chunk-ARZ6BEV6.js.map} +0 -0
@@ -1,4 +1,5 @@
1
1
  import { a as RunRecord, R as RunSplitTag } from './run-record-CX_jcAyr.js';
2
+ import { a as Run, S as Span, f as TraceEvent, F as FailureClass, T as TraceStore } from './store-u47QaJ9G.js';
2
3
 
3
4
  /**
4
5
  * HeldOutGate — first-class held-out paired-delta promotion gate.
@@ -595,4 +596,383 @@ declare function runMultiShotOptimization<P>(config: MultiShotOptimizationConfig
595
596
  declare function defaultMultiShotObjectives(): Objective<VariantAggregate>[];
596
597
  declare function trialTraceFromMultiShotTrial(trial: MultiShotTrialResult): TrialTrace;
597
598
 
598
- export { type ActionableSideInfo as A, type TrialTrace as B, buildReflectionPrompt as C, DEFAULT_MUTATION_PRIMITIVES as D, type EvolvableVariant as E, crowdingDistance as F, type GateDecision as G, HeldOutGate as H, InMemoryTrialCache as I, defaultMultiShotObjectives as J, dominates as K, paretoFrontier as L, type MutateAdapter as M, paretoFrontierWithCrowding as N, type Objective as O, type ParetoResult as P, parseReflectionResponse as Q, type ReflectionContext as R, type ScenarioAggregate as S, type TrialCache as T, runMultiShotOptimization as U, type VariantAggregate as V, runPromptEvolution as W, scalarScore as X, trialTraceFromMultiShotTrial as Y, type TrialResult as a, type AsiSeverity as b, type Direction as c, type GateEvidence as d, type GenerationReport as e, type HeldOutGateConfig as f, type HeldOutGateRejectionCode as g, type MultiShotGateConfig as h, type MultiShotGateResult as i, type MultiShotMutateAdapter as j, type MultiShotOptimizationConfig as k, type MultiShotOptimizationResult as l, type MultiShotRun as m, type MultiShotRunInput as n, type MultiShotRunner as o, type MultiShotScore as p, type MultiShotScorer as q, type MultiShotSplit as r, type MultiShotTrace as s, type MultiShotTrialResult as t, type MultiShotVariant as u, type PromptEvolutionConfig as v, type PromptEvolutionEvent as w, type PromptEvolutionResult as x, type ReflectionProposal as y, type ScoreAdapter as z };
599
+ /**
600
+ * Failure taxonomy — canonical classes + a default classifier.
601
+ *
602
+ * Every failed run should end up in a named class. The classifier here
603
+ * is rule-based (fast, deterministic); an LLM fallback can be added by
604
+ * the consumer for novel cases and trained into the rule base over time.
605
+ *
606
+ * Consumers call `classifyFailure(run, spans, events)` and persist the
607
+ * returned class as `Run.outcome.failureClass`.
608
+ */
609
+
610
+ interface FailureContext {
611
+ run: Run;
612
+ spans: Span[];
613
+ events: TraceEvent[];
614
+ }
615
+ interface FailureClassification {
616
+ failureClass: FailureClass;
617
+ reason: string;
618
+ triggerSpanId?: string;
619
+ triggerEventId?: string;
620
+ }
621
+ /** Ordered rules — first match wins. */
622
+ interface FailureRule {
623
+ id: string;
624
+ match: (ctx: FailureContext) => {
625
+ failureClass: FailureClass;
626
+ reason: string;
627
+ triggerSpanId?: string;
628
+ triggerEventId?: string;
629
+ } | null;
630
+ }
631
+ declare const DEFAULT_RULES: FailureRule[];
632
+ /** Classify the failure mode of a run using an ordered rule list. */
633
+ declare function classifyFailure(ctx: FailureContext, rules?: FailureRule[]): FailureClassification;
634
+
635
+ /**
636
+ * FailureClusterView — groups failed runs by (failureClass, triggerTool,
637
+ * argHash-prefix) so weekly reviews can prioritize the top-N clusters.
638
+ *
639
+ * Each cluster includes: N runs, scenarios affected, representative
640
+ * error message, a proposed mitigation hint (rule → action table).
641
+ */
642
+
643
+ interface FailureCluster {
644
+ failureClass: FailureClass;
645
+ /** Tool name when the trigger was a tool span, else undefined. */
646
+ toolName?: string;
647
+ /** First 16 chars of argHash — clusters similar args. */
648
+ argPrefix?: string;
649
+ /**
650
+ * Source dimension when the trigger was a judge span (e.g. `'format'`,
651
+ * `'safety'`, `'correctness'`). Lets cross-template aggregators
652
+ * group failures by the dimension that fired without overloading
653
+ * `argPrefix`. Optional — legacy clusters without this field
654
+ * deserialize cleanly.
655
+ */
656
+ dimension?: string;
657
+ runCount: number;
658
+ scenarioIds: string[];
659
+ exampleError?: string;
660
+ exampleRunId: string;
661
+ }
662
+ interface FailureClusterReport {
663
+ clusters: FailureCluster[];
664
+ totalFailures: number;
665
+ totalRuns: number;
666
+ }
667
+ declare function failureClusterView(store: TraceStore, options?: {
668
+ rules?: FailureRule[];
669
+ minClusterSize?: number;
670
+ }): Promise<FailureClusterReport>;
671
+
672
+ /**
673
+ * Reporting helpers — production summaries and paper-quality figures — sit alongside `reporter.ts` rather
674
+ * than replacing it.
675
+ *
676
+ * Three artefacts:
677
+ *
678
+ * - `summaryTable` Markdown table of per-candidate means,
679
+ * 95% bootstrap CIs, BH-adjusted Wilcoxon
680
+ * p-values, and Cohen's d versus a
681
+ * comparator candidate.
682
+ * - `paretoChart` Abstract spec for a cost vs quality
683
+ * scatter, with gate decisions overlaid.
684
+ * Returns numbers + labels — caller
685
+ * chooses the plotting library.
686
+ * - `gainHistogram`
687
+ * Per-item paired holdout deltas as a
688
+ * histogram spec (bins + counts + median +
689
+ * CI). Same "data, not images" contract.
690
+ *
691
+ * The figure types are PlotSpecs — JSON-friendly, library-agnostic.
692
+ * They aren't React components and they aren't PNGs; they are
693
+ * what you'd hand to vega-lite, plotly, matplotlib, or your own
694
+ * Canvas renderer to draw the actual figure.
695
+ */
696
+
697
+ interface SummaryTableOptions {
698
+ /** Comparator candidate id. Wilcoxon + Cohen's d are computed
699
+ * versus this candidate. Required for paired stats columns. */
700
+ comparator?: string;
701
+ /** Which split to read scores from. Default 'holdout'. */
702
+ split?: 'search' | 'holdout';
703
+ /** Confidence level for the bootstrap CI on the mean. Default 0.95. */
704
+ confidence?: number;
705
+ /** FDR for BH adjustment of the comparison p-values. Default 0.05. */
706
+ fdr?: number;
707
+ }
708
+ interface SummaryTableRow {
709
+ candidateId: string;
710
+ n: number;
711
+ mean: number;
712
+ ciLow: number;
713
+ ciHigh: number;
714
+ /** BH-adjusted q-value vs comparator. NaN if no comparator. */
715
+ qValue: number;
716
+ /** Cohen's d vs comparator. NaN if no comparator. */
717
+ cohensD: number;
718
+ }
719
+ interface SummaryTable {
720
+ rows: SummaryTableRow[];
721
+ comparator: string | null;
722
+ split: 'search' | 'holdout';
723
+ /** Pre-rendered markdown — drop into a paper or PR. */
724
+ markdown: string;
725
+ }
726
+ /**
727
+ * Table 1 helper. Buckets runs by `candidateId`, computes mean +
728
+ * bootstrap CI on the chosen split, and (when a comparator is given)
729
+ * BH-adjusted Wilcoxon p + Cohen's d versus that comparator.
730
+ */
731
+ declare function summaryTable(runs: RunRecord[], opts?: SummaryTableOptions): SummaryTable;
732
+ interface ParetoPoint {
733
+ candidateId: string;
734
+ /** Mean USD cost per run on the chosen split. */
735
+ cost: number;
736
+ /** Mean score on the chosen split. */
737
+ quality: number;
738
+ /** Number of runs that informed this point. */
739
+ n: number;
740
+ /** Whether this candidate is on the Pareto frontier — high
741
+ * quality, low cost, no dominator. */
742
+ onFrontier: boolean;
743
+ /** Optional gate verdict for this candidate, if a `GateDecision`
744
+ * for it was passed in. */
745
+ gate?: 'promote' | 'reject_few_runs' | 'reject_negative_delta' | 'reject_overfit_gap' | null;
746
+ }
747
+ interface ParetoFigureSpec {
748
+ kind: 'pareto-cost-quality';
749
+ split: 'search' | 'holdout';
750
+ points: ParetoPoint[];
751
+ axes: {
752
+ x: 'costUsd';
753
+ y: 'score';
754
+ };
755
+ }
756
+ /**
757
+ * Cost vs quality scatter spec. `gateDecisions` is keyed by
758
+ * candidate id; if present, every point picks up the gate verdict
759
+ * for overlay.
760
+ */
761
+ declare function paretoChart(runs: RunRecord[], opts?: {
762
+ split?: 'search' | 'holdout';
763
+ gateDecisions?: Record<string, GateDecision>;
764
+ }): ParetoFigureSpec;
765
+ interface GainDistributionBin {
766
+ /** Inclusive lower edge. */
767
+ lo: number;
768
+ /** Exclusive upper edge (or inclusive if it's the last bin). */
769
+ hi: number;
770
+ /** Number of pairs whose delta lands in this bin. */
771
+ count: number;
772
+ }
773
+ interface GainDistributionFigureSpec {
774
+ kind: 'gain-distribution';
775
+ candidateId: string;
776
+ comparator: string;
777
+ split: 'search' | 'holdout';
778
+ /** Number of pairs used. */
779
+ n: number;
780
+ bins: GainDistributionBin[];
781
+ median: number;
782
+ ci: {
783
+ low: number;
784
+ high: number;
785
+ };
786
+ }
787
+ interface GainDistributionOptions {
788
+ /** Number of histogram bins. Default 11 (so the centre is exact at 0). */
789
+ bins?: number;
790
+ /** Which split to use. Default 'holdout'. */
791
+ split?: 'search' | 'holdout';
792
+ /** Confidence level for the CI. Default 0.95. */
793
+ confidence?: number;
794
+ /** Bootstrap resamples. Default 2000. */
795
+ resamples?: number;
796
+ /** Deterministic seed. */
797
+ seed?: number;
798
+ }
799
+ /**
800
+ * Held-out improvement distribution: per-pair delta (candidate −
801
+ * comparator), histogrammed. Includes the bootstrap CI on the median
802
+ * delta — same primitive the promotion gate uses.
803
+ */
804
+ declare function gainHistogram(runs: RunRecord[], candidateId: string, comparator: string, opts?: GainDistributionOptions): GainDistributionFigureSpec;
805
+ type ResearchReportDecision = 'promote' | 'hold' | 'reject' | 'equivalent' | 'needs_more_data';
806
+ /**
807
+ * Hard floor below which a paired comparison is treated as uninformative
808
+ * regardless of `minPairs`. Mirrors the lower limit on Wilcoxon signed-rank
809
+ * exact tables; below this the test has no power to separate effect sizes.
810
+ */
811
+ declare const RESEARCH_REPORT_HARD_PAIR_FLOOR = 6;
812
+ interface ResearchReportOptions {
813
+ /** Human-readable report title. */
814
+ title?: string;
815
+ /** Comparator candidate id. Required for statistical decision guidance. */
816
+ comparator?: string;
817
+ /** Which split to use for the primary decision. Default 'holdout'. */
818
+ split?: 'search' | 'holdout';
819
+ /** Confidence level used by lower-level report helpers. Default 0.95. */
820
+ confidence?: number;
821
+ /** FDR threshold for q-values. Default 0.05. */
822
+ fdr?: number;
823
+ /**
824
+ * Soft floor on paired observations before issuing a directional
825
+ * promote / reject. Below this we report `needs_more_data` and surface the
826
+ * minimum detectable effect at the current N. Default 20 — chosen so the
827
+ * Wilcoxon signed-rank approximation is reasonable and so the paired
828
+ * bootstrap CI has non-degenerate coverage. Hard floor is enforced at
829
+ * `RESEARCH_REPORT_HARD_PAIR_FLOOR` (6) regardless of this value.
830
+ */
831
+ minPairs?: number;
832
+ /**
833
+ * Region of Practical Equivalence on the paired delta. When a candidate's
834
+ * paired-delta CI is fully contained in `[low, high]`, the decision is
835
+ * `equivalent` rather than `hold`. Sourced from the domain owner — there is
836
+ * no statistically-defensible default.
837
+ */
838
+ rope?: {
839
+ low: number;
840
+ high: number;
841
+ };
842
+ /**
843
+ * Power for the minimum detectable effect (MDE) reported on each candidate.
844
+ * Default 0.8.
845
+ */
846
+ mdePower?: number;
847
+ /**
848
+ * Two-sided alpha for the MDE. Default matches `fdr` so the reported MDE
849
+ * lines up with the test the report actually runs.
850
+ */
851
+ mdeAlpha?: number;
852
+ /** Optional held-out gate decisions keyed by candidate id. */
853
+ gateDecisions?: Record<string, GateDecision>;
854
+ /** Optional failure clusters from failureClusterView. */
855
+ failureClusters?: FailureClusterReport;
856
+ /** Build gain histograms for these candidates. Defaults to all non-comparator candidates. */
857
+ candidateIds?: string[];
858
+ /** Deterministic bootstrap seed passed to gainHistogram and the posterior helper. */
859
+ seed?: number;
860
+ /** Report timestamp. Defaults to current time. */
861
+ generatedAt?: string;
862
+ /**
863
+ * Hash of a preregistered protocol (e.g. `signManifest({...}).contentHash`).
864
+ * Embedded verbatim in the report so the analysis can be cited as the
865
+ * preregistered one rather than a post-hoc fishing expedition.
866
+ */
867
+ preregistrationHash?: string;
868
+ }
869
+ interface ResearchReportRecommendation {
870
+ decision: ResearchReportDecision;
871
+ candidateId: string | null;
872
+ rationale: string[];
873
+ risks: string[];
874
+ nextActions: string[];
875
+ }
876
+ interface ResearchReportCandidate {
877
+ candidateId: string;
878
+ n: number;
879
+ mean: number;
880
+ ciLow: number;
881
+ ciHigh: number;
882
+ qValue: number;
883
+ cohensD: number;
884
+ meanDeltaVsComparator: number | null;
885
+ pairedN: number;
886
+ medianGain: number | null;
887
+ meanGain: number | null;
888
+ gainCi: {
889
+ low: number;
890
+ high: number;
891
+ } | null;
892
+ /**
893
+ * Bayesian-bootstrap-style posterior summaries on the paired delta. Computed
894
+ * from the same resamples that produce the gain CI; interpretable as
895
+ * "fraction of resamples in which the candidate beats the comparator on
896
+ * matched pairs."
897
+ */
898
+ prGreaterThanZero: number | null;
899
+ prInRope: number | null;
900
+ /**
901
+ * Minimum detectable effect (in score units) at the candidate's paired N,
902
+ * the configured power, and the configured alpha. Standardised by the
903
+ * observed paired-delta SD and inverted via `requiredSampleSize`. Reported
904
+ * for every candidate so a `needs_more_data` verdict is actionable.
905
+ */
906
+ mde: number | null;
907
+ onParetoFrontier: boolean;
908
+ gate?: ParetoPoint['gate'];
909
+ decision: ResearchReportDecision;
910
+ decisionReason: string;
911
+ }
912
+ interface ResearchReportMethodology {
913
+ /**
914
+ * Plain-language assumptions the report depends on. Read these first when
915
+ * deciding whether the verdict is load-bearing for a launch decision.
916
+ */
917
+ assumptions: string[];
918
+ /** Tests and estimators the verdict was computed from. */
919
+ methods: string[];
920
+ /** Alternatives the author considered and why this report didn't take them. */
921
+ alternatives: string[];
922
+ /** Failure modes — when this report should NOT drive a decision. */
923
+ whenNotToApply: string[];
924
+ /** Citations for the methodological choices above. */
925
+ citations: string[];
926
+ }
927
+ interface ResearchReport {
928
+ kind: 'agent-eval-research-report';
929
+ title: string;
930
+ generatedAt: string;
931
+ split: 'search' | 'holdout';
932
+ comparator: string | null;
933
+ /**
934
+ * SHA-256 over the canonicalised set of `(runId, candidateId, split)` triples
935
+ * the report was computed from, plus the comparator and split. Stable across
936
+ * key insertion order; recomputable by the reader to verify provenance.
937
+ */
938
+ runFingerprint: string;
939
+ preregistrationHash: string | null;
940
+ rope: {
941
+ low: number;
942
+ high: number;
943
+ } | null;
944
+ executiveSummary: string[];
945
+ recommendation: ResearchReportRecommendation;
946
+ candidates: ResearchReportCandidate[];
947
+ summary: SummaryTable;
948
+ charts: {
949
+ pareto: ParetoFigureSpec;
950
+ gains: GainDistributionFigureSpec[];
951
+ };
952
+ methodology: ResearchReportMethodology;
953
+ failureClusters?: FailureClusterReport;
954
+ markdown: string;
955
+ html: string;
956
+ }
957
+ /**
958
+ * Executive research report for CPO / AI-lead / launch-review consumption.
959
+ *
960
+ * Composes:
961
+ * - `summaryTable` marginal stats with BH-FDR-adjusted q-values
962
+ * - `paretoChart` cost-vs-quality frontier with gate overlay
963
+ * - `gainHistogram` per-candidate paired-delta distribution
964
+ * - paired posterior (this file): bootstrap CI on median, Pr(Δ>0),
965
+ * Pr(Δ∈ROPE), MDE at the configured power
966
+ *
967
+ * Decisions are made on paired evidence — never on marginal means alone —
968
+ * and respect any held-out gate decision the caller passes through. The
969
+ * report embeds a SHA-256 fingerprint of the input run set and, optionally,
970
+ * the hash of a preregistered protocol so a downstream reader can verify
971
+ * provenance and that the analysis was the preregistered one.
972
+ *
973
+ * Async because the fingerprint uses Web Crypto via `hashJson`; deterministic
974
+ * for any fixed `runs`, `seed`, and ROPE.
975
+ */
976
+ declare function researchReport(runs: RunRecord[], opts?: ResearchReportOptions): Promise<ResearchReport>;
977
+
978
+ export { type ResearchReportOptions as $, type ActionableSideInfo as A, type MultiShotTrace as B, type MultiShotTrialResult as C, DEFAULT_RULES as D, type EvolvableVariant as E, type FailureClassification as F, type GainDistributionBin as G, HeldOutGate as H, InMemoryTrialCache as I, type MultiShotVariant as J, type ParetoFigureSpec as K, type ParetoPoint as L, type MutateAdapter as M, type PromptEvolutionConfig as N, type Objective as O, type ParetoResult as P, type PromptEvolutionEvent as Q, type PromptEvolutionResult as R, RESEARCH_REPORT_HARD_PAIR_FLOOR as S, type TrialCache as T, type ReflectionContext as U, type VariantAggregate as V, type ReflectionProposal as W, type ResearchReport as X, type ResearchReportCandidate as Y, type ResearchReportDecision as Z, type ResearchReportMethodology as _, type TrialResult as a, type ResearchReportRecommendation as a0, type ScenarioAggregate as a1, type ScoreAdapter as a2, type SummaryTable as a3, type SummaryTableOptions as a4, type SummaryTableRow as a5, type TrialTrace as a6, buildReflectionPrompt as a7, classifyFailure as a8, crowdingDistance as a9, defaultMultiShotObjectives as aa, dominates as ab, failureClusterView as ac, gainHistogram as ad, paretoChart as ae, paretoFrontier as af, paretoFrontierWithCrowding as ag, parseReflectionResponse as ah, researchReport as ai, runMultiShotOptimization as aj, runPromptEvolution as ak, scalarScore as al, summaryTable as am, trialTraceFromMultiShotTrial as an, type AsiSeverity as b, DEFAULT_MUTATION_PRIMITIVES as c, type Direction as d, type FailureCluster as e, type FailureClusterReport as f, type FailureContext as g, type FailureRule as h, type GainDistributionFigureSpec as i, type GainDistributionOptions as j, type GateDecision as k, type GateEvidence as l, type GenerationReport as m, type HeldOutGateConfig as n, type HeldOutGateRejectionCode as o, type MultiShotGateConfig as p, type MultiShotGateResult as q, type MultiShotMutateAdapter as r, type MultiShotOptimizationConfig as s, type MultiShotOptimizationResult as t, type MultiShotRun as u, type MultiShotRunInput as v, type MultiShotRunner as w, type MultiShotScore as x, type MultiShotScorer as y, type MultiShotSplit as z };
package/dist/traces.d.ts CHANGED
@@ -1,5 +1,9 @@
1
- import { a as TraceStore, L as LlmSpan, J as JudgeSpan, R as Run, F as FailureClass, d as ToolSpan } from './emitter-BYO2nSDA.js';
2
- export { A as Artifact, B as BudgetLedgerEntry, c as BudgetSpec, E as EventFilter, f as EventKind, g as FAILURE_CLASSES, h as FileSystemTraceStore, i as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, M as Message, j as RetrievalSpan, e as RunFilter, k as RunLayer, C as RunOutcome, l as RunStatus, m as SandboxSpan, S as Span, n as SpanBase, o as SpanFilter, p as SpanHandle, q as SpanKind, r as SpanStatus, s as TRACE_SCHEMA_VERSION, T as TraceEmitter, t as TraceEmitterOptions, b as TraceEvent, u as isJudgeSpan, v as isLlmSpan, w as isRetrievalSpan, x as isSandboxSpan, y as isToolSpan, z as llmSpanFromProvider } from './emitter-BYO2nSDA.js';
1
+ import { T as TraceStore, L as LlmSpan, J as JudgeSpan, a as Run, F as FailureClass, c as ToolSpan } from './store-u47QaJ9G.js';
2
+ export { A as Artifact, B as BudgetLedgerEntry, g as BudgetSpec, i as EventFilter, E as EventKind, j as FAILURE_CLASSES, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, M as Message, d as RetrievalSpan, h as RunFilter, m as RunLayer, R as RunOutcome, n as RunStatus, e as SandboxSpan, S as Span, o as SpanBase, p as SpanFilter, b as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, f as TraceEvent, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-u47QaJ9G.js';
3
+ import { a as RunCompleteHookContext, R as RunCompleteHook } from './emitter-B2XqDKFU.js';
4
+ export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-B2XqDKFU.js';
5
+ import { d as RawProviderSink, c as RawProviderEvent } from './integrity-K2oVlF57.js';
6
+ export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, R as RawProviderDirection, e as RawProviderSinkFilter, f as RunIntegrityError, g as RunIntegrityExpectations, h as RunIntegrityIssue, i as RunIntegrityIssueCode, j as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-K2oVlF57.js';
3
7
  import { AxAIService, AxFunction } from '@ax-llm/ax';
4
8
 
5
9
  /**
@@ -130,6 +134,124 @@ interface OtlpExport {
130
134
  /** Export a single run's spans + events in OTLP/JSON. */
131
135
  declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
132
136
 
137
+ /**
138
+ * Replay-from-raw-events — turn every captured campaign run into a
139
+ * re-runnable artifact.
140
+ *
141
+ * The premise: 0.21 made `RawProviderSink` capture every provider HTTP
142
+ * envelope. 0.22's `runEvalCampaign` makes capture the default. Together
143
+ * they mean every past run is a complete fingerprint of what happened on
144
+ * the wire — and that fingerprint is enough to replay the run without
145
+ * burning new LLM cost.
146
+ *
147
+ * Three use cases this primitive enables:
148
+ *
149
+ * 1. **Post-hoc judging** — apply a new judge / rubric / scoring callback
150
+ * to last week's runs without re-calling any LLM. The cost of trying
151
+ * a new rubric drops from "another full sweep" to a CPU-bound replay.
152
+ * 2. **Determinism audits** — replay the same campaign and verify the
153
+ * raw responses match byte-for-byte. Any drift is a non-determinism
154
+ * bug (in the harness, the prompt builder, the sandbox, …).
155
+ * 3. **Free judge calibration** — run two judges on identical responses
156
+ * and measure inter-judge agreement without doubling LLM spend.
157
+ *
158
+ * The interface is deliberately fetch-shaped. Inject `createReplayFetch`
159
+ * into `LlmClientOptions.fetch` and every `callLlm` transparently reads
160
+ * from the cache instead of calling the network. No new code path through
161
+ * the LLM client is needed; the cache hit is invisible to the runner.
162
+ */
163
+
164
+ declare class ReplayCacheMissError extends Error {
165
+ readonly url: string;
166
+ readonly requestKey: string;
167
+ constructor(url: string, requestKey: string, message?: string);
168
+ }
169
+ interface ReplayCacheEntry {
170
+ request: RawProviderEvent;
171
+ response: RawProviderEvent;
172
+ }
173
+ interface ReplayCacheStats {
174
+ total: number;
175
+ byProvider: Record<string, number>;
176
+ byModel: Record<string, number>;
177
+ /** Spans for which we have a request but no response (run aborted mid-call). */
178
+ orphanRequests: number;
179
+ }
180
+ /**
181
+ * In-memory deterministic cache of (request → response) keyed on a stable
182
+ * hash of the request body. Built from a `RawProviderSink` containing
183
+ * paired `request` and `response` events from a previous run.
184
+ *
185
+ * The cache is the source of truth for replay; `createReplayFetch` is a
186
+ * thin wrapper that reads from it.
187
+ */
188
+ declare class ReplayCache {
189
+ private byKey;
190
+ private orphans;
191
+ private byProvider;
192
+ private byModel;
193
+ /**
194
+ * Build a cache from a sink's events. The sink must implement `list()`.
195
+ * Filter by `runId` / `spanId` to scope to a specific replay.
196
+ */
197
+ static fromSink(sink: RawProviderSink, filter?: {
198
+ runId?: string;
199
+ spanId?: string;
200
+ }): Promise<ReplayCache>;
201
+ /** Build a cache from an in-memory event list. */
202
+ static fromEvents(events: RawProviderEvent[]): Promise<ReplayCache>;
203
+ /** Number of cacheable (request, response) pairs in the cache. */
204
+ size(): number;
205
+ stats(): ReplayCacheStats;
206
+ /**
207
+ * Look up a cached response by hashing the (model, messages, temperature,
208
+ * maxTokens, response_format) shape. Returns `undefined` on miss; the
209
+ * caller decides whether to throw, fall back to the network, or skip.
210
+ */
211
+ lookup(requestBody: unknown): Promise<ReplayCacheEntry | undefined>;
212
+ }
213
+ interface ReplayFetchOptions {
214
+ /**
215
+ * Behaviour on cache miss. Default `'throw'`. `'fallback'` calls the
216
+ * `fallbackFetch` (typically `globalThis.fetch`) so a partial replay can
217
+ * still complete; `'fail-closed'` returns a synthetic 599 response so the
218
+ * call site sees a non-retriable failure.
219
+ */
220
+ onMiss?: 'throw' | 'fallback' | 'fail-closed';
221
+ fallbackFetch?: typeof fetch;
222
+ /** Optional callback fired once per replayed call (for telemetry / counters). */
223
+ onHit?: (info: {
224
+ url: string;
225
+ provider: string;
226
+ model: string;
227
+ }) => void;
228
+ /** Optional callback fired on cache miss before the `onMiss` policy applies. */
229
+ onMissNotify?: (info: {
230
+ url: string;
231
+ requestBody: unknown;
232
+ }) => void;
233
+ }
234
+ /**
235
+ * Build a `fetch`-shaped function that serves cached responses out of a
236
+ * `ReplayCache` for any URL ending in `/chat/completions`. Pass through
237
+ * `LlmClientOptions.fetch` and `callLlm` becomes free.
238
+ *
239
+ * Non-`/chat/completions` URLs are passed straight to the fallback fetch
240
+ * (default: `globalThis.fetch`). This matters because non-LLM HTTP work
241
+ * (judge HTTP servers, sandbox callbacks) sometimes flows through the same
242
+ * `fetch` and shouldn't be intercepted.
243
+ */
244
+ declare function createReplayFetch(cache: ReplayCache, opts?: ReplayFetchOptions): typeof fetch;
245
+ /**
246
+ * Convenience iterator over `(request, response)` pairs in a sink — for
247
+ * post-hoc scoring that doesn't need a `fetch` shim. The judge or scorer
248
+ * runs purely in-process over cached LLM outputs.
249
+ */
250
+ declare function iterateRawCalls(sink: RawProviderSink, filter?: {
251
+ runId?: string;
252
+ spanId?: string;
253
+ }): AsyncGenerator<ReplayCacheEntry>;
254
+
133
255
  /**
134
256
  * Shared types for the trace-analyst module.
135
257
  *
@@ -578,6 +700,60 @@ declare function traceAnalystFunctionGroup(opts: BuildTraceAnalystToolsOpts): {
578
700
  functions: AxFunction[];
579
701
  };
580
702
 
703
+ /**
704
+ * Trace-analyst auto-execution hook.
705
+ *
706
+ * Wires `analyzeTraces` into a `TraceEmitter`'s `onRunComplete` so a
707
+ * direct matrix run produces an analysis artifact without an out-of-band
708
+ * step. Designed for the case where the consumer reports "the analyst
709
+ * never ran" — the cause is almost always orchestration, not the analyst.
710
+ *
711
+ * Usage:
712
+ *
713
+ * const emitter = new TraceEmitter(store, {
714
+ * onRunComplete: [traceAnalystOnRunComplete({ analyze: opts, save })],
715
+ * })
716
+ *
717
+ * Hooks are best-effort by default — they never crash the underlying run.
718
+ * The caller decides whether to gate the run on the analysis result via
719
+ * the `gateOn` callback.
720
+ */
721
+
722
+ interface TraceAnalystHookOptions {
723
+ /**
724
+ * Options forwarded to `analyzeTraces`. The hook supplies the question
725
+ * if you don't pass one — defaulting to a launch-grade prompt that asks
726
+ * for failure modes, surprising findings, and a recommendation.
727
+ */
728
+ analyze: Omit<AnalyzeTracesOptions, 'source'> & {
729
+ source?: AnalyzeTracesOptions['source'];
730
+ };
731
+ /**
732
+ * Override the question. The default is intentionally generic:
733
+ * "Summarise what happened in this run, surface any failure modes,
734
+ * surprising findings, or evidence the verdict is wrong."
735
+ */
736
+ question?: string;
737
+ /**
738
+ * Persist the result. The hook calls this with the analysis output and
739
+ * the run context. Common implementations write to a TraceAnalysisStore
740
+ * or append to a per-run JSONL.
741
+ */
742
+ save?: (result: AnalyzeTracesResult, ctx: RunCompleteHookContext) => Promise<void>;
743
+ /**
744
+ * Predicate gating execution per run. Default: every completed run.
745
+ * Use to skip aborted runs, debug runs, or runs without LLM activity.
746
+ */
747
+ shouldRun?: (ctx: RunCompleteHookContext) => boolean;
748
+ /**
749
+ * Optional gate: if set and returns false, the hook records the failure
750
+ * as a log event on the run instead of staying quiet. The caller can
751
+ * then trigger downstream alerts off `analyst_gate_failed` log events.
752
+ */
753
+ gateOn?: (result: AnalyzeTracesResult, ctx: RunCompleteHookContext) => boolean;
754
+ }
755
+ declare function traceAnalystOnRunComplete(opts: TraceAnalystHookOptions): RunCompleteHook;
756
+
581
757
  /** Ax RLM prompt for bounded trace discovery and evidence-backed analysis. */
582
758
  declare const TRACE_ANALYST_ACTOR_DESCRIPTION = "You answer questions about an OTLP-shaped JSONL trace dataset using the trace tools provided in the `traces` namespace.\n\nDISCOVERY \u2192 NARROW \u2192 DEEP-READ protocol \u2014 follow exactly:\n\n1. ALWAYS call `traces.getDatasetOverview({})` FIRST without a regex_pattern. The result tells you total_traces, raw_jsonl_bytes, services, agents, models, and sample_trace_ids (real ids \u2014 never fabricate one).\n\n2. Use raw_jsonl_bytes to gauge how expensive raw scans will be. `filters.regex_pattern` is the one scan-heavy filter on getDatasetOverview / queryTraces / countTraces \u2014 narrow with indexed fields (has_errors, model_names, service_names, agent_names, time bounds) BEFORE adding a regex on a large dataset.\n\n3. To list more traces than the sample, call `traces.queryTraces({ filters?, limit, offset? })`. Each summary carries raw_jsonl_bytes \u2014 use it to choose between viewTrace and searchTrace BEFORE calling either.\n\n4. Per-trace inspection:\n - SMALL trace (raw_jsonl_bytes well under 150_000): call `traces.viewTrace({ trace_id })`. Returns all spans. Per-attribute payloads are head-capped at ~4KB; large `input.value` / `output.value` / `llm.input_messages` will show a `[trace-analyst truncated: N bytes]` marker.\n - LARGE trace (raw_jsonl_bytes near or above 150_000, or you saw an `oversized` response): use `traces.searchTrace({ trace_id, regex_pattern })` to get bounded SpanMatchRecords (span metadata + matched text + surrounding context). Then call `traces.viewSpans({ trace_id, span_ids: [...] })` for surgical reads (~16KB cap, 4\u00D7 higher than discovery), or `traces.searchSpan({ trace_id, span_id, regex_pattern })` for one large span. Stays bounded regardless of trace size.\n - Useful regex patterns: `STATUS_CODE_ERROR` (failures), tool names like `grep` or `view_trace`, error strings like `MaxTurnsExceeded`, model names, attribute keys.\n\n5. ONLY call viewTrace / viewSpans / searchTrace / searchSpan with trace/span ids you have already seen in sample_trace_ids, a queryTraces page, or a previous search result. Never invent ids.\n\n5a. **Result-shape contract** \u2014 searchTrace and searchSpan return `{ trace_id, hits, total_matches, has_more }`. Iterate `result.hits` (NOT result.matches). Each hit has `{ span_id, span_name, span_kind, attribute_path, matched_text, context_before, context_after, match_offset }`. viewTrace returns `{ trace_id, spans }` (or `oversized`). viewSpans returns `{ trace_id, spans, missing_span_ids, truncated_attribute_count }`. Never assume a field name \u2014 log the result shape first if unsure.\n\n6. If viewTrace returns an `oversized` summary instead of `spans`, DO NOT retry the same call. Read the summary's top_span_names, span_count, span_response_bytes_max, error_span_count to plan a follow-up: switch to searchTrace (or searchSpan for one large span), then viewSpans on a smaller, surgical span_ids set.\n\n7. If searchTrace or searchSpan returns has_more=true, REFINE the regex to be more specific rather than blindly raising max_matches.\n\n8. If a tool errors (invalid regex, range error), STOP and reconsider \u2014 don't retry with a guessed id or argument. Use the discovery tools above to recover.\n\n9. If a ~4KB-truncated payload from viewTrace / searchTrace matters for your answer, first try viewSpans on that span id (~16KB cap). If a 16KB-truncated payload from viewSpans still matters, narrow further with searchSpan against a more specific regex rather than asking for the full payload again.\n\n10. If maxDepth > 0 and the question splits into independent semantic branches, delegate well-defined subtasks to subagents using `await llmQuery(...)`. Pass narrow context and a focused query. Examples:\n\n const reviews = await llmQuery([\n { query: 'Drill into trace abc123 \u2014 what tool calls preceded the failure?', context: { trace_id: 'abc123' } },\n { query: 'Drill into trace def456 \u2014 same failure mode?', context: { trace_id: 'def456' } },\n ]);\n\nOBSERVABILITY rules:\n- Each non-final actor turn must emit at least one `console.log(...)` for evidence. Up to 3 logs per turn is fine when correlating multiple data sources (e.g. one log for findings list, one for source-file content, one for derived analysis).\n- Do NOT combine `console.log` with `final(...)` or `askClarification(...)` in the same turn \u2014 finish gathering data first, then call final on its own turn.\n- Reuse runtime variables across turns; don't recompute.\n- When done, call `await final(answer)` with the fully-formed report. The responder rewrites the answer into output fields; if you only pass a vague summary string the responder has nothing concrete to format.\n\nCRITICAL \u2014 `final()` payload contract for evidence-grounded analysis tasks:\n- Pass a STRUCTURED object as the second arg with the actual data the responder needs to format the answer. Do NOT pass abstract instructions; pass evidence.\n- Example for per-item verdict tasks:\n ```js\n await final(\"Format the per-item verdict report from the evidence below.\", {\n findings: [\n { id: 'sub-1-finding-1', claim: '...', verdict: 'TRUE-POSITIVE', evidence: 'lines 42-45 of contracts/X.sol show ...' },\n ...all items\n ],\n systemic_summary: '3 sentences I wrote based on the evidence above'\n });\n ```\n- Calling `final(\"answer\", {})` with no evidence is a failure mode \u2014 the responder will hallucinate or echo back the field names. Always include the gathered data.\n- Premature final after a single viewSpans call is INSUFFICIENT for per-finding analysis tasks. Read the requested attributes (e.g. `spans[i].attributes['redteam.finding.title']`), and for each one perform the requested cross-reference (e.g. read the source SPAN's `attributes['source.content']`).\n\nOUTPUT contract \u2014 your final answer must include:\n- A clear prose conclusion answering the user's question.\n- Trace ids and span ids cited as evidence for each claim.\n- Failure modes named in the user's domain language, with frequency and concrete examples.\n\nDo NOT invent trace ids, span ids, error messages, or model names. Every fact must be traceable to a tool result.";
583
759
  declare const TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION = "trace-analyst-actor-v5-2026-05-06";
@@ -655,4 +831,4 @@ declare function scoreTraceInsightReadiness(context: TraceInsightContext): Trace
655
831
  declare function defaultTraceInsightPanel(): TraceInsightPanelRole[];
656
832
  declare function buildTraceInsightPrompt(input: TraceInsightPromptInput): string;
657
833
 
658
- export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, FailureClass, JudgeSpan, LlmSpan, OTEL_AGENT_EVAL_SCOPE, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type QueryTracesPage, REDACTION_VERSION, type RedactionReport, type RedactionRule, Run, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, ToolSpan, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, aggregateLlm, analyzeTraces, argHash, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, groupBy, inferDomainKeywords, judgeSpans, llmSpans, planTraceInsightQuestions, redactString, redactValue, runFailureClass, runsForScenario, scoreTraceInsightReadiness, tokenizeDomainWords, toolSpans, traceAnalystFunctionGroup };
834
+ export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, FailureClass, JudgeSpan, LlmSpan, OTEL_AGENT_EVAL_SCOPE, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type QueryTracesPage, REDACTION_VERSION, RawProviderEvent, RawProviderSink, type RedactionReport, type RedactionRule, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, Run, RunCompleteHook, RunCompleteHookContext, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, ToolSpan, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystHookOptions, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, aggregateLlm, analyzeTraces, argHash, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, groupBy, inferDomainKeywords, iterateRawCalls, judgeSpans, llmSpans, planTraceInsightQuestions, redactString, redactValue, runFailureClass, runsForScenario, scoreTraceInsightReadiness, tokenizeDomainWords, toolSpans, traceAnalystFunctionGroup, traceAnalystOnRunComplete };