@tangle-network/agent-eval 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,57 +1,32 @@
1
1
  # @tangle-network/agent-eval
2
2
 
3
- Domain-agnostic evaluation framework for Tangle agent apps. Multi-turn scenario execution, multi-judge scoring, agent-driver meta-testing, convergence tracking. Every agent (tax, legal, film, gtm) imports this to get a reproducible quality harness.
3
+ Trace-first evaluation framework for Tangle agents. Core (spans, pipelines, sandbox harness, OTLP export), trust (dataset, red-team, calibration, behavior DSL), builder-of-builders (three-layer eval, resumable sessions, meta-runtime correlation), and frontier (meta-eval correlation study, Process Reward Modeling, bisector).
4
4
 
5
5
  ## Install
6
6
 
7
7
  ```bash
8
- npm install @tangle-network/agent-eval
8
+ pnpm add @tangle-network/agent-eval
9
9
  ```
10
10
 
11
11
  ## Usage
12
12
 
13
- ```ts
14
- import { BenchmarkRunner, ProductClient, defaultJudges } from '@tangle-network/agent-eval'
15
-
16
- const client = new ProductClient({
17
- baseUrl: 'https://my-agent.tangle.tools',
18
- routes: {
19
- signup: '/api/auth/sign-up/email',
20
- chat: '/api/chat',
21
- // ...
22
- },
23
- })
24
-
25
- const runner = new BenchmarkRunner(client, {
26
- scenarios: myScenarios,
27
- judges: defaultJudges('film production'),
28
- systemPrompt: MY_SYSTEM_PROMPT,
29
- })
30
-
31
- const report = await runner.run()
32
- ```
33
-
34
- ## What's in the box
13
+ **→ [`.claude/skills/agent-eval/SKILL.md`](./.claude/skills/agent-eval/SKILL.md)** — single source of truth for every usage pattern. Covers: minimal builder-of-builders path, the seven muffled-gate footguns paid for in shipped bugs, the three-layer eval contract, regression tests worth writing, and "when to use what" for the 100+ exports.
35
14
 
36
- - **ProductClient**configurable HTTP client (routes are config, not code)
37
- - **ScenarioRegistry** — auto-discovery + filtering
38
- - **executeScenario** — multi-turn executor with artifact collection
39
- - **BenchmarkRunner** — orchestrates scenarios + judges + scoring
40
- - **AgentDriver** — meta-agent that plays personas against a real product
41
- - **MetricsCollector** — per-turn product state metrics
42
- - **ConvergenceTracker** — completion% over turns
43
- - **Reporter** — markdown + console output
44
- - **Judges** — 4 built-in (domain expert, code execution, coherence, adversarial) + `createCustomJudge` factory
15
+ If you're an LLM or agent reading this, load the skill file before writing integration code it encodes 10+ incident-driven directives that will save you from rediscovering them.
45
16
 
46
- ## Tier
17
+ ## Dev
47
18
 
48
- Marketplace tier of the [agent-builder](https://github.com/drewstone/tangle-agent-builder) three-tier architecture. Uses [`@tangle-network/tcloud`](https://github.com/tangle-network/tcloud) for judge LLM calls.
19
+ ```bash
20
+ pnpm build # tsup
21
+ pnpm test # vitest
22
+ pnpm typecheck # tsc --noEmit
23
+ ```
49
24
 
50
25
  ## Related
51
26
 
52
- - [`@tangle-network/agent-gateway`](https://github.com/tangle-network/agent-gateway) — the gateway agents published through
53
- - [`@tangle-network/agent-client`](https://github.com/tangle-network/agent-client) — consumer SDK for those endpoints
54
- - [`@tangle-network/tcloud`](https://github.com/tangle-network/tcloud) — platform SDK (used internally by judges)
27
+ - [`@tangle-network/agent-gateway`](https://github.com/tangle-network/agent-gateway)
28
+ - [`@tangle-network/agent-client`](https://github.com/tangle-network/agent-client)
29
+ - [`@tangle-network/tcloud`](https://github.com/tangle-network/tcloud)
55
30
 
56
31
  ## License
57
32
 
package/dist/index.d.ts CHANGED
@@ -1065,6 +1065,8 @@ interface RunScore {
1065
1065
  toolUseQuality: number;
1066
1066
  patchQuality: number;
1067
1067
  testReality: number;
1068
+ finalGate: number;
1069
+ reviewerBlockers: number;
1068
1070
  costUsd: number;
1069
1071
  wallSeconds: number;
1070
1072
  notes?: string[];
@@ -1077,6 +1079,8 @@ interface RunScoreWeights {
1077
1079
  toolUseQuality: number;
1078
1080
  patchQuality: number;
1079
1081
  testReality: number;
1082
+ finalGate: number;
1083
+ reviewerBlockers: number;
1080
1084
  costUsd: number;
1081
1085
  wallSeconds: number;
1082
1086
  }
@@ -1707,6 +1711,124 @@ declare class AxGepaSteeringOptimizer {
1707
1711
  optimize(rows: SteeringOptimizationRow[]): Promise<SteeringOptimizationResult>;
1708
1712
  }
1709
1713
 
1714
+ /**
1715
+ * Pareto frontier — multi-objective optimization over candidate runs.
1716
+ *
1717
+ * Lifted from ADC pareto.ts and blueprint-agent frontier.ts. When you're
1718
+ * trading off (cost, latency, quality) or (passRate, tokenBudget,
1719
+ * ttfb), you rarely have a single "winner" — you have a set of
1720
+ * non-dominated candidates. This module exposes:
1721
+ *
1722
+ * - `paretoFrontier`: filter a set of candidates to the non-dominated ones
1723
+ * - `dominates`: does A dominate B across all objectives?
1724
+ *
1725
+ * Each objective is declared with a direction: 'maximize' (higher=better)
1726
+ * or 'minimize' (lower=better). Candidates are any object; pass an
1727
+ * `objective(candidate)` accessor.
1728
+ */
1729
+ type Direction = 'maximize' | 'minimize';
1730
+ interface Objective<T> {
1731
+ /** Stable label used in reports. */
1732
+ name: string;
1733
+ direction: Direction;
1734
+ value: (candidate: T) => number;
1735
+ }
1736
+ interface ParetoResult<T> {
1737
+ frontier: T[];
1738
+ dominated: T[];
1739
+ /** Index map: frontier[i] dominates each of dominatedBy[i]. */
1740
+ dominanceMap: Array<{
1741
+ dominator: T;
1742
+ dominated: T[];
1743
+ }>;
1744
+ }
1745
+ /** Does candidate A weakly dominate B — strictly better on at least one objective and no worse on any? */
1746
+ declare function dominates<T>(a: T, b: T, objectives: Objective<T>[]): boolean;
1747
+ /**
1748
+ * Compute the non-dominated frontier. Candidates with NaN/Infinity on any
1749
+ * objective are excluded (can't rank them). A candidate enters the frontier
1750
+ * iff no other candidate dominates it.
1751
+ */
1752
+ declare function paretoFrontier<T>(candidates: T[], objectives: Objective<T>[]): ParetoResult<T>;
1753
+
1754
+ type HarnessIntervention = 'continue' | 'plan' | 'audit' | 'recover' | 'repair' | 'verify' | 'final_gate' | 'wait_for_measurement' | 'abort';
1755
+ interface WorkflowTopology {
1756
+ id: string;
1757
+ interventions: HarnessIntervention[];
1758
+ maxParallelBranches?: number;
1759
+ metadata?: Record<string, unknown>;
1760
+ }
1761
+ interface MeasurementPolicy {
1762
+ required: string[];
1763
+ optional?: string[];
1764
+ promoteOn?: Array<keyof RunScore | 'aggregate'>;
1765
+ }
1766
+ interface HarnessVariant {
1767
+ id: string;
1768
+ steering?: SteeringBundle;
1769
+ topology?: WorkflowTopology;
1770
+ measurement?: MeasurementPolicy;
1771
+ budgets?: Record<string, number>;
1772
+ models?: Record<string, string>;
1773
+ reviewers?: Record<string, string>;
1774
+ metadata?: Record<string, unknown>;
1775
+ }
1776
+ interface HarnessScenario {
1777
+ id: string;
1778
+ task: string;
1779
+ split?: 'train' | 'validation' | 'test' | string;
1780
+ metadata?: Record<string, unknown>;
1781
+ }
1782
+ interface HarnessRunRequest {
1783
+ variant: HarnessVariant;
1784
+ scenario: HarnessScenario;
1785
+ trialIndex: number;
1786
+ }
1787
+ interface HarnessAdapter {
1788
+ run(request: HarnessRunRequest): Promise<RunTrace>;
1789
+ }
1790
+ interface HarnessRunResult {
1791
+ variant: HarnessVariant;
1792
+ scenario: HarnessScenario;
1793
+ trialIndex: number;
1794
+ trace: RunTrace;
1795
+ score: RunScore;
1796
+ aggregate: number;
1797
+ }
1798
+ interface HarnessVariantReport {
1799
+ variant: HarnessVariant;
1800
+ runs: HarnessRunResult[];
1801
+ aggregateMean: number;
1802
+ passRate: number;
1803
+ costUsdMean: number;
1804
+ wallSecondsMean: number;
1805
+ scoreMean: RunScore;
1806
+ }
1807
+ interface HarnessSelection {
1808
+ winner: HarnessVariantReport;
1809
+ frontier: ParetoResult<HarnessVariantReport>;
1810
+ reports: HarnessVariantReport[];
1811
+ }
1812
+ interface HarnessExperimentResult {
1813
+ results: HarnessRunResult[];
1814
+ selection: HarnessSelection;
1815
+ }
1816
+ interface HarnessExperimentConfig {
1817
+ adapter: HarnessAdapter;
1818
+ variants: HarnessVariant[];
1819
+ scenarios: HarnessScenario[];
1820
+ trialsPerScenario?: number;
1821
+ parallelism?: number;
1822
+ weights?: Partial<RunScoreWeights>;
1823
+ objectives?: Array<Objective<HarnessVariantReport>>;
1824
+ score?: (trace: RunTrace, request: HarnessRunRequest) => RunScore | Promise<RunScore>;
1825
+ onResult?: (result: HarnessRunResult) => void | Promise<void>;
1826
+ }
1827
+ declare const DEFAULT_HARNESS_OBJECTIVES: Array<Objective<HarnessVariantReport>>;
1828
+ declare function runHarnessExperiment(config: HarnessExperimentConfig): Promise<HarnessExperimentResult>;
1829
+ declare function selectHarnessVariant(results: HarnessRunResult[], objectives?: Array<Objective<HarnessVariantReport>>): HarnessSelection;
1830
+ declare function summarizeHarnessResults(results: HarnessRunResult[]): HarnessVariantReport[];
1831
+
1710
1832
  /**
1711
1833
  * SandboxHarness — executes a scenario in an isolated environment and
1712
1834
  * emits a rich SandboxSpan into the trace.
@@ -1767,8 +1889,27 @@ declare const pytestTestParser: TestOutputParser;
1767
1889
  declare const jestTestParser: TestOutputParser;
1768
1890
  /** Composite parser — tries a list of parsers in order. */
1769
1891
  declare function composeParsers(...parsers: TestOutputParser[]): TestOutputParser;
1892
+ interface SubprocessSandboxDriverOptions {
1893
+ /**
1894
+ * Default cwd for all `exec` calls. Used when the per-call `HarnessConfig`
1895
+ * does not set its own `cwd`. Lets callers bind the driver to a working
1896
+ * directory once instead of spreading cwd into every harness config —
1897
+ * useful when the harness config is constructed far from the call site
1898
+ * (e.g. starter-foundry's promoter passes a static HarnessConfig per
1899
+ * family taxonomy but needs a per-run composed-scaffold cwd).
1900
+ */
1901
+ cwd?: string;
1902
+ /**
1903
+ * Default env merged into every `exec` call's env (per-call `HarnessConfig.env`
1904
+ * still wins on key collision). Same ergonomic rationale as `cwd` above.
1905
+ */
1906
+ env?: Record<string, string>;
1907
+ }
1770
1908
  declare class SubprocessSandboxDriver implements SandboxDriver {
1771
1909
  id: string;
1910
+ private defaultCwd?;
1911
+ private defaultEnv?;
1912
+ constructor(options?: SubprocessSandboxDriverOptions);
1772
1913
  exec(phase: SandboxResult['phase'], command: string, config: HarnessConfig): Promise<SandboxResult>;
1773
1914
  }
1774
1915
  declare class DockerSandboxDriver implements SandboxDriver {
@@ -2687,6 +2828,26 @@ declare class CostTracker {
2687
2828
  timestamp?: number;
2688
2829
  }): CostEntry;
2689
2830
  markOutcome(scenarioId: string, completed: boolean): void;
2831
+ /**
2832
+ * Convenience: record + markOutcome in one call from a
2833
+ * `{ usage, verdict }`-shaped response (starter-foundry's
2834
+ * `invokeMetaJudge` returns this shape; consumers that wrap any
2835
+ * judge/critic can follow the same convention).
2836
+ *
2837
+ * `usage.model` must be present in `MODEL_PRICING` for cost math to
2838
+ * populate; otherwise totalCostUsd stays at 0 for the entry but
2839
+ * tokens still aggregate.
2840
+ */
2841
+ recordVerdict(verdict: {
2842
+ usage?: {
2843
+ inputTokens: number;
2844
+ outputTokens: number;
2845
+ model: string;
2846
+ cachedTokens?: number;
2847
+ reasoningTokens?: number;
2848
+ };
2849
+ verdict?: 'pass' | 'fail' | 'borderline' | string;
2850
+ }, scenarioId: string, tags?: Record<string, string>): CostEntry | null;
2690
2851
  get(scenarioId: string): ScenarioCost | undefined;
2691
2852
  list(): ScenarioCost[];
2692
2853
  summary(): CostSummary;
@@ -2703,44 +2864,106 @@ interface CostSummary {
2703
2864
  }
2704
2865
 
2705
2866
  /**
2706
- * Pareto frontier multi-objective optimization over candidate runs.
2867
+ * muffled-gate-scannertest helper that greps consumer source for
2868
+ * gate + measurement anti-patterns and fails with file:line locations.
2707
2869
  *
2708
- * Lifted from ADC pareto.ts and blueprint-agent frontier.ts. When you're
2709
- * trading off (cost, latency, quality) or (passRate, tokenBudget,
2710
- * ttfb), you rarely have a single "winner" you have a set of
2711
- * non-dominated candidates. This module exposes:
2870
+ * Named pattern lives at starter-foundry's `.evolve/patterns/muffled-gate.md`;
2871
+ * same shape applies to every consumer (a gate that should fail loud
2872
+ * returns silent success; a metric that should emit a real number
2873
+ * reports noise/empty).
2712
2874
  *
2713
- * - `paretoFrontier`: filter a set of candidates to the non-dominated ones
2714
- * - `dominates`: does A dominate B across all objectives?
2875
+ * Usage (in a consumer project's test file):
2715
2876
  *
2716
- * Each objective is declared with a direction: 'maximize' (higher=better)
2717
- * or 'minimize' (lower=better). Candidates are any object; pass an
2718
- * `objective(candidate)` accessor.
2877
+ * import { scanForMuffledGates, DEFAULT_FINDERS } from '@tangle-network/agent-eval'
2878
+ *
2879
+ * test('no muffled gates in eval surface', () => {
2880
+ * const findings = scanForMuffledGates({
2881
+ * repoRoot: process.cwd(),
2882
+ * scanFiles: ['src/eval/scaffold.ts', 'scripts/promote.mjs'],
2883
+ * finders: DEFAULT_FINDERS,
2884
+ * })
2885
+ * if (findings.length) assert.fail(formatFindings(findings))
2886
+ * })
2887
+ *
2888
+ * Customize by passing your own `finders` — each finder is
2889
+ * `(file, text) => Finding[]` and runs per-file.
2890
+ *
2891
+ * Escape hatch: any line containing `muffle-ok:` is excluded from all
2892
+ * finders, letting consumers opt a legitimate fallback out explicitly.
2719
2893
  */
2720
- type Direction = 'maximize' | 'minimize';
2721
- interface Objective<T> {
2722
- /** Stable label used in reports. */
2723
- name: string;
2724
- direction: Direction;
2725
- value: (candidate: T) => number;
2726
- }
2727
- interface ParetoResult<T> {
2728
- frontier: T[];
2729
- dominated: T[];
2730
- /** Index map: frontier[i] dominates each of dominatedBy[i]. */
2731
- dominanceMap: Array<{
2732
- dominator: T;
2733
- dominated: T[];
2734
- }>;
2894
+ interface MuffledFinding {
2895
+ file: string;
2896
+ line: number;
2897
+ lineText: string;
2898
+ pattern: string;
2899
+ }
2900
+ type MuffledFinder = (file: string, text: string) => MuffledFinding[];
2901
+ interface ScanOptions {
2902
+ /** Absolute path to the repo root. */
2903
+ repoRoot: string;
2904
+ /** Explicit file list (paths relative to repoRoot) for context-specific finders. */
2905
+ scanFiles: string[];
2906
+ /**
2907
+ * Auto-derived scan: walk these dirs for files matching importGlob + the
2908
+ * string `importsContain` and run the universal finders on them. Pattern
2909
+ * from starter-foundry H4 (research/decisions/001) — catches new files
2910
+ * with agent-eval import that would otherwise escape context-specific
2911
+ * scan lists.
2912
+ */
2913
+ autoDerive?: {
2914
+ roots: string[];
2915
+ extensions: RegExp;
2916
+ importsContain: string;
2917
+ universalFinders: MuffledFinder[];
2918
+ };
2919
+ /** Per-file finders (context-specific patterns). */
2920
+ finders: MuffledFinder[];
2735
2921
  }
2736
- /** Does candidate A weakly dominate B — strictly better on at least one objective and no worse on any? */
2737
- declare function dominates<T>(a: T, b: T, objectives: Objective<T>[]): boolean;
2738
2922
  /**
2739
- * Compute the non-dominated frontier. Candidates with NaN/Infinity on any
2740
- * objective are excluded (can't rank them). A candidate enters the frontier
2741
- * iff no other candidate dominates it.
2923
+ * Default finder: `command || true` in a testCommand/setupCommand/cmd/command
2924
+ * string. Swallows exit codes.
2742
2925
  */
2743
- declare function paretoFrontier<T>(candidates: T[], objectives: Objective<T>[]): ParetoResult<T>;
2926
+ declare const findFallbackToPass: MuffledFinder;
2927
+ /**
2928
+ * `testCommand: 'true'` literal silent-pass — an unknown-language dispatch
2929
+ * arm that returns a no-op instead of throwing.
2930
+ */
2931
+ declare const findLiteralTruePass: MuffledFinder;
2932
+ /**
2933
+ * `new SubprocessSandboxDriver({ cwd: ... })` — constructor arg silently
2934
+ * dropped in agent-eval <0.7.1. 0.7.1+ honors as fallback, but the form
2935
+ * still invites confusion; prefer `new SubprocessSandboxDriver()` with
2936
+ * cwd in the per-call HarnessConfig.
2937
+ */
2938
+ declare const findConstructorCwdDropped: MuffledFinder;
2939
+ /**
2940
+ * `if (!expected) return true` — matcher auto-passes when ground truth is
2941
+ * absent. Inflates accuracy metrics for scenarios without expectations.
2942
+ */
2943
+ declare const findAutoMatchNoExpectation: MuffledFinder;
2944
+ /**
2945
+ * `if (p.skipped) return true` — skip-counts-as-pass in quality scorers.
2946
+ * Use three-valued `true | false | 'skipped'` return + explicit partial
2947
+ * credit instead.
2948
+ */
2949
+ declare const findSkipCountsAsPass: MuffledFinder;
2950
+ /**
2951
+ * The canonical default bundle. Callers can import these individually,
2952
+ * replace them, or append custom finders for project-specific patterns.
2953
+ */
2954
+ declare const DEFAULT_FINDERS: MuffledFinder[];
2955
+ /** Finders that should run on EVERY file with the target import, not just SCAN_FILES. */
2956
+ declare const UNIVERSAL_FINDERS: MuffledFinder[];
2957
+ /**
2958
+ * Run all finders against the configured files. Returns a flat list of
2959
+ * findings. Callers format + assert as they prefer.
2960
+ */
2961
+ declare function scanForMuffledGates(opts: ScanOptions): MuffledFinding[];
2962
+ /**
2963
+ * Format findings into a single assert.fail-ready message. Each finding
2964
+ * carries file:line + pattern name + the offending line.
2965
+ */
2966
+ declare function formatFindings(findings: MuffledFinding[]): string;
2744
2967
 
2745
2968
  /**
2746
2969
  * Series convergence — detects whether a sequence of scalar measurements
@@ -4573,4 +4796,4 @@ interface UseCaseSignals {
4573
4796
  declare function classifyEuAiRisk(signals: UseCaseSignals): EuRiskClass;
4574
4797
  declare function euAiActReport(ctx: GovernanceContext, signals: UseCaseSignals): Promise<GovernanceReport>;
4575
4798
 
4576
- export { type ActiveLearningOptions, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeploymentOutcome, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type GenericSpan, type GoldenItem, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessConfig, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type LayerCorrelation, type LlmJsonCall, type LlmReviewerConfig, type LlmSpan, MODEL_PRICING, type MatcherResult, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxResult, type SandboxSpan, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type Verification, type VerifyFn, type VisualDiffOptions, type VisualDiffResult, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, budgetBreachView, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, createLlmReviewer, crossTraceDiff, defaultJudges, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, failureClusterView, fileContains, fileExists, firstDivergenceView, formatBenchmarkReport, formatDriverReport, groupBy, hashContent, hashScenarios, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReviewStore, judgeAgreementView, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, lowercaseMutator, mannWhitneyU, mergeSteeringBundle, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runJudgeFleet, runProposeReview, runSelfPlay, runTestGradedScenario, runsForScenario, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, securityJudge, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
4799
+ export { type ActiveLearningOptions, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeploymentOutcome, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type GenericSpan, type GoldenItem, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type LayerCorrelation, type LlmJsonCall, type LlmReviewerConfig, type LlmSpan, MODEL_PRICING, type MatcherResult, type MeasurementPolicy, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type Verification, type VerifyFn, type VisualDiffOptions, type VisualDiffResult, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, budgetBreachView, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, createLlmReviewer, crossTraceDiff, defaultJudges, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, formatBenchmarkReport, formatDriverReport, formatFindings, groupBy, hashContent, hashScenarios, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReviewStore, judgeAgreementView, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, lowercaseMutator, mannWhitneyU, mergeSteeringBundle, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runJudgeFleet, runProposeReview, runSelfPlay, runTestGradedScenario, runsForScenario, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };