@tangle-network/agent-eval 0.5.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +1539 -171
- package/dist/index.js +4841 -2552
- package/dist/index.js.map +1 -1
- package/package.json +6 -2
package/dist/index.d.ts
CHANGED
|
@@ -1033,86 +1033,56 @@ declare class PromptOptimizer {
|
|
|
1033
1033
|
run(config: OptimizationConfig): Promise<OptimizationResult>;
|
|
1034
1034
|
}
|
|
1035
1035
|
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
* Pattern lifted from tax-agent + legal-agent: two agents take turns until
|
|
1040
|
-
* they converge on a consensus artifact. One proposes, the other critiques;
|
|
1041
|
-
* the proposer revises; repeat until a score threshold is hit or max rounds.
|
|
1042
|
-
*
|
|
1043
|
-
* Generalized so any two "agents" (gateways, local functions, anything with
|
|
1044
|
-
* `propose` + `critique`) compose in. Returns convergence rounds per
|
|
1045
|
-
* scenario + whether convergence happened.
|
|
1046
|
-
*/
|
|
1047
|
-
interface DualAgentScenario {
|
|
1048
|
-
id: string;
|
|
1049
|
-
initialPrompt: string;
|
|
1050
|
-
/** Optional context the agents can read (e.g. source documents). */
|
|
1051
|
-
context?: Record<string, unknown>;
|
|
1036
|
+
interface SteeringRolePrompt {
|
|
1037
|
+
system?: string;
|
|
1038
|
+
append?: string;
|
|
1052
1039
|
}
|
|
1053
|
-
interface
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
converged: boolean;
|
|
1062
|
-
roundsToConverge: number | null;
|
|
1063
|
-
finalProposal: string;
|
|
1064
|
-
history: DualAgentRound[];
|
|
1065
|
-
finalScore: number;
|
|
1066
|
-
}
|
|
1067
|
-
interface DualAgentBenchConfig {
|
|
1068
|
-
scenarios: DualAgentScenario[];
|
|
1069
|
-
maxRounds?: number;
|
|
1070
|
-
/** Convergence threshold in 0..1 (default 0.85). */
|
|
1071
|
-
convergenceThreshold?: number;
|
|
1072
|
-
/**
|
|
1073
|
-
* Propose an answer given the scenario + the critic's prior critique (if any).
|
|
1074
|
-
* Returns the proposal string.
|
|
1075
|
-
*/
|
|
1076
|
-
propose: (args: {
|
|
1077
|
-
scenario: DualAgentScenario;
|
|
1078
|
-
roundIndex: number;
|
|
1079
|
-
priorProposal?: string;
|
|
1080
|
-
priorCritique?: string;
|
|
1081
|
-
}) => Promise<string>;
|
|
1082
|
-
/**
|
|
1083
|
-
* Critique the proposer's current output. Returns a structured critique
|
|
1084
|
-
* (free text) plus a convergence score: how close the proposal is to
|
|
1085
|
-
* acceptable. 1.0 = accept, 0.0 = totally off.
|
|
1086
|
-
*/
|
|
1087
|
-
critique: (args: {
|
|
1088
|
-
scenario: DualAgentScenario;
|
|
1089
|
-
roundIndex: number;
|
|
1090
|
-
proposal: string;
|
|
1091
|
-
}) => Promise<{
|
|
1092
|
-
critique: string;
|
|
1093
|
-
convergenceScore: number;
|
|
1094
|
-
}>;
|
|
1095
|
-
/** Optional per-round hook for progress + tracing. */
|
|
1096
|
-
onRoundComplete?: (info: {
|
|
1097
|
-
scenarioId: string;
|
|
1098
|
-
round: DualAgentRound;
|
|
1099
|
-
}) => void;
|
|
1040
|
+
interface SteeringBundle {
|
|
1041
|
+
id: string;
|
|
1042
|
+
coderPrompt?: string;
|
|
1043
|
+
continuePrompt?: string;
|
|
1044
|
+
reviewerPrompts?: Record<string, string>;
|
|
1045
|
+
skills?: string[];
|
|
1046
|
+
rolePrompts?: Record<string, SteeringRolePrompt>;
|
|
1047
|
+
metadata?: Record<string, unknown>;
|
|
1100
1048
|
}
|
|
1101
|
-
interface
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
config: {
|
|
1109
|
-
maxRounds: number;
|
|
1110
|
-
convergenceThreshold: number;
|
|
1111
|
-
};
|
|
1049
|
+
interface SteeringDelta {
|
|
1050
|
+
coderPrompt?: string;
|
|
1051
|
+
continuePrompt?: string;
|
|
1052
|
+
reviewerPrompts?: Record<string, string>;
|
|
1053
|
+
skills?: string[];
|
|
1054
|
+
rolePrompts?: Record<string, SteeringRolePrompt>;
|
|
1055
|
+
metadata?: Record<string, unknown>;
|
|
1112
1056
|
}
|
|
1113
|
-
declare
|
|
1114
|
-
|
|
1057
|
+
declare function mergeSteeringBundle(base: SteeringBundle, delta: SteeringDelta): SteeringBundle;
|
|
1058
|
+
declare function renderSteeringText(bundle: SteeringBundle): string;
|
|
1059
|
+
|
|
1060
|
+
interface RunScore {
|
|
1061
|
+
success: number;
|
|
1062
|
+
goalProgress: number;
|
|
1063
|
+
repoGroundedness: number;
|
|
1064
|
+
driftPenalty: number;
|
|
1065
|
+
toolUseQuality: number;
|
|
1066
|
+
patchQuality: number;
|
|
1067
|
+
testReality: number;
|
|
1068
|
+
costUsd: number;
|
|
1069
|
+
wallSeconds: number;
|
|
1070
|
+
notes?: string[];
|
|
1071
|
+
}
|
|
1072
|
+
interface RunScoreWeights {
|
|
1073
|
+
success: number;
|
|
1074
|
+
goalProgress: number;
|
|
1075
|
+
repoGroundedness: number;
|
|
1076
|
+
driftPenalty: number;
|
|
1077
|
+
toolUseQuality: number;
|
|
1078
|
+
patchQuality: number;
|
|
1079
|
+
testReality: number;
|
|
1080
|
+
costUsd: number;
|
|
1081
|
+
wallSeconds: number;
|
|
1115
1082
|
}
|
|
1083
|
+
declare const DEFAULT_RUN_SCORE_WEIGHTS: RunScoreWeights;
|
|
1084
|
+
declare function aggregateRunScore(score: RunScore, weights?: Partial<RunScoreWeights>): number;
|
|
1085
|
+
declare function clamp01(value: number): number;
|
|
1116
1086
|
|
|
1117
1087
|
/**
|
|
1118
1088
|
* TraceSchema v1 — the canonical data model for agent-eval.
|
|
@@ -1606,6 +1576,137 @@ interface OtlpExport {
|
|
|
1606
1576
|
/** Export a single run's spans + events in OTLP/JSON. */
|
|
1607
1577
|
declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
|
|
1608
1578
|
|
|
1579
|
+
interface RunTrace {
|
|
1580
|
+
run: Run;
|
|
1581
|
+
spans: Span[];
|
|
1582
|
+
events: TraceEvent[];
|
|
1583
|
+
artifacts: Artifact[];
|
|
1584
|
+
budget: BudgetLedgerEntry[];
|
|
1585
|
+
}
|
|
1586
|
+
interface RunCriticOptions {
|
|
1587
|
+
weights?: Partial<RunScoreWeights>;
|
|
1588
|
+
driftPatterns?: RegExp[];
|
|
1589
|
+
}
|
|
1590
|
+
declare class RunCritic {
|
|
1591
|
+
private readonly weights?;
|
|
1592
|
+
private readonly driftPatterns;
|
|
1593
|
+
constructor(options?: RunCriticOptions);
|
|
1594
|
+
score(store: TraceStore, runId: string): Promise<RunScore>;
|
|
1595
|
+
scoreTrace(trace: RunTrace): RunScore;
|
|
1596
|
+
rank(score: RunScore): number;
|
|
1597
|
+
private isDrift;
|
|
1598
|
+
}
|
|
1599
|
+
|
|
1600
|
+
interface PlaybookEntry {
|
|
1601
|
+
instruction: string;
|
|
1602
|
+
rationale: string;
|
|
1603
|
+
category?: string;
|
|
1604
|
+
evidence?: string;
|
|
1605
|
+
weight?: number;
|
|
1606
|
+
sourceRunId?: string;
|
|
1607
|
+
}
|
|
1608
|
+
interface Playbook {
|
|
1609
|
+
entries: PlaybookEntry[];
|
|
1610
|
+
}
|
|
1611
|
+
declare function distillPlaybook(entries: PlaybookEntry[], options?: {
|
|
1612
|
+
maxEntries?: number;
|
|
1613
|
+
}): Playbook;
|
|
1614
|
+
declare function renderPlaybookMarkdown(playbook: Playbook): string;
|
|
1615
|
+
|
|
1616
|
+
interface OptimizationExample {
|
|
1617
|
+
scenarioId: string;
|
|
1618
|
+
metadata?: Record<string, unknown>;
|
|
1619
|
+
}
|
|
1620
|
+
interface SteeringEvaluation {
|
|
1621
|
+
variant: SteeringBundle;
|
|
1622
|
+
example: OptimizationExample;
|
|
1623
|
+
trialIndex: number;
|
|
1624
|
+
}
|
|
1625
|
+
interface SteeringVariantReport {
|
|
1626
|
+
variantId: string;
|
|
1627
|
+
bundle: SteeringBundle;
|
|
1628
|
+
mean: number;
|
|
1629
|
+
ci95: {
|
|
1630
|
+
lower: number;
|
|
1631
|
+
upper: number;
|
|
1632
|
+
};
|
|
1633
|
+
scenarioScores: Record<string, {
|
|
1634
|
+
mean: number;
|
|
1635
|
+
n: number;
|
|
1636
|
+
samples: number[];
|
|
1637
|
+
}>;
|
|
1638
|
+
}
|
|
1639
|
+
interface OptimizationLoopResult {
|
|
1640
|
+
winner: SteeringBundle;
|
|
1641
|
+
significant: boolean;
|
|
1642
|
+
reports: SteeringVariantReport[];
|
|
1643
|
+
pairwise: Array<{
|
|
1644
|
+
variantA: string;
|
|
1645
|
+
variantB: string;
|
|
1646
|
+
pValue: number;
|
|
1647
|
+
qValue: number;
|
|
1648
|
+
significant: boolean;
|
|
1649
|
+
meanDelta: number;
|
|
1650
|
+
}>;
|
|
1651
|
+
}
|
|
1652
|
+
interface OptimizationLoopConfig {
|
|
1653
|
+
variants: SteeringBundle[];
|
|
1654
|
+
examples: OptimizationExample[];
|
|
1655
|
+
evaluate: (args: SteeringEvaluation) => Promise<RunScore>;
|
|
1656
|
+
scoreWeights?: Partial<RunScoreWeights>;
|
|
1657
|
+
trialsPerScenario?: number;
|
|
1658
|
+
}
|
|
1659
|
+
declare class OptimizationLoop {
|
|
1660
|
+
private readonly optimizer;
|
|
1661
|
+
constructor(optimizer?: PromptOptimizer);
|
|
1662
|
+
run(config: OptimizationLoopConfig): Promise<OptimizationLoopResult>;
|
|
1663
|
+
}
|
|
1664
|
+
|
|
1665
|
+
type SteeringOptimizerBackend = 'pairwise' | 'ax-gepa';
|
|
1666
|
+
interface SteeringOptimizationRow {
|
|
1667
|
+
variantId: string;
|
|
1668
|
+
scenarioId: string;
|
|
1669
|
+
bundle: SteeringBundle;
|
|
1670
|
+
score: RunScore;
|
|
1671
|
+
metadata?: Record<string, unknown>;
|
|
1672
|
+
}
|
|
1673
|
+
interface SteeringOptimizationSelector {
|
|
1674
|
+
backend: SteeringOptimizerBackend;
|
|
1675
|
+
signature?: string;
|
|
1676
|
+
labels?: string[];
|
|
1677
|
+
rationale?: string;
|
|
1678
|
+
}
|
|
1679
|
+
interface SteeringOptimizationResult {
|
|
1680
|
+
backend: SteeringOptimizerBackend;
|
|
1681
|
+
recommendedVariantId: string;
|
|
1682
|
+
rationale: string;
|
|
1683
|
+
rankings: Array<{
|
|
1684
|
+
variantId: string;
|
|
1685
|
+
mean: number;
|
|
1686
|
+
runs: number;
|
|
1687
|
+
}>;
|
|
1688
|
+
selector?: SteeringOptimizationSelector;
|
|
1689
|
+
skipped?: boolean;
|
|
1690
|
+
}
|
|
1691
|
+
interface SteeringOptimizerConfig {
|
|
1692
|
+
weights?: Partial<RunScoreWeights>;
|
|
1693
|
+
}
|
|
1694
|
+
interface AxSteeringOptimizerConfig extends SteeringOptimizerConfig {
|
|
1695
|
+
provider: 'openai' | 'anthropic';
|
|
1696
|
+
apiKey: string;
|
|
1697
|
+
model: string;
|
|
1698
|
+
teacherModel?: string;
|
|
1699
|
+
minRows?: number;
|
|
1700
|
+
}
|
|
1701
|
+
declare class PairwiseSteeringOptimizer {
|
|
1702
|
+
optimize(rows: SteeringOptimizationRow[], config?: SteeringOptimizerConfig): SteeringOptimizationResult;
|
|
1703
|
+
}
|
|
1704
|
+
declare class AxGepaSteeringOptimizer {
|
|
1705
|
+
private readonly config;
|
|
1706
|
+
constructor(config: AxSteeringOptimizerConfig);
|
|
1707
|
+
optimize(rows: SteeringOptimizationRow[]): Promise<SteeringOptimizationResult>;
|
|
1708
|
+
}
|
|
1709
|
+
|
|
1609
1710
|
/**
|
|
1610
1711
|
* SandboxHarness — executes a scenario in an isolated environment and
|
|
1611
1712
|
* emits a rich SandboxSpan into the trace.
|
|
@@ -1689,113 +1790,434 @@ declare class SandboxHarness {
|
|
|
1689
1790
|
run(config: HarnessConfig, emitter: TraceEmitter): Promise<SandboxHarnessResult>;
|
|
1690
1791
|
}
|
|
1691
1792
|
|
|
1692
|
-
|
|
1693
|
-
|
|
1694
|
-
*
|
|
1695
|
-
* This is the SWE-bench pattern generalized. The scenario ships:
|
|
1696
|
-
* - fixture data (setup instructions)
|
|
1697
|
-
* - a test command the harness runs
|
|
1698
|
-
* - optional assertion overrides
|
|
1699
|
-
*
|
|
1700
|
-
* The runner emits a run, delegates to SandboxHarness, records the
|
|
1701
|
-
* outcome, and returns a structured verdict. Consumers bind their own
|
|
1702
|
-
* agent execution to this contract.
|
|
1703
|
-
*/
|
|
1704
|
-
|
|
1705
|
-
interface TestGradedScenario {
|
|
1793
|
+
type SandboxJudgeKind = 'compiler' | 'test' | 'linter' | 'security';
|
|
1794
|
+
interface SandboxJudgeSpec {
|
|
1706
1795
|
id: string;
|
|
1707
|
-
|
|
1708
|
-
|
|
1709
|
-
/** Optional pass threshold in 0..1 (default 1.0 = all tests must pass). */
|
|
1710
|
-
passThreshold?: number;
|
|
1711
|
-
/** Provenance for dataset tracking. */
|
|
1712
|
-
datasetVersion?: string;
|
|
1713
|
-
/** Free-form tags (difficulty, category, etc.). */
|
|
1714
|
-
tags?: Record<string, string>;
|
|
1796
|
+
kind: SandboxJudgeKind;
|
|
1797
|
+
config: HarnessConfig;
|
|
1715
1798
|
}
|
|
1716
|
-
interface
|
|
1717
|
-
|
|
1799
|
+
interface SandboxJudgeResult {
|
|
1800
|
+
id: string;
|
|
1801
|
+
kind: SandboxJudgeKind;
|
|
1802
|
+
passed: boolean;
|
|
1803
|
+
score: number;
|
|
1804
|
+
summary: string;
|
|
1805
|
+
detail: SandboxHarnessResult;
|
|
1806
|
+
}
|
|
1807
|
+
interface JudgeFleetOptions {
|
|
1718
1808
|
driver?: SandboxDriver;
|
|
1719
|
-
|
|
1720
|
-
provenance?: Pick<Run, 'codeSha' | 'promptSha' | 'modelFingerprint' | 'seed' | 'envFingerprint'>;
|
|
1809
|
+
parallel?: boolean;
|
|
1721
1810
|
}
|
|
1722
|
-
|
|
1723
|
-
|
|
1724
|
-
|
|
1725
|
-
|
|
1726
|
-
pass: boolean;
|
|
1727
|
-
score: number;
|
|
1728
|
-
failureClass?: FailureClass;
|
|
1811
|
+
declare class JudgeRunner {
|
|
1812
|
+
private readonly driver;
|
|
1813
|
+
constructor(driver?: SandboxDriver);
|
|
1814
|
+
run(spec: SandboxJudgeSpec): Promise<SandboxJudgeResult>;
|
|
1729
1815
|
}
|
|
1730
|
-
declare function
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
* `BudgetBreachError` when a cap is hit.
|
|
1736
|
-
*
|
|
1737
|
-
* Wraps a TraceEmitter. The emitter persists ledger entries + breach
|
|
1738
|
-
* events so the classifier, pipelines, and reports can all read
|
|
1739
|
-
* budget state from the trace corpus — no separate accounting.
|
|
1740
|
-
*/
|
|
1816
|
+
declare function runJudgeFleet(specs: SandboxJudgeSpec[], options?: JudgeFleetOptions): Promise<SandboxJudgeResult[]>;
|
|
1817
|
+
declare function compilerJudge(id: string, config: HarnessConfig): SandboxJudgeSpec;
|
|
1818
|
+
declare function testJudge(id: string, config: HarnessConfig): SandboxJudgeSpec;
|
|
1819
|
+
declare function linterJudge(id: string, config: HarnessConfig): SandboxJudgeSpec;
|
|
1820
|
+
declare function securityJudge(id: string, config: HarnessConfig): SandboxJudgeSpec;
|
|
1741
1821
|
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
|
|
1746
|
-
constructor(dimension: keyof BudgetSpec, limit: number, attempted: number);
|
|
1822
|
+
interface HostedJudgeDimension {
|
|
1823
|
+
name: string;
|
|
1824
|
+
weight: number;
|
|
1825
|
+
rubric: string;
|
|
1747
1826
|
}
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1827
|
+
interface HostedJudgeConfig {
|
|
1828
|
+
model: string;
|
|
1829
|
+
mode?: 'llm' | 'sandbox' | 'composite';
|
|
1830
|
+
systemPrompt?: string;
|
|
1831
|
+
rubricTemplate?: string;
|
|
1832
|
+
temperature?: number;
|
|
1833
|
+
maxTurns?: number;
|
|
1834
|
+
tools?: string[];
|
|
1835
|
+
dimensions?: HostedJudgeDimension[];
|
|
1836
|
+
setupCommand?: string;
|
|
1837
|
+
scripts?: Record<string, string>;
|
|
1838
|
+
}
|
|
1839
|
+
interface HostedJudgeRequest {
|
|
1840
|
+
prompt: string;
|
|
1841
|
+
response: string;
|
|
1842
|
+
rubric?: string;
|
|
1843
|
+
reference?: string;
|
|
1844
|
+
judge: HostedJudgeConfig;
|
|
1759
1845
|
}
|
|
1846
|
+
interface HostedJudgeResponse {
|
|
1847
|
+
score: number;
|
|
1848
|
+
reasoning: string;
|
|
1849
|
+
cost: number;
|
|
1850
|
+
dimensions?: Array<{
|
|
1851
|
+
name: string;
|
|
1852
|
+
score: number;
|
|
1853
|
+
reasoning: string;
|
|
1854
|
+
}>;
|
|
1855
|
+
evidence?: Array<{
|
|
1856
|
+
type: string;
|
|
1857
|
+
content: string;
|
|
1858
|
+
}>;
|
|
1859
|
+
turns?: number;
|
|
1860
|
+
parseFailed?: boolean;
|
|
1861
|
+
rawOutput?: string;
|
|
1862
|
+
}
|
|
1863
|
+
interface HostedRunScoreRequest {
|
|
1864
|
+
trace: RunTrace;
|
|
1865
|
+
weights?: Partial<RunScoreWeights>;
|
|
1866
|
+
driftPatterns?: string[];
|
|
1867
|
+
}
|
|
1868
|
+
interface HostedRunScoreResponse {
|
|
1869
|
+
score: RunScore;
|
|
1870
|
+
aggregate: number;
|
|
1871
|
+
weights: RunScoreWeights;
|
|
1872
|
+
notes: string[];
|
|
1873
|
+
}
|
|
1874
|
+
type HostedRunCriticConfig = Pick<RunCriticOptions, 'weights'> & {
|
|
1875
|
+
driftPatterns?: string[];
|
|
1876
|
+
};
|
|
1760
1877
|
|
|
1761
1878
|
/**
|
|
1762
|
-
*
|
|
1879
|
+
* Dual-agent convergence bench.
|
|
1763
1880
|
*
|
|
1764
|
-
*
|
|
1765
|
-
*
|
|
1766
|
-
* the
|
|
1881
|
+
* Pattern lifted from tax-agent + legal-agent: two agents take turns until
|
|
1882
|
+
* they converge on a consensus artifact. One proposes, the other critiques;
|
|
1883
|
+
* the proposer revises; repeat until a score threshold is hit or max rounds.
|
|
1767
1884
|
*
|
|
1768
|
-
*
|
|
1769
|
-
*
|
|
1885
|
+
* Generalized so any two "agents" (gateways, local functions, anything with
|
|
1886
|
+
* `propose` + `critique`) compose in. Returns convergence rounds per
|
|
1887
|
+
* scenario + whether convergence happened.
|
|
1770
1888
|
*/
|
|
1771
|
-
|
|
1772
|
-
interface FailureContext {
|
|
1773
|
-
run: Run;
|
|
1774
|
-
spans: Span[];
|
|
1775
|
-
events: TraceEvent[];
|
|
1776
|
-
}
|
|
1777
|
-
interface FailureClassification {
|
|
1778
|
-
failureClass: FailureClass;
|
|
1779
|
-
reason: string;
|
|
1780
|
-
triggerSpanId?: string;
|
|
1781
|
-
triggerEventId?: string;
|
|
1782
|
-
}
|
|
1783
|
-
/** Ordered rules — first match wins. */
|
|
1784
|
-
interface FailureRule {
|
|
1889
|
+
interface DualAgentScenario {
|
|
1785
1890
|
id: string;
|
|
1786
|
-
|
|
1787
|
-
|
|
1788
|
-
|
|
1789
|
-
triggerSpanId?: string;
|
|
1790
|
-
triggerEventId?: string;
|
|
1791
|
-
} | null;
|
|
1891
|
+
initialPrompt: string;
|
|
1892
|
+
/** Optional context the agents can read (e.g. source documents). */
|
|
1893
|
+
context?: Record<string, unknown>;
|
|
1792
1894
|
}
|
|
1793
|
-
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
|
|
1895
|
+
interface DualAgentRound {
|
|
1896
|
+
roundIndex: number;
|
|
1897
|
+
proposal: string;
|
|
1898
|
+
critique: string;
|
|
1899
|
+
convergenceScore: number;
|
|
1900
|
+
}
|
|
1901
|
+
interface DualAgentScenarioResult {
|
|
1902
|
+
scenarioId: string;
|
|
1903
|
+
converged: boolean;
|
|
1904
|
+
roundsToConverge: number | null;
|
|
1905
|
+
finalProposal: string;
|
|
1906
|
+
history: DualAgentRound[];
|
|
1907
|
+
finalScore: number;
|
|
1908
|
+
}
|
|
1909
|
+
interface DualAgentBenchConfig {
|
|
1910
|
+
scenarios: DualAgentScenario[];
|
|
1911
|
+
maxRounds?: number;
|
|
1912
|
+
/** Convergence threshold in 0..1 (default 0.85). */
|
|
1913
|
+
convergenceThreshold?: number;
|
|
1914
|
+
/**
|
|
1915
|
+
* Propose an answer given the scenario + the critic's prior critique (if any).
|
|
1916
|
+
* Returns the proposal string.
|
|
1917
|
+
*/
|
|
1918
|
+
propose: (args: {
|
|
1919
|
+
scenario: DualAgentScenario;
|
|
1920
|
+
roundIndex: number;
|
|
1921
|
+
priorProposal?: string;
|
|
1922
|
+
priorCritique?: string;
|
|
1923
|
+
}) => Promise<string>;
|
|
1924
|
+
/**
|
|
1925
|
+
* Critique the proposer's current output. Returns a structured critique
|
|
1926
|
+
* (free text) plus a convergence score: how close the proposal is to
|
|
1927
|
+
* acceptable. 1.0 = accept, 0.0 = totally off.
|
|
1928
|
+
*/
|
|
1929
|
+
critique: (args: {
|
|
1930
|
+
scenario: DualAgentScenario;
|
|
1931
|
+
roundIndex: number;
|
|
1932
|
+
proposal: string;
|
|
1933
|
+
}) => Promise<{
|
|
1934
|
+
critique: string;
|
|
1935
|
+
convergenceScore: number;
|
|
1936
|
+
}>;
|
|
1937
|
+
/** Optional per-round hook for progress + tracing. */
|
|
1938
|
+
onRoundComplete?: (info: {
|
|
1939
|
+
scenarioId: string;
|
|
1940
|
+
round: DualAgentRound;
|
|
1941
|
+
}) => void;
|
|
1942
|
+
}
|
|
1943
|
+
interface DualAgentReport {
|
|
1944
|
+
scenarios: DualAgentScenarioResult[];
|
|
1945
|
+
aggregate: {
|
|
1946
|
+
convergenceRate: number;
|
|
1947
|
+
avgRoundsToConverge: number | null;
|
|
1948
|
+
avgFinalScore: number;
|
|
1949
|
+
};
|
|
1950
|
+
config: {
|
|
1951
|
+
maxRounds: number;
|
|
1952
|
+
convergenceThreshold: number;
|
|
1953
|
+
};
|
|
1954
|
+
}
|
|
1955
|
+
declare class DualAgentBench {
|
|
1956
|
+
run(config: DualAgentBenchConfig): Promise<DualAgentReport>;
|
|
1957
|
+
}
|
|
1958
|
+
|
|
1959
|
+
/**
|
|
1960
|
+
* Propose / Verify / Review — the core multi-shot primitive.
|
|
1961
|
+
*
|
|
1962
|
+
* shot N: propose(state, priorReview) → new state
|
|
1963
|
+
* verify(state) → pass/fail, optional layers
|
|
1964
|
+
* review(state, verification, memory) → observations + next-shot
|
|
1965
|
+
* instruction + shouldContinue
|
|
1966
|
+
* memory.append(entry)
|
|
1967
|
+
*
|
|
1968
|
+
* Roles are strictly separated:
|
|
1969
|
+
*
|
|
1970
|
+
* - The WORKER is whatever the caller wraps in `propose`. It is
|
|
1971
|
+
* stateful — caller owns its resume/session mechanism.
|
|
1972
|
+
* - The VERIFIER grades the state. It produces the ground truth.
|
|
1973
|
+
* The reviewer cannot overturn or downgrade a verification layer.
|
|
1974
|
+
* - The REVIEWER is stateless per call. Its continuity is the
|
|
1975
|
+
* `ReviewMemoryStore` — durable JSONL by default, or any store
|
|
1976
|
+
* implementing the interface. It reads memory + trace summary +
|
|
1977
|
+
* verification and directs the NEXT proposer shot.
|
|
1978
|
+
*
|
|
1979
|
+
* This shape is load-bearing. The reviewer never grades; the verifier
|
|
1980
|
+
* never directs. Two processes, two prompts, two concerns — which is
|
|
1981
|
+
* what keeps the loop from confirmation-biasing itself into "all
|
|
1982
|
+
* passed" when it didn't.
|
|
1983
|
+
*
|
|
1984
|
+
* Short-circuits and soft-fails are both first-class:
|
|
1985
|
+
* - verify.pass === true → reviewer LLM call is skipped, memory
|
|
1986
|
+
* records a success entry, loop exits.
|
|
1987
|
+
* - review throws → the shot still counts; the loop uses the
|
|
1988
|
+
* last-known instruction (or `fallbackInstruction`) for the next
|
|
1989
|
+
* propose call. A transient reviewer failure must NEVER abort a
|
|
1990
|
+
* valid arc.
|
|
1991
|
+
*
|
|
1992
|
+
* Composable: `propose` itself can be another `runProposeReview` call.
|
|
1993
|
+
* That's the dogfooding path — a harness built on this primitive is in
|
|
1994
|
+
* turn evaluable by it.
|
|
1995
|
+
*/
|
|
1996
|
+
|
|
1997
|
+
interface Verification {
|
|
1998
|
+
pass: boolean;
|
|
1999
|
+
score?: number;
|
|
2000
|
+
failingLayers?: string[];
|
|
2001
|
+
details?: unknown;
|
|
2002
|
+
}
|
|
2003
|
+
interface Review {
|
|
2004
|
+
observations: string;
|
|
2005
|
+
diagnosis: string;
|
|
2006
|
+
nextShotInstruction: string;
|
|
2007
|
+
shouldContinue: boolean;
|
|
2008
|
+
confidence: number;
|
|
2009
|
+
}
|
|
2010
|
+
interface ReviewMemoryEntry extends Review {
|
|
2011
|
+
shot: number;
|
|
2012
|
+
timestamp: number;
|
|
2013
|
+
verification: {
|
|
2014
|
+
pass: boolean;
|
|
2015
|
+
score?: number;
|
|
2016
|
+
failingLayers?: string[];
|
|
2017
|
+
};
|
|
2018
|
+
}
|
|
2019
|
+
interface ProposeInput<State> {
|
|
2020
|
+
shot: number;
|
|
2021
|
+
goal: string;
|
|
2022
|
+
state: State;
|
|
2023
|
+
priorReview: Review | null;
|
|
2024
|
+
abortSignal: AbortSignal;
|
|
2025
|
+
emitter?: TraceEmitter;
|
|
2026
|
+
}
|
|
2027
|
+
interface ProposeOutput<State, Summary = unknown> {
|
|
2028
|
+
state: State;
|
|
2029
|
+
traceSummary?: Summary;
|
|
2030
|
+
}
|
|
2031
|
+
interface ReviewInput<State, Summary = unknown> {
|
|
2032
|
+
shot: number;
|
|
2033
|
+
goal: string;
|
|
2034
|
+
state: State;
|
|
2035
|
+
verification: Verification;
|
|
2036
|
+
traceSummary: Summary | undefined;
|
|
2037
|
+
memory: ReviewMemoryEntry[];
|
|
2038
|
+
}
|
|
2039
|
+
type ProposeFn<State, Summary = unknown> = (input: ProposeInput<State>) => Promise<ProposeOutput<State, Summary>>;
|
|
2040
|
+
type VerifyFn<State> = (state: State) => Promise<Verification>;
|
|
2041
|
+
type ReviewFn<State, Summary = unknown> = (input: ReviewInput<State, Summary>) => Promise<Review>;
|
|
2042
|
+
interface ReviewMemoryStore {
|
|
2043
|
+
load(): Promise<ReviewMemoryEntry[]>;
|
|
2044
|
+
append(entry: ReviewMemoryEntry): Promise<void>;
|
|
2045
|
+
}
|
|
2046
|
+
interface ProposeReviewConfig<State, Summary = unknown> {
|
|
2047
|
+
goal: string;
|
|
2048
|
+
initialState: State;
|
|
2049
|
+
propose: ProposeFn<State, Summary>;
|
|
2050
|
+
verify: VerifyFn<State>;
|
|
2051
|
+
review: ReviewFn<State, Summary>;
|
|
2052
|
+
/** Hard shot cap. Default 10. */
|
|
2053
|
+
maxShots?: number;
|
|
2054
|
+
/** Wall-clock cap in ms. Default 10 min. */
|
|
2055
|
+
maxWallMs?: number;
|
|
2056
|
+
/**
|
|
2057
|
+
* If the reviewer returns confidence ≤ floor on `confidenceFloorWindow`
|
|
2058
|
+
* consecutive shots, terminate early. Default floor 0.3, window 2.
|
|
2059
|
+
* Set window to 0 or floor to <0 to disable.
|
|
2060
|
+
*/
|
|
2061
|
+
confidenceFloor?: number;
|
|
2062
|
+
confidenceFloorWindow?: number;
|
|
2063
|
+
/** Defaults to an in-memory store if omitted. */
|
|
2064
|
+
memory?: ReviewMemoryStore;
|
|
2065
|
+
/** If provided, emit a Run + per-shot spans. */
|
|
2066
|
+
store?: TraceStore;
|
|
2067
|
+
scenarioId?: string;
|
|
2068
|
+
projectId?: string;
|
|
2069
|
+
variantId?: string;
|
|
2070
|
+
/**
|
|
2071
|
+
* Used when the reviewer soft-fails on shot 1 (no prior instruction to
|
|
2072
|
+
* fall back to). Default is a generic "inspect failures and fix".
|
|
2073
|
+
*/
|
|
2074
|
+
fallbackInstruction?: string;
|
|
2075
|
+
}
|
|
2076
|
+
interface ProposeReviewShot<State, Summary = unknown> {
|
|
2077
|
+
shot: number;
|
|
2078
|
+
state: State;
|
|
2079
|
+
verification: Verification;
|
|
2080
|
+
traceSummary: Summary | undefined;
|
|
2081
|
+
review: Review;
|
|
2082
|
+
reviewAvailable: boolean;
|
|
2083
|
+
reviewError?: string;
|
|
2084
|
+
durationMs: number;
|
|
2085
|
+
}
|
|
2086
|
+
interface ProposeReviewReport<State, Summary = unknown> {
|
|
2087
|
+
runId: string | null;
|
|
2088
|
+
completed: boolean;
|
|
2089
|
+
shots: ProposeReviewShot<State, Summary>[];
|
|
2090
|
+
finalState: State;
|
|
2091
|
+
finalVerification: Verification;
|
|
2092
|
+
failureClass?: FailureClass;
|
|
2093
|
+
wallMs: number;
|
|
2094
|
+
score: number;
|
|
2095
|
+
}
|
|
2096
|
+
declare function inMemoryReviewStore(initial?: ReviewMemoryEntry[]): ReviewMemoryStore;
|
|
2097
|
+
declare function jsonlReviewStore(path: string): ReviewMemoryStore;
|
|
2098
|
+
declare function runProposeReview<State, Summary = unknown>(config: ProposeReviewConfig<State, Summary>): Promise<ProposeReviewReport<State, Summary>>;
|
|
2099
|
+
interface LlmJsonCall {
|
|
2100
|
+
(req: {
|
|
2101
|
+
system: string;
|
|
2102
|
+
user: string;
|
|
2103
|
+
}): Promise<unknown>;
|
|
2104
|
+
}
|
|
2105
|
+
interface LlmReviewerConfig<State, Summary = unknown> {
|
|
2106
|
+
callJson: LlmJsonCall;
|
|
2107
|
+
renderState?: (state: State) => string;
|
|
2108
|
+
renderTraceSummary?: (summary: Summary | undefined) => string;
|
|
2109
|
+
/** Appended to the default system prompt. */
|
|
2110
|
+
systemPromptAddendum?: string;
|
|
2111
|
+
}
|
|
2112
|
+
declare function createLlmReviewer<State, Summary = unknown>(cfg: LlmReviewerConfig<State, Summary>): ReviewFn<State, Summary>;
|
|
2113
|
+
|
|
2114
|
+
/**
|
|
2115
|
+
* TestGradedScenario — a scenario whose score comes from a test suite.
|
|
2116
|
+
*
|
|
2117
|
+
* This is the SWE-bench pattern generalized. The scenario ships:
|
|
2118
|
+
* - fixture data (setup instructions)
|
|
2119
|
+
* - a test command the harness runs
|
|
2120
|
+
* - optional assertion overrides
|
|
2121
|
+
*
|
|
2122
|
+
* The runner emits a run, delegates to SandboxHarness, records the
|
|
2123
|
+
* outcome, and returns a structured verdict. Consumers bind their own
|
|
2124
|
+
* agent execution to this contract.
|
|
2125
|
+
*/
|
|
2126
|
+
|
|
2127
|
+
interface TestGradedScenario {
|
|
2128
|
+
id: string;
|
|
2129
|
+
description?: string;
|
|
2130
|
+
harness: HarnessConfig;
|
|
2131
|
+
/** Optional pass threshold in 0..1 (default 1.0 = all tests must pass). */
|
|
2132
|
+
passThreshold?: number;
|
|
2133
|
+
/** Provenance for dataset tracking. */
|
|
2134
|
+
datasetVersion?: string;
|
|
2135
|
+
/** Free-form tags (difficulty, category, etc.). */
|
|
2136
|
+
tags?: Record<string, string>;
|
|
2137
|
+
}
|
|
2138
|
+
interface TestGradedRunOptions {
|
|
2139
|
+
variantId?: string;
|
|
2140
|
+
driver?: SandboxDriver;
|
|
2141
|
+
/** Metadata recorded on the Run (codeSha, promptSha, modelFingerprint, seed). */
|
|
2142
|
+
provenance?: Pick<Run, 'codeSha' | 'promptSha' | 'modelFingerprint' | 'seed' | 'envFingerprint'>;
|
|
2143
|
+
}
|
|
2144
|
+
interface TestGradedRunResult {
|
|
2145
|
+
runId: string;
|
|
2146
|
+
scenario: TestGradedScenario;
|
|
2147
|
+
harness: SandboxHarnessResult;
|
|
2148
|
+
pass: boolean;
|
|
2149
|
+
score: number;
|
|
2150
|
+
failureClass?: FailureClass;
|
|
2151
|
+
}
|
|
2152
|
+
declare function runTestGradedScenario(scenario: TestGradedScenario, store: TraceStore, options?: TestGradedRunOptions): Promise<TestGradedRunResult>;
|
|
2153
|
+
|
|
2154
|
+
/**
|
|
2155
|
+
* BudgetGuard — enforces token / wall-clock / call / $ caps, records
|
|
2156
|
+
* a ledger entry on every decrement, emits `budget_breach` + throws
|
|
2157
|
+
* `BudgetBreachError` when a cap is hit.
|
|
2158
|
+
*
|
|
2159
|
+
* Wraps a TraceEmitter. The emitter persists ledger entries + breach
|
|
2160
|
+
* events so the classifier, pipelines, and reports can all read
|
|
2161
|
+
* budget state from the trace corpus — no separate accounting.
|
|
2162
|
+
*/
|
|
2163
|
+
|
|
2164
|
+
declare class BudgetBreachError extends Error {
|
|
2165
|
+
dimension: keyof BudgetSpec;
|
|
2166
|
+
limit: number;
|
|
2167
|
+
attempted: number;
|
|
2168
|
+
constructor(dimension: keyof BudgetSpec, limit: number, attempted: number);
|
|
2169
|
+
}
|
|
2170
|
+
declare class BudgetGuard {
|
|
2171
|
+
private consumed;
|
|
2172
|
+
private emitter;
|
|
2173
|
+
private budget;
|
|
2174
|
+
private startedAt;
|
|
2175
|
+
constructor(emitter: TraceEmitter, budget: BudgetSpec, now?: () => number);
|
|
2176
|
+
/** Record consumption. Throws `BudgetBreachError` if any dimension exceeds its cap. */
|
|
2177
|
+
charge(delta: Partial<Record<keyof BudgetSpec, number>>, spanId?: string): Promise<void>;
|
|
2178
|
+
/** Convenience: advance wall-clock budget based on elapsed wall time. */
|
|
2179
|
+
tickWall(nowMs: number, spanId?: string): Promise<void>;
|
|
2180
|
+
get state(): Record<keyof BudgetSpec, number>;
|
|
2181
|
+
}
|
|
2182
|
+
|
|
2183
|
+
/**
|
|
2184
|
+
* Failure taxonomy — canonical classes + a default classifier.
|
|
2185
|
+
*
|
|
2186
|
+
* Every failed run should end up in a named class. The classifier here
|
|
2187
|
+
* is rule-based (fast, deterministic); an LLM fallback can be added by
|
|
2188
|
+
* the consumer for novel cases and trained into the rule base over time.
|
|
2189
|
+
*
|
|
2190
|
+
* Consumers call `classifyFailure(run, spans, events)` and persist the
|
|
2191
|
+
* returned class as `Run.outcome.failureClass`.
|
|
2192
|
+
*/
|
|
2193
|
+
|
|
2194
|
+
interface FailureContext {
|
|
2195
|
+
run: Run;
|
|
2196
|
+
spans: Span[];
|
|
2197
|
+
events: TraceEvent[];
|
|
2198
|
+
}
|
|
2199
|
+
interface FailureClassification {
|
|
2200
|
+
failureClass: FailureClass;
|
|
2201
|
+
reason: string;
|
|
2202
|
+
triggerSpanId?: string;
|
|
2203
|
+
triggerEventId?: string;
|
|
2204
|
+
}
|
|
2205
|
+
/** Ordered rules — first match wins. */
|
|
2206
|
+
interface FailureRule {
|
|
2207
|
+
id: string;
|
|
2208
|
+
match: (ctx: FailureContext) => {
|
|
2209
|
+
failureClass: FailureClass;
|
|
2210
|
+
reason: string;
|
|
2211
|
+
triggerSpanId?: string;
|
|
2212
|
+
triggerEventId?: string;
|
|
2213
|
+
} | null;
|
|
2214
|
+
}
|
|
2215
|
+
declare const DEFAULT_RULES: FailureRule[];
|
|
2216
|
+
/** Classify the failure mode of a run using an ordered rule list. */
|
|
2217
|
+
declare function classifyFailure(ctx: FailureContext, rules?: FailureRule[]): FailureClassification;
|
|
2218
|
+
|
|
2219
|
+
/**
|
|
2220
|
+
* Trajectory — ordered, structured view over a run's spans.
|
|
1799
2221
|
*
|
|
1800
2222
|
* A pure function `buildTrajectory(store, runId) → Trajectory` returns
|
|
1801
2223
|
* a topologically ordered list of `TrajectoryStep` with parent-child
|
|
@@ -3102,10 +3524,25 @@ declare function resumeBuilderSession(store: TraceStore, projectId: string): Pro
|
|
|
3102
3524
|
* is the highest-leverage signal the framework computes — if
|
|
3103
3525
|
* meta_score doesn't predict runtime_score, the builder's self-scoring
|
|
3104
3526
|
* is broken.
|
|
3527
|
+
*
|
|
3528
|
+
* Scaffold-only mode: when a project has no `app-runtime` runs (e.g. a
|
|
3529
|
+
* scaffold-builder eval that grades compose + build without driving a
|
|
3530
|
+
* runtime scenario), `kind` is `'scaffold-only'` and `complete` measures
|
|
3531
|
+
* meta + build only. Consumers can tell the two apart without having to
|
|
3532
|
+
* interpret null-runtime as either "not yet computed" or "N/A for this
|
|
3533
|
+
* project shape".
|
|
3105
3534
|
*/
|
|
3106
3535
|
|
|
3536
|
+
type ProjectKind = 'full' | 'scaffold-only';
|
|
3107
3537
|
interface ThreeLayerProjectReport {
|
|
3108
3538
|
projectId: string;
|
|
3539
|
+
/**
|
|
3540
|
+
* `'full'` when the project has at least one `app-runtime` run;
|
|
3541
|
+
* `'scaffold-only'` when it only has meta + build layers. Lets
|
|
3542
|
+
* downstream consumers treat a null runtime score as expected
|
|
3543
|
+
* (scaffold-only) vs. missing (full, pipeline broke).
|
|
3544
|
+
*/
|
|
3545
|
+
kind: ProjectKind;
|
|
3109
3546
|
builderRunId?: string;
|
|
3110
3547
|
/** Judge-verdict score on the builder run (0..1 after normalization). */
|
|
3111
3548
|
metaScore: number | null;
|
|
@@ -3113,10 +3550,14 @@ interface ThreeLayerProjectReport {
|
|
|
3113
3550
|
/** 0..1 from the sandbox harness (testsPassed / testsTotal). */
|
|
3114
3551
|
buildScore: number | null;
|
|
3115
3552
|
appRuntimeRunIds: string[];
|
|
3116
|
-
/** Mean of outcome.score over app-runtime runs, 0..1. */
|
|
3553
|
+
/** Mean of outcome.score over app-runtime runs, 0..1. Always null in scaffold-only mode. */
|
|
3117
3554
|
runtimeScore: number | null;
|
|
3118
3555
|
runtimePassRate: number | null;
|
|
3119
|
-
/**
|
|
3556
|
+
/**
|
|
3557
|
+
* Layer-aware completeness:
|
|
3558
|
+
* - `kind='full'`: all three layers scored
|
|
3559
|
+
* - `kind='scaffold-only'`: meta + build scored (runtime not applicable)
|
|
3560
|
+
*/
|
|
3120
3561
|
complete: boolean;
|
|
3121
3562
|
}
|
|
3122
3563
|
declare function scoreProject(store: TraceStore, projectId: string): Promise<ThreeLayerProjectReport>;
|
|
@@ -3205,4 +3646,931 @@ declare class ProjectRegistry {
|
|
|
3205
3646
|
projectChats(projectId: string): Promise<ChatSummary[]>;
|
|
3206
3647
|
}
|
|
3207
3648
|
|
|
3208
|
-
|
|
3649
|
+
/**
|
|
3650
|
+
* OutcomeStore — deployment outcomes attached to Run IDs.
|
|
3651
|
+
*
|
|
3652
|
+
* Outcomes arrive asynchronously from production telemetry after the
|
|
3653
|
+
* eval run completed: user ratings, retention flags, conversion events,
|
|
3654
|
+
* revenue, support-ticket rate, anything a product team can measure.
|
|
3655
|
+
* The store is a peer to TraceStore — separate lifecycle, same runId
|
|
3656
|
+
* foreign key.
|
|
3657
|
+
*
|
|
3658
|
+
* The whole point of this module is to make the meta-eval correlation
|
|
3659
|
+
* question computable: `correlate(evalMetric, outcomeMetric) → r, ρ, n, CI`.
|
|
3660
|
+
*/
|
|
3661
|
+
interface DeploymentOutcome {
|
|
3662
|
+
runId: string;
|
|
3663
|
+
capturedAt: number;
|
|
3664
|
+
/** Numeric outcomes keyed by name — retention_7d, csat, revenue_usd, etc. */
|
|
3665
|
+
metrics: Record<string, number>;
|
|
3666
|
+
/** Dimensions for stratified analysis — cohort, region, user_segment. */
|
|
3667
|
+
labels?: Record<string, string>;
|
|
3668
|
+
/** Free-form provenance (source system, pipeline version). */
|
|
3669
|
+
source?: string;
|
|
3670
|
+
}
|
|
3671
|
+
interface OutcomeFilter {
|
|
3672
|
+
runIds?: string[];
|
|
3673
|
+
since?: number;
|
|
3674
|
+
until?: number;
|
|
3675
|
+
label?: {
|
|
3676
|
+
key: string;
|
|
3677
|
+
value: string;
|
|
3678
|
+
};
|
|
3679
|
+
source?: string;
|
|
3680
|
+
}
|
|
3681
|
+
interface OutcomeStore {
|
|
3682
|
+
append(outcome: DeploymentOutcome): Promise<void>;
|
|
3683
|
+
/** All outcomes attached to this run (a single run can have many — multiple
|
|
3684
|
+
* capture windows over deployment time). */
|
|
3685
|
+
forRun(runId: string): Promise<DeploymentOutcome[]>;
|
|
3686
|
+
list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
|
|
3687
|
+
}
|
|
3688
|
+
declare class InMemoryOutcomeStore implements OutcomeStore {
|
|
3689
|
+
private items;
|
|
3690
|
+
append(outcome: DeploymentOutcome): Promise<void>;
|
|
3691
|
+
forRun(runId: string): Promise<DeploymentOutcome[]>;
|
|
3692
|
+
list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
|
|
3693
|
+
}
|
|
3694
|
+
interface FileSystemOutcomeStoreOptions {
|
|
3695
|
+
dir: string;
|
|
3696
|
+
maxBytes?: number;
|
|
3697
|
+
}
|
|
3698
|
+
declare class FileSystemOutcomeStore implements OutcomeStore {
|
|
3699
|
+
private dir;
|
|
3700
|
+
private maxBytes;
|
|
3701
|
+
private memo?;
|
|
3702
|
+
private loaded;
|
|
3703
|
+
constructor(options: FileSystemOutcomeStoreOptions);
|
|
3704
|
+
private ensureDir;
|
|
3705
|
+
append(outcome: DeploymentOutcome): Promise<void>;
|
|
3706
|
+
private load;
|
|
3707
|
+
forRun(runId: string): Promise<DeploymentOutcome[]>;
|
|
3708
|
+
list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
|
|
3709
|
+
}
|
|
3710
|
+
|
|
3711
|
+
/**
|
|
3712
|
+
* Correlation study — "does our eval score predict real-world outcomes?"
|
|
3713
|
+
*
|
|
3714
|
+
* This is the load-bearing signal. Takes a TraceStore + OutcomeStore,
|
|
3715
|
+
* joins on runId, computes Pearson + Spearman + bootstrap CI for every
|
|
3716
|
+
* (evalMetric, outcomeMetric) pair the caller declares.
|
|
3717
|
+
*
|
|
3718
|
+
* Without this number the framework is ornamental. With it and r > 0.6
|
|
3719
|
+
* the framework is a moat — no other agent-eval tool publishes one.
|
|
3720
|
+
*/
|
|
3721
|
+
|
|
3722
|
+
interface EvalMetricSpec {
|
|
3723
|
+
id: string;
|
|
3724
|
+
/** Extract a scalar from a run (defaults cover score/pass/durationMs/costUsd/tokens). */
|
|
3725
|
+
extract?: (run: Run, store: TraceStore) => Promise<number | null>;
|
|
3726
|
+
}
|
|
3727
|
+
interface OutcomePair {
|
|
3728
|
+
evalMetric: string;
|
|
3729
|
+
outcomeMetric: string;
|
|
3730
|
+
}
|
|
3731
|
+
interface CorrelationResult {
|
|
3732
|
+
evalMetric: string;
|
|
3733
|
+
outcomeMetric: string;
|
|
3734
|
+
n: number;
|
|
3735
|
+
pearson: number;
|
|
3736
|
+
spearman: number;
|
|
3737
|
+
/** 95% bootstrap CI for Pearson. */
|
|
3738
|
+
pearsonCi95: {
|
|
3739
|
+
lower: number;
|
|
3740
|
+
upper: number;
|
|
3741
|
+
};
|
|
3742
|
+
/** Rough verdict: 'strong' ≥ 0.7, 'moderate' ≥ 0.4, else 'weak'. */
|
|
3743
|
+
verdict: 'strong' | 'moderate' | 'weak';
|
|
3744
|
+
}
|
|
3745
|
+
interface CorrelationStudyResult {
|
|
3746
|
+
pairs: CorrelationResult[];
|
|
3747
|
+
joinedSamples: number;
|
|
3748
|
+
skippedRuns: number;
|
|
3749
|
+
}
|
|
3750
|
+
interface CorrelationStudyOptions {
|
|
3751
|
+
/** Only join outcomes captured within this window after run.startedAt. */
|
|
3752
|
+
maxCaptureLagMs?: number;
|
|
3753
|
+
/** Restrict to a subset of outcomes (cohort, region, source). */
|
|
3754
|
+
outcomeFilter?: OutcomeFilter;
|
|
3755
|
+
/** Which outcome per run to use when multiple exist. Default 'latest'. */
|
|
3756
|
+
reduction?: 'latest' | 'mean' | 'max';
|
|
3757
|
+
/** Bootstrap iterations for the CI. Default 500. */
|
|
3758
|
+
bootstrapIterations?: number;
|
|
3759
|
+
}
|
|
3760
|
+
declare function correlationStudy(traceStore: TraceStore, outcomeStore: OutcomeStore, evalMetrics: EvalMetricSpec[], outcomeMetricNames: string[], options?: CorrelationStudyOptions): Promise<CorrelationStudyResult>;
|
|
3761
|
+
|
|
3762
|
+
/**
|
|
3763
|
+
* Calibration curve — binned "if eval says X, what does reality show?"
|
|
3764
|
+
*
|
|
3765
|
+
* Companion to correlationStudy. Raw correlation is a single number;
|
|
3766
|
+
* the calibration curve shows *where* the eval is well-calibrated vs
|
|
3767
|
+
* overconfident / underconfident. Buckets the eval metric, computes
|
|
3768
|
+
* mean outcome per bucket, reports expected-calibration-error (ECE).
|
|
3769
|
+
*/
|
|
3770
|
+
|
|
3771
|
+
interface CalibrationBin {
|
|
3772
|
+
lower: number;
|
|
3773
|
+
upper: number;
|
|
3774
|
+
n: number;
|
|
3775
|
+
evalMean: number;
|
|
3776
|
+
outcomeMean: number;
|
|
3777
|
+
/** |outcomeMean − evalMean|; contributes to ECE weighted by n/total. */
|
|
3778
|
+
gap: number;
|
|
3779
|
+
}
|
|
3780
|
+
interface CalibrationReport {
|
|
3781
|
+
evalMetric: string;
|
|
3782
|
+
outcomeMetric: string;
|
|
3783
|
+
n: number;
|
|
3784
|
+
bins: CalibrationBin[];
|
|
3785
|
+
/** Expected Calibration Error — Σ (n_i/N) × |outcomeMean_i − evalMean_i|. */
|
|
3786
|
+
ece: number;
|
|
3787
|
+
/** Max bin gap — upper bound on miscalibration. */
|
|
3788
|
+
maxGap: number;
|
|
3789
|
+
}
|
|
3790
|
+
interface CalibrationOptions {
|
|
3791
|
+
bins?: number;
|
|
3792
|
+
/** Equal-width (fixed bin edges) or equal-frequency (quantile bins). */
|
|
3793
|
+
binning?: 'equal-width' | 'equal-frequency';
|
|
3794
|
+
/** Clip eval values to [lo, hi] before binning. */
|
|
3795
|
+
range?: {
|
|
3796
|
+
lo: number;
|
|
3797
|
+
hi: number;
|
|
3798
|
+
};
|
|
3799
|
+
}
|
|
3800
|
+
declare function calibrationCurve(traceStore: TraceStore, outcomeStore: OutcomeStore, evalMetric: EvalMetricSpec, outcomeMetric: string, options?: CalibrationOptions): Promise<CalibrationReport | null>;
|
|
3801
|
+
|
|
3802
|
+
/**
|
|
3803
|
+
* Process Reward Modeling — per-step rubric grading.
|
|
3804
|
+
*
|
|
3805
|
+
* A StepRubric inspects one span and returns a score + rationale.
|
|
3806
|
+
* PrmGrader applies an array of rubrics to every LLM span in a
|
|
3807
|
+
* trajectory (consumers can broaden to tool/retrieval spans via the
|
|
3808
|
+
* `kind` filter on each rubric).
|
|
3809
|
+
*
|
|
3810
|
+
* Why this matters: outcome-only eval (did the final artifact work?)
|
|
3811
|
+
* gives sparse reward — most agent turns are unattributable. PRMs
|
|
3812
|
+
* densify the signal so optimizers and RL fine-tuning can assign
|
|
3813
|
+
* credit per turn.
|
|
3814
|
+
*/
|
|
3815
|
+
|
|
3816
|
+
interface StepContext {
|
|
3817
|
+
trajectory: Trajectory;
|
|
3818
|
+
step: TrajectoryStep;
|
|
3819
|
+
/** Steps preceding `step` in trajectory order. */
|
|
3820
|
+
prior: TrajectoryStep[];
|
|
3821
|
+
/** Steps following `step`. */
|
|
3822
|
+
next: TrajectoryStep[];
|
|
3823
|
+
}
|
|
3824
|
+
interface StepRubric {
|
|
3825
|
+
id: string;
|
|
3826
|
+
/** Only grade spans of these kinds (default: all). */
|
|
3827
|
+
kinds?: Array<Span['kind']>;
|
|
3828
|
+
/** Weight in the aggregate score. Default 1. */
|
|
3829
|
+
weight?: number;
|
|
3830
|
+
/** Returns score in 0..1 + optional rationale/evidence. Return `null` to
|
|
3831
|
+
* skip grading (rubric doesn't apply to this step). */
|
|
3832
|
+
grade: (ctx: StepContext) => Promise<{
|
|
3833
|
+
score: number;
|
|
3834
|
+
rationale?: string;
|
|
3835
|
+
evidence?: string;
|
|
3836
|
+
} | null>;
|
|
3837
|
+
}
|
|
3838
|
+
interface GradedStep {
|
|
3839
|
+
spanId: string;
|
|
3840
|
+
rubricId: string;
|
|
3841
|
+
score: number;
|
|
3842
|
+
weight: number;
|
|
3843
|
+
rationale?: string;
|
|
3844
|
+
evidence?: string;
|
|
3845
|
+
}
|
|
3846
|
+
interface PrmGradedTrace {
|
|
3847
|
+
runId: string;
|
|
3848
|
+
steps: GradedStep[];
|
|
3849
|
+
/** Weighted mean of all graded steps; 0..1. */
|
|
3850
|
+
aggregateScore: number;
|
|
3851
|
+
/** Number of spans graded — useful for sanity-checking coverage. */
|
|
3852
|
+
gradedCount: number;
|
|
3853
|
+
/** Number of spans in the trajectory that no rubric matched. */
|
|
3854
|
+
ungradedCount: number;
|
|
3855
|
+
}
|
|
3856
|
+
declare class PrmGrader {
|
|
3857
|
+
private rubrics;
|
|
3858
|
+
constructor(rubrics: StepRubric[]);
|
|
3859
|
+
/**
|
|
3860
|
+
* Grade every eligible span in a run. Emits a JudgeVerdict span for each
|
|
3861
|
+
* (rubric × span) verdict so the result is visible to downstream pipelines
|
|
3862
|
+
* (judgeAgreementView, etc.) — PRM is just "a judge that runs per span."
|
|
3863
|
+
*/
|
|
3864
|
+
grade(store: TraceStore, runId: string): Promise<PrmGradedTrace>;
|
|
3865
|
+
}
|
|
3866
|
+
/** Helper: reads JudgeVerdict spans that PRM emitted so downstream pipelines
|
|
3867
|
+
* can distinguish PRM verdicts from human or top-level LLM judges. */
|
|
3868
|
+
declare function isPrmVerdict(verdict: JudgeSpan): boolean;
|
|
3869
|
+
|
|
3870
|
+
/**
|
|
3871
|
+
* Built-in reference rubrics. Consumers combine these with domain
|
|
3872
|
+
* rubrics. All are deterministic, rule-based — cheap to run + easy
|
|
3873
|
+
* to unit-test. LLM-based rubrics are trivially authored by
|
|
3874
|
+
* following the StepRubric contract.
|
|
3875
|
+
*/
|
|
3876
|
+
|
|
3877
|
+
/** Penalize very short or very long assistant outputs. */
|
|
3878
|
+
declare function outputLengthRubric(args?: {
|
|
3879
|
+
minChars?: number;
|
|
3880
|
+
maxChars?: number;
|
|
3881
|
+
weight?: number;
|
|
3882
|
+
}): StepRubric;
|
|
3883
|
+
/** Reward tool calls that succeeded (status='ok') with an informative result. */
|
|
3884
|
+
declare function toolSuccessRubric(args?: {
|
|
3885
|
+
weight?: number;
|
|
3886
|
+
}): StepRubric;
|
|
3887
|
+
/** Penalize tool calls that duplicate a prior call with identical args. */
|
|
3888
|
+
declare function toolNonRedundantRubric(args?: {
|
|
3889
|
+
weight?: number;
|
|
3890
|
+
}): StepRubric;
|
|
3891
|
+
/** Penalize LLM outputs that contain common refusal markers when a refusal
|
|
3892
|
+
* is NOT expected (caller inverts weight for scenarios where refusal IS expected). */
|
|
3893
|
+
declare function nonRefusalRubric(args?: {
|
|
3894
|
+
markers?: RegExp[];
|
|
3895
|
+
weight?: number;
|
|
3896
|
+
}): StepRubric;
|
|
3897
|
+
/** Reward outputs that invoke the next-step tool the trajectory actually uses
|
|
3898
|
+
* (i.e. the LLM span announced "I will call X" and the following tool span IS X). */
|
|
3899
|
+
declare function toolIntentAlignmentRubric(args?: {
|
|
3900
|
+
weight?: number;
|
|
3901
|
+
}): StepRubric;
|
|
3902
|
+
|
|
3903
|
+
/**
|
|
3904
|
+
* Export PRM-graded traces as training data for downstream reward-model
|
|
3905
|
+
* fine-tuning. Canonical format is NDJSON of
|
|
3906
|
+
* `{ trajectory_text, step_index, rubric, score }` so a small model can
|
|
3907
|
+
* learn to predict step rewards from step context.
|
|
3908
|
+
*
|
|
3909
|
+
* The framework doesn't train the model — we emit the data; callers
|
|
3910
|
+
* plug it into their preferred trainer (TRL, Unsloth, custom).
|
|
3911
|
+
*/
|
|
3912
|
+
|
|
3913
|
+
interface PrmTrainingSample {
|
|
3914
|
+
runId: string;
|
|
3915
|
+
spanId: string;
|
|
3916
|
+
rubricId: string;
|
|
3917
|
+
score: number;
|
|
3918
|
+
/** Serialized step context — step + surrounding conversation. */
|
|
3919
|
+
context: {
|
|
3920
|
+
priorTurns: Array<{
|
|
3921
|
+
role: string;
|
|
3922
|
+
content: string;
|
|
3923
|
+
}>;
|
|
3924
|
+
step: {
|
|
3925
|
+
kind: Span['kind'];
|
|
3926
|
+
text: string;
|
|
3927
|
+
};
|
|
3928
|
+
};
|
|
3929
|
+
/** Optional evidence + rationale for auditability. */
|
|
3930
|
+
rationale?: string;
|
|
3931
|
+
evidence?: string;
|
|
3932
|
+
}
|
|
3933
|
+
declare function exportTrainingData(store: TraceStore, graded: PrmGradedTrace[], options?: {
|
|
3934
|
+
contextWindow?: number;
|
|
3935
|
+
}): Promise<PrmTrainingSample[]>;
|
|
3936
|
+
/** NDJSON serialization — write to file or stream directly to a trainer. */
|
|
3937
|
+
declare function toNdjson(samples: PrmTrainingSample[]): string;
|
|
3938
|
+
|
|
3939
|
+
/**
|
|
3940
|
+
* Inference-time PRM scoring — pick the best of N candidate trajectories
|
|
3941
|
+
* using a trained reward model (or a rule-based PRM as a proxy).
|
|
3942
|
+
*
|
|
3943
|
+
* The canonical Best-of-N pattern: generate N completions, score each
|
|
3944
|
+
* with a PRM, pick the winner. Here the scoring loop is framework-agnostic
|
|
3945
|
+
* — supply a TraceStore + PrmGrader + N run IDs → get ranking + winner.
|
|
3946
|
+
*/
|
|
3947
|
+
|
|
3948
|
+
interface BestOfNResult {
|
|
3949
|
+
winner: PrmGradedTrace;
|
|
3950
|
+
ranked: PrmGradedTrace[];
|
|
3951
|
+
/** Standard deviation of aggregate scores — small = candidates were homogenous. */
|
|
3952
|
+
stdDev: number;
|
|
3953
|
+
}
|
|
3954
|
+
declare function prmBestOfN(store: TraceStore, grader: PrmGrader, runIds: string[]): Promise<BestOfNResult>;
|
|
3955
|
+
/**
|
|
3956
|
+
* Weighted vote across multiple graders — use when you want a PRM ensemble
|
|
3957
|
+
* (e.g. rule-based + LLM-based + trained model). Each grader produces its
|
|
3958
|
+
* own ranking; we aggregate via rank-sum (Borda count) so no single grader
|
|
3959
|
+
* dominates via a different score scale.
|
|
3960
|
+
*/
|
|
3961
|
+
declare function prmEnsembleBestOfN(store: TraceStore, graders: PrmGrader[], runIds: string[]): Promise<BestOfNResult>;
|
|
3962
|
+
|
|
3963
|
+
/**
|
|
3964
|
+
* Bisector — auto-locate the change that introduced an eval regression.
|
|
3965
|
+
*
|
|
3966
|
+
* Two shapes:
|
|
3967
|
+
* - `commitBisect` — walk an ordered SHA list, binary-search for the
|
|
3968
|
+
* first commit that fails.
|
|
3969
|
+
* - `promptBisect` — given a good and bad prompt, progressively port
|
|
3970
|
+
* paragraphs from good→bad to localize the breaking change.
|
|
3971
|
+
*
|
|
3972
|
+
* Generic `bisect<T>` lets callers drive any ordered state space
|
|
3973
|
+
* (dataset versions, config files, CLI flag combinations).
|
|
3974
|
+
*/
|
|
3975
|
+
interface BisectOptions<T> {
|
|
3976
|
+
/** State known to pass. */
|
|
3977
|
+
good: T;
|
|
3978
|
+
/** State known to fail. */
|
|
3979
|
+
bad: T;
|
|
3980
|
+
/** Equality test on state values — default Object.is. */
|
|
3981
|
+
equals?: (a: T, b: T) => boolean;
|
|
3982
|
+
/** Pick the halfway state between good + bad. Return null when no further
|
|
3983
|
+
* split is possible (e.g. adjacent commits). */
|
|
3984
|
+
halfway: (good: T, bad: T) => T | null;
|
|
3985
|
+
/** Produce a verdict for a state. */
|
|
3986
|
+
runEval: (state: T) => Promise<{
|
|
3987
|
+
score: number;
|
|
3988
|
+
pass: boolean;
|
|
3989
|
+
}>;
|
|
3990
|
+
/** Hard cap on iterations (default 40 — covers ~1T ordered states). */
|
|
3991
|
+
maxIterations?: number;
|
|
3992
|
+
}
|
|
3993
|
+
interface BisectStep<T> {
|
|
3994
|
+
state: T;
|
|
3995
|
+
score: number;
|
|
3996
|
+
pass: boolean;
|
|
3997
|
+
}
|
|
3998
|
+
interface BisectResult<T> {
|
|
3999
|
+
/** The first bad state — typically `bad` in the final (good, bad) adjacent pair. */
|
|
4000
|
+
culprit: T;
|
|
4001
|
+
/** Ordered trace of all states evaluated. */
|
|
4002
|
+
path: BisectStep<T>[];
|
|
4003
|
+
/** True when we narrowed to an adjacent (good, bad) pair. */
|
|
4004
|
+
converged: boolean;
|
|
4005
|
+
/** True when `good` itself failed or `bad` itself passed — the caller's
|
|
4006
|
+
* premise was broken. */
|
|
4007
|
+
inputInconsistent: boolean;
|
|
4008
|
+
}
|
|
4009
|
+
declare function bisect<T>(options: BisectOptions<T>): Promise<BisectResult<T>>;
|
|
4010
|
+
/**
|
|
4011
|
+
* Commit bisect — `commits` is an ordered SHA list, oldest to newest.
|
|
4012
|
+
* `good` and `bad` must both be present in the list.
|
|
4013
|
+
*/
|
|
4014
|
+
declare function commitBisect(options: {
|
|
4015
|
+
commits: string[];
|
|
4016
|
+
good: string;
|
|
4017
|
+
bad: string;
|
|
4018
|
+
runEval: (sha: string) => Promise<{
|
|
4019
|
+
score: number;
|
|
4020
|
+
pass: boolean;
|
|
4021
|
+
}>;
|
|
4022
|
+
maxIterations?: number;
|
|
4023
|
+
}): Promise<BisectResult<string>>;
|
|
4024
|
+
/**
|
|
4025
|
+
* Prompt bisect — splits the good and bad prompts into paragraphs, then
|
|
4026
|
+
* progressively replaces paragraphs in `good` with their counterparts
|
|
4027
|
+
* from `bad` to localize the offending change. Only works when the two
|
|
4028
|
+
* prompts have the same paragraph count (a common editorial workflow
|
|
4029
|
+
* constraint — one paragraph = one change unit).
|
|
4030
|
+
*/
|
|
4031
|
+
declare function promptBisect(options: {
|
|
4032
|
+
good: string;
|
|
4033
|
+
bad: string;
|
|
4034
|
+
runEval: (prompt: string) => Promise<{
|
|
4035
|
+
score: number;
|
|
4036
|
+
pass: boolean;
|
|
4037
|
+
}>;
|
|
4038
|
+
maxIterations?: number;
|
|
4039
|
+
paragraphSplitter?: (prompt: string) => string[];
|
|
4040
|
+
}): Promise<BisectResult<string> & {
|
|
4041
|
+
offendingParagraphIndex?: number;
|
|
4042
|
+
}>;
|
|
4043
|
+
|
|
4044
|
+
/**
|
|
4045
|
+
* Counterfactual replay — "what would have happened if we'd changed
|
|
4046
|
+
* exactly one thing at turn N?"
|
|
4047
|
+
*
|
|
4048
|
+
* The framework does NOT drive the agent — it sets up the replay
|
|
4049
|
+
* context (prior spans, prior state, mutation spec) and records the
|
|
4050
|
+
* resulting divergence. Consumers supply an `executeFrom(ctx)` callback
|
|
4051
|
+
* that runs their agent starting from turn N with the mutation applied.
|
|
4052
|
+
*
|
|
4053
|
+
* Counterfactual runs are recorded as a new Run with `layer='meta'` and
|
|
4054
|
+
* `parentRunId = originalRunId`, so downstream diff + correlation
|
|
4055
|
+
* pipelines see them natively.
|
|
4056
|
+
*/
|
|
4057
|
+
|
|
4058
|
+
type CounterfactualMutation = {
|
|
4059
|
+
kind: 'swap-model';
|
|
4060
|
+
at: number;
|
|
4061
|
+
newModel: string;
|
|
4062
|
+
} | {
|
|
4063
|
+
kind: 'swap-tool-result';
|
|
4064
|
+
at: number;
|
|
4065
|
+
newResult: unknown;
|
|
4066
|
+
} | {
|
|
4067
|
+
kind: 'truncate-after';
|
|
4068
|
+
at: number;
|
|
4069
|
+
} | {
|
|
4070
|
+
kind: 'inject-system-message';
|
|
4071
|
+
at: number;
|
|
4072
|
+
content: string;
|
|
4073
|
+
} | {
|
|
4074
|
+
kind: 'custom';
|
|
4075
|
+
at: number;
|
|
4076
|
+
describe: string;
|
|
4077
|
+
apply: (step: TrajectoryStep) => TrajectoryStep;
|
|
4078
|
+
};
|
|
4079
|
+
interface CounterfactualContext {
|
|
4080
|
+
originalRunId: string;
|
|
4081
|
+
originalTrajectory: Trajectory;
|
|
4082
|
+
/** Steps up to (but not including) the mutation point — the prefix the
|
|
4083
|
+
* replayed agent inherits as its prior conversation/tool history. */
|
|
4084
|
+
prefix: TrajectoryStep[];
|
|
4085
|
+
mutation: CounterfactualMutation;
|
|
4086
|
+
/** Pre-applied mutation on the step at `mutation.at`. Consumers use this
|
|
4087
|
+
* as the FIRST step the replayed agent emits (they decide whether to
|
|
4088
|
+
* re-emit it or continue from there). */
|
|
4089
|
+
mutatedStep: TrajectoryStep;
|
|
4090
|
+
}
|
|
4091
|
+
interface CounterfactualResult {
|
|
4092
|
+
counterfactualRunId: string;
|
|
4093
|
+
originalRunId: string;
|
|
4094
|
+
mutation: CounterfactualMutation;
|
|
4095
|
+
/** Structured delta summary — caller can extend via scoring. */
|
|
4096
|
+
delta: {
|
|
4097
|
+
originalOutcomeScore: number | null;
|
|
4098
|
+
counterfactualOutcomeScore: number | null;
|
|
4099
|
+
deltaScore: number | null;
|
|
4100
|
+
};
|
|
4101
|
+
}
|
|
4102
|
+
interface CounterfactualRunner {
|
|
4103
|
+
/**
|
|
4104
|
+
* Execute the agent from `ctx.prefix` with the mutation applied.
|
|
4105
|
+
* MUST emit spans into the provided emitter so they become part of
|
|
4106
|
+
* the counterfactual run. MUST call emitter.endRun() with a verdict.
|
|
4107
|
+
*/
|
|
4108
|
+
executeFrom: (ctx: CounterfactualContext, emitter: TraceEmitter) => Promise<void>;
|
|
4109
|
+
}
|
|
4110
|
+
declare function runCounterfactual(store: TraceStore, originalRunId: string, mutation: CounterfactualMutation, runner: CounterfactualRunner): Promise<CounterfactualResult>;
|
|
4111
|
+
/**
|
|
4112
|
+
* Aggregate a batch of counterfactuals into a simple attribution table:
|
|
4113
|
+
* which mutation kinds move outcomes most? (Useful when you run a grid
|
|
4114
|
+
* over the same trajectory — swap-model at every llm span, swap-tool
|
|
4115
|
+
* at every tool span — and want a ranked summary.)
|
|
4116
|
+
*/
|
|
4117
|
+
declare function attributeCounterfactuals(results: CounterfactualResult[]): Array<{
|
|
4118
|
+
mutationKind: CounterfactualMutation['kind'];
|
|
4119
|
+
n: number;
|
|
4120
|
+
meanAbsDelta: number;
|
|
4121
|
+
meanSignedDelta: number;
|
|
4122
|
+
}>;
|
|
4123
|
+
|
|
4124
|
+
/**
|
|
4125
|
+
* Full cross-trace diff — align two trajectories step-by-step, report
|
|
4126
|
+
* per-step score deltas, attribute a variant's total outcome lead to
|
|
4127
|
+
* specific turns.
|
|
4128
|
+
*
|
|
4129
|
+
* 0.5 shipped `firstDivergenceView` (finds the first differing step).
|
|
4130
|
+
* This does the heavier work: full alignment via LCS, per-step
|
|
4131
|
+
* contribution to score delta using PRM verdicts when available,
|
|
4132
|
+
* fallback to structural heuristics (latency, token count, tool
|
|
4133
|
+
* outcome) otherwise.
|
|
4134
|
+
*/
|
|
4135
|
+
|
|
4136
|
+
type AlignmentOp = {
|
|
4137
|
+
op: 'match';
|
|
4138
|
+
a: TrajectoryStep;
|
|
4139
|
+
b: TrajectoryStep;
|
|
4140
|
+
} | {
|
|
4141
|
+
op: 'insert';
|
|
4142
|
+
b: TrajectoryStep;
|
|
4143
|
+
} | {
|
|
4144
|
+
op: 'delete';
|
|
4145
|
+
a: TrajectoryStep;
|
|
4146
|
+
} | {
|
|
4147
|
+
op: 'replace';
|
|
4148
|
+
a: TrajectoryStep;
|
|
4149
|
+
b: TrajectoryStep;
|
|
4150
|
+
};
|
|
4151
|
+
interface StepAttribution {
|
|
4152
|
+
op: AlignmentOp;
|
|
4153
|
+
/** Difference in PRM score (or null when not scored by a matching judge). */
|
|
4154
|
+
prmDelta: number | null;
|
|
4155
|
+
/** Difference in latency (endedAt - startedAt). */
|
|
4156
|
+
latencyDeltaMs: number | null;
|
|
4157
|
+
/** Difference in token count (LLM spans). */
|
|
4158
|
+
tokenDelta: number | null;
|
|
4159
|
+
/** Reason this step is / isn't considered a contributor to the outcome delta. */
|
|
4160
|
+
note: string;
|
|
4161
|
+
}
|
|
4162
|
+
interface CrossTraceDiff {
|
|
4163
|
+
runA: string;
|
|
4164
|
+
runB: string;
|
|
4165
|
+
alignment: AlignmentOp[];
|
|
4166
|
+
attributions: StepAttribution[];
|
|
4167
|
+
/** Total score delta (B - A). */
|
|
4168
|
+
totalScoreDelta: number | null;
|
|
4169
|
+
/** Sum of PRM deltas across matched/replaced steps. Close to
|
|
4170
|
+
* `totalScoreDelta` when PRM covers the trajectory; gap indicates
|
|
4171
|
+
* unmodeled variance. */
|
|
4172
|
+
prmDeltaSum: number;
|
|
4173
|
+
}
|
|
4174
|
+
interface CrossTraceDiffOptions {
|
|
4175
|
+
stepEquals?: (a: TrajectoryStep, b: TrajectoryStep) => boolean;
|
|
4176
|
+
}
|
|
4177
|
+
declare function crossTraceDiff(store: TraceStore, runA: string, runB: string, options?: CrossTraceDiffOptions): Promise<CrossTraceDiff>;
|
|
4178
|
+
|
|
4179
|
+
/**
|
|
4180
|
+
* Pre-registered hypotheses — declare what you're testing BEFORE the
|
|
4181
|
+
* run, check it AFTER. Prevents p-hacking, optional stopping, and the
|
|
4182
|
+
* "we ran until it looked good" failure mode.
|
|
4183
|
+
*
|
|
4184
|
+
* Manifest is a plain JSON-friendly object. Sign it with a content hash
|
|
4185
|
+
* + timestamp; the registered record becomes immutable. Post-run,
|
|
4186
|
+
* evaluate the manifest against observed results — the library refuses
|
|
4187
|
+
* to let you re-interpret a different metric as the declared one.
|
|
4188
|
+
*/
|
|
4189
|
+
interface HypothesisManifest {
|
|
4190
|
+
id: string;
|
|
4191
|
+
/** Human prose — goes into the audit trail. */
|
|
4192
|
+
hypothesis: string;
|
|
4193
|
+
/** Metric the hypothesis claims to move. */
|
|
4194
|
+
metric: string;
|
|
4195
|
+
/** 'increase' = candidate should score higher than baseline; 'decrease' = lower. */
|
|
4196
|
+
direction: 'increase' | 'decrease';
|
|
4197
|
+
/** Minimum effect size to count (same units as the metric). */
|
|
4198
|
+
minEffect: number;
|
|
4199
|
+
/** Alpha threshold. */
|
|
4200
|
+
alpha: number;
|
|
4201
|
+
/** Target statistical power at which sample size was pre-computed. */
|
|
4202
|
+
power: number;
|
|
4203
|
+
/** Declared N per arm before running. */
|
|
4204
|
+
preRegisteredN: number;
|
|
4205
|
+
/** ISO8601 timestamp the manifest was registered. */
|
|
4206
|
+
registeredAt: string;
|
|
4207
|
+
/** Optional identifiers to tie into the trace corpus. */
|
|
4208
|
+
baselineLabel?: string;
|
|
4209
|
+
candidateLabel?: string;
|
|
4210
|
+
}
|
|
4211
|
+
interface SignedManifest extends HypothesisManifest {
|
|
4212
|
+
/** sha256 hex of canonicalized manifest (everything except contentHash). */
|
|
4213
|
+
contentHash: string;
|
|
4214
|
+
}
|
|
4215
|
+
interface HypothesisResult {
|
|
4216
|
+
manifest: SignedManifest;
|
|
4217
|
+
observedN: number;
|
|
4218
|
+
observedEffect: number;
|
|
4219
|
+
observedPValue: number;
|
|
4220
|
+
/** True iff the observed effect hits the pre-declared direction with
|
|
4221
|
+
* magnitude ≥ minEffect AND p < alpha. */
|
|
4222
|
+
confirmed: boolean;
|
|
4223
|
+
/** Enumerated reasons the hypothesis was rejected (each a machine-tag). */
|
|
4224
|
+
rejectionReasons: Array<'wrong_direction' | 'effect_too_small' | 'not_significant' | 'undersampled'>;
|
|
4225
|
+
notes?: string;
|
|
4226
|
+
}
|
|
4227
|
+
declare function signManifest(m: HypothesisManifest): Promise<SignedManifest>;
|
|
4228
|
+
/** Verify that a signed manifest has not been tampered with. */
|
|
4229
|
+
declare function verifyManifest(m: SignedManifest): Promise<boolean>;
|
|
4230
|
+
/**
|
|
4231
|
+
* Evaluate a pre-registered hypothesis against observed results.
|
|
4232
|
+
* Mechanical — no re-interpretation permitted.
|
|
4233
|
+
*/
|
|
4234
|
+
declare function evaluateHypothesis(manifest: SignedManifest, observed: {
|
|
4235
|
+
n: number;
|
|
4236
|
+
effect: number;
|
|
4237
|
+
pValue: number;
|
|
4238
|
+
}): Promise<HypothesisResult>;
|
|
4239
|
+
|
|
4240
|
+
/**
|
|
4241
|
+
* Self-play scenario evolution — agents generate adversarial scenarios
|
|
4242
|
+
* against each other; survivors become part of the eval corpus.
|
|
4243
|
+
*
|
|
4244
|
+
* Framework-agnostic about how scenarios are generated. Caller supplies:
|
|
4245
|
+
* - `propose`: asks a "proposer" agent for candidate scenarios
|
|
4246
|
+
* - `scoreAgainst`: runs a target agent against a scenario and returns
|
|
4247
|
+
* its score
|
|
4248
|
+
*
|
|
4249
|
+
* A scenario *survives* if it reveals a meaningful score difference
|
|
4250
|
+
* between two target agents (or between a target agent and itself on
|
|
4251
|
+
* different runs). Survivors are promoted to a Dataset; the caller
|
|
4252
|
+
* decides what to do with them (hold-out, training, regression set).
|
|
4253
|
+
*
|
|
4254
|
+
* Guard rails: minimum absolute score delta to consider a scenario
|
|
4255
|
+
* informative; floor on absolute target score so degenerate break-all
|
|
4256
|
+
* scenarios (noise, gibberish) don't flood the corpus.
|
|
4257
|
+
*/
|
|
4258
|
+
|
|
4259
|
+
interface CandidateScenario {
|
|
4260
|
+
id: string;
|
|
4261
|
+
payload: unknown;
|
|
4262
|
+
/** Free-form tags (domain, generation, parent). */
|
|
4263
|
+
tags?: Record<string, string>;
|
|
4264
|
+
}
|
|
4265
|
+
interface ScoredTarget {
|
|
4266
|
+
targetId: string;
|
|
4267
|
+
score: number;
|
|
4268
|
+
}
|
|
4269
|
+
interface EvolutionRound {
|
|
4270
|
+
round: number;
|
|
4271
|
+
proposed: CandidateScenario[];
|
|
4272
|
+
survived: CandidateScenario[];
|
|
4273
|
+
rejected: Array<{
|
|
4274
|
+
candidate: CandidateScenario;
|
|
4275
|
+
reason: string;
|
|
4276
|
+
}>;
|
|
4277
|
+
scoredBreakdown: Array<{
|
|
4278
|
+
candidate: CandidateScenario;
|
|
4279
|
+
scores: ScoredTarget[];
|
|
4280
|
+
spread: number;
|
|
4281
|
+
}>;
|
|
4282
|
+
}
|
|
4283
|
+
interface SelfPlayOptions {
|
|
4284
|
+
/** Minimum score spread across targets for a scenario to survive. Default 0.1. */
|
|
4285
|
+
minSpread?: number;
|
|
4286
|
+
/** Minimum floor score across targets — keeps degenerate break-all scenarios
|
|
4287
|
+
* out. Default 0.1 (if every target scores below this, discard). */
|
|
4288
|
+
minAbsoluteFloor?: number;
|
|
4289
|
+
/** Hard cap on survivors per round. Default 50. */
|
|
4290
|
+
maxSurvivors?: number;
|
|
4291
|
+
/** Rounds to run. Default 1. Each round's survivors can be fed back into
|
|
4292
|
+
* `propose` to compound. */
|
|
4293
|
+
rounds?: number;
|
|
4294
|
+
/** Seed for scenario id generation if proposer doesn't provide one. */
|
|
4295
|
+
seed?: number;
|
|
4296
|
+
}
|
|
4297
|
+
interface SelfPlayProposer {
|
|
4298
|
+
propose(round: number, priorSurvivors: CandidateScenario[]): Promise<CandidateScenario[]>;
|
|
4299
|
+
}
|
|
4300
|
+
interface SelfPlayScorer {
|
|
4301
|
+
/** Score one candidate against every target; returns parallel array. */
|
|
4302
|
+
scoreCandidate(candidate: CandidateScenario, targets: string[]): Promise<ScoredTarget[]>;
|
|
4303
|
+
}
|
|
4304
|
+
declare function runSelfPlay(proposer: SelfPlayProposer, scorer: SelfPlayScorer, targets: string[], options?: SelfPlayOptions): Promise<{
|
|
4305
|
+
rounds: EvolutionRound[];
|
|
4306
|
+
dataset: Dataset;
|
|
4307
|
+
}>;
|
|
4308
|
+
|
|
4309
|
+
/**
|
|
4310
|
+
* Causal attribution via factorial experiments.
|
|
4311
|
+
*
|
|
4312
|
+
* Run every combination of {model × prompt × scenario × seed}, then
|
|
4313
|
+
* decompose observed score variance into main effects + interactions.
|
|
4314
|
+
* Moves from correlational "variant B is better" to causal "the model
|
|
4315
|
+
* swap accounts for 42% of the lead; the prompt change accounts for 28%;
|
|
4316
|
+
* interaction is 30%."
|
|
4317
|
+
*
|
|
4318
|
+
* Minimal implementation: 2-way factorial (two factors at a time) with
|
|
4319
|
+
* main-effect + interaction decomposition via variance of cell means.
|
|
4320
|
+
* Consumers run the factorial design themselves (we don't schedule
|
|
4321
|
+
* runs); this module consumes the (factorLevels, observedScores)
|
|
4322
|
+
* table and does the attribution math.
|
|
4323
|
+
*/
|
|
4324
|
+
interface FactorialCell {
|
|
4325
|
+
/** Map factor name → level id. e.g. { model: 'claude', prompt: 'v2' } */
|
|
4326
|
+
levels: Record<string, string>;
|
|
4327
|
+
/** Observed score for this cell (mean over replications if n > 1). */
|
|
4328
|
+
score: number;
|
|
4329
|
+
/** Number of replications averaged to produce `score`. */
|
|
4330
|
+
n: number;
|
|
4331
|
+
}
|
|
4332
|
+
interface FactorContribution {
|
|
4333
|
+
factor: string;
|
|
4334
|
+
/** Variance attributed to this factor's main effect, as a fraction of total. */
|
|
4335
|
+
shareOfVariance: number;
|
|
4336
|
+
/** Range of cell means across levels of this factor. */
|
|
4337
|
+
range: number;
|
|
4338
|
+
}
|
|
4339
|
+
interface InteractionContribution {
|
|
4340
|
+
factors: [string, string];
|
|
4341
|
+
shareOfVariance: number;
|
|
4342
|
+
}
|
|
4343
|
+
interface CausalAttributionReport {
|
|
4344
|
+
totalVariance: number;
|
|
4345
|
+
mainEffects: FactorContribution[];
|
|
4346
|
+
interactions: InteractionContribution[];
|
|
4347
|
+
/** Residual = variance unexplained by main effects + modeled interactions. */
|
|
4348
|
+
residualShare: number;
|
|
4349
|
+
/** Sanity: shares sum to 1 (within fp). */
|
|
4350
|
+
sharesSum: number;
|
|
4351
|
+
}
|
|
4352
|
+
declare function causalAttribution(cells: FactorialCell[]): CausalAttributionReport;
|
|
4353
|
+
|
|
4354
|
+
/**
|
|
4355
|
+
* Active learning — agent-as-scenario-author.
|
|
4356
|
+
*
|
|
4357
|
+
* Analyzes an existing Dataset + trace corpus for coverage gaps and
|
|
4358
|
+
* weak spots, returns a prioritized list of *synthesis targets*:
|
|
4359
|
+
* (gap description, existing-neighbor examples, suggested direction).
|
|
4360
|
+
*
|
|
4361
|
+
* Does NOT call an LLM itself — the proposer agent is caller-supplied.
|
|
4362
|
+
* This module's job is to identify WHERE new scenarios would compound
|
|
4363
|
+
* the most information, not to author them.
|
|
4364
|
+
*
|
|
4365
|
+
* Gaps we detect:
|
|
4366
|
+
* - dimensions with high score variance (unstable, need more data)
|
|
4367
|
+
* - dimensions with low coverage count (undersampled)
|
|
4368
|
+
* - failure classes with clusters (systematic weakness)
|
|
4369
|
+
* - difficulty bins with no coverage
|
|
4370
|
+
*/
|
|
4371
|
+
|
|
4372
|
+
type SynthesisReason = 'high-variance' | 'undersampled' | 'failure-cluster' | 'difficulty-gap';
|
|
4373
|
+
interface SynthesisTarget {
|
|
4374
|
+
reason: SynthesisReason;
|
|
4375
|
+
description: string;
|
|
4376
|
+
/** Existing scenarios that are closest to the gap; caller feeds these to
|
|
4377
|
+
* their LLM proposer as few-shot examples. */
|
|
4378
|
+
neighbors: DatasetScenario[];
|
|
4379
|
+
/** Suggested direction — e.g. "harder variants", "edge cases of X", "failure class Y". */
|
|
4380
|
+
direction: string;
|
|
4381
|
+
/** Priority score — higher = more information-dense gap. 0..1. */
|
|
4382
|
+
priority: number;
|
|
4383
|
+
}
|
|
4384
|
+
interface ActiveLearningOptions {
|
|
4385
|
+
/** Minimum scenarios per difficulty band to count as "covered". */
|
|
4386
|
+
minPerBand?: number;
|
|
4387
|
+
/** Variance threshold above which a scenario's dimension is "unstable". */
|
|
4388
|
+
varianceThreshold?: number;
|
|
4389
|
+
/** Max synthesis targets returned. */
|
|
4390
|
+
topK?: number;
|
|
4391
|
+
}
|
|
4392
|
+
declare function proposeSynthesisTargets(dataset: Dataset, traceStore: TraceStore, options?: ActiveLearningOptions): Promise<SynthesisTarget[]>;
|
|
4393
|
+
|
|
4394
|
+
/**
|
|
4395
|
+
* Reward-model export — the productizable wrapper around PRM training
|
|
4396
|
+
* data. Takes a TraceStore + PrmGrader, produces an embeddable
|
|
4397
|
+
* inference scorer that customers plug into their own agent stack.
|
|
4398
|
+
*
|
|
4399
|
+
* Two export forms:
|
|
4400
|
+
* - `exportRewardModel(store, graders)` — serializes the (step-context,
|
|
4401
|
+
* score) corpus to a framework-agnostic payload. Customer fine-tunes
|
|
4402
|
+
* their own model; we ship the scaffolding.
|
|
4403
|
+
* - `loadScorerFromTraces(store, grader)` — a zero-deps "reward model"
|
|
4404
|
+
* that literally replays the trained rubric at inference time. Works
|
|
4405
|
+
* as a reference baseline + deterministic fallback.
|
|
4406
|
+
*/
|
|
4407
|
+
|
|
4408
|
+
interface ExportedRewardModel {
|
|
4409
|
+
/** Version of the export format. Bump when payload shape changes. */
|
|
4410
|
+
version: '1.0';
|
|
4411
|
+
/** Metadata about the training corpus. */
|
|
4412
|
+
metadata: {
|
|
4413
|
+
nTraces: number;
|
|
4414
|
+
nSamples: number;
|
|
4415
|
+
rubrics: string[];
|
|
4416
|
+
exportedAt: string;
|
|
4417
|
+
/** Mean reward across training corpus — use as sanity check at load. */
|
|
4418
|
+
meanReward: number;
|
|
4419
|
+
};
|
|
4420
|
+
/** NDJSON training payload suitable for most fine-tuning frameworks. */
|
|
4421
|
+
trainingNdjson: string;
|
|
4422
|
+
}
|
|
4423
|
+
declare function exportRewardModel(store: TraceStore, grader: PrmGrader, runIds: string[]): Promise<ExportedRewardModel>;
|
|
4424
|
+
/**
|
|
4425
|
+
* Zero-deps inference scorer — apply a grader to a trajectory and return
|
|
4426
|
+
* its aggregate score. This is the "reward model" customers embed when
|
|
4427
|
+
* they don't want (or can't) fine-tune one. Deterministic + portable.
|
|
4428
|
+
*/
|
|
4429
|
+
interface InferenceScorer {
|
|
4430
|
+
/** Score a completed trajectory. Higher is better. */
|
|
4431
|
+
score(trajectory: Trajectory, store: TraceStore): Promise<number>;
|
|
4432
|
+
metadata: {
|
|
4433
|
+
rubrics: string[];
|
|
4434
|
+
deterministic: true;
|
|
4435
|
+
};
|
|
4436
|
+
}
|
|
4437
|
+
declare function loadScorerFromGrader(grader: PrmGrader): InferenceScorer;
|
|
4438
|
+
/**
|
|
4439
|
+
* Replay a trace corpus through a scorer — produces the canonical
|
|
4440
|
+
* "what would this reward model have said about every run?" table.
|
|
4441
|
+
* Callers use this to validate a trained model against the training
|
|
4442
|
+
* corpus (expect high agreement; drift indicates overfitting).
|
|
4443
|
+
*/
|
|
4444
|
+
declare function replayScorerOverCorpus(store: TraceStore, scorer: InferenceScorer, runIds: string[]): Promise<Array<{
|
|
4445
|
+
runId: string;
|
|
4446
|
+
score: number;
|
|
4447
|
+
outcomeScore: number | null;
|
|
4448
|
+
}>>;
|
|
4449
|
+
|
|
4450
|
+
/**
|
|
4451
|
+
* Governance reporting — shared types.
|
|
4452
|
+
*
|
|
4453
|
+
* The framework collects a `GovernanceContext` (traces + outcomes +
|
|
4454
|
+
* dataset manifests + red-team results + judge calibration) and each
|
|
4455
|
+
* specific template (NIST AI RMF, SOC2, EU AI Act) renders a
|
|
4456
|
+
* structured report from it.
|
|
4457
|
+
*
|
|
4458
|
+
* Reports are machine-readable JSON first; human-readable Markdown is a
|
|
4459
|
+
* pure transform on top. External auditors consume the Markdown; CI
|
|
4460
|
+
* consumes the JSON.
|
|
4461
|
+
*/
|
|
4462
|
+
|
|
4463
|
+
interface GovernanceContext {
|
|
4464
|
+
/** Legal / org identity for the report. */
|
|
4465
|
+
organization: string;
|
|
4466
|
+
/** System / agent identifier. */
|
|
4467
|
+
systemName: string;
|
|
4468
|
+
/** ISO8601 period the report covers. */
|
|
4469
|
+
periodStart: string;
|
|
4470
|
+
periodEnd: string;
|
|
4471
|
+
/** Versioned dataset manifests used during the period. */
|
|
4472
|
+
datasets: DatasetManifest[];
|
|
4473
|
+
traceStore: TraceStore;
|
|
4474
|
+
outcomeStore?: OutcomeStore;
|
|
4475
|
+
/** Cached red-team results for the period, if available. */
|
|
4476
|
+
redTeam?: RedTeamReport;
|
|
4477
|
+
/** Judge-vs-human calibration results, if measured. */
|
|
4478
|
+
judgeCalibration?: CalibrationResult[];
|
|
4479
|
+
/** Responsible owner for the system — role + name + email. */
|
|
4480
|
+
owner: {
|
|
4481
|
+
role: string;
|
|
4482
|
+
name: string;
|
|
4483
|
+
email: string;
|
|
4484
|
+
};
|
|
4485
|
+
}
|
|
4486
|
+
interface GovernanceFinding {
|
|
4487
|
+
id: string;
|
|
4488
|
+
severity: 'info' | 'low' | 'medium' | 'high' | 'critical';
|
|
4489
|
+
/** Control reference the finding maps to (e.g. "NIST-AI-RMF:MEASURE-2.1"). */
|
|
4490
|
+
control: string;
|
|
4491
|
+
summary: string;
|
|
4492
|
+
evidence?: string;
|
|
4493
|
+
remediation?: string;
|
|
4494
|
+
}
|
|
4495
|
+
interface GovernanceReport {
|
|
4496
|
+
framework: 'NIST-AI-RMF' | 'SOC2' | 'EU-AI-ACT';
|
|
4497
|
+
version: string;
|
|
4498
|
+
context: Pick<GovernanceContext, 'organization' | 'systemName' | 'periodStart' | 'periodEnd' | 'owner'>;
|
|
4499
|
+
summary: {
|
|
4500
|
+
findings: number;
|
|
4501
|
+
byeverity: Record<GovernanceFinding['severity'], number>;
|
|
4502
|
+
overall: 'compliant' | 'compliant-with-findings' | 'non-compliant';
|
|
4503
|
+
};
|
|
4504
|
+
findings: GovernanceFinding[];
|
|
4505
|
+
/** Framework-specific structured payload (mapped controls, risk class, etc.). */
|
|
4506
|
+
payload: Record<string, unknown>;
|
|
4507
|
+
generatedAt: string;
|
|
4508
|
+
}
|
|
4509
|
+
declare function renderMarkdown(report: GovernanceReport): string;
|
|
4510
|
+
declare function summarize(findings: GovernanceFinding[]): GovernanceReport['summary'];
|
|
4511
|
+
|
|
4512
|
+
/**
|
|
4513
|
+
* NIST AI RMF 1.0 — Govern / Map / Measure / Manage mapping.
|
|
4514
|
+
*
|
|
4515
|
+
* Each subcategory derives its status from concrete framework state:
|
|
4516
|
+
* MEASURE 2.x: do we have a calibration regime? contamination controls?
|
|
4517
|
+
* MEASURE 2.7: are red-team results available?
|
|
4518
|
+
* MANAGE 1.x: are outcome metrics captured? correlation measured?
|
|
4519
|
+
* GOVERN 1.x: dataset + prompt provenance recorded?
|
|
4520
|
+
*
|
|
4521
|
+
* We ship the mapping and the derivation rules; consumers supply the
|
|
4522
|
+
* GovernanceContext.
|
|
4523
|
+
*/
|
|
4524
|
+
|
|
4525
|
+
declare function nistAiRmfReport(ctx: GovernanceContext): Promise<GovernanceReport>;
|
|
4526
|
+
|
|
4527
|
+
/**
|
|
4528
|
+
* SOC 2 — Common Criteria 7 (system operations + change management)
|
|
4529
|
+
* audit trail derived from the trace corpus.
|
|
4530
|
+
*
|
|
4531
|
+
* This is NOT a formal SOC2 report — that requires an external
|
|
4532
|
+
* auditor. What we ship is the machine-readable *evidence* package
|
|
4533
|
+
* that an auditor consumes: run counts, deploy events, access log
|
|
4534
|
+
* summary, anomaly tracking, response-time SLOs.
|
|
4535
|
+
*/
|
|
4536
|
+
|
|
4537
|
+
declare function soc2Report(ctx: GovernanceContext): Promise<GovernanceReport>;
|
|
4538
|
+
|
|
4539
|
+
/**
|
|
4540
|
+
* EU AI Act — risk-class classification + compliance checklist.
|
|
4541
|
+
*
|
|
4542
|
+
* Classification is declarative: caller supplies the domain/use-case
|
|
4543
|
+
* signals (biometric? critical infrastructure? education? employment?
|
|
4544
|
+
* access to services?) and we map to the Act's risk tiers:
|
|
4545
|
+
* - "unacceptable" (prohibited)
|
|
4546
|
+
* - "high" (Annex III — strict obligations)
|
|
4547
|
+
* - "limited" (transparency obligations)
|
|
4548
|
+
* - "minimal" (voluntary codes of conduct)
|
|
4549
|
+
*
|
|
4550
|
+
* Then the compliance checklist enumerates Article 9 (risk mgmt),
|
|
4551
|
+
* 10 (data + data governance), 11 (technical documentation), 13
|
|
4552
|
+
* (transparency), 14 (human oversight), 15 (accuracy + robustness)
|
|
4553
|
+
* requirements and flags gaps.
|
|
4554
|
+
*/
|
|
4555
|
+
|
|
4556
|
+
type EuRiskClass = 'unacceptable' | 'high' | 'limited' | 'minimal';
|
|
4557
|
+
interface UseCaseSignals {
|
|
4558
|
+
/** Used for biometric identification in public spaces? (Art. 5 — unacceptable). */
|
|
4559
|
+
biometricPublic?: boolean;
|
|
4560
|
+
/** Social scoring by public authorities? (Art. 5). */
|
|
4561
|
+
socialScoring?: boolean;
|
|
4562
|
+
/** Subliminal manipulation? (Art. 5). */
|
|
4563
|
+
subliminal?: boolean;
|
|
4564
|
+
/** Annex III sector: critical infrastructure / education / employment /
|
|
4565
|
+
* access to essential services / law enforcement / migration /
|
|
4566
|
+
* administration of justice / democratic processes? */
|
|
4567
|
+
annexIII?: boolean;
|
|
4568
|
+
/** Interacts directly with natural persons (chatbot, agent)? — limited risk. */
|
|
4569
|
+
chatbot?: boolean;
|
|
4570
|
+
/** Generates synthetic media (image/audio/video/text deepfakes)? — limited risk. */
|
|
4571
|
+
generatesSyntheticMedia?: boolean;
|
|
4572
|
+
}
|
|
4573
|
+
declare function classifyEuAiRisk(signals: UseCaseSignals): EuRiskClass;
|
|
4574
|
+
declare function euAiActReport(ctx: GovernanceContext, signals: UseCaseSignals): Promise<GovernanceReport>;
|
|
4575
|
+
|
|
4576
|
+
export { type ActiveLearningOptions, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeploymentOutcome, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type GenericSpan, type GoldenItem, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessConfig, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type LayerCorrelation, type LlmJsonCall, type LlmReviewerConfig, type LlmSpan, MODEL_PRICING, type MatcherResult, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxResult, type SandboxSpan, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type Verification, type VerifyFn, type VisualDiffOptions, type VisualDiffResult, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, budgetBreachView, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, createLlmReviewer, crossTraceDiff, defaultJudges, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, failureClusterView, fileContains, fileExists, firstDivergenceView, formatBenchmarkReport, formatDriverReport, groupBy, hashContent, hashScenarios, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReviewStore, judgeAgreementView, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, lowercaseMutator, mannWhitneyU, mergeSteeringBundle, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runJudgeFleet, runProposeReview, runSelfPlay, runTestGradedScenario, runsForScenario, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, securityJudge, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
|