@tangle-network/agent-eval 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +520 -79
- package/dist/index.js +1035 -322
- package/dist/index.js.map +1 -1
- package/package.json +5 -1
package/dist/index.d.ts
CHANGED
|
@@ -1033,86 +1033,56 @@ declare class PromptOptimizer {
|
|
|
1033
1033
|
run(config: OptimizationConfig): Promise<OptimizationResult>;
|
|
1034
1034
|
}
|
|
1035
1035
|
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
* Pattern lifted from tax-agent + legal-agent: two agents take turns until
|
|
1040
|
-
* they converge on a consensus artifact. One proposes, the other critiques;
|
|
1041
|
-
* the proposer revises; repeat until a score threshold is hit or max rounds.
|
|
1042
|
-
*
|
|
1043
|
-
* Generalized so any two "agents" (gateways, local functions, anything with
|
|
1044
|
-
* `propose` + `critique`) compose in. Returns convergence rounds per
|
|
1045
|
-
* scenario + whether convergence happened.
|
|
1046
|
-
*/
|
|
1047
|
-
interface DualAgentScenario {
|
|
1048
|
-
id: string;
|
|
1049
|
-
initialPrompt: string;
|
|
1050
|
-
/** Optional context the agents can read (e.g. source documents). */
|
|
1051
|
-
context?: Record<string, unknown>;
|
|
1052
|
-
}
|
|
1053
|
-
interface DualAgentRound {
|
|
1054
|
-
roundIndex: number;
|
|
1055
|
-
proposal: string;
|
|
1056
|
-
critique: string;
|
|
1057
|
-
convergenceScore: number;
|
|
1058
|
-
}
|
|
1059
|
-
interface DualAgentScenarioResult {
|
|
1060
|
-
scenarioId: string;
|
|
1061
|
-
converged: boolean;
|
|
1062
|
-
roundsToConverge: number | null;
|
|
1063
|
-
finalProposal: string;
|
|
1064
|
-
history: DualAgentRound[];
|
|
1065
|
-
finalScore: number;
|
|
1036
|
+
interface SteeringRolePrompt {
|
|
1037
|
+
system?: string;
|
|
1038
|
+
append?: string;
|
|
1066
1039
|
}
|
|
1067
|
-
interface
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
*/
|
|
1076
|
-
propose: (args: {
|
|
1077
|
-
scenario: DualAgentScenario;
|
|
1078
|
-
roundIndex: number;
|
|
1079
|
-
priorProposal?: string;
|
|
1080
|
-
priorCritique?: string;
|
|
1081
|
-
}) => Promise<string>;
|
|
1082
|
-
/**
|
|
1083
|
-
* Critique the proposer's current output. Returns a structured critique
|
|
1084
|
-
* (free text) plus a convergence score: how close the proposal is to
|
|
1085
|
-
* acceptable. 1.0 = accept, 0.0 = totally off.
|
|
1086
|
-
*/
|
|
1087
|
-
critique: (args: {
|
|
1088
|
-
scenario: DualAgentScenario;
|
|
1089
|
-
roundIndex: number;
|
|
1090
|
-
proposal: string;
|
|
1091
|
-
}) => Promise<{
|
|
1092
|
-
critique: string;
|
|
1093
|
-
convergenceScore: number;
|
|
1094
|
-
}>;
|
|
1095
|
-
/** Optional per-round hook for progress + tracing. */
|
|
1096
|
-
onRoundComplete?: (info: {
|
|
1097
|
-
scenarioId: string;
|
|
1098
|
-
round: DualAgentRound;
|
|
1099
|
-
}) => void;
|
|
1040
|
+
interface SteeringBundle {
|
|
1041
|
+
id: string;
|
|
1042
|
+
coderPrompt?: string;
|
|
1043
|
+
continuePrompt?: string;
|
|
1044
|
+
reviewerPrompts?: Record<string, string>;
|
|
1045
|
+
skills?: string[];
|
|
1046
|
+
rolePrompts?: Record<string, SteeringRolePrompt>;
|
|
1047
|
+
metadata?: Record<string, unknown>;
|
|
1100
1048
|
}
|
|
1101
|
-
interface
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
config: {
|
|
1109
|
-
maxRounds: number;
|
|
1110
|
-
convergenceThreshold: number;
|
|
1111
|
-
};
|
|
1049
|
+
interface SteeringDelta {
|
|
1050
|
+
coderPrompt?: string;
|
|
1051
|
+
continuePrompt?: string;
|
|
1052
|
+
reviewerPrompts?: Record<string, string>;
|
|
1053
|
+
skills?: string[];
|
|
1054
|
+
rolePrompts?: Record<string, SteeringRolePrompt>;
|
|
1055
|
+
metadata?: Record<string, unknown>;
|
|
1112
1056
|
}
|
|
1113
|
-
declare
|
|
1114
|
-
|
|
1057
|
+
declare function mergeSteeringBundle(base: SteeringBundle, delta: SteeringDelta): SteeringBundle;
|
|
1058
|
+
declare function renderSteeringText(bundle: SteeringBundle): string;
|
|
1059
|
+
|
|
1060
|
+
interface RunScore {
|
|
1061
|
+
success: number;
|
|
1062
|
+
goalProgress: number;
|
|
1063
|
+
repoGroundedness: number;
|
|
1064
|
+
driftPenalty: number;
|
|
1065
|
+
toolUseQuality: number;
|
|
1066
|
+
patchQuality: number;
|
|
1067
|
+
testReality: number;
|
|
1068
|
+
costUsd: number;
|
|
1069
|
+
wallSeconds: number;
|
|
1070
|
+
notes?: string[];
|
|
1071
|
+
}
|
|
1072
|
+
interface RunScoreWeights {
|
|
1073
|
+
success: number;
|
|
1074
|
+
goalProgress: number;
|
|
1075
|
+
repoGroundedness: number;
|
|
1076
|
+
driftPenalty: number;
|
|
1077
|
+
toolUseQuality: number;
|
|
1078
|
+
patchQuality: number;
|
|
1079
|
+
testReality: number;
|
|
1080
|
+
costUsd: number;
|
|
1081
|
+
wallSeconds: number;
|
|
1115
1082
|
}
|
|
1083
|
+
declare const DEFAULT_RUN_SCORE_WEIGHTS: RunScoreWeights;
|
|
1084
|
+
declare function aggregateRunScore(score: RunScore, weights?: Partial<RunScoreWeights>): number;
|
|
1085
|
+
declare function clamp01(value: number): number;
|
|
1116
1086
|
|
|
1117
1087
|
/**
|
|
1118
1088
|
* TraceSchema v1 — the canonical data model for agent-eval.
|
|
@@ -1606,6 +1576,137 @@ interface OtlpExport {
|
|
|
1606
1576
|
/** Export a single run's spans + events in OTLP/JSON. */
|
|
1607
1577
|
declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
|
|
1608
1578
|
|
|
1579
|
+
interface RunTrace {
|
|
1580
|
+
run: Run;
|
|
1581
|
+
spans: Span[];
|
|
1582
|
+
events: TraceEvent[];
|
|
1583
|
+
artifacts: Artifact[];
|
|
1584
|
+
budget: BudgetLedgerEntry[];
|
|
1585
|
+
}
|
|
1586
|
+
interface RunCriticOptions {
|
|
1587
|
+
weights?: Partial<RunScoreWeights>;
|
|
1588
|
+
driftPatterns?: RegExp[];
|
|
1589
|
+
}
|
|
1590
|
+
declare class RunCritic {
|
|
1591
|
+
private readonly weights?;
|
|
1592
|
+
private readonly driftPatterns;
|
|
1593
|
+
constructor(options?: RunCriticOptions);
|
|
1594
|
+
score(store: TraceStore, runId: string): Promise<RunScore>;
|
|
1595
|
+
scoreTrace(trace: RunTrace): RunScore;
|
|
1596
|
+
rank(score: RunScore): number;
|
|
1597
|
+
private isDrift;
|
|
1598
|
+
}
|
|
1599
|
+
|
|
1600
|
+
interface PlaybookEntry {
|
|
1601
|
+
instruction: string;
|
|
1602
|
+
rationale: string;
|
|
1603
|
+
category?: string;
|
|
1604
|
+
evidence?: string;
|
|
1605
|
+
weight?: number;
|
|
1606
|
+
sourceRunId?: string;
|
|
1607
|
+
}
|
|
1608
|
+
interface Playbook {
|
|
1609
|
+
entries: PlaybookEntry[];
|
|
1610
|
+
}
|
|
1611
|
+
declare function distillPlaybook(entries: PlaybookEntry[], options?: {
|
|
1612
|
+
maxEntries?: number;
|
|
1613
|
+
}): Playbook;
|
|
1614
|
+
declare function renderPlaybookMarkdown(playbook: Playbook): string;
|
|
1615
|
+
|
|
1616
|
+
interface OptimizationExample {
|
|
1617
|
+
scenarioId: string;
|
|
1618
|
+
metadata?: Record<string, unknown>;
|
|
1619
|
+
}
|
|
1620
|
+
interface SteeringEvaluation {
|
|
1621
|
+
variant: SteeringBundle;
|
|
1622
|
+
example: OptimizationExample;
|
|
1623
|
+
trialIndex: number;
|
|
1624
|
+
}
|
|
1625
|
+
interface SteeringVariantReport {
|
|
1626
|
+
variantId: string;
|
|
1627
|
+
bundle: SteeringBundle;
|
|
1628
|
+
mean: number;
|
|
1629
|
+
ci95: {
|
|
1630
|
+
lower: number;
|
|
1631
|
+
upper: number;
|
|
1632
|
+
};
|
|
1633
|
+
scenarioScores: Record<string, {
|
|
1634
|
+
mean: number;
|
|
1635
|
+
n: number;
|
|
1636
|
+
samples: number[];
|
|
1637
|
+
}>;
|
|
1638
|
+
}
|
|
1639
|
+
interface OptimizationLoopResult {
|
|
1640
|
+
winner: SteeringBundle;
|
|
1641
|
+
significant: boolean;
|
|
1642
|
+
reports: SteeringVariantReport[];
|
|
1643
|
+
pairwise: Array<{
|
|
1644
|
+
variantA: string;
|
|
1645
|
+
variantB: string;
|
|
1646
|
+
pValue: number;
|
|
1647
|
+
qValue: number;
|
|
1648
|
+
significant: boolean;
|
|
1649
|
+
meanDelta: number;
|
|
1650
|
+
}>;
|
|
1651
|
+
}
|
|
1652
|
+
interface OptimizationLoopConfig {
|
|
1653
|
+
variants: SteeringBundle[];
|
|
1654
|
+
examples: OptimizationExample[];
|
|
1655
|
+
evaluate: (args: SteeringEvaluation) => Promise<RunScore>;
|
|
1656
|
+
scoreWeights?: Partial<RunScoreWeights>;
|
|
1657
|
+
trialsPerScenario?: number;
|
|
1658
|
+
}
|
|
1659
|
+
declare class OptimizationLoop {
|
|
1660
|
+
private readonly optimizer;
|
|
1661
|
+
constructor(optimizer?: PromptOptimizer);
|
|
1662
|
+
run(config: OptimizationLoopConfig): Promise<OptimizationLoopResult>;
|
|
1663
|
+
}
|
|
1664
|
+
|
|
1665
|
+
type SteeringOptimizerBackend = 'pairwise' | 'ax-gepa';
|
|
1666
|
+
interface SteeringOptimizationRow {
|
|
1667
|
+
variantId: string;
|
|
1668
|
+
scenarioId: string;
|
|
1669
|
+
bundle: SteeringBundle;
|
|
1670
|
+
score: RunScore;
|
|
1671
|
+
metadata?: Record<string, unknown>;
|
|
1672
|
+
}
|
|
1673
|
+
interface SteeringOptimizationSelector {
|
|
1674
|
+
backend: SteeringOptimizerBackend;
|
|
1675
|
+
signature?: string;
|
|
1676
|
+
labels?: string[];
|
|
1677
|
+
rationale?: string;
|
|
1678
|
+
}
|
|
1679
|
+
interface SteeringOptimizationResult {
|
|
1680
|
+
backend: SteeringOptimizerBackend;
|
|
1681
|
+
recommendedVariantId: string;
|
|
1682
|
+
rationale: string;
|
|
1683
|
+
rankings: Array<{
|
|
1684
|
+
variantId: string;
|
|
1685
|
+
mean: number;
|
|
1686
|
+
runs: number;
|
|
1687
|
+
}>;
|
|
1688
|
+
selector?: SteeringOptimizationSelector;
|
|
1689
|
+
skipped?: boolean;
|
|
1690
|
+
}
|
|
1691
|
+
interface SteeringOptimizerConfig {
|
|
1692
|
+
weights?: Partial<RunScoreWeights>;
|
|
1693
|
+
}
|
|
1694
|
+
interface AxSteeringOptimizerConfig extends SteeringOptimizerConfig {
|
|
1695
|
+
provider: 'openai' | 'anthropic';
|
|
1696
|
+
apiKey: string;
|
|
1697
|
+
model: string;
|
|
1698
|
+
teacherModel?: string;
|
|
1699
|
+
minRows?: number;
|
|
1700
|
+
}
|
|
1701
|
+
declare class PairwiseSteeringOptimizer {
|
|
1702
|
+
optimize(rows: SteeringOptimizationRow[], config?: SteeringOptimizerConfig): SteeringOptimizationResult;
|
|
1703
|
+
}
|
|
1704
|
+
declare class AxGepaSteeringOptimizer {
|
|
1705
|
+
private readonly config;
|
|
1706
|
+
constructor(config: AxSteeringOptimizerConfig);
|
|
1707
|
+
optimize(rows: SteeringOptimizationRow[]): Promise<SteeringOptimizationResult>;
|
|
1708
|
+
}
|
|
1709
|
+
|
|
1609
1710
|
/**
|
|
1610
1711
|
* SandboxHarness — executes a scenario in an isolated environment and
|
|
1611
1712
|
* emits a rich SandboxSpan into the trace.
|
|
@@ -1689,6 +1790,327 @@ declare class SandboxHarness {
|
|
|
1689
1790
|
run(config: HarnessConfig, emitter: TraceEmitter): Promise<SandboxHarnessResult>;
|
|
1690
1791
|
}
|
|
1691
1792
|
|
|
1793
|
+
type SandboxJudgeKind = 'compiler' | 'test' | 'linter' | 'security';
|
|
1794
|
+
interface SandboxJudgeSpec {
|
|
1795
|
+
id: string;
|
|
1796
|
+
kind: SandboxJudgeKind;
|
|
1797
|
+
config: HarnessConfig;
|
|
1798
|
+
}
|
|
1799
|
+
interface SandboxJudgeResult {
|
|
1800
|
+
id: string;
|
|
1801
|
+
kind: SandboxJudgeKind;
|
|
1802
|
+
passed: boolean;
|
|
1803
|
+
score: number;
|
|
1804
|
+
summary: string;
|
|
1805
|
+
detail: SandboxHarnessResult;
|
|
1806
|
+
}
|
|
1807
|
+
interface JudgeFleetOptions {
|
|
1808
|
+
driver?: SandboxDriver;
|
|
1809
|
+
parallel?: boolean;
|
|
1810
|
+
}
|
|
1811
|
+
declare class JudgeRunner {
|
|
1812
|
+
private readonly driver;
|
|
1813
|
+
constructor(driver?: SandboxDriver);
|
|
1814
|
+
run(spec: SandboxJudgeSpec): Promise<SandboxJudgeResult>;
|
|
1815
|
+
}
|
|
1816
|
+
declare function runJudgeFleet(specs: SandboxJudgeSpec[], options?: JudgeFleetOptions): Promise<SandboxJudgeResult[]>;
|
|
1817
|
+
declare function compilerJudge(id: string, config: HarnessConfig): SandboxJudgeSpec;
|
|
1818
|
+
declare function testJudge(id: string, config: HarnessConfig): SandboxJudgeSpec;
|
|
1819
|
+
declare function linterJudge(id: string, config: HarnessConfig): SandboxJudgeSpec;
|
|
1820
|
+
declare function securityJudge(id: string, config: HarnessConfig): SandboxJudgeSpec;
|
|
1821
|
+
|
|
1822
|
+
interface HostedJudgeDimension {
|
|
1823
|
+
name: string;
|
|
1824
|
+
weight: number;
|
|
1825
|
+
rubric: string;
|
|
1826
|
+
}
|
|
1827
|
+
interface HostedJudgeConfig {
|
|
1828
|
+
model: string;
|
|
1829
|
+
mode?: 'llm' | 'sandbox' | 'composite';
|
|
1830
|
+
systemPrompt?: string;
|
|
1831
|
+
rubricTemplate?: string;
|
|
1832
|
+
temperature?: number;
|
|
1833
|
+
maxTurns?: number;
|
|
1834
|
+
tools?: string[];
|
|
1835
|
+
dimensions?: HostedJudgeDimension[];
|
|
1836
|
+
setupCommand?: string;
|
|
1837
|
+
scripts?: Record<string, string>;
|
|
1838
|
+
}
|
|
1839
|
+
interface HostedJudgeRequest {
|
|
1840
|
+
prompt: string;
|
|
1841
|
+
response: string;
|
|
1842
|
+
rubric?: string;
|
|
1843
|
+
reference?: string;
|
|
1844
|
+
judge: HostedJudgeConfig;
|
|
1845
|
+
}
|
|
1846
|
+
interface HostedJudgeResponse {
|
|
1847
|
+
score: number;
|
|
1848
|
+
reasoning: string;
|
|
1849
|
+
cost: number;
|
|
1850
|
+
dimensions?: Array<{
|
|
1851
|
+
name: string;
|
|
1852
|
+
score: number;
|
|
1853
|
+
reasoning: string;
|
|
1854
|
+
}>;
|
|
1855
|
+
evidence?: Array<{
|
|
1856
|
+
type: string;
|
|
1857
|
+
content: string;
|
|
1858
|
+
}>;
|
|
1859
|
+
turns?: number;
|
|
1860
|
+
parseFailed?: boolean;
|
|
1861
|
+
rawOutput?: string;
|
|
1862
|
+
}
|
|
1863
|
+
interface HostedRunScoreRequest {
|
|
1864
|
+
trace: RunTrace;
|
|
1865
|
+
weights?: Partial<RunScoreWeights>;
|
|
1866
|
+
driftPatterns?: string[];
|
|
1867
|
+
}
|
|
1868
|
+
interface HostedRunScoreResponse {
|
|
1869
|
+
score: RunScore;
|
|
1870
|
+
aggregate: number;
|
|
1871
|
+
weights: RunScoreWeights;
|
|
1872
|
+
notes: string[];
|
|
1873
|
+
}
|
|
1874
|
+
type HostedRunCriticConfig = Pick<RunCriticOptions, 'weights'> & {
|
|
1875
|
+
driftPatterns?: string[];
|
|
1876
|
+
};
|
|
1877
|
+
|
|
1878
|
+
/**
|
|
1879
|
+
* Dual-agent convergence bench.
|
|
1880
|
+
*
|
|
1881
|
+
* Pattern lifted from tax-agent + legal-agent: two agents take turns until
|
|
1882
|
+
* they converge on a consensus artifact. One proposes, the other critiques;
|
|
1883
|
+
* the proposer revises; repeat until a score threshold is hit or max rounds.
|
|
1884
|
+
*
|
|
1885
|
+
* Generalized so any two "agents" (gateways, local functions, anything with
|
|
1886
|
+
* `propose` + `critique`) compose in. Returns convergence rounds per
|
|
1887
|
+
* scenario + whether convergence happened.
|
|
1888
|
+
*/
|
|
1889
|
+
interface DualAgentScenario {
|
|
1890
|
+
id: string;
|
|
1891
|
+
initialPrompt: string;
|
|
1892
|
+
/** Optional context the agents can read (e.g. source documents). */
|
|
1893
|
+
context?: Record<string, unknown>;
|
|
1894
|
+
}
|
|
1895
|
+
interface DualAgentRound {
|
|
1896
|
+
roundIndex: number;
|
|
1897
|
+
proposal: string;
|
|
1898
|
+
critique: string;
|
|
1899
|
+
convergenceScore: number;
|
|
1900
|
+
}
|
|
1901
|
+
interface DualAgentScenarioResult {
|
|
1902
|
+
scenarioId: string;
|
|
1903
|
+
converged: boolean;
|
|
1904
|
+
roundsToConverge: number | null;
|
|
1905
|
+
finalProposal: string;
|
|
1906
|
+
history: DualAgentRound[];
|
|
1907
|
+
finalScore: number;
|
|
1908
|
+
}
|
|
1909
|
+
interface DualAgentBenchConfig {
|
|
1910
|
+
scenarios: DualAgentScenario[];
|
|
1911
|
+
maxRounds?: number;
|
|
1912
|
+
/** Convergence threshold in 0..1 (default 0.85). */
|
|
1913
|
+
convergenceThreshold?: number;
|
|
1914
|
+
/**
|
|
1915
|
+
* Propose an answer given the scenario + the critic's prior critique (if any).
|
|
1916
|
+
* Returns the proposal string.
|
|
1917
|
+
*/
|
|
1918
|
+
propose: (args: {
|
|
1919
|
+
scenario: DualAgentScenario;
|
|
1920
|
+
roundIndex: number;
|
|
1921
|
+
priorProposal?: string;
|
|
1922
|
+
priorCritique?: string;
|
|
1923
|
+
}) => Promise<string>;
|
|
1924
|
+
/**
|
|
1925
|
+
* Critique the proposer's current output. Returns a structured critique
|
|
1926
|
+
* (free text) plus a convergence score: how close the proposal is to
|
|
1927
|
+
* acceptable. 1.0 = accept, 0.0 = totally off.
|
|
1928
|
+
*/
|
|
1929
|
+
critique: (args: {
|
|
1930
|
+
scenario: DualAgentScenario;
|
|
1931
|
+
roundIndex: number;
|
|
1932
|
+
proposal: string;
|
|
1933
|
+
}) => Promise<{
|
|
1934
|
+
critique: string;
|
|
1935
|
+
convergenceScore: number;
|
|
1936
|
+
}>;
|
|
1937
|
+
/** Optional per-round hook for progress + tracing. */
|
|
1938
|
+
onRoundComplete?: (info: {
|
|
1939
|
+
scenarioId: string;
|
|
1940
|
+
round: DualAgentRound;
|
|
1941
|
+
}) => void;
|
|
1942
|
+
}
|
|
1943
|
+
interface DualAgentReport {
|
|
1944
|
+
scenarios: DualAgentScenarioResult[];
|
|
1945
|
+
aggregate: {
|
|
1946
|
+
convergenceRate: number;
|
|
1947
|
+
avgRoundsToConverge: number | null;
|
|
1948
|
+
avgFinalScore: number;
|
|
1949
|
+
};
|
|
1950
|
+
config: {
|
|
1951
|
+
maxRounds: number;
|
|
1952
|
+
convergenceThreshold: number;
|
|
1953
|
+
};
|
|
1954
|
+
}
|
|
1955
|
+
declare class DualAgentBench {
|
|
1956
|
+
run(config: DualAgentBenchConfig): Promise<DualAgentReport>;
|
|
1957
|
+
}
|
|
1958
|
+
|
|
1959
|
+
/**
|
|
1960
|
+
* Propose / Verify / Review — the core multi-shot primitive.
|
|
1961
|
+
*
|
|
1962
|
+
* shot N: propose(state, priorReview) → new state
|
|
1963
|
+
* verify(state) → pass/fail, optional layers
|
|
1964
|
+
* review(state, verification, memory) → observations + next-shot
|
|
1965
|
+
* instruction + shouldContinue
|
|
1966
|
+
* memory.append(entry)
|
|
1967
|
+
*
|
|
1968
|
+
* Roles are strictly separated:
|
|
1969
|
+
*
|
|
1970
|
+
* - The WORKER is whatever the caller wraps in `propose`. It is
|
|
1971
|
+
* stateful — caller owns its resume/session mechanism.
|
|
1972
|
+
* - The VERIFIER grades the state. It produces the ground truth.
|
|
1973
|
+
* The reviewer cannot overturn or downgrade a verification layer.
|
|
1974
|
+
* - The REVIEWER is stateless per call. Its continuity is the
|
|
1975
|
+
* `ReviewMemoryStore` — durable JSONL by default, or any store
|
|
1976
|
+
* implementing the interface. It reads memory + trace summary +
|
|
1977
|
+
* verification and directs the NEXT proposer shot.
|
|
1978
|
+
*
|
|
1979
|
+
* This shape is load-bearing. The reviewer never grades; the verifier
|
|
1980
|
+
* never directs. Two processes, two prompts, two concerns — which is
|
|
1981
|
+
* what keeps the loop from confirmation-biasing itself into "all
|
|
1982
|
+
* passed" when it didn't.
|
|
1983
|
+
*
|
|
1984
|
+
* Short-circuits and soft-fails are both first-class:
|
|
1985
|
+
* - verify.pass === true → reviewer LLM call is skipped, memory
|
|
1986
|
+
* records a success entry, loop exits.
|
|
1987
|
+
* - review throws → the shot still counts; the loop uses the
|
|
1988
|
+
* last-known instruction (or `fallbackInstruction`) for the next
|
|
1989
|
+
* propose call. A transient reviewer failure must NEVER abort a
|
|
1990
|
+
* valid arc.
|
|
1991
|
+
*
|
|
1992
|
+
* Composable: `propose` itself can be another `runProposeReview` call.
|
|
1993
|
+
* That's the dogfooding path — a harness built on this primitive is in
|
|
1994
|
+
* turn evaluable by it.
|
|
1995
|
+
*/
|
|
1996
|
+
|
|
1997
|
+
interface Verification {
|
|
1998
|
+
pass: boolean;
|
|
1999
|
+
score?: number;
|
|
2000
|
+
failingLayers?: string[];
|
|
2001
|
+
details?: unknown;
|
|
2002
|
+
}
|
|
2003
|
+
interface Review {
|
|
2004
|
+
observations: string;
|
|
2005
|
+
diagnosis: string;
|
|
2006
|
+
nextShotInstruction: string;
|
|
2007
|
+
shouldContinue: boolean;
|
|
2008
|
+
confidence: number;
|
|
2009
|
+
}
|
|
2010
|
+
interface ReviewMemoryEntry extends Review {
|
|
2011
|
+
shot: number;
|
|
2012
|
+
timestamp: number;
|
|
2013
|
+
verification: {
|
|
2014
|
+
pass: boolean;
|
|
2015
|
+
score?: number;
|
|
2016
|
+
failingLayers?: string[];
|
|
2017
|
+
};
|
|
2018
|
+
}
|
|
2019
|
+
interface ProposeInput<State> {
|
|
2020
|
+
shot: number;
|
|
2021
|
+
goal: string;
|
|
2022
|
+
state: State;
|
|
2023
|
+
priorReview: Review | null;
|
|
2024
|
+
abortSignal: AbortSignal;
|
|
2025
|
+
emitter?: TraceEmitter;
|
|
2026
|
+
}
|
|
2027
|
+
interface ProposeOutput<State, Summary = unknown> {
|
|
2028
|
+
state: State;
|
|
2029
|
+
traceSummary?: Summary;
|
|
2030
|
+
}
|
|
2031
|
+
interface ReviewInput<State, Summary = unknown> {
|
|
2032
|
+
shot: number;
|
|
2033
|
+
goal: string;
|
|
2034
|
+
state: State;
|
|
2035
|
+
verification: Verification;
|
|
2036
|
+
traceSummary: Summary | undefined;
|
|
2037
|
+
memory: ReviewMemoryEntry[];
|
|
2038
|
+
}
|
|
2039
|
+
type ProposeFn<State, Summary = unknown> = (input: ProposeInput<State>) => Promise<ProposeOutput<State, Summary>>;
|
|
2040
|
+
type VerifyFn<State> = (state: State) => Promise<Verification>;
|
|
2041
|
+
type ReviewFn<State, Summary = unknown> = (input: ReviewInput<State, Summary>) => Promise<Review>;
|
|
2042
|
+
interface ReviewMemoryStore {
|
|
2043
|
+
load(): Promise<ReviewMemoryEntry[]>;
|
|
2044
|
+
append(entry: ReviewMemoryEntry): Promise<void>;
|
|
2045
|
+
}
|
|
2046
|
+
interface ProposeReviewConfig<State, Summary = unknown> {
|
|
2047
|
+
goal: string;
|
|
2048
|
+
initialState: State;
|
|
2049
|
+
propose: ProposeFn<State, Summary>;
|
|
2050
|
+
verify: VerifyFn<State>;
|
|
2051
|
+
review: ReviewFn<State, Summary>;
|
|
2052
|
+
/** Hard shot cap. Default 10. */
|
|
2053
|
+
maxShots?: number;
|
|
2054
|
+
/** Wall-clock cap in ms. Default 10 min. */
|
|
2055
|
+
maxWallMs?: number;
|
|
2056
|
+
/**
|
|
2057
|
+
* If the reviewer returns confidence ≤ floor on `confidenceFloorWindow`
|
|
2058
|
+
* consecutive shots, terminate early. Default floor 0.3, window 2.
|
|
2059
|
+
* Set window to 0 or floor to <0 to disable.
|
|
2060
|
+
*/
|
|
2061
|
+
confidenceFloor?: number;
|
|
2062
|
+
confidenceFloorWindow?: number;
|
|
2063
|
+
/** Defaults to an in-memory store if omitted. */
|
|
2064
|
+
memory?: ReviewMemoryStore;
|
|
2065
|
+
/** If provided, emit a Run + per-shot spans. */
|
|
2066
|
+
store?: TraceStore;
|
|
2067
|
+
scenarioId?: string;
|
|
2068
|
+
projectId?: string;
|
|
2069
|
+
variantId?: string;
|
|
2070
|
+
/**
|
|
2071
|
+
* Used when the reviewer soft-fails on shot 1 (no prior instruction to
|
|
2072
|
+
* fall back to). Default is a generic "inspect failures and fix".
|
|
2073
|
+
*/
|
|
2074
|
+
fallbackInstruction?: string;
|
|
2075
|
+
}
|
|
2076
|
+
interface ProposeReviewShot<State, Summary = unknown> {
|
|
2077
|
+
shot: number;
|
|
2078
|
+
state: State;
|
|
2079
|
+
verification: Verification;
|
|
2080
|
+
traceSummary: Summary | undefined;
|
|
2081
|
+
review: Review;
|
|
2082
|
+
reviewAvailable: boolean;
|
|
2083
|
+
reviewError?: string;
|
|
2084
|
+
durationMs: number;
|
|
2085
|
+
}
|
|
2086
|
+
interface ProposeReviewReport<State, Summary = unknown> {
|
|
2087
|
+
runId: string | null;
|
|
2088
|
+
completed: boolean;
|
|
2089
|
+
shots: ProposeReviewShot<State, Summary>[];
|
|
2090
|
+
finalState: State;
|
|
2091
|
+
finalVerification: Verification;
|
|
2092
|
+
failureClass?: FailureClass;
|
|
2093
|
+
wallMs: number;
|
|
2094
|
+
score: number;
|
|
2095
|
+
}
|
|
2096
|
+
declare function inMemoryReviewStore(initial?: ReviewMemoryEntry[]): ReviewMemoryStore;
|
|
2097
|
+
declare function jsonlReviewStore(path: string): ReviewMemoryStore;
|
|
2098
|
+
declare function runProposeReview<State, Summary = unknown>(config: ProposeReviewConfig<State, Summary>): Promise<ProposeReviewReport<State, Summary>>;
|
|
2099
|
+
interface LlmJsonCall {
|
|
2100
|
+
(req: {
|
|
2101
|
+
system: string;
|
|
2102
|
+
user: string;
|
|
2103
|
+
}): Promise<unknown>;
|
|
2104
|
+
}
|
|
2105
|
+
interface LlmReviewerConfig<State, Summary = unknown> {
|
|
2106
|
+
callJson: LlmJsonCall;
|
|
2107
|
+
renderState?: (state: State) => string;
|
|
2108
|
+
renderTraceSummary?: (summary: Summary | undefined) => string;
|
|
2109
|
+
/** Appended to the default system prompt. */
|
|
2110
|
+
systemPromptAddendum?: string;
|
|
2111
|
+
}
|
|
2112
|
+
declare function createLlmReviewer<State, Summary = unknown>(cfg: LlmReviewerConfig<State, Summary>): ReviewFn<State, Summary>;
|
|
2113
|
+
|
|
1692
2114
|
/**
|
|
1693
2115
|
* TestGradedScenario — a scenario whose score comes from a test suite.
|
|
1694
2116
|
*
|
|
@@ -3102,10 +3524,25 @@ declare function resumeBuilderSession(store: TraceStore, projectId: string): Pro
|
|
|
3102
3524
|
* is the highest-leverage signal the framework computes — if
|
|
3103
3525
|
* meta_score doesn't predict runtime_score, the builder's self-scoring
|
|
3104
3526
|
* is broken.
|
|
3527
|
+
*
|
|
3528
|
+
* Scaffold-only mode: when a project has no `app-runtime` runs (e.g. a
|
|
3529
|
+
* scaffold-builder eval that grades compose + build without driving a
|
|
3530
|
+
* runtime scenario), `kind` is `'scaffold-only'` and `complete` measures
|
|
3531
|
+
* meta + build only. Consumers can tell the two apart without having to
|
|
3532
|
+
* interpret null-runtime as either "not yet computed" or "N/A for this
|
|
3533
|
+
* project shape".
|
|
3105
3534
|
*/
|
|
3106
3535
|
|
|
3536
|
+
type ProjectKind = 'full' | 'scaffold-only';
|
|
3107
3537
|
interface ThreeLayerProjectReport {
|
|
3108
3538
|
projectId: string;
|
|
3539
|
+
/**
|
|
3540
|
+
* `'full'` when the project has at least one `app-runtime` run;
|
|
3541
|
+
* `'scaffold-only'` when it only has meta + build layers. Lets
|
|
3542
|
+
* downstream consumers treat a null runtime score as expected
|
|
3543
|
+
* (scaffold-only) vs. missing (full, pipeline broke).
|
|
3544
|
+
*/
|
|
3545
|
+
kind: ProjectKind;
|
|
3109
3546
|
builderRunId?: string;
|
|
3110
3547
|
/** Judge-verdict score on the builder run (0..1 after normalization). */
|
|
3111
3548
|
metaScore: number | null;
|
|
@@ -3113,10 +3550,14 @@ interface ThreeLayerProjectReport {
|
|
|
3113
3550
|
/** 0..1 from the sandbox harness (testsPassed / testsTotal). */
|
|
3114
3551
|
buildScore: number | null;
|
|
3115
3552
|
appRuntimeRunIds: string[];
|
|
3116
|
-
/** Mean of outcome.score over app-runtime runs, 0..1. */
|
|
3553
|
+
/** Mean of outcome.score over app-runtime runs, 0..1. Always null in scaffold-only mode. */
|
|
3117
3554
|
runtimeScore: number | null;
|
|
3118
3555
|
runtimePassRate: number | null;
|
|
3119
|
-
/**
|
|
3556
|
+
/**
|
|
3557
|
+
* Layer-aware completeness:
|
|
3558
|
+
* - `kind='full'`: all three layers scored
|
|
3559
|
+
* - `kind='scaffold-only'`: meta + build scored (runtime not applicable)
|
|
3560
|
+
*/
|
|
3120
3561
|
complete: boolean;
|
|
3121
3562
|
}
|
|
3122
3563
|
declare function scoreProject(store: TraceStore, projectId: string): Promise<ThreeLayerProjectReport>;
|
|
@@ -4132,4 +4573,4 @@ interface UseCaseSignals {
|
|
|
4132
4573
|
declare function classifyEuAiRisk(signals: UseCaseSignals): EuRiskClass;
|
|
4133
4574
|
declare function euAiActReport(ctx: GovernanceContext, signals: UseCaseSignals): Promise<GovernanceReport>;
|
|
4134
4575
|
|
|
4135
|
-
export { type ActiveLearningOptions, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeploymentOutcome, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type GenericSpan, type GoldenItem, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessConfig, HoldoutAuditor, HoldoutLockedError, type HypothesisManifest, type HypothesisResult, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, type JudgeScore, type JudgeSpan, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type LayerCorrelation, type LlmSpan, MODEL_PRICING, type MatcherResult, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, type ParetoResult, type PersonaConfig, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunConfig, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunStatus, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxResult, type SandboxSpan, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type VisualDiffOptions, type VisualDiffResult, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, budgetBreachView, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, causalAttribution, checkCanaries, checkSlos, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareToBaseline, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, crossTraceDiff, defaultJudges, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, failureClusterView, fileContains, fileExists, firstDivergenceView, formatBenchmarkReport, formatDriverReport, groupBy, hashContent, hashScenarios, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, judgeAgreementView, judgeSpans, keyPreserved, llmSpanFromProvider, llmSpans, loadScorerFromGrader, lowercaseMutator, mannWhitneyU, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runSelfPlay, runTestGradedScenario, runsForScenario, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
|
|
4576
|
+
export { type ActiveLearningOptions, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeploymentOutcome, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type GenericSpan, type GoldenItem, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessConfig, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type LayerCorrelation, type LlmJsonCall, type LlmReviewerConfig, type LlmSpan, MODEL_PRICING, type MatcherResult, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxResult, type SandboxSpan, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type Verification, type VerifyFn, type VisualDiffOptions, type VisualDiffResult, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, budgetBreachView, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, createLlmReviewer, crossTraceDiff, defaultJudges, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, failureClusterView, fileContains, fileExists, firstDivergenceView, formatBenchmarkReport, formatDriverReport, groupBy, hashContent, hashScenarios, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReviewStore, judgeAgreementView, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, lowercaseMutator, mannWhitneyU, mergeSteeringBundle, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runJudgeFleet, runProposeReview, runSelfPlay, runTestGradedScenario, runsForScenario, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, securityJudge, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
|