@remnic/bench 9.3.681 → 9.3.682
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +372 -2
- package/dist/index.js +605 -53
- package/package.json +6 -3
- package/profiles/README.md +113 -0
- package/profiles/local-lab-3090.json +39 -0
package/dist/index.d.ts
CHANGED
|
@@ -211,7 +211,7 @@ type BenchmarkMode = "full" | "quick";
|
|
|
211
211
|
type BenchmarkTier = "published" | "remnic" | "custom";
|
|
212
212
|
type BenchmarkStatus = "ready" | "planned";
|
|
213
213
|
type BenchmarkCategory = "agentic" | "retrieval" | "conversational" | "ingestion";
|
|
214
|
-
type BenchRuntimeProfile = "baseline" | "real" | "openclaw-chain";
|
|
214
|
+
type BenchRuntimeProfile = "baseline" | "real" | "openclaw-chain" | "local-lab";
|
|
215
215
|
type AmaBenchJudgeProtocol = "default" | "recommended";
|
|
216
216
|
/**
|
|
217
217
|
* Built-in LLM providers supported by the bench harness.
|
|
@@ -244,6 +244,18 @@ interface ProviderConfig {
|
|
|
244
244
|
reasoningEffort?: BenchReasoningEffort;
|
|
245
245
|
responderContextBudgetChars?: number;
|
|
246
246
|
responderPromptBudgetChars?: number;
|
|
247
|
+
/**
|
|
248
|
+
* Sampling temperature forwarded from a runtime profile manifest (e.g. the
|
|
249
|
+
* local-lab manifest pins this to 0 for reproducibility). Optional; providers
|
|
250
|
+
* that do not read it ignore the value.
|
|
251
|
+
*/
|
|
252
|
+
temperature?: number;
|
|
253
|
+
/**
|
|
254
|
+
* Sampling seed forwarded from a runtime profile manifest so local-lab runs
|
|
255
|
+
* are reproducible across invocations. Optional; providers that do not read
|
|
256
|
+
* it ignore the value.
|
|
257
|
+
*/
|
|
258
|
+
seed?: number;
|
|
247
259
|
}
|
|
248
260
|
interface TaskTokenUsage {
|
|
249
261
|
input: number;
|
|
@@ -648,6 +660,18 @@ interface ProviderBaseConfig {
|
|
|
648
660
|
};
|
|
649
661
|
/** Suppress thinking/reasoning tokens for thinking-capable models (Qwen 3.5, Gemma 4, DeepSeek). */
|
|
650
662
|
disableThinking?: boolean;
|
|
663
|
+
/**
|
|
664
|
+
* Sampling temperature pinned by a runtime profile manifest (e.g. local-lab
|
|
665
|
+
* pins 0 for reproducibility). Providers use this as the fallback when the
|
|
666
|
+
* per-call CompletionOpts does not override it.
|
|
667
|
+
*/
|
|
668
|
+
temperature?: number;
|
|
669
|
+
/**
|
|
670
|
+
* Sampling seed pinned by a runtime profile manifest so reruns reproduce the
|
|
671
|
+
* same draws. Providers forward this to the backend on every completion call
|
|
672
|
+
* (Ollama options.seed, OpenAI-compatible top-level seed).
|
|
673
|
+
*/
|
|
674
|
+
seed?: number;
|
|
651
675
|
/**
|
|
652
676
|
* Optional answering-only memory-context budget. Benchmark artifacts keep the
|
|
653
677
|
* full recalled text, but provider-backed responders may receive this compact
|
|
@@ -1723,6 +1747,340 @@ interface AssistantRunnerOptions {
|
|
|
1723
1747
|
random?: () => number;
|
|
1724
1748
|
}
|
|
1725
1749
|
|
|
1750
|
+
/**
|
|
1751
|
+
* Local-lab runtime profile manifest (issue #1573 PR2).
|
|
1752
|
+
*
|
|
1753
|
+
* A `local-lab` profile is a JSON manifest — never hardcoded model strings
|
|
1754
|
+
* (rule 30/55) — that pins a single-locale bench run (responder, judge,
|
|
1755
|
+
* optional embedding) to operator-hosted models. It is resolved by
|
|
1756
|
+
* `resolveBenchRuntimeProfile` to drive sequential phase scheduling and
|
|
1757
|
+
* endpoint preflight (see `preflight.ts` and `sequential-phases.ts`).
|
|
1758
|
+
*
|
|
1759
|
+
* Field contract:
|
|
1760
|
+
*
|
|
1761
|
+
* - `provider` is one of `LOCAL_LAB_PROVIDER_KINDS` ("openai-compatible"
|
|
1762
|
+
* for llama.cpp / vLLM / LM Studio; "ollama" for native Ollama). Any
|
|
1763
|
+
* other value is REJECTED with the valid kinds listed (rule 51 — never
|
|
1764
|
+
* silently fall through to a default).
|
|
1765
|
+
* - `temperature` is pinned to 0 and `seed` is required so local-lab
|
|
1766
|
+
* runs are reproducible.
|
|
1767
|
+
* - `ctx` is the manifest-declared serving context (tokens). Preflight
|
|
1768
|
+
* verifies the live endpoint reports at least this much.
|
|
1769
|
+
*
|
|
1770
|
+
* The manifest is content (no command strings interpolated into shells;
|
|
1771
|
+
* rule 10) — `baseUrl`/`model` are only ever fetch targets.
|
|
1772
|
+
*/
|
|
1773
|
+
/**
|
|
1774
|
+
* Provider kinds accepted by a local-lab manifest role.
|
|
1775
|
+
*
|
|
1776
|
+
* `"openai-compatible"` maps to the OpenAI-compatible transport
|
|
1777
|
+
* (`/v1/chat/completions` + `/v1/models`) used by llama.cpp, vLLM, LM
|
|
1778
|
+
* Studio, etc. `"ollama"` maps to Ollama's native transport
|
|
1779
|
+
* (`/api/generate` + `/api/tags`).
|
|
1780
|
+
*/
|
|
1781
|
+
declare const LOCAL_LAB_PROVIDER_KINDS: readonly ["openai-compatible", "ollama"];
|
|
1782
|
+
type LocalLabProviderKind = (typeof LOCAL_LAB_PROVIDER_KINDS)[number];
|
|
1783
|
+
interface LocalLabRoleConfig {
|
|
1784
|
+
provider: LocalLabProviderKind;
|
|
1785
|
+
/** Base URL of the operator-hosted endpoint (e.g. `http://localhost:1234/v1`). */
|
|
1786
|
+
baseUrl: string;
|
|
1787
|
+
/** Exact model id the endpoint reports (no aliases, no shell interpolation). */
|
|
1788
|
+
model: string;
|
|
1789
|
+
/** Optional quantization label (informational; recorded in artifacts). */
|
|
1790
|
+
quantization?: string;
|
|
1791
|
+
/** Manifest-declared serving context length in tokens. */
|
|
1792
|
+
ctx: number;
|
|
1793
|
+
/** Sampling temperature. Local-lab pins this to 0 for reproducibility. */
|
|
1794
|
+
temperature: 0;
|
|
1795
|
+
/** Sampling seed; required so reruns reproduce the same draws. */
|
|
1796
|
+
seed: number;
|
|
1797
|
+
}
|
|
1798
|
+
interface LocalLabManifestNotes {
|
|
1799
|
+
/**
|
|
1800
|
+
* Free-form operator guidance printed between responder and judge phases
|
|
1801
|
+
* when the two roles live on different endpoints. When both roles share
|
|
1802
|
+
* an endpoint the runner skips the hand-off (see sequential-phases.ts).
|
|
1803
|
+
*/
|
|
1804
|
+
responderToJudgeHandoff?: string;
|
|
1805
|
+
[key: string]: unknown;
|
|
1806
|
+
}
|
|
1807
|
+
interface LocalLabManifest {
|
|
1808
|
+
/** Manifest discriminator; always the literal `"local-lab"`. */
|
|
1809
|
+
profile: "local-lab";
|
|
1810
|
+
responder: LocalLabRoleConfig;
|
|
1811
|
+
judge: LocalLabRoleConfig;
|
|
1812
|
+
embedding?: LocalLabRoleConfig;
|
|
1813
|
+
/** Phase scheduling mode. PR2 ships `"sequential"` only. */
|
|
1814
|
+
phases: "sequential";
|
|
1815
|
+
notes?: LocalLabManifestNotes;
|
|
1816
|
+
}
|
|
1817
|
+
/**
|
|
1818
|
+
* Parse and validate a local-lab manifest from an unknown parsed JSON value.
|
|
1819
|
+
* Throws a rule-51-shaped error (lists valid kinds) on any violation.
|
|
1820
|
+
*/
|
|
1821
|
+
declare function parseLocalLabManifest(raw: unknown): LocalLabManifest;
|
|
1822
|
+
/**
|
|
1823
|
+
* Read and parse a local-lab manifest from disk. The path is opened read-only;
|
|
1824
|
+
* nothing in the manifest is ever interpolated into a shell (rule 10).
|
|
1825
|
+
*/
|
|
1826
|
+
declare function loadLocalLabManifest(filePath: string): Promise<LocalLabManifest>;
|
|
1827
|
+
|
|
1828
|
+
/**
|
|
1829
|
+
* Resolve a parsed local-lab manifest into runtime ProviderConfigs.
|
|
1830
|
+
*
|
|
1831
|
+
* Resolution is the bridge between the manifest (operator-authored JSON) and
|
|
1832
|
+
* the harness's existing `ProviderConfig` shape: each manifest role becomes a
|
|
1833
|
+
* `ProviderConfig` with `temperature` and `seed` forwarded verbatim so a test
|
|
1834
|
+
* can assert on the resolved config (issue #1573 PR2 test list).
|
|
1835
|
+
*
|
|
1836
|
+
* Provider kind mapping (kept dumb and explicit, never silent fallback):
|
|
1837
|
+
*
|
|
1838
|
+
* - `"openai-compatible"` → `BuiltInProvider` `"local-llm"` (the bench
|
|
1839
|
+
* provider that talks `/v1/chat/completions` + `/v1/models`, requiring an
|
|
1840
|
+
* explicit `baseUrl` — exactly what the manifest pins).
|
|
1841
|
+
* - `"ollama"` → `"ollama"` (native `/api/generate` + `/api/tags`).
|
|
1842
|
+
*
|
|
1843
|
+
* Both baseUrl and model are copied as-is; they are only ever fetch targets,
|
|
1844
|
+
* never interpolated into a shell (rule 10). API keys are not part of the
|
|
1845
|
+
* manifest — local-lab endpoints are operator-hosted on the loopback or a
|
|
1846
|
+
* private host, so persisting a key into the manifest would be a footgun.
|
|
1847
|
+
* Operators with auth'd local endpoints pass the key out-of-band.
|
|
1848
|
+
*
|
|
1849
|
+
* `quantization` is informational only; the bench `ProviderConfig` does not
|
|
1850
|
+
* have a quantization field, so it is kept on the resolved role and surfaces
|
|
1851
|
+
* in the bench artifact (PR3's tier/hardware metadata).
|
|
1852
|
+
*/
|
|
1853
|
+
|
|
1854
|
+
/**
|
|
1855
|
+
* A manifest role paired with its resolved `ProviderConfig`. The PR2 test
|
|
1856
|
+
* list asserts temperature/seed on `providerConfig` directly.
|
|
1857
|
+
*/
|
|
1858
|
+
interface ResolvedLocalLabRole {
|
|
1859
|
+
readonly provider: LocalLabRoleConfig["provider"];
|
|
1860
|
+
readonly baseUrl: string;
|
|
1861
|
+
readonly model: string;
|
|
1862
|
+
readonly quantization?: string;
|
|
1863
|
+
readonly ctx: number;
|
|
1864
|
+
readonly temperature: number;
|
|
1865
|
+
readonly seed: number;
|
|
1866
|
+
readonly providerConfig: ProviderConfig;
|
|
1867
|
+
}
|
|
1868
|
+
interface ResolvedLocalLabProfile {
|
|
1869
|
+
/** The parsed manifest this resolution was produced from. */
|
|
1870
|
+
readonly manifest: LocalLabManifest;
|
|
1871
|
+
readonly responder: ResolvedLocalLabRole;
|
|
1872
|
+
readonly judge: ResolvedLocalLabRole;
|
|
1873
|
+
readonly embedding?: ResolvedLocalLabRole;
|
|
1874
|
+
/** Phase scheduling mode. PR2 ships `"sequential"` only. */
|
|
1875
|
+
readonly phases: "sequential";
|
|
1876
|
+
/** Operator hand-off note (or undefined when not authored). */
|
|
1877
|
+
readonly notes?: LocalLabManifestNotes;
|
|
1878
|
+
}
|
|
1879
|
+
/**
|
|
1880
|
+
* Resolve a single manifest role into a `ResolvedLocalLabRole`, forwarding
|
|
1881
|
+
* `temperature` and `seed` into the `ProviderConfig` so providers can read
|
|
1882
|
+
* them off the config directly.
|
|
1883
|
+
*
|
|
1884
|
+
* Ollama `baseUrl`s are normalized to include `/api` so the provider posts to
|
|
1885
|
+
* `${baseUrl}/generate` → `…/api/generate` rather than `…/generate` (404).
|
|
1886
|
+
* This mirrors `discoveryEndpointFor` which already appends `/api/tags` for
|
|
1887
|
+
* preflight. Operators can write `http://127.0.0.1:11434` or
|
|
1888
|
+
* `http://127.0.0.1:11434/api` interchangeably (codex review, #1573 PR2).
|
|
1889
|
+
*/
|
|
1890
|
+
declare function resolveLocalLabRole(role: LocalLabRoleConfig): ResolvedLocalLabRole;
|
|
1891
|
+
/**
|
|
1892
|
+
* Resolve the full manifest into a `ResolvedLocalLabProfile`. Used by
|
|
1893
|
+
* `resolveBenchRuntimeProfile` for `runtimeProfile: "local-lab"`, and
|
|
1894
|
+
* directly by the unit test for the temperature/seed forwarding assertion.
|
|
1895
|
+
*/
|
|
1896
|
+
declare function resolveLocalLabProfile(manifest: LocalLabManifest): ResolvedLocalLabProfile;
|
|
1897
|
+
|
|
1898
|
+
/**
|
|
1899
|
+
* Local-lab endpoint preflight (issue #1573 PR2).
|
|
1900
|
+
*
|
|
1901
|
+
* Before each phase the runner verifies the operator-hosted endpoint is
|
|
1902
|
+
* actually serving the model the manifest claims, with at least the
|
|
1903
|
+
* manifest-declared context length. The harness never manages model
|
|
1904
|
+
* processes — it asks the endpoint what's live and refuses to proceed
|
|
1905
|
+
* ("hard error listing what was found vs expected", rule 51) on any
|
|
1906
|
+
* mismatch. Silent fallback is explicitly forbidden.
|
|
1907
|
+
*
|
|
1908
|
+
* Discovery endpoints by provider kind:
|
|
1909
|
+
*
|
|
1910
|
+
* - `"openai-compatible"` → `GET <baseUrl>/models` (or `/v1/models` when
|
|
1911
|
+
* the baseUrl does not already end in `/v1`). The body shape mirrors the
|
|
1912
|
+
* OpenAI `/v1/models` contract: `{ data: [{ id, context_length?, ... }] }`.
|
|
1913
|
+
* - `"ollama"` → `GET <baseUrl>/tags` (or `/api/tags`). The body shape
|
|
1914
|
+
* mirrors Ollama's `/api/tags` contract: `{ models: [{ name,
|
|
1915
|
+
* details?.parameter_size?, ... }] }`.
|
|
1916
|
+
*
|
|
1917
|
+
* `baseUrl` is composed into a fetch URL only — never a shell string
|
|
1918
|
+
* (rule 10). Failures carry the endpoint's actual reported model list so an
|
|
1919
|
+
* operator can immediately see why their manifest doesn't match.
|
|
1920
|
+
*/
|
|
1921
|
+
|
|
1922
|
+
/** Minimal shape the preflight reader needs from a discovered model entry. */
|
|
1923
|
+
interface PreflightDiscoveredModel {
|
|
1924
|
+
id: string;
|
|
1925
|
+
contextLength?: number;
|
|
1926
|
+
}
|
|
1927
|
+
interface LocalLabPreflightInput {
|
|
1928
|
+
provider: LocalLabProviderKind;
|
|
1929
|
+
baseUrl: string;
|
|
1930
|
+
/** Exact model id the endpoint is expected to report. */
|
|
1931
|
+
model: string;
|
|
1932
|
+
/** Manifest-declared serving context length; the live endpoint must meet or exceed it. */
|
|
1933
|
+
ctx: number;
|
|
1934
|
+
}
|
|
1935
|
+
interface LocalLabPreflightSuccess {
|
|
1936
|
+
ok: true;
|
|
1937
|
+
provider: LocalLabProviderKind;
|
|
1938
|
+
endpoint: string;
|
|
1939
|
+
expectedModel: string;
|
|
1940
|
+
foundModels: PreflightDiscoveredModel[];
|
|
1941
|
+
/** Resolved context length for the matched model, when the endpoint reports one. */
|
|
1942
|
+
matchedContextLength?: number;
|
|
1943
|
+
}
|
|
1944
|
+
interface LocalLabPreflightFailure {
|
|
1945
|
+
ok: false;
|
|
1946
|
+
provider: LocalLabProviderKind;
|
|
1947
|
+
endpoint: string;
|
|
1948
|
+
expectedModel: string;
|
|
1949
|
+
foundModels: PreflightDiscoveredModel[];
|
|
1950
|
+
expectedCtx: number;
|
|
1951
|
+
matchedContextLength?: number;
|
|
1952
|
+
reason: string;
|
|
1953
|
+
}
|
|
1954
|
+
type LocalLabPreflightResult = LocalLabPreflightSuccess | LocalLabPreflightFailure;
|
|
1955
|
+
interface LocalLabPreflightOptions {
|
|
1956
|
+
signal?: AbortSignal;
|
|
1957
|
+
/** Per-request timeout. Defaults to 5 s — preflight should be fast. */
|
|
1958
|
+
timeoutMs?: number;
|
|
1959
|
+
/** Inject a fetch implementation (tests). Defaults to global fetch. */
|
|
1960
|
+
fetchImpl?: typeof fetch;
|
|
1961
|
+
}
|
|
1962
|
+
/**
|
|
1963
|
+
* Preflight a single manifest role. Resolves to a result object — never
|
|
1964
|
+
* throws for endpoint/discovery failures (those are preflight failures, not
|
|
1965
|
+
* runtime errors). Throws only on truly exceptional conditions (invalid
|
|
1966
|
+
* baseUrl shape, fetchImpl contract violation).
|
|
1967
|
+
*/
|
|
1968
|
+
declare function preflightLocalLabRole(input: LocalLabPreflightInput, options?: LocalLabPreflightOptions): Promise<LocalLabPreflightResult>;
|
|
1969
|
+
/**
|
|
1970
|
+
* Compose the discovery URL for a provider kind + baseUrl. Never
|
|
1971
|
+
* interpolates into a shell — only into a fetch URL. Tolerant of trailing
|
|
1972
|
+
* slashes and the common `/v1` (OpenAI-compatible) / `/api` (Ollama)
|
|
1973
|
+
* suffixes already being present.
|
|
1974
|
+
*/
|
|
1975
|
+
declare function discoveryEndpointFor(provider: LocalLabProviderKind, baseUrl: string): string;
|
|
1976
|
+
|
|
1977
|
+
/**
|
|
1978
|
+
* Local-lab sequential phase scheduler (issue #1573 PR2).
|
|
1979
|
+
*
|
|
1980
|
+
* On a single-GPU lab box (e.g. RTX 3090, 24 GB VRAM) the responder and
|
|
1981
|
+
* judge models cannot be co-resident: the harness runs them in two distinct
|
|
1982
|
+
* phases, with the operator physically swapping which model is loaded
|
|
1983
|
+
* between them. The harness DOES NOT manage model processes — it tells the
|
|
1984
|
+
* operator what to do and waits for them to confirm via the next phase's
|
|
1985
|
+
* endpoint preflight succeeding.
|
|
1986
|
+
*
|
|
1987
|
+
* The scheduler:
|
|
1988
|
+
*
|
|
1989
|
+
* 1. Preflights the phase's endpoint (reachable + serving the manifest
|
|
1990
|
+
* model with enough context — see `preflight.ts`). Hard error on
|
|
1991
|
+
* mismatch; no silent fallback.
|
|
1992
|
+
* 2. Calls the phase's `execute()` callback — the runner supplies the
|
|
1993
|
+
* actual ingest/answer or judge work; the scheduler only sequences.
|
|
1994
|
+
* 3. Between phases prints the operator hand-off note from
|
|
1995
|
+
* `manifest.notes.responderToJudgeHandoff`, OR proceeds silently when
|
|
1996
|
+
* the next phase's endpoint is the same baseUrl (Ollama can hot-swap
|
|
1997
|
+
* models within one endpoint, so no physical swap is needed).
|
|
1998
|
+
*
|
|
1999
|
+
* `execute` receives the resolved role so the runner can build the right
|
|
2000
|
+
* provider config without re-reading the manifest. The phase scheduler is
|
|
2001
|
+
* intentionally process-supervision-free per the issue's "do not add
|
|
2002
|
+
* process supervision in v1" instruction.
|
|
2003
|
+
*/
|
|
2004
|
+
|
|
2005
|
+
type LocalLabPhaseName = "responder" | "judge" | "embedding";
|
|
2006
|
+
interface LocalLabPhaseDescriptor {
|
|
2007
|
+
name: LocalLabPhaseName;
|
|
2008
|
+
role: LocalLabRoleConfig;
|
|
2009
|
+
}
|
|
2010
|
+
/**
|
|
2011
|
+
* A unit of phase work supplied by the runner. The callback receives the
|
|
2012
|
+
* phase's resolved role so the caller can build its own provider/client
|
|
2013
|
+
* without re-reading the manifest. Returning a value (e.g. judge verdicts)
|
|
2014
|
+
* is supported; the scheduler does not inspect it.
|
|
2015
|
+
*/
|
|
2016
|
+
type LocalLabPhaseExecute<T = unknown> = (role: ResolvedLocalLabRole) => Promise<T>;
|
|
2017
|
+
interface LocalLabPhase<T = unknown> {
|
|
2018
|
+
name: LocalLabPhaseName;
|
|
2019
|
+
role: LocalLabRoleConfig;
|
|
2020
|
+
execute: LocalLabPhaseExecute<T>;
|
|
2021
|
+
}
|
|
2022
|
+
interface LocalLabPhaseOutcome<T> {
|
|
2023
|
+
phase: LocalLabPhaseDescriptor;
|
|
2024
|
+
preflight: LocalLabPreflightResult;
|
|
2025
|
+
result: T;
|
|
2026
|
+
}
|
|
2027
|
+
interface SequentialPhaseHooks {
|
|
2028
|
+
/**
|
|
2029
|
+
* Called after a phase's preflight succeeds but before its `execute`.
|
|
2030
|
+
* Useful for logging "phase X starting against <endpoint>".
|
|
2031
|
+
*/
|
|
2032
|
+
onPhasePreflight?: (result: LocalLabPreflightResult) => void;
|
|
2033
|
+
/** Called immediately before invoking the phase's execute callback. */
|
|
2034
|
+
onPhaseStart?: (phase: LocalLabPhaseDescriptor) => void;
|
|
2035
|
+
/** Called immediately after a phase's execute resolves. */
|
|
2036
|
+
onPhaseComplete?: <T>(outcome: LocalLabPhaseOutcome<T>) => void;
|
|
2037
|
+
/**
|
|
2038
|
+
* Called between two phases when their endpoints differ. Receives the
|
|
2039
|
+
* manifest hand-off note (or undefined). The scheduler does NOT print to
|
|
2040
|
+
* stdout itself — it delegates to this hook so tests can observe it
|
|
2041
|
+
* without capturing process output.
|
|
2042
|
+
*/
|
|
2043
|
+
onPhaseHandoff?: (from: LocalLabPhaseDescriptor, to: LocalLabPhaseDescriptor, note: string | undefined) => void;
|
|
2044
|
+
}
|
|
2045
|
+
interface RunSequentialPhasesOptions {
|
|
2046
|
+
/** Forwarded to each phase's preflight. */
|
|
2047
|
+
preflight?: LocalLabPreflightOptions;
|
|
2048
|
+
hooks?: SequentialPhaseHooks;
|
|
2049
|
+
}
|
|
2050
|
+
/**
|
|
2051
|
+
* Run a sequence of phases against the manifest's declared endpoints, with
|
|
2052
|
+
* preflight + hand-off enforcement between them. Resolves with each phase's
|
|
2053
|
+
* outcome in order, or rejects with a `LocalLabPreflightError`
|
|
2054
|
+
* carrying the failing preflight (which surfaces the endpoint's found model
|
|
2055
|
+
* list per rule 51).
|
|
2056
|
+
*
|
|
2057
|
+
* Phases are required to be supplied in run order; the scheduler does not
|
|
2058
|
+
* reorder them. Empty phase lists are a no-op (the harness intentionally
|
|
2059
|
+
* supports an empty responder/judge pair for preflight-only smoke checks).
|
|
2060
|
+
*/
|
|
2061
|
+
declare function runSequentialPhases<T>(manifest: LocalLabManifest, phases: LocalLabPhase<T>[], options?: RunSequentialPhasesOptions): Promise<LocalLabPhaseOutcome<T>[]>;
|
|
2062
|
+
/**
|
|
2063
|
+
* Error thrown when a phase's preflight fails. Carries the failing result
|
|
2064
|
+
* so callers (and tests) can inspect what the endpoint reported.
|
|
2065
|
+
*/
|
|
2066
|
+
declare class LocalLabPreflightError extends Error {
|
|
2067
|
+
readonly phase: LocalLabPhaseDescriptor;
|
|
2068
|
+
readonly preflight: LocalLabPreflightResult;
|
|
2069
|
+
readonly phaseIndex: number;
|
|
2070
|
+
constructor(args: {
|
|
2071
|
+
phase: LocalLabPhaseDescriptor;
|
|
2072
|
+
preflight: LocalLabPreflightResult;
|
|
2073
|
+
phaseIndex: number;
|
|
2074
|
+
});
|
|
2075
|
+
}
|
|
2076
|
+
/**
|
|
2077
|
+
* Render the operator hand-off string the scheduler passes to the
|
|
2078
|
+
* `onPhaseHandoff` hook. Returns the manifest note when authored, otherwise
|
|
2079
|
+
* a default instruction naming the next phase and its endpoint. This is a
|
|
2080
|
+
* pure formatting helper — tests assert on it directly.
|
|
2081
|
+
*/
|
|
2082
|
+
declare function formatHandoffNote(from: LocalLabPhaseDescriptor, to: LocalLabPhaseDescriptor, manifestNote: string | undefined): string;
|
|
2083
|
+
|
|
1726
2084
|
type BenchModelSource = "plugin" | "gateway";
|
|
1727
2085
|
interface ResolveBenchRuntimeProfileOptions {
|
|
1728
2086
|
runtimeProfile?: BenchRuntimeProfile;
|
|
@@ -1754,6 +2112,12 @@ interface ResolveBenchRuntimeProfileOptions {
|
|
|
1754
2112
|
drainTimeout?: number;
|
|
1755
2113
|
max429WaitMs?: number;
|
|
1756
2114
|
disableThinking?: boolean;
|
|
2115
|
+
/**
|
|
2116
|
+
* Path to a local-lab manifest JSON file (issue #1573 PR2). Required when
|
|
2117
|
+
* `runtimeProfile: "local-lab"`. The manifest pins responder/judge/embedding
|
|
2118
|
+
* to operator-hosted models with temperature=0 and a fixed seed.
|
|
2119
|
+
*/
|
|
2120
|
+
localLabManifestPath?: string;
|
|
1757
2121
|
}
|
|
1758
2122
|
interface ResolvedBenchRuntimeProfile {
|
|
1759
2123
|
profile: BenchRuntimeProfile;
|
|
@@ -1769,6 +2133,12 @@ interface ResolvedBenchRuntimeProfile {
|
|
|
1769
2133
|
systemProvider: ProviderConfig | null;
|
|
1770
2134
|
judgeProvider: ProviderConfig | null;
|
|
1771
2135
|
internalProvider: ProviderConfig | null;
|
|
2136
|
+
/**
|
|
2137
|
+
* Resolved local-lab profile (issue #1573 PR2). Present only when
|
|
2138
|
+
* `runtimeProfile: "local-lab"`. Drives sequential phase scheduling +
|
|
2139
|
+
* endpoint preflight in the bench runner.
|
|
2140
|
+
*/
|
|
2141
|
+
localLab?: ResolvedLocalLabProfile;
|
|
1772
2142
|
}
|
|
1773
2143
|
declare function resolveBenchRuntimeProfile(options: ResolveBenchRuntimeProfileOptions): Promise<ResolvedBenchRuntimeProfile>;
|
|
1774
2144
|
|
|
@@ -3188,4 +3558,4 @@ interface MitigatedTargetConfig {
|
|
|
3188
3558
|
*/
|
|
3189
3559
|
declare function createMitigatedTarget(config: MitigatedTargetConfig): ExtractionAttackTarget;
|
|
3190
3560
|
|
|
3191
|
-
export { AMA_BENCH_DIAGNOSTIC_VARIANTS, ASSISTANT_AGENT_CONFIG_KEY, ASSISTANT_JUDGE_CONFIG_KEY, ASSISTANT_MEETING_PREP_SCENARIOS, ASSISTANT_MEETING_PREP_SMOKE_SCENARIOS, ASSISTANT_MORNING_BRIEF_SCENARIOS, ASSISTANT_MORNING_BRIEF_SMOKE_SCENARIOS, ASSISTANT_NEXT_BEST_ACTION_SCENARIOS, ASSISTANT_NEXT_BEST_ACTION_SMOKE_SCENARIOS, ASSISTANT_RUBRIC_DIMENSIONS, ASSISTANT_RUBRIC_ID_KEY, ASSISTANT_SEEDS_CONFIG_KEY, ASSISTANT_SPOT_CHECK_DIR_KEY, ASSISTANT_SYNTHESIS_SCENARIOS, ASSISTANT_SYNTHESIS_SMOKE_SCENARIOS, type AbstentionRetrievalCase, type AggregateMetrics, type AmaBenchDiagnosticAdapterOptions, type AmaBenchDiagnosticAnswererMode, type AmaBenchDiagnosticBreakdown, type AmaBenchDiagnosticMatrixArtifact, type AmaBenchDiagnosticRecallMode, type AmaBenchDiagnosticRunContext, type AmaBenchDiagnosticTaskEvidence, type AmaBenchDiagnosticTaskRow, type AmaBenchDiagnosticVariant, type AmaBenchDiagnosticVariantSummary, type AnthropicProviderConfig, type AssistantAgent, type AssistantMemoryFact, type AssistantMemoryGraph, type AssistantRubricDimension, type AssistantRubricScores, type AssistantRunnerOptions, type AssistantScenario, type AssistantStance, type AttackRecallOptions, type AttackRetrievalHit, type AttackerMode, BENCHMARK_ARTIFACT_SCHEMA_VERSION, BENCHMARK_INTEGRITY_META_SCHEMA, BENCHMARK_REPRO_MANIFEST_FILENAME, BENCHMARK_REPRO_MANIFEST_SCHEMA_VERSION, BENCHMARK_RESULT_SCHEMA, BENCHMARK_SPLIT_TYPES, type BaselineRow, type BaselineScenario, type BeamDatasetPreview, type BenchConfig, type BenchJudge, type BenchJudgeResult, type BenchMemoryAdapter, type BenchModelSource, type BenchReasoningEffort, type BenchRecallOptions, type BenchResponder, type BenchResponse, type BenchRuntimeProfile, type BenchTier, type BenchmarkArtifact, type BenchmarkArtifactEnvironment, type BenchmarkArtifactPerTaskScore, type BenchmarkArtifactSystem, type BenchmarkCategory, type BenchmarkDefinition, type BenchmarkIntegrityMeta, type BenchmarkMeta, type BenchmarkMode, type BenchmarkReport, type BenchmarkReproManifest, type BenchmarkReproManifestDataset, type BenchmarkReproManifestFile, type BenchmarkReproManifestResult, type BenchmarkResult, type BenchmarkSplitType, type BenchmarkStatus, type BenchmarkSuiteResult, type BenchmarkTier, type BuildBenchmarkArtifactInput, type BuildBenchmarkPublishFeedOptions, type BuildBenchmarkReproManifestOptions, type BuiltInProvider, CANARY_FIXED_RECALL, CANARY_SCORE_FLOOR, type CanaryAdapterOptions, type CanaryFloorCheck, type CodexCliProviderConfig, type ComparisonMetricDelta, type ComparisonResult, type CompletionOpts, type CompletionResult, type ConfidenceInterval, type ContaminationCheckResult, type ContaminationEntry, type ContaminationManifest, type CustomBenchmarkScoring, type CustomBenchmarkSpec, type CustomBenchmarkTask, DEFAULT_ABLATION_BOOTSTRAP_SEED, DEFAULT_ASSISTANT_RUBRIC_ID, DEFAULT_BASELINE_SCENARIOS, type DatasetSource, type DiscoveredModel, EMPTY_CONTAMINATION_MANIFEST, type EffectSizeInterpretation, type EffectSizeSummary, type ExplainResult, type ExtractedEntity, type ExtractedLink, type ExtractedPage, type ExtractionAttackOptions, type ExtractionAttackResult, type ExtractionAttackTarget, type FixtureGenerator, type FixtureOutput, type FixtureVariant, type GeneratedFile, type GoldEntity, type GoldEntityType, type GoldGraph, type GoldLink, type GoldPage, type HarnessRng, INTEGRITY_CIPHER_ALGORITHM, INTEGRITY_HASH_ALGORITHM, INTEGRITY_META_FIELDS, type IngestionBenchAdapter, type IngestionLog, LOCOMO_DATASET_FILENAMES, LONG_MEM_EVAL_DATASET_FILENAMES, type LeaderboardArtifactWrite, type LlmJudge, type LlmProvider, type LoadDatasetOptions, type LoadSealedQrelsOptions, type LoadedDataset, type LocalLlmProviderConfig, MEMORY_EVAL_DIMENSIONS, MEMORY_EVAL_PUBLIC_LINE, MITIGATED_BASELINE_SCENARIOS, type MemoryEvalCategory, type MemoryEvalDimension, type MemoryEvalDimensionId, type MemoryEvalMetric, type MemoryGraph, type MemoryStats, type MemorySystem, type Message, type MetricAggregate, type MitigatedBaselineConfig, type MitigatedTargetConfig, type MultipleChoiceQuestion, OTHER_NAMESPACE_MEMORIES, type OllamaProviderConfig, type OpenAiCompatibleProviderConfig, PROCEDURAL_REAL_SCENARIOS, PROCEDURAL_REAL_SCENARIOS_SMOKE, PUBLISHED_BENCHMARK_ARTIFACT_IDS, type PersonalizationRetrievalCase, type ProceduralAblationArtifact, type ProceduralAblationPerCase, type ProceduralAblationScenario, type ProceduralRealScenario, type ProceduralRealScenarioCategory, type ProviderBaseConfig, type ProviderConfig, type ProviderDiscoveryResult, type ProviderFactoryConfig, type PublishSkipReason, type PublishSkipRecord, type PublishedBenchmarkFeed, type PublishedBenchmarkFeedEntry, type PublishedBenchmarkId, REQUIRED_FRONTMATTER_FIELDS, type RecallMetrics, type RecoveredMemory, type RegressionDetail, type RegressionGateResult, type RemnicAdapterOptions, type ResolveBenchRuntimeProfileOptions, type ResolvedBenchRuntimeProfile, type ResolvedRunBenchmarkOptions, type RotatedChoices, type RunBenchmarkOptions, type RunProceduralAblationCliArgs, type RunProceduralAblationOptions, SCHEMA_TIER_FIXTURE, SCHEMA_TIER_SMOKE_FIXTURE, SEALED_PROMPT_REGISTRY, SYNTHETIC_MEMORIES, type SanitizedDiagnosticProvider, type SavedBaseline, type SchemaTierCorpus, type SchemaTierFixture, type SchemaTierName, type SchemaTierPage, type SchemaTierPageFrontmatter, type SealedArtifact, type SealedJudgeDecision, type SealedJudgeInput, type SealedQrelsArtifact, type SealedQrelsHandle, type SealedRubric, type SearchResult, type SeededMemory, type SeededRng, type SpotCheckLogger, type StatisticalReport, type StructuredJudge, type SyntheticEmailIngestionAdapterOptions, type SyntheticTargetOptions, type TaskResult, type TaskTokenUsage, type TemporalRetrievalCase, type TierDetail, type TimelineEntry, type TokenUsage, type WriteBenchmarkArtifactResult, addContaminationEntry, aggregateTaskScores, answerBenchmarkQuestion, assertCanaryUnderFloor, assertIntegrityMetaPresent, assertPublishableIntegrity, assertSha256Hex, assistantMeetingPrepDefinition, assistantMorningBriefDefinition, assistantNextBestActionDefinition, assistantSynthesisDefinition, backlinkF1, bootstrapMeanConfidenceInterval, buildAmaBenchDiagnosticMatrixArtifact, buildAmaBenchDiagnosticVariantSummary, buildAmaBenchLeaderboardRows, buildBenchmarkArtifact, buildBenchmarkArtifactFilename, buildBenchmarkPublishFeed, buildBenchmarkReproManifest, buildBenchmarkRunSeeds, buildJudgePayload, buildOracleTrajectoryRecall, buildSchemaTierFixture, buildSchemaTierSmokeFixture, calendarFixture, canonicalJsonStringify, chatFixture, checkDatasetContamination, checkRegression, clampScore, cohensD, compareResults, computeSealHash, containsAnswer, createSeededRng as createAdamSeededRng, createAmaBenchDiagnosticAdapter, createAnthropicProvider, createCanaryAdapter, createCodexCliProvider, createDeterministicSpotCheckLogger, createGatewayResponder, createLightweightAdapter, createLiteLlmProvider, createLocalLlmProvider, createMitigatedTarget, createOllamaProvider, createOpenAiCompatibleProvider, createSeededRandom as createProceduralAblationSeededRandom, createProvider, createProviderBackedAmaBenchRecommendedJudge, createProviderBackedJudge, createProviderBackedResponder, createProviderBackedStructuredJudge, createRemnicAdapter, createResponderFromProvider, createSeededRng$1 as createSeededRng, createSpotCheckFileLogger, createStructuredJudgeFromProvider, createSyntheticEmailIngestionAdapter, createSyntheticTarget, createTimeoutGuardedAdapter, defaultBenchmarkBaselineDir, defaultBenchmarkPublishPath, deleteBenchmarkResults, discoverAllProviders, emailFixture, entityRecall, exactMatch, extractMarkdownSectionsByTitle, f1Score, fixtureToAblationScenarios, formatMissingDatasetError, generateReport, getBenchmark, getBenchmarkLowerIsBetter, getMemoryEvalDimension, getRemnicVersion, hashBenchmarkArtifact, hashBytes, hashCanonicalJson, hashString, integrityMetaIsComplete, interpretEffectSize, isAmaBenchUnknownLikeAnswer, isContaminationEntry, isContaminationManifest, isSealedQrelsArtifact, isSha256Hex, linkMatches, listBenchmarkBaselines, listBenchmarkResults, listBenchmarks, listMemoryEvalBenchmarkIds, listMemoryEvalDimensions, llmJudgeScore, llmJudgeScoreDetailed, loadAblationFixture, loadBaseline, loadBeamDatasetPreview, loadBenchmarkArtifact, loadBenchmarkBaseline, loadBenchmarkResult, loadCustomBenchmarkFile, loadLoCoMo10, loadLongMemEvalS, loadSealKeyFromEnv, loadSealedQrels, loadSealedRubric, matchEntity, mergeContaminationManifests, openSeal, orchestrateBenchmarkRuns, pairedDeltaConfidenceInterval, parseBenchmarkArtifact, parseCustomBenchmark, parseRubricResponse, parseSealedQrels, precisionAtK, projectFolderFixture, recallAtK, redactBenchmarkResultSecrets, renderBaselineMarkdown, renderBenchmarkResultExport, renderMemorySummaryForJudge, renderMemoryViewForAgent, resolveAssistantAgent, resolveAssistantRubricId, resolveAssistantSeeds, resolveAssistantSpotCheckDir, resolveBenchRuntimeProfile, resolveBenchmarkPhaseTimeoutMs, resolveBenchmarkProgressLogging, resolveBenchmarkResultReference, resolveBenchmarkRunCount, resolveStructuredJudge, rotateDistractors, rougeL, runAssistantBenchmark, runAssistantMeetingPrepBenchmark, runAssistantMorningBriefBenchmark, runAssistantNextBestActionBenchmark, runAssistantSynthesisBenchmark, runBaseline, runBenchSuite, runBenchmark, runCustomBenchmarkFile, runExplain, runExtractionAttack, runMitigatedBaseline, runProceduralAblation, runProceduralAblationCli, runSealedJudge, safeHexEqual, saveBaseline, saveBenchmarkBaseline, schemaCompleteness, sealPayload, selectAmaBenchDiagnosticVariants, selectFixtureVariant, serializeBenchmarkArtifact, serializeJsonl, serializeSealedQrels, shuffleTasks, timed, verifyRubricDigest, writeBenchmarkArtifact, writeBenchmarkPublishFeed, writeBenchmarkReproManifest, writeBenchmarkResult, writeLeaderboardArtifactsForResult, zeroScores };
|
|
3561
|
+
export { AMA_BENCH_DIAGNOSTIC_VARIANTS, ASSISTANT_AGENT_CONFIG_KEY, ASSISTANT_JUDGE_CONFIG_KEY, ASSISTANT_MEETING_PREP_SCENARIOS, ASSISTANT_MEETING_PREP_SMOKE_SCENARIOS, ASSISTANT_MORNING_BRIEF_SCENARIOS, ASSISTANT_MORNING_BRIEF_SMOKE_SCENARIOS, ASSISTANT_NEXT_BEST_ACTION_SCENARIOS, ASSISTANT_NEXT_BEST_ACTION_SMOKE_SCENARIOS, ASSISTANT_RUBRIC_DIMENSIONS, ASSISTANT_RUBRIC_ID_KEY, ASSISTANT_SEEDS_CONFIG_KEY, ASSISTANT_SPOT_CHECK_DIR_KEY, ASSISTANT_SYNTHESIS_SCENARIOS, ASSISTANT_SYNTHESIS_SMOKE_SCENARIOS, type AbstentionRetrievalCase, type AggregateMetrics, type AmaBenchDiagnosticAdapterOptions, type AmaBenchDiagnosticAnswererMode, type AmaBenchDiagnosticBreakdown, type AmaBenchDiagnosticMatrixArtifact, type AmaBenchDiagnosticRecallMode, type AmaBenchDiagnosticRunContext, type AmaBenchDiagnosticTaskEvidence, type AmaBenchDiagnosticTaskRow, type AmaBenchDiagnosticVariant, type AmaBenchDiagnosticVariantSummary, type AnthropicProviderConfig, type AssistantAgent, type AssistantMemoryFact, type AssistantMemoryGraph, type AssistantRubricDimension, type AssistantRubricScores, type AssistantRunnerOptions, type AssistantScenario, type AssistantStance, type AttackRecallOptions, type AttackRetrievalHit, type AttackerMode, BENCHMARK_ARTIFACT_SCHEMA_VERSION, BENCHMARK_INTEGRITY_META_SCHEMA, BENCHMARK_REPRO_MANIFEST_FILENAME, BENCHMARK_REPRO_MANIFEST_SCHEMA_VERSION, BENCHMARK_RESULT_SCHEMA, BENCHMARK_SPLIT_TYPES, type BaselineRow, type BaselineScenario, type BeamDatasetPreview, type BenchConfig, type BenchJudge, type BenchJudgeResult, type BenchMemoryAdapter, type BenchModelSource, type BenchReasoningEffort, type BenchRecallOptions, type BenchResponder, type BenchResponse, type BenchRuntimeProfile, type BenchTier, type BenchmarkArtifact, type BenchmarkArtifactEnvironment, type BenchmarkArtifactPerTaskScore, type BenchmarkArtifactSystem, type BenchmarkCategory, type BenchmarkDefinition, type BenchmarkIntegrityMeta, type BenchmarkMeta, type BenchmarkMode, type BenchmarkReport, type BenchmarkReproManifest, type BenchmarkReproManifestDataset, type BenchmarkReproManifestFile, type BenchmarkReproManifestResult, type BenchmarkResult, type BenchmarkSplitType, type BenchmarkStatus, type BenchmarkSuiteResult, type BenchmarkTier, type BuildBenchmarkArtifactInput, type BuildBenchmarkPublishFeedOptions, type BuildBenchmarkReproManifestOptions, type BuiltInProvider, CANARY_FIXED_RECALL, CANARY_SCORE_FLOOR, type CanaryAdapterOptions, type CanaryFloorCheck, type CodexCliProviderConfig, type ComparisonMetricDelta, type ComparisonResult, type CompletionOpts, type CompletionResult, type ConfidenceInterval, type ContaminationCheckResult, type ContaminationEntry, type ContaminationManifest, type CustomBenchmarkScoring, type CustomBenchmarkSpec, type CustomBenchmarkTask, DEFAULT_ABLATION_BOOTSTRAP_SEED, DEFAULT_ASSISTANT_RUBRIC_ID, DEFAULT_BASELINE_SCENARIOS, type DatasetSource, type DiscoveredModel, EMPTY_CONTAMINATION_MANIFEST, type EffectSizeInterpretation, type EffectSizeSummary, type ExplainResult, type ExtractedEntity, type ExtractedLink, type ExtractedPage, type ExtractionAttackOptions, type ExtractionAttackResult, type ExtractionAttackTarget, type FixtureGenerator, type FixtureOutput, type FixtureVariant, type GeneratedFile, type GoldEntity, type GoldEntityType, type GoldGraph, type GoldLink, type GoldPage, type HarnessRng, INTEGRITY_CIPHER_ALGORITHM, INTEGRITY_HASH_ALGORITHM, INTEGRITY_META_FIELDS, type IngestionBenchAdapter, type IngestionLog, LOCAL_LAB_PROVIDER_KINDS, LOCOMO_DATASET_FILENAMES, LONG_MEM_EVAL_DATASET_FILENAMES, type LeaderboardArtifactWrite, type LlmJudge, type LlmProvider, type LoadDatasetOptions, type LoadSealedQrelsOptions, type LoadedDataset, type LocalLabManifest, type LocalLabManifestNotes, type LocalLabPhase, type LocalLabPhaseDescriptor, type LocalLabPhaseExecute, type LocalLabPhaseName, type LocalLabPhaseOutcome, LocalLabPreflightError, type LocalLabPreflightFailure, type LocalLabPreflightInput, type LocalLabPreflightOptions, type LocalLabPreflightResult, type LocalLabPreflightSuccess, type LocalLabProviderKind, type LocalLabRoleConfig, type LocalLlmProviderConfig, MEMORY_EVAL_DIMENSIONS, MEMORY_EVAL_PUBLIC_LINE, MITIGATED_BASELINE_SCENARIOS, type MemoryEvalCategory, type MemoryEvalDimension, type MemoryEvalDimensionId, type MemoryEvalMetric, type MemoryGraph, type MemoryStats, type MemorySystem, type Message, type MetricAggregate, type MitigatedBaselineConfig, type MitigatedTargetConfig, type MultipleChoiceQuestion, OTHER_NAMESPACE_MEMORIES, type OllamaProviderConfig, type OpenAiCompatibleProviderConfig, PROCEDURAL_REAL_SCENARIOS, PROCEDURAL_REAL_SCENARIOS_SMOKE, PUBLISHED_BENCHMARK_ARTIFACT_IDS, type PersonalizationRetrievalCase, type PreflightDiscoveredModel, type ProceduralAblationArtifact, type ProceduralAblationPerCase, type ProceduralAblationScenario, type ProceduralRealScenario, type ProceduralRealScenarioCategory, type ProviderBaseConfig, type ProviderConfig, type ProviderDiscoveryResult, type ProviderFactoryConfig, type PublishSkipReason, type PublishSkipRecord, type PublishedBenchmarkFeed, type PublishedBenchmarkFeedEntry, type PublishedBenchmarkId, REQUIRED_FRONTMATTER_FIELDS, type RecallMetrics, type RecoveredMemory, type RegressionDetail, type RegressionGateResult, type RemnicAdapterOptions, type ResolveBenchRuntimeProfileOptions, type ResolvedBenchRuntimeProfile, type ResolvedLocalLabProfile, type ResolvedLocalLabRole, type ResolvedRunBenchmarkOptions, type RotatedChoices, type RunBenchmarkOptions, type RunProceduralAblationCliArgs, type RunProceduralAblationOptions, type RunSequentialPhasesOptions, SCHEMA_TIER_FIXTURE, SCHEMA_TIER_SMOKE_FIXTURE, SEALED_PROMPT_REGISTRY, SYNTHETIC_MEMORIES, type SanitizedDiagnosticProvider, type SavedBaseline, type SchemaTierCorpus, type SchemaTierFixture, type SchemaTierName, type SchemaTierPage, type SchemaTierPageFrontmatter, type SealedArtifact, type SealedJudgeDecision, type SealedJudgeInput, type SealedQrelsArtifact, type SealedQrelsHandle, type SealedRubric, type SearchResult, type SeededMemory, type SeededRng, type SequentialPhaseHooks, type SpotCheckLogger, type StatisticalReport, type StructuredJudge, type SyntheticEmailIngestionAdapterOptions, type SyntheticTargetOptions, type TaskResult, type TaskTokenUsage, type TemporalRetrievalCase, type TierDetail, type TimelineEntry, type TokenUsage, type WriteBenchmarkArtifactResult, addContaminationEntry, aggregateTaskScores, answerBenchmarkQuestion, assertCanaryUnderFloor, assertIntegrityMetaPresent, assertPublishableIntegrity, assertSha256Hex, assistantMeetingPrepDefinition, assistantMorningBriefDefinition, assistantNextBestActionDefinition, assistantSynthesisDefinition, backlinkF1, bootstrapMeanConfidenceInterval, buildAmaBenchDiagnosticMatrixArtifact, buildAmaBenchDiagnosticVariantSummary, buildAmaBenchLeaderboardRows, buildBenchmarkArtifact, buildBenchmarkArtifactFilename, buildBenchmarkPublishFeed, buildBenchmarkReproManifest, buildBenchmarkRunSeeds, buildJudgePayload, buildOracleTrajectoryRecall, buildSchemaTierFixture, buildSchemaTierSmokeFixture, calendarFixture, canonicalJsonStringify, chatFixture, checkDatasetContamination, checkRegression, clampScore, cohensD, compareResults, computeSealHash, containsAnswer, createSeededRng as createAdamSeededRng, createAmaBenchDiagnosticAdapter, createAnthropicProvider, createCanaryAdapter, createCodexCliProvider, createDeterministicSpotCheckLogger, createGatewayResponder, createLightweightAdapter, createLiteLlmProvider, createLocalLlmProvider, createMitigatedTarget, createOllamaProvider, createOpenAiCompatibleProvider, createSeededRandom as createProceduralAblationSeededRandom, createProvider, createProviderBackedAmaBenchRecommendedJudge, createProviderBackedJudge, createProviderBackedResponder, createProviderBackedStructuredJudge, createRemnicAdapter, createResponderFromProvider, createSeededRng$1 as createSeededRng, createSpotCheckFileLogger, createStructuredJudgeFromProvider, createSyntheticEmailIngestionAdapter, createSyntheticTarget, createTimeoutGuardedAdapter, defaultBenchmarkBaselineDir, defaultBenchmarkPublishPath, deleteBenchmarkResults, discoverAllProviders, discoveryEndpointFor, emailFixture, entityRecall, exactMatch, extractMarkdownSectionsByTitle, f1Score, fixtureToAblationScenarios, formatHandoffNote, formatMissingDatasetError, generateReport, getBenchmark, getBenchmarkLowerIsBetter, getMemoryEvalDimension, getRemnicVersion, hashBenchmarkArtifact, hashBytes, hashCanonicalJson, hashString, integrityMetaIsComplete, interpretEffectSize, isAmaBenchUnknownLikeAnswer, isContaminationEntry, isContaminationManifest, isSealedQrelsArtifact, isSha256Hex, linkMatches, listBenchmarkBaselines, listBenchmarkResults, listBenchmarks, listMemoryEvalBenchmarkIds, listMemoryEvalDimensions, llmJudgeScore, llmJudgeScoreDetailed, loadAblationFixture, loadBaseline, loadBeamDatasetPreview, loadBenchmarkArtifact, loadBenchmarkBaseline, loadBenchmarkResult, loadCustomBenchmarkFile, loadLoCoMo10, loadLocalLabManifest, loadLongMemEvalS, loadSealKeyFromEnv, loadSealedQrels, loadSealedRubric, matchEntity, mergeContaminationManifests, openSeal, orchestrateBenchmarkRuns, pairedDeltaConfidenceInterval, parseBenchmarkArtifact, parseCustomBenchmark, parseLocalLabManifest, parseRubricResponse, parseSealedQrels, precisionAtK, preflightLocalLabRole, projectFolderFixture, recallAtK, redactBenchmarkResultSecrets, renderBaselineMarkdown, renderBenchmarkResultExport, renderMemorySummaryForJudge, renderMemoryViewForAgent, resolveAssistantAgent, resolveAssistantRubricId, resolveAssistantSeeds, resolveAssistantSpotCheckDir, resolveBenchRuntimeProfile, resolveBenchmarkPhaseTimeoutMs, resolveBenchmarkProgressLogging, resolveBenchmarkResultReference, resolveBenchmarkRunCount, resolveLocalLabProfile, resolveLocalLabRole, resolveStructuredJudge, rotateDistractors, rougeL, runAssistantBenchmark, runAssistantMeetingPrepBenchmark, runAssistantMorningBriefBenchmark, runAssistantNextBestActionBenchmark, runAssistantSynthesisBenchmark, runBaseline, runBenchSuite, runBenchmark, runCustomBenchmarkFile, runExplain, runExtractionAttack, runMitigatedBaseline, runProceduralAblation, runProceduralAblationCli, runSealedJudge, runSequentialPhases, safeHexEqual, saveBaseline, saveBenchmarkBaseline, schemaCompleteness, sealPayload, selectAmaBenchDiagnosticVariants, selectFixtureVariant, serializeBenchmarkArtifact, serializeJsonl, serializeSealedQrels, shuffleTasks, timed, verifyRubricDigest, writeBenchmarkArtifact, writeBenchmarkPublishFeed, writeBenchmarkReproManifest, writeBenchmarkResult, writeLeaderboardArtifactsForResult, zeroScores };
|