@remnic/bench 9.3.680 → 9.3.682

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -211,7 +211,7 @@ type BenchmarkMode = "full" | "quick";
211
211
  type BenchmarkTier = "published" | "remnic" | "custom";
212
212
  type BenchmarkStatus = "ready" | "planned";
213
213
  type BenchmarkCategory = "agentic" | "retrieval" | "conversational" | "ingestion";
214
- type BenchRuntimeProfile = "baseline" | "real" | "openclaw-chain";
214
+ type BenchRuntimeProfile = "baseline" | "real" | "openclaw-chain" | "local-lab";
215
215
  type AmaBenchJudgeProtocol = "default" | "recommended";
216
216
  /**
217
217
  * Built-in LLM providers supported by the bench harness.
@@ -244,6 +244,18 @@ interface ProviderConfig {
244
244
  reasoningEffort?: BenchReasoningEffort;
245
245
  responderContextBudgetChars?: number;
246
246
  responderPromptBudgetChars?: number;
247
+ /**
248
+ * Sampling temperature forwarded from a runtime profile manifest (e.g. the
249
+ * local-lab manifest pins this to 0 for reproducibility). Optional; providers
250
+ * that do not read it ignore the value.
251
+ */
252
+ temperature?: number;
253
+ /**
254
+ * Sampling seed forwarded from a runtime profile manifest so local-lab runs
255
+ * are reproducible across invocations. Optional; providers that do not read
256
+ * it ignore the value.
257
+ */
258
+ seed?: number;
247
259
  }
248
260
  interface TaskTokenUsage {
249
261
  input: number;
@@ -648,6 +660,18 @@ interface ProviderBaseConfig {
648
660
  };
649
661
  /** Suppress thinking/reasoning tokens for thinking-capable models (Qwen 3.5, Gemma 4, DeepSeek). */
650
662
  disableThinking?: boolean;
663
+ /**
664
+ * Sampling temperature pinned by a runtime profile manifest (e.g. local-lab
665
+ * pins 0 for reproducibility). Providers use this as the fallback when the
666
+ * per-call CompletionOpts does not override it.
667
+ */
668
+ temperature?: number;
669
+ /**
670
+ * Sampling seed pinned by a runtime profile manifest so reruns reproduce the
671
+ * same draws. Providers forward this to the backend on every completion call
672
+ * (Ollama options.seed, OpenAI-compatible top-level seed).
673
+ */
674
+ seed?: number;
651
675
  /**
652
676
  * Optional answering-only memory-context budget. Benchmark artifacts keep the
653
677
  * full recalled text, but provider-backed responders may receive this compact
@@ -1723,6 +1747,340 @@ interface AssistantRunnerOptions {
1723
1747
  random?: () => number;
1724
1748
  }
1725
1749
 
1750
+ /**
1751
+ * Local-lab runtime profile manifest (issue #1573 PR2).
1752
+ *
1753
+ * A `local-lab` profile is a JSON manifest — never hardcoded model strings
1754
+ * (rule 30/55) — that pins a single-locale bench run (responder, judge,
1755
+ * optional embedding) to operator-hosted models. It is resolved by
1756
+ * `resolveBenchRuntimeProfile` to drive sequential phase scheduling and
1757
+ * endpoint preflight (see `preflight.ts` and `sequential-phases.ts`).
1758
+ *
1759
+ * Field contract:
1760
+ *
1761
+ * - `provider` is one of `LOCAL_LAB_PROVIDER_KINDS` ("openai-compatible"
1762
+ * for llama.cpp / vLLM / LM Studio; "ollama" for native Ollama). Any
1763
+ * other value is REJECTED with the valid kinds listed (rule 51 — never
1764
+ * silently fall through to a default).
1765
+ * - `temperature` is pinned to 0 and `seed` is required so local-lab
1766
+ * runs are reproducible.
1767
+ * - `ctx` is the manifest-declared serving context (tokens). Preflight
1768
+ * verifies the live endpoint reports at least this much.
1769
+ *
1770
+ * The manifest is content (no command strings interpolated into shells;
1771
+ * rule 10) — `baseUrl`/`model` are only ever fetch targets.
1772
+ */
1773
+ /**
1774
+ * Provider kinds accepted by a local-lab manifest role.
1775
+ *
1776
+ * `"openai-compatible"` maps to the OpenAI-compatible transport
1777
+ * (`/v1/chat/completions` + `/v1/models`) used by llama.cpp, vLLM, LM
1778
+ * Studio, etc. `"ollama"` maps to Ollama's native transport
1779
+ * (`/api/generate` + `/api/tags`).
1780
+ */
1781
+ declare const LOCAL_LAB_PROVIDER_KINDS: readonly ["openai-compatible", "ollama"];
1782
+ type LocalLabProviderKind = (typeof LOCAL_LAB_PROVIDER_KINDS)[number];
1783
+ interface LocalLabRoleConfig {
1784
+ provider: LocalLabProviderKind;
1785
+ /** Base URL of the operator-hosted endpoint (e.g. `http://localhost:1234/v1`). */
1786
+ baseUrl: string;
1787
+ /** Exact model id the endpoint reports (no aliases, no shell interpolation). */
1788
+ model: string;
1789
+ /** Optional quantization label (informational; recorded in artifacts). */
1790
+ quantization?: string;
1791
+ /** Manifest-declared serving context length in tokens. */
1792
+ ctx: number;
1793
+ /** Sampling temperature. Local-lab pins this to 0 for reproducibility. */
1794
+ temperature: 0;
1795
+ /** Sampling seed; required so reruns reproduce the same draws. */
1796
+ seed: number;
1797
+ }
1798
+ interface LocalLabManifestNotes {
1799
+ /**
1800
+ * Free-form operator guidance printed between responder and judge phases
1801
+ * when the two roles live on different endpoints. When both roles share
1802
+ * an endpoint the runner skips the hand-off (see sequential-phases.ts).
1803
+ */
1804
+ responderToJudgeHandoff?: string;
1805
+ [key: string]: unknown;
1806
+ }
1807
+ interface LocalLabManifest {
1808
+ /** Manifest discriminator; always the literal `"local-lab"`. */
1809
+ profile: "local-lab";
1810
+ responder: LocalLabRoleConfig;
1811
+ judge: LocalLabRoleConfig;
1812
+ embedding?: LocalLabRoleConfig;
1813
+ /** Phase scheduling mode. PR2 ships `"sequential"` only. */
1814
+ phases: "sequential";
1815
+ notes?: LocalLabManifestNotes;
1816
+ }
1817
+ /**
1818
+ * Parse and validate a local-lab manifest from an unknown parsed JSON value.
1819
+ * Throws a rule-51-shaped error (lists valid kinds) on any violation.
1820
+ */
1821
+ declare function parseLocalLabManifest(raw: unknown): LocalLabManifest;
1822
+ /**
1823
+ * Read and parse a local-lab manifest from disk. The path is opened read-only;
1824
+ * nothing in the manifest is ever interpolated into a shell (rule 10).
1825
+ */
1826
+ declare function loadLocalLabManifest(filePath: string): Promise<LocalLabManifest>;
1827
+
1828
+ /**
1829
+ * Resolve a parsed local-lab manifest into runtime ProviderConfigs.
1830
+ *
1831
+ * Resolution is the bridge between the manifest (operator-authored JSON) and
1832
+ * the harness's existing `ProviderConfig` shape: each manifest role becomes a
1833
+ * `ProviderConfig` with `temperature` and `seed` forwarded verbatim so a test
1834
+ * can assert on the resolved config (issue #1573 PR2 test list).
1835
+ *
1836
+ * Provider kind mapping (kept dumb and explicit, never silent fallback):
1837
+ *
1838
+ * - `"openai-compatible"` → `BuiltInProvider` `"local-llm"` (the bench
1839
+ * provider that talks `/v1/chat/completions` + `/v1/models`, requiring an
1840
+ * explicit `baseUrl` — exactly what the manifest pins).
1841
+ * - `"ollama"` → `"ollama"` (native `/api/generate` + `/api/tags`).
1842
+ *
1843
+ * Both baseUrl and model are copied as-is; they are only ever fetch targets,
1844
+ * never interpolated into a shell (rule 10). API keys are not part of the
1845
+ * manifest — local-lab endpoints are operator-hosted on the loopback or a
1846
+ * private host, so persisting a key into the manifest would be a footgun.
1847
+ * Operators with auth'd local endpoints pass the key out-of-band.
1848
+ *
1849
+ * `quantization` is informational only; the bench `ProviderConfig` does not
1850
+ * have a quantization field, so it is kept on the resolved role and surfaces
1851
+ * in the bench artifact (PR3's tier/hardware metadata).
1852
+ */
1853
+
1854
+ /**
1855
+ * A manifest role paired with its resolved `ProviderConfig`. The PR2 test
1856
+ * list asserts temperature/seed on `providerConfig` directly.
1857
+ */
1858
+ interface ResolvedLocalLabRole {
1859
+ readonly provider: LocalLabRoleConfig["provider"];
1860
+ readonly baseUrl: string;
1861
+ readonly model: string;
1862
+ readonly quantization?: string;
1863
+ readonly ctx: number;
1864
+ readonly temperature: number;
1865
+ readonly seed: number;
1866
+ readonly providerConfig: ProviderConfig;
1867
+ }
1868
+ interface ResolvedLocalLabProfile {
1869
+ /** The parsed manifest this resolution was produced from. */
1870
+ readonly manifest: LocalLabManifest;
1871
+ readonly responder: ResolvedLocalLabRole;
1872
+ readonly judge: ResolvedLocalLabRole;
1873
+ readonly embedding?: ResolvedLocalLabRole;
1874
+ /** Phase scheduling mode. PR2 ships `"sequential"` only. */
1875
+ readonly phases: "sequential";
1876
+ /** Operator hand-off note (or undefined when not authored). */
1877
+ readonly notes?: LocalLabManifestNotes;
1878
+ }
1879
+ /**
1880
+ * Resolve a single manifest role into a `ResolvedLocalLabRole`, forwarding
1881
+ * `temperature` and `seed` into the `ProviderConfig` so providers can read
1882
+ * them off the config directly.
1883
+ *
1884
+ * Ollama `baseUrl`s are normalized to include `/api` so the provider posts to
1885
+ * `${baseUrl}/generate` → `…/api/generate` rather than `…/generate` (404).
1886
+ * This mirrors `discoveryEndpointFor` which already appends `/api/tags` for
1887
+ * preflight. Operators can write `http://127.0.0.1:11434` or
1888
+ * `http://127.0.0.1:11434/api` interchangeably (codex review, #1573 PR2).
1889
+ */
1890
+ declare function resolveLocalLabRole(role: LocalLabRoleConfig): ResolvedLocalLabRole;
1891
+ /**
1892
+ * Resolve the full manifest into a `ResolvedLocalLabProfile`. Used by
1893
+ * `resolveBenchRuntimeProfile` for `runtimeProfile: "local-lab"`, and
1894
+ * directly by the unit test for the temperature/seed forwarding assertion.
1895
+ */
1896
+ declare function resolveLocalLabProfile(manifest: LocalLabManifest): ResolvedLocalLabProfile;
1897
+
1898
+ /**
1899
+ * Local-lab endpoint preflight (issue #1573 PR2).
1900
+ *
1901
+ * Before each phase the runner verifies the operator-hosted endpoint is
1902
+ * actually serving the model the manifest claims, with at least the
1903
+ * manifest-declared context length. The harness never manages model
1904
+ * processes — it asks the endpoint what's live and refuses to proceed
1905
+ * ("hard error listing what was found vs expected", rule 51) on any
1906
+ * mismatch. Silent fallback is explicitly forbidden.
1907
+ *
1908
+ * Discovery endpoints by provider kind:
1909
+ *
1910
+ * - `"openai-compatible"` → `GET <baseUrl>/models` (or `/v1/models` when
1911
+ * the baseUrl does not already end in `/v1`). The body shape mirrors the
1912
+ * OpenAI `/v1/models` contract: `{ data: [{ id, context_length?, ... }] }`.
1913
+ * - `"ollama"` → `GET <baseUrl>/tags` (or `/api/tags`). The body shape
1914
+ * mirrors Ollama's `/api/tags` contract: `{ models: [{ name,
1915
+ * details?.parameter_size?, ... }] }`.
1916
+ *
1917
+ * `baseUrl` is composed into a fetch URL only — never a shell string
1918
+ * (rule 10). Failures carry the endpoint's actual reported model list so an
1919
+ * operator can immediately see why their manifest doesn't match.
1920
+ */
1921
+
1922
+ /** Minimal shape the preflight reader needs from a discovered model entry. */
1923
+ interface PreflightDiscoveredModel {
1924
+ id: string;
1925
+ contextLength?: number;
1926
+ }
1927
+ interface LocalLabPreflightInput {
1928
+ provider: LocalLabProviderKind;
1929
+ baseUrl: string;
1930
+ /** Exact model id the endpoint is expected to report. */
1931
+ model: string;
1932
+ /** Manifest-declared serving context length; the live endpoint must meet or exceed it. */
1933
+ ctx: number;
1934
+ }
1935
+ interface LocalLabPreflightSuccess {
1936
+ ok: true;
1937
+ provider: LocalLabProviderKind;
1938
+ endpoint: string;
1939
+ expectedModel: string;
1940
+ foundModels: PreflightDiscoveredModel[];
1941
+ /** Resolved context length for the matched model, when the endpoint reports one. */
1942
+ matchedContextLength?: number;
1943
+ }
1944
+ interface LocalLabPreflightFailure {
1945
+ ok: false;
1946
+ provider: LocalLabProviderKind;
1947
+ endpoint: string;
1948
+ expectedModel: string;
1949
+ foundModels: PreflightDiscoveredModel[];
1950
+ expectedCtx: number;
1951
+ matchedContextLength?: number;
1952
+ reason: string;
1953
+ }
1954
+ type LocalLabPreflightResult = LocalLabPreflightSuccess | LocalLabPreflightFailure;
1955
+ interface LocalLabPreflightOptions {
1956
+ signal?: AbortSignal;
1957
+ /** Per-request timeout. Defaults to 5 s — preflight should be fast. */
1958
+ timeoutMs?: number;
1959
+ /** Inject a fetch implementation (tests). Defaults to global fetch. */
1960
+ fetchImpl?: typeof fetch;
1961
+ }
1962
+ /**
1963
+ * Preflight a single manifest role. Resolves to a result object — never
1964
+ * throws for endpoint/discovery failures (those are preflight failures, not
1965
+ * runtime errors). Throws only on truly exceptional conditions (invalid
1966
+ * baseUrl shape, fetchImpl contract violation).
1967
+ */
1968
+ declare function preflightLocalLabRole(input: LocalLabPreflightInput, options?: LocalLabPreflightOptions): Promise<LocalLabPreflightResult>;
1969
+ /**
1970
+ * Compose the discovery URL for a provider kind + baseUrl. Never
1971
+ * interpolates into a shell — only into a fetch URL. Tolerant of trailing
1972
+ * slashes and the common `/v1` (OpenAI-compatible) / `/api` (Ollama)
1973
+ * suffixes already being present.
1974
+ */
1975
+ declare function discoveryEndpointFor(provider: LocalLabProviderKind, baseUrl: string): string;
1976
+
1977
+ /**
1978
+ * Local-lab sequential phase scheduler (issue #1573 PR2).
1979
+ *
1980
+ * On a single-GPU lab box (e.g. RTX 3090, 24 GB VRAM) the responder and
1981
+ * judge models cannot be co-resident: the harness runs them in two distinct
1982
+ * phases, with the operator physically swapping which model is loaded
1983
+ * between them. The harness DOES NOT manage model processes — it tells the
1984
+ * operator what to do and waits for them to confirm via the next phase's
1985
+ * endpoint preflight succeeding.
1986
+ *
1987
+ * The scheduler:
1988
+ *
1989
+ * 1. Preflights the phase's endpoint (reachable + serving the manifest
1990
+ * model with enough context — see `preflight.ts`). Hard error on
1991
+ * mismatch; no silent fallback.
1992
+ * 2. Calls the phase's `execute()` callback — the runner supplies the
1993
+ * actual ingest/answer or judge work; the scheduler only sequences.
1994
+ * 3. Between phases prints the operator hand-off note from
1995
+ * `manifest.notes.responderToJudgeHandoff`, OR proceeds silently when
1996
+ * the next phase's endpoint is the same baseUrl (Ollama can hot-swap
1997
+ * models within one endpoint, so no physical swap is needed).
1998
+ *
1999
+ * `execute` receives the resolved role so the runner can build the right
2000
+ * provider config without re-reading the manifest. The phase scheduler is
2001
+ * intentionally process-supervision-free per the issue's "do not add
2002
+ * process supervision in v1" instruction.
2003
+ */
2004
+
2005
+ type LocalLabPhaseName = "responder" | "judge" | "embedding";
2006
+ interface LocalLabPhaseDescriptor {
2007
+ name: LocalLabPhaseName;
2008
+ role: LocalLabRoleConfig;
2009
+ }
2010
+ /**
2011
+ * A unit of phase work supplied by the runner. The callback receives the
2012
+ * phase's resolved role so the caller can build its own provider/client
2013
+ * without re-reading the manifest. Returning a value (e.g. judge verdicts)
2014
+ * is supported; the scheduler does not inspect it.
2015
+ */
2016
+ type LocalLabPhaseExecute<T = unknown> = (role: ResolvedLocalLabRole) => Promise<T>;
2017
+ interface LocalLabPhase<T = unknown> {
2018
+ name: LocalLabPhaseName;
2019
+ role: LocalLabRoleConfig;
2020
+ execute: LocalLabPhaseExecute<T>;
2021
+ }
2022
+ interface LocalLabPhaseOutcome<T> {
2023
+ phase: LocalLabPhaseDescriptor;
2024
+ preflight: LocalLabPreflightResult;
2025
+ result: T;
2026
+ }
2027
+ interface SequentialPhaseHooks {
2028
+ /**
2029
+ * Called after a phase's preflight succeeds but before its `execute`.
2030
+ * Useful for logging "phase X starting against <endpoint>".
2031
+ */
2032
+ onPhasePreflight?: (result: LocalLabPreflightResult) => void;
2033
+ /** Called immediately before invoking the phase's execute callback. */
2034
+ onPhaseStart?: (phase: LocalLabPhaseDescriptor) => void;
2035
+ /** Called immediately after a phase's execute resolves. */
2036
+ onPhaseComplete?: <T>(outcome: LocalLabPhaseOutcome<T>) => void;
2037
+ /**
2038
+ * Called between two phases when their endpoints differ. Receives the
2039
+ * manifest hand-off note (or undefined). The scheduler does NOT print to
2040
+ * stdout itself — it delegates to this hook so tests can observe it
2041
+ * without capturing process output.
2042
+ */
2043
+ onPhaseHandoff?: (from: LocalLabPhaseDescriptor, to: LocalLabPhaseDescriptor, note: string | undefined) => void;
2044
+ }
2045
+ interface RunSequentialPhasesOptions {
2046
+ /** Forwarded to each phase's preflight. */
2047
+ preflight?: LocalLabPreflightOptions;
2048
+ hooks?: SequentialPhaseHooks;
2049
+ }
2050
+ /**
2051
+ * Run a sequence of phases against the manifest's declared endpoints, with
2052
+ * preflight + hand-off enforcement between them. Resolves with each phase's
2053
+ * outcome in order, or rejects with a `LocalLabPreflightError`
2054
+ * carrying the failing preflight (which surfaces the endpoint's found model
2055
+ * list per rule 51).
2056
+ *
2057
+ * Phases are required to be supplied in run order; the scheduler does not
2058
+ * reorder them. Empty phase lists are a no-op (the harness intentionally
2059
+ * supports an empty responder/judge pair for preflight-only smoke checks).
2060
+ */
2061
+ declare function runSequentialPhases<T>(manifest: LocalLabManifest, phases: LocalLabPhase<T>[], options?: RunSequentialPhasesOptions): Promise<LocalLabPhaseOutcome<T>[]>;
2062
+ /**
2063
+ * Error thrown when a phase's preflight fails. Carries the failing result
2064
+ * so callers (and tests) can inspect what the endpoint reported.
2065
+ */
2066
+ declare class LocalLabPreflightError extends Error {
2067
+ readonly phase: LocalLabPhaseDescriptor;
2068
+ readonly preflight: LocalLabPreflightResult;
2069
+ readonly phaseIndex: number;
2070
+ constructor(args: {
2071
+ phase: LocalLabPhaseDescriptor;
2072
+ preflight: LocalLabPreflightResult;
2073
+ phaseIndex: number;
2074
+ });
2075
+ }
2076
+ /**
2077
+ * Render the operator hand-off string the scheduler passes to the
2078
+ * `onPhaseHandoff` hook. Returns the manifest note when authored, otherwise
2079
+ * a default instruction naming the next phase and its endpoint. This is a
2080
+ * pure formatting helper — tests assert on it directly.
2081
+ */
2082
+ declare function formatHandoffNote(from: LocalLabPhaseDescriptor, to: LocalLabPhaseDescriptor, manifestNote: string | undefined): string;
2083
+
1726
2084
  type BenchModelSource = "plugin" | "gateway";
1727
2085
  interface ResolveBenchRuntimeProfileOptions {
1728
2086
  runtimeProfile?: BenchRuntimeProfile;
@@ -1754,6 +2112,12 @@ interface ResolveBenchRuntimeProfileOptions {
1754
2112
  drainTimeout?: number;
1755
2113
  max429WaitMs?: number;
1756
2114
  disableThinking?: boolean;
2115
+ /**
2116
+ * Path to a local-lab manifest JSON file (issue #1573 PR2). Required when
2117
+ * `runtimeProfile: "local-lab"`. The manifest pins responder/judge/embedding
2118
+ * to operator-hosted models with temperature=0 and a fixed seed.
2119
+ */
2120
+ localLabManifestPath?: string;
1757
2121
  }
1758
2122
  interface ResolvedBenchRuntimeProfile {
1759
2123
  profile: BenchRuntimeProfile;
@@ -1769,6 +2133,12 @@ interface ResolvedBenchRuntimeProfile {
1769
2133
  systemProvider: ProviderConfig | null;
1770
2134
  judgeProvider: ProviderConfig | null;
1771
2135
  internalProvider: ProviderConfig | null;
2136
+ /**
2137
+ * Resolved local-lab profile (issue #1573 PR2). Present only when
2138
+ * `runtimeProfile: "local-lab"`. Drives sequential phase scheduling +
2139
+ * endpoint preflight in the bench runner.
2140
+ */
2141
+ localLab?: ResolvedLocalLabProfile;
1772
2142
  }
1773
2143
  declare function resolveBenchRuntimeProfile(options: ResolveBenchRuntimeProfileOptions): Promise<ResolvedBenchRuntimeProfile>;
1774
2144
 
@@ -3188,4 +3558,4 @@ interface MitigatedTargetConfig {
3188
3558
  */
3189
3559
  declare function createMitigatedTarget(config: MitigatedTargetConfig): ExtractionAttackTarget;
3190
3560
 
3191
- export { AMA_BENCH_DIAGNOSTIC_VARIANTS, ASSISTANT_AGENT_CONFIG_KEY, ASSISTANT_JUDGE_CONFIG_KEY, ASSISTANT_MEETING_PREP_SCENARIOS, ASSISTANT_MEETING_PREP_SMOKE_SCENARIOS, ASSISTANT_MORNING_BRIEF_SCENARIOS, ASSISTANT_MORNING_BRIEF_SMOKE_SCENARIOS, ASSISTANT_NEXT_BEST_ACTION_SCENARIOS, ASSISTANT_NEXT_BEST_ACTION_SMOKE_SCENARIOS, ASSISTANT_RUBRIC_DIMENSIONS, ASSISTANT_RUBRIC_ID_KEY, ASSISTANT_SEEDS_CONFIG_KEY, ASSISTANT_SPOT_CHECK_DIR_KEY, ASSISTANT_SYNTHESIS_SCENARIOS, ASSISTANT_SYNTHESIS_SMOKE_SCENARIOS, type AbstentionRetrievalCase, type AggregateMetrics, type AmaBenchDiagnosticAdapterOptions, type AmaBenchDiagnosticAnswererMode, type AmaBenchDiagnosticBreakdown, type AmaBenchDiagnosticMatrixArtifact, type AmaBenchDiagnosticRecallMode, type AmaBenchDiagnosticRunContext, type AmaBenchDiagnosticTaskEvidence, type AmaBenchDiagnosticTaskRow, type AmaBenchDiagnosticVariant, type AmaBenchDiagnosticVariantSummary, type AnthropicProviderConfig, type AssistantAgent, type AssistantMemoryFact, type AssistantMemoryGraph, type AssistantRubricDimension, type AssistantRubricScores, type AssistantRunnerOptions, type AssistantScenario, type AssistantStance, type AttackRecallOptions, type AttackRetrievalHit, type AttackerMode, BENCHMARK_ARTIFACT_SCHEMA_VERSION, BENCHMARK_INTEGRITY_META_SCHEMA, BENCHMARK_REPRO_MANIFEST_FILENAME, BENCHMARK_REPRO_MANIFEST_SCHEMA_VERSION, BENCHMARK_RESULT_SCHEMA, BENCHMARK_SPLIT_TYPES, type BaselineRow, type BaselineScenario, type BeamDatasetPreview, type BenchConfig, type BenchJudge, type BenchJudgeResult, type BenchMemoryAdapter, type BenchModelSource, type BenchReasoningEffort, type BenchRecallOptions, type BenchResponder, type BenchResponse, type BenchRuntimeProfile, type BenchTier, type BenchmarkArtifact, type BenchmarkArtifactEnvironment, type BenchmarkArtifactPerTaskScore, type BenchmarkArtifactSystem, type BenchmarkCategory, type BenchmarkDefinition, type BenchmarkIntegrityMeta, type BenchmarkMeta, type BenchmarkMode, type BenchmarkReport, type BenchmarkReproManifest, type BenchmarkReproManifestDataset, type BenchmarkReproManifestFile, type BenchmarkReproManifestResult, type BenchmarkResult, type BenchmarkSplitType, type BenchmarkStatus, type BenchmarkSuiteResult, type BenchmarkTier, type BuildBenchmarkArtifactInput, type BuildBenchmarkPublishFeedOptions, type BuildBenchmarkReproManifestOptions, type BuiltInProvider, CANARY_FIXED_RECALL, CANARY_SCORE_FLOOR, type CanaryAdapterOptions, type CanaryFloorCheck, type CodexCliProviderConfig, type ComparisonMetricDelta, type ComparisonResult, type CompletionOpts, type CompletionResult, type ConfidenceInterval, type ContaminationCheckResult, type ContaminationEntry, type ContaminationManifest, type CustomBenchmarkScoring, type CustomBenchmarkSpec, type CustomBenchmarkTask, DEFAULT_ABLATION_BOOTSTRAP_SEED, DEFAULT_ASSISTANT_RUBRIC_ID, DEFAULT_BASELINE_SCENARIOS, type DatasetSource, type DiscoveredModel, EMPTY_CONTAMINATION_MANIFEST, type EffectSizeInterpretation, type EffectSizeSummary, type ExplainResult, type ExtractedEntity, type ExtractedLink, type ExtractedPage, type ExtractionAttackOptions, type ExtractionAttackResult, type ExtractionAttackTarget, type FixtureGenerator, type FixtureOutput, type FixtureVariant, type GeneratedFile, type GoldEntity, type GoldEntityType, type GoldGraph, type GoldLink, type GoldPage, type HarnessRng, INTEGRITY_CIPHER_ALGORITHM, INTEGRITY_HASH_ALGORITHM, INTEGRITY_META_FIELDS, type IngestionBenchAdapter, type IngestionLog, LOCOMO_DATASET_FILENAMES, LONG_MEM_EVAL_DATASET_FILENAMES, type LeaderboardArtifactWrite, type LlmJudge, type LlmProvider, type LoadDatasetOptions, type LoadSealedQrelsOptions, type LoadedDataset, type LocalLlmProviderConfig, MEMORY_EVAL_DIMENSIONS, MEMORY_EVAL_PUBLIC_LINE, MITIGATED_BASELINE_SCENARIOS, type MemoryEvalCategory, type MemoryEvalDimension, type MemoryEvalDimensionId, type MemoryEvalMetric, type MemoryGraph, type MemoryStats, type MemorySystem, type Message, type MetricAggregate, type MitigatedBaselineConfig, type MitigatedTargetConfig, type MultipleChoiceQuestion, OTHER_NAMESPACE_MEMORIES, type OllamaProviderConfig, type OpenAiCompatibleProviderConfig, PROCEDURAL_REAL_SCENARIOS, PROCEDURAL_REAL_SCENARIOS_SMOKE, PUBLISHED_BENCHMARK_ARTIFACT_IDS, type PersonalizationRetrievalCase, type ProceduralAblationArtifact, type ProceduralAblationPerCase, type ProceduralAblationScenario, type ProceduralRealScenario, type ProceduralRealScenarioCategory, type ProviderBaseConfig, type ProviderConfig, type ProviderDiscoveryResult, type ProviderFactoryConfig, type PublishSkipReason, type PublishSkipRecord, type PublishedBenchmarkFeed, type PublishedBenchmarkFeedEntry, type PublishedBenchmarkId, REQUIRED_FRONTMATTER_FIELDS, type RecallMetrics, type RecoveredMemory, type RegressionDetail, type RegressionGateResult, type RemnicAdapterOptions, type ResolveBenchRuntimeProfileOptions, type ResolvedBenchRuntimeProfile, type ResolvedRunBenchmarkOptions, type RotatedChoices, type RunBenchmarkOptions, type RunProceduralAblationCliArgs, type RunProceduralAblationOptions, SCHEMA_TIER_FIXTURE, SCHEMA_TIER_SMOKE_FIXTURE, SEALED_PROMPT_REGISTRY, SYNTHETIC_MEMORIES, type SanitizedDiagnosticProvider, type SavedBaseline, type SchemaTierCorpus, type SchemaTierFixture, type SchemaTierName, type SchemaTierPage, type SchemaTierPageFrontmatter, type SealedArtifact, type SealedJudgeDecision, type SealedJudgeInput, type SealedQrelsArtifact, type SealedQrelsHandle, type SealedRubric, type SearchResult, type SeededMemory, type SeededRng, type SpotCheckLogger, type StatisticalReport, type StructuredJudge, type SyntheticEmailIngestionAdapterOptions, type SyntheticTargetOptions, type TaskResult, type TaskTokenUsage, type TemporalRetrievalCase, type TierDetail, type TimelineEntry, type TokenUsage, type WriteBenchmarkArtifactResult, addContaminationEntry, aggregateTaskScores, answerBenchmarkQuestion, assertCanaryUnderFloor, assertIntegrityMetaPresent, assertPublishableIntegrity, assertSha256Hex, assistantMeetingPrepDefinition, assistantMorningBriefDefinition, assistantNextBestActionDefinition, assistantSynthesisDefinition, backlinkF1, bootstrapMeanConfidenceInterval, buildAmaBenchDiagnosticMatrixArtifact, buildAmaBenchDiagnosticVariantSummary, buildAmaBenchLeaderboardRows, buildBenchmarkArtifact, buildBenchmarkArtifactFilename, buildBenchmarkPublishFeed, buildBenchmarkReproManifest, buildBenchmarkRunSeeds, buildJudgePayload, buildOracleTrajectoryRecall, buildSchemaTierFixture, buildSchemaTierSmokeFixture, calendarFixture, canonicalJsonStringify, chatFixture, checkDatasetContamination, checkRegression, clampScore, cohensD, compareResults, computeSealHash, containsAnswer, createSeededRng as createAdamSeededRng, createAmaBenchDiagnosticAdapter, createAnthropicProvider, createCanaryAdapter, createCodexCliProvider, createDeterministicSpotCheckLogger, createGatewayResponder, createLightweightAdapter, createLiteLlmProvider, createLocalLlmProvider, createMitigatedTarget, createOllamaProvider, createOpenAiCompatibleProvider, createSeededRandom as createProceduralAblationSeededRandom, createProvider, createProviderBackedAmaBenchRecommendedJudge, createProviderBackedJudge, createProviderBackedResponder, createProviderBackedStructuredJudge, createRemnicAdapter, createResponderFromProvider, createSeededRng$1 as createSeededRng, createSpotCheckFileLogger, createStructuredJudgeFromProvider, createSyntheticEmailIngestionAdapter, createSyntheticTarget, createTimeoutGuardedAdapter, defaultBenchmarkBaselineDir, defaultBenchmarkPublishPath, deleteBenchmarkResults, discoverAllProviders, emailFixture, entityRecall, exactMatch, extractMarkdownSectionsByTitle, f1Score, fixtureToAblationScenarios, formatMissingDatasetError, generateReport, getBenchmark, getBenchmarkLowerIsBetter, getMemoryEvalDimension, getRemnicVersion, hashBenchmarkArtifact, hashBytes, hashCanonicalJson, hashString, integrityMetaIsComplete, interpretEffectSize, isAmaBenchUnknownLikeAnswer, isContaminationEntry, isContaminationManifest, isSealedQrelsArtifact, isSha256Hex, linkMatches, listBenchmarkBaselines, listBenchmarkResults, listBenchmarks, listMemoryEvalBenchmarkIds, listMemoryEvalDimensions, llmJudgeScore, llmJudgeScoreDetailed, loadAblationFixture, loadBaseline, loadBeamDatasetPreview, loadBenchmarkArtifact, loadBenchmarkBaseline, loadBenchmarkResult, loadCustomBenchmarkFile, loadLoCoMo10, loadLongMemEvalS, loadSealKeyFromEnv, loadSealedQrels, loadSealedRubric, matchEntity, mergeContaminationManifests, openSeal, orchestrateBenchmarkRuns, pairedDeltaConfidenceInterval, parseBenchmarkArtifact, parseCustomBenchmark, parseRubricResponse, parseSealedQrels, precisionAtK, projectFolderFixture, recallAtK, redactBenchmarkResultSecrets, renderBaselineMarkdown, renderBenchmarkResultExport, renderMemorySummaryForJudge, renderMemoryViewForAgent, resolveAssistantAgent, resolveAssistantRubricId, resolveAssistantSeeds, resolveAssistantSpotCheckDir, resolveBenchRuntimeProfile, resolveBenchmarkPhaseTimeoutMs, resolveBenchmarkProgressLogging, resolveBenchmarkResultReference, resolveBenchmarkRunCount, resolveStructuredJudge, rotateDistractors, rougeL, runAssistantBenchmark, runAssistantMeetingPrepBenchmark, runAssistantMorningBriefBenchmark, runAssistantNextBestActionBenchmark, runAssistantSynthesisBenchmark, runBaseline, runBenchSuite, runBenchmark, runCustomBenchmarkFile, runExplain, runExtractionAttack, runMitigatedBaseline, runProceduralAblation, runProceduralAblationCli, runSealedJudge, safeHexEqual, saveBaseline, saveBenchmarkBaseline, schemaCompleteness, sealPayload, selectAmaBenchDiagnosticVariants, selectFixtureVariant, serializeBenchmarkArtifact, serializeJsonl, serializeSealedQrels, shuffleTasks, timed, verifyRubricDigest, writeBenchmarkArtifact, writeBenchmarkPublishFeed, writeBenchmarkReproManifest, writeBenchmarkResult, writeLeaderboardArtifactsForResult, zeroScores };
3561
+ export { AMA_BENCH_DIAGNOSTIC_VARIANTS, ASSISTANT_AGENT_CONFIG_KEY, ASSISTANT_JUDGE_CONFIG_KEY, ASSISTANT_MEETING_PREP_SCENARIOS, ASSISTANT_MEETING_PREP_SMOKE_SCENARIOS, ASSISTANT_MORNING_BRIEF_SCENARIOS, ASSISTANT_MORNING_BRIEF_SMOKE_SCENARIOS, ASSISTANT_NEXT_BEST_ACTION_SCENARIOS, ASSISTANT_NEXT_BEST_ACTION_SMOKE_SCENARIOS, ASSISTANT_RUBRIC_DIMENSIONS, ASSISTANT_RUBRIC_ID_KEY, ASSISTANT_SEEDS_CONFIG_KEY, ASSISTANT_SPOT_CHECK_DIR_KEY, ASSISTANT_SYNTHESIS_SCENARIOS, ASSISTANT_SYNTHESIS_SMOKE_SCENARIOS, type AbstentionRetrievalCase, type AggregateMetrics, type AmaBenchDiagnosticAdapterOptions, type AmaBenchDiagnosticAnswererMode, type AmaBenchDiagnosticBreakdown, type AmaBenchDiagnosticMatrixArtifact, type AmaBenchDiagnosticRecallMode, type AmaBenchDiagnosticRunContext, type AmaBenchDiagnosticTaskEvidence, type AmaBenchDiagnosticTaskRow, type AmaBenchDiagnosticVariant, type AmaBenchDiagnosticVariantSummary, type AnthropicProviderConfig, type AssistantAgent, type AssistantMemoryFact, type AssistantMemoryGraph, type AssistantRubricDimension, type AssistantRubricScores, type AssistantRunnerOptions, type AssistantScenario, type AssistantStance, type AttackRecallOptions, type AttackRetrievalHit, type AttackerMode, BENCHMARK_ARTIFACT_SCHEMA_VERSION, BENCHMARK_INTEGRITY_META_SCHEMA, BENCHMARK_REPRO_MANIFEST_FILENAME, BENCHMARK_REPRO_MANIFEST_SCHEMA_VERSION, BENCHMARK_RESULT_SCHEMA, BENCHMARK_SPLIT_TYPES, type BaselineRow, type BaselineScenario, type BeamDatasetPreview, type BenchConfig, type BenchJudge, type BenchJudgeResult, type BenchMemoryAdapter, type BenchModelSource, type BenchReasoningEffort, type BenchRecallOptions, type BenchResponder, type BenchResponse, type BenchRuntimeProfile, type BenchTier, type BenchmarkArtifact, type BenchmarkArtifactEnvironment, type BenchmarkArtifactPerTaskScore, type BenchmarkArtifactSystem, type BenchmarkCategory, type BenchmarkDefinition, type BenchmarkIntegrityMeta, type BenchmarkMeta, type BenchmarkMode, type BenchmarkReport, type BenchmarkReproManifest, type BenchmarkReproManifestDataset, type BenchmarkReproManifestFile, type BenchmarkReproManifestResult, type BenchmarkResult, type BenchmarkSplitType, type BenchmarkStatus, type BenchmarkSuiteResult, type BenchmarkTier, type BuildBenchmarkArtifactInput, type BuildBenchmarkPublishFeedOptions, type BuildBenchmarkReproManifestOptions, type BuiltInProvider, CANARY_FIXED_RECALL, CANARY_SCORE_FLOOR, type CanaryAdapterOptions, type CanaryFloorCheck, type CodexCliProviderConfig, type ComparisonMetricDelta, type ComparisonResult, type CompletionOpts, type CompletionResult, type ConfidenceInterval, type ContaminationCheckResult, type ContaminationEntry, type ContaminationManifest, type CustomBenchmarkScoring, type CustomBenchmarkSpec, type CustomBenchmarkTask, DEFAULT_ABLATION_BOOTSTRAP_SEED, DEFAULT_ASSISTANT_RUBRIC_ID, DEFAULT_BASELINE_SCENARIOS, type DatasetSource, type DiscoveredModel, EMPTY_CONTAMINATION_MANIFEST, type EffectSizeInterpretation, type EffectSizeSummary, type ExplainResult, type ExtractedEntity, type ExtractedLink, type ExtractedPage, type ExtractionAttackOptions, type ExtractionAttackResult, type ExtractionAttackTarget, type FixtureGenerator, type FixtureOutput, type FixtureVariant, type GeneratedFile, type GoldEntity, type GoldEntityType, type GoldGraph, type GoldLink, type GoldPage, type HarnessRng, INTEGRITY_CIPHER_ALGORITHM, INTEGRITY_HASH_ALGORITHM, INTEGRITY_META_FIELDS, type IngestionBenchAdapter, type IngestionLog, LOCAL_LAB_PROVIDER_KINDS, LOCOMO_DATASET_FILENAMES, LONG_MEM_EVAL_DATASET_FILENAMES, type LeaderboardArtifactWrite, type LlmJudge, type LlmProvider, type LoadDatasetOptions, type LoadSealedQrelsOptions, type LoadedDataset, type LocalLabManifest, type LocalLabManifestNotes, type LocalLabPhase, type LocalLabPhaseDescriptor, type LocalLabPhaseExecute, type LocalLabPhaseName, type LocalLabPhaseOutcome, LocalLabPreflightError, type LocalLabPreflightFailure, type LocalLabPreflightInput, type LocalLabPreflightOptions, type LocalLabPreflightResult, type LocalLabPreflightSuccess, type LocalLabProviderKind, type LocalLabRoleConfig, type LocalLlmProviderConfig, MEMORY_EVAL_DIMENSIONS, MEMORY_EVAL_PUBLIC_LINE, MITIGATED_BASELINE_SCENARIOS, type MemoryEvalCategory, type MemoryEvalDimension, type MemoryEvalDimensionId, type MemoryEvalMetric, type MemoryGraph, type MemoryStats, type MemorySystem, type Message, type MetricAggregate, type MitigatedBaselineConfig, type MitigatedTargetConfig, type MultipleChoiceQuestion, OTHER_NAMESPACE_MEMORIES, type OllamaProviderConfig, type OpenAiCompatibleProviderConfig, PROCEDURAL_REAL_SCENARIOS, PROCEDURAL_REAL_SCENARIOS_SMOKE, PUBLISHED_BENCHMARK_ARTIFACT_IDS, type PersonalizationRetrievalCase, type PreflightDiscoveredModel, type ProceduralAblationArtifact, type ProceduralAblationPerCase, type ProceduralAblationScenario, type ProceduralRealScenario, type ProceduralRealScenarioCategory, type ProviderBaseConfig, type ProviderConfig, type ProviderDiscoveryResult, type ProviderFactoryConfig, type PublishSkipReason, type PublishSkipRecord, type PublishedBenchmarkFeed, type PublishedBenchmarkFeedEntry, type PublishedBenchmarkId, REQUIRED_FRONTMATTER_FIELDS, type RecallMetrics, type RecoveredMemory, type RegressionDetail, type RegressionGateResult, type RemnicAdapterOptions, type ResolveBenchRuntimeProfileOptions, type ResolvedBenchRuntimeProfile, type ResolvedLocalLabProfile, type ResolvedLocalLabRole, type ResolvedRunBenchmarkOptions, type RotatedChoices, type RunBenchmarkOptions, type RunProceduralAblationCliArgs, type RunProceduralAblationOptions, type RunSequentialPhasesOptions, SCHEMA_TIER_FIXTURE, SCHEMA_TIER_SMOKE_FIXTURE, SEALED_PROMPT_REGISTRY, SYNTHETIC_MEMORIES, type SanitizedDiagnosticProvider, type SavedBaseline, type SchemaTierCorpus, type SchemaTierFixture, type SchemaTierName, type SchemaTierPage, type SchemaTierPageFrontmatter, type SealedArtifact, type SealedJudgeDecision, type SealedJudgeInput, type SealedQrelsArtifact, type SealedQrelsHandle, type SealedRubric, type SearchResult, type SeededMemory, type SeededRng, type SequentialPhaseHooks, type SpotCheckLogger, type StatisticalReport, type StructuredJudge, type SyntheticEmailIngestionAdapterOptions, type SyntheticTargetOptions, type TaskResult, type TaskTokenUsage, type TemporalRetrievalCase, type TierDetail, type TimelineEntry, type TokenUsage, type WriteBenchmarkArtifactResult, addContaminationEntry, aggregateTaskScores, answerBenchmarkQuestion, assertCanaryUnderFloor, assertIntegrityMetaPresent, assertPublishableIntegrity, assertSha256Hex, assistantMeetingPrepDefinition, assistantMorningBriefDefinition, assistantNextBestActionDefinition, assistantSynthesisDefinition, backlinkF1, bootstrapMeanConfidenceInterval, buildAmaBenchDiagnosticMatrixArtifact, buildAmaBenchDiagnosticVariantSummary, buildAmaBenchLeaderboardRows, buildBenchmarkArtifact, buildBenchmarkArtifactFilename, buildBenchmarkPublishFeed, buildBenchmarkReproManifest, buildBenchmarkRunSeeds, buildJudgePayload, buildOracleTrajectoryRecall, buildSchemaTierFixture, buildSchemaTierSmokeFixture, calendarFixture, canonicalJsonStringify, chatFixture, checkDatasetContamination, checkRegression, clampScore, cohensD, compareResults, computeSealHash, containsAnswer, createSeededRng as createAdamSeededRng, createAmaBenchDiagnosticAdapter, createAnthropicProvider, createCanaryAdapter, createCodexCliProvider, createDeterministicSpotCheckLogger, createGatewayResponder, createLightweightAdapter, createLiteLlmProvider, createLocalLlmProvider, createMitigatedTarget, createOllamaProvider, createOpenAiCompatibleProvider, createSeededRandom as createProceduralAblationSeededRandom, createProvider, createProviderBackedAmaBenchRecommendedJudge, createProviderBackedJudge, createProviderBackedResponder, createProviderBackedStructuredJudge, createRemnicAdapter, createResponderFromProvider, createSeededRng$1 as createSeededRng, createSpotCheckFileLogger, createStructuredJudgeFromProvider, createSyntheticEmailIngestionAdapter, createSyntheticTarget, createTimeoutGuardedAdapter, defaultBenchmarkBaselineDir, defaultBenchmarkPublishPath, deleteBenchmarkResults, discoverAllProviders, discoveryEndpointFor, emailFixture, entityRecall, exactMatch, extractMarkdownSectionsByTitle, f1Score, fixtureToAblationScenarios, formatHandoffNote, formatMissingDatasetError, generateReport, getBenchmark, getBenchmarkLowerIsBetter, getMemoryEvalDimension, getRemnicVersion, hashBenchmarkArtifact, hashBytes, hashCanonicalJson, hashString, integrityMetaIsComplete, interpretEffectSize, isAmaBenchUnknownLikeAnswer, isContaminationEntry, isContaminationManifest, isSealedQrelsArtifact, isSha256Hex, linkMatches, listBenchmarkBaselines, listBenchmarkResults, listBenchmarks, listMemoryEvalBenchmarkIds, listMemoryEvalDimensions, llmJudgeScore, llmJudgeScoreDetailed, loadAblationFixture, loadBaseline, loadBeamDatasetPreview, loadBenchmarkArtifact, loadBenchmarkBaseline, loadBenchmarkResult, loadCustomBenchmarkFile, loadLoCoMo10, loadLocalLabManifest, loadLongMemEvalS, loadSealKeyFromEnv, loadSealedQrels, loadSealedRubric, matchEntity, mergeContaminationManifests, openSeal, orchestrateBenchmarkRuns, pairedDeltaConfidenceInterval, parseBenchmarkArtifact, parseCustomBenchmark, parseLocalLabManifest, parseRubricResponse, parseSealedQrels, precisionAtK, preflightLocalLabRole, projectFolderFixture, recallAtK, redactBenchmarkResultSecrets, renderBaselineMarkdown, renderBenchmarkResultExport, renderMemorySummaryForJudge, renderMemoryViewForAgent, resolveAssistantAgent, resolveAssistantRubricId, resolveAssistantSeeds, resolveAssistantSpotCheckDir, resolveBenchRuntimeProfile, resolveBenchmarkPhaseTimeoutMs, resolveBenchmarkProgressLogging, resolveBenchmarkResultReference, resolveBenchmarkRunCount, resolveLocalLabProfile, resolveLocalLabRole, resolveStructuredJudge, rotateDistractors, rougeL, runAssistantBenchmark, runAssistantMeetingPrepBenchmark, runAssistantMorningBriefBenchmark, runAssistantNextBestActionBenchmark, runAssistantSynthesisBenchmark, runBaseline, runBenchSuite, runBenchmark, runCustomBenchmarkFile, runExplain, runExtractionAttack, runMitigatedBaseline, runProceduralAblation, runProceduralAblationCli, runSealedJudge, runSequentialPhases, safeHexEqual, saveBaseline, saveBenchmarkBaseline, schemaCompleteness, sealPayload, selectAmaBenchDiagnosticVariants, selectFixtureVariant, serializeBenchmarkArtifact, serializeJsonl, serializeSealedQrels, shuffleTasks, timed, verifyRubricDigest, writeBenchmarkArtifact, writeBenchmarkPublishFeed, writeBenchmarkReproManifest, writeBenchmarkResult, writeLeaderboardArtifactsForResult, zeroScores };