@remnic/bench 9.3.681 → 9.3.683
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +372 -2
- package/dist/index.js +605 -53
- package/package.json +6 -3
- package/profiles/README.md +113 -0
- package/profiles/local-lab-3090.json +39 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@remnic/bench",
|
|
3
|
-
"version": "9.3.
|
|
3
|
+
"version": "9.3.683",
|
|
4
4
|
"description": "Retrieval latency ladder benchmarks + CI regression gates for @remnic/core",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -12,6 +12,8 @@
|
|
|
12
12
|
},
|
|
13
13
|
"./baselines/*": "./baselines/*",
|
|
14
14
|
"./baselines/*.json": "./baselines/*.json",
|
|
15
|
+
"./profiles/*": "./profiles/*",
|
|
16
|
+
"./profiles/*.json": "./profiles/*.json",
|
|
15
17
|
"./package.json": "./package.json"
|
|
16
18
|
},
|
|
17
19
|
"license": "MIT",
|
|
@@ -34,7 +36,7 @@
|
|
|
34
36
|
"dependencies": {
|
|
35
37
|
"hyparquet": "^1.25.7",
|
|
36
38
|
"yaml": "^2.4.2",
|
|
37
|
-
"@remnic/core": "^9.3.
|
|
39
|
+
"@remnic/core": "^9.3.683"
|
|
38
40
|
},
|
|
39
41
|
"devDependencies": {
|
|
40
42
|
"tsup": "^8.5.1",
|
|
@@ -42,7 +44,8 @@
|
|
|
42
44
|
},
|
|
43
45
|
"files": [
|
|
44
46
|
"dist",
|
|
45
|
-
"baselines"
|
|
47
|
+
"baselines",
|
|
48
|
+
"profiles"
|
|
46
49
|
],
|
|
47
50
|
"scripts": {
|
|
48
51
|
"prebuild": "node ../../scripts/ensure-bench-build-deps.mjs",
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
# Local-lab runtime profiles
|
|
2
|
+
|
|
3
|
+
This directory ships reference JSON manifests for the **`local-lab`** bench
|
|
4
|
+
runtime profile (issue #1573 PR2). A local-lab profile pins responder,
|
|
5
|
+
judge, and (optionally) embedding to operator-hosted models so the bench
|
|
6
|
+
runs entirely on local hardware — no paid API calls, no rate-limit cliffs.
|
|
7
|
+
|
|
8
|
+
These manifests are **templates**. They use placeholder model ids on
|
|
9
|
+
purpose: the operator must replace them with the ids their endpoint
|
|
10
|
+
actually reports (`GET /v1/models` for OpenAI-compatible servers,
|
|
11
|
+
`GET /api/tags` for Ollama) before running. The bench refuses to start a
|
|
12
|
+
phase whose preflight cannot find the manifest model id (rule 51 — no
|
|
13
|
+
silent fallback).
|
|
14
|
+
|
|
15
|
+
## Schema
|
|
16
|
+
|
|
17
|
+
|Field|Type|Meaning|
|
|
18
|
+
|---|---|---|
|
|
19
|
+
|`profile`|`"local-lab"`|Discriminator. Required.|
|
|
20
|
+
|`responder`|role|Answering model (must report this id at the endpoint).|
|
|
21
|
+
|`judge`|role|Scoring model. Runs in the judge phase after the responder.|
|
|
22
|
+
|`embedding`|role?|Optional embedding model for retrieval indexing.|
|
|
23
|
+
|`phases`|`"sequential"`|Phase scheduling mode. PR2 ships `sequential` only.|
|
|
24
|
+
|`notes.responderToJudgeHandoff`|string?|Operator guidance for multi-endpoint swaps. PR2 ships same-endpoint profiles only (multi-endpoint sequential execution is PR3 scope).|
|
|
25
|
+
|
|
26
|
+
Each role has shape:
|
|
27
|
+
|
|
28
|
+
|Field|Type|Meaning|
|
|
29
|
+
|---|---|---|
|
|
30
|
+
|`provider`|`"openai-compatible"` \| `"ollama"`|Transport the bench uses for preflight + completion.|
|
|
31
|
+
|`baseUrl`|string|Endpoint URL. Fetch target only — never interpolated into a shell.|
|
|
32
|
+
|`model`|string|Exact id the endpoint reports. No aliases.|
|
|
33
|
+
|`quantization`|string?|Informational; recorded in artifacts.|
|
|
34
|
+
|`ctx`|positive int|Manifest-declared serving context (tokens). Preflight verifies the live endpoint reports at least this much.|
|
|
35
|
+
|`temperature`|`0`|Pinned to 0 for reproducibility. Non-zero is rejected.|
|
|
36
|
+
|`seed`|int|Sampling seed; required so reruns reproduce the same draws.|
|
|
37
|
+
|
|
38
|
+
## `local-lab-3090.json`
|
|
39
|
+
|
|
40
|
+
Reference profile targeting a single-GPU lab box. The numbers below come
|
|
41
|
+
from issue #1573's *Hardware envelope* section; they are VRAM math, not
|
|
42
|
+
performance claims, and are reproduced here so the manifest's defaults
|
|
43
|
+
can be checked against the design constraints.
|
|
44
|
+
|
|
45
|
+
### Hardware envelope (RTX 3090, 24 GB VRAM, Ampere; 256 GB RAM, NVMe)
|
|
46
|
+
|
|
47
|
+
|Class|Rough VRAM @ 4-bit|Practical serving ctx|Notes|
|
|
48
|
+
|---|---|---|---|
|
|
49
|
+
|~14B dense (e.g. Qwen 2.5 14B, Llama 3 8B-class step-up)|9–10 GB|32k KV cache headroom|Comfortably fits; responder-only or judge-only.|
|
|
50
|
+
|~24–32B dense or ~30B-class MoE|14–19 GB|≤16k ctx (tight)|Responder OR judge, not both; KV budget tight.|
|
|
51
|
+
|Embedding model (e.g. BGE-M3, Qwen 3 embedding)|1–3 GB|8k ctx|Fits alongside one of the above on the same GPU in some setups.|
|
|
52
|
+
|
|
53
|
+
Constraints the profile respects:
|
|
54
|
+
|
|
55
|
+
- **24 GB VRAM fits one serious model at a time** — responder and judge
|
|
56
|
+
MUST run in sequential phases, never co-resident.
|
|
57
|
+
- **Ampere: bf16 OK, no FP8.** Prefer AWQ/GPTQ-int4 or GGUF Q4_K_M
|
|
58
|
+
quantizations.
|
|
59
|
+
- Remnic's memory-based answering keeps prompts small (that is the
|
|
60
|
+
product); LongMemEval_S's ~115k-token histories are ingested through
|
|
61
|
+
observe/extraction, not stuffed in context — so **16–32k serving context
|
|
62
|
+
is sufficient**.
|
|
63
|
+
- **256 GB RAM**: model files stay in page cache; phase swaps from NVMe
|
|
64
|
+
are seconds, not minutes. The operator can hot-swap models between
|
|
65
|
+
phases without long stalls.
|
|
66
|
+
|
|
67
|
+
### Defaults chosen in this manifest
|
|
68
|
+
|
|
69
|
+
The reference manifest pins both `responder` and `judge` to a **16 384
|
|
70
|
+
token** serving context. That value:
|
|
71
|
+
|
|
72
|
+
- Lies inside the 16–32k envelope above.
|
|
73
|
+
- Is conservatively below the "tight" ceiling for 24–32B dense models.
|
|
74
|
+
- Matches what GGUF Q4_K_M builds of mainstream ~14B / ~30B-class models
|
|
75
|
+
expose out of the box, so an operator swapping in their own model id is
|
|
76
|
+
unlikely to need to lower `ctx`.
|
|
77
|
+
|
|
78
|
+
`temperature: 0` and a fixed `seed: 1573` make reruns reproducible (the
|
|
79
|
+
manifest parser rejects non-zero temperatures and missing seeds).
|
|
80
|
+
|
|
81
|
+
The shipped profile pins all roles to a single Ollama endpoint
|
|
82
|
+
(`http://127.0.0.1:11434`). On a single 3090 the operator runs one
|
|
83
|
+
`ollama serve` instance and hot-swaps models between phases — no manual
|
|
84
|
+
server restart is needed. **Multi-endpoint manifests** (responder and
|
|
85
|
+
judge on different URLs) are not yet supported by the benchmark runner:
|
|
86
|
+
the published harness executes recall→answer→judge per trial, which
|
|
87
|
+
requires both models to be co-resident. Full sequential phase execution
|
|
88
|
+
(answer-all-then-judge-all with an operator swap between) is tracked for
|
|
89
|
+
PR3's calibration work.
|
|
90
|
+
|
|
91
|
+
## Using a profile
|
|
92
|
+
|
|
93
|
+
Resolution goes through `resolveBenchRuntimeProfile` (see
|
|
94
|
+
`packages/bench/src/runtime-profiles.ts`); the resolver turns each role
|
|
95
|
+
into a `ProviderConfig` with `temperature` and `seed` forwarded verbatim.
|
|
96
|
+
At run time, the CLI preflights both responder and judge endpoints before
|
|
97
|
+
any trial starts. Neither layer manages model processes — that is the
|
|
98
|
+
operator's responsibility. The sequential phase scheduler
|
|
99
|
+
(`runSequentialPhases`) is implemented and tested but not yet wired into
|
|
100
|
+
the benchmark trial loop (PR3 calibration scope).
|
|
101
|
+
|
|
102
|
+
```ts
|
|
103
|
+
import {
|
|
104
|
+
loadLocalLabManifest,
|
|
105
|
+
resolveLocalLabProfile,
|
|
106
|
+
runSequentialPhases,
|
|
107
|
+
} from "@remnic/bench";
|
|
108
|
+
|
|
109
|
+
const manifest = await loadLocalLabManifest("packages/bench/profiles/local-lab-3090.json");
|
|
110
|
+
const profile = resolveLocalLabProfile(manifest);
|
|
111
|
+
// profile.responder.providerConfig.temperature === 0
|
|
112
|
+
// profile.responder.providerConfig.seed === 1573
|
|
113
|
+
```
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"profile": "local-lab",
|
|
3
|
+
"responder": {
|
|
4
|
+
"provider": "ollama",
|
|
5
|
+
"baseUrl": "http://127.0.0.1:11434",
|
|
6
|
+
"model": "PLACEHOLDER-RESPONDER-MODEL",
|
|
7
|
+
"quantization": "Q4_K_M",
|
|
8
|
+
"ctx": 16384,
|
|
9
|
+
"temperature": 0,
|
|
10
|
+
"seed": 1573
|
|
11
|
+
},
|
|
12
|
+
"judge": {
|
|
13
|
+
"provider": "ollama",
|
|
14
|
+
"baseUrl": "http://127.0.0.1:11434",
|
|
15
|
+
"model": "PLACEHOLDER-JUDGE-MODEL",
|
|
16
|
+
"quantization": "Q4_K_M",
|
|
17
|
+
"ctx": 16384,
|
|
18
|
+
"temperature": 0,
|
|
19
|
+
"seed": 1573
|
|
20
|
+
},
|
|
21
|
+
"embedding": {
|
|
22
|
+
"provider": "ollama",
|
|
23
|
+
"baseUrl": "http://127.0.0.1:11434",
|
|
24
|
+
"model": "PLACEHOLDER-EMBEDDING-MODEL",
|
|
25
|
+
"ctx": 8192,
|
|
26
|
+
"temperature": 0,
|
|
27
|
+
"seed": 1573
|
|
28
|
+
},
|
|
29
|
+
"phases": "sequential",
|
|
30
|
+
"notes": {
|
|
31
|
+
"responderToJudgeHandoff": "Both responder and judge models run on the same Ollama instance (hot-swap). Ensure `ollama pull` has both PLACEHOLDER-RESPONDER-MODEL and PLACEHOLDER-JUDGE-MODEL before starting the bench.",
|
|
32
|
+
"hardware": {
|
|
33
|
+
"gpu": "NVIDIA RTX 3090",
|
|
34
|
+
"vramGb": 24,
|
|
35
|
+
"ramGb": 256,
|
|
36
|
+
"ampereBf16Ok": true
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|