opencode-lmstudio-warm 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md ADDED
@@ -0,0 +1,47 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project are documented here.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+ While the version is `0.x`, a MINOR bump may include breaking changes (per the
8
+ SemVer 0.x rule); such changes are called out explicitly below.
9
+
10
+ ## [Unreleased]
11
+
12
+ ## [0.1.0] - 2026-07-03
13
+
14
+ Initial public release.
15
+
16
+ ### Added
17
+
18
+ - `lmstudio-warm` opencode plugin (`src/index.ts`): a deterministic pre-warm
19
+ gate on the awaited `chat.params` hook that guarantees the target LM Studio
20
+ model is addressable before every request, healing cold JIT loads,
21
+ "no model loaded" errors, and mid-session idle-TTL evictions.
22
+ - Background eager warm of `model` + `small_model` at instance start.
23
+ - Cross-process `mkdir` mutex with dead-holder liveness detection so parallel
24
+ opencode workers never race `lms load` (no `:2` duplicate instances).
25
+ - Configurable via `~/.config/opencode/lmstudio-warm.json` or plugin options:
26
+ `providers`, `ttlSeconds`, `parallel`, `contextLength`, `perModel`,
27
+ `verifyCacheMs`, `retryCooldownMs`, `failMode`, `reconcileDuplicates`,
28
+ `eager`, and more.
29
+ - Three install paths: npm package, single-file copy, and project-local.
30
+ - A live E2E fixture under `test/e2e/` — a 9-check harness (cold load /
31
+ eviction heal / thundering herd / orphaned-duplicate reconcile).
32
+ - Vitest unit tests (`test/`) for the exported pure logic: config merge,
33
+ model-ref parsing, load-arg building, addressability, pid liveness, and
34
+ fail-mode decisions.
35
+ - Reference configs under `examples/`.
36
+
37
+ ### Fixed
38
+
39
+ - Cross-process lock leak in the fire-and-forget eager-warm path: a one-shot
40
+ `opencode run` exiting mid-load could leave the mkdir lock held by a dead pid,
41
+ stalling the next worker up to ~18.5 min. `acquireLock` now breaks a contended
42
+ lock immediately when its holder pid is dead (or the pid file is absent past a
43
+ grace window), the release is synchronous, and a `process.on("exit")` handler
44
+ is a last-resort cleanup. Verified 9/9 against a live LM Studio fleet.
45
+
46
+ [Unreleased]: https://github.com/diegomarino/opencode-lmstudio-warm/compare/v0.1.0...HEAD
47
+ [0.1.0]: https://github.com/diegomarino/opencode-lmstudio-warm/releases/tag/v0.1.0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Diego Marino
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,301 @@
1
+ # opencode-lmstudio-warm
2
+
3
+ Deterministic model pre-warm for **opencode + LM Studio**.
4
+
5
+ ![Quick start: install the plugin, LM Studio starts cold, the first opencode run warms the model before the request leaves, and lms ps shows both models resident with no TTL](https://github.com/user-attachments/assets/f5522cb6-7967-4f47-a8c5-ca617a8d736a)
6
+
7
+ <sup>Scripted demo (`tools/quickstart/generate-cast.py`) — every output line captured verbatim from a real run; the cold-load wait is shortened, and its spinner visualizes the plugin's background `lms load` (opencode itself waits silently).</sup>
8
+
9
+ A dependency-free opencode plugin that **guarantees your LM Studio model is
10
+ loaded and addressable _before_ any request leaves opencode**.
11
+
12
+ If you point
13
+ opencode at LM Studio, it fixes three failures you have probably already met:
14
+
15
+ - **First request hangs** — the model is cold and JIT-loads while your request waits.
16
+ - **`"no model loaded"` errors** — JIT is off and nothing loads the model for you.
17
+ - **Mid-session breakage** — LM Studio's idle TTL evicted the model between two messages.
18
+
19
+ Per request, the plugin checks that the model is actually loaded and, when it
20
+ isn't, performs exactly one `lms load` (even across parallel sessions) before
21
+ letting the request through.
22
+
23
+ Verified against opencode **v1.17.10** and the local LM Studio + `lms` CLI on
24
+ macOS/Apple Silicon (see [`test/e2e/verify.sh`](./test/e2e/verify.sh), 9/9 passing).
25
+
26
+ ## Quick start
27
+
28
+ **1. Install and register the plugin** — one command; opencode resolves it from
29
+ npm and adds it to your config's `plugin` array:
30
+
31
+ ```bash
32
+ opencode plugin -g opencode-lmstudio-warm # global (~/.config/opencode) — every session on the machine
33
+ # or, for a single project's opencode.json:
34
+ opencode plugin opencode-lmstudio-warm
35
+ ```
36
+
37
+ **2. Point opencode at LM Studio** (skip if you already have an `lmstudio`
38
+ provider). In `~/.config/opencode/opencode.json`:
39
+
40
+ ```jsonc
41
+ {
42
+ "plugin": ["opencode-lmstudio-warm"],
43
+ "provider": {
44
+ "lmstudio": {
45
+ "npm": "@ai-sdk/openai-compatible",
46
+ "options": {
47
+ "baseURL": "http://127.0.0.1:1234/v1",
48
+ "apiKey": "{env:LM_API_TOKEN}",
49
+ "headerTimeout": 600000,
50
+ "chunkTimeout": 120000
51
+ }
52
+ }
53
+ }
54
+ }
55
+ ```
56
+
57
+ Then set your `model` / `small_model` to your LM Studio model keys. See
58
+ [`examples/opencode.json`](./examples/opencode.json) for a fuller starting point.
59
+
60
+ **3. Adjust LM Studio once** (App Settings → Developer): disable
61
+ **JIT model auto-unload TTL** and **unload previous JIT model on load**; keep
62
+ JIT itself on as a fallback. ([Why these matter →](#how-it-works))
63
+
64
+ That's it — from your next opencode session, the model is warm before the
65
+ first token is requested.
66
+
67
+ ## Install options
68
+
69
+ All three paths load the same plugin — pick the one that fits:
70
+
71
+ | Path | Best for |
72
+ |------|----------|
73
+ | [npm](#npm-recommended) (recommended) | Most users and fleets — version-pinned, one-line updates |
74
+ | [Single-file copy](#single-file-copy-offline-fleet-wide) | Offline machines |
75
+ | [Project-local](#project-local) | Hacking on the plugin itself |
76
+
77
+ ### npm (recommended)
78
+
79
+ The Quick start command above is all you need. Notes:
80
+
81
+ - You don't run `npm install` / `bun add` yourself, and there's no `npx` step —
82
+ opencode imports the module and auto-installs any plugin named in your config
83
+ at startup, so hand-adding `"opencode-lmstudio-warm"` to the `plugin` array
84
+ works too.
85
+ - Use `-f` to force a version bump.
86
+
87
+ **Scriptable setup** — for fleets or automation, this `jq` one-shot registers
88
+ the plugin *and* scaffolds the provider with recommended timeouts. It is
89
+ idempotent and non-destructive: keeps your existing plugins, provider, and
90
+ models, and never overwrites options you've set.
91
+
92
+ ```bash
93
+ CFG=~/.config/opencode/opencode.json # or ./opencode.json for a single project
94
+ [ -f "$CFG" ] || echo '{}' > "$CFG"
95
+ jq '
96
+ .plugin = ((.plugin // []) - ["opencode-lmstudio-warm"] + ["opencode-lmstudio-warm"])
97
+ | .provider.lmstudio.npm //= "@ai-sdk/openai-compatible"
98
+ | .provider.lmstudio.options.baseURL //= "http://127.0.0.1:1234/v1"
99
+ | .provider.lmstudio.options.apiKey //= "{env:LM_API_TOKEN}"
100
+ | .provider.lmstudio.options.headerTimeout //= 600000
101
+ | .provider.lmstudio.options.chunkTimeout //= 120000
102
+ ' "$CFG" > "$CFG.tmp" && mv "$CFG.tmp" "$CFG"
103
+ ```
104
+
105
+ ### Single-file copy (offline, fleet-wide)
106
+
107
+ ```bash
108
+ mkdir -p ~/.config/opencode/plugin
109
+ cp src/index.ts ~/.config/opencode/plugin/lmstudio-warm.ts
110
+ ```
111
+
112
+ Auto-discovered by every opencode session on the machine. (opencode's docs spell
113
+ this directory `plugins`; verified as `plugin/` — singular — on v1.17.10.)
114
+
115
+ ### Project-local
116
+
117
+ Scope the plugin to a single project by copying `src/index.ts` into that
118
+ project's `.opencode/plugin/lmstudio-warm.ts` — opencode auto-discovers it there
119
+ for that project only. (This repo's own E2E fixture uses exactly this mechanism;
120
+ see `test/e2e/`.)
121
+
122
+ Whichever path you pick, also apply the LM Studio GUI settings from
123
+ [Quick start](#quick-start) step 3 on every machine. The provider timeouts
124
+ (`headerTimeout` / `chunkTimeout`) are defense-in-depth and are already set by
125
+ the JSON/`jq` above.
126
+
127
+ ## Configuration
128
+
129
+ The plugin works with zero configuration. Optional tuning lives in
130
+ `~/.config/opencode/lmstudio-warm.json` (or inline as
131
+ `"plugin": [["opencode-lmstudio-warm", {...}]]`): `providers`,
132
+ `ttlSeconds`, `parallel` (size ≈ concurrent fleet width; overflow queues
133
+ server-side), `contextLength`,
134
+ `perModel: { "<key>": { parallel, ttlSeconds, contextLength } }`,
135
+ `verifyCacheMs`, `retryCooldownMs`, `failMode` (`hybrid` default: confirmed
136
+ failures fail the request with a clear error; ambiguous lock contention
137
+ proceeds fail-open), `reconcileDuplicates`, `eager`, `logFile`.
138
+
139
+ Log: `~/.cache/opencode/lmstudio-warm.log`.
140
+
141
+ See `examples/lmstudio-warm.json` for a fleet-tuned starting point
142
+ (`cp examples/lmstudio-warm.json ~/.config/opencode/lmstudio-warm.json`).
143
+ `perModel` keys are LM Studio model keys — the exact string opencode sends as
144
+ the API `model` field. Sizing `parallel`: set it to the expected number of
145
+ concurrent workers hitting that model; each slot costs extra KV-cache memory,
146
+ and overflow requests queue server-side (latency, not failure), so
147
+ undersizing is safe and oversizing wastes VRAM. Titles/summaries on the small
148
+ model tolerate queueing; the main model is where fleet width matters.
149
+
150
+ ## Verify
151
+
152
+ A live, self-contained E2E fixture lives in [`test/e2e/`](./test/e2e/) — set two
153
+ real LM Studio model keys and run it:
154
+
155
+ ```bash
156
+ MAIN="your/main-model" SMALL="your-small-model" bun run e2e
157
+ # requires jq, lms, opencode + a running LM Studio; export LM_API_TOKEN for full E2E
158
+ ```
159
+
160
+ Covers: (a) cold spawn loads before the first request; (b) mid-session
161
+ eviction healed on resume (`opencode run -c`); (c) 3 parallel cold spawns →
162
+ exactly one `lms load`, no `:2` duplicates; (d) orphaned `:2`-only state is
163
+ reconciled back to an addressable instance. See
164
+ [`test/e2e/README.md`](./test/e2e/README.md) for setup and the placeholders to edit.
165
+
166
+ > ⚠️ It mutates live LM Studio state (unloads/loads models, spawns parallel
167
+ > sessions) — run it on a dev machine, not a busy fleet.
168
+
169
+ ## Uninstall / rollback
170
+
171
+ For the npm install path, remove `"opencode-lmstudio-warm"` from the `plugin`
172
+ array in `opencode.json`. For the file-copy paths:
173
+
174
+ ```bash
175
+ rm ~/.config/opencode/plugin/lmstudio-warm.ts # removes the gate everywhere
176
+ rm -f ~/.config/opencode/lmstudio-warm.json # optional tuning file
177
+ rm -rf ~/.cache/opencode/lmstudio-warm.lock # only if a stale lock lingers
178
+ ```
179
+
180
+ Models loaded by the plugin have no TTL, so after uninstalling they stay
181
+ resident until `lms unload <key>` or an LM Studio restart. The `opencode.json`
182
+ timeout options and the LM Studio GUI settings are independent of the plugin
183
+ and can stay.
184
+
185
+ ## How it works
186
+
187
+ ### The three layers
188
+
189
+ 1. **Plugin (primary, deterministic)** — `src/index.ts`.
190
+ Per request: verified-cache (30 s) → `lms ps --json` addressability check →
191
+ cross-process `mkdir` lock → double-checked re-check → orphan-duplicate
192
+ reconciliation → `lms load <key> -y` (no `--ttl` ⇒ resident indefinitely,
193
+ `ttlMs: null` verified) → post-load verification. Plus a background eager
194
+ warm of `model` + `small_model` at instance start (`config` hook).
195
+ 2. **LM Studio server settings (independent)** — in the GUI (App Settings →
196
+ Developer): disable **JIT model auto-unload TTL** (`jitModelTTL`, the 1 h
197
+ eviction that stalls long sessions) and **unload previous JIT model on load**
198
+ (`unloadPreviousJITModelOnLoad` — otherwise a JIT load of one model can
199
+ evict the other). Keys live in `~/.lmstudio/settings.json` under
200
+ `developer.*` (edit only while the app is closed). Keep JIT **on** as a
201
+ fallback; keep server autostart on.
202
+ 3. **opencode timeouts (defense-in-depth)** — v1.17.10 honors undocumented
203
+ provider options `timeout`, `headerTimeout`, `chunkTimeout`
204
+ (`provider.ts:resolveSDK`). Default is NO timeout at all (infinite hang
205
+ possible). `opencode.json` here sets `headerTimeout: 600000` (tolerates
206
+ queueing behind busy parallel slots) and `chunkTimeout: 120000` (converts a
207
+ wedged stream into a visible, bounded error).
208
+
209
+ ### Why a plugin is the right layer (design decision)
210
+
211
+ Investigated against the v1.17.10 source (tag clone), not docs:
212
+
213
+ - The `chat.params` hook is **awaited** (`yield* plugin.trigger("chat.params", ...)`
214
+ in `session/llm/request.ts`) before every request is built and sent, and it
215
+ fires for **every** stream — including `small: true` title/summary requests.
216
+ One hook deterministically gates BOTH pinned models, per request, which is
217
+ what heals mid-session eviction (an orchestrator pre-warm only helps at
218
+ spawn time).
219
+ - Plugins run in-process under Bun and can spawn `lms` (a blocking, exit-code
220
+ deterministic load barrier).
221
+ - The `event` hook is dispatched fire-and-forget (`void hook.event?.(...)`) —
222
+ it can NOT gate. The v2 `ctx.aisdk.sdk` custom-fetch API is **types-only** in
223
+ v1.17.10 (nothing in core imports it) — that path from the prior verdict is
224
+ refuted for this release.
225
+ - A plugin dropped in `~/.config/opencode/plugin/` is auto-discovered by every
226
+ worker on the machine — one file distributes fleet-wide and also covers
227
+ manually launched sessions.
228
+
229
+ Tradeoff vs. an orchestrator pre-warm node: the plugin costs one
230
+ `lms ps --json` (~150 ms) per model per 30 s per process at steady state; the
231
+ orchestrator node is simpler but only covers spawn time and only sessions it
232
+ spawns. Keep the orchestrator node, if you add one, as belt-and-suspenders —
233
+ it is not required.
234
+
235
+ ## Known limitations / failure modes
236
+
237
+ - **30 s verified-cache window**: an external unload (GUI, crash) within 30 s
238
+ of a positive check can slip one request through; it errors visibly and the
239
+ next request heals. There is no error hook in v1.17.10 to invalidate the
240
+ cache on failure.
241
+ - **`lms ps` cannot signal "loading"** (measured: a loading instance shows
242
+ `status: "idle"` at ~200 ms into a 12.5 s load). A waiter can pass the gate
243
+ mid-load; LM Studio queues its request until weights are ready (verified) —
244
+ a short wait, not a failure.
245
+ - **External JIT loads race**: a non-gated client can still trigger JIT
246
+ duplicates/evictions. Mitigated by Layer 2 settings; gate all fleet clients.
247
+ - **`unloadPreviousJITModelOnLoad` scope for explicit loads is assumed exempt**
248
+ (evidence: explicit loads carry `ttlMs: null` vs JIT's TTL, so bookkeeping
249
+ differs). Confirm by JIT-loading a third model via API while both pinned
250
+ models are resident, then `lms ps`. Disabling the setting (Layer 2) makes
251
+ this moot.
252
+ - **LM Studio app fully closed**: `lms server start` + `open -ga "LM Studio"`
253
+ fallback is implemented but untested here (the app was running). Confirm:
254
+ quit LM Studio → run one worker → check the log.
255
+ - **Memory guardrails**: if LM Studio's guardrail refuses a load, the plugin
256
+ fails that request with a clear error and cools down 60 s (no load storm) —
257
+ it cannot free VRAM for you.
258
+ - **API auth**: the plugin itself never needs `LM_API_TOKEN` (lms + probe are
259
+ auth-independent); workers still need it for generation when auth is on.
260
+
261
+ ## Running under an orchestrator (e.g. ao-lite)
262
+
263
+ No orchestrator changes are required — workers inherit the plugin from
264
+ `~/.config/opencode/plugin/` and warm themselves. Two optional touches:
265
+ export `LM_API_TOKEN` in the worker environment (the plugin itself never needs
266
+ it), and if you want spawn-time belt-and-suspenders, a pre-warm node only needs:
267
+ `lms ps --json` guard → `lms load <key> -y` — the same logic, but remember it
268
+ cannot heal mid-session evictions; the plugin does.
269
+
270
+ ## Development
271
+
272
+ The plugin is a single file with **no runtime dependencies** (its only import,
273
+ `@opencode-ai/plugin`, is `import type` and erased at build time). The root
274
+ `package.json` pulls that type package and `@types/node` as devDependencies so
275
+ you can type-check locally:
276
+
277
+ ```bash
278
+ bun install
279
+ bun run typecheck # tsc --strict, 0 errors
280
+ bun run test # vitest unit tests for the pure logic (test/)
281
+ bun run check # typecheck + tests + shellcheck
282
+ bun run e2e # live E2E fixture (needs LM Studio; see test/e2e/)
283
+ ```
284
+
285
+ The pure, per-process-stateless logic (config merge, model-ref parsing, load-arg
286
+ building, addressability, pid liveness, fail-mode decisions) is exported from
287
+ `src/index.ts` and unit-tested under `test/`; the live system behavior is covered
288
+ by the E2E fixture under [`test/e2e/`](./test/e2e/).
289
+
290
+ Releases follow [SemVer](https://semver.org) and are cut by CI on `v*` tags
291
+ (see [`CHANGELOG.md`](./CHANGELOG.md)).
292
+
293
+ ## Disclaimer
294
+
295
+ Community plugin. Not affiliated with, endorsed by, or an official product of the
296
+ OpenCode or LM Studio teams. "opencode" and "LM Studio" are used only to indicate
297
+ compatibility.
298
+
299
+ ## License
300
+
301
+ [MIT](./LICENSE) © Diego Marino
@@ -0,0 +1,36 @@
1
+ # Examples
2
+
3
+ Reference configs for `opencode-lmstudio-warm`. Neither is enabled by copying
4
+ the repo — take the pieces you need into your own files and replace the
5
+ `your-*-model-key` placeholders with your real LM Studio model keys (the exact
6
+ strings opencode sends as the API `model` field).
7
+
8
+ ## `opencode.json` — wiring the plugin
9
+
10
+ A minimal consumer config: the `plugin` array entry plus the `lmstudio` provider
11
+ block (`baseURL`, `apiKey`, and the recommended `headerTimeout` / `chunkTimeout`).
12
+ Merge it into your own `opencode.json` — the repo
13
+ [README's Install section](../README.md#install) has an idempotent `jq` one-liner
14
+ that does this non-destructively, or run `opencode plugin opencode-lmstudio-warm`
15
+ to register the plugin and add only the provider block by hand.
16
+
17
+ Set `model` / `small_model` to your own model keys before use.
18
+
19
+ ## `lmstudio-warm.json` — tuning the plugin
20
+
21
+ A fleet-tuned starting point for the plugin's own options file. Copy it to
22
+ `~/.config/opencode/lmstudio-warm.json` (or pass the same object as plugin
23
+ options in `opencode.json`). Highlights:
24
+
25
+ - **`failMode: "hybrid"`** — confirmed failures (server down, load failed,
26
+ unreconcilable duplicates) fail the request with a clear error; ambiguous lock
27
+ contention proceeds fail-open so a plausibly-in-flight load can still serve it.
28
+ - **`perModel.<key>.parallel`** — size to the number of concurrent workers
29
+ hitting that model. Each slot costs extra KV-cache memory; overflow requests
30
+ queue server-side (latency, not failure), so undersizing is safe. The small
31
+ model tolerates queueing; the main model is where fleet width matters.
32
+ - **timeouts** (`loadTimeoutMs`, `serverStartTimeoutMs`, `lockWaitTimeoutMs`,
33
+ `verifyCacheMs`, `retryCooldownMs`) are tuned for a multi-worker fleet.
34
+
35
+ See the [README's Configuration section](../README.md#configuration) for the full
36
+ option list and defaults.
@@ -0,0 +1,16 @@
1
+ {
2
+ "providers": ["lmstudio"],
3
+ "failMode": "hybrid",
4
+ "eager": true,
5
+ "reconcileDuplicates": true,
6
+ "launchAppFallback": true,
7
+ "verifyCacheMs": 30000,
8
+ "retryCooldownMs": 60000,
9
+ "loadTimeoutMs": 900000,
10
+ "serverStartTimeoutMs": 90000,
11
+ "lockWaitTimeoutMs": 1200000,
12
+ "perModel": {
13
+ "your-main-model-key": { "parallel": 6 },
14
+ "your-small-model-key": { "parallel": 4 }
15
+ }
16
+ }
@@ -0,0 +1,23 @@
1
+ {
2
+ "$schema": "https://opencode.ai/config.json",
3
+ "//": "Consumer example: wire the plugin via npm. opencode auto-installs the package named in `plugin` and loads it before every request. Copy the `plugin` line and the `lmstudio` provider block into your own opencode.json, then set your model/small_model and LM_API_TOKEN.",
4
+ "plugin": ["opencode-lmstudio-warm"],
5
+ "model": "lmstudio/your-main-model-key",
6
+ "small_model": "lmstudio/your-small-model-key",
7
+ "provider": {
8
+ "lmstudio": {
9
+ "npm": "@ai-sdk/openai-compatible",
10
+ "name": "LM Studio (local)",
11
+ "options": {
12
+ "baseURL": "http://127.0.0.1:1234/v1",
13
+ "apiKey": "{env:LM_API_TOKEN}",
14
+ "headerTimeout": 600000,
15
+ "chunkTimeout": 120000
16
+ },
17
+ "models": {
18
+ "your-main-model-key": { "name": "Your main local model" },
19
+ "your-small-model-key": { "name": "Your small/title local model" }
20
+ }
21
+ }
22
+ }
23
+ }
package/package.json ADDED
@@ -0,0 +1,57 @@
1
+ {
2
+ "name": "opencode-lmstudio-warm",
3
+ "version": "0.1.0",
4
+ "description": "Deterministic LM Studio model pre-warm gate for opencode — loads and keeps the target model resident before every request, healing cold starts and mid-session TTL evictions.",
5
+ "type": "module",
6
+ "main": "./src/index.ts",
7
+ "exports": {
8
+ ".": {
9
+ "types": "./src/index.ts",
10
+ "import": "./src/index.ts"
11
+ }
12
+ },
13
+ "files": [
14
+ "src",
15
+ "examples",
16
+ "README.md",
17
+ "LICENSE",
18
+ "CHANGELOG.md"
19
+ ],
20
+ "keywords": [
21
+ "opencode",
22
+ "opencode-plugin",
23
+ "lmstudio",
24
+ "lm-studio",
25
+ "llm",
26
+ "local-llm",
27
+ "model-loading",
28
+ "prewarm",
29
+ "warm",
30
+ "bun"
31
+ ],
32
+ "license": "MIT",
33
+ "author": "Diego Marino <diego@petalo.xyz>",
34
+ "repository": {
35
+ "type": "git",
36
+ "url": "git+https://github.com/diegomarino/opencode-lmstudio-warm.git"
37
+ },
38
+ "homepage": "https://github.com/diegomarino/opencode-lmstudio-warm#readme",
39
+ "bugs": {
40
+ "url": "https://github.com/diegomarino/opencode-lmstudio-warm/issues"
41
+ },
42
+ "engines": {
43
+ "bun": ">=1.0.0"
44
+ },
45
+ "scripts": {
46
+ "typecheck": "tsc --noEmit --strict --skipLibCheck --moduleResolution bundler --module esnext --target esnext --types node src/index.ts",
47
+ "test": "vitest run",
48
+ "test:watch": "vitest",
49
+ "e2e": "test/e2e/verify.sh",
50
+ "check": "bun run typecheck && bun run test && shellcheck --severity=warning test/e2e/verify.sh"
51
+ },
52
+ "devDependencies": {
53
+ "@opencode-ai/plugin": "1.17.10",
54
+ "@types/node": "^26.1.0",
55
+ "vitest": "^4.1.9"
56
+ }
57
+ }
package/src/index.ts ADDED
@@ -0,0 +1,559 @@
1
+ /**
2
+ * lmstudio-warm — deterministic LM Studio model pre-warm gate for opencode.
3
+ *
4
+ * Guarantees the target model is addressable in LM Studio BEFORE any LLM
5
+ * request leaves opencode, healing cold starts and mid-session TTL evictions
6
+ * for every model the session uses (main model AND small_model, which shares
7
+ * the same chat.params hook path).
8
+ *
9
+ * Verified against opencode v1.17.10 source:
10
+ * - `chat.params` is awaited before each request and fires for every stream,
11
+ * including small-model title/summary generation.
12
+ * - `model.api.id` is the exact string sent as the API `model` field.
13
+ * - Plugins run in-process (Bun) and may spawn child processes.
14
+ *
15
+ * Verified live against LM Studio (lms CLI):
16
+ * - `lms load <key> -y` blocks until ready, exits 0 only on success.
17
+ * - `lms load` is NOT idempotent: loading a resident key creates a duplicate
18
+ * instance suffixed `:2` — hence the ps-guard + cross-process lock below.
19
+ * - Omitting `--ttl` loads with ttlMs=null (resident until unloaded), and
20
+ * such instances are bookkept separately from JIT loads (which carry the
21
+ * server's JIT TTL).
22
+ * - `lms ps --json` lists a loading instance as status "idle" ~immediately
23
+ * (measured: listed at ~200ms into a 12.5s load). There is NO ps-visible
24
+ * "loading" state. This is benign for the gate: identifier presence means
25
+ * the instance is addressable and LM Studio QUEUES requests against it
26
+ * until weights are ready (verified live) — so a waiter passing the gate
27
+ * mid-load waits briefly server-side instead of erroring.
28
+ * - `lms ps` works even while the HTTP server is off, so the HTTP server is
29
+ * ensured independently (probe /models, else `lms server start`; any HTTP
30
+ * response — including 401 when API auth is enabled — means "listening").
31
+ *
32
+ * Config (all optional), merged in this order:
33
+ * defaults < ~/.config/opencode/lmstudio-warm.json < plugin options tuple
34
+ *
35
+ * The pure, per-process-stateless helpers are hoisted to module scope and
36
+ * exported (see the "Pure helpers" block) so they can be unit-tested directly;
37
+ * the plugin closure composes them with the live state and child processes.
38
+ */
39
+ import type { Plugin } from "@opencode-ai/plugin"
40
+ import { execFile } from "node:child_process"
41
+ import * as fs from "node:fs"
42
+ import * as fsp from "node:fs/promises"
43
+ import * as os from "node:os"
44
+ import * as path from "node:path"
45
+
46
+ export type PerModel = { ttlSeconds?: number; parallel?: number; contextLength?: number }
47
+
48
+ export type WarmOptions = {
49
+ /** Provider IDs to gate. Requests on other providers are ignored. */
50
+ providers: string[]
51
+ /** Absolute path to the lms CLI. */
52
+ lmsPath: string
53
+ /** Fallback base URL if the provider config doesn't carry one. */
54
+ baseURL: string
55
+ /** --ttl passed to lms load. 0 = omit (resident until unloaded). */
56
+ ttlSeconds: number
57
+ /** --parallel passed to lms load. 0 = omit (LM Studio default, currently 4).
58
+ * Size to expected concurrent fleet width per model; requests beyond the
59
+ * slot count queue server-side (latency, not failure). */
60
+ parallel: number
61
+ /** --context-length passed to lms load. 0 = omit (model default). */
62
+ contextLength: number
63
+ /** Per-model-key overrides of ttlSeconds/parallel/contextLength. */
64
+ perModel: Record<string, PerModel>
65
+ /** How long a positive residency verdict is trusted before re-checking. */
66
+ verifyCacheMs: number
67
+ /** After a CONFIRMED load failure, don't retry the same key for this long
68
+ * (prevents a load storm when e.g. a memory guardrail keeps refusing). */
69
+ retryCooldownMs: number
70
+ /** Hard cap on a single lms load (cold load of a big model can take minutes). */
71
+ loadTimeoutMs: number
72
+ /** Hard cap on bringing the HTTP server up. */
73
+ serverStartTimeoutMs: number
74
+ /** Max time a process waits for another process's in-flight load. */
75
+ lockWaitTimeoutMs: number
76
+ /**
77
+ * What to do when the warm gate cannot ensure residency:
78
+ * - "hybrid" (default): CONFIRMED failures (server won't start, lms load
79
+ * failed, unreconcilable duplicates) fail the request with a clear error;
80
+ * ambiguous outcomes (lock contention timeout) proceed fail-open so a
81
+ * plausibly-in-flight load elsewhere can serve the request via queueing.
82
+ * - "open": never fail the request; log and proceed (JIT fallback).
83
+ * - "closed": any warm failure fails the request.
84
+ */
85
+ failMode: "open" | "closed" | "hybrid"
86
+ /** If only suffixed duplicate instances (key:2 …) exist and none is busy,
87
+ * unload them and load fresh so the bare key becomes addressable again. */
88
+ reconcileDuplicates: boolean
89
+ /** If the server can't be started (LM Studio app closed), try `open -ga "LM Studio"` once. */
90
+ launchAppFallback: boolean
91
+ /** Warm cfg.model + cfg.small_model in the background at instance start. */
92
+ eager: boolean
93
+ logFile: string
94
+ lockDir: string
95
+ }
96
+
97
+ const HOME = os.homedir()
98
+
99
+ const DEFAULTS: WarmOptions = {
100
+ providers: ["lmstudio"],
101
+ lmsPath: fs.existsSync(path.join(HOME, ".lmstudio/bin/lms")) ? path.join(HOME, ".lmstudio/bin/lms") : "lms",
102
+ baseURL: "http://127.0.0.1:1234/v1",
103
+ ttlSeconds: 0,
104
+ parallel: 0,
105
+ contextLength: 0,
106
+ perModel: {},
107
+ verifyCacheMs: 30_000,
108
+ retryCooldownMs: 60_000,
109
+ loadTimeoutMs: 900_000,
110
+ serverStartTimeoutMs: 90_000,
111
+ lockWaitTimeoutMs: 1_200_000,
112
+ failMode: "hybrid",
113
+ reconcileDuplicates: true,
114
+ launchAppFallback: true,
115
+ eager: true,
116
+ logFile: path.join(HOME, ".cache/opencode/lmstudio-warm.log"),
117
+ lockDir: path.join(HOME, ".cache/opencode/lmstudio-warm.lock"),
118
+ }
119
+
120
+ function loadFileOptions(): Partial<WarmOptions> {
121
+ try {
122
+ const p = path.join(HOME, ".config/opencode/lmstudio-warm.json")
123
+ if (!fs.existsSync(p)) return {}
124
+ return JSON.parse(fs.readFileSync(p, "utf8"))
125
+ } catch {
126
+ return {}
127
+ }
128
+ }
129
+
130
+ export type LmsInstance = {
131
+ modelKey?: string
132
+ identifier?: string
133
+ status?: string
134
+ ttlMs?: number | null
135
+ parallel?: number
136
+ queued?: number
137
+ }
138
+
139
+ /** Warm outcome. `confirmed` marks a definitive failure (vs. ambiguity). */
140
+ export type WarmResult = { ok: boolean; confirmed: boolean; reason: string }
141
+
142
+ const OK: WarmResult = { ok: true, confirmed: false, reason: "" }
143
+
144
+ // ─── Pure helpers (module scope, exported for unit tests) ───────────────────
145
+ // No per-process state — the plugin closure below composes these with the live
146
+ // caches, child processes, and lock directory.
147
+
148
+ /** Merge config in precedence order: DEFAULTS < file options < plugin options.
149
+ * Also maps the legacy `failClosed` boolean onto `failMode` when the newer
150
+ * key isn't set. */
151
+ export function resolveOptions(
152
+ fileOpts: Partial<WarmOptions>,
153
+ pluginOpts?: (Partial<WarmOptions> & { failClosed?: boolean }) | null,
154
+ ): WarmOptions {
155
+ const raw = { ...fileOpts, ...(pluginOpts ?? {}) }
156
+ // Legacy boolean from earlier revisions of this plugin.
157
+ if (raw.failClosed !== undefined && raw.failMode === undefined) raw.failMode = raw.failClosed ? "closed" : "open"
158
+ return { ...DEFAULTS, ...raw }
159
+ }
160
+
161
+ /** opencode addresses models by the UNSUFFIXED key; LM Studio routes the API
162
+ * `model` field by instance identifier. "Addressable" means an instance whose
163
+ * identifier equals the key exists. NOTE (verified live): a still-loading
164
+ * instance already appears with status "idle" and LM Studio queues requests
165
+ * against it until ready — there is no ps-visible "loading" state, and none is
166
+ * needed for correctness. */
167
+ export function addressable(instances: LmsInstance[], key: string): boolean {
168
+ return instances.some((i) => i.identifier === key)
169
+ }
170
+
171
+ /** Split an opencode model ref ("provider/key…") on the FIRST slash, so a key
172
+ * that itself contains slashes (e.g. "qwen/qwen3") is preserved intact. */
173
+ export function parseModelRef(ref: unknown): { providerID: string; key: string } | null {
174
+ if (typeof ref !== "string" || !ref.includes("/")) return null
175
+ const slash = ref.indexOf("/")
176
+ return { providerID: ref.slice(0, slash), key: ref.slice(slash + 1) }
177
+ }
178
+
179
+ /** Build the `lms load` argv for a key, applying per-model overrides over the
180
+ * top-level options. A value of 0 omits the corresponding flag. */
181
+ export function loadArgs(opts: WarmOptions, key: string): string[] {
182
+ const per = opts.perModel[key] ?? {}
183
+ const ttl = per.ttlSeconds ?? opts.ttlSeconds
184
+ const parallel = per.parallel ?? opts.parallel
185
+ const ctx = per.contextLength ?? opts.contextLength
186
+ const args = ["load", key, "-y"]
187
+ if (ttl > 0) args.push("--ttl", String(ttl))
188
+ if (parallel > 0) args.push("--parallel", String(parallel))
189
+ if (ctx > 0) args.push("--context-length", String(ctx))
190
+ return args
191
+ }
192
+
193
+ /** Is a process alive? `kill(pid, 0)` sends no signal, just probes: ESRCH ⇒
194
+ * no such process (dead); EPERM ⇒ exists but owned by another user (alive).
195
+ * Host-local only, which is fine — the lock dir is host-local too. */
196
+ export function pidAlive(pid: number): boolean {
197
+ try {
198
+ process.kill(pid, 0)
199
+ return true
200
+ } catch (err: any) {
201
+ return err?.code === "EPERM"
202
+ }
203
+ }
204
+
205
+ /** Parse a lock pid-file's contents to a pid, or null if absent/blank/garbage. */
206
+ export function parseLockPid(content: string | null): number | null {
207
+ if (content == null) return null
208
+ const n = Number.parseInt(content.trim(), 10)
209
+ return Number.isFinite(n) ? n : null
210
+ }
211
+
212
+ /** Given a warm outcome and the configured failMode, should opencode's request
213
+ * be failed? `closed` fails on any not-ok; `hybrid` fails only CONFIRMED
214
+ * failures; `open` never fails. An ok result never fails. */
215
+ export function shouldFailRequest(failMode: WarmOptions["failMode"], result: WarmResult): boolean {
216
+ if (result.ok) return false
217
+ return failMode === "closed" || (failMode === "hybrid" && result.confirmed)
218
+ }
219
+
220
+ export const LMStudioWarm: Plugin = async (_input, pluginOptions) => {
221
+ const opts = resolveOptions(
222
+ loadFileOptions(),
223
+ pluginOptions as (Partial<WarmOptions> & { failClosed?: boolean }) | null,
224
+ )
225
+
226
+ // ---- state (per opencode process) ----
227
+ const verifiedAt = new Map<string, number>() // model key -> last confirmed-addressable timestamp
228
+ const failedAt = new Map<string, { at: number; reason: string }>() // negative cache
229
+ const inflight = new Map<string, Promise<WarmResult>>()
230
+ let serverVerifiedAt = 0
231
+ // True only while THIS process holds the mkdir lock. Used by the exit handler
232
+ // to release a lock that a fire-and-forget eager warm may still be holding
233
+ // when the process tears down (otherwise the async finally never runs).
234
+ let holdingLock = false
235
+
236
+ try {
237
+ fs.mkdirSync(path.dirname(opts.logFile), { recursive: true })
238
+ } catch {}
239
+
240
+ function log(msg: string) {
241
+ try {
242
+ fs.appendFileSync(opts.logFile, `${new Date().toISOString()} [pid ${process.pid}] ${msg}\n`)
243
+ } catch {}
244
+ }
245
+
246
+ // Last-resort synchronous lock release. A one-shot `opencode run` can exit
247
+ // while a background eager warm still holds the lock; process.on("exit") runs
248
+ // sync only, so rmSync is the tool. Guard by the pid file so we never delete a
249
+ // lock another process legitimately re-acquired in the meantime (TOCTOU), and
250
+ // never throw from the handler. SIGKILL is uncatchable — the dead-holder
251
+ // liveness check in acquireLock is the backstop for that.
252
+ process.once("exit", () => {
253
+ if (!holdingLock) return
254
+ try {
255
+ let ours = true
256
+ try {
257
+ const pidStr = fs.readFileSync(path.join(opts.lockDir, "pid"), "utf8").trim()
258
+ ours = pidStr === "" || pidStr === String(process.pid) // absent pid ⇒ we mkdir'd but hadn't written it yet
259
+ } catch {
260
+ ours = true
261
+ }
262
+ if (ours) fs.rmSync(opts.lockDir, { recursive: true, force: true })
263
+ } catch {}
264
+ })
265
+
266
+ function run(
267
+ cmd: string,
268
+ args: string[],
269
+ timeoutMs: number,
270
+ ): Promise<{ ok: boolean; timedOut: boolean; stdout: string; stderr: string }> {
271
+ return new Promise((resolve) => {
272
+ execFile(cmd, args, { timeout: timeoutMs, maxBuffer: 16 * 1024 * 1024, env: process.env }, (err, stdout, stderr) =>
273
+ resolve({
274
+ ok: !err,
275
+ timedOut: Boolean(err && (err as any).killed),
276
+ stdout: String(stdout),
277
+ stderr: String(stderr),
278
+ }),
279
+ )
280
+ })
281
+ }
282
+
283
+ const lms = (args: string[], timeoutMs: number) => run(opts.lmsPath, args, timeoutMs)
284
+
285
+ async function psInstances(): Promise<LmsInstance[] | null> {
286
+ const res = await lms(["ps", "--json"], 15_000)
287
+ if (!res.ok) {
288
+ log(`lms ps failed: ${res.stderr.trim().slice(0, 300)}`)
289
+ return null
290
+ }
291
+ try {
292
+ const parsed = JSON.parse(res.stdout)
293
+ return Array.isArray(parsed) ? parsed : null
294
+ } catch {
295
+ log(`lms ps returned non-JSON: ${res.stdout.slice(0, 200)}`)
296
+ return null
297
+ }
298
+ }
299
+
300
+ // "Alive" means the server is listening — any HTTP response counts, including
301
+ // 401/403 (LM Studio with API auth enabled rejects unauthenticated probes).
302
+ // Only network-level failures (refused/timeout) mean the server is down.
303
+ async function httpAlive(baseURL: string): Promise<boolean> {
304
+ try {
305
+ await fetch(`${baseURL.replace(/\/+$/, "")}/models`, { signal: AbortSignal.timeout(2_500) })
306
+ return true
307
+ } catch {
308
+ return false
309
+ }
310
+ }
311
+
312
+ async function pollAlive(baseURL: string, timeoutMs: number): Promise<boolean> {
313
+ const deadline = Date.now() + timeoutMs
314
+ while (Date.now() < deadline) {
315
+ if (await httpAlive(baseURL)) return true
316
+ await new Promise((r) => setTimeout(r, 1_000))
317
+ }
318
+ return false
319
+ }
320
+
321
+ let serverInflight: Promise<boolean> | null = null
322
+ function ensureServer(baseURL: string): Promise<boolean> {
323
+ if (Date.now() - serverVerifiedAt < opts.verifyCacheMs) return Promise.resolve(true)
324
+ if (serverInflight) return serverInflight
325
+ serverInflight = ensureServerImpl(baseURL).finally(() => {
326
+ serverInflight = null
327
+ })
328
+ return serverInflight
329
+ }
330
+
331
+ async function ensureServerImpl(baseURL: string): Promise<boolean> {
332
+ if (await httpAlive(baseURL)) {
333
+ serverVerifiedAt = Date.now()
334
+ return true
335
+ }
336
+ log(`HTTP server not reachable at ${baseURL} — running lms server start`)
337
+ const started = await lms(["server", "start"], 30_000)
338
+ if (!started.ok) log(`lms server start failed: ${started.stderr.trim().slice(0, 300)}`)
339
+ if (await pollAlive(baseURL, opts.serverStartTimeoutMs)) {
340
+ serverVerifiedAt = Date.now()
341
+ log(`HTTP server is up at ${baseURL}`)
342
+ return true
343
+ }
344
+ if (opts.launchAppFallback && process.platform === "darwin") {
345
+ // Server still down: the LM Studio app itself may be closed.
346
+ log(`server still down — trying: open -ga "LM Studio"`)
347
+ await run("/usr/bin/open", ["-ga", "LM Studio"], 15_000)
348
+ await new Promise((r) => setTimeout(r, 3_000))
349
+ await lms(["server", "start"], 30_000)
350
+ if (await pollAlive(baseURL, opts.serverStartTimeoutMs)) {
351
+ serverVerifiedAt = Date.now()
352
+ log(`HTTP server is up at ${baseURL} (after app launch)`)
353
+ return true
354
+ }
355
+ }
356
+ log(`HTTP server did not come up within budget`)
357
+ return false
358
+ }
359
+
360
+ function lockHolderPid(): number | null {
361
+ try {
362
+ return parseLockPid(fs.readFileSync(path.join(opts.lockDir, "pid"), "utf8"))
363
+ } catch {
364
+ return null
365
+ }
366
+ }
367
+
368
+ // Cross-process mutex via atomic mkdir: parallel opencode workers must not
369
+ // race lms load (it is not idempotent). A lock may be broken when (1) it is
370
+ // older than staleMs (no live holder can run that long — every command under
371
+ // the lock is killed at a hard timeout), (2) its recorded holder pid is dead
372
+ // (crash/abrupt exit before the finally released it — the observed eager-warm
373
+ // leak), or (3) the pid file is missing AND the dir has outlived a short grace
374
+ // (a holder that crashed between mkdir and writeFile). A fresh, pid-less lock
375
+ // is left alone: that is a live holder still mid-acquisition.
376
+ async function acquireLock(): Promise<(() => void) | null> {
377
+ const deadline = Date.now() + opts.lockWaitTimeoutMs
378
+ const staleMs = opts.loadTimeoutMs + opts.serverStartTimeoutMs + 120_000
379
+ const pidGraceMs = 5_000
380
+ for (;;) {
381
+ try {
382
+ await fsp.mkdir(opts.lockDir, { recursive: false })
383
+ holdingLock = true
384
+ try {
385
+ await fsp.writeFile(path.join(opts.lockDir, "pid"), String(process.pid))
386
+ } catch {}
387
+ // Synchronous release: rmSync + flag clear run with no await between them,
388
+ // so a second in-process waiter (parallel eager warm) cannot observe a
389
+ // removed dir with holdingLock still true, and the dir is gone even if
390
+ // the process is mid-teardown when release fires.
391
+ return () => {
392
+ try {
393
+ fs.rmSync(opts.lockDir, { recursive: true, force: true })
394
+ } catch {}
395
+ holdingLock = false
396
+ }
397
+ } catch (err: any) {
398
+ if (err?.code !== "EEXIST") throw err
399
+ try {
400
+ const st = await fsp.stat(opts.lockDir)
401
+ const age = Date.now() - st.mtimeMs
402
+ const holder = lockHolderPid()
403
+ let reason = ""
404
+ if (age > staleMs) reason = `stale (age ${Math.round(age / 1000)}s)`
405
+ else if (holder !== null && holder !== process.pid && !pidAlive(holder)) reason = `dead holder pid ${holder}`
406
+ else if (holder === null && age > pidGraceMs) reason = `abandoned (no pid, age ${Math.round(age / 1000)}s)`
407
+ if (reason) {
408
+ log(`breaking lock: ${reason}`)
409
+ await fsp.rm(opts.lockDir, { recursive: true, force: true })
410
+ continue
411
+ }
412
+ } catch {} // lock vanished between mkdir and stat — retry
413
+ if (Date.now() > deadline) return null // contention timeout — ambiguous, caller decides
414
+ await new Promise((r) => setTimeout(r, 500))
415
+ }
416
+ }
417
+ }
418
+
419
+ async function doWarm(key: string, baseURL: string): Promise<WarmResult> {
420
+ if (!(await ensureServer(baseURL))) {
421
+ return { ok: false, confirmed: true, reason: `LM Studio HTTP server is not reachable at ${baseURL}` }
422
+ }
423
+
424
+ // Fast path: no lock needed if already addressable.
425
+ let instances = await psInstances()
426
+ if (instances && addressable(instances, key)) {
427
+ verifiedAt.set(key, Date.now())
428
+ return OK
429
+ }
430
+
431
+ const release = await acquireLock()
432
+ if (!release) {
433
+ // Someone else has been loading for a long time (big model, slow disk).
434
+ // Their instance may already be addressable and queueing — ambiguous.
435
+ log(`lock contention timeout waiting to warm ${key} — proceeding (ambiguous)`)
436
+ return { ok: false, confirmed: false, reason: "lock contention timeout" }
437
+ }
438
+ try {
439
+ // Double-checked: another process may have loaded it while we waited.
440
+ instances = await psInstances()
441
+ if (instances && addressable(instances, key)) {
442
+ verifiedAt.set(key, Date.now())
443
+ return OK
444
+ }
445
+
446
+ // Orphaned duplicates: instances of this model exist (key:2 …) but none
447
+ // is addressable by the bare key (e.g. the original was unloaded and a
448
+ // stray duplicate survived). Loading again would only stack key:3 —
449
+ // reconcile by unloading idle duplicates first, then load fresh.
450
+ const dups = (instances ?? []).filter((i) => i.modelKey === key)
451
+ if (dups.length > 0) {
452
+ const busy = dups.some((i) => i.status === "generating" || (i.queued ?? 0) > 0)
453
+ if (!opts.reconcileDuplicates || busy) {
454
+ const ids = dups.map((i) => i.identifier).join(", ")
455
+ log(`WARNING: only non-addressable instances of ${key} exist (${ids}); busy=${busy} — cannot warm`)
456
+ return { ok: false, confirmed: true, reason: `only suffixed duplicates of ${key} are resident (${ids})` }
457
+ }
458
+ for (const d of dups) {
459
+ if (!d.identifier) continue
460
+ log(`reconciling: unloading duplicate instance ${d.identifier}`)
461
+ const un = await lms(["unload", d.identifier], 60_000)
462
+ if (!un.ok) log(`unload ${d.identifier} failed: ${un.stderr.trim().slice(0, 200)}`)
463
+ }
464
+ }
465
+
466
+ const args = loadArgs(opts, key)
467
+ log(`loading ${key} (${args.join(" ")}) ...`)
468
+ const t0 = Date.now()
469
+ const res = await lms(args, opts.loadTimeoutMs)
470
+ if (!res.ok) {
471
+ const kind = res.timedOut ? "timeout" : "error"
472
+ const detail = (res.stderr || res.stdout).trim().slice(0, 500)
473
+ log(`lms load ${key} FAILED (${kind}) after ${Date.now() - t0}ms: ${detail}`)
474
+ return { ok: false, confirmed: true, reason: `lms load failed (${kind}): ${detail.slice(0, 200)}` }
475
+ }
476
+
477
+ instances = await psInstances()
478
+ if (instances && addressable(instances, key)) {
479
+ verifiedAt.set(key, Date.now())
480
+ log(`loaded ${key} in ${Math.round((Date.now() - t0) / 1000)}s`)
481
+ return OK
482
+ }
483
+ log(`lms load ${key} exited 0 but ps does not show identifier === key`)
484
+ return { ok: false, confirmed: true, reason: `loaded but not addressable as "${key}"` }
485
+ } finally {
486
+ release()
487
+ }
488
+ }
489
+
490
+ function warm(key: string, baseURL: string): Promise<WarmResult> {
491
+ if (Date.now() - (verifiedAt.get(key) ?? 0) < opts.verifyCacheMs) return Promise.resolve(OK)
492
+ const failed = failedAt.get(key)
493
+ if (failed && Date.now() - failed.at < opts.retryCooldownMs) {
494
+ return Promise.resolve({ ok: false, confirmed: true, reason: `${failed.reason} (cooldown)` })
495
+ }
496
+ const existing = inflight.get(key)
497
+ if (existing) return existing
498
+ const p = doWarm(key, baseURL)
499
+ .catch((err): WarmResult => {
500
+ log(`warm(${key}) error: ${err instanceof Error ? err.message : String(err)}`)
501
+ return { ok: false, confirmed: false, reason: "internal error (see log)" }
502
+ })
503
+ .then((r) => {
504
+ if (r.ok) failedAt.delete(key)
505
+ else if (r.confirmed) failedAt.set(key, { at: Date.now(), reason: r.reason })
506
+ return r
507
+ })
508
+ .finally(() => inflight.delete(key))
509
+ inflight.set(key, p)
510
+ return p
511
+ }
512
+
513
+ log(
514
+ `plugin loaded (providers=${opts.providers.join(",")} ttl=${opts.ttlSeconds || "none"} parallel=${opts.parallel || "default"} failMode=${opts.failMode})`,
515
+ )
516
+
517
+ return {
518
+ // Fires once at instance start with the resolved config. Background eager
519
+ // warm of both pinned models — NOT awaited, so startup isn't delayed; the
520
+ // chat.params gate below remains the deterministic barrier.
521
+ config: async (cfg: any) => {
522
+ if (!opts.eager) return
523
+ for (const ref of [cfg?.model, cfg?.small_model]) {
524
+ const parsed = parseModelRef(ref)
525
+ if (!parsed || !opts.providers.includes(parsed.providerID)) continue
526
+ const configured = cfg?.provider?.[parsed.providerID]?.options?.baseURL
527
+ const baseURL = typeof configured === "string" && configured.startsWith("http") ? configured : opts.baseURL
528
+ log(`eager warm queued for ${parsed.key}`)
529
+ void warm(parsed.key, baseURL)
530
+ }
531
+ },
532
+
533
+ // Awaited by opencode before EVERY LLM request (main and small model alike):
534
+ // the deterministic pre-warm gate. Heals cold starts and TTL evictions.
535
+ "chat.params": async (input: any) => {
536
+ let result: WarmResult = OK
537
+ let key: string | undefined
538
+ try {
539
+ const providerID: string | undefined = input?.provider?.info?.id ?? input?.model?.providerID
540
+ if (!providerID || !opts.providers.includes(providerID)) return
541
+ // model.api.id is the exact string opencode sends as the API `model`
542
+ // field (== LM Studio model key for config-defined models).
543
+ key = input?.model?.api?.id ?? input?.model?.id
544
+ if (!key) return
545
+ const configured = input?.provider?.options?.baseURL
546
+ const baseURL = typeof configured === "string" && configured.startsWith("http") ? configured : opts.baseURL
547
+ result = await warm(key, baseURL)
548
+ } catch (err) {
549
+ log(`chat.params hook error: ${err instanceof Error ? err.message : String(err)}`)
550
+ result = { ok: false, confirmed: false, reason: "hook error (see log)" }
551
+ }
552
+ if (result.ok) return
553
+ if (shouldFailRequest(opts.failMode, result)) {
554
+ throw new Error(`lmstudio-warm: cannot ensure model "${key}" is loaded — ${result.reason}. See ${opts.logFile}`)
555
+ }
556
+ log(`warm(${key}) not ensured (${result.reason}) — proceeding fail-open`)
557
+ },
558
+ }
559
+ }