@fugood/buttress-server 2.25.0-beta.23 → 2.25.0-beta.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +238 -19
- package/lib/index.d.mts +117 -4
- package/lib/index.mjs +43 -43
- package/package.json +5 -3
- package/public/status.html +162 -1
package/README.md
CHANGED
|
@@ -100,48 +100,267 @@ The token claims `{ k: 'ba', w_id, st: 'ws', sid, jti, exp }` and any buttress-s
|
|
|
100
100
|
|
|
101
101
|
## Configuration
|
|
102
102
|
|
|
103
|
-
Configuration
|
|
104
|
-
- `--config` / `-c` flag with TOML file path
|
|
103
|
+
Configuration is loaded from a TOML file passed via `--config` / `-c`. Every top-level table is optional — missing sections fall back to defaults. See `config/sample.toml` for an end-to-end example.
|
|
105
104
|
|
|
106
|
-
###
|
|
105
|
+
### Top-level sections
|
|
106
|
+
|
|
107
|
+
| Section | Purpose |
|
|
108
|
+
| ------------------------- | -------------------------------------------------------------------------------------------------- |
|
|
109
|
+
| `[env]` | Environment variables exported into the process **only if not already set** |
|
|
110
|
+
| `[server]` | HTTP/RPC listener (port, log level, body limits) |
|
|
111
|
+
| `[runtime]` | Global defaults shared by every generator (most `[generators.model]` keys may live here too) |
|
|
112
|
+
| `[runtime.session_cache]` | KV-cache reuse store — see [Session State Cache](#session-state-cache) |
|
|
113
|
+
| `[autodiscover]` | LAN UDP / HTTP / mDNS discovery toggles |
|
|
114
|
+
| `[openai_compat]` | Enable `/oai-compat/v1/*` — see [Compatibility Endpoints](#compatibility-endpoints-experimental) |
|
|
115
|
+
| `[anthropic_messages]` | Enable `/anthropic-messages` — see [Compatibility Endpoints](#compatibility-endpoints-experimental)|
|
|
116
|
+
| `[[generators]]` | Array of generator instances — one entry per loaded model |
|
|
117
|
+
|
|
118
|
+
### `[env]`
|
|
107
119
|
|
|
108
120
|
```toml
|
|
109
|
-
# Environment variables (only set if not already defined in system)
|
|
110
121
|
[env]
|
|
111
|
-
|
|
122
|
+
HUGGINGFACE_TOKEN = "hf_xxx" # ggml backends read this; HF_TOKEN is not picked up automatically
|
|
112
123
|
CUDA_VISIBLE_DEVICES = "0"
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Values here are exported only when the variable isn't already set in the process — see [Environment Variable Priority](#environment-variable-priority). For HuggingFace auth across all backends, `[runtime] huggingface_token = "hf_xxx"` works regardless of variable name.
|
|
127
|
+
|
|
128
|
+
### `[server]`
|
|
129
|
+
|
|
130
|
+
| Key | Type | Default |
|
|
131
|
+
| ----------------- | -------------- | ---------------------------------------------------------------------- |
|
|
132
|
+
| `id` | string | `buttress-<machineId>` — stable id used for autodiscover / binding |
|
|
133
|
+
| `name` | string | `Buttress Server (<short id>)` — display name |
|
|
134
|
+
| `port` | number | `2080` (overridden by `--port`) |
|
|
135
|
+
| `log_level` | `"debug"`/`"info"`/`"warn"`/`"error"` | unset |
|
|
136
|
+
| `max_body_size` | string\|number | `"50MB"` — e.g. `"100MB"`, `"1GB"`, or raw bytes |
|
|
137
|
+
| `session_timeout` | string\|number | `60000` ms — accepts ms numbers or duration strings (`"30s"`) |
|
|
138
|
+
| `temp_file_dir` | string | `$TMPDIR/.buttress` |
|
|
139
|
+
|
|
140
|
+
### `[runtime]` — global generator defaults
|
|
141
|
+
|
|
142
|
+
Most ggml-llm `[generators.model]` keys can also live in `[runtime]` as defaults. Per-generator values win; otherwise the runtime default applies.
|
|
143
|
+
|
|
144
|
+
| Key | Type | Notes |
|
|
145
|
+
| ---------------------------- | ----------------------------- | ---------------------------------------------------------------------- |
|
|
146
|
+
| `cache_dir` | string | Model + metadata cache root (default `~/.buttress/models`) |
|
|
147
|
+
| `huggingface_token` | string | Falls back to `$HUGGINGFACE_TOKEN` |
|
|
148
|
+
| `http_headers` | table | Extra headers attached to HF / HTTP downloads |
|
|
149
|
+
| `context_release_delay_ms` | number | Idle time before unloading a context (default `10000`; `0` = immediate)|
|
|
150
|
+
| `prefer_variants` | string[] | Override variant probe order (ggml backends) |
|
|
151
|
+
| `n_threads` | number | CPU thread count |
|
|
152
|
+
| `n_ctx` | number | Context window (per-model value wins; auto-capped at training context) |
|
|
153
|
+
| `n_gpu_layers` | number\|`"auto"` | Layers offloaded to GPU (default `"auto"`) |
|
|
154
|
+
| `n_batch` / `n_ubatch` | number | Prompt batch / micro-batch size. **Note:** `n_batch` has a model-level default of `512` that shadows the runtime value unless `[generators.model] n_batch` is set explicitly. |
|
|
155
|
+
| `n_parallel` | number | Parallel sequences (default `4`) |
|
|
156
|
+
| `n_cpu_moe` | number | MoE expert layers offloaded to CPU |
|
|
157
|
+
| `flash_attn_type` | `"on"` / `"off"` / `"auto"` | When a GPU backend is selected, defaults to `"auto"`; on CPU, defaults to `"off"`. Explicit `"on"` / `"off"` / `"auto"` overrides. |
|
|
158
|
+
| `cache_type_k`, `cache_type_v` | string | KV-cache dtype (`f16`, `f32`, `q8_0`, `q4_0`, …) |
|
|
159
|
+
| `kv_unified` | boolean | Use a unified KV cache across sequences |
|
|
160
|
+
| `swa_full` | boolean | Materialize full attention even for sliding-window layers |
|
|
161
|
+
| `ctx_shift` | boolean | Allow llama.cpp's rolling context shift |
|
|
162
|
+
| `use_mmap`, `use_mlock` | boolean | Memory-mapping / locking |
|
|
163
|
+
| `no_extra_bufts` | boolean | Disable extra compute buffer types |
|
|
164
|
+
| `cpu_mask`, `cpu_strict` | string / boolean | CPU affinity (advanced) |
|
|
165
|
+
| `devices` | string[] | Restrict to specific GGML devices |
|
|
166
|
+
| Speculative keys | various | `speculative`, `spec_type`, `spec_draft_n_max/n_min/p_min/p_split` |
|
|
167
|
+
|
|
168
|
+
### `[autodiscover]`
|
|
169
|
+
|
|
170
|
+
Set `autodiscover = true` for defaults, `false` (or omit) to disable, or a table for fine control:
|
|
113
171
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
172
|
+
```toml
|
|
173
|
+
[autodiscover]
|
|
174
|
+
udp.port = 8089
|
|
175
|
+
udp.announcements = { enabled = true, interval = 5000 }
|
|
176
|
+
udp.requests = { enabled = true, responseDelay = 100 }
|
|
177
|
+
http.enabled = true
|
|
178
|
+
http.path = "/buttress/info"
|
|
179
|
+
http.cors = true
|
|
180
|
+
# mdns.enabled = false # Bonjour/Avahi advertisement (optional)
|
|
181
|
+
```
|
|
117
182
|
|
|
118
|
-
[
|
|
119
|
-
cache_dir = "~/.buttress/models"
|
|
183
|
+
### `[[generators]]`
|
|
120
184
|
|
|
121
|
-
|
|
122
|
-
[runtime.session_cache]
|
|
123
|
-
enabled = true
|
|
124
|
-
max_size_bytes = "10GB" # Supports string (e.g., "10GB", "500MB") or number
|
|
125
|
-
max_entries = 1000
|
|
185
|
+
Every generator entry has a `type`, an optional `[generators.backend]` table, and a `[generators.model]` table:
|
|
126
186
|
|
|
127
|
-
|
|
187
|
+
```toml
|
|
188
|
+
[[generators]]
|
|
189
|
+
type = "ggml-llm" # or "ggml-stt" / "mlx-llm"
|
|
190
|
+
|
|
191
|
+
[generators.backend]
|
|
192
|
+
# (see per-type sections below)
|
|
193
|
+
|
|
194
|
+
[generators.model]
|
|
195
|
+
repo_id = "..."
|
|
196
|
+
# (see per-type sections below)
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
#### Common `[generators.model]` keys
|
|
200
|
+
|
|
201
|
+
Shared by **all** generator types:
|
|
202
|
+
|
|
203
|
+
| Key | Type | Notes |
|
|
204
|
+
| ------------------------- | --------- | -------------------------------------------------------------------------------- |
|
|
205
|
+
| `repo_id` *(required)* | string | HuggingFace repo (`org/repo`) |
|
|
206
|
+
| `revision` | string | Default `"main"` |
|
|
207
|
+
| `download` | boolean | Pre-download at server startup (default `false`) |
|
|
208
|
+
|
|
209
|
+
Additional keys honored by **ggml-llm** and **ggml-stt** (mlx-llm gets quantization from the repo itself and does not use these):
|
|
210
|
+
|
|
211
|
+
| Key | Type | Notes |
|
|
212
|
+
| ------------------------- | --------- | -------------------------------------------------------------------------------- |
|
|
213
|
+
| `filename` | string | Pin a specific artifact in the repo |
|
|
214
|
+
| `url` | string | Direct download URL (skips manifest lookup) |
|
|
215
|
+
| `quantization` | string | Preferred quant tag — e.g. `q4_0`, `q8_0`, `mxfp4` |
|
|
216
|
+
| `preferred_quantizations` | string[] | Ordered fallback list when `quantization` doesn't match (alias: `quantizations`) |
|
|
217
|
+
| `allow_local_file` | boolean | Required to use `local_path` / `mmproj_local_path` |
|
|
218
|
+
| `local_path` | string | Use a local file as the load path. Repo metadata is still resolved from HF, so `repo_id` is still required. |
|
|
219
|
+
| `api_base`, `base_url` | string | Override HF API / blob hosts (mirrors / proxies) |
|
|
220
|
+
|
|
221
|
+
---
|
|
222
|
+
|
|
223
|
+
### `ggml-llm` (llama.cpp via `@fugood/llama.node`)
|
|
224
|
+
|
|
225
|
+
Loads a GGUF LLM. Runtime keys above can be overridden per-generator under `[generators.model]`; `[generators.backend]` only controls backend selection and resource planning.
|
|
226
|
+
|
|
227
|
+
**`[generators.backend]`**
|
|
228
|
+
|
|
229
|
+
| Key | Type | Default | Notes |
|
|
230
|
+
| --------------------- | -------- | --------------------------------------------- | ---------------------------------------------------------------- |
|
|
231
|
+
| `variant` | string | auto | Force `cuda` / `vulkan` / `snapdragon` / `default` |
|
|
232
|
+
| `variant_preference` | string[] | `["cuda","vulkan","snapdragon","default"]` | Probe order when `variant` is unset |
|
|
233
|
+
| `gpu_memory_fraction` | number | `0.85` | Max GPU fraction the hardware guardrails may plan against |
|
|
234
|
+
| `cpu_memory_fraction` | number | `0.5` | Max RAM fraction for CPU-side buffers |
|
|
235
|
+
|
|
236
|
+
**`[generators.model]`** — in addition to the common keys above:
|
|
237
|
+
|
|
238
|
+
| Key | Type | Notes |
|
|
239
|
+
| ----------------------------------------------------------------------------- | ---------------- | -------------------------------------------------------------------- |
|
|
240
|
+
| `n_ctx` | number | Context window. Auto-capped at the model's training context. |
|
|
241
|
+
| `n_gpu_layers` | number\|`"auto"` | Layers offloaded to GPU (default `"auto"`) |
|
|
242
|
+
| `n_batch` | number | Prompt batch size (default `512`) |
|
|
243
|
+
| `n_ubatch`, `n_threads`, `n_parallel`, `n_cpu_moe` | number | Same semantics as the `[runtime]` defaults |
|
|
244
|
+
| `flash_attn_type`, `cache_type_k`, `cache_type_v`, `kv_unified`, `swa_full`, `ctx_shift`, `use_mmap`, `use_mlock`, `no_extra_bufts`, `cpu_mask`, `cpu_strict`, `devices` | various | Per-model overrides for the `[runtime]` defaults |
|
|
245
|
+
|
|
246
|
+
**Multimodal (mtmd)** — auto-downloads the matching `mmproj-*.gguf` from the same repo and calls `initMultimodal`:
|
|
247
|
+
|
|
248
|
+
| Key | Type | Notes |
|
|
249
|
+
| ------------------------- | ------- | ------------------------------------------------------------------ |
|
|
250
|
+
| `enable_mtmd` | boolean | Default `false` |
|
|
251
|
+
| `mmproj_filename` | string | Pin a specific projector file |
|
|
252
|
+
| `mmproj_url` | string | Direct URL override |
|
|
253
|
+
| `mmproj_local_path` | string | Local projector (requires `allow_local_file = true`) |
|
|
254
|
+
| `mmproj_use_gpu` | boolean | `null` = auto (true when `n_gpu_layers > 0`) |
|
|
255
|
+
| `mmproj_image_min_tokens` | number | Min visual tokens (dynamic-resolution models; `-1` = unset) |
|
|
256
|
+
| `mmproj_image_max_tokens` | number | Max visual tokens (`-1` = unset) |
|
|
257
|
+
|
|
258
|
+
**Speculative decoding**
|
|
259
|
+
|
|
260
|
+
| Key | Type | Notes |
|
|
261
|
+
| -------------------- | ------ | -------------------------------------------------- |
|
|
262
|
+
| `speculative` | string | Draft model identifier |
|
|
263
|
+
| `spec_type` | string | Strategy (backend-defined) |
|
|
264
|
+
| `spec_draft_n_max` | int | Max drafted tokens per step |
|
|
265
|
+
| `spec_draft_n_min` | int | Min drafted tokens |
|
|
266
|
+
| `spec_draft_p_min` | number | Min acceptance probability |
|
|
267
|
+
| `spec_draft_p_split` | number | Split threshold |
|
|
268
|
+
|
|
269
|
+
**Example**
|
|
270
|
+
|
|
271
|
+
```toml
|
|
128
272
|
[[generators]]
|
|
129
273
|
type = "ggml-llm"
|
|
130
274
|
[generators.backend]
|
|
131
275
|
variant_preference = ["cuda", "vulkan", "default"]
|
|
276
|
+
gpu_memory_fraction = 0.95
|
|
132
277
|
[generators.model]
|
|
133
278
|
repo_id = "ggml-org/gpt-oss-20b-GGUF"
|
|
134
279
|
quantization = "mxfp4"
|
|
135
280
|
n_ctx = 12800
|
|
281
|
+
download = true
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
---
|
|
285
|
+
|
|
286
|
+
### `ggml-stt` (whisper.cpp via `@fugood/whisper.node`)
|
|
287
|
+
|
|
288
|
+
Loads a Whisper GGML model for speech-to-text.
|
|
289
|
+
|
|
290
|
+
**`[generators.backend]`**
|
|
136
291
|
|
|
137
|
-
|
|
292
|
+
| Key | Type | Default | Notes |
|
|
293
|
+
| --------------------- | -------- | ----------------------------- | ---------------------------------- |
|
|
294
|
+
| `variant` | string | auto | `cuda` / `vulkan` / `default` |
|
|
295
|
+
| `variant_preference` | string[] | `["cuda","vulkan","default"]` | Probe order |
|
|
296
|
+
| `gpu_memory_fraction` | number | `0.85` | |
|
|
297
|
+
| `cpu_memory_fraction` | number | `0.5` | |
|
|
298
|
+
|
|
299
|
+
**`[generators.model]`** — common keys plus:
|
|
300
|
+
|
|
301
|
+
| Key | Type | Default | Notes |
|
|
302
|
+
| ------------------------- | ----------------------------- | -------------------------------- | ---------------------------------------------------- |
|
|
303
|
+
| `repo_id` | string | `"BricksDisplay/whisper-ggml"` | Defaulted (unlike ggml-llm) |
|
|
304
|
+
| `preferred_quantizations` | string[] | `["q8_0", <no-quant>, "q5_1"]` | Default fallback chain |
|
|
305
|
+
| `use_gpu` | boolean | `true` | Force-disable GPU even when available |
|
|
306
|
+
| `use_flash_attn` | `"on"` / `"off"` / `"auto"` / boolean | `"auto"` | `"auto"` enables flash-attn when GPU is in use. `true`/`false` are accepted as shortcuts for `"on"`/`"off"`. |
|
|
307
|
+
|
|
308
|
+
**Runtime extras** — under `[runtime]` for ggml-stt only:
|
|
309
|
+
|
|
310
|
+
| Key | Type | Notes |
|
|
311
|
+
| ------------- | ------ | ------------------------------------------- |
|
|
312
|
+
| `max_threads` | number | Caps the whisper.cpp thread count |
|
|
313
|
+
|
|
314
|
+
**Example**
|
|
315
|
+
|
|
316
|
+
```toml
|
|
138
317
|
[[generators]]
|
|
139
318
|
type = "ggml-stt"
|
|
140
319
|
[generators.backend]
|
|
141
|
-
variant_preference = ["
|
|
320
|
+
variant_preference = ["cuda", "vulkan", "default"]
|
|
142
321
|
[generators.model]
|
|
143
322
|
repo_id = "BricksDisplay/whisper-ggml"
|
|
144
|
-
filename = "ggml-
|
|
323
|
+
filename = "ggml-large-v3-turbo-q8_0.bin"
|
|
324
|
+
use_gpu = true
|
|
325
|
+
use_flash_attn = "on"
|
|
326
|
+
download = true
|
|
327
|
+
```
|
|
328
|
+
|
|
329
|
+
---
|
|
330
|
+
|
|
331
|
+
### `mlx-llm` (Apple Silicon, Python `mlx-lm` / `mlx-vlm` bridge)
|
|
332
|
+
|
|
333
|
+
Loads an MLX-format model on Apple Silicon. On first use, the backend creates a virtualenv at `{cache_dir}/mlx-env` and installs `mlx_lm_package`, `mlx_vlm_package`, plus `torch` and `torchvision` (required by some VLM processors). If an existing venv already has `mlx_vlm` and `torch` importable, the install step is skipped. There is no `[generators.backend]` section.
|
|
334
|
+
|
|
335
|
+
**`[generators.model]`** — common `repo_id` / `revision` / `download` plus:
|
|
336
|
+
|
|
337
|
+
| Key | Type | Default | Notes |
|
|
338
|
+
| ------------------ | ------------------- | --------- | ----------------------------------------------------------------------------- |
|
|
339
|
+
| `adapter_path` | string | — | Local LoRA adapter directory |
|
|
340
|
+
| `vlm` | `"auto"` / boolean | `"auto"` | Force VLM (`true`) vs text-only (`false`); `"auto"` infers from the repo |
|
|
341
|
+
| `tokenizer_config` | table | — | Forwarded to `mlx_lm.load(..., tokenizer_config=...)` |
|
|
342
|
+
| `model_config` | table | — | Forwarded to `mlx_lm.load(..., model_config=...)` |
|
|
343
|
+
|
|
344
|
+
`quantization`, `filename`, and `preferred_quantizations` are **not** used — the MLX repo itself determines the quantization.
|
|
345
|
+
|
|
346
|
+
**Runtime extras** — under `[runtime]` for mlx-llm:
|
|
347
|
+
|
|
348
|
+
| Key | Type | Default | Notes |
|
|
349
|
+
| ------------------- | ------ | ----------------------------- | -------------------------------------------------------------------- |
|
|
350
|
+
| `mlx_env_dir` | string | `{cache_dir}/mlx-env` | Location of the auto-managed Python venv |
|
|
351
|
+
| `mlx_lm_package` | string | `"mlx-lm==0.31.1"` | pip spec used when provisioning the venv |
|
|
352
|
+
| `mlx_vlm_package` | string | `"mlx-vlm==0.4.0"` | pip spec used when provisioning the venv |
|
|
353
|
+
| `session_cache.*` | table | enabled, `5GB`, 100 entries | Separate cache from ggml-llm (lives in `{cache_dir}/mlx-session-cache`) |
|
|
354
|
+
|
|
355
|
+
**Example**
|
|
356
|
+
|
|
357
|
+
```toml
|
|
358
|
+
[[generators]]
|
|
359
|
+
type = "mlx-llm"
|
|
360
|
+
[generators.model]
|
|
361
|
+
repo_id = "mlx-community/Qwen2.5-VL-3B-Instruct-4bit"
|
|
362
|
+
vlm = true
|
|
363
|
+
download = true
|
|
145
364
|
```
|
|
146
365
|
|
|
147
366
|
### Programmatic Usage
|
package/lib/index.d.mts
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
|
|
2
2
|
import { AnyElysia, Elysia } from "elysia";
|
|
3
3
|
import crypto from "node:crypto";
|
|
4
|
+
import * as node_stream_web0 from "node:stream/web";
|
|
4
5
|
import { ReadableStream } from "node:stream/web";
|
|
5
6
|
import { EventEmitter } from "node:events";
|
|
7
|
+
import { estimateOnnxRuntimeMemory as estimateRuntimeMemory } from "@fugood/buttress-hardware-guardrails";
|
|
6
8
|
|
|
7
9
|
//#region ../buttress-backend-core/lib/types/caps.d.ts
|
|
8
10
|
declare function getCapabilities(type: any, currentClientCapabilities?: any, options?: {}): Promise<any>;
|
|
@@ -101,8 +103,80 @@ declare function testGgmlSttCapabilities({
|
|
|
101
103
|
modelId: string | null;
|
|
102
104
|
defaultConfig: any | null;
|
|
103
105
|
}): Promise<void>;
|
|
106
|
+
//#endregion
|
|
107
|
+
//#region ../buttress-backend-core/lib/types/utils/onnx.d.ts
|
|
108
|
+
declare function resolveModelCacheDir(cacheDir: string, repoId: string, revision?: string): string;
|
|
109
|
+
declare function estimateOnnxModelSize({
|
|
110
|
+
repoId,
|
|
111
|
+
revision,
|
|
112
|
+
modelType,
|
|
113
|
+
dtype,
|
|
114
|
+
cacheDir,
|
|
115
|
+
baseUrl,
|
|
116
|
+
subfolder,
|
|
117
|
+
headers,
|
|
118
|
+
configJson
|
|
119
|
+
}: {
|
|
120
|
+
repoId: string;
|
|
121
|
+
revision?: string;
|
|
122
|
+
modelType?: string;
|
|
123
|
+
dtype?: string | Record<string, string>;
|
|
124
|
+
cacheDir?: string;
|
|
125
|
+
baseUrl?: string;
|
|
126
|
+
subfolder?: string;
|
|
127
|
+
headers?: Record<string, string>;
|
|
128
|
+
configJson?: Record<string, any>;
|
|
129
|
+
}): Promise<{
|
|
130
|
+
totalBytes: number;
|
|
131
|
+
files: Array<{
|
|
132
|
+
name: string;
|
|
133
|
+
dtype: string;
|
|
134
|
+
bytes: number;
|
|
135
|
+
}>;
|
|
136
|
+
warnings: string[];
|
|
137
|
+
source: "local" | "remote";
|
|
138
|
+
}>;
|
|
139
|
+
declare function resolveOnnxDownloadManifest({
|
|
140
|
+
repoId,
|
|
141
|
+
revision,
|
|
142
|
+
modelType,
|
|
143
|
+
dtype,
|
|
144
|
+
cacheDir,
|
|
145
|
+
baseUrl,
|
|
146
|
+
subfolder,
|
|
147
|
+
headers,
|
|
148
|
+
configJson
|
|
149
|
+
}: {
|
|
150
|
+
repoId: string;
|
|
151
|
+
revision?: string;
|
|
152
|
+
modelType?: string;
|
|
153
|
+
dtype?: string | Record<string, string>;
|
|
154
|
+
cacheDir: string;
|
|
155
|
+
baseUrl?: string;
|
|
156
|
+
subfolder?: string;
|
|
157
|
+
headers?: Record<string, string>;
|
|
158
|
+
configJson?: Record<string, any>;
|
|
159
|
+
}): Promise<{
|
|
160
|
+
modelDir: string;
|
|
161
|
+
files: Array<{
|
|
162
|
+
rfilename: string;
|
|
163
|
+
url: string;
|
|
164
|
+
localPath: string;
|
|
165
|
+
size: number;
|
|
166
|
+
}>;
|
|
167
|
+
config: Record<string, any> | null;
|
|
168
|
+
}>;
|
|
169
|
+
declare function startOnnxModelDownload(config: object, globalDownloadManager: object, options?: {
|
|
170
|
+
onProgress?: (p: number) => void;
|
|
171
|
+
onComplete?: (info: object) => void;
|
|
172
|
+
onError?: (err: Error) => void;
|
|
173
|
+
}): Promise<{
|
|
174
|
+
started: boolean;
|
|
175
|
+
localPath: string | null;
|
|
176
|
+
repoId: string | null;
|
|
177
|
+
}>;
|
|
104
178
|
declare namespace index_d_exports {
|
|
105
|
-
export { finalizeGenerator, generatorRegistry, getCapabilities, getModelIdentifier, ggmlLlm, ggmlStt, globalDownloadManager, mlxLlm, showModelsTable, showSttModelsTable, startGenerator, startModelDownload, status, testGgmlLlmCapabilities, testGgmlSttCapabilities };
|
|
179
|
+
export { estimateOnnxModelSize, estimateRuntimeMemory, finalizeGenerator, generatorRegistry, getCapabilities, getModelIdentifier, ggmlLlm, ggmlStt, globalDownloadManager, mlxLlm, onnxStt, onnxTts, resolveModelCacheDir, resolveOnnxDownloadManifest, showModelsTable, showSttModelsTable, startGenerator, startModelDownload, startOnnxModelDownload, status, testGgmlLlmCapabilities, testGgmlSttCapabilities };
|
|
106
180
|
}
|
|
107
181
|
declare function startGenerator(type: any, config: any): Promise<{
|
|
108
182
|
id: any;
|
|
@@ -132,8 +206,47 @@ declare namespace mlxLlm {
|
|
|
132
206
|
function applyChatTemplate(id: any, property: any): Promise<any>;
|
|
133
207
|
function releaseContext(id: any, property: any): Promise<any>;
|
|
134
208
|
}
|
|
209
|
+
declare namespace onnxStt {
|
|
210
|
+
function initContext(id: any, property: any): Promise<any>;
|
|
211
|
+
/**
|
|
212
|
+
* @returns {import('node:stream/web').ReadableStream}
|
|
213
|
+
*/
|
|
214
|
+
function transcribe(id: any, property: any): node_stream_web0.ReadableStream;
|
|
215
|
+
function transcribeData(id: any, property: any): Promise<any>;
|
|
216
|
+
function releaseContext(id: any, property: any): Promise<any>;
|
|
217
|
+
}
|
|
218
|
+
declare namespace onnxTts {
|
|
219
|
+
function initContext(id: any, property: any): Promise<any>;
|
|
220
|
+
function addSpeaker(id: any, speaker: any): Promise<any>;
|
|
221
|
+
function synthesize(id: any, property: any): Promise<any>;
|
|
222
|
+
function releaseContext(id: any, property: any): Promise<any>;
|
|
223
|
+
}
|
|
135
224
|
declare namespace status {
|
|
136
|
-
export function getFullStatus():
|
|
225
|
+
export function getFullStatus(): {
|
|
226
|
+
timestamp: string;
|
|
227
|
+
ggmlLlm: any;
|
|
228
|
+
ggmlStt: any;
|
|
229
|
+
mlxLlm: any;
|
|
230
|
+
onnxStt: any;
|
|
231
|
+
onnxTts: {
|
|
232
|
+
generators: {
|
|
233
|
+
id: any;
|
|
234
|
+
type: any;
|
|
235
|
+
refCount: any;
|
|
236
|
+
repoId: any;
|
|
237
|
+
dtype: any;
|
|
238
|
+
provider: any;
|
|
239
|
+
device: any;
|
|
240
|
+
modelBytes: any;
|
|
241
|
+
vocoderRepoId: any;
|
|
242
|
+
pipelines: any;
|
|
243
|
+
}[];
|
|
244
|
+
history: {
|
|
245
|
+
modelLoads: any[];
|
|
246
|
+
syntheses: any[];
|
|
247
|
+
};
|
|
248
|
+
};
|
|
249
|
+
};
|
|
137
250
|
export function getGgmlLlmStatus(): any;
|
|
138
251
|
export function getGgmlSttStatus(): any;
|
|
139
252
|
export function getMlxLlmStatus(): any;
|
|
@@ -184,7 +297,7 @@ declare namespace globalDownloadManager {
|
|
|
184
297
|
* The download will be tracked by the global download manager so that
|
|
185
298
|
* initContext can wait for it if needed.
|
|
186
299
|
*
|
|
187
|
-
* @param {string} type - The generator type ('ggml-llm'
|
|
300
|
+
* @param {string} type - The generator type ('ggml-llm', 'ggml-stt', etc.)
|
|
188
301
|
* @param {Object} config - The generator configuration
|
|
189
302
|
* @param {Object} options - Options for the download
|
|
190
303
|
* @param {function} options.onProgress - Progress callback (0-1)
|
|
@@ -223,7 +336,7 @@ type RuntimeConfig = {
|
|
|
223
336
|
huggingface_token?: string;
|
|
224
337
|
session_cache?: SessionCacheConfig;
|
|
225
338
|
} & Record<string, any>;
|
|
226
|
-
type GeneratorType = 'ggml-llm' | 'ggml-stt' | 'mlx-llm';
|
|
339
|
+
type GeneratorType = 'ggml-llm' | 'ggml-stt' | 'mlx-llm' | 'onnx-stt' | 'onnx-tts';
|
|
227
340
|
type GeneratorConfig = {
|
|
228
341
|
type: GeneratorType;
|
|
229
342
|
} & Record<string, any>;
|