@fugood/buttress-server-poc 2.23.0-beta.3 → 2.23.0-beta.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +88 -1
- package/bin/start.mjs +177 -3
- package/config/sample.toml +63 -2
- package/lib/index.js +1 -1
- package/package.json +8 -5
- package/public/status.html +896 -0
package/README.md
CHANGED
|
@@ -40,6 +40,13 @@ log_level = "info"
|
|
|
40
40
|
[runtime]
|
|
41
41
|
cache_dir = "~/.buttress/models"
|
|
42
42
|
|
|
43
|
+
# Session state cache for ggml-llm (saves KV cache to disk for prompt reuse)
|
|
44
|
+
[runtime.session_cache]
|
|
45
|
+
enabled = true
|
|
46
|
+
max_size_bytes = "10GB" # Supports string (e.g., "10GB", "500MB") or number
|
|
47
|
+
max_entries = 1000
|
|
48
|
+
|
|
49
|
+
# GGML LLM generator
|
|
43
50
|
[[generators]]
|
|
44
51
|
type = "ggml-llm"
|
|
45
52
|
[generators.backend]
|
|
@@ -48,6 +55,15 @@ variant_preference = ["cuda", "vulkan", "default"]
|
|
|
48
55
|
repo_id = "ggml-org/gpt-oss-20b-GGUF"
|
|
49
56
|
quantization = "mxfp4"
|
|
50
57
|
n_ctx = 12800
|
|
58
|
+
|
|
59
|
+
# GGML STT (Speech-to-Text) generator
|
|
60
|
+
[[generators]]
|
|
61
|
+
type = "ggml-stt"
|
|
62
|
+
[generators.backend]
|
|
63
|
+
variant_preference = ["coreml", "default"]
|
|
64
|
+
[generators.model]
|
|
65
|
+
repo_id = "BricksDisplay/whisper-ggml"
|
|
66
|
+
filename = "ggml-small.bin"
|
|
51
67
|
```
|
|
52
68
|
|
|
53
69
|
### Programmatic Usage
|
|
@@ -103,4 +119,75 @@ Port can be configured via multiple sources (highest priority first):
|
|
|
103
119
|
|
|
104
120
|
1. **Command-line flag**: `--port 3000`
|
|
105
121
|
2. **Config file**: `[server] port = 2080`
|
|
106
|
-
|
|
122
|
+
3. **Default**: `2080`
|
|
123
|
+
|
|
124
|
+
## CLI Reference
|
|
125
|
+
|
|
126
|
+
```
|
|
127
|
+
bricks-buttress-poc v2.23.0-beta.22
|
|
128
|
+
|
|
129
|
+
Buttress server for remote inference with GGML backends.
|
|
130
|
+
|
|
131
|
+
Usage:
|
|
132
|
+
bricks-buttress-poc [options]
|
|
133
|
+
|
|
134
|
+
Options:
|
|
135
|
+
-h, --help Show this help message
|
|
136
|
+
-v, --version Show version number
|
|
137
|
+
-p, --port <port> Port to listen on (default: 2080)
|
|
138
|
+
-c, --config <path|toml> Path to TOML config file or inline TOML string
|
|
139
|
+
|
|
140
|
+
Testing Options:
|
|
141
|
+
--test-caps <backend> Test model capabilities (ggml-llm or ggml-stt)
|
|
142
|
+
--test-caps-model-id <id> Model ID to test (used with --test-caps)
|
|
143
|
+
--test-models <ids> Comma-separated list of model IDs to test
|
|
144
|
+
--test-models-default Test default set of models
|
|
145
|
+
|
|
146
|
+
Note: --test-models and --test-models-default output a markdown report
|
|
147
|
+
file (e.g., ggml-llm-model-capabilities-YYYY-MM-DD.md)
|
|
148
|
+
|
|
149
|
+
Environment Variables:
|
|
150
|
+
BUTTRESS_PORT Port to listen on (overridden by --port)
|
|
151
|
+
NODE_ENV Set to 'development' for dev mode
|
|
152
|
+
|
|
153
|
+
Examples:
|
|
154
|
+
bricks-buttress-poc
|
|
155
|
+
bricks-buttress-poc --port 3000
|
|
156
|
+
bricks-buttress-poc --config ./config.toml
|
|
157
|
+
bricks-buttress-poc --test-caps ggml-llm --test-models-default
|
|
158
|
+
bricks-buttress-poc --test-caps ggml-stt --test-caps-model-id BricksDisplay/whisper-ggml:ggml-small.bin
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## Session State Cache
|
|
162
|
+
|
|
163
|
+
The server supports session state caching for ggml-llm generators, which saves KV cache state to disk after completions. This enables:
|
|
164
|
+
|
|
165
|
+
- **Prompt reuse**: Same or similar prompts can reuse cached state, skipping prompt processing
|
|
166
|
+
- **Multi-turn conversations**: Conversation history state is preserved across requests
|
|
167
|
+
|
|
168
|
+
### Configuration
|
|
169
|
+
|
|
170
|
+
```toml
|
|
171
|
+
[runtime.session_cache]
|
|
172
|
+
enabled = true # Enable/disable session caching (default: true)
|
|
173
|
+
max_size_bytes = "10GB" # Supports string (e.g., "10GB", "500MB") or number (default: 10GB)
|
|
174
|
+
max_entries = 1000 # Max number of cached entries (default: 1000)
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### How it works
|
|
178
|
+
|
|
179
|
+
1. After a successful completion, the KV cache state is saved to disk
|
|
180
|
+
2. On new completions, the server checks if any cached state matches the prompt prefix
|
|
181
|
+
3. If a match is found, the cached state is loaded, skipping redundant prompt processing
|
|
182
|
+
4. LRU eviction removes oldest entries when limits are exceeded
|
|
183
|
+
|
|
184
|
+
### Cache location
|
|
185
|
+
|
|
186
|
+
Cache files are stored in `{cache_dir}/.session-state-cache/`:
|
|
187
|
+
- `cache-map.json` - Index of cached entries
|
|
188
|
+
- `states/` - Binary state files
|
|
189
|
+
- `temp/` - Temporary files (auto-cleaned after 1 hour)
|
|
190
|
+
|
|
191
|
+
## Tips
|
|
192
|
+
|
|
193
|
+
- macOS: Use `sudo sysctl iogpu.wired_limit_mb=<number>` to increase GPU memory allocation. The default available memory of GPU is about ~70%. For example, if the hardware have 128GB memory, you can use `sudo sysctl iogpu.wired_limit_mb=137438` to increase to 128GB. Run `sudo sysctl iogpu.wired_limit_mb=0` if you want to back to default.
|
package/bin/start.mjs
CHANGED
|
@@ -3,12 +3,61 @@ import fs from 'node:fs'
|
|
|
3
3
|
import path from 'node:path'
|
|
4
4
|
import os from 'node:os'
|
|
5
5
|
import TOML from '@iarna/toml'
|
|
6
|
+
import bytes from 'bytes'
|
|
7
|
+
import { createRequire } from 'node:module'
|
|
6
8
|
|
|
7
9
|
const serverModule =
|
|
8
10
|
process.env.NODE_ENV === 'development'
|
|
9
11
|
? await import('../src/index.js')
|
|
10
12
|
: await import('../lib/index.js')
|
|
11
|
-
|
|
13
|
+
|
|
14
|
+
const require = createRequire(import.meta.url)
|
|
15
|
+
const pkg = require('../package.json')
|
|
16
|
+
|
|
17
|
+
// Handle --version/-v flag
|
|
18
|
+
if (process.argv.includes('--version') || process.argv.includes('-v')) {
|
|
19
|
+
console.log(pkg.version)
|
|
20
|
+
process.exit(0)
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
// Handle --help/-h flag
|
|
24
|
+
if (process.argv.includes('--help') || process.argv.includes('-h')) {
|
|
25
|
+
console.log(`
|
|
26
|
+
bricks-buttress-poc v${pkg.version}
|
|
27
|
+
|
|
28
|
+
Buttress server for remote inference with GGML backends.
|
|
29
|
+
|
|
30
|
+
Usage:
|
|
31
|
+
bricks-buttress-poc [options]
|
|
32
|
+
|
|
33
|
+
Options:
|
|
34
|
+
-h, --help Show this help message
|
|
35
|
+
-v, --version Show version number
|
|
36
|
+
-p, --port <port> Port to listen on (default: 2080)
|
|
37
|
+
-c, --config <path|toml> Path to TOML config file or inline TOML string
|
|
38
|
+
|
|
39
|
+
Testing Options:
|
|
40
|
+
--test-caps <backend> Test model capabilities (ggml-llm or ggml-stt)
|
|
41
|
+
--test-caps-model-id <id> Model ID to test (used with --test-caps)
|
|
42
|
+
--test-models <ids> Comma-separated list of model IDs to test
|
|
43
|
+
--test-models-default Test default set of models
|
|
44
|
+
|
|
45
|
+
Note: --test-models and --test-models-default output a markdown report
|
|
46
|
+
file (e.g., ggml-llm-model-capabilities-YYYY-MM-DD.md)
|
|
47
|
+
|
|
48
|
+
Environment Variables:
|
|
49
|
+
BUTTRESS_PORT Port to listen on (overridden by --port)
|
|
50
|
+
NODE_ENV Set to 'development' for dev mode
|
|
51
|
+
|
|
52
|
+
Examples:
|
|
53
|
+
bricks-buttress-poc
|
|
54
|
+
bricks-buttress-poc --port 3000
|
|
55
|
+
bricks-buttress-poc --config ./config.toml
|
|
56
|
+
bricks-buttress-poc --test-caps ggml-llm --test-models-default
|
|
57
|
+
bricks-buttress-poc --test-caps ggml-stt --test-caps-model-id BricksDisplay/whisper-ggml:ggml-small.bin
|
|
58
|
+
`)
|
|
59
|
+
process.exit(0)
|
|
60
|
+
}
|
|
12
61
|
|
|
13
62
|
const portArgIndex = process.argv.findIndex((arg) => arg === '--port' || arg === '-p')
|
|
14
63
|
const portValue = portArgIndex >= 0 ? Number(process.argv[portArgIndex + 1]) : undefined
|
|
@@ -53,6 +102,117 @@ if (configInput) {
|
|
|
53
102
|
}
|
|
54
103
|
}
|
|
55
104
|
|
|
105
|
+
const {
|
|
106
|
+
testGgmlLlmCapabilities,
|
|
107
|
+
testGgmlSttCapabilities,
|
|
108
|
+
showModelsTable,
|
|
109
|
+
showSttModelsTable,
|
|
110
|
+
startServer,
|
|
111
|
+
checkAndNotifyUpdates,
|
|
112
|
+
} = serverModule
|
|
113
|
+
|
|
114
|
+
// Default models for --test-models-default
|
|
115
|
+
const DEFAULT_TEST_GGML_LLM_MODELS = [
|
|
116
|
+
'ggml-org/gpt-oss-20b-GGUF',
|
|
117
|
+
'ggml-org/gpt-oss-120b-GGUF',
|
|
118
|
+
'unsloth/Nemotron-3-Nano-30B-A3B-GGUF',
|
|
119
|
+
'unsloth/Qwen3-VL-30B-A3B-Instruct-GGUF',
|
|
120
|
+
'bartowski/Mistral-Nemo-Instruct-2407-GGUF',
|
|
121
|
+
'mistralai/Magistral-Small-2509-GGUF',
|
|
122
|
+
'mistralai/Ministral-3-14B-Reasoning-2512-GGUF',
|
|
123
|
+
'bartowski/mistralai_Devstral-Small-2-24B-Instruct-2512-GGUF',
|
|
124
|
+
'bartowski/mistralai_Devstral-2-123B-Instruct-2512-GGUF',
|
|
125
|
+
'ggml-org/gemma-3-12b-it-qat-GGUF',
|
|
126
|
+
'ggml-org/gemma-3-27b-it-qat-GGUF',
|
|
127
|
+
'unsloth/phi-4-GGUF',
|
|
128
|
+
]
|
|
129
|
+
|
|
130
|
+
const DEFAULT_TEST_GGML_STT_MODELS = [
|
|
131
|
+
'BricksDisplay/whisper-ggml:ggml-small.bin',
|
|
132
|
+
'BricksDisplay/whisper-ggml:ggml-small-q8_0.bin',
|
|
133
|
+
'BricksDisplay/whisper-ggml:ggml-medium.bin',
|
|
134
|
+
'BricksDisplay/whisper-ggml:ggml-medium-q8_0.bin',
|
|
135
|
+
'BricksDisplay/whisper-ggml:ggml-large-v3-turbo.bin',
|
|
136
|
+
'BricksDisplay/whisper-ggml:ggml-large-v3-turbo-q8_0.bin',
|
|
137
|
+
'BricksDisplay/whisper-ggml:ggml-large-v3.bin',
|
|
138
|
+
]
|
|
139
|
+
|
|
140
|
+
// Handle --test-caps flag
|
|
141
|
+
const testCapsArgIndex = process.argv.findIndex((arg) => arg === '--test-caps')
|
|
142
|
+
if (testCapsArgIndex >= 0) {
|
|
143
|
+
const backendType = process.argv[testCapsArgIndex + 1] || 'ggml-llm'
|
|
144
|
+
if (backendType !== 'ggml-llm' && backendType !== 'ggml-stt') {
|
|
145
|
+
console.error('Only ggml-llm and ggml-stt backends are supported for testing capabilities')
|
|
146
|
+
process.exit(1)
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// Check for --test-models or --test-models-default
|
|
150
|
+
const testModelsArgIndex = process.argv.findIndex((arg) => arg === '--test-models')
|
|
151
|
+
const hasTestModelsDefault = process.argv.includes('--test-models-default')
|
|
152
|
+
|
|
153
|
+
if (backendType === 'ggml-stt') {
|
|
154
|
+
// STT backend
|
|
155
|
+
if (testModelsArgIndex >= 0) {
|
|
156
|
+
const modelIdsInput = process.argv[testModelsArgIndex + 1]
|
|
157
|
+
if (!modelIdsInput) {
|
|
158
|
+
console.error('Error: --test-models requires a comma-separated list of model IDs')
|
|
159
|
+
process.exit(1)
|
|
160
|
+
}
|
|
161
|
+
const modelIds = modelIdsInput.split(',').map((id) => id.trim())
|
|
162
|
+
|
|
163
|
+
await showSttModelsTable({
|
|
164
|
+
modelIds,
|
|
165
|
+
defaultConfig,
|
|
166
|
+
})
|
|
167
|
+
} else if (hasTestModelsDefault) {
|
|
168
|
+
await showSttModelsTable({
|
|
169
|
+
modelIds: DEFAULT_TEST_GGML_STT_MODELS,
|
|
170
|
+
defaultConfig,
|
|
171
|
+
})
|
|
172
|
+
} else {
|
|
173
|
+
// Single model test
|
|
174
|
+
const testCapsModelIdArgIndex = process.argv.findIndex(
|
|
175
|
+
(arg) => arg === '--test-caps-model-id',
|
|
176
|
+
)
|
|
177
|
+
const modelId =
|
|
178
|
+
testCapsModelIdArgIndex >= 0 ? process.argv[testCapsModelIdArgIndex + 1] : null
|
|
179
|
+
|
|
180
|
+
await testGgmlSttCapabilities({
|
|
181
|
+
modelId,
|
|
182
|
+
defaultConfig,
|
|
183
|
+
})
|
|
184
|
+
}
|
|
185
|
+
} else if (testModelsArgIndex >= 0) {
|
|
186
|
+
// LLM backend with custom models
|
|
187
|
+
const modelIdsInput = process.argv[testModelsArgIndex + 1]
|
|
188
|
+
if (!modelIdsInput) {
|
|
189
|
+
console.error('Error: --test-models requires a comma-separated list of model IDs')
|
|
190
|
+
process.exit(1)
|
|
191
|
+
}
|
|
192
|
+
const modelIds = modelIdsInput.split(',').map((id) => id.trim())
|
|
193
|
+
|
|
194
|
+
await showModelsTable({
|
|
195
|
+
modelIds,
|
|
196
|
+
defaultConfig,
|
|
197
|
+
})
|
|
198
|
+
} else if (hasTestModelsDefault) {
|
|
199
|
+
// LLM backend with default models
|
|
200
|
+
await showModelsTable({
|
|
201
|
+
modelIds: DEFAULT_TEST_GGML_LLM_MODELS,
|
|
202
|
+
defaultConfig,
|
|
203
|
+
})
|
|
204
|
+
} else {
|
|
205
|
+
// LLM backend single model test
|
|
206
|
+
const testCapsModelIdArgIndex = process.argv.findIndex((arg) => arg === '--test-caps-model-id')
|
|
207
|
+
const modelId = testCapsModelIdArgIndex >= 0 ? process.argv[testCapsModelIdArgIndex + 1] : null
|
|
208
|
+
|
|
209
|
+
await testGgmlLlmCapabilities({
|
|
210
|
+
modelId,
|
|
211
|
+
defaultConfig,
|
|
212
|
+
})
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
56
216
|
const getLocalIpAddress = () => {
|
|
57
217
|
const networkInterfaces = os.networkInterfaces()
|
|
58
218
|
const localIp = Object.values(networkInterfaces)
|
|
@@ -62,11 +222,17 @@ const getLocalIpAddress = () => {
|
|
|
62
222
|
}
|
|
63
223
|
|
|
64
224
|
const configPort = defaultConfig?.server?.port
|
|
225
|
+
const configMaxBodySizeRaw = defaultConfig?.server?.max_body_size
|
|
226
|
+
const configMaxBodySize =
|
|
227
|
+
typeof configMaxBodySizeRaw === 'string'
|
|
228
|
+
? bytes.parse(configMaxBodySizeRaw)
|
|
229
|
+
: configMaxBodySizeRaw
|
|
65
230
|
const finalPort = portValue || configPort || Number(process.env.BUTTRESS_PORT) || 2080
|
|
66
231
|
|
|
67
232
|
startServer({
|
|
68
233
|
port: finalPort,
|
|
69
234
|
defaultConfig,
|
|
235
|
+
maxBodySize: configMaxBodySize,
|
|
70
236
|
})
|
|
71
237
|
.then(async ({ port }) => {
|
|
72
238
|
const ip = getLocalIpAddress()
|
|
@@ -74,10 +240,18 @@ startServer({
|
|
|
74
240
|
console.log('--------------------------------')
|
|
75
241
|
await checkAndNotifyUpdates()
|
|
76
242
|
|
|
243
|
+
console.log()
|
|
244
|
+
console.log('Current supported Generators:')
|
|
245
|
+
console.log('- LLM (GGML)')
|
|
246
|
+
console.log('- STT (GGML)')
|
|
247
|
+
console.log()
|
|
77
248
|
console.log(
|
|
78
|
-
'Please configure `Buttress (Remote Inference)` in the Generator
|
|
249
|
+
'Please configure `Buttress (Remote Inference)` in the Generator to connect to this server.',
|
|
79
250
|
)
|
|
80
|
-
console.log(
|
|
251
|
+
console.log()
|
|
252
|
+
console.log(`- Use http://${ip}:${port}/trpc to connect to this server via LAN.`)
|
|
253
|
+
console.log(`- Visit http://${ip}:${port}/status to see status via LAN.`)
|
|
254
|
+
console.log()
|
|
81
255
|
})
|
|
82
256
|
.catch((error) => {
|
|
83
257
|
console.error('Failed to start Buttress server POC:', error)
|
package/config/sample.toml
CHANGED
|
@@ -8,18 +8,52 @@
|
|
|
8
8
|
[server]
|
|
9
9
|
port = 2080
|
|
10
10
|
log_level = "info"
|
|
11
|
+
# max_body_size = "100MB" # Supports string (e.g., "100MB", "1GB") or number in bytes
|
|
11
12
|
|
|
12
13
|
[runtime]
|
|
13
14
|
cache_dir = "./.buttress-cache"
|
|
15
|
+
# huggingface_token = "hf_xx"
|
|
16
|
+
|
|
17
|
+
# Global model params
|
|
18
|
+
flash_attn_type = "on"
|
|
19
|
+
cache_type_k = "q8_0"
|
|
20
|
+
cache_type_v = "q8_0"
|
|
21
|
+
|
|
22
|
+
# Session state cache for ggml-llm (saves KV cache to disk for prompt reuse)
|
|
23
|
+
[runtime.session_cache]
|
|
24
|
+
enabled = true
|
|
25
|
+
max_size_bytes = "10GB" # Supports string (e.g., "10GB", "500MB") or number
|
|
26
|
+
max_entries = 1000
|
|
14
27
|
|
|
15
28
|
[[generators]]
|
|
16
29
|
type = "ggml-llm"
|
|
17
30
|
[generators.backend]
|
|
18
|
-
variant_preference = ["cuda", "vulkan", "default"]
|
|
31
|
+
variant_preference = ["cuda", "vulkan", "snapdragon", "default"]
|
|
32
|
+
gpu_memory_fraction = 0.95
|
|
33
|
+
cpu_memory_fraction = 0.95
|
|
19
34
|
[generators.model]
|
|
20
35
|
repo_id = "ggml-org/gpt-oss-20b-GGUF"
|
|
21
36
|
quantization = "mxfp4"
|
|
22
|
-
n_ctx = 12800
|
|
37
|
+
# n_ctx = 12800 # Max: 131072
|
|
38
|
+
|
|
39
|
+
[[generators]]
|
|
40
|
+
type = "ggml-llm"
|
|
41
|
+
[generators.backend]
|
|
42
|
+
variant_preference = ["cuda", "vulkan", "snapdragon", "default"]
|
|
43
|
+
gpu_memory_fraction = 0.95
|
|
44
|
+
[generators.model]
|
|
45
|
+
repo_id = "ggml-org/gpt-oss-120b-GGUF"
|
|
46
|
+
quantization = "mxfp4"
|
|
47
|
+
|
|
48
|
+
[[generators]]
|
|
49
|
+
type = "ggml-llm"
|
|
50
|
+
[generators.backend]
|
|
51
|
+
variant_preference = ["default"]
|
|
52
|
+
gpu_memory_fraction = 0.95
|
|
53
|
+
[generators.model]
|
|
54
|
+
repo_id = "bartowski/mistralai_Devstral-2-123B-Instruct-2512-GGUF"
|
|
55
|
+
quantization = "q4_0"
|
|
56
|
+
# n_ctx = 128000 # Max: 262144
|
|
23
57
|
|
|
24
58
|
[[generators]]
|
|
25
59
|
type = "ggml-llm"
|
|
@@ -28,3 +62,30 @@ variant_preference = ["default"]
|
|
|
28
62
|
[generators.model]
|
|
29
63
|
repo_id = "ggml-org/gemma-3-270m-qat-GGUF"
|
|
30
64
|
quantization = "q4_0"
|
|
65
|
+
|
|
66
|
+
[[generators]]
|
|
67
|
+
type = "ggml-llm"
|
|
68
|
+
[generators.backend]
|
|
69
|
+
variant_preference = ["default"]
|
|
70
|
+
[generators.model]
|
|
71
|
+
repo_id = "ggml-org/gemma-3-12b-it-qat-GGUF"
|
|
72
|
+
|
|
73
|
+
# Speech-to-Text (STT) generators
|
|
74
|
+
[[generators]]
|
|
75
|
+
type = "ggml-stt"
|
|
76
|
+
[generators.backend]
|
|
77
|
+
variant_preference = ["cuda", "vulkan", "default"]
|
|
78
|
+
[generators.model]
|
|
79
|
+
repo_id = "BricksDisplay/whisper-ggml"
|
|
80
|
+
filename = "ggml-small-q8_0.bin"
|
|
81
|
+
use_gpu = true
|
|
82
|
+
|
|
83
|
+
[[generators]]
|
|
84
|
+
type = "ggml-stt"
|
|
85
|
+
[generators.backend]
|
|
86
|
+
variant_preference = ["cuda", "vulkan", "default"]
|
|
87
|
+
[generators.model]
|
|
88
|
+
repo_id = "BricksDisplay/whisper-ggml"
|
|
89
|
+
filename = "ggml-large-v3-turbo-q8_0.bin"
|
|
90
|
+
use_gpu = true
|
|
91
|
+
use_flash_attn = true
|