@fugood/buttress-server-poc 2.23.0-beta.3 → 2.23.0-beta.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -40,6 +40,13 @@ log_level = "info"
40
40
  [runtime]
41
41
  cache_dir = "~/.buttress/models"
42
42
 
43
+ # Session state cache for ggml-llm (saves KV cache to disk for prompt reuse)
44
+ [runtime.session_cache]
45
+ enabled = true
46
+ max_size_bytes = "10GB" # Supports string (e.g., "10GB", "500MB") or number
47
+ max_entries = 1000
48
+
49
+ # GGML LLM generator
43
50
  [[generators]]
44
51
  type = "ggml-llm"
45
52
  [generators.backend]
@@ -48,6 +55,15 @@ variant_preference = ["cuda", "vulkan", "default"]
48
55
  repo_id = "ggml-org/gpt-oss-20b-GGUF"
49
56
  quantization = "mxfp4"
50
57
  n_ctx = 12800
58
+
59
+ # GGML STT (Speech-to-Text) generator
60
+ [[generators]]
61
+ type = "ggml-stt"
62
+ [generators.backend]
63
+ variant_preference = ["coreml", "default"]
64
+ [generators.model]
65
+ repo_id = "BricksDisplay/whisper-ggml"
66
+ filename = "ggml-small.bin"
51
67
  ```
52
68
 
53
69
  ### Programmatic Usage
@@ -103,4 +119,75 @@ Port can be configured via multiple sources (highest priority first):
103
119
 
104
120
  1. **Command-line flag**: `--port 3000`
105
121
  2. **Config file**: `[server] port = 2080`
106
- 4. **Default**: `2080`
122
+ 3. **Default**: `2080`
123
+
124
+ ## CLI Reference
125
+
126
+ ```
127
+ bricks-buttress-poc v2.23.0-beta.22
128
+
129
+ Buttress server for remote inference with GGML backends.
130
+
131
+ Usage:
132
+ bricks-buttress-poc [options]
133
+
134
+ Options:
135
+ -h, --help Show this help message
136
+ -v, --version Show version number
137
+ -p, --port <port> Port to listen on (default: 2080)
138
+ -c, --config <path|toml> Path to TOML config file or inline TOML string
139
+
140
+ Testing Options:
141
+ --test-caps <backend> Test model capabilities (ggml-llm or ggml-stt)
142
+ --test-caps-model-id <id> Model ID to test (used with --test-caps)
143
+ --test-models <ids> Comma-separated list of model IDs to test
144
+ --test-models-default Test default set of models
145
+
146
+ Note: --test-models and --test-models-default output a markdown report
147
+ file (e.g., ggml-llm-model-capabilities-YYYY-MM-DD.md)
148
+
149
+ Environment Variables:
150
+ BUTTRESS_PORT Port to listen on (overridden by --port)
151
+ NODE_ENV Set to 'development' for dev mode
152
+
153
+ Examples:
154
+ bricks-buttress-poc
155
+ bricks-buttress-poc --port 3000
156
+ bricks-buttress-poc --config ./config.toml
157
+ bricks-buttress-poc --test-caps ggml-llm --test-models-default
158
+ bricks-buttress-poc --test-caps ggml-stt --test-caps-model-id BricksDisplay/whisper-ggml:ggml-small.bin
159
+ ```
160
+
161
+ ## Session State Cache
162
+
163
+ The server supports session state caching for ggml-llm generators, which saves KV cache state to disk after completions. This enables:
164
+
165
+ - **Prompt reuse**: Same or similar prompts can reuse cached state, skipping prompt processing
166
+ - **Multi-turn conversations**: Conversation history state is preserved across requests
167
+
168
+ ### Configuration
169
+
170
+ ```toml
171
+ [runtime.session_cache]
172
+ enabled = true # Enable/disable session caching (default: true)
173
+ max_size_bytes = "10GB" # Supports string (e.g., "10GB", "500MB") or number (default: 10GB)
174
+ max_entries = 1000 # Max number of cached entries (default: 1000)
175
+ ```
176
+
177
+ ### How it works
178
+
179
+ 1. After a successful completion, the KV cache state is saved to disk
180
+ 2. On new completions, the server checks if any cached state matches the prompt prefix
181
+ 3. If a match is found, the cached state is loaded, skipping redundant prompt processing
182
+ 4. LRU eviction removes oldest entries when limits are exceeded
183
+
184
+ ### Cache location
185
+
186
+ Cache files are stored in `{cache_dir}/.session-state-cache/`:
187
+ - `cache-map.json` - Index of cached entries
188
+ - `states/` - Binary state files
189
+ - `temp/` - Temporary files (auto-cleaned after 1 hour)
190
+
191
+ ## Tips
192
+
193
+ - macOS: Use `sudo sysctl iogpu.wired_limit_mb=<number>` to increase GPU memory allocation. The default available memory of GPU is about ~70%. For example, if the hardware have 128GB memory, you can use `sudo sysctl iogpu.wired_limit_mb=137438` to increase to 128GB. Run `sudo sysctl iogpu.wired_limit_mb=0` if you want to back to default.
package/bin/start.mjs CHANGED
@@ -3,12 +3,61 @@ import fs from 'node:fs'
3
3
  import path from 'node:path'
4
4
  import os from 'node:os'
5
5
  import TOML from '@iarna/toml'
6
+ import bytes from 'bytes'
7
+ import { createRequire } from 'node:module'
6
8
 
7
9
  const serverModule =
8
10
  process.env.NODE_ENV === 'development'
9
11
  ? await import('../src/index.js')
10
12
  : await import('../lib/index.js')
11
- const { startServer, checkAndNotifyUpdates } = serverModule
13
+
14
+ const require = createRequire(import.meta.url)
15
+ const pkg = require('../package.json')
16
+
17
+ // Handle --version/-v flag
18
+ if (process.argv.includes('--version') || process.argv.includes('-v')) {
19
+ console.log(pkg.version)
20
+ process.exit(0)
21
+ }
22
+
23
+ // Handle --help/-h flag
24
+ if (process.argv.includes('--help') || process.argv.includes('-h')) {
25
+ console.log(`
26
+ bricks-buttress-poc v${pkg.version}
27
+
28
+ Buttress server for remote inference with GGML backends.
29
+
30
+ Usage:
31
+ bricks-buttress-poc [options]
32
+
33
+ Options:
34
+ -h, --help Show this help message
35
+ -v, --version Show version number
36
+ -p, --port <port> Port to listen on (default: 2080)
37
+ -c, --config <path|toml> Path to TOML config file or inline TOML string
38
+
39
+ Testing Options:
40
+ --test-caps <backend> Test model capabilities (ggml-llm or ggml-stt)
41
+ --test-caps-model-id <id> Model ID to test (used with --test-caps)
42
+ --test-models <ids> Comma-separated list of model IDs to test
43
+ --test-models-default Test default set of models
44
+
45
+ Note: --test-models and --test-models-default output a markdown report
46
+ file (e.g., ggml-llm-model-capabilities-YYYY-MM-DD.md)
47
+
48
+ Environment Variables:
49
+ BUTTRESS_PORT Port to listen on (overridden by --port)
50
+ NODE_ENV Set to 'development' for dev mode
51
+
52
+ Examples:
53
+ bricks-buttress-poc
54
+ bricks-buttress-poc --port 3000
55
+ bricks-buttress-poc --config ./config.toml
56
+ bricks-buttress-poc --test-caps ggml-llm --test-models-default
57
+ bricks-buttress-poc --test-caps ggml-stt --test-caps-model-id BricksDisplay/whisper-ggml:ggml-small.bin
58
+ `)
59
+ process.exit(0)
60
+ }
12
61
 
13
62
  const portArgIndex = process.argv.findIndex((arg) => arg === '--port' || arg === '-p')
14
63
  const portValue = portArgIndex >= 0 ? Number(process.argv[portArgIndex + 1]) : undefined
@@ -53,6 +102,117 @@ if (configInput) {
53
102
  }
54
103
  }
55
104
 
105
+ const {
106
+ testGgmlLlmCapabilities,
107
+ testGgmlSttCapabilities,
108
+ showModelsTable,
109
+ showSttModelsTable,
110
+ startServer,
111
+ checkAndNotifyUpdates,
112
+ } = serverModule
113
+
114
+ // Default models for --test-models-default
115
+ const DEFAULT_TEST_GGML_LLM_MODELS = [
116
+ 'ggml-org/gpt-oss-20b-GGUF',
117
+ 'ggml-org/gpt-oss-120b-GGUF',
118
+ 'unsloth/Nemotron-3-Nano-30B-A3B-GGUF',
119
+ 'unsloth/Qwen3-VL-30B-A3B-Instruct-GGUF',
120
+ 'bartowski/Mistral-Nemo-Instruct-2407-GGUF',
121
+ 'mistralai/Magistral-Small-2509-GGUF',
122
+ 'mistralai/Ministral-3-14B-Reasoning-2512-GGUF',
123
+ 'bartowski/mistralai_Devstral-Small-2-24B-Instruct-2512-GGUF',
124
+ 'bartowski/mistralai_Devstral-2-123B-Instruct-2512-GGUF',
125
+ 'ggml-org/gemma-3-12b-it-qat-GGUF',
126
+ 'ggml-org/gemma-3-27b-it-qat-GGUF',
127
+ 'unsloth/phi-4-GGUF',
128
+ ]
129
+
130
+ const DEFAULT_TEST_GGML_STT_MODELS = [
131
+ 'BricksDisplay/whisper-ggml:ggml-small.bin',
132
+ 'BricksDisplay/whisper-ggml:ggml-small-q8_0.bin',
133
+ 'BricksDisplay/whisper-ggml:ggml-medium.bin',
134
+ 'BricksDisplay/whisper-ggml:ggml-medium-q8_0.bin',
135
+ 'BricksDisplay/whisper-ggml:ggml-large-v3-turbo.bin',
136
+ 'BricksDisplay/whisper-ggml:ggml-large-v3-turbo-q8_0.bin',
137
+ 'BricksDisplay/whisper-ggml:ggml-large-v3.bin',
138
+ ]
139
+
140
+ // Handle --test-caps flag
141
+ const testCapsArgIndex = process.argv.findIndex((arg) => arg === '--test-caps')
142
+ if (testCapsArgIndex >= 0) {
143
+ const backendType = process.argv[testCapsArgIndex + 1] || 'ggml-llm'
144
+ if (backendType !== 'ggml-llm' && backendType !== 'ggml-stt') {
145
+ console.error('Only ggml-llm and ggml-stt backends are supported for testing capabilities')
146
+ process.exit(1)
147
+ }
148
+
149
+ // Check for --test-models or --test-models-default
150
+ const testModelsArgIndex = process.argv.findIndex((arg) => arg === '--test-models')
151
+ const hasTestModelsDefault = process.argv.includes('--test-models-default')
152
+
153
+ if (backendType === 'ggml-stt') {
154
+ // STT backend
155
+ if (testModelsArgIndex >= 0) {
156
+ const modelIdsInput = process.argv[testModelsArgIndex + 1]
157
+ if (!modelIdsInput) {
158
+ console.error('Error: --test-models requires a comma-separated list of model IDs')
159
+ process.exit(1)
160
+ }
161
+ const modelIds = modelIdsInput.split(',').map((id) => id.trim())
162
+
163
+ await showSttModelsTable({
164
+ modelIds,
165
+ defaultConfig,
166
+ })
167
+ } else if (hasTestModelsDefault) {
168
+ await showSttModelsTable({
169
+ modelIds: DEFAULT_TEST_GGML_STT_MODELS,
170
+ defaultConfig,
171
+ })
172
+ } else {
173
+ // Single model test
174
+ const testCapsModelIdArgIndex = process.argv.findIndex(
175
+ (arg) => arg === '--test-caps-model-id',
176
+ )
177
+ const modelId =
178
+ testCapsModelIdArgIndex >= 0 ? process.argv[testCapsModelIdArgIndex + 1] : null
179
+
180
+ await testGgmlSttCapabilities({
181
+ modelId,
182
+ defaultConfig,
183
+ })
184
+ }
185
+ } else if (testModelsArgIndex >= 0) {
186
+ // LLM backend with custom models
187
+ const modelIdsInput = process.argv[testModelsArgIndex + 1]
188
+ if (!modelIdsInput) {
189
+ console.error('Error: --test-models requires a comma-separated list of model IDs')
190
+ process.exit(1)
191
+ }
192
+ const modelIds = modelIdsInput.split(',').map((id) => id.trim())
193
+
194
+ await showModelsTable({
195
+ modelIds,
196
+ defaultConfig,
197
+ })
198
+ } else if (hasTestModelsDefault) {
199
+ // LLM backend with default models
200
+ await showModelsTable({
201
+ modelIds: DEFAULT_TEST_GGML_LLM_MODELS,
202
+ defaultConfig,
203
+ })
204
+ } else {
205
+ // LLM backend single model test
206
+ const testCapsModelIdArgIndex = process.argv.findIndex((arg) => arg === '--test-caps-model-id')
207
+ const modelId = testCapsModelIdArgIndex >= 0 ? process.argv[testCapsModelIdArgIndex + 1] : null
208
+
209
+ await testGgmlLlmCapabilities({
210
+ modelId,
211
+ defaultConfig,
212
+ })
213
+ }
214
+ }
215
+
56
216
  const getLocalIpAddress = () => {
57
217
  const networkInterfaces = os.networkInterfaces()
58
218
  const localIp = Object.values(networkInterfaces)
@@ -62,11 +222,17 @@ const getLocalIpAddress = () => {
62
222
  }
63
223
 
64
224
  const configPort = defaultConfig?.server?.port
225
+ const configMaxBodySizeRaw = defaultConfig?.server?.max_body_size
226
+ const configMaxBodySize =
227
+ typeof configMaxBodySizeRaw === 'string'
228
+ ? bytes.parse(configMaxBodySizeRaw)
229
+ : configMaxBodySizeRaw
65
230
  const finalPort = portValue || configPort || Number(process.env.BUTTRESS_PORT) || 2080
66
231
 
67
232
  startServer({
68
233
  port: finalPort,
69
234
  defaultConfig,
235
+ maxBodySize: configMaxBodySize,
70
236
  })
71
237
  .then(async ({ port }) => {
72
238
  const ip = getLocalIpAddress()
@@ -74,10 +240,18 @@ startServer({
74
240
  console.log('--------------------------------')
75
241
  await checkAndNotifyUpdates()
76
242
 
243
+ console.log()
244
+ console.log('Current supported Generators:')
245
+ console.log('- LLM (GGML)')
246
+ console.log('- STT (GGML)')
247
+ console.log()
77
248
  console.log(
78
- 'Please configure `Buttress (Remote Inference)` in the Generator like LLM (GGML) to connect to this server.',
249
+ 'Please configure `Buttress (Remote Inference)` in the Generator to connect to this server.',
79
250
  )
80
- console.log(`Use http://${ip}:${port} (tRPC path: /trpc) to connect to this server via LAN.`)
251
+ console.log()
252
+ console.log(`- Use http://${ip}:${port}/trpc to connect to this server via LAN.`)
253
+ console.log(`- Visit http://${ip}:${port}/status to see status via LAN.`)
254
+ console.log()
81
255
  })
82
256
  .catch((error) => {
83
257
  console.error('Failed to start Buttress server POC:', error)
@@ -8,18 +8,52 @@
8
8
  [server]
9
9
  port = 2080
10
10
  log_level = "info"
11
+ # max_body_size = "100MB" # Supports string (e.g., "100MB", "1GB") or number in bytes
11
12
 
12
13
  [runtime]
13
14
  cache_dir = "./.buttress-cache"
15
+ # huggingface_token = "hf_xx"
16
+
17
+ # Global model params
18
+ flash_attn_type = "on"
19
+ cache_type_k = "q8_0"
20
+ cache_type_v = "q8_0"
21
+
22
+ # Session state cache for ggml-llm (saves KV cache to disk for prompt reuse)
23
+ [runtime.session_cache]
24
+ enabled = true
25
+ max_size_bytes = "10GB" # Supports string (e.g., "10GB", "500MB") or number
26
+ max_entries = 1000
14
27
 
15
28
  [[generators]]
16
29
  type = "ggml-llm"
17
30
  [generators.backend]
18
- variant_preference = ["cuda", "vulkan", "default"]
31
+ variant_preference = ["cuda", "vulkan", "snapdragon", "default"]
32
+ gpu_memory_fraction = 0.95
33
+ cpu_memory_fraction = 0.95
19
34
  [generators.model]
20
35
  repo_id = "ggml-org/gpt-oss-20b-GGUF"
21
36
  quantization = "mxfp4"
22
- n_ctx = 12800
37
+ # n_ctx = 12800 # Max: 131072
38
+
39
+ [[generators]]
40
+ type = "ggml-llm"
41
+ [generators.backend]
42
+ variant_preference = ["cuda", "vulkan", "snapdragon", "default"]
43
+ gpu_memory_fraction = 0.95
44
+ [generators.model]
45
+ repo_id = "ggml-org/gpt-oss-120b-GGUF"
46
+ quantization = "mxfp4"
47
+
48
+ [[generators]]
49
+ type = "ggml-llm"
50
+ [generators.backend]
51
+ variant_preference = ["default"]
52
+ gpu_memory_fraction = 0.95
53
+ [generators.model]
54
+ repo_id = "bartowski/mistralai_Devstral-2-123B-Instruct-2512-GGUF"
55
+ quantization = "q4_0"
56
+ # n_ctx = 128000 # Max: 262144
23
57
 
24
58
  [[generators]]
25
59
  type = "ggml-llm"
@@ -28,3 +62,30 @@ variant_preference = ["default"]
28
62
  [generators.model]
29
63
  repo_id = "ggml-org/gemma-3-270m-qat-GGUF"
30
64
  quantization = "q4_0"
65
+
66
+ [[generators]]
67
+ type = "ggml-llm"
68
+ [generators.backend]
69
+ variant_preference = ["default"]
70
+ [generators.model]
71
+ repo_id = "ggml-org/gemma-3-12b-it-qat-GGUF"
72
+
73
+ # Speech-to-Text (STT) generators
74
+ [[generators]]
75
+ type = "ggml-stt"
76
+ [generators.backend]
77
+ variant_preference = ["cuda", "vulkan", "default"]
78
+ [generators.model]
79
+ repo_id = "BricksDisplay/whisper-ggml"
80
+ filename = "ggml-small-q8_0.bin"
81
+ use_gpu = true
82
+
83
+ [[generators]]
84
+ type = "ggml-stt"
85
+ [generators.backend]
86
+ variant_preference = ["cuda", "vulkan", "default"]
87
+ [generators.model]
88
+ repo_id = "BricksDisplay/whisper-ggml"
89
+ filename = "ggml-large-v3-turbo-q8_0.bin"
90
+ use_gpu = true
91
+ use_flash_attn = true