lemonade-sdk 8.0.6__py3-none-any.whl → 8.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/common/inference_engines.py +62 -77
- lemonade/common/network.py +18 -1
- lemonade/common/system_info.py +61 -44
- lemonade/tools/llamacpp/bench.py +3 -1
- lemonade/tools/llamacpp/load.py +13 -4
- lemonade/tools/llamacpp/utils.py +229 -61
- lemonade/tools/oga/load.py +239 -112
- lemonade/tools/oga/utils.py +19 -7
- lemonade/tools/server/llamacpp.py +30 -53
- lemonade/tools/server/serve.py +64 -123
- lemonade/tools/server/static/styles.css +208 -6
- lemonade/tools/server/static/webapp.html +510 -71
- lemonade/tools/server/tray.py +4 -2
- lemonade/tools/server/utils/thread.py +2 -4
- lemonade/version.py +1 -1
- lemonade_install/install.py +90 -86
- {lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/METADATA +74 -24
- {lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/RECORD +27 -27
- lemonade_server/cli.py +79 -26
- lemonade_server/model_manager.py +4 -3
- lemonade_server/pydantic_models.py +1 -4
- lemonade_server/server_models.json +60 -11
- {lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/WHEEL +0 -0
- {lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/top_level.txt +0 -0
lemonade_server/cli.py
CHANGED
|
@@ -39,11 +39,19 @@ class ModelNotAvailableError(Exception):
|
|
|
39
39
|
"""
|
|
40
40
|
|
|
41
41
|
|
|
42
|
+
class ModelLoadError(Exception):
|
|
43
|
+
"""
|
|
44
|
+
The model failed to load on the server
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
|
|
42
48
|
def serve(
|
|
43
49
|
port: int = None,
|
|
44
50
|
log_level: str = None,
|
|
45
51
|
tray: bool = False,
|
|
46
52
|
use_thread: bool = False,
|
|
53
|
+
llamacpp_backend: str = None,
|
|
54
|
+
ctx_size: int = None,
|
|
47
55
|
):
|
|
48
56
|
"""
|
|
49
57
|
Execute the serve command
|
|
@@ -51,26 +59,33 @@ def serve(
|
|
|
51
59
|
|
|
52
60
|
# Otherwise, start the server
|
|
53
61
|
print("Starting Lemonade Server...")
|
|
54
|
-
from lemonade.tools.server.serve import
|
|
62
|
+
from lemonade.tools.server.serve import (
|
|
63
|
+
Server,
|
|
64
|
+
DEFAULT_PORT,
|
|
65
|
+
DEFAULT_LOG_LEVEL,
|
|
66
|
+
DEFAULT_LLAMACPP_BACKEND,
|
|
67
|
+
DEFAULT_CTX_SIZE,
|
|
68
|
+
)
|
|
55
69
|
|
|
56
70
|
port = port if port is not None else DEFAULT_PORT
|
|
57
71
|
log_level = log_level if log_level is not None else DEFAULT_LOG_LEVEL
|
|
72
|
+
llamacpp_backend = (
|
|
73
|
+
llamacpp_backend if llamacpp_backend is not None else DEFAULT_LLAMACPP_BACKEND
|
|
74
|
+
)
|
|
58
75
|
|
|
59
|
-
#
|
|
60
|
-
|
|
76
|
+
# Use ctx_size if provided, otherwise use default
|
|
77
|
+
ctx_size = ctx_size if ctx_size is not None else DEFAULT_CTX_SIZE
|
|
61
78
|
|
|
62
79
|
# Start the server
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
80
|
+
server = Server(
|
|
81
|
+
port=port,
|
|
82
|
+
log_level=log_level,
|
|
83
|
+
ctx_size=ctx_size,
|
|
84
|
+
tray=tray,
|
|
85
|
+
llamacpp_backend=llamacpp_backend,
|
|
86
|
+
)
|
|
69
87
|
if not use_thread:
|
|
70
|
-
server.run(
|
|
71
|
-
port=port,
|
|
72
|
-
**serve_kwargs,
|
|
73
|
-
)
|
|
88
|
+
server.run()
|
|
74
89
|
else:
|
|
75
90
|
from threading import Thread
|
|
76
91
|
import time
|
|
@@ -78,8 +93,6 @@ def serve(
|
|
|
78
93
|
# Start a background thread to run the server
|
|
79
94
|
server_thread = Thread(
|
|
80
95
|
target=server.run,
|
|
81
|
-
args=(port,),
|
|
82
|
-
kwargs=serve_kwargs,
|
|
83
96
|
daemon=True,
|
|
84
97
|
)
|
|
85
98
|
server_thread.start()
|
|
@@ -243,7 +256,13 @@ def delete(model_names: List[str]):
|
|
|
243
256
|
ModelManager().delete_model(model_name)
|
|
244
257
|
|
|
245
258
|
|
|
246
|
-
def run(
|
|
259
|
+
def run(
|
|
260
|
+
model_name: str,
|
|
261
|
+
port: int = None,
|
|
262
|
+
log_level: str = None,
|
|
263
|
+
llamacpp_backend: str = None,
|
|
264
|
+
ctx_size: int = None,
|
|
265
|
+
):
|
|
247
266
|
"""
|
|
248
267
|
Start the server if not running and open the webapp with the specified model
|
|
249
268
|
"""
|
|
@@ -254,7 +273,16 @@ def run(model_name: str):
|
|
|
254
273
|
_, port = get_server_info()
|
|
255
274
|
server_previously_running = port is not None
|
|
256
275
|
if not server_previously_running:
|
|
257
|
-
port, server_thread = serve(
|
|
276
|
+
port, server_thread = serve(
|
|
277
|
+
port=port,
|
|
278
|
+
log_level=log_level,
|
|
279
|
+
tray=True,
|
|
280
|
+
use_thread=True,
|
|
281
|
+
llamacpp_backend=llamacpp_backend,
|
|
282
|
+
ctx_size=ctx_size,
|
|
283
|
+
)
|
|
284
|
+
else:
|
|
285
|
+
port = running_port
|
|
258
286
|
|
|
259
287
|
# Pull model
|
|
260
288
|
pull([model_name])
|
|
@@ -412,6 +440,29 @@ def list_models():
|
|
|
412
440
|
print(tabulate(table_data, headers=headers, tablefmt="simple"))
|
|
413
441
|
|
|
414
442
|
|
|
443
|
+
def _add_server_arguments(parser):
|
|
444
|
+
"""Add common server arguments to a parser"""
|
|
445
|
+
parser.add_argument("--port", type=int, help="Port number to serve on")
|
|
446
|
+
parser.add_argument(
|
|
447
|
+
"--log-level",
|
|
448
|
+
type=str,
|
|
449
|
+
help="Log level for the server",
|
|
450
|
+
choices=["critical", "error", "warning", "info", "debug", "trace"],
|
|
451
|
+
default="info",
|
|
452
|
+
)
|
|
453
|
+
parser.add_argument(
|
|
454
|
+
"--llamacpp",
|
|
455
|
+
type=str,
|
|
456
|
+
help=f"LlamaCpp backend to use",
|
|
457
|
+
choices=["vulkan", "rocm"],
|
|
458
|
+
)
|
|
459
|
+
parser.add_argument(
|
|
460
|
+
"--ctx-size",
|
|
461
|
+
type=int,
|
|
462
|
+
help="Context size for the model (default: 4096 for llamacpp, truncates prompts for other recipes)",
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
|
|
415
466
|
def main():
|
|
416
467
|
parser = argparse.ArgumentParser(
|
|
417
468
|
description="Serve LLMs on CPU, GPU, and NPU.",
|
|
@@ -430,14 +481,7 @@ def main():
|
|
|
430
481
|
|
|
431
482
|
# Serve command
|
|
432
483
|
serve_parser = subparsers.add_parser("serve", help="Start server")
|
|
433
|
-
serve_parser
|
|
434
|
-
serve_parser.add_argument(
|
|
435
|
-
"--log-level",
|
|
436
|
-
type=str,
|
|
437
|
-
help="Log level for the server",
|
|
438
|
-
choices=["critical", "error", "warning", "info", "debug", "trace"],
|
|
439
|
-
default="info",
|
|
440
|
-
)
|
|
484
|
+
_add_server_arguments(serve_parser)
|
|
441
485
|
if os.name == "nt":
|
|
442
486
|
serve_parser.add_argument(
|
|
443
487
|
"--no-tray",
|
|
@@ -513,6 +557,7 @@ def main():
|
|
|
513
557
|
"model",
|
|
514
558
|
help="Lemonade Server model name to run",
|
|
515
559
|
)
|
|
560
|
+
_add_server_arguments(run_parser)
|
|
516
561
|
|
|
517
562
|
args = parser.parse_args()
|
|
518
563
|
|
|
@@ -535,6 +580,8 @@ def main():
|
|
|
535
580
|
port=args.port,
|
|
536
581
|
log_level=args.log_level,
|
|
537
582
|
tray=not args.no_tray,
|
|
583
|
+
llamacpp_backend=args.llamacpp,
|
|
584
|
+
ctx_size=args.ctx_size,
|
|
538
585
|
)
|
|
539
586
|
elif args.command == "status":
|
|
540
587
|
status()
|
|
@@ -553,7 +600,13 @@ def main():
|
|
|
553
600
|
elif args.command == "stop":
|
|
554
601
|
stop()
|
|
555
602
|
elif args.command == "run":
|
|
556
|
-
run(
|
|
603
|
+
run(
|
|
604
|
+
args.model,
|
|
605
|
+
port=args.port,
|
|
606
|
+
log_level=args.log_level,
|
|
607
|
+
llamacpp_backend=args.llamacpp,
|
|
608
|
+
ctx_size=args.ctx_size,
|
|
609
|
+
)
|
|
557
610
|
elif args.command == "help" or not args.command:
|
|
558
611
|
parser.print_help()
|
|
559
612
|
|
lemonade_server/model_manager.py
CHANGED
|
@@ -7,6 +7,7 @@ from importlib.metadata import distributions
|
|
|
7
7
|
from lemonade_server.pydantic_models import PullConfig
|
|
8
8
|
from lemonade.cache import DEFAULT_CACHE_DIR
|
|
9
9
|
from lemonade.tools.llamacpp.utils import parse_checkpoint, download_gguf
|
|
10
|
+
from lemonade.common.network import custom_snapshot_download
|
|
10
11
|
|
|
11
12
|
USER_MODELS_FILE = os.path.join(DEFAULT_CACHE_DIR, "user_models.json")
|
|
12
13
|
|
|
@@ -175,7 +176,7 @@ class ModelManager:
|
|
|
175
176
|
if "gguf" in checkpoint_to_download.lower():
|
|
176
177
|
download_gguf(gguf_model_config.checkpoint, gguf_model_config.mmproj)
|
|
177
178
|
else:
|
|
178
|
-
|
|
179
|
+
custom_snapshot_download(checkpoint_to_download)
|
|
179
180
|
|
|
180
181
|
# Register the model in user_models.json, creating that file if needed
|
|
181
182
|
# We do this registration after the download so that we don't register
|
|
@@ -233,8 +234,8 @@ class ModelManager:
|
|
|
233
234
|
|
|
234
235
|
try:
|
|
235
236
|
# Get the local path using snapshot_download with local_files_only=True
|
|
236
|
-
snapshot_path =
|
|
237
|
-
|
|
237
|
+
snapshot_path = custom_snapshot_download(
|
|
238
|
+
base_checkpoint, local_files_only=True
|
|
238
239
|
)
|
|
239
240
|
|
|
240
241
|
# Navigate up to the model directory (parent of snapshots directory)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Optional, Union, List
|
|
1
|
+
from typing import Optional, Union, List
|
|
2
2
|
|
|
3
3
|
from pydantic import BaseModel
|
|
4
4
|
|
|
@@ -18,9 +18,6 @@ class LoadConfig(BaseModel):
|
|
|
18
18
|
model_name: str
|
|
19
19
|
checkpoint: Optional[str] = None
|
|
20
20
|
recipe: Optional[str] = None
|
|
21
|
-
# Indicates the maximum prompt length allowed for that specific
|
|
22
|
-
# checkpoint + recipe combination
|
|
23
|
-
max_prompt_length: Optional[int] = None
|
|
24
21
|
# Indicates whether the model is a reasoning model, like DeepSeek
|
|
25
22
|
reasoning: Optional[bool] = False
|
|
26
23
|
# Indicates which Multimodal Projector (mmproj) file to use
|
|
@@ -39,19 +39,16 @@
|
|
|
39
39
|
"Llama-3.2-1B-Instruct-Hybrid": {
|
|
40
40
|
"checkpoint": "amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
|
|
41
41
|
"recipe": "oga-hybrid",
|
|
42
|
-
"max_prompt_length": 3000,
|
|
43
42
|
"suggested": true
|
|
44
43
|
},
|
|
45
44
|
"Llama-3.2-3B-Instruct-Hybrid": {
|
|
46
45
|
"checkpoint": "amd/Llama-3.2-3B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
|
|
47
46
|
"recipe": "oga-hybrid",
|
|
48
|
-
"max_prompt_length": 2000,
|
|
49
47
|
"suggested": true
|
|
50
48
|
},
|
|
51
49
|
"Phi-3-Mini-Instruct-Hybrid": {
|
|
52
50
|
"checkpoint": "amd/Phi-3-mini-4k-instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
|
|
53
51
|
"recipe": "oga-hybrid",
|
|
54
|
-
"max_prompt_length": 2000,
|
|
55
52
|
"suggested": true
|
|
56
53
|
},
|
|
57
54
|
"Phi-3.5-Mini-Instruct-Hybrid": {
|
|
@@ -62,13 +59,26 @@
|
|
|
62
59
|
"Qwen-1.5-7B-Chat-Hybrid": {
|
|
63
60
|
"checkpoint": "amd/Qwen1.5-7B-Chat-awq-g128-int4-asym-fp16-onnx-hybrid",
|
|
64
61
|
"recipe": "oga-hybrid",
|
|
65
|
-
"
|
|
62
|
+
"suggested": true
|
|
63
|
+
},
|
|
64
|
+
"Qwen-2.5-7B-Instruct-Hybrid": {
|
|
65
|
+
"checkpoint": "amd/Qwen2.5-7B-Instruct-awq-uint4-asym-g128-lmhead-g32-fp16-onnx-hybrid",
|
|
66
|
+
"recipe": "oga-hybrid",
|
|
67
|
+
"suggested": true
|
|
68
|
+
},
|
|
69
|
+
"Qwen-2.5-3B-Instruct-Hybrid": {
|
|
70
|
+
"checkpoint": "amd/Qwen2.5-3B-Instruct-awq-uint4-asym-g128-lmhead-g32-fp16-onnx-hybrid",
|
|
71
|
+
"recipe": "oga-hybrid",
|
|
72
|
+
"suggested": true
|
|
73
|
+
},
|
|
74
|
+
"Qwen-2.5-1.5B-Instruct-Hybrid": {
|
|
75
|
+
"checkpoint": "amd/Qwen2.5-1.5B-Instruct-awq-uint4-asym-g128-lmhead-g32-fp16-onnx-hybrid",
|
|
76
|
+
"recipe": "oga-hybrid",
|
|
66
77
|
"suggested": true
|
|
67
78
|
},
|
|
68
79
|
"DeepSeek-R1-Distill-Llama-8B-Hybrid": {
|
|
69
80
|
"checkpoint": "amd/DeepSeek-R1-Distill-Llama-8B-awq-asym-uint4-g128-lmhead-onnx-hybrid",
|
|
70
81
|
"recipe": "oga-hybrid",
|
|
71
|
-
"max_prompt_length": 2000,
|
|
72
82
|
"suggested": true,
|
|
73
83
|
"labels": ["reasoning"]
|
|
74
84
|
},
|
|
@@ -76,25 +86,32 @@
|
|
|
76
86
|
"checkpoint": "amd/DeepSeek-R1-Distill-Qwen-7B-awq-asym-uint4-g128-lmhead-onnx-hybrid",
|
|
77
87
|
"recipe": "oga-hybrid",
|
|
78
88
|
"max_prompt_length": 2000,
|
|
79
|
-
"suggested":
|
|
89
|
+
"suggested": false,
|
|
80
90
|
"labels": ["reasoning"]
|
|
81
91
|
},
|
|
82
92
|
"Mistral-7B-v0.3-Instruct-Hybrid": {
|
|
83
93
|
"checkpoint": "amd/Mistral-7B-Instruct-v0.3-awq-g128-int4-asym-fp16-onnx-hybrid",
|
|
84
94
|
"recipe": "oga-hybrid",
|
|
85
|
-
"max_prompt_length": 2000,
|
|
86
95
|
"suggested": true
|
|
87
96
|
},
|
|
88
97
|
"Llama-3.1-8B-Instruct-Hybrid": {
|
|
89
98
|
"checkpoint": "amd/Llama-3.1-8B-Instruct-awq-asym-uint4-g128-lmhead-onnx-hybrid",
|
|
90
99
|
"recipe": "oga-hybrid",
|
|
91
|
-
"max_prompt_length": 2000,
|
|
92
100
|
"suggested": true
|
|
93
101
|
},
|
|
94
102
|
"Llama-xLAM-2-8b-fc-r-Hybrid": {
|
|
95
103
|
"checkpoint": "amd/Llama-xLAM-2-8b-fc-r-awq-g128-int4-asym-bfp16-onnx-hybrid",
|
|
96
104
|
"recipe": "oga-hybrid",
|
|
97
|
-
"
|
|
105
|
+
"suggested": true
|
|
106
|
+
},
|
|
107
|
+
"Qwen-2.5-7B-Instruct-NPU": {
|
|
108
|
+
"checkpoint": "amd/Qwen2.5-7B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix",
|
|
109
|
+
"recipe": "oga-npu",
|
|
110
|
+
"suggested": true
|
|
111
|
+
},
|
|
112
|
+
"Qwen-2.5-1.5B-Instruct-NPU": {
|
|
113
|
+
"checkpoint": "amd/Qwen2.5-1.5B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix",
|
|
114
|
+
"recipe": "oga-npu",
|
|
98
115
|
"suggested": true
|
|
99
116
|
},
|
|
100
117
|
"Llama-3.2-1B-Instruct-DirectML": {
|
|
@@ -169,6 +186,18 @@
|
|
|
169
186
|
"suggested": true,
|
|
170
187
|
"labels": ["reasoning"]
|
|
171
188
|
},
|
|
189
|
+
"Qwen3-30B-A3B-Instruct-2507-GGUF": {
|
|
190
|
+
"checkpoint": "unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF:Qwen3-30B-A3B-Instruct-2507-Q4_0.gguf",
|
|
191
|
+
"recipe": "llamacpp",
|
|
192
|
+
"suggested": true,
|
|
193
|
+
"labels": ["hot"]
|
|
194
|
+
},
|
|
195
|
+
"Qwen3-Coder-30B-A3B-Instruct-GGUF": {
|
|
196
|
+
"checkpoint": "unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Qwen3-Coder-30B-A3B-Instruct-Q4_K_M.gguf",
|
|
197
|
+
"recipe": "llamacpp",
|
|
198
|
+
"suggested": true,
|
|
199
|
+
"labels": ["coding","hot"]
|
|
200
|
+
},
|
|
172
201
|
"Gemma-3-4b-it-GGUF": {
|
|
173
202
|
"checkpoint": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
|
|
174
203
|
"mmproj": "mmproj-model-f16.gguf",
|
|
@@ -190,6 +219,13 @@
|
|
|
190
219
|
"suggested": true,
|
|
191
220
|
"labels": ["vision"]
|
|
192
221
|
},
|
|
222
|
+
"Cogito-v2-llama-109B-MoE-GGUF": {
|
|
223
|
+
"checkpoint": "unsloth/cogito-v2-preview-llama-109B-MoE-GGUF:Q4_K_M",
|
|
224
|
+
"mmproj": "mmproj-F16.gguf",
|
|
225
|
+
"recipe": "llamacpp",
|
|
226
|
+
"suggested": true,
|
|
227
|
+
"labels": ["vision","hot"]
|
|
228
|
+
},
|
|
193
229
|
"nomic-embed-text-v1-GGUF": {
|
|
194
230
|
"checkpoint": "nomic-ai/nomic-embed-text-v1-GGUF:Q4_K_S",
|
|
195
231
|
"recipe": "llamacpp",
|
|
@@ -217,12 +253,25 @@
|
|
|
217
253
|
"Devstral-Small-2507-GGUF":{
|
|
218
254
|
"checkpoint": "mistralai/Devstral-Small-2507_gguf:Q4_K_M",
|
|
219
255
|
"recipe": "llamacpp",
|
|
220
|
-
"suggested": true
|
|
256
|
+
"suggested": true,
|
|
257
|
+
"labels": ["coding"]
|
|
221
258
|
},
|
|
222
259
|
"Qwen2.5-Coder-32B-Instruct-GGUF": {
|
|
223
260
|
"checkpoint": "Qwen/Qwen2.5-Coder-32B-Instruct-GGUF:Q4_K_M",
|
|
224
261
|
"recipe": "llamacpp",
|
|
225
262
|
"suggested": true,
|
|
226
|
-
"labels": ["reasoning"]
|
|
263
|
+
"labels": ["reasoning", "coding"]
|
|
264
|
+
},
|
|
265
|
+
"gpt-oss-120b-GGUF": {
|
|
266
|
+
"checkpoint": "unsloth/gpt-oss-120b-GGUF:Q4_K_M",
|
|
267
|
+
"recipe": "llamacpp",
|
|
268
|
+
"suggested": true,
|
|
269
|
+
"labels": ["hot", "reasoning"]
|
|
270
|
+
},
|
|
271
|
+
"gpt-oss-20b-GGUF": {
|
|
272
|
+
"checkpoint": "unsloth/gpt-oss-20b-GGUF:Q4_K_M",
|
|
273
|
+
"recipe": "llamacpp",
|
|
274
|
+
"suggested": true,
|
|
275
|
+
"labels": ["hot", "reasoning"]
|
|
227
276
|
}
|
|
228
277
|
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|