lemonade-sdk 8.0.6__py3-none-any.whl → 8.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

lemonade_server/cli.py CHANGED
@@ -39,11 +39,19 @@ class ModelNotAvailableError(Exception):
39
39
  """
40
40
 
41
41
 
42
+ class ModelLoadError(Exception):
43
+ """
44
+ The model failed to load on the server
45
+ """
46
+
47
+
42
48
  def serve(
43
49
  port: int = None,
44
50
  log_level: str = None,
45
51
  tray: bool = False,
46
52
  use_thread: bool = False,
53
+ llamacpp_backend: str = None,
54
+ ctx_size: int = None,
47
55
  ):
48
56
  """
49
57
  Execute the serve command
@@ -51,26 +59,33 @@ def serve(
51
59
 
52
60
  # Otherwise, start the server
53
61
  print("Starting Lemonade Server...")
54
- from lemonade.tools.server.serve import Server, DEFAULT_PORT, DEFAULT_LOG_LEVEL
62
+ from lemonade.tools.server.serve import (
63
+ Server,
64
+ DEFAULT_PORT,
65
+ DEFAULT_LOG_LEVEL,
66
+ DEFAULT_LLAMACPP_BACKEND,
67
+ DEFAULT_CTX_SIZE,
68
+ )
55
69
 
56
70
  port = port if port is not None else DEFAULT_PORT
57
71
  log_level = log_level if log_level is not None else DEFAULT_LOG_LEVEL
72
+ llamacpp_backend = (
73
+ llamacpp_backend if llamacpp_backend is not None else DEFAULT_LLAMACPP_BACKEND
74
+ )
58
75
 
59
- # Hidden environment variable to enable input truncation (experimental feature)
60
- truncate_inputs = "LEMONADE_TRUNCATE_INPUTS" in os.environ
76
+ # Use ctx_size if provided, otherwise use default
77
+ ctx_size = ctx_size if ctx_size is not None else DEFAULT_CTX_SIZE
61
78
 
62
79
  # Start the server
63
- serve_kwargs = {
64
- "log_level": log_level,
65
- "truncate_inputs": truncate_inputs,
66
- "tray": tray,
67
- }
68
- server = Server()
80
+ server = Server(
81
+ port=port,
82
+ log_level=log_level,
83
+ ctx_size=ctx_size,
84
+ tray=tray,
85
+ llamacpp_backend=llamacpp_backend,
86
+ )
69
87
  if not use_thread:
70
- server.run(
71
- port=port,
72
- **serve_kwargs,
73
- )
88
+ server.run()
74
89
  else:
75
90
  from threading import Thread
76
91
  import time
@@ -78,8 +93,6 @@ def serve(
78
93
  # Start a background thread to run the server
79
94
  server_thread = Thread(
80
95
  target=server.run,
81
- args=(port,),
82
- kwargs=serve_kwargs,
83
96
  daemon=True,
84
97
  )
85
98
  server_thread.start()
@@ -243,7 +256,13 @@ def delete(model_names: List[str]):
243
256
  ModelManager().delete_model(model_name)
244
257
 
245
258
 
246
- def run(model_name: str):
259
+ def run(
260
+ model_name: str,
261
+ port: int = None,
262
+ log_level: str = None,
263
+ llamacpp_backend: str = None,
264
+ ctx_size: int = None,
265
+ ):
247
266
  """
248
267
  Start the server if not running and open the webapp with the specified model
249
268
  """
@@ -254,7 +273,16 @@ def run(model_name: str):
254
273
  _, port = get_server_info()
255
274
  server_previously_running = port is not None
256
275
  if not server_previously_running:
257
- port, server_thread = serve(use_thread=True, tray=True, log_level="info")
276
+ port, server_thread = serve(
277
+ port=port,
278
+ log_level=log_level,
279
+ tray=True,
280
+ use_thread=True,
281
+ llamacpp_backend=llamacpp_backend,
282
+ ctx_size=ctx_size,
283
+ )
284
+ else:
285
+ port = running_port
258
286
 
259
287
  # Pull model
260
288
  pull([model_name])
@@ -412,6 +440,29 @@ def list_models():
412
440
  print(tabulate(table_data, headers=headers, tablefmt="simple"))
413
441
 
414
442
 
443
+ def _add_server_arguments(parser):
444
+ """Add common server arguments to a parser"""
445
+ parser.add_argument("--port", type=int, help="Port number to serve on")
446
+ parser.add_argument(
447
+ "--log-level",
448
+ type=str,
449
+ help="Log level for the server",
450
+ choices=["critical", "error", "warning", "info", "debug", "trace"],
451
+ default="info",
452
+ )
453
+ parser.add_argument(
454
+ "--llamacpp",
455
+ type=str,
456
+ help=f"LlamaCpp backend to use",
457
+ choices=["vulkan", "rocm"],
458
+ )
459
+ parser.add_argument(
460
+ "--ctx-size",
461
+ type=int,
462
+ help="Context size for the model (default: 4096 for llamacpp, truncates prompts for other recipes)",
463
+ )
464
+
465
+
415
466
  def main():
416
467
  parser = argparse.ArgumentParser(
417
468
  description="Serve LLMs on CPU, GPU, and NPU.",
@@ -430,14 +481,7 @@ def main():
430
481
 
431
482
  # Serve command
432
483
  serve_parser = subparsers.add_parser("serve", help="Start server")
433
- serve_parser.add_argument("--port", type=int, help="Port number to serve on")
434
- serve_parser.add_argument(
435
- "--log-level",
436
- type=str,
437
- help="Log level for the server",
438
- choices=["critical", "error", "warning", "info", "debug", "trace"],
439
- default="info",
440
- )
484
+ _add_server_arguments(serve_parser)
441
485
  if os.name == "nt":
442
486
  serve_parser.add_argument(
443
487
  "--no-tray",
@@ -513,6 +557,7 @@ def main():
513
557
  "model",
514
558
  help="Lemonade Server model name to run",
515
559
  )
560
+ _add_server_arguments(run_parser)
516
561
 
517
562
  args = parser.parse_args()
518
563
 
@@ -535,6 +580,8 @@ def main():
535
580
  port=args.port,
536
581
  log_level=args.log_level,
537
582
  tray=not args.no_tray,
583
+ llamacpp_backend=args.llamacpp,
584
+ ctx_size=args.ctx_size,
538
585
  )
539
586
  elif args.command == "status":
540
587
  status()
@@ -553,7 +600,13 @@ def main():
553
600
  elif args.command == "stop":
554
601
  stop()
555
602
  elif args.command == "run":
556
- run(args.model)
603
+ run(
604
+ args.model,
605
+ port=args.port,
606
+ log_level=args.log_level,
607
+ llamacpp_backend=args.llamacpp,
608
+ ctx_size=args.ctx_size,
609
+ )
557
610
  elif args.command == "help" or not args.command:
558
611
  parser.print_help()
559
612
 
@@ -7,6 +7,7 @@ from importlib.metadata import distributions
7
7
  from lemonade_server.pydantic_models import PullConfig
8
8
  from lemonade.cache import DEFAULT_CACHE_DIR
9
9
  from lemonade.tools.llamacpp.utils import parse_checkpoint, download_gguf
10
+ from lemonade.common.network import custom_snapshot_download
10
11
 
11
12
  USER_MODELS_FILE = os.path.join(DEFAULT_CACHE_DIR, "user_models.json")
12
13
 
@@ -175,7 +176,7 @@ class ModelManager:
175
176
  if "gguf" in checkpoint_to_download.lower():
176
177
  download_gguf(gguf_model_config.checkpoint, gguf_model_config.mmproj)
177
178
  else:
178
- huggingface_hub.snapshot_download(repo_id=checkpoint_to_download)
179
+ custom_snapshot_download(checkpoint_to_download)
179
180
 
180
181
  # Register the model in user_models.json, creating that file if needed
181
182
  # We do this registration after the download so that we don't register
@@ -233,8 +234,8 @@ class ModelManager:
233
234
 
234
235
  try:
235
236
  # Get the local path using snapshot_download with local_files_only=True
236
- snapshot_path = huggingface_hub.snapshot_download(
237
- repo_id=base_checkpoint, local_files_only=True
237
+ snapshot_path = custom_snapshot_download(
238
+ base_checkpoint, local_files_only=True
238
239
  )
239
240
 
240
241
  # Navigate up to the model directory (parent of snapshots directory)
@@ -1,4 +1,4 @@
1
- from typing import Optional, Union, List, Any
1
+ from typing import Optional, Union, List
2
2
 
3
3
  from pydantic import BaseModel
4
4
 
@@ -18,9 +18,6 @@ class LoadConfig(BaseModel):
18
18
  model_name: str
19
19
  checkpoint: Optional[str] = None
20
20
  recipe: Optional[str] = None
21
- # Indicates the maximum prompt length allowed for that specific
22
- # checkpoint + recipe combination
23
- max_prompt_length: Optional[int] = None
24
21
  # Indicates whether the model is a reasoning model, like DeepSeek
25
22
  reasoning: Optional[bool] = False
26
23
  # Indicates which Multimodal Projector (mmproj) file to use
@@ -39,19 +39,16 @@
39
39
  "Llama-3.2-1B-Instruct-Hybrid": {
40
40
  "checkpoint": "amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
41
41
  "recipe": "oga-hybrid",
42
- "max_prompt_length": 3000,
43
42
  "suggested": true
44
43
  },
45
44
  "Llama-3.2-3B-Instruct-Hybrid": {
46
45
  "checkpoint": "amd/Llama-3.2-3B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
47
46
  "recipe": "oga-hybrid",
48
- "max_prompt_length": 2000,
49
47
  "suggested": true
50
48
  },
51
49
  "Phi-3-Mini-Instruct-Hybrid": {
52
50
  "checkpoint": "amd/Phi-3-mini-4k-instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
53
51
  "recipe": "oga-hybrid",
54
- "max_prompt_length": 2000,
55
52
  "suggested": true
56
53
  },
57
54
  "Phi-3.5-Mini-Instruct-Hybrid": {
@@ -62,13 +59,26 @@
62
59
  "Qwen-1.5-7B-Chat-Hybrid": {
63
60
  "checkpoint": "amd/Qwen1.5-7B-Chat-awq-g128-int4-asym-fp16-onnx-hybrid",
64
61
  "recipe": "oga-hybrid",
65
- "max_prompt_length": 3000,
62
+ "suggested": true
63
+ },
64
+ "Qwen-2.5-7B-Instruct-Hybrid": {
65
+ "checkpoint": "amd/Qwen2.5-7B-Instruct-awq-uint4-asym-g128-lmhead-g32-fp16-onnx-hybrid",
66
+ "recipe": "oga-hybrid",
67
+ "suggested": true
68
+ },
69
+ "Qwen-2.5-3B-Instruct-Hybrid": {
70
+ "checkpoint": "amd/Qwen2.5-3B-Instruct-awq-uint4-asym-g128-lmhead-g32-fp16-onnx-hybrid",
71
+ "recipe": "oga-hybrid",
72
+ "suggested": true
73
+ },
74
+ "Qwen-2.5-1.5B-Instruct-Hybrid": {
75
+ "checkpoint": "amd/Qwen2.5-1.5B-Instruct-awq-uint4-asym-g128-lmhead-g32-fp16-onnx-hybrid",
76
+ "recipe": "oga-hybrid",
66
77
  "suggested": true
67
78
  },
68
79
  "DeepSeek-R1-Distill-Llama-8B-Hybrid": {
69
80
  "checkpoint": "amd/DeepSeek-R1-Distill-Llama-8B-awq-asym-uint4-g128-lmhead-onnx-hybrid",
70
81
  "recipe": "oga-hybrid",
71
- "max_prompt_length": 2000,
72
82
  "suggested": true,
73
83
  "labels": ["reasoning"]
74
84
  },
@@ -76,25 +86,32 @@
76
86
  "checkpoint": "amd/DeepSeek-R1-Distill-Qwen-7B-awq-asym-uint4-g128-lmhead-onnx-hybrid",
77
87
  "recipe": "oga-hybrid",
78
88
  "max_prompt_length": 2000,
79
- "suggested": true,
89
+ "suggested": false,
80
90
  "labels": ["reasoning"]
81
91
  },
82
92
  "Mistral-7B-v0.3-Instruct-Hybrid": {
83
93
  "checkpoint": "amd/Mistral-7B-Instruct-v0.3-awq-g128-int4-asym-fp16-onnx-hybrid",
84
94
  "recipe": "oga-hybrid",
85
- "max_prompt_length": 2000,
86
95
  "suggested": true
87
96
  },
88
97
  "Llama-3.1-8B-Instruct-Hybrid": {
89
98
  "checkpoint": "amd/Llama-3.1-8B-Instruct-awq-asym-uint4-g128-lmhead-onnx-hybrid",
90
99
  "recipe": "oga-hybrid",
91
- "max_prompt_length": 2000,
92
100
  "suggested": true
93
101
  },
94
102
  "Llama-xLAM-2-8b-fc-r-Hybrid": {
95
103
  "checkpoint": "amd/Llama-xLAM-2-8b-fc-r-awq-g128-int4-asym-bfp16-onnx-hybrid",
96
104
  "recipe": "oga-hybrid",
97
- "max_prompt_length": 2000,
105
+ "suggested": true
106
+ },
107
+ "Qwen-2.5-7B-Instruct-NPU": {
108
+ "checkpoint": "amd/Qwen2.5-7B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix",
109
+ "recipe": "oga-npu",
110
+ "suggested": true
111
+ },
112
+ "Qwen-2.5-1.5B-Instruct-NPU": {
113
+ "checkpoint": "amd/Qwen2.5-1.5B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix",
114
+ "recipe": "oga-npu",
98
115
  "suggested": true
99
116
  },
100
117
  "Llama-3.2-1B-Instruct-DirectML": {
@@ -169,6 +186,18 @@
169
186
  "suggested": true,
170
187
  "labels": ["reasoning"]
171
188
  },
189
+ "Qwen3-30B-A3B-Instruct-2507-GGUF": {
190
+ "checkpoint": "unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF:Qwen3-30B-A3B-Instruct-2507-Q4_0.gguf",
191
+ "recipe": "llamacpp",
192
+ "suggested": true,
193
+ "labels": ["hot"]
194
+ },
195
+ "Qwen3-Coder-30B-A3B-Instruct-GGUF": {
196
+ "checkpoint": "unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Qwen3-Coder-30B-A3B-Instruct-Q4_K_M.gguf",
197
+ "recipe": "llamacpp",
198
+ "suggested": true,
199
+ "labels": ["coding","hot"]
200
+ },
172
201
  "Gemma-3-4b-it-GGUF": {
173
202
  "checkpoint": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
174
203
  "mmproj": "mmproj-model-f16.gguf",
@@ -190,6 +219,13 @@
190
219
  "suggested": true,
191
220
  "labels": ["vision"]
192
221
  },
222
+ "Cogito-v2-llama-109B-MoE-GGUF": {
223
+ "checkpoint": "unsloth/cogito-v2-preview-llama-109B-MoE-GGUF:Q4_K_M",
224
+ "mmproj": "mmproj-F16.gguf",
225
+ "recipe": "llamacpp",
226
+ "suggested": true,
227
+ "labels": ["vision","hot"]
228
+ },
193
229
  "nomic-embed-text-v1-GGUF": {
194
230
  "checkpoint": "nomic-ai/nomic-embed-text-v1-GGUF:Q4_K_S",
195
231
  "recipe": "llamacpp",
@@ -217,12 +253,25 @@
217
253
  "Devstral-Small-2507-GGUF":{
218
254
  "checkpoint": "mistralai/Devstral-Small-2507_gguf:Q4_K_M",
219
255
  "recipe": "llamacpp",
220
- "suggested": true
256
+ "suggested": true,
257
+ "labels": ["coding"]
221
258
  },
222
259
  "Qwen2.5-Coder-32B-Instruct-GGUF": {
223
260
  "checkpoint": "Qwen/Qwen2.5-Coder-32B-Instruct-GGUF:Q4_K_M",
224
261
  "recipe": "llamacpp",
225
262
  "suggested": true,
226
- "labels": ["reasoning"]
263
+ "labels": ["reasoning", "coding"]
264
+ },
265
+ "gpt-oss-120b-GGUF": {
266
+ "checkpoint": "unsloth/gpt-oss-120b-GGUF:Q4_K_M",
267
+ "recipe": "llamacpp",
268
+ "suggested": true,
269
+ "labels": ["hot", "reasoning"]
270
+ },
271
+ "gpt-oss-20b-GGUF": {
272
+ "checkpoint": "unsloth/gpt-oss-20b-GGUF:Q4_K_M",
273
+ "recipe": "llamacpp",
274
+ "suggested": true,
275
+ "labels": ["hot", "reasoning"]
227
276
  }
228
277
  }