lemonade-sdk 8.0.4__py3-none-any.whl → 8.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/api.py +50 -0
- lemonade/cache.py +3 -1
- lemonade/common/inference_engines.py +415 -0
- lemonade/common/system_info.py +493 -47
- lemonade/tools/adapter.py +6 -0
- lemonade/tools/huggingface/utils.py +6 -5
- lemonade/tools/llamacpp/bench.py +26 -46
- lemonade/tools/llamacpp/load.py +104 -196
- lemonade/tools/llamacpp/utils.py +612 -0
- lemonade/tools/management_tools.py +53 -7
- lemonade/tools/oga/bench.py +5 -6
- lemonade/tools/oga/utils.py +8 -2
- lemonade/tools/prompt.py +17 -25
- lemonade/tools/report/table.py +12 -9
- lemonade/tools/server/llamacpp.py +80 -92
- lemonade/tools/server/serve.py +32 -0
- lemonade/tools/server/static/styles.css +137 -58
- lemonade/tools/server/static/webapp.html +34 -8
- lemonade/tools/server/tray.py +7 -0
- lemonade/version.py +1 -1
- lemonade_sdk-8.0.6.dist-info/METADATA +295 -0
- {lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/RECORD +30 -28
- lemonade_server/cli.py +168 -22
- lemonade_server/model_manager.py +4 -148
- lemonade_server/server_models.json +11 -0
- lemonade_sdk-8.0.4.dist-info/METADATA +0 -176
- {lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/WHEEL +0 -0
- {lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/top_level.txt +0 -0
lemonade_server/cli.py
CHANGED
|
@@ -27,43 +27,75 @@ class DeleteError(Exception):
|
|
|
27
27
|
"""
|
|
28
28
|
|
|
29
29
|
|
|
30
|
+
class ServerTimeoutError(Exception):
|
|
31
|
+
"""
|
|
32
|
+
The server failed to start within the timeout period
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ModelNotAvailableError(Exception):
|
|
37
|
+
"""
|
|
38
|
+
The specified model is not available on the server
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
|
|
30
42
|
def serve(
|
|
31
|
-
port: int,
|
|
43
|
+
port: int = None,
|
|
32
44
|
log_level: str = None,
|
|
33
45
|
tray: bool = False,
|
|
46
|
+
use_thread: bool = False,
|
|
34
47
|
):
|
|
35
48
|
"""
|
|
36
49
|
Execute the serve command
|
|
37
50
|
"""
|
|
38
51
|
|
|
39
|
-
# Check if Lemonade Server is already running
|
|
40
|
-
_, running_port = get_server_info()
|
|
41
|
-
if running_port is not None:
|
|
42
|
-
print(
|
|
43
|
-
(
|
|
44
|
-
f"Lemonade Server is already running on port {running_port}\n"
|
|
45
|
-
"Please stop the existing server before starting a new instance."
|
|
46
|
-
),
|
|
47
|
-
)
|
|
48
|
-
sys.exit(ExitCodes.SERVER_ALREADY_RUNNING)
|
|
49
|
-
|
|
50
52
|
# Otherwise, start the server
|
|
51
53
|
print("Starting Lemonade Server...")
|
|
52
54
|
from lemonade.tools.server.serve import Server, DEFAULT_PORT, DEFAULT_LOG_LEVEL
|
|
53
55
|
|
|
54
|
-
server = Server()
|
|
55
56
|
port = port if port is not None else DEFAULT_PORT
|
|
56
57
|
log_level = log_level if log_level is not None else DEFAULT_LOG_LEVEL
|
|
57
58
|
|
|
58
59
|
# Hidden environment variable to enable input truncation (experimental feature)
|
|
59
60
|
truncate_inputs = "LEMONADE_TRUNCATE_INPUTS" in os.environ
|
|
60
61
|
|
|
61
|
-
server
|
|
62
|
-
|
|
63
|
-
log_level
|
|
64
|
-
truncate_inputs
|
|
65
|
-
tray
|
|
66
|
-
|
|
62
|
+
# Start the server
|
|
63
|
+
serve_kwargs = {
|
|
64
|
+
"log_level": log_level,
|
|
65
|
+
"truncate_inputs": truncate_inputs,
|
|
66
|
+
"tray": tray,
|
|
67
|
+
}
|
|
68
|
+
server = Server()
|
|
69
|
+
if not use_thread:
|
|
70
|
+
server.run(
|
|
71
|
+
port=port,
|
|
72
|
+
**serve_kwargs,
|
|
73
|
+
)
|
|
74
|
+
else:
|
|
75
|
+
from threading import Thread
|
|
76
|
+
import time
|
|
77
|
+
|
|
78
|
+
# Start a background thread to run the server
|
|
79
|
+
server_thread = Thread(
|
|
80
|
+
target=server.run,
|
|
81
|
+
args=(port,),
|
|
82
|
+
kwargs=serve_kwargs,
|
|
83
|
+
daemon=True,
|
|
84
|
+
)
|
|
85
|
+
server_thread.start()
|
|
86
|
+
|
|
87
|
+
# Wait for the server to be ready
|
|
88
|
+
max_wait_time = 30
|
|
89
|
+
wait_interval = 0.5
|
|
90
|
+
waited = 0
|
|
91
|
+
while waited < max_wait_time:
|
|
92
|
+
time.sleep(wait_interval)
|
|
93
|
+
_, running_port = get_server_info()
|
|
94
|
+
if running_port is not None:
|
|
95
|
+
break
|
|
96
|
+
waited += wait_interval
|
|
97
|
+
|
|
98
|
+
return port, server_thread
|
|
67
99
|
|
|
68
100
|
|
|
69
101
|
def stop():
|
|
@@ -161,9 +193,8 @@ def pull(
|
|
|
161
193
|
if pull_response.status_code != 200:
|
|
162
194
|
raise PullError(
|
|
163
195
|
f"Failed to install {model_name}. Check the "
|
|
164
|
-
"Lemonade Server log for more information.
|
|
165
|
-
"
|
|
166
|
-
"https://github.com/lemonade-sdk/lemonade/blob/main/docs/server/server_models.md"
|
|
196
|
+
"Lemonade Server log for more information. You can list "
|
|
197
|
+
"supported models with `lemonade-server list`"
|
|
167
198
|
)
|
|
168
199
|
else:
|
|
169
200
|
from lemonade_server.model_manager import ModelManager
|
|
@@ -212,6 +243,53 @@ def delete(model_names: List[str]):
|
|
|
212
243
|
ModelManager().delete_model(model_name)
|
|
213
244
|
|
|
214
245
|
|
|
246
|
+
def run(model_name: str):
|
|
247
|
+
"""
|
|
248
|
+
Start the server if not running and open the webapp with the specified model
|
|
249
|
+
"""
|
|
250
|
+
import webbrowser
|
|
251
|
+
import time
|
|
252
|
+
|
|
253
|
+
# Start the server if not running
|
|
254
|
+
_, port = get_server_info()
|
|
255
|
+
server_previously_running = port is not None
|
|
256
|
+
if not server_previously_running:
|
|
257
|
+
port, server_thread = serve(use_thread=True, tray=True, log_level="info")
|
|
258
|
+
|
|
259
|
+
# Pull model
|
|
260
|
+
pull([model_name])
|
|
261
|
+
|
|
262
|
+
# Load model
|
|
263
|
+
load(model_name, port)
|
|
264
|
+
|
|
265
|
+
# Open the webapp with the specified model
|
|
266
|
+
url = f"http://localhost:{port}/?model={model_name}#llm-chat"
|
|
267
|
+
print(f"You can now chat with {model_name} at {url}")
|
|
268
|
+
webbrowser.open(url)
|
|
269
|
+
|
|
270
|
+
# Keep the server running if we started it
|
|
271
|
+
if not server_previously_running:
|
|
272
|
+
while server_thread.is_alive():
|
|
273
|
+
time.sleep(0.5)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def load(model_name: str, port: int):
|
|
277
|
+
"""
|
|
278
|
+
Load a model using the endpoint
|
|
279
|
+
"""
|
|
280
|
+
import requests
|
|
281
|
+
|
|
282
|
+
base_url = f"http://localhost:{port}/api/v1"
|
|
283
|
+
|
|
284
|
+
# Load the model
|
|
285
|
+
load_response = requests.post(f"{base_url}/load", json={"model_name": model_name})
|
|
286
|
+
if load_response.status_code != 200:
|
|
287
|
+
raise ModelLoadError(
|
|
288
|
+
f"Failed to load {model_name}. Check the "
|
|
289
|
+
"Lemonade Server log for more information."
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
|
|
215
293
|
def version():
|
|
216
294
|
"""
|
|
217
295
|
Print the version number
|
|
@@ -294,6 +372,46 @@ def get_server_info() -> Tuple[int | None, int | None]:
|
|
|
294
372
|
return None, None
|
|
295
373
|
|
|
296
374
|
|
|
375
|
+
def list_models():
|
|
376
|
+
"""
|
|
377
|
+
List recommended models and their download status
|
|
378
|
+
"""
|
|
379
|
+
from tabulate import tabulate
|
|
380
|
+
from lemonade_server.model_manager import ModelManager
|
|
381
|
+
|
|
382
|
+
model_manager = ModelManager()
|
|
383
|
+
|
|
384
|
+
# Get all supported models and downloaded models
|
|
385
|
+
supported_models = model_manager.supported_models
|
|
386
|
+
downloaded_models = model_manager.downloaded_models
|
|
387
|
+
|
|
388
|
+
# Filter to only show recommended models
|
|
389
|
+
recommended_models = {
|
|
390
|
+
model_name: model_info
|
|
391
|
+
for model_name, model_info in supported_models.items()
|
|
392
|
+
if model_info.get("suggested", False)
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
# Create table data
|
|
396
|
+
table_data = []
|
|
397
|
+
for model_name, model_info in recommended_models.items():
|
|
398
|
+
downloaded_status = "Yes" if model_name in downloaded_models else "No"
|
|
399
|
+
|
|
400
|
+
# Get model labels/type
|
|
401
|
+
labels = model_info.get("labels", [])
|
|
402
|
+
model_type = ", ".join(labels) if labels else "-"
|
|
403
|
+
|
|
404
|
+
table_data.append([model_name, downloaded_status, model_type])
|
|
405
|
+
|
|
406
|
+
# Sort by model name for consistent display
|
|
407
|
+
# Show downloaded models first
|
|
408
|
+
table_data.sort(key=lambda x: (x[1] == "No", x[0].lower()))
|
|
409
|
+
|
|
410
|
+
# Display table
|
|
411
|
+
headers = ["Model Name", "Downloaded", "Details"]
|
|
412
|
+
print(tabulate(table_data, headers=headers, tablefmt="simple"))
|
|
413
|
+
|
|
414
|
+
|
|
297
415
|
def main():
|
|
298
416
|
parser = argparse.ArgumentParser(
|
|
299
417
|
description="Serve LLMs on CPU, GPU, and NPU.",
|
|
@@ -333,6 +451,11 @@ def main():
|
|
|
333
451
|
# Stop command
|
|
334
452
|
stop_parser = subparsers.add_parser("stop", help="Stop the server")
|
|
335
453
|
|
|
454
|
+
# List command
|
|
455
|
+
list_parser = subparsers.add_parser(
|
|
456
|
+
"list", help="List recommended models and their download status"
|
|
457
|
+
)
|
|
458
|
+
|
|
336
459
|
# Pull command
|
|
337
460
|
pull_parser = subparsers.add_parser(
|
|
338
461
|
"pull",
|
|
@@ -381,6 +504,16 @@ def main():
|
|
|
381
504
|
nargs="+",
|
|
382
505
|
)
|
|
383
506
|
|
|
507
|
+
# Run command
|
|
508
|
+
run_parser = subparsers.add_parser(
|
|
509
|
+
"run",
|
|
510
|
+
help="Chat with specified model (starts server if needed)",
|
|
511
|
+
)
|
|
512
|
+
run_parser.add_argument(
|
|
513
|
+
"model",
|
|
514
|
+
help="Lemonade Server model name to run",
|
|
515
|
+
)
|
|
516
|
+
|
|
384
517
|
args = parser.parse_args()
|
|
385
518
|
|
|
386
519
|
if os.name != "nt":
|
|
@@ -389,6 +522,15 @@ def main():
|
|
|
389
522
|
if args.version:
|
|
390
523
|
version()
|
|
391
524
|
elif args.command == "serve":
|
|
525
|
+
_, running_port = get_server_info()
|
|
526
|
+
if running_port is not None:
|
|
527
|
+
print(
|
|
528
|
+
(
|
|
529
|
+
f"Lemonade Server is already running on port {running_port}\n"
|
|
530
|
+
"Please stop the existing server before starting a new instance."
|
|
531
|
+
),
|
|
532
|
+
)
|
|
533
|
+
sys.exit(ExitCodes.SERVER_ALREADY_RUNNING)
|
|
392
534
|
serve(
|
|
393
535
|
port=args.port,
|
|
394
536
|
log_level=args.log_level,
|
|
@@ -396,6 +538,8 @@ def main():
|
|
|
396
538
|
)
|
|
397
539
|
elif args.command == "status":
|
|
398
540
|
status()
|
|
541
|
+
elif args.command == "list":
|
|
542
|
+
list_models()
|
|
399
543
|
elif args.command == "pull":
|
|
400
544
|
pull(
|
|
401
545
|
args.model,
|
|
@@ -408,6 +552,8 @@ def main():
|
|
|
408
552
|
delete(args.model)
|
|
409
553
|
elif args.command == "stop":
|
|
410
554
|
stop()
|
|
555
|
+
elif args.command == "run":
|
|
556
|
+
run(args.model)
|
|
411
557
|
elif args.command == "help" or not args.command:
|
|
412
558
|
parser.print_help()
|
|
413
559
|
|
lemonade_server/model_manager.py
CHANGED
|
@@ -6,31 +6,13 @@ import huggingface_hub
|
|
|
6
6
|
from importlib.metadata import distributions
|
|
7
7
|
from lemonade_server.pydantic_models import PullConfig
|
|
8
8
|
from lemonade.cache import DEFAULT_CACHE_DIR
|
|
9
|
+
from lemonade.tools.llamacpp.utils import parse_checkpoint, download_gguf
|
|
9
10
|
|
|
10
11
|
USER_MODELS_FILE = os.path.join(DEFAULT_CACHE_DIR, "user_models.json")
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
class ModelManager:
|
|
14
15
|
|
|
15
|
-
@staticmethod
|
|
16
|
-
def parse_checkpoint(checkpoint: str) -> tuple[str, str | None]:
|
|
17
|
-
"""
|
|
18
|
-
Parse a checkpoint string that may contain a variant separated by a colon.
|
|
19
|
-
|
|
20
|
-
For GGUF models, the format is "repository:variant" (e.g., "unsloth/Qwen3-0.6B-GGUF:Q4_0").
|
|
21
|
-
For other models, there is no variant.
|
|
22
|
-
|
|
23
|
-
Args:
|
|
24
|
-
checkpoint: The checkpoint string, potentially with variant
|
|
25
|
-
|
|
26
|
-
Returns:
|
|
27
|
-
tuple: (base_checkpoint, variant) where variant is None if no colon is present
|
|
28
|
-
"""
|
|
29
|
-
if ":" in checkpoint:
|
|
30
|
-
base_checkpoint, variant = checkpoint.split(":", 1)
|
|
31
|
-
return base_checkpoint, variant
|
|
32
|
-
return checkpoint, None
|
|
33
|
-
|
|
34
16
|
@property
|
|
35
17
|
def supported_models(self) -> dict:
|
|
36
18
|
"""
|
|
@@ -98,7 +80,7 @@ class ModelManager:
|
|
|
98
80
|
downloaded_models = {}
|
|
99
81
|
downloaded_checkpoints = self.downloaded_hf_checkpoints
|
|
100
82
|
for model in self.supported_models:
|
|
101
|
-
base_checkpoint =
|
|
83
|
+
base_checkpoint = parse_checkpoint(
|
|
102
84
|
self.supported_models[model]["checkpoint"]
|
|
103
85
|
)[0]
|
|
104
86
|
if base_checkpoint in downloaded_checkpoints:
|
|
@@ -113,132 +95,6 @@ class ModelManager:
|
|
|
113
95
|
"""
|
|
114
96
|
return self.filter_models_by_backend(self.downloaded_models)
|
|
115
97
|
|
|
116
|
-
def identify_gguf_models(
|
|
117
|
-
self, checkpoint: str, variant: str, mmproj: str
|
|
118
|
-
) -> tuple[dict, list[str]]:
|
|
119
|
-
"""
|
|
120
|
-
Identifies the GGUF model files in the repository that match the variant.
|
|
121
|
-
"""
|
|
122
|
-
|
|
123
|
-
hint = """
|
|
124
|
-
The CHECKPOINT:VARIANT scheme is used to specify model files in Hugging Face repositories.
|
|
125
|
-
|
|
126
|
-
The VARIANT format can be one of several types:
|
|
127
|
-
1. Full filename: exact file to download
|
|
128
|
-
2. None/empty: gets the first .gguf file in the repository (excludes mmproj files)
|
|
129
|
-
3. Quantization variant: find a single file ending with the variant name (case insensitive)
|
|
130
|
-
4. Folder name: downloads all .gguf files in the folder that matches the variant name (case insensitive)
|
|
131
|
-
|
|
132
|
-
Examples:
|
|
133
|
-
- "unsloth/Qwen3-8B-GGUF:qwen3.gguf" -> downloads "qwen3.gguf"
|
|
134
|
-
- "unsloth/Qwen3-30B-A3B-GGUF" -> downloads "Qwen3-30B-A3B-GGUF.gguf"
|
|
135
|
-
- "unsloth/Qwen3-8B-GGUF:Q4_1" -> downloads "Qwen3-8B-GGUF-Q4_1.gguf"
|
|
136
|
-
- "unsloth/Qwen3-30B-A3B-GGUF:Q4_0" -> downloads all files in "Q4_0/" folder
|
|
137
|
-
"""
|
|
138
|
-
|
|
139
|
-
repo_files = huggingface_hub.list_repo_files(checkpoint)
|
|
140
|
-
sharded_files = []
|
|
141
|
-
|
|
142
|
-
# (case 1) If variant ends in .gguf, use it directly
|
|
143
|
-
if variant and variant.endswith(".gguf"):
|
|
144
|
-
variant_name = variant
|
|
145
|
-
if variant_name not in repo_files:
|
|
146
|
-
raise ValueError(
|
|
147
|
-
f"File {variant} not found in Hugging Face repository {checkpoint}. {hint}"
|
|
148
|
-
)
|
|
149
|
-
# (case 2) If no variant is provided, get the first .gguf file in the repository
|
|
150
|
-
elif variant is None:
|
|
151
|
-
all_variants = [
|
|
152
|
-
f for f in repo_files if f.endswith(".gguf") and "mmproj" not in f
|
|
153
|
-
]
|
|
154
|
-
if len(all_variants) == 0:
|
|
155
|
-
raise ValueError(
|
|
156
|
-
f"No .gguf files found in Hugging Face repository {checkpoint}. {hint}"
|
|
157
|
-
)
|
|
158
|
-
variant_name = all_variants[0]
|
|
159
|
-
else:
|
|
160
|
-
# (case 3) Find a single file ending with the variant name (case insensitive)
|
|
161
|
-
end_with_variant = [
|
|
162
|
-
f
|
|
163
|
-
for f in repo_files
|
|
164
|
-
if f.lower().endswith(f"{variant}.gguf".lower())
|
|
165
|
-
and "mmproj" not in f.lower()
|
|
166
|
-
]
|
|
167
|
-
if len(end_with_variant) == 1:
|
|
168
|
-
variant_name = end_with_variant[0]
|
|
169
|
-
elif len(end_with_variant) > 1:
|
|
170
|
-
raise ValueError(
|
|
171
|
-
f"Multiple .gguf files found for variant {variant}, but only one is allowed. {hint}"
|
|
172
|
-
)
|
|
173
|
-
# (case 4) Check whether the variant corresponds to a folder with sharded files (case insensitive)
|
|
174
|
-
else:
|
|
175
|
-
sharded_files = [
|
|
176
|
-
f
|
|
177
|
-
for f in repo_files
|
|
178
|
-
if f.endswith(".gguf")
|
|
179
|
-
and f.lower().startswith(f"{variant}/".lower())
|
|
180
|
-
]
|
|
181
|
-
|
|
182
|
-
if not sharded_files:
|
|
183
|
-
raise ValueError(
|
|
184
|
-
f"No .gguf files found for variant {variant}. {hint}"
|
|
185
|
-
)
|
|
186
|
-
|
|
187
|
-
# Sort to ensure consistent ordering
|
|
188
|
-
sharded_files.sort()
|
|
189
|
-
|
|
190
|
-
# Use first file as primary (this is how llamacpp handles it)
|
|
191
|
-
variant_name = sharded_files[0]
|
|
192
|
-
|
|
193
|
-
core_files = {"variant": variant_name}
|
|
194
|
-
|
|
195
|
-
# If there is a mmproj file, add it to the patterns
|
|
196
|
-
if mmproj:
|
|
197
|
-
if mmproj not in repo_files:
|
|
198
|
-
raise ValueError(
|
|
199
|
-
f"The provided mmproj file {mmproj} was not found in {checkpoint}."
|
|
200
|
-
)
|
|
201
|
-
core_files["mmproj"] = mmproj
|
|
202
|
-
|
|
203
|
-
return core_files, sharded_files
|
|
204
|
-
|
|
205
|
-
def download_gguf(self, model_config: PullConfig) -> dict:
|
|
206
|
-
"""
|
|
207
|
-
Downloads the GGUF file for the given model configuration.
|
|
208
|
-
|
|
209
|
-
For sharded models, if the variant points to a folder (e.g. Q4_0), all files in that folder
|
|
210
|
-
will be downloaded but only the first file will be returned for loading.
|
|
211
|
-
"""
|
|
212
|
-
|
|
213
|
-
# This code handles all cases by constructing the appropriate filename or pattern
|
|
214
|
-
checkpoint, variant = self.parse_checkpoint(model_config.checkpoint)
|
|
215
|
-
|
|
216
|
-
# Identify the GGUF model files in the repository that match the variant
|
|
217
|
-
core_files, sharded_files = self.identify_gguf_models(
|
|
218
|
-
checkpoint, variant, model_config.mmproj
|
|
219
|
-
)
|
|
220
|
-
|
|
221
|
-
# Download the files
|
|
222
|
-
snapshot_folder = huggingface_hub.snapshot_download(
|
|
223
|
-
repo_id=checkpoint,
|
|
224
|
-
allow_patterns=list(core_files.values()) + sharded_files,
|
|
225
|
-
)
|
|
226
|
-
|
|
227
|
-
# Ensure we downloaded all expected files
|
|
228
|
-
for file in list(core_files.values()) + sharded_files:
|
|
229
|
-
expected_path = os.path.join(snapshot_folder, file)
|
|
230
|
-
if not os.path.exists(expected_path):
|
|
231
|
-
raise ValueError(
|
|
232
|
-
f"Hugging Face snapshot download for {model_config.checkpoint} "
|
|
233
|
-
f"expected file {file} not found at {expected_path}"
|
|
234
|
-
)
|
|
235
|
-
|
|
236
|
-
# Return a dict of the full path of the core GGUF files
|
|
237
|
-
return {
|
|
238
|
-
file_name: os.path.join(snapshot_folder, file_path)
|
|
239
|
-
for file_name, file_path in core_files.items()
|
|
240
|
-
}
|
|
241
|
-
|
|
242
98
|
def download_models(
|
|
243
99
|
self,
|
|
244
100
|
models: list[str],
|
|
@@ -317,7 +173,7 @@ class ModelManager:
|
|
|
317
173
|
print(f"Downloading {model} ({checkpoint_to_download})")
|
|
318
174
|
|
|
319
175
|
if "gguf" in checkpoint_to_download.lower():
|
|
320
|
-
|
|
176
|
+
download_gguf(gguf_model_config.checkpoint, gguf_model_config.mmproj)
|
|
321
177
|
else:
|
|
322
178
|
huggingface_hub.snapshot_download(repo_id=checkpoint_to_download)
|
|
323
179
|
|
|
@@ -373,7 +229,7 @@ class ModelManager:
|
|
|
373
229
|
print(f"Deleting {model_name} ({checkpoint})")
|
|
374
230
|
|
|
375
231
|
# Handle GGUF models that have the format "checkpoint:variant"
|
|
376
|
-
base_checkpoint =
|
|
232
|
+
base_checkpoint = parse_checkpoint(checkpoint)[0]
|
|
377
233
|
|
|
378
234
|
try:
|
|
379
235
|
# Get the local path using snapshot_download with local_files_only=True
|
|
@@ -213,5 +213,16 @@
|
|
|
213
213
|
"recipe": "llamacpp",
|
|
214
214
|
"suggested": false,
|
|
215
215
|
"labels": ["reranking"]
|
|
216
|
+
},
|
|
217
|
+
"Devstral-Small-2507-GGUF":{
|
|
218
|
+
"checkpoint": "mistralai/Devstral-Small-2507_gguf:Q4_K_M",
|
|
219
|
+
"recipe": "llamacpp",
|
|
220
|
+
"suggested": true
|
|
221
|
+
},
|
|
222
|
+
"Qwen2.5-Coder-32B-Instruct-GGUF": {
|
|
223
|
+
"checkpoint": "Qwen/Qwen2.5-Coder-32B-Instruct-GGUF:Q4_K_M",
|
|
224
|
+
"recipe": "llamacpp",
|
|
225
|
+
"suggested": true,
|
|
226
|
+
"labels": ["reasoning"]
|
|
216
227
|
}
|
|
217
228
|
}
|
|
@@ -1,176 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: lemonade-sdk
|
|
3
|
-
Version: 8.0.4
|
|
4
|
-
Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
|
|
5
|
-
Author-email: lemonade@amd.com
|
|
6
|
-
Requires-Python: >=3.10, <3.12
|
|
7
|
-
Description-Content-Type: text/markdown
|
|
8
|
-
License-File: LICENSE
|
|
9
|
-
License-File: NOTICE.md
|
|
10
|
-
Requires-Dist: invoke>=2.0.0
|
|
11
|
-
Requires-Dist: onnx<1.18.0,>=1.11.0
|
|
12
|
-
Requires-Dist: pyyaml>=5.4
|
|
13
|
-
Requires-Dist: typeguard>=2.3.13
|
|
14
|
-
Requires-Dist: packaging>=20.9
|
|
15
|
-
Requires-Dist: numpy<2.0.0
|
|
16
|
-
Requires-Dist: fasteners
|
|
17
|
-
Requires-Dist: GitPython>=3.1.40
|
|
18
|
-
Requires-Dist: psutil>=6.1.1
|
|
19
|
-
Requires-Dist: wmi
|
|
20
|
-
Requires-Dist: py-cpuinfo
|
|
21
|
-
Requires-Dist: pytz
|
|
22
|
-
Requires-Dist: zstandard
|
|
23
|
-
Requires-Dist: fastapi
|
|
24
|
-
Requires-Dist: uvicorn[standard]
|
|
25
|
-
Requires-Dist: openai>=1.81.0
|
|
26
|
-
Requires-Dist: transformers<=4.51.3
|
|
27
|
-
Requires-Dist: jinja2
|
|
28
|
-
Requires-Dist: tabulate
|
|
29
|
-
Requires-Dist: sentencepiece
|
|
30
|
-
Requires-Dist: huggingface-hub==0.33.0
|
|
31
|
-
Provides-Extra: oga-hybrid
|
|
32
|
-
Requires-Dist: onnx==1.16.1; extra == "oga-hybrid"
|
|
33
|
-
Requires-Dist: numpy==1.26.4; extra == "oga-hybrid"
|
|
34
|
-
Requires-Dist: protobuf>=6.30.1; extra == "oga-hybrid"
|
|
35
|
-
Provides-Extra: oga-cpu
|
|
36
|
-
Requires-Dist: onnxruntime-genai==0.8.2; extra == "oga-cpu"
|
|
37
|
-
Requires-Dist: onnxruntime>=1.22.0; extra == "oga-cpu"
|
|
38
|
-
Provides-Extra: dev
|
|
39
|
-
Requires-Dist: torch>=2.6.0; extra == "dev"
|
|
40
|
-
Requires-Dist: accelerate; extra == "dev"
|
|
41
|
-
Requires-Dist: datasets; extra == "dev"
|
|
42
|
-
Requires-Dist: pandas>=1.5.3; extra == "dev"
|
|
43
|
-
Requires-Dist: matplotlib; extra == "dev"
|
|
44
|
-
Requires-Dist: human-eval-windows==1.0.4; extra == "dev"
|
|
45
|
-
Requires-Dist: lm-eval[api]; extra == "dev"
|
|
46
|
-
Provides-Extra: oga-hybrid-minimal
|
|
47
|
-
Requires-Dist: lemonade-sdk[oga-hybrid]; extra == "oga-hybrid-minimal"
|
|
48
|
-
Provides-Extra: oga-cpu-minimal
|
|
49
|
-
Requires-Dist: lemonade-sdk[oga-cpu]; extra == "oga-cpu-minimal"
|
|
50
|
-
Provides-Extra: llm
|
|
51
|
-
Requires-Dist: lemonade-sdk[dev]; extra == "llm"
|
|
52
|
-
Provides-Extra: llm-oga-cpu
|
|
53
|
-
Requires-Dist: lemonade-sdk[dev,oga-cpu]; extra == "llm-oga-cpu"
|
|
54
|
-
Provides-Extra: llm-oga-igpu
|
|
55
|
-
Requires-Dist: onnxruntime-genai-directml==0.6.0; extra == "llm-oga-igpu"
|
|
56
|
-
Requires-Dist: onnxruntime-directml<1.22.0,>=1.19.0; extra == "llm-oga-igpu"
|
|
57
|
-
Requires-Dist: transformers<4.45.0; extra == "llm-oga-igpu"
|
|
58
|
-
Requires-Dist: lemonade-sdk[dev]; extra == "llm-oga-igpu"
|
|
59
|
-
Provides-Extra: llm-oga-cuda
|
|
60
|
-
Requires-Dist: onnxruntime-genai-cuda==0.8.2; extra == "llm-oga-cuda"
|
|
61
|
-
Requires-Dist: onnxruntime-gpu>=1.22.0; extra == "llm-oga-cuda"
|
|
62
|
-
Requires-Dist: transformers<=4.51.3; extra == "llm-oga-cuda"
|
|
63
|
-
Requires-Dist: lemonade-sdk[dev]; extra == "llm-oga-cuda"
|
|
64
|
-
Provides-Extra: llm-oga-npu
|
|
65
|
-
Requires-Dist: onnx==1.16.0; extra == "llm-oga-npu"
|
|
66
|
-
Requires-Dist: onnxruntime==1.18.0; extra == "llm-oga-npu"
|
|
67
|
-
Requires-Dist: numpy==1.26.4; extra == "llm-oga-npu"
|
|
68
|
-
Requires-Dist: protobuf>=6.30.1; extra == "llm-oga-npu"
|
|
69
|
-
Requires-Dist: lemonade-sdk[dev]; extra == "llm-oga-npu"
|
|
70
|
-
Provides-Extra: llm-oga-hybrid
|
|
71
|
-
Requires-Dist: lemonade-sdk[dev,oga-hybrid]; extra == "llm-oga-hybrid"
|
|
72
|
-
Provides-Extra: llm-oga-unified
|
|
73
|
-
Requires-Dist: lemonade-sdk[llm-oga-hybrid]; extra == "llm-oga-unified"
|
|
74
|
-
Dynamic: author-email
|
|
75
|
-
Dynamic: description
|
|
76
|
-
Dynamic: description-content-type
|
|
77
|
-
Dynamic: license-file
|
|
78
|
-
Dynamic: provides-extra
|
|
79
|
-
Dynamic: requires-dist
|
|
80
|
-
Dynamic: requires-python
|
|
81
|
-
Dynamic: summary
|
|
82
|
-
|
|
83
|
-
[](https://github.com/lemonade-sdk/lemonade/tree/main/test "Check out our tests")
|
|
84
|
-
[](docs/README.md#installation "Check out our instructions")
|
|
85
|
-
[](docs/README.md#installation "Check out our instructions")
|
|
86
|
-
|
|
87
|
-
## 🍋 Lemonade SDK: Quickly serve, benchmark and deploy LLMs
|
|
88
|
-
|
|
89
|
-
The [Lemonade SDK](./docs/README.md) makes it easy to run Large Language Models (LLMs) on your PC. Our focus is using the best tools, such as neural processing units (NPUs) and Vulkan GPU acceleration, to maximize LLM speed and responsiveness.
|
|
90
|
-
|
|
91
|
-
<div align="center">
|
|
92
|
-
<img src="https://download.amd.com/images/lemonade_640x480_1.gif" alt="Lemonade Demo" title="Lemonade in Action">
|
|
93
|
-
</div>
|
|
94
|
-
|
|
95
|
-
### Features
|
|
96
|
-
|
|
97
|
-
The [Lemonade SDK](./docs/README.md) is comprised of the following:
|
|
98
|
-
|
|
99
|
-
- 🌐 **[Lemonade Server](https://lemonade-server.ai/docs)**: A local LLM server for running ONNX and GGUF models using the OpenAI API standard. Install and enable your applications with NPU and GPU acceleration in minutes.
|
|
100
|
-
- 🐍 **[Lemonade API](./docs/lemonade_api.md)**: High-level Python API to directly integrate Lemonade LLMs into Python applications.
|
|
101
|
-
- 🖥️ **[Lemonade CLI](./docs/dev_cli/README.md)**: The `lemonade` CLI lets you mix-and-match LLMs (ONNX, GGUF, SafeTensors) with measurement tools to characterize your models on your hardware. The available tools are:
|
|
102
|
-
- Prompting with templates.
|
|
103
|
-
- Measuring accuracy with a variety of tests.
|
|
104
|
-
- Benchmarking to get the time-to-first-token and tokens per second.
|
|
105
|
-
- Profiling the memory utilization.
|
|
106
|
-
|
|
107
|
-
### [Click here to get started with Lemonade.](./docs/README.md)
|
|
108
|
-
|
|
109
|
-
### Supported Configurations
|
|
110
|
-
|
|
111
|
-
Maximum LLM performance requires the right hardware accelerator with the right inference engine for your scenario. Lemonade supports the following configurations, while also making it easy to switch between them at runtime.
|
|
112
|
-
|
|
113
|
-
<table border="1" cellpadding="6" cellspacing="0">
|
|
114
|
-
<thead>
|
|
115
|
-
<tr>
|
|
116
|
-
<th rowspan="2">Hardware</th>
|
|
117
|
-
<th colspan="3" align="center">🛠️ Engine Support</th>
|
|
118
|
-
<th colspan="2" align="center">🖥️ OS (x86/x64)</th>
|
|
119
|
-
</tr>
|
|
120
|
-
<tr>
|
|
121
|
-
<th align="center">OGA</th>
|
|
122
|
-
<th align="center">llamacpp</th>
|
|
123
|
-
<th align="center">HF</th>
|
|
124
|
-
<th align="center">Windows</th>
|
|
125
|
-
<th align="center">Linux</th>
|
|
126
|
-
</tr>
|
|
127
|
-
</thead>
|
|
128
|
-
<tbody>
|
|
129
|
-
<tr>
|
|
130
|
-
<td>🧠 CPU</td>
|
|
131
|
-
<td align="center">All platforms</td>
|
|
132
|
-
<td align="center">All platforms</td>
|
|
133
|
-
<td align="center">All platforms</td>
|
|
134
|
-
<td align="center">✅</td>
|
|
135
|
-
<td align="center">✅</td>
|
|
136
|
-
</tr>
|
|
137
|
-
<tr>
|
|
138
|
-
<td>🎮 GPU</td>
|
|
139
|
-
<td align="center">—</td>
|
|
140
|
-
<td align="center">Vulkan: All platforms<br><small>Focus:<br/>Ryzen™ AI 7000/8000/300<br/>Radeon™ 7000/9000</small></td>
|
|
141
|
-
<td align="center">—</td>
|
|
142
|
-
<td align="center">✅</td>
|
|
143
|
-
<td align="center">✅</td>
|
|
144
|
-
</tr>
|
|
145
|
-
<tr>
|
|
146
|
-
<td>🤖 NPU</td>
|
|
147
|
-
<td align="center">AMD Ryzen™ AI 300 series</td>
|
|
148
|
-
<td align="center">—</td>
|
|
149
|
-
<td align="center">—</td>
|
|
150
|
-
<td align="center">✅</td>
|
|
151
|
-
<td align="center">—</td>
|
|
152
|
-
</tr>
|
|
153
|
-
</tbody>
|
|
154
|
-
</table>
|
|
155
|
-
|
|
156
|
-
To learn more about the supported hardware and software, visit the documentation [here](./docs/README.md#software-and-hardware-overview).
|
|
157
|
-
|
|
158
|
-
## Integrate Lemonade Server with Your Application
|
|
159
|
-
|
|
160
|
-
Lemonade Server enables languages including Python, C++, Java, C#, Node.js, Go, Ruby, Rust, and PHP. For the full list and integration details, see [docs/server/README.md](./docs/server/README.md).
|
|
161
|
-
|
|
162
|
-
## Contributing
|
|
163
|
-
|
|
164
|
-
We are actively seeking collaborators from across the industry. If you would like to contribute to this project, please check out our [contribution guide](./docs/contribute.md).
|
|
165
|
-
|
|
166
|
-
## Maintainers
|
|
167
|
-
|
|
168
|
-
This project is sponsored by AMD. It is maintained by @danielholanda @jeremyfowers @ramkrishna @vgodsoe in equal measure. You can reach us by filing an [issue](https://github.com/lemonade-sdk/lemonade/issues) or email [lemonade@amd.com](mailto:lemonade@amd.com).
|
|
169
|
-
|
|
170
|
-
## License
|
|
171
|
-
|
|
172
|
-
This project is licensed under the [Apache 2.0 License](https://github.com/lemonade-sdk/lemonade/blob/main/LICENSE). Portions of the project are licensed as described in [NOTICE.md](./NOTICE.md).
|
|
173
|
-
|
|
174
|
-
<!--This file was originally licensed under Apache 2.0. It has been modified.
|
|
175
|
-
Modifications Copyright (c) 2025 AMD-->
|
|
176
|
-
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|