lemonade-sdk 7.0.4__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/api.py +3 -3
- lemonade/cli.py +11 -17
- lemonade/common/build.py +0 -47
- lemonade/common/network.py +50 -0
- lemonade/common/status.py +2 -21
- lemonade/common/system_info.py +19 -4
- lemonade/profilers/memory_tracker.py +3 -1
- lemonade/tools/accuracy.py +3 -4
- lemonade/tools/adapter.py +1 -2
- lemonade/tools/{huggingface_bench.py → huggingface/bench.py} +2 -87
- lemonade/tools/huggingface/load.py +235 -0
- lemonade/tools/{huggingface_load.py → huggingface/utils.py} +87 -255
- lemonade/tools/humaneval.py +9 -3
- lemonade/tools/{llamacpp_bench.py → llamacpp/bench.py} +1 -1
- lemonade/tools/{llamacpp.py → llamacpp/load.py} +18 -2
- lemonade/tools/mmlu.py +7 -15
- lemonade/tools/{ort_genai/oga.py → oga/load.py} +31 -422
- lemonade/tools/oga/utils.py +423 -0
- lemonade/tools/perplexity.py +4 -3
- lemonade/tools/prompt.py +2 -1
- lemonade/tools/quark/quark_load.py +2 -1
- lemonade/tools/quark/quark_quantize.py +5 -5
- lemonade/tools/report/table.py +3 -3
- lemonade/tools/server/llamacpp.py +154 -29
- lemonade/tools/server/serve.py +169 -146
- lemonade/tools/server/static/favicon.ico +0 -0
- lemonade/tools/server/static/styles.css +568 -0
- lemonade/tools/server/static/webapp.html +439 -0
- lemonade/tools/server/tray.py +458 -0
- lemonade/tools/server/{port_utils.py → utils/port.py} +22 -3
- lemonade/tools/server/utils/system_tray.py +395 -0
- lemonade/tools/server/{instructions.py → webapp.py} +4 -10
- lemonade/version.py +1 -1
- lemonade_install/install.py +46 -28
- {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/METADATA +84 -22
- lemonade_sdk-8.0.0.dist-info/RECORD +70 -0
- lemonade_server/cli.py +182 -27
- lemonade_server/model_manager.py +192 -20
- lemonade_server/pydantic_models.py +9 -4
- lemonade_server/server_models.json +5 -3
- lemonade/common/analyze_model.py +0 -26
- lemonade/common/labels.py +0 -61
- lemonade/common/onnx_helpers.py +0 -176
- lemonade/common/plugins.py +0 -10
- lemonade/common/tensor_helpers.py +0 -83
- lemonade/tools/server/static/instructions.html +0 -262
- lemonade_sdk-7.0.4.dist-info/RECORD +0 -69
- /lemonade/tools/{ort_genai → oga}/__init__.py +0 -0
- /lemonade/tools/{ort_genai/oga_bench.py → oga/bench.py} +0 -0
- /lemonade/tools/server/{thread_utils.py → utils/thread.py} +0 -0
- {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/WHEEL +0 -0
- {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/top_level.txt +0 -0
|
@@ -6,6 +6,8 @@ import subprocess
|
|
|
6
6
|
import zipfile
|
|
7
7
|
import re
|
|
8
8
|
import threading
|
|
9
|
+
import platform
|
|
10
|
+
import shutil
|
|
9
11
|
|
|
10
12
|
import requests
|
|
11
13
|
from tabulate import tabulate
|
|
@@ -14,21 +16,83 @@ from fastapi.responses import StreamingResponse
|
|
|
14
16
|
|
|
15
17
|
from openai import OpenAI
|
|
16
18
|
|
|
17
|
-
from lemonade_server.pydantic_models import ChatCompletionRequest
|
|
19
|
+
from lemonade_server.pydantic_models import ChatCompletionRequest, PullConfig
|
|
18
20
|
from lemonade_server.model_manager import ModelManager
|
|
19
|
-
from lemonade.tools.server.
|
|
21
|
+
from lemonade.tools.server.utils.port import find_free_port
|
|
20
22
|
|
|
21
|
-
LLAMA_VERSION = "
|
|
23
|
+
LLAMA_VERSION = "b5699"
|
|
22
24
|
|
|
23
|
-
LLAMA_SERVER_EXE_DIR = os.path.join(
|
|
24
|
-
os.path.dirname(sys.executable),
|
|
25
|
-
"llama_server",
|
|
26
|
-
)
|
|
27
25
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
26
|
+
def get_llama_server_paths():
|
|
27
|
+
"""
|
|
28
|
+
Get platform-specific paths for llama server directory and executable
|
|
29
|
+
"""
|
|
30
|
+
base_dir = os.path.join(os.path.dirname(sys.executable), "llama_server")
|
|
31
|
+
|
|
32
|
+
if platform.system().lower() == "windows":
|
|
33
|
+
return base_dir, os.path.join(base_dir, "llama-server.exe")
|
|
34
|
+
else: # Linux/Ubuntu
|
|
35
|
+
# Check if executable exists in build/bin subdirectory (Current Ubuntu structure)
|
|
36
|
+
build_bin_path = os.path.join(base_dir, "build", "bin", "llama-server")
|
|
37
|
+
if os.path.exists(build_bin_path):
|
|
38
|
+
return base_dir, build_bin_path
|
|
39
|
+
else:
|
|
40
|
+
# Fallback to root directory
|
|
41
|
+
return base_dir, os.path.join(base_dir, "llama-server")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_binary_url_and_filename(version):
|
|
45
|
+
"""
|
|
46
|
+
Get the appropriate binary URL and filename based on platform
|
|
47
|
+
"""
|
|
48
|
+
system = platform.system().lower()
|
|
49
|
+
|
|
50
|
+
if system == "windows":
|
|
51
|
+
filename = f"llama-{version}-bin-win-vulkan-x64.zip"
|
|
52
|
+
elif system == "linux":
|
|
53
|
+
filename = f"llama-{version}-bin-ubuntu-vulkan-x64.zip"
|
|
54
|
+
else:
|
|
55
|
+
raise NotImplementedError(
|
|
56
|
+
f"Platform {system} not supported for llamacpp. Supported: Windows, Ubuntu Linux"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
url = (
|
|
60
|
+
f"https://github.com/ggml-org/llama.cpp/releases/download/{version}/{filename}"
|
|
61
|
+
)
|
|
62
|
+
return url, filename
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def validate_platform_support():
|
|
66
|
+
"""
|
|
67
|
+
Validate platform support before attempting download
|
|
68
|
+
"""
|
|
69
|
+
system = platform.system().lower()
|
|
70
|
+
|
|
71
|
+
if system not in ["windows", "linux"]:
|
|
72
|
+
raise HTTPException(
|
|
73
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
74
|
+
detail=(
|
|
75
|
+
f"Platform {system} not supported for llamacpp. "
|
|
76
|
+
"Supported: Windows, Ubuntu Linux"
|
|
77
|
+
),
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
if system == "linux":
|
|
81
|
+
# Check if we're actually on Ubuntu/compatible distro and log a warning if not
|
|
82
|
+
try:
|
|
83
|
+
with open("/etc/os-release", "r", encoding="utf-8") as f:
|
|
84
|
+
os_info = f.read().lower()
|
|
85
|
+
if "ubuntu" not in os_info and "debian" not in os_info:
|
|
86
|
+
logging.warning(
|
|
87
|
+
"llamacpp binaries are built for Ubuntu. "
|
|
88
|
+
"Compatibility with other Linux distributions is not guaranteed."
|
|
89
|
+
)
|
|
90
|
+
except (FileNotFoundError, PermissionError, OSError) as e:
|
|
91
|
+
logging.warning(
|
|
92
|
+
"Could not determine Linux distribution (%s). "
|
|
93
|
+
"llamacpp binaries are built for Ubuntu.",
|
|
94
|
+
str(e),
|
|
95
|
+
)
|
|
32
96
|
|
|
33
97
|
|
|
34
98
|
class LlamaTelemetry:
|
|
@@ -66,10 +130,21 @@ class LlamaTelemetry:
|
|
|
66
130
|
Parse telemetry data from llama server output lines.
|
|
67
131
|
"""
|
|
68
132
|
|
|
133
|
+
# Parse Vulkan device detection
|
|
134
|
+
vulkan_match = re.search(r"ggml_vulkan: Found (\d+) Vulkan devices?:", line)
|
|
135
|
+
if vulkan_match:
|
|
136
|
+
device_count = int(vulkan_match.group(1))
|
|
137
|
+
if device_count > 0:
|
|
138
|
+
logging.info(
|
|
139
|
+
f"GPU acceleration active: {device_count} Vulkan device(s) "
|
|
140
|
+
"detected by llama-server"
|
|
141
|
+
)
|
|
142
|
+
return
|
|
143
|
+
|
|
69
144
|
# Parse prompt evaluation line
|
|
70
145
|
prompt_match = re.search(
|
|
71
|
-
|
|
72
|
-
r"
|
|
146
|
+
r"prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?"
|
|
147
|
+
r"([\d.]+)\s*tokens per second",
|
|
73
148
|
line,
|
|
74
149
|
)
|
|
75
150
|
if prompt_match:
|
|
@@ -83,7 +158,8 @@ class LlamaTelemetry:
|
|
|
83
158
|
|
|
84
159
|
# Parse generation evaluation line
|
|
85
160
|
eval_match = re.search(
|
|
86
|
-
r"eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?
|
|
161
|
+
r"eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?"
|
|
162
|
+
r"([\d.]+)\s*tokens per second",
|
|
87
163
|
line,
|
|
88
164
|
)
|
|
89
165
|
if eval_match:
|
|
@@ -169,8 +245,11 @@ def _launch_llama_subprocess(
|
|
|
169
245
|
Launch llama server subprocess with GPU or CPU configuration
|
|
170
246
|
"""
|
|
171
247
|
|
|
248
|
+
# Get the current executable path (handles both Windows and Ubuntu structures)
|
|
249
|
+
_, exe_path = get_llama_server_paths()
|
|
250
|
+
|
|
172
251
|
# Build the base command
|
|
173
|
-
base_command = [
|
|
252
|
+
base_command = [exe_path, "-m", snapshot_files["variant"]]
|
|
174
253
|
if "mmproj" in snapshot_files:
|
|
175
254
|
base_command.extend(["--mmproj", snapshot_files["mmproj"]])
|
|
176
255
|
if not use_gpu:
|
|
@@ -183,13 +262,33 @@ def _launch_llama_subprocess(
|
|
|
183
262
|
# Add port and jinja to enable tool use
|
|
184
263
|
base_command.extend(["--port", str(telemetry.port), "--jinja"])
|
|
185
264
|
|
|
265
|
+
# Use legacy reasoning formatting, since not all apps support the new
|
|
266
|
+
# reasoning_content field
|
|
267
|
+
base_command.extend(["--reasoning-format", "none"])
|
|
268
|
+
|
|
186
269
|
# Configure GPU layers: 99 for GPU, 0 for CPU-only
|
|
187
270
|
ngl_value = "99" if use_gpu else "0"
|
|
188
271
|
command = base_command + ["-ngl", ngl_value]
|
|
189
272
|
|
|
273
|
+
# Set up environment with library path for Linux
|
|
274
|
+
env = os.environ.copy()
|
|
275
|
+
if platform.system().lower() == "linux":
|
|
276
|
+
lib_dir = os.path.dirname(exe_path) # Same directory as the executable
|
|
277
|
+
current_ld_path = env.get("LD_LIBRARY_PATH", "")
|
|
278
|
+
if current_ld_path:
|
|
279
|
+
env["LD_LIBRARY_PATH"] = f"{lib_dir}:{current_ld_path}"
|
|
280
|
+
else:
|
|
281
|
+
env["LD_LIBRARY_PATH"] = lib_dir
|
|
282
|
+
logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
|
|
283
|
+
|
|
190
284
|
# Start subprocess with output capture
|
|
191
285
|
process = subprocess.Popen(
|
|
192
|
-
command,
|
|
286
|
+
command,
|
|
287
|
+
stdout=subprocess.PIPE,
|
|
288
|
+
stderr=subprocess.STDOUT,
|
|
289
|
+
text=True,
|
|
290
|
+
bufsize=1,
|
|
291
|
+
env=env,
|
|
193
292
|
)
|
|
194
293
|
|
|
195
294
|
# Start background thread to log subprocess output
|
|
@@ -203,15 +302,30 @@ def _launch_llama_subprocess(
|
|
|
203
302
|
return process
|
|
204
303
|
|
|
205
304
|
|
|
206
|
-
def server_load(model_config:
|
|
305
|
+
def server_load(model_config: PullConfig, telemetry: LlamaTelemetry):
|
|
306
|
+
|
|
307
|
+
# Validate platform support before proceeding
|
|
308
|
+
validate_platform_support()
|
|
309
|
+
|
|
310
|
+
# Get platform-specific paths at runtime
|
|
311
|
+
llama_server_exe_dir, llama_server_exe_path = get_llama_server_paths()
|
|
312
|
+
|
|
313
|
+
# Check whether the llamacpp install needs an upgrade
|
|
314
|
+
version_txt_path = os.path.join(llama_server_exe_dir, "version.txt")
|
|
315
|
+
if os.path.exists(version_txt_path):
|
|
316
|
+
with open(version_txt_path, "r", encoding="utf-8") as f:
|
|
317
|
+
llamacpp_installed_version = f.read()
|
|
318
|
+
|
|
319
|
+
if llamacpp_installed_version != LLAMA_VERSION:
|
|
320
|
+
# Remove the existing install, which will trigger a new install
|
|
321
|
+
# in the next code block
|
|
322
|
+
shutil.rmtree(llama_server_exe_dir)
|
|
323
|
+
|
|
207
324
|
# Download llama.cpp server if it isn't already available
|
|
208
|
-
if not os.path.exists(
|
|
325
|
+
if not os.path.exists(llama_server_exe_dir):
|
|
209
326
|
# Download llama.cpp server zip
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
llama_zip_path = os.path.join(
|
|
213
|
-
os.path.dirname(sys.executable), "llama-server.zip"
|
|
214
|
-
)
|
|
327
|
+
llama_zip_url, filename = get_binary_url_and_filename(LLAMA_VERSION)
|
|
328
|
+
llama_zip_path = os.path.join(os.path.dirname(sys.executable), filename)
|
|
215
329
|
logging.info(f"Downloading llama.cpp server from {llama_zip_url}")
|
|
216
330
|
|
|
217
331
|
with requests.get(llama_zip_url, stream=True) as r:
|
|
@@ -221,12 +335,23 @@ def server_load(model_config: dict, model_reference: str, telemetry: LlamaTeleme
|
|
|
221
335
|
f.write(chunk)
|
|
222
336
|
|
|
223
337
|
# Extract zip
|
|
224
|
-
logging.info(f"Extracting {llama_zip_path} to {
|
|
338
|
+
logging.info(f"Extracting {llama_zip_path} to {llama_server_exe_dir}")
|
|
225
339
|
with zipfile.ZipFile(llama_zip_path, "r") as zip_ref:
|
|
226
|
-
zip_ref.extractall(
|
|
340
|
+
zip_ref.extractall(llama_server_exe_dir)
|
|
341
|
+
|
|
342
|
+
# Make executable on Linux - need to update paths after extraction
|
|
343
|
+
if platform.system().lower() == "linux":
|
|
344
|
+
# Re-get the paths since extraction might have changed the directory structure
|
|
345
|
+
_, updated_exe_path = get_llama_server_paths()
|
|
346
|
+
if os.path.exists(updated_exe_path):
|
|
347
|
+
os.chmod(updated_exe_path, 0o755)
|
|
348
|
+
logging.info(f"Set executable permissions for {updated_exe_path}")
|
|
349
|
+
else:
|
|
350
|
+
logging.warning(
|
|
351
|
+
f"Could not find llama-server executable at {updated_exe_path}"
|
|
352
|
+
)
|
|
227
353
|
|
|
228
354
|
# Save version.txt
|
|
229
|
-
version_txt_path = os.path.join(LLAMA_SERVER_EXE_DIR, "version.txt")
|
|
230
355
|
with open(version_txt_path, "w", encoding="utf-8") as vf:
|
|
231
356
|
vf.write(LLAMA_VERSION)
|
|
232
357
|
|
|
@@ -239,7 +364,7 @@ def server_load(model_config: dict, model_reference: str, telemetry: LlamaTeleme
|
|
|
239
364
|
logging.debug(f"GGUF file paths: {snapshot_files}")
|
|
240
365
|
|
|
241
366
|
# Start the llama-serve.exe process
|
|
242
|
-
logging.debug(f"Using llama_server for GGUF model: {
|
|
367
|
+
logging.debug(f"Using llama_server for GGUF model: {llama_server_exe_path}")
|
|
243
368
|
|
|
244
369
|
# Attempt loading on GPU first
|
|
245
370
|
llama_server_process = _launch_llama_subprocess(
|
|
@@ -255,7 +380,7 @@ def server_load(model_config: dict, model_reference: str, telemetry: LlamaTeleme
|
|
|
255
380
|
# If loading on GPU failed, try loading on CPU
|
|
256
381
|
if llama_server_process.poll():
|
|
257
382
|
logging.warning(
|
|
258
|
-
f"Loading {
|
|
383
|
+
f"Loading {model_config.model_name} on GPU didn't work, re-attempting on CPU"
|
|
259
384
|
)
|
|
260
385
|
|
|
261
386
|
llama_server_process = _launch_llama_subprocess(
|
|
@@ -271,7 +396,7 @@ def server_load(model_config: dict, model_reference: str, telemetry: LlamaTeleme
|
|
|
271
396
|
if llama_server_process.poll():
|
|
272
397
|
raise HTTPException(
|
|
273
398
|
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
274
|
-
detail=f"Failed to load {
|
|
399
|
+
detail=f"Failed to load {model_config.model_name} with llama.cpp",
|
|
275
400
|
)
|
|
276
401
|
|
|
277
402
|
return llama_server_process
|