lemonade-sdk 7.0.3__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- lemonade/api.py +3 -3
- lemonade/cli.py +11 -17
- lemonade/common/build.py +0 -47
- lemonade/common/network.py +50 -0
- lemonade/common/status.py +2 -21
- lemonade/common/system_info.py +19 -4
- lemonade/profilers/memory_tracker.py +3 -1
- lemonade/tools/accuracy.py +3 -4
- lemonade/tools/adapter.py +1 -2
- lemonade/tools/{huggingface_bench.py → huggingface/bench.py} +2 -87
- lemonade/tools/huggingface/load.py +235 -0
- lemonade/tools/{huggingface_load.py → huggingface/utils.py} +87 -255
- lemonade/tools/humaneval.py +9 -3
- lemonade/tools/{llamacpp_bench.py → llamacpp/bench.py} +1 -1
- lemonade/tools/{llamacpp.py → llamacpp/load.py} +18 -2
- lemonade/tools/mmlu.py +7 -15
- lemonade/tools/{ort_genai/oga.py → oga/load.py} +31 -422
- lemonade/tools/oga/utils.py +423 -0
- lemonade/tools/perplexity.py +4 -3
- lemonade/tools/prompt.py +2 -1
- lemonade/tools/quark/quark_load.py +2 -1
- lemonade/tools/quark/quark_quantize.py +5 -5
- lemonade/tools/report/table.py +3 -3
- lemonade/tools/server/llamacpp.py +159 -34
- lemonade/tools/server/serve.py +169 -147
- lemonade/tools/server/static/favicon.ico +0 -0
- lemonade/tools/server/static/styles.css +568 -0
- lemonade/tools/server/static/webapp.html +439 -0
- lemonade/tools/server/tray.py +458 -0
- lemonade/tools/server/{port_utils.py → utils/port.py} +22 -3
- lemonade/tools/server/utils/system_tray.py +395 -0
- lemonade/tools/server/{instructions.py → webapp.py} +4 -10
- lemonade/version.py +1 -1
- lemonade_install/install.py +46 -28
- {lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/METADATA +84 -22
- lemonade_sdk-8.0.0.dist-info/RECORD +70 -0
- lemonade_server/cli.py +182 -27
- lemonade_server/model_manager.py +192 -20
- lemonade_server/pydantic_models.py +9 -4
- lemonade_server/server_models.json +5 -3
- lemonade/common/analyze_model.py +0 -26
- lemonade/common/labels.py +0 -61
- lemonade/common/onnx_helpers.py +0 -176
- lemonade/common/plugins.py +0 -10
- lemonade/common/tensor_helpers.py +0 -83
- lemonade/tools/server/static/instructions.html +0 -262
- lemonade_sdk-7.0.3.dist-info/RECORD +0 -69
- /lemonade/tools/{ort_genai → oga}/__init__.py +0 -0
- /lemonade/tools/{ort_genai/oga_bench.py → oga/bench.py} +0 -0
- /lemonade/tools/server/{thread_utils.py → utils/thread.py} +0 -0
- {lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/WHEEL +0 -0
- {lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/entry_points.txt +0 -0
- {lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/licenses/LICENSE +0 -0
- {lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/licenses/NOTICE.md +0 -0
- {lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/top_level.txt +0 -0
|
@@ -6,6 +6,8 @@ import subprocess
|
|
|
6
6
|
import zipfile
|
|
7
7
|
import re
|
|
8
8
|
import threading
|
|
9
|
+
import platform
|
|
10
|
+
import shutil
|
|
9
11
|
|
|
10
12
|
import requests
|
|
11
13
|
from tabulate import tabulate
|
|
@@ -14,21 +16,83 @@ from fastapi.responses import StreamingResponse
|
|
|
14
16
|
|
|
15
17
|
from openai import OpenAI
|
|
16
18
|
|
|
17
|
-
from lemonade_server.pydantic_models import ChatCompletionRequest
|
|
19
|
+
from lemonade_server.pydantic_models import ChatCompletionRequest, PullConfig
|
|
18
20
|
from lemonade_server.model_manager import ModelManager
|
|
19
|
-
from lemonade.tools.server.
|
|
21
|
+
from lemonade.tools.server.utils.port import find_free_port
|
|
20
22
|
|
|
21
|
-
LLAMA_VERSION = "
|
|
23
|
+
LLAMA_VERSION = "b5699"
|
|
22
24
|
|
|
23
|
-
LLAMA_SERVER_EXE_DIR = os.path.join(
|
|
24
|
-
os.path.dirname(sys.executable),
|
|
25
|
-
"llama_server",
|
|
26
|
-
)
|
|
27
25
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
26
|
+
def get_llama_server_paths():
|
|
27
|
+
"""
|
|
28
|
+
Get platform-specific paths for llama server directory and executable
|
|
29
|
+
"""
|
|
30
|
+
base_dir = os.path.join(os.path.dirname(sys.executable), "llama_server")
|
|
31
|
+
|
|
32
|
+
if platform.system().lower() == "windows":
|
|
33
|
+
return base_dir, os.path.join(base_dir, "llama-server.exe")
|
|
34
|
+
else: # Linux/Ubuntu
|
|
35
|
+
# Check if executable exists in build/bin subdirectory (Current Ubuntu structure)
|
|
36
|
+
build_bin_path = os.path.join(base_dir, "build", "bin", "llama-server")
|
|
37
|
+
if os.path.exists(build_bin_path):
|
|
38
|
+
return base_dir, build_bin_path
|
|
39
|
+
else:
|
|
40
|
+
# Fallback to root directory
|
|
41
|
+
return base_dir, os.path.join(base_dir, "llama-server")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_binary_url_and_filename(version):
|
|
45
|
+
"""
|
|
46
|
+
Get the appropriate binary URL and filename based on platform
|
|
47
|
+
"""
|
|
48
|
+
system = platform.system().lower()
|
|
49
|
+
|
|
50
|
+
if system == "windows":
|
|
51
|
+
filename = f"llama-{version}-bin-win-vulkan-x64.zip"
|
|
52
|
+
elif system == "linux":
|
|
53
|
+
filename = f"llama-{version}-bin-ubuntu-vulkan-x64.zip"
|
|
54
|
+
else:
|
|
55
|
+
raise NotImplementedError(
|
|
56
|
+
f"Platform {system} not supported for llamacpp. Supported: Windows, Ubuntu Linux"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
url = (
|
|
60
|
+
f"https://github.com/ggml-org/llama.cpp/releases/download/{version}/{filename}"
|
|
61
|
+
)
|
|
62
|
+
return url, filename
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def validate_platform_support():
|
|
66
|
+
"""
|
|
67
|
+
Validate platform support before attempting download
|
|
68
|
+
"""
|
|
69
|
+
system = platform.system().lower()
|
|
70
|
+
|
|
71
|
+
if system not in ["windows", "linux"]:
|
|
72
|
+
raise HTTPException(
|
|
73
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
74
|
+
detail=(
|
|
75
|
+
f"Platform {system} not supported for llamacpp. "
|
|
76
|
+
"Supported: Windows, Ubuntu Linux"
|
|
77
|
+
),
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
if system == "linux":
|
|
81
|
+
# Check if we're actually on Ubuntu/compatible distro and log a warning if not
|
|
82
|
+
try:
|
|
83
|
+
with open("/etc/os-release", "r", encoding="utf-8") as f:
|
|
84
|
+
os_info = f.read().lower()
|
|
85
|
+
if "ubuntu" not in os_info and "debian" not in os_info:
|
|
86
|
+
logging.warning(
|
|
87
|
+
"llamacpp binaries are built for Ubuntu. "
|
|
88
|
+
"Compatibility with other Linux distributions is not guaranteed."
|
|
89
|
+
)
|
|
90
|
+
except (FileNotFoundError, PermissionError, OSError) as e:
|
|
91
|
+
logging.warning(
|
|
92
|
+
"Could not determine Linux distribution (%s). "
|
|
93
|
+
"llamacpp binaries are built for Ubuntu.",
|
|
94
|
+
str(e),
|
|
95
|
+
)
|
|
32
96
|
|
|
33
97
|
|
|
34
98
|
class LlamaTelemetry:
|
|
@@ -66,10 +130,21 @@ class LlamaTelemetry:
|
|
|
66
130
|
Parse telemetry data from llama server output lines.
|
|
67
131
|
"""
|
|
68
132
|
|
|
133
|
+
# Parse Vulkan device detection
|
|
134
|
+
vulkan_match = re.search(r"ggml_vulkan: Found (\d+) Vulkan devices?:", line)
|
|
135
|
+
if vulkan_match:
|
|
136
|
+
device_count = int(vulkan_match.group(1))
|
|
137
|
+
if device_count > 0:
|
|
138
|
+
logging.info(
|
|
139
|
+
f"GPU acceleration active: {device_count} Vulkan device(s) "
|
|
140
|
+
"detected by llama-server"
|
|
141
|
+
)
|
|
142
|
+
return
|
|
143
|
+
|
|
69
144
|
# Parse prompt evaluation line
|
|
70
145
|
prompt_match = re.search(
|
|
71
|
-
|
|
72
|
-
r"
|
|
146
|
+
r"prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?"
|
|
147
|
+
r"([\d.]+)\s*tokens per second",
|
|
73
148
|
line,
|
|
74
149
|
)
|
|
75
150
|
if prompt_match:
|
|
@@ -83,7 +158,8 @@ class LlamaTelemetry:
|
|
|
83
158
|
|
|
84
159
|
# Parse generation evaluation line
|
|
85
160
|
eval_match = re.search(
|
|
86
|
-
r"eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?
|
|
161
|
+
r"eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?"
|
|
162
|
+
r"([\d.]+)\s*tokens per second",
|
|
87
163
|
line,
|
|
88
164
|
)
|
|
89
165
|
if eval_match:
|
|
@@ -145,16 +221,14 @@ def _log_subprocess_output(
|
|
|
145
221
|
break
|
|
146
222
|
|
|
147
223
|
|
|
148
|
-
def _wait_for_load(
|
|
149
|
-
llama_server_process: subprocess.Popen, port: int, fail_message: str
|
|
150
|
-
):
|
|
224
|
+
def _wait_for_load(llama_server_process: subprocess.Popen, port: int):
|
|
151
225
|
status_code = None
|
|
152
226
|
while not llama_server_process.poll() and status_code != 200:
|
|
153
227
|
health_url = f"http://localhost:{port}/health"
|
|
154
228
|
try:
|
|
155
229
|
health_response = requests.get(health_url)
|
|
156
230
|
except requests.exceptions.ConnectionError:
|
|
157
|
-
logging.
|
|
231
|
+
logging.debug("Not able to connect to llama-server yet, will retry")
|
|
158
232
|
else:
|
|
159
233
|
status_code = health_response.status_code
|
|
160
234
|
logging.debug(
|
|
@@ -171,8 +245,11 @@ def _launch_llama_subprocess(
|
|
|
171
245
|
Launch llama server subprocess with GPU or CPU configuration
|
|
172
246
|
"""
|
|
173
247
|
|
|
248
|
+
# Get the current executable path (handles both Windows and Ubuntu structures)
|
|
249
|
+
_, exe_path = get_llama_server_paths()
|
|
250
|
+
|
|
174
251
|
# Build the base command
|
|
175
|
-
base_command = [
|
|
252
|
+
base_command = [exe_path, "-m", snapshot_files["variant"]]
|
|
176
253
|
if "mmproj" in snapshot_files:
|
|
177
254
|
base_command.extend(["--mmproj", snapshot_files["mmproj"]])
|
|
178
255
|
if not use_gpu:
|
|
@@ -185,13 +262,33 @@ def _launch_llama_subprocess(
|
|
|
185
262
|
# Add port and jinja to enable tool use
|
|
186
263
|
base_command.extend(["--port", str(telemetry.port), "--jinja"])
|
|
187
264
|
|
|
265
|
+
# Use legacy reasoning formatting, since not all apps support the new
|
|
266
|
+
# reasoning_content field
|
|
267
|
+
base_command.extend(["--reasoning-format", "none"])
|
|
268
|
+
|
|
188
269
|
# Configure GPU layers: 99 for GPU, 0 for CPU-only
|
|
189
270
|
ngl_value = "99" if use_gpu else "0"
|
|
190
271
|
command = base_command + ["-ngl", ngl_value]
|
|
191
272
|
|
|
273
|
+
# Set up environment with library path for Linux
|
|
274
|
+
env = os.environ.copy()
|
|
275
|
+
if platform.system().lower() == "linux":
|
|
276
|
+
lib_dir = os.path.dirname(exe_path) # Same directory as the executable
|
|
277
|
+
current_ld_path = env.get("LD_LIBRARY_PATH", "")
|
|
278
|
+
if current_ld_path:
|
|
279
|
+
env["LD_LIBRARY_PATH"] = f"{lib_dir}:{current_ld_path}"
|
|
280
|
+
else:
|
|
281
|
+
env["LD_LIBRARY_PATH"] = lib_dir
|
|
282
|
+
logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
|
|
283
|
+
|
|
192
284
|
# Start subprocess with output capture
|
|
193
285
|
process = subprocess.Popen(
|
|
194
|
-
command,
|
|
286
|
+
command,
|
|
287
|
+
stdout=subprocess.PIPE,
|
|
288
|
+
stderr=subprocess.STDOUT,
|
|
289
|
+
text=True,
|
|
290
|
+
bufsize=1,
|
|
291
|
+
env=env,
|
|
195
292
|
)
|
|
196
293
|
|
|
197
294
|
# Start background thread to log subprocess output
|
|
@@ -205,15 +302,30 @@ def _launch_llama_subprocess(
|
|
|
205
302
|
return process
|
|
206
303
|
|
|
207
304
|
|
|
208
|
-
def server_load(model_config:
|
|
305
|
+
def server_load(model_config: PullConfig, telemetry: LlamaTelemetry):
|
|
306
|
+
|
|
307
|
+
# Validate platform support before proceeding
|
|
308
|
+
validate_platform_support()
|
|
309
|
+
|
|
310
|
+
# Get platform-specific paths at runtime
|
|
311
|
+
llama_server_exe_dir, llama_server_exe_path = get_llama_server_paths()
|
|
312
|
+
|
|
313
|
+
# Check whether the llamacpp install needs an upgrade
|
|
314
|
+
version_txt_path = os.path.join(llama_server_exe_dir, "version.txt")
|
|
315
|
+
if os.path.exists(version_txt_path):
|
|
316
|
+
with open(version_txt_path, "r", encoding="utf-8") as f:
|
|
317
|
+
llamacpp_installed_version = f.read()
|
|
318
|
+
|
|
319
|
+
if llamacpp_installed_version != LLAMA_VERSION:
|
|
320
|
+
# Remove the existing install, which will trigger a new install
|
|
321
|
+
# in the next code block
|
|
322
|
+
shutil.rmtree(llama_server_exe_dir)
|
|
323
|
+
|
|
209
324
|
# Download llama.cpp server if it isn't already available
|
|
210
|
-
if not os.path.exists(
|
|
325
|
+
if not os.path.exists(llama_server_exe_dir):
|
|
211
326
|
# Download llama.cpp server zip
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
llama_zip_path = os.path.join(
|
|
215
|
-
os.path.dirname(sys.executable), "llama-server.zip"
|
|
216
|
-
)
|
|
327
|
+
llama_zip_url, filename = get_binary_url_and_filename(LLAMA_VERSION)
|
|
328
|
+
llama_zip_path = os.path.join(os.path.dirname(sys.executable), filename)
|
|
217
329
|
logging.info(f"Downloading llama.cpp server from {llama_zip_url}")
|
|
218
330
|
|
|
219
331
|
with requests.get(llama_zip_url, stream=True) as r:
|
|
@@ -223,12 +335,23 @@ def server_load(model_config: dict, model_reference: str, telemetry: LlamaTeleme
|
|
|
223
335
|
f.write(chunk)
|
|
224
336
|
|
|
225
337
|
# Extract zip
|
|
226
|
-
logging.info(f"Extracting {llama_zip_path} to {
|
|
338
|
+
logging.info(f"Extracting {llama_zip_path} to {llama_server_exe_dir}")
|
|
227
339
|
with zipfile.ZipFile(llama_zip_path, "r") as zip_ref:
|
|
228
|
-
zip_ref.extractall(
|
|
340
|
+
zip_ref.extractall(llama_server_exe_dir)
|
|
341
|
+
|
|
342
|
+
# Make executable on Linux - need to update paths after extraction
|
|
343
|
+
if platform.system().lower() == "linux":
|
|
344
|
+
# Re-get the paths since extraction might have changed the directory structure
|
|
345
|
+
_, updated_exe_path = get_llama_server_paths()
|
|
346
|
+
if os.path.exists(updated_exe_path):
|
|
347
|
+
os.chmod(updated_exe_path, 0o755)
|
|
348
|
+
logging.info(f"Set executable permissions for {updated_exe_path}")
|
|
349
|
+
else:
|
|
350
|
+
logging.warning(
|
|
351
|
+
f"Could not find llama-server executable at {updated_exe_path}"
|
|
352
|
+
)
|
|
229
353
|
|
|
230
354
|
# Save version.txt
|
|
231
|
-
version_txt_path = os.path.join(LLAMA_SERVER_EXE_DIR, "version.txt")
|
|
232
355
|
with open(version_txt_path, "w", encoding="utf-8") as vf:
|
|
233
356
|
vf.write(LLAMA_VERSION)
|
|
234
357
|
|
|
@@ -241,7 +364,7 @@ def server_load(model_config: dict, model_reference: str, telemetry: LlamaTeleme
|
|
|
241
364
|
logging.debug(f"GGUF file paths: {snapshot_files}")
|
|
242
365
|
|
|
243
366
|
# Start the llama-serve.exe process
|
|
244
|
-
logging.debug(f"Using llama_server for GGUF model: {
|
|
367
|
+
logging.debug(f"Using llama_server for GGUF model: {llama_server_exe_path}")
|
|
245
368
|
|
|
246
369
|
# Attempt loading on GPU first
|
|
247
370
|
llama_server_process = _launch_llama_subprocess(
|
|
@@ -252,11 +375,14 @@ def server_load(model_config: dict, model_reference: str, telemetry: LlamaTeleme
|
|
|
252
375
|
_wait_for_load(
|
|
253
376
|
llama_server_process,
|
|
254
377
|
telemetry.port,
|
|
255
|
-
f"Loading {model_reference} on GPU didn't work, re-attempting on CPU",
|
|
256
378
|
)
|
|
257
379
|
|
|
258
380
|
# If loading on GPU failed, try loading on CPU
|
|
259
381
|
if llama_server_process.poll():
|
|
382
|
+
logging.warning(
|
|
383
|
+
f"Loading {model_config.model_name} on GPU didn't work, re-attempting on CPU"
|
|
384
|
+
)
|
|
385
|
+
|
|
260
386
|
llama_server_process = _launch_llama_subprocess(
|
|
261
387
|
snapshot_files, use_gpu=False, telemetry=telemetry
|
|
262
388
|
)
|
|
@@ -265,13 +391,12 @@ def server_load(model_config: dict, model_reference: str, telemetry: LlamaTeleme
|
|
|
265
391
|
_wait_for_load(
|
|
266
392
|
llama_server_process,
|
|
267
393
|
telemetry.port,
|
|
268
|
-
f"Loading {model_reference} on CPU didn't work",
|
|
269
394
|
)
|
|
270
395
|
|
|
271
396
|
if llama_server_process.poll():
|
|
272
397
|
raise HTTPException(
|
|
273
398
|
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
274
|
-
detail=f"Failed to load {
|
|
399
|
+
detail=f"Failed to load {model_config.model_name} with llama.cpp",
|
|
275
400
|
)
|
|
276
401
|
|
|
277
402
|
return llama_server_process
|