lemonade-sdk 7.0.4__py3-none-any.whl → 8.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (55) hide show
  1. lemonade/api.py +3 -3
  2. lemonade/cli.py +11 -17
  3. lemonade/common/build.py +0 -47
  4. lemonade/common/network.py +50 -0
  5. lemonade/common/status.py +2 -21
  6. lemonade/common/system_info.py +19 -4
  7. lemonade/profilers/memory_tracker.py +3 -1
  8. lemonade/tools/accuracy.py +3 -4
  9. lemonade/tools/adapter.py +1 -2
  10. lemonade/tools/{huggingface_bench.py → huggingface/bench.py} +2 -87
  11. lemonade/tools/huggingface/load.py +235 -0
  12. lemonade/tools/{huggingface_load.py → huggingface/utils.py} +87 -255
  13. lemonade/tools/humaneval.py +9 -3
  14. lemonade/tools/{llamacpp_bench.py → llamacpp/bench.py} +1 -1
  15. lemonade/tools/{llamacpp.py → llamacpp/load.py} +18 -2
  16. lemonade/tools/mmlu.py +7 -15
  17. lemonade/tools/{ort_genai/oga.py → oga/load.py} +31 -422
  18. lemonade/tools/oga/utils.py +423 -0
  19. lemonade/tools/perplexity.py +4 -3
  20. lemonade/tools/prompt.py +2 -1
  21. lemonade/tools/quark/quark_load.py +2 -1
  22. lemonade/tools/quark/quark_quantize.py +5 -5
  23. lemonade/tools/report/table.py +3 -3
  24. lemonade/tools/server/llamacpp.py +154 -29
  25. lemonade/tools/server/serve.py +169 -146
  26. lemonade/tools/server/static/favicon.ico +0 -0
  27. lemonade/tools/server/static/styles.css +568 -0
  28. lemonade/tools/server/static/webapp.html +439 -0
  29. lemonade/tools/server/tray.py +458 -0
  30. lemonade/tools/server/{port_utils.py → utils/port.py} +22 -3
  31. lemonade/tools/server/utils/system_tray.py +395 -0
  32. lemonade/tools/server/{instructions.py → webapp.py} +4 -10
  33. lemonade/version.py +1 -1
  34. lemonade_install/install.py +46 -28
  35. {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/METADATA +84 -22
  36. lemonade_sdk-8.0.0.dist-info/RECORD +70 -0
  37. lemonade_server/cli.py +182 -27
  38. lemonade_server/model_manager.py +192 -20
  39. lemonade_server/pydantic_models.py +9 -4
  40. lemonade_server/server_models.json +5 -3
  41. lemonade/common/analyze_model.py +0 -26
  42. lemonade/common/labels.py +0 -61
  43. lemonade/common/onnx_helpers.py +0 -176
  44. lemonade/common/plugins.py +0 -10
  45. lemonade/common/tensor_helpers.py +0 -83
  46. lemonade/tools/server/static/instructions.html +0 -262
  47. lemonade_sdk-7.0.4.dist-info/RECORD +0 -69
  48. /lemonade/tools/{ort_genai → oga}/__init__.py +0 -0
  49. /lemonade/tools/{ort_genai/oga_bench.py → oga/bench.py} +0 -0
  50. /lemonade/tools/server/{thread_utils.py → utils/thread.py} +0 -0
  51. {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/WHEEL +0 -0
  52. {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/entry_points.txt +0 -0
  53. {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/licenses/LICENSE +0 -0
  54. {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/licenses/NOTICE.md +0 -0
  55. {lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/top_level.txt +0 -0
@@ -6,6 +6,8 @@ import subprocess
6
6
  import zipfile
7
7
  import re
8
8
  import threading
9
+ import platform
10
+ import shutil
9
11
 
10
12
  import requests
11
13
  from tabulate import tabulate
@@ -14,21 +16,83 @@ from fastapi.responses import StreamingResponse
14
16
 
15
17
  from openai import OpenAI
16
18
 
17
- from lemonade_server.pydantic_models import ChatCompletionRequest
19
+ from lemonade_server.pydantic_models import ChatCompletionRequest, PullConfig
18
20
  from lemonade_server.model_manager import ModelManager
19
- from lemonade.tools.server.port_utils import find_free_port
21
+ from lemonade.tools.server.utils.port import find_free_port
20
22
 
21
- LLAMA_VERSION = "b5543"
23
+ LLAMA_VERSION = "b5699"
22
24
 
23
- LLAMA_SERVER_EXE_DIR = os.path.join(
24
- os.path.dirname(sys.executable),
25
- "llama_server",
26
- )
27
25
 
28
- LLAMA_SERVER_EXE_PATH = os.path.join(
29
- LLAMA_SERVER_EXE_DIR,
30
- "llama-server.exe",
31
- )
26
+ def get_llama_server_paths():
27
+ """
28
+ Get platform-specific paths for llama server directory and executable
29
+ """
30
+ base_dir = os.path.join(os.path.dirname(sys.executable), "llama_server")
31
+
32
+ if platform.system().lower() == "windows":
33
+ return base_dir, os.path.join(base_dir, "llama-server.exe")
34
+ else: # Linux/Ubuntu
35
+ # Check if executable exists in build/bin subdirectory (Current Ubuntu structure)
36
+ build_bin_path = os.path.join(base_dir, "build", "bin", "llama-server")
37
+ if os.path.exists(build_bin_path):
38
+ return base_dir, build_bin_path
39
+ else:
40
+ # Fallback to root directory
41
+ return base_dir, os.path.join(base_dir, "llama-server")
42
+
43
+
44
+ def get_binary_url_and_filename(version):
45
+ """
46
+ Get the appropriate binary URL and filename based on platform
47
+ """
48
+ system = platform.system().lower()
49
+
50
+ if system == "windows":
51
+ filename = f"llama-{version}-bin-win-vulkan-x64.zip"
52
+ elif system == "linux":
53
+ filename = f"llama-{version}-bin-ubuntu-vulkan-x64.zip"
54
+ else:
55
+ raise NotImplementedError(
56
+ f"Platform {system} not supported for llamacpp. Supported: Windows, Ubuntu Linux"
57
+ )
58
+
59
+ url = (
60
+ f"https://github.com/ggml-org/llama.cpp/releases/download/{version}/{filename}"
61
+ )
62
+ return url, filename
63
+
64
+
65
+ def validate_platform_support():
66
+ """
67
+ Validate platform support before attempting download
68
+ """
69
+ system = platform.system().lower()
70
+
71
+ if system not in ["windows", "linux"]:
72
+ raise HTTPException(
73
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
74
+ detail=(
75
+ f"Platform {system} not supported for llamacpp. "
76
+ "Supported: Windows, Ubuntu Linux"
77
+ ),
78
+ )
79
+
80
+ if system == "linux":
81
+ # Check if we're actually on Ubuntu/compatible distro and log a warning if not
82
+ try:
83
+ with open("/etc/os-release", "r", encoding="utf-8") as f:
84
+ os_info = f.read().lower()
85
+ if "ubuntu" not in os_info and "debian" not in os_info:
86
+ logging.warning(
87
+ "llamacpp binaries are built for Ubuntu. "
88
+ "Compatibility with other Linux distributions is not guaranteed."
89
+ )
90
+ except (FileNotFoundError, PermissionError, OSError) as e:
91
+ logging.warning(
92
+ "Could not determine Linux distribution (%s). "
93
+ "llamacpp binaries are built for Ubuntu.",
94
+ str(e),
95
+ )
32
96
 
33
97
 
34
98
  class LlamaTelemetry:
@@ -66,10 +130,21 @@ class LlamaTelemetry:
66
130
  Parse telemetry data from llama server output lines.
67
131
  """
68
132
 
133
+ # Parse Vulkan device detection
134
+ vulkan_match = re.search(r"ggml_vulkan: Found (\d+) Vulkan devices?:", line)
135
+ if vulkan_match:
136
+ device_count = int(vulkan_match.group(1))
137
+ if device_count > 0:
138
+ logging.info(
139
+ f"GPU acceleration active: {device_count} Vulkan device(s) "
140
+ "detected by llama-server"
141
+ )
142
+ return
143
+
69
144
  # Parse prompt evaluation line
70
145
  prompt_match = re.search(
71
- # pylint: disable=C0301
72
- r"prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?([\d.]+)\s*tokens per second",
146
+ r"prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?"
147
+ r"([\d.]+)\s*tokens per second",
73
148
  line,
74
149
  )
75
150
  if prompt_match:
@@ -83,7 +158,8 @@ class LlamaTelemetry:
83
158
 
84
159
  # Parse generation evaluation line
85
160
  eval_match = re.search(
86
- r"eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?([\d.]+)\s*tokens per second",
161
+ r"eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?"
162
+ r"([\d.]+)\s*tokens per second",
87
163
  line,
88
164
  )
89
165
  if eval_match:
@@ -169,8 +245,11 @@ def _launch_llama_subprocess(
169
245
  Launch llama server subprocess with GPU or CPU configuration
170
246
  """
171
247
 
248
+ # Get the current executable path (handles both Windows and Ubuntu structures)
249
+ _, exe_path = get_llama_server_paths()
250
+
172
251
  # Build the base command
173
- base_command = [LLAMA_SERVER_EXE_PATH, "-m", snapshot_files["variant"]]
252
+ base_command = [exe_path, "-m", snapshot_files["variant"]]
174
253
  if "mmproj" in snapshot_files:
175
254
  base_command.extend(["--mmproj", snapshot_files["mmproj"]])
176
255
  if not use_gpu:
@@ -183,13 +262,33 @@ def _launch_llama_subprocess(
183
262
  # Add port and jinja to enable tool use
184
263
  base_command.extend(["--port", str(telemetry.port), "--jinja"])
185
264
 
265
+ # Use legacy reasoning formatting, since not all apps support the new
266
+ # reasoning_content field
267
+ base_command.extend(["--reasoning-format", "none"])
268
+
186
269
  # Configure GPU layers: 99 for GPU, 0 for CPU-only
187
270
  ngl_value = "99" if use_gpu else "0"
188
271
  command = base_command + ["-ngl", ngl_value]
189
272
 
273
+ # Set up environment with library path for Linux
274
+ env = os.environ.copy()
275
+ if platform.system().lower() == "linux":
276
+ lib_dir = os.path.dirname(exe_path) # Same directory as the executable
277
+ current_ld_path = env.get("LD_LIBRARY_PATH", "")
278
+ if current_ld_path:
279
+ env["LD_LIBRARY_PATH"] = f"{lib_dir}:{current_ld_path}"
280
+ else:
281
+ env["LD_LIBRARY_PATH"] = lib_dir
282
+ logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
283
+
190
284
  # Start subprocess with output capture
191
285
  process = subprocess.Popen(
192
- command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1
286
+ command,
287
+ stdout=subprocess.PIPE,
288
+ stderr=subprocess.STDOUT,
289
+ text=True,
290
+ bufsize=1,
291
+ env=env,
193
292
  )
194
293
 
195
294
  # Start background thread to log subprocess output
@@ -203,15 +302,30 @@ def _launch_llama_subprocess(
203
302
  return process
204
303
 
205
304
 
206
- def server_load(model_config: dict, model_reference: str, telemetry: LlamaTelemetry):
305
+ def server_load(model_config: PullConfig, telemetry: LlamaTelemetry):
306
+
307
+ # Validate platform support before proceeding
308
+ validate_platform_support()
309
+
310
+ # Get platform-specific paths at runtime
311
+ llama_server_exe_dir, llama_server_exe_path = get_llama_server_paths()
312
+
313
+ # Check whether the llamacpp install needs an upgrade
314
+ version_txt_path = os.path.join(llama_server_exe_dir, "version.txt")
315
+ if os.path.exists(version_txt_path):
316
+ with open(version_txt_path, "r", encoding="utf-8") as f:
317
+ llamacpp_installed_version = f.read()
318
+
319
+ if llamacpp_installed_version != LLAMA_VERSION:
320
+ # Remove the existing install, which will trigger a new install
321
+ # in the next code block
322
+ shutil.rmtree(llama_server_exe_dir)
323
+
207
324
  # Download llama.cpp server if it isn't already available
208
- if not os.path.exists(LLAMA_SERVER_EXE_DIR):
325
+ if not os.path.exists(llama_server_exe_dir):
209
326
  # Download llama.cpp server zip
210
- # pylint: disable=C0301
211
- llama_zip_url = f"https://github.com/ggml-org/llama.cpp/releases/download/{LLAMA_VERSION}/llama-{LLAMA_VERSION}-bin-win-vulkan-x64.zip"
212
- llama_zip_path = os.path.join(
213
- os.path.dirname(sys.executable), "llama-server.zip"
214
- )
327
+ llama_zip_url, filename = get_binary_url_and_filename(LLAMA_VERSION)
328
+ llama_zip_path = os.path.join(os.path.dirname(sys.executable), filename)
215
329
  logging.info(f"Downloading llama.cpp server from {llama_zip_url}")
216
330
 
217
331
  with requests.get(llama_zip_url, stream=True) as r:
@@ -221,12 +335,23 @@ def server_load(model_config: dict, model_reference: str, telemetry: LlamaTeleme
221
335
  f.write(chunk)
222
336
 
223
337
  # Extract zip
224
- logging.info(f"Extracting {llama_zip_path} to {LLAMA_SERVER_EXE_DIR}")
338
+ logging.info(f"Extracting {llama_zip_path} to {llama_server_exe_dir}")
225
339
  with zipfile.ZipFile(llama_zip_path, "r") as zip_ref:
226
- zip_ref.extractall(LLAMA_SERVER_EXE_DIR)
340
+ zip_ref.extractall(llama_server_exe_dir)
341
+
342
+ # Make executable on Linux - need to update paths after extraction
343
+ if platform.system().lower() == "linux":
344
+ # Re-get the paths since extraction might have changed the directory structure
345
+ _, updated_exe_path = get_llama_server_paths()
346
+ if os.path.exists(updated_exe_path):
347
+ os.chmod(updated_exe_path, 0o755)
348
+ logging.info(f"Set executable permissions for {updated_exe_path}")
349
+ else:
350
+ logging.warning(
351
+ f"Could not find llama-server executable at {updated_exe_path}"
352
+ )
227
353
 
228
354
  # Save version.txt
229
- version_txt_path = os.path.join(LLAMA_SERVER_EXE_DIR, "version.txt")
230
355
  with open(version_txt_path, "w", encoding="utf-8") as vf:
231
356
  vf.write(LLAMA_VERSION)
232
357
 
@@ -239,7 +364,7 @@ def server_load(model_config: dict, model_reference: str, telemetry: LlamaTeleme
239
364
  logging.debug(f"GGUF file paths: {snapshot_files}")
240
365
 
241
366
  # Start the llama-serve.exe process
242
- logging.debug(f"Using llama_server for GGUF model: {LLAMA_SERVER_EXE_PATH}")
367
+ logging.debug(f"Using llama_server for GGUF model: {llama_server_exe_path}")
243
368
 
244
369
  # Attempt loading on GPU first
245
370
  llama_server_process = _launch_llama_subprocess(
@@ -255,7 +380,7 @@ def server_load(model_config: dict, model_reference: str, telemetry: LlamaTeleme
255
380
  # If loading on GPU failed, try loading on CPU
256
381
  if llama_server_process.poll():
257
382
  logging.warning(
258
- f"Loading {model_reference} on GPU didn't work, re-attempting on CPU"
383
+ f"Loading {model_config.model_name} on GPU didn't work, re-attempting on CPU"
259
384
  )
260
385
 
261
386
  llama_server_process = _launch_llama_subprocess(
@@ -271,7 +396,7 @@ def server_load(model_config: dict, model_reference: str, telemetry: LlamaTeleme
271
396
  if llama_server_process.poll():
272
397
  raise HTTPException(
273
398
  status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
274
- detail=f"Failed to load {model_reference} with llama.cpp",
399
+ detail=f"Failed to load {model_config.model_name} with llama.cpp",
275
400
  )
276
401
 
277
402
  return llama_server_process