lemonade-sdk 7.0.3__py3-none-any.whl → 8.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (55) hide show
  1. lemonade/api.py +3 -3
  2. lemonade/cli.py +11 -17
  3. lemonade/common/build.py +0 -47
  4. lemonade/common/network.py +50 -0
  5. lemonade/common/status.py +2 -21
  6. lemonade/common/system_info.py +19 -4
  7. lemonade/profilers/memory_tracker.py +3 -1
  8. lemonade/tools/accuracy.py +3 -4
  9. lemonade/tools/adapter.py +1 -2
  10. lemonade/tools/{huggingface_bench.py → huggingface/bench.py} +2 -87
  11. lemonade/tools/huggingface/load.py +235 -0
  12. lemonade/tools/{huggingface_load.py → huggingface/utils.py} +87 -255
  13. lemonade/tools/humaneval.py +9 -3
  14. lemonade/tools/{llamacpp_bench.py → llamacpp/bench.py} +1 -1
  15. lemonade/tools/{llamacpp.py → llamacpp/load.py} +18 -2
  16. lemonade/tools/mmlu.py +7 -15
  17. lemonade/tools/{ort_genai/oga.py → oga/load.py} +31 -422
  18. lemonade/tools/oga/utils.py +423 -0
  19. lemonade/tools/perplexity.py +4 -3
  20. lemonade/tools/prompt.py +2 -1
  21. lemonade/tools/quark/quark_load.py +2 -1
  22. lemonade/tools/quark/quark_quantize.py +5 -5
  23. lemonade/tools/report/table.py +3 -3
  24. lemonade/tools/server/llamacpp.py +159 -34
  25. lemonade/tools/server/serve.py +169 -147
  26. lemonade/tools/server/static/favicon.ico +0 -0
  27. lemonade/tools/server/static/styles.css +568 -0
  28. lemonade/tools/server/static/webapp.html +439 -0
  29. lemonade/tools/server/tray.py +458 -0
  30. lemonade/tools/server/{port_utils.py → utils/port.py} +22 -3
  31. lemonade/tools/server/utils/system_tray.py +395 -0
  32. lemonade/tools/server/{instructions.py → webapp.py} +4 -10
  33. lemonade/version.py +1 -1
  34. lemonade_install/install.py +46 -28
  35. {lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/METADATA +84 -22
  36. lemonade_sdk-8.0.0.dist-info/RECORD +70 -0
  37. lemonade_server/cli.py +182 -27
  38. lemonade_server/model_manager.py +192 -20
  39. lemonade_server/pydantic_models.py +9 -4
  40. lemonade_server/server_models.json +5 -3
  41. lemonade/common/analyze_model.py +0 -26
  42. lemonade/common/labels.py +0 -61
  43. lemonade/common/onnx_helpers.py +0 -176
  44. lemonade/common/plugins.py +0 -10
  45. lemonade/common/tensor_helpers.py +0 -83
  46. lemonade/tools/server/static/instructions.html +0 -262
  47. lemonade_sdk-7.0.3.dist-info/RECORD +0 -69
  48. /lemonade/tools/{ort_genai → oga}/__init__.py +0 -0
  49. /lemonade/tools/{ort_genai/oga_bench.py → oga/bench.py} +0 -0
  50. /lemonade/tools/server/{thread_utils.py → utils/thread.py} +0 -0
  51. {lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/WHEEL +0 -0
  52. {lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/entry_points.txt +0 -0
  53. {lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/licenses/LICENSE +0 -0
  54. {lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/licenses/NOTICE.md +0 -0
  55. {lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/top_level.txt +0 -0
@@ -6,6 +6,8 @@ import subprocess
6
6
  import zipfile
7
7
  import re
8
8
  import threading
9
+ import platform
10
+ import shutil
9
11
 
10
12
  import requests
11
13
  from tabulate import tabulate
@@ -14,21 +16,83 @@ from fastapi.responses import StreamingResponse
14
16
 
15
17
  from openai import OpenAI
16
18
 
17
- from lemonade_server.pydantic_models import ChatCompletionRequest
19
+ from lemonade_server.pydantic_models import ChatCompletionRequest, PullConfig
18
20
  from lemonade_server.model_manager import ModelManager
19
- from lemonade.tools.server.port_utils import find_free_port
21
+ from lemonade.tools.server.utils.port import find_free_port
20
22
 
21
- LLAMA_VERSION = "b5543"
23
+ LLAMA_VERSION = "b5699"
22
24
 
23
- LLAMA_SERVER_EXE_DIR = os.path.join(
24
- os.path.dirname(sys.executable),
25
- "llama_server",
26
- )
27
25
 
28
- LLAMA_SERVER_EXE_PATH = os.path.join(
29
- LLAMA_SERVER_EXE_DIR,
30
- "llama-server.exe",
31
- )
26
+ def get_llama_server_paths():
27
+ """
28
+ Get platform-specific paths for llama server directory and executable
29
+ """
30
+ base_dir = os.path.join(os.path.dirname(sys.executable), "llama_server")
31
+
32
+ if platform.system().lower() == "windows":
33
+ return base_dir, os.path.join(base_dir, "llama-server.exe")
34
+ else: # Linux/Ubuntu
35
+ # Check if executable exists in build/bin subdirectory (Current Ubuntu structure)
36
+ build_bin_path = os.path.join(base_dir, "build", "bin", "llama-server")
37
+ if os.path.exists(build_bin_path):
38
+ return base_dir, build_bin_path
39
+ else:
40
+ # Fallback to root directory
41
+ return base_dir, os.path.join(base_dir, "llama-server")
42
+
43
+
44
+ def get_binary_url_and_filename(version):
45
+ """
46
+ Get the appropriate binary URL and filename based on platform
47
+ """
48
+ system = platform.system().lower()
49
+
50
+ if system == "windows":
51
+ filename = f"llama-{version}-bin-win-vulkan-x64.zip"
52
+ elif system == "linux":
53
+ filename = f"llama-{version}-bin-ubuntu-vulkan-x64.zip"
54
+ else:
55
+ raise NotImplementedError(
56
+ f"Platform {system} not supported for llamacpp. Supported: Windows, Ubuntu Linux"
57
+ )
58
+
59
+ url = (
60
+ f"https://github.com/ggml-org/llama.cpp/releases/download/{version}/{filename}"
61
+ )
62
+ return url, filename
63
+
64
+
65
+ def validate_platform_support():
66
+ """
67
+ Validate platform support before attempting download
68
+ """
69
+ system = platform.system().lower()
70
+
71
+ if system not in ["windows", "linux"]:
72
+ raise HTTPException(
73
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
74
+ detail=(
75
+ f"Platform {system} not supported for llamacpp. "
76
+ "Supported: Windows, Ubuntu Linux"
77
+ ),
78
+ )
79
+
80
+ if system == "linux":
81
+ # Check if we're actually on Ubuntu/compatible distro and log a warning if not
82
+ try:
83
+ with open("/etc/os-release", "r", encoding="utf-8") as f:
84
+ os_info = f.read().lower()
85
+ if "ubuntu" not in os_info and "debian" not in os_info:
86
+ logging.warning(
87
+ "llamacpp binaries are built for Ubuntu. "
88
+ "Compatibility with other Linux distributions is not guaranteed."
89
+ )
90
+ except (FileNotFoundError, PermissionError, OSError) as e:
91
+ logging.warning(
92
+ "Could not determine Linux distribution (%s). "
93
+ "llamacpp binaries are built for Ubuntu.",
94
+ str(e),
95
+ )
32
96
 
33
97
 
34
98
  class LlamaTelemetry:
@@ -66,10 +130,21 @@ class LlamaTelemetry:
66
130
  Parse telemetry data from llama server output lines.
67
131
  """
68
132
 
133
+ # Parse Vulkan device detection
134
+ vulkan_match = re.search(r"ggml_vulkan: Found (\d+) Vulkan devices?:", line)
135
+ if vulkan_match:
136
+ device_count = int(vulkan_match.group(1))
137
+ if device_count > 0:
138
+ logging.info(
139
+ f"GPU acceleration active: {device_count} Vulkan device(s) "
140
+ "detected by llama-server"
141
+ )
142
+ return
143
+
69
144
  # Parse prompt evaluation line
70
145
  prompt_match = re.search(
71
- # pylint: disable=C0301
72
- r"prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?([\d.]+)\s*tokens per second",
146
+ r"prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?"
147
+ r"([\d.]+)\s*tokens per second",
73
148
  line,
74
149
  )
75
150
  if prompt_match:
@@ -83,7 +158,8 @@ class LlamaTelemetry:
83
158
 
84
159
  # Parse generation evaluation line
85
160
  eval_match = re.search(
86
- r"eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?([\d.]+)\s*tokens per second",
161
+ r"eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?"
162
+ r"([\d.]+)\s*tokens per second",
87
163
  line,
88
164
  )
89
165
  if eval_match:
@@ -145,16 +221,14 @@ def _log_subprocess_output(
145
221
  break
146
222
 
147
223
 
148
- def _wait_for_load(
149
- llama_server_process: subprocess.Popen, port: int, fail_message: str
150
- ):
224
+ def _wait_for_load(llama_server_process: subprocess.Popen, port: int):
151
225
  status_code = None
152
226
  while not llama_server_process.poll() and status_code != 200:
153
227
  health_url = f"http://localhost:{port}/health"
154
228
  try:
155
229
  health_response = requests.get(health_url)
156
230
  except requests.exceptions.ConnectionError:
157
- logging.warning(fail_message)
231
+ logging.debug("Not able to connect to llama-server yet, will retry")
158
232
  else:
159
233
  status_code = health_response.status_code
160
234
  logging.debug(
@@ -171,8 +245,11 @@ def _launch_llama_subprocess(
171
245
  Launch llama server subprocess with GPU or CPU configuration
172
246
  """
173
247
 
248
+ # Get the current executable path (handles both Windows and Ubuntu structures)
249
+ _, exe_path = get_llama_server_paths()
250
+
174
251
  # Build the base command
175
- base_command = [LLAMA_SERVER_EXE_PATH, "-m", snapshot_files["variant"]]
252
+ base_command = [exe_path, "-m", snapshot_files["variant"]]
176
253
  if "mmproj" in snapshot_files:
177
254
  base_command.extend(["--mmproj", snapshot_files["mmproj"]])
178
255
  if not use_gpu:
@@ -185,13 +262,33 @@ def _launch_llama_subprocess(
185
262
  # Add port and jinja to enable tool use
186
263
  base_command.extend(["--port", str(telemetry.port), "--jinja"])
187
264
 
265
+ # Use legacy reasoning formatting, since not all apps support the new
266
+ # reasoning_content field
267
+ base_command.extend(["--reasoning-format", "none"])
268
+
188
269
  # Configure GPU layers: 99 for GPU, 0 for CPU-only
189
270
  ngl_value = "99" if use_gpu else "0"
190
271
  command = base_command + ["-ngl", ngl_value]
191
272
 
273
+ # Set up environment with library path for Linux
274
+ env = os.environ.copy()
275
+ if platform.system().lower() == "linux":
276
+ lib_dir = os.path.dirname(exe_path) # Same directory as the executable
277
+ current_ld_path = env.get("LD_LIBRARY_PATH", "")
278
+ if current_ld_path:
279
+ env["LD_LIBRARY_PATH"] = f"{lib_dir}:{current_ld_path}"
280
+ else:
281
+ env["LD_LIBRARY_PATH"] = lib_dir
282
+ logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
283
+
192
284
  # Start subprocess with output capture
193
285
  process = subprocess.Popen(
194
- command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1
286
+ command,
287
+ stdout=subprocess.PIPE,
288
+ stderr=subprocess.STDOUT,
289
+ text=True,
290
+ bufsize=1,
291
+ env=env,
195
292
  )
196
293
 
197
294
  # Start background thread to log subprocess output
@@ -205,15 +302,30 @@ def _launch_llama_subprocess(
205
302
  return process
206
303
 
207
304
 
208
- def server_load(model_config: dict, model_reference: str, telemetry: LlamaTelemetry):
305
+ def server_load(model_config: PullConfig, telemetry: LlamaTelemetry):
306
+
307
+ # Validate platform support before proceeding
308
+ validate_platform_support()
309
+
310
+ # Get platform-specific paths at runtime
311
+ llama_server_exe_dir, llama_server_exe_path = get_llama_server_paths()
312
+
313
+ # Check whether the llamacpp install needs an upgrade
314
+ version_txt_path = os.path.join(llama_server_exe_dir, "version.txt")
315
+ if os.path.exists(version_txt_path):
316
+ with open(version_txt_path, "r", encoding="utf-8") as f:
317
+ llamacpp_installed_version = f.read()
318
+
319
+ if llamacpp_installed_version != LLAMA_VERSION:
320
+ # Remove the existing install, which will trigger a new install
321
+ # in the next code block
322
+ shutil.rmtree(llama_server_exe_dir)
323
+
209
324
  # Download llama.cpp server if it isn't already available
210
- if not os.path.exists(LLAMA_SERVER_EXE_DIR):
325
+ if not os.path.exists(llama_server_exe_dir):
211
326
  # Download llama.cpp server zip
212
- # pylint: disable=C0301
213
- llama_zip_url = f"https://github.com/ggml-org/llama.cpp/releases/download/{LLAMA_VERSION}/llama-{LLAMA_VERSION}-bin-win-vulkan-x64.zip"
214
- llama_zip_path = os.path.join(
215
- os.path.dirname(sys.executable), "llama-server.zip"
216
- )
327
+ llama_zip_url, filename = get_binary_url_and_filename(LLAMA_VERSION)
328
+ llama_zip_path = os.path.join(os.path.dirname(sys.executable), filename)
217
329
  logging.info(f"Downloading llama.cpp server from {llama_zip_url}")
218
330
 
219
331
  with requests.get(llama_zip_url, stream=True) as r:
@@ -223,12 +335,23 @@ def server_load(model_config: dict, model_reference: str, telemetry: LlamaTeleme
223
335
  f.write(chunk)
224
336
 
225
337
  # Extract zip
226
- logging.info(f"Extracting {llama_zip_path} to {LLAMA_SERVER_EXE_DIR}")
338
+ logging.info(f"Extracting {llama_zip_path} to {llama_server_exe_dir}")
227
339
  with zipfile.ZipFile(llama_zip_path, "r") as zip_ref:
228
- zip_ref.extractall(LLAMA_SERVER_EXE_DIR)
340
+ zip_ref.extractall(llama_server_exe_dir)
341
+
342
+ # Make executable on Linux - need to update paths after extraction
343
+ if platform.system().lower() == "linux":
344
+ # Re-get the paths since extraction might have changed the directory structure
345
+ _, updated_exe_path = get_llama_server_paths()
346
+ if os.path.exists(updated_exe_path):
347
+ os.chmod(updated_exe_path, 0o755)
348
+ logging.info(f"Set executable permissions for {updated_exe_path}")
349
+ else:
350
+ logging.warning(
351
+ f"Could not find llama-server executable at {updated_exe_path}"
352
+ )
229
353
 
230
354
  # Save version.txt
231
- version_txt_path = os.path.join(LLAMA_SERVER_EXE_DIR, "version.txt")
232
355
  with open(version_txt_path, "w", encoding="utf-8") as vf:
233
356
  vf.write(LLAMA_VERSION)
234
357
 
@@ -241,7 +364,7 @@ def server_load(model_config: dict, model_reference: str, telemetry: LlamaTeleme
241
364
  logging.debug(f"GGUF file paths: {snapshot_files}")
242
365
 
243
366
  # Start the llama-serve.exe process
244
- logging.debug(f"Using llama_server for GGUF model: {LLAMA_SERVER_EXE_PATH}")
367
+ logging.debug(f"Using llama_server for GGUF model: {llama_server_exe_path}")
245
368
 
246
369
  # Attempt loading on GPU first
247
370
  llama_server_process = _launch_llama_subprocess(
@@ -252,11 +375,14 @@ def server_load(model_config: dict, model_reference: str, telemetry: LlamaTeleme
252
375
  _wait_for_load(
253
376
  llama_server_process,
254
377
  telemetry.port,
255
- f"Loading {model_reference} on GPU didn't work, re-attempting on CPU",
256
378
  )
257
379
 
258
380
  # If loading on GPU failed, try loading on CPU
259
381
  if llama_server_process.poll():
382
+ logging.warning(
383
+ f"Loading {model_config.model_name} on GPU didn't work, re-attempting on CPU"
384
+ )
385
+
260
386
  llama_server_process = _launch_llama_subprocess(
261
387
  snapshot_files, use_gpu=False, telemetry=telemetry
262
388
  )
@@ -265,13 +391,12 @@ def server_load(model_config: dict, model_reference: str, telemetry: LlamaTeleme
265
391
  _wait_for_load(
266
392
  llama_server_process,
267
393
  telemetry.port,
268
- f"Loading {model_reference} on CPU didn't work",
269
394
  )
270
395
 
271
396
  if llama_server_process.poll():
272
397
  raise HTTPException(
273
398
  status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
274
- detail=f"Failed to load {model_reference} with llama.cpp",
399
+ detail=f"Failed to load {model_config.model_name} with llama.cpp",
275
400
  )
276
401
 
277
402
  return llama_server_process