PyPI - lemonade-sdk - Versions diffs - 7.0.3__py3-none-any.whl → 8.0.0__py3-none-any.whl - Mend

lemonade-sdk 7.0.3py3-none-any.whl → 8.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (55) hide show

lemonade/api.py +3 -3
lemonade/cli.py +11 -17
lemonade/common/build.py +0 -47
lemonade/common/network.py +50 -0
lemonade/common/status.py +2 -21
lemonade/common/system_info.py +19 -4
lemonade/profilers/memory_tracker.py +3 -1
lemonade/tools/accuracy.py +3 -4
lemonade/tools/adapter.py +1 -2
lemonade/tools/{huggingface_bench.py → huggingface/bench.py} +2 -87
lemonade/tools/huggingface/load.py +235 -0
lemonade/tools/{huggingface_load.py → huggingface/utils.py} +87 -255
lemonade/tools/humaneval.py +9 -3
lemonade/tools/{llamacpp_bench.py → llamacpp/bench.py} +1 -1
lemonade/tools/{llamacpp.py → llamacpp/load.py} +18 -2
lemonade/tools/mmlu.py +7 -15
lemonade/tools/{ort_genai/oga.py → oga/load.py} +31 -422
lemonade/tools/oga/utils.py +423 -0
lemonade/tools/perplexity.py +4 -3
lemonade/tools/prompt.py +2 -1
lemonade/tools/quark/quark_load.py +2 -1
lemonade/tools/quark/quark_quantize.py +5 -5
lemonade/tools/report/table.py +3 -3
lemonade/tools/server/llamacpp.py +159 -34
lemonade/tools/server/serve.py +169 -147
lemonade/tools/server/static/favicon.ico +0 -0
lemonade/tools/server/static/styles.css +568 -0
lemonade/tools/server/static/webapp.html +439 -0
lemonade/tools/server/tray.py +458 -0
lemonade/tools/server/{port_utils.py → utils/port.py} +22 -3
lemonade/tools/server/utils/system_tray.py +395 -0
lemonade/tools/server/{instructions.py → webapp.py} +4 -10
lemonade/version.py +1 -1
lemonade_install/install.py +46 -28
{lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/METADATA +84 -22
lemonade_sdk-8.0.0.dist-info/RECORD +70 -0
lemonade_server/cli.py +182 -27
lemonade_server/model_manager.py +192 -20
lemonade_server/pydantic_models.py +9 -4
lemonade_server/server_models.json +5 -3
lemonade/common/analyze_model.py +0 -26
lemonade/common/labels.py +0 -61
lemonade/common/onnx_helpers.py +0 -176
lemonade/common/plugins.py +0 -10
lemonade/common/tensor_helpers.py +0 -83
lemonade/tools/server/static/instructions.html +0 -262
lemonade_sdk-7.0.3.dist-info/RECORD +0 -69
/lemonade/tools/{ort_genai → oga}/__init__.py +0 -0
/lemonade/tools/{ort_genai/oga_bench.py → oga/bench.py} +0 -0
/lemonade/tools/server/{thread_utils.py → utils/thread.py} +0 -0
{lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/WHEEL +0 -0
{lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/entry_points.txt +0 -0
{lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/licenses/LICENSE +0 -0
{lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/licenses/NOTICE.md +0 -0
{lemonade_sdk-7.0.3.dist-info → lemonade_sdk-8.0.0.dist-info}/top_level.txt +0 -0

lemonade/tools/server/llamacpp.py CHANGED Viewed

@@ -6,6 +6,8 @@ import subprocess
 import zipfile
 import re
 import threading
+import platform
+import shutil
 import requests
 from tabulate import tabulate
@@ -14,21 +16,83 @@ from fastapi.responses import StreamingResponse
 from openai import OpenAI
-from lemonade_server.pydantic_models import ChatCompletionRequest
+from lemonade_server.pydantic_models import ChatCompletionRequest, PullConfig
 from lemonade_server.model_manager import ModelManager
-from lemonade.tools.server.port_utils import find_free_port
+from lemonade.tools.server.utils.port import find_free_port
-LLAMA_VERSION = "b5543"
+LLAMA_VERSION = "b5699"
-LLAMA_SERVER_EXE_DIR = os.path.join(
-    os.path.dirname(sys.executable),
-    "llama_server",
-)
-LLAMA_SERVER_EXE_PATH = os.path.join(
-    LLAMA_SERVER_EXE_DIR,
-    "llama-server.exe",
-)
+def get_llama_server_paths():
+    """
+    Get platform-specific paths for llama server directory and executable
+    """
+    base_dir = os.path.join(os.path.dirname(sys.executable), "llama_server")
+    if platform.system().lower() == "windows":
+        return base_dir, os.path.join(base_dir, "llama-server.exe")
+    else:  # Linux/Ubuntu
+        # Check if executable exists in build/bin subdirectory (Current Ubuntu structure)
+        build_bin_path = os.path.join(base_dir, "build", "bin", "llama-server")
+        if os.path.exists(build_bin_path):
+            return base_dir, build_bin_path
+        else:
+            # Fallback to root directory
+            return base_dir, os.path.join(base_dir, "llama-server")
+def get_binary_url_and_filename(version):
+    """
+    Get the appropriate binary URL and filename based on platform
+    """
+    system = platform.system().lower()
+    if system == "windows":
+        filename = f"llama-{version}-bin-win-vulkan-x64.zip"
+    elif system == "linux":
+        filename = f"llama-{version}-bin-ubuntu-vulkan-x64.zip"
+    else:
+        raise NotImplementedError(
+            f"Platform {system} not supported for llamacpp. Supported: Windows, Ubuntu Linux"
+        )
+    url = (
+        f"https://github.com/ggml-org/llama.cpp/releases/download/{version}/{filename}"
+    )
+    return url, filename
+def validate_platform_support():
+    """
+    Validate platform support before attempting download
+    """
+    system = platform.system().lower()
+    if system not in ["windows", "linux"]:
+        raise HTTPException(
+            status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
+            detail=(
+                f"Platform {system} not supported for llamacpp. "
+                "Supported: Windows, Ubuntu Linux"
+            ),
+        )
+    if system == "linux":
+        # Check if we're actually on Ubuntu/compatible distro and log a warning if not
+        try:
+            with open("/etc/os-release", "r", encoding="utf-8") as f:
+                os_info = f.read().lower()
+                if "ubuntu" not in os_info and "debian" not in os_info:
+                    logging.warning(
+                        "llamacpp binaries are built for Ubuntu. "
+                        "Compatibility with other Linux distributions is not guaranteed."
+                    )
+        except (FileNotFoundError, PermissionError, OSError) as e:
+            logging.warning(
+                "Could not determine Linux distribution (%s). "
+                "llamacpp binaries are built for Ubuntu.",
+                str(e),
+            )
 class LlamaTelemetry:
@@ -66,10 +130,21 @@ class LlamaTelemetry:
         Parse telemetry data from llama server output lines.
         """
+        # Parse Vulkan device detection
+        vulkan_match = re.search(r"ggml_vulkan: Found (\d+) Vulkan devices?:", line)
+        if vulkan_match:
+            device_count = int(vulkan_match.group(1))
+            if device_count > 0:
+                logging.info(
+                    f"GPU acceleration active: {device_count} Vulkan device(s) "
+                    "detected by llama-server"
+                )
+            return
         # Parse prompt evaluation line
         prompt_match = re.search(
-            # pylint: disable=C0301
-            r"prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?([\d.]+)\s*tokens per second",
+            r"prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?"
+            r"([\d.]+)\s*tokens per second",
             line,
         )
         if prompt_match:
@@ -83,7 +158,8 @@ class LlamaTelemetry:
         # Parse generation evaluation line
         eval_match = re.search(
-            r"eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?([\d.]+)\s*tokens per second",
+            r"eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?"
+            r"([\d.]+)\s*tokens per second",
             line,
         )
         if eval_match:
@@ -145,16 +221,14 @@ def _log_subprocess_output(
                 break
-def _wait_for_load(
-    llama_server_process: subprocess.Popen, port: int, fail_message: str
-):
+def _wait_for_load(llama_server_process: subprocess.Popen, port: int):
     status_code = None
     while not llama_server_process.poll() and status_code != 200:
         health_url = f"http://localhost:{port}/health"
         try:
             health_response = requests.get(health_url)
         except requests.exceptions.ConnectionError:
-            logging.warning(fail_message)
+            logging.debug("Not able to connect to llama-server yet, will retry")
         else:
             status_code = health_response.status_code
             logging.debug(
@@ -171,8 +245,11 @@ def _launch_llama_subprocess(
     Launch llama server subprocess with GPU or CPU configuration
     """
+    # Get the current executable path (handles both Windows and Ubuntu structures)
+    _, exe_path = get_llama_server_paths()
     # Build the base command
-    base_command = [LLAMA_SERVER_EXE_PATH, "-m", snapshot_files["variant"]]
+    base_command = [exe_path, "-m", snapshot_files["variant"]]
     if "mmproj" in snapshot_files:
         base_command.extend(["--mmproj", snapshot_files["mmproj"]])
         if not use_gpu:
@@ -185,13 +262,33 @@ def _launch_llama_subprocess(
     # Add port and jinja to enable tool use
     base_command.extend(["--port", str(telemetry.port), "--jinja"])
+    # Use legacy reasoning formatting, since not all apps support the new
+    # reasoning_content field
+    base_command.extend(["--reasoning-format", "none"])
     # Configure GPU layers: 99 for GPU, 0 for CPU-only
     ngl_value = "99" if use_gpu else "0"
     command = base_command + ["-ngl", ngl_value]
+    # Set up environment with library path for Linux
+    env = os.environ.copy()
+    if platform.system().lower() == "linux":
+        lib_dir = os.path.dirname(exe_path)  # Same directory as the executable
+        current_ld_path = env.get("LD_LIBRARY_PATH", "")
+        if current_ld_path:
+            env["LD_LIBRARY_PATH"] = f"{lib_dir}:{current_ld_path}"
+        else:
+            env["LD_LIBRARY_PATH"] = lib_dir
+        logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
     # Start subprocess with output capture
     process = subprocess.Popen(
-        command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1
+        command,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        bufsize=1,
+        env=env,
     )
     # Start background thread to log subprocess output
@@ -205,15 +302,30 @@ def _launch_llama_subprocess(
     return process
-def server_load(model_config: dict, model_reference: str, telemetry: LlamaTelemetry):
+def server_load(model_config: PullConfig, telemetry: LlamaTelemetry):
+    # Validate platform support before proceeding
+    validate_platform_support()
+    # Get platform-specific paths at runtime
+    llama_server_exe_dir, llama_server_exe_path = get_llama_server_paths()
+    # Check whether the llamacpp install needs an upgrade
+    version_txt_path = os.path.join(llama_server_exe_dir, "version.txt")
+    if os.path.exists(version_txt_path):
+        with open(version_txt_path, "r", encoding="utf-8") as f:
+            llamacpp_installed_version = f.read()
+        if llamacpp_installed_version != LLAMA_VERSION:
+            # Remove the existing install, which will trigger a new install
+            # in the next code block
+            shutil.rmtree(llama_server_exe_dir)
     # Download llama.cpp server if it isn't already available
-    if not os.path.exists(LLAMA_SERVER_EXE_DIR):
+    if not os.path.exists(llama_server_exe_dir):
         # Download llama.cpp server zip
-        # pylint: disable=C0301
-        llama_zip_url = f"https://github.com/ggml-org/llama.cpp/releases/download/{LLAMA_VERSION}/llama-{LLAMA_VERSION}-bin-win-vulkan-x64.zip"
-        llama_zip_path = os.path.join(
-            os.path.dirname(sys.executable), "llama-server.zip"
-        )
+        llama_zip_url, filename = get_binary_url_and_filename(LLAMA_VERSION)
+        llama_zip_path = os.path.join(os.path.dirname(sys.executable), filename)
         logging.info(f"Downloading llama.cpp server from {llama_zip_url}")
         with requests.get(llama_zip_url, stream=True) as r:
@@ -223,12 +335,23 @@ def server_load(model_config: dict, model_reference: str, telemetry: LlamaTeleme
                     f.write(chunk)
         # Extract zip
-        logging.info(f"Extracting {llama_zip_path} to {LLAMA_SERVER_EXE_DIR}")
+        logging.info(f"Extracting {llama_zip_path} to {llama_server_exe_dir}")
         with zipfile.ZipFile(llama_zip_path, "r") as zip_ref:
-            zip_ref.extractall(LLAMA_SERVER_EXE_DIR)
+            zip_ref.extractall(llama_server_exe_dir)
+        # Make executable on Linux - need to update paths after extraction
+        if platform.system().lower() == "linux":
+            # Re-get the paths since extraction might have changed the directory structure
+            _, updated_exe_path = get_llama_server_paths()
+            if os.path.exists(updated_exe_path):
+                os.chmod(updated_exe_path, 0o755)
+                logging.info(f"Set executable permissions for {updated_exe_path}")
+            else:
+                logging.warning(
+                    f"Could not find llama-server executable at {updated_exe_path}"
+                )
         # Save version.txt
-        version_txt_path = os.path.join(LLAMA_SERVER_EXE_DIR, "version.txt")
         with open(version_txt_path, "w", encoding="utf-8") as vf:
             vf.write(LLAMA_VERSION)
@@ -241,7 +364,7 @@ def server_load(model_config: dict, model_reference: str, telemetry: LlamaTeleme
     logging.debug(f"GGUF file paths: {snapshot_files}")
     # Start the llama-serve.exe process
-    logging.debug(f"Using llama_server for GGUF model: {LLAMA_SERVER_EXE_PATH}")
+    logging.debug(f"Using llama_server for GGUF model: {llama_server_exe_path}")
     # Attempt loading on GPU first
     llama_server_process = _launch_llama_subprocess(
@@ -252,11 +375,14 @@ def server_load(model_config: dict, model_reference: str, telemetry: LlamaTeleme
     _wait_for_load(
         llama_server_process,
         telemetry.port,
-        f"Loading {model_reference} on GPU didn't work, re-attempting on CPU",
     )
     # If loading on GPU failed, try loading on CPU
     if llama_server_process.poll():
+        logging.warning(
+            f"Loading {model_config.model_name} on GPU didn't work, re-attempting on CPU"
+        )
         llama_server_process = _launch_llama_subprocess(
             snapshot_files, use_gpu=False, telemetry=telemetry
         )
@@ -265,13 +391,12 @@ def server_load(model_config: dict, model_reference: str, telemetry: LlamaTeleme
         _wait_for_load(
             llama_server_process,
             telemetry.port,
-            f"Loading {model_reference} on CPU didn't work",
         )
     if llama_server_process.poll():
         raise HTTPException(
             status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
-            detail=f"Failed to load {model_reference} with llama.cpp",
+            detail=f"Failed to load {model_config.model_name} with llama.cpp",
         )
     return llama_server_process

lemonade-sdk 7.0.3__py3-none-any.whl → 8.0.0__py3-none-any.whl

Potentially problematic release.

lemonade-sdk 7.0.3py3-none-any.whl → 8.0.0py3-none-any.whl