PyPI - lemonade-sdk - Versions diffs - 8.1.10__py3-none-any.whl → 8.1.12__py3-none-any.whl - Mend

lemonade-sdk 8.1.10py3-none-any.whl → 8.1.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (37) hide show

lemonade/cache.py +6 -1
lemonade/common/status.py +4 -4
lemonade/tools/bench.py +22 -1
lemonade/tools/flm/__init__.py +1 -0
lemonade/tools/flm/utils.py +255 -0
lemonade/tools/llamacpp/bench.py +111 -23
lemonade/tools/llamacpp/load.py +20 -1
lemonade/tools/llamacpp/utils.py +210 -17
lemonade/tools/oga/bench.py +0 -26
lemonade/tools/report/table.py +6 -0
lemonade/tools/server/flm.py +133 -0
lemonade/tools/server/llamacpp.py +23 -5
lemonade/tools/server/serve.py +260 -135
lemonade/tools/server/static/js/chat.js +165 -82
lemonade/tools/server/static/js/models.js +87 -54
lemonade/tools/server/static/js/shared.js +9 -6
lemonade/tools/server/static/logs.html +57 -0
lemonade/tools/server/static/styles.css +159 -8
lemonade/tools/server/static/webapp.html +28 -10
lemonade/tools/server/tray.py +94 -38
lemonade/tools/server/utils/macos_tray.py +226 -0
lemonade/tools/server/utils/{system_tray.py → windows_tray.py} +13 -0
lemonade/tools/server/webapp.py +4 -1
lemonade/tools/server/wrapped_server.py +91 -25
lemonade/version.py +1 -1
lemonade_install/install.py +25 -2
{lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/METADATA +10 -6
{lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/RECORD +37 -32
lemonade_server/cli.py +103 -14
lemonade_server/model_manager.py +186 -45
lemonade_server/pydantic_models.py +25 -1
lemonade_server/server_models.json +175 -62
{lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/WHEEL +0 -0
{lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/entry_points.txt +0 -0
{lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/licenses/LICENSE +0 -0
{lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/licenses/NOTICE.md +0 -0
{lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/top_level.txt +0 -0

lemonade/tools/llamacpp/utils.py CHANGED Viewed

@@ -7,6 +7,7 @@ import zipfile
 from typing import Optional
 import subprocess
 import requests
+import lemonade.common.build as build
 import lemonade.common.printing as printing
 from lemonade.tools.adapter import PassthroughTokenizer, ModelAdapter
@@ -14,8 +15,9 @@ from lemonade.common.system_info import get_system_info
 from dotenv import set_key, load_dotenv
-LLAMA_VERSION_VULKAN = "b6431"
-LLAMA_VERSION_ROCM = "b1057"
+LLAMA_VERSION_VULKAN = "b6510"
+LLAMA_VERSION_ROCM = "b1066"
+LLAMA_VERSION_METAL = "b6510"
 def identify_rocm_arch_from_name(device_name: str) -> str | None:
@@ -126,8 +128,12 @@ def get_llama_version(backend: str) -> str:
         return LLAMA_VERSION_ROCM
     elif backend == "vulkan":
         return LLAMA_VERSION_VULKAN
+    elif backend == "metal":
+        return LLAMA_VERSION_METAL
     else:
-        raise ValueError(f"Unsupported backend: {backend}")
+        raise ValueError(
+            f"Unsupported backend: {backend}. Supported: vulkan, rocm, metal"
+        )
 def get_llama_folder_path(backend: str):
@@ -142,10 +148,12 @@ def get_llama_exe_path(exe_name: str, backend: str):
     Get path to platform-specific llama-server executable
     """
     base_dir = get_llama_folder_path(backend)
-    if platform.system().lower() == "windows":
+    system = platform.system().lower()
+    if system == "windows":
         return os.path.join(base_dir, f"{exe_name}.exe")
-    else:  # Linux/Ubuntu
-        # Check if executable exists in build/bin subdirectory (Current Ubuntu structure)
+    else:  # Darwin/Linux/Ubuntu
+        # Check if executable exists in build/bin subdirectory
         build_bin_path = os.path.join(base_dir, "build", "bin", exe_name)
         if os.path.exists(build_bin_path):
             return build_bin_path
@@ -168,6 +176,13 @@ def get_llama_cli_exe_path(backend: str):
     return get_llama_exe_path("llama-cli", backend)
+def get_llama_bench_exe_path(backend: str):
+    """
+    Get path to platform-specific llama-bench executable
+    """
+    return get_llama_exe_path("llama-bench", backend)
 def get_version_txt_path(backend: str):
     """
     Get path to text file that contains version information
@@ -223,8 +238,24 @@ def get_binary_url_and_filename(backend: str, target_arch: str = None):
             raise NotImplementedError(
                 f"Platform {system} not supported for Vulkan llamacpp. Supported: Windows, Ubuntu Linux"
             )
+    elif backend == "metal":
+        # Metal support for macOS Apple Silicon from ggml-org/llama.cpp
+        repo = "ggml-org/llama.cpp"
+        version = LLAMA_VERSION_METAL
+        if system == "darwin":
+            if platform.machine().lower() in ["arm64", "aarch64"]:
+                filename = f"llama-{version}-bin-macos-arm64.zip"
+            else:
+                raise NotImplementedError(
+                    "Metal backend only supports Apple Silicon (ARM64) processors"
+                )
+        else:
+            raise NotImplementedError(
+                f"Platform {system} not supported for Metal llamacpp. Metal is only supported on macOS"
+            )
     else:
-        supported_backends = ["vulkan", "rocm"]
+        supported_backends = ["vulkan", "rocm", "metal"]
         raise NotImplementedError(
             f"Unsupported backend: {backend}. Supported backends: {supported_backends}"
         )
@@ -239,10 +270,10 @@ def validate_platform_support():
     """
     system = platform.system().lower()
-    if system not in ["windows", "linux"]:
+    if system not in ["windows", "linux", "darwin"]:
         raise NotImplementedError(
             f"Platform {system} not supported for llamacpp. "
-            "Supported: Windows, Ubuntu Linux"
+            "Supported: Windows, Ubuntu Linux, macOS"
         )
     if system == "linux":
@@ -341,6 +372,29 @@ def install_llamacpp(backend):
         if filename.endswith(".zip"):
             with zipfile.ZipFile(llama_archive_path, "r") as zip_ref:
                 zip_ref.extractall(llama_server_exe_dir)
+            # On Unix-like systems (macOS/Linux), make executables executable
+            if platform.system().lower() in ["darwin", "linux"]:
+                import stat
+                # Find and make executable files executable
+                for root, dirs, files in os.walk(llama_server_exe_dir):
+                    for file in files:
+                        file_path = os.path.join(root, file)
+                        # Make files in bin/ directories executable
+                        if "bin" in root.split(os.sep) or file in [
+                            "llama-server",
+                            "llama-simple",
+                        ]:
+                            try:
+                                current_permissions = os.stat(file_path).st_mode
+                                os.chmod(file_path, current_permissions | stat.S_IEXEC)
+                                logging.debug(f"Made {file_path} executable")
+                            except Exception as e:
+                                raise RuntimeError(
+                                    f"Failed to make {file_path} executable. This will prevent "
+                                    f"llama-server from starting. Error: {e}"
+                                )
         else:
             raise NotImplementedError(f"Unsupported archive format: {filename}")
@@ -360,6 +414,7 @@ def install_llamacpp(backend):
             exe_paths = [
                 (get_llama_server_exe_path(backend), "llama-server"),
                 (get_llama_cli_exe_path(backend), "llama-cli"),
+                (get_llama_bench_exe_path(backend), "llama-bench"),
             ]
             for exe_path, exe_name in exe_paths:
@@ -653,8 +708,10 @@ class LlamaCppAdapter(ModelAdapter):
         context_size,
         threads,
         executable,
+        bench_executable,
         reasoning=False,
         lib_dir=None,
+        state=None,
     ):
         super().__init__()
@@ -666,8 +723,10 @@ class LlamaCppAdapter(ModelAdapter):
         self.context_size = context_size
         self.threads = threads
         self.executable = os.path.normpath(executable)
+        self.bench_executable = os.path.normpath(bench_executable)
         self.reasoning = reasoning
         self.lib_dir = lib_dir
+        self.state = state
     def generate(
         self,
@@ -708,32 +767,54 @@ class LlamaCppAdapter(ModelAdapter):
             self.executable,
             "-m",
             self.model,
-            "--ctx-size",
+            "--ctx-size",  # size of the prompt context, 0 = loaded from model
             str(self.context_size),
-            "-n",
+            "-n",  # number of tokens to predict, -1 = infinity, =2 - until context filled
             str(n_predict),
-            "-t",
+            "-t",  # number of threads to use during generation
             str(self.threads),
             "-p",
             prompt,
+            "-b",  # logical maximum batch size
+            "1",
+            "-ub",  # physical maximum batch size
+            "1",
             "--temp",
             str(temperature),
             "--top-p",
             str(top_p),
             "--top-k",
             str(top_k),
-            "-e",
-            "-no-cnv",
-            "--reasoning-format",
+            "-e",  # process escape sequences
+            "--no-conversation",  # disable conversation mode
+            "--reasoning-format",  # leaves thoughts unparsed in message content
             "none",
         ]
+        # If prompt exceeds 500 characters, then use a file
+        if len(prompt) < 500:
+            cmd += ["-p", prompt]
+        else:
+            # Create prompt file in cache directory
+            prompt_file = os.path.join(
+                build.output_dir(self.state.cache_dir, self.state.build_name),
+                "prompt.txt",
+            )
+            with open(prompt_file, "w", encoding="utf-8") as file:
+                file.write(prompt)
+            cmd += ["-f", prompt_file]
         # Configure GPU layers: 99 for GPU, 0 for CPU-only
         ngl_value = "99" if self.device == "igpu" else "0"
         cmd = cmd + ["-ngl", ngl_value]
         cmd = [str(m) for m in cmd]
+        # save llama-cli command
+        self.state.llama_cli_cmd = getattr(self.state, "llama_cli_cmd", []) + [
+            " ".join(cmd)
+        ]
         try:
             # Set up environment with library path for Linux
             env = os.environ.copy()
@@ -763,6 +844,15 @@ class LlamaCppAdapter(ModelAdapter):
             )
             raw_output, stderr = process.communicate(timeout=600)
+            # save llama-cli command output with performance info to state
+            # (can be viewed in state.yaml file in cache)
+            self.state.llama_cli_stderr = getattr(
+                self.state, "llama_cli_stderr", []
+            ) + [
+                [line for line in stderr.splitlines() if line.startswith("llama_perf_")]
+            ]
             if process.returncode != 0:
                 error_msg = f"llama.cpp failed with return code {process.returncode}.\n"
                 error_msg += f"Command: {' '.join(cmd)}\n"
@@ -827,7 +917,108 @@ class LlamaCppAdapter(ModelAdapter):
             return [output_text]
         except Exception as e:
-            error_msg = f"Failed to run llama.cpp command: {str(e)}\n"
+            error_msg = f"Failed to run llama-cli.exe command: {str(e)}\n"
+            error_msg += f"Command: {' '.join(cmd)}"
+            raise Exception(error_msg)
+    def benchmark(self, prompts, iterations, output_tokens):
+        """
+        Runs the llama-bench.exe tool to measure TTFT and TPS
+        """
+        cmd = [
+            self.bench_executable,
+            "-m",
+            self.model,
+            "-r",
+            iterations,
+            "-p",
+            ",".join([str(p) for p in prompts]),
+            "-n",
+            output_tokens,
+            "-t",
+            self.threads if self.threads > 0 else 16,
+            "-b",
+            1,
+            "-ub",
+            1,
+        ]
+        cmd = [str(m) for m in cmd]
+        # save llama-bench command
+        self.state.llama_bench_cmd = " ".join(cmd)
+        try:
+            # Set up environment with library path for Linux
+            env = os.environ.copy()
+            # Load environment variables from .env file in the executable directory
+            exe_dir = os.path.dirname(self.executable)
+            env_file_path = os.path.join(exe_dir, ".env")
+            if os.path.exists(env_file_path):
+                load_dotenv(env_file_path, override=True)
+                env.update(os.environ)
+            if self.lib_dir and os.name != "nt":  # Not Windows
+                current_ld_path = env.get("LD_LIBRARY_PATH", "")
+                if current_ld_path:
+                    env["LD_LIBRARY_PATH"] = f"{self.lib_dir}:{current_ld_path}"
+                else:
+                    env["LD_LIBRARY_PATH"] = self.lib_dir
+            process = subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                universal_newlines=True,
+                encoding="utf-8",
+                errors="replace",
+                env=env,
+            )
+            raw_output, stderr = process.communicate(timeout=600)
+            # save llama-bench command output with performance info to state
+            # (can be viewed in state.yaml file in cache)
+            self.state.llama_bench_standard_output = raw_output.splitlines()
+            if process.returncode != 0:
+                error_msg = (
+                    f"llama-bench.exe failed with return code {process.returncode}.\n"
+                )
+                error_msg += f"Command: {' '.join(cmd)}\n"
+                error_msg += f"Error output:\n{stderr}\n"
+                error_msg += f"Standard output:\n{raw_output}"
+                raise Exception(error_msg)
+            if raw_output is None:
+                raise Exception("No output received from llama-bench.exe process")
+            # Parse information from llama-bench.exe output
+            prompt_lengths = []
+            pp_tps = []
+            pp_tps_sd = []
+            tg_tps = None
+            tg_tps_sd = None
+            for line in self.state.llama_bench_standard_output:
+                # Parse TPS information
+                for p in prompts:
+                    if f"pp{p:d}" in line:
+                        parts = line.split("|")
+                        timings = parts[-2].strip().split(" ")
+                        prompt_lengths.append(p)
+                        pp_tps.append(float(timings[0]))
+                        pp_tps_sd.append(float(timings[-1]))
+                    if f"tg{output_tokens:d}" in line:
+                        parts = line.split("|")
+                        timings = parts[-2].strip().split(" ")
+                        tg_tps = float(timings[0])
+                        tg_tps_sd = float(timings[-1])
+            return prompt_lengths, pp_tps, pp_tps_sd, tg_tps, tg_tps_sd
+        except Exception as e:
+            error_msg = f"Failed to run llama-bench.exe command: {str(e)}\n"
             error_msg += f"Command: {' '.join(cmd)}"
             raise Exception(error_msg)
@@ -857,7 +1048,9 @@ def get_hip_devices():
     try:
         libhip = ctypes.CDLL(matching_files[0])
     except OSError:
-        raise RuntimeError(f"Could not load HIP runtime library from {path}")
+        raise RuntimeError(
+            f"Could not load HIP runtime library from {matching_files[0]}"
+        )
     # Setup function signatures
     hipError_t = c_int

lemonade/tools/oga/bench.py CHANGED Viewed

@@ -2,7 +2,6 @@ import argparse
 import statistics
 from statistics import StatisticsError
 from lemonade.state import State
-from lemonade.cache import Keys
 from lemonade.tools.adapter import ModelAdapter, TokenizerAdapter
 from lemonade.tools.bench import Bench
@@ -20,16 +19,6 @@ class OgaBench(Bench):
     unique_name = "oga-bench"
-    def __init__(self):
-        super().__init__()
-        # Additional statistics generated by this bench tool
-        self.status_stats.insert(
-            self.status_stats.index(Keys.TOKEN_GENERATION_TOKENS_PER_SECOND) + 1,
-            Keys.STD_DEV_TOKENS_PER_SECOND,
-        )
-        self.std_dev_token_generation_tokens_per_second_list = []
     @staticmethod
     def parser(add_help: bool = True) -> argparse.ArgumentParser:
         parser = __class__.helpful_parser(
@@ -121,21 +110,6 @@ class OgaBench(Bench):
             # Less than 2 measurements
             self.std_dev_token_generation_tokens_per_second_list.append(None)
-    def save_stats(self, state):
-        super().save_stats(state)
-        # Save additional statistics
-        if not all(
-            element is None
-            for element in self.std_dev_token_generation_tokens_per_second_list
-        ):
-            state.save_stat(
-                Keys.STD_DEV_TOKENS_PER_SECOND,
-                self.get_item_or_list(
-                    self.std_dev_token_generation_tokens_per_second_list
-                ),
-            )
 # This file was originally licensed under Apache 2.0. It has been modified.
 # Modifications Copyright (c) 2025 AMD

lemonade/tools/report/table.py CHANGED Viewed

@@ -581,6 +581,12 @@ class LemonadePerfTable(Table):
                     Keys.STD_DEV_SECONDS_TO_FIRST_TOKEN,
                     ".2f",
                 ),
+                StatWithSD(
+                    _wrap("Prefill Tokens per Second", 8),
+                    Keys.PREFILL_TOKENS_PER_SECOND,
+                    Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
+                    ".2f",
+                ),
                 StatWithSD(
                     _wrap("Tokens per Second", 8),
                     Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,

lemonade/tools/server/flm.py ADDED Viewed

@@ -0,0 +1,133 @@
+import os
+import logging
+import subprocess
+import time
+import threading
+import requests
+from lemonade_server.pydantic_models import (
+    PullConfig,
+    ChatCompletionRequest,
+)
+from lemonade.tools.server.wrapped_server import WrappedServerTelemetry, WrappedServer
+from lemonade.tools.flm.utils import install_flm, download_flm_model
+class FlmTelemetry(WrappedServerTelemetry):
+    """
+    Manages telemetry data collection and display for FLM server.
+    """
+    def parse_telemetry_line(self, line: str):
+        """
+        Parse telemetry data from FLM server output lines.
+        Note: as of FLM 0.9.10, no telemetry data is provided by the server CLI.
+                This function is required to be implemented, so we leave it empty
+                as a placeholder for now.
+        """
+        return
+class FlmServer(WrappedServer):
+    """
+    Routes OpenAI API requests to an FLM server instance and returns the result
+    back to Lemonade Server.
+    """
+    def __init__(self):
+        self.flm_model_name = None
+        super().__init__(server_name="flm-server", telemetry=FlmTelemetry())
+    def address(self):
+        return f"http://localhost:{self.port}/v1"
+    def install_server(self):
+        """
+        Check if FLM is installed and at minimum version.
+        If not, download and run the GUI installer, then wait for completion.
+        """
+        install_flm()
+    def download_model(
+        self, config_checkpoint, config_mmproj=None, do_not_upgrade=False
+    ) -> dict:
+        download_flm_model(config_checkpoint, config_mmproj, do_not_upgrade)
+    def _launch_server_subprocess(
+        self,
+        model_config: PullConfig,
+        snapshot_files: dict,
+        ctx_size: int,
+        supports_embeddings: bool = False,
+        supports_reranking: bool = False,
+    ):
+        self._choose_port()
+        # Keep track of the FLM model name so that we can use it later
+        self.flm_model_name = model_config.checkpoint
+        command = [
+            "flm",
+            "serve",
+            f"{self.flm_model_name}",
+            "--ctx-len",
+            str(ctx_size),
+            "--port",
+            str(self.port),
+        ]
+        # Set up environment with library path for Linux
+        env = os.environ.copy()
+        self.process = subprocess.Popen(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            encoding="utf-8",
+            errors="replace",
+            bufsize=1,
+            env=env,
+        )
+        # Start background thread to log subprocess output
+        threading.Thread(
+            target=self._log_subprocess_output,
+            args=("FLM SERVER",),
+            daemon=True,
+        ).start()
+    def _wait_for_load(self):
+        """
+        FLM doesn't seem to have a health API, so we'll use the "list local models"
+        API to check if the server is up.
+        """
+        status_code = None
+        while not self.process.poll() and status_code != 200:
+            health_url = f"http://localhost:{self.port}/api/tags"
+            try:
+                health_response = requests.get(health_url)
+            except requests.exceptions.ConnectionError:
+                logging.debug(
+                    "Not able to connect to %s yet, will retry", self.server_name
+                )
+            else:
+                status_code = health_response.status_code
+                logging.debug(
+                    "Testing %s readiness (will retry until ready), result: %s",
+                    self.server_name,
+                    health_response.json(),
+                )
+            time.sleep(1)
+    def chat_completion(self, chat_completion_request: ChatCompletionRequest):
+        # FLM requires the correct model name to be in the request
+        # (whereas llama-server ignores the model name field in the request)
+        chat_completion_request.model = self.flm_model_name
+        return super().chat_completion(chat_completion_request)

lemonade/tools/server/llamacpp.py CHANGED Viewed

@@ -88,9 +88,8 @@ class LlamaTelemetry(WrappedServerTelemetry):
 class LlamaServer(WrappedServer):
     def __init__(self, backend: str):
-        self.telemetry = LlamaTelemetry()
         self.backend = backend
-        super().__init__(server_name="llama-server", telemetry=self.telemetry)
+        super().__init__(server_name="llama-server", telemetry=LlamaTelemetry())
     def install_server(self, backend=None):
         """
@@ -157,13 +156,23 @@ class LlamaServer(WrappedServer):
         # Find a port, and save it in the telemetry object for future reference
         # by other functions
-        self.choose_port()
+        self._choose_port()
         # Add port and jinja to enable tool use
         base_command.extend(["--port", str(self.port), "--jinja"])
         # Enable context shift and avoid attention sink issues by preserving the initial tokens
-        base_command.extend(["--context-shift", "--keep", "16"])
+        # Note: --context-shift is not supported on all backends (e.g., Metal on macOS)
+        # Only add context-shift for backends that support it
+        context_shift_supported_backends = ["vulkan", "rocm"]
+        if self.backend in context_shift_supported_backends:
+            base_command.extend(["--context-shift", "--keep", "16"])
+        else:
+            # For backends that don't support context-shift (e.g., Metal), just use keep
+            base_command.extend(["--keep", "16"])
+            logging.debug(
+                f"Skipped --context-shift for backend: {self.backend} (not supported)"
+            )
         # Use legacy reasoning formatting, since not all apps support the new
         # reasoning_content field
@@ -192,7 +201,8 @@ class LlamaServer(WrappedServer):
             env.update(os.environ)
             logging.debug(f"Loaded environment variables from {env_file_path}")
-        if platform.system().lower() == "linux":
+        system = platform.system().lower()
+        if system == "linux":
             lib_dir = os.path.dirname(exe_path)  # Same directory as the executable
             current_ld_path = env.get("LD_LIBRARY_PATH", "")
             if current_ld_path:
@@ -200,6 +210,14 @@ class LlamaServer(WrappedServer):
             else:
                 env["LD_LIBRARY_PATH"] = lib_dir
             logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
+        elif system == "darwin":
+            lib_dir = os.path.dirname(exe_path)
+            current_dyld_path = env.get("DYLD_LIBRARY_PATH", "")
+            if current_dyld_path:
+                env["DYLD_LIBRARY_PATH"] = f"{lib_dir}:{current_dyld_path}"
+            else:
+                env["DYLD_LIBRARY_PATH"] = lib_dir
+            logging.debug(f"Set DYLD_LIBRARY_PATH to {env['DYLD_LIBRARY_PATH']}")
         # Start subprocess with output capture
         self.process = subprocess.Popen(

lemonade-sdk 8.1.10__py3-none-any.whl → 8.1.12__py3-none-any.whl

Potentially problematic release.

lemonade-sdk 8.1.10py3-none-any.whl → 8.1.12py3-none-any.whl