PyPI - lemonade-sdk - Versions diffs - 8.1.4__py3-none-any.whl → 8.2.2__py3-none-any.whl - Mend

lemonade-sdk 8.1.4py3-none-any.whl → 8.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (53) hide show

lemonade/cache.py +6 -1
lemonade/cli.py +47 -5
lemonade/common/inference_engines.py +13 -4
lemonade/common/status.py +4 -4
lemonade/common/system_info.py +544 -1
lemonade/profilers/agt_power.py +437 -0
lemonade/profilers/hwinfo_power.py +429 -0
lemonade/tools/accuracy.py +143 -48
lemonade/tools/adapter.py +6 -1
lemonade/tools/bench.py +26 -8
lemonade/tools/flm/__init__.py +1 -0
lemonade/tools/flm/utils.py +303 -0
lemonade/tools/huggingface/bench.py +6 -1
lemonade/tools/llamacpp/bench.py +146 -27
lemonade/tools/llamacpp/load.py +30 -2
lemonade/tools/llamacpp/utils.py +393 -33
lemonade/tools/oga/bench.py +5 -26
lemonade/tools/oga/load.py +60 -121
lemonade/tools/oga/migration.py +403 -0
lemonade/tools/report/table.py +76 -8
lemonade/tools/server/flm.py +133 -0
lemonade/tools/server/llamacpp.py +220 -553
lemonade/tools/server/serve.py +684 -168
lemonade/tools/server/static/js/chat.js +666 -342
lemonade/tools/server/static/js/model-settings.js +24 -3
lemonade/tools/server/static/js/models.js +597 -73
lemonade/tools/server/static/js/shared.js +79 -14
lemonade/tools/server/static/logs.html +191 -0
lemonade/tools/server/static/styles.css +491 -66
lemonade/tools/server/static/webapp.html +83 -31
lemonade/tools/server/tray.py +158 -38
lemonade/tools/server/utils/macos_tray.py +226 -0
lemonade/tools/server/utils/{system_tray.py → windows_tray.py} +13 -0
lemonade/tools/server/webapp.py +4 -1
lemonade/tools/server/wrapped_server.py +559 -0
lemonade/version.py +1 -1
lemonade_install/install.py +54 -611
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/METADATA +29 -72
lemonade_sdk-8.2.2.dist-info/RECORD +83 -0
lemonade_server/cli.py +145 -37
lemonade_server/model_manager.py +521 -37
lemonade_server/pydantic_models.py +28 -1
lemonade_server/server_models.json +246 -92
lemonade_server/settings.py +39 -39
lemonade/tools/quark/__init__.py +0 -0
lemonade/tools/quark/quark_load.py +0 -173
lemonade/tools/quark/quark_quantize.py +0 -439
lemonade_sdk-8.1.4.dist-info/RECORD +0 -77
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/WHEEL +0 -0
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/entry_points.txt +0 -0
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/LICENSE +0 -0
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/NOTICE.md +0 -0
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/top_level.txt +0 -0

lemonade/tools/llamacpp/utils.py CHANGED Viewed

@@ -3,19 +3,22 @@ import os
 import platform
 import shutil
 import sys
+import threading
+import time
 import zipfile
 from typing import Optional
+import psutil
 import subprocess
 import requests
+import lemonade.common.build as build
 import lemonade.common.printing as printing
 from lemonade.tools.adapter import PassthroughTokenizer, ModelAdapter
 from lemonade.common.system_info import get_system_info
 from dotenv import set_key, load_dotenv
-LLAMA_VERSION_VULKAN = "b6097"
-LLAMA_VERSION_ROCM = "b1021"
+LLAMA_VERSION_VULKAN = "b6510"
+LLAMA_VERSION_ROCM = "b1066"
+LLAMA_VERSION_METAL = "b6510"
 def identify_rocm_arch_from_name(device_name: str) -> str | None:
@@ -126,8 +129,12 @@ def get_llama_version(backend: str) -> str:
         return LLAMA_VERSION_ROCM
     elif backend == "vulkan":
         return LLAMA_VERSION_VULKAN
+    elif backend == "metal":
+        return LLAMA_VERSION_METAL
     else:
-        raise ValueError(f"Unsupported backend: {backend}")
+        raise ValueError(
+            f"Unsupported backend: {backend}. Supported: vulkan, rocm, metal"
+        )
 def get_llama_folder_path(backend: str):
@@ -142,10 +149,12 @@ def get_llama_exe_path(exe_name: str, backend: str):
     Get path to platform-specific llama-server executable
     """
     base_dir = get_llama_folder_path(backend)
-    if platform.system().lower() == "windows":
+    system = platform.system().lower()
+    if system == "windows":
         return os.path.join(base_dir, f"{exe_name}.exe")
-    else:  # Linux/Ubuntu
-        # Check if executable exists in build/bin subdirectory (Current Ubuntu structure)
+    else:  # Darwin/Linux/Ubuntu
+        # Check if executable exists in build/bin subdirectory
         build_bin_path = os.path.join(base_dir, "build", "bin", exe_name)
         if os.path.exists(build_bin_path):
             return build_bin_path
@@ -168,6 +177,13 @@ def get_llama_cli_exe_path(backend: str):
     return get_llama_exe_path("llama-cli", backend)
+def get_llama_bench_exe_path(backend: str):
+    """
+    Get path to platform-specific llama-bench executable
+    """
+    return get_llama_exe_path("llama-bench", backend)
 def get_version_txt_path(backend: str):
     """
     Get path to text file that contains version information
@@ -223,8 +239,24 @@ def get_binary_url_and_filename(backend: str, target_arch: str = None):
             raise NotImplementedError(
                 f"Platform {system} not supported for Vulkan llamacpp. Supported: Windows, Ubuntu Linux"
             )
+    elif backend == "metal":
+        # Metal support for macOS Apple Silicon from ggml-org/llama.cpp
+        repo = "ggml-org/llama.cpp"
+        version = LLAMA_VERSION_METAL
+        if system == "darwin":
+            if platform.machine().lower() in ["arm64", "aarch64"]:
+                filename = f"llama-{version}-bin-macos-arm64.zip"
+            else:
+                raise NotImplementedError(
+                    "Metal backend only supports Apple Silicon (ARM64) processors"
+                )
+        else:
+            raise NotImplementedError(
+                f"Platform {system} not supported for Metal llamacpp. Metal is only supported on macOS"
+            )
     else:
-        supported_backends = ["vulkan", "rocm"]
+        supported_backends = ["vulkan", "rocm", "metal"]
         raise NotImplementedError(
             f"Unsupported backend: {backend}. Supported backends: {supported_backends}"
         )
@@ -239,10 +271,10 @@ def validate_platform_support():
     """
     system = platform.system().lower()
-    if system not in ["windows", "linux"]:
+    if system not in ["windows", "linux", "darwin"]:
         raise NotImplementedError(
             f"Platform {system} not supported for llamacpp. "
-            "Supported: Windows, Ubuntu Linux"
+            "Supported: Windows, Ubuntu Linux, macOS"
         )
     if system == "linux":
@@ -341,12 +373,39 @@ def install_llamacpp(backend):
         if filename.endswith(".zip"):
             with zipfile.ZipFile(llama_archive_path, "r") as zip_ref:
                 zip_ref.extractall(llama_server_exe_dir)
+            # On Unix-like systems (macOS/Linux), make executables executable
+            if platform.system().lower() in ["darwin", "linux"]:
+                import stat
+                # Find and make executable files executable
+                for root, _, files in os.walk(llama_server_exe_dir):
+                    for file in files:
+                        file_path = os.path.join(root, file)
+                        # Make files in bin/ directories executable
+                        if "bin" in root.split(os.sep) or file in [
+                            "llama-server",
+                            "llama-simple",
+                        ]:
+                            try:
+                                current_permissions = os.stat(file_path).st_mode
+                                os.chmod(file_path, current_permissions | stat.S_IEXEC)
+                                logging.debug(f"Made {file_path} executable")
+                            except Exception as e:
+                                raise RuntimeError(
+                                    f"Failed to make {file_path} executable. This will prevent "
+                                    f"llama-server from starting. Error: {e}"
+                                )
         else:
             raise NotImplementedError(f"Unsupported archive format: {filename}")
         # Identify and set HIP ID
         if backend == "rocm":
-            hip_id = identify_hip_id()
+            try:
+                hip_id = identify_hip_id()
+            except Exception as e:  # pylint: disable=broad-exception-caught
+                hip_id = 0
+                logging.warning(f"Error identifying HIP ID: {e}. Falling back to 0.")
             env_file_path = os.path.join(llama_server_exe_dir, ".env")
             set_key(env_file_path, "HIP_VISIBLE_DEVICES", str(hip_id))
@@ -356,6 +415,7 @@ def install_llamacpp(backend):
             exe_paths = [
                 (get_llama_server_exe_path(backend), "llama-server"),
                 (get_llama_cli_exe_path(backend), "llama-cli"),
+                (get_llama_bench_exe_path(backend), "llama-bench"),
             ]
             for exe_path, exe_name in exe_paths:
@@ -496,7 +556,7 @@ def get_local_checkpoint_path(base_checkpoint, variant):
 def identify_gguf_models(
-    checkpoint: str, variant: str, mmproj: str
+    checkpoint: str, variant: Optional[str], mmproj: str
 ) -> tuple[dict, list[str]]:
     """
     Identifies the GGUF model files in the repository that match the variant.
@@ -506,12 +566,14 @@ def identify_gguf_models(
     The CHECKPOINT:VARIANT scheme is used to specify model files in Hugging Face repositories.
     The VARIANT format can be one of several types:
+    0. wildcard (*): download all .gguf files in the repo
     1. Full filename: exact file to download
     2. None/empty: gets the first .gguf file in the repository (excludes mmproj files)
     3. Quantization variant: find a single file ending with the variant name (case insensitive)
     4. Folder name: downloads all .gguf files in the folder that matches the variant name (case insensitive)
     Examples:
+    - "ggml-org/gpt-oss-120b-GGUF:*" -> downloads all .gguf files in repo
     - "unsloth/Qwen3-8B-GGUF:qwen3.gguf" -> downloads "qwen3.gguf"
     - "unsloth/Qwen3-30B-A3B-GGUF" -> downloads "Qwen3-30B-A3B-GGUF.gguf"
     - "unsloth/Qwen3-8B-GGUF:Q4_1" -> downloads "Qwen3-8B-GGUF-Q4_1.gguf"
@@ -523,8 +585,18 @@ def identify_gguf_models(
     repo_files = list_repo_files(checkpoint)
     sharded_files = []
+    # (case 0) Wildcard, download everything
+    if variant and variant == "*":
+        sharded_files = [f for f in repo_files if f.endswith(".gguf")]
+        # Sort to ensure consistent ordering
+        sharded_files.sort()
+        # Use first file as primary (this is how llamacpp handles it)
+        variant_name = sharded_files[0]
     # (case 1) If variant ends in .gguf, use it directly
-    if variant and variant.endswith(".gguf"):
+    elif variant and variant.endswith(".gguf"):
         variant_name = variant
         if variant_name not in repo_files:
             raise ValueError(
@@ -585,15 +657,91 @@ def identify_gguf_models(
     return core_files, sharded_files
-def download_gguf(config_checkpoint, config_mmproj=None, do_not_upgrade=False) -> dict:
+def resolve_local_gguf_model(
+    checkpoint: str, variant: str, config_mmproj: str = None
+) -> dict | None:
+    """
+    Attempts to resolve a GGUF model from the local HuggingFace cache.
     """
-    Downloads the GGUF file for the given model configuration.
+    from huggingface_hub.constants import HF_HUB_CACHE
+    # Convert checkpoint to cache directory format
+    if checkpoint.startswith("models--"):
+        model_cache_dir = os.path.join(HF_HUB_CACHE, checkpoint)
+    else:
+        # This is a HuggingFace repo - convert to cache directory format
+        repo_cache_name = checkpoint.replace("/", "--")
+        model_cache_dir = os.path.join(HF_HUB_CACHE, f"models--{repo_cache_name}")
+    # Check if the cache directory exists
+    if not os.path.exists(model_cache_dir):
+        return None
-    For sharded models, if the variant points to a folder (e.g. Q4_0), all files in that folder
-    will be downloaded but only the first file will be returned for loading.
+    gguf_file_found = None
+    # If variant is specified, look for that specific file
+    if variant:
+        search_term = variant if variant.endswith(".gguf") else f"{variant}.gguf"
+        for root, _, files in os.walk(model_cache_dir):
+            if search_term in files:
+                gguf_file_found = os.path.join(root, search_term)
+                break
+    # If no variant or variant not found, find any .gguf file (excluding mmproj)
+    if not gguf_file_found:
+        for root, _, files in os.walk(model_cache_dir):
+            gguf_files = [
+                f for f in files if f.endswith(".gguf") and "mmproj" not in f.lower()
+            ]
+            if gguf_files:
+                gguf_file_found = os.path.join(root, gguf_files[0])
+                break
+    # If no GGUF file found, model is not in cache
+    if not gguf_file_found:
+        return None
+    # Build result dictionary
+    result = {"variant": gguf_file_found}
+    # Search for mmproj file if provided
+    if config_mmproj:
+        for root, _, files in os.walk(model_cache_dir):
+            if config_mmproj in files:
+                result["mmproj"] = os.path.join(root, config_mmproj)
+                break
+    logging.info(f"Resolved local GGUF model: {result}")
+    return result
+def download_gguf(
+    config_checkpoint: str, config_mmproj=None, do_not_upgrade: bool = False
+) -> dict:
     """
+    Downloads the GGUF file for the given model configuration from HuggingFace.
+    This function downloads models from the internet. It does NOT check the local cache first.
+    Callers should use resolve_local_gguf_model() if they want to check for existing models first.
+    Args:
+        config_checkpoint: Checkpoint identifier (file path or HF repo with variant)
+        config_mmproj: Optional mmproj file to also download
+        do_not_upgrade: If True, use local cache only without attempting to download updates
-    # This code handles all cases by constructing the appropriate filename or pattern
+    Returns:
+        Dictionary with "variant" (and optionally "mmproj") file paths
+    """
+    # Handle direct file path case - if the checkpoint is an actual file on disk
+    if os.path.exists(config_checkpoint):
+        result = {"variant": config_checkpoint}
+        if config_mmproj:
+            result["mmproj"] = config_mmproj
+        return result
+    # Parse checkpoint to extract base and variant
+    # Checkpoint format: repo_name:variant (e.g., "unsloth/Qwen3-0.6B-GGUF:Q4_0")
     checkpoint, variant = parse_checkpoint(config_checkpoint)
     # Identify the GGUF model files in the repository that match the variant
@@ -624,6 +772,37 @@ def download_gguf(config_checkpoint, config_mmproj=None, do_not_upgrade=False) -
     }
+# Function to read a stream (stdout or stderr) into a list
+def stream_reader(stream, output_list):
+    for line in iter(stream.readline, b""):
+        decoded_line = line.decode().rstrip()
+        output_list.append(decoded_line)
+    stream.close()
+def monitor_process_memory(pid, memory_data, interval=0.5):
+    """Monitor memory usage of a process in a separate thread."""
+    try:
+        is_windows = platform.system() == "Windows"
+        if is_windows:
+            # We can only collect peak_wset in Windows
+            process = psutil.Process(pid)
+            while process.is_running():
+                try:
+                    mem_info = process.memory_info()
+                    peak_wset = mem_info.peak_wset
+                    if peak_wset is not None:
+                        memory_data["peak_wset"] = peak_wset
+                except psutil.NoSuchProcess:
+                    break
+                time.sleep(interval)
+    except Exception as e:
+        print(f"Error monitoring process: {e}")
+    return memory_data
 class LlamaCppTokenizerAdapter(PassthroughTokenizer):
     pass
@@ -637,8 +816,10 @@ class LlamaCppAdapter(ModelAdapter):
         context_size,
         threads,
         executable,
+        bench_executable,
         reasoning=False,
         lib_dir=None,
+        state=None,
     ):
         super().__init__()
@@ -650,8 +831,10 @@ class LlamaCppAdapter(ModelAdapter):
         self.context_size = context_size
         self.threads = threads
         self.executable = os.path.normpath(executable)
+        self.bench_executable = os.path.normpath(bench_executable)
         self.reasoning = reasoning
         self.lib_dir = lib_dir
+        self.state = state
     def generate(
         self,
@@ -661,6 +844,7 @@ class LlamaCppAdapter(ModelAdapter):
         top_p: float = 0.95,
         top_k: int = 40,
         return_raw: bool = False,
+        save_max_memory_used: bool = False,
         **kwargs,  # pylint: disable=unused-argument
     ):
         """
@@ -692,32 +876,54 @@ class LlamaCppAdapter(ModelAdapter):
             self.executable,
             "-m",
             self.model,
-            "--ctx-size",
+            "--ctx-size",  # size of the prompt context, 0 = loaded from model
             str(self.context_size),
-            "-n",
+            "-n",  # number of tokens to predict, -1 = infinity, =2 - until context filled
             str(n_predict),
-            "-t",
+            "-t",  # number of threads to use during generation
             str(self.threads),
             "-p",
             prompt,
+            "-b",  # logical maximum batch size
+            "1",
+            "-ub",  # physical maximum batch size
+            "1",
             "--temp",
             str(temperature),
             "--top-p",
             str(top_p),
             "--top-k",
             str(top_k),
-            "-e",
-            "-no-cnv",
-            "--reasoning-format",
+            "-e",  # process escape sequences
+            "--no-conversation",  # disable conversation mode
+            "--reasoning-format",  # leaves thoughts unparsed in message content
             "none",
         ]
+        # If prompt exceeds 500 characters, then use a file
+        if len(prompt) < 500:
+            cmd += ["-p", prompt]
+        else:
+            # Create prompt file in cache directory
+            prompt_file = os.path.join(
+                build.output_dir(self.state.cache_dir, self.state.build_name),
+                "prompt.txt",
+            )
+            with open(prompt_file, "w", encoding="utf-8") as file:
+                file.write(prompt)
+            cmd += ["-f", prompt_file]
         # Configure GPU layers: 99 for GPU, 0 for CPU-only
         ngl_value = "99" if self.device == "igpu" else "0"
         cmd = cmd + ["-ngl", ngl_value]
         cmd = [str(m) for m in cmd]
+        # save llama-cli command
+        self.state.llama_cli_cmd = getattr(self.state, "llama_cli_cmd", []) + [
+            " ".join(cmd)
+        ]
         try:
             # Set up environment with library path for Linux
             env = os.environ.copy()
@@ -746,15 +952,35 @@ class LlamaCppAdapter(ModelAdapter):
                 env=env,
             )
-            raw_output, stderr = process.communicate(timeout=600)
+            # Start memory monitoring in a separate thread
+            if save_max_memory_used:
+                memory_data = {}
+                monitor_thread = threading.Thread(
+                    target=monitor_process_memory,
+                    args=(process.pid, memory_data),
+                    daemon=True,
+                )
+                monitor_thread.start()
+            # Communicate with the subprocess
+            stdout, stderr = process.communicate(timeout=600)
+            # save llama-cli command output with performance info to state
+            # (can be viewed in state.yaml file in cache)
+            self.state.llama_cli_stderr = getattr(
+                self.state, "llama_cli_stderr", []
+            ) + [
+                [line for line in stderr.splitlines() if line.startswith("llama_perf_")]
+            ]
             if process.returncode != 0:
                 error_msg = f"llama.cpp failed with return code {process.returncode}.\n"
                 error_msg += f"Command: {' '.join(cmd)}\n"
                 error_msg += f"Error output:\n{stderr}\n"
-                error_msg += f"Standard output:\n{raw_output}"
+                error_msg += f"Standard output:\n{stdout}"
                 raise Exception(error_msg)
-            if raw_output is None:
+            if stdout is None:
                 raise Exception("No output received from llama.cpp process")
             # Parse information from llama.cpp output
@@ -785,14 +1011,19 @@ class LlamaCppAdapter(ModelAdapter):
                         else 0
                     )
+            # Wait for monitor thread to finish and write peak_wset
+            if save_max_memory_used:
+                monitor_thread.join(timeout=2)
+                self.peak_wset = memory_data.get("peak_wset", None)
             if return_raw:
-                return [raw_output, stderr]
+                return [stdout, stderr]
             # Find where the prompt ends and the generated text begins
             prompt_found = False
             output_text = ""
             prompt_first_line = prompt.split("\n")[0]
-            for line in raw_output.splitlines():
+            for line in stdout.splitlines():
                 if prompt_first_line in line:
                     prompt_found = True
                 if prompt_found:
@@ -803,7 +1034,7 @@ class LlamaCppAdapter(ModelAdapter):
                 raise Exception(
                     f"Could not find prompt '{prompt_first_line}' in llama.cpp output. "
                     "This usually means the model failed to process the prompt correctly.\n"
-                    f"Raw output:\n{raw_output}\n"
+                    f"Raw output:\n{stdout}\n"
                     f"Stderr:\n{stderr}"
                 )
@@ -811,10 +1042,137 @@ class LlamaCppAdapter(ModelAdapter):
             return [output_text]
         except Exception as e:
-            error_msg = f"Failed to run llama.cpp command: {str(e)}\n"
+            error_msg = f"Failed to run llama-cli.exe command: {str(e)}\n"
             error_msg += f"Command: {' '.join(cmd)}"
             raise Exception(error_msg)
+    def benchmark(self, prompt, iterations, output_tokens):
+        """
+        Runs the llama-bench.exe tool to measure TTFT and TPS
+        """
+        cmd = [
+            self.bench_executable,
+            "-m",
+            self.model,
+            "-r",
+            iterations,
+            "-p",
+            str(prompt),
+            "-n",
+            output_tokens,
+            "-t",
+            self.threads if self.threads > 0 else 16,
+            "-b",
+            1,
+            "-ub",
+            1,
+        ]
+        ngl_value = "99" if self.device == "igpu" else "0"
+        cmd = cmd + ["-ngl", ngl_value]
+        cmd = [str(m) for m in cmd]
+        # save llama-bench command
+        self.state.llama_bench_cmd = " ".join(cmd)
+        try:
+            # Set up environment with library path for Linux
+            env = os.environ.copy()
+            # Load environment variables from .env file in the executable directory
+            exe_dir = os.path.dirname(self.executable)
+            env_file_path = os.path.join(exe_dir, ".env")
+            if os.path.exists(env_file_path):
+                load_dotenv(env_file_path, override=True)
+                env.update(os.environ)
+            if self.lib_dir and os.name != "nt":  # Not Windows
+                current_ld_path = env.get("LD_LIBRARY_PATH", "")
+                if current_ld_path:
+                    env["LD_LIBRARY_PATH"] = f"{self.lib_dir}:{current_ld_path}"
+                else:
+                    env["LD_LIBRARY_PATH"] = self.lib_dir
+            process = subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                universal_newlines=True,
+                encoding="utf-8",
+                errors="replace",
+                env=env,
+            )
+            # Start memory monitoring in a separate thread
+            save_max_memory_used = platform.system() == "Windows"
+            if save_max_memory_used:
+                memory_data = {}
+                monitor_thread = threading.Thread(
+                    target=monitor_process_memory,
+                    args=(process.pid, memory_data),
+                    daemon=True,
+                )
+                monitor_thread.start()
+            # Communicate with the subprocess
+            stdout, stderr = process.communicate(timeout=600)
+            # save llama-bench command output with performance info to state
+            # (can be viewed in state.yaml file in cache)
+            self.state.llama_bench_standard_output = stdout.splitlines()
+            if process.returncode != 0:
+                error_msg = (
+                    f"llama-bench.exe failed with return code {process.returncode}.\n"
+                )
+                error_msg += f"Command: {' '.join(cmd)}\n"
+                error_msg += f"Error output:\n{stderr}\n"
+                error_msg += f"Standard output:\n{stdout}"
+                raise Exception(error_msg)
+            if stdout is None:
+                error_msg = "No output received from llama-bench.exe process\n"
+                error_msg += f"Error output:\n{stderr}\n"
+                error_msg += f"Standard output:\n{stdout}"
+                raise Exception(error_msg)
+            # Parse information from llama-bench.exe output
+            prompt_length = None
+            pp_tps = None
+            pp_tps_sd = None
+            tg_tps = None
+            tg_tps_sd = None
+            for line in stdout.splitlines():
+                # Parse TPS information
+                if f"pp{prompt:d}" in line:
+                    parts = line.split("|")
+                    timings = parts[-2].strip().split(" ")
+                    prompt_length = prompt
+                    pp_tps = float(timings[0])
+                    pp_tps_sd = float(timings[-1])
+                if f"tg{output_tokens:d}" in line:
+                    parts = line.split("|")
+                    timings = parts[-2].strip().split(" ")
+                    tg_tps = float(timings[0])
+                    tg_tps_sd = float(timings[-1])
+        except Exception as e:
+            error_msg = f"Failed to run llama-bench.exe command: {str(e)}\n"
+            error_msg += f"Command: {' '.join(cmd)}"
+            raise Exception(error_msg)
+        # Determine max memory used
+        if save_max_memory_used:
+            # Wait for monitor thread to finish
+            monitor_thread.join(timeout=2)
+            # Track memory usage concurrently
+            peak_wset = memory_data.get("peak_wset", None)
+        else:
+            peak_wset = None
+        return prompt_length, pp_tps, pp_tps_sd, tg_tps, tg_tps_sd, peak_wset
 def get_hip_devices():
     """Get list of HIP devices with their IDs and names."""
@@ -841,7 +1199,9 @@ def get_hip_devices():
     try:
         libhip = ctypes.CDLL(matching_files[0])
     except OSError:
-        raise RuntimeError(f"Could not load HIP runtime library from {path}")
+        raise RuntimeError(
+            f"Could not load HIP runtime library from {matching_files[0]}"
+        )
     # Setup function signatures
     hipError_t = c_int

lemonade-sdk 8.1.4__py3-none-any.whl → 8.2.2__py3-none-any.whl

Potentially problematic release.

lemonade-sdk 8.1.4py3-none-any.whl → 8.2.2py3-none-any.whl