PyPI - lemonade-sdk - Versions diffs - 8.0.6__py3-none-any.whl → 8.1.1__py3-none-any.whl - Mend

lemonade-sdk 8.0.6py3-none-any.whl → 8.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (27) hide show

lemonade/common/inference_engines.py +62 -77
lemonade/common/network.py +18 -1
lemonade/common/system_info.py +61 -44
lemonade/tools/llamacpp/bench.py +3 -1
lemonade/tools/llamacpp/load.py +13 -4
lemonade/tools/llamacpp/utils.py +229 -61
lemonade/tools/oga/load.py +239 -112
lemonade/tools/oga/utils.py +19 -7
lemonade/tools/server/llamacpp.py +30 -53
lemonade/tools/server/serve.py +64 -123
lemonade/tools/server/static/styles.css +208 -6
lemonade/tools/server/static/webapp.html +510 -71
lemonade/tools/server/tray.py +4 -2
lemonade/tools/server/utils/thread.py +2 -4
lemonade/version.py +1 -1
lemonade_install/install.py +90 -86
{lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/METADATA +74 -24
{lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/RECORD +27 -27
lemonade_server/cli.py +79 -26
lemonade_server/model_manager.py +4 -3
lemonade_server/pydantic_models.py +1 -4
lemonade_server/server_models.json +60 -11
{lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/WHEEL +0 -0
{lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/entry_points.txt +0 -0
{lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/licenses/LICENSE +0 -0
{lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/licenses/NOTICE.md +0 -0
{lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/top_level.txt +0 -0

lemonade/common/inference_engines.py CHANGED Viewed

@@ -2,7 +2,6 @@ import os
 import sys
 import importlib.util
 import importlib.metadata
-import platform
 import subprocess
 from abc import ABC, abstractmethod
 from typing import Dict, Optional
@@ -19,7 +18,9 @@ class InferenceEngineDetector:
         self.llamacpp_detector = LlamaCppDetector()
         self.transformers_detector = TransformersDetector()
-    def detect_engines_for_device(self, device_type: str) -> Dict[str, Dict]:
+    def detect_engines_for_device(
+        self, device_type: str, device_name: str
+    ) -> Dict[str, Dict]:
         """
         Detect all available inference engines for a specific device type.
@@ -36,10 +37,19 @@ class InferenceEngineDetector:
         if oga_info:
             engines["oga"] = oga_info
-        # Detect llama.cpp availability
-        llamacpp_info = self.llamacpp_detector.detect_for_device(device_type)
+        # Detect llama.cpp vulkan availability
+        llamacpp_info = self.llamacpp_detector.detect_for_device(
+            device_type, device_name, "vulkan"
+        )
+        if llamacpp_info:
+            engines["llamacpp-vulkan"] = llamacpp_info
+        # Detect llama.cpp rocm availability
+        llamacpp_info = self.llamacpp_detector.detect_for_device(
+            device_type, device_name, "rocm"
+        )
         if llamacpp_info:
-            engines["llamacpp"] = llamacpp_info
+            engines["llamacpp-rocm"] = llamacpp_info
         # Detect Transformers availability
         transformers_info = self.transformers_detector.detect_for_device(device_type)
@@ -206,57 +216,40 @@ class LlamaCppDetector(BaseEngineDetector):
     Detector for llama.cpp.
     """
-    def detect_for_device(self, device_type: str) -> Optional[Dict]:
+    def detect_for_device(
+        self, device_type: str, device_name: str, backend: str
+    ) -> Optional[Dict]:
         """
         Detect llama.cpp availability for specific device.
         """
         try:
-            # Map device types to llama.cpp backends
-            device_backend_map = {
-                "cpu": "cpu",
-                "amd_igpu": "vulkan",
-                "amd_dgpu": "vulkan",
-            }
-            if device_type not in device_backend_map:
+            if device_type not in ["cpu", "amd_igpu", "amd_dgpu"]:
                 return None
-            backend = device_backend_map[device_type]
-            is_installed = self.is_installed()
-            # Check requirements based on backend
-            if backend == "vulkan":
-                vulkan_available = self._check_vulkan_support()
-                if not vulkan_available:
-                    return {"available": False, "error": "Vulkan not available"}
-                # Vulkan is available
-                if is_installed:
-                    result = {
-                        "available": True,
-                        "version": self._get_llamacpp_version(),
-                        "backend": backend,
-                    }
-                    return result
-                else:
-                    return {
-                        "available": False,
-                        "error": "llama.cpp binaries not installed",
-                    }
-            else:
-                # CPU backend
-                if is_installed:
-                    result = {
-                        "available": True,
-                        "version": self._get_llamacpp_version(),
-                        "backend": backend,
-                    }
-                    return result
-                else:
-                    return {
-                        "available": False,
-                        "error": "llama.cpp binaries not installed",
-                    }
+            # Check if the device is supported by the backend
+            if device_type == "cpu":
+                device_supported = True
+            elif device_type == "amd_igpu" or device_type == "amd_dgpu":
+                if backend == "vulkan":
+                    device_supported = self._check_vulkan_support()
+                elif backend == "rocm":
+                    device_supported = self._check_rocm_support(device_name.lower())
+            if not device_supported:
+                return {"available": False, "error": f"{backend} not available"}
+            is_installed = self.is_installed(backend)
+            if not is_installed:
+                return {
+                    "available": False,
+                    "error": f"{backend} binaries not installed",
+                }
+            return {
+                "available": True,
+                "version": self._get_llamacpp_version(backend),
+                "backend": backend,
+            }
         except (ImportError, OSError, subprocess.SubprocessError) as e:
             return {
@@ -264,35 +257,17 @@ class LlamaCppDetector(BaseEngineDetector):
                 "error": f"llama.cpp detection failed: {str(e)}",
             }
-    def is_installed(self) -> bool:
+    def is_installed(self, backend: str) -> bool:
         """
-        Check if llama.cpp binaries are available.
+        Check if llama.cpp binaries are available for any backend.
         """
+        from lemonade.tools.llamacpp.utils import get_llama_server_exe_path
-        # Check lemonade-managed binary locations
         try:
-            # Check lemonade server directory
-            server_base_dir = os.path.join(
-                os.path.dirname(sys.executable), "llama_server"
-            )
-            if platform.system().lower() == "windows":
-                server_exe_path = os.path.join(server_base_dir, "llama-server.exe")
-            else:
-                # Check both build/bin and root directory locations
-                build_bin_path = os.path.join(
-                    server_base_dir, "build", "bin", "llama-server"
-                )
-                root_path = os.path.join(server_base_dir, "llama-server")
-                server_exe_path = (
-                    build_bin_path if os.path.exists(build_bin_path) else root_path
-                )
+            server_exe_path = get_llama_server_exe_path(backend)
             if os.path.exists(server_exe_path):
                 return True
-        except (ImportError, OSError):
+        except (ImportError, OSError, ValueError):
             pass
         return False
@@ -334,13 +309,22 @@ class LlamaCppDetector(BaseEngineDetector):
             except OSError:
                 return False
-    def _get_llamacpp_version(self) -> str:
+    def _check_rocm_support(self, device_name: str) -> bool:
+        """
+        Check if ROCM is available for GPU acceleration.
+        """
+        from lemonade.tools.llamacpp.utils import identify_rocm_arch_from_name
+        return identify_rocm_arch_from_name(device_name) is not None
+    def _get_llamacpp_version(self, backend: str) -> str:
         """
-        Get llama.cpp version from lemonade's managed installation.
+        Get llama.cpp version from lemonade's managed installation for specific backend.
         """
         try:
+            # Use backend-specific path - same logic as get_llama_folder_path in utils.py
             server_base_dir = os.path.join(
-                os.path.dirname(sys.executable), "llama_server"
+                os.path.dirname(sys.executable), backend, "llama_server"
             )
             version_file = os.path.join(server_base_dir, "version.txt")
@@ -401,15 +385,16 @@ class TransformersDetector(BaseEngineDetector):
         )
-def detect_inference_engines(device_type: str) -> Dict[str, Dict]:
+def detect_inference_engines(device_type: str, device_name: str) -> Dict[str, Dict]:
     """
     Helper function to detect inference engines for a device type.
     Args:
         device_type: "cpu", "amd_igpu", "amd_dgpu", or "npu"
+        device_name: device name
     Returns:
         dict: Engine availability information.
     """
     detector = InferenceEngineDetector()
-    return detector.detect_engines_for_device(device_type)
+    return detector.detect_engines_for_device(device_type, device_name)

lemonade/common/network.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import os
 from typing import Optional
 import socket
-from huggingface_hub import model_info
+from huggingface_hub import model_info, snapshot_download
 def is_offline():
@@ -48,3 +48,20 @@ def get_base_model(checkpoint: str) -> Optional[str]:
     except Exception:  # pylint: disable=broad-except
         pass
     return None
+def custom_snapshot_download(repo_id, **kwargs):
+    """
+    Custom snapshot download with retry logic for Windows symlink privilege errors.
+    """
+    for attempt in range(2):
+        try:
+            return snapshot_download(repo_id=repo_id, **kwargs)
+        except OSError as e:
+            if (
+                hasattr(e, "winerror")
+                and e.winerror == 1314  # pylint: disable=no-member
+                and attempt < 1
+            ):
+                continue
+            raise

lemonade/common/system_info.py CHANGED Viewed

@@ -47,11 +47,10 @@ class SystemInfo(ABC):
         Returns:
             dict: Device information.
         """
         device_dict = {
             "cpu": self.get_cpu_device(),
-            "amd_igpu": self.get_amd_igpu_device(),
-            "amd_dgpu": self.get_amd_dgpu_devices(),
+            "amd_igpu": self.get_amd_igpu_device(include_inference_engines=True),
+            "amd_dgpu": self.get_amd_dgpu_devices(include_inference_engines=True),
             "npu": self.get_npu_device(),
         }
         return device_dict
@@ -66,7 +65,7 @@ class SystemInfo(ABC):
         """
     @abstractmethod
-    def get_amd_igpu_device(self) -> dict:
+    def get_amd_igpu_device(self, include_inference_engines: bool = False) -> dict:
         """
         Retrieves AMD integrated GPU device information.
@@ -75,7 +74,7 @@ class SystemInfo(ABC):
         """
     @abstractmethod
-    def get_amd_dgpu_devices(self) -> list:
+    def get_amd_dgpu_devices(self, include_inference_engines: bool = False) -> list:
         """
         Retrieves AMD discrete GPU device information.
@@ -143,8 +142,9 @@ class WindowsSystemInfo(SystemInfo):
             processors = self.connection.Win32_Processor()
             if processors:
                 processor = processors[0]
+                cpu_name = processor.Name.strip()
                 cpu_info = {
-                    "name": processor.Name.strip(),
+                    "name": cpu_name,
                     "cores": processor.NumberOfCores,
                     "threads": processor.NumberOfLogicalProcessors,
                     "max_clock_speed_mhz": processor.MaxClockSpeed,
@@ -152,7 +152,9 @@ class WindowsSystemInfo(SystemInfo):
                 }
                 # Add inference engine detection
-                cpu_info["inference_engines"] = self._detect_inference_engines("cpu")
+                cpu_info["inference_engines"] = self._detect_inference_engines(
+                    "cpu", cpu_name
+                )
                 return cpu_info
         except Exception as e:  # pylint: disable=broad-except
@@ -160,7 +162,7 @@ class WindowsSystemInfo(SystemInfo):
         return {"available": False, "error": "No CPU information found"}
-    def _detect_amd_gpus(self, gpu_type: str):
+    def _detect_amd_gpus(self, gpu_type: str, include_inference_engines: bool = False):
         """
         Shared AMD GPU detection logic for both integrated and discrete GPUs.
         Uses keyword-based classification for simplicity and reliability.
@@ -194,23 +196,25 @@ class WindowsSystemInfo(SystemInfo):
                         gpu_type == "discrete" and not is_integrated
                     ):
-                        driver_version = self.get_driver_version(
-                            "AMD-OpenCL User Mode Driver"
-                        )
                         device_type = "amd_igpu" if is_integrated else "amd_dgpu"
                         gpu_info = {
                             "name": controller.Name,
-                            "driver_version": (
-                                driver_version if driver_version else "Unknown"
-                            ),
                             "available": True,
                         }
-                        # Add inference engine detection
-                        gpu_info["inference_engines"] = self._detect_inference_engines(
-                            device_type
+                        driver_version = self.get_driver_version(
+                            "AMD-OpenCL User Mode Driver"
+                        )
+                        gpu_info["driver_version"] = (
+                            driver_version if driver_version else "Unknown"
                         )
+                        if include_inference_engines:
+                            gpu_info["inference_engines"] = (
+                                self._detect_inference_engines(
+                                    device_type, controller.Name
+                                )
+                            )
                         gpu_devices.append(gpu_info)
         except Exception as e:  # pylint: disable=broad-except
@@ -219,32 +223,36 @@ class WindowsSystemInfo(SystemInfo):
         return gpu_devices
-    def get_amd_igpu_device(self) -> dict:
+    def get_amd_igpu_device(self, include_inference_engines: bool = False) -> dict:
         """
         Retrieves AMD integrated GPU device information using keyword-based classification.
         Returns:
             dict: AMD iGPU device information.
         """
-        igpu_devices = self._detect_amd_gpus("integrated")
+        igpu_devices = self._detect_amd_gpus(
+            "integrated", include_inference_engines=include_inference_engines
+        )
         return (
             igpu_devices[0]
             if igpu_devices
             else {"available": False, "error": "No AMD integrated GPU found"}
         )
-    def get_amd_dgpu_devices(self):
+    def get_amd_dgpu_devices(self, include_inference_engines: bool = False):
         """
         Retrieves AMD discrete GPU device information using keyword-based classification.
         Returns:
             list: List of AMD dGPU device information.
         """
-        dgpu_devices = self._detect_amd_gpus("discrete")
+        dgpu_devices = self._detect_amd_gpus(
+            "discrete", include_inference_engines=include_inference_engines
+        )
         return (
             dgpu_devices
             if dgpu_devices
-            else {"available": False, "error": "No AMD discrete GPU found"}
+            else [{"available": False, "error": "No AMD discrete GPU found"}]
         )
     def get_npu_device(self) -> dict:
@@ -267,7 +275,9 @@ class WindowsSystemInfo(SystemInfo):
                 }
                 # Add inference engine detection
-                npu_info["inference_engines"] = self._detect_inference_engines("npu")
+                npu_info["inference_engines"] = self._detect_inference_engines(
+                    "npu", "AMD NPU"
+                )
                 return npu_info
         except Exception as e:  # pylint: disable=broad-except
             return {"available": False, "error": f"NPU detection failed: {e}"}
@@ -438,12 +448,13 @@ class WindowsSystemInfo(SystemInfo):
         info_dict["Windows Power Setting"] = self.get_windows_power_setting()
         return info_dict
-    def _detect_inference_engines(self, device_type: str) -> dict:
+    def _detect_inference_engines(self, device_type: str, device_name: str) -> dict:
         """
         Detect available inference engines for a specific device type.
         Args:
             device_type: Device type ("cpu", "amd_igpu", "amd_dgpu", "npu")
+            device_name: Device name
         Returns:
             dict: Available inference engines and their information.
@@ -451,7 +462,7 @@ class WindowsSystemInfo(SystemInfo):
         try:
             from .inference_engines import detect_inference_engines
-            return detect_inference_engines(device_type)
+            return detect_inference_engines(device_type, device_name)
         except Exception as e:  # pylint: disable=broad-except
             return {"error": f"Inference engine detection failed: {str(e)}"}
@@ -467,13 +478,13 @@ class WSLSystemInfo(SystemInfo):
         """
         return {"available": False, "error": "Device detection not supported in WSL"}
-    def get_amd_igpu_device(self) -> dict:
+    def get_amd_igpu_device(self, include_inference_engines: bool = False) -> dict:
         """
         Retrieves AMD integrated GPU device information in WSL environment.
         """
         return {"available": False, "error": "GPU detection not supported in WSL"}
-    def get_amd_dgpu_devices(self) -> list:
+    def get_amd_dgpu_devices(self, include_inference_engines: bool = False) -> list:
         """
         Retrieves AMD discrete GPU device information in WSL environment.
         """
@@ -556,6 +567,7 @@ class LinuxSystemInfo(SystemInfo):
                     cpu_data["architecture"] = line.split(":")[1].strip()
             if "name" in cpu_data:
+                cpu_name = cpu_data.get("name", "Unknown")
                 cpu_info = {
                     "name": cpu_data.get("name", "Unknown"),
                     "cores": cpu_data.get("cores", "Unknown"),
@@ -565,14 +577,16 @@ class LinuxSystemInfo(SystemInfo):
                 }
                 # Add inference engine detection
-                cpu_info["inference_engines"] = self._detect_inference_engines("cpu")
+                cpu_info["inference_engines"] = self._detect_inference_engines(
+                    "cpu", cpu_name
+                )
                 return cpu_info
         except Exception as e:  # pylint: disable=broad-except
             return {"available": False, "error": f"CPU detection failed: {e}"}
         return {"available": False, "error": "No CPU information found"}
-    def _detect_amd_gpus(self, gpu_type: str):
+    def _detect_amd_gpus(self, gpu_type: str, include_inference_engines: bool = False):
         """
         Shared AMD GPU detection logic for both integrated and discrete GPUs.
         Uses keyword-based classification for simplicity and reliability.
@@ -611,11 +625,10 @@ class LinuxSystemInfo(SystemInfo):
                             "name": device_name,
                             "available": True,
                         }
-                        # Add inference engine detection
-                        gpu_info["inference_engines"] = self._detect_inference_engines(
-                            device_type
-                        )
+                        if include_inference_engines:
+                            gpu_info["inference_engines"] = (
+                                self._detect_inference_engines(device_type, device_name)
+                            )
                         gpu_devices.append(gpu_info)
         except Exception as e:  # pylint: disable=broad-except
@@ -624,32 +637,36 @@ class LinuxSystemInfo(SystemInfo):
         return gpu_devices
-    def get_amd_igpu_device(self) -> dict:
+    def get_amd_igpu_device(self, include_inference_engines: bool = False) -> dict:
         """
         Retrieves AMD integrated GPU device information using keyword-based classification.
         Returns:
             dict: AMD iGPU device information.
         """
-        igpu_devices = self._detect_amd_gpus("integrated")
+        igpu_devices = self._detect_amd_gpus(
+            "integrated", include_inference_engines=include_inference_engines
+        )
         return (
             igpu_devices[0]
             if igpu_devices
             else {"available": False, "error": "No AMD integrated GPU found"}
         )
-    def get_amd_dgpu_devices(self):
+    def get_amd_dgpu_devices(self, include_inference_engines: bool = False) -> list:
         """
         Retrieves AMD discrete GPU device information using keyword-based classification.
         Returns:
             list: List of AMD dGPU device information.
         """
-        dgpu_devices = self._detect_amd_gpus("discrete")
+        dgpu_devices = self._detect_amd_gpus(
+            "discrete", include_inference_engines=include_inference_engines
+        )
         return (
             dgpu_devices
             if dgpu_devices
-            else {"available": False, "error": "No AMD discrete GPU found"}
+            else [{"available": False, "error": "No AMD discrete GPU found"}]
         )
     def get_npu_device(self) -> dict:
@@ -741,7 +758,7 @@ class LinuxSystemInfo(SystemInfo):
         info_dict["Physical Memory"] = self.get_physical_memory()
         return info_dict
-    def _detect_inference_engines(self, device_type: str) -> dict:
+    def _detect_inference_engines(self, device_type: str, device_name: str) -> dict:
         """
         Detect available inference engines for a specific device type.
@@ -752,7 +769,7 @@ class LinuxSystemInfo(SystemInfo):
             dict: Available inference engines and their information.
         """
         try:
-            return detect_inference_engines(device_type)
+            return detect_inference_engines(device_type, device_name)
         except Exception as e:  # pylint: disable=broad-except
             return {"error": f"Inference engine detection failed: {str(e)}"}
@@ -771,7 +788,7 @@ class UnsupportedOSSystemInfo(SystemInfo):
             "error": "Device detection not supported on this operating system",
         }
-    def get_amd_igpu_device(self) -> dict:
+    def get_amd_igpu_device(self, include_inference_engines: bool = False) -> dict:
         """
         Retrieves AMD integrated GPU device information for unsupported OS.
         """
@@ -780,7 +797,7 @@ class UnsupportedOSSystemInfo(SystemInfo):
             "error": "Device detection not supported on this operating system",
         }
-    def get_amd_dgpu_devices(self) -> list:
+    def get_amd_dgpu_devices(self, include_inference_engines: bool = False) -> list:
         """
         Retrieves AMD discrete GPU device information for unsupported OS.
         """

lemonade/tools/llamacpp/bench.py CHANGED Viewed

@@ -68,7 +68,9 @@ class LlamaCppBench(Bench):
                 # and error handling
                 model.time_to_first_token = None
                 model.tokens_per_second = None
-                raw_output, stderr = model.generate(prompt, return_raw=True)
+                raw_output, stderr = model.generate(
+                    prompt, max_new_tokens=output_tokens, return_raw=True
+                )
                 if model.time_to_first_token is None or model.tokens_per_second is None:
                     error_msg = (

lemonade/tools/llamacpp/load.py CHANGED Viewed

@@ -65,6 +65,13 @@ class LoadLlamaCpp(FirstTool):
             help="Set this flag to indicate the model is a reasoning model",
         )
+        parser.add_argument(
+            "--backend",
+            choices=["vulkan", "rocm"],
+            default="vulkan",
+            help="Backend to use for llama.cpp (default: vulkan)",
+        )
         return parser
     def run(
@@ -76,6 +83,7 @@ class LoadLlamaCpp(FirstTool):
         threads: int = 1,
         output_tokens: int = 512,
         reasoning: bool = False,
+        backend: str = "vulkan",
     ) -> State:
         """
         Load a llama.cpp model
@@ -93,8 +101,7 @@ class LoadLlamaCpp(FirstTool):
             LlamaCppAdapter,
         )
-        # Validate and install llama.cpp, if needed
-        install_llamacpp()
+        install_llamacpp(backend)
         # Check if input is a local folder containing a .GGUF model
         if os.path.isdir(input):
@@ -153,7 +160,7 @@ class LoadLlamaCpp(FirstTool):
                 full_model_path = snapshot_files["variant"]
                 model_to_use = os.path.basename(full_model_path)
-        llama_cli_exe_path = get_llama_cli_exe_path()
+        llama_cli_exe_path = get_llama_cli_exe_path(backend)
         printing.log_info(f"Using llama_cli for GGUF model: {llama_cli_exe_path}")
         # Get the directory containing the executable for shared libraries
@@ -175,7 +182,9 @@ class LoadLlamaCpp(FirstTool):
         # Save initial stats
         state.save_stat(Keys.DEVICE, device)
-        state.save_stat(Keys.LLAMA_CLI_VERSION_INFO, get_llama_installed_version())
+        state.save_stat(
+            Keys.LLAMA_CLI_VERSION_INFO, get_llama_installed_version(backend)
+        )
         status.add_to_state(state=state, name=input, model=model_to_use)
         return state

lemonade-sdk 8.0.6__py3-none-any.whl → 8.1.1__py3-none-any.whl

Potentially problematic release.

lemonade-sdk 8.0.6py3-none-any.whl → 8.1.1py3-none-any.whl