PyPI - lemonade-sdk - Versions diffs - 8.1.5__tar.gz → 8.1.6__tar.gz - Mend

lemonade-sdk 8.1.5tar.gz → 8.1.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (87) hide show

{lemonade_sdk-8.1.5/src/lemonade_sdk.egg-info → lemonade_sdk-8.1.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lemonade-sdk
-Version: 8.1.5
+Version: 8.1.6
 Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
 Author-email: lemonade@amd.com
 Requires-Python: >=3.10, <3.14

{lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/llamacpp/utils.py RENAMED Viewed

@@ -346,7 +346,11 @@ def install_llamacpp(backend):
         # Identify and set HIP ID
         if backend == "rocm":
-            hip_id = identify_hip_id()
+            try:
+                hip_id = identify_hip_id()
+            except Exception as e:  # pylint: disable=broad-exception-caught
+                hip_id = 0
+                logging.warning(f"Error identifying HIP ID: {e}. Falling back to 0.")
             env_file_path = os.path.join(llama_server_exe_dir, ".env")
             set_key(env_file_path, "HIP_VISIBLE_DEVICES", str(hip_id))

lemonade_sdk-8.1.6/src/lemonade/tools/server/llamacpp.py ADDED Viewed

@@ -0,0 +1,255 @@
+import os
+import logging
+import subprocess
+import re
+import threading
+import platform
+from dotenv import load_dotenv
+from lemonade_server.pydantic_models import (
+    PullConfig,
+)
+from lemonade.tools.llamacpp.utils import (
+    get_llama_server_exe_path,
+    install_llamacpp,
+    download_gguf,
+)
+from lemonade.tools.server.wrapped_server import WrappedServerTelemetry, WrappedServer
+class LlamaTelemetry(WrappedServerTelemetry):
+    """
+    Manages telemetry data collection and display for llama server.
+    """
+    def parse_telemetry_line(self, line: str):
+        """
+        Parse telemetry data from llama server output lines.
+        """
+        # Parse Vulkan device detection
+        vulkan_match = re.search(r"ggml_vulkan: Found (\d+) Vulkan devices?:", line)
+        if vulkan_match:
+            device_count = int(vulkan_match.group(1))
+            if device_count > 0:
+                logging.info(
+                    f"GPU acceleration active: {device_count} device(s) "
+                    "detected by llama-server"
+                )
+            return
+        # Parse prompt evaluation line
+        prompt_match = re.search(
+            r"prompt eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?"
+            r"([\d.]+)\s*tokens per second",
+            line,
+        )
+        if prompt_match:
+            prompt_time_ms = float(prompt_match.group(1))
+            input_tokens = int(prompt_match.group(2))
+            self.prompt_eval_time = prompt_time_ms / 1000.0
+            self.input_tokens = input_tokens
+            self.time_to_first_token = prompt_time_ms / 1000.0
+            return
+        # Parse generation evaluation line
+        eval_match = re.search(
+            r"eval time\s*=\s*([\d.]+)\s*ms\s*/\s*(\d+)\s*tokens.*?"
+            r"([\d.]+)\s*tokens per second",
+            line,
+        )
+        if eval_match:
+            eval_time_ms = float(eval_match.group(1))
+            output_tokens = int(eval_match.group(2))
+            tokens_per_second = float(eval_match.group(3))
+            self.eval_time = eval_time_ms / 1000.0
+            self.output_tokens = output_tokens
+            self.tokens_per_second = tokens_per_second
+            return
+class LlamaServer(WrappedServer):
+    def __init__(self, backend: str):
+        self.telemetry = LlamaTelemetry()
+        self.backend = backend
+        super().__init__(server_name="llama-server", telemetry=self.telemetry)
+    def install_server(self, backend=None):
+        """
+        Install the wrapped server
+        """
+        install_llamacpp(self.backend)
+    def download_model(
+        self, config_checkpoint, config_mmproj=None, do_not_upgrade=False
+    ) -> dict:
+        """
+        Download a model for the wrapper server
+        """
+        return download_gguf(
+            config_checkpoint=config_checkpoint,
+            config_mmproj=config_mmproj,
+            do_not_upgrade=do_not_upgrade,
+        )
+    def _launch_device_backend_subprocess(
+        self,
+        snapshot_files: dict,
+        use_gpu: bool,
+        ctx_size: int,
+        supports_embeddings: bool = False,
+        supports_reranking: bool = False,
+    ) -> subprocess.Popen:
+        """
+        Launch llama server subprocess with appropriate configuration.
+        Args:
+            snapshot_files: Dictionary of model files to load
+            use_gpu: Whether to use GPU acceleration
+            telemetry: Telemetry object for tracking performance metrics
+            backend: Backend to use (e.g., 'vulkan', 'rocm')
+            supports_embeddings: Whether the model supports embeddings
+            supports_reranking: Whether the model supports reranking
+        Returns:
+            Subprocess handle for the llama server
+        """
+        # Get the current executable path (handles both Windows and Ubuntu structures)
+        exe_path = get_llama_server_exe_path(self.backend)
+        # Build the base command
+        base_command = [
+            exe_path,
+            "-m",
+            snapshot_files["variant"],
+            "--ctx-size",
+            str(ctx_size),
+        ]
+        # Lock random seed for deterministic behavior in CI
+        if os.environ.get("LEMONADE_CI_MODE"):
+            base_command.extend(["--seed", "42"])
+            logging.info(f"Seed applied to base command: {base_command}")
+        if "mmproj" in snapshot_files:
+            base_command.extend(["--mmproj", snapshot_files["mmproj"]])
+            if not use_gpu:
+                base_command.extend(["--no-mmproj-offload"])
+        # Find a port, and save it in the telemetry object for future reference
+        # by other functions
+        self.choose_port()
+        # Add port and jinja to enable tool use
+        base_command.extend(["--port", str(self.port), "--jinja"])
+        # Disable jinja for gpt-oss-120b on Vulkan
+        if (
+            self.backend == "vulkan"
+            and "gpt-oss-120b" in snapshot_files["variant"].lower()
+        ):
+            base_command.remove("--jinja")
+            logging.warning(
+                "Jinja is disabled for gpt-oss-120b on Vulkan due to a llama.cpp bug "
+                "(see https://github.com/ggml-org/llama.cpp/issues/15274). "
+                "The model cannot use tools. If needed, use the ROCm backend instead."
+            )
+        # Use legacy reasoning formatting, since not all apps support the new
+        # reasoning_content field
+        base_command.extend(["--reasoning-format", "none"])
+        # Add embeddings support if the model supports it
+        if supports_embeddings:
+            base_command.append("--embeddings")
+        # Add reranking support if the model supports it
+        if supports_reranking:
+            base_command.append("--reranking")
+        # Configure GPU layers: 99 for GPU, 0 for CPU-only
+        ngl_value = "99" if use_gpu else "0"
+        command = base_command + ["-ngl", ngl_value]
+        # Set up environment with library path for Linux
+        env = os.environ.copy()
+        # Load environment variables from .env file in the executable directory
+        exe_dir = os.path.dirname(exe_path)
+        env_file_path = os.path.join(exe_dir, ".env")
+        if os.path.exists(env_file_path):
+            load_dotenv(env_file_path, override=True)
+            env.update(os.environ)
+            logging.debug(f"Loaded environment variables from {env_file_path}")
+        if platform.system().lower() == "linux":
+            lib_dir = os.path.dirname(exe_path)  # Same directory as the executable
+            current_ld_path = env.get("LD_LIBRARY_PATH", "")
+            if current_ld_path:
+                env["LD_LIBRARY_PATH"] = f"{lib_dir}:{current_ld_path}"
+            else:
+                env["LD_LIBRARY_PATH"] = lib_dir
+            logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
+        # Start subprocess with output capture
+        self.process = subprocess.Popen(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            encoding="utf-8",
+            errors="replace",
+            bufsize=1,
+            env=env,
+        )
+        # Start background thread to log subprocess output
+        device_type = "GPU" if use_gpu else "CPU"
+        threading.Thread(
+            target=self._log_subprocess_output,
+            args=(f"LLAMA SERVER {device_type}",),
+            daemon=True,
+        ).start()
+    def _launch_server_subprocess(
+        self,
+        model_config: PullConfig,
+        snapshot_files: dict,
+        ctx_size: int,
+        supports_embeddings: bool = False,
+        supports_reranking: bool = False,
+    ):
+        # Attempt loading on GPU first
+        self._launch_device_backend_subprocess(
+            snapshot_files,
+            use_gpu=True,
+            ctx_size=ctx_size,
+            supports_embeddings=supports_embeddings,
+            supports_reranking=supports_reranking,
+        )
+        # Check the /health endpoint until GPU server is ready
+        self._wait_for_load()
+        # If loading on GPU failed, try loading on CPU
+        if self.process.poll():
+            logging.warning(
+                f"Loading {model_config.model_name} on GPU didn't work, re-attempting on CPU"
+            )
+            if os.environ.get("LEMONADE_LLAMACPP_NO_FALLBACK"):
+                # Used for testing, when the test should fail if GPU didn't work
+                raise Exception("llamacpp GPU loading failed")
+            self._launch_device_backend_subprocess(
+                snapshot_files,
+                use_gpu=False,
+                ctx_size=ctx_size,
+                supports_embeddings=supports_embeddings,
+                supports_reranking=supports_reranking,
+            )

{lemonade_sdk-8.1.5 → lemonade_sdk-8.1.6}/src/lemonade/tools/server/serve.py RENAMED Viewed

@@ -9,7 +9,6 @@ import tempfile
 import traceback
 from typing import Optional, Union
 import json
-import subprocess
 from pathlib import Path
 from fastapi import FastAPI, HTTPException, status, Request
@@ -47,7 +46,8 @@ from openai.types.responses import (
 )
 import lemonade.api as lemonade_api
-import lemonade.tools.server.llamacpp as llamacpp
+from lemonade.tools.server.wrapped_server import WrappedServer
+from lemonade.tools.server.llamacpp import LlamaServer
 from lemonade.tools.server.tool_calls import extract_tool_calls, get_tool_call_pattern
 from lemonade.tools.server.webapp import get_webapp_html
 from lemonade.tools.server.utils.port import lifespan
@@ -232,11 +232,8 @@ class Server:
         # Add lock for load/unload operations
         self._load_lock = asyncio.Lock()
-        # Subprocess handle for llama_server.exe
-        self.llama_server_process: subprocess.Popen = None
-        # Telemetry instance for llama server
-        self.llama_telemetry = llamacpp.LlamaTelemetry()
+        # Subprocess handle for wrapped instance of llama_server.exe, etc.
+        self.wrapped_server: WrappedServer = None
     def setup_routes(self, api_prefixes: list[str]):
         for prefix in api_prefixes:
@@ -521,7 +518,7 @@ class Server:
         await self.load_llm(lc)
         if self.llm_loaded.recipe == "llamacpp":
-            return llamacpp.completion(completion_request, self.llama_telemetry)
+            return self.wrapped_server.completion(completion_request)
         # Check if the model supports reasoning
         reasoning_first_token = self.llm_loaded.reasoning
@@ -656,9 +653,7 @@ class Server:
         await self.load_llm(lc)
         if self.llm_loaded.recipe == "llamacpp":
-            return llamacpp.chat_completion(
-                chat_completion_request, self.llama_telemetry
-            )
+            return self.wrapped_server.chat_completion(chat_completion_request)
         # Convert chat messages to text using the model's chat template
         text = self.apply_chat_template(
@@ -861,7 +856,7 @@ class Server:
         if self.llm_loaded.recipe == "llamacpp":
             try:
-                return llamacpp.embeddings(embeddings_request, self.llama_telemetry)
+                return self.wrapped_server.embeddings(embeddings_request)
             except Exception as e:  # pylint: disable=broad-exception-caught
                 # Check if model has embeddings label
                 model_info = ModelManager().supported_models.get(
@@ -884,7 +879,7 @@ class Server:
     async def reranking(self, reranking_request: RerankingRequest):
         """
-        Rerank documents based on their relevance to a query using the llamacpp server.
+        Rerank documents based on their relevance to a query.
         """
         # Initialize load config from reranking request
         lc = LoadConfig(model_name=reranking_request.model)
@@ -894,7 +889,7 @@ class Server:
         if self.llm_loaded.recipe == "llamacpp":
             try:
-                return llamacpp.reranking(reranking_request, self.llama_telemetry)
+                return self.wrapped_server.reranking(reranking_request)
             except Exception as e:  # pylint: disable=broad-exception-caught
                 # Check if model has reranking label
                 model_info = ModelManager().supported_models.get(
@@ -1287,7 +1282,7 @@ class Server:
         """
         # If using llama server, get telemetry from the telemetry instance
         if self.llm_loaded and self.llm_loaded.recipe == "llamacpp":
-            return self.llama_telemetry.get_telemetry_data()
+            return self.wrapped_server.telemetry.get_telemetry_data()
         # For built-in server, use the existing telemetry
         return {
@@ -1466,9 +1461,9 @@ class Server:
             ):
                 if (
                     self.llm_loaded.recipe == "llamacpp"
-                    and self.llama_server_process.poll()
+                    and self.wrapped_server.process.poll()
                 ):
-                    # llama-server process has gone away for some reason, so we should
+                    # wrapped server process has gone away for some reason, so we should
                     # proceed with loading to get it back
                     pass
                 else:
@@ -1484,12 +1479,10 @@ class Server:
             logging.info(f"Loading llm: {config.model_name}")
             try:
                 if config_to_use.recipe == "llamacpp":
-                    self.llama_server_process = llamacpp.server_load(
+                    self.wrapped_server = LlamaServer(self.llamacpp_backend)
+                    self.wrapped_server.load(
                         model_config=config_to_use,
-                        telemetry=self.llama_telemetry,
-                        backend=self.llamacpp_backend,
                         ctx_size=self.ctx_size,
-                        # Models should only upgrade when using the pull endpoint
                         do_not_upgrade=True,
                     )
@@ -1530,7 +1523,7 @@ class Server:
                     await self._generate_semaphore.acquire()
             if self.llm_loaded.recipe == "llamacpp":
-                self.llama_server_process.terminate()
+                self.wrapped_server.process.terminate()
             self.llm_loaded = None
             self.tokenizer = None

lemonade-sdk 8.1.5__tar.gz → 8.1.6__tar.gz

Potentially problematic release.

lemonade-sdk 8.1.5tar.gz → 8.1.6tar.gz