PyPI - lemonade-sdk - Versions diffs - 8.1.4__py3-none-any.whl → 8.2.2__py3-none-any.whl - Mend

lemonade-sdk 8.1.4py3-none-any.whl → 8.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (53) hide show

lemonade/cache.py +6 -1
lemonade/cli.py +47 -5
lemonade/common/inference_engines.py +13 -4
lemonade/common/status.py +4 -4
lemonade/common/system_info.py +544 -1
lemonade/profilers/agt_power.py +437 -0
lemonade/profilers/hwinfo_power.py +429 -0
lemonade/tools/accuracy.py +143 -48
lemonade/tools/adapter.py +6 -1
lemonade/tools/bench.py +26 -8
lemonade/tools/flm/__init__.py +1 -0
lemonade/tools/flm/utils.py +303 -0
lemonade/tools/huggingface/bench.py +6 -1
lemonade/tools/llamacpp/bench.py +146 -27
lemonade/tools/llamacpp/load.py +30 -2
lemonade/tools/llamacpp/utils.py +393 -33
lemonade/tools/oga/bench.py +5 -26
lemonade/tools/oga/load.py +60 -121
lemonade/tools/oga/migration.py +403 -0
lemonade/tools/report/table.py +76 -8
lemonade/tools/server/flm.py +133 -0
lemonade/tools/server/llamacpp.py +220 -553
lemonade/tools/server/serve.py +684 -168
lemonade/tools/server/static/js/chat.js +666 -342
lemonade/tools/server/static/js/model-settings.js +24 -3
lemonade/tools/server/static/js/models.js +597 -73
lemonade/tools/server/static/js/shared.js +79 -14
lemonade/tools/server/static/logs.html +191 -0
lemonade/tools/server/static/styles.css +491 -66
lemonade/tools/server/static/webapp.html +83 -31
lemonade/tools/server/tray.py +158 -38
lemonade/tools/server/utils/macos_tray.py +226 -0
lemonade/tools/server/utils/{system_tray.py → windows_tray.py} +13 -0
lemonade/tools/server/webapp.py +4 -1
lemonade/tools/server/wrapped_server.py +559 -0
lemonade/version.py +1 -1
lemonade_install/install.py +54 -611
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/METADATA +29 -72
lemonade_sdk-8.2.2.dist-info/RECORD +83 -0
lemonade_server/cli.py +145 -37
lemonade_server/model_manager.py +521 -37
lemonade_server/pydantic_models.py +28 -1
lemonade_server/server_models.json +246 -92
lemonade_server/settings.py +39 -39
lemonade/tools/quark/__init__.py +0 -0
lemonade/tools/quark/quark_load.py +0 -173
lemonade/tools/quark/quark_quantize.py +0 -439
lemonade_sdk-8.1.4.dist-info/RECORD +0 -77
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/WHEEL +0 -0
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/entry_points.txt +0 -0
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/LICENSE +0 -0
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/NOTICE.md +0 -0
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/top_level.txt +0 -0

lemonade/tools/server/llamacpp.py CHANGED Viewed

@@ -1,148 +1,54 @@
 import os
 import logging
-import time
 import subprocess
 import re
 import threading
 import platform
-import requests
-from tabulate import tabulate
 from dotenv import load_dotenv
 from fastapi import HTTPException, status
-from fastapi.responses import StreamingResponse
-from openai import OpenAI
 from lemonade_server.pydantic_models import (
-    ChatCompletionRequest,
-    CompletionRequest,
     PullConfig,
-    EmbeddingsRequest,
-    RerankingRequest,
 )
-from lemonade_server.model_manager import ModelManager
-from lemonade.tools.server.utils.port import find_free_port
 from lemonade.tools.llamacpp.utils import (
     get_llama_server_exe_path,
     install_llamacpp,
     download_gguf,
+    resolve_local_gguf_model,
+    parse_checkpoint,
 )
+from lemonade.tools.server.wrapped_server import WrappedServerTelemetry, WrappedServer
-def llamacpp_address(port: int) -> str:
-    """
-    Generate the base URL for the llamacpp server.
-    Args:
-        port: The port number the llamacpp server is running on
-    Returns:
-        The base URL for the llamacpp server
-    """
-    return f"http://127.0.0.1:{port}/v1"
+# Embedding model batch configuration set to 8192 as default
+EMBEDDING_CTX_SIZE = 8192
+EMBEDDING_BATCH_SIZE = 8192
+EMBEDDING_UBATCH_SIZE = 8192
-def _separate_openai_params(request_dict: dict, endpoint_type: str = "chat") -> dict:
-    """
-    Separate standard OpenAI parameters from custom llama.cpp parameters.
-    Args:
-        request_dict: Dictionary of all request parameters
-        endpoint_type: Type of endpoint ("chat" or "completion")
-    Returns:
-        Dictionary with parameters properly separated for OpenAI client
-    """
-    openai_client_params = {}
-    extra_params = {}
-    # Common OpenAI parameters for both endpoint types
-    common_params = {
-        "model",
-        "frequency_penalty",
-        "logit_bias",
-        "logprobs",
-        "max_tokens",
-        "n",
-        "presence_penalty",
-        "seed",
-        "stop",
-        "stream",
-        "temperature",
-        "top_p",
-        "user",
-    }
-    # Standard OpenAI parameters by endpoint type
-    if endpoint_type == "chat":
-        chat_specific_params = {
-            "messages",
-            "top_logprobs",
-            "response_format",
-            "service_tier",
-            "stream_options",
-            "tools",
-            "tool_choice",
-            "parallel_tool_calls",
-        }
-        openai_params = common_params | chat_specific_params
-    else:  # completion
-        completion_specific_params = {
-            "prompt",
-            "best_of",
-            "echo",
-            "suffix",
-        }
-        openai_params = common_params | completion_specific_params
-    for key, value in request_dict.items():
-        if key in openai_params:
-            openai_client_params[key] = value
-        else:
-            extra_params[key] = value
-    # If there are custom parameters, use extra_body to pass them through
-    if extra_params:
-        openai_client_params["extra_body"] = extra_params
-    return openai_client_params
-class LlamaTelemetry:
+class LlamaTelemetry(WrappedServerTelemetry):
     """
     Manages telemetry data collection and display for llama server.
     """
-    def __init__(self):
-        self.input_tokens = None
-        self.output_tokens = None
-        self.time_to_first_token = None
-        self.tokens_per_second = None
-        self.prompt_eval_time = None
-        self.eval_time = None
-        self.port = None
-    def choose_port(self):
+    def parse_telemetry_line(self, line: str):
         """
-        Users probably don't care what port we start llama-server on, so let's
-        search for an empty port
+        Parse telemetry data from llama server output lines.
         """
-        self.port = find_free_port()
-        if self.port is None:
-            msg = "Failed to find an empty port to start llama-server on"
+        if "vk::PhysicalDevice::createDevice: ErrorExtensionNotPresent" in line:
+            msg = (
+                "Your AMD GPU driver version is not compatible with this software.\n"
+                "Please update and try again: "
+                "https://www.amd.com/en/support/download/drivers.html"
+            )
             logging.error(msg)
             raise HTTPException(
                 status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                 detail=msg,
             )
-    def parse_telemetry_line(self, line: str):
-        """
-        Parse telemetry data from llama server output lines.
-        """
+        elif "error" in line.lower():
+            logging.error(line)
         # Parse Vulkan device detection
         vulkan_match = re.search(r"ggml_vulkan: Found (\d+) Vulkan devices?:", line)
@@ -186,468 +92,229 @@ class LlamaTelemetry:
             self.tokens_per_second = tokens_per_second
             return
-    def get_telemetry_data(self):
-        return {
-            "input_tokens": self.input_tokens,
-            "output_tokens": self.output_tokens,
-            "time_to_first_token": self.time_to_first_token,
-            "tokens_per_second": self.tokens_per_second,
-            "decode_token_times": None,
-        }
-    def show_telemetry(self):
-        # Check if debug logging is enabled
-        if not logging.getLogger().isEnabledFor(logging.DEBUG):
-            return
-        # Prepare telemetry data (transposed format)
-        telemetry = [
-            ["Input tokens", self.input_tokens],
-            ["Output tokens", self.output_tokens],
-            ["TTFT (s)", f"{self.time_to_first_token:.2f}"],
-            ["TPS", f"{self.tokens_per_second:.2f}"],
-        ]
+class LlamaServer(WrappedServer):
+    def __init__(self, backend: str):
+        self.backend = backend
+        super().__init__(server_name="llama-server", telemetry=LlamaTelemetry())
-        table = tabulate(
-            telemetry, headers=["Metric", "Value"], tablefmt="fancy_grid"
-        ).split("\n")
+    def install_server(self, backend=None):
+        """
+        Install the wrapped server
+        """
+        install_llamacpp(self.backend)
-        # Show telemetry in debug while complying with uvicorn's log indentation
-        logging.debug("\n          ".join(table))
+    def download_model(
+        self, config_checkpoint, config_mmproj=None, do_not_upgrade=False
+    ) -> dict:
+        """
+        Download a model for the wrapper server.
+        First checks local cache, then downloads from internet if needed.
+        """
+        # If it's a direct file path, just return it
+        if os.path.exists(config_checkpoint):
+            result = {"variant": config_checkpoint}
+            if config_mmproj:
+                result["mmproj"] = config_mmproj
+            return result
+        # Try to resolve from local cache first to avoid unnecessary downloads
+        checkpoint, variant = parse_checkpoint(config_checkpoint)
+        local_result = resolve_local_gguf_model(checkpoint, variant, config_mmproj)
+        if local_result:
+            return local_result
+        # Not found locally - download from internet
+        return download_gguf(
+            config_checkpoint=config_checkpoint,
+            config_mmproj=config_mmproj,
+            do_not_upgrade=do_not_upgrade,
+        )
+    def _launch_device_backend_subprocess(
+        self,
+        snapshot_files: dict,
+        use_gpu: bool,
+        ctx_size: int,
+        supports_embeddings: bool = False,
+        supports_reranking: bool = False,
+    ) -> subprocess.Popen:
+        """
+        Launch llama server subprocess with appropriate configuration.
+        Args:
+            snapshot_files: Dictionary of model files to load
+            use_gpu: Whether to use GPU acceleration
+            telemetry: Telemetry object for tracking performance metrics
+            backend: Backend to use (e.g., 'vulkan', 'rocm')
+            supports_embeddings: Whether the model supports embeddings
+            supports_reranking: Whether the model supports reranking
+        Returns:
+            Subprocess handle for the llama server
+        """
-def _log_subprocess_output(
-    process: subprocess.Popen, prefix: str, telemetry: LlamaTelemetry
-):
-    """
-    Read subprocess output line by line, log to debug, and parse telemetry
-    """
+        # Get the current executable path (handles both Windows and Ubuntu structures)
+        exe_path = get_llama_server_exe_path(self.backend)
+        # For embedding models, use a larger context size to support longer individual
+        # strings. Embedding requests can include multiple strings in a batch, and each
+        # string needs to fit within the context window.
+        if supports_embeddings and ctx_size < EMBEDDING_CTX_SIZE:
+            ctx_size = EMBEDDING_CTX_SIZE
+        # Build the base command
+        base_command = [
+            exe_path,
+            "-m",
+            snapshot_files["variant"],
+            "--ctx-size",
+            str(ctx_size),
+        ]
-    if process.stdout:
-        try:
-            for line in iter(process.stdout.readline, ""):
-                if line:
-                    line_stripped = line.strip()
-                    logging.debug("%s: %s", prefix, line_stripped)
-                    telemetry.parse_telemetry_line(line_stripped)
-                if process.poll() is not None:
-                    break
-        except UnicodeDecodeError as e:
-            logging.debug("Unicode decode error reading subprocess output: %s", str(e))
-        except Exception as e:  # pylint: disable=broad-exception-caught
-            logging.error("Unexpected error reading subprocess output: %s", str(e))
-def _wait_for_load(llama_server_process: subprocess.Popen, port: int):
-    status_code = None
-    while not llama_server_process.poll() and status_code != 200:
-        health_url = f"http://localhost:{port}/health"
-        try:
-            health_response = requests.get(health_url)
-        except requests.exceptions.ConnectionError:
-            logging.debug("Not able to connect to llama-server yet, will retry")
+        # Lock random seed for deterministic behavior in CI
+        if os.environ.get("LEMONADE_CI_MODE"):
+            base_command.extend(["--seed", "42"])
+            logging.info(f"Seed applied to base command: {base_command}")
+        if "mmproj" in snapshot_files:
+            base_command.extend(["--mmproj", snapshot_files["mmproj"]])
+            if not use_gpu:
+                base_command.extend(["--no-mmproj-offload"])
+        # Find a port, and save it in the telemetry object for future reference
+        # by other functions
+        self._choose_port()
+        # Add port and jinja to enable tool use
+        base_command.extend(["--port", str(self.port), "--jinja"])
+        # Enable context shift and avoid attention sink issues by preserving the initial tokens
+        # Note: --context-shift is not supported on all backends (e.g., Metal on macOS)
+        # Only add context-shift for backends that support it
+        context_shift_supported_backends = ["vulkan", "rocm"]
+        if self.backend in context_shift_supported_backends:
+            base_command.extend(["--context-shift", "--keep", "16"])
         else:
-            status_code = health_response.status_code
+            # For backends that don't support context-shift (e.g., Metal), just use keep
+            base_command.extend(["--keep", "16"])
             logging.debug(
-                "Testing llama-server readiness (will retry until ready), "
-                f"result: {health_response.json()}"
+                f"Skipped --context-shift for backend: {self.backend} (not supported)"
             )
-        time.sleep(1)
-def _launch_llama_subprocess(
-    snapshot_files: dict,
-    use_gpu: bool,
-    telemetry: LlamaTelemetry,
-    backend: str,
-    ctx_size: int,
-    supports_embeddings: bool = False,
-    supports_reranking: bool = False,
-) -> subprocess.Popen:
-    """
-    Launch llama server subprocess with appropriate configuration.
-    Args:
-        snapshot_files: Dictionary of model files to load
-        use_gpu: Whether to use GPU acceleration
-        telemetry: Telemetry object for tracking performance metrics
-        backend: Backend to use (e.g., 'vulkan', 'rocm')
-        supports_embeddings: Whether the model supports embeddings
-        supports_reranking: Whether the model supports reranking
-    Returns:
-        Subprocess handle for the llama server
-    """
-    # Get the current executable path (handles both Windows and Ubuntu structures)
-    exe_path = get_llama_server_exe_path(backend)
-    # Build the base command
-    base_command = [
-        exe_path,
-        "-m",
-        snapshot_files["variant"],
-        "--ctx-size",
-        str(ctx_size),
-    ]
-    # Lock random seed for deterministic behavior in CI
-    if os.environ.get("LEMONADE_CI_MODE"):
-        base_command.extend(["--seed", "42"])
-    if "mmproj" in snapshot_files:
-        base_command.extend(["--mmproj", snapshot_files["mmproj"]])
-        if not use_gpu:
-            base_command.extend(["--no-mmproj-offload"])
-    # Find a port, and save it in the telemetry object for future reference
-    # by other functions
-    telemetry.choose_port()
-    # Add port and jinja to enable tool use
-    base_command.extend(["--port", str(telemetry.port), "--jinja"])
-    # Disable jinja for gpt-oss-120b on Vulkan
-    if backend == "vulkan" and "gpt-oss-120b" in snapshot_files["variant"].lower():
-        base_command.remove("--jinja")
-        logging.warning(
-            "Jinja is disabled for gpt-oss-120b on Vulkan due to a llama.cpp bug "
-            "(see https://github.com/ggml-org/llama.cpp/issues/15274). "
-            "The model cannot use tools. If needed, use the ROCm backend instead."
-        )
-    # Use legacy reasoning formatting, since not all apps support the new
-    # reasoning_content field
-    base_command.extend(["--reasoning-format", "none"])
-    # Add embeddings support if the model supports it
-    if supports_embeddings:
-        base_command.append("--embeddings")
-    # Add reranking support if the model supports it
-    if supports_reranking:
-        base_command.append("--reranking")
-    # Configure GPU layers: 99 for GPU, 0 for CPU-only
-    ngl_value = "99" if use_gpu else "0"
-    command = base_command + ["-ngl", ngl_value]
-    # Set up environment with library path for Linux
-    env = os.environ.copy()
-    # Load environment variables from .env file in the executable directory
-    exe_dir = os.path.dirname(exe_path)
-    env_file_path = os.path.join(exe_dir, ".env")
-    if os.path.exists(env_file_path):
-        load_dotenv(env_file_path, override=True)
-        env.update(os.environ)
-        logging.debug(f"Loaded environment variables from {env_file_path}")
-    if platform.system().lower() == "linux":
-        lib_dir = os.path.dirname(exe_path)  # Same directory as the executable
-        current_ld_path = env.get("LD_LIBRARY_PATH", "")
-        if current_ld_path:
-            env["LD_LIBRARY_PATH"] = f"{lib_dir}:{current_ld_path}"
-        else:
-            env["LD_LIBRARY_PATH"] = lib_dir
-        logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
-    # Start subprocess with output capture
-    process = subprocess.Popen(
-        command,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        text=True,
-        encoding="utf-8",
-        errors="replace",
-        bufsize=1,
-        env=env,
-    )
-    # Start background thread to log subprocess output
-    device_type = "GPU" if use_gpu else "CPU"
-    threading.Thread(
-        target=_log_subprocess_output,
-        args=(process, f"LLAMA SERVER {device_type}", telemetry),
-        daemon=True,
-    ).start()
-    return process
-def server_load(
-    model_config: PullConfig,
-    telemetry: LlamaTelemetry,
-    backend: str,
-    ctx_size: int,
-    do_not_upgrade: bool = False,
-):
-    # Install and/or update llama.cpp if needed
-    try:
-        install_llamacpp(backend)
-    except NotImplementedError as e:
-        raise HTTPException(
-            status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=str(e)
-        )
+        # Use legacy reasoning formatting, since not all apps support the new
+        # reasoning_content field
+        base_command.extend(["--reasoning-format", "auto"])
+        # Add embeddings support if the model supports it
+        if supports_embeddings:
+            # For embedding models, set batch sizes to handle multiple documents in a single request
+            # batch-size: logical batch size (total tokens across all sequences)
+            # ubatch-size: physical batch size (tokens processed in a single forward pass)
+            base_command.extend(
+                [
+                    "--embeddings",
+                    "--batch-size",
+                    str(EMBEDDING_BATCH_SIZE),
+                    "--ubatch-size",
+                    str(EMBEDDING_UBATCH_SIZE),
+                ]
+            )
-    # Download the gguf to the hugging face cache
-    snapshot_files = download_gguf(
-        model_config.checkpoint, model_config.mmproj, do_not_upgrade=do_not_upgrade
-    )
-    logging.debug(f"GGUF file paths: {snapshot_files}")
-    # Check if model supports embeddings
-    supported_models = ModelManager().supported_models
-    model_info = supported_models.get(model_config.model_name, {})
-    supports_embeddings = "embeddings" in model_info.get("labels", [])
-    supports_reranking = "reranking" in model_info.get("labels", [])
-    # Attempt loading on GPU first
-    llama_server_process = _launch_llama_subprocess(
-        snapshot_files,
-        use_gpu=True,
-        telemetry=telemetry,
-        backend=backend,
-        ctx_size=ctx_size,
-        supports_embeddings=supports_embeddings,
-        supports_reranking=supports_reranking,
-    )
-    # Check the /health endpoint until GPU server is ready
-    _wait_for_load(
-        llama_server_process,
-        telemetry.port,
-    )
-    # If loading on GPU failed, try loading on CPU
-    if llama_server_process.poll():
-        logging.warning(
-            f"Loading {model_config.model_name} on GPU didn't work, re-attempting on CPU"
+        # Add reranking support if the model supports it
+        if supports_reranking:
+            base_command.append("--reranking")
+        # Configure GPU layers: 99 for GPU, 0 for CPU-only
+        ngl_value = "99" if use_gpu else "0"
+        command = base_command + ["-ngl", ngl_value]
+        # Set up environment with library path for Linux
+        env = os.environ.copy()
+        # Load environment variables from .env file in the executable directory
+        exe_dir = os.path.dirname(exe_path)
+        env_file_path = os.path.join(exe_dir, ".env")
+        if os.path.exists(env_file_path):
+            load_dotenv(env_file_path, override=False)
+            env.update(os.environ)
+            logging.debug(f"Loaded environment variables from {env_file_path}")
+        system = platform.system().lower()
+        if system == "linux":
+            lib_dir = os.path.dirname(exe_path)  # Same directory as the executable
+            current_ld_path = env.get("LD_LIBRARY_PATH", "")
+            if current_ld_path:
+                env["LD_LIBRARY_PATH"] = f"{lib_dir}:{current_ld_path}"
+            else:
+                env["LD_LIBRARY_PATH"] = lib_dir
+            logging.debug(f"Set LD_LIBRARY_PATH to {env['LD_LIBRARY_PATH']}")
+        elif system == "darwin":
+            lib_dir = os.path.dirname(exe_path)
+            current_dyld_path = env.get("DYLD_LIBRARY_PATH", "")
+            if current_dyld_path:
+                env["DYLD_LIBRARY_PATH"] = f"{lib_dir}:{current_dyld_path}"
+            else:
+                env["DYLD_LIBRARY_PATH"] = lib_dir
+            logging.debug(f"Set DYLD_LIBRARY_PATH to {env['DYLD_LIBRARY_PATH']}")
+        # Start subprocess with output capture
+        self.process = subprocess.Popen(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            encoding="utf-8",
+            errors="replace",
+            bufsize=1,
+            env=env,
         )
-        if os.environ.get("LEMONADE_LLAMACPP_NO_FALLBACK"):
-            # Used for testing, when the test should fail if GPU didn't work
-            raise Exception("llamacpp GPU loading failed")
-        llama_server_process = _launch_llama_subprocess(
+        # Start background thread to log subprocess output
+        device_type = "GPU" if use_gpu else "CPU"
+        threading.Thread(
+            target=self._log_subprocess_output,
+            args=(f"LLAMA SERVER {device_type}",),
+            daemon=True,
+        ).start()
+    def _launch_server_subprocess(
+        self,
+        model_config: PullConfig,
+        snapshot_files: dict,
+        ctx_size: int,
+        supports_embeddings: bool = False,
+        supports_reranking: bool = False,
+    ):
+        # Attempt loading on GPU first
+        self._launch_device_backend_subprocess(
             snapshot_files,
-            use_gpu=False,
-            telemetry=telemetry,
-            backend=backend,
+            use_gpu=True,
             ctx_size=ctx_size,
             supports_embeddings=supports_embeddings,
             supports_reranking=supports_reranking,
         )
-        # Check the /health endpoint until CPU server is ready
-        _wait_for_load(
-            llama_server_process,
-            telemetry.port,
-        )
-    if llama_server_process.poll():
-        raise HTTPException(
-            status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
-            detail=f"Failed to load {model_config.model_name} with llama.cpp",
-        )
-    return llama_server_process
-def chat_completion(
-    chat_completion_request: ChatCompletionRequest, telemetry: LlamaTelemetry
-):
-    base_url = llamacpp_address(telemetry.port)
-    client = OpenAI(
-        base_url=base_url,
-        api_key="lemonade",
-    )
-    # Convert Pydantic model to dict and remove unset/null values
-    request_dict = chat_completion_request.model_dump(
-        exclude_unset=True, exclude_none=True
-    )
-    # Separate standard OpenAI parameters from custom llama.cpp parameters
-    openai_client_params = _separate_openai_params(request_dict, "chat")
-    # Check if streaming is requested
-    if chat_completion_request.stream:
-        def event_stream():
-            try:
-                # Enable streaming
-                # pylint: disable=missing-kwoa
-                for chunk in client.chat.completions.create(**openai_client_params):
-                    yield f"data: {chunk.model_dump_json()}\n\n"
-                yield "data: [DONE]\n\n"
-                # Show telemetry after completion
-                telemetry.show_telemetry()
-            except Exception as e:  # pylint: disable=broad-exception-caught
-                yield f'data: {{"error": "{str(e)}"}}\n\n'
-        return StreamingResponse(
-            event_stream(),
-            media_type="text/event-stream",
-            headers={
-                "Cache-Control": "no-cache",
-                "Connection": "keep-alive",
-            },
-        )
-    else:
-        # Non-streaming response
-        try:
-            # Disable streaming for non-streaming requests
-            # pylint: disable=missing-kwoa
-            response = client.chat.completions.create(**openai_client_params)
-            # Show telemetry after completion
-            telemetry.show_telemetry()
+        # Check the /health endpoint until GPU server is ready
+        self._wait_for_load()
-            return response
-        except Exception as e:  # pylint: disable=broad-exception-caught
-            logging.error("Error during chat completion: %s", str(e))
-            raise HTTPException(
-                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-                detail=f"Chat completion error: {str(e)}",
+        # If loading on GPU failed, try loading on CPU
+        if self.process.poll():
+            logging.warning(
+                f"Loading {model_config.model_name} on GPU didn't work, re-attempting on CPU"
             )
+            if os.environ.get("LEMONADE_LLAMACPP_NO_FALLBACK"):
+                # Used for testing, when the test should fail if GPU didn't work
+                raise Exception("llamacpp GPU loading failed")
-def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry):
-    """
-    Handle text completions using the llamacpp server.
-    Args:
-        completion_request: The completion request containing prompt and parameters
-        telemetry: Telemetry object containing the server port
-    Returns:
-        Completion response from the llamacpp server
-    """
-    base_url = llamacpp_address(telemetry.port)
-    client = OpenAI(
-        base_url=base_url,
-        api_key="lemonade",
-    )
-    # Convert Pydantic model to dict and remove unset/null values
-    request_dict = completion_request.model_dump(exclude_unset=True, exclude_none=True)
-    # Separate standard OpenAI parameters from custom llama.cpp parameters
-    openai_client_params = _separate_openai_params(request_dict, "completion")
-    # Check if streaming is requested
-    if completion_request.stream:
-        def event_stream():
-            try:
-                # Enable streaming
-                # pylint: disable=missing-kwoa
-                for chunk in client.completions.create(**openai_client_params):
-                    yield f"data: {chunk.model_dump_json()}\n\n"
-                yield "data: [DONE]\n\n"
-                # Show telemetry after completion
-                telemetry.show_telemetry()
-            except Exception as e:  # pylint: disable=broad-exception-caught
-                yield f'data: {{"error": "{str(e)}"}}\n\n'
-        return StreamingResponse(
-            event_stream(),
-            media_type="text/event-stream",
-            headers={
-                "Cache-Control": "no-cache",
-                "Connection": "keep-alive",
-            },
-        )
-    else:
-        # Non-streaming response
-        try:
-            # Disable streaming for non-streaming requests
-            # pylint: disable=missing-kwoa
-            response = client.completions.create(**openai_client_params)
-            # Show telemetry after completion
-            telemetry.show_telemetry()
-            return response
-        except Exception as e:  # pylint: disable=broad-exception-caught
-            logging.error("Error during completion: %s", str(e))
-            raise HTTPException(
-                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-                detail=f"Completion error: {str(e)}",
+            self._launch_device_backend_subprocess(
+                snapshot_files,
+                use_gpu=False,
+                ctx_size=ctx_size,
+                supports_embeddings=supports_embeddings,
+                supports_reranking=supports_reranking,
             )
-def embeddings(embeddings_request: EmbeddingsRequest, telemetry: LlamaTelemetry):
-    """
-    Generate embeddings using the llamacpp server.
-    Args:
-        embeddings_request: The embeddings request containing input text/tokens
-        telemetry: Telemetry object containing the server port
-    Returns:
-        Embeddings response from the llamacpp server
-    """
-    base_url = llamacpp_address(telemetry.port)
-    client = OpenAI(
-        base_url=base_url,
-        api_key="lemonade",
-    )
-    # Convert Pydantic model to dict and remove unset/null values
-    request_dict = embeddings_request.model_dump(exclude_unset=True, exclude_none=True)
-    try:
-        # Call the embeddings endpoint
-        response = client.embeddings.create(**request_dict)
-        return response
-    except Exception as e:  # pylint: disable=broad-exception-caught
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"Embeddings error: {str(e)}",
-        )
-def reranking(reranking_request: RerankingRequest, telemetry: LlamaTelemetry):
-    """
-    Rerank documents based on their relevance to a query using the llamacpp server.
-    Args:
-        reranking_request: The reranking request containing query and documents
-        telemetry: Telemetry object containing the server port
-    Returns:
-        Reranking response from the llamacpp server containing ranked documents and scores
-    """
-    base_url = llamacpp_address(telemetry.port)
-    try:
-        # Convert Pydantic model to dict and exclude unset/null values
-        request_dict = reranking_request.model_dump(
-            exclude_unset=True, exclude_none=True
-        )
-        # Call the reranking endpoint directly since it's not supported by the OpenAI API
-        response = requests.post(
-            f"{base_url}/rerank",
-            json=request_dict,
-        )
-        response.raise_for_status()
-        return response.json()
-    except Exception as e:
-        logging.error("Error during reranking: %s", str(e))
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"Reranking error: {str(e)}",
-        ) from e

lemonade-sdk 8.1.4__py3-none-any.whl → 8.2.2__py3-none-any.whl

Potentially problematic release.

lemonade-sdk 8.1.4py3-none-any.whl → 8.2.2py3-none-any.whl