PyPI - lemonade-sdk - Versions diffs - 8.1.1__py3-none-any.whl → 8.1.3__py3-none-any.whl - Mend

lemonade-sdk 8.1.1py3-none-any.whl → 8.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (26) hide show

lemonade/common/inference_engines.py +1 -1
lemonade/tools/llamacpp/utils.py +114 -14
lemonade/tools/management_tools.py +1 -1
lemonade/tools/oga/utils.py +54 -33
lemonade/tools/server/llamacpp.py +96 -4
lemonade/tools/server/serve.py +80 -10
lemonade/tools/server/static/js/chat.js +735 -0
lemonade/tools/server/static/js/model-settings.js +162 -0
lemonade/tools/server/static/js/models.js +865 -0
lemonade/tools/server/static/js/shared.js +491 -0
lemonade/tools/server/static/styles.css +652 -26
lemonade/tools/server/static/webapp.html +145 -1091
lemonade/tools/server/tray.py +1 -1
lemonade/tools/server/utils/port.py +5 -4
lemonade/version.py +1 -1
{lemonade_sdk-8.1.1.dist-info → lemonade_sdk-8.1.3.dist-info}/METADATA +7 -6
{lemonade_sdk-8.1.1.dist-info → lemonade_sdk-8.1.3.dist-info}/RECORD +26 -22
{lemonade_sdk-8.1.1.dist-info → lemonade_sdk-8.1.3.dist-info}/entry_points.txt +1 -0
lemonade_server/cli.py +66 -17
lemonade_server/model_manager.py +1 -1
lemonade_server/pydantic_models.py +15 -3
lemonade_server/server_models.json +54 -3
{lemonade_sdk-8.1.1.dist-info → lemonade_sdk-8.1.3.dist-info}/WHEEL +0 -0
{lemonade_sdk-8.1.1.dist-info → lemonade_sdk-8.1.3.dist-info}/licenses/LICENSE +0 -0
{lemonade_sdk-8.1.1.dist-info → lemonade_sdk-8.1.3.dist-info}/licenses/NOTICE.md +0 -0
{lemonade_sdk-8.1.1.dist-info → lemonade_sdk-8.1.3.dist-info}/top_level.txt +0 -0

lemonade/common/inference_engines.py CHANGED Viewed

@@ -5,7 +5,6 @@ import importlib.metadata
 import subprocess
 from abc import ABC, abstractmethod
 from typing import Dict, Optional
-import transformers
 class InferenceEngineDetector:
@@ -352,6 +351,7 @@ class TransformersDetector(BaseEngineDetector):
         try:
             import torch
+            import transformers
             if device_type == "cpu":
                 result = {

lemonade/tools/llamacpp/utils.py CHANGED Viewed

@@ -57,7 +57,7 @@ def identify_rocm_arch_from_name(device_name: str) -> str | None:
     return None
-def identify_rocm_arch_and_hip_id() -> tuple[str, str]:
+def identify_rocm_arch() -> str:
     """
     Identify the appropriate ROCm target architecture based on the device info
     Returns tuple of (architecture, gpu_type) where gpu_type is 'igpu' or 'dgpu'
@@ -68,21 +68,54 @@ def identify_rocm_arch_and_hip_id() -> tuple[str, str]:
     amd_igpu = system_info.get_amd_igpu_device()
     amd_dgpu = system_info.get_amd_dgpu_devices()
     target_arch = None
-    gpu_count = 0
     for gpu in [amd_igpu] + amd_dgpu:
         if gpu.get("available") and gpu.get("name"):
-            gpu_count += 1
             target_arch = identify_rocm_arch_from_name(gpu["name"].lower())
             if target_arch:
                 break
-    # Get HIP ID based on the number of GPUs available
-    # Here, we assume that the iGPU will always show up before the dGPUs (if available)
-    # We also assume that selecting the dGPU is preferred over the iGPU
-    # Multiple GPUs are not supported at the moment
-    hip_id = str(gpu_count - 1)
+    return target_arch
-    return target_arch, hip_id
+def identify_hip_id() -> str:
+    """
+    Identify the HIP ID
+    """
+    # Get HIP devices
+    hip_devices = get_hip_devices()
+    logging.debug(f"HIP devices found: {hip_devices}")
+    if len(hip_devices) == 0:
+        raise ValueError("No HIP devices found when identifying HIP ID")
+    # Identify HIP devices that are compatible with our ROCm builds
+    rocm_devices = []
+    for device in hip_devices:
+        device_id, device_name = device
+        if identify_rocm_arch_from_name(device_name):
+            rocm_devices.append([device_id, device_name])
+    logging.debug(f"ROCm devices found: {rocm_devices}")
+    # If no ROCm devices are found, use the last HIP device
+    # This might be needed in some scenarios where HIP reports generic device names
+    # Example: "AMD Radeon Graphics" for STX Halo iGPU on Ubuntu 24.04
+    if len(rocm_devices) == 0:
+        rocm_devices = [hip_devices[-1]]
+        logging.warning(
+            "No ROCm devices found when identifying HIP ID. "
+            f"Falling back to the following device: {rocm_devices[0]}"
+        )
+    elif len(rocm_devices) > 1:
+        logging.warning(
+            f"Multiple ROCm devices found when identifying HIP ID: {rocm_devices}"
+            "The last device will be used."
+        )
+    # Select the last device
+    device_selected = rocm_devices[-1]
+    logging.debug(f"Selected ROCm device: {device_selected}")
+    # Return the device ID
+    return device_selected[0]
 def get_llama_version(backend: str) -> str:
@@ -277,7 +310,7 @@ def install_llamacpp(backend):
         target_arch = None
         if backend == "rocm":
             # Identify the target architecture
-            target_arch, hip_id = identify_rocm_arch_and_hip_id()
+            target_arch = identify_rocm_arch()
             if not target_arch:
                 system = platform.system().lower()
                 if system == "linux":
@@ -293,10 +326,6 @@ def install_llamacpp(backend):
                     f"for supported configurations. {hint}"
                 )
-            # Set HIP_VISIBLE_DEVICES=0 for igpu, =1 for dgpu
-            env_file_path = os.path.join(llama_server_exe_dir, ".env")
-            set_key(env_file_path, "HIP_VISIBLE_DEVICES", hip_id)
         # Direct download for Vulkan/ROCm
         llama_archive_url, filename = get_binary_url_and_filename(backend, target_arch)
         llama_archive_path = os.path.join(llama_server_exe_dir, filename)
@@ -315,6 +344,12 @@ def install_llamacpp(backend):
         else:
             raise NotImplementedError(f"Unsupported archive format: {filename}")
+        # Identify and set HIP ID
+        if backend == "rocm":
+            hip_id = identify_hip_id()
+            env_file_path = os.path.join(llama_server_exe_dir, ".env")
+            set_key(env_file_path, "HIP_VISIBLE_DEVICES", str(hip_id))
         # Make executable on Linux - need to update paths after extraction
         if platform.system().lower() == "linux":
             # Re-get the paths since extraction might have changed the directory structure
@@ -778,3 +813,68 @@ class LlamaCppAdapter(ModelAdapter):
             error_msg = f"Failed to run llama.cpp command: {str(e)}\n"
             error_msg += f"Command: {' '.join(cmd)}"
             raise Exception(error_msg)
+def get_hip_devices():
+    """Get list of HIP devices with their IDs and names."""
+    import ctypes
+    import sys
+    import os
+    import glob
+    from ctypes import c_int, POINTER
+    from ctypes.util import find_library
+    # Get llama.cpp path
+    rocm_path = get_llama_folder_path("rocm")
+    # Load HIP library
+    hip_library_pattern = (
+        "amdhip64*.dll" if sys.platform.startswith("win") else "libamdhip64*.so"
+    )
+    search_pattern = os.path.join(rocm_path, hip_library_pattern)
+    matching_files = glob.glob(search_pattern)
+    if not matching_files:
+        raise RuntimeError(
+            f"Could not find HIP runtime library matching pattern: {search_pattern}"
+        )
+    try:
+        libhip = ctypes.CDLL(matching_files[0])
+    except OSError:
+        raise RuntimeError(f"Could not load HIP runtime library from {path}")
+    # Setup function signatures
+    hipError_t = c_int
+    hipDeviceProp_t = ctypes.c_char * 2048
+    libhip.hipGetDeviceCount.restype = hipError_t
+    libhip.hipGetDeviceCount.argtypes = [POINTER(c_int)]
+    libhip.hipGetDeviceProperties.restype = hipError_t
+    libhip.hipGetDeviceProperties.argtypes = [POINTER(hipDeviceProp_t), c_int]
+    libhip.hipGetErrorString.restype = ctypes.c_char_p
+    libhip.hipGetErrorString.argtypes = [hipError_t]
+    # Get device count
+    device_count = c_int()
+    err = libhip.hipGetDeviceCount(ctypes.byref(device_count))
+    if err != 0:
+        logging.error(
+            "hipGetDeviceCount failed:", libhip.hipGetErrorString(err).decode()
+        )
+        return []
+    # Get device properties
+    devices = []
+    for i in range(device_count.value):
+        prop = hipDeviceProp_t()
+        err = libhip.hipGetDeviceProperties(ctypes.byref(prop), i)
+        if err != 0:
+            logging.error(
+                f"hipGetDeviceProperties failed for device {i}:",
+                libhip.hipGetErrorString(err).decode(),
+            )
+            continue
+        # Extract device name from HIP device properties
+        device_name = ctypes.string_at(prop, 256).decode("utf-8").rstrip("\x00")
+        devices.append([i, device_name])
+    return devices

lemonade/tools/management_tools.py CHANGED Viewed

@@ -109,7 +109,7 @@ class Cache(ManagementTool):
     # pylint: disable=pointless-statement,f-string-without-interpolation
     f"""
     A set of functions for managing the lemonade build cache. The default
-    cache location is {lemonade_cache.DEFAULT_CACHE_DIR}, and can also be
+    cache location is {lemonade_cache.DEFAULT_CACHE_DIR}, and can also be
     selected with
     the global --cache-dir option or the LEMONADE_CACHE_DIR environment variable.

lemonade/tools/oga/utils.py CHANGED Viewed

@@ -100,9 +100,10 @@ class OrtGenaiModel(ModelAdapter):
         max_new_tokens=512,
         min_new_tokens=0,
         do_sample=True,
-        top_k=50,
-        top_p=1.0,
-        temperature=0.7,
+        top_k=None,
+        top_p=None,
+        temperature=None,
+        repeat_penalty=None,
         streamer: OrtGenaiStreamer = None,
         pad_token_id=None,
         stopping_criteria=None,
@@ -154,38 +155,58 @@ class OrtGenaiModel(ModelAdapter):
         if random_seed is None:
             random_seed = -1  # In og.Generator, -1 = seed with random device
+        # Get search config if available, otherwise use empty dict
+        # Thanks to the empty dict, if the model doesn't have a built-in search
+        #   config, the .get() calls will all just use the default values
+        search_config = {}
         if self.config and "search" in self.config:
             search_config = self.config["search"]
-            params.set_search_options(
-                do_sample=search_config.get("do_sample", do_sample),
-                top_k=search_config.get("top_k", top_k),
-                top_p=search_config.get("top_p", top_p),
-                temperature=search_config.get("temperature", temperature),
-                max_length=max_length_to_use,
-                min_length=min_length,
-                early_stopping=search_config.get("early_stopping", False),
-                length_penalty=search_config.get("length_penalty", 1.0),
-                num_beams=search_config.get("num_beams", 1),
-                num_return_sequences=search_config.get("num_return_sequences", 1),
-                repetition_penalty=search_config.get("repetition_penalty", 1.0),
-                past_present_share_buffer=search_config.get(
-                    "past_present_share_buffer", True
-                ),
-                random_seed=random_seed,
-                # Not currently supported by OGA
-                # diversity_penalty=search_config.get('diversity_penalty', 0.0),
-                # no_repeat_ngram_size=search_config.get('no_repeat_ngram_size', 0),
-            )
-        else:
-            params.set_search_options(
-                do_sample=do_sample,
-                top_k=top_k,
-                top_p=top_p,
-                temperature=temperature,
-                max_length=max_length_to_use,
-                min_length=min_length,
-                random_seed=random_seed,
-            )
+        # Apply parameter hierarchy: user provided > search config > defaults
+        default_top_k = 50
+        default_top_p = 1.0
+        default_temperature = 0.7
+        default_repetition_penalty = 1.0
+        top_k_to_use = (
+            top_k if top_k is not None else search_config.get("top_k", default_top_k)
+        )
+        top_p_to_use = (
+            top_p if top_p is not None else search_config.get("top_p", default_top_p)
+        )
+        temperature_to_use = (
+            temperature
+            if temperature is not None
+            else search_config.get("temperature", default_temperature)
+        )
+        # Map the llamacpp name, `repeat_penalty`, to the OGA name, `repetition_penalty`
+        repetition_penalty_to_use = (
+            repeat_penalty
+            if repeat_penalty is not None
+            else search_config.get("repetition_penalty", default_repetition_penalty)
+        )
+        # Set search options once with all parameters
+        params.set_search_options(
+            do_sample=search_config.get("do_sample", do_sample),
+            top_k=top_k_to_use,
+            top_p=top_p_to_use,
+            temperature=temperature_to_use,
+            repetition_penalty=repetition_penalty_to_use,
+            max_length=max_length_to_use,
+            min_length=min_length,
+            early_stopping=search_config.get("early_stopping", False),
+            length_penalty=search_config.get("length_penalty", 1.0),
+            num_beams=search_config.get("num_beams", 1),
+            num_return_sequences=search_config.get("num_return_sequences", 1),
+            past_present_share_buffer=search_config.get(
+                "past_present_share_buffer", True
+            ),
+            random_seed=random_seed,
+            # Not currently supported by OGA
+            # diversity_penalty=search_config.get('diversity_penalty', 0.0),
+            # no_repeat_ngram_size=search_config.get('no_repeat_ngram_size', 0),
+        )
         params.try_graph_capture_with_max_batch_size(1)
         generator = og.Generator(self.model, params)

lemonade/tools/server/llamacpp.py CHANGED Viewed

@@ -43,6 +43,72 @@ def llamacpp_address(port: int) -> str:
     return f"http://127.0.0.1:{port}/v1"
+def _separate_openai_params(request_dict: dict, endpoint_type: str = "chat") -> dict:
+    """
+    Separate standard OpenAI parameters from custom llama.cpp parameters.
+    Args:
+        request_dict: Dictionary of all request parameters
+        endpoint_type: Type of endpoint ("chat" or "completion")
+    Returns:
+        Dictionary with parameters properly separated for OpenAI client
+    """
+    openai_client_params = {}
+    extra_params = {}
+    # Common OpenAI parameters for both endpoint types
+    common_params = {
+        "model",
+        "frequency_penalty",
+        "logit_bias",
+        "logprobs",
+        "max_tokens",
+        "n",
+        "presence_penalty",
+        "seed",
+        "stop",
+        "stream",
+        "temperature",
+        "top_p",
+        "user",
+    }
+    # Standard OpenAI parameters by endpoint type
+    if endpoint_type == "chat":
+        chat_specific_params = {
+            "messages",
+            "top_logprobs",
+            "response_format",
+            "service_tier",
+            "stream_options",
+            "tools",
+            "tool_choice",
+            "parallel_tool_calls",
+        }
+        openai_params = common_params | chat_specific_params
+    else:  # completion
+        completion_specific_params = {
+            "prompt",
+            "best_of",
+            "echo",
+            "suffix",
+        }
+        openai_params = common_params | completion_specific_params
+    for key, value in request_dict.items():
+        if key in openai_params:
+            openai_client_params[key] = value
+        else:
+            extra_params[key] = value
+    # If there are custom parameters, use extra_body to pass them through
+    if extra_params:
+        openai_client_params["extra_body"] = extra_params
+    return openai_client_params
 class LlamaTelemetry:
     """
     Manages telemetry data collection and display for llama server.
@@ -226,6 +292,11 @@ def _launch_llama_subprocess(
         "--ctx-size",
         str(ctx_size),
     ]
+    # Lock random seed for deterministic behavior in CI
+    if os.environ.get("LEMONADE_CI_MODE"):
+        base_command.extend(["--seed", "42"])
     if "mmproj" in snapshot_files:
         base_command.extend(["--mmproj", snapshot_files["mmproj"]])
         if not use_gpu:
@@ -238,6 +309,15 @@ def _launch_llama_subprocess(
     # Add port and jinja to enable tool use
     base_command.extend(["--port", str(telemetry.port), "--jinja"])
+    # Disable jinja for gpt-oss-120b on Vulkan
+    if backend == "vulkan" and "gpt-oss-120b" in snapshot_files["variant"].lower():
+        base_command.remove("--jinja")
+        logging.warning(
+            "Jinja is disabled for gpt-oss-120b on Vulkan due to a llama.cpp bug "
+            "(see https://github.com/ggml-org/llama.cpp/issues/15274). "
+            "The model cannot use tools. If needed, use the ROCm backend instead."
+        )
     # Use legacy reasoning formatting, since not all apps support the new
     # reasoning_content field
     base_command.extend(["--reasoning-format", "none"])
@@ -384,13 +464,17 @@ def chat_completion(
         exclude_unset=True, exclude_none=True
     )
+    # Separate standard OpenAI parameters from custom llama.cpp parameters
+    openai_client_params = _separate_openai_params(request_dict, "chat")
     # Check if streaming is requested
     if chat_completion_request.stream:
         def event_stream():
             try:
                 # Enable streaming
-                for chunk in client.chat.completions.create(**request_dict):
+                # pylint: disable=missing-kwoa
+                for chunk in client.chat.completions.create(**openai_client_params):
                     yield f"data: {chunk.model_dump_json()}\n\n"
                 yield "data: [DONE]\n\n"
@@ -412,7 +496,8 @@ def chat_completion(
         # Non-streaming response
         try:
             # Disable streaming for non-streaming requests
-            response = client.chat.completions.create(**request_dict)
+            # pylint: disable=missing-kwoa
+            response = client.chat.completions.create(**openai_client_params)
             # Show telemetry after completion
             telemetry.show_telemetry()
@@ -420,6 +505,7 @@ def chat_completion(
             return response
         except Exception as e:  # pylint: disable=broad-exception-caught
+            logging.error("Error during chat completion: %s", str(e))
             raise HTTPException(
                 status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                 detail=f"Chat completion error: {str(e)}",
@@ -446,13 +532,17 @@ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry)
     # Convert Pydantic model to dict and remove unset/null values
     request_dict = completion_request.model_dump(exclude_unset=True, exclude_none=True)
+    # Separate standard OpenAI parameters from custom llama.cpp parameters
+    openai_client_params = _separate_openai_params(request_dict, "completion")
     # Check if streaming is requested
     if completion_request.stream:
         def event_stream():
             try:
                 # Enable streaming
-                for chunk in client.completions.create(**request_dict):
+                # pylint: disable=missing-kwoa
+                for chunk in client.completions.create(**openai_client_params):
                     yield f"data: {chunk.model_dump_json()}\n\n"
                 yield "data: [DONE]\n\n"
@@ -474,7 +564,8 @@ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry)
         # Non-streaming response
         try:
             # Disable streaming for non-streaming requests
-            response = client.completions.create(**request_dict)
+            # pylint: disable=missing-kwoa
+            response = client.completions.create(**openai_client_params)
             # Show telemetry after completion
             telemetry.show_telemetry()
@@ -482,6 +573,7 @@ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry)
             return response
         except Exception as e:  # pylint: disable=broad-exception-caught
+            logging.error("Error during completion: %s", str(e))
             raise HTTPException(
                 status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                 detail=f"Completion error: {str(e)}",

lemonade/tools/server/serve.py CHANGED Viewed

@@ -54,7 +54,11 @@ from lemonade.tools.server.utils.port import lifespan
 from lemonade_server.model_manager import ModelManager
 from lemonade_server.pydantic_models import (
-    DEFAULT_MAX_NEW_TOKENS,
+    DEFAULT_PORT,
+    DEFAULT_HOST,
+    DEFAULT_LOG_LEVEL,
+    DEFAULT_LLAMACPP_BACKEND,
+    DEFAULT_CTX_SIZE,
     LoadConfig,
     CompletionRequest,
     ChatCompletionRequest,
@@ -65,18 +69,16 @@ from lemonade_server.pydantic_models import (
     DeleteConfig,
 )
+# Set to a high number to allow for interesting experiences in real apps
+# Tests should use the max_new_tokens argument to set a lower value
+DEFAULT_MAX_NEW_TOKENS = 1500
 # Only import tray on Windows
 if platform.system() == "Windows":
     # pylint: disable=ungrouped-imports
     from lemonade.tools.server.tray import LemonadeTray, OutputDuplicator
-DEFAULT_PORT = 8000
-DEFAULT_LOG_LEVEL = "info"
-DEFAULT_LLAMACPP_BACKEND = "vulkan"
-DEFAULT_CTX_SIZE = 4096
 class ServerModel(Model):
     """
     An extension of OpenAI's Model class that adds
@@ -150,6 +152,7 @@ class Server:
     def __init__(
         self,
         port: int = DEFAULT_PORT,
+        host: str = DEFAULT_HOST,
         log_level: str = DEFAULT_LOG_LEVEL,
         ctx_size: int = DEFAULT_CTX_SIZE,
         tray: bool = False,
@@ -160,6 +163,7 @@ class Server:
         # Save args as members
         self.port = port
+        self.host = host
         self.log_level = log_level
         self.ctx_size = ctx_size
         self.tray = tray
@@ -255,6 +259,47 @@ class Server:
             self.app.post(f"{prefix}/reranking")(self.reranking)
             self.app.post(f"{prefix}/rerank")(self.reranking)
+    def _log_request_parameters(self, request, endpoint_name: str):
+        """
+        Log request parameters excluding content fields like messages, prompt, or input.
+        Args:
+            request: Any request object (CompletionRequest, ChatCompletionRequest, etc.)
+            endpoint_name: Name of the endpoint for logging context
+        """
+        if not logging.getLogger().isEnabledFor(logging.DEBUG):
+            return
+        # Fields to exclude from logging (content fields)
+        excluded_fields = {"messages", "prompt", "input"}
+        # Get all attributes from the request object
+        request_params = {}
+        if hasattr(request, "__dict__"):
+            # For pydantic models, get the dict representation
+            if hasattr(request, "model_dump"):
+                all_params = request.model_dump()
+            elif hasattr(request, "dict"):
+                all_params = request.dict()
+            else:
+                all_params = request.__dict__
+            # Filter out excluded fields and add special handling for certain fields
+            for key, value in all_params.items():
+                if key not in excluded_fields:
+                    # Special handling for tools field - show count instead of full content
+                    if key == "tools" and value is not None:
+                        request_params[key] = (
+                            f"{len(value)} tools" if isinstance(value, list) else value
+                        )
+                    # Special handling for input type in responses
+                    elif key == "input" and hasattr(request, "input"):
+                        request_params["input_type"] = type(value).__name__
+                    else:
+                        request_params[key] = value
+        logging.debug(f"{endpoint_name} request parameters: {request_params}")
     def _setup_server_common(
         self,
         tray: bool = False,
@@ -332,6 +377,9 @@ class Server:
         # Let the app know what port it's running on, so
         # that the lifespan can access it
         self.app.port = self.port
+        # FastAPI already has a `host` function and we cannot use `_host` as
+        # PyLint will believe its private
+        self.app.host_ = self.host
     def run(self):
         # Common setup
@@ -340,9 +388,7 @@ class Server:
             tray=self.tray,
         )
-        uvicorn.run(
-            self.app, host="localhost", port=self.port, log_level=self.log_level
-        )
+        uvicorn.run(self.app, host=self.host, port=self.port, log_level=self.log_level)
     def run_in_thread(self, host: str = "localhost"):
         """
@@ -431,6 +477,9 @@ class Server:
         lc = self.initialize_load_config(completion_request)
+        # Log request parameters (excluding message content for brevity)
+        self._log_request_parameters(completion_request, "Completions")
         # Load the model if it's different from the currently loaded one
         await self.load_llm(lc)
@@ -452,6 +501,9 @@ class Server:
             "message": text,
             "stop": completion_request.stop,
             "temperature": completion_request.temperature,
+            "repeat_penalty": completion_request.repeat_penalty,
+            "top_k": completion_request.top_k,
+            "top_p": completion_request.top_p,
             "max_new_tokens": completion_request.max_tokens,
         }
@@ -560,6 +612,9 @@ class Server:
         lc = self.initialize_load_config(chat_completion_request)
+        # Log request parameters (excluding message history for brevity)
+        self._log_request_parameters(chat_completion_request, "Chat completions")
         # Load the model if it's different from the currently loaded one
         await self.load_llm(lc)
@@ -604,6 +659,9 @@ class Server:
             "message": text,
             "stop": chat_completion_request.stop,
             "temperature": chat_completion_request.temperature,
+            "repeat_penalty": chat_completion_request.repeat_penalty,
+            "top_k": chat_completion_request.top_k,
+            "top_p": chat_completion_request.top_p,
             "max_new_tokens": max_new_tokens,
         }
@@ -852,6 +910,9 @@ class Server:
         lc = self.initialize_load_config(responses_request)
+        # Log request parameters (excluding message history for brevity)
+        self._log_request_parameters(responses_request, "Responses")
         # Load the model if it's different from the currently loaded one
         await self.load_llm(lc)
@@ -873,6 +934,9 @@ class Server:
         generation_args = {
             "message": text,
             "temperature": responses_request.temperature,
+            "repeat_penalty": responses_request.repeat_penalty,
+            "top_k": responses_request.top_k,
+            "top_p": responses_request.top_p,
             "max_new_tokens": responses_request.max_output_tokens,
         }
@@ -1002,6 +1066,9 @@ class Server:
         stop: list[str] | str | None = None,
         max_new_tokens: int | None = None,
         temperature: float | None = None,
+        repeat_penalty: float | None = None,
+        top_k: int | None = None,
+        top_p: float | None = None,
     ):
         """
         Core streaming completion logic, separated from response handling.
@@ -1084,6 +1151,9 @@ class Server:
             "pad_token_id": tokenizer.eos_token_id,
             "stopping_criteria": stopping_criteria,
             "temperature": temperature,
+            "repeat_penalty": repeat_penalty,
+            "top_k": top_k,
+            "top_p": top_p,
         }
         # Initialize performance variables

lemonade-sdk 8.1.1__py3-none-any.whl → 8.1.3__py3-none-any.whl

Potentially problematic release.

lemonade-sdk 8.1.1py3-none-any.whl → 8.1.3py3-none-any.whl