PyPI - lemonade-sdk - Versions diffs - 8.1.2__py3-none-any.whl → 8.1.3__py3-none-any.whl - Mend

lemonade-sdk 8.1.2py3-none-any.whl → 8.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (21) hide show

lemonade/tools/oga/utils.py +54 -33
lemonade/tools/server/llamacpp.py +96 -4
lemonade/tools/server/serve.py +74 -8
lemonade/tools/server/static/js/chat.js +735 -0
lemonade/tools/server/static/js/model-settings.js +162 -0
lemonade/tools/server/static/js/models.js +865 -0
lemonade/tools/server/static/js/shared.js +491 -0
lemonade/tools/server/static/styles.css +652 -26
lemonade/tools/server/static/webapp.html +145 -1092
lemonade/tools/server/utils/port.py +3 -2
lemonade/version.py +1 -1
{lemonade_sdk-8.1.2.dist-info → lemonade_sdk-8.1.3.dist-info}/METADATA +7 -6
{lemonade_sdk-8.1.2.dist-info → lemonade_sdk-8.1.3.dist-info}/RECORD +21 -17
lemonade_server/cli.py +31 -17
lemonade_server/pydantic_models.py +15 -3
lemonade_server/server_models.json +9 -3
{lemonade_sdk-8.1.2.dist-info → lemonade_sdk-8.1.3.dist-info}/WHEEL +0 -0
{lemonade_sdk-8.1.2.dist-info → lemonade_sdk-8.1.3.dist-info}/entry_points.txt +0 -0
{lemonade_sdk-8.1.2.dist-info → lemonade_sdk-8.1.3.dist-info}/licenses/LICENSE +0 -0
{lemonade_sdk-8.1.2.dist-info → lemonade_sdk-8.1.3.dist-info}/licenses/NOTICE.md +0 -0
{lemonade_sdk-8.1.2.dist-info → lemonade_sdk-8.1.3.dist-info}/top_level.txt +0 -0

lemonade/tools/oga/utils.py CHANGED Viewed

@@ -100,9 +100,10 @@ class OrtGenaiModel(ModelAdapter):
         max_new_tokens=512,
         min_new_tokens=0,
         do_sample=True,
-        top_k=50,
-        top_p=1.0,
-        temperature=0.7,
+        top_k=None,
+        top_p=None,
+        temperature=None,
+        repeat_penalty=None,
         streamer: OrtGenaiStreamer = None,
         pad_token_id=None,
         stopping_criteria=None,
@@ -154,38 +155,58 @@ class OrtGenaiModel(ModelAdapter):
         if random_seed is None:
             random_seed = -1  # In og.Generator, -1 = seed with random device
+        # Get search config if available, otherwise use empty dict
+        # Thanks to the empty dict, if the model doesn't have a built-in search
+        #   config, the .get() calls will all just use the default values
+        search_config = {}
         if self.config and "search" in self.config:
             search_config = self.config["search"]
-            params.set_search_options(
-                do_sample=search_config.get("do_sample", do_sample),
-                top_k=search_config.get("top_k", top_k),
-                top_p=search_config.get("top_p", top_p),
-                temperature=search_config.get("temperature", temperature),
-                max_length=max_length_to_use,
-                min_length=min_length,
-                early_stopping=search_config.get("early_stopping", False),
-                length_penalty=search_config.get("length_penalty", 1.0),
-                num_beams=search_config.get("num_beams", 1),
-                num_return_sequences=search_config.get("num_return_sequences", 1),
-                repetition_penalty=search_config.get("repetition_penalty", 1.0),
-                past_present_share_buffer=search_config.get(
-                    "past_present_share_buffer", True
-                ),
-                random_seed=random_seed,
-                # Not currently supported by OGA
-                # diversity_penalty=search_config.get('diversity_penalty', 0.0),
-                # no_repeat_ngram_size=search_config.get('no_repeat_ngram_size', 0),
-            )
-        else:
-            params.set_search_options(
-                do_sample=do_sample,
-                top_k=top_k,
-                top_p=top_p,
-                temperature=temperature,
-                max_length=max_length_to_use,
-                min_length=min_length,
-                random_seed=random_seed,
-            )
+        # Apply parameter hierarchy: user provided > search config > defaults
+        default_top_k = 50
+        default_top_p = 1.0
+        default_temperature = 0.7
+        default_repetition_penalty = 1.0
+        top_k_to_use = (
+            top_k if top_k is not None else search_config.get("top_k", default_top_k)
+        )
+        top_p_to_use = (
+            top_p if top_p is not None else search_config.get("top_p", default_top_p)
+        )
+        temperature_to_use = (
+            temperature
+            if temperature is not None
+            else search_config.get("temperature", default_temperature)
+        )
+        # Map the llamacpp name, `repeat_penalty`, to the OGA name, `repetition_penalty`
+        repetition_penalty_to_use = (
+            repeat_penalty
+            if repeat_penalty is not None
+            else search_config.get("repetition_penalty", default_repetition_penalty)
+        )
+        # Set search options once with all parameters
+        params.set_search_options(
+            do_sample=search_config.get("do_sample", do_sample),
+            top_k=top_k_to_use,
+            top_p=top_p_to_use,
+            temperature=temperature_to_use,
+            repetition_penalty=repetition_penalty_to_use,
+            max_length=max_length_to_use,
+            min_length=min_length,
+            early_stopping=search_config.get("early_stopping", False),
+            length_penalty=search_config.get("length_penalty", 1.0),
+            num_beams=search_config.get("num_beams", 1),
+            num_return_sequences=search_config.get("num_return_sequences", 1),
+            past_present_share_buffer=search_config.get(
+                "past_present_share_buffer", True
+            ),
+            random_seed=random_seed,
+            # Not currently supported by OGA
+            # diversity_penalty=search_config.get('diversity_penalty', 0.0),
+            # no_repeat_ngram_size=search_config.get('no_repeat_ngram_size', 0),
+        )
         params.try_graph_capture_with_max_batch_size(1)
         generator = og.Generator(self.model, params)

lemonade/tools/server/llamacpp.py CHANGED Viewed

@@ -43,6 +43,72 @@ def llamacpp_address(port: int) -> str:
     return f"http://127.0.0.1:{port}/v1"
+def _separate_openai_params(request_dict: dict, endpoint_type: str = "chat") -> dict:
+    """
+    Separate standard OpenAI parameters from custom llama.cpp parameters.
+    Args:
+        request_dict: Dictionary of all request parameters
+        endpoint_type: Type of endpoint ("chat" or "completion")
+    Returns:
+        Dictionary with parameters properly separated for OpenAI client
+    """
+    openai_client_params = {}
+    extra_params = {}
+    # Common OpenAI parameters for both endpoint types
+    common_params = {
+        "model",
+        "frequency_penalty",
+        "logit_bias",
+        "logprobs",
+        "max_tokens",
+        "n",
+        "presence_penalty",
+        "seed",
+        "stop",
+        "stream",
+        "temperature",
+        "top_p",
+        "user",
+    }
+    # Standard OpenAI parameters by endpoint type
+    if endpoint_type == "chat":
+        chat_specific_params = {
+            "messages",
+            "top_logprobs",
+            "response_format",
+            "service_tier",
+            "stream_options",
+            "tools",
+            "tool_choice",
+            "parallel_tool_calls",
+        }
+        openai_params = common_params | chat_specific_params
+    else:  # completion
+        completion_specific_params = {
+            "prompt",
+            "best_of",
+            "echo",
+            "suffix",
+        }
+        openai_params = common_params | completion_specific_params
+    for key, value in request_dict.items():
+        if key in openai_params:
+            openai_client_params[key] = value
+        else:
+            extra_params[key] = value
+    # If there are custom parameters, use extra_body to pass them through
+    if extra_params:
+        openai_client_params["extra_body"] = extra_params
+    return openai_client_params
 class LlamaTelemetry:
     """
     Manages telemetry data collection and display for llama server.
@@ -226,6 +292,11 @@ def _launch_llama_subprocess(
         "--ctx-size",
         str(ctx_size),
     ]
+    # Lock random seed for deterministic behavior in CI
+    if os.environ.get("LEMONADE_CI_MODE"):
+        base_command.extend(["--seed", "42"])
     if "mmproj" in snapshot_files:
         base_command.extend(["--mmproj", snapshot_files["mmproj"]])
         if not use_gpu:
@@ -238,6 +309,15 @@ def _launch_llama_subprocess(
     # Add port and jinja to enable tool use
     base_command.extend(["--port", str(telemetry.port), "--jinja"])
+    # Disable jinja for gpt-oss-120b on Vulkan
+    if backend == "vulkan" and "gpt-oss-120b" in snapshot_files["variant"].lower():
+        base_command.remove("--jinja")
+        logging.warning(
+            "Jinja is disabled for gpt-oss-120b on Vulkan due to a llama.cpp bug "
+            "(see https://github.com/ggml-org/llama.cpp/issues/15274). "
+            "The model cannot use tools. If needed, use the ROCm backend instead."
+        )
     # Use legacy reasoning formatting, since not all apps support the new
     # reasoning_content field
     base_command.extend(["--reasoning-format", "none"])
@@ -384,13 +464,17 @@ def chat_completion(
         exclude_unset=True, exclude_none=True
     )
+    # Separate standard OpenAI parameters from custom llama.cpp parameters
+    openai_client_params = _separate_openai_params(request_dict, "chat")
     # Check if streaming is requested
     if chat_completion_request.stream:
         def event_stream():
             try:
                 # Enable streaming
-                for chunk in client.chat.completions.create(**request_dict):
+                # pylint: disable=missing-kwoa
+                for chunk in client.chat.completions.create(**openai_client_params):
                     yield f"data: {chunk.model_dump_json()}\n\n"
                 yield "data: [DONE]\n\n"
@@ -412,7 +496,8 @@ def chat_completion(
         # Non-streaming response
         try:
             # Disable streaming for non-streaming requests
-            response = client.chat.completions.create(**request_dict)
+            # pylint: disable=missing-kwoa
+            response = client.chat.completions.create(**openai_client_params)
             # Show telemetry after completion
             telemetry.show_telemetry()
@@ -420,6 +505,7 @@ def chat_completion(
             return response
         except Exception as e:  # pylint: disable=broad-exception-caught
+            logging.error("Error during chat completion: %s", str(e))
             raise HTTPException(
                 status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                 detail=f"Chat completion error: {str(e)}",
@@ -446,13 +532,17 @@ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry)
     # Convert Pydantic model to dict and remove unset/null values
     request_dict = completion_request.model_dump(exclude_unset=True, exclude_none=True)
+    # Separate standard OpenAI parameters from custom llama.cpp parameters
+    openai_client_params = _separate_openai_params(request_dict, "completion")
     # Check if streaming is requested
     if completion_request.stream:
         def event_stream():
             try:
                 # Enable streaming
-                for chunk in client.completions.create(**request_dict):
+                # pylint: disable=missing-kwoa
+                for chunk in client.completions.create(**openai_client_params):
                     yield f"data: {chunk.model_dump_json()}\n\n"
                 yield "data: [DONE]\n\n"
@@ -474,7 +564,8 @@ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry)
         # Non-streaming response
         try:
             # Disable streaming for non-streaming requests
-            response = client.completions.create(**request_dict)
+            # pylint: disable=missing-kwoa
+            response = client.completions.create(**openai_client_params)
             # Show telemetry after completion
             telemetry.show_telemetry()
@@ -482,6 +573,7 @@ def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry)
             return response
         except Exception as e:  # pylint: disable=broad-exception-caught
+            logging.error("Error during completion: %s", str(e))
             raise HTTPException(
                 status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                 detail=f"Completion error: {str(e)}",

lemonade/tools/server/serve.py CHANGED Viewed

@@ -54,7 +54,11 @@ from lemonade.tools.server.utils.port import lifespan
 from lemonade_server.model_manager import ModelManager
 from lemonade_server.pydantic_models import (
-    DEFAULT_MAX_NEW_TOKENS,
+    DEFAULT_PORT,
+    DEFAULT_HOST,
+    DEFAULT_LOG_LEVEL,
+    DEFAULT_LLAMACPP_BACKEND,
+    DEFAULT_CTX_SIZE,
     LoadConfig,
     CompletionRequest,
     ChatCompletionRequest,
@@ -65,19 +69,16 @@ from lemonade_server.pydantic_models import (
     DeleteConfig,
 )
+# Set to a high number to allow for interesting experiences in real apps
+# Tests should use the max_new_tokens argument to set a lower value
+DEFAULT_MAX_NEW_TOKENS = 1500
 # Only import tray on Windows
 if platform.system() == "Windows":
     # pylint: disable=ungrouped-imports
     from lemonade.tools.server.tray import LemonadeTray, OutputDuplicator
-DEFAULT_PORT = 8000
-DEFAULT_HOST = "localhost"
-DEFAULT_LOG_LEVEL = "info"
-DEFAULT_LLAMACPP_BACKEND = "vulkan"
-DEFAULT_CTX_SIZE = 4096
 class ServerModel(Model):
     """
     An extension of OpenAI's Model class that adds
@@ -258,6 +259,47 @@ class Server:
             self.app.post(f"{prefix}/reranking")(self.reranking)
             self.app.post(f"{prefix}/rerank")(self.reranking)
+    def _log_request_parameters(self, request, endpoint_name: str):
+        """
+        Log request parameters excluding content fields like messages, prompt, or input.
+        Args:
+            request: Any request object (CompletionRequest, ChatCompletionRequest, etc.)
+            endpoint_name: Name of the endpoint for logging context
+        """
+        if not logging.getLogger().isEnabledFor(logging.DEBUG):
+            return
+        # Fields to exclude from logging (content fields)
+        excluded_fields = {"messages", "prompt", "input"}
+        # Get all attributes from the request object
+        request_params = {}
+        if hasattr(request, "__dict__"):
+            # For pydantic models, get the dict representation
+            if hasattr(request, "model_dump"):
+                all_params = request.model_dump()
+            elif hasattr(request, "dict"):
+                all_params = request.dict()
+            else:
+                all_params = request.__dict__
+            # Filter out excluded fields and add special handling for certain fields
+            for key, value in all_params.items():
+                if key not in excluded_fields:
+                    # Special handling for tools field - show count instead of full content
+                    if key == "tools" and value is not None:
+                        request_params[key] = (
+                            f"{len(value)} tools" if isinstance(value, list) else value
+                        )
+                    # Special handling for input type in responses
+                    elif key == "input" and hasattr(request, "input"):
+                        request_params["input_type"] = type(value).__name__
+                    else:
+                        request_params[key] = value
+        logging.debug(f"{endpoint_name} request parameters: {request_params}")
     def _setup_server_common(
         self,
         tray: bool = False,
@@ -435,6 +477,9 @@ class Server:
         lc = self.initialize_load_config(completion_request)
+        # Log request parameters (excluding message content for brevity)
+        self._log_request_parameters(completion_request, "Completions")
         # Load the model if it's different from the currently loaded one
         await self.load_llm(lc)
@@ -456,6 +501,9 @@ class Server:
             "message": text,
             "stop": completion_request.stop,
             "temperature": completion_request.temperature,
+            "repeat_penalty": completion_request.repeat_penalty,
+            "top_k": completion_request.top_k,
+            "top_p": completion_request.top_p,
             "max_new_tokens": completion_request.max_tokens,
         }
@@ -564,6 +612,9 @@ class Server:
         lc = self.initialize_load_config(chat_completion_request)
+        # Log request parameters (excluding message history for brevity)
+        self._log_request_parameters(chat_completion_request, "Chat completions")
         # Load the model if it's different from the currently loaded one
         await self.load_llm(lc)
@@ -608,6 +659,9 @@ class Server:
             "message": text,
             "stop": chat_completion_request.stop,
             "temperature": chat_completion_request.temperature,
+            "repeat_penalty": chat_completion_request.repeat_penalty,
+            "top_k": chat_completion_request.top_k,
+            "top_p": chat_completion_request.top_p,
             "max_new_tokens": max_new_tokens,
         }
@@ -856,6 +910,9 @@ class Server:
         lc = self.initialize_load_config(responses_request)
+        # Log request parameters (excluding message history for brevity)
+        self._log_request_parameters(responses_request, "Responses")
         # Load the model if it's different from the currently loaded one
         await self.load_llm(lc)
@@ -877,6 +934,9 @@ class Server:
         generation_args = {
             "message": text,
             "temperature": responses_request.temperature,
+            "repeat_penalty": responses_request.repeat_penalty,
+            "top_k": responses_request.top_k,
+            "top_p": responses_request.top_p,
             "max_new_tokens": responses_request.max_output_tokens,
         }
@@ -1006,6 +1066,9 @@ class Server:
         stop: list[str] | str | None = None,
         max_new_tokens: int | None = None,
         temperature: float | None = None,
+        repeat_penalty: float | None = None,
+        top_k: int | None = None,
+        top_p: float | None = None,
     ):
         """
         Core streaming completion logic, separated from response handling.
@@ -1088,6 +1151,9 @@ class Server:
             "pad_token_id": tokenizer.eos_token_id,
             "stopping_criteria": stopping_criteria,
             "temperature": temperature,
+            "repeat_penalty": repeat_penalty,
+            "top_k": top_k,
+            "top_p": top_p,
         }
         # Initialize performance variables

lemonade-sdk 8.1.2__py3-none-any.whl → 8.1.3__py3-none-any.whl

Potentially problematic release.

lemonade-sdk 8.1.2py3-none-any.whl → 8.1.3py3-none-any.whl