PyPI - lemonade-sdk - Versions diffs - 8.1.4__py3-none-any.whl → 8.2.2__py3-none-any.whl - Mend

lemonade-sdk 8.1.4py3-none-any.whl → 8.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (53) hide show

lemonade/cache.py +6 -1
lemonade/cli.py +47 -5
lemonade/common/inference_engines.py +13 -4
lemonade/common/status.py +4 -4
lemonade/common/system_info.py +544 -1
lemonade/profilers/agt_power.py +437 -0
lemonade/profilers/hwinfo_power.py +429 -0
lemonade/tools/accuracy.py +143 -48
lemonade/tools/adapter.py +6 -1
lemonade/tools/bench.py +26 -8
lemonade/tools/flm/__init__.py +1 -0
lemonade/tools/flm/utils.py +303 -0
lemonade/tools/huggingface/bench.py +6 -1
lemonade/tools/llamacpp/bench.py +146 -27
lemonade/tools/llamacpp/load.py +30 -2
lemonade/tools/llamacpp/utils.py +393 -33
lemonade/tools/oga/bench.py +5 -26
lemonade/tools/oga/load.py +60 -121
lemonade/tools/oga/migration.py +403 -0
lemonade/tools/report/table.py +76 -8
lemonade/tools/server/flm.py +133 -0
lemonade/tools/server/llamacpp.py +220 -553
lemonade/tools/server/serve.py +684 -168
lemonade/tools/server/static/js/chat.js +666 -342
lemonade/tools/server/static/js/model-settings.js +24 -3
lemonade/tools/server/static/js/models.js +597 -73
lemonade/tools/server/static/js/shared.js +79 -14
lemonade/tools/server/static/logs.html +191 -0
lemonade/tools/server/static/styles.css +491 -66
lemonade/tools/server/static/webapp.html +83 -31
lemonade/tools/server/tray.py +158 -38
lemonade/tools/server/utils/macos_tray.py +226 -0
lemonade/tools/server/utils/{system_tray.py → windows_tray.py} +13 -0
lemonade/tools/server/webapp.py +4 -1
lemonade/tools/server/wrapped_server.py +559 -0
lemonade/version.py +1 -1
lemonade_install/install.py +54 -611
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/METADATA +29 -72
lemonade_sdk-8.2.2.dist-info/RECORD +83 -0
lemonade_server/cli.py +145 -37
lemonade_server/model_manager.py +521 -37
lemonade_server/pydantic_models.py +28 -1
lemonade_server/server_models.json +246 -92
lemonade_server/settings.py +39 -39
lemonade/tools/quark/__init__.py +0 -0
lemonade/tools/quark/quark_load.py +0 -173
lemonade/tools/quark/quark_quantize.py +0 -439
lemonade_sdk-8.1.4.dist-info/RECORD +0 -77
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/WHEEL +0 -0
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/entry_points.txt +0 -0
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/LICENSE +0 -0
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/NOTICE.md +0 -0
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/top_level.txt +0 -0

lemonade/tools/server/serve.py CHANGED Viewed

@@ -7,15 +7,16 @@ import logging
 import platform
 import tempfile
 import traceback
-from typing import Optional, Union
+from typing import Optional, Union, List
 import json
-import subprocess
 from pathlib import Path
-from fastapi import FastAPI, HTTPException, status, Request
+import os
+import shutil
+from fastapi import FastAPI, HTTPException, status, Request, WebSocket, Form, UploadFile
 from fastapi.responses import StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
+from starlette.websockets import WebSocketDisconnect, WebSocketState
 import uvicorn
 from uvicorn.config import Config
 from uvicorn.server import Server as UvicornServer
@@ -47,7 +48,9 @@ from openai.types.responses import (
 )
 import lemonade.api as lemonade_api
-import lemonade.tools.server.llamacpp as llamacpp
+from lemonade.tools.server.wrapped_server import WrappedServer
+from lemonade.tools.server.llamacpp import LlamaServer
+from lemonade.tools.server.flm import FlmServer
 from lemonade.tools.server.tool_calls import extract_tool_calls, get_tool_call_pattern
 from lemonade.tools.server.webapp import get_webapp_html
 from lemonade.tools.server.utils.port import lifespan
@@ -75,12 +78,83 @@ from lemonade_server.settings import save_setting
 # Tests should use the max_new_tokens argument to set a lower value
 DEFAULT_MAX_NEW_TOKENS = 1500
-# Only import tray on Windows
-if platform.system() == "Windows":
+if platform.system() in ["Windows", "Darwin"]:
     # pylint: disable=ungrouped-imports
     from lemonade.tools.server.tray import LemonadeTray, OutputDuplicator
+class ServerLogFilter(logging.Filter):
+    def __init__(self, server):
+        super().__init__()
+        self.server = server
+        self.noisy_paths = {
+            "/api/v1/health",
+            "/api/v0/health",
+            "/api/v1/models",
+            "/api/v0/models",
+        }
+    def filter(self, record: logging.LogRecord) -> bool:
+        msg = record.getMessage()
+        # Filter out websocket logs
+        if "> TEXT" in msg:
+            return False
+        # Filter out noisy HTTP routes if debug logs are OFF
+        if not self.server.debug_logging_enabled:
+            if any(path in msg for path in self.noisy_paths):
+                return False
+        # Otherwise, allow the log
+        return True
+async def log_streamer(websocket: WebSocket, path: str, interval: float = 1.0):
+    logger = logging.getLogger()
+    await websocket.accept()
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            f.seek(0)  # start at the beginning of the file
+            while True:
+                # Try reading a line
+                line = f.readline()
+                if not line:
+                    await asyncio.sleep(interval)
+                    continue
+                # Send defensively: if disconnected, bail out
+                if websocket.application_state != WebSocketState.CONNECTED:
+                    # Server-side state says we're not connected anymore
+                    break
+                try:
+                    await websocket.send_text(line)
+                except WebSocketDisconnect:
+                    # Client closed — normal path out
+                    break
+                except RuntimeError as re:
+                    # Starlette will raise this if a close has already been sent
+                    logger.debug("RuntimeError during send: %s", re)
+                    break
+    except WebSocketDisconnect:
+        # Client closed the socket; do not try to send or close again
+        pass
+    except Exception as e:  # pylint: disable=broad-except
+        # Log server-side; do not attempt to send error over a possibly closed socket
+        logger.exception("Error in log_streamer: %s", e)
+    finally:
+        # Only close if Starlette still thinks we're connected.
+        # This prevents "Cannot call send once a close message has been sent."
+        try:
+            if websocket.application_state == WebSocketState.CONNECTED:
+                await websocket.close()
+        except Exception:  # pylint: disable=broad-except
+            # If close itself races, swallow — we're shutting down anyway.
+            pass
 class ServerModel(Model):
     """
     An extension of OpenAI's Model class that adds
@@ -133,6 +207,21 @@ class StopOnEvent:
         return self.stop_event.is_set()
+class NoCacheStaticFiles(StaticFiles):
+    """Custom StaticFiles class with no-cache headers"""
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def file_response(self, *args, **kwargs) -> Response:
+        response = super().file_response(*args, **kwargs)
+        # Add no-cache headers for all static files
+        response.headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
+        response.headers["Pragma"] = "no-cache"
+        response.headers["Expires"] = "0"
+        return response
 class Server:
     """
     Open a web server that apps can use to communicate with the LLM.
@@ -149,6 +238,7 @@ class Server:
     - /api/v1/chat/completions: chat completion responses using HTTP chunked transfer encoding.
     - /api/v1/responses: responses API using HTTP chunked transfer encoding.
     - /api/v1/models: list all available models.
+    - /api/v1/models/{model_id}: retrieve a specific model by ID.
     """
     def __init__(
@@ -188,6 +278,12 @@ class Server:
             allow_headers=["*"],  # Allows all headers
         )
+        # Set up debug middleware if debug logging is enabled
+        # This must be done during app initialization, not at runtime
+        self.debug_logging_enabled = log_level == "debug"
+        if self.debug_logging_enabled:
+            self.setup_middleware_timer()
         # Set up custom routes
         self.setup_routes(["/api/v0", "/api/v1"])
@@ -198,7 +294,7 @@ class Server:
         # as the Web App
         static_dir = Path(__file__).parent / "static"
         self.app.mount(
-            "/static", StaticFiles(directory=static_dir), name="static_assets"
+            "/static", NoCacheStaticFiles(directory=static_dir), name="static_assets"
         )
         # Performance stats that are set during /ws and can be
@@ -232,11 +328,8 @@ class Server:
         # Add lock for load/unload operations
         self._load_lock = asyncio.Lock()
-        # Subprocess handle for llama_server.exe
-        self.llama_server_process: subprocess.Popen = None
-        # Telemetry instance for llama server
-        self.llama_telemetry = llamacpp.LlamaTelemetry()
+        # Subprocess handle for wrapped instance of llama_server.exe, etc.
+        self.wrapped_server: WrappedServer = None
     def setup_routes(self, api_prefixes: list[str]):
         for prefix in api_prefixes:
@@ -252,16 +345,199 @@ class Server:
             self.app.post(f"{prefix}/completions")(self.completions)
             self.app.post(f"{prefix}/responses")(self.responses)
             self.app.post(f"{prefix}/log-level")(self.set_log_level)
+            self.app.websocket(f"{prefix}/logs/ws")(self.logs_ws)
+            self.app.post(f"{prefix}/add-local-model")(self.add_local_model)
             # OpenAI-compatible routes
             self.app.post(f"{prefix}/chat/completions")(self.chat_completions)
             self.app.post(f"{prefix}/embeddings")(self.embeddings)
             self.app.get(f"{prefix}/models")(self.models)
+            self.app.get(f"{prefix}/models/{{model_id}}")(self.retrieve_model)
             # JinaAI routes (jina.ai/reranker/)
             self.app.post(f"{prefix}/reranking")(self.reranking)
             self.app.post(f"{prefix}/rerank")(self.reranking)
+            # Migration routes
+            self.app.get(f"{prefix}/migration/incompatible-models")(
+                self.get_incompatible_models
+            )
+            self.app.post(f"{prefix}/migration/cleanup")(
+                self.cleanup_incompatible_models
+            )
+    async def add_local_model(
+        self,
+        model_name: str = Form(...),
+        checkpoint: str = Form(""),
+        recipe: str = Form(...),
+        reasoning: bool = Form(False),
+        vision: bool = Form(False),
+        mmproj: str = Form(None),
+        model_files: List[UploadFile] = None,
+    ):
+        from huggingface_hub.constants import HF_HUB_CACHE
+        from lemonade.tools.llamacpp.utils import parse_checkpoint
+        # Upload and register a local model from files.
+        try:
+            if not model_files:
+                raise HTTPException(
+                    status_code=status.HTTP_400_BAD_REQUEST,
+                    detail="No model files provided for upload",
+                )
+            if not model_name.startswith("user."):
+                raise HTTPException(
+                    status_code=status.HTTP_400_BAD_REQUEST,
+                    detail="Model name must start with 'user.'",
+                )
+            valid_recipes = ["llamacpp", "oga-npu", "oga-hybrid", "oga-cpu"]
+            if recipe not in valid_recipes:
+                raise HTTPException(
+                    status_code=status.HTTP_400_BAD_REQUEST,
+                    detail=f"Invalid recipe. Must be one of: {', '.join(valid_recipes)}",
+                )
+            if recipe == "llamacpp" and not any(
+                f.filename.lower().endswith(".gguf") for f in model_files
+            ):
+                raise HTTPException(
+                    status_code=status.HTTP_400_BAD_REQUEST,
+                    detail="At least one .gguf file is required for llamacpp",
+                )
+            # Check if model name already exists
+            if model_name in ModelManager().supported_models:
+                raise HTTPException(
+                    status_code=status.HTTP_409_CONFLICT,
+                    detail=(
+                        f"Model name '{model_name}' already exists. "
+                        "Please use a different name."
+                    ),
+                )
+            model_name_clean = model_name.replace("user.", "")
+            # Files are saved to models--{model_name_clean}
+            # Note: This is based on the user's custom model name, NOT the checkpoint field
+            repo_cache_name = model_name_clean.replace("/", "--")
+            snapshot_path = os.path.join(HF_HUB_CACHE, f"models--{repo_cache_name}")
+            os.makedirs(snapshot_path, exist_ok=True)
+            # Extract variant from checkpoint field if provided
+            # checkpoint field format: "folder:variant" or just "folder"
+            variant = None
+            if checkpoint and ":" in checkpoint:
+                _, variant = parse_checkpoint(checkpoint)
+                # variant now contains just the variant[can be with or without the
+                # .gguf extension] filename (e.g., "LFM2-VL-1.6B-F16 or LFM2-VL-1.6B-F16.gguf")
+            # Save uploaded files, preserving folder structure
+            for file in model_files:
+                relative_path = file.filename
+                path_parts = relative_path.split("/")
+                if len(path_parts) > 1:
+                    internal_path = "/".join(path_parts[1:])
+                    file_path = os.path.join(snapshot_path, internal_path)
+                else:
+                    file_path = os.path.join(snapshot_path, path_parts[0])
+                os.makedirs(os.path.dirname(file_path), exist_ok=True)
+                with open(file_path, "wb") as f:
+                    content = await file.read()
+                    f.write(content)
+            # Resolve actual file paths after upload (for faster loading later)
+            resolved_checkpoint = None
+            resolved_mmproj = None
+            # For OGA models, find genai_config.json
+            if recipe.startswith("oga-"):
+                for root, _, files in os.walk(snapshot_path):
+                    if "genai_config.json" in files:
+                        resolved_checkpoint = root
+                        break
+                if not resolved_checkpoint:
+                    resolved_checkpoint = snapshot_path
+            # For llamacpp models, find the GGUF file
+            elif recipe == "llamacpp":
+                gguf_file_found = None
+                # If variant is specified, look for that specific file
+                if variant:
+                    search_term = (
+                        variant if variant.endswith(".gguf") else f"{variant}.gguf"
+                    )
+                    for root, _, files in os.walk(snapshot_path):
+                        if search_term in files:
+                            gguf_file_found = os.path.join(root, search_term)
+                            break
+                # If no variant or variant not found, search for any .gguf file (excluding mmproj)
+                if not gguf_file_found:
+                    for root, _, files in os.walk(snapshot_path):
+                        gguf_files = [
+                            f
+                            for f in files
+                            if f.endswith(".gguf") and "mmproj" not in f.lower()
+                        ]
+                        if gguf_files:
+                            gguf_file_found = os.path.join(root, gguf_files[0])
+                            break
+                resolved_checkpoint = (
+                    gguf_file_found if gguf_file_found else snapshot_path
+                )
+            # Search for mmproj file if provided
+            if mmproj:
+                for root, _, files in os.walk(snapshot_path):
+                    if mmproj in files:
+                        resolved_mmproj = os.path.join(root, mmproj)
+                        break
+            # Build checkpoint for registration
+            # For llamacpp with resolved path, store the full path relative to HF_HUB_CACHE
+            if resolved_checkpoint:
+                # Store as relative path from HF_HUB_CACHE for portability
+                checkpoint_to_register = os.path.relpath(
+                    resolved_checkpoint, HF_HUB_CACHE
+                )
+            elif variant:
+                checkpoint_to_register = f"models--{repo_cache_name}:{variant}"
+            else:
+                checkpoint_to_register = f"models--{repo_cache_name}"
+            # Register the model
+            ModelManager().register_local_model(
+                model_name=model_name,
+                checkpoint=checkpoint_to_register,
+                recipe=recipe,
+                reasoning=reasoning,
+                vision=vision,
+                mmproj=resolved_mmproj if resolved_mmproj else mmproj,
+                snapshot_path=snapshot_path,
+            )
+            # Refresh local models
+            self.local_models = ModelManager().downloaded_models_enabled
+            return {
+                "status": "success",
+                "message": f"Model {model_name} uploaded and registered successfully",
+            }
+        except Exception as e:
+            if os.path.exists(checkpoint_to_register):
+                shutil.rmtree(checkpoint_to_register)
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail=f"Failed to upload model: {str(e)}",
+            )
     async def set_log_level(self, config: LogLevelConfig):
         """
         Set the logging level of the server.
@@ -380,11 +656,13 @@ class Server:
             )
             file_handler.setLevel(logging_level)
             file_handler.setFormatter(uvicorn_formatter)
+            file_handler.addFilter(ServerLogFilter(self))
             # Set up console handler
             console_handler = logging.StreamHandler()
             console_handler.setLevel(logging_level)
             console_handler.setFormatter(uvicorn_formatter)
+            console_handler.addFilter(ServerLogFilter(self))
             # Configure root logger with both handlers
             logging.basicConfig(
@@ -407,10 +685,6 @@ class Server:
             ).run()
             sys.exit(0)
-        if self.debug_logging_enabled:
-            # Print the elapsed time for each request
-            self.setup_middleware_timer()
         # Let the app know what port it's running on, so
         # that the lifespan can access it
         self.app.port = self.port
@@ -507,7 +781,9 @@ class Server:
         return lc
-    async def completions(self, completion_request: CompletionRequest):
+    async def completions(
+        self, completion_request: CompletionRequest, request: Request
+    ):
         """
         Stream completion responses using HTTP chunked transfer encoding.
         """
@@ -520,8 +796,8 @@ class Server:
         # Load the model if it's different from the currently loaded one
         await self.load_llm(lc)
-        if self.llm_loaded.recipe == "llamacpp":
-            return llamacpp.completion(completion_request, self.llama_telemetry)
+        if self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm":
+            return self.wrapped_server.completion(completion_request)
         # Check if the model supports reasoning
         reasoning_first_token = self.llm_loaded.reasoning
@@ -559,29 +835,43 @@ class Server:
                 # This is necessary because the variable is modified
                 # in the inner function
                 nonlocal reasoning_first_token
+                try:
+                    async for token in self._generate_tokens(**generation_args):
+                        # Handle client disconnect: stop generation and exit
+                        if await request.is_disconnected():
+                            self.stop_event.set()
+                            break
-                async for token in self._generate_tokens(**generation_args):
-                    choice = CompletionChoice(
-                        text=("<think>" + token if reasoning_first_token else token),
-                        index=0,
-                        finish_reason="stop",
-                        logprobs=None,
-                    )
+                        choice = CompletionChoice(
+                            text=(
+                                "<think>" + token if reasoning_first_token else token
+                            ),
+                            index=0,
+                            finish_reason="stop",
+                            logprobs=None,
+                        )
-                    completion = Completion(
-                        id="0",
-                        choices=[choice],
-                        model=self.llm_loaded.checkpoint,
-                        object="text_completion",
-                        created=int(time.time()),
-                    )
+                        completion = Completion(
+                            id="0",
+                            choices=[choice],
+                            model=self.llm_loaded.checkpoint,
+                            object="text_completion",
+                            created=int(time.time()),
+                        )
-                    # Format as SSE
-                    reasoning_first_token = False
-                    yield f"data: {completion.model_dump_json()}\n\n".encode("utf-8")
+                        # Format as SSE
+                        reasoning_first_token = False
+                        yield f"data: {completion.model_dump_json()}\n\n".encode(
+                            "utf-8"
+                        )
-                # Send the [DONE] marker
-                yield b"data: [DONE]\n\n"
+                    # Send the [DONE] marker only if still connected
+                    if not await request.is_disconnected():
+                        yield b"data: [DONE]\n\n"
+                except asyncio.CancelledError:
+                    # Propagate cancellation to the generator loop
+                    self.stop_event.set()
+                    return
             return StreamingResponse(
                 generate(),
@@ -639,7 +929,9 @@ class Server:
                 created=int(time.time()),
             )
-    async def chat_completions(self, chat_completion_request: ChatCompletionRequest):
+    async def chat_completions(
+        self, chat_completion_request: ChatCompletionRequest, request: Request
+    ):
         """
         Stream chat completion responses using HTTP chunked transfer encoding.
         """
@@ -655,10 +947,25 @@ class Server:
         # Load the model if it's different from the currently loaded one
         await self.load_llm(lc)
-        if self.llm_loaded.recipe == "llamacpp":
-            return llamacpp.chat_completion(
-                chat_completion_request, self.llama_telemetry
-            )
+        if self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm":
+            if (
+                hasattr(chat_completion_request, "enable_thinking")
+                and chat_completion_request.enable_thinking is False
+                and "qwen3" in self.llm_loaded.model_name.lower()
+            ):
+                # Modify the last user message to include /no_think
+                if chat_completion_request.messages:
+                    for i in range(len(chat_completion_request.messages) - 1, -1, -1):
+                        if chat_completion_request.messages[i].get("role") == "user":
+                            original_content = chat_completion_request.messages[i][
+                                "content"
+                            ]
+                            chat_completion_request.messages[i][
+                                "content"
+                            ] = f"/no_think\n{original_content}"
+                            break
+            return self.wrapped_server.chat_completion(chat_completion_request)
         # Convert chat messages to text using the model's chat template
         text = self.apply_chat_template(
@@ -720,68 +1027,126 @@ class Server:
                 # Keep track of the full response for tool call extraction
                 full_response = ""
-                async for token in self._generate_tokens(**generation_args):
-                    # Continuously look for tool calls embedded into the generated text
-                    openai_tool_calls = None
-                    if chat_completion_request.tools:
+                # Track whether we're still in the thinking phase (before </think> tag)
+                in_thinking_phase = self.llm_loaded.reasoning
+                reasoning_buffer = ""  # Accumulate reasoning tokens to detect </think>
-                        # Append the token to the full response
-                        full_response += token
+                try:
+                    async for token in self._generate_tokens(**generation_args):
+                        # Handle client disconnect: stop generation and exit
+                        if await request.is_disconnected():
+                            self.stop_event.set()
+                            break
-                        tool_calls, _ = extract_tool_calls(
-                            full_response,
-                            tool_call_pattern,
-                        )
+                        # Continuously look for tool calls embedded into the generated text
+                        openai_tool_calls = None
+                        if chat_completion_request.tools:
-                        # If there are tool calls, reset the full response for the next tool call
-                        if tool_calls:
-                            openai_tool_calls = []
-                            full_response = ""
-                        for tool_call in tool_calls:
-                            openai_tool_calls.append(
-                                ChoiceDeltaToolCall(
-                                    index=0,
-                                    id="-",
-                                    function=ChoiceDeltaToolCallFunction(
-                                        arguments=json.dumps(tool_call["arguments"]),
-                                        name=tool_call["name"],
-                                    ),
-                                    type="function",
-                                )
+                            # Append the token to the full response
+                            full_response += token
+                            tool_calls, _ = extract_tool_calls(
+                                full_response,
+                                tool_call_pattern,
                             )
-                    # Create a ChatCompletionChunk
-                    chunk = ChatCompletionChunk.model_construct(
-                        id="0",
-                        object="chat.completion.chunk",
-                        created=int(time.time()),
-                        model=self.llm_loaded.checkpoint,
-                        choices=[
-                            Choice.model_construct(
-                                index=0,
-                                delta=ChoiceDelta(
-                                    content=(
-                                        "<think>" + token
-                                        if reasoning_first_token
-                                        else token
+                            # If there are tool calls, reset the full response for the next call
+                            if tool_calls:
+                                openai_tool_calls = []
+                                full_response = ""
+                            for tool_call in tool_calls:
+                                openai_tool_calls.append(
+                                    ChoiceDeltaToolCall(
+                                        index=0,
+                                        id="-",
+                                        function=ChoiceDeltaToolCallFunction(
+                                            arguments=json.dumps(
+                                                tool_call["arguments"]
+                                            ),
+                                            name=tool_call["name"],
+                                        ),
+                                        type="function",
+                                    )
+                                )
+                        # Create a ChatCompletionChunk with reasoning_content support
+                        # If we're in reasoning mode and haven't seen </think> yet,
+                        # send tokens as reasoning_content instead of content
+                        delta_content = None
+                        delta_reasoning = None
+                        if reasoning_first_token:
+                            # First token - include opening tag in reasoning
+                            delta_reasoning = "<think>" + token
+                            reasoning_first_token = False
+                            reasoning_buffer = token
+                        elif in_thinking_phase:
+                            # Still in thinking phase - accumulate and check for </think>
+                            reasoning_buffer += token
+                            # Check if we've seen the closing tag
+                            if "</think>" in reasoning_buffer:
+                                # Split at the closing tag
+                                before_close, after_close = reasoning_buffer.split(
+                                    "</think>", 1
+                                )
+                                # Send everything before + closing tag as reasoning
+                                if before_close or not reasoning_buffer.startswith(
+                                    "</think>"
+                                ):
+                                    delta_reasoning = before_close + "</think>"
+                                else:
+                                    delta_reasoning = "</think>"
+                                # Everything after goes to content (will be sent in next iteration)
+                                # For now, mark that we've exited thinking phase
+                                in_thinking_phase = False
+                                # If there's content after </think>, we need to send it too
+                                # But we send it in the current chunk as regular content
+                                if after_close:
+                                    # We have both reasoning and content in this token
+                                    # Send reasoning first, content will accumulate
+                                    delta_content = after_close
+                            else:
+                                # Still accumulating thinking, send as reasoning_content
+                                delta_reasoning = token
+                        else:
+                            # Normal content (after thinking phase ended)
+                            delta_content = token
+                        chunk = ChatCompletionChunk.model_construct(
+                            id="0",
+                            object="chat.completion.chunk",
+                            created=int(time.time()),
+                            model=self.llm_loaded.checkpoint,
+                            choices=[
+                                Choice.model_construct(
+                                    index=0,
+                                    delta=ChoiceDelta(
+                                        content=delta_content,
+                                        reasoning_content=delta_reasoning,
+                                        function_call=None,
+                                        role="assistant",
+                                        tool_calls=openai_tool_calls,
+                                        refusal=None,
                                     ),
-                                    function_call=None,
-                                    role="assistant",
-                                    tool_calls=openai_tool_calls,
-                                    refusal=None,
-                                ),
-                                finish_reason=None,
-                                logprobs=None,
-                            )
-                        ],
-                    )
+                                    finish_reason=None,
+                                    logprobs=None,
+                                )
+                            ],
+                        )
-                    # Format as SSE
-                    reasoning_first_token = False
-                    yield f"data: {chunk.model_dump_json()}\n\n".encode("utf-8")
+                        # Format as SSE
+                        yield f"data: {chunk.model_dump_json()}\n\n".encode("utf-8")
-                # Send the [DONE] marker
-                yield b"data: [DONE]\n\n"
+                    # Send the [DONE] marker only if still connected
+                    if not await request.is_disconnected():
+                        yield b"data: [DONE]\n\n"
+                except asyncio.CancelledError:
+                    self.stop_event.set()
+                    return
             return StreamingResponse(
                 generate(),
@@ -861,7 +1226,7 @@ class Server:
         if self.llm_loaded.recipe == "llamacpp":
             try:
-                return llamacpp.embeddings(embeddings_request, self.llama_telemetry)
+                return self.wrapped_server.embeddings(embeddings_request)
             except Exception as e:  # pylint: disable=broad-exception-caught
                 # Check if model has embeddings label
                 model_info = ModelManager().supported_models.get(
@@ -884,7 +1249,7 @@ class Server:
     async def reranking(self, reranking_request: RerankingRequest):
         """
-        Rerank documents based on their relevance to a query using the llamacpp server.
+        Rerank documents based on their relevance to a query.
         """
         # Initialize load config from reranking request
         lc = LoadConfig(model_name=reranking_request.model)
@@ -894,7 +1259,7 @@ class Server:
         if self.llm_loaded.recipe == "llamacpp":
             try:
-                return llamacpp.reranking(reranking_request, self.llama_telemetry)
+                return self.wrapped_server.reranking(reranking_request)
             except Exception as e:  # pylint: disable=broad-exception-caught
                 # Check if model has reranking label
                 model_info = ModelManager().supported_models.get(
@@ -940,7 +1305,7 @@ class Server:
             formatted_messages.append(f"{role_marker}\n{content} <|end|>")
         return "\n".join(formatted_messages) + "\n<|assistant|>"
-    async def responses(self, responses_request: ResponsesRequest):
+    async def responses(self, responses_request: ResponsesRequest, request: Request):
         """
         Stream responses using HTTP chunked transfer encoding.
         """
@@ -953,6 +1318,12 @@ class Server:
         # Load the model if it's different from the currently loaded one
         await self.load_llm(lc)
+        if self.llm_loaded.recipe == "llamacpp":
+            raise HTTPException(
+                status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
+                detail=f"Responses API not supported for recipe: {self.llm_loaded.recipe}",
+            )
         # Convert chat messages to text using the model's chat template
         if isinstance(responses_request.input, str):
             text = responses_request.input
@@ -1006,56 +1377,72 @@ class Server:
                 full_response = "<think>" if reasoning_first_token else ""
-                async for token in self._generate_tokens(**generation_args):
+                try:
+                    async for token in self._generate_tokens(**generation_args):
+                        # Handle client disconnect: stop generation and exit
+                        if await request.is_disconnected():
+                            self.stop_event.set()
+                            break
-                    # Create an event
-                    delta_event = ResponseTextDeltaEvent(
-                        content_index=0,
-                        delta=("<think>" + token if reasoning_first_token else token),
-                        item_id="0 ",
-                        output_index=0,
-                        type="response.output_text.delta",
-                        sequence_number=0,
-                    )
-                    full_response += token
+                        # Create an event
+                        delta_event = ResponseTextDeltaEvent(
+                            content_index=0,
+                            delta=(
+                                "<think>" + token if reasoning_first_token else token
+                            ),
+                            item_id="0 ",
+                            logprobs=[],
+                            output_index=0,
+                            sequence_number=0,
+                            type="response.output_text.delta",
+                        )
+                        full_response += token
-                    # Format as SSE
-                    reasoning_first_token = False
-                    yield f"data: {delta_event.model_dump_json()}\n\n".encode("utf-8")
+                        # Format as SSE
+                        reasoning_first_token = False
+                        yield f"data: {delta_event.model_dump_json()}\n\n".encode(
+                            "utf-8"
+                        )
-                # Send the completed event
-                response_output_message = ResponseOutputMessage(
-                    id="0",
-                    content=[
-                        ResponseOutputText(
-                            annotations=[],
-                            text=full_response,
-                            type="output_text",
+                    # Send the completed event (only if still connected)
+                    if not await request.is_disconnected():
+                        response_output_message = ResponseOutputMessage(
+                            id="0",
+                            content=[
+                                ResponseOutputText(
+                                    annotations=[],
+                                    text=full_response,
+                                    type="output_text",
+                                )
+                            ],
+                            role="assistant",
+                            status="completed",
+                            type="message",
+                        )
+                        response = Response(
+                            id="0",
+                            model=self.llm_loaded.checkpoint,
+                            created_at=int(time.time()),
+                            object="response",
+                            output=[response_output_message],
+                            parallel_tool_calls=True,
+                            tool_choice="auto",
+                            tools=[],
+                        )
+                        completed_event = ResponseCompletedEvent(
+                            response=response,
+                            type="response.completed",
+                            sequence_number=0,
+                        )
+                        yield f"data: {completed_event.model_dump_json()}\n\n".encode(
+                            "utf-8"
                         )
-                    ],
-                    role="assistant",
-                    status="completed",
-                    type="message",
-                )
-                response = Response(
-                    id="0",
-                    model=self.llm_loaded.checkpoint,
-                    created_at=int(time.time()),
-                    object="response",
-                    output=[response_output_message],
-                    parallel_tool_calls=True,
-                    tool_choice="auto",
-                    tools=[],
-                )
-                completed_event = ResponseCompletedEvent(
-                    response=response,
-                    type="response.completed",
-                    sequence_number=0,
-                )
-                yield f"data: {completed_event.model_dump_json()}\n\n".encode("utf-8")
-                # Send the [DONE] marker
-                yield b"data: [DONE]\n\n"
+                        # Send the [DONE] marker
+                        yield b"data: [DONE]\n\n"
+                except asyncio.CancelledError:
+                    self.stop_event.set()
+                    return
             return StreamingResponse(
                 generate(),
@@ -1150,18 +1537,33 @@ class Server:
             )
             self.input_tokens = len(input_ids[0])
-        # For non-llamacpp recipes, truncate inputs to ctx_size if needed
-        if self.llm_loaded.recipe != "llamacpp" and self.input_tokens > self.ctx_size:
-            # Truncate input ids
-            truncate_amount = self.input_tokens - self.ctx_size
-            input_ids = input_ids[: self.ctx_size]
+        max_prompt_length = self.ctx_size  # Default fallback
+        # For OGA models, try to read the actual max prompt length from config
+        if "oga-" in self.llm_loaded.recipe:
+            try:
+                if model.config and model.config.get("max_prompt_length"):
+                    max_prompt_length = model.config["max_prompt_length"]
+                    logging.debug(
+                        f"Using OGA model max_prompt_length: {max_prompt_length}"
+                    )
+            # pylint: disable=broad-exception-caught
+            except Exception as e:
+                logging.debug(f"Could not read OGA model config, using ctx_size: {e}")
+        # Apply truncation if input exceeds the limit
+        if self.input_tokens > max_prompt_length:
+            # Truncate input ids
+            truncate_amount = self.input_tokens - max_prompt_length
+            input_ids = input_ids[:max_prompt_length]
             # Update token count
-            self.input_tokens = len(input_ids)
+            if "oga-" in self.llm_loaded.recipe:
+                self.input_tokens = len(input_ids)
+            else:
+                self.input_tokens = len(input_ids[0])
-            # Show warning message
+            # Log warning message instead of raising exception
             truncation_message = (
-                f"Input exceeded {self.ctx_size} tokens. "
+                f"Input exceeded {max_prompt_length} tokens. "
                 f"Truncated {truncate_amount} tokens from the beginning."
             )
             logging.warning(truncation_message)
@@ -1285,9 +1687,11 @@ class Server:
         """
         Send performance statistics to the client.
         """
-        # If using llama server, get telemetry from the telemetry instance
-        if self.llm_loaded and self.llm_loaded.recipe == "llamacpp":
-            return self.llama_telemetry.get_telemetry_data()
+        # If using wrapped server, get telemetry from the telemetry instance
+        if self.llm_loaded and (
+            self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm"
+        ):
+            return self.wrapped_server.telemetry.get_telemetry_data()
         # For built-in server, use the existing telemetry
         return {
@@ -1387,6 +1791,7 @@ class Server:
             checkpoint=config.checkpoint,
             recipe=config.recipe,
             reasoning=config.reasoning,
+            vision=config.vision,
             mmproj=config.mmproj,
             # The pull endpoint will download an upgraded model if available, even
             # if we already have a local copy of the model
@@ -1432,9 +1837,10 @@ class Server:
         Load a registered LLM into system memory. Install the model first, if needed.
             config: the information required to load the model
         """
+        from huggingface_hub.constants import HF_HUB_CACHE
         try:
             await self._load_lock.acquire()
             # Acquire all generate locks
             for _ in range(self.max_concurrent_generations):
                 await self._generate_semaphore.acquire()
@@ -1459,6 +1865,38 @@ class Server:
                 # Get additional properties from the model registry
                 config_to_use = LoadConfig(**supported_models[config.model_name])
+            # For locally uploaded models, convert the relative checkpoint path to absolute path
+            model_source = supported_models.get(config.model_name, {}).get(
+                "source", None
+            )
+            if (
+                model_source == "local_upload"
+                and config_to_use.checkpoint
+                and not config_to_use.recipe.startswith("hf-")
+            ):
+                # Check if checkpoint is a relative path (stored during upload)
+                if not os.path.isabs(config_to_use.checkpoint):
+                    # Convert relative path to absolute by joining with HF_HUB_CACHE
+                    absolute_checkpoint = os.path.join(
+                        HF_HUB_CACHE, config_to_use.checkpoint
+                    )
+                    if os.path.exists(absolute_checkpoint):
+                        config_to_use.checkpoint = absolute_checkpoint
+                    else:
+                        logging.warning(
+                            f"Checkpoint path does not exist: {absolute_checkpoint}"
+                        )
+                # Also resolve mmproj path if present
+                if config_to_use.mmproj and not os.path.isabs(config_to_use.mmproj):
+                    absolute_mmproj = os.path.join(HF_HUB_CACHE, config_to_use.mmproj)
+                    if os.path.exists(absolute_mmproj):
+                        config_to_use.mmproj = absolute_mmproj
+                    else:
+                        logging.warning(
+                            f"MMProj path does not exist: {absolute_mmproj}"
+                        )
             # Caching mechanism: if the checkpoint is already loaded there is nothing else to do
             if (
                 self.llm_loaded
@@ -1466,9 +1904,9 @@ class Server:
             ):
                 if (
                     self.llm_loaded.recipe == "llamacpp"
-                    and self.llama_server_process.poll()
-                ):
-                    # llama-server process has gone away for some reason, so we should
+                    or self.llm_loaded.recipe == "flm"
+                ) and self.wrapped_server.process.poll():
+                    # wrapped server process has gone away for some reason, so we should
                     # proceed with loading to get it back
                     pass
                 else:
@@ -1484,12 +1922,18 @@ class Server:
             logging.info(f"Loading llm: {config.model_name}")
             try:
                 if config_to_use.recipe == "llamacpp":
-                    self.llama_server_process = llamacpp.server_load(
+                    self.wrapped_server = LlamaServer(self.llamacpp_backend)
+                    self.wrapped_server.load(
+                        model_config=config_to_use,
+                        ctx_size=self.ctx_size,
+                        do_not_upgrade=True,
+                    )
+                elif config_to_use.recipe == "flm":
+                    self.wrapped_server = FlmServer()
+                    self.wrapped_server.load(
                         model_config=config_to_use,
-                        telemetry=self.llama_telemetry,
-                        backend=self.llamacpp_backend,
                         ctx_size=self.ctx_size,
-                        # Models should only upgrade when using the pull endpoint
                         do_not_upgrade=True,
                     )
@@ -1529,8 +1973,8 @@ class Server:
                 for _ in range(self.max_concurrent_generations):
                     await self._generate_semaphore.acquire()
-            if self.llm_loaded.recipe == "llamacpp":
-                self.llama_server_process.terminate()
+            if self.llm_loaded.recipe == "llamacpp" or self.llm_loaded.recipe == "flm":
+                self.wrapped_server.process.terminate()
             self.llm_loaded = None
             self.tokenizer = None
@@ -1567,6 +2011,36 @@ class Server:
         return {"object": "list", "data": models_list}
+    async def retrieve_model(self, model_id: str):
+        """
+        Retrieve a specific model by ID in OpenAI-compatible format.
+        """
+        # Raise an error if the model does not exist
+        if model_id not in self.local_models:
+            # Mimic the error format of the OpenAI API
+            raise HTTPException(
+                status_code=404,
+                detail={
+                    "message": f"model {model_id} not found",
+                    "type": "api_error",
+                    "param": None,
+                    "code": None,
+                },
+            )
+        # Return the specific model
+        model_info = self.local_models[model_id]
+        model = ServerModel(
+            id=model_id,
+            owned_by="lemonade",
+            object="model",
+            created=int(time.time()),
+            checkpoint=model_info["checkpoint"],
+            recipe=model_info["recipe"],
+        )
+        return model
     def setup_middleware_timer(self):
         logging.info("Middleware set up")
@@ -1602,6 +2076,48 @@ class Server:
                     logging.debug(f"Total request time: {request_time:.4f} seconds")
             return response
+    async def logs_ws(self, websocket: WebSocket):
+        if not self.log_file or not os.path.exists(self.log_file):
+            await websocket.close(code=4000)
+            return
+        await log_streamer(websocket, self.log_file)
+    async def get_incompatible_models(self):
+        """
+        Get information about incompatible RyzenAI models in the cache.
+        """
+        try:
+            return ModelManager().get_incompatible_ryzenai_models()
+        except Exception as e:
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail=f"Failed to scan for incompatible models: {str(e)}",
+            )
+    async def cleanup_incompatible_models(self, request: Request):
+        """
+        Delete selected incompatible RyzenAI models from the cache.
+        """
+        try:
+            body = await request.json()
+            model_paths = body.get("model_paths", [])
+            if not model_paths:
+                raise HTTPException(
+                    status_code=status.HTTP_400_BAD_REQUEST,
+                    detail="No model_paths provided",
+                )
+            result = ModelManager().cleanup_incompatible_models(model_paths)
+            return result
+        except HTTPException:
+            raise
+        except Exception as e:
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail=f"Failed to cleanup models: {str(e)}",
+            )
 # This file was originally licensed under Apache 2.0. It has been modified.
 # Modifications Copyright (c) 2025 AMD

lemonade-sdk 8.1.4__py3-none-any.whl → 8.2.2__py3-none-any.whl

Potentially problematic release.

lemonade-sdk 8.1.4py3-none-any.whl → 8.2.2py3-none-any.whl