PyPI - lemonade-sdk - Versions diffs - 7.0.0__py3-none-any.whl → 7.0.2__py3-none-any.whl - Mend

lemonade-sdk 7.0.0py3-none-any.whl → 7.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (21) hide show

lemonade/cli.py +2 -0
lemonade/tools/accuracy.py +335 -0
lemonade/tools/server/instructions.py +294 -0
lemonade/tools/server/llamacpp.py +315 -0
lemonade/tools/server/port_utils.py +57 -0
lemonade/tools/server/pydantic_models.py +83 -0
lemonade/tools/server/serve.py +225 -167
lemonade/tools/server/static/styles.css +313 -0
lemonade/tools/server/thread_utils.py +87 -0
lemonade/tools/server/tool_calls.py +50 -43
lemonade/version.py +1 -1
{lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.2.dist-info}/METADATA +4 -7
{lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.2.dist-info}/RECORD +21 -14
{lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.2.dist-info}/WHEEL +1 -1
lemonade_server/cli.py +4 -2
lemonade_server/model_manager.py +34 -17
lemonade_server/server_models.json +52 -3
{lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.2.dist-info}/entry_points.txt +0 -0
{lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.2.dist-info}/licenses/LICENSE +0 -0
{lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.2.dist-info}/licenses/NOTICE.md +0 -0
{lemonade_sdk-7.0.0.dist-info → lemonade_sdk-7.0.2.dist-info}/top_level.txt +0 -0

lemonade/tools/server/serve.py CHANGED Viewed

@@ -7,12 +7,16 @@ import logging
 import traceback
 from typing import Optional, Union
 import json
+import subprocess
+from pathlib import Path
 from fastapi import FastAPI, HTTPException, status, Request
-from fastapi.responses import StreamingResponse, HTMLResponse
+from fastapi.responses import StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
+from fastapi.staticfiles import StaticFiles
 import uvicorn
+from uvicorn.config import Config
+from uvicorn.server import Server as UvicornServer
 from transformers import TextIteratorStreamer, StoppingCriteria, StoppingCriteriaList
 from tabulate import tabulate
@@ -24,7 +28,11 @@ from openai.types.chat.chat_completion_message_tool_call import (
     Function,
 )
 from openai.types.chat.chat_completion import Choice
-from openai.types.chat.chat_completion_chunk import ChoiceDelta
+from openai.types.chat.chat_completion_chunk import (
+    ChoiceDelta,
+    ChoiceDeltaToolCall,
+    ChoiceDeltaToolCallFunction,
+)
 from openai.types.completion_choice import Logprobs
 from openai.types.model import Model
 from openai.types.responses import (
@@ -39,11 +47,18 @@ from openai.types.responses import (
 import lemonade.api as lemonade_api
 from lemonade_server.model_manager import ModelManager
 from lemonade.tools.management_tools import ManagementTool
-from lemonade.tools.server.tool_calls import extract_tool_calls
-# Set to a high number to allow for interesting experiences in real apps
-# Tests should use the max_new_tokens argument to set a lower value
-DEFAULT_MAX_NEW_TOKENS = 1500
+import lemonade.tools.server.llamacpp as llamacpp
+from lemonade.tools.server.pydantic_models import (
+    DEFAULT_MAX_NEW_TOKENS,
+    LoadConfig,
+    CompletionRequest,
+    ChatCompletionRequest,
+    ResponsesRequest,
+    PullConfig,
+)
+from lemonade.tools.server.tool_calls import extract_tool_calls, get_tool_call_pattern
+from lemonade.tools.server.instructions import get_instructions_html
+from lemonade.tools.server.port_utils import lifespan
 DEFAULT_PORT = 8000
 DEFAULT_LOG_LEVEL = "info"
@@ -101,97 +116,21 @@ class StopOnEvent(StoppingCriteria):
         return self.stop_event.is_set()
-class PullConfig(BaseModel):
-    """
-    Configurating for installing a supported LLM.
-    """
-    model_name: str
-class LoadConfig(BaseModel):
-    """
-    Configuration for loading a language model.
-    Specifies the model checkpoint, generation parameters,
-    and hardware/framework configuration (recipe) for model loading.
-    """
-    model_name: Optional[str] = None
-    checkpoint: Optional[str] = None
-    max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS
-    recipe: Optional[str] = None
-    # Indicates the maximum prompt length allowed for that specific
-    # checkpoint + recipe combination
-    max_prompt_length: Optional[int] = None
-    # Indicates whether the model is a reasoning model, like DeepSeek
-    reasoning: Optional[bool] = False
-class CompletionRequest(BaseModel):
-    """
-    Request model for text completion API endpoint.
-    Contains a prompt, a model identifier, and a streaming
-    flag to control response delivery.
-    """
-    prompt: str
-    model: str
-    echo: bool = False
-    stream: bool = False
-    logprobs: int | None = False
-    stop: list[str] | str | None = None
-    temperature: float | None = None
-    max_tokens: int | None = None
-class ChatCompletionRequest(BaseModel):
-    """
-    Request model for chat completion API endpoint.
-    Contains a list of chat messages, a model identifier,
-    and a streaming flag to control response delivery.
-    """
-    messages: list[dict]
-    model: str
-    stream: bool = False
-    logprobs: int | None = False
-    stop: list[str] | str | None = None
-    temperature: float | None = None
-    tools: list[dict] | None = None
-    max_tokens: int | None = None
-    max_completion_tokens: int | None = None
-class ResponsesRequest(BaseModel):
-    """
-    Request model for responses API endpoint.
-    """
-    input: list[dict] | str
-    model: str
-    max_output_tokens: int | None = None
-    temperature: float | None = None
-    stream: bool = False
 class Server(ManagementTool):
     """
     Open a web server that apps can use to communicate with the LLM.
     The server exposes these endpoints:
-    - /api/v0/pull: install an LLM by its Lemonade Server Model Name.
-    - /api/v0/load: load a model checkpoint.
-    - /api/v0/unload: unload a model checkpoint.
-    - /api/v0/health: check whether a model is loaded and ready to serve.
-    - /api/v0/stats: performance statistics for the generation.
-    - /api/v0/halt: stop an in-progress generation from make more tokens.
-    - /api/v0/completions: completion responses using HTTP chunked transfer encoding.
-    - /api/v0/chat/completions: chat completion responses using HTTP chunked transfer encoding.
-    - /api/v0/responses: responses API using HTTP chunked transfer encoding.
-    - /api/v0/models: list all available models.
+    - /api/v1/pull: install an LLM by its Lemonade Server Model Name.
+    - /api/v1/load: load a model checkpoint.
+    - /api/v1/unload: unload a model checkpoint.
+    - /api/v1/health: check whether a model is loaded and ready to serve.
+    - /api/v1/stats: performance statistics for the generation.
+    - /api/v1/halt: stop an in-progress generation from make more tokens.
+    - /api/v1/completions: completion responses using HTTP chunked transfer encoding.
+    - /api/v1/chat/completions: chat completion responses using HTTP chunked transfer encoding.
+    - /api/v1/responses: responses API using HTTP chunked transfer encoding.
+    - /api/v1/models: list all available models.
     """
     unique_name = "serve"
@@ -200,7 +139,7 @@ class Server(ManagementTool):
         super().__init__()
         # Initialize FastAPI app
-        self.app = FastAPI()
+        self.app = FastAPI(lifespan=lifespan)
         # Add CORS middleware
         self.app.add_middleware(
@@ -212,23 +151,18 @@ class Server(ManagementTool):
         )
         # Set up custom routes
-        self.app.post("/api/v0/pull")(self.pull)
-        self.app.post("/api/v0/load")(self.load_llm)
-        self.app.post("/api/v0/unload")(self.unload_llm)
-        self.app.get("/api/v0/health")(self.health)
-        self.app.get("/api/v0/halt")(self.halt_generation)
-        self.app.get("/api/v0/stats")(self.send_stats)
-        self.app.post("/api/v0/completions")(self.completions)
-        self.app.post("/api/v0/responses")(self.responses)
-        # Set up OpenAI-compatible routes
-        self.app.post("/api/v0/chat/completions")(self.chat_completions)
-        self.app.post("/api/v0/completions")(self.completions)
-        self.app.get("/api/v0/models")(self.models)
+        self.setup_routes(["/api/v0", "/api/v1"])
         # Set up instructions
         self.app.get("/")(self.instructions)
+        # Mount a static assets dir for HTML responses, such
+        # as the instructions
+        static_dir = Path(__file__).parent / "static"
+        self.app.mount(
+            "/static", StaticFiles(directory=static_dir), name="static_assets"
+        )
         # Performance stats that are set during /ws and can be
         # fetched in /stats
         self.time_to_first_token = None
@@ -263,6 +197,28 @@ class Server(ManagementTool):
         # Add lock for load/unload operations
         self._load_lock = asyncio.Lock()
+        # Subprocess handle for llama_server.exe
+        self.llama_server_process: subprocess.Popen = None
+        # Telemetry instance for llama server
+        self.llama_telemetry = llamacpp.LlamaTelemetry()
+    def setup_routes(self, api_prefixes: list[str]):
+        for prefix in api_prefixes:
+            # Custom routes
+            self.app.post(f"{prefix}/pull")(self.pull)
+            self.app.post(f"{prefix}/load")(self.load_llm)
+            self.app.post(f"{prefix}/unload")(self.unload_llm)
+            self.app.get(f"{prefix}/health")(self.health)
+            self.app.get(f"{prefix}/halt")(self.halt_generation)
+            self.app.get(f"{prefix}/stats")(self.send_stats)
+            self.app.post(f"{prefix}/completions")(self.completions)
+            self.app.post(f"{prefix}/responses")(self.responses)
+            # OpenAI-compatible routes
+            self.app.post(f"{prefix}/chat/completions")(self.chat_completions)
+            self.app.get(f"{prefix}/models")(self.models)
     @staticmethod
     def parser(add_help: bool = True) -> argparse.ArgumentParser:
         parser = __class__.helpful_parser(
@@ -288,15 +244,22 @@ class Server(ManagementTool):
         return parser
-    def run(
+    def _setup_server_common(
         self,
-        # ManagementTool has a required cache_dir arg, but
-        # we always use the default cache directory
-        _=None,
-        port: int = DEFAULT_PORT,
-        log_level: str = DEFAULT_LOG_LEVEL,
+        port: int,
         truncate_inputs: bool = False,
+        log_level: str = DEFAULT_LOG_LEVEL,
+        threaded_mode: bool = False,
     ):
+        """
+        Common setup logic shared between run() and run_in_thread().
+        Args:
+            port: Port number for the server
+            truncate_inputs: Whether to truncate inputs if they exceed max length
+            log_level: Logging level to configure
+            threaded_mode: Whether this is being set up for threaded execution
+        """
         # Store truncation settings
         self.truncate_inputs = truncate_inputs
@@ -310,22 +273,27 @@ class Server(ManagementTool):
         logging.trace = trace
-        # Configure logging to match uvicorn's format
-        logging_level = getattr(logging, log_level.upper())
-        logging.basicConfig(
-            level=logging_level,
-            format="%(levelprefix)s %(message)s",
-            datefmt="%Y-%m-%d %H:%M:%S",
-        )
+        # Configure logging based on mode
+        if threaded_mode:
+            # Configure logging for warning level (to reduce noise in threaded execution)
+            logging.getLogger("uvicorn.error").setLevel(logging.WARNING)
+        else:
+            # Configure logging to match uvicorn's format
+            logging_level = getattr(logging, log_level.upper())
+            logging.basicConfig(
+                level=logging_level,
+                format="%(levelprefix)s %(message)s",
+                datefmt="%Y-%m-%d %H:%M:%S",
+            )
-        # Add uvicorn's log formatter
-        logging.root.handlers[0].formatter = uvicorn.logging.DefaultFormatter(
-            fmt="%(levelprefix)s %(message)s",
-            use_colors=True,
-        )
+            # Add uvicorn's log formatter
+            logging.root.handlers[0].formatter = uvicorn.logging.DefaultFormatter(
+                fmt="%(levelprefix)s %(message)s",
+                use_colors=True,
+            )
-        # Ensure the log level is properly set
-        logging.getLogger().setLevel(logging_level)
+            # Ensure the log level is properly set
+            logging.getLogger().setLevel(logging_level)
         # Update debug logging state after setting log level
         self.debug_logging_enabled = logging.getLogger().isEnabledFor(logging.DEBUG)
@@ -334,8 +302,66 @@ class Server(ManagementTool):
             # Print the elapsed time for each request
             self.setup_middleware_timer()
+        # Let the app know what port it's running on, so
+        # that the lifespan can access it
+        self.app.port = port
+    def run(
+        self,
+        # ManagementTool has a required cache_dir arg, but
+        # we always use the default cache directory
+        _=None,
+        port: int = DEFAULT_PORT,
+        log_level: str = DEFAULT_LOG_LEVEL,
+        truncate_inputs: bool = False,
+    ):
+        # Common setup
+        self._setup_server_common(
+            port=port,
+            truncate_inputs=truncate_inputs,
+            log_level=log_level,
+            threaded_mode=False,
+        )
         uvicorn.run(self.app, host="localhost", port=port, log_level=log_level)
+    def run_in_thread(
+        self,
+        port: int = DEFAULT_PORT,
+        host: str = "localhost",
+        log_level: str = "warning",
+        truncate_inputs: bool = False,
+    ):
+        """
+        Set up the server for running in a thread.
+        Returns a uvicorn server instance that can be controlled externally.
+        """
+        # Common setup
+        self._setup_server_common(
+            port=port,
+            truncate_inputs=truncate_inputs,
+            log_level=log_level,
+            threaded_mode=True,
+        )
+        class CustomServer(UvicornServer):
+            """Custom Uvicorn server that can be properly shutdown from another thread"""
+            def install_signal_handlers(self):
+                pass
+        # Configure the server
+        config = Config(
+            app=self.app,
+            host=host,
+            port=port,
+            log_level=log_level,
+            log_config=None,
+        )
+        # Create and return the uvicorn server
+        return CustomServer(config=config)
     async def _show_telemetry(self):
         """
         Show telemetry data in debug mode.
@@ -363,31 +389,8 @@ class Server(ManagementTool):
         """
         Show instructions on how to use the server.
         """
-        html_content = """
-            <!DOCTYPE html>
-            <html>
-            <head>
-                <title>Lemonade Server</title>
-                <link rel="icon" href="data:,">
-            </head>
-            <body>
-                <h1>🍋 Welcome to Lemonade Server!</h1>
-                <p>
-                    A standards-compliant server that provides REST APIs for LLM communication.
-                    To get started, simply point your OpenAI-compatible application at the server's endpoint.
-                </p>
-                <div class="links">
-                    <h3>Documentation:</h3>
-                    <ul>
-                        <li><a href="https://github.com/lemonade-sdk/lemonade/tree/main/docs/server/apps/README.md">Examples & Usage</a></li>
-                        <li><a href="https://github.com/lemonade-sdk/lemonade/blob/main/docs/server/server_integration.md">Integration Guide</a></li>
-                        <li><a href="https://github.com/lemonade-sdk/lemonade/blob/main/docs/server/server_spec.md">Server Specification</a></li>
-                    </ul>
-                </div>
-            </body>
-            </html>
-            """
-        return HTMLResponse(content=html_content, status_code=200)
+        return get_instructions_html(port=self.app.port)
     def initialize_load_config(
         self, request: Union[ChatCompletionRequest, CompletionRequest]
@@ -530,10 +533,6 @@ class Server(ManagementTool):
         Stream chat completion responses using HTTP chunked transfer encoding.
         """
-        if chat_completion_request.tools and chat_completion_request.stream:
-            logging.warning(
-                "tools are only supported on non-streaming chat completions"
-            )
         if chat_completion_request.logprobs:
             logging.warning("logprobs is not supported on chat completion")
@@ -542,14 +541,15 @@ class Server(ManagementTool):
         # Load the model if it's different from the currently loaded one
         await self.load_llm(lc, internal_call=True)
+        if self.llm_loaded.recipe == "llamacpp":
+            return llamacpp.chat_completion(
+                chat_completion_request, self.llama_telemetry
+            )
         # Convert chat messages to text using the model's chat template
         text = self.apply_chat_template(
             chat_completion_request.messages,
-            tools=(
-                chat_completion_request.tools
-                if not chat_completion_request.stream
-                else None
-            ),
+            tools=chat_completion_request.tools,
         )
         # If the model supports reasoning, we:
@@ -585,6 +585,12 @@ class Server(ManagementTool):
             "max_new_tokens": max_new_tokens,
         }
+        if chat_completion_request.tools:
+            # Get the tool call pattern
+            tool_call_pattern = get_tool_call_pattern(
+                self.tokenizer.auto_tokenizer.added_tokens_decoder
+            )
         if chat_completion_request.stream:
             # Stream the response
@@ -594,7 +600,38 @@ class Server(ManagementTool):
                 # in the inner function
                 nonlocal reasoning_first_token
+                # Keep track of the full response for tool call extraction
+                full_response = ""
                 async for token in self._generate_tokens(**generation_args):
+                    # Continuously look for tool calls embedded into the generated text
+                    openai_tool_calls = None
+                    if chat_completion_request.tools:
+                        # Append the token to the full response
+                        full_response += token
+                        tool_calls, _ = extract_tool_calls(
+                            full_response,
+                            tool_call_pattern,
+                        )
+                        # If there are tool calls, reset the full response for the next tool call
+                        if tool_calls:
+                            openai_tool_calls = []
+                            full_response = ""
+                        for tool_call in tool_calls:
+                            openai_tool_calls.append(
+                                ChoiceDeltaToolCall(
+                                    index=0,
+                                    id="-",
+                                    function=ChoiceDeltaToolCallFunction(
+                                        arguments=json.dumps(tool_call["arguments"]),
+                                        name=tool_call["name"],
+                                    ),
+                                    type="function",
+                                )
+                            )
                     # Create a ChatCompletionChunk
                     chunk = ChatCompletionChunk.model_construct(
@@ -613,7 +650,7 @@ class Server(ManagementTool):
                                     ),
                                     function_call=None,
                                     role="assistant",
-                                    tool_calls=None,
+                                    tool_calls=openai_tool_calls,
                                     refusal=None,
                                 ),
                                 finish_reason=None,
@@ -648,7 +685,7 @@ class Server(ManagementTool):
             openai_tool_calls = None
             if chat_completion_request.tools:
                 tool_calls, full_response = extract_tool_calls(
-                    full_response, self.tokenizer.auto_tokenizer.added_tokens_decoder
+                    full_response, tool_call_pattern
                 )
                 if tool_calls:
                     openai_tool_calls = []
@@ -767,6 +804,7 @@ class Server(ManagementTool):
                 created_event = ResponseCreatedEvent(
                     response=response,
                     type="response.created",
+                    sequence_number=0,
                 )
                 yield f"data: {created_event.model_dump_json()}\n\n".encode("utf-8")
@@ -781,6 +819,7 @@ class Server(ManagementTool):
                         item_id="0 ",
                         output_index=0,
                         type="response.output_text.delta",
+                        sequence_number=0,
                     )
                     full_response += token
@@ -815,6 +854,7 @@ class Server(ManagementTool):
                 completed_event = ResponseCompletedEvent(
                     response=response,
                     type="response.completed",
+                    sequence_number=0,
                 )
                 yield f"data: {completed_event.model_dump_json()}\n\n".encode("utf-8")
@@ -1035,6 +1075,11 @@ class Server(ManagementTool):
         """
         Send performance statistics to the client.
         """
+        # If using llama server, get telemetry from the telemetry instance
+        if self.llm_loaded and self.llm_loaded.recipe == "llamacpp":
+            return self.llama_telemetry.get_telemetry_data()
+        # For built-in server, use the existing telemetry
         return {
             "time_to_first_token": self.time_to_first_token,
             "tokens_per_second": self.tokens_per_second,
@@ -1246,15 +1291,25 @@ class Server(ManagementTool):
             logging.info(f"Loading llm: {model_reference}")
             try:
-                self.model, self.tokenizer = lemonade_api.from_pretrained(
-                    checkpoint=config_to_use.checkpoint, recipe=config_to_use.recipe
-                )
+                if config_to_use.recipe == "llamacpp":
+                    self.llama_server_process = llamacpp.server_load(
+                        checkpoint=config_to_use.checkpoint,
+                        model_reference=model_reference,
+                        telemetry=self.llama_telemetry,
+                    )
+                else:
+                    self.model, self.tokenizer = lemonade_api.from_pretrained(
+                        checkpoint=config_to_use.checkpoint, recipe=config_to_use.recipe
+                    )
                 self.llm_loaded = config_to_use
                 return {
                     "status": "success",
                     "message": f"Loaded model: {model_reference}",
                 }
+            except HTTPException:
+                raise
             except Exception:  # pylint: disable=broad-exception-caught
                 self.model_load_failure(model_reference)
@@ -1279,6 +1334,9 @@ class Server(ManagementTool):
                 for _ in range(self.max_concurrent_generations):
                     await self._generate_semaphore.acquire()
+            if self.llm_loaded.recipe == "llamacpp":
+                self.llama_server_process.terminate()
             self.llm_loaded = None
             self.tokenizer = None
             self.model = None

lemonade-sdk 7.0.0__py3-none-any.whl → 7.0.2__py3-none-any.whl

Potentially problematic release.

lemonade-sdk 7.0.0py3-none-any.whl → 7.0.2py3-none-any.whl