PyPI - lemonade-sdk - Versions diffs - 8.0.5__py3-none-any.whl → 8.1.0__py3-none-any.whl - Mend

lemonade-sdk 8.0.5py3-none-any.whl → 8.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (30) hide show

lemonade/cache.py +3 -1
lemonade/common/network.py +18 -1
lemonade/tools/adapter.py +6 -0
lemonade/tools/huggingface/utils.py +6 -5
lemonade/tools/llamacpp/bench.py +28 -46
lemonade/tools/llamacpp/load.py +104 -196
lemonade/tools/llamacpp/utils.py +612 -0
lemonade/tools/oga/bench.py +5 -6
lemonade/tools/oga/load.py +239 -112
lemonade/tools/oga/utils.py +27 -9
lemonade/tools/prompt.py +17 -25
lemonade/tools/report/table.py +12 -9
lemonade/tools/server/llamacpp.py +80 -92
lemonade/tools/server/serve.py +22 -28
lemonade/tools/server/static/styles.css +121 -26
lemonade/tools/server/static/webapp.html +14 -6
lemonade/tools/server/tray.py +7 -0
lemonade/version.py +1 -1
lemonade_install/install.py +65 -84
{lemonade_sdk-8.0.5.dist-info → lemonade_sdk-8.1.0.dist-info}/METADATA +32 -21
{lemonade_sdk-8.0.5.dist-info → lemonade_sdk-8.1.0.dist-info}/RECORD +30 -29
lemonade_server/cli.py +1 -1
lemonade_server/model_manager.py +8 -151
lemonade_server/pydantic_models.py +1 -4
lemonade_server/server_models.json +44 -9
{lemonade_sdk-8.0.5.dist-info → lemonade_sdk-8.1.0.dist-info}/WHEEL +0 -0
{lemonade_sdk-8.0.5.dist-info → lemonade_sdk-8.1.0.dist-info}/entry_points.txt +0 -0
{lemonade_sdk-8.0.5.dist-info → lemonade_sdk-8.1.0.dist-info}/licenses/LICENSE +0 -0
{lemonade_sdk-8.0.5.dist-info → lemonade_sdk-8.1.0.dist-info}/licenses/NOTICE.md +0 -0
{lemonade_sdk-8.0.5.dist-info → lemonade_sdk-8.1.0.dist-info}/top_level.txt +0 -0

lemonade/tools/report/table.py CHANGED Viewed

@@ -758,15 +758,18 @@ class LemonadePerfTable(Table):
             data[key] = model_stats.get(key, "")
         # Create a new entry with Driver Versions and relevant Python Packages
-        sw_versions = [
-            key + ": " + value
-            for key, value in data[fs.Keys.SYSTEM_INFO]["Driver Versions"].items()
-        ]
-        sw_versions += [
-            pkg
-            for pkg in data[fs.Keys.SYSTEM_INFO]["Python Packages"]
-            if any(name in pkg for name in PYTHON_PACKAGES)
-        ]
+        sw_versions = []
+        if "Driver Versions" in data[fs.Keys.SYSTEM_INFO]:
+            sw_versions += [
+                key + ": " + value
+                for key, value in data[fs.Keys.SYSTEM_INFO]["Driver Versions"].items()
+            ]
+        if "Python Packages" in data[fs.Keys.SYSTEM_INFO]:
+            sw_versions += [
+                pkg
+                for pkg in data[fs.Keys.SYSTEM_INFO]["Python Packages"]
+                if any(name in pkg for name in PYTHON_PACKAGES)
+            ]
         if isinstance(data[Keys.RYZEN_AI_VERSION_INFO], dict):
             sw_versions += [
                 "Ryzen AI: " + value

lemonade/tools/server/llamacpp.py CHANGED Viewed

@@ -1,13 +1,11 @@
-import sys
 import os
+import sys
 import logging
 import time
 import subprocess
-import zipfile
 import re
 import threading
 import platform
-import shutil
 import requests
 from tabulate import tabulate
@@ -18,12 +16,18 @@ from openai import OpenAI
 from lemonade_server.pydantic_models import (
     ChatCompletionRequest,
+    CompletionRequest,
     PullConfig,
     EmbeddingsRequest,
     RerankingRequest,
 )
 from lemonade_server.model_manager import ModelManager
 from lemonade.tools.server.utils.port import find_free_port
+from lemonade.tools.llamacpp.utils import (
+    get_llama_server_exe_path,
+    install_llamacpp,
+    download_gguf,
+)
 LLAMA_VERSION = "b5787"
@@ -80,39 +84,6 @@ def get_binary_url_and_filename(version):
     return url, filename
-def validate_platform_support():
-    """
-    Validate platform support before attempting download
-    """
-    system = platform.system().lower()
-    if system not in ["windows", "linux"]:
-        raise HTTPException(
-            status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
-            detail=(
-                f"Platform {system} not supported for llamacpp. "
-                "Supported: Windows, Ubuntu Linux"
-            ),
-        )
-    if system == "linux":
-        # Check if we're actually on Ubuntu/compatible distro and log a warning if not
-        try:
-            with open("/etc/os-release", "r", encoding="utf-8") as f:
-                os_info = f.read().lower()
-                if "ubuntu" not in os_info and "debian" not in os_info:
-                    logging.warning(
-                        "llamacpp binaries are built for Ubuntu. "
-                        "Compatibility with other Linux distributions is not guaranteed."
-                    )
-        except (FileNotFoundError, PermissionError, OSError) as e:
-            logging.warning(
-                "Could not determine Linux distribution (%s). "
-                "llamacpp binaries are built for Ubuntu.",
-                str(e),
-            )
 class LlamaTelemetry:
     """
     Manages telemetry data collection and display for llama server.
@@ -283,7 +254,7 @@ def _launch_llama_subprocess(
     """
     # Get the current executable path (handles both Windows and Ubuntu structures)
-    _, exe_path = get_llama_server_paths()
+    exe_path = get_llama_server_exe_path()
     # Build the base command
     base_command = [exe_path, "-m", snapshot_files["variant"]]
@@ -350,68 +321,23 @@ def _launch_llama_subprocess(
 def server_load(model_config: PullConfig, telemetry: LlamaTelemetry):
-    # Validate platform support before proceeding
-    validate_platform_support()
+    # Install and/or update llama.cpp if needed
+    try:
+        install_llamacpp()
+    except NotImplementedError as e:
+        raise HTTPException(
+            status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=str(e)
+        )
     # Get platform-specific paths at runtime
-    llama_server_exe_dir, llama_server_exe_path = get_llama_server_paths()
-    # Check whether the llamacpp install needs an upgrade
-    version_txt_path = os.path.join(llama_server_exe_dir, "version.txt")
-    if os.path.exists(version_txt_path):
-        with open(version_txt_path, "r", encoding="utf-8") as f:
-            llamacpp_installed_version = f.read()
-        if llamacpp_installed_version != LLAMA_VERSION:
-            # Remove the existing install, which will trigger a new install
-            # in the next code block
-            shutil.rmtree(llama_server_exe_dir)
-    # Download llama.cpp server if it isn't already available
-    if not os.path.exists(llama_server_exe_dir):
-        # Download llama.cpp server zip
-        llama_zip_url, filename = get_binary_url_and_filename(LLAMA_VERSION)
-        llama_zip_path = os.path.join(os.path.dirname(sys.executable), filename)
-        logging.info(f"Downloading llama.cpp server from {llama_zip_url}")
-        with requests.get(llama_zip_url, stream=True) as r:
-            r.raise_for_status()
-            with open(llama_zip_path, "wb") as f:
-                for chunk in r.iter_content(chunk_size=8192):
-                    f.write(chunk)
-        # Extract zip
-        logging.info(f"Extracting {llama_zip_path} to {llama_server_exe_dir}")
-        with zipfile.ZipFile(llama_zip_path, "r") as zip_ref:
-            zip_ref.extractall(llama_server_exe_dir)
-        # Make executable on Linux - need to update paths after extraction
-        if platform.system().lower() == "linux":
-            # Re-get the paths since extraction might have changed the directory structure
-            _, updated_exe_path = get_llama_server_paths()
-            if os.path.exists(updated_exe_path):
-                os.chmod(updated_exe_path, 0o755)
-                logging.info(f"Set executable permissions for {updated_exe_path}")
-            else:
-                logging.warning(
-                    f"Could not find llama-server executable at {updated_exe_path}"
-                )
-        # Save version.txt
-        with open(version_txt_path, "w", encoding="utf-8") as vf:
-            vf.write(LLAMA_VERSION)
-        # Delete zip file
-        os.remove(llama_zip_path)
-        logging.info("Cleaned up zip file")
+    llama_server_exe_path = get_llama_server_exe_path()
     # Download the gguf to the hugging face cache
-    model_manager = ModelManager()
-    snapshot_files = model_manager.download_gguf(model_config)
+    snapshot_files = download_gguf(model_config.checkpoint, model_config.mmproj)
     logging.debug(f"GGUF file paths: {snapshot_files}")
     # Check if model supports embeddings
-    supported_models = model_manager.supported_models
+    supported_models = ModelManager().supported_models
     model_info = supported_models.get(model_config.model_name, {})
     supports_embeddings = "embeddings" in model_info.get("labels", [])
     supports_reranking = "reranking" in model_info.get("labels", [])
@@ -523,6 +449,68 @@ def chat_completion(
             )
+def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry):
+    """
+    Handle text completions using the llamacpp server.
+    Args:
+        completion_request: The completion request containing prompt and parameters
+        telemetry: Telemetry object containing the server port
+    Returns:
+        Completion response from the llamacpp server
+    """
+    base_url = llamacpp_address(telemetry.port)
+    client = OpenAI(
+        base_url=base_url,
+        api_key="lemonade",
+    )
+    # Convert Pydantic model to dict and remove unset/null values
+    request_dict = completion_request.model_dump(exclude_unset=True, exclude_none=True)
+    # Check if streaming is requested
+    if completion_request.stream:
+        def event_stream():
+            try:
+                # Enable streaming
+                for chunk in client.completions.create(**request_dict):
+                    yield f"data: {chunk.model_dump_json()}\n\n"
+                yield "data: [DONE]\n\n"
+                # Show telemetry after completion
+                telemetry.show_telemetry()
+            except Exception as e:  # pylint: disable=broad-exception-caught
+                yield f'data: {{"error": "{str(e)}"}}\n\n'
+        return StreamingResponse(
+            event_stream(),
+            media_type="text/event-stream",
+            headers={
+                "Cache-Control": "no-cache",
+                "Connection": "keep-alive",
+            },
+        )
+    else:
+        # Non-streaming response
+        try:
+            # Disable streaming for non-streaming requests
+            response = client.completions.create(**request_dict)
+            # Show telemetry after completion
+            telemetry.show_telemetry()
+            return response
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail=f"Completion error: {str(e)}",
+            )
 def embeddings(embeddings_request: EmbeddingsRequest, telemetry: LlamaTelemetry):
     """
     Generate embeddings using the llamacpp server.

lemonade/tools/server/serve.py CHANGED Viewed

@@ -284,7 +284,7 @@ class Server(ManagementTool):
     def _setup_server_common(
         self,
         port: int,
-        truncate_inputs: bool = False,
+        truncate_inputs: Optional[int] = None,
         log_level: str = DEFAULT_LOG_LEVEL,
         tray: bool = False,
         log_file: str = None,
@@ -295,7 +295,7 @@ class Server(ManagementTool):
         Args:
             port: Port number for the server
-            truncate_inputs: Whether to truncate inputs if they exceed max length
+            truncate_inputs: Truncate messages to this length
             log_level: Logging level to configure
             threaded_mode: Whether this is being set up for threaded execution
         """
@@ -372,7 +372,7 @@ class Server(ManagementTool):
         _=None,
         port: int = DEFAULT_PORT,
         log_level: str = DEFAULT_LOG_LEVEL,
-        truncate_inputs: bool = False,
+        truncate_inputs: Optional[int] = None,
         tray: bool = False,
         log_file: str = None,
     ):
@@ -393,7 +393,7 @@ class Server(ManagementTool):
         port: int = DEFAULT_PORT,
         host: str = "localhost",
         log_level: str = "warning",
-        truncate_inputs: bool = False,
+        truncate_inputs: Optional[int] = None,
     ):
         """
         Set up the server for running in a thread.
@@ -487,6 +487,9 @@ class Server(ManagementTool):
         # Load the model if it's different from the currently loaded one
         await self.load_llm(lc)
+        if self.llm_loaded.recipe == "llamacpp":
+            return llamacpp.completion(completion_request, self.llama_telemetry)
         # Check if the model supports reasoning
         reasoning_first_token = self.llm_loaded.reasoning
@@ -1096,29 +1099,20 @@ class Server(ManagementTool):
             )
             self.input_tokens = len(input_ids[0])
-        if (
-            self.llm_loaded.max_prompt_length
-            and self.input_tokens > self.llm_loaded.max_prompt_length
-        ):
-            if self.truncate_inputs:
-                # Truncate input ids
-                truncate_amount = self.input_tokens - self.llm_loaded.max_prompt_length
-                input_ids = input_ids[: self.llm_loaded.max_prompt_length]
-                # Update token count
-                self.input_tokens = len(input_ids)
-                # Show warning message
-                truncation_message = (
-                    f"Input exceeded {self.llm_loaded.max_prompt_length} tokens. "
-                    f"Truncated {truncate_amount} tokens."
-                )
-                logging.warning(truncation_message)
-            else:
-                raise RuntimeError(
-                    f"Prompt tokens ({self.input_tokens}) cannot be greater "
-                    f"than the model's max prompt length ({self.llm_loaded.max_prompt_length})"
-                )
+        if self.truncate_inputs and self.truncate_inputs > self.input_tokens:
+            # Truncate input ids
+            truncate_amount = self.input_tokens - self.truncate_inputs
+            input_ids = input_ids[: self.truncate_inputs]
+            # Update token count
+            self.input_tokens = len(input_ids)
+            # Show warning message
+            truncation_message = (
+                f"Input exceeded {self.truncate_inputs} tokens. "
+                f"Truncated {truncate_amount} tokens."
+            )
+            logging.warning(truncation_message)
         # Log the input tokens early to avoid this not showing due to potential crashes
         logging.debug(f"Input Tokens: {self.input_tokens}")
@@ -1314,7 +1308,7 @@ class Server(ManagementTool):
         self.tokenizer = None
         self.model = None
-        default_message = f"model {model_reference} not found"
+        default_message = "see stack trace and error message below"
         if message:
             detail = message
         else:

lemonade/tools/server/static/styles.css CHANGED Viewed

@@ -1,33 +1,92 @@
+/* === CSS Variables === */
+:root {
+  /* Colors */
+  --primary-yellow: #ffe066;
+  --primary-yellow-dark: #ffd43b;
+  --accent-gold: #e6b800;
+  --accent-gold-dark: #bfa100;
+  --text-primary: #222;
+  --text-secondary: #555;
+  --text-muted: #666;
+  --bg-primary: #fffbe9;
+  --bg-secondary: #fff8dd;
+  --bg-tertiary: #fff5d1;
+  /* Transitions */
+  --transition-fast: 0.2s ease;
+  --transition-medium: 0.3s ease;
+}
 body {
   margin: 0;
   font-family: 'Segoe UI', 'Arial', sans-serif;
-  background: #fffbe9;
-  color: #222;
+  background: linear-gradient(135deg, var(--bg-primary) 0%, var(--bg-secondary) 50%, var(--bg-tertiary) 100%);
+  color: var(--text-primary);
   min-height: 100vh;
   display: flex;
   flex-direction: column;
-  padding-bottom: 5rem;
+}
+body::before {
+  content: '';
+  position: fixed;
+  top: 0;
+  left: 0;
+  width: 100%;
+  height: 100%;
+  background:
+    radial-gradient(circle at 20% 20%, rgba(255, 224, 102, 0.1) 0%, transparent 50%),
+    radial-gradient(circle at 80% 80%, rgba(255, 212, 59, 0.1) 0%, transparent 50%);
+  pointer-events: none;
+  z-index: -1;
 }
 .navbar {
   display: flex;
-  justify-content: center;
-  gap: 2.5rem;
-  padding: 2rem 0 1.5rem 0;
+  justify-content: space-between;
+  align-items: center;
+  padding: 1rem 3rem 0.5rem 1rem;
   font-size: 1.25rem;
   font-weight: 500;
   background: transparent;
   letter-spacing: 0.02em;
+  position: relative;
+  transition: var(--transition-medium);
 }
-.navbar a {
+.navbar-brand {
+  display: flex;
+  align-items: center;
+}
+.brand-title {
+  font-size: 1.5rem;
+  font-weight: 700;
+  color: var(--text-primary);
+  text-decoration: none;
+  letter-spacing: 0.01em;
+}
+.brand-title a {
+  color: inherit;
+  text-decoration: none;
+}
+.navbar-links {
+  display: flex;
+  gap: 2.5rem;
+}
+.navbar-links a {
   color: #444;
   text-decoration: none;
-  transition: color 0.2s;
+  transition: var(--transition-fast);
 }
-.navbar a:hover {
-  color: #e6b800;
+.navbar-links a:hover {
+  color: var(--accent-gold);
 }
 .main {
@@ -37,26 +96,14 @@ body {
   align-items: center;
   justify-content: flex-start;
   min-height: 60vh;
-  margin-top: 3rem;
-}
-.title {
-  font-size: 3rem;
-  font-weight: 700;
-  margin-bottom: 2.5rem;
-  letter-spacing: 0.01em;
-  text-align: center;
-  color: #222;
+  margin-top: 2rem;
+  padding-top: 1rem;
 }
 .site-footer {
-  position: fixed;
-  left: 0;
-  bottom: 0;
-  width: 100%;
-  background-color: #fffbe9;
+  background: transparent;
   padding-top: 0.5rem;
-  z-index: 100;
+  margin-top: auto;
 }
 .dad-joke {
@@ -483,6 +530,10 @@ body {
   background-color: #ca4747;
 }
+.model-label.coding {
+  background-color: #ff6b35;
+}
 .model-labels-container {
   display: flex;
   align-items: center;
@@ -983,6 +1034,50 @@ body {
   }
 }
+/* === Responsive Navbar === */
+@media (max-width: 800px) {
+  .navbar {
+    flex-direction: column;
+    gap: 1rem;
+    padding: 1rem 1rem 0.5rem 1rem;
+    align-items: center;
+  }
+  .navbar-brand {
+    margin-bottom: 0.5rem;
+  }
+  .brand-title {
+    font-size: 1.3rem;
+  }
+  .navbar-links {
+    gap: 1.5rem;
+    font-size: 1rem;
+  }
+}
+@media (max-width: 600px) {
+  .navbar {
+    padding: 0.5rem 0.5rem 0.25rem 0.5rem;
+  }
+  .brand-title {
+    font-size: 1.2rem;
+  }
+  .navbar-links {
+    gap: 1rem;
+    font-size: 0.9rem;
+    flex-wrap: wrap;
+    justify-content: center;
+  }
+  .main {
+    margin-top: 0.5rem;
+  }
+}
 /* Ensure form container allows tooltip overflow */
 .model-mgmt-register-form {
   position: relative;

lemonade/tools/server/static/webapp.html CHANGED Viewed

@@ -12,14 +12,19 @@
     {{SERVER_MODELS_JS}}
 </head>
 <body>
-    <nav class="navbar">
-        <a href="https://github.com/lemonade-sdk/lemonade" target="_blank">GitHub</a>
-        <a href="https://lemonade-server.ai/docs/" target="_blank">Docs</a>
-        <a href="https://lemonade-server.ai/docs/server/server_models/" target="_blank">Models</a>
-        <a href="https://lemonade-server.ai/docs/server/apps/" target="_blank">Featured Apps</a>
+    <nav class="navbar" id="navbar">
+        <div class="navbar-brand">
+            <span class="brand-title"><a href="https://lemonade-server.ai">🍋 Lemonade Server</a></span>
+        </div>
+        <div class="navbar-links">
+            <a href="https://github.com/lemonade-sdk/lemonade" target="_blank">GitHub</a>
+            <a href="https://lemonade-server.ai/docs/" target="_blank">Docs</a>
+            <a href="https://lemonade-server.ai/docs/server/server_models/" target="_blank">Models</a>
+            <a href="https://lemonade-server.ai/docs/server/apps/" target="_blank">Featured Apps</a>
+            <a href="https://lemonade-server.ai/news/" target="_blank">News</a>
+        </div>
     </nav>
     <main class="main">
-        <div class="title">🍋 Lemonade Server</div>
         <div class="tab-container">
             <div class="tabs">
                 <button class="tab active" id="tab-chat" onclick="showTab('chat')">LLM Chat</button>
@@ -104,6 +109,7 @@
                         </label>
                         <select id="register-recipe" name="recipe" required>
                             <option value="llamacpp">llamacpp</option>
+                            <option value="oga-npu">oga-npu</option>
                             <option value="oga-hybrid">oga-hybrid</option>
                             <option value="oga-cpu">oga-cpu</option>
                         </select>
@@ -408,6 +414,8 @@
                     labelClass = 'reasoning';
                 } else if (labelLower === 'reranking') {
                     labelClass = 'reranking';
+                } else if (labelLower === 'coding') {
+                    labelClass = 'coding';
                 }
                 labelSpan.className = `model-label ${labelClass}`;
                 labelSpan.textContent = label;

lemonade/tools/server/tray.py CHANGED Viewed

@@ -87,8 +87,15 @@ class LemonadeTray(SystemTray):
         Update the latest version information.
         """
         try:
+            # Prepare headers for GitHub API request
+            headers = {}
+            github_token = os.environ.get("GITHUB_TOKEN")
+            if github_token:
+                headers["Authorization"] = f"token {github_token}"
             response = requests.get(
                 "https://api.github.com/repos/lemonade-sdk/lemonade/releases/latest",
+                headers=headers,
                 timeout=10,  # Add timeout to prevent hanging
             )
             response.raise_for_status()

lemonade/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "8.0.5"
1	+ __version__ = "8.1.0"

lemonade-sdk 8.0.5__py3-none-any.whl → 8.1.0__py3-none-any.whl

Potentially problematic release.

lemonade-sdk 8.0.5py3-none-any.whl → 8.1.0py3-none-any.whl