PyPI - lemonade-sdk - Versions diffs - 8.0.5__py3-none-any.whl → 8.0.6__py3-none-any.whl - Mend

lemonade-sdk 8.0.5py3-none-any.whl → 8.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (25) hide show

lemonade/cache.py +3 -1
lemonade/tools/adapter.py +6 -0
lemonade/tools/huggingface/utils.py +6 -5
lemonade/tools/llamacpp/bench.py +26 -46
lemonade/tools/llamacpp/load.py +104 -196
lemonade/tools/llamacpp/utils.py +612 -0
lemonade/tools/oga/bench.py +5 -6
lemonade/tools/oga/utils.py +8 -2
lemonade/tools/prompt.py +17 -25
lemonade/tools/report/table.py +12 -9
lemonade/tools/server/llamacpp.py +80 -92
lemonade/tools/server/serve.py +3 -0
lemonade/tools/server/static/styles.css +116 -20
lemonade/tools/server/static/webapp.html +11 -6
lemonade/tools/server/tray.py +7 -0
lemonade/version.py +1 -1
{lemonade_sdk-8.0.5.dist-info → lemonade_sdk-8.0.6.dist-info}/METADATA +3 -3
{lemonade_sdk-8.0.5.dist-info → lemonade_sdk-8.0.6.dist-info}/RECORD +25 -24
lemonade_server/model_manager.py +4 -148
lemonade_server/server_models.json +11 -0
{lemonade_sdk-8.0.5.dist-info → lemonade_sdk-8.0.6.dist-info}/WHEEL +0 -0
{lemonade_sdk-8.0.5.dist-info → lemonade_sdk-8.0.6.dist-info}/entry_points.txt +0 -0
{lemonade_sdk-8.0.5.dist-info → lemonade_sdk-8.0.6.dist-info}/licenses/LICENSE +0 -0
{lemonade_sdk-8.0.5.dist-info → lemonade_sdk-8.0.6.dist-info}/licenses/NOTICE.md +0 -0
{lemonade_sdk-8.0.5.dist-info → lemonade_sdk-8.0.6.dist-info}/top_level.txt +0 -0

lemonade/tools/prompt.py CHANGED Viewed

@@ -161,7 +161,11 @@ class LLMPrompt(Tool):
         # If template flag is set, then wrap prompt in template
         if template:
             # Embed prompt in model's chat template
-            if tokenizer.chat_template:
+            if not hasattr(tokenizer, "prompt_template"):
+                printing.log_warning(
+                    "Templates for this model type are not yet implemented."
+                )
+            elif tokenizer.chat_template:
                 # Use the model's built-in chat template if available
                 messages_dict = [{"role": "user", "content": prompt}]
                 prompt = tokenizer.apply_chat_template(
@@ -175,25 +179,10 @@ class LLMPrompt(Tool):
                 state.save_stat(Keys.PROMPT_TEMPLATE, "Default")
         input_ids = tokenizer(prompt, return_tensors="pt").input_ids
-        if isinstance(input_ids, (list, str)):
-            # OGA models return a list of tokens (older versions)
-            # Our llama.cpp adapter returns a string
-            len_tokens_in = len(input_ids)
-        elif hasattr(input_ids, "shape"):
-            # HF models return a 2-D tensor
-            # OGA models with newer versions may return numpy arrays
-            if len(input_ids.shape) == 1:
-                # 1-D array from newer OGA versions
-                len_tokens_in = len(input_ids)
-            else:
-                # 2-D tensor from HF models
-                len_tokens_in = input_ids.shape[1]
-        else:
-            # Fallback: try to get length directly
-            len_tokens_in = len(input_ids)
         len_tokens_out = []
         response_texts = []
+        prompt_tokens = None  # will be determined in generate function
         for trial in range(n_trials):
             if n_trials > 1:
                 self.set_percent_progress(100.0 * trial / n_trials)
@@ -222,19 +211,22 @@ class LLMPrompt(Tool):
             response_array = response if isinstance(response, str) else response[0]
-            # Separate the prompt from the response
-            len_tokens_out.append(len(response_array) - len_tokens_in)
+            prompt_tokens = model.prompt_tokens
+            len_tokens_out.append(model.response_tokens)
-            input_token = 0
+            # Remove the input from the response
+            # (up to the point they diverge, which they should not)
+            counter = 0
+            len_input_ids = len(input_ids_array)
             while (
-                input_token < len_tokens_in
-                and input_ids_array[input_token] == response_array[input_token]
+                counter < len_input_ids
+                and input_ids_array[counter] == response_array[counter]
             ):
-                input_token += 1
+                counter += 1
             # Only decode the actual response (not the prompt)
             response_text = tokenizer.decode(
-                response_array[input_token:], skip_special_tokens=True
+                response_array[counter:], skip_special_tokens=True
             ).strip()
             response_texts.append(response_text)
@@ -259,7 +251,7 @@ class LLMPrompt(Tool):
             plt.savefig(figure_path)
             state.save_stat(Keys.RESPONSE_LENGTHS_HISTOGRAM, figure_path)
-        state.save_stat(Keys.PROMPT_TOKENS, len_tokens_in)
+        state.save_stat(Keys.PROMPT_TOKENS, prompt_tokens)
         state.save_stat(Keys.PROMPT, prompt)
         state.save_stat(Keys.RESPONSE_TOKENS, len_tokens_out)
         state.save_stat(Keys.RESPONSE, sanitize_text(response_texts))

lemonade/tools/report/table.py CHANGED Viewed

@@ -758,15 +758,18 @@ class LemonadePerfTable(Table):
             data[key] = model_stats.get(key, "")
         # Create a new entry with Driver Versions and relevant Python Packages
-        sw_versions = [
-            key + ": " + value
-            for key, value in data[fs.Keys.SYSTEM_INFO]["Driver Versions"].items()
-        ]
-        sw_versions += [
-            pkg
-            for pkg in data[fs.Keys.SYSTEM_INFO]["Python Packages"]
-            if any(name in pkg for name in PYTHON_PACKAGES)
-        ]
+        sw_versions = []
+        if "Driver Versions" in data[fs.Keys.SYSTEM_INFO]:
+            sw_versions += [
+                key + ": " + value
+                for key, value in data[fs.Keys.SYSTEM_INFO]["Driver Versions"].items()
+            ]
+        if "Python Packages" in data[fs.Keys.SYSTEM_INFO]:
+            sw_versions += [
+                pkg
+                for pkg in data[fs.Keys.SYSTEM_INFO]["Python Packages"]
+                if any(name in pkg for name in PYTHON_PACKAGES)
+            ]
         if isinstance(data[Keys.RYZEN_AI_VERSION_INFO], dict):
             sw_versions += [
                 "Ryzen AI: " + value

lemonade/tools/server/llamacpp.py CHANGED Viewed

@@ -1,13 +1,11 @@
-import sys
 import os
+import sys
 import logging
 import time
 import subprocess
-import zipfile
 import re
 import threading
 import platform
-import shutil
 import requests
 from tabulate import tabulate
@@ -18,12 +16,18 @@ from openai import OpenAI
 from lemonade_server.pydantic_models import (
     ChatCompletionRequest,
+    CompletionRequest,
     PullConfig,
     EmbeddingsRequest,
     RerankingRequest,
 )
 from lemonade_server.model_manager import ModelManager
 from lemonade.tools.server.utils.port import find_free_port
+from lemonade.tools.llamacpp.utils import (
+    get_llama_server_exe_path,
+    install_llamacpp,
+    download_gguf,
+)
 LLAMA_VERSION = "b5787"
@@ -80,39 +84,6 @@ def get_binary_url_and_filename(version):
     return url, filename
-def validate_platform_support():
-    """
-    Validate platform support before attempting download
-    """
-    system = platform.system().lower()
-    if system not in ["windows", "linux"]:
-        raise HTTPException(
-            status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
-            detail=(
-                f"Platform {system} not supported for llamacpp. "
-                "Supported: Windows, Ubuntu Linux"
-            ),
-        )
-    if system == "linux":
-        # Check if we're actually on Ubuntu/compatible distro and log a warning if not
-        try:
-            with open("/etc/os-release", "r", encoding="utf-8") as f:
-                os_info = f.read().lower()
-                if "ubuntu" not in os_info and "debian" not in os_info:
-                    logging.warning(
-                        "llamacpp binaries are built for Ubuntu. "
-                        "Compatibility with other Linux distributions is not guaranteed."
-                    )
-        except (FileNotFoundError, PermissionError, OSError) as e:
-            logging.warning(
-                "Could not determine Linux distribution (%s). "
-                "llamacpp binaries are built for Ubuntu.",
-                str(e),
-            )
 class LlamaTelemetry:
     """
     Manages telemetry data collection and display for llama server.
@@ -283,7 +254,7 @@ def _launch_llama_subprocess(
     """
     # Get the current executable path (handles both Windows and Ubuntu structures)
-    _, exe_path = get_llama_server_paths()
+    exe_path = get_llama_server_exe_path()
     # Build the base command
     base_command = [exe_path, "-m", snapshot_files["variant"]]
@@ -350,68 +321,23 @@ def _launch_llama_subprocess(
 def server_load(model_config: PullConfig, telemetry: LlamaTelemetry):
-    # Validate platform support before proceeding
-    validate_platform_support()
+    # Install and/or update llama.cpp if needed
+    try:
+        install_llamacpp()
+    except NotImplementedError as e:
+        raise HTTPException(
+            status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=str(e)
+        )
     # Get platform-specific paths at runtime
-    llama_server_exe_dir, llama_server_exe_path = get_llama_server_paths()
-    # Check whether the llamacpp install needs an upgrade
-    version_txt_path = os.path.join(llama_server_exe_dir, "version.txt")
-    if os.path.exists(version_txt_path):
-        with open(version_txt_path, "r", encoding="utf-8") as f:
-            llamacpp_installed_version = f.read()
-        if llamacpp_installed_version != LLAMA_VERSION:
-            # Remove the existing install, which will trigger a new install
-            # in the next code block
-            shutil.rmtree(llama_server_exe_dir)
-    # Download llama.cpp server if it isn't already available
-    if not os.path.exists(llama_server_exe_dir):
-        # Download llama.cpp server zip
-        llama_zip_url, filename = get_binary_url_and_filename(LLAMA_VERSION)
-        llama_zip_path = os.path.join(os.path.dirname(sys.executable), filename)
-        logging.info(f"Downloading llama.cpp server from {llama_zip_url}")
-        with requests.get(llama_zip_url, stream=True) as r:
-            r.raise_for_status()
-            with open(llama_zip_path, "wb") as f:
-                for chunk in r.iter_content(chunk_size=8192):
-                    f.write(chunk)
-        # Extract zip
-        logging.info(f"Extracting {llama_zip_path} to {llama_server_exe_dir}")
-        with zipfile.ZipFile(llama_zip_path, "r") as zip_ref:
-            zip_ref.extractall(llama_server_exe_dir)
-        # Make executable on Linux - need to update paths after extraction
-        if platform.system().lower() == "linux":
-            # Re-get the paths since extraction might have changed the directory structure
-            _, updated_exe_path = get_llama_server_paths()
-            if os.path.exists(updated_exe_path):
-                os.chmod(updated_exe_path, 0o755)
-                logging.info(f"Set executable permissions for {updated_exe_path}")
-            else:
-                logging.warning(
-                    f"Could not find llama-server executable at {updated_exe_path}"
-                )
-        # Save version.txt
-        with open(version_txt_path, "w", encoding="utf-8") as vf:
-            vf.write(LLAMA_VERSION)
-        # Delete zip file
-        os.remove(llama_zip_path)
-        logging.info("Cleaned up zip file")
+    llama_server_exe_path = get_llama_server_exe_path()
     # Download the gguf to the hugging face cache
-    model_manager = ModelManager()
-    snapshot_files = model_manager.download_gguf(model_config)
+    snapshot_files = download_gguf(model_config.checkpoint, model_config.mmproj)
     logging.debug(f"GGUF file paths: {snapshot_files}")
     # Check if model supports embeddings
-    supported_models = model_manager.supported_models
+    supported_models = ModelManager().supported_models
     model_info = supported_models.get(model_config.model_name, {})
     supports_embeddings = "embeddings" in model_info.get("labels", [])
     supports_reranking = "reranking" in model_info.get("labels", [])
@@ -523,6 +449,68 @@ def chat_completion(
             )
+def completion(completion_request: CompletionRequest, telemetry: LlamaTelemetry):
+    """
+    Handle text completions using the llamacpp server.
+    Args:
+        completion_request: The completion request containing prompt and parameters
+        telemetry: Telemetry object containing the server port
+    Returns:
+        Completion response from the llamacpp server
+    """
+    base_url = llamacpp_address(telemetry.port)
+    client = OpenAI(
+        base_url=base_url,
+        api_key="lemonade",
+    )
+    # Convert Pydantic model to dict and remove unset/null values
+    request_dict = completion_request.model_dump(exclude_unset=True, exclude_none=True)
+    # Check if streaming is requested
+    if completion_request.stream:
+        def event_stream():
+            try:
+                # Enable streaming
+                for chunk in client.completions.create(**request_dict):
+                    yield f"data: {chunk.model_dump_json()}\n\n"
+                yield "data: [DONE]\n\n"
+                # Show telemetry after completion
+                telemetry.show_telemetry()
+            except Exception as e:  # pylint: disable=broad-exception-caught
+                yield f'data: {{"error": "{str(e)}"}}\n\n'
+        return StreamingResponse(
+            event_stream(),
+            media_type="text/event-stream",
+            headers={
+                "Cache-Control": "no-cache",
+                "Connection": "keep-alive",
+            },
+        )
+    else:
+        # Non-streaming response
+        try:
+            # Disable streaming for non-streaming requests
+            response = client.completions.create(**request_dict)
+            # Show telemetry after completion
+            telemetry.show_telemetry()
+            return response
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail=f"Completion error: {str(e)}",
+            )
 def embeddings(embeddings_request: EmbeddingsRequest, telemetry: LlamaTelemetry):
     """
     Generate embeddings using the llamacpp server.

lemonade/tools/server/serve.py CHANGED Viewed

@@ -487,6 +487,9 @@ class Server(ManagementTool):
         # Load the model if it's different from the currently loaded one
         await self.load_llm(lc)
+        if self.llm_loaded.recipe == "llamacpp":
+            return llamacpp.completion(completion_request, self.llama_telemetry)
         # Check if the model supports reasoning
         reasoning_first_token = self.llm_loaded.reasoning

lemonade/tools/server/static/styles.css CHANGED Viewed

@@ -1,33 +1,93 @@
+/* === CSS Variables === */
+:root {
+  /* Colors */
+  --primary-yellow: #ffe066;
+  --primary-yellow-dark: #ffd43b;
+  --accent-gold: #e6b800;
+  --accent-gold-dark: #bfa100;
+  --text-primary: #222;
+  --text-secondary: #555;
+  --text-muted: #666;
+  --bg-primary: #fffbe9;
+  --bg-secondary: #fff8dd;
+  --bg-tertiary: #fff5d1;
+  /* Transitions */
+  --transition-fast: 0.2s ease;
+  --transition-medium: 0.3s ease;
+}
 body {
   margin: 0;
   font-family: 'Segoe UI', 'Arial', sans-serif;
-  background: #fffbe9;
-  color: #222;
+  background: linear-gradient(135deg, var(--bg-primary) 0%, var(--bg-secondary) 50%, var(--bg-tertiary) 100%);
+  color: var(--text-primary);
   min-height: 100vh;
   display: flex;
   flex-direction: column;
   padding-bottom: 5rem;
 }
+body::before {
+  content: '';
+  position: fixed;
+  top: 0;
+  left: 0;
+  width: 100%;
+  height: 100%;
+  background:
+    radial-gradient(circle at 20% 20%, rgba(255, 224, 102, 0.1) 0%, transparent 50%),
+    radial-gradient(circle at 80% 80%, rgba(255, 212, 59, 0.1) 0%, transparent 50%);
+  pointer-events: none;
+  z-index: -1;
+}
 .navbar {
   display: flex;
-  justify-content: center;
-  gap: 2.5rem;
-  padding: 2rem 0 1.5rem 0;
+  justify-content: space-between;
+  align-items: center;
+  padding: 1rem 3rem 0.5rem 1rem;
   font-size: 1.25rem;
   font-weight: 500;
   background: transparent;
   letter-spacing: 0.02em;
+  position: relative;
+  transition: var(--transition-medium);
+}
+.navbar-brand {
+  display: flex;
+  align-items: center;
 }
-.navbar a {
+.brand-title {
+  font-size: 1.5rem;
+  font-weight: 700;
+  color: var(--text-primary);
+  text-decoration: none;
+  letter-spacing: 0.01em;
+}
+.brand-title a {
+  color: inherit;
+  text-decoration: none;
+}
+.navbar-links {
+  display: flex;
+  gap: 2.5rem;
+}
+.navbar-links a {
   color: #444;
   text-decoration: none;
-  transition: color 0.2s;
+  transition: var(--transition-fast);
 }
-.navbar a:hover {
-  color: #e6b800;
+.navbar-links a:hover {
+  color: var(--accent-gold);
 }
 .main {
@@ -37,16 +97,8 @@ body {
   align-items: center;
   justify-content: flex-start;
   min-height: 60vh;
-  margin-top: 3rem;
-}
-.title {
-  font-size: 3rem;
-  font-weight: 700;
-  margin-bottom: 2.5rem;
-  letter-spacing: 0.01em;
-  text-align: center;
-  color: #222;
+  margin-top: 2rem;
+  padding-top: 1rem;
 }
 .site-footer {
@@ -54,7 +106,7 @@ body {
   left: 0;
   bottom: 0;
   width: 100%;
-  background-color: #fffbe9;
+  background: transparent;
   padding-top: 0.5rem;
   z-index: 100;
 }
@@ -983,6 +1035,50 @@ body {
   }
 }
+/* === Responsive Navbar === */
+@media (max-width: 800px) {
+  .navbar {
+    flex-direction: column;
+    gap: 1rem;
+    padding: 1rem 1rem 0.5rem 1rem;
+    align-items: center;
+  }
+  .navbar-brand {
+    margin-bottom: 0.5rem;
+  }
+  .brand-title {
+    font-size: 1.3rem;
+  }
+  .navbar-links {
+    gap: 1.5rem;
+    font-size: 1rem;
+  }
+}
+@media (max-width: 600px) {
+  .navbar {
+    padding: 0.5rem 0.5rem 0.25rem 0.5rem;
+  }
+  .brand-title {
+    font-size: 1.2rem;
+  }
+  .navbar-links {
+    gap: 1rem;
+    font-size: 0.9rem;
+    flex-wrap: wrap;
+    justify-content: center;
+  }
+  .main {
+    margin-top: 0.5rem;
+  }
+}
 /* Ensure form container allows tooltip overflow */
 .model-mgmt-register-form {
   position: relative;

lemonade/tools/server/static/webapp.html CHANGED Viewed

@@ -12,14 +12,19 @@
     {{SERVER_MODELS_JS}}
 </head>
 <body>
-    <nav class="navbar">
-        <a href="https://github.com/lemonade-sdk/lemonade" target="_blank">GitHub</a>
-        <a href="https://lemonade-server.ai/docs/" target="_blank">Docs</a>
-        <a href="https://lemonade-server.ai/docs/server/server_models/" target="_blank">Models</a>
-        <a href="https://lemonade-server.ai/docs/server/apps/" target="_blank">Featured Apps</a>
+    <nav class="navbar" id="navbar">
+        <div class="navbar-brand">
+            <span class="brand-title"><a href="https://lemonade-server.ai">🍋 Lemonade Server</a></span>
+        </div>
+        <div class="navbar-links">
+            <a href="https://github.com/lemonade-sdk/lemonade" target="_blank">GitHub</a>
+            <a href="https://lemonade-server.ai/docs/" target="_blank">Docs</a>
+            <a href="https://lemonade-server.ai/docs/server/server_models/" target="_blank">Models</a>
+            <a href="https://lemonade-server.ai/docs/server/apps/" target="_blank">Featured Apps</a>
+            <a href="https://lemonade-server.ai/news/" target="_blank">News</a>
+        </div>
     </nav>
     <main class="main">
-        <div class="title">🍋 Lemonade Server</div>
         <div class="tab-container">
             <div class="tabs">
                 <button class="tab active" id="tab-chat" onclick="showTab('chat')">LLM Chat</button>

lemonade/tools/server/tray.py CHANGED Viewed

@@ -87,8 +87,15 @@ class LemonadeTray(SystemTray):
         Update the latest version information.
         """
         try:
+            # Prepare headers for GitHub API request
+            headers = {}
+            github_token = os.environ.get("GITHUB_TOKEN")
+            if github_token:
+                headers["Authorization"] = f"token {github_token}"
             response = requests.get(
                 "https://api.github.com/repos/lemonade-sdk/lemonade/releases/latest",
+                headers=headers,
                 timeout=10,  # Add timeout to prevent hanging
             )
             response.raise_for_status()

lemonade/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "8.0.5"
1	+ __version__ = "8.0.6"

{lemonade_sdk-8.0.5.dist-info → lemonade_sdk-8.0.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lemonade-sdk
-Version: 8.0.5
+Version: 8.0.6
 Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
 Author-email: lemonade@amd.com
 Requires-Python: >=3.10, <3.13
@@ -23,7 +23,7 @@ Requires-Dist: zstandard
 Requires-Dist: fastapi
 Requires-Dist: uvicorn[standard]
 Requires-Dist: openai>=1.81.0
-Requires-Dist: transformers<=4.51.3
+Requires-Dist: transformers<=4.53.2
 Requires-Dist: jinja2
 Requires-Dist: tabulate
 Requires-Dist: sentencepiece
@@ -284,7 +284,7 @@ New contributors can find beginner-friendly issues tagged with "Good First Issue
 ## Maintainers
-This project is sponsored by AMD. It is maintained by @danielholanda @jeremyfowers @ramkrishna @vgodsoe in equal measure. You can reach us by filing an [issue](https://github.com/lemonade-sdk/lemonade/issues), email [lemonade@amd.com](mailto:lemonade@amd.com), or join our [Discord](https://discord.gg/5xXzkMu8Zk).
+This project is sponsored by AMD. It is maintained by @danielholanda @jeremyfowers @ramkrishna @vgodsoe in equal measure. You can reach us by filing an [issue](https://github.com/lemonade-sdk/lemonade/issues), emailing [lemonade@amd.com](mailto:lemonade@amd.com), or joining our [Discord](https://discord.gg/5xXzkMu8Zk).
 ## License

lemonade-sdk 8.0.5__py3-none-any.whl → 8.0.6__py3-none-any.whl

Potentially problematic release.

lemonade-sdk 8.0.5py3-none-any.whl → 8.0.6py3-none-any.whl