PyPI - lemonade-sdk - Versions diffs - 8.0.6__py3-none-any.whl → 8.1.1__py3-none-any.whl - Mend

lemonade-sdk 8.0.6py3-none-any.whl → 8.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (27) hide show

lemonade/common/inference_engines.py +62 -77
lemonade/common/network.py +18 -1
lemonade/common/system_info.py +61 -44
lemonade/tools/llamacpp/bench.py +3 -1
lemonade/tools/llamacpp/load.py +13 -4
lemonade/tools/llamacpp/utils.py +229 -61
lemonade/tools/oga/load.py +239 -112
lemonade/tools/oga/utils.py +19 -7
lemonade/tools/server/llamacpp.py +30 -53
lemonade/tools/server/serve.py +64 -123
lemonade/tools/server/static/styles.css +208 -6
lemonade/tools/server/static/webapp.html +510 -71
lemonade/tools/server/tray.py +4 -2
lemonade/tools/server/utils/thread.py +2 -4
lemonade/version.py +1 -1
lemonade_install/install.py +90 -86
{lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/METADATA +74 -24
{lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/RECORD +27 -27
lemonade_server/cli.py +79 -26
lemonade_server/model_manager.py +4 -3
lemonade_server/pydantic_models.py +1 -4
lemonade_server/server_models.json +60 -11
{lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/WHEEL +0 -0
{lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/entry_points.txt +0 -0
{lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/licenses/LICENSE +0 -0
{lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/licenses/NOTICE.md +0 -0
{lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/top_level.txt +0 -0

lemonade_server/cli.py CHANGED Viewed

@@ -39,11 +39,19 @@ class ModelNotAvailableError(Exception):
     """
+class ModelLoadError(Exception):
+    """
+    The model failed to load on the server
+    """
 def serve(
     port: int = None,
     log_level: str = None,
     tray: bool = False,
     use_thread: bool = False,
+    llamacpp_backend: str = None,
+    ctx_size: int = None,
 ):
     """
     Execute the serve command
@@ -51,26 +59,33 @@ def serve(
     # Otherwise, start the server
     print("Starting Lemonade Server...")
-    from lemonade.tools.server.serve import Server, DEFAULT_PORT, DEFAULT_LOG_LEVEL
+    from lemonade.tools.server.serve import (
+        Server,
+        DEFAULT_PORT,
+        DEFAULT_LOG_LEVEL,
+        DEFAULT_LLAMACPP_BACKEND,
+        DEFAULT_CTX_SIZE,
+    )
     port = port if port is not None else DEFAULT_PORT
     log_level = log_level if log_level is not None else DEFAULT_LOG_LEVEL
+    llamacpp_backend = (
+        llamacpp_backend if llamacpp_backend is not None else DEFAULT_LLAMACPP_BACKEND
+    )
-    # Hidden environment variable to enable input truncation (experimental feature)
-    truncate_inputs = "LEMONADE_TRUNCATE_INPUTS" in os.environ
+    # Use ctx_size if provided, otherwise use default
+    ctx_size = ctx_size if ctx_size is not None else DEFAULT_CTX_SIZE
     # Start the server
-    serve_kwargs = {
-        "log_level": log_level,
-        "truncate_inputs": truncate_inputs,
-        "tray": tray,
-    }
-    server = Server()
+    server = Server(
+        port=port,
+        log_level=log_level,
+        ctx_size=ctx_size,
+        tray=tray,
+        llamacpp_backend=llamacpp_backend,
+    )
     if not use_thread:
-        server.run(
-            port=port,
-            **serve_kwargs,
-        )
+        server.run()
     else:
         from threading import Thread
         import time
@@ -78,8 +93,6 @@ def serve(
         # Start a background thread to run the server
         server_thread = Thread(
             target=server.run,
-            args=(port,),
-            kwargs=serve_kwargs,
             daemon=True,
         )
         server_thread.start()
@@ -243,7 +256,13 @@ def delete(model_names: List[str]):
             ModelManager().delete_model(model_name)
-def run(model_name: str):
+def run(
+    model_name: str,
+    port: int = None,
+    log_level: str = None,
+    llamacpp_backend: str = None,
+    ctx_size: int = None,
+):
     """
     Start the server if not running and open the webapp with the specified model
     """
@@ -254,7 +273,16 @@ def run(model_name: str):
     _, port = get_server_info()
     server_previously_running = port is not None
     if not server_previously_running:
-        port, server_thread = serve(use_thread=True, tray=True, log_level="info")
+        port, server_thread = serve(
+            port=port,
+            log_level=log_level,
+            tray=True,
+            use_thread=True,
+            llamacpp_backend=llamacpp_backend,
+            ctx_size=ctx_size,
+        )
+    else:
+        port = running_port
     # Pull model
     pull([model_name])
@@ -412,6 +440,29 @@ def list_models():
     print(tabulate(table_data, headers=headers, tablefmt="simple"))
+def _add_server_arguments(parser):
+    """Add common server arguments to a parser"""
+    parser.add_argument("--port", type=int, help="Port number to serve on")
+    parser.add_argument(
+        "--log-level",
+        type=str,
+        help="Log level for the server",
+        choices=["critical", "error", "warning", "info", "debug", "trace"],
+        default="info",
+    )
+    parser.add_argument(
+        "--llamacpp",
+        type=str,
+        help=f"LlamaCpp backend to use",
+        choices=["vulkan", "rocm"],
+    )
+    parser.add_argument(
+        "--ctx-size",
+        type=int,
+        help="Context size for the model (default: 4096 for llamacpp, truncates prompts for other recipes)",
+    )
 def main():
     parser = argparse.ArgumentParser(
         description="Serve LLMs on CPU, GPU, and NPU.",
@@ -430,14 +481,7 @@ def main():
     # Serve command
     serve_parser = subparsers.add_parser("serve", help="Start server")
-    serve_parser.add_argument("--port", type=int, help="Port number to serve on")
-    serve_parser.add_argument(
-        "--log-level",
-        type=str,
-        help="Log level for the server",
-        choices=["critical", "error", "warning", "info", "debug", "trace"],
-        default="info",
-    )
+    _add_server_arguments(serve_parser)
     if os.name == "nt":
         serve_parser.add_argument(
             "--no-tray",
@@ -513,6 +557,7 @@ def main():
         "model",
         help="Lemonade Server model name to run",
     )
+    _add_server_arguments(run_parser)
     args = parser.parse_args()
@@ -535,6 +580,8 @@ def main():
             port=args.port,
             log_level=args.log_level,
             tray=not args.no_tray,
+            llamacpp_backend=args.llamacpp,
+            ctx_size=args.ctx_size,
         )
     elif args.command == "status":
         status()
@@ -553,7 +600,13 @@ def main():
     elif args.command == "stop":
         stop()
     elif args.command == "run":
-        run(args.model)
+        run(
+            args.model,
+            port=args.port,
+            log_level=args.log_level,
+            llamacpp_backend=args.llamacpp,
+            ctx_size=args.ctx_size,
+        )
     elif args.command == "help" or not args.command:
         parser.print_help()

lemonade_server/model_manager.py CHANGED Viewed

@@ -7,6 +7,7 @@ from importlib.metadata import distributions
 from lemonade_server.pydantic_models import PullConfig
 from lemonade.cache import DEFAULT_CACHE_DIR
 from lemonade.tools.llamacpp.utils import parse_checkpoint, download_gguf
+from lemonade.common.network import custom_snapshot_download
 USER_MODELS_FILE = os.path.join(DEFAULT_CACHE_DIR, "user_models.json")
@@ -175,7 +176,7 @@ class ModelManager:
             if "gguf" in checkpoint_to_download.lower():
                 download_gguf(gguf_model_config.checkpoint, gguf_model_config.mmproj)
             else:
-                huggingface_hub.snapshot_download(repo_id=checkpoint_to_download)
+                custom_snapshot_download(checkpoint_to_download)
             # Register the model in user_models.json, creating that file if needed
             # We do this registration after the download so that we don't register
@@ -233,8 +234,8 @@ class ModelManager:
         try:
             # Get the local path using snapshot_download with local_files_only=True
-            snapshot_path = huggingface_hub.snapshot_download(
-                repo_id=base_checkpoint, local_files_only=True
+            snapshot_path = custom_snapshot_download(
+                base_checkpoint, local_files_only=True
             )
             # Navigate up to the model directory (parent of snapshots directory)

lemonade_server/pydantic_models.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Optional, Union, List, Any
+from typing import Optional, Union, List
 from pydantic import BaseModel
@@ -18,9 +18,6 @@ class LoadConfig(BaseModel):
     model_name: str
     checkpoint: Optional[str] = None
     recipe: Optional[str] = None
-    # Indicates the maximum prompt length allowed for that specific
-    # checkpoint + recipe combination
-    max_prompt_length: Optional[int] = None
     # Indicates whether the model is a reasoning model, like DeepSeek
     reasoning: Optional[bool] = False
     # Indicates which Multimodal Projector (mmproj) file to use

lemonade_server/server_models.json CHANGED Viewed

@@ -39,19 +39,16 @@
     "Llama-3.2-1B-Instruct-Hybrid": {
         "checkpoint": "amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
         "recipe": "oga-hybrid",
-        "max_prompt_length": 3000,
         "suggested": true
     },
     "Llama-3.2-3B-Instruct-Hybrid": {
         "checkpoint": "amd/Llama-3.2-3B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
         "recipe": "oga-hybrid",
-        "max_prompt_length": 2000,
         "suggested": true
     },
     "Phi-3-Mini-Instruct-Hybrid": {
         "checkpoint": "amd/Phi-3-mini-4k-instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
         "recipe": "oga-hybrid",
-        "max_prompt_length": 2000,
         "suggested": true
     },
     "Phi-3.5-Mini-Instruct-Hybrid": {
@@ -62,13 +59,26 @@
     "Qwen-1.5-7B-Chat-Hybrid": {
         "checkpoint": "amd/Qwen1.5-7B-Chat-awq-g128-int4-asym-fp16-onnx-hybrid",
         "recipe": "oga-hybrid",
-        "max_prompt_length": 3000,
+        "suggested": true
+    },
+    "Qwen-2.5-7B-Instruct-Hybrid": {
+        "checkpoint": "amd/Qwen2.5-7B-Instruct-awq-uint4-asym-g128-lmhead-g32-fp16-onnx-hybrid",
+        "recipe": "oga-hybrid",
+        "suggested": true
+    },
+    "Qwen-2.5-3B-Instruct-Hybrid": {
+        "checkpoint": "amd/Qwen2.5-3B-Instruct-awq-uint4-asym-g128-lmhead-g32-fp16-onnx-hybrid",
+        "recipe": "oga-hybrid",
+        "suggested": true
+    },
+    "Qwen-2.5-1.5B-Instruct-Hybrid": {
+        "checkpoint": "amd/Qwen2.5-1.5B-Instruct-awq-uint4-asym-g128-lmhead-g32-fp16-onnx-hybrid",
+        "recipe": "oga-hybrid",
         "suggested": true
     },
     "DeepSeek-R1-Distill-Llama-8B-Hybrid": {
         "checkpoint": "amd/DeepSeek-R1-Distill-Llama-8B-awq-asym-uint4-g128-lmhead-onnx-hybrid",
         "recipe": "oga-hybrid",
-        "max_prompt_length": 2000,
         "suggested": true,
         "labels": ["reasoning"]
     },
@@ -76,25 +86,32 @@
         "checkpoint": "amd/DeepSeek-R1-Distill-Qwen-7B-awq-asym-uint4-g128-lmhead-onnx-hybrid",
         "recipe": "oga-hybrid",
         "max_prompt_length": 2000,
-        "suggested": true,
+        "suggested": false,
         "labels": ["reasoning"]
     },
     "Mistral-7B-v0.3-Instruct-Hybrid": {
         "checkpoint": "amd/Mistral-7B-Instruct-v0.3-awq-g128-int4-asym-fp16-onnx-hybrid",
         "recipe": "oga-hybrid",
-        "max_prompt_length": 2000,
         "suggested": true
     },
     "Llama-3.1-8B-Instruct-Hybrid": {
         "checkpoint": "amd/Llama-3.1-8B-Instruct-awq-asym-uint4-g128-lmhead-onnx-hybrid",
         "recipe": "oga-hybrid",
-        "max_prompt_length": 2000,
         "suggested": true
     },
     "Llama-xLAM-2-8b-fc-r-Hybrid": {
         "checkpoint": "amd/Llama-xLAM-2-8b-fc-r-awq-g128-int4-asym-bfp16-onnx-hybrid",
         "recipe": "oga-hybrid",
-        "max_prompt_length": 2000,
+        "suggested": true
+    },
+    "Qwen-2.5-7B-Instruct-NPU": {
+        "checkpoint": "amd/Qwen2.5-7B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix",
+        "recipe": "oga-npu",
+        "suggested": true
+    },
+    "Qwen-2.5-1.5B-Instruct-NPU": {
+        "checkpoint": "amd/Qwen2.5-1.5B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix",
+        "recipe": "oga-npu",
         "suggested": true
     },
     "Llama-3.2-1B-Instruct-DirectML": {
@@ -169,6 +186,18 @@
         "suggested": true,
         "labels": ["reasoning"]
     },
+    "Qwen3-30B-A3B-Instruct-2507-GGUF": {
+        "checkpoint": "unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF:Qwen3-30B-A3B-Instruct-2507-Q4_0.gguf",
+        "recipe": "llamacpp",
+        "suggested": true,
+        "labels": ["hot"]
+    },
+    "Qwen3-Coder-30B-A3B-Instruct-GGUF": {
+        "checkpoint": "unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Qwen3-Coder-30B-A3B-Instruct-Q4_K_M.gguf",
+        "recipe": "llamacpp",
+        "suggested": true,
+        "labels": ["coding","hot"]
+    },
     "Gemma-3-4b-it-GGUF": {
         "checkpoint": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
         "mmproj": "mmproj-model-f16.gguf",
@@ -190,6 +219,13 @@
         "suggested": true,
         "labels": ["vision"]
     },
+    "Cogito-v2-llama-109B-MoE-GGUF": {
+        "checkpoint": "unsloth/cogito-v2-preview-llama-109B-MoE-GGUF:Q4_K_M",
+        "mmproj": "mmproj-F16.gguf",
+        "recipe": "llamacpp",
+        "suggested": true,
+        "labels": ["vision","hot"]
+    },
     "nomic-embed-text-v1-GGUF": {
         "checkpoint": "nomic-ai/nomic-embed-text-v1-GGUF:Q4_K_S",
         "recipe": "llamacpp",
@@ -217,12 +253,25 @@
     "Devstral-Small-2507-GGUF":{
         "checkpoint": "mistralai/Devstral-Small-2507_gguf:Q4_K_M",
         "recipe": "llamacpp",
-        "suggested": true
+        "suggested": true,
+        "labels": ["coding"]
     },
     "Qwen2.5-Coder-32B-Instruct-GGUF": {
         "checkpoint": "Qwen/Qwen2.5-Coder-32B-Instruct-GGUF:Q4_K_M",
         "recipe": "llamacpp",
         "suggested": true,
-        "labels": ["reasoning"]
+        "labels": ["reasoning", "coding"]
+    },
+    "gpt-oss-120b-GGUF": {
+        "checkpoint": "unsloth/gpt-oss-120b-GGUF:Q4_K_M",
+        "recipe": "llamacpp",
+        "suggested": true,
+        "labels": ["hot", "reasoning"]
+    },
+    "gpt-oss-20b-GGUF": {
+        "checkpoint": "unsloth/gpt-oss-20b-GGUF:Q4_K_M",
+        "recipe": "llamacpp",
+        "suggested": true,
+        "labels": ["hot", "reasoning"]
     }
 }

{lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/licenses/NOTICE.md RENAMED Viewed

File without changes

{lemonade_sdk-8.0.6.dist-info → lemonade_sdk-8.1.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

lemonade-sdk 8.0.6__py3-none-any.whl → 8.1.1__py3-none-any.whl

Potentially problematic release.

lemonade-sdk 8.0.6py3-none-any.whl → 8.1.1py3-none-any.whl