PyPI - lemonade-sdk - Versions diffs - 7.0.0__py3-none-any.whl - Mend

lemonade-sdk 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (61) hide show

lemonade/__init__.py +5 -0
lemonade/api.py +125 -0
lemonade/cache.py +85 -0
lemonade/cli.py +135 -0
lemonade/common/__init__.py +0 -0
lemonade/common/analyze_model.py +26 -0
lemonade/common/build.py +223 -0
lemonade/common/cli_helpers.py +139 -0
lemonade/common/exceptions.py +98 -0
lemonade/common/filesystem.py +368 -0
lemonade/common/labels.py +61 -0
lemonade/common/onnx_helpers.py +176 -0
lemonade/common/plugins.py +10 -0
lemonade/common/printing.py +110 -0
lemonade/common/status.py +490 -0
lemonade/common/system_info.py +390 -0
lemonade/common/tensor_helpers.py +83 -0
lemonade/common/test_helpers.py +28 -0
lemonade/profilers/__init__.py +1 -0
lemonade/profilers/memory_tracker.py +257 -0
lemonade/profilers/profiler.py +55 -0
lemonade/sequence.py +363 -0
lemonade/state.py +159 -0
lemonade/tools/__init__.py +1 -0
lemonade/tools/adapter.py +104 -0
lemonade/tools/bench.py +284 -0
lemonade/tools/huggingface_bench.py +267 -0
lemonade/tools/huggingface_load.py +520 -0
lemonade/tools/humaneval.py +258 -0
lemonade/tools/llamacpp.py +261 -0
lemonade/tools/llamacpp_bench.py +154 -0
lemonade/tools/management_tools.py +273 -0
lemonade/tools/mmlu.py +327 -0
lemonade/tools/ort_genai/__init__.py +0 -0
lemonade/tools/ort_genai/oga.py +1129 -0
lemonade/tools/ort_genai/oga_bench.py +142 -0
lemonade/tools/perplexity.py +146 -0
lemonade/tools/prompt.py +228 -0
lemonade/tools/quark/__init__.py +0 -0
lemonade/tools/quark/quark_load.py +172 -0
lemonade/tools/quark/quark_quantize.py +439 -0
lemonade/tools/report/__init__.py +0 -0
lemonade/tools/report/llm_report.py +203 -0
lemonade/tools/report/table.py +739 -0
lemonade/tools/server/__init__.py +0 -0
lemonade/tools/server/serve.py +1354 -0
lemonade/tools/server/tool_calls.py +146 -0
lemonade/tools/tool.py +374 -0
lemonade/version.py +1 -0
lemonade_install/__init__.py +1 -0
lemonade_install/install.py +774 -0
lemonade_sdk-7.0.0.dist-info/METADATA +116 -0
lemonade_sdk-7.0.0.dist-info/RECORD +61 -0
lemonade_sdk-7.0.0.dist-info/WHEEL +5 -0
lemonade_sdk-7.0.0.dist-info/entry_points.txt +4 -0
lemonade_sdk-7.0.0.dist-info/licenses/LICENSE +201 -0
lemonade_sdk-7.0.0.dist-info/licenses/NOTICE.md +21 -0
lemonade_sdk-7.0.0.dist-info/top_level.txt +3 -0
lemonade_server/cli.py +260 -0
lemonade_server/model_manager.py +98 -0
lemonade_server/server_models.json +142 -0

lemonade_server/cli.py ADDED Viewed

@@ -0,0 +1,260 @@
+import argparse
+import sys
+import os
+from typing import Tuple
+import psutil
+from typing import List
+class PullError(Exception):
+    """
+    The pull command has failed to install an LLM
+    """
+def serve(
+    port: int,
+    log_level: str = None,
+):
+    """
+    Execute the serve command
+    """
+    # Check if Lemonade Server is already running
+    _, running_port = get_server_info()
+    if running_port is not None:
+        print(
+            (
+                f"Lemonade Server is already running on port {running_port}\n"
+                "Please stop the existing server before starting a new instance."
+            ),
+        )
+        sys.exit(1)
+    # Otherwise, start the server
+    print("Starting Lemonade Server...")
+    from lemonade.tools.server.serve import Server, DEFAULT_PORT, DEFAULT_LOG_LEVEL
+    server = Server()
+    port = port if port is not None else DEFAULT_PORT
+    log_level = log_level if log_level is not None else DEFAULT_LOG_LEVEL
+    # Hidden environment variable to enable input truncation (experimental feature)
+    truncate_inputs = "LEMONADE_TRUNCATE_INPUTS" in os.environ
+    server.run(
+        port=port,
+        log_level=log_level,
+        truncate_inputs=truncate_inputs,
+    )
+def stop():
+    """
+    Stop the Lemonade Server
+    """
+    # Check if Lemonade Server is running
+    running_pid, running_port = get_server_info()
+    if running_port is None:
+        print(f"Lemonade Server is not running\n")
+        return
+    # Stop the server
+    try:
+        process = psutil.Process(running_pid)
+        process.terminate()
+        process.wait(timeout=10)
+    except psutil.NoSuchProcess:
+        # Process already terminated
+        pass
+    except psutil.TimeoutExpired:
+        print("Timed out waiting for Lemonade Server to stop.")
+        sys.exit(1)
+    except Exception as e:  # pylint: disable=broad-exception-caught
+        print(f"Error stopping Lemonade Server: {e}")
+        sys.exit(1)
+    print("Lemonade Server stopped successfully.")
+def pull(model_names: List[str]):
+    """
+    Install an LLM based on its Lemonade Server model name
+    If Lemonade Server is running, use the pull endpoint to download the model
+    so that the Lemonade Server instance is aware of the pull.
+    Otherwise, use ModelManager to install the model.
+    """
+    server_running, port = status(verbose=False)
+    if server_running:
+        import requests
+        base_url = f"http://localhost:{port}/api/v0"
+        for model_name in model_names:
+            # Install the model
+            pull_response = requests.post(
+                f"{base_url}/pull", json={"model_name": model_name}
+            )
+            if pull_response.status_code != 200:
+                raise PullError(
+                    f"Failed to install {model_name}. Check the "
+                    "Lemonade Server log for more information. A list of supported models "
+                    "is provided at "
+                    "https://github.com/lemonade-sdk/lemonade/blob/main/docs/server/server_models.md"
+                )
+    else:
+        from lemonade_server.model_manager import ModelManager
+        ModelManager().download_models(model_names)
+def version():
+    """
+    Print the version number
+    """
+    from lemonade import __version__ as version_number
+    print(f"{version_number}")
+def status(verbose: bool = True) -> Tuple[bool, int]:
+    """
+    Print the status of the server
+    Returns a tuple of:
+    1. Whether the server is running
+    2. What port the server is running on (None if server is not running)
+    """
+    _, port = get_server_info()
+    if port is None:
+        if verbose:
+            print("Server is not running")
+        return False, None
+    else:
+        if verbose:
+            print(f"Server is running on port {port}")
+        return True, port
+def is_lemonade_server(pid):
+    """
+    Check wether or not a given PID corresponds to a Lemonade server
+    """
+    try:
+        process = psutil.Process(pid)
+        while True:
+            if process.name() in [  # Windows
+                "lemonade-server-dev.exe",
+                "lemonade-server.exe",
+                "lemonade.exe",
+            ] or process.name() in [  # Linux
+                "lemonade-server-dev",
+                "lemonade-server",
+                "lemonade",
+            ]:
+                return True
+            if not process.parent():
+                return False
+            process = process.parent()
+    except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
+        return False
+    return False
+def get_server_info() -> Tuple[int | None, int | None]:
+    """
+    Returns a tuple of:
+    1. Lemonade Server's PID
+    2. The port that Lemonade Server is running on
+    """
+    # Go over all python processes that have a port open
+    for process in psutil.process_iter(["pid", "name"]):
+        try:
+            connections = process.net_connections()
+            for conn in connections:
+                if conn.status == "LISTEN":
+                    if is_lemonade_server(process.info["pid"]):
+                        return process.info["pid"], conn.laddr.port
+        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
+            continue
+    return None, None
+def main():
+    parser = argparse.ArgumentParser(
+        description="Serve LLMs on CPU, GPU, and NPU.",
+        usage=argparse.SUPPRESS,
+    )
+    # Add version flag
+    parser.add_argument(
+        "-v", "--version", action="store_true", help="Show version number"
+    )
+    # Create subparsers for commands
+    subparsers = parser.add_subparsers(
+        title="Available Commands", dest="command", metavar=""
+    )
+    # Serve command
+    serve_parser = subparsers.add_parser("serve", help="Start server")
+    serve_parser.add_argument("--port", type=int, help="Port number to serve on")
+    serve_parser.add_argument(
+        "--log-level",
+        type=str,
+        help="Log level for the server",
+        choices=["critical", "error", "warning", "info", "debug", "trace"],
+        default="info",
+    )
+    # Status command
+    status_parser = subparsers.add_parser("status", help="Check if server is running")
+    # Stop command
+    stop_parser = subparsers.add_parser("stop", help="Stop the server")
+    # Pull command
+    pull_parser = subparsers.add_parser(
+        "pull",
+        help="Install an LLM",
+        epilog=(
+            "More information: "
+            "https://github.com/lemonade-sdk/lemonade/blob/main/docs/server/server_models.md"
+        ),
+    )
+    pull_parser.add_argument(
+        "model",
+        help="Lemonade Server model name",
+        nargs="+",
+    )
+    args = parser.parse_args()
+    if args.version:
+        version()
+    elif args.command == "serve":
+        serve(
+            args.port,
+            args.log_level,
+        )
+    elif args.command == "status":
+        status()
+    elif args.command == "pull":
+        pull(args.model)
+    elif args.command == "stop":
+        stop()
+    elif args.command == "help" or not args.command:
+        parser.print_help()
+if __name__ == "__main__":
+    main()
+# This file was originally licensed under Apache 2.0. It has been modified.
+# Modifications Copyright (c) 2025 AMD

lemonade_server/model_manager.py ADDED Viewed

@@ -0,0 +1,98 @@
+import json
+import os
+import huggingface_hub
+import pkg_resources
+class ModelManager:
+    @property
+    def supported_models(self) -> dict:
+        """
+        Returns a dictionary of supported models.
+        Note: Models must be downloaded before they are locally available.
+        """
+        # Load the models dictionary from the JSON file
+        server_models_file = os.path.join(
+            os.path.dirname(__file__), "server_models.json"
+        )
+        with open(server_models_file, "r", encoding="utf-8") as file:
+            models = json.load(file)
+        # Add the model name as a key in each entry, to make it easier
+        # to access later
+        for key, value in models.items():
+            value["model_name"] = key
+        return models
+    @property
+    def downloaded_hf_checkpoints(self) -> list[str]:
+        """
+        Returns a list of Hugging Face checkpoints that have been downloaded.
+        """
+        downloaded_hf_checkpoints = []
+        try:
+            hf_cache_info = huggingface_hub.scan_cache_dir()
+            downloaded_hf_checkpoints = [entry.repo_id for entry in hf_cache_info.repos]
+        except huggingface_hub.CacheNotFound:
+            pass
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            print(f"Error scanning Hugging Face cache: {e}")
+        return downloaded_hf_checkpoints
+    @property
+    def downloaded_models(self) -> dict:
+        """
+        Returns a dictionary of locally available models.
+        """
+        downloaded_models = {}
+        for model in self.supported_models:
+            if (
+                self.supported_models[model]["checkpoint"]
+                in self.downloaded_hf_checkpoints
+            ):
+                downloaded_models[model] = self.supported_models[model]
+        return downloaded_models
+    @property
+    def downloaded_models_enabled(self) -> dict:
+        """
+        Returns a dictionary of locally available models that are enabled by
+        the current installation.
+        """
+        hybrid_installed = (
+            "onnxruntime-vitisai" in pkg_resources.working_set.by_key
+            and "onnxruntime-genai-directml-ryzenai" in pkg_resources.working_set.by_key
+        )
+        downloaded_models_enabled = {}
+        for model, value in self.downloaded_models.items():
+            if value["recipe"] == "oga-hybrid" and hybrid_installed:
+                downloaded_models_enabled[model] = value
+            else:
+                # All other models are CPU models right now
+                # This logic will get more sophisticated when we
+                # start to support more backends
+                downloaded_models_enabled[model] = value
+        return downloaded_models_enabled
+    def download_models(self, models: list[str]):
+        """
+        Downloads the specified models from Hugging Face.
+        """
+        for model in models:
+            if model not in self.supported_models:
+                raise ValueError(
+                    f"Model {model} is not supported. Please choose from the following: "
+                    f"{list(self.supported_models.keys())}"
+                )
+            checkpoint = self.supported_models[model]["checkpoint"]
+            print(f"Downloading {model} ({checkpoint})")
+            huggingface_hub.snapshot_download(repo_id=checkpoint)
+# This file was originally licensed under Apache 2.0. It has been modified.
+# Modifications Copyright (c) 2025 AMD

lemonade_server/server_models.json ADDED Viewed

@@ -0,0 +1,142 @@
+{
+    "Qwen2.5-0.5B-Instruct-CPU": {
+        "checkpoint": "amd/Qwen2.5-0.5B-Instruct-quantized_int4-float16-cpu-onnx",
+        "recipe": "oga-cpu",
+        "reasoning": false,
+        "suggested": true
+    },
+    "Llama-3.2-1B-Instruct-CPU": {
+        "checkpoint": "amd/Llama-3.2-1B-Instruct-awq-uint4-float16-cpu-onnx",
+        "recipe": "oga-cpu",
+        "reasoning": false,
+        "suggested": true
+    },
+    "Llama-3.2-3B-Instruct-CPU": {
+        "checkpoint": "amd/Llama-3.2-3B-Instruct-awq-uint4-float16-cpu-onnx",
+        "recipe": "oga-cpu",
+        "reasoning": false,
+        "suggested": true
+    },
+    "Phi-3-Mini-Instruct-CPU": {
+        "checkpoint": "amd/Phi-3-mini-4k-instruct_int4_float16_onnx_cpu",
+        "recipe": "oga-cpu",
+        "reasoning": false,
+        "suggested": true
+    },
+    "Qwen-1.5-7B-Chat-CPU": {
+        "checkpoint": "amd/Qwen1.5-7B-Chat_uint4_asym_g128_float16_onnx_cpu",
+        "recipe": "oga-cpu",
+        "reasoning": false,
+        "suggested": true
+    },
+    "DeepSeek-R1-Distill-Llama-8B-CPU": {
+        "checkpoint": "amd/DeepSeek-R1-Distill-Llama-8B-awq-asym-uint4-g128-lmhead-onnx-cpu",
+        "recipe": "oga-cpu",
+        "reasoning": true,
+        "suggested": true
+    },
+    "DeepSeek-R1-Distill-Qwen-7B-CPU": {
+        "checkpoint": "amd/DeepSeek-R1-Distill-Llama-8B-awq-asym-uint4-g128-lmhead-onnx-cpu",
+        "recipe": "oga-cpu",
+        "reasoning": true,
+        "suggested": true
+    },
+    "Llama-3.2-1B-Instruct-Hybrid": {
+        "checkpoint": "amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
+        "recipe": "oga-hybrid",
+        "reasoning": false,
+        "max_prompt_length": 3000,
+        "suggested": true
+    },
+    "Llama-3.2-3B-Instruct-Hybrid": {
+        "checkpoint": "amd/Llama-3.2-3B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
+        "recipe": "oga-hybrid",
+        "reasoning": false,
+        "max_prompt_length": 2000,
+        "suggested": true
+    },
+    "Phi-3-Mini-Instruct-Hybrid": {
+        "checkpoint": "amd/Phi-3-mini-4k-instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
+        "recipe": "oga-hybrid",
+        "reasoning": false,
+        "max_prompt_length": 2000,
+        "suggested": true
+    },
+    "Phi-3.5-Mini-Instruct-Hybrid": {
+        "checkpoint": "amd/Phi-3.5-mini-instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
+        "recipe": "oga-hybrid",
+        "reasoning": false,
+        "suggested": false
+    },
+    "Qwen-1.5-7B-Chat-Hybrid": {
+        "checkpoint": "amd/Qwen1.5-7B-Chat-awq-g128-int4-asym-fp16-onnx-hybrid",
+        "recipe": "oga-hybrid",
+        "reasoning": false,
+        "max_prompt_length": 3000,
+        "suggested": true
+    },
+    "DeepSeek-R1-Distill-Llama-8B-Hybrid": {
+        "checkpoint": "amd/DeepSeek-R1-Distill-Llama-8B-awq-asym-uint4-g128-lmhead-onnx-hybrid",
+        "recipe": "oga-hybrid",
+        "reasoning": true,
+        "max_prompt_length": 2000,
+        "suggested": true
+    },
+    "DeepSeek-R1-Distill-Qwen-7B-Hybrid": {
+        "checkpoint": "amd/DeepSeek-R1-Distill-Qwen-7B-awq-asym-uint4-g128-lmhead-onnx-hybrid",
+        "recipe": "oga-hybrid",
+        "reasoning": true,
+        "max_prompt_length": 2000,
+        "suggested": true
+    },
+    "Mistral-7B-v0.3-Instruct-Hybrid": {
+        "checkpoint": "amd/Mistral-7B-Instruct-v0.3-awq-g128-int4-asym-fp16-onnx-hybrid",
+        "recipe": "oga-hybrid",
+        "reasoning": false,
+        "max_prompt_length": 2000,
+        "suggested": true
+    },
+    "Llama-3.1-8B-Instruct-Hybrid": {
+        "checkpoint": "amd/Llama-3.1-8B-Instruct-awq-asym-uint4-g128-lmhead-onnx-hybrid",
+        "recipe": "oga-hybrid",
+        "reasoning": false,
+        "max_prompt_length": 2000,
+        "suggested": true
+    },
+    "Llama-3.2-1B-Instruct-DirectML": {
+        "checkpoint": "amd/Llama-3.2-1B-Instruct-dml-int4-awq-block-128-directml",
+        "recipe": "oga-igpu",
+        "reasoning": false,
+        "suggested": false
+    },
+    "Llama-3.2-3B-Instruct-DirectML": {
+        "checkpoint": "amd/Llama-3.2-3B-Instruct-dml-int4-awq-block-128-directml",
+        "recipe": "oga-igpu",
+        "reasoning": false,
+        "suggested": false
+    },
+    "Phi-3.5-Mini-Instruct-DirectML": {
+        "checkpoint": "amd/phi3.5-mini-instruct-int4-awq-block-128-directml",
+        "recipe": "oga-igpu",
+        "reasoning": false,
+        "suggested": false
+    },
+    "Qwen-1.5-7B-Chat-DirectML": {
+        "checkpoint": "amd/Qwen1.5-7B-Chat-dml-int4-awq-block-128-directml",
+        "recipe": "oga-igpu",
+        "reasoning": false,
+        "suggested": false
+    },
+    "Mistral-7B-v0.1-Instruct-DirectML": {
+        "checkpoint": "amd/Mistral-7B-Instruct-v0.1-awq-g128-int4-onnx-directml",
+        "recipe": "oga-igpu",
+        "reasoning": false,
+        "suggested": false
+    },
+    "Llama-3-8B-Instruct-DirectML": {
+        "checkpoint": "amd/llama3-8b-instruct-awq-g128-int4-onnx-directml",
+        "recipe": "oga-igpu",
+        "reasoning": false,
+        "suggested": false
+    }
+}