PyPI - lemonade-sdk - Versions diffs - 7.0.4__py3-none-any.whl → 8.0.0__py3-none-any.whl - Mend

lemonade-sdk 7.0.4py3-none-any.whl → 8.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (55) hide show

lemonade/api.py +3 -3
lemonade/cli.py +11 -17
lemonade/common/build.py +0 -47
lemonade/common/network.py +50 -0
lemonade/common/status.py +2 -21
lemonade/common/system_info.py +19 -4
lemonade/profilers/memory_tracker.py +3 -1
lemonade/tools/accuracy.py +3 -4
lemonade/tools/adapter.py +1 -2
lemonade/tools/{huggingface_bench.py → huggingface/bench.py} +2 -87
lemonade/tools/huggingface/load.py +235 -0
lemonade/tools/{huggingface_load.py → huggingface/utils.py} +87 -255
lemonade/tools/humaneval.py +9 -3
lemonade/tools/{llamacpp_bench.py → llamacpp/bench.py} +1 -1
lemonade/tools/{llamacpp.py → llamacpp/load.py} +18 -2
lemonade/tools/mmlu.py +7 -15
lemonade/tools/{ort_genai/oga.py → oga/load.py} +31 -422
lemonade/tools/oga/utils.py +423 -0
lemonade/tools/perplexity.py +4 -3
lemonade/tools/prompt.py +2 -1
lemonade/tools/quark/quark_load.py +2 -1
lemonade/tools/quark/quark_quantize.py +5 -5
lemonade/tools/report/table.py +3 -3
lemonade/tools/server/llamacpp.py +154 -29
lemonade/tools/server/serve.py +169 -146
lemonade/tools/server/static/favicon.ico +0 -0
lemonade/tools/server/static/styles.css +568 -0
lemonade/tools/server/static/webapp.html +439 -0
lemonade/tools/server/tray.py +458 -0
lemonade/tools/server/{port_utils.py → utils/port.py} +22 -3
lemonade/tools/server/utils/system_tray.py +395 -0
lemonade/tools/server/{instructions.py → webapp.py} +4 -10
lemonade/version.py +1 -1
lemonade_install/install.py +46 -28
{lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/METADATA +84 -22
lemonade_sdk-8.0.0.dist-info/RECORD +70 -0
lemonade_server/cli.py +182 -27
lemonade_server/model_manager.py +192 -20
lemonade_server/pydantic_models.py +9 -4
lemonade_server/server_models.json +5 -3
lemonade/common/analyze_model.py +0 -26
lemonade/common/labels.py +0 -61
lemonade/common/onnx_helpers.py +0 -176
lemonade/common/plugins.py +0 -10
lemonade/common/tensor_helpers.py +0 -83
lemonade/tools/server/static/instructions.html +0 -262
lemonade_sdk-7.0.4.dist-info/RECORD +0 -69
/lemonade/tools/{ort_genai → oga}/__init__.py +0 -0
/lemonade/tools/{ort_genai/oga_bench.py → oga/bench.py} +0 -0
/lemonade/tools/server/{thread_utils.py → utils/thread.py} +0 -0
{lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/WHEEL +0 -0
{lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/entry_points.txt +0 -0
{lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/licenses/LICENSE +0 -0
{lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/licenses/NOTICE.md +0 -0
{lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.0.dist-info}/top_level.txt +0 -0

lemonade/tools/huggingface/load.py ADDED Viewed

@@ -0,0 +1,235 @@
+import argparse
+from typing import Dict, Optional
+import json
+from lemonade.tools import FirstTool
+from lemonade.state import State
+import lemonade.common.status as status
+import lemonade.common.printing as printing
+from lemonade.cache import Keys
+class HuggingfaceLoad(FirstTool):
+    """
+    Load an LLM as a torch.nn.Module using the Hugging Face transformers
+    from_pretrained() API.
+    Expected input: a checkpoint to load
+    Output state produced:
+        - state.model: instance of torch.nn.Module that implements an LLM.
+        - state.inputs: tokenized example inputs to the model, in the form of a
+            dictionary of kwargs.
+        - state.tokenizer: instance of Hugging Face PretrainedTokenizer.
+        - state.dtype: data type of the model.
+        - state.checkpoint: pretrained checkpoint used to load the model.
+    """
+    unique_name = "huggingface-load"
+    def _imports(self):
+        pass
+    def __init__(self):
+        super().__init__(monitor_message="Loading Huggingface checkpoint")
+        self.status_stats = [Keys.DTYPE]
+    @staticmethod
+    def parser(add_help: bool = True) -> argparse.ArgumentParser:
+        parser = __class__.helpful_parser(
+            short_description="Load an LLM in PyTorch using huggingface transformers",
+            add_help=add_help,
+        )
+        default_dtype = "float32"
+        parser.add_argument(
+            "--dtype",
+            "-d",
+            required=False,
+            default=default_dtype,
+            help=f"Data type to load the model in (default: {default_dtype}).",
+        )
+        choices = ["cpu", "cuda"]
+        for cuda in range(15):
+            choices.append(f"cuda:{cuda}")
+        parser.add_argument(
+            "--device",
+            required=False,
+            default=None,
+            choices=choices,
+            help="Move the model and inputs to a device using the .to() method "
+            "(default: don't call the .to() method)",
+        )
+        parser.add_argument(
+            "--load-kwargs",
+            required=False,
+            default="{}",
+            type=json.loads,
+            help="Arbitrary kwargs, in json format, that will be passed as "
+            "from_pretrained(**kwargs). "
+            r"Example: --load-kwargs='{\"trust_remote_code\": true} would result in "
+            "from_pretrained(trust_remote_code=True)",
+        )
+        parser.add_argument(
+            "--channels-last",
+            default=True,
+            type=bool,
+            help="Whether to format the model in memory using "
+            "channels-last (default: True)",
+        )
+        return parser
+    def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
+        from lemonade.tools.huggingface.utils import str_to_dtype
+        parsed_args = super().parse(state, args, known_only)
+        # Save stats about the user's input (do this prior to decoding)
+        state.save_stat(Keys.CHECKPOINT, parsed_args.input)
+        state.save_stat(Keys.DTYPE, parsed_args.dtype)
+        # Decode dtype arg into a torch value
+        parsed_args.dtype = str_to_dtype[parsed_args.dtype]
+        return parsed_args
+    def run(
+        self,
+        state: State,
+        input: str = "",
+        dtype: "torch.dtype" = None,
+        device: Optional[str] = None,
+        load_kwargs: Optional[Dict] = None,
+        channels_last: bool = True,
+    ) -> State:
+        # Import expensive modules at runtime
+        import transformers
+        import torch
+        from lemonade.tools.huggingface.utils import (
+            HuggingfaceTokenizerAdapter,
+            HuggingfaceAdapter,
+        )
+        from lemonade.common.network import (
+            is_offline,
+            get_base_model,
+        )
+        # Set default dtype
+        if dtype is None:
+            dtype_to_use = torch.float32
+        else:
+            dtype_to_use = dtype
+        # Auto-detect offline status
+        offline = is_offline()
+        if offline:
+            printing.log_warning(
+                "Network connectivity to huggingface.co not detected. Running in offline mode."
+            )
+        checkpoint = input
+        if load_kwargs is None:
+            load_kwargs_to_use = {}
+        else:
+            load_kwargs_to_use = load_kwargs
+        # Add local_files_only to kwargs in offline mode
+        if offline:
+            load_kwargs_to_use["local_files_only"] = True
+        if vars(state).get(Keys.MODEL):
+            raise ValueError("HuggingfaceLoad must be the first tool in the sequence")
+        try:
+            model = transformers.AutoModelForCausalLM.from_pretrained(
+                checkpoint,
+                torch_dtype=dtype_to_use,
+                low_cpu_mem_usage=True,
+                **load_kwargs_to_use,
+            )
+        except Exception as e:
+            if offline and "Can't load config for" in str(e):
+                raise ValueError(
+                    f"Cannot load model {checkpoint} in offline mode. "
+                    f"The model files may not be available locally. Original error: {str(e)}"
+                )
+            raise
+        # Only call the model.to() method if an argument to this function
+        # provides a reason to do so
+        to_args = {}
+        if channels_last:
+            to_args["memory_format"] = torch.channels_last
+        if device:
+            to_args["device"] = device
+        if to_args:
+            model.to(**to_args)
+        model = model.eval()
+        try:
+            tokenizer_kwargs = {
+                "use_fast": False,
+                "model_max_length": 4096,
+                "padding_side": "left",
+            }
+            if offline:
+                tokenizer_kwargs["local_files_only"] = True
+            tokenizer = transformers.AutoTokenizer.from_pretrained(
+                checkpoint, **tokenizer_kwargs
+            )
+        except ValueError as e:
+            # Sometimes those specific tokenizer flags are not supported, in which
+            # case we try to just load a simple tokenizer
+            tokenizer_kwargs = {}
+            if offline:
+                tokenizer_kwargs["local_files_only"] = True
+            try:
+                tokenizer = transformers.AutoTokenizer.from_pretrained(
+                    checkpoint, **tokenizer_kwargs
+                )
+            except Exception as e:
+                if offline and "Can't load tokenizer for" in str(e):
+                    raise ValueError(
+                        f"Cannot load tokenizer for {checkpoint} in offline mode. "
+                        f"The tokenizer files may not be available locally. "
+                        f"Original error: {str(e)}"
+                    )
+                raise
+        # Pass the model and inputs into state
+        state.model = HuggingfaceAdapter(model, dtype_to_use, device, tokenizer)
+        state.tokenizer = HuggingfaceTokenizerAdapter(tokenizer, device)
+        state.dtype = dtype_to_use
+        state.checkpoint = checkpoint
+        state.device = device
+        # Save stats about the model
+        state.save_stat(Keys.CHECKPOINT, checkpoint)
+        state.save_stat(Keys.DTYPE, str(dtype_to_use).split(".")[1])
+        state.save_stat(Keys.DEVICE, device)
+        # Get base model information
+        base_model = get_base_model(checkpoint)
+        if base_model is not None:
+            state.save_stat("base_model", base_model)
+        # Create a UniqueInvocationInfo and ModelInfo so that we can display status
+        # at the end of the sequence
+        status.add_to_state(state=state, name=input, model=model)
+        return state
+# This file was originally licensed under Apache 2.0. It has been modified.
+# Modifications Copyright (c) 2025 AMD

lemonade/tools/{huggingface_load.py → huggingface/utils.py} RENAMED Viewed

@@ -1,16 +1,12 @@
-import argparse
-from typing import Dict, Optional
-import json
-import socket
+from typing import Dict, List, Tuple
+import time
+from contextlib import nullcontext
 import transformers
 import torch
-from huggingface_hub import model_info
 from lemonade.state import State
-import lemonade.common.status as status
-import lemonade.common.printing as printing
-from lemonade.tools import FirstTool
-from lemonade.tools.adapter import ModelAdapter, TokenizerAdapter
-from lemonade.cache import Keys
+from lemonade.tools.adapter import TokenizerAdapter
+from lemonade.tools.adapter import ModelAdapter
+from lemonade.tools.bench import Bench
 # Command line interfaces for tools will use string inputs for data
 # types, however the internal tool logic will need to know the actual
@@ -62,249 +58,6 @@ class HuggingfaceTokenizerAdapter(TokenizerAdapter):
         return self.tokenizer.save_pretrained(model_dir, **kwargs)
-def is_offline():
-    """
-    Check if the system is offline by attempting to connect to huggingface.co.
-    Returns:
-        bool: True if the system is offline (cannot connect to huggingface.co),
-              False otherwise.
-    """
-    try:
-        socket.gethostbyname("huggingface.co")
-        return False
-    except socket.gaierror:
-        return True
-def get_base_model(checkpoint: str) -> Optional[str]:
-    """
-    Get the base model information for a given checkpoint from the Hugging Face Hub.
-    Will auto-detect if we're offline and skip the network call in that case.
-    Args:
-        checkpoint: The model checkpoint to query
-    Returns:
-        The base model name if found, or None if not found or error occurs
-    """
-    # Skip network call in offline mode
-    if is_offline():
-        return None
-    try:
-        info = model_info(checkpoint)
-        if info.cardData and "base_model" in info.cardData:
-            if info.cardData["base_model"] is not None:
-                # This is a derived model
-                return info.cardData["base_model"]
-            else:
-                # This is itself a base model
-                return [checkpoint]
-    except Exception:  # pylint: disable=broad-except
-        pass
-    return None
-class HuggingfaceLoad(FirstTool):
-    """
-    Load an LLM as a torch.nn.Module using the Hugging Face transformers
-    from_pretrained() API.
-    Expected input: a checkpoint to load
-    Output state produced:
-        - state.model: instance of torch.nn.Module that implements an LLM.
-        - state.inputs: tokenized example inputs to the model, in the form of a
-            dictionary of kwargs.
-        - state.tokenizer: instance of Hugging Face PretrainedTokenizer.
-        - state.dtype: data type of the model.
-        - state.checkpoint: pretrained checkpoint used to load the model.
-    """
-    unique_name = "huggingface-load"
-    def __init__(self):
-        super().__init__(monitor_message="Loading Huggingface checkpoint")
-        self.status_stats = [Keys.DTYPE]
-    @staticmethod
-    def parser(add_help: bool = True) -> argparse.ArgumentParser:
-        parser = __class__.helpful_parser(
-            short_description="Load an LLM in PyTorch using huggingface transformers",
-            add_help=add_help,
-        )
-        default_dtype = "float32"
-        parser.add_argument(
-            "--dtype",
-            "-d",
-            required=False,
-            default=default_dtype,
-            help=f"Data type to load the model in (default: {default_dtype}).",
-        )
-        choices = ["cpu", "cuda"]
-        for cuda in range(15):
-            choices.append(f"cuda:{cuda}")
-        parser.add_argument(
-            "--device",
-            required=False,
-            default=None,
-            choices=choices,
-            help="Move the model and inputs to a device using the .to() method "
-            "(default: don't call the .to() method)",
-        )
-        parser.add_argument(
-            "--load-kwargs",
-            required=False,
-            default="{}",
-            type=json.loads,
-            help="Arbitrary kwargs, in json format, that will be passed as "
-            "from_pretrained(**kwargs). "
-            r"Example: --load-kwargs='{\"trust_remote_code\": true} would result in "
-            "from_pretrained(trust_remote_code=True)",
-        )
-        parser.add_argument(
-            "--channels-last",
-            default=True,
-            type=bool,
-            help="Whether to format the model in memory using "
-            "channels-last (default: True)",
-        )
-        return parser
-    def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
-        parsed_args = super().parse(state, args, known_only)
-        # Save stats about the user's input (do this prior to decoding)
-        state.save_stat(Keys.CHECKPOINT, parsed_args.input)
-        state.save_stat(Keys.DTYPE, parsed_args.dtype)
-        # Decode dtype arg into a torch value
-        parsed_args.dtype = str_to_dtype[parsed_args.dtype]
-        return parsed_args
-    def run(
-        self,
-        state: State,
-        input: str = "",
-        dtype: torch.dtype = torch.float32,
-        device: Optional[str] = None,
-        load_kwargs: Optional[Dict] = None,
-        channels_last: bool = True,
-    ) -> State:
-        # Auto-detect offline status
-        offline = is_offline()
-        if offline:
-            printing.log_warning(
-                "Network connectivity to huggingface.co not detected. Running in offline mode."
-            )
-        checkpoint = input
-        if load_kwargs is None:
-            load_kwargs_to_use = {}
-        else:
-            load_kwargs_to_use = load_kwargs
-        # Add local_files_only to kwargs in offline mode
-        if offline:
-            load_kwargs_to_use["local_files_only"] = True
-        if vars(state).get(Keys.MODEL):
-            raise ValueError("HuggingfaceLoad must be the first tool in the sequence")
-        try:
-            model = transformers.AutoModelForCausalLM.from_pretrained(
-                checkpoint,
-                torch_dtype=dtype,
-                low_cpu_mem_usage=True,
-                **load_kwargs_to_use,
-            )
-        except Exception as e:
-            if offline and "Can't load config for" in str(e):
-                raise ValueError(
-                    f"Cannot load model {checkpoint} in offline mode. "
-                    f"The model files may not be available locally. Original error: {str(e)}"
-                )
-            raise
-        # Only call the model.to() method if an argument to this function
-        # provides a reason to do so
-        to_args = {}
-        if channels_last:
-            to_args["memory_format"] = torch.channels_last
-        if device:
-            to_args["device"] = device
-        if to_args:
-            model.to(**to_args)
-        model = model.eval()
-        try:
-            tokenizer_kwargs = {
-                "use_fast": False,
-                "model_max_length": 4096,
-                "padding_side": "left",
-            }
-            if offline:
-                tokenizer_kwargs["local_files_only"] = True
-            tokenizer = transformers.AutoTokenizer.from_pretrained(
-                checkpoint, **tokenizer_kwargs
-            )
-        except ValueError as e:
-            # Sometimes those specific tokenizer flags are not supported, in which
-            # case we try to just load a simple tokenizer
-            tokenizer_kwargs = {}
-            if offline:
-                tokenizer_kwargs["local_files_only"] = True
-            try:
-                tokenizer = transformers.AutoTokenizer.from_pretrained(
-                    checkpoint, **tokenizer_kwargs
-                )
-            except Exception as e:
-                if offline and "Can't load tokenizer for" in str(e):
-                    raise ValueError(
-                        f"Cannot load tokenizer for {checkpoint} in offline mode. "
-                        f"The tokenizer files may not be available locally. "
-                        f"Original error: {str(e)}"
-                    )
-                raise
-        # Pass the model and inputs into state
-        state.model = HuggingfaceAdapter(model, dtype, device, tokenizer)
-        state.tokenizer = HuggingfaceTokenizerAdapter(tokenizer, device)
-        state.dtype = dtype
-        state.checkpoint = checkpoint
-        state.device = device
-        # Save stats about the model
-        state.save_stat(Keys.CHECKPOINT, checkpoint)
-        state.save_stat(Keys.DTYPE, str(dtype).split(".")[1])
-        state.save_stat(Keys.DEVICE, device)
-        # Get base model information
-        base_model = get_base_model(checkpoint)
-        if base_model is not None:
-            state.save_stat("base_model", base_model)
-        # Create a UniqueInvocationInfo and ModelInfo so that we can display status
-        # at the end of the sequence
-        status.add_to_state(state=state, name=input, model=model)
-        return state
 class HuggingfaceAdapter(ModelAdapter):
     """
     Wrapper class for Huggingface LLMs that handle generation arguments
@@ -522,5 +275,84 @@ class HuggingfaceAdapter(ModelAdapter):
         return text_offset, token_log_probs, token_strings, top_logprobs_list
-# This file was originally licensed under Apache 2.0. It has been modified.
-# Modifications Copyright (c) 2025 AMD
+def benchmark_huggingface_llm(
+    model: torch.nn.Module,
+    tokenizer,
+    input_ids,
+    dtype,
+    num_beams: int,
+    target_output_tokens: int,
+    iterations: int,
+    warmup_iterations: int,
+    report_progress_fn,
+) -> List[Tuple[float, int]]:
+    amp_enabled = True if (dtype == torch.float16 or dtype == torch.bfloat16) else False
+    # The "if amp_enabled else nullcontext()" is to get around a bug in PyTorch 2.1
+    # where torch.cpu.amp.autocast(enabled=False) does nothing
+    with (
+        torch.cpu.amp.autocast(enabled=amp_enabled, dtype=dtype)
+        if amp_enabled
+        else nullcontext()
+    ):
+        per_iteration_result = []
+        tokens_out_len_list = []
+        # Early stopping is only a valid parameter with multiple beams
+        early_stopping = num_beams > 1
+        with torch.no_grad(), torch.inference_mode():
+            # Don't capture time for warmup
+            for count in range(warmup_iterations):
+                outputs = model.generate(
+                    input_ids,
+                    num_beams=num_beams,
+                    max_new_tokens=target_output_tokens,
+                    min_new_tokens=target_output_tokens,
+                    early_stopping=early_stopping,
+                    pad_token_id=tokenizer.eos_token_id,
+                )
+                tokens_out_len_list.append(outputs.shape[1] - input_ids.shape[1])
+                report_progress_fn((count + 1) / (warmup_iterations + iterations))
+            for count in range(iterations):
+                # CUDA synchronization is required prior to GPU benchmarking
+                # This has no negative effect on CPU-only benchmarks, and is more robust than
+                # checking `model.device == "cuda"` since it applies to multi-GPU environments
+                # Synchronization is done before collecting the start time because this will
+                # ensure that the GPU has finished initialization tasks such as loading weights
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+                start_time = time.perf_counter()
+                outputs = model.generate(
+                    input_ids,
+                    num_beams=num_beams,
+                    max_new_tokens=target_output_tokens,
+                    min_new_tokens=target_output_tokens,
+                    early_stopping=early_stopping,
+                    pad_token_id=tokenizer.eos_token_id,
+                )
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+                end_time = time.perf_counter()
+                latency = end_time - start_time
+                token_len = outputs.shape[1] - input_ids.shape[1]
+                tokens_out_len_list.append(token_len)
+                # Only count an iteration if it produced enough tokens
+                if token_len >= target_output_tokens:
+                    per_iteration_result.append((latency, token_len))
+                report_progress_fn(
+                    (warmup_iterations + count + 1) / (warmup_iterations + iterations)
+                )
+        if not per_iteration_result:
+            raise Bench.not_enough_tokens(target_output_tokens)
+    return per_iteration_result, tokens_out_len_list

lemonade/tools/humaneval.py CHANGED Viewed

@@ -2,9 +2,7 @@ import argparse
 import os
 import csv
 from typing import Dict, Optional, Any
-import requests
-from human_eval.data import write_jsonl, read_problems
-from human_eval.evaluation import evaluate_functional_correctness
 from lemonade.state import State
 from lemonade.tools import Tool
@@ -95,6 +93,7 @@ class AccuracyHumaneval(Tool):
         Returns:
             Updated state with evaluation results
         """
         # Validate required state components
         if not hasattr(state, "model") or not hasattr(state, "tokenizer"):
             raise ValueError("State must contain both 'model' and 'tokenizer'")
@@ -128,6 +127,9 @@ class AccuracyHumaneval(Tool):
     def _download_dataset(self, output_path: str) -> None:
         """Download HumanEval dataset if not already present."""
+        import requests
         if os.path.exists(output_path):
             printing.log_info(f"Dataset already exists at: {output_path}")
             return
@@ -170,6 +172,10 @@ class AccuracyHumaneval(Tool):
         Returns:
             Dictionary containing evaluation metrics
         """
+        from human_eval.data import write_jsonl, read_problems
+        from human_eval.evaluation import evaluate_functional_correctness
         dataset = read_problems(data_path)
         # Limit to first N problems

lemonade/tools/{llamacpp_bench.py → llamacpp/bench.py} RENAMED Viewed

@@ -3,7 +3,7 @@ import statistics
 from statistics import StatisticsError
 from lemonade.state import State
 from lemonade.cache import Keys
-from lemonade.tools.llamacpp import LlamaCppAdapter
+from lemonade.tools.llamacpp.load import LlamaCppAdapter
 from lemonade.tools.bench import Bench

lemonade-sdk 7.0.4__py3-none-any.whl → 8.0.0__py3-none-any.whl

Potentially problematic release.

lemonade-sdk 7.0.4py3-none-any.whl → 8.0.0py3-none-any.whl