PyPI - lemonade-sdk - Versions diffs - 9.1.1__py3-none-any.whl - Mend

lemonade-sdk 9.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

lemonade/__init__.py +5 -0
lemonade/api.py +180 -0
lemonade/cache.py +92 -0
lemonade/cli.py +173 -0
lemonade/common/__init__.py +0 -0
lemonade/common/build.py +176 -0
lemonade/common/cli_helpers.py +139 -0
lemonade/common/exceptions.py +98 -0
lemonade/common/filesystem.py +368 -0
lemonade/common/inference_engines.py +408 -0
lemonade/common/network.py +93 -0
lemonade/common/printing.py +110 -0
lemonade/common/status.py +471 -0
lemonade/common/system_info.py +1411 -0
lemonade/common/test_helpers.py +28 -0
lemonade/profilers/__init__.py +1 -0
lemonade/profilers/agt_power.py +437 -0
lemonade/profilers/hwinfo_power.py +429 -0
lemonade/profilers/memory_tracker.py +259 -0
lemonade/profilers/profiler.py +58 -0
lemonade/sequence.py +363 -0
lemonade/state.py +159 -0
lemonade/tools/__init__.py +1 -0
lemonade/tools/accuracy.py +432 -0
lemonade/tools/adapter.py +114 -0
lemonade/tools/bench.py +302 -0
lemonade/tools/flm/__init__.py +1 -0
lemonade/tools/flm/utils.py +305 -0
lemonade/tools/huggingface/bench.py +187 -0
lemonade/tools/huggingface/load.py +235 -0
lemonade/tools/huggingface/utils.py +359 -0
lemonade/tools/humaneval.py +264 -0
lemonade/tools/llamacpp/bench.py +255 -0
lemonade/tools/llamacpp/load.py +222 -0
lemonade/tools/llamacpp/utils.py +1260 -0
lemonade/tools/management_tools.py +319 -0
lemonade/tools/mmlu.py +319 -0
lemonade/tools/oga/__init__.py +0 -0
lemonade/tools/oga/bench.py +120 -0
lemonade/tools/oga/load.py +804 -0
lemonade/tools/oga/migration.py +403 -0
lemonade/tools/oga/utils.py +462 -0
lemonade/tools/perplexity.py +147 -0
lemonade/tools/prompt.py +263 -0
lemonade/tools/report/__init__.py +0 -0
lemonade/tools/report/llm_report.py +203 -0
lemonade/tools/report/table.py +899 -0
lemonade/tools/server/__init__.py +0 -0
lemonade/tools/server/flm.py +133 -0
lemonade/tools/server/llamacpp.py +320 -0
lemonade/tools/server/serve.py +2123 -0
lemonade/tools/server/static/favicon.ico +0 -0
lemonade/tools/server/static/index.html +279 -0
lemonade/tools/server/static/js/chat.js +1059 -0
lemonade/tools/server/static/js/model-settings.js +183 -0
lemonade/tools/server/static/js/models.js +1395 -0
lemonade/tools/server/static/js/shared.js +556 -0
lemonade/tools/server/static/logs.html +191 -0
lemonade/tools/server/static/styles.css +2654 -0
lemonade/tools/server/static/webapp.html +321 -0
lemonade/tools/server/tool_calls.py +153 -0
lemonade/tools/server/tray.py +664 -0
lemonade/tools/server/utils/macos_tray.py +226 -0
lemonade/tools/server/utils/port.py +77 -0
lemonade/tools/server/utils/thread.py +85 -0
lemonade/tools/server/utils/windows_tray.py +408 -0
lemonade/tools/server/webapp.py +34 -0
lemonade/tools/server/wrapped_server.py +559 -0
lemonade/tools/tool.py +374 -0
lemonade/version.py +1 -0
lemonade_install/__init__.py +1 -0
lemonade_install/install.py +239 -0
lemonade_sdk-9.1.1.dist-info/METADATA +276 -0
lemonade_sdk-9.1.1.dist-info/RECORD +84 -0
lemonade_sdk-9.1.1.dist-info/WHEEL +5 -0
lemonade_sdk-9.1.1.dist-info/entry_points.txt +5 -0
lemonade_sdk-9.1.1.dist-info/licenses/LICENSE +201 -0
lemonade_sdk-9.1.1.dist-info/licenses/NOTICE.md +47 -0
lemonade_sdk-9.1.1.dist-info/top_level.txt +3 -0
lemonade_server/cli.py +805 -0
lemonade_server/model_manager.py +758 -0
lemonade_server/pydantic_models.py +159 -0
lemonade_server/server_models.json +643 -0
lemonade_server/settings.py +39 -0

lemonade/tools/huggingface/bench.py ADDED Viewed

@@ -0,0 +1,187 @@
+import argparse
+import statistics
+from statistics import StatisticsError
+import psutil
+from lemonade.state import State
+from lemonade.cache import Keys
+from lemonade.tools.bench import Bench
+default_beams = 1
+class HuggingfaceBench(Bench):
+    """
+    Benchmarks the performance of the generate() method of an LLM loaded from
+    Huggingface Transformers (or any object that supports a
+    huggingface-like generate() method).
+    Required input state:
+        - DTYPE: data type of the model; used to determine if AMP should be
+            enabled to convert the input data type to match the model data
+            type.
+        - MODEL: huggingface-like instance to benchmark.
+        - INPUTS: model inputs to pass to generate() during benchmarking.
+    Output state produced: None
+    """
+    unique_name = "huggingface-bench"
+    @staticmethod
+    def parser(parser: argparse.ArgumentParser = None, add_help: bool = True):
+        # Allow inherited classes to initialize and pass in a parser, add parameters to it if so
+        if parser is None:
+            parser = __class__.helpful_parser(
+                short_description="Benchmark a huggingface-style PyTorch LLM",
+                add_help=add_help,
+            )
+        parser = Bench.parser(parser)
+        parser.add_argument(
+            "--num-beams",
+            required=False,
+            type=int,
+            default=default_beams,
+            help=f"Number of beams for the LLM to use (default: {default_beams})",
+        )
+        return parser
+    def get_prompt_str(self, state, token_length):
+        """
+        Returns a string with the prescribed token length.
+        """
+        model = state.model
+        tokenizer = state.tokenizer
+        test_prompt = "word " * (token_length - 2)
+        input_ids = (
+            tokenizer(test_prompt, return_tensors="pt")
+            .to(device=model.device)
+            .input_ids
+        )
+        test_token_length = input_ids.shape[1]
+        delta = test_token_length - token_length
+        if delta == 0:
+            return test_prompt
+        return "word " * max(token_length - 2 - delta, 0)
+    def run_prompt(
+        self,
+        state: State,
+        report_progress_fn,
+        prompt: str,
+        iterations: int,
+        warmup_iterations: int,
+        output_tokens: int,
+        num_beams: int = default_beams,
+    ):
+        """
+        We don't have access to the internal timings of generate(), so time to first
+        token (TTFT, aka prefill latency) and token/s are calculated using the following formulae:
+            prefill_latency = latency of generate(output_tokens=1)
+            execution_latency = latency of generate(output_tokens=output_tokens)
+            tokens_per_second = (new_tokens - 1) / (execution_latency - prefill_latency)
+        """
+        from lemonade.tools.huggingface.utils import benchmark_huggingface_llm
+        if self.first_run_prompt:
+            if vars(state).get(Keys.MODEL) is None:
+                raise ValueError(
+                    f"{self.__class__.__name__} requires that a model be passed from another tool"
+                )
+            if (
+                vars(state).get("num_beams")
+                and vars(state).get("num_beams") != num_beams
+            ):
+                raise ValueError(
+                    f"Number of beams was set to {vars(state).get('num_beams')} "
+                    f"in a previous tool, but it is set to {num_beams} in "
+                    "this tool. The values must be the same."
+                )
+            # Save benchmarking parameters
+            state.save_stat("num_beams", num_beams)
+        model = state.model
+        tokenizer = state.tokenizer
+        dtype = state.dtype
+        # Generate the input_ids outside the benchmarking function to make sure
+        # the same input_ids are used everywhere
+        input_ids = (
+            tokenizer(prompt, return_tensors="pt").to(device=model.device).input_ids
+        )
+        self.input_ids_len_list.append(input_ids.shape[1])
+        prefill_report_progress_fn = lambda x: report_progress_fn(0.5 * x)
+        # Benchmark prefill time (time to first token)
+        prefill_per_iteration_result, tokens_out_len_list = benchmark_huggingface_llm(
+            model=model,
+            tokenizer=tokenizer,
+            input_ids=input_ids,
+            dtype=dtype,
+            num_beams=num_beams,
+            target_output_tokens=1,
+            iterations=iterations,
+            warmup_iterations=warmup_iterations,
+            report_progress_fn=prefill_report_progress_fn,
+        )
+        self.tokens_out_len_list += tokens_out_len_list
+        time_to_first_token_per_iteration = [
+            latency for latency, _ in prefill_per_iteration_result
+        ]
+        mean_time_to_first_token = statistics.mean(time_to_first_token_per_iteration)
+        self.mean_time_to_first_token_list.append(mean_time_to_first_token)
+        self.prefill_tokens_per_second_list.append(
+            input_ids.shape[1] / mean_time_to_first_token
+        )
+        try:
+            self.std_dev_time_to_first_token_list.append(
+                statistics.stdev(time_to_first_token_per_iteration)
+            )
+        except StatisticsError:
+            # Less than 2 measurements
+            self.std_dev_time_to_first_token_list.append(None)
+        decode_report_progress_fn = lambda x: report_progress_fn(0.5 + 0.5 * x)
+        # Benchmark generation of all tokens
+        decode_per_iteration_result, tokens_out_len_list = benchmark_huggingface_llm(
+            model=model,
+            tokenizer=tokenizer,
+            input_ids=input_ids,
+            dtype=dtype,
+            num_beams=num_beams,
+            target_output_tokens=output_tokens,
+            iterations=iterations,
+            warmup_iterations=warmup_iterations,
+            report_progress_fn=decode_report_progress_fn,
+        )
+        self.tokens_out_len_list += tokens_out_len_list
+        execution_latency_per_iteration = [
+            latency for latency, _ in decode_per_iteration_result
+        ]
+        token_len_per_iteration = [
+            token_len for _, token_len in decode_per_iteration_result
+        ]
+        mean_execution_latency = statistics.mean(execution_latency_per_iteration)
+        mean_decode_latency = mean_execution_latency - mean_time_to_first_token
+        mean_token_len = statistics.mean(token_len_per_iteration)
+        # Subtract 1 so that we don't count the prefill token
+        self.token_generation_tokens_per_second_list.append(
+            (mean_token_len - 1) / mean_decode_latency
+        )
+        if self.save_max_memory_used:
+            self.max_memory_used_gb_list.append(
+                psutil.Process().memory_info().peak_wset / 1024**3
+            )
+# This file was originally licensed under Apache 2.0. It has been modified.
+# Modifications Copyright (c) 2025 AMD

lemonade/tools/huggingface/load.py ADDED Viewed

@@ -0,0 +1,235 @@
+import argparse
+from typing import Dict, Optional
+import json
+from lemonade.tools import FirstTool
+from lemonade.state import State
+import lemonade.common.status as status
+import lemonade.common.printing as printing
+from lemonade.cache import Keys
+class HuggingfaceLoad(FirstTool):
+    """
+    Load an LLM as a torch.nn.Module using the Hugging Face transformers
+    from_pretrained() API.
+    Expected input: a checkpoint to load
+    Output state produced:
+        - state.model: instance of torch.nn.Module that implements an LLM.
+        - state.inputs: tokenized example inputs to the model, in the form of a
+            dictionary of kwargs.
+        - state.tokenizer: instance of Hugging Face PretrainedTokenizer.
+        - state.dtype: data type of the model.
+        - state.checkpoint: pretrained checkpoint used to load the model.
+    """
+    unique_name = "huggingface-load"
+    def _imports(self):
+        pass
+    def __init__(self):
+        super().__init__(monitor_message="Loading Huggingface checkpoint")
+        self.status_stats = [Keys.DTYPE]
+    @staticmethod
+    def parser(add_help: bool = True) -> argparse.ArgumentParser:
+        parser = __class__.helpful_parser(
+            short_description="Load an LLM in PyTorch using huggingface transformers",
+            add_help=add_help,
+        )
+        default_dtype = "float32"
+        parser.add_argument(
+            "--dtype",
+            "-d",
+            required=False,
+            default=default_dtype,
+            help=f"Data type to load the model in (default: {default_dtype}).",
+        )
+        choices = ["cpu", "cuda"]
+        for cuda in range(15):
+            choices.append(f"cuda:{cuda}")
+        parser.add_argument(
+            "--device",
+            required=False,
+            default=None,
+            choices=choices,
+            help="Move the model and inputs to a device using the .to() method "
+            "(default: don't call the .to() method)",
+        )
+        parser.add_argument(
+            "--load-kwargs",
+            required=False,
+            default="{}",
+            type=json.loads,
+            help="Arbitrary kwargs, in json format, that will be passed as "
+            "from_pretrained(**kwargs). "
+            r"Example: --load-kwargs='{\"trust_remote_code\": true} would result in "
+            "from_pretrained(trust_remote_code=True)",
+        )
+        parser.add_argument(
+            "--channels-last",
+            default=True,
+            type=bool,
+            help="Whether to format the model in memory using "
+            "channels-last (default: True)",
+        )
+        return parser
+    def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
+        from lemonade.tools.huggingface.utils import str_to_dtype
+        parsed_args = super().parse(state, args, known_only)
+        # Save stats about the user's input (do this prior to decoding)
+        state.save_stat(Keys.CHECKPOINT, parsed_args.input)
+        state.save_stat(Keys.DTYPE, parsed_args.dtype)
+        # Decode dtype arg into a torch value
+        parsed_args.dtype = str_to_dtype[parsed_args.dtype]
+        return parsed_args
+    def run(
+        self,
+        state: State,
+        input: str = "",
+        dtype: "torch.dtype" = None,
+        device: Optional[str] = None,
+        load_kwargs: Optional[Dict] = None,
+        channels_last: bool = True,
+    ) -> State:
+        # Import expensive modules at runtime
+        import transformers
+        import torch
+        from lemonade.tools.huggingface.utils import (
+            HuggingfaceTokenizerAdapter,
+            HuggingfaceAdapter,
+        )
+        from lemonade.common.network import (
+            is_offline,
+            get_base_model,
+        )
+        # Set default dtype
+        if dtype is None:
+            dtype_to_use = torch.float32
+        else:
+            dtype_to_use = dtype
+        # Auto-detect offline status
+        offline = is_offline()
+        if offline:
+            printing.log_warning(
+                "Network connectivity to huggingface.co not detected. Running in offline mode."
+            )
+        checkpoint = input
+        if load_kwargs is None:
+            load_kwargs_to_use = {}
+        else:
+            load_kwargs_to_use = load_kwargs
+        # Add local_files_only to kwargs in offline mode
+        if offline:
+            load_kwargs_to_use["local_files_only"] = True
+        if vars(state).get(Keys.MODEL):
+            raise ValueError("HuggingfaceLoad must be the first tool in the sequence")
+        try:
+            model = transformers.AutoModelForCausalLM.from_pretrained(
+                checkpoint,
+                torch_dtype=dtype_to_use,
+                low_cpu_mem_usage=True,
+                **load_kwargs_to_use,
+            )
+        except Exception as e:
+            if offline and "Can't load config for" in str(e):
+                raise ValueError(
+                    f"Cannot load model {checkpoint} in offline mode. "
+                    f"The model files may not be available locally. Original error: {str(e)}"
+                )
+            raise
+        # Only call the model.to() method if an argument to this function
+        # provides a reason to do so
+        to_args = {}
+        if channels_last:
+            to_args["memory_format"] = torch.channels_last
+        if device:
+            to_args["device"] = device
+        if to_args:
+            model.to(**to_args)
+        model = model.eval()
+        try:
+            tokenizer_kwargs = {
+                "use_fast": False,
+                "model_max_length": 4096,
+                "padding_side": "left",
+            }
+            if offline:
+                tokenizer_kwargs["local_files_only"] = True
+            tokenizer = transformers.AutoTokenizer.from_pretrained(
+                checkpoint, **tokenizer_kwargs
+            )
+        except ValueError as e:
+            # Sometimes those specific tokenizer flags are not supported, in which
+            # case we try to just load a simple tokenizer
+            tokenizer_kwargs = {}
+            if offline:
+                tokenizer_kwargs["local_files_only"] = True
+            try:
+                tokenizer = transformers.AutoTokenizer.from_pretrained(
+                    checkpoint, **tokenizer_kwargs
+                )
+            except Exception as e:
+                if offline and "Can't load tokenizer for" in str(e):
+                    raise ValueError(
+                        f"Cannot load tokenizer for {checkpoint} in offline mode. "
+                        f"The tokenizer files may not be available locally. "
+                        f"Original error: {str(e)}"
+                    )
+                raise
+        # Pass the model and inputs into state
+        state.model = HuggingfaceAdapter(model, dtype_to_use, device, tokenizer)
+        state.tokenizer = HuggingfaceTokenizerAdapter(tokenizer, device)
+        state.dtype = dtype_to_use
+        state.checkpoint = checkpoint
+        state.device = device
+        # Save stats about the model
+        state.save_stat(Keys.CHECKPOINT, checkpoint)
+        state.save_stat(Keys.DTYPE, str(dtype_to_use).split(".")[1])
+        state.save_stat(Keys.DEVICE, device)
+        # Get base model information
+        base_model = get_base_model(checkpoint)
+        if base_model is not None:
+            state.save_stat("base_model", base_model)
+        # Create a UniqueInvocationInfo and ModelInfo so that we can display status
+        # at the end of the sequence
+        status.add_to_state(state=state, name=input, model=model)
+        return state
+# This file was originally licensed under Apache 2.0. It has been modified.
+# Modifications Copyright (c) 2025 AMD