PyPI - lemonade-sdk - Versions diffs - 9.1.1__py3-none-any.whl - Mend

lemonade-sdk 9.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

lemonade/__init__.py +5 -0
lemonade/api.py +180 -0
lemonade/cache.py +92 -0
lemonade/cli.py +173 -0
lemonade/common/__init__.py +0 -0
lemonade/common/build.py +176 -0
lemonade/common/cli_helpers.py +139 -0
lemonade/common/exceptions.py +98 -0
lemonade/common/filesystem.py +368 -0
lemonade/common/inference_engines.py +408 -0
lemonade/common/network.py +93 -0
lemonade/common/printing.py +110 -0
lemonade/common/status.py +471 -0
lemonade/common/system_info.py +1411 -0
lemonade/common/test_helpers.py +28 -0
lemonade/profilers/__init__.py +1 -0
lemonade/profilers/agt_power.py +437 -0
lemonade/profilers/hwinfo_power.py +429 -0
lemonade/profilers/memory_tracker.py +259 -0
lemonade/profilers/profiler.py +58 -0
lemonade/sequence.py +363 -0
lemonade/state.py +159 -0
lemonade/tools/__init__.py +1 -0
lemonade/tools/accuracy.py +432 -0
lemonade/tools/adapter.py +114 -0
lemonade/tools/bench.py +302 -0
lemonade/tools/flm/__init__.py +1 -0
lemonade/tools/flm/utils.py +305 -0
lemonade/tools/huggingface/bench.py +187 -0
lemonade/tools/huggingface/load.py +235 -0
lemonade/tools/huggingface/utils.py +359 -0
lemonade/tools/humaneval.py +264 -0
lemonade/tools/llamacpp/bench.py +255 -0
lemonade/tools/llamacpp/load.py +222 -0
lemonade/tools/llamacpp/utils.py +1260 -0
lemonade/tools/management_tools.py +319 -0
lemonade/tools/mmlu.py +319 -0
lemonade/tools/oga/__init__.py +0 -0
lemonade/tools/oga/bench.py +120 -0
lemonade/tools/oga/load.py +804 -0
lemonade/tools/oga/migration.py +403 -0
lemonade/tools/oga/utils.py +462 -0
lemonade/tools/perplexity.py +147 -0
lemonade/tools/prompt.py +263 -0
lemonade/tools/report/__init__.py +0 -0
lemonade/tools/report/llm_report.py +203 -0
lemonade/tools/report/table.py +899 -0
lemonade/tools/server/__init__.py +0 -0
lemonade/tools/server/flm.py +133 -0
lemonade/tools/server/llamacpp.py +320 -0
lemonade/tools/server/serve.py +2123 -0
lemonade/tools/server/static/favicon.ico +0 -0
lemonade/tools/server/static/index.html +279 -0
lemonade/tools/server/static/js/chat.js +1059 -0
lemonade/tools/server/static/js/model-settings.js +183 -0
lemonade/tools/server/static/js/models.js +1395 -0
lemonade/tools/server/static/js/shared.js +556 -0
lemonade/tools/server/static/logs.html +191 -0
lemonade/tools/server/static/styles.css +2654 -0
lemonade/tools/server/static/webapp.html +321 -0
lemonade/tools/server/tool_calls.py +153 -0
lemonade/tools/server/tray.py +664 -0
lemonade/tools/server/utils/macos_tray.py +226 -0
lemonade/tools/server/utils/port.py +77 -0
lemonade/tools/server/utils/thread.py +85 -0
lemonade/tools/server/utils/windows_tray.py +408 -0
lemonade/tools/server/webapp.py +34 -0
lemonade/tools/server/wrapped_server.py +559 -0
lemonade/tools/tool.py +374 -0
lemonade/version.py +1 -0
lemonade_install/__init__.py +1 -0
lemonade_install/install.py +239 -0
lemonade_sdk-9.1.1.dist-info/METADATA +276 -0
lemonade_sdk-9.1.1.dist-info/RECORD +84 -0
lemonade_sdk-9.1.1.dist-info/WHEEL +5 -0
lemonade_sdk-9.1.1.dist-info/entry_points.txt +5 -0
lemonade_sdk-9.1.1.dist-info/licenses/LICENSE +201 -0
lemonade_sdk-9.1.1.dist-info/licenses/NOTICE.md +47 -0
lemonade_sdk-9.1.1.dist-info/top_level.txt +3 -0
lemonade_server/cli.py +805 -0
lemonade_server/model_manager.py +758 -0
lemonade_server/pydantic_models.py +159 -0
lemonade_server/server_models.json +643 -0
lemonade_server/settings.py +39 -0

lemonade/tools/llamacpp/bench.py ADDED Viewed

@@ -0,0 +1,255 @@
+import argparse
+import statistics
+from statistics import StatisticsError
+from lemonade.state import State
+from lemonade.tools.tool import Tool
+from lemonade.tools.llamacpp.utils import LlamaCppAdapter
+from lemonade.tools.bench import (
+    Bench,
+    default_prompt_length,
+    default_iterations,
+    default_output_tokens,
+    default_warmup_runs,
+)
+class LlamaCppBench(Bench):
+    """
+    Benchmark a llama.cpp model
+    """
+    unique_name = "llamacpp-bench"
+    @staticmethod
+    def parser(add_help: bool = True) -> argparse.ArgumentParser:
+        parser = __class__.helpful_parser(
+            short_description="Benchmark an LLM in llama.cpp",
+            add_help=add_help,
+        )
+        parser = Bench.parser(parser)
+        parser.add_argument(
+            "--cli",
+            action="store_true",
+            help="Set this flag to use llama-cli.exe to benchmark model performance. "
+            "This executable will be called once per iteration.  Otherwise, "
+            "llama-bench.exe is used by default.  In this default behavior behavior, "
+            "the only valid prompt format is integer token lengths. Also, the "
+            "warmup-iterations parameter is ignored and the default value for number of "
+            "threads is 16.",
+        )
+        return parser
+    def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
+        """
+        Helper function to parse CLI arguments into the args expected by run()
+        """
+        # Call Tool parse method, NOT the Bench parse method
+        parsed_args = Tool.parse(self, state, args, known_only)
+        if parsed_args.cli:
+            parsed_args = super().parse(state, args, known_only)
+        else:
+            # Make sure prompts is a list of integers
+            if parsed_args.prompts is None:
+                parsed_args.prompts = [default_prompt_length]
+            prompt_ints = []
+            for prompt_item in parsed_args.prompts:
+                if prompt_item.isdigit():
+                    prompt_ints.append(int(prompt_item))
+                else:
+                    raise Exception(
+                        f"When not using the --cli flag to {self.unique_name}, the prompt format "
+                        "must be in integer format."
+                    )
+            parsed_args.prompts = prompt_ints
+        return parsed_args
+    def run_prompt(
+        self,
+        state: State,
+        report_progress_fn,
+        prompt: str,
+        iterations: int,
+        warmup_iterations: int,
+        output_tokens: int,
+    ):
+        """
+        Benchmark llama.cpp model that was loaded by LoadLlamaCpp.
+        """
+        if self.first_run_prompt:
+            if not hasattr(state, "model") or not isinstance(
+                state.model, LlamaCppAdapter
+            ):
+                raise Exception(
+                    f"{self.__class__.unique_name} requires a LlamaCppAdapter model to be "
+                    "loaded first. Please run load-llama-cpp before this tool."
+                )
+        model: LlamaCppAdapter = state.model
+        per_iteration_tokens_per_second = []
+        per_iteration_time_to_first_token = []
+        per_iteration_peak_wset = []
+        for iteration in range(iterations + warmup_iterations):
+            try:
+                # Use the adapter's generate method which already has the timeout
+                # and error handling
+                model.time_to_first_token = None
+                model.tokens_per_second = None
+                raw_output, stderr = model.generate(
+                    prompt,
+                    max_new_tokens=output_tokens,
+                    return_raw=True,
+                    save_max_memory_used=self.save_max_memory_used,
+                )
+                if model.time_to_first_token is None or model.tokens_per_second is None:
+                    error_msg = (
+                        "Could not find timing information in llama.cpp output.\n"
+                    )
+                    error_msg += "Raw output:\n" + raw_output + "\n"
+                    error_msg += "Stderr:\n" + stderr
+                    raise Exception(error_msg)
+                self.tokens_out_len_list.append(model.response_tokens)
+                if iteration > warmup_iterations - 1:
+                    per_iteration_tokens_per_second.append(model.tokens_per_second)
+                    per_iteration_time_to_first_token.append(model.time_to_first_token)
+                    per_iteration_peak_wset.append(model.peak_wset)
+                report_progress_fn((iteration + 1) / (warmup_iterations + iterations))
+            except Exception as e:
+                error_msg = f"Failed to run benchmark: {str(e)}"
+                raise Exception(error_msg)
+        self.input_ids_len_list.append(model.prompt_tokens)
+        mean_time_to_first_token = statistics.mean(per_iteration_time_to_first_token)
+        self.mean_time_to_first_token_list.append(mean_time_to_first_token)
+        self.prefill_tokens_per_second_list.append(
+            model.prompt_tokens / mean_time_to_first_token
+        )
+        self.token_generation_tokens_per_second_list.append(
+            statistics.mean(per_iteration_tokens_per_second)
+        )
+        try:
+            self.std_dev_time_to_first_token_list.append(
+                statistics.stdev(per_iteration_time_to_first_token)
+            )
+        except StatisticsError:
+            # Less than 2 measurements
+            self.std_dev_time_to_first_token_list.append(None)
+        try:
+            self.std_dev_token_generation_tokens_per_second_list.append(
+                statistics.stdev(per_iteration_tokens_per_second)
+            )
+        except StatisticsError:
+            # Less than 2 measurements
+            self.std_dev_token_generation_tokens_per_second_list.append(None)
+        if self.save_max_memory_used:
+            filtered_list = [
+                item for item in per_iteration_peak_wset if item is not None
+            ]
+            mean_gb_used = (
+                None
+                if len(filtered_list) == 0
+                else statistics.mean(filtered_list) / 1024**3
+            )
+            self.max_memory_used_gb_list.append(mean_gb_used)
+    def run_llama_bench_exe(self, state, prompts, iterations, output_tokens):
+        if prompts is None:
+            prompts = [default_prompt_length]
+        elif isinstance(prompts, int):
+            prompts = [prompts]
+        state.save_stat("prompts", prompts)
+        state.save_stat("iterations", iterations)
+        state.save_stat("output_tokens", output_tokens)
+        counter = 0
+        report_progress_fn = lambda x: self.set_percent_progress(
+            100 * (counter + x) / len(prompts)
+        )
+        self.first_run_prompt = True
+        for counter, prompt in enumerate(prompts):
+            report_progress_fn(0)
+            self.run_prompt_llama_bench_exe(
+                state,
+                prompt,
+                iterations,
+                output_tokens,
+            )
+            self.first_run_prompt = False
+        self.set_percent_progress(None)
+        self.save_stats(state)
+        return state
+    def run_prompt_llama_bench_exe(self, state, prompt, iterations, output_tokens):
+        model: LlamaCppAdapter = state.model
+        prompt_length, pp_tps, pp_tps_sd, tg_tps, tg_tps_sd, peak_wset = (
+            model.benchmark(prompt, iterations, output_tokens)
+        )
+        self.input_ids_len_list.append(prompt_length)
+        self.prefill_tokens_per_second_list.append(pp_tps)
+        self.std_dev_prefill_tokens_per_second_list.append(pp_tps_sd)
+        self.mean_time_to_first_token_list.append(prompt_length / pp_tps)
+        self.token_generation_tokens_per_second_list.append(tg_tps)
+        self.std_dev_token_generation_tokens_per_second_list.append(tg_tps_sd)
+        self.tokens_out_len_list.append(output_tokens * iterations)
+        if self.save_max_memory_used:
+            if peak_wset is not None:
+                self.max_memory_used_gb_list.append(peak_wset / 1024**3)
+            else:
+                self.max_memory_used_gb_list.append(None)
+    def run(
+        self,
+        state: State,
+        prompts: list[str] = None,
+        iterations: int = default_iterations,
+        warmup_iterations: int = default_warmup_runs,
+        output_tokens: int = default_output_tokens,
+        cli: bool = False,
+        **kwargs,
+    ) -> State:
+        """
+        Args:
+            - prompts: List of input prompts used as starting points for LLM text generation
+            - iterations: Number of benchmarking samples to take; results are
+                reported as the median and mean of the samples.
+            - warmup_iterations: Subset of the iterations to treat as warmup,
+                and not included in the results.
+            - output_tokens: Number of new tokens LLM to create.
+            - cli: Use multiple calls to llama-cpp.exe instead of llama-bench.exe
+            - kwargs: Additional parameters used by bench tools
+        """
+        # Check that state has the attribute model and it is a LlamaCPP model
+        if not hasattr(state, "model") or not isinstance(state.model, LlamaCppAdapter):
+            raise Exception("Load model using llamacpp-load first.")
+        if cli:
+            state = super().run(
+                state, prompts, iterations, warmup_iterations, output_tokens, **kwargs
+            )
+        else:
+            state = self.run_llama_bench_exe(state, prompts, iterations, output_tokens)
+        return state
+# This file was originally licensed under Apache 2.0. It has been modified.
+# Modifications Copyright (c) 2025 AMD

lemonade/tools/llamacpp/load.py ADDED Viewed

@@ -0,0 +1,222 @@
+import argparse
+import os
+import lemonade.common.printing as printing
+import lemonade.common.status as status
+from lemonade.state import State
+from lemonade.tools import FirstTool
+from lemonade.cache import Keys
+class LoadLlamaCpp(FirstTool):
+    unique_name = "llamacpp-load"
+    def __init__(self):
+        super().__init__(monitor_message="Loading llama.cpp model")
+        self.status_stats = [
+            Keys.DEVICE,
+        ]
+    @staticmethod
+    def parser(add_help: bool = True) -> argparse.ArgumentParser:
+        parser = __class__.helpful_parser(
+            short_description="Wrap llama.cpp models with an API",
+            add_help=add_help,
+        )
+        parser.add_argument(
+            "-d",
+            "--device",
+            choices=["cpu", "igpu"],
+            default="igpu",
+            help="Which device to load the model on to (default: igpu)",
+        )
+        default_threads = -1
+        parser.add_argument(
+            "--threads",
+            required=False,
+            type=int,
+            default=default_threads,
+            help=f"Number of threads to use during generation (default: {default_threads})",
+        )
+        context_size = 4096
+        parser.add_argument(
+            "--context-size",
+            required=False,
+            type=int,
+            default=context_size,
+            help=f"Size of the prompt context (default: {context_size}. 0 = loaded from model)",
+        )
+        output_tokens = 512
+        parser.add_argument(
+            "--output-tokens",
+            required=False,
+            type=int,
+            default=output_tokens,
+            help=f"Maximum number of output tokens to generate (default: {output_tokens})",
+        )
+        parser.add_argument(
+            "--reasoning",
+            action="store_true",
+            help="Set this flag to indicate the model is a reasoning model",
+        )
+        parser.add_argument(
+            "--backend",
+            choices=["vulkan", "rocm", "cpu"],
+            default="vulkan",
+            help="Backend to use for llama.cpp (default: vulkan)",
+        )
+        return parser
+    def run(
+        self,
+        state: State,
+        input: str = "",
+        device: str = "igpu",
+        context_size: int = 512,
+        threads: int = 1,
+        output_tokens: int = 512,
+        reasoning: bool = False,
+        backend: str = "vulkan",
+    ) -> State:
+        """
+        Load a llama.cpp model
+        """
+        from lemonade.common.network import is_offline
+        from lemonade.tools.llamacpp.utils import (
+            install_llamacpp,
+            get_llama_cli_exe_path,
+            get_llama_bench_exe_path,
+            get_llama_installed_version,
+            parse_checkpoint,
+            download_gguf,
+            resolve_local_gguf_model,
+            get_local_checkpoint_path,
+            LlamaCppTokenizerAdapter,
+            LlamaCppAdapter,
+        )
+        install_llamacpp(backend)
+        extension = ""
+        # Check if input is a local folder containing a .GGUF model
+        if os.path.isdir(input):
+            # input is a local folder
+            local_model_folder = os.path.abspath(input)
+            checkpoint = "local_model"
+            state.checkpoint = checkpoint
+            state.save_stat(Keys.CHECKPOINT, checkpoint)
+            state.save_stat(Keys.LOCAL_MODEL_FOLDER, local_model_folder)
+            # See if there is a file ending in ".gguf" in this folder
+            dir = os.listdir(input)
+            gguf_files = [filename for filename in dir if filename.endswith(".gguf")]
+            if len(gguf_files) == 0:
+                raise ValueError(
+                    f"The folder {input} does not contain a GGUF model file."
+                )
+            model_to_use = gguf_files[0]
+            full_model_path = os.path.join(local_model_folder, model_to_use)
+            extension = ".gguf"
+        elif input.endswith(".gguf") and os.path.isfile(input):
+            # input is a local .gguf file
+            full_model_path = os.path.abspath(input)
+            checkpoint = "local_model"
+            state.checkpoint = checkpoint
+            state.save_stat(Keys.CHECKPOINT, checkpoint)
+            state.save_stat(Keys.LOCAL_MODEL_FOLDER, full_model_path)
+            model_to_use = os.path.basename(full_model_path)
+            extension = ".gguf"
+        else:
+            # Input is a model checkpoint
+            checkpoint = input
+            state.checkpoint = checkpoint
+            state.save_stat(Keys.CHECKPOINT, checkpoint)
+            # Make sure that a variant is provided for the GGUF model
+            base_checkpoint, variant = parse_checkpoint(checkpoint)
+            if variant is None:
+                raise ValueError(
+                    "You are required to provide a 'variant' when "
+                    "selecting a GGUF model. The variant is provided "
+                    "as CHECKPOINT:VARIANT. For example: "
+                    "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_0 or "
+                    "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:qwen2.5-coder-3b-instruct-q4_0.gguf"
+                )
+            # Auto-detect offline status
+            offline = is_offline()
+            if offline:
+                printing.log_warning(
+                    "Network connectivity to huggingface.co not detected. Running in offline mode."
+                )
+                full_model_path, model_to_use = get_local_checkpoint_path(
+                    base_checkpoint, variant
+                )
+                if not full_model_path:
+                    raise ValueError(
+                        f"Model {checkpoint} is not available locally."
+                        f"Cannot download in offline mode."
+                    )
+            else:
+                # First, try to resolve from local cache to avoid unnecessary downloads
+                base_checkpoint, variant = parse_checkpoint(checkpoint)
+                snapshot_files = resolve_local_gguf_model(
+                    base_checkpoint, variant, None
+                )
+                # If not found locally, download from internet
+                if not snapshot_files:
+                    snapshot_files = download_gguf(checkpoint)
+                full_model_path = snapshot_files["variant"]
+                model_to_use = os.path.basename(full_model_path)
+        llama_cli_exe_path = get_llama_cli_exe_path(backend)
+        llama_bench_exe_path = get_llama_bench_exe_path(backend)
+        printing.log_info(f"Using llama_cli for GGUF model: {llama_cli_exe_path}")
+        # Get the directory containing the executable for shared libraries
+        lib_dir = os.path.dirname(llama_cli_exe_path)
+        # Pass the model and inputs into state
+        state.model = LlamaCppAdapter(
+            model=full_model_path,
+            device=device,
+            output_tokens=output_tokens,
+            context_size=context_size,
+            threads=threads,
+            executable=llama_cli_exe_path,
+            bench_executable=llama_bench_exe_path,
+            reasoning=reasoning,
+            lib_dir=lib_dir,
+            state=state,
+        )
+        state.tokenizer = LlamaCppTokenizerAdapter()
+        state.device = device
+        # Save initial stats
+        state.save_stat(Keys.DEVICE, device)
+        state.save_stat(
+            Keys.LLAMA_CLI_VERSION_INFO, get_llama_installed_version(backend)
+        )
+        status.add_to_state(
+            state=state, name=input, model=model_to_use, extension=extension
+        )
+        return state
+# This file was originally licensed under Apache 2.0. It has been modified.
+# Modifications Copyright (c) 2025 AMD