PyPI - lemonade-sdk - Versions diffs - 8.1.4__py3-none-any.whl → 8.2.2__py3-none-any.whl - Mend

lemonade-sdk 8.1.4py3-none-any.whl → 8.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (53) hide show

lemonade/cache.py +6 -1
lemonade/cli.py +47 -5
lemonade/common/inference_engines.py +13 -4
lemonade/common/status.py +4 -4
lemonade/common/system_info.py +544 -1
lemonade/profilers/agt_power.py +437 -0
lemonade/profilers/hwinfo_power.py +429 -0
lemonade/tools/accuracy.py +143 -48
lemonade/tools/adapter.py +6 -1
lemonade/tools/bench.py +26 -8
lemonade/tools/flm/__init__.py +1 -0
lemonade/tools/flm/utils.py +303 -0
lemonade/tools/huggingface/bench.py +6 -1
lemonade/tools/llamacpp/bench.py +146 -27
lemonade/tools/llamacpp/load.py +30 -2
lemonade/tools/llamacpp/utils.py +393 -33
lemonade/tools/oga/bench.py +5 -26
lemonade/tools/oga/load.py +60 -121
lemonade/tools/oga/migration.py +403 -0
lemonade/tools/report/table.py +76 -8
lemonade/tools/server/flm.py +133 -0
lemonade/tools/server/llamacpp.py +220 -553
lemonade/tools/server/serve.py +684 -168
lemonade/tools/server/static/js/chat.js +666 -342
lemonade/tools/server/static/js/model-settings.js +24 -3
lemonade/tools/server/static/js/models.js +597 -73
lemonade/tools/server/static/js/shared.js +79 -14
lemonade/tools/server/static/logs.html +191 -0
lemonade/tools/server/static/styles.css +491 -66
lemonade/tools/server/static/webapp.html +83 -31
lemonade/tools/server/tray.py +158 -38
lemonade/tools/server/utils/macos_tray.py +226 -0
lemonade/tools/server/utils/{system_tray.py → windows_tray.py} +13 -0
lemonade/tools/server/webapp.py +4 -1
lemonade/tools/server/wrapped_server.py +559 -0
lemonade/version.py +1 -1
lemonade_install/install.py +54 -611
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/METADATA +29 -72
lemonade_sdk-8.2.2.dist-info/RECORD +83 -0
lemonade_server/cli.py +145 -37
lemonade_server/model_manager.py +521 -37
lemonade_server/pydantic_models.py +28 -1
lemonade_server/server_models.json +246 -92
lemonade_server/settings.py +39 -39
lemonade/tools/quark/__init__.py +0 -0
lemonade/tools/quark/quark_load.py +0 -173
lemonade/tools/quark/quark_quantize.py +0 -439
lemonade_sdk-8.1.4.dist-info/RECORD +0 -77
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/WHEEL +0 -0
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/entry_points.txt +0 -0
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/LICENSE +0 -0
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/NOTICE.md +0 -0
{lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/top_level.txt +0 -0

lemonade/tools/adapter.py CHANGED Viewed

@@ -10,11 +10,14 @@ class ModelAdapter(abc.ABC):
         """
         Self-benchmarking ModelAdapters can store their results in the
         tokens_per_second and time_to_first_token members.
+        ModelAdapters that run generate in a different process can store the
+        peak memory used (bytes) by that process in the peak_wset member.
         """
         self.tokens_per_second = None
         self.time_to_first_token = None
         self.prompt_tokens = None
         self.response_tokens = None
+        self.peak_wset = None
         self.type = "generic"
@@ -27,7 +30,9 @@ class ModelAdapter(abc.ABC):
         with recipe components, which themselves may not support a lot of arguments.
         The generate method should store prompt and response lengths (in tokens)
-        in the prompt_tokens and response_tokens members.
+        in the prompt_tokens and response_tokens members.  If a different process is used,
+        the generate method can also store the peak memory used by that process in the
+        peak_wset member.
         """

lemonade/tools/bench.py CHANGED Viewed

@@ -2,7 +2,6 @@ from abc import ABC, abstractmethod
 import argparse
 import os
 import platform
-import psutil
 from lemonade.state import State
 from lemonade.tools import Tool
 from lemonade.cache import Keys
@@ -29,7 +28,9 @@ class Bench(Tool, ABC):
             Keys.SECONDS_TO_FIRST_TOKEN,
             Keys.STD_DEV_SECONDS_TO_FIRST_TOKEN,
             Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
+            Keys.STD_DEV_TOKENS_PER_SECOND,
             Keys.PREFILL_TOKENS_PER_SECOND,
+            Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
             Keys.PROMPT_TOKENS,
             Keys.RESPONSE_TOKENS,
             Keys.MAX_MEMORY_USED_GBYTE,
@@ -42,7 +43,9 @@ class Bench(Tool, ABC):
         self.mean_time_to_first_token_list = []
         self.std_dev_time_to_first_token_list = []
         self.prefill_tokens_per_second_list = []
+        self.std_dev_prefill_tokens_per_second_list = []
         self.token_generation_tokens_per_second_list = []
+        self.std_dev_token_generation_tokens_per_second_list = []
         self.max_memory_used_gb_list = []
         # Max memory used can only be measured on Windows systems
@@ -88,7 +91,7 @@ class Bench(Tool, ABC):
             default=[str(default_prompt_length)],
             metavar="PROMPT",
             help="Input one or more prompts to the LLM. Three formats are supported. "
-            "1) integer: use a synthetic prompt with the specified length "
+            "1) integer: use a synthetic prompt with the specified token length "
             "2) str: use a user-provided prompt string "
             "3) path/to/prompt.txt: load the prompt from a text file. "
             f"(default: {default_prompt_length}) ",
@@ -190,11 +193,6 @@ class Bench(Tool, ABC):
             )
             self.first_run_prompt = False
-            if self.save_max_memory_used:
-                self.max_memory_used_gb_list.append(
-                    psutil.Process().memory_info().peak_wset / 1024**3
-                )
         self.set_percent_progress(None)
         self.save_stats(state)
@@ -211,7 +209,10 @@ class Bench(Tool, ABC):
         output_tokens,
         **kwargs,
     ):
-        pass
+        """
+        The run_prompt method should append the appropriate value to each of the per prompt
+        measurement statistics lists that are members of the Bench class.
+        """
     @staticmethod
     def get_item_or_list(lst):
@@ -246,10 +247,27 @@ class Bench(Tool, ABC):
             Keys.PREFILL_TOKENS_PER_SECOND,
             self.get_item_or_list(self.prefill_tokens_per_second_list),
         )
+        if not all(
+            element is None for element in self.std_dev_prefill_tokens_per_second_list
+        ):
+            state.save_stat(
+                Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
+                self.get_item_or_list(self.std_dev_prefill_tokens_per_second_list),
+            )
         state.save_stat(
             Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
             self.get_item_or_list(self.token_generation_tokens_per_second_list),
         )
+        if not all(
+            element is None
+            for element in self.std_dev_token_generation_tokens_per_second_list
+        ):
+            state.save_stat(
+                Keys.STD_DEV_TOKENS_PER_SECOND,
+                self.get_item_or_list(
+                    self.std_dev_token_generation_tokens_per_second_list
+                ),
+            )
         if self.save_max_memory_used:
             state.save_stat(
                 Keys.MAX_MEMORY_USED_GBYTE,

lemonade/tools/flm/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # FLM (FastFlowLM) utilities for Lemonade SDK

lemonade/tools/flm/utils.py ADDED Viewed

@@ -0,0 +1,303 @@
+"""
+FLM (FastFlowLM) utilities for installation, version checking, and model management.
+"""
+import os
+import logging
+import subprocess
+import tempfile
+import time
+from typing import List, Optional
+import requests
+from packaging.version import Version, InvalidVersion
+def get_flm_latest_version() -> Optional[str]:
+    """
+    Get and return the latest FLM version from "https://github.com/FastFlowLM/FastFlowLM/tags"
+    This uses the GitHub tags API.
+    """
+    url = "https://api.github.com/repos/FastFlowLM/FastFlowLM/tags"
+    try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        tags = response.json()
+        if not tags:
+            return None
+        # Tags are sorted in reverse chronological order; find the first that looks like a version
+        for tag in tags:
+            tag_name = tag.get("name", "")
+            # Accept tags of the form v0.9.10, 0.9.10, etc.
+            if tag_name.startswith("v"):
+                version_candidate = tag_name[1:]
+            else:
+                version_candidate = tag_name
+            try:
+                # validate it's a version string
+                _ = Version(version_candidate)
+                return version_candidate
+            except InvalidVersion:
+                continue
+        return None
+    except requests.exceptions.RequestException as e:
+        logging.debug("Error retrieving latest FLM version: %s", e)
+        return None
+def check_flm_version() -> Optional[str]:
+    """
+    Check if FLM is installed and return version, or None if not available.
+    """
+    latest_version_str = get_flm_latest_version()
+    try:
+        result = subprocess.run(
+            ["flm", "version"],
+            capture_output=True,
+            text=True,
+            check=True,
+            encoding="utf-8",
+            errors="replace",
+        )
+        # Parse version from output like "FLM v0.9.4"
+        output = result.stdout.strip()
+        if output.startswith("FLM v"):
+            version_str = output[5:]  # Remove "FLM v" prefix
+            return version_str, latest_version_str
+        return None, latest_version_str
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return None, latest_version_str
+def refresh_environment():
+    """
+    Refresh PATH to pick up newly installed executables.
+    """
+    if os.name == "nt":  # Windows
+        # On Windows, we need to refresh the PATH from registry
+        import winreg
+        try:
+            with winreg.OpenKey(
+                winreg.HKEY_LOCAL_MACHINE,
+                r"SYSTEM\CurrentControlSet\Control\Session Manager\Environment",
+            ) as key:
+                path_value, _ = winreg.QueryValueEx(key, "PATH")
+                os.environ["PATH"] = path_value + ";" + os.environ.get("PATH", "")
+        except Exception as e:  # pylint: disable=broad-except
+            logging.debug("Could not refresh PATH from registry: %s", e)
+        # Also try to add common installation paths
+        common_paths = [
+            r"C:\Program Files\FLM",
+            r"C:\Program Files (x86)\FLM",
+            os.path.expanduser(r"~\AppData\Local\FLM"),
+        ]
+        for path in common_paths:
+            if os.path.exists(path) and path not in os.environ.get("PATH", ""):
+                os.environ["PATH"] = path + ";" + os.environ.get("PATH", "")
+def install_flm():
+    """
+    Check if FLM is installed and at minimum version.
+    If not, download and run the GUI installer, then wait for completion.
+    """
+    # Check current FLM installation
+    current_version, latest_version = check_flm_version()
+    if (
+        current_version
+        and latest_version
+        and Version(current_version) == Version(latest_version)
+    ):
+        logging.info(
+            "FLM v%s is already installed and is up to date (latest version: v%s).",
+            current_version,
+            latest_version,
+        )
+        return
+    if current_version:
+        if not latest_version:
+            logging.info(
+                "Unable to detect the latest FLM version; continuing with installed FLM v%s.",
+                current_version,
+            )
+            return
+        logging.info(
+            "FLM v%s is installed but below latest version v%s. Upgrading...",
+            current_version,
+            latest_version,
+        )
+        verysilent = True
+    else:
+        logging.info("FLM not found. Installing FLM v%s or later...", latest_version)
+        verysilent = False
+    # Download the installer
+    # pylint: disable=line-too-long
+    installer_url = "https://github.com/FastFlowLM/FastFlowLM/releases/latest/download/flm-setup.exe"
+    installer_path = os.path.join(tempfile.gettempdir(), "flm-setup.exe")
+    installer_args = [installer_path, "/VERYSILENT"] if verysilent else [installer_path]
+    try:
+        # Remove existing installer if present
+        if os.path.exists(installer_path):
+            os.remove(installer_path)
+        logging.info("Downloading FLM installer...")
+        response = requests.get(installer_url, stream=True, timeout=30)
+        response.raise_for_status()
+        # Save installer to disk
+        with open(installer_path, "wb") as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+            f.flush()
+            os.fsync(f.fileno())
+        logging.info("Downloaded FLM installer to %s", installer_path)
+        # Launch the installer GUI
+        logging.warning(
+            "Launching FLM installer GUI. Please complete the installation..."
+            if not verysilent
+            else "Installing FLM..."
+        )
+        # Launch installer and wait for it to complete
+        if os.name == "nt":  # Windows
+            process = subprocess.Popen(installer_args, shell=True)
+        else:
+            process = subprocess.Popen(installer_args)
+        # Wait for installer to complete
+        process.wait()
+        if process.returncode != 0:
+            raise RuntimeError(
+                f"FLM installer failed with exit code {process.returncode}"
+            )
+        logging.info("FLM installer completed successfully")
+        # Refresh environment to pick up new PATH entries
+        refresh_environment()
+        # Wait a moment for system to update
+        time.sleep(2)
+        # Verify installation
+        max_retries = 10
+        for attempt in range(max_retries):
+            new_version, latest_version = check_flm_version()
+            if new_version and Version(new_version) == Version(latest_version):
+                logging.info("FLM v%s successfully installed and verified", new_version)
+                return
+            if attempt < max_retries - 1:
+                logging.debug(
+                    "FLM not yet available in PATH, retrying... (attempt %d/%d)",
+                    attempt + 1,
+                    max_retries,
+                )
+                time.sleep(3)
+                refresh_environment()
+        # Final check failed
+        raise RuntimeError(
+            "FLM installation completed but 'flm' command is not available in PATH. "
+            "Please ensure FLM is properly installed and available in your system PATH."
+        )
+    except requests.RequestException as e:
+        raise RuntimeError(f"Failed to download FLM installer: {e}") from e
+    except Exception as e:
+        raise RuntimeError(f"FLM installation failed: {e}") from e
+    finally:
+        # Clean up installer file
+        if os.path.exists(installer_path):
+            try:
+                os.remove(installer_path)
+            except OSError:
+                pass  # Ignore cleanup errors
+def download_flm_model(config_checkpoint, _=None, do_not_upgrade=False) -> dict:
+    """
+    Downloads the FLM model for the given configuration.
+    Args:
+        config_checkpoint: name of the FLM model to install.
+        _: placeholder for `config_mmproj`, which is standard
+            for WrappedServer (see llamacpp/utils.py) .
+        do_not_upgrade: whether to re-download the model if it is already
+            available.
+    """
+    if do_not_upgrade:
+        command = ["flm", "pull", f"{config_checkpoint}"]
+    else:
+        command = ["flm", "pull", f"{config_checkpoint}", "--force"]
+    subprocess.run(command, check=True)
+def get_flm_installed_models() -> List[str]:
+    """
+    Parse FLM model list and return installed model checkpoints.
+    Returns:
+        List of installed FLM model checkpoints (e.g., ["llama3.2:1b", "gemma3:4b"])
+    """
+    try:
+        result = subprocess.run(
+            ["flm", "list"],
+            capture_output=True,
+            text=True,
+            check=True,
+            encoding="utf-8",
+            errors="replace",
+        )
+        # Check if we got valid output
+        if not result.stdout:
+            return []
+        installed_checkpoints = []
+        lines = result.stdout.strip().split("\n")
+        for line in lines:
+            line = line.strip()
+            if line.startswith("- "):
+                # Remove the leading "- " and parse the model info
+                model_info = line[2:].strip()
+                # Check if model is installed (✅)
+                if model_info.endswith(" ✅"):
+                    checkpoint = model_info[:-2].strip()
+                    installed_checkpoints.append(checkpoint)
+        return installed_checkpoints
+    except (
+        subprocess.CalledProcessError,
+        FileNotFoundError,
+        AttributeError,
+        NotADirectoryError,
+    ):
+        # FLM not installed, not available, or output parsing failed
+        return []
+def is_flm_available() -> bool:
+    """
+    Check if FLM is available and meets minimum version requirements.
+    """
+    current_version, latest_version = check_flm_version()
+    return current_version is not None and Version(current_version) == Version(
+        latest_version
+    )

lemonade/tools/huggingface/bench.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import argparse
 import statistics
 from statistics import StatisticsError
+import psutil
 from lemonade.state import State
 from lemonade.cache import Keys
 from lemonade.tools.bench import Bench
@@ -75,7 +76,7 @@ class HuggingfaceBench(Bench):
         warmup_iterations: int,
         output_tokens: int,
         num_beams: int = default_beams,
-    ) -> State:
+    ):
         """
         We don't have access to the internal timings of generate(), so time to first
         token (TTFT, aka prefill latency) and token/s are calculated using the following formulae:
@@ -176,6 +177,10 @@ class HuggingfaceBench(Bench):
         self.token_generation_tokens_per_second_list.append(
             (mean_token_len - 1) / mean_decode_latency
         )
+        if self.save_max_memory_used:
+            self.max_memory_used_gb_list.append(
+                psutil.Process().memory_info().peak_wset / 1024**3
+            )
 # This file was originally licensed under Apache 2.0. It has been modified.

lemonade/tools/llamacpp/bench.py CHANGED Viewed

@@ -2,9 +2,15 @@ import argparse
 import statistics
 from statistics import StatisticsError
 from lemonade.state import State
-from lemonade.cache import Keys
+from lemonade.tools.tool import Tool
 from lemonade.tools.llamacpp.utils import LlamaCppAdapter
-from lemonade.tools.bench import Bench
+from lemonade.tools.bench import (
+    Bench,
+    default_prompt_length,
+    default_iterations,
+    default_output_tokens,
+    default_warmup_runs,
+)
 class LlamaCppBench(Bench):
@@ -14,16 +20,6 @@ class LlamaCppBench(Bench):
     unique_name = "llamacpp-bench"
-    def __init__(self):
-        super().__init__()
-        # Additional statistics generated by this bench tool
-        self.status_stats.insert(
-            self.status_stats.index(Keys.TOKEN_GENERATION_TOKENS_PER_SECOND) + 1,
-            Keys.STD_DEV_TOKENS_PER_SECOND,
-        )
-        self.std_dev_token_generation_tokens_per_second_list = []
     @staticmethod
     def parser(add_help: bool = True) -> argparse.ArgumentParser:
         parser = __class__.helpful_parser(
@@ -33,8 +29,46 @@ class LlamaCppBench(Bench):
         parser = Bench.parser(parser)
+        parser.add_argument(
+            "--cli",
+            action="store_true",
+            help="Set this flag to use llama-cli.exe to benchmark model performance. "
+            "This executable will be called once per iteration.  Otherwise, "
+            "llama-bench.exe is used by default.  In this default behavior behavior, "
+            "the only valid prompt format is integer token lengths. Also, the "
+            "warmup-iterations parameter is ignored and the default value for number of "
+            "threads is 16.",
+        )
         return parser
+    def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
+        """
+        Helper function to parse CLI arguments into the args expected by run()
+        """
+        # Call Tool parse method, NOT the Bench parse method
+        parsed_args = Tool.parse(self, state, args, known_only)
+        if parsed_args.cli:
+            parsed_args = super().parse(state, args, known_only)
+        else:
+            # Make sure prompts is a list of integers
+            if parsed_args.prompts is None:
+                parsed_args.prompts = [default_prompt_length]
+            prompt_ints = []
+            for prompt_item in parsed_args.prompts:
+                if prompt_item.isdigit():
+                    prompt_ints.append(int(prompt_item))
+                else:
+                    raise Exception(
+                        f"When not using the --cli flag to {self.unique_name}, the prompt format "
+                        "must be in integer format."
+                    )
+            parsed_args.prompts = prompt_ints
+        return parsed_args
     def run_prompt(
         self,
         state: State,
@@ -43,7 +77,7 @@ class LlamaCppBench(Bench):
         iterations: int,
         warmup_iterations: int,
         output_tokens: int,
-    ) -> State:
+    ):
         """
         Benchmark llama.cpp model that was loaded by LoadLlamaCpp.
         """
@@ -61,6 +95,7 @@ class LlamaCppBench(Bench):
         per_iteration_tokens_per_second = []
         per_iteration_time_to_first_token = []
+        per_iteration_peak_wset = []
         for iteration in range(iterations + warmup_iterations):
             try:
@@ -69,7 +104,10 @@ class LlamaCppBench(Bench):
                 model.time_to_first_token = None
                 model.tokens_per_second = None
                 raw_output, stderr = model.generate(
-                    prompt, max_new_tokens=output_tokens, return_raw=True
+                    prompt,
+                    max_new_tokens=output_tokens,
+                    return_raw=True,
+                    save_max_memory_used=self.save_max_memory_used,
                 )
                 if model.time_to_first_token is None or model.tokens_per_second is None:
@@ -85,6 +123,7 @@ class LlamaCppBench(Bench):
                 if iteration > warmup_iterations - 1:
                     per_iteration_tokens_per_second.append(model.tokens_per_second)
                     per_iteration_time_to_first_token.append(model.time_to_first_token)
+                    per_iteration_peak_wset.append(model.peak_wset)
                 report_progress_fn((iteration + 1) / (warmup_iterations + iterations))
@@ -115,21 +154,101 @@ class LlamaCppBench(Bench):
         except StatisticsError:
             # Less than 2 measurements
             self.std_dev_token_generation_tokens_per_second_list.append(None)
+        if self.save_max_memory_used:
+            filtered_list = [
+                item for item in per_iteration_peak_wset if item is not None
+            ]
+            mean_gb_used = (
+                None
+                if len(filtered_list) == 0
+                else statistics.mean(filtered_list) / 1024**3
+            )
+            self.max_memory_used_gb_list.append(mean_gb_used)
+    def run_llama_bench_exe(self, state, prompts, iterations, output_tokens):
+        if prompts is None:
+            prompts = [default_prompt_length]
+        elif isinstance(prompts, int):
+            prompts = [prompts]
+        state.save_stat("prompts", prompts)
+        state.save_stat("iterations", iterations)
+        state.save_stat("output_tokens", output_tokens)
-    def save_stats(self, state):
-        super().save_stats(state)
-        # Save additional statistics
-        if not all(
-            element is None
-            for element in self.std_dev_token_generation_tokens_per_second_list
-        ):
-            state.save_stat(
-                Keys.STD_DEV_TOKENS_PER_SECOND,
-                self.get_item_or_list(
-                    self.std_dev_token_generation_tokens_per_second_list
-                ),
+        counter = 0
+        report_progress_fn = lambda x: self.set_percent_progress(
+            100 * (counter + x) / len(prompts)
+        )
+        self.first_run_prompt = True
+        for counter, prompt in enumerate(prompts):
+            report_progress_fn(0)
+            self.run_prompt_llama_bench_exe(
+                state,
+                prompt,
+                iterations,
+                output_tokens,
             )
+            self.first_run_prompt = False
+        self.set_percent_progress(None)
+        self.save_stats(state)
+        return state
+    def run_prompt_llama_bench_exe(self, state, prompt, iterations, output_tokens):
+        model: LlamaCppAdapter = state.model
+        prompt_length, pp_tps, pp_tps_sd, tg_tps, tg_tps_sd, peak_wset = (
+            model.benchmark(prompt, iterations, output_tokens)
+        )
+        self.input_ids_len_list.append(prompt_length)
+        self.prefill_tokens_per_second_list.append(pp_tps)
+        self.std_dev_prefill_tokens_per_second_list.append(pp_tps_sd)
+        self.mean_time_to_first_token_list.append(prompt_length / pp_tps)
+        self.token_generation_tokens_per_second_list.append(tg_tps)
+        self.std_dev_token_generation_tokens_per_second_list.append(tg_tps_sd)
+        self.tokens_out_len_list.append(output_tokens * iterations)
+        if self.save_max_memory_used:
+            if peak_wset is not None:
+                self.max_memory_used_gb_list.append(peak_wset / 1024**3)
+            else:
+                self.max_memory_used_gb_list.append(None)
+    def run(
+        self,
+        state: State,
+        prompts: list[str] = None,
+        iterations: int = default_iterations,
+        warmup_iterations: int = default_warmup_runs,
+        output_tokens: int = default_output_tokens,
+        cli: bool = False,
+        **kwargs,
+    ) -> State:
+        """
+        Args:
+            - prompts: List of input prompts used as starting points for LLM text generation
+            - iterations: Number of benchmarking samples to take; results are
+                reported as the median and mean of the samples.
+            - warmup_iterations: Subset of the iterations to treat as warmup,
+                and not included in the results.
+            - output_tokens: Number of new tokens LLM to create.
+            - cli: Use multiple calls to llama-cpp.exe instead of llama-bench.exe
+            - kwargs: Additional parameters used by bench tools
+        """
+        # Check that state has the attribute model and it is a LlamaCPP model
+        if not hasattr(state, "model") or not isinstance(state.model, LlamaCppAdapter):
+            raise Exception("Load model using llamacpp-load first.")
+        if cli:
+            state = super().run(
+                state, prompts, iterations, warmup_iterations, output_tokens, **kwargs
+            )
+        else:
+            state = self.run_llama_bench_exe(state, prompts, iterations, output_tokens)
+        return state
 # This file was originally licensed under Apache 2.0. It has been modified.

lemonade-sdk 8.1.4__py3-none-any.whl → 8.2.2__py3-none-any.whl

Potentially problematic release.

lemonade-sdk 8.1.4py3-none-any.whl → 8.2.2py3-none-any.whl