PyPI - lemonade-sdk - Versions diffs - 8.1.11__py3-none-any.whl → 8.2.2__py3-none-any.whl - Mend

lemonade-sdk 8.1.11py3-none-any.whl → 8.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (38) hide show

lemonade/cache.py +6 -1
lemonade/common/status.py +4 -4
lemonade/common/system_info.py +0 -26
lemonade/tools/accuracy.py +143 -48
lemonade/tools/adapter.py +6 -1
lemonade/tools/bench.py +26 -8
lemonade/tools/flm/utils.py +70 -22
lemonade/tools/huggingface/bench.py +6 -1
lemonade/tools/llamacpp/bench.py +146 -27
lemonade/tools/llamacpp/load.py +30 -2
lemonade/tools/llamacpp/utils.py +317 -21
lemonade/tools/oga/bench.py +5 -26
lemonade/tools/oga/load.py +49 -123
lemonade/tools/oga/migration.py +403 -0
lemonade/tools/report/table.py +76 -8
lemonade/tools/server/flm.py +2 -6
lemonade/tools/server/llamacpp.py +43 -2
lemonade/tools/server/serve.py +354 -18
lemonade/tools/server/static/js/chat.js +15 -77
lemonade/tools/server/static/js/model-settings.js +24 -3
lemonade/tools/server/static/js/models.js +440 -37
lemonade/tools/server/static/js/shared.js +61 -8
lemonade/tools/server/static/logs.html +157 -13
lemonade/tools/server/static/styles.css +204 -0
lemonade/tools/server/static/webapp.html +39 -1
lemonade/version.py +1 -1
lemonade_install/install.py +33 -579
{lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/METADATA +6 -4
{lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/RECORD +38 -37
lemonade_server/cli.py +10 -0
lemonade_server/model_manager.py +172 -11
lemonade_server/pydantic_models.py +3 -0
lemonade_server/server_models.json +102 -66
{lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/WHEEL +0 -0
{lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/entry_points.txt +0 -0
{lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/LICENSE +0 -0
{lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/NOTICE.md +0 -0
{lemonade_sdk-8.1.11.dist-info → lemonade_sdk-8.2.2.dist-info}/top_level.txt +0 -0

lemonade/cache.py CHANGED Viewed

@@ -43,7 +43,11 @@ def build_name(input_name):
     """
     if os.path.isdir(input_name):
+        # Input is a folder so no good way to determine a model name
         input_name_sanitized = "local_model"
+    elif os.path.isfile(input_name):
+        # Use the filename without its extension
+        input_name_sanitized = os.path.splitext(os.path.basename(input_name))[0]
     else:
         # Sanitize the input name
         input_name_sanitized = input_name.replace("/", "_")
@@ -63,8 +67,9 @@ class Keys:
     TOKEN_GENERATION_TOKENS_PER_SECOND = "token_generation_tokens_per_second"
     STD_DEV_TOKENS_PER_SECOND = "std_dev_tokens_per_second"
     SECONDS_TO_FIRST_TOKEN = "seconds_to_first_token"
-    PREFILL_TOKENS_PER_SECOND = "prefill_tokens_per_second"
     STD_DEV_SECONDS_TO_FIRST_TOKEN = "std_dev_seconds_to_first_token"
+    PREFILL_TOKENS_PER_SECOND = "prefill_tokens_per_second"
+    STD_DEV_PREFILL_TOKENS_PER_SECOND = "std_dev_prefill_tokens_per_second"
     CHECKPOINT = "checkpoint"
     DTYPE = "dtype"
     PROMPT = "prompt"

lemonade/common/status.py CHANGED Viewed

@@ -112,10 +112,10 @@ class UniqueInvocationInfo(BasicInfo):
         if print_file_name:
             print(f"{self.script_name}{self.extension}:")
-        # Print invocation about the model (only applies to scripts, not ONNX files or
+        # Print invocation about the model (only applies to scripts, not ONNX or GGUF files, nor
         # LLMs, which have no extension)
         if not (
-            self.extension == ".onnx"
+            self.extension in [".onnx", ".gguf"]
             or self.extension == build.state_file_name
             or self.extension == ""
         ):
@@ -138,7 +138,7 @@ class UniqueInvocationInfo(BasicInfo):
         if self.depth == 0:
             print(f"{self.indent}\tLocation:\t{self.file}", end="")
-            if self.extension == ".onnx":
+            if self.extension in [".onnx", ".gguf"]:
                 print()
             else:
                 print(f", line {self.line}")
@@ -314,7 +314,7 @@ class UniqueInvocationInfo(BasicInfo):
         Print information about a given model or submodel.
         """
-        if self.extension == ".onnx" or self.extension == "":
+        if self.extension in [".onnx", ".gguf"] or self.extension == "":
             self.indent = "\t" * (2 * self.depth)
         else:
             self.indent = "\t" * (2 * self.depth + 1)

lemonade/common/system_info.py CHANGED Viewed

@@ -1110,32 +1110,6 @@ class LinuxSystemInfo(SystemInfo):
         return ""
-    def _get_nvidia_vram_smi_linux(self) -> float:
-        """
-        Get NVIDIA GPU VRAM on Linux using nvidia-smi command.
-        Returns:
-            float: VRAM in GB, or 0.0 if detection fails
-        """
-        try:
-            output = (
-                subprocess.check_output(
-                    "nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits",
-                    shell=True,
-                    stderr=subprocess.DEVNULL,
-                )
-                .decode()
-                .strip()
-            )
-            # nvidia-smi returns memory in MB
-            vram_mb = int(output.split("\n")[0])
-            vram_gb = round(vram_mb / 1024, 1)
-            return vram_gb
-        except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
-            pass
-        return 0.0
     @staticmethod
     def get_processor_name() -> str:
         """

lemonade/tools/accuracy.py CHANGED Viewed

@@ -83,42 +83,116 @@ class LMEvalHarness(Tool):
         return parser
-    def _process_results(self, results_dir, state):
-        """Process evaluation results and save to state stats"""
-        if not os.path.exists(results_dir) or not os.path.isdir(results_dir):
-            printing.log_warning(f"Results directory not found at {results_dir}")
-            return
-        model_dirs = [
-            d
-            for d in os.listdir(results_dir)
-            if os.path.isdir(os.path.join(results_dir, d))
-        ]
-        if not model_dirs:
-            printing.log_warning(f"No model directories found in {results_dir}")
-            return
-        model_dir = os.path.join(results_dir, model_dirs[0])
-        printing.log_info(f"Found model directory: {model_dir}")
-        # Find the results JSON file with timestamp
-        results_files = [
-            f
-            for f in os.listdir(model_dir)
-            if f.startswith("results_") and f.endswith(".json")
-        ]
+    def _scale_metric(self, metric_name, value):
+        """
+        Scale metric value appropriately based on type and range
+        Args:
+            metric_name: Name of the metric (e.g., "acc,none", "ppl")
+            value: Numeric value of the metric
+        Returns:
+            tuple: (scaled_value, units, display_string)
+        """
+        fraction_metrics = {
+            "acc",
+            "accuracy",
+            "f1",
+            "exact_match",
+            "em",
+            "win_rate",
+            "recall",
+            "precision",
+            "rouge",
+            "bleu",
+            "meteor",
+            "bertscore",
+            "match",
+            "correct",
+            "pass",
+            "success_rate",
+        }
+        metric_base = metric_name.split(",")[0].lower()
+        is_fraction = any(
+            frac_metric in metric_base for frac_metric in fraction_metrics
+        )
+        is_in_unit_range = 0 <= value <= 1
-        if not results_files:
-            printing.log_warning(f"No results files found in {model_dir}")
+        if is_fraction and is_in_unit_range:
+            scaled_value = float(value) * 100
+            units = "%"
+            display_str = f"{value:.4f} ({scaled_value:.2f}%)"
+        else:
+            scaled_value = float(value)
+            units = "raw"
+            display_str = f"{value:.4f}"
+        return scaled_value, units, display_str
+    def _process_results(self, results_path, state):
+        """
+        Process evaluation results and save to state stats
+        Args:
+            results_path: Can be either a direct JSON file path or a directory path
+            state: State object to save metrics to
+        """
+        results_file_path = None
+        # Determine if this is a file or directory and find the JSON file
+        if os.path.isfile(results_path) and results_path.endswith(".json"):
+            # Direct JSON file path (modern format)
+            results_file_path = results_path
+        elif os.path.isdir(results_path):
+            # Look for model subdirectories
+            model_dirs = [
+                d
+                for d in os.listdir(results_path)
+                if os.path.isdir(os.path.join(results_path, d))
+            ]
+            if model_dirs:
+                # Format: results_dir/model_name/results_*.json
+                model_dir = os.path.join(results_path, model_dirs[0])
+                printing.log_info(f"Found model directory: {model_dir}")
+                results_files = [
+                    f
+                    for f in os.listdir(model_dir)
+                    if f.startswith("results_") and f.endswith(".json")
+                ]
+                if results_files:
+                    results_files.sort(reverse=True)
+                    results_file_path = os.path.join(model_dir, results_files[0])
+                else:
+                    printing.log_warning(f"No results files found in {model_dir}")
+                    return
+            else:
+                printing.log_warning(f"No model directories found in {results_path}")
+                return
+        else:
+            # Handle case where lm-eval adds timestamp to expected filename
+            results_dir = os.path.dirname(results_path)
+            if os.path.exists(results_dir):
+                json_files = [f for f in os.listdir(results_dir) if f.endswith(".json")]
+                if json_files:
+                    results_file_path = os.path.join(results_dir, json_files[0])
+                    printing.log_info(f"Found results file: {results_file_path}")
+                else:
+                    printing.log_warning(f"No JSON results file found in {results_dir}")
+                    return
+            else:
+                printing.log_warning(f"Results path not found at {results_path}")
+                return
+        if not results_file_path or not os.path.exists(results_file_path):
+            printing.log_warning(f"Results file not found at {results_file_path}")
             return
-        # Sort by timestamp
-        results_files.sort(reverse=True)
-        results_file_path = os.path.join(model_dir, results_files[0])
         printing.log_info(f"Processing results from {results_file_path}")
-        # Read and process results
         try:
             with open(results_file_path, "r", encoding="utf-8") as f:
                 results = json.load(f)
@@ -132,18 +206,21 @@ class LMEvalHarness(Tool):
                         if isinstance(value, (int, float)) and not metric.startswith(
                             "alias"
                         ):
-                            # Format metric name for stats
-                            clean_metric = metric.replace(",", "_")
+                            # Format metric name for stats - remove ,none suffix
+                            clean_metric = metric.split(",")[0]  # Remove ,none suffix
                             stat_name = f"lm_eval_{task_name}_{clean_metric}"
-                            # Save to state stats as percentage
-                            state.save_stat(stat_name, float(value) * 100)
-                            state.save_stat(f"{stat_name}_units", "%")
+                            # Scale metric appropriately
+                            scaled_value, units, value_str = self._scale_metric(
+                                metric, value
+                            )
+                            display_str = f"  {metric}: {value_str}"
+                            state.save_stat(stat_name, scaled_value)
+                            state.save_stat(f"{stat_name}_units", units)
                             self.status_stats.append(stat_name)
-                            printing.log_info(
-                                f"  {metric}: {value:.4f} ({value*100:.2f}%)"
-                            )
+                            printing.log_info(display_str)
                 # Save summary metrics if available
                 avg_metrics = {}
@@ -167,12 +244,17 @@ class LMEvalHarness(Tool):
                     if values:
                         avg_value = sum(values) / len(values)
                         stat_name = f"lm_eval_average_{metric}"
-                        state.save_stat(stat_name, float(avg_value) * 100)
-                        state.save_stat(f"{stat_name}_units", "%")
-                        self.status_stats.append(stat_name)
-                        printing.log_info(
-                            f"Average {metric}: {avg_value:.4f} ({avg_value*100:.2f}%)"
+                        # Apply same scaling logic as individual metrics
+                        scaled_avg, units, value_str = self._scale_metric(
+                            metric, avg_value
                         )
+                        display_str = f"Average {metric}: {value_str}"
+                        state.save_stat(stat_name, scaled_avg)
+                        state.save_stat(f"{stat_name}_units", units)
+                        self.status_stats.append(stat_name)
+                        printing.log_info(display_str)
         except (IOError, json.JSONDecodeError) as e:
             printing.log_error(f"Error processing results: {e}")
@@ -189,6 +271,20 @@ class LMEvalHarness(Tool):
         output_path: Optional[str] = None,
     ) -> State:
+        # Check if lm-eval is available
+        try:
+            # pylint: disable=unused-import
+            import lm_eval
+        except ImportError:
+            error_msg = (
+                "lm-eval-harness is required but not installed. "
+                "Please install it using one of the following commands:\n"
+                "  pip install lemonade-sdk[dev]\n"
+                "  pip install -e .[dev]\n"
+            )
+            printing.log_error(error_msg)
+            raise ImportError(error_msg)
         import requests
         from lemonade.tools.server.utils.thread import ServerRunner
@@ -261,7 +357,7 @@ class LMEvalHarness(Tool):
                     raise RuntimeError("Failed to start the server")
         # Build API URL
-        results_file = os.path.join(output_path, f"{task}_results")
+        results_file = os.path.join(output_path, f"{task}_results.json")
         printing.log_info(f"Running lm-eval-harness on {task}...")
@@ -312,9 +408,8 @@ class LMEvalHarness(Tool):
                     "Results obtained successfully but couldn't display due to encoding issues"
                 )
-            # Process results from the correct location
-            results_dir = os.path.join(output_path, f"{task}_results")
-            self._process_results(results_dir, state)
+            # Process results from the JSON file
+            self._process_results(results_file, state)
         except subprocess.CalledProcessError as e:
             printing.log_error(f"Error running lm-eval-harness: {e}")

lemonade/tools/adapter.py CHANGED Viewed

@@ -10,11 +10,14 @@ class ModelAdapter(abc.ABC):
         """
         Self-benchmarking ModelAdapters can store their results in the
         tokens_per_second and time_to_first_token members.
+        ModelAdapters that run generate in a different process can store the
+        peak memory used (bytes) by that process in the peak_wset member.
         """
         self.tokens_per_second = None
         self.time_to_first_token = None
         self.prompt_tokens = None
         self.response_tokens = None
+        self.peak_wset = None
         self.type = "generic"
@@ -27,7 +30,9 @@ class ModelAdapter(abc.ABC):
         with recipe components, which themselves may not support a lot of arguments.
         The generate method should store prompt and response lengths (in tokens)
-        in the prompt_tokens and response_tokens members.
+        in the prompt_tokens and response_tokens members.  If a different process is used,
+        the generate method can also store the peak memory used by that process in the
+        peak_wset member.
         """

lemonade/tools/bench.py CHANGED Viewed

@@ -2,7 +2,6 @@ from abc import ABC, abstractmethod
 import argparse
 import os
 import platform
-import psutil
 from lemonade.state import State
 from lemonade.tools import Tool
 from lemonade.cache import Keys
@@ -29,7 +28,9 @@ class Bench(Tool, ABC):
             Keys.SECONDS_TO_FIRST_TOKEN,
             Keys.STD_DEV_SECONDS_TO_FIRST_TOKEN,
             Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
+            Keys.STD_DEV_TOKENS_PER_SECOND,
             Keys.PREFILL_TOKENS_PER_SECOND,
+            Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
             Keys.PROMPT_TOKENS,
             Keys.RESPONSE_TOKENS,
             Keys.MAX_MEMORY_USED_GBYTE,
@@ -42,7 +43,9 @@ class Bench(Tool, ABC):
         self.mean_time_to_first_token_list = []
         self.std_dev_time_to_first_token_list = []
         self.prefill_tokens_per_second_list = []
+        self.std_dev_prefill_tokens_per_second_list = []
         self.token_generation_tokens_per_second_list = []
+        self.std_dev_token_generation_tokens_per_second_list = []
         self.max_memory_used_gb_list = []
         # Max memory used can only be measured on Windows systems
@@ -88,7 +91,7 @@ class Bench(Tool, ABC):
             default=[str(default_prompt_length)],
             metavar="PROMPT",
             help="Input one or more prompts to the LLM. Three formats are supported. "
-            "1) integer: use a synthetic prompt with the specified length "
+            "1) integer: use a synthetic prompt with the specified token length "
             "2) str: use a user-provided prompt string "
             "3) path/to/prompt.txt: load the prompt from a text file. "
             f"(default: {default_prompt_length}) ",
@@ -190,11 +193,6 @@ class Bench(Tool, ABC):
             )
             self.first_run_prompt = False
-            if self.save_max_memory_used:
-                self.max_memory_used_gb_list.append(
-                    psutil.Process().memory_info().peak_wset / 1024**3
-                )
         self.set_percent_progress(None)
         self.save_stats(state)
@@ -211,7 +209,10 @@ class Bench(Tool, ABC):
         output_tokens,
         **kwargs,
     ):
-        pass
+        """
+        The run_prompt method should append the appropriate value to each of the per prompt
+        measurement statistics lists that are members of the Bench class.
+        """
     @staticmethod
     def get_item_or_list(lst):
@@ -246,10 +247,27 @@ class Bench(Tool, ABC):
             Keys.PREFILL_TOKENS_PER_SECOND,
             self.get_item_or_list(self.prefill_tokens_per_second_list),
         )
+        if not all(
+            element is None for element in self.std_dev_prefill_tokens_per_second_list
+        ):
+            state.save_stat(
+                Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
+                self.get_item_or_list(self.std_dev_prefill_tokens_per_second_list),
+            )
         state.save_stat(
             Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
             self.get_item_or_list(self.token_generation_tokens_per_second_list),
         )
+        if not all(
+            element is None
+            for element in self.std_dev_token_generation_tokens_per_second_list
+        ):
+            state.save_stat(
+                Keys.STD_DEV_TOKENS_PER_SECOND,
+                self.get_item_or_list(
+                    self.std_dev_token_generation_tokens_per_second_list
+                ),
+            )
         if self.save_max_memory_used:
             state.save_stat(
                 Keys.MAX_MEMORY_USED_GBYTE,

lemonade/tools/flm/utils.py CHANGED Viewed

@@ -10,16 +10,46 @@ import time
 from typing import List, Optional
 import requests
-from packaging.version import Version
+from packaging.version import Version, InvalidVersion
-FLM_MINIMUM_VERSION = "0.9.10"
+def get_flm_latest_version() -> Optional[str]:
+    """
+    Get and return the latest FLM version from "https://github.com/FastFlowLM/FastFlowLM/tags"
+    This uses the GitHub tags API.
+    """
+    url = "https://api.github.com/repos/FastFlowLM/FastFlowLM/tags"
+    try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        tags = response.json()
+        if not tags:
+            return None
+        # Tags are sorted in reverse chronological order; find the first that looks like a version
+        for tag in tags:
+            tag_name = tag.get("name", "")
+            # Accept tags of the form v0.9.10, 0.9.10, etc.
+            if tag_name.startswith("v"):
+                version_candidate = tag_name[1:]
+            else:
+                version_candidate = tag_name
+            try:
+                # validate it's a version string
+                _ = Version(version_candidate)
+                return version_candidate
+            except InvalidVersion:
+                continue
+        return None
+    except requests.exceptions.RequestException as e:
+        logging.debug("Error retrieving latest FLM version: %s", e)
+        return None
 def check_flm_version() -> Optional[str]:
     """
     Check if FLM is installed and return version, or None if not available.
     """
+    latest_version_str = get_flm_latest_version()
     try:
         result = subprocess.run(
             ["flm", "version"],
@@ -34,11 +64,11 @@ def check_flm_version() -> Optional[str]:
         output = result.stdout.strip()
         if output.startswith("FLM v"):
             version_str = output[5:]  # Remove "FLM v" prefix
-            return version_str
-        return None
+            return version_str, latest_version_str
+        return None, latest_version_str
     except (subprocess.CalledProcessError, FileNotFoundError):
-        return None
+        return None, latest_version_str
 def refresh_environment():
@@ -76,31 +106,42 @@ def install_flm():
     If not, download and run the GUI installer, then wait for completion.
     """
     # Check current FLM installation
-    current_version = check_flm_version()
+    current_version, latest_version = check_flm_version()
-    if current_version and Version(current_version) >= Version(FLM_MINIMUM_VERSION):
+    if (
+        current_version
+        and latest_version
+        and Version(current_version) == Version(latest_version)
+    ):
         logging.info(
-            "FLM v%s is already installed and meets minimum version requirement (v%s)",
+            "FLM v%s is already installed and is up to date (latest version: v%s).",
             current_version,
-            FLM_MINIMUM_VERSION,
+            latest_version,
         )
         return
     if current_version:
+        if not latest_version:
+            logging.info(
+                "Unable to detect the latest FLM version; continuing with installed FLM v%s.",
+                current_version,
+            )
+            return
         logging.info(
-            "FLM v%s is installed but below minimum version v%s. Upgrading...",
+            "FLM v%s is installed but below latest version v%s. Upgrading...",
             current_version,
-            FLM_MINIMUM_VERSION,
+            latest_version,
         )
+        verysilent = True
     else:
-        logging.info(
-            "FLM not found. Installing FLM v%s or later...", FLM_MINIMUM_VERSION
-        )
+        logging.info("FLM not found. Installing FLM v%s or later...", latest_version)
+        verysilent = False
     # Download the installer
     # pylint: disable=line-too-long
     installer_url = "https://github.com/FastFlowLM/FastFlowLM/releases/latest/download/flm-setup.exe"
     installer_path = os.path.join(tempfile.gettempdir(), "flm-setup.exe")
+    installer_args = [installer_path, "/VERYSILENT"] if verysilent else [installer_path]
     try:
         # Remove existing installer if present
@@ -123,13 +164,15 @@ def install_flm():
         # Launch the installer GUI
         logging.warning(
             "Launching FLM installer GUI. Please complete the installation..."
+            if not verysilent
+            else "Installing FLM..."
         )
         # Launch installer and wait for it to complete
         if os.name == "nt":  # Windows
-            process = subprocess.Popen([installer_path], shell=True)
+            process = subprocess.Popen(installer_args, shell=True)
         else:
-            process = subprocess.Popen([installer_path])
+            process = subprocess.Popen(installer_args)
         # Wait for installer to complete
         process.wait()
@@ -150,8 +193,8 @@ def install_flm():
         # Verify installation
         max_retries = 10
         for attempt in range(max_retries):
-            new_version = check_flm_version()
-            if new_version and Version(new_version) >= Version(FLM_MINIMUM_VERSION):
+            new_version, latest_version = check_flm_version()
+            if new_version and Version(new_version) == Version(latest_version):
                 logging.info("FLM v%s successfully installed and verified", new_version)
                 return
@@ -240,7 +283,12 @@ def get_flm_installed_models() -> List[str]:
         return installed_checkpoints
-    except (subprocess.CalledProcessError, FileNotFoundError, AttributeError):
+    except (
+        subprocess.CalledProcessError,
+        FileNotFoundError,
+        AttributeError,
+        NotADirectoryError,
+    ):
         # FLM not installed, not available, or output parsing failed
         return []
@@ -249,7 +297,7 @@ def is_flm_available() -> bool:
     """
     Check if FLM is available and meets minimum version requirements.
     """
-    current_version = check_flm_version()
-    return current_version is not None and Version(current_version) >= Version(
-        FLM_MINIMUM_VERSION
+    current_version, latest_version = check_flm_version()
+    return current_version is not None and Version(current_version) == Version(
+        latest_version
     )

lemonade/tools/huggingface/bench.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import argparse
 import statistics
 from statistics import StatisticsError
+import psutil
 from lemonade.state import State
 from lemonade.cache import Keys
 from lemonade.tools.bench import Bench
@@ -75,7 +76,7 @@ class HuggingfaceBench(Bench):
         warmup_iterations: int,
         output_tokens: int,
         num_beams: int = default_beams,
-    ) -> State:
+    ):
         """
         We don't have access to the internal timings of generate(), so time to first
         token (TTFT, aka prefill latency) and token/s are calculated using the following formulae:
@@ -176,6 +177,10 @@ class HuggingfaceBench(Bench):
         self.token_generation_tokens_per_second_list.append(
             (mean_token_len - 1) / mean_decode_latency
         )
+        if self.save_max_memory_used:
+            self.max_memory_used_gb_list.append(
+                psutil.Process().memory_info().peak_wset / 1024**3
+            )
 # This file was originally licensed under Apache 2.0. It has been modified.

lemonade-sdk 8.1.11__py3-none-any.whl → 8.2.2__py3-none-any.whl

Potentially problematic release.

lemonade-sdk 8.1.11py3-none-any.whl → 8.2.2py3-none-any.whl