PyPI - lemonade-sdk - Versions diffs - 7.0.4__py3-none-any.whl → 8.0.1__py3-none-any.whl - Mend

lemonade-sdk 7.0.4py3-none-any.whl → 8.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (56) hide show

lemonade/api.py +3 -3
lemonade/cli.py +11 -17
lemonade/common/build.py +0 -47
lemonade/common/network.py +50 -0
lemonade/common/status.py +2 -21
lemonade/common/system_info.py +19 -4
lemonade/profilers/memory_tracker.py +3 -1
lemonade/tools/accuracy.py +3 -4
lemonade/tools/adapter.py +1 -2
lemonade/tools/{huggingface_bench.py → huggingface/bench.py} +2 -87
lemonade/tools/huggingface/load.py +235 -0
lemonade/tools/{huggingface_load.py → huggingface/utils.py} +87 -255
lemonade/tools/humaneval.py +9 -3
lemonade/tools/{llamacpp_bench.py → llamacpp/bench.py} +1 -1
lemonade/tools/{llamacpp.py → llamacpp/load.py} +18 -2
lemonade/tools/mmlu.py +7 -15
lemonade/tools/{ort_genai/oga.py → oga/load.py} +31 -422
lemonade/tools/oga/utils.py +423 -0
lemonade/tools/perplexity.py +4 -3
lemonade/tools/prompt.py +2 -1
lemonade/tools/quark/quark_load.py +2 -1
lemonade/tools/quark/quark_quantize.py +5 -5
lemonade/tools/report/table.py +3 -3
lemonade/tools/server/llamacpp.py +188 -45
lemonade/tools/server/serve.py +184 -146
lemonade/tools/server/static/favicon.ico +0 -0
lemonade/tools/server/static/styles.css +568 -0
lemonade/tools/server/static/webapp.html +439 -0
lemonade/tools/server/tray.py +458 -0
lemonade/tools/server/{port_utils.py → utils/port.py} +22 -3
lemonade/tools/server/utils/system_tray.py +395 -0
lemonade/tools/server/{instructions.py → webapp.py} +4 -10
lemonade/version.py +1 -1
lemonade_install/install.py +46 -28
lemonade_sdk-8.0.1.dist-info/METADATA +179 -0
lemonade_sdk-8.0.1.dist-info/RECORD +70 -0
lemonade_server/cli.py +182 -27
lemonade_server/model_manager.py +192 -20
lemonade_server/pydantic_models.py +9 -4
lemonade_server/server_models.json +5 -3
lemonade/common/analyze_model.py +0 -26
lemonade/common/labels.py +0 -61
lemonade/common/onnx_helpers.py +0 -176
lemonade/common/plugins.py +0 -10
lemonade/common/tensor_helpers.py +0 -83
lemonade/tools/server/static/instructions.html +0 -262
lemonade_sdk-7.0.4.dist-info/METADATA +0 -113
lemonade_sdk-7.0.4.dist-info/RECORD +0 -69
/lemonade/tools/{ort_genai → oga}/__init__.py +0 -0
/lemonade/tools/{ort_genai/oga_bench.py → oga/bench.py} +0 -0
/lemonade/tools/server/{thread_utils.py → utils/thread.py} +0 -0
{lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.1.dist-info}/WHEEL +0 -0
{lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.1.dist-info}/entry_points.txt +0 -0
{lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.1.dist-info}/licenses/LICENSE +0 -0
{lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.1.dist-info}/licenses/NOTICE.md +0 -0
{lemonade_sdk-7.0.4.dist-info → lemonade_sdk-8.0.1.dist-info}/top_level.txt +0 -0

lemonade/api.py CHANGED Viewed

@@ -57,7 +57,7 @@ def from_pretrained(
         # Huggingface supports all checkpoints, so there is nothing to check for
         import torch
-        from lemonade.tools.huggingface_load import HuggingfaceLoad
+        from lemonade.tools.huggingface.load import HuggingfaceLoad
         state = _make_state(recipe, checkpoint)
@@ -73,7 +73,7 @@ def from_pretrained(
         # Huggingface Transformers recipe for discrete GPU (Nvidia, Instinct, Radeon)
         import torch
-        from lemonade.tools.huggingface_load import HuggingfaceLoad
+        from lemonade.tools.huggingface.load import HuggingfaceLoad
         state = _make_state(recipe, checkpoint)
@@ -87,7 +87,7 @@ def from_pretrained(
         return state.model, state.tokenizer
     elif recipe.startswith("oga-"):
-        import lemonade.tools.ort_genai.oga as oga
+        import lemonade.tools.oga.load as oga
         # Make sure the user chose a supported runtime, e.g., oga-cpu
         user_backend = recipe.split("oga-")[1]

lemonade/cli.py CHANGED Viewed

@@ -1,4 +1,8 @@
 import os
+# pylint: disable=C0413
+# Prevent HF warnings from showing on every import
+os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
 from lemonade.version import __version__ as version_number
 from lemonade.tools import FirstTool, NiceHelpFormatter
 from lemonade.profilers.memory_tracker import MemoryTracker
@@ -8,12 +12,12 @@ from lemonade.sequence import Sequence
 from lemonade.tools.management_tools import Cache, Version, SystemInfo
 from lemonade.state import State
-from lemonade.tools.huggingface_load import HuggingfaceLoad
-from lemonade.tools.huggingface_bench import HuggingfaceBench
-from lemonade.tools.ort_genai.oga_bench import OgaBench
-from lemonade.tools.llamacpp_bench import LlamaCppBench
-from lemonade.tools.llamacpp import LoadLlamaCpp
+from lemonade.tools.huggingface.load import HuggingfaceLoad
+from lemonade.tools.huggingface.bench import HuggingfaceBench
+from lemonade.tools.oga.load import OgaLoad
+from lemonade.tools.oga.bench import OgaBench
+from lemonade.tools.llamacpp.bench import LlamaCppBench
+from lemonade.tools.llamacpp.load import LoadLlamaCpp
 import lemonade.cache as cache
 from lemonade.tools.mmlu import AccuracyMMLU
@@ -24,7 +28,6 @@ from lemonade.tools.prompt import LLMPrompt
 from lemonade.tools.quark.quark_load import QuarkLoad
 from lemonade.tools.quark.quark_quantize import QuarkQuantize
 from lemonade.tools.report.llm_report import LemonadeReport
-from lemonade.tools.server.serve import Server
 def main():
@@ -40,26 +43,17 @@ def main():
         LMEvalHarness,
         LLMPrompt,
         HuggingfaceBench,
+        OgaLoad,
         OgaBench,
         QuarkQuantize,
         QuarkLoad,
         LemonadeReport,
-        Server,
         # Inherited from lemonade
         Cache,
         Version,
         SystemInfo,
     ]
-    # Import onnxruntime-genai recipes
-    try:
-        from lemonade.tools.ort_genai.oga import OgaLoad
-        tools = tools + [OgaLoad]
-    except ModuleNotFoundError:
-        pass
     # List the available profilers
     profilers = [MemoryTracker]

lemonade/common/build.py CHANGED Viewed

@@ -6,8 +6,6 @@ from typing import Dict
 import hashlib
 import psutil
 import yaml
-import torch
-import numpy as np
 import lemonade.common.exceptions as exp
 state_file_name = "state.yaml"
@@ -101,51 +99,6 @@ def unique_id():
     return hashlib.sha256(f"{pid}{start_time}".encode()).hexdigest()
-def get_shapes_and_dtypes(inputs: dict):
-    """
-    Return the shape and data type of each value in the inputs dict
-    """
-    shapes = {}
-    dtypes = {}
-    for key in sorted(inputs):
-        value = inputs[key]
-        if isinstance(
-            value,
-            (list, tuple),
-        ):
-            for v, i in zip(value, range(len(value))):
-                if isinstance(v, (list, tuple)):
-                    # Handle nested lists/tuples, for example past_key_values
-                    # in an LLM that has KV-caching enabled
-                    for v2, i2 in zip(v, range(len(v))):
-                        subsubkey = f"{key}[{i}][{i2}]"
-                        shapes[subsubkey] = np.array(v2).shape
-                        dtypes[subsubkey] = np.array(v2).dtype.name
-                else:
-                    # Handle single list/tuple
-                    subkey = f"{key}[{i}]"
-                    shapes[subkey] = np.array(v).shape
-                    dtypes[subkey] = np.array(v).dtype.name
-        elif torch.is_tensor(value):
-            shapes[key] = np.array(value.detach()).shape
-            dtypes[key] = np.array(value.detach()).dtype.name
-        elif isinstance(value, np.ndarray):
-            shapes[key] = value.shape
-            dtypes[key] = value.dtype.name
-        elif isinstance(value, (bool, int, float)):
-            shapes[key] = (1,)
-            dtypes[key] = type(value).__name__
-        elif value is None:
-            pass
-        else:
-            raise exp.Error(
-                "One of the provided inputs contains the unsupported "
-                f' type {type(value)} at key "{key}".'
-            )
-    return shapes, dtypes
 class Logger:
     """
     Redirects stdout to file (and console if needed)

lemonade/common/network.py ADDED Viewed

@@ -0,0 +1,50 @@
+import os
+from typing import Optional
+import socket
+from huggingface_hub import model_info
+def is_offline():
+    """
+    Check if the system is offline by attempting to connect to huggingface.co.
+    Returns:
+        bool: True if the system is offline (cannot connect to huggingface.co),
+              False otherwise.
+    """
+    if os.environ.get("LEMONADE_OFFLINE"):
+        return True
+    try:
+        socket.gethostbyname("huggingface.co")
+        return False
+    except socket.gaierror:
+        return True
+def get_base_model(checkpoint: str) -> Optional[str]:
+    """
+    Get the base model information for a given checkpoint from the Hugging Face Hub.
+    Will auto-detect if we're offline and skip the network call in that case.
+    Args:
+        checkpoint: The model checkpoint to query
+    Returns:
+        The base model name if found, or None if not found or error occurs
+    """
+    # Skip network call in offline mode
+    if is_offline():
+        return None
+    try:
+        info = model_info(checkpoint)
+        if info.cardData and "base_model" in info.cardData:
+            if info.cardData["base_model"] is not None:
+                # This is a derived model
+                return info.cardData["base_model"]
+            else:
+                # This is itself a base model
+                return [checkpoint]
+    except Exception:  # pylint: disable=broad-except
+        pass
+    return None

lemonade/common/status.py CHANGED Viewed

@@ -7,12 +7,10 @@ import dataclasses
 from typing import Callable, List, Union, Dict, Optional
 import textwrap
 import psutil
-import torch
 from lemonade.common import printing
 from lemonade.state import State
 import lemonade.common.build as build
 import lemonade.common.filesystem as fs
-import lemonade.common.analyze_model as analyze_model
 def _pretty_print_key(key: str) -> str:
@@ -64,7 +62,6 @@ class SkipFields:
     file_name: bool = False
     model_name: bool = False
-    parameters: bool = False
     location: bool = False
     input_shape: bool = False
     build_dir: bool = False
@@ -147,18 +144,6 @@ class UniqueInvocationInfo(BasicInfo):
                 print(f", line {self.line}")
             self.skip.location = True
-    def _print_parameters(self):
-        if self.skip.parameters or self.params is None:
-            return
-        # Display number of parameters and size
-        parameters_size = parameters_to_size(self.params)
-        print(
-            f"{self.indent}\tParameters:\t{'{:,}'.format(self.params)} ({parameters_size})"
-        )
-        self.skip.parameters = True
     def _print_unique_input_shape(
         self,
         exec_time_formatted: str,
@@ -348,7 +333,6 @@ class UniqueInvocationInfo(BasicInfo):
         if (self.depth == 0 and not model_visited) or (self.depth != 0):
             # Print this information only once per model
             self._print_location()
-            self._print_parameters()
         self._print_unique_input_shape(
             exec_time_formatted, invocation_idx, multiple_unique_invocations
         )
@@ -362,16 +346,13 @@ class UniqueInvocationInfo(BasicInfo):
 @dataclasses.dataclass
 class ModelInfo(BasicInfo):
-    model: torch.nn.Module = None
+    model: str = None
     old_forward: Union[Callable, None] = None
     unique_invocations: Union[Dict[str, UniqueInvocationInfo], None] = (
         dataclasses.field(default_factory=dict)
     )
     last_unique_invocation_executed: Union[str, None] = None
-    def __post_init__(self):
-        self.params = analyze_model.count_parameters(self.model)
 def recursive_print(
     models_found: Dict[str, ModelInfo],
@@ -447,7 +428,7 @@ def stop_logger_forward() -> None:
 def add_to_state(
     state: State,
     name: str,
-    model: Union[str, torch.nn.Module],
+    model: str,
     extension: str = "",
     input_shapes: Optional[Dict] = None,
 ):

lemonade/common/system_info.py CHANGED Viewed

@@ -3,6 +3,7 @@ import importlib.metadata
 import platform
 import re
 import subprocess
+import ctypes
 class SystemInfo(ABC):
@@ -184,11 +185,25 @@ class WindowsSystemInfo(SystemInfo):
             str: Windows power setting.
         """
         try:
-            out = subprocess.check_output(["powercfg", "/getactivescheme"]).decode()
-            return re.search(r"\((.*?)\)", out).group(1)
+            # Capture output as bytes
+            out_bytes = subprocess.check_output(["powercfg", "/getactivescheme"])
+            # Get system's OEM code page (e.g., cp437, cp850)
+            oem_cp = "cp" + str(ctypes.windll.kernel32.GetOEMCP())
+            # Decode using detected OEM code page
+            out = out_bytes.decode(oem_cp)
+            # Extract power scheme name from parentheses
+            match = re.search(r"\((.*?)\)", out)
+            if match:
+                return match.group(1)
+            return "Power scheme name not found in output"
         except subprocess.CalledProcessError:
-            pass
-        return "Windows power setting not found"
+            return "Windows power setting not found (command failed)"
+        except Exception as e:  # pylint: disable=broad-except
+            return f"Error retrieving power setting: {str(e)}"
     def get_dict(self) -> dict:
         """

lemonade/profilers/memory_tracker.py CHANGED Viewed

@@ -2,7 +2,6 @@ import os
 import time
 import textwrap
 from multiprocessing import Process, Queue
-import matplotlib.pyplot as plt
 import psutil
 import yaml
 import lemonade.common.filesystem as fs
@@ -101,6 +100,9 @@ class MemoryTracker(Profiler):
             self.tracking_active = False
     def generate_results(self, state, timestamp, _):
+        import matplotlib.pyplot as plt
         if self.tracker_process is None:
             return

lemonade/tools/accuracy.py CHANGED Viewed

@@ -7,15 +7,11 @@ import sys
 import time
 from typing import Optional
-import requests
 from lemonade.state import State
 from lemonade.tools import Tool
 import lemonade.common.printing as printing
 import lemonade.common.build as build
-from lemonade.tools.server.thread_utils import ServerRunner
 def is_port_in_use(port, host="localhost"):
     """
@@ -193,6 +189,9 @@ class LMEvalHarness(Tool):
         output_path: Optional[str] = None,
     ) -> State:
+        import requests
+        from lemonade.tools.server.utils.thread import ServerRunner
         model = state.model
         tokenizer = state.tokenizer

lemonade/tools/adapter.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import abc
-from transformers import AutoTokenizer
 class ModelAdapter(abc.ABC):
@@ -31,7 +30,7 @@ class TokenizerAdapter(abc.ABC):
     Base class for adapting an LLM's tokenizer to work with lemonade's standard tools
     """
-    def __init__(self, tokenizer: AutoTokenizer = None):
+    def __init__(self, tokenizer=None):
         self.auto_tokenizer = tokenizer
     @abc.abstractmethod

lemonade/tools/{huggingface_bench.py → huggingface/bench.py} RENAMED Viewed

@@ -1,10 +1,6 @@
 import argparse
-from typing import List, Tuple
-import time
 import statistics
 from statistics import StatisticsError
-from contextlib import nullcontext
-import torch
 from lemonade.state import State
 from lemonade.cache import Keys
 from lemonade.tools.bench import Bench
@@ -12,89 +8,6 @@ from lemonade.tools.bench import Bench
 default_beams = 1
-def benchmark_huggingface_llm(
-    model: torch.nn.Module,
-    tokenizer,
-    input_ids,
-    dtype,
-    num_beams: int,
-    target_output_tokens: int,
-    iterations: int,
-    warmup_iterations: int,
-    report_progress_fn,
-) -> List[Tuple[float, int]]:
-    amp_enabled = True if (dtype == torch.float16 or dtype == torch.bfloat16) else False
-    # The "if amp_enabled else nullcontext()" is to get around a bug in PyTorch 2.1
-    # where torch.cpu.amp.autocast(enabled=False) does nothing
-    with (
-        torch.cpu.amp.autocast(enabled=amp_enabled, dtype=dtype)
-        if amp_enabled
-        else nullcontext()
-    ):
-        per_iteration_result = []
-        tokens_out_len_list = []
-        # Early stopping is only a valid parameter with multiple beams
-        early_stopping = num_beams > 1
-        with torch.no_grad(), torch.inference_mode():
-            # Don't capture time for warmup
-            for count in range(warmup_iterations):
-                outputs = model.generate(
-                    input_ids,
-                    num_beams=num_beams,
-                    max_new_tokens=target_output_tokens,
-                    min_new_tokens=target_output_tokens,
-                    early_stopping=early_stopping,
-                    pad_token_id=tokenizer.eos_token_id,
-                )
-                tokens_out_len_list.append(outputs.shape[1] - input_ids.shape[1])
-                report_progress_fn((count + 1) / (warmup_iterations + iterations))
-            for count in range(iterations):
-                # CUDA synchronization is required prior to GPU benchmarking
-                # This has no negative effect on CPU-only benchmarks, and is more robust than
-                # checking `model.device == "cuda"` since it applies to multi-GPU environments
-                # Synchronization is done before collecting the start time because this will
-                # ensure that the GPU has finished initialization tasks such as loading weights
-                if torch.cuda.is_available():
-                    torch.cuda.synchronize()
-                start_time = time.perf_counter()
-                outputs = model.generate(
-                    input_ids,
-                    num_beams=num_beams,
-                    max_new_tokens=target_output_tokens,
-                    min_new_tokens=target_output_tokens,
-                    early_stopping=early_stopping,
-                    pad_token_id=tokenizer.eos_token_id,
-                )
-                if torch.cuda.is_available():
-                    torch.cuda.synchronize()
-                end_time = time.perf_counter()
-                latency = end_time - start_time
-                token_len = outputs.shape[1] - input_ids.shape[1]
-                tokens_out_len_list.append(token_len)
-                # Only count an iteration if it produced enough tokens
-                if token_len >= target_output_tokens:
-                    per_iteration_result.append((latency, token_len))
-                report_progress_fn(
-                    (warmup_iterations + count + 1) / (warmup_iterations + iterations)
-                )
-        if not per_iteration_result:
-            raise Bench.not_enough_tokens(target_output_tokens)
-    return per_iteration_result, tokens_out_len_list
 class HuggingfaceBench(Bench):
     """
     Benchmarks the performance of the generate() method of an LLM loaded from
@@ -171,6 +84,8 @@ class HuggingfaceBench(Bench):
             tokens_per_second = (new_tokens - 1) / (execution_latency - prefill_latency)
         """
+        from lemonade.tools.huggingface.utils import benchmark_huggingface_llm
         if self.first_run_prompt:
             if vars(state).get(Keys.MODEL) is None:
                 raise ValueError(

lemonade-sdk 7.0.4__py3-none-any.whl → 8.0.1__py3-none-any.whl

Potentially problematic release.

lemonade-sdk 7.0.4py3-none-any.whl → 8.0.1py3-none-any.whl