PyPI - lemonade-sdk - Versions diffs - 8.0.4__py3-none-any.whl → 8.0.6__py3-none-any.whl - Mend

lemonade-sdk 8.0.4py3-none-any.whl → 8.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (31) hide show

lemonade/api.py +50 -0
lemonade/cache.py +3 -1
lemonade/common/inference_engines.py +415 -0
lemonade/common/system_info.py +493 -47
lemonade/tools/adapter.py +6 -0
lemonade/tools/huggingface/utils.py +6 -5
lemonade/tools/llamacpp/bench.py +26 -46
lemonade/tools/llamacpp/load.py +104 -196
lemonade/tools/llamacpp/utils.py +612 -0
lemonade/tools/management_tools.py +53 -7
lemonade/tools/oga/bench.py +5 -6
lemonade/tools/oga/utils.py +8 -2
lemonade/tools/prompt.py +17 -25
lemonade/tools/report/table.py +12 -9
lemonade/tools/server/llamacpp.py +80 -92
lemonade/tools/server/serve.py +32 -0
lemonade/tools/server/static/styles.css +137 -58
lemonade/tools/server/static/webapp.html +34 -8
lemonade/tools/server/tray.py +7 -0
lemonade/version.py +1 -1
lemonade_sdk-8.0.6.dist-info/METADATA +295 -0
{lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/RECORD +30 -28
lemonade_server/cli.py +168 -22
lemonade_server/model_manager.py +4 -148
lemonade_server/server_models.json +11 -0
lemonade_sdk-8.0.4.dist-info/METADATA +0 -176
{lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/WHEEL +0 -0
{lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/entry_points.txt +0 -0
{lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/licenses/LICENSE +0 -0
{lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/licenses/NOTICE.md +0 -0
{lemonade_sdk-8.0.4.dist-info → lemonade_sdk-8.0.6.dist-info}/top_level.txt +0 -0

lemonade/tools/adapter.py CHANGED Viewed

@@ -13,6 +13,9 @@ class ModelAdapter(abc.ABC):
         """
         self.tokens_per_second = None
         self.time_to_first_token = None
+        self.prompt_tokens = None
+        self.response_tokens = None
         self.type = "generic"
     @abc.abstractmethod
@@ -22,6 +25,9 @@ class ModelAdapter(abc.ABC):
         We try to keep the signature here minimal to allow for maximum compatibility
         with recipe components, which themselves may not support a lot of arguments.
+        The generate method should store prompt and response lengths (in tokens)
+        in the prompt_tokens and response_tokens members.
         """

lemonade/tools/huggingface/utils.py CHANGED Viewed

@@ -108,7 +108,9 @@ class HuggingfaceAdapter(ModelAdapter):
         with torch.no_grad(), torch.inference_mode():
             outputs = self.model.generate(input_ids=input_ids, **generation_kwargs)
-            return outputs
+        self.prompt_tokens = input_ids.shape[1]
+        self.response_tokens = len(outputs[0]) - self.prompt_tokens
+        return outputs
     def _model_call(self, input_tensor):
         """Forward pass through the model to get logits
@@ -341,12 +343,11 @@ def benchmark_huggingface_llm(
                 latency = end_time - start_time
-                token_len = outputs.shape[1] - input_ids.shape[1]
-                tokens_out_len_list.append(token_len)
+                tokens_out_len_list.append(model.response_tokens)
                 # Only count an iteration if it produced enough tokens
-                if token_len >= target_output_tokens:
-                    per_iteration_result.append((latency, token_len))
+                if model.response_tokens >= target_output_tokens:
+                    per_iteration_result.append((latency, model.response_tokens))
                 report_progress_fn(
                     (warmup_iterations + count + 1) / (warmup_iterations + iterations)

lemonade/tools/llamacpp/bench.py CHANGED Viewed

@@ -3,27 +3,31 @@ import statistics
 from statistics import StatisticsError
 from lemonade.state import State
 from lemonade.cache import Keys
-from lemonade.tools.llamacpp.load import LlamaCppAdapter
+from lemonade.tools.llamacpp.utils import LlamaCppAdapter
 from lemonade.tools.bench import Bench
 class LlamaCppBench(Bench):
+    """
+    Benchmark a llama.cpp model
+    """
-    unique_name = "llama-cpp-bench"
+    unique_name = "llamacpp-bench"
     def __init__(self):
         super().__init__()
         # Additional statistics generated by this bench tool
-        self.status_stats += [
+        self.status_stats.insert(
+            self.status_stats.index(Keys.TOKEN_GENERATION_TOKENS_PER_SECOND) + 1,
             Keys.STD_DEV_TOKENS_PER_SECOND,
-        ]
+        )
         self.std_dev_token_generation_tokens_per_second_list = []
     @staticmethod
     def parser(add_help: bool = True) -> argparse.ArgumentParser:
         parser = __class__.helpful_parser(
-            short_description="Benchmark a llama.cpp model",
+            short_description="Benchmark an LLM in llama.cpp",
             add_help=add_help,
         )
@@ -53,38 +57,20 @@ class LlamaCppBench(Bench):
                     f"{self.__class__.unique_name} requires a LlamaCppAdapter model to be "
                     "loaded first. Please run load-llama-cpp before this tool."
                 )
+        model: LlamaCppAdapter = state.model
-        iteration_tokens_per_second = []
-        iteration_time_to_first_token = []
+        per_iteration_tokens_per_second = []
+        per_iteration_time_to_first_token = []
         for iteration in range(iterations + warmup_iterations):
             try:
                 # Use the adapter's generate method which already has the timeout
                 # and error handling
-                raw_output, stderr = state.model.generate(prompt, return_raw=True)
-                # Parse the timing information from the output
-                ms_per_token = None
-                time_to_first_token_ms = None
-                input_tokens = None
-                # Look for timing in both stdout and stderr
-                for output in [raw_output, stderr]:
-                    for line in output.splitlines():
-                        if "llama_perf_context_print:        eval time =" in line:
-                            parts = line.split("(")[1].strip()
-                            parts = parts.split(",")
-                            ms_per_token = float(
-                                parts[0].split("ms per token")[0].strip()
-                            )
-                        if "llama_perf_context_print: prompt eval time =" in line:
-                            parts = line.split("=")[1].split("/")
-                            time_to_first_token_ms = float(
-                                parts[0].split("ms")[0].strip()
-                            )
-                            input_tokens = int(parts[1].split("tokens")[0].strip())
-                if ms_per_token is None or time_to_first_token_ms is None:
+                model.time_to_first_token = None
+                model.tokens_per_second = None
+                raw_output, stderr = model.generate(prompt, return_raw=True)
+                if model.time_to_first_token is None or model.tokens_per_second is None:
                     error_msg = (
                         "Could not find timing information in llama.cpp output.\n"
                     )
@@ -92,17 +78,11 @@ class LlamaCppBench(Bench):
                     error_msg += "Stderr:\n" + stderr
                     raise Exception(error_msg)
-                # When output_tokens is set to 1 for accuracy tests, ms_per_token tends to 0
-                # and causes a divide-by-zero error. Set tokens_per_second to 0 in such cases
-                # as performance data for generating a few tokens is not relevant.
-                tokens_per_second = 0
-                if output_tokens > 5 and ms_per_token > 0:
-                    tokens_per_second = 1000 / ms_per_token
-                time_to_first_token = time_to_first_token_ms / 1000
+                self.tokens_out_len_list.append(model.response_tokens)
                 if iteration > warmup_iterations - 1:
-                    iteration_tokens_per_second.append(tokens_per_second)
-                    iteration_time_to_first_token.append(time_to_first_token)
+                    per_iteration_tokens_per_second.append(model.tokens_per_second)
+                    per_iteration_time_to_first_token.append(model.time_to_first_token)
                 report_progress_fn((iteration + 1) / (warmup_iterations + iterations))
@@ -110,25 +90,25 @@ class LlamaCppBench(Bench):
                 error_msg = f"Failed to run benchmark: {str(e)}"
                 raise Exception(error_msg)
-        self.input_ids_len_list.append(input_tokens)
-        mean_time_to_first_token = statistics.mean(iteration_time_to_first_token)
+        self.input_ids_len_list.append(model.prompt_tokens)
+        mean_time_to_first_token = statistics.mean(per_iteration_time_to_first_token)
         self.mean_time_to_first_token_list.append(mean_time_to_first_token)
         self.prefill_tokens_per_second_list.append(
-            input_tokens / mean_time_to_first_token
+            model.prompt_tokens / mean_time_to_first_token
         )
         self.token_generation_tokens_per_second_list.append(
-            statistics.mean(iteration_tokens_per_second)
+            statistics.mean(per_iteration_tokens_per_second)
         )
         try:
             self.std_dev_time_to_first_token_list.append(
-                statistics.stdev(iteration_time_to_first_token)
+                statistics.stdev(per_iteration_time_to_first_token)
             )
         except StatisticsError:
             # Less than 2 measurements
             self.std_dev_time_to_first_token_list.append(None)
         try:
             self.std_dev_token_generation_tokens_per_second_list.append(
-                statistics.stdev(iteration_tokens_per_second)
+                statistics.stdev(per_iteration_tokens_per_second)
             )
         except StatisticsError:
             # Less than 2 measurements

lemonade/tools/llamacpp/load.py CHANGED Viewed

@@ -1,166 +1,22 @@
 import argparse
 import os
-from typing import Optional
-import subprocess
-from lemonade.state import State
+import lemonade.common.printing as printing
 import lemonade.common.status as status
+from lemonade.state import State
 from lemonade.tools import FirstTool
-from lemonade.tools.adapter import PassthroughTokenizer, ModelAdapter
 from lemonade.cache import Keys
-class LlamaCppAdapter(ModelAdapter):
-    def __init__(
-        self, model, output_tokens, context_size, threads, executable, lib_dir=None
-    ):
-        super().__init__()
-        self.model = os.path.normpath(model)
-        self.output_tokens = output_tokens
-        self.context_size = context_size
-        self.threads = threads
-        self.executable = os.path.normpath(executable)
-        self.lib_dir = lib_dir
-    def generate(
-        self,
-        input_ids: str,
-        max_new_tokens: Optional[int] = None,
-        temperature: float = 0.8,
-        top_p: float = 0.95,
-        top_k: int = 40,
-        return_raw: bool = False,
-        **kwargs,  # pylint: disable=unused-argument
-    ):
-        """
-        Pass a text prompt into the llamacpp inference CLI.
-        The input_ids arg here should receive the original text that
-        would normally be encoded by a tokenizer.
-        Args:
-            input_ids: The input text prompt
-            max_new_tokens: Maximum number of tokens to generate
-            temperature: Temperature for sampling (0.0 = greedy)
-            top_p: Top-p sampling threshold
-            top_k: Top-k sampling threshold
-            return_raw: If True, returns the complete raw output including timing info
-            **kwargs: Additional arguments (ignored)
-        Returns:
-            List containing a single string with the generated text, or raw output if
-            return_raw=True
-        """
-        prompt = input_ids
-        n_predict = max_new_tokens if max_new_tokens is not None else self.output_tokens
-        cmd = [
-            self.executable,
-            "-m",
-            self.model,
-            "--ctx-size",
-            str(self.context_size),
-            "-n",
-            str(n_predict),
-            "-t",
-            str(self.threads),
-            "-p",
-            prompt,
-            "--temp",
-            str(temperature),
-            "--top-p",
-            str(top_p),
-            "--top-k",
-            str(top_k),
-            "-e",
-            "-no-cnv",
-        ]
-        cmd = [str(m) for m in cmd]
-        try:
-            # Set up environment with library path for Linux
-            env = os.environ.copy()
-            if self.lib_dir and os.name != "nt":  # Not Windows
-                current_ld_path = env.get("LD_LIBRARY_PATH", "")
-                if current_ld_path:
-                    env["LD_LIBRARY_PATH"] = f"{self.lib_dir}:{current_ld_path}"
-                else:
-                    env["LD_LIBRARY_PATH"] = self.lib_dir
-            process = subprocess.Popen(
-                cmd,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                universal_newlines=True,
-                encoding="utf-8",
-                errors="replace",
-                env=env,
-            )
-            raw_output, stderr = process.communicate(timeout=600)
-            if process.returncode != 0:
-                error_msg = f"llama.cpp failed with return code {process.returncode}.\n"
-                error_msg += f"Command: {' '.join(cmd)}\n"
-                error_msg += f"Error output:\n{stderr}\n"
-                error_msg += f"Standard output:\n{raw_output}"
-                raise Exception(error_msg)
-            if raw_output is None:
-                raise Exception("No output received from llama.cpp process")
-            # Parse timing information
-            for line in raw_output.splitlines():
-                if "llama_perf_context_print:        eval time =" in line:
-                    parts = line.split("(")[1].strip()
-                    parts = parts.split(",")
-                    ms_per_token = float(parts[0].split("ms per token")[0].strip())
-                    self.tokens_per_second = (
-                        1000 / ms_per_token if ms_per_token > 0 else 0
-                    )
-                if "llama_perf_context_print: prompt eval time =" in line:
-                    parts = line.split("=")[1].split("/")[0]
-                    time_to_first_token_ms = float(parts.split("ms")[0].strip())
-                    self.time_to_first_token = time_to_first_token_ms / 1000
-            if return_raw:
-                return [raw_output, stderr]
-            # Find where the prompt ends and the generated text begins
-            prompt_found = False
-            output_text = ""
-            prompt_first_line = prompt.split("\n")[0]
-            for line in raw_output.splitlines():
-                if prompt_first_line in line:
-                    prompt_found = True
-                if prompt_found:
-                    line = line.replace("</s> [end of text]", "")
-                    output_text = output_text + line
-            if not prompt_found:
-                raise Exception(
-                    f"Could not find prompt '{prompt_first_line}' in llama.cpp output. "
-                    "This usually means the model failed to process the prompt correctly.\n"
-                    f"Raw output:\n{raw_output}\n"
-                    f"Stderr:\n{stderr}"
-                )
-            # Return list containing the generated text
-            return [output_text]
-        except Exception as e:
-            error_msg = f"Failed to run llama.cpp command: {str(e)}\n"
-            error_msg += f"Command: {' '.join(cmd)}"
-            raise Exception(error_msg)
 class LoadLlamaCpp(FirstTool):
-    unique_name = "load-llama-cpp"
+    unique_name = "llamacpp-load"
     def __init__(self):
         super().__init__(monitor_message="Loading llama.cpp model")
+        self.status_stats = [
+            Keys.DEVICE,
+        ]
     @staticmethod
     def parser(add_help: bool = True) -> argparse.ArgumentParser:
         parser = __class__.helpful_parser(
@@ -169,28 +25,29 @@ class LoadLlamaCpp(FirstTool):
         )
         parser.add_argument(
-            "--executable",
-            required=True,
-            type=str,
-            help="Path to the llama.cpp executable (e.g., llama-cli or llama-cli.exe)",
+            "-d",
+            "--device",
+            choices=["cpu", "igpu"],
+            default="igpu",
+            help="Which device to load the model on to (default: igpu)",
         )
-        default_threads = 1
+        default_threads = -1
         parser.add_argument(
             "--threads",
             required=False,
             type=int,
             default=default_threads,
-            help=f"Number of threads to use for generation (default: {default_threads})",
+            help=f"Number of threads to use during generation (default: {default_threads})",
         )
-        context_size = 512
+        context_size = 4096
         parser.add_argument(
             "--context-size",
             required=False,
             type=int,
             default=context_size,
-            help=f"Context size of the prompt (default: {context_size})",
+            help=f"Size of the prompt context (default: {context_size}. 0 = loaded from model)",
         )
         output_tokens = 512
@@ -199,14 +56,13 @@ class LoadLlamaCpp(FirstTool):
             required=False,
             type=int,
             default=output_tokens,
-            help=f"Maximum number of output tokens the LLM should make (default: {output_tokens})",
+            help=f"Maximum number of output tokens to generate (default: {output_tokens})",
         )
         parser.add_argument(
-            "--model-binary",
-            required=True,
-            type=str,
-            help="Path to a .gguf model file",
+            "--reasoning",
+            action="store_true",
+            help="Set this flag to indicate the model is a reasoning model",
         )
         return parser
@@ -215,61 +71,113 @@ class LoadLlamaCpp(FirstTool):
         self,
         state: State,
         input: str = "",
+        device: str = "igpu",
         context_size: int = 512,
         threads: int = 1,
         output_tokens: int = 512,
-        model_binary: Optional[str] = None,
-        executable: str = None,
-        lib_dir: Optional[str] = None,
+        reasoning: bool = False,
     ) -> State:
         """
         Load a llama.cpp model
         """
-        from lemonade.common.network import get_base_model
-        if executable is None:
-            raise Exception(f"{self.__class__.unique_name} requires an executable path")
+        from lemonade.common.network import is_offline
+        from lemonade.tools.llamacpp.utils import (
+            install_llamacpp,
+            get_llama_cli_exe_path,
+            get_llama_installed_version,
+            parse_checkpoint,
+            download_gguf,
+            get_local_checkpoint_path,
+            LlamaCppTokenizerAdapter,
+            LlamaCppAdapter,
+        )
-        # Convert paths to platform-specific format
-        executable = os.path.normpath(executable)
+        # Validate and install llama.cpp, if needed
+        install_llamacpp()
+        # Check if input is a local folder containing a .GGUF model
+        if os.path.isdir(input):
+            # input is a local folder
+            local_model_folder = os.path.abspath(input)
+            checkpoint = "local_model"
+            state.checkpoint = checkpoint
+            state.save_stat(Keys.CHECKPOINT, checkpoint)
+            state.save_stat(Keys.LOCAL_MODEL_FOLDER, local_model_folder)
+            # See if there is a file ending in ".gguf" in this folder
+            dir = os.listdir(input)
+            gguf_files = [filename for filename in dir if filename.endswith(".gguf")]
+            if len(gguf_files) == 0:
+                raise ValueError(
+                    f"The folder {input} does not contain a GGUF model file."
+                )
+            model_to_use = gguf_files[0]
+            full_model_path = os.path.join(local_model_folder, model_to_use)
-        if model_binary:
-            model_to_use = os.path.normpath(model_binary)
         else:
-            model_binary = input
-            model_to_use = os.path.normpath(model_binary) if model_binary else None
+            # Input is a model checkpoint
+            checkpoint = input
+            state.checkpoint = checkpoint
+            state.save_stat(Keys.CHECKPOINT, checkpoint)
+            # Make sure that a variant is provided for the GGUF model
+            base_checkpoint, variant = parse_checkpoint(checkpoint)
+            if variant is None:
+                raise ValueError(
+                    "You are required to provide a 'variant' when "
+                    "selecting a GGUF model. The variant is provided "
+                    "as CHECKPOINT:VARIANT. For example: "
+                    "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_0 or "
+                    "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:qwen2.5-coder-3b-instruct-q4_0.gguf"
+                )
+            # Auto-detect offline status
+            offline = is_offline()
+            if offline:
+                printing.log_warning(
+                    "Network connectivity to huggingface.co not detected. Running in offline mode."
+                )
+                full_model_path, model_to_use = get_local_checkpoint_path(
+                    base_checkpoint, variant
+                )
+                if not full_model_path:
+                    raise ValueError(
+                        f"Model {checkpoint} is not available locally."
+                        f"Cannot download in offline mode."
+                    )
+            else:
+                snapshot_files = download_gguf(checkpoint)
+                full_model_path = snapshot_files["variant"]
+                model_to_use = os.path.basename(full_model_path)
-            if not model_binary:
-                model_to_use = state.get(Keys.MODEL)
+        llama_cli_exe_path = get_llama_cli_exe_path()
+        printing.log_info(f"Using llama_cli for GGUF model: {llama_cli_exe_path}")
-        if model_to_use is None:
-            raise Exception(
-                f"{self.__class__.unique_name} requires the preceding tool to pass a "
-                "Llamacpp model, "
-                "or for the user to supply a model with `--model-binary`"
-            )
+        # Get the directory containing the executable for shared libraries
+        lib_dir = os.path.dirname(llama_cli_exe_path)
+        # Pass the model and inputs into state
         state.model = LlamaCppAdapter(
-            model=model_to_use,
+            model=full_model_path,
+            device=device,
             output_tokens=output_tokens,
             context_size=context_size,
             threads=threads,
-            executable=executable,
+            executable=llama_cli_exe_path,
+            reasoning=reasoning,
             lib_dir=lib_dir,
         )
-        state.tokenizer = PassthroughTokenizer()
+        state.tokenizer = LlamaCppTokenizerAdapter()
+        state.device = device
-        # Save stats about the model
-        state.save_stat(Keys.CHECKPOINT, model_to_use)
-        # Get base model information if this is a converted HF model
-        base_model = get_base_model(input)
-        if base_model is not None:
-            state.save_stat("base_model", base_model)
+        # Save initial stats
+        state.save_stat(Keys.DEVICE, device)
+        state.save_stat(Keys.LLAMA_CLI_VERSION_INFO, get_llama_installed_version())
         status.add_to_state(state=state, name=input, model=model_to_use)
         return state

lemonade-sdk 8.0.4__py3-none-any.whl → 8.0.6__py3-none-any.whl

Potentially problematic release.

lemonade-sdk 8.0.4py3-none-any.whl → 8.0.6py3-none-any.whl