PyPI - lemonade-sdk - Versions diffs - 8.1.11__tar.gz → 8.2.0__tar.gz - Mend

lemonade-sdk 8.1.11tar.gz → 8.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (93) hide show

{lemonade_sdk-8.1.11/src/lemonade_sdk.egg-info → lemonade_sdk-8.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lemonade-sdk
-Version: 8.1.11
+Version: 8.2.0
 Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
 Author-email: lemonade@amd.com
 Requires-Python: >=3.10, <3.14
@@ -29,12 +29,13 @@ Requires-Dist: tabulate
 Requires-Dist: sentencepiece
 Requires-Dist: huggingface-hub[hf_xet]==0.33.0
 Requires-Dist: python-dotenv
+Requires-Dist: python-multipart
 Requires-Dist: rumps>=0.4.0; sys_platform == "darwin"
 Provides-Extra: oga-ryzenai
-Requires-Dist: onnxruntime-genai-directml-ryzenai==0.7.0.2.1; extra == "oga-ryzenai"
+Requires-Dist: onnxruntime-genai-directml-ryzenai==0.9.2; extra == "oga-ryzenai"
 Requires-Dist: protobuf>=6.30.1; extra == "oga-ryzenai"
 Provides-Extra: oga-cpu
-Requires-Dist: onnxruntime-genai==0.8.2; extra == "oga-cpu"
+Requires-Dist: onnxruntime-genai==0.9.2; extra == "oga-cpu"
 Requires-Dist: onnxruntime>=1.22.0; extra == "oga-cpu"
 Provides-Extra: dev
 Requires-Dist: torch>=2.6.0; extra == "dev"
@@ -264,6 +265,7 @@ This project is:
   - [OnnxRuntime GenAI](https://github.com/microsoft/onnxruntime-genai)
   - [Hugging Face Hub](https://github.com/huggingface/huggingface_hub)
   - [OpenAI API](https://github.com/openai/openai-python)
+  - [IRON/MLIR-AIE](https://github.com/Xilinx/mlir-aie)
   - and more...
 - Accelerated by mentorship from the OCV Catalyst program.
 - Licensed under the [Apache 2.0 License](https://github.com/lemonade-sdk/lemonade/blob/main/LICENSE).

{lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/README.md RENAMED Viewed

@@ -207,6 +207,7 @@ This project is:
   - [OnnxRuntime GenAI](https://github.com/microsoft/onnxruntime-genai)
   - [Hugging Face Hub](https://github.com/huggingface/huggingface_hub)
   - [OpenAI API](https://github.com/openai/openai-python)
+  - [IRON/MLIR-AIE](https://github.com/Xilinx/mlir-aie)
   - and more...
 - Accelerated by mentorship from the OCV Catalyst program.
 - Licensed under the [Apache 2.0 License](https://github.com/lemonade-sdk/lemonade/blob/main/LICENSE).

{lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/setup.py RENAMED Viewed

@@ -49,6 +49,7 @@ setup(
         "sentencepiece",
         "huggingface-hub[hf_xet]==0.33.0",
         "python-dotenv",
+        "python-multipart",
         # macOS-specific dependencies
         "rumps>=0.4.0; sys_platform == 'darwin'",
     ],
@@ -57,11 +58,11 @@ setup(
         # applications, without including developer-focused tools
         # Primary NPU extra using unified PyPI package
         "oga-ryzenai": [
-            "onnxruntime-genai-directml-ryzenai==0.7.0.2.1",
+            "onnxruntime-genai-directml-ryzenai==0.9.2",
             "protobuf>=6.30.1",
         ],
         "oga-cpu": [
-            "onnxruntime-genai==0.8.2",
+            "onnxruntime-genai==0.9.2",
             "onnxruntime >=1.22.0",
         ],
         # Developer-focused tools for benchmarking, accuracy testing, and

{lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/cache.py RENAMED Viewed

@@ -43,7 +43,11 @@ def build_name(input_name):
     """
     if os.path.isdir(input_name):
+        # Input is a folder so no good way to determine a model name
         input_name_sanitized = "local_model"
+    elif os.path.isfile(input_name):
+        # Use the filename without its extension
+        input_name_sanitized = os.path.splitext(os.path.basename(input_name))[0]
     else:
         # Sanitize the input name
         input_name_sanitized = input_name.replace("/", "_")
@@ -63,8 +67,9 @@ class Keys:
     TOKEN_GENERATION_TOKENS_PER_SECOND = "token_generation_tokens_per_second"
     STD_DEV_TOKENS_PER_SECOND = "std_dev_tokens_per_second"
     SECONDS_TO_FIRST_TOKEN = "seconds_to_first_token"
-    PREFILL_TOKENS_PER_SECOND = "prefill_tokens_per_second"
     STD_DEV_SECONDS_TO_FIRST_TOKEN = "std_dev_seconds_to_first_token"
+    PREFILL_TOKENS_PER_SECOND = "prefill_tokens_per_second"
+    STD_DEV_PREFILL_TOKENS_PER_SECOND = "std_dev_prefill_tokens_per_second"
     CHECKPOINT = "checkpoint"
     DTYPE = "dtype"
     PROMPT = "prompt"

{lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/status.py RENAMED Viewed

@@ -112,10 +112,10 @@ class UniqueInvocationInfo(BasicInfo):
         if print_file_name:
             print(f"{self.script_name}{self.extension}:")
-        # Print invocation about the model (only applies to scripts, not ONNX files or
+        # Print invocation about the model (only applies to scripts, not ONNX or GGUF files, nor
         # LLMs, which have no extension)
         if not (
-            self.extension == ".onnx"
+            self.extension in [".onnx", ".gguf"]
             or self.extension == build.state_file_name
             or self.extension == ""
         ):
@@ -138,7 +138,7 @@ class UniqueInvocationInfo(BasicInfo):
         if self.depth == 0:
             print(f"{self.indent}\tLocation:\t{self.file}", end="")
-            if self.extension == ".onnx":
+            if self.extension in [".onnx", ".gguf"]:
                 print()
             else:
                 print(f", line {self.line}")
@@ -314,7 +314,7 @@ class UniqueInvocationInfo(BasicInfo):
         Print information about a given model or submodel.
         """
-        if self.extension == ".onnx" or self.extension == "":
+        if self.extension in [".onnx", ".gguf"] or self.extension == "":
             self.indent = "\t" * (2 * self.depth)
         else:
             self.indent = "\t" * (2 * self.depth + 1)

{lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/system_info.py RENAMED Viewed

@@ -1110,32 +1110,6 @@ class LinuxSystemInfo(SystemInfo):
         return ""
-    def _get_nvidia_vram_smi_linux(self) -> float:
-        """
-        Get NVIDIA GPU VRAM on Linux using nvidia-smi command.
-        Returns:
-            float: VRAM in GB, or 0.0 if detection fails
-        """
-        try:
-            output = (
-                subprocess.check_output(
-                    "nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits",
-                    shell=True,
-                    stderr=subprocess.DEVNULL,
-                )
-                .decode()
-                .strip()
-            )
-            # nvidia-smi returns memory in MB
-            vram_mb = int(output.split("\n")[0])
-            vram_gb = round(vram_mb / 1024, 1)
-            return vram_gb
-        except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
-            pass
-        return 0.0
     @staticmethod
     def get_processor_name() -> str:
         """

{lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/bench.py RENAMED Viewed

@@ -29,7 +29,9 @@ class Bench(Tool, ABC):
             Keys.SECONDS_TO_FIRST_TOKEN,
             Keys.STD_DEV_SECONDS_TO_FIRST_TOKEN,
             Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
+            Keys.STD_DEV_TOKENS_PER_SECOND,
             Keys.PREFILL_TOKENS_PER_SECOND,
+            Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
             Keys.PROMPT_TOKENS,
             Keys.RESPONSE_TOKENS,
             Keys.MAX_MEMORY_USED_GBYTE,
@@ -42,7 +44,9 @@ class Bench(Tool, ABC):
         self.mean_time_to_first_token_list = []
         self.std_dev_time_to_first_token_list = []
         self.prefill_tokens_per_second_list = []
+        self.std_dev_prefill_tokens_per_second_list = []
         self.token_generation_tokens_per_second_list = []
+        self.std_dev_token_generation_tokens_per_second_list = []
         self.max_memory_used_gb_list = []
         # Max memory used can only be measured on Windows systems
@@ -88,7 +92,7 @@ class Bench(Tool, ABC):
             default=[str(default_prompt_length)],
             metavar="PROMPT",
             help="Input one or more prompts to the LLM. Three formats are supported. "
-            "1) integer: use a synthetic prompt with the specified length "
+            "1) integer: use a synthetic prompt with the specified token length "
             "2) str: use a user-provided prompt string "
             "3) path/to/prompt.txt: load the prompt from a text file. "
             f"(default: {default_prompt_length}) ",
@@ -246,10 +250,27 @@ class Bench(Tool, ABC):
             Keys.PREFILL_TOKENS_PER_SECOND,
             self.get_item_or_list(self.prefill_tokens_per_second_list),
         )
+        if not all(
+            element is None for element in self.std_dev_prefill_tokens_per_second_list
+        ):
+            state.save_stat(
+                Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
+                self.get_item_or_list(self.std_dev_prefill_tokens_per_second_list),
+            )
         state.save_stat(
             Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
             self.get_item_or_list(self.token_generation_tokens_per_second_list),
         )
+        if not all(
+            element is None
+            for element in self.std_dev_token_generation_tokens_per_second_list
+        ):
+            state.save_stat(
+                Keys.STD_DEV_TOKENS_PER_SECOND,
+                self.get_item_or_list(
+                    self.std_dev_token_generation_tokens_per_second_list
+                ),
+            )
         if self.save_max_memory_used:
             state.save_stat(
                 Keys.MAX_MEMORY_USED_GBYTE,

{lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/flm/utils.py RENAMED Viewed

@@ -10,16 +10,46 @@ import time
 from typing import List, Optional
 import requests
-from packaging.version import Version
+from packaging.version import Version, InvalidVersion
-FLM_MINIMUM_VERSION = "0.9.10"
+def get_flm_latest_version() -> Optional[str]:
+    """
+    Get and return the latest FLM version from "https://github.com/FastFlowLM/FastFlowLM/tags"
+    This uses the GitHub tags API.
+    """
+    url = "https://api.github.com/repos/FastFlowLM/FastFlowLM/tags"
+    try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        tags = response.json()
+        if not tags:
+            return None
+        # Tags are sorted in reverse chronological order; find the first that looks like a version
+        for tag in tags:
+            tag_name = tag.get("name", "")
+            # Accept tags of the form v0.9.10, 0.9.10, etc.
+            if tag_name.startswith("v"):
+                version_candidate = tag_name[1:]
+            else:
+                version_candidate = tag_name
+            try:
+                # validate it's a version string
+                _ = Version(version_candidate)
+                return version_candidate
+            except InvalidVersion:
+                continue
+        return None
+    except requests.exceptions.RequestException as e:
+        logging.debug("Error retrieving latest FLM version: %s", e)
+        return None
 def check_flm_version() -> Optional[str]:
     """
     Check if FLM is installed and return version, or None if not available.
     """
+    latest_version_str = get_flm_latest_version()
     try:
         result = subprocess.run(
             ["flm", "version"],
@@ -34,11 +64,11 @@ def check_flm_version() -> Optional[str]:
         output = result.stdout.strip()
         if output.startswith("FLM v"):
             version_str = output[5:]  # Remove "FLM v" prefix
-            return version_str
-        return None
+            return version_str, latest_version_str
+        return None, latest_version_str
     except (subprocess.CalledProcessError, FileNotFoundError):
-        return None
+        return None, latest_version_str
 def refresh_environment():
@@ -76,31 +106,42 @@ def install_flm():
     If not, download and run the GUI installer, then wait for completion.
     """
     # Check current FLM installation
-    current_version = check_flm_version()
+    current_version, latest_version = check_flm_version()
-    if current_version and Version(current_version) >= Version(FLM_MINIMUM_VERSION):
+    if (
+        current_version
+        and latest_version
+        and Version(current_version) == Version(latest_version)
+    ):
         logging.info(
-            "FLM v%s is already installed and meets minimum version requirement (v%s)",
+            "FLM v%s is already installed and is up to date (latest version: v%s).",
             current_version,
-            FLM_MINIMUM_VERSION,
+            latest_version,
         )
         return
     if current_version:
+        if not latest_version:
+            logging.info(
+                "Unable to detect the latest FLM version; continuing with installed FLM v%s.",
+                current_version,
+            )
+            return
         logging.info(
-            "FLM v%s is installed but below minimum version v%s. Upgrading...",
+            "FLM v%s is installed but below latest version v%s. Upgrading...",
             current_version,
-            FLM_MINIMUM_VERSION,
+            latest_version,
         )
+        verysilent = True
     else:
-        logging.info(
-            "FLM not found. Installing FLM v%s or later...", FLM_MINIMUM_VERSION
-        )
+        logging.info("FLM not found. Installing FLM v%s or later...", latest_version)
+        verysilent = False
     # Download the installer
     # pylint: disable=line-too-long
     installer_url = "https://github.com/FastFlowLM/FastFlowLM/releases/latest/download/flm-setup.exe"
     installer_path = os.path.join(tempfile.gettempdir(), "flm-setup.exe")
+    installer_args = [installer_path, "/VERYSILENT"] if verysilent else [installer_path]
     try:
         # Remove existing installer if present
@@ -123,13 +164,15 @@ def install_flm():
         # Launch the installer GUI
         logging.warning(
             "Launching FLM installer GUI. Please complete the installation..."
+            if not verysilent
+            else "Installing FLM..."
         )
         # Launch installer and wait for it to complete
         if os.name == "nt":  # Windows
-            process = subprocess.Popen([installer_path], shell=True)
+            process = subprocess.Popen(installer_args, shell=True)
         else:
-            process = subprocess.Popen([installer_path])
+            process = subprocess.Popen(installer_args)
         # Wait for installer to complete
         process.wait()
@@ -150,8 +193,8 @@ def install_flm():
         # Verify installation
         max_retries = 10
         for attempt in range(max_retries):
-            new_version = check_flm_version()
-            if new_version and Version(new_version) >= Version(FLM_MINIMUM_VERSION):
+            new_version, latest_version = check_flm_version()
+            if new_version and Version(new_version) == Version(latest_version):
                 logging.info("FLM v%s successfully installed and verified", new_version)
                 return
@@ -240,7 +283,12 @@ def get_flm_installed_models() -> List[str]:
         return installed_checkpoints
-    except (subprocess.CalledProcessError, FileNotFoundError, AttributeError):
+    except (
+        subprocess.CalledProcessError,
+        FileNotFoundError,
+        AttributeError,
+        NotADirectoryError,
+    ):
         # FLM not installed, not available, or output parsing failed
         return []
@@ -249,7 +297,7 @@ def is_flm_available() -> bool:
     """
     Check if FLM is available and meets minimum version requirements.
     """
-    current_version = check_flm_version()
-    return current_version is not None and Version(current_version) >= Version(
-        FLM_MINIMUM_VERSION
+    current_version, latest_version = check_flm_version()
+    return current_version is not None and Version(current_version) == Version(
+        latest_version
     )

lemonade_sdk-8.2.0/src/lemonade/tools/llamacpp/bench.py ADDED Viewed

@@ -0,0 +1,224 @@
+import argparse
+import statistics
+from statistics import StatisticsError
+from lemonade.state import State
+from lemonade.tools.tool import Tool
+from lemonade.tools.llamacpp.utils import LlamaCppAdapter
+from lemonade.tools.bench import (
+    Bench,
+    default_prompt_length,
+    default_iterations,
+    default_output_tokens,
+    default_warmup_runs,
+)
+class LlamaCppBench(Bench):
+    """
+    Benchmark a llama.cpp model
+    """
+    unique_name = "llamacpp-bench"
+    def __init__(self, monitor_message="Benchmarking LLM"):
+        super().__init__(monitor_message)
+        # Don't track memory usage since we are using a llamacpp executable for compute
+        self.save_max_memory_used = False
+    @staticmethod
+    def parser(add_help: bool = True) -> argparse.ArgumentParser:
+        parser = __class__.helpful_parser(
+            short_description="Benchmark an LLM in llama.cpp",
+            add_help=add_help,
+        )
+        parser = Bench.parser(parser)
+        parser.add_argument(
+            "--cli",
+            action="store_true",
+            help="Set this flag to use llama-cli.exe to benchmark model performance. This executable will be called "
+            "once per iteration.  Otherwise, llama-bench.exe is used by default.  In this default behavior behavior, "
+            "the only valid prompt format is integer token lengths. Also, the warmup-iterations parameter is "
+            "ignored and the default value for number of threads is 16.",
+        )
+        return parser
+    def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
+        """
+        Helper function to parse CLI arguments into the args expected by run()
+        """
+        # Call Tool parse method, NOT the Bench parse method
+        parsed_args = Tool.parse(self, state, args, known_only)
+        if parsed_args.cli:
+            parsed_args = super().parse(state, args, known_only)
+        else:
+            # Make sure prompts is a list of integers
+            if parsed_args.prompts is None:
+                parsed_args.prompts = [default_prompt_length]
+            prompt_ints = []
+            for prompt_item in parsed_args.prompts:
+                if prompt_item.isdigit():
+                    prompt_ints.append(int(prompt_item))
+                else:
+                    raise Exception(
+                        f"When not using the --cli flag to {self.unique_name}, the prompt format must "
+                        "be in integer format."
+                    )
+            parsed_args.prompts = prompt_ints
+        return parsed_args
+    def run_prompt(
+        self,
+        state: State,
+        report_progress_fn,
+        prompt: str,
+        iterations: int,
+        warmup_iterations: int,
+        output_tokens: int,
+    ) -> State:
+        """
+        Benchmark llama.cpp model that was loaded by LoadLlamaCpp.
+        """
+        if self.first_run_prompt:
+            if not hasattr(state, "model") or not isinstance(
+                state.model, LlamaCppAdapter
+            ):
+                raise Exception(
+                    f"{self.__class__.unique_name} requires a LlamaCppAdapter model to be "
+                    "loaded first. Please run load-llama-cpp before this tool."
+                )
+        model: LlamaCppAdapter = state.model
+        per_iteration_tokens_per_second = []
+        per_iteration_time_to_first_token = []
+        for iteration in range(iterations + warmup_iterations):
+            try:
+                # Use the adapter's generate method which already has the timeout
+                # and error handling
+                model.time_to_first_token = None
+                model.tokens_per_second = None
+                raw_output, stderr = model.generate(
+                    prompt, max_new_tokens=output_tokens, return_raw=True
+                )
+                if model.time_to_first_token is None or model.tokens_per_second is None:
+                    error_msg = (
+                        "Could not find timing information in llama.cpp output.\n"
+                    )
+                    error_msg += "Raw output:\n" + raw_output + "\n"
+                    error_msg += "Stderr:\n" + stderr
+                    raise Exception(error_msg)
+                self.tokens_out_len_list.append(model.response_tokens)
+                if iteration > warmup_iterations - 1:
+                    per_iteration_tokens_per_second.append(model.tokens_per_second)
+                    per_iteration_time_to_first_token.append(model.time_to_first_token)
+                report_progress_fn((iteration + 1) / (warmup_iterations + iterations))
+            except Exception as e:
+                error_msg = f"Failed to run benchmark: {str(e)}"
+                raise Exception(error_msg)
+        self.input_ids_len_list.append(model.prompt_tokens)
+        mean_time_to_first_token = statistics.mean(per_iteration_time_to_first_token)
+        self.mean_time_to_first_token_list.append(mean_time_to_first_token)
+        self.prefill_tokens_per_second_list.append(
+            model.prompt_tokens / mean_time_to_first_token
+        )
+        self.token_generation_tokens_per_second_list.append(
+            statistics.mean(per_iteration_tokens_per_second)
+        )
+        try:
+            self.std_dev_time_to_first_token_list.append(
+                statistics.stdev(per_iteration_time_to_first_token)
+            )
+        except StatisticsError:
+            # Less than 2 measurements
+            self.std_dev_time_to_first_token_list.append(None)
+        try:
+            self.std_dev_token_generation_tokens_per_second_list.append(
+                statistics.stdev(per_iteration_tokens_per_second)
+            )
+        except StatisticsError:
+            # Less than 2 measurements
+            self.std_dev_token_generation_tokens_per_second_list.append(None)
+    def run_llama_bench_exe(self, state, prompts, iterations, output_tokens):
+        if prompts is None:
+            prompts = [default_prompt_length]
+        elif isinstance(prompts, int):
+            prompts = [prompts]
+        state.save_stat("prompts", prompts)
+        state.save_stat("iterations", iterations)
+        state.save_stat("output_tokens", output_tokens)
+        model: LlamaCppAdapter = state.model
+        prompt_lengths, pp_tps, pp_tps_sd, tg_tps, tg_tps_sd = model.benchmark(
+            prompts, iterations, output_tokens
+        )
+        self.input_ids_len_list = prompt_lengths
+        self.prefill_tokens_per_second_list = pp_tps
+        if iterations > 1:
+            self.std_dev_prefill_tokens_per_second_list = pp_tps_sd
+        self.mean_time_to_first_token_list = [
+            tokens / tps for tokens, tps in zip(prompt_lengths, pp_tps)
+        ]
+        self.token_generation_tokens_per_second_list = [tg_tps]
+        if iterations > 1:
+            self.std_dev_token_generation_tokens_per_second_list = [tg_tps_sd]
+        self.tokens_out_len_list = [output_tokens] * len(prompts) * iterations
+        self.save_stats(state)
+        return state
+    def run(
+        self,
+        state: State,
+        prompts: list[str] = None,
+        iterations: int = default_iterations,
+        warmup_iterations: int = default_warmup_runs,
+        output_tokens: int = default_output_tokens,
+        cli: bool = False,
+        **kwargs,
+    ) -> State:
+        """
+        Args:
+            - prompts: List of input prompts used as starting points for LLM text generation
+            - iterations: Number of benchmarking samples to take; results are
+                reported as the median and mean of the samples.
+            - warmup_iterations: Subset of the iterations to treat as warmup,
+                and not included in the results.
+            - output_tokens: Number of new tokens LLM to create.
+            - ggml: Use llama-bench.exe directly
+            - kwargs: Additional parameters used by bench tools
+        """
+        # Check that state has the attribute model and it is a LlamaCPP model
+        if not hasattr(state, "model") or not isinstance(state.model, LlamaCppAdapter):
+            raise Exception("Load model using llamacpp-load first.")
+        if cli:
+            state = super().run(
+                state, prompts, iterations, warmup_iterations, output_tokens, **kwargs
+            )
+        else:
+            state = self.run_llama_bench_exe(state, prompts, iterations, output_tokens)
+        return state
+# This file was originally licensed under Apache 2.0. It has been modified.
+# Modifications Copyright (c) 2025 AMD

{lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/llamacpp/load.py RENAMED Viewed

@@ -93,9 +93,11 @@ class LoadLlamaCpp(FirstTool):
         from lemonade.tools.llamacpp.utils import (
             install_llamacpp,
             get_llama_cli_exe_path,
+            get_llama_bench_exe_path,
             get_llama_installed_version,
             parse_checkpoint,
             download_gguf,
+            resolve_local_gguf_model,
             get_local_checkpoint_path,
             LlamaCppTokenizerAdapter,
             LlamaCppAdapter,
@@ -103,6 +105,8 @@ class LoadLlamaCpp(FirstTool):
         install_llamacpp(backend)
+        extension = ""
         # Check if input is a local folder containing a .GGUF model
         if os.path.isdir(input):
             # input is a local folder
@@ -121,6 +125,17 @@ class LoadLlamaCpp(FirstTool):
                 )
             model_to_use = gguf_files[0]
             full_model_path = os.path.join(local_model_folder, model_to_use)
+            extension = ".gguf"
+        elif input.endswith(".gguf") and os.path.isfile(input):
+            # input is a local .gguf file
+            full_model_path = os.path.abspath(input)
+            checkpoint = "local_model"
+            state.checkpoint = checkpoint
+            state.save_stat(Keys.CHECKPOINT, checkpoint)
+            state.save_stat(Keys.LOCAL_MODEL_FOLDER, full_model_path)
+            model_to_use = os.path.basename(full_model_path)
+            extension = ".gguf"
         else:
             # Input is a model checkpoint
@@ -155,12 +170,21 @@ class LoadLlamaCpp(FirstTool):
                     )
             else:
+                # First, try to resolve from local cache to avoid unnecessary downloads
+                base_checkpoint, variant = parse_checkpoint(checkpoint)
+                snapshot_files = resolve_local_gguf_model(
+                    base_checkpoint, variant, None
+                )
+                # If not found locally, download from internet
+                if not snapshot_files:
+                    snapshot_files = download_gguf(checkpoint)
-                snapshot_files = download_gguf(checkpoint)
                 full_model_path = snapshot_files["variant"]
                 model_to_use = os.path.basename(full_model_path)
         llama_cli_exe_path = get_llama_cli_exe_path(backend)
+        llama_bench_exe_path = get_llama_bench_exe_path(backend)
         printing.log_info(f"Using llama_cli for GGUF model: {llama_cli_exe_path}")
         # Get the directory containing the executable for shared libraries
@@ -174,8 +198,10 @@ class LoadLlamaCpp(FirstTool):
             context_size=context_size,
             threads=threads,
             executable=llama_cli_exe_path,
+            bench_executable=llama_bench_exe_path,
             reasoning=reasoning,
             lib_dir=lib_dir,
+            state=state,
         )
         state.tokenizer = LlamaCppTokenizerAdapter()
         state.device = device
@@ -186,7 +212,9 @@ class LoadLlamaCpp(FirstTool):
             Keys.LLAMA_CLI_VERSION_INFO, get_llama_installed_version(backend)
         )
-        status.add_to_state(state=state, name=input, model=model_to_use)
+        status.add_to_state(
+            state=state, name=input, model=model_to_use, extension=extension
+        )
         return state

lemonade-sdk 8.1.11__tar.gz → 8.2.0__tar.gz

Potentially problematic release.

lemonade-sdk 8.1.11tar.gz → 8.2.0tar.gz