lemonade-sdk 8.1.10__py3-none-any.whl → 8.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (37) hide show
  1. lemonade/cache.py +6 -1
  2. lemonade/common/status.py +4 -4
  3. lemonade/tools/bench.py +22 -1
  4. lemonade/tools/flm/__init__.py +1 -0
  5. lemonade/tools/flm/utils.py +255 -0
  6. lemonade/tools/llamacpp/bench.py +111 -23
  7. lemonade/tools/llamacpp/load.py +20 -1
  8. lemonade/tools/llamacpp/utils.py +210 -17
  9. lemonade/tools/oga/bench.py +0 -26
  10. lemonade/tools/report/table.py +6 -0
  11. lemonade/tools/server/flm.py +133 -0
  12. lemonade/tools/server/llamacpp.py +23 -5
  13. lemonade/tools/server/serve.py +260 -135
  14. lemonade/tools/server/static/js/chat.js +165 -82
  15. lemonade/tools/server/static/js/models.js +87 -54
  16. lemonade/tools/server/static/js/shared.js +9 -6
  17. lemonade/tools/server/static/logs.html +57 -0
  18. lemonade/tools/server/static/styles.css +159 -8
  19. lemonade/tools/server/static/webapp.html +28 -10
  20. lemonade/tools/server/tray.py +94 -38
  21. lemonade/tools/server/utils/macos_tray.py +226 -0
  22. lemonade/tools/server/utils/{system_tray.py → windows_tray.py} +13 -0
  23. lemonade/tools/server/webapp.py +4 -1
  24. lemonade/tools/server/wrapped_server.py +91 -25
  25. lemonade/version.py +1 -1
  26. lemonade_install/install.py +25 -2
  27. {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/METADATA +10 -6
  28. {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/RECORD +37 -32
  29. lemonade_server/cli.py +103 -14
  30. lemonade_server/model_manager.py +186 -45
  31. lemonade_server/pydantic_models.py +25 -1
  32. lemonade_server/server_models.json +175 -62
  33. {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/WHEEL +0 -0
  34. {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/entry_points.txt +0 -0
  35. {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/licenses/LICENSE +0 -0
  36. {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/licenses/NOTICE.md +0 -0
  37. {lemonade_sdk-8.1.10.dist-info → lemonade_sdk-8.1.12.dist-info}/top_level.txt +0 -0
lemonade/cache.py CHANGED
@@ -43,7 +43,11 @@ def build_name(input_name):
43
43
  """
44
44
 
45
45
  if os.path.isdir(input_name):
46
+ # Input is a folder so no good way to determine a model name
46
47
  input_name_sanitized = "local_model"
48
+ elif os.path.isfile(input_name):
49
+ # Use the filename without its extension
50
+ input_name_sanitized = os.path.splitext(os.path.basename(input_name))[0]
47
51
  else:
48
52
  # Sanitize the input name
49
53
  input_name_sanitized = input_name.replace("/", "_")
@@ -63,8 +67,9 @@ class Keys:
63
67
  TOKEN_GENERATION_TOKENS_PER_SECOND = "token_generation_tokens_per_second"
64
68
  STD_DEV_TOKENS_PER_SECOND = "std_dev_tokens_per_second"
65
69
  SECONDS_TO_FIRST_TOKEN = "seconds_to_first_token"
66
- PREFILL_TOKENS_PER_SECOND = "prefill_tokens_per_second"
67
70
  STD_DEV_SECONDS_TO_FIRST_TOKEN = "std_dev_seconds_to_first_token"
71
+ PREFILL_TOKENS_PER_SECOND = "prefill_tokens_per_second"
72
+ STD_DEV_PREFILL_TOKENS_PER_SECOND = "std_dev_prefill_tokens_per_second"
68
73
  CHECKPOINT = "checkpoint"
69
74
  DTYPE = "dtype"
70
75
  PROMPT = "prompt"
lemonade/common/status.py CHANGED
@@ -112,10 +112,10 @@ class UniqueInvocationInfo(BasicInfo):
112
112
  if print_file_name:
113
113
  print(f"{self.script_name}{self.extension}:")
114
114
 
115
- # Print invocation about the model (only applies to scripts, not ONNX files or
115
+ # Print invocation about the model (only applies to scripts, not ONNX or GGUF files, nor
116
116
  # LLMs, which have no extension)
117
117
  if not (
118
- self.extension == ".onnx"
118
+ self.extension in [".onnx", ".gguf"]
119
119
  or self.extension == build.state_file_name
120
120
  or self.extension == ""
121
121
  ):
@@ -138,7 +138,7 @@ class UniqueInvocationInfo(BasicInfo):
138
138
 
139
139
  if self.depth == 0:
140
140
  print(f"{self.indent}\tLocation:\t{self.file}", end="")
141
- if self.extension == ".onnx":
141
+ if self.extension in [".onnx", ".gguf"]:
142
142
  print()
143
143
  else:
144
144
  print(f", line {self.line}")
@@ -314,7 +314,7 @@ class UniqueInvocationInfo(BasicInfo):
314
314
  Print information about a given model or submodel.
315
315
  """
316
316
 
317
- if self.extension == ".onnx" or self.extension == "":
317
+ if self.extension in [".onnx", ".gguf"] or self.extension == "":
318
318
  self.indent = "\t" * (2 * self.depth)
319
319
  else:
320
320
  self.indent = "\t" * (2 * self.depth + 1)
lemonade/tools/bench.py CHANGED
@@ -29,7 +29,9 @@ class Bench(Tool, ABC):
29
29
  Keys.SECONDS_TO_FIRST_TOKEN,
30
30
  Keys.STD_DEV_SECONDS_TO_FIRST_TOKEN,
31
31
  Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
32
+ Keys.STD_DEV_TOKENS_PER_SECOND,
32
33
  Keys.PREFILL_TOKENS_PER_SECOND,
34
+ Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
33
35
  Keys.PROMPT_TOKENS,
34
36
  Keys.RESPONSE_TOKENS,
35
37
  Keys.MAX_MEMORY_USED_GBYTE,
@@ -42,7 +44,9 @@ class Bench(Tool, ABC):
42
44
  self.mean_time_to_first_token_list = []
43
45
  self.std_dev_time_to_first_token_list = []
44
46
  self.prefill_tokens_per_second_list = []
47
+ self.std_dev_prefill_tokens_per_second_list = []
45
48
  self.token_generation_tokens_per_second_list = []
49
+ self.std_dev_token_generation_tokens_per_second_list = []
46
50
  self.max_memory_used_gb_list = []
47
51
 
48
52
  # Max memory used can only be measured on Windows systems
@@ -88,7 +92,7 @@ class Bench(Tool, ABC):
88
92
  default=[str(default_prompt_length)],
89
93
  metavar="PROMPT",
90
94
  help="Input one or more prompts to the LLM. Three formats are supported. "
91
- "1) integer: use a synthetic prompt with the specified length "
95
+ "1) integer: use a synthetic prompt with the specified token length "
92
96
  "2) str: use a user-provided prompt string "
93
97
  "3) path/to/prompt.txt: load the prompt from a text file. "
94
98
  f"(default: {default_prompt_length}) ",
@@ -246,10 +250,27 @@ class Bench(Tool, ABC):
246
250
  Keys.PREFILL_TOKENS_PER_SECOND,
247
251
  self.get_item_or_list(self.prefill_tokens_per_second_list),
248
252
  )
253
+ if not all(
254
+ element is None for element in self.std_dev_prefill_tokens_per_second_list
255
+ ):
256
+ state.save_stat(
257
+ Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
258
+ self.get_item_or_list(self.std_dev_prefill_tokens_per_second_list),
259
+ )
249
260
  state.save_stat(
250
261
  Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
251
262
  self.get_item_or_list(self.token_generation_tokens_per_second_list),
252
263
  )
264
+ if not all(
265
+ element is None
266
+ for element in self.std_dev_token_generation_tokens_per_second_list
267
+ ):
268
+ state.save_stat(
269
+ Keys.STD_DEV_TOKENS_PER_SECOND,
270
+ self.get_item_or_list(
271
+ self.std_dev_token_generation_tokens_per_second_list
272
+ ),
273
+ )
253
274
  if self.save_max_memory_used:
254
275
  state.save_stat(
255
276
  Keys.MAX_MEMORY_USED_GBYTE,
@@ -0,0 +1 @@
1
+ # FLM (FastFlowLM) utilities for Lemonade SDK
@@ -0,0 +1,255 @@
1
+ """
2
+ FLM (FastFlowLM) utilities for installation, version checking, and model management.
3
+ """
4
+
5
+ import os
6
+ import logging
7
+ import subprocess
8
+ import tempfile
9
+ import time
10
+ from typing import List, Optional
11
+
12
+ import requests
13
+ from packaging.version import Version
14
+
15
+
16
+ FLM_MINIMUM_VERSION = "0.9.12"
17
+
18
+
19
+ def check_flm_version() -> Optional[str]:
20
+ """
21
+ Check if FLM is installed and return version, or None if not available.
22
+ """
23
+ try:
24
+ result = subprocess.run(
25
+ ["flm", "version"],
26
+ capture_output=True,
27
+ text=True,
28
+ check=True,
29
+ encoding="utf-8",
30
+ errors="replace",
31
+ )
32
+
33
+ # Parse version from output like "FLM v0.9.4"
34
+ output = result.stdout.strip()
35
+ if output.startswith("FLM v"):
36
+ version_str = output[5:] # Remove "FLM v" prefix
37
+ return version_str
38
+ return None
39
+
40
+ except (subprocess.CalledProcessError, FileNotFoundError):
41
+ return None
42
+
43
+
44
+ def refresh_environment():
45
+ """
46
+ Refresh PATH to pick up newly installed executables.
47
+ """
48
+ if os.name == "nt": # Windows
49
+ # On Windows, we need to refresh the PATH from registry
50
+ import winreg
51
+
52
+ try:
53
+ with winreg.OpenKey(
54
+ winreg.HKEY_LOCAL_MACHINE,
55
+ r"SYSTEM\CurrentControlSet\Control\Session Manager\Environment",
56
+ ) as key:
57
+ path_value, _ = winreg.QueryValueEx(key, "PATH")
58
+ os.environ["PATH"] = path_value + ";" + os.environ.get("PATH", "")
59
+ except Exception as e: # pylint: disable=broad-except
60
+ logging.debug("Could not refresh PATH from registry: %s", e)
61
+
62
+ # Also try to add common installation paths
63
+ common_paths = [
64
+ r"C:\Program Files\FLM",
65
+ r"C:\Program Files (x86)\FLM",
66
+ os.path.expanduser(r"~\AppData\Local\FLM"),
67
+ ]
68
+ for path in common_paths:
69
+ if os.path.exists(path) and path not in os.environ.get("PATH", ""):
70
+ os.environ["PATH"] = path + ";" + os.environ.get("PATH", "")
71
+
72
+
73
+ def install_flm():
74
+ """
75
+ Check if FLM is installed and at minimum version.
76
+ If not, download and run the GUI installer, then wait for completion.
77
+ """
78
+ # Check current FLM installation
79
+ current_version = check_flm_version()
80
+
81
+ if current_version and Version(current_version) >= Version(FLM_MINIMUM_VERSION):
82
+ logging.info(
83
+ "FLM v%s is already installed and meets minimum version requirement (v%s)",
84
+ current_version,
85
+ FLM_MINIMUM_VERSION,
86
+ )
87
+ return
88
+
89
+ if current_version:
90
+ logging.info(
91
+ "FLM v%s is installed but below minimum version v%s. Upgrading...",
92
+ current_version,
93
+ FLM_MINIMUM_VERSION,
94
+ )
95
+ else:
96
+ logging.info(
97
+ "FLM not found. Installing FLM v%s or later...", FLM_MINIMUM_VERSION
98
+ )
99
+
100
+ # Download the installer
101
+ # pylint: disable=line-too-long
102
+ installer_url = "https://github.com/FastFlowLM/FastFlowLM/releases/latest/download/flm-setup.exe"
103
+ installer_path = os.path.join(tempfile.gettempdir(), "flm-setup.exe")
104
+
105
+ try:
106
+ # Remove existing installer if present
107
+ if os.path.exists(installer_path):
108
+ os.remove(installer_path)
109
+
110
+ logging.info("Downloading FLM installer...")
111
+ response = requests.get(installer_url, stream=True, timeout=30)
112
+ response.raise_for_status()
113
+
114
+ # Save installer to disk
115
+ with open(installer_path, "wb") as f:
116
+ for chunk in response.iter_content(chunk_size=8192):
117
+ f.write(chunk)
118
+ f.flush()
119
+ os.fsync(f.fileno())
120
+
121
+ logging.info("Downloaded FLM installer to %s", installer_path)
122
+
123
+ # Launch the installer GUI
124
+ logging.warning(
125
+ "Launching FLM installer GUI. Please complete the installation..."
126
+ )
127
+
128
+ # Launch installer and wait for it to complete
129
+ if os.name == "nt": # Windows
130
+ process = subprocess.Popen([installer_path], shell=True)
131
+ else:
132
+ process = subprocess.Popen([installer_path])
133
+
134
+ # Wait for installer to complete
135
+ process.wait()
136
+
137
+ if process.returncode != 0:
138
+ raise RuntimeError(
139
+ f"FLM installer failed with exit code {process.returncode}"
140
+ )
141
+
142
+ logging.info("FLM installer completed successfully")
143
+
144
+ # Refresh environment to pick up new PATH entries
145
+ refresh_environment()
146
+
147
+ # Wait a moment for system to update
148
+ time.sleep(2)
149
+
150
+ # Verify installation
151
+ max_retries = 10
152
+ for attempt in range(max_retries):
153
+ new_version = check_flm_version()
154
+ if new_version and Version(new_version) >= Version(FLM_MINIMUM_VERSION):
155
+ logging.info("FLM v%s successfully installed and verified", new_version)
156
+ return
157
+
158
+ if attempt < max_retries - 1:
159
+ logging.debug(
160
+ "FLM not yet available in PATH, retrying... (attempt %d/%d)",
161
+ attempt + 1,
162
+ max_retries,
163
+ )
164
+ time.sleep(3)
165
+ refresh_environment()
166
+
167
+ # Final check failed
168
+ raise RuntimeError(
169
+ "FLM installation completed but 'flm' command is not available in PATH. "
170
+ "Please ensure FLM is properly installed and available in your system PATH."
171
+ )
172
+
173
+ except requests.RequestException as e:
174
+ raise RuntimeError(f"Failed to download FLM installer: {e}") from e
175
+ except Exception as e:
176
+ raise RuntimeError(f"FLM installation failed: {e}") from e
177
+ finally:
178
+ # Clean up installer file
179
+ if os.path.exists(installer_path):
180
+ try:
181
+ os.remove(installer_path)
182
+ except OSError:
183
+ pass # Ignore cleanup errors
184
+
185
+
186
+ def download_flm_model(config_checkpoint, _=None, do_not_upgrade=False) -> dict:
187
+ """
188
+ Downloads the FLM model for the given configuration.
189
+
190
+ Args:
191
+ config_checkpoint: name of the FLM model to install.
192
+ _: placeholder for `config_mmproj`, which is standard
193
+ for WrappedServer (see llamacpp/utils.py) .
194
+ do_not_upgrade: whether to re-download the model if it is already
195
+ available.
196
+ """
197
+
198
+ if do_not_upgrade:
199
+ command = ["flm", "pull", f"{config_checkpoint}"]
200
+ else:
201
+ command = ["flm", "pull", f"{config_checkpoint}", "--force"]
202
+
203
+ subprocess.run(command, check=True)
204
+
205
+
206
+ def get_flm_installed_models() -> List[str]:
207
+ """
208
+ Parse FLM model list and return installed model checkpoints.
209
+
210
+ Returns:
211
+ List of installed FLM model checkpoints (e.g., ["llama3.2:1b", "gemma3:4b"])
212
+ """
213
+ try:
214
+ result = subprocess.run(
215
+ ["flm", "list"],
216
+ capture_output=True,
217
+ text=True,
218
+ check=True,
219
+ encoding="utf-8",
220
+ errors="replace",
221
+ )
222
+
223
+ # Check if we got valid output
224
+ if not result.stdout:
225
+ return []
226
+
227
+ installed_checkpoints = []
228
+
229
+ lines = result.stdout.strip().split("\n")
230
+ for line in lines:
231
+ line = line.strip()
232
+ if line.startswith("- "):
233
+ # Remove the leading "- " and parse the model info
234
+ model_info = line[2:].strip()
235
+
236
+ # Check if model is installed (✅)
237
+ if model_info.endswith(" ✅"):
238
+ checkpoint = model_info[:-2].strip()
239
+ installed_checkpoints.append(checkpoint)
240
+
241
+ return installed_checkpoints
242
+
243
+ except (subprocess.CalledProcessError, FileNotFoundError, AttributeError):
244
+ # FLM not installed, not available, or output parsing failed
245
+ return []
246
+
247
+
248
+ def is_flm_available() -> bool:
249
+ """
250
+ Check if FLM is available and meets minimum version requirements.
251
+ """
252
+ current_version = check_flm_version()
253
+ return current_version is not None and Version(current_version) >= Version(
254
+ FLM_MINIMUM_VERSION
255
+ )
@@ -2,9 +2,15 @@ import argparse
2
2
  import statistics
3
3
  from statistics import StatisticsError
4
4
  from lemonade.state import State
5
- from lemonade.cache import Keys
5
+ from lemonade.tools.tool import Tool
6
6
  from lemonade.tools.llamacpp.utils import LlamaCppAdapter
7
- from lemonade.tools.bench import Bench
7
+ from lemonade.tools.bench import (
8
+ Bench,
9
+ default_prompt_length,
10
+ default_iterations,
11
+ default_output_tokens,
12
+ default_warmup_runs,
13
+ )
8
14
 
9
15
 
10
16
  class LlamaCppBench(Bench):
@@ -14,15 +20,11 @@ class LlamaCppBench(Bench):
14
20
 
15
21
  unique_name = "llamacpp-bench"
16
22
 
17
- def __init__(self):
18
- super().__init__()
23
+ def __init__(self, monitor_message="Benchmarking LLM"):
24
+ super().__init__(monitor_message)
19
25
 
20
- # Additional statistics generated by this bench tool
21
- self.status_stats.insert(
22
- self.status_stats.index(Keys.TOKEN_GENERATION_TOKENS_PER_SECOND) + 1,
23
- Keys.STD_DEV_TOKENS_PER_SECOND,
24
- )
25
- self.std_dev_token_generation_tokens_per_second_list = []
26
+ # Don't track memory usage since we are using a llamacpp executable for compute
27
+ self.save_max_memory_used = False
26
28
 
27
29
  @staticmethod
28
30
  def parser(add_help: bool = True) -> argparse.ArgumentParser:
@@ -33,8 +35,44 @@ class LlamaCppBench(Bench):
33
35
 
34
36
  parser = Bench.parser(parser)
35
37
 
38
+ parser.add_argument(
39
+ "--cli",
40
+ action="store_true",
41
+ help="Set this flag to use llama-cli.exe to benchmark model performance. This executable will be called "
42
+ "once per iteration. Otherwise, llama-bench.exe is used by default. In this default behavior behavior, "
43
+ "the only valid prompt format is integer token lengths. Also, the warmup-iterations parameter is "
44
+ "ignored and the default value for number of threads is 16.",
45
+ )
46
+
36
47
  return parser
37
48
 
49
+ def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
50
+ """
51
+ Helper function to parse CLI arguments into the args expected by run()
52
+ """
53
+
54
+ # Call Tool parse method, NOT the Bench parse method
55
+ parsed_args = Tool.parse(self, state, args, known_only)
56
+
57
+ if parsed_args.cli:
58
+ parsed_args = super().parse(state, args, known_only)
59
+ else:
60
+ # Make sure prompts is a list of integers
61
+ if parsed_args.prompts is None:
62
+ parsed_args.prompts = [default_prompt_length]
63
+ prompt_ints = []
64
+ for prompt_item in parsed_args.prompts:
65
+ if prompt_item.isdigit():
66
+ prompt_ints.append(int(prompt_item))
67
+ else:
68
+ raise Exception(
69
+ f"When not using the --cli flag to {self.unique_name}, the prompt format must "
70
+ "be in integer format."
71
+ )
72
+ parsed_args.prompts = prompt_ints
73
+
74
+ return parsed_args
75
+
38
76
  def run_prompt(
39
77
  self,
40
78
  state: State,
@@ -116,20 +154,70 @@ class LlamaCppBench(Bench):
116
154
  # Less than 2 measurements
117
155
  self.std_dev_token_generation_tokens_per_second_list.append(None)
118
156
 
119
- def save_stats(self, state):
120
- super().save_stats(state)
121
-
122
- # Save additional statistics
123
- if not all(
124
- element is None
125
- for element in self.std_dev_token_generation_tokens_per_second_list
126
- ):
127
- state.save_stat(
128
- Keys.STD_DEV_TOKENS_PER_SECOND,
129
- self.get_item_or_list(
130
- self.std_dev_token_generation_tokens_per_second_list
131
- ),
157
+ def run_llama_bench_exe(self, state, prompts, iterations, output_tokens):
158
+
159
+ if prompts is None:
160
+ prompts = [default_prompt_length]
161
+ elif isinstance(prompts, int):
162
+ prompts = [prompts]
163
+
164
+ state.save_stat("prompts", prompts)
165
+ state.save_stat("iterations", iterations)
166
+ state.save_stat("output_tokens", output_tokens)
167
+
168
+ model: LlamaCppAdapter = state.model
169
+ prompt_lengths, pp_tps, pp_tps_sd, tg_tps, tg_tps_sd = model.benchmark(
170
+ prompts, iterations, output_tokens
171
+ )
172
+ self.input_ids_len_list = prompt_lengths
173
+ self.prefill_tokens_per_second_list = pp_tps
174
+ if iterations > 1:
175
+ self.std_dev_prefill_tokens_per_second_list = pp_tps_sd
176
+ self.mean_time_to_first_token_list = [
177
+ tokens / tps for tokens, tps in zip(prompt_lengths, pp_tps)
178
+ ]
179
+ self.token_generation_tokens_per_second_list = [tg_tps]
180
+ if iterations > 1:
181
+ self.std_dev_token_generation_tokens_per_second_list = [tg_tps_sd]
182
+ self.tokens_out_len_list = [output_tokens] * len(prompts) * iterations
183
+
184
+ self.save_stats(state)
185
+ return state
186
+
187
+ def run(
188
+ self,
189
+ state: State,
190
+ prompts: list[str] = None,
191
+ iterations: int = default_iterations,
192
+ warmup_iterations: int = default_warmup_runs,
193
+ output_tokens: int = default_output_tokens,
194
+ cli: bool = False,
195
+ **kwargs,
196
+ ) -> State:
197
+ """
198
+ Args:
199
+ - prompts: List of input prompts used as starting points for LLM text generation
200
+ - iterations: Number of benchmarking samples to take; results are
201
+ reported as the median and mean of the samples.
202
+ - warmup_iterations: Subset of the iterations to treat as warmup,
203
+ and not included in the results.
204
+ - output_tokens: Number of new tokens LLM to create.
205
+ - ggml: Use llama-bench.exe directly
206
+ - kwargs: Additional parameters used by bench tools
207
+ """
208
+
209
+ # Check that state has the attribute model and it is a LlamaCPP model
210
+ if not hasattr(state, "model") or not isinstance(state.model, LlamaCppAdapter):
211
+ raise Exception("Load model using llamacpp-load first.")
212
+
213
+ if cli:
214
+ state = super().run(
215
+ state, prompts, iterations, warmup_iterations, output_tokens, **kwargs
132
216
  )
217
+ else:
218
+ state = self.run_llama_bench_exe(state, prompts, iterations, output_tokens)
219
+
220
+ return state
133
221
 
134
222
 
135
223
  # This file was originally licensed under Apache 2.0. It has been modified.
@@ -93,6 +93,7 @@ class LoadLlamaCpp(FirstTool):
93
93
  from lemonade.tools.llamacpp.utils import (
94
94
  install_llamacpp,
95
95
  get_llama_cli_exe_path,
96
+ get_llama_bench_exe_path,
96
97
  get_llama_installed_version,
97
98
  parse_checkpoint,
98
99
  download_gguf,
@@ -103,6 +104,8 @@ class LoadLlamaCpp(FirstTool):
103
104
 
104
105
  install_llamacpp(backend)
105
106
 
107
+ extension = ""
108
+
106
109
  # Check if input is a local folder containing a .GGUF model
107
110
  if os.path.isdir(input):
108
111
  # input is a local folder
@@ -121,6 +124,17 @@ class LoadLlamaCpp(FirstTool):
121
124
  )
122
125
  model_to_use = gguf_files[0]
123
126
  full_model_path = os.path.join(local_model_folder, model_to_use)
127
+ extension = ".gguf"
128
+
129
+ elif input.endswith(".gguf") and os.path.isfile(input):
130
+ # input is a local .gguf file
131
+ full_model_path = os.path.abspath(input)
132
+ checkpoint = "local_model"
133
+ state.checkpoint = checkpoint
134
+ state.save_stat(Keys.CHECKPOINT, checkpoint)
135
+ state.save_stat(Keys.LOCAL_MODEL_FOLDER, full_model_path)
136
+ model_to_use = os.path.basename(full_model_path)
137
+ extension = ".gguf"
124
138
 
125
139
  else:
126
140
  # Input is a model checkpoint
@@ -161,6 +175,7 @@ class LoadLlamaCpp(FirstTool):
161
175
  model_to_use = os.path.basename(full_model_path)
162
176
 
163
177
  llama_cli_exe_path = get_llama_cli_exe_path(backend)
178
+ llama_bench_exe_path = get_llama_bench_exe_path(backend)
164
179
  printing.log_info(f"Using llama_cli for GGUF model: {llama_cli_exe_path}")
165
180
 
166
181
  # Get the directory containing the executable for shared libraries
@@ -174,8 +189,10 @@ class LoadLlamaCpp(FirstTool):
174
189
  context_size=context_size,
175
190
  threads=threads,
176
191
  executable=llama_cli_exe_path,
192
+ bench_executable=llama_bench_exe_path,
177
193
  reasoning=reasoning,
178
194
  lib_dir=lib_dir,
195
+ state=state,
179
196
  )
180
197
  state.tokenizer = LlamaCppTokenizerAdapter()
181
198
  state.device = device
@@ -186,7 +203,9 @@ class LoadLlamaCpp(FirstTool):
186
203
  Keys.LLAMA_CLI_VERSION_INFO, get_llama_installed_version(backend)
187
204
  )
188
205
 
189
- status.add_to_state(state=state, name=input, model=model_to_use)
206
+ status.add_to_state(
207
+ state=state, name=input, model=model_to_use, extension=extension
208
+ )
190
209
  return state
191
210
 
192
211