lemonade-sdk 8.1.11__tar.gz → 8.1.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (91) hide show
  1. {lemonade_sdk-8.1.11/src/lemonade_sdk.egg-info → lemonade_sdk-8.1.12}/PKG-INFO +2 -1
  2. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/README.md +1 -0
  3. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/cache.py +6 -1
  4. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/common/status.py +4 -4
  5. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/bench.py +22 -1
  6. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/flm/utils.py +1 -1
  7. lemonade_sdk-8.1.12/src/lemonade/tools/llamacpp/bench.py +224 -0
  8. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/llamacpp/load.py +20 -1
  9. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/llamacpp/utils.py +152 -7
  10. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/oga/bench.py +0 -26
  11. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/report/table.py +6 -0
  12. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/flm.py +2 -6
  13. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/serve.py +1 -1
  14. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/static/js/shared.js +4 -3
  15. lemonade_sdk-8.1.12/src/lemonade/tools/server/static/logs.html +57 -0
  16. lemonade_sdk-8.1.12/src/lemonade/version.py +1 -0
  17. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12/src/lemonade_sdk.egg-info}/PKG-INFO +2 -1
  18. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_server/server_models.json +14 -1
  19. lemonade_sdk-8.1.11/src/lemonade/tools/llamacpp/bench.py +0 -136
  20. lemonade_sdk-8.1.11/src/lemonade/tools/server/static/logs.html +0 -47
  21. lemonade_sdk-8.1.11/src/lemonade/version.py +0 -1
  22. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/LICENSE +0 -0
  23. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/NOTICE.md +0 -0
  24. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/pyproject.toml +0 -0
  25. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/setup.cfg +0 -0
  26. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/setup.py +0 -0
  27. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/__init__.py +0 -0
  28. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/api.py +0 -0
  29. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/cli.py +0 -0
  30. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/common/__init__.py +0 -0
  31. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/common/build.py +0 -0
  32. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/common/cli_helpers.py +0 -0
  33. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/common/exceptions.py +0 -0
  34. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/common/filesystem.py +0 -0
  35. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/common/inference_engines.py +0 -0
  36. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/common/network.py +0 -0
  37. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/common/printing.py +0 -0
  38. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/common/system_info.py +0 -0
  39. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/common/test_helpers.py +0 -0
  40. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/profilers/__init__.py +0 -0
  41. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/profilers/agt_power.py +0 -0
  42. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/profilers/hwinfo_power.py +0 -0
  43. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/profilers/memory_tracker.py +0 -0
  44. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/profilers/profiler.py +0 -0
  45. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/sequence.py +0 -0
  46. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/state.py +0 -0
  47. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/__init__.py +0 -0
  48. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/accuracy.py +0 -0
  49. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/adapter.py +0 -0
  50. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/flm/__init__.py +0 -0
  51. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/huggingface/bench.py +0 -0
  52. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/huggingface/load.py +0 -0
  53. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/huggingface/utils.py +0 -0
  54. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/humaneval.py +0 -0
  55. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/management_tools.py +0 -0
  56. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/mmlu.py +0 -0
  57. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/oga/__init__.py +0 -0
  58. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/oga/load.py +0 -0
  59. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/oga/utils.py +0 -0
  60. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/perplexity.py +0 -0
  61. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/prompt.py +0 -0
  62. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/report/__init__.py +0 -0
  63. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/report/llm_report.py +0 -0
  64. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/__init__.py +0 -0
  65. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/llamacpp.py +0 -0
  66. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/static/favicon.ico +0 -0
  67. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/static/js/chat.js +0 -0
  68. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/static/js/model-settings.js +0 -0
  69. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/static/js/models.js +0 -0
  70. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/static/styles.css +0 -0
  71. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/static/webapp.html +0 -0
  72. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/tool_calls.py +0 -0
  73. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/tray.py +0 -0
  74. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/utils/macos_tray.py +0 -0
  75. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/utils/port.py +0 -0
  76. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/utils/thread.py +0 -0
  77. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/utils/windows_tray.py +0 -0
  78. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/webapp.py +0 -0
  79. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/wrapped_server.py +0 -0
  80. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/tool.py +0 -0
  81. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_install/__init__.py +0 -0
  82. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_install/install.py +0 -0
  83. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_sdk.egg-info/SOURCES.txt +0 -0
  84. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_sdk.egg-info/dependency_links.txt +0 -0
  85. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_sdk.egg-info/entry_points.txt +0 -0
  86. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_sdk.egg-info/requires.txt +0 -0
  87. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_sdk.egg-info/top_level.txt +0 -0
  88. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_server/cli.py +0 -0
  89. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_server/model_manager.py +0 -0
  90. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_server/pydantic_models.py +0 -0
  91. {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_server/settings.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lemonade-sdk
3
- Version: 8.1.11
3
+ Version: 8.1.12
4
4
  Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
5
5
  Author-email: lemonade@amd.com
6
6
  Requires-Python: >=3.10, <3.14
@@ -264,6 +264,7 @@ This project is:
264
264
  - [OnnxRuntime GenAI](https://github.com/microsoft/onnxruntime-genai)
265
265
  - [Hugging Face Hub](https://github.com/huggingface/huggingface_hub)
266
266
  - [OpenAI API](https://github.com/openai/openai-python)
267
+ - [IRON/MLIR-AIE](https://github.com/Xilinx/mlir-aie)
267
268
  - and more...
268
269
  - Accelerated by mentorship from the OCV Catalyst program.
269
270
  - Licensed under the [Apache 2.0 License](https://github.com/lemonade-sdk/lemonade/blob/main/LICENSE).
@@ -207,6 +207,7 @@ This project is:
207
207
  - [OnnxRuntime GenAI](https://github.com/microsoft/onnxruntime-genai)
208
208
  - [Hugging Face Hub](https://github.com/huggingface/huggingface_hub)
209
209
  - [OpenAI API](https://github.com/openai/openai-python)
210
+ - [IRON/MLIR-AIE](https://github.com/Xilinx/mlir-aie)
210
211
  - and more...
211
212
  - Accelerated by mentorship from the OCV Catalyst program.
212
213
  - Licensed under the [Apache 2.0 License](https://github.com/lemonade-sdk/lemonade/blob/main/LICENSE).
@@ -43,7 +43,11 @@ def build_name(input_name):
43
43
  """
44
44
 
45
45
  if os.path.isdir(input_name):
46
+ # Input is a folder so no good way to determine a model name
46
47
  input_name_sanitized = "local_model"
48
+ elif os.path.isfile(input_name):
49
+ # Use the filename without its extension
50
+ input_name_sanitized = os.path.splitext(os.path.basename(input_name))[0]
47
51
  else:
48
52
  # Sanitize the input name
49
53
  input_name_sanitized = input_name.replace("/", "_")
@@ -63,8 +67,9 @@ class Keys:
63
67
  TOKEN_GENERATION_TOKENS_PER_SECOND = "token_generation_tokens_per_second"
64
68
  STD_DEV_TOKENS_PER_SECOND = "std_dev_tokens_per_second"
65
69
  SECONDS_TO_FIRST_TOKEN = "seconds_to_first_token"
66
- PREFILL_TOKENS_PER_SECOND = "prefill_tokens_per_second"
67
70
  STD_DEV_SECONDS_TO_FIRST_TOKEN = "std_dev_seconds_to_first_token"
71
+ PREFILL_TOKENS_PER_SECOND = "prefill_tokens_per_second"
72
+ STD_DEV_PREFILL_TOKENS_PER_SECOND = "std_dev_prefill_tokens_per_second"
68
73
  CHECKPOINT = "checkpoint"
69
74
  DTYPE = "dtype"
70
75
  PROMPT = "prompt"
@@ -112,10 +112,10 @@ class UniqueInvocationInfo(BasicInfo):
112
112
  if print_file_name:
113
113
  print(f"{self.script_name}{self.extension}:")
114
114
 
115
- # Print invocation about the model (only applies to scripts, not ONNX files or
115
+ # Print invocation about the model (only applies to scripts, not ONNX or GGUF files, nor
116
116
  # LLMs, which have no extension)
117
117
  if not (
118
- self.extension == ".onnx"
118
+ self.extension in [".onnx", ".gguf"]
119
119
  or self.extension == build.state_file_name
120
120
  or self.extension == ""
121
121
  ):
@@ -138,7 +138,7 @@ class UniqueInvocationInfo(BasicInfo):
138
138
 
139
139
  if self.depth == 0:
140
140
  print(f"{self.indent}\tLocation:\t{self.file}", end="")
141
- if self.extension == ".onnx":
141
+ if self.extension in [".onnx", ".gguf"]:
142
142
  print()
143
143
  else:
144
144
  print(f", line {self.line}")
@@ -314,7 +314,7 @@ class UniqueInvocationInfo(BasicInfo):
314
314
  Print information about a given model or submodel.
315
315
  """
316
316
 
317
- if self.extension == ".onnx" or self.extension == "":
317
+ if self.extension in [".onnx", ".gguf"] or self.extension == "":
318
318
  self.indent = "\t" * (2 * self.depth)
319
319
  else:
320
320
  self.indent = "\t" * (2 * self.depth + 1)
@@ -29,7 +29,9 @@ class Bench(Tool, ABC):
29
29
  Keys.SECONDS_TO_FIRST_TOKEN,
30
30
  Keys.STD_DEV_SECONDS_TO_FIRST_TOKEN,
31
31
  Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
32
+ Keys.STD_DEV_TOKENS_PER_SECOND,
32
33
  Keys.PREFILL_TOKENS_PER_SECOND,
34
+ Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
33
35
  Keys.PROMPT_TOKENS,
34
36
  Keys.RESPONSE_TOKENS,
35
37
  Keys.MAX_MEMORY_USED_GBYTE,
@@ -42,7 +44,9 @@ class Bench(Tool, ABC):
42
44
  self.mean_time_to_first_token_list = []
43
45
  self.std_dev_time_to_first_token_list = []
44
46
  self.prefill_tokens_per_second_list = []
47
+ self.std_dev_prefill_tokens_per_second_list = []
45
48
  self.token_generation_tokens_per_second_list = []
49
+ self.std_dev_token_generation_tokens_per_second_list = []
46
50
  self.max_memory_used_gb_list = []
47
51
 
48
52
  # Max memory used can only be measured on Windows systems
@@ -88,7 +92,7 @@ class Bench(Tool, ABC):
88
92
  default=[str(default_prompt_length)],
89
93
  metavar="PROMPT",
90
94
  help="Input one or more prompts to the LLM. Three formats are supported. "
91
- "1) integer: use a synthetic prompt with the specified length "
95
+ "1) integer: use a synthetic prompt with the specified token length "
92
96
  "2) str: use a user-provided prompt string "
93
97
  "3) path/to/prompt.txt: load the prompt from a text file. "
94
98
  f"(default: {default_prompt_length}) ",
@@ -246,10 +250,27 @@ class Bench(Tool, ABC):
246
250
  Keys.PREFILL_TOKENS_PER_SECOND,
247
251
  self.get_item_or_list(self.prefill_tokens_per_second_list),
248
252
  )
253
+ if not all(
254
+ element is None for element in self.std_dev_prefill_tokens_per_second_list
255
+ ):
256
+ state.save_stat(
257
+ Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
258
+ self.get_item_or_list(self.std_dev_prefill_tokens_per_second_list),
259
+ )
249
260
  state.save_stat(
250
261
  Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
251
262
  self.get_item_or_list(self.token_generation_tokens_per_second_list),
252
263
  )
264
+ if not all(
265
+ element is None
266
+ for element in self.std_dev_token_generation_tokens_per_second_list
267
+ ):
268
+ state.save_stat(
269
+ Keys.STD_DEV_TOKENS_PER_SECOND,
270
+ self.get_item_or_list(
271
+ self.std_dev_token_generation_tokens_per_second_list
272
+ ),
273
+ )
253
274
  if self.save_max_memory_used:
254
275
  state.save_stat(
255
276
  Keys.MAX_MEMORY_USED_GBYTE,
@@ -13,7 +13,7 @@ import requests
13
13
  from packaging.version import Version
14
14
 
15
15
 
16
- FLM_MINIMUM_VERSION = "0.9.10"
16
+ FLM_MINIMUM_VERSION = "0.9.12"
17
17
 
18
18
 
19
19
  def check_flm_version() -> Optional[str]:
@@ -0,0 +1,224 @@
1
+ import argparse
2
+ import statistics
3
+ from statistics import StatisticsError
4
+ from lemonade.state import State
5
+ from lemonade.tools.tool import Tool
6
+ from lemonade.tools.llamacpp.utils import LlamaCppAdapter
7
+ from lemonade.tools.bench import (
8
+ Bench,
9
+ default_prompt_length,
10
+ default_iterations,
11
+ default_output_tokens,
12
+ default_warmup_runs,
13
+ )
14
+
15
+
16
+ class LlamaCppBench(Bench):
17
+ """
18
+ Benchmark a llama.cpp model
19
+ """
20
+
21
+ unique_name = "llamacpp-bench"
22
+
23
+ def __init__(self, monitor_message="Benchmarking LLM"):
24
+ super().__init__(monitor_message)
25
+
26
+ # Don't track memory usage since we are using a llamacpp executable for compute
27
+ self.save_max_memory_used = False
28
+
29
+ @staticmethod
30
+ def parser(add_help: bool = True) -> argparse.ArgumentParser:
31
+ parser = __class__.helpful_parser(
32
+ short_description="Benchmark an LLM in llama.cpp",
33
+ add_help=add_help,
34
+ )
35
+
36
+ parser = Bench.parser(parser)
37
+
38
+ parser.add_argument(
39
+ "--cli",
40
+ action="store_true",
41
+ help="Set this flag to use llama-cli.exe to benchmark model performance. This executable will be called "
42
+ "once per iteration. Otherwise, llama-bench.exe is used by default. In this default behavior behavior, "
43
+ "the only valid prompt format is integer token lengths. Also, the warmup-iterations parameter is "
44
+ "ignored and the default value for number of threads is 16.",
45
+ )
46
+
47
+ return parser
48
+
49
+ def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
50
+ """
51
+ Helper function to parse CLI arguments into the args expected by run()
52
+ """
53
+
54
+ # Call Tool parse method, NOT the Bench parse method
55
+ parsed_args = Tool.parse(self, state, args, known_only)
56
+
57
+ if parsed_args.cli:
58
+ parsed_args = super().parse(state, args, known_only)
59
+ else:
60
+ # Make sure prompts is a list of integers
61
+ if parsed_args.prompts is None:
62
+ parsed_args.prompts = [default_prompt_length]
63
+ prompt_ints = []
64
+ for prompt_item in parsed_args.prompts:
65
+ if prompt_item.isdigit():
66
+ prompt_ints.append(int(prompt_item))
67
+ else:
68
+ raise Exception(
69
+ f"When not using the --cli flag to {self.unique_name}, the prompt format must "
70
+ "be in integer format."
71
+ )
72
+ parsed_args.prompts = prompt_ints
73
+
74
+ return parsed_args
75
+
76
+ def run_prompt(
77
+ self,
78
+ state: State,
79
+ report_progress_fn,
80
+ prompt: str,
81
+ iterations: int,
82
+ warmup_iterations: int,
83
+ output_tokens: int,
84
+ ) -> State:
85
+ """
86
+ Benchmark llama.cpp model that was loaded by LoadLlamaCpp.
87
+ """
88
+
89
+ if self.first_run_prompt:
90
+
91
+ if not hasattr(state, "model") or not isinstance(
92
+ state.model, LlamaCppAdapter
93
+ ):
94
+ raise Exception(
95
+ f"{self.__class__.unique_name} requires a LlamaCppAdapter model to be "
96
+ "loaded first. Please run load-llama-cpp before this tool."
97
+ )
98
+ model: LlamaCppAdapter = state.model
99
+
100
+ per_iteration_tokens_per_second = []
101
+ per_iteration_time_to_first_token = []
102
+
103
+ for iteration in range(iterations + warmup_iterations):
104
+ try:
105
+ # Use the adapter's generate method which already has the timeout
106
+ # and error handling
107
+ model.time_to_first_token = None
108
+ model.tokens_per_second = None
109
+ raw_output, stderr = model.generate(
110
+ prompt, max_new_tokens=output_tokens, return_raw=True
111
+ )
112
+
113
+ if model.time_to_first_token is None or model.tokens_per_second is None:
114
+ error_msg = (
115
+ "Could not find timing information in llama.cpp output.\n"
116
+ )
117
+ error_msg += "Raw output:\n" + raw_output + "\n"
118
+ error_msg += "Stderr:\n" + stderr
119
+ raise Exception(error_msg)
120
+
121
+ self.tokens_out_len_list.append(model.response_tokens)
122
+
123
+ if iteration > warmup_iterations - 1:
124
+ per_iteration_tokens_per_second.append(model.tokens_per_second)
125
+ per_iteration_time_to_first_token.append(model.time_to_first_token)
126
+
127
+ report_progress_fn((iteration + 1) / (warmup_iterations + iterations))
128
+
129
+ except Exception as e:
130
+ error_msg = f"Failed to run benchmark: {str(e)}"
131
+ raise Exception(error_msg)
132
+
133
+ self.input_ids_len_list.append(model.prompt_tokens)
134
+ mean_time_to_first_token = statistics.mean(per_iteration_time_to_first_token)
135
+ self.mean_time_to_first_token_list.append(mean_time_to_first_token)
136
+ self.prefill_tokens_per_second_list.append(
137
+ model.prompt_tokens / mean_time_to_first_token
138
+ )
139
+ self.token_generation_tokens_per_second_list.append(
140
+ statistics.mean(per_iteration_tokens_per_second)
141
+ )
142
+ try:
143
+ self.std_dev_time_to_first_token_list.append(
144
+ statistics.stdev(per_iteration_time_to_first_token)
145
+ )
146
+ except StatisticsError:
147
+ # Less than 2 measurements
148
+ self.std_dev_time_to_first_token_list.append(None)
149
+ try:
150
+ self.std_dev_token_generation_tokens_per_second_list.append(
151
+ statistics.stdev(per_iteration_tokens_per_second)
152
+ )
153
+ except StatisticsError:
154
+ # Less than 2 measurements
155
+ self.std_dev_token_generation_tokens_per_second_list.append(None)
156
+
157
+ def run_llama_bench_exe(self, state, prompts, iterations, output_tokens):
158
+
159
+ if prompts is None:
160
+ prompts = [default_prompt_length]
161
+ elif isinstance(prompts, int):
162
+ prompts = [prompts]
163
+
164
+ state.save_stat("prompts", prompts)
165
+ state.save_stat("iterations", iterations)
166
+ state.save_stat("output_tokens", output_tokens)
167
+
168
+ model: LlamaCppAdapter = state.model
169
+ prompt_lengths, pp_tps, pp_tps_sd, tg_tps, tg_tps_sd = model.benchmark(
170
+ prompts, iterations, output_tokens
171
+ )
172
+ self.input_ids_len_list = prompt_lengths
173
+ self.prefill_tokens_per_second_list = pp_tps
174
+ if iterations > 1:
175
+ self.std_dev_prefill_tokens_per_second_list = pp_tps_sd
176
+ self.mean_time_to_first_token_list = [
177
+ tokens / tps for tokens, tps in zip(prompt_lengths, pp_tps)
178
+ ]
179
+ self.token_generation_tokens_per_second_list = [tg_tps]
180
+ if iterations > 1:
181
+ self.std_dev_token_generation_tokens_per_second_list = [tg_tps_sd]
182
+ self.tokens_out_len_list = [output_tokens] * len(prompts) * iterations
183
+
184
+ self.save_stats(state)
185
+ return state
186
+
187
+ def run(
188
+ self,
189
+ state: State,
190
+ prompts: list[str] = None,
191
+ iterations: int = default_iterations,
192
+ warmup_iterations: int = default_warmup_runs,
193
+ output_tokens: int = default_output_tokens,
194
+ cli: bool = False,
195
+ **kwargs,
196
+ ) -> State:
197
+ """
198
+ Args:
199
+ - prompts: List of input prompts used as starting points for LLM text generation
200
+ - iterations: Number of benchmarking samples to take; results are
201
+ reported as the median and mean of the samples.
202
+ - warmup_iterations: Subset of the iterations to treat as warmup,
203
+ and not included in the results.
204
+ - output_tokens: Number of new tokens LLM to create.
205
+ - ggml: Use llama-bench.exe directly
206
+ - kwargs: Additional parameters used by bench tools
207
+ """
208
+
209
+ # Check that state has the attribute model and it is a LlamaCPP model
210
+ if not hasattr(state, "model") or not isinstance(state.model, LlamaCppAdapter):
211
+ raise Exception("Load model using llamacpp-load first.")
212
+
213
+ if cli:
214
+ state = super().run(
215
+ state, prompts, iterations, warmup_iterations, output_tokens, **kwargs
216
+ )
217
+ else:
218
+ state = self.run_llama_bench_exe(state, prompts, iterations, output_tokens)
219
+
220
+ return state
221
+
222
+
223
+ # This file was originally licensed under Apache 2.0. It has been modified.
224
+ # Modifications Copyright (c) 2025 AMD
@@ -93,6 +93,7 @@ class LoadLlamaCpp(FirstTool):
93
93
  from lemonade.tools.llamacpp.utils import (
94
94
  install_llamacpp,
95
95
  get_llama_cli_exe_path,
96
+ get_llama_bench_exe_path,
96
97
  get_llama_installed_version,
97
98
  parse_checkpoint,
98
99
  download_gguf,
@@ -103,6 +104,8 @@ class LoadLlamaCpp(FirstTool):
103
104
 
104
105
  install_llamacpp(backend)
105
106
 
107
+ extension = ""
108
+
106
109
  # Check if input is a local folder containing a .GGUF model
107
110
  if os.path.isdir(input):
108
111
  # input is a local folder
@@ -121,6 +124,17 @@ class LoadLlamaCpp(FirstTool):
121
124
  )
122
125
  model_to_use = gguf_files[0]
123
126
  full_model_path = os.path.join(local_model_folder, model_to_use)
127
+ extension = ".gguf"
128
+
129
+ elif input.endswith(".gguf") and os.path.isfile(input):
130
+ # input is a local .gguf file
131
+ full_model_path = os.path.abspath(input)
132
+ checkpoint = "local_model"
133
+ state.checkpoint = checkpoint
134
+ state.save_stat(Keys.CHECKPOINT, checkpoint)
135
+ state.save_stat(Keys.LOCAL_MODEL_FOLDER, full_model_path)
136
+ model_to_use = os.path.basename(full_model_path)
137
+ extension = ".gguf"
124
138
 
125
139
  else:
126
140
  # Input is a model checkpoint
@@ -161,6 +175,7 @@ class LoadLlamaCpp(FirstTool):
161
175
  model_to_use = os.path.basename(full_model_path)
162
176
 
163
177
  llama_cli_exe_path = get_llama_cli_exe_path(backend)
178
+ llama_bench_exe_path = get_llama_bench_exe_path(backend)
164
179
  printing.log_info(f"Using llama_cli for GGUF model: {llama_cli_exe_path}")
165
180
 
166
181
  # Get the directory containing the executable for shared libraries
@@ -174,8 +189,10 @@ class LoadLlamaCpp(FirstTool):
174
189
  context_size=context_size,
175
190
  threads=threads,
176
191
  executable=llama_cli_exe_path,
192
+ bench_executable=llama_bench_exe_path,
177
193
  reasoning=reasoning,
178
194
  lib_dir=lib_dir,
195
+ state=state,
179
196
  )
180
197
  state.tokenizer = LlamaCppTokenizerAdapter()
181
198
  state.device = device
@@ -186,7 +203,9 @@ class LoadLlamaCpp(FirstTool):
186
203
  Keys.LLAMA_CLI_VERSION_INFO, get_llama_installed_version(backend)
187
204
  )
188
205
 
189
- status.add_to_state(state=state, name=input, model=model_to_use)
206
+ status.add_to_state(
207
+ state=state, name=input, model=model_to_use, extension=extension
208
+ )
190
209
  return state
191
210
 
192
211
 
@@ -7,6 +7,7 @@ import zipfile
7
7
  from typing import Optional
8
8
  import subprocess
9
9
  import requests
10
+ import lemonade.common.build as build
10
11
  import lemonade.common.printing as printing
11
12
  from lemonade.tools.adapter import PassthroughTokenizer, ModelAdapter
12
13
 
@@ -175,6 +176,13 @@ def get_llama_cli_exe_path(backend: str):
175
176
  return get_llama_exe_path("llama-cli", backend)
176
177
 
177
178
 
179
+ def get_llama_bench_exe_path(backend: str):
180
+ """
181
+ Get path to platform-specific llama-bench executable
182
+ """
183
+ return get_llama_exe_path("llama-bench", backend)
184
+
185
+
178
186
  def get_version_txt_path(backend: str):
179
187
  """
180
188
  Get path to text file that contains version information
@@ -406,6 +414,7 @@ def install_llamacpp(backend):
406
414
  exe_paths = [
407
415
  (get_llama_server_exe_path(backend), "llama-server"),
408
416
  (get_llama_cli_exe_path(backend), "llama-cli"),
417
+ (get_llama_bench_exe_path(backend), "llama-bench"),
409
418
  ]
410
419
 
411
420
  for exe_path, exe_name in exe_paths:
@@ -699,8 +708,10 @@ class LlamaCppAdapter(ModelAdapter):
699
708
  context_size,
700
709
  threads,
701
710
  executable,
711
+ bench_executable,
702
712
  reasoning=False,
703
713
  lib_dir=None,
714
+ state=None,
704
715
  ):
705
716
  super().__init__()
706
717
 
@@ -712,8 +723,10 @@ class LlamaCppAdapter(ModelAdapter):
712
723
  self.context_size = context_size
713
724
  self.threads = threads
714
725
  self.executable = os.path.normpath(executable)
726
+ self.bench_executable = os.path.normpath(bench_executable)
715
727
  self.reasoning = reasoning
716
728
  self.lib_dir = lib_dir
729
+ self.state = state
717
730
 
718
731
  def generate(
719
732
  self,
@@ -754,32 +767,54 @@ class LlamaCppAdapter(ModelAdapter):
754
767
  self.executable,
755
768
  "-m",
756
769
  self.model,
757
- "--ctx-size",
770
+ "--ctx-size", # size of the prompt context, 0 = loaded from model
758
771
  str(self.context_size),
759
- "-n",
772
+ "-n", # number of tokens to predict, -1 = infinity, =2 - until context filled
760
773
  str(n_predict),
761
- "-t",
774
+ "-t", # number of threads to use during generation
762
775
  str(self.threads),
763
776
  "-p",
764
777
  prompt,
778
+ "-b", # logical maximum batch size
779
+ "1",
780
+ "-ub", # physical maximum batch size
781
+ "1",
765
782
  "--temp",
766
783
  str(temperature),
767
784
  "--top-p",
768
785
  str(top_p),
769
786
  "--top-k",
770
787
  str(top_k),
771
- "-e",
772
- "-no-cnv",
773
- "--reasoning-format",
788
+ "-e", # process escape sequences
789
+ "--no-conversation", # disable conversation mode
790
+ "--reasoning-format", # leaves thoughts unparsed in message content
774
791
  "none",
775
792
  ]
776
793
 
794
+ # If prompt exceeds 500 characters, then use a file
795
+ if len(prompt) < 500:
796
+ cmd += ["-p", prompt]
797
+ else:
798
+ # Create prompt file in cache directory
799
+ prompt_file = os.path.join(
800
+ build.output_dir(self.state.cache_dir, self.state.build_name),
801
+ "prompt.txt",
802
+ )
803
+ with open(prompt_file, "w", encoding="utf-8") as file:
804
+ file.write(prompt)
805
+ cmd += ["-f", prompt_file]
806
+
777
807
  # Configure GPU layers: 99 for GPU, 0 for CPU-only
778
808
  ngl_value = "99" if self.device == "igpu" else "0"
779
809
  cmd = cmd + ["-ngl", ngl_value]
780
810
 
781
811
  cmd = [str(m) for m in cmd]
782
812
 
813
+ # save llama-cli command
814
+ self.state.llama_cli_cmd = getattr(self.state, "llama_cli_cmd", []) + [
815
+ " ".join(cmd)
816
+ ]
817
+
783
818
  try:
784
819
  # Set up environment with library path for Linux
785
820
  env = os.environ.copy()
@@ -809,6 +844,15 @@ class LlamaCppAdapter(ModelAdapter):
809
844
  )
810
845
 
811
846
  raw_output, stderr = process.communicate(timeout=600)
847
+
848
+ # save llama-cli command output with performance info to state
849
+ # (can be viewed in state.yaml file in cache)
850
+ self.state.llama_cli_stderr = getattr(
851
+ self.state, "llama_cli_stderr", []
852
+ ) + [
853
+ [line for line in stderr.splitlines() if line.startswith("llama_perf_")]
854
+ ]
855
+
812
856
  if process.returncode != 0:
813
857
  error_msg = f"llama.cpp failed with return code {process.returncode}.\n"
814
858
  error_msg += f"Command: {' '.join(cmd)}\n"
@@ -873,7 +917,108 @@ class LlamaCppAdapter(ModelAdapter):
873
917
  return [output_text]
874
918
 
875
919
  except Exception as e:
876
- error_msg = f"Failed to run llama.cpp command: {str(e)}\n"
920
+ error_msg = f"Failed to run llama-cli.exe command: {str(e)}\n"
921
+ error_msg += f"Command: {' '.join(cmd)}"
922
+ raise Exception(error_msg)
923
+
924
+ def benchmark(self, prompts, iterations, output_tokens):
925
+ """
926
+ Runs the llama-bench.exe tool to measure TTFT and TPS
927
+ """
928
+ cmd = [
929
+ self.bench_executable,
930
+ "-m",
931
+ self.model,
932
+ "-r",
933
+ iterations,
934
+ "-p",
935
+ ",".join([str(p) for p in prompts]),
936
+ "-n",
937
+ output_tokens,
938
+ "-t",
939
+ self.threads if self.threads > 0 else 16,
940
+ "-b",
941
+ 1,
942
+ "-ub",
943
+ 1,
944
+ ]
945
+ cmd = [str(m) for m in cmd]
946
+
947
+ # save llama-bench command
948
+ self.state.llama_bench_cmd = " ".join(cmd)
949
+
950
+ try:
951
+ # Set up environment with library path for Linux
952
+ env = os.environ.copy()
953
+
954
+ # Load environment variables from .env file in the executable directory
955
+ exe_dir = os.path.dirname(self.executable)
956
+ env_file_path = os.path.join(exe_dir, ".env")
957
+ if os.path.exists(env_file_path):
958
+ load_dotenv(env_file_path, override=True)
959
+ env.update(os.environ)
960
+
961
+ if self.lib_dir and os.name != "nt": # Not Windows
962
+ current_ld_path = env.get("LD_LIBRARY_PATH", "")
963
+ if current_ld_path:
964
+ env["LD_LIBRARY_PATH"] = f"{self.lib_dir}:{current_ld_path}"
965
+ else:
966
+ env["LD_LIBRARY_PATH"] = self.lib_dir
967
+
968
+ process = subprocess.Popen(
969
+ cmd,
970
+ stdout=subprocess.PIPE,
971
+ stderr=subprocess.PIPE,
972
+ universal_newlines=True,
973
+ encoding="utf-8",
974
+ errors="replace",
975
+ env=env,
976
+ )
977
+
978
+ raw_output, stderr = process.communicate(timeout=600)
979
+
980
+ # save llama-bench command output with performance info to state
981
+ # (can be viewed in state.yaml file in cache)
982
+ self.state.llama_bench_standard_output = raw_output.splitlines()
983
+
984
+ if process.returncode != 0:
985
+ error_msg = (
986
+ f"llama-bench.exe failed with return code {process.returncode}.\n"
987
+ )
988
+ error_msg += f"Command: {' '.join(cmd)}\n"
989
+ error_msg += f"Error output:\n{stderr}\n"
990
+ error_msg += f"Standard output:\n{raw_output}"
991
+ raise Exception(error_msg)
992
+
993
+ if raw_output is None:
994
+ raise Exception("No output received from llama-bench.exe process")
995
+
996
+ # Parse information from llama-bench.exe output
997
+ prompt_lengths = []
998
+ pp_tps = []
999
+ pp_tps_sd = []
1000
+ tg_tps = None
1001
+ tg_tps_sd = None
1002
+
1003
+ for line in self.state.llama_bench_standard_output:
1004
+ # Parse TPS information
1005
+ for p in prompts:
1006
+ if f"pp{p:d}" in line:
1007
+ parts = line.split("|")
1008
+ timings = parts[-2].strip().split(" ")
1009
+ prompt_lengths.append(p)
1010
+ pp_tps.append(float(timings[0]))
1011
+ pp_tps_sd.append(float(timings[-1]))
1012
+ if f"tg{output_tokens:d}" in line:
1013
+ parts = line.split("|")
1014
+ timings = parts[-2].strip().split(" ")
1015
+ tg_tps = float(timings[0])
1016
+ tg_tps_sd = float(timings[-1])
1017
+
1018
+ return prompt_lengths, pp_tps, pp_tps_sd, tg_tps, tg_tps_sd
1019
+
1020
+ except Exception as e:
1021
+ error_msg = f"Failed to run llama-bench.exe command: {str(e)}\n"
877
1022
  error_msg += f"Command: {' '.join(cmd)}"
878
1023
  raise Exception(error_msg)
879
1024