lemonade-sdk 8.1.11__tar.gz → 8.1.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- {lemonade_sdk-8.1.11/src/lemonade_sdk.egg-info → lemonade_sdk-8.1.12}/PKG-INFO +2 -1
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/README.md +1 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/cache.py +6 -1
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/common/status.py +4 -4
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/bench.py +22 -1
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/flm/utils.py +1 -1
- lemonade_sdk-8.1.12/src/lemonade/tools/llamacpp/bench.py +224 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/llamacpp/load.py +20 -1
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/llamacpp/utils.py +152 -7
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/oga/bench.py +0 -26
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/report/table.py +6 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/flm.py +2 -6
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/serve.py +1 -1
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/static/js/shared.js +4 -3
- lemonade_sdk-8.1.12/src/lemonade/tools/server/static/logs.html +57 -0
- lemonade_sdk-8.1.12/src/lemonade/version.py +1 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12/src/lemonade_sdk.egg-info}/PKG-INFO +2 -1
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_server/server_models.json +14 -1
- lemonade_sdk-8.1.11/src/lemonade/tools/llamacpp/bench.py +0 -136
- lemonade_sdk-8.1.11/src/lemonade/tools/server/static/logs.html +0 -47
- lemonade_sdk-8.1.11/src/lemonade/version.py +0 -1
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/LICENSE +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/NOTICE.md +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/pyproject.toml +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/setup.cfg +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/setup.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/__init__.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/api.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/cli.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/common/__init__.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/common/build.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/common/cli_helpers.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/common/exceptions.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/common/filesystem.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/common/inference_engines.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/common/network.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/common/printing.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/common/system_info.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/common/test_helpers.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/profilers/__init__.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/profilers/agt_power.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/profilers/hwinfo_power.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/profilers/memory_tracker.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/profilers/profiler.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/sequence.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/state.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/__init__.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/accuracy.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/adapter.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/flm/__init__.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/huggingface/bench.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/huggingface/load.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/huggingface/utils.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/humaneval.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/management_tools.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/mmlu.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/oga/__init__.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/oga/load.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/oga/utils.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/perplexity.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/prompt.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/report/__init__.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/report/llm_report.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/__init__.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/llamacpp.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/static/favicon.ico +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/static/js/chat.js +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/static/js/model-settings.js +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/static/js/models.js +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/static/styles.css +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/static/webapp.html +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/tool_calls.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/tray.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/utils/macos_tray.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/utils/port.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/utils/thread.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/utils/windows_tray.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/webapp.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/server/wrapped_server.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade/tools/tool.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_install/__init__.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_install/install.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_sdk.egg-info/SOURCES.txt +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_sdk.egg-info/dependency_links.txt +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_sdk.egg-info/entry_points.txt +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_sdk.egg-info/requires.txt +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_sdk.egg-info/top_level.txt +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_server/cli.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_server/model_manager.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_server/pydantic_models.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.1.12}/src/lemonade_server/settings.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: lemonade-sdk
|
|
3
|
-
Version: 8.1.
|
|
3
|
+
Version: 8.1.12
|
|
4
4
|
Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
|
|
5
5
|
Author-email: lemonade@amd.com
|
|
6
6
|
Requires-Python: >=3.10, <3.14
|
|
@@ -264,6 +264,7 @@ This project is:
|
|
|
264
264
|
- [OnnxRuntime GenAI](https://github.com/microsoft/onnxruntime-genai)
|
|
265
265
|
- [Hugging Face Hub](https://github.com/huggingface/huggingface_hub)
|
|
266
266
|
- [OpenAI API](https://github.com/openai/openai-python)
|
|
267
|
+
- [IRON/MLIR-AIE](https://github.com/Xilinx/mlir-aie)
|
|
267
268
|
- and more...
|
|
268
269
|
- Accelerated by mentorship from the OCV Catalyst program.
|
|
269
270
|
- Licensed under the [Apache 2.0 License](https://github.com/lemonade-sdk/lemonade/blob/main/LICENSE).
|
|
@@ -207,6 +207,7 @@ This project is:
|
|
|
207
207
|
- [OnnxRuntime GenAI](https://github.com/microsoft/onnxruntime-genai)
|
|
208
208
|
- [Hugging Face Hub](https://github.com/huggingface/huggingface_hub)
|
|
209
209
|
- [OpenAI API](https://github.com/openai/openai-python)
|
|
210
|
+
- [IRON/MLIR-AIE](https://github.com/Xilinx/mlir-aie)
|
|
210
211
|
- and more...
|
|
211
212
|
- Accelerated by mentorship from the OCV Catalyst program.
|
|
212
213
|
- Licensed under the [Apache 2.0 License](https://github.com/lemonade-sdk/lemonade/blob/main/LICENSE).
|
|
@@ -43,7 +43,11 @@ def build_name(input_name):
|
|
|
43
43
|
"""
|
|
44
44
|
|
|
45
45
|
if os.path.isdir(input_name):
|
|
46
|
+
# Input is a folder so no good way to determine a model name
|
|
46
47
|
input_name_sanitized = "local_model"
|
|
48
|
+
elif os.path.isfile(input_name):
|
|
49
|
+
# Use the filename without its extension
|
|
50
|
+
input_name_sanitized = os.path.splitext(os.path.basename(input_name))[0]
|
|
47
51
|
else:
|
|
48
52
|
# Sanitize the input name
|
|
49
53
|
input_name_sanitized = input_name.replace("/", "_")
|
|
@@ -63,8 +67,9 @@ class Keys:
|
|
|
63
67
|
TOKEN_GENERATION_TOKENS_PER_SECOND = "token_generation_tokens_per_second"
|
|
64
68
|
STD_DEV_TOKENS_PER_SECOND = "std_dev_tokens_per_second"
|
|
65
69
|
SECONDS_TO_FIRST_TOKEN = "seconds_to_first_token"
|
|
66
|
-
PREFILL_TOKENS_PER_SECOND = "prefill_tokens_per_second"
|
|
67
70
|
STD_DEV_SECONDS_TO_FIRST_TOKEN = "std_dev_seconds_to_first_token"
|
|
71
|
+
PREFILL_TOKENS_PER_SECOND = "prefill_tokens_per_second"
|
|
72
|
+
STD_DEV_PREFILL_TOKENS_PER_SECOND = "std_dev_prefill_tokens_per_second"
|
|
68
73
|
CHECKPOINT = "checkpoint"
|
|
69
74
|
DTYPE = "dtype"
|
|
70
75
|
PROMPT = "prompt"
|
|
@@ -112,10 +112,10 @@ class UniqueInvocationInfo(BasicInfo):
|
|
|
112
112
|
if print_file_name:
|
|
113
113
|
print(f"{self.script_name}{self.extension}:")
|
|
114
114
|
|
|
115
|
-
# Print invocation about the model (only applies to scripts, not ONNX files
|
|
115
|
+
# Print invocation about the model (only applies to scripts, not ONNX or GGUF files, nor
|
|
116
116
|
# LLMs, which have no extension)
|
|
117
117
|
if not (
|
|
118
|
-
self.extension
|
|
118
|
+
self.extension in [".onnx", ".gguf"]
|
|
119
119
|
or self.extension == build.state_file_name
|
|
120
120
|
or self.extension == ""
|
|
121
121
|
):
|
|
@@ -138,7 +138,7 @@ class UniqueInvocationInfo(BasicInfo):
|
|
|
138
138
|
|
|
139
139
|
if self.depth == 0:
|
|
140
140
|
print(f"{self.indent}\tLocation:\t{self.file}", end="")
|
|
141
|
-
if self.extension
|
|
141
|
+
if self.extension in [".onnx", ".gguf"]:
|
|
142
142
|
print()
|
|
143
143
|
else:
|
|
144
144
|
print(f", line {self.line}")
|
|
@@ -314,7 +314,7 @@ class UniqueInvocationInfo(BasicInfo):
|
|
|
314
314
|
Print information about a given model or submodel.
|
|
315
315
|
"""
|
|
316
316
|
|
|
317
|
-
if self.extension
|
|
317
|
+
if self.extension in [".onnx", ".gguf"] or self.extension == "":
|
|
318
318
|
self.indent = "\t" * (2 * self.depth)
|
|
319
319
|
else:
|
|
320
320
|
self.indent = "\t" * (2 * self.depth + 1)
|
|
@@ -29,7 +29,9 @@ class Bench(Tool, ABC):
|
|
|
29
29
|
Keys.SECONDS_TO_FIRST_TOKEN,
|
|
30
30
|
Keys.STD_DEV_SECONDS_TO_FIRST_TOKEN,
|
|
31
31
|
Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
|
|
32
|
+
Keys.STD_DEV_TOKENS_PER_SECOND,
|
|
32
33
|
Keys.PREFILL_TOKENS_PER_SECOND,
|
|
34
|
+
Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
|
|
33
35
|
Keys.PROMPT_TOKENS,
|
|
34
36
|
Keys.RESPONSE_TOKENS,
|
|
35
37
|
Keys.MAX_MEMORY_USED_GBYTE,
|
|
@@ -42,7 +44,9 @@ class Bench(Tool, ABC):
|
|
|
42
44
|
self.mean_time_to_first_token_list = []
|
|
43
45
|
self.std_dev_time_to_first_token_list = []
|
|
44
46
|
self.prefill_tokens_per_second_list = []
|
|
47
|
+
self.std_dev_prefill_tokens_per_second_list = []
|
|
45
48
|
self.token_generation_tokens_per_second_list = []
|
|
49
|
+
self.std_dev_token_generation_tokens_per_second_list = []
|
|
46
50
|
self.max_memory_used_gb_list = []
|
|
47
51
|
|
|
48
52
|
# Max memory used can only be measured on Windows systems
|
|
@@ -88,7 +92,7 @@ class Bench(Tool, ABC):
|
|
|
88
92
|
default=[str(default_prompt_length)],
|
|
89
93
|
metavar="PROMPT",
|
|
90
94
|
help="Input one or more prompts to the LLM. Three formats are supported. "
|
|
91
|
-
"1) integer: use a synthetic prompt with the specified length "
|
|
95
|
+
"1) integer: use a synthetic prompt with the specified token length "
|
|
92
96
|
"2) str: use a user-provided prompt string "
|
|
93
97
|
"3) path/to/prompt.txt: load the prompt from a text file. "
|
|
94
98
|
f"(default: {default_prompt_length}) ",
|
|
@@ -246,10 +250,27 @@ class Bench(Tool, ABC):
|
|
|
246
250
|
Keys.PREFILL_TOKENS_PER_SECOND,
|
|
247
251
|
self.get_item_or_list(self.prefill_tokens_per_second_list),
|
|
248
252
|
)
|
|
253
|
+
if not all(
|
|
254
|
+
element is None for element in self.std_dev_prefill_tokens_per_second_list
|
|
255
|
+
):
|
|
256
|
+
state.save_stat(
|
|
257
|
+
Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
|
|
258
|
+
self.get_item_or_list(self.std_dev_prefill_tokens_per_second_list),
|
|
259
|
+
)
|
|
249
260
|
state.save_stat(
|
|
250
261
|
Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
|
|
251
262
|
self.get_item_or_list(self.token_generation_tokens_per_second_list),
|
|
252
263
|
)
|
|
264
|
+
if not all(
|
|
265
|
+
element is None
|
|
266
|
+
for element in self.std_dev_token_generation_tokens_per_second_list
|
|
267
|
+
):
|
|
268
|
+
state.save_stat(
|
|
269
|
+
Keys.STD_DEV_TOKENS_PER_SECOND,
|
|
270
|
+
self.get_item_or_list(
|
|
271
|
+
self.std_dev_token_generation_tokens_per_second_list
|
|
272
|
+
),
|
|
273
|
+
)
|
|
253
274
|
if self.save_max_memory_used:
|
|
254
275
|
state.save_stat(
|
|
255
276
|
Keys.MAX_MEMORY_USED_GBYTE,
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import statistics
|
|
3
|
+
from statistics import StatisticsError
|
|
4
|
+
from lemonade.state import State
|
|
5
|
+
from lemonade.tools.tool import Tool
|
|
6
|
+
from lemonade.tools.llamacpp.utils import LlamaCppAdapter
|
|
7
|
+
from lemonade.tools.bench import (
|
|
8
|
+
Bench,
|
|
9
|
+
default_prompt_length,
|
|
10
|
+
default_iterations,
|
|
11
|
+
default_output_tokens,
|
|
12
|
+
default_warmup_runs,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class LlamaCppBench(Bench):
|
|
17
|
+
"""
|
|
18
|
+
Benchmark a llama.cpp model
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
unique_name = "llamacpp-bench"
|
|
22
|
+
|
|
23
|
+
def __init__(self, monitor_message="Benchmarking LLM"):
|
|
24
|
+
super().__init__(monitor_message)
|
|
25
|
+
|
|
26
|
+
# Don't track memory usage since we are using a llamacpp executable for compute
|
|
27
|
+
self.save_max_memory_used = False
|
|
28
|
+
|
|
29
|
+
@staticmethod
|
|
30
|
+
def parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
31
|
+
parser = __class__.helpful_parser(
|
|
32
|
+
short_description="Benchmark an LLM in llama.cpp",
|
|
33
|
+
add_help=add_help,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
parser = Bench.parser(parser)
|
|
37
|
+
|
|
38
|
+
parser.add_argument(
|
|
39
|
+
"--cli",
|
|
40
|
+
action="store_true",
|
|
41
|
+
help="Set this flag to use llama-cli.exe to benchmark model performance. This executable will be called "
|
|
42
|
+
"once per iteration. Otherwise, llama-bench.exe is used by default. In this default behavior behavior, "
|
|
43
|
+
"the only valid prompt format is integer token lengths. Also, the warmup-iterations parameter is "
|
|
44
|
+
"ignored and the default value for number of threads is 16.",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
return parser
|
|
48
|
+
|
|
49
|
+
def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
|
|
50
|
+
"""
|
|
51
|
+
Helper function to parse CLI arguments into the args expected by run()
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
# Call Tool parse method, NOT the Bench parse method
|
|
55
|
+
parsed_args = Tool.parse(self, state, args, known_only)
|
|
56
|
+
|
|
57
|
+
if parsed_args.cli:
|
|
58
|
+
parsed_args = super().parse(state, args, known_only)
|
|
59
|
+
else:
|
|
60
|
+
# Make sure prompts is a list of integers
|
|
61
|
+
if parsed_args.prompts is None:
|
|
62
|
+
parsed_args.prompts = [default_prompt_length]
|
|
63
|
+
prompt_ints = []
|
|
64
|
+
for prompt_item in parsed_args.prompts:
|
|
65
|
+
if prompt_item.isdigit():
|
|
66
|
+
prompt_ints.append(int(prompt_item))
|
|
67
|
+
else:
|
|
68
|
+
raise Exception(
|
|
69
|
+
f"When not using the --cli flag to {self.unique_name}, the prompt format must "
|
|
70
|
+
"be in integer format."
|
|
71
|
+
)
|
|
72
|
+
parsed_args.prompts = prompt_ints
|
|
73
|
+
|
|
74
|
+
return parsed_args
|
|
75
|
+
|
|
76
|
+
def run_prompt(
|
|
77
|
+
self,
|
|
78
|
+
state: State,
|
|
79
|
+
report_progress_fn,
|
|
80
|
+
prompt: str,
|
|
81
|
+
iterations: int,
|
|
82
|
+
warmup_iterations: int,
|
|
83
|
+
output_tokens: int,
|
|
84
|
+
) -> State:
|
|
85
|
+
"""
|
|
86
|
+
Benchmark llama.cpp model that was loaded by LoadLlamaCpp.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
if self.first_run_prompt:
|
|
90
|
+
|
|
91
|
+
if not hasattr(state, "model") or not isinstance(
|
|
92
|
+
state.model, LlamaCppAdapter
|
|
93
|
+
):
|
|
94
|
+
raise Exception(
|
|
95
|
+
f"{self.__class__.unique_name} requires a LlamaCppAdapter model to be "
|
|
96
|
+
"loaded first. Please run load-llama-cpp before this tool."
|
|
97
|
+
)
|
|
98
|
+
model: LlamaCppAdapter = state.model
|
|
99
|
+
|
|
100
|
+
per_iteration_tokens_per_second = []
|
|
101
|
+
per_iteration_time_to_first_token = []
|
|
102
|
+
|
|
103
|
+
for iteration in range(iterations + warmup_iterations):
|
|
104
|
+
try:
|
|
105
|
+
# Use the adapter's generate method which already has the timeout
|
|
106
|
+
# and error handling
|
|
107
|
+
model.time_to_first_token = None
|
|
108
|
+
model.tokens_per_second = None
|
|
109
|
+
raw_output, stderr = model.generate(
|
|
110
|
+
prompt, max_new_tokens=output_tokens, return_raw=True
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
if model.time_to_first_token is None or model.tokens_per_second is None:
|
|
114
|
+
error_msg = (
|
|
115
|
+
"Could not find timing information in llama.cpp output.\n"
|
|
116
|
+
)
|
|
117
|
+
error_msg += "Raw output:\n" + raw_output + "\n"
|
|
118
|
+
error_msg += "Stderr:\n" + stderr
|
|
119
|
+
raise Exception(error_msg)
|
|
120
|
+
|
|
121
|
+
self.tokens_out_len_list.append(model.response_tokens)
|
|
122
|
+
|
|
123
|
+
if iteration > warmup_iterations - 1:
|
|
124
|
+
per_iteration_tokens_per_second.append(model.tokens_per_second)
|
|
125
|
+
per_iteration_time_to_first_token.append(model.time_to_first_token)
|
|
126
|
+
|
|
127
|
+
report_progress_fn((iteration + 1) / (warmup_iterations + iterations))
|
|
128
|
+
|
|
129
|
+
except Exception as e:
|
|
130
|
+
error_msg = f"Failed to run benchmark: {str(e)}"
|
|
131
|
+
raise Exception(error_msg)
|
|
132
|
+
|
|
133
|
+
self.input_ids_len_list.append(model.prompt_tokens)
|
|
134
|
+
mean_time_to_first_token = statistics.mean(per_iteration_time_to_first_token)
|
|
135
|
+
self.mean_time_to_first_token_list.append(mean_time_to_first_token)
|
|
136
|
+
self.prefill_tokens_per_second_list.append(
|
|
137
|
+
model.prompt_tokens / mean_time_to_first_token
|
|
138
|
+
)
|
|
139
|
+
self.token_generation_tokens_per_second_list.append(
|
|
140
|
+
statistics.mean(per_iteration_tokens_per_second)
|
|
141
|
+
)
|
|
142
|
+
try:
|
|
143
|
+
self.std_dev_time_to_first_token_list.append(
|
|
144
|
+
statistics.stdev(per_iteration_time_to_first_token)
|
|
145
|
+
)
|
|
146
|
+
except StatisticsError:
|
|
147
|
+
# Less than 2 measurements
|
|
148
|
+
self.std_dev_time_to_first_token_list.append(None)
|
|
149
|
+
try:
|
|
150
|
+
self.std_dev_token_generation_tokens_per_second_list.append(
|
|
151
|
+
statistics.stdev(per_iteration_tokens_per_second)
|
|
152
|
+
)
|
|
153
|
+
except StatisticsError:
|
|
154
|
+
# Less than 2 measurements
|
|
155
|
+
self.std_dev_token_generation_tokens_per_second_list.append(None)
|
|
156
|
+
|
|
157
|
+
def run_llama_bench_exe(self, state, prompts, iterations, output_tokens):
|
|
158
|
+
|
|
159
|
+
if prompts is None:
|
|
160
|
+
prompts = [default_prompt_length]
|
|
161
|
+
elif isinstance(prompts, int):
|
|
162
|
+
prompts = [prompts]
|
|
163
|
+
|
|
164
|
+
state.save_stat("prompts", prompts)
|
|
165
|
+
state.save_stat("iterations", iterations)
|
|
166
|
+
state.save_stat("output_tokens", output_tokens)
|
|
167
|
+
|
|
168
|
+
model: LlamaCppAdapter = state.model
|
|
169
|
+
prompt_lengths, pp_tps, pp_tps_sd, tg_tps, tg_tps_sd = model.benchmark(
|
|
170
|
+
prompts, iterations, output_tokens
|
|
171
|
+
)
|
|
172
|
+
self.input_ids_len_list = prompt_lengths
|
|
173
|
+
self.prefill_tokens_per_second_list = pp_tps
|
|
174
|
+
if iterations > 1:
|
|
175
|
+
self.std_dev_prefill_tokens_per_second_list = pp_tps_sd
|
|
176
|
+
self.mean_time_to_first_token_list = [
|
|
177
|
+
tokens / tps for tokens, tps in zip(prompt_lengths, pp_tps)
|
|
178
|
+
]
|
|
179
|
+
self.token_generation_tokens_per_second_list = [tg_tps]
|
|
180
|
+
if iterations > 1:
|
|
181
|
+
self.std_dev_token_generation_tokens_per_second_list = [tg_tps_sd]
|
|
182
|
+
self.tokens_out_len_list = [output_tokens] * len(prompts) * iterations
|
|
183
|
+
|
|
184
|
+
self.save_stats(state)
|
|
185
|
+
return state
|
|
186
|
+
|
|
187
|
+
def run(
|
|
188
|
+
self,
|
|
189
|
+
state: State,
|
|
190
|
+
prompts: list[str] = None,
|
|
191
|
+
iterations: int = default_iterations,
|
|
192
|
+
warmup_iterations: int = default_warmup_runs,
|
|
193
|
+
output_tokens: int = default_output_tokens,
|
|
194
|
+
cli: bool = False,
|
|
195
|
+
**kwargs,
|
|
196
|
+
) -> State:
|
|
197
|
+
"""
|
|
198
|
+
Args:
|
|
199
|
+
- prompts: List of input prompts used as starting points for LLM text generation
|
|
200
|
+
- iterations: Number of benchmarking samples to take; results are
|
|
201
|
+
reported as the median and mean of the samples.
|
|
202
|
+
- warmup_iterations: Subset of the iterations to treat as warmup,
|
|
203
|
+
and not included in the results.
|
|
204
|
+
- output_tokens: Number of new tokens LLM to create.
|
|
205
|
+
- ggml: Use llama-bench.exe directly
|
|
206
|
+
- kwargs: Additional parameters used by bench tools
|
|
207
|
+
"""
|
|
208
|
+
|
|
209
|
+
# Check that state has the attribute model and it is a LlamaCPP model
|
|
210
|
+
if not hasattr(state, "model") or not isinstance(state.model, LlamaCppAdapter):
|
|
211
|
+
raise Exception("Load model using llamacpp-load first.")
|
|
212
|
+
|
|
213
|
+
if cli:
|
|
214
|
+
state = super().run(
|
|
215
|
+
state, prompts, iterations, warmup_iterations, output_tokens, **kwargs
|
|
216
|
+
)
|
|
217
|
+
else:
|
|
218
|
+
state = self.run_llama_bench_exe(state, prompts, iterations, output_tokens)
|
|
219
|
+
|
|
220
|
+
return state
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
# This file was originally licensed under Apache 2.0. It has been modified.
|
|
224
|
+
# Modifications Copyright (c) 2025 AMD
|
|
@@ -93,6 +93,7 @@ class LoadLlamaCpp(FirstTool):
|
|
|
93
93
|
from lemonade.tools.llamacpp.utils import (
|
|
94
94
|
install_llamacpp,
|
|
95
95
|
get_llama_cli_exe_path,
|
|
96
|
+
get_llama_bench_exe_path,
|
|
96
97
|
get_llama_installed_version,
|
|
97
98
|
parse_checkpoint,
|
|
98
99
|
download_gguf,
|
|
@@ -103,6 +104,8 @@ class LoadLlamaCpp(FirstTool):
|
|
|
103
104
|
|
|
104
105
|
install_llamacpp(backend)
|
|
105
106
|
|
|
107
|
+
extension = ""
|
|
108
|
+
|
|
106
109
|
# Check if input is a local folder containing a .GGUF model
|
|
107
110
|
if os.path.isdir(input):
|
|
108
111
|
# input is a local folder
|
|
@@ -121,6 +124,17 @@ class LoadLlamaCpp(FirstTool):
|
|
|
121
124
|
)
|
|
122
125
|
model_to_use = gguf_files[0]
|
|
123
126
|
full_model_path = os.path.join(local_model_folder, model_to_use)
|
|
127
|
+
extension = ".gguf"
|
|
128
|
+
|
|
129
|
+
elif input.endswith(".gguf") and os.path.isfile(input):
|
|
130
|
+
# input is a local .gguf file
|
|
131
|
+
full_model_path = os.path.abspath(input)
|
|
132
|
+
checkpoint = "local_model"
|
|
133
|
+
state.checkpoint = checkpoint
|
|
134
|
+
state.save_stat(Keys.CHECKPOINT, checkpoint)
|
|
135
|
+
state.save_stat(Keys.LOCAL_MODEL_FOLDER, full_model_path)
|
|
136
|
+
model_to_use = os.path.basename(full_model_path)
|
|
137
|
+
extension = ".gguf"
|
|
124
138
|
|
|
125
139
|
else:
|
|
126
140
|
# Input is a model checkpoint
|
|
@@ -161,6 +175,7 @@ class LoadLlamaCpp(FirstTool):
|
|
|
161
175
|
model_to_use = os.path.basename(full_model_path)
|
|
162
176
|
|
|
163
177
|
llama_cli_exe_path = get_llama_cli_exe_path(backend)
|
|
178
|
+
llama_bench_exe_path = get_llama_bench_exe_path(backend)
|
|
164
179
|
printing.log_info(f"Using llama_cli for GGUF model: {llama_cli_exe_path}")
|
|
165
180
|
|
|
166
181
|
# Get the directory containing the executable for shared libraries
|
|
@@ -174,8 +189,10 @@ class LoadLlamaCpp(FirstTool):
|
|
|
174
189
|
context_size=context_size,
|
|
175
190
|
threads=threads,
|
|
176
191
|
executable=llama_cli_exe_path,
|
|
192
|
+
bench_executable=llama_bench_exe_path,
|
|
177
193
|
reasoning=reasoning,
|
|
178
194
|
lib_dir=lib_dir,
|
|
195
|
+
state=state,
|
|
179
196
|
)
|
|
180
197
|
state.tokenizer = LlamaCppTokenizerAdapter()
|
|
181
198
|
state.device = device
|
|
@@ -186,7 +203,9 @@ class LoadLlamaCpp(FirstTool):
|
|
|
186
203
|
Keys.LLAMA_CLI_VERSION_INFO, get_llama_installed_version(backend)
|
|
187
204
|
)
|
|
188
205
|
|
|
189
|
-
status.add_to_state(
|
|
206
|
+
status.add_to_state(
|
|
207
|
+
state=state, name=input, model=model_to_use, extension=extension
|
|
208
|
+
)
|
|
190
209
|
return state
|
|
191
210
|
|
|
192
211
|
|
|
@@ -7,6 +7,7 @@ import zipfile
|
|
|
7
7
|
from typing import Optional
|
|
8
8
|
import subprocess
|
|
9
9
|
import requests
|
|
10
|
+
import lemonade.common.build as build
|
|
10
11
|
import lemonade.common.printing as printing
|
|
11
12
|
from lemonade.tools.adapter import PassthroughTokenizer, ModelAdapter
|
|
12
13
|
|
|
@@ -175,6 +176,13 @@ def get_llama_cli_exe_path(backend: str):
|
|
|
175
176
|
return get_llama_exe_path("llama-cli", backend)
|
|
176
177
|
|
|
177
178
|
|
|
179
|
+
def get_llama_bench_exe_path(backend: str):
|
|
180
|
+
"""
|
|
181
|
+
Get path to platform-specific llama-bench executable
|
|
182
|
+
"""
|
|
183
|
+
return get_llama_exe_path("llama-bench", backend)
|
|
184
|
+
|
|
185
|
+
|
|
178
186
|
def get_version_txt_path(backend: str):
|
|
179
187
|
"""
|
|
180
188
|
Get path to text file that contains version information
|
|
@@ -406,6 +414,7 @@ def install_llamacpp(backend):
|
|
|
406
414
|
exe_paths = [
|
|
407
415
|
(get_llama_server_exe_path(backend), "llama-server"),
|
|
408
416
|
(get_llama_cli_exe_path(backend), "llama-cli"),
|
|
417
|
+
(get_llama_bench_exe_path(backend), "llama-bench"),
|
|
409
418
|
]
|
|
410
419
|
|
|
411
420
|
for exe_path, exe_name in exe_paths:
|
|
@@ -699,8 +708,10 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
699
708
|
context_size,
|
|
700
709
|
threads,
|
|
701
710
|
executable,
|
|
711
|
+
bench_executable,
|
|
702
712
|
reasoning=False,
|
|
703
713
|
lib_dir=None,
|
|
714
|
+
state=None,
|
|
704
715
|
):
|
|
705
716
|
super().__init__()
|
|
706
717
|
|
|
@@ -712,8 +723,10 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
712
723
|
self.context_size = context_size
|
|
713
724
|
self.threads = threads
|
|
714
725
|
self.executable = os.path.normpath(executable)
|
|
726
|
+
self.bench_executable = os.path.normpath(bench_executable)
|
|
715
727
|
self.reasoning = reasoning
|
|
716
728
|
self.lib_dir = lib_dir
|
|
729
|
+
self.state = state
|
|
717
730
|
|
|
718
731
|
def generate(
|
|
719
732
|
self,
|
|
@@ -754,32 +767,54 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
754
767
|
self.executable,
|
|
755
768
|
"-m",
|
|
756
769
|
self.model,
|
|
757
|
-
"--ctx-size",
|
|
770
|
+
"--ctx-size", # size of the prompt context, 0 = loaded from model
|
|
758
771
|
str(self.context_size),
|
|
759
|
-
"-n",
|
|
772
|
+
"-n", # number of tokens to predict, -1 = infinity, =2 - until context filled
|
|
760
773
|
str(n_predict),
|
|
761
|
-
"-t",
|
|
774
|
+
"-t", # number of threads to use during generation
|
|
762
775
|
str(self.threads),
|
|
763
776
|
"-p",
|
|
764
777
|
prompt,
|
|
778
|
+
"-b", # logical maximum batch size
|
|
779
|
+
"1",
|
|
780
|
+
"-ub", # physical maximum batch size
|
|
781
|
+
"1",
|
|
765
782
|
"--temp",
|
|
766
783
|
str(temperature),
|
|
767
784
|
"--top-p",
|
|
768
785
|
str(top_p),
|
|
769
786
|
"--top-k",
|
|
770
787
|
str(top_k),
|
|
771
|
-
"-e",
|
|
772
|
-
"
|
|
773
|
-
"--reasoning-format",
|
|
788
|
+
"-e", # process escape sequences
|
|
789
|
+
"--no-conversation", # disable conversation mode
|
|
790
|
+
"--reasoning-format", # leaves thoughts unparsed in message content
|
|
774
791
|
"none",
|
|
775
792
|
]
|
|
776
793
|
|
|
794
|
+
# If prompt exceeds 500 characters, then use a file
|
|
795
|
+
if len(prompt) < 500:
|
|
796
|
+
cmd += ["-p", prompt]
|
|
797
|
+
else:
|
|
798
|
+
# Create prompt file in cache directory
|
|
799
|
+
prompt_file = os.path.join(
|
|
800
|
+
build.output_dir(self.state.cache_dir, self.state.build_name),
|
|
801
|
+
"prompt.txt",
|
|
802
|
+
)
|
|
803
|
+
with open(prompt_file, "w", encoding="utf-8") as file:
|
|
804
|
+
file.write(prompt)
|
|
805
|
+
cmd += ["-f", prompt_file]
|
|
806
|
+
|
|
777
807
|
# Configure GPU layers: 99 for GPU, 0 for CPU-only
|
|
778
808
|
ngl_value = "99" if self.device == "igpu" else "0"
|
|
779
809
|
cmd = cmd + ["-ngl", ngl_value]
|
|
780
810
|
|
|
781
811
|
cmd = [str(m) for m in cmd]
|
|
782
812
|
|
|
813
|
+
# save llama-cli command
|
|
814
|
+
self.state.llama_cli_cmd = getattr(self.state, "llama_cli_cmd", []) + [
|
|
815
|
+
" ".join(cmd)
|
|
816
|
+
]
|
|
817
|
+
|
|
783
818
|
try:
|
|
784
819
|
# Set up environment with library path for Linux
|
|
785
820
|
env = os.environ.copy()
|
|
@@ -809,6 +844,15 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
809
844
|
)
|
|
810
845
|
|
|
811
846
|
raw_output, stderr = process.communicate(timeout=600)
|
|
847
|
+
|
|
848
|
+
# save llama-cli command output with performance info to state
|
|
849
|
+
# (can be viewed in state.yaml file in cache)
|
|
850
|
+
self.state.llama_cli_stderr = getattr(
|
|
851
|
+
self.state, "llama_cli_stderr", []
|
|
852
|
+
) + [
|
|
853
|
+
[line for line in stderr.splitlines() if line.startswith("llama_perf_")]
|
|
854
|
+
]
|
|
855
|
+
|
|
812
856
|
if process.returncode != 0:
|
|
813
857
|
error_msg = f"llama.cpp failed with return code {process.returncode}.\n"
|
|
814
858
|
error_msg += f"Command: {' '.join(cmd)}\n"
|
|
@@ -873,7 +917,108 @@ class LlamaCppAdapter(ModelAdapter):
|
|
|
873
917
|
return [output_text]
|
|
874
918
|
|
|
875
919
|
except Exception as e:
|
|
876
|
-
error_msg = f"Failed to run llama.
|
|
920
|
+
error_msg = f"Failed to run llama-cli.exe command: {str(e)}\n"
|
|
921
|
+
error_msg += f"Command: {' '.join(cmd)}"
|
|
922
|
+
raise Exception(error_msg)
|
|
923
|
+
|
|
924
|
+
def benchmark(self, prompts, iterations, output_tokens):
|
|
925
|
+
"""
|
|
926
|
+
Runs the llama-bench.exe tool to measure TTFT and TPS
|
|
927
|
+
"""
|
|
928
|
+
cmd = [
|
|
929
|
+
self.bench_executable,
|
|
930
|
+
"-m",
|
|
931
|
+
self.model,
|
|
932
|
+
"-r",
|
|
933
|
+
iterations,
|
|
934
|
+
"-p",
|
|
935
|
+
",".join([str(p) for p in prompts]),
|
|
936
|
+
"-n",
|
|
937
|
+
output_tokens,
|
|
938
|
+
"-t",
|
|
939
|
+
self.threads if self.threads > 0 else 16,
|
|
940
|
+
"-b",
|
|
941
|
+
1,
|
|
942
|
+
"-ub",
|
|
943
|
+
1,
|
|
944
|
+
]
|
|
945
|
+
cmd = [str(m) for m in cmd]
|
|
946
|
+
|
|
947
|
+
# save llama-bench command
|
|
948
|
+
self.state.llama_bench_cmd = " ".join(cmd)
|
|
949
|
+
|
|
950
|
+
try:
|
|
951
|
+
# Set up environment with library path for Linux
|
|
952
|
+
env = os.environ.copy()
|
|
953
|
+
|
|
954
|
+
# Load environment variables from .env file in the executable directory
|
|
955
|
+
exe_dir = os.path.dirname(self.executable)
|
|
956
|
+
env_file_path = os.path.join(exe_dir, ".env")
|
|
957
|
+
if os.path.exists(env_file_path):
|
|
958
|
+
load_dotenv(env_file_path, override=True)
|
|
959
|
+
env.update(os.environ)
|
|
960
|
+
|
|
961
|
+
if self.lib_dir and os.name != "nt": # Not Windows
|
|
962
|
+
current_ld_path = env.get("LD_LIBRARY_PATH", "")
|
|
963
|
+
if current_ld_path:
|
|
964
|
+
env["LD_LIBRARY_PATH"] = f"{self.lib_dir}:{current_ld_path}"
|
|
965
|
+
else:
|
|
966
|
+
env["LD_LIBRARY_PATH"] = self.lib_dir
|
|
967
|
+
|
|
968
|
+
process = subprocess.Popen(
|
|
969
|
+
cmd,
|
|
970
|
+
stdout=subprocess.PIPE,
|
|
971
|
+
stderr=subprocess.PIPE,
|
|
972
|
+
universal_newlines=True,
|
|
973
|
+
encoding="utf-8",
|
|
974
|
+
errors="replace",
|
|
975
|
+
env=env,
|
|
976
|
+
)
|
|
977
|
+
|
|
978
|
+
raw_output, stderr = process.communicate(timeout=600)
|
|
979
|
+
|
|
980
|
+
# save llama-bench command output with performance info to state
|
|
981
|
+
# (can be viewed in state.yaml file in cache)
|
|
982
|
+
self.state.llama_bench_standard_output = raw_output.splitlines()
|
|
983
|
+
|
|
984
|
+
if process.returncode != 0:
|
|
985
|
+
error_msg = (
|
|
986
|
+
f"llama-bench.exe failed with return code {process.returncode}.\n"
|
|
987
|
+
)
|
|
988
|
+
error_msg += f"Command: {' '.join(cmd)}\n"
|
|
989
|
+
error_msg += f"Error output:\n{stderr}\n"
|
|
990
|
+
error_msg += f"Standard output:\n{raw_output}"
|
|
991
|
+
raise Exception(error_msg)
|
|
992
|
+
|
|
993
|
+
if raw_output is None:
|
|
994
|
+
raise Exception("No output received from llama-bench.exe process")
|
|
995
|
+
|
|
996
|
+
# Parse information from llama-bench.exe output
|
|
997
|
+
prompt_lengths = []
|
|
998
|
+
pp_tps = []
|
|
999
|
+
pp_tps_sd = []
|
|
1000
|
+
tg_tps = None
|
|
1001
|
+
tg_tps_sd = None
|
|
1002
|
+
|
|
1003
|
+
for line in self.state.llama_bench_standard_output:
|
|
1004
|
+
# Parse TPS information
|
|
1005
|
+
for p in prompts:
|
|
1006
|
+
if f"pp{p:d}" in line:
|
|
1007
|
+
parts = line.split("|")
|
|
1008
|
+
timings = parts[-2].strip().split(" ")
|
|
1009
|
+
prompt_lengths.append(p)
|
|
1010
|
+
pp_tps.append(float(timings[0]))
|
|
1011
|
+
pp_tps_sd.append(float(timings[-1]))
|
|
1012
|
+
if f"tg{output_tokens:d}" in line:
|
|
1013
|
+
parts = line.split("|")
|
|
1014
|
+
timings = parts[-2].strip().split(" ")
|
|
1015
|
+
tg_tps = float(timings[0])
|
|
1016
|
+
tg_tps_sd = float(timings[-1])
|
|
1017
|
+
|
|
1018
|
+
return prompt_lengths, pp_tps, pp_tps_sd, tg_tps, tg_tps_sd
|
|
1019
|
+
|
|
1020
|
+
except Exception as e:
|
|
1021
|
+
error_msg = f"Failed to run llama-bench.exe command: {str(e)}\n"
|
|
877
1022
|
error_msg += f"Command: {' '.join(cmd)}"
|
|
878
1023
|
raise Exception(error_msg)
|
|
879
1024
|
|