lemonade-sdk 8.1.11__tar.gz → 8.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- {lemonade_sdk-8.1.11/src/lemonade_sdk.egg-info → lemonade_sdk-8.2.0}/PKG-INFO +5 -3
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/README.md +1 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/setup.py +3 -2
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/cache.py +6 -1
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/status.py +4 -4
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/system_info.py +0 -26
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/bench.py +22 -1
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/flm/utils.py +70 -22
- lemonade_sdk-8.2.0/src/lemonade/tools/llamacpp/bench.py +224 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/llamacpp/load.py +30 -2
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/llamacpp/utils.py +234 -15
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/oga/bench.py +0 -26
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/oga/load.py +38 -142
- lemonade_sdk-8.2.0/src/lemonade/tools/oga/migration.py +403 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/report/table.py +6 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/flm.py +2 -6
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/llamacpp.py +20 -1
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/serve.py +335 -17
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/static/js/models.js +416 -18
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/static/js/shared.js +44 -6
- lemonade_sdk-8.2.0/src/lemonade/tools/server/static/logs.html +57 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/static/styles.css +204 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/static/webapp.html +32 -0
- lemonade_sdk-8.2.0/src/lemonade/version.py +1 -0
- lemonade_sdk-8.2.0/src/lemonade_install/install.py +239 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0/src/lemonade_sdk.egg-info}/PKG-INFO +5 -3
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade_sdk.egg-info/SOURCES.txt +1 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade_sdk.egg-info/requires.txt +3 -2
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade_server/cli.py +10 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade_server/model_manager.py +172 -11
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade_server/server_models.json +102 -66
- lemonade_sdk-8.1.11/src/lemonade/tools/llamacpp/bench.py +0 -136
- lemonade_sdk-8.1.11/src/lemonade/tools/server/static/logs.html +0 -47
- lemonade_sdk-8.1.11/src/lemonade/version.py +0 -1
- lemonade_sdk-8.1.11/src/lemonade_install/install.py +0 -785
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/LICENSE +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/NOTICE.md +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/pyproject.toml +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/setup.cfg +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/__init__.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/api.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/cli.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/__init__.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/build.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/cli_helpers.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/exceptions.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/filesystem.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/inference_engines.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/network.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/printing.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/common/test_helpers.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/profilers/__init__.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/profilers/agt_power.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/profilers/hwinfo_power.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/profilers/memory_tracker.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/profilers/profiler.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/sequence.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/state.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/__init__.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/accuracy.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/adapter.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/flm/__init__.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/huggingface/bench.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/huggingface/load.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/huggingface/utils.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/humaneval.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/management_tools.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/mmlu.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/oga/__init__.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/oga/utils.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/perplexity.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/prompt.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/report/__init__.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/report/llm_report.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/__init__.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/static/favicon.ico +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/static/js/chat.js +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/static/js/model-settings.js +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/tool_calls.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/tray.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/utils/macos_tray.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/utils/port.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/utils/thread.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/utils/windows_tray.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/webapp.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/server/wrapped_server.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade/tools/tool.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade_install/__init__.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade_sdk.egg-info/dependency_links.txt +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade_sdk.egg-info/entry_points.txt +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade_sdk.egg-info/top_level.txt +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade_server/pydantic_models.py +0 -0
- {lemonade_sdk-8.1.11 → lemonade_sdk-8.2.0}/src/lemonade_server/settings.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: lemonade-sdk
|
|
3
|
-
Version: 8.
|
|
3
|
+
Version: 8.2.0
|
|
4
4
|
Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
|
|
5
5
|
Author-email: lemonade@amd.com
|
|
6
6
|
Requires-Python: >=3.10, <3.14
|
|
@@ -29,12 +29,13 @@ Requires-Dist: tabulate
|
|
|
29
29
|
Requires-Dist: sentencepiece
|
|
30
30
|
Requires-Dist: huggingface-hub[hf_xet]==0.33.0
|
|
31
31
|
Requires-Dist: python-dotenv
|
|
32
|
+
Requires-Dist: python-multipart
|
|
32
33
|
Requires-Dist: rumps>=0.4.0; sys_platform == "darwin"
|
|
33
34
|
Provides-Extra: oga-ryzenai
|
|
34
|
-
Requires-Dist: onnxruntime-genai-directml-ryzenai==0.
|
|
35
|
+
Requires-Dist: onnxruntime-genai-directml-ryzenai==0.9.2; extra == "oga-ryzenai"
|
|
35
36
|
Requires-Dist: protobuf>=6.30.1; extra == "oga-ryzenai"
|
|
36
37
|
Provides-Extra: oga-cpu
|
|
37
|
-
Requires-Dist: onnxruntime-genai==0.
|
|
38
|
+
Requires-Dist: onnxruntime-genai==0.9.2; extra == "oga-cpu"
|
|
38
39
|
Requires-Dist: onnxruntime>=1.22.0; extra == "oga-cpu"
|
|
39
40
|
Provides-Extra: dev
|
|
40
41
|
Requires-Dist: torch>=2.6.0; extra == "dev"
|
|
@@ -264,6 +265,7 @@ This project is:
|
|
|
264
265
|
- [OnnxRuntime GenAI](https://github.com/microsoft/onnxruntime-genai)
|
|
265
266
|
- [Hugging Face Hub](https://github.com/huggingface/huggingface_hub)
|
|
266
267
|
- [OpenAI API](https://github.com/openai/openai-python)
|
|
268
|
+
- [IRON/MLIR-AIE](https://github.com/Xilinx/mlir-aie)
|
|
267
269
|
- and more...
|
|
268
270
|
- Accelerated by mentorship from the OCV Catalyst program.
|
|
269
271
|
- Licensed under the [Apache 2.0 License](https://github.com/lemonade-sdk/lemonade/blob/main/LICENSE).
|
|
@@ -207,6 +207,7 @@ This project is:
|
|
|
207
207
|
- [OnnxRuntime GenAI](https://github.com/microsoft/onnxruntime-genai)
|
|
208
208
|
- [Hugging Face Hub](https://github.com/huggingface/huggingface_hub)
|
|
209
209
|
- [OpenAI API](https://github.com/openai/openai-python)
|
|
210
|
+
- [IRON/MLIR-AIE](https://github.com/Xilinx/mlir-aie)
|
|
210
211
|
- and more...
|
|
211
212
|
- Accelerated by mentorship from the OCV Catalyst program.
|
|
212
213
|
- Licensed under the [Apache 2.0 License](https://github.com/lemonade-sdk/lemonade/blob/main/LICENSE).
|
|
@@ -49,6 +49,7 @@ setup(
|
|
|
49
49
|
"sentencepiece",
|
|
50
50
|
"huggingface-hub[hf_xet]==0.33.0",
|
|
51
51
|
"python-dotenv",
|
|
52
|
+
"python-multipart",
|
|
52
53
|
# macOS-specific dependencies
|
|
53
54
|
"rumps>=0.4.0; sys_platform == 'darwin'",
|
|
54
55
|
],
|
|
@@ -57,11 +58,11 @@ setup(
|
|
|
57
58
|
# applications, without including developer-focused tools
|
|
58
59
|
# Primary NPU extra using unified PyPI package
|
|
59
60
|
"oga-ryzenai": [
|
|
60
|
-
"onnxruntime-genai-directml-ryzenai==0.
|
|
61
|
+
"onnxruntime-genai-directml-ryzenai==0.9.2",
|
|
61
62
|
"protobuf>=6.30.1",
|
|
62
63
|
],
|
|
63
64
|
"oga-cpu": [
|
|
64
|
-
"onnxruntime-genai==0.
|
|
65
|
+
"onnxruntime-genai==0.9.2",
|
|
65
66
|
"onnxruntime >=1.22.0",
|
|
66
67
|
],
|
|
67
68
|
# Developer-focused tools for benchmarking, accuracy testing, and
|
|
@@ -43,7 +43,11 @@ def build_name(input_name):
|
|
|
43
43
|
"""
|
|
44
44
|
|
|
45
45
|
if os.path.isdir(input_name):
|
|
46
|
+
# Input is a folder so no good way to determine a model name
|
|
46
47
|
input_name_sanitized = "local_model"
|
|
48
|
+
elif os.path.isfile(input_name):
|
|
49
|
+
# Use the filename without its extension
|
|
50
|
+
input_name_sanitized = os.path.splitext(os.path.basename(input_name))[0]
|
|
47
51
|
else:
|
|
48
52
|
# Sanitize the input name
|
|
49
53
|
input_name_sanitized = input_name.replace("/", "_")
|
|
@@ -63,8 +67,9 @@ class Keys:
|
|
|
63
67
|
TOKEN_GENERATION_TOKENS_PER_SECOND = "token_generation_tokens_per_second"
|
|
64
68
|
STD_DEV_TOKENS_PER_SECOND = "std_dev_tokens_per_second"
|
|
65
69
|
SECONDS_TO_FIRST_TOKEN = "seconds_to_first_token"
|
|
66
|
-
PREFILL_TOKENS_PER_SECOND = "prefill_tokens_per_second"
|
|
67
70
|
STD_DEV_SECONDS_TO_FIRST_TOKEN = "std_dev_seconds_to_first_token"
|
|
71
|
+
PREFILL_TOKENS_PER_SECOND = "prefill_tokens_per_second"
|
|
72
|
+
STD_DEV_PREFILL_TOKENS_PER_SECOND = "std_dev_prefill_tokens_per_second"
|
|
68
73
|
CHECKPOINT = "checkpoint"
|
|
69
74
|
DTYPE = "dtype"
|
|
70
75
|
PROMPT = "prompt"
|
|
@@ -112,10 +112,10 @@ class UniqueInvocationInfo(BasicInfo):
|
|
|
112
112
|
if print_file_name:
|
|
113
113
|
print(f"{self.script_name}{self.extension}:")
|
|
114
114
|
|
|
115
|
-
# Print invocation about the model (only applies to scripts, not ONNX files
|
|
115
|
+
# Print invocation about the model (only applies to scripts, not ONNX or GGUF files, nor
|
|
116
116
|
# LLMs, which have no extension)
|
|
117
117
|
if not (
|
|
118
|
-
self.extension
|
|
118
|
+
self.extension in [".onnx", ".gguf"]
|
|
119
119
|
or self.extension == build.state_file_name
|
|
120
120
|
or self.extension == ""
|
|
121
121
|
):
|
|
@@ -138,7 +138,7 @@ class UniqueInvocationInfo(BasicInfo):
|
|
|
138
138
|
|
|
139
139
|
if self.depth == 0:
|
|
140
140
|
print(f"{self.indent}\tLocation:\t{self.file}", end="")
|
|
141
|
-
if self.extension
|
|
141
|
+
if self.extension in [".onnx", ".gguf"]:
|
|
142
142
|
print()
|
|
143
143
|
else:
|
|
144
144
|
print(f", line {self.line}")
|
|
@@ -314,7 +314,7 @@ class UniqueInvocationInfo(BasicInfo):
|
|
|
314
314
|
Print information about a given model or submodel.
|
|
315
315
|
"""
|
|
316
316
|
|
|
317
|
-
if self.extension
|
|
317
|
+
if self.extension in [".onnx", ".gguf"] or self.extension == "":
|
|
318
318
|
self.indent = "\t" * (2 * self.depth)
|
|
319
319
|
else:
|
|
320
320
|
self.indent = "\t" * (2 * self.depth + 1)
|
|
@@ -1110,32 +1110,6 @@ class LinuxSystemInfo(SystemInfo):
|
|
|
1110
1110
|
|
|
1111
1111
|
return ""
|
|
1112
1112
|
|
|
1113
|
-
def _get_nvidia_vram_smi_linux(self) -> float:
|
|
1114
|
-
"""
|
|
1115
|
-
Get NVIDIA GPU VRAM on Linux using nvidia-smi command.
|
|
1116
|
-
|
|
1117
|
-
Returns:
|
|
1118
|
-
float: VRAM in GB, or 0.0 if detection fails
|
|
1119
|
-
"""
|
|
1120
|
-
try:
|
|
1121
|
-
output = (
|
|
1122
|
-
subprocess.check_output(
|
|
1123
|
-
"nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits",
|
|
1124
|
-
shell=True,
|
|
1125
|
-
stderr=subprocess.DEVNULL,
|
|
1126
|
-
)
|
|
1127
|
-
.decode()
|
|
1128
|
-
.strip()
|
|
1129
|
-
)
|
|
1130
|
-
|
|
1131
|
-
# nvidia-smi returns memory in MB
|
|
1132
|
-
vram_mb = int(output.split("\n")[0])
|
|
1133
|
-
vram_gb = round(vram_mb / 1024, 1)
|
|
1134
|
-
return vram_gb
|
|
1135
|
-
except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
|
|
1136
|
-
pass
|
|
1137
|
-
return 0.0
|
|
1138
|
-
|
|
1139
1113
|
@staticmethod
|
|
1140
1114
|
def get_processor_name() -> str:
|
|
1141
1115
|
"""
|
|
@@ -29,7 +29,9 @@ class Bench(Tool, ABC):
|
|
|
29
29
|
Keys.SECONDS_TO_FIRST_TOKEN,
|
|
30
30
|
Keys.STD_DEV_SECONDS_TO_FIRST_TOKEN,
|
|
31
31
|
Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
|
|
32
|
+
Keys.STD_DEV_TOKENS_PER_SECOND,
|
|
32
33
|
Keys.PREFILL_TOKENS_PER_SECOND,
|
|
34
|
+
Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
|
|
33
35
|
Keys.PROMPT_TOKENS,
|
|
34
36
|
Keys.RESPONSE_TOKENS,
|
|
35
37
|
Keys.MAX_MEMORY_USED_GBYTE,
|
|
@@ -42,7 +44,9 @@ class Bench(Tool, ABC):
|
|
|
42
44
|
self.mean_time_to_first_token_list = []
|
|
43
45
|
self.std_dev_time_to_first_token_list = []
|
|
44
46
|
self.prefill_tokens_per_second_list = []
|
|
47
|
+
self.std_dev_prefill_tokens_per_second_list = []
|
|
45
48
|
self.token_generation_tokens_per_second_list = []
|
|
49
|
+
self.std_dev_token_generation_tokens_per_second_list = []
|
|
46
50
|
self.max_memory_used_gb_list = []
|
|
47
51
|
|
|
48
52
|
# Max memory used can only be measured on Windows systems
|
|
@@ -88,7 +92,7 @@ class Bench(Tool, ABC):
|
|
|
88
92
|
default=[str(default_prompt_length)],
|
|
89
93
|
metavar="PROMPT",
|
|
90
94
|
help="Input one or more prompts to the LLM. Three formats are supported. "
|
|
91
|
-
"1) integer: use a synthetic prompt with the specified length "
|
|
95
|
+
"1) integer: use a synthetic prompt with the specified token length "
|
|
92
96
|
"2) str: use a user-provided prompt string "
|
|
93
97
|
"3) path/to/prompt.txt: load the prompt from a text file. "
|
|
94
98
|
f"(default: {default_prompt_length}) ",
|
|
@@ -246,10 +250,27 @@ class Bench(Tool, ABC):
|
|
|
246
250
|
Keys.PREFILL_TOKENS_PER_SECOND,
|
|
247
251
|
self.get_item_or_list(self.prefill_tokens_per_second_list),
|
|
248
252
|
)
|
|
253
|
+
if not all(
|
|
254
|
+
element is None for element in self.std_dev_prefill_tokens_per_second_list
|
|
255
|
+
):
|
|
256
|
+
state.save_stat(
|
|
257
|
+
Keys.STD_DEV_PREFILL_TOKENS_PER_SECOND,
|
|
258
|
+
self.get_item_or_list(self.std_dev_prefill_tokens_per_second_list),
|
|
259
|
+
)
|
|
249
260
|
state.save_stat(
|
|
250
261
|
Keys.TOKEN_GENERATION_TOKENS_PER_SECOND,
|
|
251
262
|
self.get_item_or_list(self.token_generation_tokens_per_second_list),
|
|
252
263
|
)
|
|
264
|
+
if not all(
|
|
265
|
+
element is None
|
|
266
|
+
for element in self.std_dev_token_generation_tokens_per_second_list
|
|
267
|
+
):
|
|
268
|
+
state.save_stat(
|
|
269
|
+
Keys.STD_DEV_TOKENS_PER_SECOND,
|
|
270
|
+
self.get_item_or_list(
|
|
271
|
+
self.std_dev_token_generation_tokens_per_second_list
|
|
272
|
+
),
|
|
273
|
+
)
|
|
253
274
|
if self.save_max_memory_used:
|
|
254
275
|
state.save_stat(
|
|
255
276
|
Keys.MAX_MEMORY_USED_GBYTE,
|
|
@@ -10,16 +10,46 @@ import time
|
|
|
10
10
|
from typing import List, Optional
|
|
11
11
|
|
|
12
12
|
import requests
|
|
13
|
-
from packaging.version import Version
|
|
13
|
+
from packaging.version import Version, InvalidVersion
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
|
|
16
|
+
def get_flm_latest_version() -> Optional[str]:
|
|
17
|
+
"""
|
|
18
|
+
Get and return the latest FLM version from "https://github.com/FastFlowLM/FastFlowLM/tags"
|
|
19
|
+
This uses the GitHub tags API.
|
|
20
|
+
"""
|
|
21
|
+
url = "https://api.github.com/repos/FastFlowLM/FastFlowLM/tags"
|
|
22
|
+
try:
|
|
23
|
+
response = requests.get(url, timeout=10)
|
|
24
|
+
response.raise_for_status()
|
|
25
|
+
tags = response.json()
|
|
26
|
+
if not tags:
|
|
27
|
+
return None
|
|
28
|
+
# Tags are sorted in reverse chronological order; find the first that looks like a version
|
|
29
|
+
for tag in tags:
|
|
30
|
+
tag_name = tag.get("name", "")
|
|
31
|
+
# Accept tags of the form v0.9.10, 0.9.10, etc.
|
|
32
|
+
if tag_name.startswith("v"):
|
|
33
|
+
version_candidate = tag_name[1:]
|
|
34
|
+
else:
|
|
35
|
+
version_candidate = tag_name
|
|
36
|
+
try:
|
|
37
|
+
# validate it's a version string
|
|
38
|
+
_ = Version(version_candidate)
|
|
39
|
+
return version_candidate
|
|
40
|
+
except InvalidVersion:
|
|
41
|
+
continue
|
|
42
|
+
return None
|
|
43
|
+
except requests.exceptions.RequestException as e:
|
|
44
|
+
logging.debug("Error retrieving latest FLM version: %s", e)
|
|
45
|
+
return None
|
|
17
46
|
|
|
18
47
|
|
|
19
48
|
def check_flm_version() -> Optional[str]:
|
|
20
49
|
"""
|
|
21
50
|
Check if FLM is installed and return version, or None if not available.
|
|
22
51
|
"""
|
|
52
|
+
latest_version_str = get_flm_latest_version()
|
|
23
53
|
try:
|
|
24
54
|
result = subprocess.run(
|
|
25
55
|
["flm", "version"],
|
|
@@ -34,11 +64,11 @@ def check_flm_version() -> Optional[str]:
|
|
|
34
64
|
output = result.stdout.strip()
|
|
35
65
|
if output.startswith("FLM v"):
|
|
36
66
|
version_str = output[5:] # Remove "FLM v" prefix
|
|
37
|
-
return version_str
|
|
38
|
-
return None
|
|
67
|
+
return version_str, latest_version_str
|
|
68
|
+
return None, latest_version_str
|
|
39
69
|
|
|
40
70
|
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
41
|
-
return None
|
|
71
|
+
return None, latest_version_str
|
|
42
72
|
|
|
43
73
|
|
|
44
74
|
def refresh_environment():
|
|
@@ -76,31 +106,42 @@ def install_flm():
|
|
|
76
106
|
If not, download and run the GUI installer, then wait for completion.
|
|
77
107
|
"""
|
|
78
108
|
# Check current FLM installation
|
|
79
|
-
current_version = check_flm_version()
|
|
109
|
+
current_version, latest_version = check_flm_version()
|
|
80
110
|
|
|
81
|
-
if
|
|
111
|
+
if (
|
|
112
|
+
current_version
|
|
113
|
+
and latest_version
|
|
114
|
+
and Version(current_version) == Version(latest_version)
|
|
115
|
+
):
|
|
82
116
|
logging.info(
|
|
83
|
-
"FLM v%s is already installed and
|
|
117
|
+
"FLM v%s is already installed and is up to date (latest version: v%s).",
|
|
84
118
|
current_version,
|
|
85
|
-
|
|
119
|
+
latest_version,
|
|
86
120
|
)
|
|
87
121
|
return
|
|
88
122
|
|
|
89
123
|
if current_version:
|
|
124
|
+
if not latest_version:
|
|
125
|
+
logging.info(
|
|
126
|
+
"Unable to detect the latest FLM version; continuing with installed FLM v%s.",
|
|
127
|
+
current_version,
|
|
128
|
+
)
|
|
129
|
+
return
|
|
90
130
|
logging.info(
|
|
91
|
-
"FLM v%s is installed but below
|
|
131
|
+
"FLM v%s is installed but below latest version v%s. Upgrading...",
|
|
92
132
|
current_version,
|
|
93
|
-
|
|
133
|
+
latest_version,
|
|
94
134
|
)
|
|
135
|
+
verysilent = True
|
|
95
136
|
else:
|
|
96
|
-
logging.info(
|
|
97
|
-
|
|
98
|
-
)
|
|
137
|
+
logging.info("FLM not found. Installing FLM v%s or later...", latest_version)
|
|
138
|
+
verysilent = False
|
|
99
139
|
|
|
100
140
|
# Download the installer
|
|
101
141
|
# pylint: disable=line-too-long
|
|
102
142
|
installer_url = "https://github.com/FastFlowLM/FastFlowLM/releases/latest/download/flm-setup.exe"
|
|
103
143
|
installer_path = os.path.join(tempfile.gettempdir(), "flm-setup.exe")
|
|
144
|
+
installer_args = [installer_path, "/VERYSILENT"] if verysilent else [installer_path]
|
|
104
145
|
|
|
105
146
|
try:
|
|
106
147
|
# Remove existing installer if present
|
|
@@ -123,13 +164,15 @@ def install_flm():
|
|
|
123
164
|
# Launch the installer GUI
|
|
124
165
|
logging.warning(
|
|
125
166
|
"Launching FLM installer GUI. Please complete the installation..."
|
|
167
|
+
if not verysilent
|
|
168
|
+
else "Installing FLM..."
|
|
126
169
|
)
|
|
127
170
|
|
|
128
171
|
# Launch installer and wait for it to complete
|
|
129
172
|
if os.name == "nt": # Windows
|
|
130
|
-
process = subprocess.Popen(
|
|
173
|
+
process = subprocess.Popen(installer_args, shell=True)
|
|
131
174
|
else:
|
|
132
|
-
process = subprocess.Popen(
|
|
175
|
+
process = subprocess.Popen(installer_args)
|
|
133
176
|
|
|
134
177
|
# Wait for installer to complete
|
|
135
178
|
process.wait()
|
|
@@ -150,8 +193,8 @@ def install_flm():
|
|
|
150
193
|
# Verify installation
|
|
151
194
|
max_retries = 10
|
|
152
195
|
for attempt in range(max_retries):
|
|
153
|
-
new_version = check_flm_version()
|
|
154
|
-
if new_version and Version(new_version)
|
|
196
|
+
new_version, latest_version = check_flm_version()
|
|
197
|
+
if new_version and Version(new_version) == Version(latest_version):
|
|
155
198
|
logging.info("FLM v%s successfully installed and verified", new_version)
|
|
156
199
|
return
|
|
157
200
|
|
|
@@ -240,7 +283,12 @@ def get_flm_installed_models() -> List[str]:
|
|
|
240
283
|
|
|
241
284
|
return installed_checkpoints
|
|
242
285
|
|
|
243
|
-
except (
|
|
286
|
+
except (
|
|
287
|
+
subprocess.CalledProcessError,
|
|
288
|
+
FileNotFoundError,
|
|
289
|
+
AttributeError,
|
|
290
|
+
NotADirectoryError,
|
|
291
|
+
):
|
|
244
292
|
# FLM not installed, not available, or output parsing failed
|
|
245
293
|
return []
|
|
246
294
|
|
|
@@ -249,7 +297,7 @@ def is_flm_available() -> bool:
|
|
|
249
297
|
"""
|
|
250
298
|
Check if FLM is available and meets minimum version requirements.
|
|
251
299
|
"""
|
|
252
|
-
current_version = check_flm_version()
|
|
253
|
-
return current_version is not None and Version(current_version)
|
|
254
|
-
|
|
300
|
+
current_version, latest_version = check_flm_version()
|
|
301
|
+
return current_version is not None and Version(current_version) == Version(
|
|
302
|
+
latest_version
|
|
255
303
|
)
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import statistics
|
|
3
|
+
from statistics import StatisticsError
|
|
4
|
+
from lemonade.state import State
|
|
5
|
+
from lemonade.tools.tool import Tool
|
|
6
|
+
from lemonade.tools.llamacpp.utils import LlamaCppAdapter
|
|
7
|
+
from lemonade.tools.bench import (
|
|
8
|
+
Bench,
|
|
9
|
+
default_prompt_length,
|
|
10
|
+
default_iterations,
|
|
11
|
+
default_output_tokens,
|
|
12
|
+
default_warmup_runs,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class LlamaCppBench(Bench):
|
|
17
|
+
"""
|
|
18
|
+
Benchmark a llama.cpp model
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
unique_name = "llamacpp-bench"
|
|
22
|
+
|
|
23
|
+
def __init__(self, monitor_message="Benchmarking LLM"):
|
|
24
|
+
super().__init__(monitor_message)
|
|
25
|
+
|
|
26
|
+
# Don't track memory usage since we are using a llamacpp executable for compute
|
|
27
|
+
self.save_max_memory_used = False
|
|
28
|
+
|
|
29
|
+
@staticmethod
|
|
30
|
+
def parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
31
|
+
parser = __class__.helpful_parser(
|
|
32
|
+
short_description="Benchmark an LLM in llama.cpp",
|
|
33
|
+
add_help=add_help,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
parser = Bench.parser(parser)
|
|
37
|
+
|
|
38
|
+
parser.add_argument(
|
|
39
|
+
"--cli",
|
|
40
|
+
action="store_true",
|
|
41
|
+
help="Set this flag to use llama-cli.exe to benchmark model performance. This executable will be called "
|
|
42
|
+
"once per iteration. Otherwise, llama-bench.exe is used by default. In this default behavior behavior, "
|
|
43
|
+
"the only valid prompt format is integer token lengths. Also, the warmup-iterations parameter is "
|
|
44
|
+
"ignored and the default value for number of threads is 16.",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
return parser
|
|
48
|
+
|
|
49
|
+
def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
|
|
50
|
+
"""
|
|
51
|
+
Helper function to parse CLI arguments into the args expected by run()
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
# Call Tool parse method, NOT the Bench parse method
|
|
55
|
+
parsed_args = Tool.parse(self, state, args, known_only)
|
|
56
|
+
|
|
57
|
+
if parsed_args.cli:
|
|
58
|
+
parsed_args = super().parse(state, args, known_only)
|
|
59
|
+
else:
|
|
60
|
+
# Make sure prompts is a list of integers
|
|
61
|
+
if parsed_args.prompts is None:
|
|
62
|
+
parsed_args.prompts = [default_prompt_length]
|
|
63
|
+
prompt_ints = []
|
|
64
|
+
for prompt_item in parsed_args.prompts:
|
|
65
|
+
if prompt_item.isdigit():
|
|
66
|
+
prompt_ints.append(int(prompt_item))
|
|
67
|
+
else:
|
|
68
|
+
raise Exception(
|
|
69
|
+
f"When not using the --cli flag to {self.unique_name}, the prompt format must "
|
|
70
|
+
"be in integer format."
|
|
71
|
+
)
|
|
72
|
+
parsed_args.prompts = prompt_ints
|
|
73
|
+
|
|
74
|
+
return parsed_args
|
|
75
|
+
|
|
76
|
+
def run_prompt(
|
|
77
|
+
self,
|
|
78
|
+
state: State,
|
|
79
|
+
report_progress_fn,
|
|
80
|
+
prompt: str,
|
|
81
|
+
iterations: int,
|
|
82
|
+
warmup_iterations: int,
|
|
83
|
+
output_tokens: int,
|
|
84
|
+
) -> State:
|
|
85
|
+
"""
|
|
86
|
+
Benchmark llama.cpp model that was loaded by LoadLlamaCpp.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
if self.first_run_prompt:
|
|
90
|
+
|
|
91
|
+
if not hasattr(state, "model") or not isinstance(
|
|
92
|
+
state.model, LlamaCppAdapter
|
|
93
|
+
):
|
|
94
|
+
raise Exception(
|
|
95
|
+
f"{self.__class__.unique_name} requires a LlamaCppAdapter model to be "
|
|
96
|
+
"loaded first. Please run load-llama-cpp before this tool."
|
|
97
|
+
)
|
|
98
|
+
model: LlamaCppAdapter = state.model
|
|
99
|
+
|
|
100
|
+
per_iteration_tokens_per_second = []
|
|
101
|
+
per_iteration_time_to_first_token = []
|
|
102
|
+
|
|
103
|
+
for iteration in range(iterations + warmup_iterations):
|
|
104
|
+
try:
|
|
105
|
+
# Use the adapter's generate method which already has the timeout
|
|
106
|
+
# and error handling
|
|
107
|
+
model.time_to_first_token = None
|
|
108
|
+
model.tokens_per_second = None
|
|
109
|
+
raw_output, stderr = model.generate(
|
|
110
|
+
prompt, max_new_tokens=output_tokens, return_raw=True
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
if model.time_to_first_token is None or model.tokens_per_second is None:
|
|
114
|
+
error_msg = (
|
|
115
|
+
"Could not find timing information in llama.cpp output.\n"
|
|
116
|
+
)
|
|
117
|
+
error_msg += "Raw output:\n" + raw_output + "\n"
|
|
118
|
+
error_msg += "Stderr:\n" + stderr
|
|
119
|
+
raise Exception(error_msg)
|
|
120
|
+
|
|
121
|
+
self.tokens_out_len_list.append(model.response_tokens)
|
|
122
|
+
|
|
123
|
+
if iteration > warmup_iterations - 1:
|
|
124
|
+
per_iteration_tokens_per_second.append(model.tokens_per_second)
|
|
125
|
+
per_iteration_time_to_first_token.append(model.time_to_first_token)
|
|
126
|
+
|
|
127
|
+
report_progress_fn((iteration + 1) / (warmup_iterations + iterations))
|
|
128
|
+
|
|
129
|
+
except Exception as e:
|
|
130
|
+
error_msg = f"Failed to run benchmark: {str(e)}"
|
|
131
|
+
raise Exception(error_msg)
|
|
132
|
+
|
|
133
|
+
self.input_ids_len_list.append(model.prompt_tokens)
|
|
134
|
+
mean_time_to_first_token = statistics.mean(per_iteration_time_to_first_token)
|
|
135
|
+
self.mean_time_to_first_token_list.append(mean_time_to_first_token)
|
|
136
|
+
self.prefill_tokens_per_second_list.append(
|
|
137
|
+
model.prompt_tokens / mean_time_to_first_token
|
|
138
|
+
)
|
|
139
|
+
self.token_generation_tokens_per_second_list.append(
|
|
140
|
+
statistics.mean(per_iteration_tokens_per_second)
|
|
141
|
+
)
|
|
142
|
+
try:
|
|
143
|
+
self.std_dev_time_to_first_token_list.append(
|
|
144
|
+
statistics.stdev(per_iteration_time_to_first_token)
|
|
145
|
+
)
|
|
146
|
+
except StatisticsError:
|
|
147
|
+
# Less than 2 measurements
|
|
148
|
+
self.std_dev_time_to_first_token_list.append(None)
|
|
149
|
+
try:
|
|
150
|
+
self.std_dev_token_generation_tokens_per_second_list.append(
|
|
151
|
+
statistics.stdev(per_iteration_tokens_per_second)
|
|
152
|
+
)
|
|
153
|
+
except StatisticsError:
|
|
154
|
+
# Less than 2 measurements
|
|
155
|
+
self.std_dev_token_generation_tokens_per_second_list.append(None)
|
|
156
|
+
|
|
157
|
+
def run_llama_bench_exe(self, state, prompts, iterations, output_tokens):
|
|
158
|
+
|
|
159
|
+
if prompts is None:
|
|
160
|
+
prompts = [default_prompt_length]
|
|
161
|
+
elif isinstance(prompts, int):
|
|
162
|
+
prompts = [prompts]
|
|
163
|
+
|
|
164
|
+
state.save_stat("prompts", prompts)
|
|
165
|
+
state.save_stat("iterations", iterations)
|
|
166
|
+
state.save_stat("output_tokens", output_tokens)
|
|
167
|
+
|
|
168
|
+
model: LlamaCppAdapter = state.model
|
|
169
|
+
prompt_lengths, pp_tps, pp_tps_sd, tg_tps, tg_tps_sd = model.benchmark(
|
|
170
|
+
prompts, iterations, output_tokens
|
|
171
|
+
)
|
|
172
|
+
self.input_ids_len_list = prompt_lengths
|
|
173
|
+
self.prefill_tokens_per_second_list = pp_tps
|
|
174
|
+
if iterations > 1:
|
|
175
|
+
self.std_dev_prefill_tokens_per_second_list = pp_tps_sd
|
|
176
|
+
self.mean_time_to_first_token_list = [
|
|
177
|
+
tokens / tps for tokens, tps in zip(prompt_lengths, pp_tps)
|
|
178
|
+
]
|
|
179
|
+
self.token_generation_tokens_per_second_list = [tg_tps]
|
|
180
|
+
if iterations > 1:
|
|
181
|
+
self.std_dev_token_generation_tokens_per_second_list = [tg_tps_sd]
|
|
182
|
+
self.tokens_out_len_list = [output_tokens] * len(prompts) * iterations
|
|
183
|
+
|
|
184
|
+
self.save_stats(state)
|
|
185
|
+
return state
|
|
186
|
+
|
|
187
|
+
def run(
|
|
188
|
+
self,
|
|
189
|
+
state: State,
|
|
190
|
+
prompts: list[str] = None,
|
|
191
|
+
iterations: int = default_iterations,
|
|
192
|
+
warmup_iterations: int = default_warmup_runs,
|
|
193
|
+
output_tokens: int = default_output_tokens,
|
|
194
|
+
cli: bool = False,
|
|
195
|
+
**kwargs,
|
|
196
|
+
) -> State:
|
|
197
|
+
"""
|
|
198
|
+
Args:
|
|
199
|
+
- prompts: List of input prompts used as starting points for LLM text generation
|
|
200
|
+
- iterations: Number of benchmarking samples to take; results are
|
|
201
|
+
reported as the median and mean of the samples.
|
|
202
|
+
- warmup_iterations: Subset of the iterations to treat as warmup,
|
|
203
|
+
and not included in the results.
|
|
204
|
+
- output_tokens: Number of new tokens LLM to create.
|
|
205
|
+
- ggml: Use llama-bench.exe directly
|
|
206
|
+
- kwargs: Additional parameters used by bench tools
|
|
207
|
+
"""
|
|
208
|
+
|
|
209
|
+
# Check that state has the attribute model and it is a LlamaCPP model
|
|
210
|
+
if not hasattr(state, "model") or not isinstance(state.model, LlamaCppAdapter):
|
|
211
|
+
raise Exception("Load model using llamacpp-load first.")
|
|
212
|
+
|
|
213
|
+
if cli:
|
|
214
|
+
state = super().run(
|
|
215
|
+
state, prompts, iterations, warmup_iterations, output_tokens, **kwargs
|
|
216
|
+
)
|
|
217
|
+
else:
|
|
218
|
+
state = self.run_llama_bench_exe(state, prompts, iterations, output_tokens)
|
|
219
|
+
|
|
220
|
+
return state
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
# This file was originally licensed under Apache 2.0. It has been modified.
|
|
224
|
+
# Modifications Copyright (c) 2025 AMD
|
|
@@ -93,9 +93,11 @@ class LoadLlamaCpp(FirstTool):
|
|
|
93
93
|
from lemonade.tools.llamacpp.utils import (
|
|
94
94
|
install_llamacpp,
|
|
95
95
|
get_llama_cli_exe_path,
|
|
96
|
+
get_llama_bench_exe_path,
|
|
96
97
|
get_llama_installed_version,
|
|
97
98
|
parse_checkpoint,
|
|
98
99
|
download_gguf,
|
|
100
|
+
resolve_local_gguf_model,
|
|
99
101
|
get_local_checkpoint_path,
|
|
100
102
|
LlamaCppTokenizerAdapter,
|
|
101
103
|
LlamaCppAdapter,
|
|
@@ -103,6 +105,8 @@ class LoadLlamaCpp(FirstTool):
|
|
|
103
105
|
|
|
104
106
|
install_llamacpp(backend)
|
|
105
107
|
|
|
108
|
+
extension = ""
|
|
109
|
+
|
|
106
110
|
# Check if input is a local folder containing a .GGUF model
|
|
107
111
|
if os.path.isdir(input):
|
|
108
112
|
# input is a local folder
|
|
@@ -121,6 +125,17 @@ class LoadLlamaCpp(FirstTool):
|
|
|
121
125
|
)
|
|
122
126
|
model_to_use = gguf_files[0]
|
|
123
127
|
full_model_path = os.path.join(local_model_folder, model_to_use)
|
|
128
|
+
extension = ".gguf"
|
|
129
|
+
|
|
130
|
+
elif input.endswith(".gguf") and os.path.isfile(input):
|
|
131
|
+
# input is a local .gguf file
|
|
132
|
+
full_model_path = os.path.abspath(input)
|
|
133
|
+
checkpoint = "local_model"
|
|
134
|
+
state.checkpoint = checkpoint
|
|
135
|
+
state.save_stat(Keys.CHECKPOINT, checkpoint)
|
|
136
|
+
state.save_stat(Keys.LOCAL_MODEL_FOLDER, full_model_path)
|
|
137
|
+
model_to_use = os.path.basename(full_model_path)
|
|
138
|
+
extension = ".gguf"
|
|
124
139
|
|
|
125
140
|
else:
|
|
126
141
|
# Input is a model checkpoint
|
|
@@ -155,12 +170,21 @@ class LoadLlamaCpp(FirstTool):
|
|
|
155
170
|
)
|
|
156
171
|
|
|
157
172
|
else:
|
|
173
|
+
# First, try to resolve from local cache to avoid unnecessary downloads
|
|
174
|
+
base_checkpoint, variant = parse_checkpoint(checkpoint)
|
|
175
|
+
snapshot_files = resolve_local_gguf_model(
|
|
176
|
+
base_checkpoint, variant, None
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# If not found locally, download from internet
|
|
180
|
+
if not snapshot_files:
|
|
181
|
+
snapshot_files = download_gguf(checkpoint)
|
|
158
182
|
|
|
159
|
-
snapshot_files = download_gguf(checkpoint)
|
|
160
183
|
full_model_path = snapshot_files["variant"]
|
|
161
184
|
model_to_use = os.path.basename(full_model_path)
|
|
162
185
|
|
|
163
186
|
llama_cli_exe_path = get_llama_cli_exe_path(backend)
|
|
187
|
+
llama_bench_exe_path = get_llama_bench_exe_path(backend)
|
|
164
188
|
printing.log_info(f"Using llama_cli for GGUF model: {llama_cli_exe_path}")
|
|
165
189
|
|
|
166
190
|
# Get the directory containing the executable for shared libraries
|
|
@@ -174,8 +198,10 @@ class LoadLlamaCpp(FirstTool):
|
|
|
174
198
|
context_size=context_size,
|
|
175
199
|
threads=threads,
|
|
176
200
|
executable=llama_cli_exe_path,
|
|
201
|
+
bench_executable=llama_bench_exe_path,
|
|
177
202
|
reasoning=reasoning,
|
|
178
203
|
lib_dir=lib_dir,
|
|
204
|
+
state=state,
|
|
179
205
|
)
|
|
180
206
|
state.tokenizer = LlamaCppTokenizerAdapter()
|
|
181
207
|
state.device = device
|
|
@@ -186,7 +212,9 @@ class LoadLlamaCpp(FirstTool):
|
|
|
186
212
|
Keys.LLAMA_CLI_VERSION_INFO, get_llama_installed_version(backend)
|
|
187
213
|
)
|
|
188
214
|
|
|
189
|
-
status.add_to_state(
|
|
215
|
+
status.add_to_state(
|
|
216
|
+
state=state, name=input, model=model_to_use, extension=extension
|
|
217
|
+
)
|
|
190
218
|
return state
|
|
191
219
|
|
|
192
220
|
|