lemonade-sdk 8.0.5__tar.gz → 8.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- {lemonade_sdk-8.0.5/src/lemonade_sdk.egg-info → lemonade_sdk-8.1.0}/PKG-INFO +32 -21
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/README.md +7 -3
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/setup.py +26 -19
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/cache.py +3 -1
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/common/network.py +18 -1
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/adapter.py +6 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/huggingface/utils.py +6 -5
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/llamacpp/bench.py +28 -46
- lemonade_sdk-8.1.0/src/lemonade/tools/llamacpp/load.py +185 -0
- lemonade_sdk-8.1.0/src/lemonade/tools/llamacpp/utils.py +612 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/oga/bench.py +5 -6
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/oga/load.py +239 -112
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/oga/utils.py +27 -9
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/prompt.py +17 -25
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/report/table.py +12 -9
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/llamacpp.py +80 -92
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/serve.py +22 -28
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/static/styles.css +121 -26
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/static/webapp.html +14 -6
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/tray.py +7 -0
- lemonade_sdk-8.1.0/src/lemonade/version.py +1 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade_install/install.py +65 -84
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0/src/lemonade_sdk.egg-info}/PKG-INFO +32 -21
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade_sdk.egg-info/SOURCES.txt +1 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade_sdk.egg-info/requires.txt +18 -8
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade_server/cli.py +1 -1
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade_server/model_manager.py +8 -151
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade_server/pydantic_models.py +1 -4
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade_server/server_models.json +44 -9
- lemonade_sdk-8.0.5/src/lemonade/tools/llamacpp/load.py +0 -277
- lemonade_sdk-8.0.5/src/lemonade/version.py +0 -1
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/LICENSE +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/NOTICE.md +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/setup.cfg +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/__init__.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/api.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/cli.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/common/__init__.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/common/build.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/common/cli_helpers.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/common/exceptions.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/common/filesystem.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/common/inference_engines.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/common/printing.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/common/status.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/common/system_info.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/common/test_helpers.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/profilers/__init__.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/profilers/memory_tracker.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/profilers/profiler.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/sequence.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/state.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/__init__.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/accuracy.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/bench.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/huggingface/bench.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/huggingface/load.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/humaneval.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/management_tools.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/mmlu.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/oga/__init__.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/perplexity.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/quark/__init__.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/quark/quark_load.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/quark/quark_quantize.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/report/__init__.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/report/llm_report.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/__init__.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/static/favicon.ico +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/tool_calls.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/utils/port.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/utils/system_tray.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/utils/thread.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/webapp.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/tool.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade_install/__init__.py +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade_sdk.egg-info/dependency_links.txt +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade_sdk.egg-info/entry_points.txt +0 -0
- {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade_sdk.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: lemonade-sdk
|
|
3
|
-
Version: 8.0
|
|
3
|
+
Version: 8.1.0
|
|
4
4
|
Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
|
|
5
5
|
Author-email: lemonade@amd.com
|
|
6
6
|
Requires-Python: >=3.10, <3.13
|
|
@@ -22,16 +22,15 @@ Requires-Dist: pytz
|
|
|
22
22
|
Requires-Dist: zstandard
|
|
23
23
|
Requires-Dist: fastapi
|
|
24
24
|
Requires-Dist: uvicorn[standard]
|
|
25
|
-
Requires-Dist: openai
|
|
26
|
-
Requires-Dist: transformers<=4.
|
|
25
|
+
Requires-Dist: openai<1.97.1,>=1.81.0
|
|
26
|
+
Requires-Dist: transformers<=4.53.2
|
|
27
27
|
Requires-Dist: jinja2
|
|
28
28
|
Requires-Dist: tabulate
|
|
29
29
|
Requires-Dist: sentencepiece
|
|
30
30
|
Requires-Dist: huggingface-hub==0.33.0
|
|
31
|
-
Provides-Extra: oga-
|
|
32
|
-
Requires-Dist:
|
|
33
|
-
Requires-Dist:
|
|
34
|
-
Requires-Dist: protobuf>=6.30.1; extra == "oga-hybrid"
|
|
31
|
+
Provides-Extra: oga-ryzenai
|
|
32
|
+
Requires-Dist: onnxruntime-genai-directml-ryzenai==0.7.0.2; extra == "oga-ryzenai"
|
|
33
|
+
Requires-Dist: protobuf>=6.30.1; extra == "oga-ryzenai"
|
|
35
34
|
Provides-Extra: oga-cpu
|
|
36
35
|
Requires-Dist: onnxruntime-genai==0.8.2; extra == "oga-cpu"
|
|
37
36
|
Requires-Dist: onnxruntime>=1.22.0; extra == "oga-cpu"
|
|
@@ -43,14 +42,32 @@ Requires-Dist: pandas>=1.5.3; extra == "dev"
|
|
|
43
42
|
Requires-Dist: matplotlib; extra == "dev"
|
|
44
43
|
Requires-Dist: human-eval-windows==1.0.4; extra == "dev"
|
|
45
44
|
Requires-Dist: lm-eval[api]; extra == "dev"
|
|
45
|
+
Provides-Extra: oga-hybrid
|
|
46
|
+
Requires-Dist: lemonade-sdk[oga-ryzenai]; extra == "oga-hybrid"
|
|
47
|
+
Provides-Extra: oga-unified
|
|
48
|
+
Requires-Dist: lemonade-sdk[oga-ryzenai]; extra == "oga-unified"
|
|
46
49
|
Provides-Extra: oga-hybrid-minimal
|
|
47
|
-
Requires-Dist: lemonade-sdk[oga-
|
|
50
|
+
Requires-Dist: lemonade-sdk[oga-ryzenai]; extra == "oga-hybrid-minimal"
|
|
48
51
|
Provides-Extra: oga-cpu-minimal
|
|
49
52
|
Requires-Dist: lemonade-sdk[oga-cpu]; extra == "oga-cpu-minimal"
|
|
53
|
+
Provides-Extra: oga-npu-minimal
|
|
54
|
+
Requires-Dist: lemonade-sdk[oga-ryzenai]; extra == "oga-npu-minimal"
|
|
50
55
|
Provides-Extra: llm
|
|
51
56
|
Requires-Dist: lemonade-sdk[dev]; extra == "llm"
|
|
52
57
|
Provides-Extra: llm-oga-cpu
|
|
53
58
|
Requires-Dist: lemonade-sdk[dev,oga-cpu]; extra == "llm-oga-cpu"
|
|
59
|
+
Provides-Extra: llm-oga-npu
|
|
60
|
+
Requires-Dist: onnx==1.16.0; extra == "llm-oga-npu"
|
|
61
|
+
Requires-Dist: onnxruntime==1.18.0; extra == "llm-oga-npu"
|
|
62
|
+
Requires-Dist: numpy==1.26.4; extra == "llm-oga-npu"
|
|
63
|
+
Requires-Dist: protobuf>=6.30.1; extra == "llm-oga-npu"
|
|
64
|
+
Requires-Dist: lemonade-sdk[dev]; extra == "llm-oga-npu"
|
|
65
|
+
Provides-Extra: llm-oga-hybrid
|
|
66
|
+
Requires-Dist: onnx==1.16.1; extra == "llm-oga-hybrid"
|
|
67
|
+
Requires-Dist: numpy==1.26.4; extra == "llm-oga-hybrid"
|
|
68
|
+
Requires-Dist: protobuf>=6.30.1; extra == "llm-oga-hybrid"
|
|
69
|
+
Provides-Extra: llm-oga-unified
|
|
70
|
+
Requires-Dist: lemonade-sdk[dev,llm-oga-hybrid]; extra == "llm-oga-unified"
|
|
54
71
|
Provides-Extra: llm-oga-igpu
|
|
55
72
|
Requires-Dist: onnxruntime-genai-directml==0.6.0; extra == "llm-oga-igpu"
|
|
56
73
|
Requires-Dist: onnxruntime-directml<1.22.0,>=1.19.0; extra == "llm-oga-igpu"
|
|
@@ -61,16 +78,6 @@ Requires-Dist: onnxruntime-genai-cuda==0.8.2; extra == "llm-oga-cuda"
|
|
|
61
78
|
Requires-Dist: onnxruntime-gpu>=1.22.0; extra == "llm-oga-cuda"
|
|
62
79
|
Requires-Dist: transformers<=4.51.3; extra == "llm-oga-cuda"
|
|
63
80
|
Requires-Dist: lemonade-sdk[dev]; extra == "llm-oga-cuda"
|
|
64
|
-
Provides-Extra: llm-oga-npu
|
|
65
|
-
Requires-Dist: onnx==1.16.0; extra == "llm-oga-npu"
|
|
66
|
-
Requires-Dist: onnxruntime==1.18.0; extra == "llm-oga-npu"
|
|
67
|
-
Requires-Dist: numpy==1.26.4; extra == "llm-oga-npu"
|
|
68
|
-
Requires-Dist: protobuf>=6.30.1; extra == "llm-oga-npu"
|
|
69
|
-
Requires-Dist: lemonade-sdk[dev]; extra == "llm-oga-npu"
|
|
70
|
-
Provides-Extra: llm-oga-hybrid
|
|
71
|
-
Requires-Dist: lemonade-sdk[dev,oga-hybrid]; extra == "llm-oga-hybrid"
|
|
72
|
-
Provides-Extra: llm-oga-unified
|
|
73
|
-
Requires-Dist: lemonade-sdk[llm-oga-hybrid]; extra == "llm-oga-unified"
|
|
74
81
|
Dynamic: author-email
|
|
75
82
|
Dynamic: description
|
|
76
83
|
Dynamic: description-content-type
|
|
@@ -174,7 +181,7 @@ lemonade-server list
|
|
|
174
181
|
|
|
175
182
|
## Model Library
|
|
176
183
|
|
|
177
|
-
Lemonade supports both GGUF and ONNX models as detailed in the [Supported Configuration](#supported-configurations) section. A list of all built-in models is available [here](https://lemonade-server.ai/docs/server/
|
|
184
|
+
Lemonade supports both GGUF and ONNX models as detailed in the [Supported Configuration](#supported-configurations) section. A list of all built-in models is available [here](https://lemonade-server.ai/docs/server/server_models/).
|
|
178
185
|
|
|
179
186
|
You can also import custom GGUF and ONNX models from Hugging Face by using our [Model Manager](http://localhost:8000/#model-management) (requires server to be running).
|
|
180
187
|
<p align="center">
|
|
@@ -263,7 +270,7 @@ completion = client.chat.completions.create(
|
|
|
263
270
|
print(completion.choices[0].message.content)
|
|
264
271
|
```
|
|
265
272
|
|
|
266
|
-
For more detailed integration instructions, see the [Integration Guide](./server_integration.md).
|
|
273
|
+
For more detailed integration instructions, see the [Integration Guide](./docs/server/server_integration.md).
|
|
267
274
|
|
|
268
275
|
## Beyond an LLM Server
|
|
269
276
|
|
|
@@ -272,6 +279,10 @@ The [Lemonade SDK](./docs/README.md) also include the following components:
|
|
|
272
279
|
- 🐍 **[Lemonade API](./docs/lemonade_api.md)**: High-level Python API to directly integrate Lemonade LLMs into Python applications.
|
|
273
280
|
- 🖥️ **[Lemonade CLI](./docs/dev_cli/README.md)**: The `lemonade` CLI lets you mix-and-match LLMs (ONNX, GGUF, SafeTensors) with prompting templates, accuracy testing, performance benchmarking, and memory profiling to characterize your models on your hardware.
|
|
274
281
|
|
|
282
|
+
## FAQ
|
|
283
|
+
|
|
284
|
+
To read our frequently asked questions, see our [FAQ Guide](./docs/faq.md)
|
|
285
|
+
|
|
275
286
|
## Contributing
|
|
276
287
|
|
|
277
288
|
We are actively seeking collaborators from across the industry. If you would like to contribute to this project, please check out our [contribution guide](./docs/contribute.md).
|
|
@@ -284,7 +295,7 @@ New contributors can find beginner-friendly issues tagged with "Good First Issue
|
|
|
284
295
|
|
|
285
296
|
## Maintainers
|
|
286
297
|
|
|
287
|
-
This project is sponsored by AMD. It is maintained by @danielholanda @jeremyfowers @ramkrishna @vgodsoe in equal measure. You can reach us by filing an [issue](https://github.com/lemonade-sdk/lemonade/issues),
|
|
298
|
+
This project is sponsored by AMD. It is maintained by @danielholanda @jeremyfowers @ramkrishna @vgodsoe in equal measure. You can reach us by filing an [issue](https://github.com/lemonade-sdk/lemonade/issues), emailing [lemonade@amd.com](mailto:lemonade@amd.com), or joining our [Discord](https://discord.gg/5xXzkMu8Zk).
|
|
288
299
|
|
|
289
300
|
## License
|
|
290
301
|
|
|
@@ -92,7 +92,7 @@ lemonade-server list
|
|
|
92
92
|
|
|
93
93
|
## Model Library
|
|
94
94
|
|
|
95
|
-
Lemonade supports both GGUF and ONNX models as detailed in the [Supported Configuration](#supported-configurations) section. A list of all built-in models is available [here](https://lemonade-server.ai/docs/server/
|
|
95
|
+
Lemonade supports both GGUF and ONNX models as detailed in the [Supported Configuration](#supported-configurations) section. A list of all built-in models is available [here](https://lemonade-server.ai/docs/server/server_models/).
|
|
96
96
|
|
|
97
97
|
You can also import custom GGUF and ONNX models from Hugging Face by using our [Model Manager](http://localhost:8000/#model-management) (requires server to be running).
|
|
98
98
|
<p align="center">
|
|
@@ -181,7 +181,7 @@ completion = client.chat.completions.create(
|
|
|
181
181
|
print(completion.choices[0].message.content)
|
|
182
182
|
```
|
|
183
183
|
|
|
184
|
-
For more detailed integration instructions, see the [Integration Guide](./server_integration.md).
|
|
184
|
+
For more detailed integration instructions, see the [Integration Guide](./docs/server/server_integration.md).
|
|
185
185
|
|
|
186
186
|
## Beyond an LLM Server
|
|
187
187
|
|
|
@@ -190,6 +190,10 @@ The [Lemonade SDK](./docs/README.md) also include the following components:
|
|
|
190
190
|
- 🐍 **[Lemonade API](./docs/lemonade_api.md)**: High-level Python API to directly integrate Lemonade LLMs into Python applications.
|
|
191
191
|
- 🖥️ **[Lemonade CLI](./docs/dev_cli/README.md)**: The `lemonade` CLI lets you mix-and-match LLMs (ONNX, GGUF, SafeTensors) with prompting templates, accuracy testing, performance benchmarking, and memory profiling to characterize your models on your hardware.
|
|
192
192
|
|
|
193
|
+
## FAQ
|
|
194
|
+
|
|
195
|
+
To read our frequently asked questions, see our [FAQ Guide](./docs/faq.md)
|
|
196
|
+
|
|
193
197
|
## Contributing
|
|
194
198
|
|
|
195
199
|
We are actively seeking collaborators from across the industry. If you would like to contribute to this project, please check out our [contribution guide](./docs/contribute.md).
|
|
@@ -202,7 +206,7 @@ New contributors can find beginner-friendly issues tagged with "Good First Issue
|
|
|
202
206
|
|
|
203
207
|
## Maintainers
|
|
204
208
|
|
|
205
|
-
This project is sponsored by AMD. It is maintained by @danielholanda @jeremyfowers @ramkrishna @vgodsoe in equal measure. You can reach us by filing an [issue](https://github.com/lemonade-sdk/lemonade/issues),
|
|
209
|
+
This project is sponsored by AMD. It is maintained by @danielholanda @jeremyfowers @ramkrishna @vgodsoe in equal measure. You can reach us by filing an [issue](https://github.com/lemonade-sdk/lemonade/issues), emailing [lemonade@amd.com](mailto:lemonade@amd.com), or joining our [Discord](https://discord.gg/5xXzkMu8Zk).
|
|
206
210
|
|
|
207
211
|
## License
|
|
208
212
|
|
|
@@ -44,8 +44,8 @@ setup(
|
|
|
44
44
|
"zstandard",
|
|
45
45
|
"fastapi",
|
|
46
46
|
"uvicorn[standard]",
|
|
47
|
-
"openai>=1.81.0",
|
|
48
|
-
"transformers<=4.
|
|
47
|
+
"openai>=1.81.0,<1.97.1",
|
|
48
|
+
"transformers<=4.53.2",
|
|
49
49
|
"jinja2",
|
|
50
50
|
"tabulate",
|
|
51
51
|
"sentencepiece",
|
|
@@ -54,11 +54,9 @@ setup(
|
|
|
54
54
|
extras_require={
|
|
55
55
|
# The non-dev extras are meant to deploy specific backends into end-user
|
|
56
56
|
# applications, without including developer-focused tools
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
"onnx==1.16.1",
|
|
61
|
-
"numpy==1.26.4",
|
|
57
|
+
# Primary NPU extra using unified PyPI package
|
|
58
|
+
"oga-ryzenai": [
|
|
59
|
+
"onnxruntime-genai-directml-ryzenai==0.7.0.2",
|
|
62
60
|
"protobuf>=6.30.1",
|
|
63
61
|
],
|
|
64
62
|
"oga-cpu": [
|
|
@@ -81,11 +79,31 @@ setup(
|
|
|
81
79
|
"lm-eval[api]",
|
|
82
80
|
],
|
|
83
81
|
# Keep backwards compatibility for old extras names
|
|
84
|
-
"oga-hybrid
|
|
82
|
+
"oga-hybrid": ["lemonade-sdk[oga-ryzenai]"],
|
|
83
|
+
"oga-unified": ["lemonade-sdk[oga-ryzenai]"],
|
|
84
|
+
"oga-hybrid-minimal": ["lemonade-sdk[oga-ryzenai]"],
|
|
85
85
|
"oga-cpu-minimal": ["lemonade-sdk[oga-cpu]"],
|
|
86
|
+
"oga-npu-minimal": ["lemonade-sdk[oga-ryzenai]"],
|
|
86
87
|
"llm": ["lemonade-sdk[dev]"],
|
|
87
88
|
"llm-oga-cpu": ["lemonade-sdk[dev,oga-cpu]"],
|
|
88
89
|
# The following extras are deprecated and/or not commonly used
|
|
90
|
+
"llm-oga-npu": [
|
|
91
|
+
"onnx==1.16.0",
|
|
92
|
+
# NPU requires specific onnxruntime version for Ryzen AI compatibility
|
|
93
|
+
# This may conflict with other OGA extras that require >=1.22.0
|
|
94
|
+
"onnxruntime==1.18.0",
|
|
95
|
+
"numpy==1.26.4",
|
|
96
|
+
"protobuf>=6.30.1",
|
|
97
|
+
"lemonade-sdk[dev]",
|
|
98
|
+
],
|
|
99
|
+
"llm-oga-hybrid": [
|
|
100
|
+
# Note: `lemonade-install --ryzenai hybrid` is necessary
|
|
101
|
+
# to complete installation for RAI 1.4.0.
|
|
102
|
+
"onnx==1.16.1",
|
|
103
|
+
"numpy==1.26.4",
|
|
104
|
+
"protobuf>=6.30.1",
|
|
105
|
+
],
|
|
106
|
+
"llm-oga-unified": ["lemonade-sdk[dev, llm-oga-hybrid]"],
|
|
89
107
|
"llm-oga-igpu": [
|
|
90
108
|
"onnxruntime-genai-directml==0.6.0",
|
|
91
109
|
"onnxruntime-directml>=1.19.0,<1.22.0",
|
|
@@ -98,17 +116,6 @@ setup(
|
|
|
98
116
|
"transformers<=4.51.3",
|
|
99
117
|
"lemonade-sdk[dev]",
|
|
100
118
|
],
|
|
101
|
-
"llm-oga-npu": [
|
|
102
|
-
"onnx==1.16.0",
|
|
103
|
-
# NPU requires specific onnxruntime version for Ryzen AI compatibility
|
|
104
|
-
# This may conflict with other OGA extras that require >=1.22.0
|
|
105
|
-
"onnxruntime==1.18.0",
|
|
106
|
-
"numpy==1.26.4",
|
|
107
|
-
"protobuf>=6.30.1",
|
|
108
|
-
"lemonade-sdk[dev]",
|
|
109
|
-
],
|
|
110
|
-
"llm-oga-hybrid": ["lemonade-sdk[dev,oga-hybrid]"],
|
|
111
|
-
"llm-oga-unified": ["lemonade-sdk[llm-oga-hybrid]"],
|
|
112
119
|
},
|
|
113
120
|
classifiers=[],
|
|
114
121
|
entry_points={
|
|
@@ -34,7 +34,7 @@ def build_name(input_name):
|
|
|
34
34
|
"""
|
|
35
35
|
Name the lemonade build by concatenating these two factors:
|
|
36
36
|
1. Sanitize the input name (typically a model checkpoint name) by
|
|
37
|
-
replacing any `/` characters with `_
|
|
37
|
+
replacing any `/` characters with `_` and ':' characters with '-'.
|
|
38
38
|
2. Timestamp to ensure that builds in the same cache will not
|
|
39
39
|
collide in the same build directory.
|
|
40
40
|
|
|
@@ -47,6 +47,7 @@ def build_name(input_name):
|
|
|
47
47
|
else:
|
|
48
48
|
# Sanitize the input name
|
|
49
49
|
input_name_sanitized = input_name.replace("/", "_")
|
|
50
|
+
input_name_sanitized = input_name_sanitized.replace(":", "-")
|
|
50
51
|
|
|
51
52
|
# Get the formatted timestamp string
|
|
52
53
|
timestamp = get_timestamp()
|
|
@@ -79,6 +80,7 @@ class Keys:
|
|
|
79
80
|
MAX_MEMORY_USED_GB = "max_memory_used_GB"
|
|
80
81
|
MAX_MEMORY_USED_GBYTE = "max_memory_used_gbyte"
|
|
81
82
|
RYZEN_AI_VERSION_INFO = "ryzen_ai_version_info"
|
|
83
|
+
LLAMA_CLI_VERSION_INFO = "llama_cli_version_info"
|
|
82
84
|
|
|
83
85
|
|
|
84
86
|
# This file was originally licensed under Apache 2.0. It has been modified.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import Optional
|
|
3
3
|
import socket
|
|
4
|
-
from huggingface_hub import model_info
|
|
4
|
+
from huggingface_hub import model_info, snapshot_download
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
def is_offline():
|
|
@@ -48,3 +48,20 @@ def get_base_model(checkpoint: str) -> Optional[str]:
|
|
|
48
48
|
except Exception: # pylint: disable=broad-except
|
|
49
49
|
pass
|
|
50
50
|
return None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def custom_snapshot_download(repo_id, **kwargs):
|
|
54
|
+
"""
|
|
55
|
+
Custom snapshot download with retry logic for Windows symlink privilege errors.
|
|
56
|
+
"""
|
|
57
|
+
for attempt in range(2):
|
|
58
|
+
try:
|
|
59
|
+
return snapshot_download(repo_id=repo_id, **kwargs)
|
|
60
|
+
except OSError as e:
|
|
61
|
+
if (
|
|
62
|
+
hasattr(e, "winerror")
|
|
63
|
+
and e.winerror == 1314 # pylint: disable=no-member
|
|
64
|
+
and attempt < 1
|
|
65
|
+
):
|
|
66
|
+
continue
|
|
67
|
+
raise
|
|
@@ -13,6 +13,9 @@ class ModelAdapter(abc.ABC):
|
|
|
13
13
|
"""
|
|
14
14
|
self.tokens_per_second = None
|
|
15
15
|
self.time_to_first_token = None
|
|
16
|
+
self.prompt_tokens = None
|
|
17
|
+
self.response_tokens = None
|
|
18
|
+
|
|
16
19
|
self.type = "generic"
|
|
17
20
|
|
|
18
21
|
@abc.abstractmethod
|
|
@@ -22,6 +25,9 @@ class ModelAdapter(abc.ABC):
|
|
|
22
25
|
|
|
23
26
|
We try to keep the signature here minimal to allow for maximum compatibility
|
|
24
27
|
with recipe components, which themselves may not support a lot of arguments.
|
|
28
|
+
|
|
29
|
+
The generate method should store prompt and response lengths (in tokens)
|
|
30
|
+
in the prompt_tokens and response_tokens members.
|
|
25
31
|
"""
|
|
26
32
|
|
|
27
33
|
|
|
@@ -108,7 +108,9 @@ class HuggingfaceAdapter(ModelAdapter):
|
|
|
108
108
|
with torch.no_grad(), torch.inference_mode():
|
|
109
109
|
outputs = self.model.generate(input_ids=input_ids, **generation_kwargs)
|
|
110
110
|
|
|
111
|
-
|
|
111
|
+
self.prompt_tokens = input_ids.shape[1]
|
|
112
|
+
self.response_tokens = len(outputs[0]) - self.prompt_tokens
|
|
113
|
+
return outputs
|
|
112
114
|
|
|
113
115
|
def _model_call(self, input_tensor):
|
|
114
116
|
"""Forward pass through the model to get logits
|
|
@@ -341,12 +343,11 @@ def benchmark_huggingface_llm(
|
|
|
341
343
|
|
|
342
344
|
latency = end_time - start_time
|
|
343
345
|
|
|
344
|
-
|
|
345
|
-
tokens_out_len_list.append(token_len)
|
|
346
|
+
tokens_out_len_list.append(model.response_tokens)
|
|
346
347
|
|
|
347
348
|
# Only count an iteration if it produced enough tokens
|
|
348
|
-
if
|
|
349
|
-
per_iteration_result.append((latency,
|
|
349
|
+
if model.response_tokens >= target_output_tokens:
|
|
350
|
+
per_iteration_result.append((latency, model.response_tokens))
|
|
350
351
|
|
|
351
352
|
report_progress_fn(
|
|
352
353
|
(warmup_iterations + count + 1) / (warmup_iterations + iterations)
|
|
@@ -3,27 +3,31 @@ import statistics
|
|
|
3
3
|
from statistics import StatisticsError
|
|
4
4
|
from lemonade.state import State
|
|
5
5
|
from lemonade.cache import Keys
|
|
6
|
-
from lemonade.tools.llamacpp.
|
|
6
|
+
from lemonade.tools.llamacpp.utils import LlamaCppAdapter
|
|
7
7
|
from lemonade.tools.bench import Bench
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class LlamaCppBench(Bench):
|
|
11
|
+
"""
|
|
12
|
+
Benchmark a llama.cpp model
|
|
13
|
+
"""
|
|
11
14
|
|
|
12
|
-
unique_name = "
|
|
15
|
+
unique_name = "llamacpp-bench"
|
|
13
16
|
|
|
14
17
|
def __init__(self):
|
|
15
18
|
super().__init__()
|
|
16
19
|
|
|
17
20
|
# Additional statistics generated by this bench tool
|
|
18
|
-
self.status_stats
|
|
21
|
+
self.status_stats.insert(
|
|
22
|
+
self.status_stats.index(Keys.TOKEN_GENERATION_TOKENS_PER_SECOND) + 1,
|
|
19
23
|
Keys.STD_DEV_TOKENS_PER_SECOND,
|
|
20
|
-
|
|
24
|
+
)
|
|
21
25
|
self.std_dev_token_generation_tokens_per_second_list = []
|
|
22
26
|
|
|
23
27
|
@staticmethod
|
|
24
28
|
def parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
25
29
|
parser = __class__.helpful_parser(
|
|
26
|
-
short_description="Benchmark
|
|
30
|
+
short_description="Benchmark an LLM in llama.cpp",
|
|
27
31
|
add_help=add_help,
|
|
28
32
|
)
|
|
29
33
|
|
|
@@ -53,38 +57,22 @@ class LlamaCppBench(Bench):
|
|
|
53
57
|
f"{self.__class__.unique_name} requires a LlamaCppAdapter model to be "
|
|
54
58
|
"loaded first. Please run load-llama-cpp before this tool."
|
|
55
59
|
)
|
|
60
|
+
model: LlamaCppAdapter = state.model
|
|
56
61
|
|
|
57
|
-
|
|
58
|
-
|
|
62
|
+
per_iteration_tokens_per_second = []
|
|
63
|
+
per_iteration_time_to_first_token = []
|
|
59
64
|
|
|
60
65
|
for iteration in range(iterations + warmup_iterations):
|
|
61
66
|
try:
|
|
62
67
|
# Use the adapter's generate method which already has the timeout
|
|
63
68
|
# and error handling
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
# Look for timing in both stdout and stderr
|
|
72
|
-
for output in [raw_output, stderr]:
|
|
73
|
-
for line in output.splitlines():
|
|
74
|
-
if "llama_perf_context_print: eval time =" in line:
|
|
75
|
-
parts = line.split("(")[1].strip()
|
|
76
|
-
parts = parts.split(",")
|
|
77
|
-
ms_per_token = float(
|
|
78
|
-
parts[0].split("ms per token")[0].strip()
|
|
79
|
-
)
|
|
80
|
-
if "llama_perf_context_print: prompt eval time =" in line:
|
|
81
|
-
parts = line.split("=")[1].split("/")
|
|
82
|
-
time_to_first_token_ms = float(
|
|
83
|
-
parts[0].split("ms")[0].strip()
|
|
84
|
-
)
|
|
85
|
-
input_tokens = int(parts[1].split("tokens")[0].strip())
|
|
86
|
-
|
|
87
|
-
if ms_per_token is None or time_to_first_token_ms is None:
|
|
69
|
+
model.time_to_first_token = None
|
|
70
|
+
model.tokens_per_second = None
|
|
71
|
+
raw_output, stderr = model.generate(
|
|
72
|
+
prompt, max_new_tokens=output_tokens, return_raw=True
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
if model.time_to_first_token is None or model.tokens_per_second is None:
|
|
88
76
|
error_msg = (
|
|
89
77
|
"Could not find timing information in llama.cpp output.\n"
|
|
90
78
|
)
|
|
@@ -92,17 +80,11 @@ class LlamaCppBench(Bench):
|
|
|
92
80
|
error_msg += "Stderr:\n" + stderr
|
|
93
81
|
raise Exception(error_msg)
|
|
94
82
|
|
|
95
|
-
|
|
96
|
-
# and causes a divide-by-zero error. Set tokens_per_second to 0 in such cases
|
|
97
|
-
# as performance data for generating a few tokens is not relevant.
|
|
98
|
-
tokens_per_second = 0
|
|
99
|
-
if output_tokens > 5 and ms_per_token > 0:
|
|
100
|
-
tokens_per_second = 1000 / ms_per_token
|
|
101
|
-
time_to_first_token = time_to_first_token_ms / 1000
|
|
83
|
+
self.tokens_out_len_list.append(model.response_tokens)
|
|
102
84
|
|
|
103
85
|
if iteration > warmup_iterations - 1:
|
|
104
|
-
|
|
105
|
-
|
|
86
|
+
per_iteration_tokens_per_second.append(model.tokens_per_second)
|
|
87
|
+
per_iteration_time_to_first_token.append(model.time_to_first_token)
|
|
106
88
|
|
|
107
89
|
report_progress_fn((iteration + 1) / (warmup_iterations + iterations))
|
|
108
90
|
|
|
@@ -110,25 +92,25 @@ class LlamaCppBench(Bench):
|
|
|
110
92
|
error_msg = f"Failed to run benchmark: {str(e)}"
|
|
111
93
|
raise Exception(error_msg)
|
|
112
94
|
|
|
113
|
-
self.input_ids_len_list.append(
|
|
114
|
-
mean_time_to_first_token = statistics.mean(
|
|
95
|
+
self.input_ids_len_list.append(model.prompt_tokens)
|
|
96
|
+
mean_time_to_first_token = statistics.mean(per_iteration_time_to_first_token)
|
|
115
97
|
self.mean_time_to_first_token_list.append(mean_time_to_first_token)
|
|
116
98
|
self.prefill_tokens_per_second_list.append(
|
|
117
|
-
|
|
99
|
+
model.prompt_tokens / mean_time_to_first_token
|
|
118
100
|
)
|
|
119
101
|
self.token_generation_tokens_per_second_list.append(
|
|
120
|
-
statistics.mean(
|
|
102
|
+
statistics.mean(per_iteration_tokens_per_second)
|
|
121
103
|
)
|
|
122
104
|
try:
|
|
123
105
|
self.std_dev_time_to_first_token_list.append(
|
|
124
|
-
statistics.stdev(
|
|
106
|
+
statistics.stdev(per_iteration_time_to_first_token)
|
|
125
107
|
)
|
|
126
108
|
except StatisticsError:
|
|
127
109
|
# Less than 2 measurements
|
|
128
110
|
self.std_dev_time_to_first_token_list.append(None)
|
|
129
111
|
try:
|
|
130
112
|
self.std_dev_token_generation_tokens_per_second_list.append(
|
|
131
|
-
statistics.stdev(
|
|
113
|
+
statistics.stdev(per_iteration_tokens_per_second)
|
|
132
114
|
)
|
|
133
115
|
except StatisticsError:
|
|
134
116
|
# Less than 2 measurements
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import os
|
|
3
|
+
import lemonade.common.printing as printing
|
|
4
|
+
import lemonade.common.status as status
|
|
5
|
+
from lemonade.state import State
|
|
6
|
+
from lemonade.tools import FirstTool
|
|
7
|
+
from lemonade.cache import Keys
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class LoadLlamaCpp(FirstTool):
|
|
11
|
+
unique_name = "llamacpp-load"
|
|
12
|
+
|
|
13
|
+
def __init__(self):
|
|
14
|
+
super().__init__(monitor_message="Loading llama.cpp model")
|
|
15
|
+
|
|
16
|
+
self.status_stats = [
|
|
17
|
+
Keys.DEVICE,
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
@staticmethod
|
|
21
|
+
def parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
22
|
+
parser = __class__.helpful_parser(
|
|
23
|
+
short_description="Wrap llama.cpp models with an API",
|
|
24
|
+
add_help=add_help,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
parser.add_argument(
|
|
28
|
+
"-d",
|
|
29
|
+
"--device",
|
|
30
|
+
choices=["cpu", "igpu"],
|
|
31
|
+
default="igpu",
|
|
32
|
+
help="Which device to load the model on to (default: igpu)",
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
default_threads = -1
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"--threads",
|
|
38
|
+
required=False,
|
|
39
|
+
type=int,
|
|
40
|
+
default=default_threads,
|
|
41
|
+
help=f"Number of threads to use during generation (default: {default_threads})",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
context_size = 4096
|
|
45
|
+
parser.add_argument(
|
|
46
|
+
"--context-size",
|
|
47
|
+
required=False,
|
|
48
|
+
type=int,
|
|
49
|
+
default=context_size,
|
|
50
|
+
help=f"Size of the prompt context (default: {context_size}. 0 = loaded from model)",
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
output_tokens = 512
|
|
54
|
+
parser.add_argument(
|
|
55
|
+
"--output-tokens",
|
|
56
|
+
required=False,
|
|
57
|
+
type=int,
|
|
58
|
+
default=output_tokens,
|
|
59
|
+
help=f"Maximum number of output tokens to generate (default: {output_tokens})",
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
parser.add_argument(
|
|
63
|
+
"--reasoning",
|
|
64
|
+
action="store_true",
|
|
65
|
+
help="Set this flag to indicate the model is a reasoning model",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
return parser
|
|
69
|
+
|
|
70
|
+
def run(
|
|
71
|
+
self,
|
|
72
|
+
state: State,
|
|
73
|
+
input: str = "",
|
|
74
|
+
device: str = "igpu",
|
|
75
|
+
context_size: int = 512,
|
|
76
|
+
threads: int = 1,
|
|
77
|
+
output_tokens: int = 512,
|
|
78
|
+
reasoning: bool = False,
|
|
79
|
+
) -> State:
|
|
80
|
+
"""
|
|
81
|
+
Load a llama.cpp model
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
from lemonade.common.network import is_offline
|
|
85
|
+
from lemonade.tools.llamacpp.utils import (
|
|
86
|
+
install_llamacpp,
|
|
87
|
+
get_llama_cli_exe_path,
|
|
88
|
+
get_llama_installed_version,
|
|
89
|
+
parse_checkpoint,
|
|
90
|
+
download_gguf,
|
|
91
|
+
get_local_checkpoint_path,
|
|
92
|
+
LlamaCppTokenizerAdapter,
|
|
93
|
+
LlamaCppAdapter,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Validate and install llama.cpp, if needed
|
|
97
|
+
install_llamacpp()
|
|
98
|
+
|
|
99
|
+
# Check if input is a local folder containing a .GGUF model
|
|
100
|
+
if os.path.isdir(input):
|
|
101
|
+
# input is a local folder
|
|
102
|
+
local_model_folder = os.path.abspath(input)
|
|
103
|
+
checkpoint = "local_model"
|
|
104
|
+
state.checkpoint = checkpoint
|
|
105
|
+
state.save_stat(Keys.CHECKPOINT, checkpoint)
|
|
106
|
+
state.save_stat(Keys.LOCAL_MODEL_FOLDER, local_model_folder)
|
|
107
|
+
|
|
108
|
+
# See if there is a file ending in ".gguf" in this folder
|
|
109
|
+
dir = os.listdir(input)
|
|
110
|
+
gguf_files = [filename for filename in dir if filename.endswith(".gguf")]
|
|
111
|
+
if len(gguf_files) == 0:
|
|
112
|
+
raise ValueError(
|
|
113
|
+
f"The folder {input} does not contain a GGUF model file."
|
|
114
|
+
)
|
|
115
|
+
model_to_use = gguf_files[0]
|
|
116
|
+
full_model_path = os.path.join(local_model_folder, model_to_use)
|
|
117
|
+
|
|
118
|
+
else:
|
|
119
|
+
# Input is a model checkpoint
|
|
120
|
+
checkpoint = input
|
|
121
|
+
state.checkpoint = checkpoint
|
|
122
|
+
state.save_stat(Keys.CHECKPOINT, checkpoint)
|
|
123
|
+
|
|
124
|
+
# Make sure that a variant is provided for the GGUF model
|
|
125
|
+
base_checkpoint, variant = parse_checkpoint(checkpoint)
|
|
126
|
+
if variant is None:
|
|
127
|
+
raise ValueError(
|
|
128
|
+
"You are required to provide a 'variant' when "
|
|
129
|
+
"selecting a GGUF model. The variant is provided "
|
|
130
|
+
"as CHECKPOINT:VARIANT. For example: "
|
|
131
|
+
"Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_0 or "
|
|
132
|
+
"Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:qwen2.5-coder-3b-instruct-q4_0.gguf"
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# Auto-detect offline status
|
|
136
|
+
offline = is_offline()
|
|
137
|
+
if offline:
|
|
138
|
+
printing.log_warning(
|
|
139
|
+
"Network connectivity to huggingface.co not detected. Running in offline mode."
|
|
140
|
+
)
|
|
141
|
+
full_model_path, model_to_use = get_local_checkpoint_path(
|
|
142
|
+
base_checkpoint, variant
|
|
143
|
+
)
|
|
144
|
+
if not full_model_path:
|
|
145
|
+
raise ValueError(
|
|
146
|
+
f"Model {checkpoint} is not available locally."
|
|
147
|
+
f"Cannot download in offline mode."
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
else:
|
|
151
|
+
|
|
152
|
+
snapshot_files = download_gguf(checkpoint)
|
|
153
|
+
full_model_path = snapshot_files["variant"]
|
|
154
|
+
model_to_use = os.path.basename(full_model_path)
|
|
155
|
+
|
|
156
|
+
llama_cli_exe_path = get_llama_cli_exe_path()
|
|
157
|
+
printing.log_info(f"Using llama_cli for GGUF model: {llama_cli_exe_path}")
|
|
158
|
+
|
|
159
|
+
# Get the directory containing the executable for shared libraries
|
|
160
|
+
lib_dir = os.path.dirname(llama_cli_exe_path)
|
|
161
|
+
|
|
162
|
+
# Pass the model and inputs into state
|
|
163
|
+
state.model = LlamaCppAdapter(
|
|
164
|
+
model=full_model_path,
|
|
165
|
+
device=device,
|
|
166
|
+
output_tokens=output_tokens,
|
|
167
|
+
context_size=context_size,
|
|
168
|
+
threads=threads,
|
|
169
|
+
executable=llama_cli_exe_path,
|
|
170
|
+
reasoning=reasoning,
|
|
171
|
+
lib_dir=lib_dir,
|
|
172
|
+
)
|
|
173
|
+
state.tokenizer = LlamaCppTokenizerAdapter()
|
|
174
|
+
state.device = device
|
|
175
|
+
|
|
176
|
+
# Save initial stats
|
|
177
|
+
state.save_stat(Keys.DEVICE, device)
|
|
178
|
+
state.save_stat(Keys.LLAMA_CLI_VERSION_INFO, get_llama_installed_version())
|
|
179
|
+
|
|
180
|
+
status.add_to_state(state=state, name=input, model=model_to_use)
|
|
181
|
+
return state
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# This file was originally licensed under Apache 2.0. It has been modified.
|
|
185
|
+
# Modifications Copyright (c) 2025 AMD
|