lemonade-sdk 8.0.5__tar.gz → 8.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (79) hide show
  1. {lemonade_sdk-8.0.5/src/lemonade_sdk.egg-info → lemonade_sdk-8.1.0}/PKG-INFO +32 -21
  2. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/README.md +7 -3
  3. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/setup.py +26 -19
  4. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/cache.py +3 -1
  5. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/common/network.py +18 -1
  6. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/adapter.py +6 -0
  7. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/huggingface/utils.py +6 -5
  8. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/llamacpp/bench.py +28 -46
  9. lemonade_sdk-8.1.0/src/lemonade/tools/llamacpp/load.py +185 -0
  10. lemonade_sdk-8.1.0/src/lemonade/tools/llamacpp/utils.py +612 -0
  11. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/oga/bench.py +5 -6
  12. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/oga/load.py +239 -112
  13. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/oga/utils.py +27 -9
  14. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/prompt.py +17 -25
  15. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/report/table.py +12 -9
  16. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/llamacpp.py +80 -92
  17. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/serve.py +22 -28
  18. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/static/styles.css +121 -26
  19. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/static/webapp.html +14 -6
  20. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/tray.py +7 -0
  21. lemonade_sdk-8.1.0/src/lemonade/version.py +1 -0
  22. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade_install/install.py +65 -84
  23. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0/src/lemonade_sdk.egg-info}/PKG-INFO +32 -21
  24. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade_sdk.egg-info/SOURCES.txt +1 -0
  25. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade_sdk.egg-info/requires.txt +18 -8
  26. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade_server/cli.py +1 -1
  27. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade_server/model_manager.py +8 -151
  28. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade_server/pydantic_models.py +1 -4
  29. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade_server/server_models.json +44 -9
  30. lemonade_sdk-8.0.5/src/lemonade/tools/llamacpp/load.py +0 -277
  31. lemonade_sdk-8.0.5/src/lemonade/version.py +0 -1
  32. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/LICENSE +0 -0
  33. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/NOTICE.md +0 -0
  34. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/setup.cfg +0 -0
  35. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/__init__.py +0 -0
  36. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/api.py +0 -0
  37. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/cli.py +0 -0
  38. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/common/__init__.py +0 -0
  39. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/common/build.py +0 -0
  40. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/common/cli_helpers.py +0 -0
  41. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/common/exceptions.py +0 -0
  42. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/common/filesystem.py +0 -0
  43. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/common/inference_engines.py +0 -0
  44. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/common/printing.py +0 -0
  45. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/common/status.py +0 -0
  46. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/common/system_info.py +0 -0
  47. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/common/test_helpers.py +0 -0
  48. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/profilers/__init__.py +0 -0
  49. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/profilers/memory_tracker.py +0 -0
  50. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/profilers/profiler.py +0 -0
  51. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/sequence.py +0 -0
  52. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/state.py +0 -0
  53. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/__init__.py +0 -0
  54. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/accuracy.py +0 -0
  55. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/bench.py +0 -0
  56. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/huggingface/bench.py +0 -0
  57. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/huggingface/load.py +0 -0
  58. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/humaneval.py +0 -0
  59. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/management_tools.py +0 -0
  60. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/mmlu.py +0 -0
  61. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/oga/__init__.py +0 -0
  62. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/perplexity.py +0 -0
  63. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/quark/__init__.py +0 -0
  64. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/quark/quark_load.py +0 -0
  65. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/quark/quark_quantize.py +0 -0
  66. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/report/__init__.py +0 -0
  67. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/report/llm_report.py +0 -0
  68. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/__init__.py +0 -0
  69. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/static/favicon.ico +0 -0
  70. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/tool_calls.py +0 -0
  71. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/utils/port.py +0 -0
  72. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/utils/system_tray.py +0 -0
  73. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/utils/thread.py +0 -0
  74. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/server/webapp.py +0 -0
  75. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade/tools/tool.py +0 -0
  76. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade_install/__init__.py +0 -0
  77. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade_sdk.egg-info/dependency_links.txt +0 -0
  78. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade_sdk.egg-info/entry_points.txt +0 -0
  79. {lemonade_sdk-8.0.5 → lemonade_sdk-8.1.0}/src/lemonade_sdk.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lemonade-sdk
3
- Version: 8.0.5
3
+ Version: 8.1.0
4
4
  Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
5
5
  Author-email: lemonade@amd.com
6
6
  Requires-Python: >=3.10, <3.13
@@ -22,16 +22,15 @@ Requires-Dist: pytz
22
22
  Requires-Dist: zstandard
23
23
  Requires-Dist: fastapi
24
24
  Requires-Dist: uvicorn[standard]
25
- Requires-Dist: openai>=1.81.0
26
- Requires-Dist: transformers<=4.51.3
25
+ Requires-Dist: openai<1.97.1,>=1.81.0
26
+ Requires-Dist: transformers<=4.53.2
27
27
  Requires-Dist: jinja2
28
28
  Requires-Dist: tabulate
29
29
  Requires-Dist: sentencepiece
30
30
  Requires-Dist: huggingface-hub==0.33.0
31
- Provides-Extra: oga-hybrid
32
- Requires-Dist: onnx==1.16.1; extra == "oga-hybrid"
33
- Requires-Dist: numpy==1.26.4; extra == "oga-hybrid"
34
- Requires-Dist: protobuf>=6.30.1; extra == "oga-hybrid"
31
+ Provides-Extra: oga-ryzenai
32
+ Requires-Dist: onnxruntime-genai-directml-ryzenai==0.7.0.2; extra == "oga-ryzenai"
33
+ Requires-Dist: protobuf>=6.30.1; extra == "oga-ryzenai"
35
34
  Provides-Extra: oga-cpu
36
35
  Requires-Dist: onnxruntime-genai==0.8.2; extra == "oga-cpu"
37
36
  Requires-Dist: onnxruntime>=1.22.0; extra == "oga-cpu"
@@ -43,14 +42,32 @@ Requires-Dist: pandas>=1.5.3; extra == "dev"
43
42
  Requires-Dist: matplotlib; extra == "dev"
44
43
  Requires-Dist: human-eval-windows==1.0.4; extra == "dev"
45
44
  Requires-Dist: lm-eval[api]; extra == "dev"
45
+ Provides-Extra: oga-hybrid
46
+ Requires-Dist: lemonade-sdk[oga-ryzenai]; extra == "oga-hybrid"
47
+ Provides-Extra: oga-unified
48
+ Requires-Dist: lemonade-sdk[oga-ryzenai]; extra == "oga-unified"
46
49
  Provides-Extra: oga-hybrid-minimal
47
- Requires-Dist: lemonade-sdk[oga-hybrid]; extra == "oga-hybrid-minimal"
50
+ Requires-Dist: lemonade-sdk[oga-ryzenai]; extra == "oga-hybrid-minimal"
48
51
  Provides-Extra: oga-cpu-minimal
49
52
  Requires-Dist: lemonade-sdk[oga-cpu]; extra == "oga-cpu-minimal"
53
+ Provides-Extra: oga-npu-minimal
54
+ Requires-Dist: lemonade-sdk[oga-ryzenai]; extra == "oga-npu-minimal"
50
55
  Provides-Extra: llm
51
56
  Requires-Dist: lemonade-sdk[dev]; extra == "llm"
52
57
  Provides-Extra: llm-oga-cpu
53
58
  Requires-Dist: lemonade-sdk[dev,oga-cpu]; extra == "llm-oga-cpu"
59
+ Provides-Extra: llm-oga-npu
60
+ Requires-Dist: onnx==1.16.0; extra == "llm-oga-npu"
61
+ Requires-Dist: onnxruntime==1.18.0; extra == "llm-oga-npu"
62
+ Requires-Dist: numpy==1.26.4; extra == "llm-oga-npu"
63
+ Requires-Dist: protobuf>=6.30.1; extra == "llm-oga-npu"
64
+ Requires-Dist: lemonade-sdk[dev]; extra == "llm-oga-npu"
65
+ Provides-Extra: llm-oga-hybrid
66
+ Requires-Dist: onnx==1.16.1; extra == "llm-oga-hybrid"
67
+ Requires-Dist: numpy==1.26.4; extra == "llm-oga-hybrid"
68
+ Requires-Dist: protobuf>=6.30.1; extra == "llm-oga-hybrid"
69
+ Provides-Extra: llm-oga-unified
70
+ Requires-Dist: lemonade-sdk[dev,llm-oga-hybrid]; extra == "llm-oga-unified"
54
71
  Provides-Extra: llm-oga-igpu
55
72
  Requires-Dist: onnxruntime-genai-directml==0.6.0; extra == "llm-oga-igpu"
56
73
  Requires-Dist: onnxruntime-directml<1.22.0,>=1.19.0; extra == "llm-oga-igpu"
@@ -61,16 +78,6 @@ Requires-Dist: onnxruntime-genai-cuda==0.8.2; extra == "llm-oga-cuda"
61
78
  Requires-Dist: onnxruntime-gpu>=1.22.0; extra == "llm-oga-cuda"
62
79
  Requires-Dist: transformers<=4.51.3; extra == "llm-oga-cuda"
63
80
  Requires-Dist: lemonade-sdk[dev]; extra == "llm-oga-cuda"
64
- Provides-Extra: llm-oga-npu
65
- Requires-Dist: onnx==1.16.0; extra == "llm-oga-npu"
66
- Requires-Dist: onnxruntime==1.18.0; extra == "llm-oga-npu"
67
- Requires-Dist: numpy==1.26.4; extra == "llm-oga-npu"
68
- Requires-Dist: protobuf>=6.30.1; extra == "llm-oga-npu"
69
- Requires-Dist: lemonade-sdk[dev]; extra == "llm-oga-npu"
70
- Provides-Extra: llm-oga-hybrid
71
- Requires-Dist: lemonade-sdk[dev,oga-hybrid]; extra == "llm-oga-hybrid"
72
- Provides-Extra: llm-oga-unified
73
- Requires-Dist: lemonade-sdk[llm-oga-hybrid]; extra == "llm-oga-unified"
74
81
  Dynamic: author-email
75
82
  Dynamic: description
76
83
  Dynamic: description-content-type
@@ -174,7 +181,7 @@ lemonade-server list
174
181
 
175
182
  ## Model Library
176
183
 
177
- Lemonade supports both GGUF and ONNX models as detailed in the [Supported Configuration](#supported-configurations) section. A list of all built-in models is available [here](https://lemonade-server.ai/docs/server/models/).
184
+ Lemonade supports both GGUF and ONNX models as detailed in the [Supported Configuration](#supported-configurations) section. A list of all built-in models is available [here](https://lemonade-server.ai/docs/server/server_models/).
178
185
 
179
186
  You can also import custom GGUF and ONNX models from Hugging Face by using our [Model Manager](http://localhost:8000/#model-management) (requires server to be running).
180
187
  <p align="center">
@@ -263,7 +270,7 @@ completion = client.chat.completions.create(
263
270
  print(completion.choices[0].message.content)
264
271
  ```
265
272
 
266
- For more detailed integration instructions, see the [Integration Guide](./server_integration.md).
273
+ For more detailed integration instructions, see the [Integration Guide](./docs/server/server_integration.md).
267
274
 
268
275
  ## Beyond an LLM Server
269
276
 
@@ -272,6 +279,10 @@ The [Lemonade SDK](./docs/README.md) also include the following components:
272
279
  - 🐍 **[Lemonade API](./docs/lemonade_api.md)**: High-level Python API to directly integrate Lemonade LLMs into Python applications.
273
280
  - 🖥️ **[Lemonade CLI](./docs/dev_cli/README.md)**: The `lemonade` CLI lets you mix-and-match LLMs (ONNX, GGUF, SafeTensors) with prompting templates, accuracy testing, performance benchmarking, and memory profiling to characterize your models on your hardware.
274
281
 
282
+ ## FAQ
283
+
284
+ To read our frequently asked questions, see our [FAQ Guide](./docs/faq.md)
285
+
275
286
  ## Contributing
276
287
 
277
288
  We are actively seeking collaborators from across the industry. If you would like to contribute to this project, please check out our [contribution guide](./docs/contribute.md).
@@ -284,7 +295,7 @@ New contributors can find beginner-friendly issues tagged with "Good First Issue
284
295
 
285
296
  ## Maintainers
286
297
 
287
- This project is sponsored by AMD. It is maintained by @danielholanda @jeremyfowers @ramkrishna @vgodsoe in equal measure. You can reach us by filing an [issue](https://github.com/lemonade-sdk/lemonade/issues), email [lemonade@amd.com](mailto:lemonade@amd.com), or join our [Discord](https://discord.gg/5xXzkMu8Zk).
298
+ This project is sponsored by AMD. It is maintained by @danielholanda @jeremyfowers @ramkrishna @vgodsoe in equal measure. You can reach us by filing an [issue](https://github.com/lemonade-sdk/lemonade/issues), emailing [lemonade@amd.com](mailto:lemonade@amd.com), or joining our [Discord](https://discord.gg/5xXzkMu8Zk).
288
299
 
289
300
  ## License
290
301
 
@@ -92,7 +92,7 @@ lemonade-server list
92
92
 
93
93
  ## Model Library
94
94
 
95
- Lemonade supports both GGUF and ONNX models as detailed in the [Supported Configuration](#supported-configurations) section. A list of all built-in models is available [here](https://lemonade-server.ai/docs/server/models/).
95
+ Lemonade supports both GGUF and ONNX models as detailed in the [Supported Configuration](#supported-configurations) section. A list of all built-in models is available [here](https://lemonade-server.ai/docs/server/server_models/).
96
96
 
97
97
  You can also import custom GGUF and ONNX models from Hugging Face by using our [Model Manager](http://localhost:8000/#model-management) (requires server to be running).
98
98
  <p align="center">
@@ -181,7 +181,7 @@ completion = client.chat.completions.create(
181
181
  print(completion.choices[0].message.content)
182
182
  ```
183
183
 
184
- For more detailed integration instructions, see the [Integration Guide](./server_integration.md).
184
+ For more detailed integration instructions, see the [Integration Guide](./docs/server/server_integration.md).
185
185
 
186
186
  ## Beyond an LLM Server
187
187
 
@@ -190,6 +190,10 @@ The [Lemonade SDK](./docs/README.md) also include the following components:
190
190
  - 🐍 **[Lemonade API](./docs/lemonade_api.md)**: High-level Python API to directly integrate Lemonade LLMs into Python applications.
191
191
  - 🖥️ **[Lemonade CLI](./docs/dev_cli/README.md)**: The `lemonade` CLI lets you mix-and-match LLMs (ONNX, GGUF, SafeTensors) with prompting templates, accuracy testing, performance benchmarking, and memory profiling to characterize your models on your hardware.
192
192
 
193
+ ## FAQ
194
+
195
+ To read our frequently asked questions, see our [FAQ Guide](./docs/faq.md)
196
+
193
197
  ## Contributing
194
198
 
195
199
  We are actively seeking collaborators from across the industry. If you would like to contribute to this project, please check out our [contribution guide](./docs/contribute.md).
@@ -202,7 +206,7 @@ New contributors can find beginner-friendly issues tagged with "Good First Issue
202
206
 
203
207
  ## Maintainers
204
208
 
205
- This project is sponsored by AMD. It is maintained by @danielholanda @jeremyfowers @ramkrishna @vgodsoe in equal measure. You can reach us by filing an [issue](https://github.com/lemonade-sdk/lemonade/issues), email [lemonade@amd.com](mailto:lemonade@amd.com), or join our [Discord](https://discord.gg/5xXzkMu8Zk).
209
+ This project is sponsored by AMD. It is maintained by @danielholanda @jeremyfowers @ramkrishna @vgodsoe in equal measure. You can reach us by filing an [issue](https://github.com/lemonade-sdk/lemonade/issues), emailing [lemonade@amd.com](mailto:lemonade@amd.com), or joining our [Discord](https://discord.gg/5xXzkMu8Zk).
206
210
 
207
211
  ## License
208
212
 
@@ -44,8 +44,8 @@ setup(
44
44
  "zstandard",
45
45
  "fastapi",
46
46
  "uvicorn[standard]",
47
- "openai>=1.81.0",
48
- "transformers<=4.51.3",
47
+ "openai>=1.81.0,<1.97.1",
48
+ "transformers<=4.53.2",
49
49
  "jinja2",
50
50
  "tabulate",
51
51
  "sentencepiece",
@@ -54,11 +54,9 @@ setup(
54
54
  extras_require={
55
55
  # The non-dev extras are meant to deploy specific backends into end-user
56
56
  # applications, without including developer-focused tools
57
- "oga-hybrid": [
58
- # Note: `lemonade-install --ryzenai hybrid` is necessary
59
- # to complete installation
60
- "onnx==1.16.1",
61
- "numpy==1.26.4",
57
+ # Primary NPU extra using unified PyPI package
58
+ "oga-ryzenai": [
59
+ "onnxruntime-genai-directml-ryzenai==0.7.0.2",
62
60
  "protobuf>=6.30.1",
63
61
  ],
64
62
  "oga-cpu": [
@@ -81,11 +79,31 @@ setup(
81
79
  "lm-eval[api]",
82
80
  ],
83
81
  # Keep backwards compatibility for old extras names
84
- "oga-hybrid-minimal": ["lemonade-sdk[oga-hybrid]"],
82
+ "oga-hybrid": ["lemonade-sdk[oga-ryzenai]"],
83
+ "oga-unified": ["lemonade-sdk[oga-ryzenai]"],
84
+ "oga-hybrid-minimal": ["lemonade-sdk[oga-ryzenai]"],
85
85
  "oga-cpu-minimal": ["lemonade-sdk[oga-cpu]"],
86
+ "oga-npu-minimal": ["lemonade-sdk[oga-ryzenai]"],
86
87
  "llm": ["lemonade-sdk[dev]"],
87
88
  "llm-oga-cpu": ["lemonade-sdk[dev,oga-cpu]"],
88
89
  # The following extras are deprecated and/or not commonly used
90
+ "llm-oga-npu": [
91
+ "onnx==1.16.0",
92
+ # NPU requires specific onnxruntime version for Ryzen AI compatibility
93
+ # This may conflict with other OGA extras that require >=1.22.0
94
+ "onnxruntime==1.18.0",
95
+ "numpy==1.26.4",
96
+ "protobuf>=6.30.1",
97
+ "lemonade-sdk[dev]",
98
+ ],
99
+ "llm-oga-hybrid": [
100
+ # Note: `lemonade-install --ryzenai hybrid` is necessary
101
+ # to complete installation for RAI 1.4.0.
102
+ "onnx==1.16.1",
103
+ "numpy==1.26.4",
104
+ "protobuf>=6.30.1",
105
+ ],
106
+ "llm-oga-unified": ["lemonade-sdk[dev, llm-oga-hybrid]"],
89
107
  "llm-oga-igpu": [
90
108
  "onnxruntime-genai-directml==0.6.0",
91
109
  "onnxruntime-directml>=1.19.0,<1.22.0",
@@ -98,17 +116,6 @@ setup(
98
116
  "transformers<=4.51.3",
99
117
  "lemonade-sdk[dev]",
100
118
  ],
101
- "llm-oga-npu": [
102
- "onnx==1.16.0",
103
- # NPU requires specific onnxruntime version for Ryzen AI compatibility
104
- # This may conflict with other OGA extras that require >=1.22.0
105
- "onnxruntime==1.18.0",
106
- "numpy==1.26.4",
107
- "protobuf>=6.30.1",
108
- "lemonade-sdk[dev]",
109
- ],
110
- "llm-oga-hybrid": ["lemonade-sdk[dev,oga-hybrid]"],
111
- "llm-oga-unified": ["lemonade-sdk[llm-oga-hybrid]"],
112
119
  },
113
120
  classifiers=[],
114
121
  entry_points={
@@ -34,7 +34,7 @@ def build_name(input_name):
34
34
  """
35
35
  Name the lemonade build by concatenating these two factors:
36
36
  1. Sanitize the input name (typically a model checkpoint name) by
37
- replacing any `/` characters with `_`.
37
+ replacing any `/` characters with `_` and ':' characters with '-'.
38
38
  2. Timestamp to ensure that builds in the same cache will not
39
39
  collide in the same build directory.
40
40
 
@@ -47,6 +47,7 @@ def build_name(input_name):
47
47
  else:
48
48
  # Sanitize the input name
49
49
  input_name_sanitized = input_name.replace("/", "_")
50
+ input_name_sanitized = input_name_sanitized.replace(":", "-")
50
51
 
51
52
  # Get the formatted timestamp string
52
53
  timestamp = get_timestamp()
@@ -79,6 +80,7 @@ class Keys:
79
80
  MAX_MEMORY_USED_GB = "max_memory_used_GB"
80
81
  MAX_MEMORY_USED_GBYTE = "max_memory_used_gbyte"
81
82
  RYZEN_AI_VERSION_INFO = "ryzen_ai_version_info"
83
+ LLAMA_CLI_VERSION_INFO = "llama_cli_version_info"
82
84
 
83
85
 
84
86
  # This file was originally licensed under Apache 2.0. It has been modified.
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  from typing import Optional
3
3
  import socket
4
- from huggingface_hub import model_info
4
+ from huggingface_hub import model_info, snapshot_download
5
5
 
6
6
 
7
7
  def is_offline():
@@ -48,3 +48,20 @@ def get_base_model(checkpoint: str) -> Optional[str]:
48
48
  except Exception: # pylint: disable=broad-except
49
49
  pass
50
50
  return None
51
+
52
+
53
+ def custom_snapshot_download(repo_id, **kwargs):
54
+ """
55
+ Custom snapshot download with retry logic for Windows symlink privilege errors.
56
+ """
57
+ for attempt in range(2):
58
+ try:
59
+ return snapshot_download(repo_id=repo_id, **kwargs)
60
+ except OSError as e:
61
+ if (
62
+ hasattr(e, "winerror")
63
+ and e.winerror == 1314 # pylint: disable=no-member
64
+ and attempt < 1
65
+ ):
66
+ continue
67
+ raise
@@ -13,6 +13,9 @@ class ModelAdapter(abc.ABC):
13
13
  """
14
14
  self.tokens_per_second = None
15
15
  self.time_to_first_token = None
16
+ self.prompt_tokens = None
17
+ self.response_tokens = None
18
+
16
19
  self.type = "generic"
17
20
 
18
21
  @abc.abstractmethod
@@ -22,6 +25,9 @@ class ModelAdapter(abc.ABC):
22
25
 
23
26
  We try to keep the signature here minimal to allow for maximum compatibility
24
27
  with recipe components, which themselves may not support a lot of arguments.
28
+
29
+ The generate method should store prompt and response lengths (in tokens)
30
+ in the prompt_tokens and response_tokens members.
25
31
  """
26
32
 
27
33
 
@@ -108,7 +108,9 @@ class HuggingfaceAdapter(ModelAdapter):
108
108
  with torch.no_grad(), torch.inference_mode():
109
109
  outputs = self.model.generate(input_ids=input_ids, **generation_kwargs)
110
110
 
111
- return outputs
111
+ self.prompt_tokens = input_ids.shape[1]
112
+ self.response_tokens = len(outputs[0]) - self.prompt_tokens
113
+ return outputs
112
114
 
113
115
  def _model_call(self, input_tensor):
114
116
  """Forward pass through the model to get logits
@@ -341,12 +343,11 @@ def benchmark_huggingface_llm(
341
343
 
342
344
  latency = end_time - start_time
343
345
 
344
- token_len = outputs.shape[1] - input_ids.shape[1]
345
- tokens_out_len_list.append(token_len)
346
+ tokens_out_len_list.append(model.response_tokens)
346
347
 
347
348
  # Only count an iteration if it produced enough tokens
348
- if token_len >= target_output_tokens:
349
- per_iteration_result.append((latency, token_len))
349
+ if model.response_tokens >= target_output_tokens:
350
+ per_iteration_result.append((latency, model.response_tokens))
350
351
 
351
352
  report_progress_fn(
352
353
  (warmup_iterations + count + 1) / (warmup_iterations + iterations)
@@ -3,27 +3,31 @@ import statistics
3
3
  from statistics import StatisticsError
4
4
  from lemonade.state import State
5
5
  from lemonade.cache import Keys
6
- from lemonade.tools.llamacpp.load import LlamaCppAdapter
6
+ from lemonade.tools.llamacpp.utils import LlamaCppAdapter
7
7
  from lemonade.tools.bench import Bench
8
8
 
9
9
 
10
10
  class LlamaCppBench(Bench):
11
+ """
12
+ Benchmark a llama.cpp model
13
+ """
11
14
 
12
- unique_name = "llama-cpp-bench"
15
+ unique_name = "llamacpp-bench"
13
16
 
14
17
  def __init__(self):
15
18
  super().__init__()
16
19
 
17
20
  # Additional statistics generated by this bench tool
18
- self.status_stats += [
21
+ self.status_stats.insert(
22
+ self.status_stats.index(Keys.TOKEN_GENERATION_TOKENS_PER_SECOND) + 1,
19
23
  Keys.STD_DEV_TOKENS_PER_SECOND,
20
- ]
24
+ )
21
25
  self.std_dev_token_generation_tokens_per_second_list = []
22
26
 
23
27
  @staticmethod
24
28
  def parser(add_help: bool = True) -> argparse.ArgumentParser:
25
29
  parser = __class__.helpful_parser(
26
- short_description="Benchmark a llama.cpp model",
30
+ short_description="Benchmark an LLM in llama.cpp",
27
31
  add_help=add_help,
28
32
  )
29
33
 
@@ -53,38 +57,22 @@ class LlamaCppBench(Bench):
53
57
  f"{self.__class__.unique_name} requires a LlamaCppAdapter model to be "
54
58
  "loaded first. Please run load-llama-cpp before this tool."
55
59
  )
60
+ model: LlamaCppAdapter = state.model
56
61
 
57
- iteration_tokens_per_second = []
58
- iteration_time_to_first_token = []
62
+ per_iteration_tokens_per_second = []
63
+ per_iteration_time_to_first_token = []
59
64
 
60
65
  for iteration in range(iterations + warmup_iterations):
61
66
  try:
62
67
  # Use the adapter's generate method which already has the timeout
63
68
  # and error handling
64
- raw_output, stderr = state.model.generate(prompt, return_raw=True)
65
-
66
- # Parse the timing information from the output
67
- ms_per_token = None
68
- time_to_first_token_ms = None
69
- input_tokens = None
70
-
71
- # Look for timing in both stdout and stderr
72
- for output in [raw_output, stderr]:
73
- for line in output.splitlines():
74
- if "llama_perf_context_print: eval time =" in line:
75
- parts = line.split("(")[1].strip()
76
- parts = parts.split(",")
77
- ms_per_token = float(
78
- parts[0].split("ms per token")[0].strip()
79
- )
80
- if "llama_perf_context_print: prompt eval time =" in line:
81
- parts = line.split("=")[1].split("/")
82
- time_to_first_token_ms = float(
83
- parts[0].split("ms")[0].strip()
84
- )
85
- input_tokens = int(parts[1].split("tokens")[0].strip())
86
-
87
- if ms_per_token is None or time_to_first_token_ms is None:
69
+ model.time_to_first_token = None
70
+ model.tokens_per_second = None
71
+ raw_output, stderr = model.generate(
72
+ prompt, max_new_tokens=output_tokens, return_raw=True
73
+ )
74
+
75
+ if model.time_to_first_token is None or model.tokens_per_second is None:
88
76
  error_msg = (
89
77
  "Could not find timing information in llama.cpp output.\n"
90
78
  )
@@ -92,17 +80,11 @@ class LlamaCppBench(Bench):
92
80
  error_msg += "Stderr:\n" + stderr
93
81
  raise Exception(error_msg)
94
82
 
95
- # When output_tokens is set to 1 for accuracy tests, ms_per_token tends to 0
96
- # and causes a divide-by-zero error. Set tokens_per_second to 0 in such cases
97
- # as performance data for generating a few tokens is not relevant.
98
- tokens_per_second = 0
99
- if output_tokens > 5 and ms_per_token > 0:
100
- tokens_per_second = 1000 / ms_per_token
101
- time_to_first_token = time_to_first_token_ms / 1000
83
+ self.tokens_out_len_list.append(model.response_tokens)
102
84
 
103
85
  if iteration > warmup_iterations - 1:
104
- iteration_tokens_per_second.append(tokens_per_second)
105
- iteration_time_to_first_token.append(time_to_first_token)
86
+ per_iteration_tokens_per_second.append(model.tokens_per_second)
87
+ per_iteration_time_to_first_token.append(model.time_to_first_token)
106
88
 
107
89
  report_progress_fn((iteration + 1) / (warmup_iterations + iterations))
108
90
 
@@ -110,25 +92,25 @@ class LlamaCppBench(Bench):
110
92
  error_msg = f"Failed to run benchmark: {str(e)}"
111
93
  raise Exception(error_msg)
112
94
 
113
- self.input_ids_len_list.append(input_tokens)
114
- mean_time_to_first_token = statistics.mean(iteration_time_to_first_token)
95
+ self.input_ids_len_list.append(model.prompt_tokens)
96
+ mean_time_to_first_token = statistics.mean(per_iteration_time_to_first_token)
115
97
  self.mean_time_to_first_token_list.append(mean_time_to_first_token)
116
98
  self.prefill_tokens_per_second_list.append(
117
- input_tokens / mean_time_to_first_token
99
+ model.prompt_tokens / mean_time_to_first_token
118
100
  )
119
101
  self.token_generation_tokens_per_second_list.append(
120
- statistics.mean(iteration_tokens_per_second)
102
+ statistics.mean(per_iteration_tokens_per_second)
121
103
  )
122
104
  try:
123
105
  self.std_dev_time_to_first_token_list.append(
124
- statistics.stdev(iteration_time_to_first_token)
106
+ statistics.stdev(per_iteration_time_to_first_token)
125
107
  )
126
108
  except StatisticsError:
127
109
  # Less than 2 measurements
128
110
  self.std_dev_time_to_first_token_list.append(None)
129
111
  try:
130
112
  self.std_dev_token_generation_tokens_per_second_list.append(
131
- statistics.stdev(iteration_tokens_per_second)
113
+ statistics.stdev(per_iteration_tokens_per_second)
132
114
  )
133
115
  except StatisticsError:
134
116
  # Less than 2 measurements
@@ -0,0 +1,185 @@
1
+ import argparse
2
+ import os
3
+ import lemonade.common.printing as printing
4
+ import lemonade.common.status as status
5
+ from lemonade.state import State
6
+ from lemonade.tools import FirstTool
7
+ from lemonade.cache import Keys
8
+
9
+
10
+ class LoadLlamaCpp(FirstTool):
11
+ unique_name = "llamacpp-load"
12
+
13
+ def __init__(self):
14
+ super().__init__(monitor_message="Loading llama.cpp model")
15
+
16
+ self.status_stats = [
17
+ Keys.DEVICE,
18
+ ]
19
+
20
+ @staticmethod
21
+ def parser(add_help: bool = True) -> argparse.ArgumentParser:
22
+ parser = __class__.helpful_parser(
23
+ short_description="Wrap llama.cpp models with an API",
24
+ add_help=add_help,
25
+ )
26
+
27
+ parser.add_argument(
28
+ "-d",
29
+ "--device",
30
+ choices=["cpu", "igpu"],
31
+ default="igpu",
32
+ help="Which device to load the model on to (default: igpu)",
33
+ )
34
+
35
+ default_threads = -1
36
+ parser.add_argument(
37
+ "--threads",
38
+ required=False,
39
+ type=int,
40
+ default=default_threads,
41
+ help=f"Number of threads to use during generation (default: {default_threads})",
42
+ )
43
+
44
+ context_size = 4096
45
+ parser.add_argument(
46
+ "--context-size",
47
+ required=False,
48
+ type=int,
49
+ default=context_size,
50
+ help=f"Size of the prompt context (default: {context_size}. 0 = loaded from model)",
51
+ )
52
+
53
+ output_tokens = 512
54
+ parser.add_argument(
55
+ "--output-tokens",
56
+ required=False,
57
+ type=int,
58
+ default=output_tokens,
59
+ help=f"Maximum number of output tokens to generate (default: {output_tokens})",
60
+ )
61
+
62
+ parser.add_argument(
63
+ "--reasoning",
64
+ action="store_true",
65
+ help="Set this flag to indicate the model is a reasoning model",
66
+ )
67
+
68
+ return parser
69
+
70
+ def run(
71
+ self,
72
+ state: State,
73
+ input: str = "",
74
+ device: str = "igpu",
75
+ context_size: int = 512,
76
+ threads: int = 1,
77
+ output_tokens: int = 512,
78
+ reasoning: bool = False,
79
+ ) -> State:
80
+ """
81
+ Load a llama.cpp model
82
+ """
83
+
84
+ from lemonade.common.network import is_offline
85
+ from lemonade.tools.llamacpp.utils import (
86
+ install_llamacpp,
87
+ get_llama_cli_exe_path,
88
+ get_llama_installed_version,
89
+ parse_checkpoint,
90
+ download_gguf,
91
+ get_local_checkpoint_path,
92
+ LlamaCppTokenizerAdapter,
93
+ LlamaCppAdapter,
94
+ )
95
+
96
+ # Validate and install llama.cpp, if needed
97
+ install_llamacpp()
98
+
99
+ # Check if input is a local folder containing a .GGUF model
100
+ if os.path.isdir(input):
101
+ # input is a local folder
102
+ local_model_folder = os.path.abspath(input)
103
+ checkpoint = "local_model"
104
+ state.checkpoint = checkpoint
105
+ state.save_stat(Keys.CHECKPOINT, checkpoint)
106
+ state.save_stat(Keys.LOCAL_MODEL_FOLDER, local_model_folder)
107
+
108
+ # See if there is a file ending in ".gguf" in this folder
109
+ dir = os.listdir(input)
110
+ gguf_files = [filename for filename in dir if filename.endswith(".gguf")]
111
+ if len(gguf_files) == 0:
112
+ raise ValueError(
113
+ f"The folder {input} does not contain a GGUF model file."
114
+ )
115
+ model_to_use = gguf_files[0]
116
+ full_model_path = os.path.join(local_model_folder, model_to_use)
117
+
118
+ else:
119
+ # Input is a model checkpoint
120
+ checkpoint = input
121
+ state.checkpoint = checkpoint
122
+ state.save_stat(Keys.CHECKPOINT, checkpoint)
123
+
124
+ # Make sure that a variant is provided for the GGUF model
125
+ base_checkpoint, variant = parse_checkpoint(checkpoint)
126
+ if variant is None:
127
+ raise ValueError(
128
+ "You are required to provide a 'variant' when "
129
+ "selecting a GGUF model. The variant is provided "
130
+ "as CHECKPOINT:VARIANT. For example: "
131
+ "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_0 or "
132
+ "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:qwen2.5-coder-3b-instruct-q4_0.gguf"
133
+ )
134
+
135
+ # Auto-detect offline status
136
+ offline = is_offline()
137
+ if offline:
138
+ printing.log_warning(
139
+ "Network connectivity to huggingface.co not detected. Running in offline mode."
140
+ )
141
+ full_model_path, model_to_use = get_local_checkpoint_path(
142
+ base_checkpoint, variant
143
+ )
144
+ if not full_model_path:
145
+ raise ValueError(
146
+ f"Model {checkpoint} is not available locally."
147
+ f"Cannot download in offline mode."
148
+ )
149
+
150
+ else:
151
+
152
+ snapshot_files = download_gguf(checkpoint)
153
+ full_model_path = snapshot_files["variant"]
154
+ model_to_use = os.path.basename(full_model_path)
155
+
156
+ llama_cli_exe_path = get_llama_cli_exe_path()
157
+ printing.log_info(f"Using llama_cli for GGUF model: {llama_cli_exe_path}")
158
+
159
+ # Get the directory containing the executable for shared libraries
160
+ lib_dir = os.path.dirname(llama_cli_exe_path)
161
+
162
+ # Pass the model and inputs into state
163
+ state.model = LlamaCppAdapter(
164
+ model=full_model_path,
165
+ device=device,
166
+ output_tokens=output_tokens,
167
+ context_size=context_size,
168
+ threads=threads,
169
+ executable=llama_cli_exe_path,
170
+ reasoning=reasoning,
171
+ lib_dir=lib_dir,
172
+ )
173
+ state.tokenizer = LlamaCppTokenizerAdapter()
174
+ state.device = device
175
+
176
+ # Save initial stats
177
+ state.save_stat(Keys.DEVICE, device)
178
+ state.save_stat(Keys.LLAMA_CLI_VERSION_INFO, get_llama_installed_version())
179
+
180
+ status.add_to_state(state=state, name=input, model=model_to_use)
181
+ return state
182
+
183
+
184
+ # This file was originally licensed under Apache 2.0. It has been modified.
185
+ # Modifications Copyright (c) 2025 AMD