lemonade-sdk 8.0.2__tar.gz → 8.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (77) hide show
  1. {lemonade_sdk-8.0.2/src/lemonade_sdk.egg-info → lemonade_sdk-8.0.4}/PKG-INFO +33 -36
  2. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/README.md +4 -11
  3. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/setup.py +26 -26
  4. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/cli.py +2 -2
  5. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/profilers/profiler.py +4 -1
  6. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/humaneval.py +1 -1
  7. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/mmlu.py +1 -1
  8. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/oga/load.py +3 -9
  9. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/perplexity.py +2 -2
  10. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/prompt.py +21 -6
  11. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/quark/quark_load.py +1 -1
  12. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/quark/quark_quantize.py +2 -2
  13. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/report/table.py +80 -0
  14. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/llamacpp.py +148 -16
  15. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/serve.py +73 -0
  16. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/static/styles.css +424 -4
  17. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/static/webapp.html +337 -38
  18. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/tray.py +25 -9
  19. lemonade_sdk-8.0.4/src/lemonade/version.py +1 -0
  20. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4/src/lemonade_sdk.egg-info}/PKG-INFO +33 -36
  21. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade_sdk.egg-info/requires.txt +23 -16
  22. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade_server/model_manager.py +123 -36
  23. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade_server/pydantic_models.py +25 -1
  24. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade_server/server_models.json +53 -43
  25. lemonade_sdk-8.0.2/src/lemonade/version.py +0 -1
  26. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/LICENSE +0 -0
  27. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/NOTICE.md +0 -0
  28. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/pyproject.toml +0 -0
  29. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/setup.cfg +0 -0
  30. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/__init__.py +0 -0
  31. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/api.py +0 -0
  32. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/cache.py +0 -0
  33. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/common/__init__.py +0 -0
  34. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/common/build.py +0 -0
  35. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/common/cli_helpers.py +0 -0
  36. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/common/exceptions.py +0 -0
  37. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/common/filesystem.py +0 -0
  38. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/common/network.py +0 -0
  39. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/common/printing.py +0 -0
  40. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/common/status.py +0 -0
  41. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/common/system_info.py +0 -0
  42. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/common/test_helpers.py +0 -0
  43. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/profilers/__init__.py +0 -0
  44. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/profilers/memory_tracker.py +0 -0
  45. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/sequence.py +0 -0
  46. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/state.py +0 -0
  47. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/__init__.py +0 -0
  48. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/accuracy.py +0 -0
  49. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/adapter.py +0 -0
  50. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/bench.py +0 -0
  51. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/huggingface/bench.py +0 -0
  52. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/huggingface/load.py +0 -0
  53. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/huggingface/utils.py +0 -0
  54. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/llamacpp/bench.py +0 -0
  55. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/llamacpp/load.py +0 -0
  56. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/management_tools.py +0 -0
  57. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/oga/__init__.py +0 -0
  58. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/oga/bench.py +0 -0
  59. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/oga/utils.py +0 -0
  60. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/quark/__init__.py +0 -0
  61. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/report/__init__.py +0 -0
  62. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/report/llm_report.py +0 -0
  63. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/__init__.py +0 -0
  64. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/static/favicon.ico +0 -0
  65. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/tool_calls.py +0 -0
  66. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/utils/port.py +0 -0
  67. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/utils/system_tray.py +0 -0
  68. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/utils/thread.py +0 -0
  69. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/webapp.py +0 -0
  70. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/tool.py +0 -0
  71. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade_install/__init__.py +0 -0
  72. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade_install/install.py +0 -0
  73. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade_sdk.egg-info/SOURCES.txt +0 -0
  74. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade_sdk.egg-info/dependency_links.txt +0 -0
  75. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade_sdk.egg-info/entry_points.txt +0 -0
  76. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade_sdk.egg-info/top_level.txt +0 -0
  77. {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade_server/cli.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lemonade-sdk
3
- Version: 8.0.2
3
+ Version: 8.0.4
4
4
  Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
5
5
  Author-email: lemonade@amd.com
6
6
  Requires-Python: >=3.10, <3.12
@@ -26,45 +26,49 @@ Requires-Dist: openai>=1.81.0
26
26
  Requires-Dist: transformers<=4.51.3
27
27
  Requires-Dist: jinja2
28
28
  Requires-Dist: tabulate
29
- Requires-Dist: huggingface-hub==0.30.2
29
+ Requires-Dist: sentencepiece
30
+ Requires-Dist: huggingface-hub==0.33.0
31
+ Provides-Extra: oga-hybrid
32
+ Requires-Dist: onnx==1.16.1; extra == "oga-hybrid"
33
+ Requires-Dist: numpy==1.26.4; extra == "oga-hybrid"
34
+ Requires-Dist: protobuf>=6.30.1; extra == "oga-hybrid"
35
+ Provides-Extra: oga-cpu
36
+ Requires-Dist: onnxruntime-genai==0.8.2; extra == "oga-cpu"
37
+ Requires-Dist: onnxruntime>=1.22.0; extra == "oga-cpu"
38
+ Provides-Extra: dev
39
+ Requires-Dist: torch>=2.6.0; extra == "dev"
40
+ Requires-Dist: accelerate; extra == "dev"
41
+ Requires-Dist: datasets; extra == "dev"
42
+ Requires-Dist: pandas>=1.5.3; extra == "dev"
43
+ Requires-Dist: matplotlib; extra == "dev"
44
+ Requires-Dist: human-eval-windows==1.0.4; extra == "dev"
45
+ Requires-Dist: lm-eval[api]; extra == "dev"
30
46
  Provides-Extra: oga-hybrid-minimal
31
- Requires-Dist: onnx==1.16.1; extra == "oga-hybrid-minimal"
32
- Requires-Dist: numpy==1.26.4; extra == "oga-hybrid-minimal"
33
- Requires-Dist: protobuf>=6.30.1; extra == "oga-hybrid-minimal"
47
+ Requires-Dist: lemonade-sdk[oga-hybrid]; extra == "oga-hybrid-minimal"
34
48
  Provides-Extra: oga-cpu-minimal
35
- Requires-Dist: onnxruntime-genai==0.6.0; extra == "oga-cpu-minimal"
36
- Requires-Dist: onnxruntime<1.22.0,>=1.10.1; extra == "oga-cpu-minimal"
49
+ Requires-Dist: lemonade-sdk[oga-cpu]; extra == "oga-cpu-minimal"
37
50
  Provides-Extra: llm
38
- Requires-Dist: torch>=2.6.0; extra == "llm"
39
- Requires-Dist: accelerate; extra == "llm"
40
- Requires-Dist: sentencepiece; extra == "llm"
41
- Requires-Dist: datasets; extra == "llm"
42
- Requires-Dist: pandas>=1.5.3; extra == "llm"
43
- Requires-Dist: matplotlib; extra == "llm"
44
- Requires-Dist: human-eval-windows==1.0.4; extra == "llm"
45
- Requires-Dist: lm-eval[api]; extra == "llm"
51
+ Requires-Dist: lemonade-sdk[dev]; extra == "llm"
46
52
  Provides-Extra: llm-oga-cpu
47
- Requires-Dist: lemonade-sdk[oga-cpu-minimal]; extra == "llm-oga-cpu"
48
- Requires-Dist: lemonade-sdk[llm]; extra == "llm-oga-cpu"
53
+ Requires-Dist: lemonade-sdk[dev,oga-cpu]; extra == "llm-oga-cpu"
49
54
  Provides-Extra: llm-oga-igpu
50
55
  Requires-Dist: onnxruntime-genai-directml==0.6.0; extra == "llm-oga-igpu"
51
56
  Requires-Dist: onnxruntime-directml<1.22.0,>=1.19.0; extra == "llm-oga-igpu"
52
57
  Requires-Dist: transformers<4.45.0; extra == "llm-oga-igpu"
53
- Requires-Dist: lemonade-sdk[llm]; extra == "llm-oga-igpu"
58
+ Requires-Dist: lemonade-sdk[dev]; extra == "llm-oga-igpu"
54
59
  Provides-Extra: llm-oga-cuda
55
- Requires-Dist: onnxruntime-genai-cuda==0.6.0; extra == "llm-oga-cuda"
56
- Requires-Dist: onnxruntime-gpu<1.22.0,>=1.19.1; extra == "llm-oga-cuda"
57
- Requires-Dist: transformers<4.45.0; extra == "llm-oga-cuda"
58
- Requires-Dist: lemonade-sdk[llm]; extra == "llm-oga-cuda"
60
+ Requires-Dist: onnxruntime-genai-cuda==0.8.2; extra == "llm-oga-cuda"
61
+ Requires-Dist: onnxruntime-gpu>=1.22.0; extra == "llm-oga-cuda"
62
+ Requires-Dist: transformers<=4.51.3; extra == "llm-oga-cuda"
63
+ Requires-Dist: lemonade-sdk[dev]; extra == "llm-oga-cuda"
59
64
  Provides-Extra: llm-oga-npu
60
65
  Requires-Dist: onnx==1.16.0; extra == "llm-oga-npu"
61
66
  Requires-Dist: onnxruntime==1.18.0; extra == "llm-oga-npu"
62
67
  Requires-Dist: numpy==1.26.4; extra == "llm-oga-npu"
63
68
  Requires-Dist: protobuf>=6.30.1; extra == "llm-oga-npu"
64
- Requires-Dist: lemonade-sdk[llm]; extra == "llm-oga-npu"
69
+ Requires-Dist: lemonade-sdk[dev]; extra == "llm-oga-npu"
65
70
  Provides-Extra: llm-oga-hybrid
66
- Requires-Dist: lemonade-sdk[oga-hybrid-minimal]; extra == "llm-oga-hybrid"
67
- Requires-Dist: lemonade-sdk[llm]; extra == "llm-oga-hybrid"
71
+ Requires-Dist: lemonade-sdk[dev,oga-hybrid]; extra == "llm-oga-hybrid"
68
72
  Provides-Extra: llm-oga-unified
69
73
  Requires-Dist: lemonade-sdk[llm-oga-hybrid]; extra == "llm-oga-unified"
70
74
  Dynamic: author-email
@@ -78,7 +82,7 @@ Dynamic: summary
78
82
 
79
83
  [![Lemonade tests](https://github.com/lemonade-sdk/lemonade/actions/workflows/test_lemonade.yml/badge.svg)](https://github.com/lemonade-sdk/lemonade/tree/main/test "Check out our tests")
80
84
  [![OS - Windows | Linux](https://img.shields.io/badge/OS-windows%20%7C%20linux-blue)](docs/README.md#installation "Check out our instructions")
81
- [![Made with Python](https://img.shields.io/badge/Python-3.8,3.10-blue?logo=python&logoColor=white)](docs/README.md#installation "Check out our instructions")
85
+ [![Made with Python](https://img.shields.io/badge/Python-3.10-blue?logo=python&logoColor=white)](docs/README.md#installation "Check out our instructions")
82
86
 
83
87
  ## 🍋 Lemonade SDK: Quickly serve, benchmark and deploy LLMs
84
88
 
@@ -93,8 +97,8 @@ The [Lemonade SDK](./docs/README.md) makes it easy to run Large Language Models
93
97
  The [Lemonade SDK](./docs/README.md) is comprised of the following:
94
98
 
95
99
  - 🌐 **[Lemonade Server](https://lemonade-server.ai/docs)**: A local LLM server for running ONNX and GGUF models using the OpenAI API standard. Install and enable your applications with NPU and GPU acceleration in minutes.
96
- - 🐍 **Lemonade API**: High-level Python API to directly integrate Lemonade LLMs into Python applications.
97
- - 🖥️ **Lemonade CLI**: The `lemonade` CLI lets you mix-and-match LLMs (ONNX, GGUF, SafeTensors) with measurement tools to characterize your models on your hardware. The available tools are:
100
+ - 🐍 **[Lemonade API](./docs/lemonade_api.md)**: High-level Python API to directly integrate Lemonade LLMs into Python applications.
101
+ - 🖥️ **[Lemonade CLI](./docs/dev_cli/README.md)**: The `lemonade` CLI lets you mix-and-match LLMs (ONNX, GGUF, SafeTensors) with measurement tools to characterize your models on your hardware. The available tools are:
98
102
  - Prompting with templates.
99
103
  - Measuring accuracy with a variety of tests.
100
104
  - Benchmarking to get the time-to-first-token and tokens per second.
@@ -149,14 +153,7 @@ Maximum LLM performance requires the right hardware accelerator with the right i
149
153
  </tbody>
150
154
  </table>
151
155
 
152
-
153
-
154
- #### Inference Engines Overview
155
- | Engine | Description |
156
- | :--- | :--- |
157
- | **OnnxRuntime GenAI (OGA)** | Microsoft engine that runs `.onnx` models and enables hardware vendors to provide their own execution providers (EPs) to support specialized hardware, such as neural processing units (NPUs). |
158
- | **llamacpp** | Community-driven engine with strong GPU acceleration, support for thousands of `.gguf` models, and advanced features such as vision-language models (VLMs) and mixture-of-experts (MoEs). |
159
- | **Hugging Face (HF)** | Hugging Face's `transformers` library can run the original `.safetensors` trained weights for models on Meta's PyTorch engine, which provides a source of truth for accuracy measurement. |
156
+ To learn more about the supported hardware and software, visit the documentation [here](./docs/README.md#software-and-hardware-overview).
160
157
 
161
158
  ## Integrate Lemonade Server with Your Application
162
159
 
@@ -1,6 +1,6 @@
1
1
  [![Lemonade tests](https://github.com/lemonade-sdk/lemonade/actions/workflows/test_lemonade.yml/badge.svg)](https://github.com/lemonade-sdk/lemonade/tree/main/test "Check out our tests")
2
2
  [![OS - Windows | Linux](https://img.shields.io/badge/OS-windows%20%7C%20linux-blue)](docs/README.md#installation "Check out our instructions")
3
- [![Made with Python](https://img.shields.io/badge/Python-3.8,3.10-blue?logo=python&logoColor=white)](docs/README.md#installation "Check out our instructions")
3
+ [![Made with Python](https://img.shields.io/badge/Python-3.10-blue?logo=python&logoColor=white)](docs/README.md#installation "Check out our instructions")
4
4
 
5
5
  ## 🍋 Lemonade SDK: Quickly serve, benchmark and deploy LLMs
6
6
 
@@ -15,8 +15,8 @@ The [Lemonade SDK](./docs/README.md) makes it easy to run Large Language Models
15
15
  The [Lemonade SDK](./docs/README.md) is comprised of the following:
16
16
 
17
17
  - 🌐 **[Lemonade Server](https://lemonade-server.ai/docs)**: A local LLM server for running ONNX and GGUF models using the OpenAI API standard. Install and enable your applications with NPU and GPU acceleration in minutes.
18
- - 🐍 **Lemonade API**: High-level Python API to directly integrate Lemonade LLMs into Python applications.
19
- - 🖥️ **Lemonade CLI**: The `lemonade` CLI lets you mix-and-match LLMs (ONNX, GGUF, SafeTensors) with measurement tools to characterize your models on your hardware. The available tools are:
18
+ - 🐍 **[Lemonade API](./docs/lemonade_api.md)**: High-level Python API to directly integrate Lemonade LLMs into Python applications.
19
+ - 🖥️ **[Lemonade CLI](./docs/dev_cli/README.md)**: The `lemonade` CLI lets you mix-and-match LLMs (ONNX, GGUF, SafeTensors) with measurement tools to characterize your models on your hardware. The available tools are:
20
20
  - Prompting with templates.
21
21
  - Measuring accuracy with a variety of tests.
22
22
  - Benchmarking to get the time-to-first-token and tokens per second.
@@ -71,14 +71,7 @@ Maximum LLM performance requires the right hardware accelerator with the right i
71
71
  </tbody>
72
72
  </table>
73
73
 
74
-
75
-
76
- #### Inference Engines Overview
77
- | Engine | Description |
78
- | :--- | :--- |
79
- | **OnnxRuntime GenAI (OGA)** | Microsoft engine that runs `.onnx` models and enables hardware vendors to provide their own execution providers (EPs) to support specialized hardware, such as neural processing units (NPUs). |
80
- | **llamacpp** | Community-driven engine with strong GPU acceleration, support for thousands of `.gguf` models, and advanced features such as vision-language models (VLMs) and mixture-of-experts (MoEs). |
81
- | **Hugging Face (HF)** | Hugging Face's `transformers` library can run the original `.safetensors` trained weights for models on Meta's PyTorch engine, which provides a source of truth for accuracy measurement. |
74
+ To learn more about the supported hardware and software, visit the documentation [here](./docs/README.md#software-and-hardware-overview).
82
75
 
83
76
  ## Integrate Lemonade Server with Your Application
84
77
 
@@ -48,29 +48,30 @@ setup(
48
48
  "transformers<=4.51.3",
49
49
  "jinja2",
50
50
  "tabulate",
51
- # huggingface-hub==0.31.0 introduces a new transfer protocol that was causing us issues
52
- "huggingface-hub==0.30.2",
51
+ "sentencepiece",
52
+ "huggingface-hub==0.33.0",
53
53
  ],
54
54
  extras_require={
55
- # The -minimal extras are meant to deploy specific backends into end-user
55
+ # The non-dev extras are meant to deploy specific backends into end-user
56
56
  # applications, without including developer-focused tools
57
- "oga-hybrid-minimal": [
57
+ "oga-hybrid": [
58
58
  # Note: `lemonade-install --ryzenai hybrid` is necessary
59
59
  # to complete installation
60
60
  "onnx==1.16.1",
61
61
  "numpy==1.26.4",
62
62
  "protobuf>=6.30.1",
63
63
  ],
64
- "oga-cpu-minimal": [
65
- "onnxruntime-genai==0.6.0",
66
- "onnxruntime >=1.10.1,<1.22.0",
64
+ "oga-cpu": [
65
+ "onnxruntime-genai==0.8.2",
66
+ "onnxruntime >=1.22.0",
67
67
  ],
68
- "llm": [
68
+ # Developer-focused tools for benchmarking, accuracy testing, and
69
+ # model preparation (ONNX export, quantization, device-specifc optimization, etc.)
70
+ "dev": [
69
71
  # Minimal dependencies for developers to use all features of
70
72
  # Lemonade SDK, including building and optimizing models
71
73
  "torch>=2.6.0",
72
74
  "accelerate",
73
- "sentencepiece",
74
75
  "datasets",
75
76
  "pandas>=1.5.3",
76
77
  "matplotlib",
@@ -79,36 +80,35 @@ setup(
79
80
  "human-eval-windows==1.0.4",
80
81
  "lm-eval[api]",
81
82
  ],
82
- "llm-oga-cpu": [
83
- "lemonade-sdk[oga-cpu-minimal]",
84
- "lemonade-sdk[llm]",
85
- ],
83
+ # Keep backwards compatibility for old extras names
84
+ "oga-hybrid-minimal": ["lemonade-sdk[oga-hybrid]"],
85
+ "oga-cpu-minimal": ["lemonade-sdk[oga-cpu]"],
86
+ "llm": ["lemonade-sdk[dev]"],
87
+ "llm-oga-cpu": ["lemonade-sdk[dev,oga-cpu]"],
88
+ # The following extras are deprecated and/or not commonly used
86
89
  "llm-oga-igpu": [
87
90
  "onnxruntime-genai-directml==0.6.0",
88
91
  "onnxruntime-directml>=1.19.0,<1.22.0",
89
92
  "transformers<4.45.0",
90
- "lemonade-sdk[llm]",
93
+ "lemonade-sdk[dev]",
91
94
  ],
92
95
  "llm-oga-cuda": [
93
- "onnxruntime-genai-cuda==0.6.0",
94
- "onnxruntime-gpu >=1.19.1,<1.22.0",
95
- "transformers<4.45.0",
96
- "lemonade-sdk[llm]",
96
+ "onnxruntime-genai-cuda==0.8.2",
97
+ "onnxruntime-gpu >=1.22.0",
98
+ "transformers<=4.51.3",
99
+ "lemonade-sdk[dev]",
97
100
  ],
98
101
  "llm-oga-npu": [
99
102
  "onnx==1.16.0",
103
+ # NPU requires specific onnxruntime version for Ryzen AI compatibility
104
+ # This may conflict with other OGA extras that require >=1.22.0
100
105
  "onnxruntime==1.18.0",
101
106
  "numpy==1.26.4",
102
107
  "protobuf>=6.30.1",
103
- "lemonade-sdk[llm]",
104
- ],
105
- "llm-oga-hybrid": [
106
- "lemonade-sdk[oga-hybrid-minimal]",
107
- "lemonade-sdk[llm]",
108
- ],
109
- "llm-oga-unified": [
110
- "lemonade-sdk[llm-oga-hybrid]",
108
+ "lemonade-sdk[dev]",
111
109
  ],
110
+ "llm-oga-hybrid": ["lemonade-sdk[dev,oga-hybrid]"],
111
+ "llm-oga-unified": ["lemonade-sdk[llm-oga-hybrid]"],
112
112
  },
113
113
  classifiers=[],
114
114
  entry_points={
@@ -90,9 +90,9 @@ https://github.com/lemonade-sdk/lemonade/blob/main/docs/README.md""",
90
90
  )
91
91
 
92
92
  profiler_instances = [
93
- profiler(global_args[profiler.unique_name])
93
+ profiler(global_args[profiler.unique_name.replace("-", "_")])
94
94
  for profiler in profilers
95
- if global_args.get(profiler.unique_name, None) is not None
95
+ if global_args.get(profiler.unique_name.replace("-", "_"), None) is not None
96
96
  ]
97
97
 
98
98
  if len(evaluation_tools) > 0:
@@ -48,7 +48,10 @@ class Profiler(abc.ABC):
48
48
  This method is called so that the profiler can create its output files.
49
49
  The state is passed so that build info can be gathered and stats can be written.
50
50
  The timestamp can be used for filename in current working directory.
51
- The start times contain a list of tools and start times.
51
+ The start times parameter is a dict with the keys being the tools names and
52
+ the values being the time the tool started. There is an initial "warmup" key
53
+ that has a start time before the first tool and a "cool down" key that contains the
54
+ time when the last tool ended.
52
55
  """
53
56
 
54
57
 
@@ -24,7 +24,7 @@ class AccuracyHumaneval(Tool):
24
24
  - pass@10: Percentage of problems solved within 10 generation attempts
25
25
  - pass@100: Percentage of problems solved within 100 generation attempts
26
26
 
27
- See docs/lemonade/humaneval_accuracy.md for more details
27
+ See docs/dev_cli/humaneval_accuracy.md for more details
28
28
  """
29
29
 
30
30
  unique_name = "accuracy-humaneval"
@@ -27,7 +27,7 @@ def min_handle_none(*args: int):
27
27
 
28
28
  class AccuracyMMLU(Tool):
29
29
  """
30
- See docs/lemonade/mmlu_accuracy.md for more details
30
+ See docs/dev_cli/mmlu_accuracy.md for more details
31
31
  """
32
32
 
33
33
  unique_name = "accuracy-mmlu"
@@ -1,12 +1,6 @@
1
1
  # onnxruntime_genai is not lint-friendly yet and PyLint can't
2
2
  # find any of the class methods
3
3
  # pylint: disable=no-member
4
- #
5
- # Model builder constraints:
6
- # 11/10/24 Need transformers <4.45.0 OR onnxruntime-genai 0.5.0 (which must be built from source)
7
- # (transformers v4.45 changes the format of the tokenizer.json file which will be supported in
8
- # onnxruntime-genai 0.5)
9
- #
10
4
 
11
5
  import argparse
12
6
  import os
@@ -51,8 +45,8 @@ def import_error_heler(e: Exception):
51
45
  """
52
46
  raise ImportError(
53
47
  f"{e}\n Please install lemonade-sdk with "
54
- "one of the llm-oga extras, for example:\n"
55
- "pip install lemonade-sdk[llm-oga-cpu]\n"
48
+ "one of the oga extras, for example:\n"
49
+ "pip install lemonade-sdk[dev,oga-cpu]\n"
56
50
  "See https://lemonade_server.ai/install_options.html for details"
57
51
  )
58
52
 
@@ -64,7 +58,7 @@ class OgaLoad(FirstTool):
64
58
  Input: path to a checkpoint.
65
59
  Supported choices for cpu and igpu from HF model repository:
66
60
  LLM models on Huggingface supported by model_builder. See documentation
67
- (https://github.com/lemonade-sdk/lemonade/blob/main/docs/ort_genai_igpu.md)
61
+ (https://github.com/lemonade-sdk/lemonade/blob/main/docs/dev_cli/ort_genai_igpu.md)
68
62
  for supported models.
69
63
  Supported choices for npu from HF model repository:
70
64
  Models on Hugging Face that follow the "amd/**-onnx-ryzen-strix" pattern
@@ -17,7 +17,7 @@ class AccuracyPerplexity(Tool):
17
17
 
18
18
  Output state produced: None
19
19
 
20
- See docs/lemonade/perplexity.md for more details.
20
+ See docs/dev_cli/perplexity.md for more details.
21
21
  """
22
22
 
23
23
  unique_name = "accuracy-perplexity"
@@ -63,7 +63,7 @@ class AccuracyPerplexity(Tool):
63
63
  # try-except will allow a few more LLMs to work
64
64
  max_length = 2048
65
65
  # Set stride to half of the maximum input length for overlapping window processing
66
- # Refer to docs/perplexity.md for more information on sliding window
66
+ # Refer to docs/dev_cli/perplexity.md for more information on sliding window
67
67
  stride = max_length // 2
68
68
  # Determine the total sequence length of the tokenized input
69
69
  seq_len = encodings.input_ids.size(1)
@@ -176,12 +176,21 @@ class LLMPrompt(Tool):
176
176
 
177
177
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids
178
178
  if isinstance(input_ids, (list, str)):
179
- # OGA models return a list of tokens
179
+ # OGA models return a list of tokens (older versions)
180
180
  # Our llama.cpp adapter returns a string
181
181
  len_tokens_in = len(input_ids)
182
- else:
182
+ elif hasattr(input_ids, "shape"):
183
183
  # HF models return a 2-D tensor
184
- len_tokens_in = input_ids.shape[1]
184
+ # OGA models with newer versions may return numpy arrays
185
+ if len(input_ids.shape) == 1:
186
+ # 1-D array from newer OGA versions
187
+ len_tokens_in = len(input_ids)
188
+ else:
189
+ # 2-D tensor from HF models
190
+ len_tokens_in = input_ids.shape[1]
191
+ else:
192
+ # Fallback: try to get length directly
193
+ len_tokens_in = len(input_ids)
185
194
 
186
195
  len_tokens_out = []
187
196
  response_texts = []
@@ -202,9 +211,15 @@ class LLMPrompt(Tool):
202
211
  random_seed += 1
203
212
 
204
213
  # Flatten the input and response
205
- input_ids_array = (
206
- input_ids if isinstance(input_ids, (list, str)) else input_ids[0]
207
- )
214
+ if isinstance(input_ids, (list, str)):
215
+ input_ids_array = input_ids
216
+ elif hasattr(input_ids, "shape") and len(input_ids.shape) == 1:
217
+ # 1-D array from newer OGA versions - already flat
218
+ input_ids_array = input_ids
219
+ else:
220
+ # 2-D tensor from HF models - take first row
221
+ input_ids_array = input_ids[0]
222
+
208
223
  response_array = response if isinstance(response, str) else response[0]
209
224
 
210
225
  # Separate the prompt from the response
@@ -18,7 +18,7 @@ class QuarkLoad(Tool):
18
18
  Output:
19
19
  - state of the loaded model
20
20
 
21
- See docs/quark.md for more details.
21
+ See docs/dev_cli/quark.md for more details.
22
22
  """
23
23
 
24
24
  unique_name = "quark-load"
@@ -25,7 +25,7 @@ class QuarkQuantize(Tool):
25
25
  Output:
26
26
  - Modifies `state` with quantized and optionally exported model.
27
27
 
28
- See docs/quark.md for more details.
28
+ See docs/dev_cli/quark.md for more details.
29
29
  """
30
30
 
31
31
  unique_name = "quark-quantize"
@@ -94,7 +94,7 @@ class QuarkQuantize(Tool):
94
94
  help="Number of samples for calibration.",
95
95
  )
96
96
 
97
- # See docs/quark.md for more details.
97
+ # See docs/dev_cli/quark.md for more details.
98
98
  parser.add_argument(
99
99
  "--quant-scheme",
100
100
  type=str,
@@ -74,6 +74,7 @@ class SimpleStat(TableColumn):
74
74
  align="center",
75
75
  omit_if_lean=False,
76
76
  wrap=None,
77
+ stat_fn=None,
77
78
  ):
78
79
  self.column_header = column_header
79
80
  self.stat = stat
@@ -81,6 +82,7 @@ class SimpleStat(TableColumn):
81
82
  self.align = align
82
83
  self.omit_if_lean = omit_if_lean
83
84
  self.wrap = wrap or self.default_wrap
85
+ self.stat_fn = stat_fn
84
86
 
85
87
  def get_str(self, build_stats, lean=False):
86
88
  if lean and self.omit_if_lean:
@@ -88,6 +90,8 @@ class SimpleStat(TableColumn):
88
90
  data = build_stats.get(self.stat, None)
89
91
  if data is None:
90
92
  return ""
93
+ if self.stat_fn:
94
+ data = self.stat_fn(data)
91
95
  cell_str = "\n".join(
92
96
  [_wrap(f"{x:{self.format_str}}", self.wrap) for x in _to_list(data)]
93
97
  )
@@ -233,6 +237,47 @@ class AdditionalStat(TableColumn):
233
237
  return "\n".join(cell_entry)
234
238
 
235
239
 
240
+ class DictListStat(TableColumn):
241
+ """
242
+ A statistic that is a list of dicts and values from a given list of keys will be
243
+ pulled out of each dict and placed in the cell
244
+ """
245
+
246
+ def __init__(
247
+ self,
248
+ column_header,
249
+ statistic_name,
250
+ key_format_list,
251
+ align="center",
252
+ omit_if_lean=False,
253
+ wrap=None,
254
+ ):
255
+ self.column_header = column_header
256
+ self.statistic_name = statistic_name
257
+ self.key_format_list = key_format_list
258
+ self.align = align
259
+ self.omit_if_lean = omit_if_lean
260
+ self.wrap = wrap or self.default_wrap
261
+
262
+ def get_str(self, build_stats, lean=False):
263
+ if lean and self.omit_if_lean:
264
+ return None
265
+ stat = build_stats.get(self.statistic_name, None)
266
+ if not stat:
267
+ return ""
268
+ cell_entry = []
269
+ for stat_dict in stat:
270
+ line = [
271
+ format_str.format(stat_dict[key])
272
+ for key, format_str in self.key_format_list
273
+ ]
274
+ cell_entry.append(" ".join(line))
275
+ return "\n".join(cell_entry)
276
+
277
+ def get_keys(self):
278
+ return [self.statistic_name]
279
+
280
+
236
281
  ################################################################################
237
282
  # ABSTRACT BASE CLASS FOR DEFINING A TABLE
238
283
  ################################################################################
@@ -350,6 +395,28 @@ class Table(ABC):
350
395
  headers.append(column.column_header)
351
396
  col_align += (column.align,)
352
397
 
398
+ # Stat column headers
399
+ stat_columns = self.table_descriptor.get("stat_columns", [])
400
+ stat_columns_include = []
401
+ for column in stat_columns:
402
+ # Check to see that at least one build has data for the column
403
+ keep_column = False
404
+ if not (self.lean and column.omit_if_lean):
405
+ keys = column.get_keys()
406
+ for build_stats in self.all_stats:
407
+ found = [(key in build_stats) for key in keys]
408
+ if any(found):
409
+ keep_column = True
410
+ headers.append(column.column_header)
411
+ col_align += (column.align,)
412
+ break
413
+ stat_columns_include.append(keep_column)
414
+ stat_columns = [
415
+ column
416
+ for column, include in zip(stat_columns, stat_columns_include)
417
+ if include
418
+ ]
419
+
353
420
  # Final headers
354
421
  last_columns = self.table_descriptor.get("last_columns", [])
355
422
  for column in last_columns:
@@ -386,6 +453,12 @@ class Table(ABC):
386
453
  if entry_str is not None:
387
454
  row.append(entry_str)
388
455
 
456
+ # Per stat columns
457
+ for entry in stat_columns:
458
+ entry_str = entry.get_str(build_stats, self.lean)
459
+ if entry_str is not None:
460
+ row.append(entry_str)
461
+
389
462
  # Final columns
390
463
  for entry in last_columns:
391
464
  entry_str = entry.get_str(build_stats, self.lean)
@@ -514,6 +587,12 @@ class LemonadePerfTable(Table):
514
587
  Keys.STD_DEV_TOKENS_PER_SECOND,
515
588
  ".2f",
516
589
  ),
590
+ SimpleStat(
591
+ _wrap("Total Generated Tokens", 9),
592
+ Keys.RESPONSE_TOKENS,
593
+ "d",
594
+ stat_fn=sum,
595
+ ),
517
596
  SimpleStat(
518
597
  _wrap("Memory Used (GB)", 8), Keys.MAX_MEMORY_USED_GBYTE, ".3f"
519
598
  ),
@@ -537,6 +616,7 @@ class LemonadePerfTable(Table):
537
616
  )
538
617
  ],
539
618
  },
619
+ "stat_columns": [],
540
620
  "last_columns": [
541
621
  SimpleStat(
542
622
  "System Info",