lemonade-sdk 8.0.2__tar.gz → 8.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lemonade-sdk might be problematic. Click here for more details.
- {lemonade_sdk-8.0.2/src/lemonade_sdk.egg-info → lemonade_sdk-8.0.4}/PKG-INFO +33 -36
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/README.md +4 -11
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/setup.py +26 -26
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/cli.py +2 -2
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/profilers/profiler.py +4 -1
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/humaneval.py +1 -1
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/mmlu.py +1 -1
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/oga/load.py +3 -9
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/perplexity.py +2 -2
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/prompt.py +21 -6
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/quark/quark_load.py +1 -1
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/quark/quark_quantize.py +2 -2
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/report/table.py +80 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/llamacpp.py +148 -16
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/serve.py +73 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/static/styles.css +424 -4
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/static/webapp.html +337 -38
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/tray.py +25 -9
- lemonade_sdk-8.0.4/src/lemonade/version.py +1 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4/src/lemonade_sdk.egg-info}/PKG-INFO +33 -36
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade_sdk.egg-info/requires.txt +23 -16
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade_server/model_manager.py +123 -36
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade_server/pydantic_models.py +25 -1
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade_server/server_models.json +53 -43
- lemonade_sdk-8.0.2/src/lemonade/version.py +0 -1
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/LICENSE +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/NOTICE.md +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/pyproject.toml +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/setup.cfg +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/__init__.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/api.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/cache.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/common/__init__.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/common/build.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/common/cli_helpers.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/common/exceptions.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/common/filesystem.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/common/network.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/common/printing.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/common/status.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/common/system_info.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/common/test_helpers.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/profilers/__init__.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/profilers/memory_tracker.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/sequence.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/state.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/__init__.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/accuracy.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/adapter.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/bench.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/huggingface/bench.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/huggingface/load.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/huggingface/utils.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/llamacpp/bench.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/llamacpp/load.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/management_tools.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/oga/__init__.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/oga/bench.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/oga/utils.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/quark/__init__.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/report/__init__.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/report/llm_report.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/__init__.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/static/favicon.ico +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/tool_calls.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/utils/port.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/utils/system_tray.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/utils/thread.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/server/webapp.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade/tools/tool.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade_install/__init__.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade_install/install.py +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade_sdk.egg-info/SOURCES.txt +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade_sdk.egg-info/dependency_links.txt +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade_sdk.egg-info/entry_points.txt +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade_sdk.egg-info/top_level.txt +0 -0
- {lemonade_sdk-8.0.2 → lemonade_sdk-8.0.4}/src/lemonade_server/cli.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: lemonade-sdk
|
|
3
|
-
Version: 8.0.
|
|
3
|
+
Version: 8.0.4
|
|
4
4
|
Summary: Lemonade SDK: Your LLM Aide for Validation and Deployment
|
|
5
5
|
Author-email: lemonade@amd.com
|
|
6
6
|
Requires-Python: >=3.10, <3.12
|
|
@@ -26,45 +26,49 @@ Requires-Dist: openai>=1.81.0
|
|
|
26
26
|
Requires-Dist: transformers<=4.51.3
|
|
27
27
|
Requires-Dist: jinja2
|
|
28
28
|
Requires-Dist: tabulate
|
|
29
|
-
Requires-Dist:
|
|
29
|
+
Requires-Dist: sentencepiece
|
|
30
|
+
Requires-Dist: huggingface-hub==0.33.0
|
|
31
|
+
Provides-Extra: oga-hybrid
|
|
32
|
+
Requires-Dist: onnx==1.16.1; extra == "oga-hybrid"
|
|
33
|
+
Requires-Dist: numpy==1.26.4; extra == "oga-hybrid"
|
|
34
|
+
Requires-Dist: protobuf>=6.30.1; extra == "oga-hybrid"
|
|
35
|
+
Provides-Extra: oga-cpu
|
|
36
|
+
Requires-Dist: onnxruntime-genai==0.8.2; extra == "oga-cpu"
|
|
37
|
+
Requires-Dist: onnxruntime>=1.22.0; extra == "oga-cpu"
|
|
38
|
+
Provides-Extra: dev
|
|
39
|
+
Requires-Dist: torch>=2.6.0; extra == "dev"
|
|
40
|
+
Requires-Dist: accelerate; extra == "dev"
|
|
41
|
+
Requires-Dist: datasets; extra == "dev"
|
|
42
|
+
Requires-Dist: pandas>=1.5.3; extra == "dev"
|
|
43
|
+
Requires-Dist: matplotlib; extra == "dev"
|
|
44
|
+
Requires-Dist: human-eval-windows==1.0.4; extra == "dev"
|
|
45
|
+
Requires-Dist: lm-eval[api]; extra == "dev"
|
|
30
46
|
Provides-Extra: oga-hybrid-minimal
|
|
31
|
-
Requires-Dist:
|
|
32
|
-
Requires-Dist: numpy==1.26.4; extra == "oga-hybrid-minimal"
|
|
33
|
-
Requires-Dist: protobuf>=6.30.1; extra == "oga-hybrid-minimal"
|
|
47
|
+
Requires-Dist: lemonade-sdk[oga-hybrid]; extra == "oga-hybrid-minimal"
|
|
34
48
|
Provides-Extra: oga-cpu-minimal
|
|
35
|
-
Requires-Dist:
|
|
36
|
-
Requires-Dist: onnxruntime<1.22.0,>=1.10.1; extra == "oga-cpu-minimal"
|
|
49
|
+
Requires-Dist: lemonade-sdk[oga-cpu]; extra == "oga-cpu-minimal"
|
|
37
50
|
Provides-Extra: llm
|
|
38
|
-
Requires-Dist:
|
|
39
|
-
Requires-Dist: accelerate; extra == "llm"
|
|
40
|
-
Requires-Dist: sentencepiece; extra == "llm"
|
|
41
|
-
Requires-Dist: datasets; extra == "llm"
|
|
42
|
-
Requires-Dist: pandas>=1.5.3; extra == "llm"
|
|
43
|
-
Requires-Dist: matplotlib; extra == "llm"
|
|
44
|
-
Requires-Dist: human-eval-windows==1.0.4; extra == "llm"
|
|
45
|
-
Requires-Dist: lm-eval[api]; extra == "llm"
|
|
51
|
+
Requires-Dist: lemonade-sdk[dev]; extra == "llm"
|
|
46
52
|
Provides-Extra: llm-oga-cpu
|
|
47
|
-
Requires-Dist: lemonade-sdk[oga-cpu
|
|
48
|
-
Requires-Dist: lemonade-sdk[llm]; extra == "llm-oga-cpu"
|
|
53
|
+
Requires-Dist: lemonade-sdk[dev,oga-cpu]; extra == "llm-oga-cpu"
|
|
49
54
|
Provides-Extra: llm-oga-igpu
|
|
50
55
|
Requires-Dist: onnxruntime-genai-directml==0.6.0; extra == "llm-oga-igpu"
|
|
51
56
|
Requires-Dist: onnxruntime-directml<1.22.0,>=1.19.0; extra == "llm-oga-igpu"
|
|
52
57
|
Requires-Dist: transformers<4.45.0; extra == "llm-oga-igpu"
|
|
53
|
-
Requires-Dist: lemonade-sdk[
|
|
58
|
+
Requires-Dist: lemonade-sdk[dev]; extra == "llm-oga-igpu"
|
|
54
59
|
Provides-Extra: llm-oga-cuda
|
|
55
|
-
Requires-Dist: onnxruntime-genai-cuda==0.
|
|
56
|
-
Requires-Dist: onnxruntime-gpu
|
|
57
|
-
Requires-Dist: transformers
|
|
58
|
-
Requires-Dist: lemonade-sdk[
|
|
60
|
+
Requires-Dist: onnxruntime-genai-cuda==0.8.2; extra == "llm-oga-cuda"
|
|
61
|
+
Requires-Dist: onnxruntime-gpu>=1.22.0; extra == "llm-oga-cuda"
|
|
62
|
+
Requires-Dist: transformers<=4.51.3; extra == "llm-oga-cuda"
|
|
63
|
+
Requires-Dist: lemonade-sdk[dev]; extra == "llm-oga-cuda"
|
|
59
64
|
Provides-Extra: llm-oga-npu
|
|
60
65
|
Requires-Dist: onnx==1.16.0; extra == "llm-oga-npu"
|
|
61
66
|
Requires-Dist: onnxruntime==1.18.0; extra == "llm-oga-npu"
|
|
62
67
|
Requires-Dist: numpy==1.26.4; extra == "llm-oga-npu"
|
|
63
68
|
Requires-Dist: protobuf>=6.30.1; extra == "llm-oga-npu"
|
|
64
|
-
Requires-Dist: lemonade-sdk[
|
|
69
|
+
Requires-Dist: lemonade-sdk[dev]; extra == "llm-oga-npu"
|
|
65
70
|
Provides-Extra: llm-oga-hybrid
|
|
66
|
-
Requires-Dist: lemonade-sdk[oga-hybrid
|
|
67
|
-
Requires-Dist: lemonade-sdk[llm]; extra == "llm-oga-hybrid"
|
|
71
|
+
Requires-Dist: lemonade-sdk[dev,oga-hybrid]; extra == "llm-oga-hybrid"
|
|
68
72
|
Provides-Extra: llm-oga-unified
|
|
69
73
|
Requires-Dist: lemonade-sdk[llm-oga-hybrid]; extra == "llm-oga-unified"
|
|
70
74
|
Dynamic: author-email
|
|
@@ -78,7 +82,7 @@ Dynamic: summary
|
|
|
78
82
|
|
|
79
83
|
[](https://github.com/lemonade-sdk/lemonade/tree/main/test "Check out our tests")
|
|
80
84
|
[](docs/README.md#installation "Check out our instructions")
|
|
81
|
-
[](docs/README.md#installation "Check out our instructions")
|
|
82
86
|
|
|
83
87
|
## 🍋 Lemonade SDK: Quickly serve, benchmark and deploy LLMs
|
|
84
88
|
|
|
@@ -93,8 +97,8 @@ The [Lemonade SDK](./docs/README.md) makes it easy to run Large Language Models
|
|
|
93
97
|
The [Lemonade SDK](./docs/README.md) is comprised of the following:
|
|
94
98
|
|
|
95
99
|
- 🌐 **[Lemonade Server](https://lemonade-server.ai/docs)**: A local LLM server for running ONNX and GGUF models using the OpenAI API standard. Install and enable your applications with NPU and GPU acceleration in minutes.
|
|
96
|
-
- 🐍 **Lemonade API**: High-level Python API to directly integrate Lemonade LLMs into Python applications.
|
|
97
|
-
- 🖥️ **Lemonade CLI**: The `lemonade` CLI lets you mix-and-match LLMs (ONNX, GGUF, SafeTensors) with measurement tools to characterize your models on your hardware. The available tools are:
|
|
100
|
+
- 🐍 **[Lemonade API](./docs/lemonade_api.md)**: High-level Python API to directly integrate Lemonade LLMs into Python applications.
|
|
101
|
+
- 🖥️ **[Lemonade CLI](./docs/dev_cli/README.md)**: The `lemonade` CLI lets you mix-and-match LLMs (ONNX, GGUF, SafeTensors) with measurement tools to characterize your models on your hardware. The available tools are:
|
|
98
102
|
- Prompting with templates.
|
|
99
103
|
- Measuring accuracy with a variety of tests.
|
|
100
104
|
- Benchmarking to get the time-to-first-token and tokens per second.
|
|
@@ -149,14 +153,7 @@ Maximum LLM performance requires the right hardware accelerator with the right i
|
|
|
149
153
|
</tbody>
|
|
150
154
|
</table>
|
|
151
155
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
#### Inference Engines Overview
|
|
155
|
-
| Engine | Description |
|
|
156
|
-
| :--- | :--- |
|
|
157
|
-
| **OnnxRuntime GenAI (OGA)** | Microsoft engine that runs `.onnx` models and enables hardware vendors to provide their own execution providers (EPs) to support specialized hardware, such as neural processing units (NPUs). |
|
|
158
|
-
| **llamacpp** | Community-driven engine with strong GPU acceleration, support for thousands of `.gguf` models, and advanced features such as vision-language models (VLMs) and mixture-of-experts (MoEs). |
|
|
159
|
-
| **Hugging Face (HF)** | Hugging Face's `transformers` library can run the original `.safetensors` trained weights for models on Meta's PyTorch engine, which provides a source of truth for accuracy measurement. |
|
|
156
|
+
To learn more about the supported hardware and software, visit the documentation [here](./docs/README.md#software-and-hardware-overview).
|
|
160
157
|
|
|
161
158
|
## Integrate Lemonade Server with Your Application
|
|
162
159
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[](https://github.com/lemonade-sdk/lemonade/tree/main/test "Check out our tests")
|
|
2
2
|
[](docs/README.md#installation "Check out our instructions")
|
|
3
|
-
[](docs/README.md#installation "Check out our instructions")
|
|
4
4
|
|
|
5
5
|
## 🍋 Lemonade SDK: Quickly serve, benchmark and deploy LLMs
|
|
6
6
|
|
|
@@ -15,8 +15,8 @@ The [Lemonade SDK](./docs/README.md) makes it easy to run Large Language Models
|
|
|
15
15
|
The [Lemonade SDK](./docs/README.md) is comprised of the following:
|
|
16
16
|
|
|
17
17
|
- 🌐 **[Lemonade Server](https://lemonade-server.ai/docs)**: A local LLM server for running ONNX and GGUF models using the OpenAI API standard. Install and enable your applications with NPU and GPU acceleration in minutes.
|
|
18
|
-
- 🐍 **Lemonade API**: High-level Python API to directly integrate Lemonade LLMs into Python applications.
|
|
19
|
-
- 🖥️ **Lemonade CLI**: The `lemonade` CLI lets you mix-and-match LLMs (ONNX, GGUF, SafeTensors) with measurement tools to characterize your models on your hardware. The available tools are:
|
|
18
|
+
- 🐍 **[Lemonade API](./docs/lemonade_api.md)**: High-level Python API to directly integrate Lemonade LLMs into Python applications.
|
|
19
|
+
- 🖥️ **[Lemonade CLI](./docs/dev_cli/README.md)**: The `lemonade` CLI lets you mix-and-match LLMs (ONNX, GGUF, SafeTensors) with measurement tools to characterize your models on your hardware. The available tools are:
|
|
20
20
|
- Prompting with templates.
|
|
21
21
|
- Measuring accuracy with a variety of tests.
|
|
22
22
|
- Benchmarking to get the time-to-first-token and tokens per second.
|
|
@@ -71,14 +71,7 @@ Maximum LLM performance requires the right hardware accelerator with the right i
|
|
|
71
71
|
</tbody>
|
|
72
72
|
</table>
|
|
73
73
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
#### Inference Engines Overview
|
|
77
|
-
| Engine | Description |
|
|
78
|
-
| :--- | :--- |
|
|
79
|
-
| **OnnxRuntime GenAI (OGA)** | Microsoft engine that runs `.onnx` models and enables hardware vendors to provide their own execution providers (EPs) to support specialized hardware, such as neural processing units (NPUs). |
|
|
80
|
-
| **llamacpp** | Community-driven engine with strong GPU acceleration, support for thousands of `.gguf` models, and advanced features such as vision-language models (VLMs) and mixture-of-experts (MoEs). |
|
|
81
|
-
| **Hugging Face (HF)** | Hugging Face's `transformers` library can run the original `.safetensors` trained weights for models on Meta's PyTorch engine, which provides a source of truth for accuracy measurement. |
|
|
74
|
+
To learn more about the supported hardware and software, visit the documentation [here](./docs/README.md#software-and-hardware-overview).
|
|
82
75
|
|
|
83
76
|
## Integrate Lemonade Server with Your Application
|
|
84
77
|
|
|
@@ -48,29 +48,30 @@ setup(
|
|
|
48
48
|
"transformers<=4.51.3",
|
|
49
49
|
"jinja2",
|
|
50
50
|
"tabulate",
|
|
51
|
-
|
|
52
|
-
"huggingface-hub==0.
|
|
51
|
+
"sentencepiece",
|
|
52
|
+
"huggingface-hub==0.33.0",
|
|
53
53
|
],
|
|
54
54
|
extras_require={
|
|
55
|
-
# The -
|
|
55
|
+
# The non-dev extras are meant to deploy specific backends into end-user
|
|
56
56
|
# applications, without including developer-focused tools
|
|
57
|
-
"oga-hybrid
|
|
57
|
+
"oga-hybrid": [
|
|
58
58
|
# Note: `lemonade-install --ryzenai hybrid` is necessary
|
|
59
59
|
# to complete installation
|
|
60
60
|
"onnx==1.16.1",
|
|
61
61
|
"numpy==1.26.4",
|
|
62
62
|
"protobuf>=6.30.1",
|
|
63
63
|
],
|
|
64
|
-
"oga-cpu
|
|
65
|
-
"onnxruntime-genai==0.
|
|
66
|
-
"onnxruntime >=1.
|
|
64
|
+
"oga-cpu": [
|
|
65
|
+
"onnxruntime-genai==0.8.2",
|
|
66
|
+
"onnxruntime >=1.22.0",
|
|
67
67
|
],
|
|
68
|
-
|
|
68
|
+
# Developer-focused tools for benchmarking, accuracy testing, and
|
|
69
|
+
# model preparation (ONNX export, quantization, device-specifc optimization, etc.)
|
|
70
|
+
"dev": [
|
|
69
71
|
# Minimal dependencies for developers to use all features of
|
|
70
72
|
# Lemonade SDK, including building and optimizing models
|
|
71
73
|
"torch>=2.6.0",
|
|
72
74
|
"accelerate",
|
|
73
|
-
"sentencepiece",
|
|
74
75
|
"datasets",
|
|
75
76
|
"pandas>=1.5.3",
|
|
76
77
|
"matplotlib",
|
|
@@ -79,36 +80,35 @@ setup(
|
|
|
79
80
|
"human-eval-windows==1.0.4",
|
|
80
81
|
"lm-eval[api]",
|
|
81
82
|
],
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
],
|
|
83
|
+
# Keep backwards compatibility for old extras names
|
|
84
|
+
"oga-hybrid-minimal": ["lemonade-sdk[oga-hybrid]"],
|
|
85
|
+
"oga-cpu-minimal": ["lemonade-sdk[oga-cpu]"],
|
|
86
|
+
"llm": ["lemonade-sdk[dev]"],
|
|
87
|
+
"llm-oga-cpu": ["lemonade-sdk[dev,oga-cpu]"],
|
|
88
|
+
# The following extras are deprecated and/or not commonly used
|
|
86
89
|
"llm-oga-igpu": [
|
|
87
90
|
"onnxruntime-genai-directml==0.6.0",
|
|
88
91
|
"onnxruntime-directml>=1.19.0,<1.22.0",
|
|
89
92
|
"transformers<4.45.0",
|
|
90
|
-
"lemonade-sdk[
|
|
93
|
+
"lemonade-sdk[dev]",
|
|
91
94
|
],
|
|
92
95
|
"llm-oga-cuda": [
|
|
93
|
-
"onnxruntime-genai-cuda==0.
|
|
94
|
-
"onnxruntime-gpu >=1.
|
|
95
|
-
"transformers
|
|
96
|
-
"lemonade-sdk[
|
|
96
|
+
"onnxruntime-genai-cuda==0.8.2",
|
|
97
|
+
"onnxruntime-gpu >=1.22.0",
|
|
98
|
+
"transformers<=4.51.3",
|
|
99
|
+
"lemonade-sdk[dev]",
|
|
97
100
|
],
|
|
98
101
|
"llm-oga-npu": [
|
|
99
102
|
"onnx==1.16.0",
|
|
103
|
+
# NPU requires specific onnxruntime version for Ryzen AI compatibility
|
|
104
|
+
# This may conflict with other OGA extras that require >=1.22.0
|
|
100
105
|
"onnxruntime==1.18.0",
|
|
101
106
|
"numpy==1.26.4",
|
|
102
107
|
"protobuf>=6.30.1",
|
|
103
|
-
"lemonade-sdk[
|
|
104
|
-
],
|
|
105
|
-
"llm-oga-hybrid": [
|
|
106
|
-
"lemonade-sdk[oga-hybrid-minimal]",
|
|
107
|
-
"lemonade-sdk[llm]",
|
|
108
|
-
],
|
|
109
|
-
"llm-oga-unified": [
|
|
110
|
-
"lemonade-sdk[llm-oga-hybrid]",
|
|
108
|
+
"lemonade-sdk[dev]",
|
|
111
109
|
],
|
|
110
|
+
"llm-oga-hybrid": ["lemonade-sdk[dev,oga-hybrid]"],
|
|
111
|
+
"llm-oga-unified": ["lemonade-sdk[llm-oga-hybrid]"],
|
|
112
112
|
},
|
|
113
113
|
classifiers=[],
|
|
114
114
|
entry_points={
|
|
@@ -90,9 +90,9 @@ https://github.com/lemonade-sdk/lemonade/blob/main/docs/README.md""",
|
|
|
90
90
|
)
|
|
91
91
|
|
|
92
92
|
profiler_instances = [
|
|
93
|
-
profiler(global_args[profiler.unique_name])
|
|
93
|
+
profiler(global_args[profiler.unique_name.replace("-", "_")])
|
|
94
94
|
for profiler in profilers
|
|
95
|
-
if global_args.get(profiler.unique_name, None) is not None
|
|
95
|
+
if global_args.get(profiler.unique_name.replace("-", "_"), None) is not None
|
|
96
96
|
]
|
|
97
97
|
|
|
98
98
|
if len(evaluation_tools) > 0:
|
|
@@ -48,7 +48,10 @@ class Profiler(abc.ABC):
|
|
|
48
48
|
This method is called so that the profiler can create its output files.
|
|
49
49
|
The state is passed so that build info can be gathered and stats can be written.
|
|
50
50
|
The timestamp can be used for filename in current working directory.
|
|
51
|
-
The start times
|
|
51
|
+
The start times parameter is a dict with the keys being the tools names and
|
|
52
|
+
the values being the time the tool started. There is an initial "warmup" key
|
|
53
|
+
that has a start time before the first tool and a "cool down" key that contains the
|
|
54
|
+
time when the last tool ended.
|
|
52
55
|
"""
|
|
53
56
|
|
|
54
57
|
|
|
@@ -24,7 +24,7 @@ class AccuracyHumaneval(Tool):
|
|
|
24
24
|
- pass@10: Percentage of problems solved within 10 generation attempts
|
|
25
25
|
- pass@100: Percentage of problems solved within 100 generation attempts
|
|
26
26
|
|
|
27
|
-
See docs/
|
|
27
|
+
See docs/dev_cli/humaneval_accuracy.md for more details
|
|
28
28
|
"""
|
|
29
29
|
|
|
30
30
|
unique_name = "accuracy-humaneval"
|
|
@@ -1,12 +1,6 @@
|
|
|
1
1
|
# onnxruntime_genai is not lint-friendly yet and PyLint can't
|
|
2
2
|
# find any of the class methods
|
|
3
3
|
# pylint: disable=no-member
|
|
4
|
-
#
|
|
5
|
-
# Model builder constraints:
|
|
6
|
-
# 11/10/24 Need transformers <4.45.0 OR onnxruntime-genai 0.5.0 (which must be built from source)
|
|
7
|
-
# (transformers v4.45 changes the format of the tokenizer.json file which will be supported in
|
|
8
|
-
# onnxruntime-genai 0.5)
|
|
9
|
-
#
|
|
10
4
|
|
|
11
5
|
import argparse
|
|
12
6
|
import os
|
|
@@ -51,8 +45,8 @@ def import_error_heler(e: Exception):
|
|
|
51
45
|
"""
|
|
52
46
|
raise ImportError(
|
|
53
47
|
f"{e}\n Please install lemonade-sdk with "
|
|
54
|
-
"one of the
|
|
55
|
-
"pip install lemonade-sdk[
|
|
48
|
+
"one of the oga extras, for example:\n"
|
|
49
|
+
"pip install lemonade-sdk[dev,oga-cpu]\n"
|
|
56
50
|
"See https://lemonade_server.ai/install_options.html for details"
|
|
57
51
|
)
|
|
58
52
|
|
|
@@ -64,7 +58,7 @@ class OgaLoad(FirstTool):
|
|
|
64
58
|
Input: path to a checkpoint.
|
|
65
59
|
Supported choices for cpu and igpu from HF model repository:
|
|
66
60
|
LLM models on Huggingface supported by model_builder. See documentation
|
|
67
|
-
(https://github.com/lemonade-sdk/lemonade/blob/main/docs/ort_genai_igpu.md)
|
|
61
|
+
(https://github.com/lemonade-sdk/lemonade/blob/main/docs/dev_cli/ort_genai_igpu.md)
|
|
68
62
|
for supported models.
|
|
69
63
|
Supported choices for npu from HF model repository:
|
|
70
64
|
Models on Hugging Face that follow the "amd/**-onnx-ryzen-strix" pattern
|
|
@@ -17,7 +17,7 @@ class AccuracyPerplexity(Tool):
|
|
|
17
17
|
|
|
18
18
|
Output state produced: None
|
|
19
19
|
|
|
20
|
-
See docs/
|
|
20
|
+
See docs/dev_cli/perplexity.md for more details.
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
23
|
unique_name = "accuracy-perplexity"
|
|
@@ -63,7 +63,7 @@ class AccuracyPerplexity(Tool):
|
|
|
63
63
|
# try-except will allow a few more LLMs to work
|
|
64
64
|
max_length = 2048
|
|
65
65
|
# Set stride to half of the maximum input length for overlapping window processing
|
|
66
|
-
# Refer to docs/perplexity.md for more information on sliding window
|
|
66
|
+
# Refer to docs/dev_cli/perplexity.md for more information on sliding window
|
|
67
67
|
stride = max_length // 2
|
|
68
68
|
# Determine the total sequence length of the tokenized input
|
|
69
69
|
seq_len = encodings.input_ids.size(1)
|
|
@@ -176,12 +176,21 @@ class LLMPrompt(Tool):
|
|
|
176
176
|
|
|
177
177
|
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
|
|
178
178
|
if isinstance(input_ids, (list, str)):
|
|
179
|
-
# OGA models return a list of tokens
|
|
179
|
+
# OGA models return a list of tokens (older versions)
|
|
180
180
|
# Our llama.cpp adapter returns a string
|
|
181
181
|
len_tokens_in = len(input_ids)
|
|
182
|
-
|
|
182
|
+
elif hasattr(input_ids, "shape"):
|
|
183
183
|
# HF models return a 2-D tensor
|
|
184
|
-
|
|
184
|
+
# OGA models with newer versions may return numpy arrays
|
|
185
|
+
if len(input_ids.shape) == 1:
|
|
186
|
+
# 1-D array from newer OGA versions
|
|
187
|
+
len_tokens_in = len(input_ids)
|
|
188
|
+
else:
|
|
189
|
+
# 2-D tensor from HF models
|
|
190
|
+
len_tokens_in = input_ids.shape[1]
|
|
191
|
+
else:
|
|
192
|
+
# Fallback: try to get length directly
|
|
193
|
+
len_tokens_in = len(input_ids)
|
|
185
194
|
|
|
186
195
|
len_tokens_out = []
|
|
187
196
|
response_texts = []
|
|
@@ -202,9 +211,15 @@ class LLMPrompt(Tool):
|
|
|
202
211
|
random_seed += 1
|
|
203
212
|
|
|
204
213
|
# Flatten the input and response
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
)
|
|
214
|
+
if isinstance(input_ids, (list, str)):
|
|
215
|
+
input_ids_array = input_ids
|
|
216
|
+
elif hasattr(input_ids, "shape") and len(input_ids.shape) == 1:
|
|
217
|
+
# 1-D array from newer OGA versions - already flat
|
|
218
|
+
input_ids_array = input_ids
|
|
219
|
+
else:
|
|
220
|
+
# 2-D tensor from HF models - take first row
|
|
221
|
+
input_ids_array = input_ids[0]
|
|
222
|
+
|
|
208
223
|
response_array = response if isinstance(response, str) else response[0]
|
|
209
224
|
|
|
210
225
|
# Separate the prompt from the response
|
|
@@ -25,7 +25,7 @@ class QuarkQuantize(Tool):
|
|
|
25
25
|
Output:
|
|
26
26
|
- Modifies `state` with quantized and optionally exported model.
|
|
27
27
|
|
|
28
|
-
See docs/quark.md for more details.
|
|
28
|
+
See docs/dev_cli/quark.md for more details.
|
|
29
29
|
"""
|
|
30
30
|
|
|
31
31
|
unique_name = "quark-quantize"
|
|
@@ -94,7 +94,7 @@ class QuarkQuantize(Tool):
|
|
|
94
94
|
help="Number of samples for calibration.",
|
|
95
95
|
)
|
|
96
96
|
|
|
97
|
-
# See docs/quark.md for more details.
|
|
97
|
+
# See docs/dev_cli/quark.md for more details.
|
|
98
98
|
parser.add_argument(
|
|
99
99
|
"--quant-scheme",
|
|
100
100
|
type=str,
|
|
@@ -74,6 +74,7 @@ class SimpleStat(TableColumn):
|
|
|
74
74
|
align="center",
|
|
75
75
|
omit_if_lean=False,
|
|
76
76
|
wrap=None,
|
|
77
|
+
stat_fn=None,
|
|
77
78
|
):
|
|
78
79
|
self.column_header = column_header
|
|
79
80
|
self.stat = stat
|
|
@@ -81,6 +82,7 @@ class SimpleStat(TableColumn):
|
|
|
81
82
|
self.align = align
|
|
82
83
|
self.omit_if_lean = omit_if_lean
|
|
83
84
|
self.wrap = wrap or self.default_wrap
|
|
85
|
+
self.stat_fn = stat_fn
|
|
84
86
|
|
|
85
87
|
def get_str(self, build_stats, lean=False):
|
|
86
88
|
if lean and self.omit_if_lean:
|
|
@@ -88,6 +90,8 @@ class SimpleStat(TableColumn):
|
|
|
88
90
|
data = build_stats.get(self.stat, None)
|
|
89
91
|
if data is None:
|
|
90
92
|
return ""
|
|
93
|
+
if self.stat_fn:
|
|
94
|
+
data = self.stat_fn(data)
|
|
91
95
|
cell_str = "\n".join(
|
|
92
96
|
[_wrap(f"{x:{self.format_str}}", self.wrap) for x in _to_list(data)]
|
|
93
97
|
)
|
|
@@ -233,6 +237,47 @@ class AdditionalStat(TableColumn):
|
|
|
233
237
|
return "\n".join(cell_entry)
|
|
234
238
|
|
|
235
239
|
|
|
240
|
+
class DictListStat(TableColumn):
|
|
241
|
+
"""
|
|
242
|
+
A statistic that is a list of dicts and values from a given list of keys will be
|
|
243
|
+
pulled out of each dict and placed in the cell
|
|
244
|
+
"""
|
|
245
|
+
|
|
246
|
+
def __init__(
|
|
247
|
+
self,
|
|
248
|
+
column_header,
|
|
249
|
+
statistic_name,
|
|
250
|
+
key_format_list,
|
|
251
|
+
align="center",
|
|
252
|
+
omit_if_lean=False,
|
|
253
|
+
wrap=None,
|
|
254
|
+
):
|
|
255
|
+
self.column_header = column_header
|
|
256
|
+
self.statistic_name = statistic_name
|
|
257
|
+
self.key_format_list = key_format_list
|
|
258
|
+
self.align = align
|
|
259
|
+
self.omit_if_lean = omit_if_lean
|
|
260
|
+
self.wrap = wrap or self.default_wrap
|
|
261
|
+
|
|
262
|
+
def get_str(self, build_stats, lean=False):
|
|
263
|
+
if lean and self.omit_if_lean:
|
|
264
|
+
return None
|
|
265
|
+
stat = build_stats.get(self.statistic_name, None)
|
|
266
|
+
if not stat:
|
|
267
|
+
return ""
|
|
268
|
+
cell_entry = []
|
|
269
|
+
for stat_dict in stat:
|
|
270
|
+
line = [
|
|
271
|
+
format_str.format(stat_dict[key])
|
|
272
|
+
for key, format_str in self.key_format_list
|
|
273
|
+
]
|
|
274
|
+
cell_entry.append(" ".join(line))
|
|
275
|
+
return "\n".join(cell_entry)
|
|
276
|
+
|
|
277
|
+
def get_keys(self):
|
|
278
|
+
return [self.statistic_name]
|
|
279
|
+
|
|
280
|
+
|
|
236
281
|
################################################################################
|
|
237
282
|
# ABSTRACT BASE CLASS FOR DEFINING A TABLE
|
|
238
283
|
################################################################################
|
|
@@ -350,6 +395,28 @@ class Table(ABC):
|
|
|
350
395
|
headers.append(column.column_header)
|
|
351
396
|
col_align += (column.align,)
|
|
352
397
|
|
|
398
|
+
# Stat column headers
|
|
399
|
+
stat_columns = self.table_descriptor.get("stat_columns", [])
|
|
400
|
+
stat_columns_include = []
|
|
401
|
+
for column in stat_columns:
|
|
402
|
+
# Check to see that at least one build has data for the column
|
|
403
|
+
keep_column = False
|
|
404
|
+
if not (self.lean and column.omit_if_lean):
|
|
405
|
+
keys = column.get_keys()
|
|
406
|
+
for build_stats in self.all_stats:
|
|
407
|
+
found = [(key in build_stats) for key in keys]
|
|
408
|
+
if any(found):
|
|
409
|
+
keep_column = True
|
|
410
|
+
headers.append(column.column_header)
|
|
411
|
+
col_align += (column.align,)
|
|
412
|
+
break
|
|
413
|
+
stat_columns_include.append(keep_column)
|
|
414
|
+
stat_columns = [
|
|
415
|
+
column
|
|
416
|
+
for column, include in zip(stat_columns, stat_columns_include)
|
|
417
|
+
if include
|
|
418
|
+
]
|
|
419
|
+
|
|
353
420
|
# Final headers
|
|
354
421
|
last_columns = self.table_descriptor.get("last_columns", [])
|
|
355
422
|
for column in last_columns:
|
|
@@ -386,6 +453,12 @@ class Table(ABC):
|
|
|
386
453
|
if entry_str is not None:
|
|
387
454
|
row.append(entry_str)
|
|
388
455
|
|
|
456
|
+
# Per stat columns
|
|
457
|
+
for entry in stat_columns:
|
|
458
|
+
entry_str = entry.get_str(build_stats, self.lean)
|
|
459
|
+
if entry_str is not None:
|
|
460
|
+
row.append(entry_str)
|
|
461
|
+
|
|
389
462
|
# Final columns
|
|
390
463
|
for entry in last_columns:
|
|
391
464
|
entry_str = entry.get_str(build_stats, self.lean)
|
|
@@ -514,6 +587,12 @@ class LemonadePerfTable(Table):
|
|
|
514
587
|
Keys.STD_DEV_TOKENS_PER_SECOND,
|
|
515
588
|
".2f",
|
|
516
589
|
),
|
|
590
|
+
SimpleStat(
|
|
591
|
+
_wrap("Total Generated Tokens", 9),
|
|
592
|
+
Keys.RESPONSE_TOKENS,
|
|
593
|
+
"d",
|
|
594
|
+
stat_fn=sum,
|
|
595
|
+
),
|
|
517
596
|
SimpleStat(
|
|
518
597
|
_wrap("Memory Used (GB)", 8), Keys.MAX_MEMORY_USED_GBYTE, ".3f"
|
|
519
598
|
),
|
|
@@ -537,6 +616,7 @@ class LemonadePerfTable(Table):
|
|
|
537
616
|
)
|
|
538
617
|
],
|
|
539
618
|
},
|
|
619
|
+
"stat_columns": [],
|
|
540
620
|
"last_columns": [
|
|
541
621
|
SimpleStat(
|
|
542
622
|
"System Info",
|