lemonade-sdk 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (61) hide show
  1. lemonade/__init__.py +5 -0
  2. lemonade/api.py +125 -0
  3. lemonade/cache.py +85 -0
  4. lemonade/cli.py +135 -0
  5. lemonade/common/__init__.py +0 -0
  6. lemonade/common/analyze_model.py +26 -0
  7. lemonade/common/build.py +223 -0
  8. lemonade/common/cli_helpers.py +139 -0
  9. lemonade/common/exceptions.py +98 -0
  10. lemonade/common/filesystem.py +368 -0
  11. lemonade/common/labels.py +61 -0
  12. lemonade/common/onnx_helpers.py +176 -0
  13. lemonade/common/plugins.py +10 -0
  14. lemonade/common/printing.py +110 -0
  15. lemonade/common/status.py +490 -0
  16. lemonade/common/system_info.py +390 -0
  17. lemonade/common/tensor_helpers.py +83 -0
  18. lemonade/common/test_helpers.py +28 -0
  19. lemonade/profilers/__init__.py +1 -0
  20. lemonade/profilers/memory_tracker.py +257 -0
  21. lemonade/profilers/profiler.py +55 -0
  22. lemonade/sequence.py +363 -0
  23. lemonade/state.py +159 -0
  24. lemonade/tools/__init__.py +1 -0
  25. lemonade/tools/adapter.py +104 -0
  26. lemonade/tools/bench.py +284 -0
  27. lemonade/tools/huggingface_bench.py +267 -0
  28. lemonade/tools/huggingface_load.py +520 -0
  29. lemonade/tools/humaneval.py +258 -0
  30. lemonade/tools/llamacpp.py +261 -0
  31. lemonade/tools/llamacpp_bench.py +154 -0
  32. lemonade/tools/management_tools.py +273 -0
  33. lemonade/tools/mmlu.py +327 -0
  34. lemonade/tools/ort_genai/__init__.py +0 -0
  35. lemonade/tools/ort_genai/oga.py +1129 -0
  36. lemonade/tools/ort_genai/oga_bench.py +142 -0
  37. lemonade/tools/perplexity.py +146 -0
  38. lemonade/tools/prompt.py +228 -0
  39. lemonade/tools/quark/__init__.py +0 -0
  40. lemonade/tools/quark/quark_load.py +172 -0
  41. lemonade/tools/quark/quark_quantize.py +439 -0
  42. lemonade/tools/report/__init__.py +0 -0
  43. lemonade/tools/report/llm_report.py +203 -0
  44. lemonade/tools/report/table.py +739 -0
  45. lemonade/tools/server/__init__.py +0 -0
  46. lemonade/tools/server/serve.py +1354 -0
  47. lemonade/tools/server/tool_calls.py +146 -0
  48. lemonade/tools/tool.py +374 -0
  49. lemonade/version.py +1 -0
  50. lemonade_install/__init__.py +1 -0
  51. lemonade_install/install.py +774 -0
  52. lemonade_sdk-7.0.0.dist-info/METADATA +116 -0
  53. lemonade_sdk-7.0.0.dist-info/RECORD +61 -0
  54. lemonade_sdk-7.0.0.dist-info/WHEEL +5 -0
  55. lemonade_sdk-7.0.0.dist-info/entry_points.txt +4 -0
  56. lemonade_sdk-7.0.0.dist-info/licenses/LICENSE +201 -0
  57. lemonade_sdk-7.0.0.dist-info/licenses/NOTICE.md +21 -0
  58. lemonade_sdk-7.0.0.dist-info/top_level.txt +3 -0
  59. lemonade_server/cli.py +260 -0
  60. lemonade_server/model_manager.py +98 -0
  61. lemonade_server/server_models.json +142 -0
lemonade_server/cli.py ADDED
@@ -0,0 +1,260 @@
1
+ import argparse
2
+ import sys
3
+ import os
4
+ from typing import Tuple
5
+ import psutil
6
+ from typing import List
7
+
8
+
9
+ class PullError(Exception):
10
+ """
11
+ The pull command has failed to install an LLM
12
+ """
13
+
14
+
15
+ def serve(
16
+ port: int,
17
+ log_level: str = None,
18
+ ):
19
+ """
20
+ Execute the serve command
21
+ """
22
+
23
+ # Check if Lemonade Server is already running
24
+ _, running_port = get_server_info()
25
+ if running_port is not None:
26
+ print(
27
+ (
28
+ f"Lemonade Server is already running on port {running_port}\n"
29
+ "Please stop the existing server before starting a new instance."
30
+ ),
31
+ )
32
+ sys.exit(1)
33
+
34
+ # Otherwise, start the server
35
+ print("Starting Lemonade Server...")
36
+ from lemonade.tools.server.serve import Server, DEFAULT_PORT, DEFAULT_LOG_LEVEL
37
+
38
+ server = Server()
39
+ port = port if port is not None else DEFAULT_PORT
40
+ log_level = log_level if log_level is not None else DEFAULT_LOG_LEVEL
41
+
42
+ # Hidden environment variable to enable input truncation (experimental feature)
43
+ truncate_inputs = "LEMONADE_TRUNCATE_INPUTS" in os.environ
44
+
45
+ server.run(
46
+ port=port,
47
+ log_level=log_level,
48
+ truncate_inputs=truncate_inputs,
49
+ )
50
+
51
+
52
+ def stop():
53
+ """
54
+ Stop the Lemonade Server
55
+ """
56
+
57
+ # Check if Lemonade Server is running
58
+ running_pid, running_port = get_server_info()
59
+ if running_port is None:
60
+ print(f"Lemonade Server is not running\n")
61
+ return
62
+
63
+ # Stop the server
64
+ try:
65
+ process = psutil.Process(running_pid)
66
+ process.terminate()
67
+ process.wait(timeout=10)
68
+ except psutil.NoSuchProcess:
69
+ # Process already terminated
70
+ pass
71
+ except psutil.TimeoutExpired:
72
+ print("Timed out waiting for Lemonade Server to stop.")
73
+ sys.exit(1)
74
+ except Exception as e: # pylint: disable=broad-exception-caught
75
+ print(f"Error stopping Lemonade Server: {e}")
76
+ sys.exit(1)
77
+ print("Lemonade Server stopped successfully.")
78
+
79
+
80
+ def pull(model_names: List[str]):
81
+ """
82
+ Install an LLM based on its Lemonade Server model name
83
+
84
+ If Lemonade Server is running, use the pull endpoint to download the model
85
+ so that the Lemonade Server instance is aware of the pull.
86
+
87
+ Otherwise, use ModelManager to install the model.
88
+ """
89
+
90
+ server_running, port = status(verbose=False)
91
+
92
+ if server_running:
93
+ import requests
94
+
95
+ base_url = f"http://localhost:{port}/api/v0"
96
+
97
+ for model_name in model_names:
98
+ # Install the model
99
+ pull_response = requests.post(
100
+ f"{base_url}/pull", json={"model_name": model_name}
101
+ )
102
+
103
+ if pull_response.status_code != 200:
104
+ raise PullError(
105
+ f"Failed to install {model_name}. Check the "
106
+ "Lemonade Server log for more information. A list of supported models "
107
+ "is provided at "
108
+ "https://github.com/lemonade-sdk/lemonade/blob/main/docs/server/server_models.md"
109
+ )
110
+ else:
111
+ from lemonade_server.model_manager import ModelManager
112
+
113
+ ModelManager().download_models(model_names)
114
+
115
+
116
+ def version():
117
+ """
118
+ Print the version number
119
+ """
120
+ from lemonade import __version__ as version_number
121
+
122
+ print(f"{version_number}")
123
+
124
+
125
+ def status(verbose: bool = True) -> Tuple[bool, int]:
126
+ """
127
+ Print the status of the server
128
+
129
+ Returns a tuple of:
130
+ 1. Whether the server is running
131
+ 2. What port the server is running on (None if server is not running)
132
+ """
133
+ _, port = get_server_info()
134
+ if port is None:
135
+ if verbose:
136
+ print("Server is not running")
137
+ return False, None
138
+ else:
139
+ if verbose:
140
+ print(f"Server is running on port {port}")
141
+ return True, port
142
+
143
+
144
+ def is_lemonade_server(pid):
145
+ """
146
+ Check wether or not a given PID corresponds to a Lemonade server
147
+ """
148
+ try:
149
+ process = psutil.Process(pid)
150
+ while True:
151
+ if process.name() in [ # Windows
152
+ "lemonade-server-dev.exe",
153
+ "lemonade-server.exe",
154
+ "lemonade.exe",
155
+ ] or process.name() in [ # Linux
156
+ "lemonade-server-dev",
157
+ "lemonade-server",
158
+ "lemonade",
159
+ ]:
160
+ return True
161
+ if not process.parent():
162
+ return False
163
+ process = process.parent()
164
+ except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
165
+ return False
166
+ return False
167
+
168
+
169
+ def get_server_info() -> Tuple[int | None, int | None]:
170
+ """
171
+ Returns a tuple of:
172
+ 1. Lemonade Server's PID
173
+ 2. The port that Lemonade Server is running on
174
+ """
175
+ # Go over all python processes that have a port open
176
+ for process in psutil.process_iter(["pid", "name"]):
177
+ try:
178
+ connections = process.net_connections()
179
+ for conn in connections:
180
+ if conn.status == "LISTEN":
181
+ if is_lemonade_server(process.info["pid"]):
182
+ return process.info["pid"], conn.laddr.port
183
+ except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
184
+ continue
185
+
186
+ return None, None
187
+
188
+
189
+ def main():
190
+ parser = argparse.ArgumentParser(
191
+ description="Serve LLMs on CPU, GPU, and NPU.",
192
+ usage=argparse.SUPPRESS,
193
+ )
194
+
195
+ # Add version flag
196
+ parser.add_argument(
197
+ "-v", "--version", action="store_true", help="Show version number"
198
+ )
199
+
200
+ # Create subparsers for commands
201
+ subparsers = parser.add_subparsers(
202
+ title="Available Commands", dest="command", metavar=""
203
+ )
204
+
205
+ # Serve command
206
+ serve_parser = subparsers.add_parser("serve", help="Start server")
207
+ serve_parser.add_argument("--port", type=int, help="Port number to serve on")
208
+ serve_parser.add_argument(
209
+ "--log-level",
210
+ type=str,
211
+ help="Log level for the server",
212
+ choices=["critical", "error", "warning", "info", "debug", "trace"],
213
+ default="info",
214
+ )
215
+
216
+ # Status command
217
+ status_parser = subparsers.add_parser("status", help="Check if server is running")
218
+
219
+ # Stop command
220
+ stop_parser = subparsers.add_parser("stop", help="Stop the server")
221
+
222
+ # Pull command
223
+ pull_parser = subparsers.add_parser(
224
+ "pull",
225
+ help="Install an LLM",
226
+ epilog=(
227
+ "More information: "
228
+ "https://github.com/lemonade-sdk/lemonade/blob/main/docs/server/server_models.md"
229
+ ),
230
+ )
231
+ pull_parser.add_argument(
232
+ "model",
233
+ help="Lemonade Server model name",
234
+ nargs="+",
235
+ )
236
+
237
+ args = parser.parse_args()
238
+
239
+ if args.version:
240
+ version()
241
+ elif args.command == "serve":
242
+ serve(
243
+ args.port,
244
+ args.log_level,
245
+ )
246
+ elif args.command == "status":
247
+ status()
248
+ elif args.command == "pull":
249
+ pull(args.model)
250
+ elif args.command == "stop":
251
+ stop()
252
+ elif args.command == "help" or not args.command:
253
+ parser.print_help()
254
+
255
+
256
+ if __name__ == "__main__":
257
+ main()
258
+
259
+ # This file was originally licensed under Apache 2.0. It has been modified.
260
+ # Modifications Copyright (c) 2025 AMD
@@ -0,0 +1,98 @@
1
+ import json
2
+ import os
3
+ import huggingface_hub
4
+ import pkg_resources
5
+
6
+
7
+ class ModelManager:
8
+
9
+ @property
10
+ def supported_models(self) -> dict:
11
+ """
12
+ Returns a dictionary of supported models.
13
+ Note: Models must be downloaded before they are locally available.
14
+ """
15
+ # Load the models dictionary from the JSON file
16
+ server_models_file = os.path.join(
17
+ os.path.dirname(__file__), "server_models.json"
18
+ )
19
+ with open(server_models_file, "r", encoding="utf-8") as file:
20
+ models = json.load(file)
21
+
22
+ # Add the model name as a key in each entry, to make it easier
23
+ # to access later
24
+
25
+ for key, value in models.items():
26
+ value["model_name"] = key
27
+
28
+ return models
29
+
30
+ @property
31
+ def downloaded_hf_checkpoints(self) -> list[str]:
32
+ """
33
+ Returns a list of Hugging Face checkpoints that have been downloaded.
34
+ """
35
+ downloaded_hf_checkpoints = []
36
+ try:
37
+ hf_cache_info = huggingface_hub.scan_cache_dir()
38
+ downloaded_hf_checkpoints = [entry.repo_id for entry in hf_cache_info.repos]
39
+ except huggingface_hub.CacheNotFound:
40
+ pass
41
+ except Exception as e: # pylint: disable=broad-exception-caught
42
+ print(f"Error scanning Hugging Face cache: {e}")
43
+ return downloaded_hf_checkpoints
44
+
45
+ @property
46
+ def downloaded_models(self) -> dict:
47
+ """
48
+ Returns a dictionary of locally available models.
49
+ """
50
+ downloaded_models = {}
51
+ for model in self.supported_models:
52
+ if (
53
+ self.supported_models[model]["checkpoint"]
54
+ in self.downloaded_hf_checkpoints
55
+ ):
56
+ downloaded_models[model] = self.supported_models[model]
57
+ return downloaded_models
58
+
59
+ @property
60
+ def downloaded_models_enabled(self) -> dict:
61
+ """
62
+ Returns a dictionary of locally available models that are enabled by
63
+ the current installation.
64
+ """
65
+ hybrid_installed = (
66
+ "onnxruntime-vitisai" in pkg_resources.working_set.by_key
67
+ and "onnxruntime-genai-directml-ryzenai" in pkg_resources.working_set.by_key
68
+ )
69
+
70
+ downloaded_models_enabled = {}
71
+ for model, value in self.downloaded_models.items():
72
+ if value["recipe"] == "oga-hybrid" and hybrid_installed:
73
+ downloaded_models_enabled[model] = value
74
+ else:
75
+ # All other models are CPU models right now
76
+ # This logic will get more sophisticated when we
77
+ # start to support more backends
78
+ downloaded_models_enabled[model] = value
79
+
80
+ return downloaded_models_enabled
81
+
82
+ def download_models(self, models: list[str]):
83
+ """
84
+ Downloads the specified models from Hugging Face.
85
+ """
86
+ for model in models:
87
+ if model not in self.supported_models:
88
+ raise ValueError(
89
+ f"Model {model} is not supported. Please choose from the following: "
90
+ f"{list(self.supported_models.keys())}"
91
+ )
92
+ checkpoint = self.supported_models[model]["checkpoint"]
93
+ print(f"Downloading {model} ({checkpoint})")
94
+ huggingface_hub.snapshot_download(repo_id=checkpoint)
95
+
96
+
97
+ # This file was originally licensed under Apache 2.0. It has been modified.
98
+ # Modifications Copyright (c) 2025 AMD
@@ -0,0 +1,142 @@
1
+ {
2
+ "Qwen2.5-0.5B-Instruct-CPU": {
3
+ "checkpoint": "amd/Qwen2.5-0.5B-Instruct-quantized_int4-float16-cpu-onnx",
4
+ "recipe": "oga-cpu",
5
+ "reasoning": false,
6
+ "suggested": true
7
+ },
8
+ "Llama-3.2-1B-Instruct-CPU": {
9
+ "checkpoint": "amd/Llama-3.2-1B-Instruct-awq-uint4-float16-cpu-onnx",
10
+ "recipe": "oga-cpu",
11
+ "reasoning": false,
12
+ "suggested": true
13
+ },
14
+ "Llama-3.2-3B-Instruct-CPU": {
15
+ "checkpoint": "amd/Llama-3.2-3B-Instruct-awq-uint4-float16-cpu-onnx",
16
+ "recipe": "oga-cpu",
17
+ "reasoning": false,
18
+ "suggested": true
19
+ },
20
+ "Phi-3-Mini-Instruct-CPU": {
21
+ "checkpoint": "amd/Phi-3-mini-4k-instruct_int4_float16_onnx_cpu",
22
+ "recipe": "oga-cpu",
23
+ "reasoning": false,
24
+ "suggested": true
25
+ },
26
+ "Qwen-1.5-7B-Chat-CPU": {
27
+ "checkpoint": "amd/Qwen1.5-7B-Chat_uint4_asym_g128_float16_onnx_cpu",
28
+ "recipe": "oga-cpu",
29
+ "reasoning": false,
30
+ "suggested": true
31
+ },
32
+ "DeepSeek-R1-Distill-Llama-8B-CPU": {
33
+ "checkpoint": "amd/DeepSeek-R1-Distill-Llama-8B-awq-asym-uint4-g128-lmhead-onnx-cpu",
34
+ "recipe": "oga-cpu",
35
+ "reasoning": true,
36
+ "suggested": true
37
+ },
38
+ "DeepSeek-R1-Distill-Qwen-7B-CPU": {
39
+ "checkpoint": "amd/DeepSeek-R1-Distill-Llama-8B-awq-asym-uint4-g128-lmhead-onnx-cpu",
40
+ "recipe": "oga-cpu",
41
+ "reasoning": true,
42
+ "suggested": true
43
+ },
44
+ "Llama-3.2-1B-Instruct-Hybrid": {
45
+ "checkpoint": "amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
46
+ "recipe": "oga-hybrid",
47
+ "reasoning": false,
48
+ "max_prompt_length": 3000,
49
+ "suggested": true
50
+ },
51
+ "Llama-3.2-3B-Instruct-Hybrid": {
52
+ "checkpoint": "amd/Llama-3.2-3B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
53
+ "recipe": "oga-hybrid",
54
+ "reasoning": false,
55
+ "max_prompt_length": 2000,
56
+ "suggested": true
57
+ },
58
+ "Phi-3-Mini-Instruct-Hybrid": {
59
+ "checkpoint": "amd/Phi-3-mini-4k-instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
60
+ "recipe": "oga-hybrid",
61
+ "reasoning": false,
62
+ "max_prompt_length": 2000,
63
+ "suggested": true
64
+ },
65
+ "Phi-3.5-Mini-Instruct-Hybrid": {
66
+ "checkpoint": "amd/Phi-3.5-mini-instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
67
+ "recipe": "oga-hybrid",
68
+ "reasoning": false,
69
+ "suggested": false
70
+ },
71
+ "Qwen-1.5-7B-Chat-Hybrid": {
72
+ "checkpoint": "amd/Qwen1.5-7B-Chat-awq-g128-int4-asym-fp16-onnx-hybrid",
73
+ "recipe": "oga-hybrid",
74
+ "reasoning": false,
75
+ "max_prompt_length": 3000,
76
+ "suggested": true
77
+ },
78
+ "DeepSeek-R1-Distill-Llama-8B-Hybrid": {
79
+ "checkpoint": "amd/DeepSeek-R1-Distill-Llama-8B-awq-asym-uint4-g128-lmhead-onnx-hybrid",
80
+ "recipe": "oga-hybrid",
81
+ "reasoning": true,
82
+ "max_prompt_length": 2000,
83
+ "suggested": true
84
+ },
85
+ "DeepSeek-R1-Distill-Qwen-7B-Hybrid": {
86
+ "checkpoint": "amd/DeepSeek-R1-Distill-Qwen-7B-awq-asym-uint4-g128-lmhead-onnx-hybrid",
87
+ "recipe": "oga-hybrid",
88
+ "reasoning": true,
89
+ "max_prompt_length": 2000,
90
+ "suggested": true
91
+ },
92
+ "Mistral-7B-v0.3-Instruct-Hybrid": {
93
+ "checkpoint": "amd/Mistral-7B-Instruct-v0.3-awq-g128-int4-asym-fp16-onnx-hybrid",
94
+ "recipe": "oga-hybrid",
95
+ "reasoning": false,
96
+ "max_prompt_length": 2000,
97
+ "suggested": true
98
+ },
99
+ "Llama-3.1-8B-Instruct-Hybrid": {
100
+ "checkpoint": "amd/Llama-3.1-8B-Instruct-awq-asym-uint4-g128-lmhead-onnx-hybrid",
101
+ "recipe": "oga-hybrid",
102
+ "reasoning": false,
103
+ "max_prompt_length": 2000,
104
+ "suggested": true
105
+ },
106
+ "Llama-3.2-1B-Instruct-DirectML": {
107
+ "checkpoint": "amd/Llama-3.2-1B-Instruct-dml-int4-awq-block-128-directml",
108
+ "recipe": "oga-igpu",
109
+ "reasoning": false,
110
+ "suggested": false
111
+ },
112
+ "Llama-3.2-3B-Instruct-DirectML": {
113
+ "checkpoint": "amd/Llama-3.2-3B-Instruct-dml-int4-awq-block-128-directml",
114
+ "recipe": "oga-igpu",
115
+ "reasoning": false,
116
+ "suggested": false
117
+ },
118
+ "Phi-3.5-Mini-Instruct-DirectML": {
119
+ "checkpoint": "amd/phi3.5-mini-instruct-int4-awq-block-128-directml",
120
+ "recipe": "oga-igpu",
121
+ "reasoning": false,
122
+ "suggested": false
123
+ },
124
+ "Qwen-1.5-7B-Chat-DirectML": {
125
+ "checkpoint": "amd/Qwen1.5-7B-Chat-dml-int4-awq-block-128-directml",
126
+ "recipe": "oga-igpu",
127
+ "reasoning": false,
128
+ "suggested": false
129
+ },
130
+ "Mistral-7B-v0.1-Instruct-DirectML": {
131
+ "checkpoint": "amd/Mistral-7B-Instruct-v0.1-awq-g128-int4-onnx-directml",
132
+ "recipe": "oga-igpu",
133
+ "reasoning": false,
134
+ "suggested": false
135
+ },
136
+ "Llama-3-8B-Instruct-DirectML": {
137
+ "checkpoint": "amd/llama3-8b-instruct-awq-g128-int4-onnx-directml",
138
+ "recipe": "oga-igpu",
139
+ "reasoning": false,
140
+ "suggested": false
141
+ }
142
+ }