lemonade-sdk 9.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lemonade/__init__.py +5 -0
- lemonade/api.py +180 -0
- lemonade/cache.py +92 -0
- lemonade/cli.py +173 -0
- lemonade/common/__init__.py +0 -0
- lemonade/common/build.py +176 -0
- lemonade/common/cli_helpers.py +139 -0
- lemonade/common/exceptions.py +98 -0
- lemonade/common/filesystem.py +368 -0
- lemonade/common/inference_engines.py +408 -0
- lemonade/common/network.py +93 -0
- lemonade/common/printing.py +110 -0
- lemonade/common/status.py +471 -0
- lemonade/common/system_info.py +1411 -0
- lemonade/common/test_helpers.py +28 -0
- lemonade/profilers/__init__.py +1 -0
- lemonade/profilers/agt_power.py +437 -0
- lemonade/profilers/hwinfo_power.py +429 -0
- lemonade/profilers/memory_tracker.py +259 -0
- lemonade/profilers/profiler.py +58 -0
- lemonade/sequence.py +363 -0
- lemonade/state.py +159 -0
- lemonade/tools/__init__.py +1 -0
- lemonade/tools/accuracy.py +432 -0
- lemonade/tools/adapter.py +114 -0
- lemonade/tools/bench.py +302 -0
- lemonade/tools/flm/__init__.py +1 -0
- lemonade/tools/flm/utils.py +305 -0
- lemonade/tools/huggingface/bench.py +187 -0
- lemonade/tools/huggingface/load.py +235 -0
- lemonade/tools/huggingface/utils.py +359 -0
- lemonade/tools/humaneval.py +264 -0
- lemonade/tools/llamacpp/bench.py +255 -0
- lemonade/tools/llamacpp/load.py +222 -0
- lemonade/tools/llamacpp/utils.py +1260 -0
- lemonade/tools/management_tools.py +319 -0
- lemonade/tools/mmlu.py +319 -0
- lemonade/tools/oga/__init__.py +0 -0
- lemonade/tools/oga/bench.py +120 -0
- lemonade/tools/oga/load.py +804 -0
- lemonade/tools/oga/migration.py +403 -0
- lemonade/tools/oga/utils.py +462 -0
- lemonade/tools/perplexity.py +147 -0
- lemonade/tools/prompt.py +263 -0
- lemonade/tools/report/__init__.py +0 -0
- lemonade/tools/report/llm_report.py +203 -0
- lemonade/tools/report/table.py +899 -0
- lemonade/tools/server/__init__.py +0 -0
- lemonade/tools/server/flm.py +133 -0
- lemonade/tools/server/llamacpp.py +320 -0
- lemonade/tools/server/serve.py +2123 -0
- lemonade/tools/server/static/favicon.ico +0 -0
- lemonade/tools/server/static/index.html +279 -0
- lemonade/tools/server/static/js/chat.js +1059 -0
- lemonade/tools/server/static/js/model-settings.js +183 -0
- lemonade/tools/server/static/js/models.js +1395 -0
- lemonade/tools/server/static/js/shared.js +556 -0
- lemonade/tools/server/static/logs.html +191 -0
- lemonade/tools/server/static/styles.css +2654 -0
- lemonade/tools/server/static/webapp.html +321 -0
- lemonade/tools/server/tool_calls.py +153 -0
- lemonade/tools/server/tray.py +664 -0
- lemonade/tools/server/utils/macos_tray.py +226 -0
- lemonade/tools/server/utils/port.py +77 -0
- lemonade/tools/server/utils/thread.py +85 -0
- lemonade/tools/server/utils/windows_tray.py +408 -0
- lemonade/tools/server/webapp.py +34 -0
- lemonade/tools/server/wrapped_server.py +559 -0
- lemonade/tools/tool.py +374 -0
- lemonade/version.py +1 -0
- lemonade_install/__init__.py +1 -0
- lemonade_install/install.py +239 -0
- lemonade_sdk-9.1.1.dist-info/METADATA +276 -0
- lemonade_sdk-9.1.1.dist-info/RECORD +84 -0
- lemonade_sdk-9.1.1.dist-info/WHEEL +5 -0
- lemonade_sdk-9.1.1.dist-info/entry_points.txt +5 -0
- lemonade_sdk-9.1.1.dist-info/licenses/LICENSE +201 -0
- lemonade_sdk-9.1.1.dist-info/licenses/NOTICE.md +47 -0
- lemonade_sdk-9.1.1.dist-info/top_level.txt +3 -0
- lemonade_server/cli.py +805 -0
- lemonade_server/model_manager.py +758 -0
- lemonade_server/pydantic_models.py +159 -0
- lemonade_server/server_models.json +643 -0
- lemonade_server/settings.py +39 -0
|
@@ -0,0 +1,804 @@
|
|
|
1
|
+
# onnxruntime_genai is not lint-friendly yet and PyLint can't
|
|
2
|
+
# find any of the class methods
|
|
3
|
+
# pylint: disable=no-member
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import subprocess
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
import json
|
|
10
|
+
import webbrowser
|
|
11
|
+
from fnmatch import fnmatch
|
|
12
|
+
|
|
13
|
+
from lemonade.state import State
|
|
14
|
+
from lemonade.tools import FirstTool
|
|
15
|
+
from lemonade.cache import Keys
|
|
16
|
+
import lemonade.common.status as status
|
|
17
|
+
import lemonade.common.printing as printing
|
|
18
|
+
from lemonade_install.install import (
|
|
19
|
+
_get_ryzenai_version_info,
|
|
20
|
+
SUPPORTED_RYZEN_AI_SERIES,
|
|
21
|
+
NPU_DRIVER_DOWNLOAD_URL,
|
|
22
|
+
REQUIRED_NPU_DRIVER_VERSION,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# ONNX Runtime GenAI models will be cached in this subfolder of the lemonade cache folder
|
|
26
|
+
oga_models_path = "oga_models"
|
|
27
|
+
|
|
28
|
+
# ONNX Runtime GenAI model builder tool uses this subfolder of the lemonade cache as its cache
|
|
29
|
+
oga_model_builder_cache_path = "model_builder"
|
|
30
|
+
|
|
31
|
+
# Mapping from processor to execution provider, used in pathnames and by model_builder
|
|
32
|
+
execution_providers = {
|
|
33
|
+
"cpu": "cpu",
|
|
34
|
+
"npu": "npu",
|
|
35
|
+
"igpu": "dml",
|
|
36
|
+
"hybrid": "hybrid",
|
|
37
|
+
"cuda": "cuda",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def find_onnx_files_recursively(directory):
|
|
42
|
+
"""
|
|
43
|
+
Recursively search for ONNX files in a directory and its subdirectories.
|
|
44
|
+
"""
|
|
45
|
+
for _, _, files in os.walk(directory):
|
|
46
|
+
for file in files:
|
|
47
|
+
if file.endswith(".onnx"):
|
|
48
|
+
return True
|
|
49
|
+
return False
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _get_npu_driver_version():
|
|
53
|
+
"""
|
|
54
|
+
Get the NPU driver version using PowerShell directly.
|
|
55
|
+
Returns the driver version string or None if not found.
|
|
56
|
+
"""
|
|
57
|
+
try:
|
|
58
|
+
|
|
59
|
+
# Use PowerShell directly to avoid wmi issues in embedded Python environments
|
|
60
|
+
powershell_cmd = [
|
|
61
|
+
"powershell",
|
|
62
|
+
"-NoProfile",
|
|
63
|
+
"-ExecutionPolicy",
|
|
64
|
+
"Bypass",
|
|
65
|
+
"-Command",
|
|
66
|
+
(
|
|
67
|
+
"Get-WmiObject -Class Win32_PnPSignedDriver | "
|
|
68
|
+
'Where-Object { $_.DeviceName -like "*NPU Compute Accelerator Device*" } | '
|
|
69
|
+
"Select-Object -ExpandProperty DriverVersion"
|
|
70
|
+
),
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
result = subprocess.run(
|
|
74
|
+
powershell_cmd, capture_output=True, text=True, check=True, timeout=30
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
driver_version = result.stdout.strip()
|
|
78
|
+
|
|
79
|
+
if driver_version and driver_version != "":
|
|
80
|
+
return driver_version
|
|
81
|
+
else:
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
except Exception: # pylint: disable=broad-except
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _compare_driver_versions(current_version, required_version):
|
|
89
|
+
"""
|
|
90
|
+
Compare two driver version strings.
|
|
91
|
+
Returns True if current_version >= required_version, False otherwise.
|
|
92
|
+
Uses packaging.version for proper semantic version comparison.
|
|
93
|
+
"""
|
|
94
|
+
from packaging.version import Version
|
|
95
|
+
|
|
96
|
+
return Version(current_version) >= Version(required_version)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def import_error_heler(e: Exception):
|
|
100
|
+
"""
|
|
101
|
+
Print a helpful message in the event of an import error
|
|
102
|
+
"""
|
|
103
|
+
raise ImportError(
|
|
104
|
+
f"{e}\n Please install lemonade-sdk with "
|
|
105
|
+
"one of the oga extras, for example:\n"
|
|
106
|
+
"pip install lemonade-sdk[dev,oga-cpu]\n"
|
|
107
|
+
"See https://lemonade-server.ai/install_options.html for details"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _open_driver_install_page():
|
|
112
|
+
"""
|
|
113
|
+
Opens the driver installation page in the user's default web browser.
|
|
114
|
+
"""
|
|
115
|
+
try:
|
|
116
|
+
driver_page_url = "https://lemonade-server.ai/driver_install.html"
|
|
117
|
+
printing.log_info(f"Opening driver installation guide: {driver_page_url}")
|
|
118
|
+
webbrowser.open(driver_page_url)
|
|
119
|
+
except Exception as e: # pylint: disable=broad-except
|
|
120
|
+
printing.log_info(f"Could not open browser automatically: {e}")
|
|
121
|
+
printing.log_info(
|
|
122
|
+
"Please visit https://lemonade-server.ai/driver_install.html "
|
|
123
|
+
"for driver installation instructions."
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class OgaLoad(FirstTool):
|
|
128
|
+
"""
|
|
129
|
+
Tool that loads an LLM in OnnxRuntime-GenAI for use with CPU or DirectML execution providers.
|
|
130
|
+
|
|
131
|
+
Input: path to a checkpoint.
|
|
132
|
+
Supported choices for cpu and igpu from HF model repository:
|
|
133
|
+
LLM models on Huggingface supported by model_builder. See documentation
|
|
134
|
+
(https://github.com/lemonade-sdk/lemonade/blob/main/docs/dev_cli/ort_genai_igpu.md)
|
|
135
|
+
for supported models.
|
|
136
|
+
Supported choices for npu from HF model repository:
|
|
137
|
+
Models on Hugging Face that follow the "amd/**-onnx-ryzen-strix" pattern
|
|
138
|
+
Local models for cpu, igpu, or npu:
|
|
139
|
+
The specified checkpoint is converted to a local path, via mapping to lower case
|
|
140
|
+
and replacing '/' with '_'. If this model already exists in the 'models' folder
|
|
141
|
+
of the lemonade cache and if it has a subfolder <device>-<dtype>, then this model
|
|
142
|
+
will be used. If the --force flag is used and the model is built with model_builder,
|
|
143
|
+
then it will be rebuilt.
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
Output:
|
|
148
|
+
state.model: handle to a Huggingface-style LLM loaded on DirectML device
|
|
149
|
+
state.tokenizer = Huggingface-style LLM tokenizer instance
|
|
150
|
+
state.dtype = data type of the model on DirectML device
|
|
151
|
+
state.checkpoint = name of the checkpoint used to load state.model
|
|
152
|
+
|
|
153
|
+
Note: This tool expects the onnxruntime-genai-directml library to be pre-installed.
|
|
154
|
+
If that library is not installed, this tool will not load.
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
unique_name = "oga-load"
|
|
158
|
+
|
|
159
|
+
def __init__(self):
|
|
160
|
+
super().__init__(monitor_message="Loading OnnxRuntime-GenAI model")
|
|
161
|
+
|
|
162
|
+
self.status_stats = [
|
|
163
|
+
Keys.DTYPE,
|
|
164
|
+
Keys.DEVICE,
|
|
165
|
+
Keys.LOCAL_MODEL_FOLDER,
|
|
166
|
+
]
|
|
167
|
+
|
|
168
|
+
@staticmethod
|
|
169
|
+
def parser(add_help: bool = True) -> argparse.ArgumentParser:
|
|
170
|
+
parser = __class__.helpful_parser(
|
|
171
|
+
short_description="Load model in onnxruntime-genai (OGA)",
|
|
172
|
+
add_help=add_help,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
parser.add_argument(
|
|
176
|
+
"-ip",
|
|
177
|
+
"--input_path",
|
|
178
|
+
default="",
|
|
179
|
+
help="the local huggingface model in your disk",
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
parser.add_argument(
|
|
183
|
+
"-d",
|
|
184
|
+
"--device",
|
|
185
|
+
choices=["igpu", "npu", "cpu", "hybrid", "cuda"],
|
|
186
|
+
default="igpu",
|
|
187
|
+
help="Which device to load the model on to (default: igpu)",
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
parser.add_argument(
|
|
191
|
+
"--dtype",
|
|
192
|
+
choices=["int4", "fp16", "fp32"],
|
|
193
|
+
required=True,
|
|
194
|
+
help="Data type to load the model in",
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
parser.add_argument(
|
|
198
|
+
"--int4-block-size",
|
|
199
|
+
default=None,
|
|
200
|
+
help="Specify the block_size for int4 quantization.",
|
|
201
|
+
choices=[16, 32, 64, 128, 256],
|
|
202
|
+
type=int,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
parser.add_argument(
|
|
206
|
+
"--force",
|
|
207
|
+
action="store_true",
|
|
208
|
+
help="Forces downloading of Hugging-Face model again (if changed). Additionally for"
|
|
209
|
+
" cpu and igpu devices only, forces model_builder to run again on the HF model"
|
|
210
|
+
" (changed or not).",
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
parser.add_argument(
|
|
214
|
+
"--download-only",
|
|
215
|
+
action="store_true",
|
|
216
|
+
help="Download the model if needed, but don't load it",
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
parser.add_argument(
|
|
220
|
+
"--trust-remote-code",
|
|
221
|
+
action="store_true",
|
|
222
|
+
help="Set this flag to use models whose code is on the Hugging Face hub rather "
|
|
223
|
+
"than natively in the OnnxRuntime Gen AI libraries. Please review the model code "
|
|
224
|
+
"in advance as this is a security risk.",
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
parser.add_argument(
|
|
228
|
+
"--subfolder",
|
|
229
|
+
default=None,
|
|
230
|
+
help="Subfolder where model is located <LEMONADE CACHE>/oga_models/<MODELNAME>"
|
|
231
|
+
"/<SUBFOLDER>, default is <EP for device>-<dtype>. The EPs are: "
|
|
232
|
+
f'{", ".join([value + " for " + key for key, value in execution_providers.items()])}.',
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
return parser
|
|
236
|
+
|
|
237
|
+
@staticmethod
|
|
238
|
+
def _validate_model_configuration(device, dtype, checkpoint):
|
|
239
|
+
"""
|
|
240
|
+
Validate if the device, dtype, platform and checkpoint combination are consistent with
|
|
241
|
+
HuggingFace checkpoint naming conventions and specifically for AMD models for NPU
|
|
242
|
+
and hybrid flows.
|
|
243
|
+
|
|
244
|
+
Returns True if device, dtype, and model are consistent.
|
|
245
|
+
"""
|
|
246
|
+
|
|
247
|
+
hf_supported_models = {
|
|
248
|
+
"cpu": {"int4": "*/*", "fp32": "*/*"},
|
|
249
|
+
"igpu": {"int4": "*/*", "fp16": "*/*"},
|
|
250
|
+
"npu": {"int4": "*/*"},
|
|
251
|
+
"hybrid": {"int4": "*/*"},
|
|
252
|
+
"cuda": {"int4": "*/*", "fp16": "*/*"},
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
hf_supported = (
|
|
256
|
+
device in hf_supported_models
|
|
257
|
+
and dtype in hf_supported_models[device]
|
|
258
|
+
and fnmatch(checkpoint, hf_supported_models[device][dtype])
|
|
259
|
+
)
|
|
260
|
+
return hf_supported
|
|
261
|
+
|
|
262
|
+
@staticmethod
|
|
263
|
+
def _setup_model_paths(
|
|
264
|
+
state, checkpoint, device, dtype, subfolder, int4_block_size
|
|
265
|
+
):
|
|
266
|
+
"""
|
|
267
|
+
Determines and returns the following model path information for models produced by OGA
|
|
268
|
+
model builder:
|
|
269
|
+
|
|
270
|
+
full_model_path - Full path to where the OGA model files are stored.
|
|
271
|
+
oga_models_subfolder - The subfolder of the oga_models folder where the model files
|
|
272
|
+
are stored. (<full_model_path> = <oga_models>/<oga_models_subfolder>)
|
|
273
|
+
This subfolder is usually
|
|
274
|
+
<checkpoint_string>/<device>-<dtype>[-block-<int4_block_size]>
|
|
275
|
+
but the if the argument subfolder is not None it will override the latter part
|
|
276
|
+
of this path.
|
|
277
|
+
model_exists_locally - True if full_model_path is a folder that contains files
|
|
278
|
+
|
|
279
|
+
Note: Model files already in ONNX format on Hugging Face will be stored in the
|
|
280
|
+
Hugging Face cache, not this folder. The <oga_models> folder contains model
|
|
281
|
+
files that have locally been quantized/converted to OGA format and any other
|
|
282
|
+
models that have been manually added by the user.
|
|
283
|
+
"""
|
|
284
|
+
from lemonade.common.network import custom_snapshot_download
|
|
285
|
+
|
|
286
|
+
if subfolder is None:
|
|
287
|
+
subfolder = f"{execution_providers[device]}-{dtype}"
|
|
288
|
+
subfolder += (
|
|
289
|
+
f"-block-{int4_block_size}"
|
|
290
|
+
if dtype == "int4" and int4_block_size is not None
|
|
291
|
+
else ""
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
# First, check in the lemonade oga_models cache
|
|
295
|
+
oga_models_subfolder = os.path.join(
|
|
296
|
+
checkpoint.replace("/", "_").lower(), subfolder
|
|
297
|
+
)
|
|
298
|
+
full_model_path = os.path.join(
|
|
299
|
+
state.cache_dir, oga_models_path, oga_models_subfolder
|
|
300
|
+
)
|
|
301
|
+
model_exists_locally = os.path.isdir(full_model_path) and os.listdir(
|
|
302
|
+
full_model_path
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# If not found in lemonade cache, check in Hugging Face cache
|
|
306
|
+
if not model_exists_locally:
|
|
307
|
+
try:
|
|
308
|
+
snapshot_path = custom_snapshot_download(
|
|
309
|
+
checkpoint,
|
|
310
|
+
local_files_only=True,
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
# Check if the snapshot contains ONNX files
|
|
314
|
+
if os.path.isdir(snapshot_path) and os.listdir(snapshot_path):
|
|
315
|
+
is_onnx_model = any(
|
|
316
|
+
filename.endswith(".onnx")
|
|
317
|
+
for filename in os.listdir(snapshot_path)
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
if is_onnx_model:
|
|
321
|
+
# If the model is in HF cache and has ONNX files, use it
|
|
322
|
+
full_model_path = snapshot_path
|
|
323
|
+
model_exists_locally = True
|
|
324
|
+
printing.log_info(
|
|
325
|
+
f"Found ONNX model in Hugging Face cache: {full_model_path}"
|
|
326
|
+
)
|
|
327
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
328
|
+
# Log any errors but continue with the original path
|
|
329
|
+
printing.log_info(f"Error checking Hugging Face cache: {e}")
|
|
330
|
+
|
|
331
|
+
return full_model_path, model_exists_locally
|
|
332
|
+
|
|
333
|
+
@staticmethod
|
|
334
|
+
def _setup_model_dependencies(full_model_path, device, ryzenai_version, oga_path):
|
|
335
|
+
# pylint: disable=unused-argument
|
|
336
|
+
"""
|
|
337
|
+
Sets up model dependencies for hybrid and NPU inference by:
|
|
338
|
+
1. Configuring the custom_ops_library path in genai_config.json.
|
|
339
|
+
2. Adding DLL source directories to PATH for dependent DLL discovery.
|
|
340
|
+
3. Check NPU driver version if required for device and ryzenai_version.
|
|
341
|
+
"""
|
|
342
|
+
|
|
343
|
+
# For RyzenAI 1.6.0, check NPU driver version for NPU and hybrid devices
|
|
344
|
+
if device in ["npu", "hybrid"]:
|
|
345
|
+
required_driver_version = REQUIRED_NPU_DRIVER_VERSION
|
|
346
|
+
|
|
347
|
+
current_driver_version = _get_npu_driver_version()
|
|
348
|
+
rai_version, _ = _get_ryzenai_version_info(device)
|
|
349
|
+
|
|
350
|
+
if not current_driver_version:
|
|
351
|
+
printing.log_warning(
|
|
352
|
+
f"NPU driver not found. {device.upper()} inference requires NPU driver "
|
|
353
|
+
f"version {required_driver_version}.\n"
|
|
354
|
+
"Please download and install the NPU Driver from:\n"
|
|
355
|
+
f"{NPU_DRIVER_DOWNLOAD_URL}\n"
|
|
356
|
+
"NPU functionality may not work properly."
|
|
357
|
+
)
|
|
358
|
+
_open_driver_install_page()
|
|
359
|
+
|
|
360
|
+
elif not _compare_driver_versions(
|
|
361
|
+
current_driver_version, required_driver_version
|
|
362
|
+
):
|
|
363
|
+
printing.log_warning(
|
|
364
|
+
f"Incorrect NPU driver version detected: {current_driver_version}\n"
|
|
365
|
+
f"{device.upper()} inference with RyzenAI {rai_version} requires driver "
|
|
366
|
+
f"version {required_driver_version} or higher.\n"
|
|
367
|
+
"Please download and install the correct NPU Driver from:\n"
|
|
368
|
+
f"{NPU_DRIVER_DOWNLOAD_URL}\n"
|
|
369
|
+
"NPU functionality may not work properly."
|
|
370
|
+
)
|
|
371
|
+
_open_driver_install_page()
|
|
372
|
+
|
|
373
|
+
# Setup DLL paths for NPU/hybrid inference
|
|
374
|
+
# Use sys.prefix to get the environment root (works for both venv and conda)
|
|
375
|
+
# - Conda: sys.executable is at env/python.exe, sys.prefix is env/
|
|
376
|
+
# - Venv: sys.executable is at .venv/Scripts/python.exe, sys.prefix is .venv/
|
|
377
|
+
env_path = sys.prefix
|
|
378
|
+
dll_source_path = os.path.join(
|
|
379
|
+
env_path, "Lib", "site-packages", "onnxruntime_genai"
|
|
380
|
+
)
|
|
381
|
+
required_dlls = ["libutf8_validity.dll", "abseil_dll.dll"]
|
|
382
|
+
|
|
383
|
+
# Validate that all required DLLs exist in the source directory
|
|
384
|
+
missing_dlls = []
|
|
385
|
+
|
|
386
|
+
for dll_name in required_dlls:
|
|
387
|
+
dll_source = os.path.join(dll_source_path, dll_name)
|
|
388
|
+
if not os.path.exists(dll_source):
|
|
389
|
+
missing_dlls.append(dll_source)
|
|
390
|
+
|
|
391
|
+
if missing_dlls:
|
|
392
|
+
dll_list = "\n - ".join(missing_dlls)
|
|
393
|
+
raise RuntimeError(
|
|
394
|
+
f"Required DLLs not found for {device} inference:\n - {dll_list}\n"
|
|
395
|
+
f"Please ensure your RyzenAI installation is complete and supports {device}.\n"
|
|
396
|
+
"Please reinstall the RyzenAI Software for your platform. Run:\n"
|
|
397
|
+
" pip install lemonade-sdk[oga-ryzenai]\n"
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
# Add the DLL source directory to PATH
|
|
401
|
+
current_path = os.environ.get("PATH", "")
|
|
402
|
+
if dll_source_path not in current_path:
|
|
403
|
+
os.environ["PATH"] = dll_source_path + os.pathsep + current_path
|
|
404
|
+
|
|
405
|
+
@staticmethod
|
|
406
|
+
def _is_preoptimized_model(input_model_path):
|
|
407
|
+
"""
|
|
408
|
+
Checks if the 'custom_ops_library' field exists in the genai_config.json file
|
|
409
|
+
to determine if this is a pre-optimized model for hybrid as well
|
|
410
|
+
as NPU only.
|
|
411
|
+
|
|
412
|
+
Args:
|
|
413
|
+
input_model_path (str): Path to the input model directory.
|
|
414
|
+
|
|
415
|
+
Returns:
|
|
416
|
+
bool: True if 'custom_ops_library' exists, False otherwise.
|
|
417
|
+
"""
|
|
418
|
+
config_path = os.path.join(input_model_path, "genai_config.json")
|
|
419
|
+
if not os.path.exists(config_path):
|
|
420
|
+
printing.log_info(f"Model's `genai_config.json` not found in {config_path}")
|
|
421
|
+
return False
|
|
422
|
+
|
|
423
|
+
with open(config_path, "r", encoding="utf-8") as f:
|
|
424
|
+
config = json.load(f)
|
|
425
|
+
if (
|
|
426
|
+
"model" in config
|
|
427
|
+
and "decoder" in config["model"]
|
|
428
|
+
and "session_options" in config["model"]["decoder"]
|
|
429
|
+
):
|
|
430
|
+
return "custom_ops_library" in config["model"]["decoder"]["session_options"]
|
|
431
|
+
return False
|
|
432
|
+
|
|
433
|
+
@staticmethod
|
|
434
|
+
def _download_and_build_safetensors_model(
|
|
435
|
+
checkpoint, device, dtype, full_model_path, int4_block_size, input_path, state
|
|
436
|
+
):
|
|
437
|
+
"""
|
|
438
|
+
Uses OGA model builder to quantize safetensors format model and convert to ONNX
|
|
439
|
+
format. The model files are saved to the full_model_path folder.
|
|
440
|
+
"""
|
|
441
|
+
|
|
442
|
+
try:
|
|
443
|
+
import onnxruntime_genai.models.builder as model_builder
|
|
444
|
+
except ImportError as e:
|
|
445
|
+
import_error_heler(e)
|
|
446
|
+
|
|
447
|
+
printing.log_info(f"Building {checkpoint} for {device} using {dtype}")
|
|
448
|
+
extra_options = {}
|
|
449
|
+
if int4_block_size is not None:
|
|
450
|
+
extra_options["int4-block-size"] = int4_block_size
|
|
451
|
+
try:
|
|
452
|
+
model_builder.create_model(
|
|
453
|
+
checkpoint,
|
|
454
|
+
input_path,
|
|
455
|
+
full_model_path,
|
|
456
|
+
dtype,
|
|
457
|
+
execution_providers[device],
|
|
458
|
+
os.path.join(state.cache_dir, oga_model_builder_cache_path),
|
|
459
|
+
**extra_options,
|
|
460
|
+
)
|
|
461
|
+
except NotImplementedError as e:
|
|
462
|
+
raise NotImplementedError("[Model builder] " + str(e)) from e
|
|
463
|
+
except OSError as e:
|
|
464
|
+
raise ValueError("[Model builder] " + str(e)) from e
|
|
465
|
+
|
|
466
|
+
return full_model_path
|
|
467
|
+
|
|
468
|
+
@staticmethod
|
|
469
|
+
def _load_model_and_setup_state(
|
|
470
|
+
state, full_model_path, checkpoint, trust_remote_code
|
|
471
|
+
):
|
|
472
|
+
"""
|
|
473
|
+
Loads the OGA model from local folder and then loads the tokenizer.
|
|
474
|
+
Will auto-detect if we're offline.
|
|
475
|
+
"""
|
|
476
|
+
|
|
477
|
+
try:
|
|
478
|
+
from lemonade.tools.oga.utils import OrtGenaiModel, OrtGenaiTokenizer
|
|
479
|
+
from lemonade.common.network import is_offline
|
|
480
|
+
except ImportError as e:
|
|
481
|
+
import_error_heler(e)
|
|
482
|
+
|
|
483
|
+
try:
|
|
484
|
+
state.model = OrtGenaiModel(full_model_path)
|
|
485
|
+
except Exception as e:
|
|
486
|
+
if "invalid unordered_map<K, T>" in str(e):
|
|
487
|
+
raise ValueError(
|
|
488
|
+
"Error initializing model: Invalid configuration detected.\n"
|
|
489
|
+
"Please check the following:\n"
|
|
490
|
+
f"1. Please check your model's config file in {full_model_path} "
|
|
491
|
+
"and ensure custom_ops_library points to the valid "
|
|
492
|
+
"onnx_custom_ops.dll path.\n"
|
|
493
|
+
"2. Make sure the NPU driver is loaded.\n"
|
|
494
|
+
"3. Make sure hybrid has been installed on a Ryzen AI "
|
|
495
|
+
f"{'or '.join(SUPPORTED_RYZEN_AI_SERIES)}-series processor."
|
|
496
|
+
) from e
|
|
497
|
+
raise
|
|
498
|
+
|
|
499
|
+
# Auto-detect offline mode
|
|
500
|
+
offline = is_offline()
|
|
501
|
+
|
|
502
|
+
try:
|
|
503
|
+
from transformers import AutoTokenizer
|
|
504
|
+
except ImportError as e:
|
|
505
|
+
import_error_heler(e)
|
|
506
|
+
|
|
507
|
+
try:
|
|
508
|
+
# Always try to use local files first
|
|
509
|
+
local_files_only = True
|
|
510
|
+
|
|
511
|
+
hf_tokenizer = AutoTokenizer.from_pretrained(
|
|
512
|
+
full_model_path,
|
|
513
|
+
local_files_only=local_files_only,
|
|
514
|
+
trust_remote_code=trust_remote_code,
|
|
515
|
+
)
|
|
516
|
+
except ValueError as e:
|
|
517
|
+
if "trust_remote_code" in str(e):
|
|
518
|
+
raise ValueError(
|
|
519
|
+
"This model requires you to execute code from the repo. Please review it "
|
|
520
|
+
"and if you trust it, then use the `--trust-remote-code` flag with oga-load."
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
if offline and "Can't load tokenizer for" in str(e):
|
|
524
|
+
raise ValueError(
|
|
525
|
+
f"Cannot load tokenizer for {checkpoint} in offline mode. "
|
|
526
|
+
f"The tokenizer files may not be available locally in {full_model_path}."
|
|
527
|
+
)
|
|
528
|
+
raise
|
|
529
|
+
|
|
530
|
+
state.tokenizer = OrtGenaiTokenizer(
|
|
531
|
+
state.model.model,
|
|
532
|
+
hf_tokenizer,
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
status.add_to_state(state=state, name=checkpoint, model=checkpoint)
|
|
536
|
+
|
|
537
|
+
@staticmethod
|
|
538
|
+
def _cleanup_environment(saved_state):
|
|
539
|
+
"""
|
|
540
|
+
Restores environment to its original state after inference is complete.
|
|
541
|
+
"""
|
|
542
|
+
if saved_state:
|
|
543
|
+
os.chdir(saved_state["cwd"])
|
|
544
|
+
os.environ["PATH"] = saved_state["path"]
|
|
545
|
+
|
|
546
|
+
def _generate_model_for_oga(self, output_model_path, device, input_model_path):
|
|
547
|
+
"""
|
|
548
|
+
Uses the model_generate tool to generate the model for OGA hybrid or npu targets.
|
|
549
|
+
"""
|
|
550
|
+
try:
|
|
551
|
+
import model_generate
|
|
552
|
+
except ImportError as e:
|
|
553
|
+
raise ImportError(
|
|
554
|
+
f"{e}\nYou are trying to use a developer tool that may not be "
|
|
555
|
+
"installed. Please install the required package using:\n"
|
|
556
|
+
"pip install -e .[dev,oga-ryzenai] \
|
|
557
|
+
--extra-index-url https://pypi.amd.com/simple"
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
# Determine the appropriate flag based on the device type
|
|
561
|
+
if device == "hybrid":
|
|
562
|
+
device_flag = "hybrid"
|
|
563
|
+
elif device == "npu":
|
|
564
|
+
device_flag = "npu"
|
|
565
|
+
else:
|
|
566
|
+
raise ValueError(f"Unsupported device type for model generation: {device}")
|
|
567
|
+
|
|
568
|
+
printing.log_info(
|
|
569
|
+
f"Generating model for device: {device_flag}, \
|
|
570
|
+
input: {input_model_path}, output: {output_model_path}"
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
try:
|
|
574
|
+
if device_flag == "npu":
|
|
575
|
+
model_generate.generate_npu_model(
|
|
576
|
+
input_model=input_model_path,
|
|
577
|
+
output_dir=output_model_path,
|
|
578
|
+
packed_const=False,
|
|
579
|
+
)
|
|
580
|
+
else: # hybrid
|
|
581
|
+
model_generate.generate_hybrid_model(
|
|
582
|
+
input_model=input_model_path,
|
|
583
|
+
output_dir=output_model_path,
|
|
584
|
+
script_option="jit_npu",
|
|
585
|
+
mode="bf16",
|
|
586
|
+
dml_only=False,
|
|
587
|
+
)
|
|
588
|
+
except Exception as e:
|
|
589
|
+
raise RuntimeError(
|
|
590
|
+
f"Failed to generate model for {device_flag} device. Error: {e}"
|
|
591
|
+
) from e
|
|
592
|
+
|
|
593
|
+
def run(
|
|
594
|
+
self,
|
|
595
|
+
state: State,
|
|
596
|
+
input: str,
|
|
597
|
+
input_path: str = "",
|
|
598
|
+
device: str = "igpu",
|
|
599
|
+
dtype: str = "int4",
|
|
600
|
+
int4_block_size: int = None,
|
|
601
|
+
force: bool = False,
|
|
602
|
+
download_only: bool = False,
|
|
603
|
+
trust_remote_code=False,
|
|
604
|
+
subfolder: str = None,
|
|
605
|
+
do_not_upgrade: bool = False,
|
|
606
|
+
) -> State:
|
|
607
|
+
from lemonade.common.network import (
|
|
608
|
+
custom_snapshot_download,
|
|
609
|
+
get_base_model,
|
|
610
|
+
is_offline,
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
# Auto-detect offline status
|
|
614
|
+
offline = is_offline()
|
|
615
|
+
if offline:
|
|
616
|
+
printing.log_warning(
|
|
617
|
+
"Network connectivity to huggingface.co not detected. Running in offline mode."
|
|
618
|
+
)
|
|
619
|
+
|
|
620
|
+
state.device = device
|
|
621
|
+
state.dtype = dtype
|
|
622
|
+
|
|
623
|
+
# Log initial stats
|
|
624
|
+
state.save_stat(Keys.DTYPE, dtype)
|
|
625
|
+
state.save_stat(Keys.DEVICE, device)
|
|
626
|
+
if device in ["hybrid", "npu"]:
|
|
627
|
+
ryzenai_version, _ = _get_ryzenai_version_info(device)
|
|
628
|
+
ryzen_ai_version_info = {"version": ryzenai_version}
|
|
629
|
+
state.save_stat(Keys.RYZEN_AI_VERSION_INFO, ryzen_ai_version_info)
|
|
630
|
+
|
|
631
|
+
# Check if input is a local folder
|
|
632
|
+
if os.path.isdir(input):
|
|
633
|
+
# input is a local folder
|
|
634
|
+
full_model_path = os.path.abspath(input)
|
|
635
|
+
checkpoint = "local_model"
|
|
636
|
+
state.checkpoint = checkpoint
|
|
637
|
+
state.save_stat(Keys.CHECKPOINT, checkpoint)
|
|
638
|
+
state.save_stat(Keys.LOCAL_MODEL_FOLDER, full_model_path)
|
|
639
|
+
# See if there is a file ending in ".onnx" in this folder
|
|
640
|
+
has_onnx_file = find_onnx_files_recursively(input)
|
|
641
|
+
if not has_onnx_file:
|
|
642
|
+
raise ValueError(
|
|
643
|
+
f"The folder {input} does not contain an ONNX model file."
|
|
644
|
+
)
|
|
645
|
+
if force:
|
|
646
|
+
raise ValueError(
|
|
647
|
+
"Your input (-i, --input) points to a local folder, which is not "
|
|
648
|
+
"compatible with the force argument."
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
else:
|
|
652
|
+
# input is a model checkpoint
|
|
653
|
+
checkpoint = input
|
|
654
|
+
state.checkpoint = checkpoint
|
|
655
|
+
state.save_stat(Keys.CHECKPOINT, checkpoint)
|
|
656
|
+
|
|
657
|
+
# Get base model information
|
|
658
|
+
if not offline:
|
|
659
|
+
base_model = get_base_model(checkpoint)
|
|
660
|
+
if base_model is not None:
|
|
661
|
+
state.save_stat("base_model", base_model)
|
|
662
|
+
|
|
663
|
+
# Setup paths
|
|
664
|
+
full_model_path, model_exists_locally = self._setup_model_paths(
|
|
665
|
+
state, checkpoint, device, dtype, subfolder, int4_block_size
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
# If in offline mode, we can only use locally available models
|
|
669
|
+
if offline and not model_exists_locally:
|
|
670
|
+
raise ValueError(
|
|
671
|
+
f"Model {checkpoint} is not available locally for {device} with {dtype}. "
|
|
672
|
+
f"Cannot download in offline mode. Check {full_model_path}"
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
# Handle download/build if needed
|
|
676
|
+
if (not model_exists_locally) or force:
|
|
677
|
+
if offline:
|
|
678
|
+
raise ValueError(
|
|
679
|
+
f"Cannot download or build model {checkpoint} in offline mode"
|
|
680
|
+
)
|
|
681
|
+
|
|
682
|
+
# Validate configuration
|
|
683
|
+
hf_supported = self._validate_model_configuration(
|
|
684
|
+
device, dtype, checkpoint
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
if not hf_supported:
|
|
688
|
+
raise ValueError(
|
|
689
|
+
"The (device, dtype, checkpoint) combination is not supported: "
|
|
690
|
+
f"({device}, {dtype}, {checkpoint})"
|
|
691
|
+
)
|
|
692
|
+
input_model_path = custom_snapshot_download(
|
|
693
|
+
checkpoint,
|
|
694
|
+
ignore_patterns=["*.md", "*.txt"],
|
|
695
|
+
local_files_only=offline or do_not_upgrade,
|
|
696
|
+
)
|
|
697
|
+
# Check if model is ONNX or safetensors
|
|
698
|
+
is_onnx_model = any(
|
|
699
|
+
[
|
|
700
|
+
filename.endswith(".onnx")
|
|
701
|
+
for filename in os.listdir(input_model_path)
|
|
702
|
+
]
|
|
703
|
+
)
|
|
704
|
+
is_preoptimized_onnx = is_onnx_model and self._is_preoptimized_model(
|
|
705
|
+
input_model_path
|
|
706
|
+
)
|
|
707
|
+
is_safetensors_model = any(
|
|
708
|
+
[
|
|
709
|
+
filename.endswith(".safetensors")
|
|
710
|
+
for filename in os.listdir(input_model_path)
|
|
711
|
+
]
|
|
712
|
+
)
|
|
713
|
+
if not (is_onnx_model or is_safetensors_model):
|
|
714
|
+
raise ValueError(
|
|
715
|
+
f"The model {checkpoint} is not supported. "
|
|
716
|
+
"It does not contain ONNX or safetensors files."
|
|
717
|
+
)
|
|
718
|
+
if device in ["npu", "hybrid"]:
|
|
719
|
+
if is_onnx_model:
|
|
720
|
+
if is_preoptimized_onnx:
|
|
721
|
+
# Use HuggingFace cache path as it is
|
|
722
|
+
full_model_path = input_model_path
|
|
723
|
+
else:
|
|
724
|
+
# If ONNX but not modified yet for Hybrid or NPU,
|
|
725
|
+
# needs further optimization
|
|
726
|
+
self._generate_model_for_oga(
|
|
727
|
+
full_model_path,
|
|
728
|
+
device,
|
|
729
|
+
input_model_path,
|
|
730
|
+
)
|
|
731
|
+
elif is_safetensors_model:
|
|
732
|
+
config_path = os.path.join(input_model_path, "config.json")
|
|
733
|
+
if os.path.exists(config_path):
|
|
734
|
+
with open(config_path, "r", encoding="utf-8") as f:
|
|
735
|
+
config = json.load(f)
|
|
736
|
+
if "quantization_config" in config:
|
|
737
|
+
# If quantized, use subprocess to generate the model
|
|
738
|
+
self._generate_model_for_oga(
|
|
739
|
+
full_model_path, device, input_model_path
|
|
740
|
+
)
|
|
741
|
+
else:
|
|
742
|
+
raise ValueError(
|
|
743
|
+
f"The safetensors model {checkpoint} is not quantized. "
|
|
744
|
+
"Only quantized safetensors models are supported"
|
|
745
|
+
" on npu or hybrid targets."
|
|
746
|
+
)
|
|
747
|
+
else:
|
|
748
|
+
raise ValueError(
|
|
749
|
+
f"config.json not found for safetensors model: {checkpoint}"
|
|
750
|
+
)
|
|
751
|
+
else:
|
|
752
|
+
raise ValueError(
|
|
753
|
+
f"Unsupported model type for checkpoint: {checkpoint}"
|
|
754
|
+
)
|
|
755
|
+
else:
|
|
756
|
+
if is_onnx_model:
|
|
757
|
+
# Use HuggingFace cache path as it is
|
|
758
|
+
full_model_path = input_model_path
|
|
759
|
+
else:
|
|
760
|
+
self._download_and_build_safetensors_model(
|
|
761
|
+
checkpoint,
|
|
762
|
+
device,
|
|
763
|
+
dtype,
|
|
764
|
+
full_model_path,
|
|
765
|
+
int4_block_size,
|
|
766
|
+
input_path,
|
|
767
|
+
state,
|
|
768
|
+
)
|
|
769
|
+
state.save_stat(Keys.LOCAL_MODEL_FOLDER, full_model_path)
|
|
770
|
+
|
|
771
|
+
# Load model if download-only argument is not set
|
|
772
|
+
if not download_only:
|
|
773
|
+
# Get version information for NPU/Hybrid devices
|
|
774
|
+
if device in ["hybrid", "npu"]:
|
|
775
|
+
ryzenai_version, oga_path = _get_ryzenai_version_info(device)
|
|
776
|
+
else:
|
|
777
|
+
ryzenai_version, oga_path = None, None
|
|
778
|
+
|
|
779
|
+
saved_env_state = None
|
|
780
|
+
|
|
781
|
+
# Setup model dependencies for NPU/Hybrid devices
|
|
782
|
+
if device in ["hybrid", "npu"]:
|
|
783
|
+
self._setup_model_dependencies(
|
|
784
|
+
full_model_path, device, ryzenai_version, oga_path
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
try:
|
|
788
|
+
if device == "npu":
|
|
789
|
+
# Set USE_AIE_RoPE based on model type
|
|
790
|
+
os.environ["USE_AIE_RoPE"] = (
|
|
791
|
+
"0" if "phi-" in checkpoint.lower() else "1"
|
|
792
|
+
)
|
|
793
|
+
|
|
794
|
+
self._load_model_and_setup_state(
|
|
795
|
+
state, full_model_path, checkpoint, trust_remote_code
|
|
796
|
+
)
|
|
797
|
+
finally:
|
|
798
|
+
self._cleanup_environment(saved_env_state)
|
|
799
|
+
|
|
800
|
+
return state
|
|
801
|
+
|
|
802
|
+
|
|
803
|
+
# This file was originally licensed under Apache 2.0. It has been modified.
|
|
804
|
+
# Modifications Copyright (c) 2025 AMD
|