lemonade-sdk 9.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. lemonade/__init__.py +5 -0
  2. lemonade/api.py +180 -0
  3. lemonade/cache.py +92 -0
  4. lemonade/cli.py +173 -0
  5. lemonade/common/__init__.py +0 -0
  6. lemonade/common/build.py +176 -0
  7. lemonade/common/cli_helpers.py +139 -0
  8. lemonade/common/exceptions.py +98 -0
  9. lemonade/common/filesystem.py +368 -0
  10. lemonade/common/inference_engines.py +408 -0
  11. lemonade/common/network.py +93 -0
  12. lemonade/common/printing.py +110 -0
  13. lemonade/common/status.py +471 -0
  14. lemonade/common/system_info.py +1411 -0
  15. lemonade/common/test_helpers.py +28 -0
  16. lemonade/profilers/__init__.py +1 -0
  17. lemonade/profilers/agt_power.py +437 -0
  18. lemonade/profilers/hwinfo_power.py +429 -0
  19. lemonade/profilers/memory_tracker.py +259 -0
  20. lemonade/profilers/profiler.py +58 -0
  21. lemonade/sequence.py +363 -0
  22. lemonade/state.py +159 -0
  23. lemonade/tools/__init__.py +1 -0
  24. lemonade/tools/accuracy.py +432 -0
  25. lemonade/tools/adapter.py +114 -0
  26. lemonade/tools/bench.py +302 -0
  27. lemonade/tools/flm/__init__.py +1 -0
  28. lemonade/tools/flm/utils.py +305 -0
  29. lemonade/tools/huggingface/bench.py +187 -0
  30. lemonade/tools/huggingface/load.py +235 -0
  31. lemonade/tools/huggingface/utils.py +359 -0
  32. lemonade/tools/humaneval.py +264 -0
  33. lemonade/tools/llamacpp/bench.py +255 -0
  34. lemonade/tools/llamacpp/load.py +222 -0
  35. lemonade/tools/llamacpp/utils.py +1260 -0
  36. lemonade/tools/management_tools.py +319 -0
  37. lemonade/tools/mmlu.py +319 -0
  38. lemonade/tools/oga/__init__.py +0 -0
  39. lemonade/tools/oga/bench.py +120 -0
  40. lemonade/tools/oga/load.py +804 -0
  41. lemonade/tools/oga/migration.py +403 -0
  42. lemonade/tools/oga/utils.py +462 -0
  43. lemonade/tools/perplexity.py +147 -0
  44. lemonade/tools/prompt.py +263 -0
  45. lemonade/tools/report/__init__.py +0 -0
  46. lemonade/tools/report/llm_report.py +203 -0
  47. lemonade/tools/report/table.py +899 -0
  48. lemonade/tools/server/__init__.py +0 -0
  49. lemonade/tools/server/flm.py +133 -0
  50. lemonade/tools/server/llamacpp.py +320 -0
  51. lemonade/tools/server/serve.py +2123 -0
  52. lemonade/tools/server/static/favicon.ico +0 -0
  53. lemonade/tools/server/static/index.html +279 -0
  54. lemonade/tools/server/static/js/chat.js +1059 -0
  55. lemonade/tools/server/static/js/model-settings.js +183 -0
  56. lemonade/tools/server/static/js/models.js +1395 -0
  57. lemonade/tools/server/static/js/shared.js +556 -0
  58. lemonade/tools/server/static/logs.html +191 -0
  59. lemonade/tools/server/static/styles.css +2654 -0
  60. lemonade/tools/server/static/webapp.html +321 -0
  61. lemonade/tools/server/tool_calls.py +153 -0
  62. lemonade/tools/server/tray.py +664 -0
  63. lemonade/tools/server/utils/macos_tray.py +226 -0
  64. lemonade/tools/server/utils/port.py +77 -0
  65. lemonade/tools/server/utils/thread.py +85 -0
  66. lemonade/tools/server/utils/windows_tray.py +408 -0
  67. lemonade/tools/server/webapp.py +34 -0
  68. lemonade/tools/server/wrapped_server.py +559 -0
  69. lemonade/tools/tool.py +374 -0
  70. lemonade/version.py +1 -0
  71. lemonade_install/__init__.py +1 -0
  72. lemonade_install/install.py +239 -0
  73. lemonade_sdk-9.1.1.dist-info/METADATA +276 -0
  74. lemonade_sdk-9.1.1.dist-info/RECORD +84 -0
  75. lemonade_sdk-9.1.1.dist-info/WHEEL +5 -0
  76. lemonade_sdk-9.1.1.dist-info/entry_points.txt +5 -0
  77. lemonade_sdk-9.1.1.dist-info/licenses/LICENSE +201 -0
  78. lemonade_sdk-9.1.1.dist-info/licenses/NOTICE.md +47 -0
  79. lemonade_sdk-9.1.1.dist-info/top_level.txt +3 -0
  80. lemonade_server/cli.py +805 -0
  81. lemonade_server/model_manager.py +758 -0
  82. lemonade_server/pydantic_models.py +159 -0
  83. lemonade_server/server_models.json +643 -0
  84. lemonade_server/settings.py +39 -0
@@ -0,0 +1,804 @@
1
+ # onnxruntime_genai is not lint-friendly yet and PyLint can't
2
+ # find any of the class methods
3
+ # pylint: disable=no-member
4
+
5
+ import argparse
6
+ import subprocess
7
+ import sys
8
+ import os
9
+ import json
10
+ import webbrowser
11
+ from fnmatch import fnmatch
12
+
13
+ from lemonade.state import State
14
+ from lemonade.tools import FirstTool
15
+ from lemonade.cache import Keys
16
+ import lemonade.common.status as status
17
+ import lemonade.common.printing as printing
18
+ from lemonade_install.install import (
19
+ _get_ryzenai_version_info,
20
+ SUPPORTED_RYZEN_AI_SERIES,
21
+ NPU_DRIVER_DOWNLOAD_URL,
22
+ REQUIRED_NPU_DRIVER_VERSION,
23
+ )
24
+
25
+ # ONNX Runtime GenAI models will be cached in this subfolder of the lemonade cache folder
26
+ oga_models_path = "oga_models"
27
+
28
+ # ONNX Runtime GenAI model builder tool uses this subfolder of the lemonade cache as its cache
29
+ oga_model_builder_cache_path = "model_builder"
30
+
31
+ # Mapping from processor to execution provider, used in pathnames and by model_builder
32
+ execution_providers = {
33
+ "cpu": "cpu",
34
+ "npu": "npu",
35
+ "igpu": "dml",
36
+ "hybrid": "hybrid",
37
+ "cuda": "cuda",
38
+ }
39
+
40
+
41
+ def find_onnx_files_recursively(directory):
42
+ """
43
+ Recursively search for ONNX files in a directory and its subdirectories.
44
+ """
45
+ for _, _, files in os.walk(directory):
46
+ for file in files:
47
+ if file.endswith(".onnx"):
48
+ return True
49
+ return False
50
+
51
+
52
+ def _get_npu_driver_version():
53
+ """
54
+ Get the NPU driver version using PowerShell directly.
55
+ Returns the driver version string or None if not found.
56
+ """
57
+ try:
58
+
59
+ # Use PowerShell directly to avoid wmi issues in embedded Python environments
60
+ powershell_cmd = [
61
+ "powershell",
62
+ "-NoProfile",
63
+ "-ExecutionPolicy",
64
+ "Bypass",
65
+ "-Command",
66
+ (
67
+ "Get-WmiObject -Class Win32_PnPSignedDriver | "
68
+ 'Where-Object { $_.DeviceName -like "*NPU Compute Accelerator Device*" } | '
69
+ "Select-Object -ExpandProperty DriverVersion"
70
+ ),
71
+ ]
72
+
73
+ result = subprocess.run(
74
+ powershell_cmd, capture_output=True, text=True, check=True, timeout=30
75
+ )
76
+
77
+ driver_version = result.stdout.strip()
78
+
79
+ if driver_version and driver_version != "":
80
+ return driver_version
81
+ else:
82
+ return None
83
+
84
+ except Exception: # pylint: disable=broad-except
85
+ return None
86
+
87
+
88
+ def _compare_driver_versions(current_version, required_version):
89
+ """
90
+ Compare two driver version strings.
91
+ Returns True if current_version >= required_version, False otherwise.
92
+ Uses packaging.version for proper semantic version comparison.
93
+ """
94
+ from packaging.version import Version
95
+
96
+ return Version(current_version) >= Version(required_version)
97
+
98
+
99
+ def import_error_heler(e: Exception):
100
+ """
101
+ Print a helpful message in the event of an import error
102
+ """
103
+ raise ImportError(
104
+ f"{e}\n Please install lemonade-sdk with "
105
+ "one of the oga extras, for example:\n"
106
+ "pip install lemonade-sdk[dev,oga-cpu]\n"
107
+ "See https://lemonade-server.ai/install_options.html for details"
108
+ )
109
+
110
+
111
+ def _open_driver_install_page():
112
+ """
113
+ Opens the driver installation page in the user's default web browser.
114
+ """
115
+ try:
116
+ driver_page_url = "https://lemonade-server.ai/driver_install.html"
117
+ printing.log_info(f"Opening driver installation guide: {driver_page_url}")
118
+ webbrowser.open(driver_page_url)
119
+ except Exception as e: # pylint: disable=broad-except
120
+ printing.log_info(f"Could not open browser automatically: {e}")
121
+ printing.log_info(
122
+ "Please visit https://lemonade-server.ai/driver_install.html "
123
+ "for driver installation instructions."
124
+ )
125
+
126
+
127
+ class OgaLoad(FirstTool):
128
+ """
129
+ Tool that loads an LLM in OnnxRuntime-GenAI for use with CPU or DirectML execution providers.
130
+
131
+ Input: path to a checkpoint.
132
+ Supported choices for cpu and igpu from HF model repository:
133
+ LLM models on Huggingface supported by model_builder. See documentation
134
+ (https://github.com/lemonade-sdk/lemonade/blob/main/docs/dev_cli/ort_genai_igpu.md)
135
+ for supported models.
136
+ Supported choices for npu from HF model repository:
137
+ Models on Hugging Face that follow the "amd/**-onnx-ryzen-strix" pattern
138
+ Local models for cpu, igpu, or npu:
139
+ The specified checkpoint is converted to a local path, via mapping to lower case
140
+ and replacing '/' with '_'. If this model already exists in the 'models' folder
141
+ of the lemonade cache and if it has a subfolder <device>-<dtype>, then this model
142
+ will be used. If the --force flag is used and the model is built with model_builder,
143
+ then it will be rebuilt.
144
+
145
+
146
+
147
+ Output:
148
+ state.model: handle to a Huggingface-style LLM loaded on DirectML device
149
+ state.tokenizer = Huggingface-style LLM tokenizer instance
150
+ state.dtype = data type of the model on DirectML device
151
+ state.checkpoint = name of the checkpoint used to load state.model
152
+
153
+ Note: This tool expects the onnxruntime-genai-directml library to be pre-installed.
154
+ If that library is not installed, this tool will not load.
155
+ """
156
+
157
+ unique_name = "oga-load"
158
+
159
+ def __init__(self):
160
+ super().__init__(monitor_message="Loading OnnxRuntime-GenAI model")
161
+
162
+ self.status_stats = [
163
+ Keys.DTYPE,
164
+ Keys.DEVICE,
165
+ Keys.LOCAL_MODEL_FOLDER,
166
+ ]
167
+
168
+ @staticmethod
169
+ def parser(add_help: bool = True) -> argparse.ArgumentParser:
170
+ parser = __class__.helpful_parser(
171
+ short_description="Load model in onnxruntime-genai (OGA)",
172
+ add_help=add_help,
173
+ )
174
+
175
+ parser.add_argument(
176
+ "-ip",
177
+ "--input_path",
178
+ default="",
179
+ help="the local huggingface model in your disk",
180
+ )
181
+
182
+ parser.add_argument(
183
+ "-d",
184
+ "--device",
185
+ choices=["igpu", "npu", "cpu", "hybrid", "cuda"],
186
+ default="igpu",
187
+ help="Which device to load the model on to (default: igpu)",
188
+ )
189
+
190
+ parser.add_argument(
191
+ "--dtype",
192
+ choices=["int4", "fp16", "fp32"],
193
+ required=True,
194
+ help="Data type to load the model in",
195
+ )
196
+
197
+ parser.add_argument(
198
+ "--int4-block-size",
199
+ default=None,
200
+ help="Specify the block_size for int4 quantization.",
201
+ choices=[16, 32, 64, 128, 256],
202
+ type=int,
203
+ )
204
+
205
+ parser.add_argument(
206
+ "--force",
207
+ action="store_true",
208
+ help="Forces downloading of Hugging-Face model again (if changed). Additionally for"
209
+ " cpu and igpu devices only, forces model_builder to run again on the HF model"
210
+ " (changed or not).",
211
+ )
212
+
213
+ parser.add_argument(
214
+ "--download-only",
215
+ action="store_true",
216
+ help="Download the model if needed, but don't load it",
217
+ )
218
+
219
+ parser.add_argument(
220
+ "--trust-remote-code",
221
+ action="store_true",
222
+ help="Set this flag to use models whose code is on the Hugging Face hub rather "
223
+ "than natively in the OnnxRuntime Gen AI libraries. Please review the model code "
224
+ "in advance as this is a security risk.",
225
+ )
226
+
227
+ parser.add_argument(
228
+ "--subfolder",
229
+ default=None,
230
+ help="Subfolder where model is located <LEMONADE CACHE>/oga_models/<MODELNAME>"
231
+ "/<SUBFOLDER>, default is <EP for device>-<dtype>. The EPs are: "
232
+ f'{", ".join([value + " for " + key for key, value in execution_providers.items()])}.',
233
+ )
234
+
235
+ return parser
236
+
237
+ @staticmethod
238
+ def _validate_model_configuration(device, dtype, checkpoint):
239
+ """
240
+ Validate if the device, dtype, platform and checkpoint combination are consistent with
241
+ HuggingFace checkpoint naming conventions and specifically for AMD models for NPU
242
+ and hybrid flows.
243
+
244
+ Returns True if device, dtype, and model are consistent.
245
+ """
246
+
247
+ hf_supported_models = {
248
+ "cpu": {"int4": "*/*", "fp32": "*/*"},
249
+ "igpu": {"int4": "*/*", "fp16": "*/*"},
250
+ "npu": {"int4": "*/*"},
251
+ "hybrid": {"int4": "*/*"},
252
+ "cuda": {"int4": "*/*", "fp16": "*/*"},
253
+ }
254
+
255
+ hf_supported = (
256
+ device in hf_supported_models
257
+ and dtype in hf_supported_models[device]
258
+ and fnmatch(checkpoint, hf_supported_models[device][dtype])
259
+ )
260
+ return hf_supported
261
+
262
+ @staticmethod
263
+ def _setup_model_paths(
264
+ state, checkpoint, device, dtype, subfolder, int4_block_size
265
+ ):
266
+ """
267
+ Determines and returns the following model path information for models produced by OGA
268
+ model builder:
269
+
270
+ full_model_path - Full path to where the OGA model files are stored.
271
+ oga_models_subfolder - The subfolder of the oga_models folder where the model files
272
+ are stored. (<full_model_path> = <oga_models>/<oga_models_subfolder>)
273
+ This subfolder is usually
274
+ <checkpoint_string>/<device>-<dtype>[-block-<int4_block_size]>
275
+ but the if the argument subfolder is not None it will override the latter part
276
+ of this path.
277
+ model_exists_locally - True if full_model_path is a folder that contains files
278
+
279
+ Note: Model files already in ONNX format on Hugging Face will be stored in the
280
+ Hugging Face cache, not this folder. The <oga_models> folder contains model
281
+ files that have locally been quantized/converted to OGA format and any other
282
+ models that have been manually added by the user.
283
+ """
284
+ from lemonade.common.network import custom_snapshot_download
285
+
286
+ if subfolder is None:
287
+ subfolder = f"{execution_providers[device]}-{dtype}"
288
+ subfolder += (
289
+ f"-block-{int4_block_size}"
290
+ if dtype == "int4" and int4_block_size is not None
291
+ else ""
292
+ )
293
+
294
+ # First, check in the lemonade oga_models cache
295
+ oga_models_subfolder = os.path.join(
296
+ checkpoint.replace("/", "_").lower(), subfolder
297
+ )
298
+ full_model_path = os.path.join(
299
+ state.cache_dir, oga_models_path, oga_models_subfolder
300
+ )
301
+ model_exists_locally = os.path.isdir(full_model_path) and os.listdir(
302
+ full_model_path
303
+ )
304
+
305
+ # If not found in lemonade cache, check in Hugging Face cache
306
+ if not model_exists_locally:
307
+ try:
308
+ snapshot_path = custom_snapshot_download(
309
+ checkpoint,
310
+ local_files_only=True,
311
+ )
312
+
313
+ # Check if the snapshot contains ONNX files
314
+ if os.path.isdir(snapshot_path) and os.listdir(snapshot_path):
315
+ is_onnx_model = any(
316
+ filename.endswith(".onnx")
317
+ for filename in os.listdir(snapshot_path)
318
+ )
319
+
320
+ if is_onnx_model:
321
+ # If the model is in HF cache and has ONNX files, use it
322
+ full_model_path = snapshot_path
323
+ model_exists_locally = True
324
+ printing.log_info(
325
+ f"Found ONNX model in Hugging Face cache: {full_model_path}"
326
+ )
327
+ except Exception as e: # pylint: disable=broad-exception-caught
328
+ # Log any errors but continue with the original path
329
+ printing.log_info(f"Error checking Hugging Face cache: {e}")
330
+
331
+ return full_model_path, model_exists_locally
332
+
333
+ @staticmethod
334
+ def _setup_model_dependencies(full_model_path, device, ryzenai_version, oga_path):
335
+ # pylint: disable=unused-argument
336
+ """
337
+ Sets up model dependencies for hybrid and NPU inference by:
338
+ 1. Configuring the custom_ops_library path in genai_config.json.
339
+ 2. Adding DLL source directories to PATH for dependent DLL discovery.
340
+ 3. Check NPU driver version if required for device and ryzenai_version.
341
+ """
342
+
343
+ # For RyzenAI 1.6.0, check NPU driver version for NPU and hybrid devices
344
+ if device in ["npu", "hybrid"]:
345
+ required_driver_version = REQUIRED_NPU_DRIVER_VERSION
346
+
347
+ current_driver_version = _get_npu_driver_version()
348
+ rai_version, _ = _get_ryzenai_version_info(device)
349
+
350
+ if not current_driver_version:
351
+ printing.log_warning(
352
+ f"NPU driver not found. {device.upper()} inference requires NPU driver "
353
+ f"version {required_driver_version}.\n"
354
+ "Please download and install the NPU Driver from:\n"
355
+ f"{NPU_DRIVER_DOWNLOAD_URL}\n"
356
+ "NPU functionality may not work properly."
357
+ )
358
+ _open_driver_install_page()
359
+
360
+ elif not _compare_driver_versions(
361
+ current_driver_version, required_driver_version
362
+ ):
363
+ printing.log_warning(
364
+ f"Incorrect NPU driver version detected: {current_driver_version}\n"
365
+ f"{device.upper()} inference with RyzenAI {rai_version} requires driver "
366
+ f"version {required_driver_version} or higher.\n"
367
+ "Please download and install the correct NPU Driver from:\n"
368
+ f"{NPU_DRIVER_DOWNLOAD_URL}\n"
369
+ "NPU functionality may not work properly."
370
+ )
371
+ _open_driver_install_page()
372
+
373
+ # Setup DLL paths for NPU/hybrid inference
374
+ # Use sys.prefix to get the environment root (works for both venv and conda)
375
+ # - Conda: sys.executable is at env/python.exe, sys.prefix is env/
376
+ # - Venv: sys.executable is at .venv/Scripts/python.exe, sys.prefix is .venv/
377
+ env_path = sys.prefix
378
+ dll_source_path = os.path.join(
379
+ env_path, "Lib", "site-packages", "onnxruntime_genai"
380
+ )
381
+ required_dlls = ["libutf8_validity.dll", "abseil_dll.dll"]
382
+
383
+ # Validate that all required DLLs exist in the source directory
384
+ missing_dlls = []
385
+
386
+ for dll_name in required_dlls:
387
+ dll_source = os.path.join(dll_source_path, dll_name)
388
+ if not os.path.exists(dll_source):
389
+ missing_dlls.append(dll_source)
390
+
391
+ if missing_dlls:
392
+ dll_list = "\n - ".join(missing_dlls)
393
+ raise RuntimeError(
394
+ f"Required DLLs not found for {device} inference:\n - {dll_list}\n"
395
+ f"Please ensure your RyzenAI installation is complete and supports {device}.\n"
396
+ "Please reinstall the RyzenAI Software for your platform. Run:\n"
397
+ " pip install lemonade-sdk[oga-ryzenai]\n"
398
+ )
399
+
400
+ # Add the DLL source directory to PATH
401
+ current_path = os.environ.get("PATH", "")
402
+ if dll_source_path not in current_path:
403
+ os.environ["PATH"] = dll_source_path + os.pathsep + current_path
404
+
405
+ @staticmethod
406
+ def _is_preoptimized_model(input_model_path):
407
+ """
408
+ Checks if the 'custom_ops_library' field exists in the genai_config.json file
409
+ to determine if this is a pre-optimized model for hybrid as well
410
+ as NPU only.
411
+
412
+ Args:
413
+ input_model_path (str): Path to the input model directory.
414
+
415
+ Returns:
416
+ bool: True if 'custom_ops_library' exists, False otherwise.
417
+ """
418
+ config_path = os.path.join(input_model_path, "genai_config.json")
419
+ if not os.path.exists(config_path):
420
+ printing.log_info(f"Model's `genai_config.json` not found in {config_path}")
421
+ return False
422
+
423
+ with open(config_path, "r", encoding="utf-8") as f:
424
+ config = json.load(f)
425
+ if (
426
+ "model" in config
427
+ and "decoder" in config["model"]
428
+ and "session_options" in config["model"]["decoder"]
429
+ ):
430
+ return "custom_ops_library" in config["model"]["decoder"]["session_options"]
431
+ return False
432
+
433
+ @staticmethod
434
+ def _download_and_build_safetensors_model(
435
+ checkpoint, device, dtype, full_model_path, int4_block_size, input_path, state
436
+ ):
437
+ """
438
+ Uses OGA model builder to quantize safetensors format model and convert to ONNX
439
+ format. The model files are saved to the full_model_path folder.
440
+ """
441
+
442
+ try:
443
+ import onnxruntime_genai.models.builder as model_builder
444
+ except ImportError as e:
445
+ import_error_heler(e)
446
+
447
+ printing.log_info(f"Building {checkpoint} for {device} using {dtype}")
448
+ extra_options = {}
449
+ if int4_block_size is not None:
450
+ extra_options["int4-block-size"] = int4_block_size
451
+ try:
452
+ model_builder.create_model(
453
+ checkpoint,
454
+ input_path,
455
+ full_model_path,
456
+ dtype,
457
+ execution_providers[device],
458
+ os.path.join(state.cache_dir, oga_model_builder_cache_path),
459
+ **extra_options,
460
+ )
461
+ except NotImplementedError as e:
462
+ raise NotImplementedError("[Model builder] " + str(e)) from e
463
+ except OSError as e:
464
+ raise ValueError("[Model builder] " + str(e)) from e
465
+
466
+ return full_model_path
467
+
468
+ @staticmethod
469
+ def _load_model_and_setup_state(
470
+ state, full_model_path, checkpoint, trust_remote_code
471
+ ):
472
+ """
473
+ Loads the OGA model from local folder and then loads the tokenizer.
474
+ Will auto-detect if we're offline.
475
+ """
476
+
477
+ try:
478
+ from lemonade.tools.oga.utils import OrtGenaiModel, OrtGenaiTokenizer
479
+ from lemonade.common.network import is_offline
480
+ except ImportError as e:
481
+ import_error_heler(e)
482
+
483
+ try:
484
+ state.model = OrtGenaiModel(full_model_path)
485
+ except Exception as e:
486
+ if "invalid unordered_map<K, T>" in str(e):
487
+ raise ValueError(
488
+ "Error initializing model: Invalid configuration detected.\n"
489
+ "Please check the following:\n"
490
+ f"1. Please check your model's config file in {full_model_path} "
491
+ "and ensure custom_ops_library points to the valid "
492
+ "onnx_custom_ops.dll path.\n"
493
+ "2. Make sure the NPU driver is loaded.\n"
494
+ "3. Make sure hybrid has been installed on a Ryzen AI "
495
+ f"{'or '.join(SUPPORTED_RYZEN_AI_SERIES)}-series processor."
496
+ ) from e
497
+ raise
498
+
499
+ # Auto-detect offline mode
500
+ offline = is_offline()
501
+
502
+ try:
503
+ from transformers import AutoTokenizer
504
+ except ImportError as e:
505
+ import_error_heler(e)
506
+
507
+ try:
508
+ # Always try to use local files first
509
+ local_files_only = True
510
+
511
+ hf_tokenizer = AutoTokenizer.from_pretrained(
512
+ full_model_path,
513
+ local_files_only=local_files_only,
514
+ trust_remote_code=trust_remote_code,
515
+ )
516
+ except ValueError as e:
517
+ if "trust_remote_code" in str(e):
518
+ raise ValueError(
519
+ "This model requires you to execute code from the repo. Please review it "
520
+ "and if you trust it, then use the `--trust-remote-code` flag with oga-load."
521
+ )
522
+
523
+ if offline and "Can't load tokenizer for" in str(e):
524
+ raise ValueError(
525
+ f"Cannot load tokenizer for {checkpoint} in offline mode. "
526
+ f"The tokenizer files may not be available locally in {full_model_path}."
527
+ )
528
+ raise
529
+
530
+ state.tokenizer = OrtGenaiTokenizer(
531
+ state.model.model,
532
+ hf_tokenizer,
533
+ )
534
+
535
+ status.add_to_state(state=state, name=checkpoint, model=checkpoint)
536
+
537
+ @staticmethod
538
+ def _cleanup_environment(saved_state):
539
+ """
540
+ Restores environment to its original state after inference is complete.
541
+ """
542
+ if saved_state:
543
+ os.chdir(saved_state["cwd"])
544
+ os.environ["PATH"] = saved_state["path"]
545
+
546
+ def _generate_model_for_oga(self, output_model_path, device, input_model_path):
547
+ """
548
+ Uses the model_generate tool to generate the model for OGA hybrid or npu targets.
549
+ """
550
+ try:
551
+ import model_generate
552
+ except ImportError as e:
553
+ raise ImportError(
554
+ f"{e}\nYou are trying to use a developer tool that may not be "
555
+ "installed. Please install the required package using:\n"
556
+ "pip install -e .[dev,oga-ryzenai] \
557
+ --extra-index-url https://pypi.amd.com/simple"
558
+ )
559
+
560
+ # Determine the appropriate flag based on the device type
561
+ if device == "hybrid":
562
+ device_flag = "hybrid"
563
+ elif device == "npu":
564
+ device_flag = "npu"
565
+ else:
566
+ raise ValueError(f"Unsupported device type for model generation: {device}")
567
+
568
+ printing.log_info(
569
+ f"Generating model for device: {device_flag}, \
570
+ input: {input_model_path}, output: {output_model_path}"
571
+ )
572
+
573
+ try:
574
+ if device_flag == "npu":
575
+ model_generate.generate_npu_model(
576
+ input_model=input_model_path,
577
+ output_dir=output_model_path,
578
+ packed_const=False,
579
+ )
580
+ else: # hybrid
581
+ model_generate.generate_hybrid_model(
582
+ input_model=input_model_path,
583
+ output_dir=output_model_path,
584
+ script_option="jit_npu",
585
+ mode="bf16",
586
+ dml_only=False,
587
+ )
588
+ except Exception as e:
589
+ raise RuntimeError(
590
+ f"Failed to generate model for {device_flag} device. Error: {e}"
591
+ ) from e
592
+
593
+ def run(
594
+ self,
595
+ state: State,
596
+ input: str,
597
+ input_path: str = "",
598
+ device: str = "igpu",
599
+ dtype: str = "int4",
600
+ int4_block_size: int = None,
601
+ force: bool = False,
602
+ download_only: bool = False,
603
+ trust_remote_code=False,
604
+ subfolder: str = None,
605
+ do_not_upgrade: bool = False,
606
+ ) -> State:
607
+ from lemonade.common.network import (
608
+ custom_snapshot_download,
609
+ get_base_model,
610
+ is_offline,
611
+ )
612
+
613
+ # Auto-detect offline status
614
+ offline = is_offline()
615
+ if offline:
616
+ printing.log_warning(
617
+ "Network connectivity to huggingface.co not detected. Running in offline mode."
618
+ )
619
+
620
+ state.device = device
621
+ state.dtype = dtype
622
+
623
+ # Log initial stats
624
+ state.save_stat(Keys.DTYPE, dtype)
625
+ state.save_stat(Keys.DEVICE, device)
626
+ if device in ["hybrid", "npu"]:
627
+ ryzenai_version, _ = _get_ryzenai_version_info(device)
628
+ ryzen_ai_version_info = {"version": ryzenai_version}
629
+ state.save_stat(Keys.RYZEN_AI_VERSION_INFO, ryzen_ai_version_info)
630
+
631
+ # Check if input is a local folder
632
+ if os.path.isdir(input):
633
+ # input is a local folder
634
+ full_model_path = os.path.abspath(input)
635
+ checkpoint = "local_model"
636
+ state.checkpoint = checkpoint
637
+ state.save_stat(Keys.CHECKPOINT, checkpoint)
638
+ state.save_stat(Keys.LOCAL_MODEL_FOLDER, full_model_path)
639
+ # See if there is a file ending in ".onnx" in this folder
640
+ has_onnx_file = find_onnx_files_recursively(input)
641
+ if not has_onnx_file:
642
+ raise ValueError(
643
+ f"The folder {input} does not contain an ONNX model file."
644
+ )
645
+ if force:
646
+ raise ValueError(
647
+ "Your input (-i, --input) points to a local folder, which is not "
648
+ "compatible with the force argument."
649
+ )
650
+
651
+ else:
652
+ # input is a model checkpoint
653
+ checkpoint = input
654
+ state.checkpoint = checkpoint
655
+ state.save_stat(Keys.CHECKPOINT, checkpoint)
656
+
657
+ # Get base model information
658
+ if not offline:
659
+ base_model = get_base_model(checkpoint)
660
+ if base_model is not None:
661
+ state.save_stat("base_model", base_model)
662
+
663
+ # Setup paths
664
+ full_model_path, model_exists_locally = self._setup_model_paths(
665
+ state, checkpoint, device, dtype, subfolder, int4_block_size
666
+ )
667
+
668
+ # If in offline mode, we can only use locally available models
669
+ if offline and not model_exists_locally:
670
+ raise ValueError(
671
+ f"Model {checkpoint} is not available locally for {device} with {dtype}. "
672
+ f"Cannot download in offline mode. Check {full_model_path}"
673
+ )
674
+
675
+ # Handle download/build if needed
676
+ if (not model_exists_locally) or force:
677
+ if offline:
678
+ raise ValueError(
679
+ f"Cannot download or build model {checkpoint} in offline mode"
680
+ )
681
+
682
+ # Validate configuration
683
+ hf_supported = self._validate_model_configuration(
684
+ device, dtype, checkpoint
685
+ )
686
+
687
+ if not hf_supported:
688
+ raise ValueError(
689
+ "The (device, dtype, checkpoint) combination is not supported: "
690
+ f"({device}, {dtype}, {checkpoint})"
691
+ )
692
+ input_model_path = custom_snapshot_download(
693
+ checkpoint,
694
+ ignore_patterns=["*.md", "*.txt"],
695
+ local_files_only=offline or do_not_upgrade,
696
+ )
697
+ # Check if model is ONNX or safetensors
698
+ is_onnx_model = any(
699
+ [
700
+ filename.endswith(".onnx")
701
+ for filename in os.listdir(input_model_path)
702
+ ]
703
+ )
704
+ is_preoptimized_onnx = is_onnx_model and self._is_preoptimized_model(
705
+ input_model_path
706
+ )
707
+ is_safetensors_model = any(
708
+ [
709
+ filename.endswith(".safetensors")
710
+ for filename in os.listdir(input_model_path)
711
+ ]
712
+ )
713
+ if not (is_onnx_model or is_safetensors_model):
714
+ raise ValueError(
715
+ f"The model {checkpoint} is not supported. "
716
+ "It does not contain ONNX or safetensors files."
717
+ )
718
+ if device in ["npu", "hybrid"]:
719
+ if is_onnx_model:
720
+ if is_preoptimized_onnx:
721
+ # Use HuggingFace cache path as it is
722
+ full_model_path = input_model_path
723
+ else:
724
+ # If ONNX but not modified yet for Hybrid or NPU,
725
+ # needs further optimization
726
+ self._generate_model_for_oga(
727
+ full_model_path,
728
+ device,
729
+ input_model_path,
730
+ )
731
+ elif is_safetensors_model:
732
+ config_path = os.path.join(input_model_path, "config.json")
733
+ if os.path.exists(config_path):
734
+ with open(config_path, "r", encoding="utf-8") as f:
735
+ config = json.load(f)
736
+ if "quantization_config" in config:
737
+ # If quantized, use subprocess to generate the model
738
+ self._generate_model_for_oga(
739
+ full_model_path, device, input_model_path
740
+ )
741
+ else:
742
+ raise ValueError(
743
+ f"The safetensors model {checkpoint} is not quantized. "
744
+ "Only quantized safetensors models are supported"
745
+ " on npu or hybrid targets."
746
+ )
747
+ else:
748
+ raise ValueError(
749
+ f"config.json not found for safetensors model: {checkpoint}"
750
+ )
751
+ else:
752
+ raise ValueError(
753
+ f"Unsupported model type for checkpoint: {checkpoint}"
754
+ )
755
+ else:
756
+ if is_onnx_model:
757
+ # Use HuggingFace cache path as it is
758
+ full_model_path = input_model_path
759
+ else:
760
+ self._download_and_build_safetensors_model(
761
+ checkpoint,
762
+ device,
763
+ dtype,
764
+ full_model_path,
765
+ int4_block_size,
766
+ input_path,
767
+ state,
768
+ )
769
+ state.save_stat(Keys.LOCAL_MODEL_FOLDER, full_model_path)
770
+
771
+ # Load model if download-only argument is not set
772
+ if not download_only:
773
+ # Get version information for NPU/Hybrid devices
774
+ if device in ["hybrid", "npu"]:
775
+ ryzenai_version, oga_path = _get_ryzenai_version_info(device)
776
+ else:
777
+ ryzenai_version, oga_path = None, None
778
+
779
+ saved_env_state = None
780
+
781
+ # Setup model dependencies for NPU/Hybrid devices
782
+ if device in ["hybrid", "npu"]:
783
+ self._setup_model_dependencies(
784
+ full_model_path, device, ryzenai_version, oga_path
785
+ )
786
+
787
+ try:
788
+ if device == "npu":
789
+ # Set USE_AIE_RoPE based on model type
790
+ os.environ["USE_AIE_RoPE"] = (
791
+ "0" if "phi-" in checkpoint.lower() else "1"
792
+ )
793
+
794
+ self._load_model_and_setup_state(
795
+ state, full_model_path, checkpoint, trust_remote_code
796
+ )
797
+ finally:
798
+ self._cleanup_environment(saved_env_state)
799
+
800
+ return state
801
+
802
+
803
+ # This file was originally licensed under Apache 2.0. It has been modified.
804
+ # Modifications Copyright (c) 2025 AMD