lemonade-sdk 8.1.4__py3-none-any.whl → 8.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lemonade-sdk might be problematic. Click here for more details.

Files changed (53) hide show
  1. lemonade/cache.py +6 -1
  2. lemonade/cli.py +47 -5
  3. lemonade/common/inference_engines.py +13 -4
  4. lemonade/common/status.py +4 -4
  5. lemonade/common/system_info.py +544 -1
  6. lemonade/profilers/agt_power.py +437 -0
  7. lemonade/profilers/hwinfo_power.py +429 -0
  8. lemonade/tools/accuracy.py +143 -48
  9. lemonade/tools/adapter.py +6 -1
  10. lemonade/tools/bench.py +26 -8
  11. lemonade/tools/flm/__init__.py +1 -0
  12. lemonade/tools/flm/utils.py +303 -0
  13. lemonade/tools/huggingface/bench.py +6 -1
  14. lemonade/tools/llamacpp/bench.py +146 -27
  15. lemonade/tools/llamacpp/load.py +30 -2
  16. lemonade/tools/llamacpp/utils.py +393 -33
  17. lemonade/tools/oga/bench.py +5 -26
  18. lemonade/tools/oga/load.py +60 -121
  19. lemonade/tools/oga/migration.py +403 -0
  20. lemonade/tools/report/table.py +76 -8
  21. lemonade/tools/server/flm.py +133 -0
  22. lemonade/tools/server/llamacpp.py +220 -553
  23. lemonade/tools/server/serve.py +684 -168
  24. lemonade/tools/server/static/js/chat.js +666 -342
  25. lemonade/tools/server/static/js/model-settings.js +24 -3
  26. lemonade/tools/server/static/js/models.js +597 -73
  27. lemonade/tools/server/static/js/shared.js +79 -14
  28. lemonade/tools/server/static/logs.html +191 -0
  29. lemonade/tools/server/static/styles.css +491 -66
  30. lemonade/tools/server/static/webapp.html +83 -31
  31. lemonade/tools/server/tray.py +158 -38
  32. lemonade/tools/server/utils/macos_tray.py +226 -0
  33. lemonade/tools/server/utils/{system_tray.py → windows_tray.py} +13 -0
  34. lemonade/tools/server/webapp.py +4 -1
  35. lemonade/tools/server/wrapped_server.py +559 -0
  36. lemonade/version.py +1 -1
  37. lemonade_install/install.py +54 -611
  38. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/METADATA +29 -72
  39. lemonade_sdk-8.2.2.dist-info/RECORD +83 -0
  40. lemonade_server/cli.py +145 -37
  41. lemonade_server/model_manager.py +521 -37
  42. lemonade_server/pydantic_models.py +28 -1
  43. lemonade_server/server_models.json +246 -92
  44. lemonade_server/settings.py +39 -39
  45. lemonade/tools/quark/__init__.py +0 -0
  46. lemonade/tools/quark/quark_load.py +0 -173
  47. lemonade/tools/quark/quark_quantize.py +0 -439
  48. lemonade_sdk-8.1.4.dist-info/RECORD +0 -77
  49. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/WHEEL +0 -0
  50. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/entry_points.txt +0 -0
  51. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/LICENSE +0 -0
  52. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/licenses/NOTICE.md +0 -0
  53. {lemonade_sdk-8.1.4.dist-info → lemonade_sdk-8.2.2.dist-info}/top_level.txt +0 -0
@@ -1,39 +1,39 @@
1
- import json
2
- import os
3
- from lemonade.cache import DEFAULT_CACHE_DIR
4
-
5
- # Define the path for the user settings file, placing it in the cache directory
6
- USER_SETTINGS_FILE = os.path.join(DEFAULT_CACHE_DIR, "user_settings.json")
7
-
8
-
9
- def save_setting(key, value):
10
- """Save a setting to the user_settings.json file."""
11
- # Ensure the cache directory exists
12
- os.makedirs(DEFAULT_CACHE_DIR, exist_ok=True)
13
-
14
- settings = {}
15
- if os.path.exists(USER_SETTINGS_FILE):
16
- with open(USER_SETTINGS_FILE, "r") as f:
17
- try:
18
- settings = json.load(f)
19
- except json.JSONDecodeError:
20
- # If the file is empty or corrupt, start with a fresh dictionary
21
- pass
22
-
23
- settings[key] = value
24
- with open(USER_SETTINGS_FILE, "w") as f:
25
- json.dump(settings, f, indent=4)
26
-
27
-
28
- def load_setting(key, default=None):
29
- """Load a setting from the user_settings.json file."""
30
- if not os.path.exists(USER_SETTINGS_FILE):
31
- return default
32
-
33
- with open(USER_SETTINGS_FILE, "r") as f:
34
- try:
35
- settings = json.load(f)
36
- return settings.get(key, default)
37
- except json.JSONDecodeError:
38
- # Return default if the file is empty or corrupt
39
- return default
1
+ import json
2
+ import os
3
+ from lemonade.cache import DEFAULT_CACHE_DIR
4
+
5
+ # Define the path for the user settings file, placing it in the cache directory
6
+ USER_SETTINGS_FILE = os.path.join(DEFAULT_CACHE_DIR, "user_settings.json")
7
+
8
+
9
+ def save_setting(key, value):
10
+ """Save a setting to the user_settings.json file."""
11
+ # Ensure the cache directory exists
12
+ os.makedirs(DEFAULT_CACHE_DIR, exist_ok=True)
13
+
14
+ settings = {}
15
+ if os.path.exists(USER_SETTINGS_FILE):
16
+ with open(USER_SETTINGS_FILE, "r") as f:
17
+ try:
18
+ settings = json.load(f)
19
+ except json.JSONDecodeError:
20
+ # If the file is empty or corrupt, start with a fresh dictionary
21
+ pass
22
+
23
+ settings[key] = value
24
+ with open(USER_SETTINGS_FILE, "w") as f:
25
+ json.dump(settings, f, indent=4)
26
+
27
+
28
+ def load_setting(key, default=None):
29
+ """Load a setting from the user_settings.json file."""
30
+ if not os.path.exists(USER_SETTINGS_FILE):
31
+ return default
32
+
33
+ with open(USER_SETTINGS_FILE, "r") as f:
34
+ try:
35
+ settings = json.load(f)
36
+ return settings.get(key, default)
37
+ except json.JSONDecodeError:
38
+ # Return default if the file is empty or corrupt
39
+ return default
File without changes
@@ -1,173 +0,0 @@
1
- import argparse
2
- import os
3
- import sys
4
-
5
- from lemonade.state import State
6
- from lemonade.tools import Tool
7
- import lemonade.common.printing as printing
8
- import lemonade.common.build as build
9
- from lemonade_install.install import DEFAULT_QUARK_DIR
10
-
11
-
12
- class QuarkLoad(Tool):
13
- """
14
- Load a model Quantized and exported using Quark.
15
- Required Input State:
16
- - state.model: Pretrained model instance to be quantized.
17
- - state.tokenizer: Tokenizer instance from Hugging Face.
18
- Output:
19
- - state of the loaded model
20
-
21
- See docs/dev_cli/quark.md for more details.
22
- """
23
-
24
- unique_name = "quark-load"
25
-
26
- def __init__(self):
27
- super().__init__(monitor_message="Load Quark Quantized model")
28
-
29
- @staticmethod
30
- def parser(add_help: bool = True) -> argparse.ArgumentParser:
31
- parser = __class__.helpful_parser(
32
- short_description="Load a quantized model using Quark",
33
- add_help=add_help,
34
- )
35
-
36
- parser.add_argument(
37
- "--quant-scheme",
38
- type=str,
39
- required=True,
40
- default=None,
41
- help="Supported quantization schemes in Quark",
42
- )
43
-
44
- parser.add_argument(
45
- "--quant-algo",
46
- type=str,
47
- required=True,
48
- default=None,
49
- choices=["awq", "gptq", "autosmoothquant", None],
50
- help="Supported quantization algorithms in Quark",
51
- )
52
-
53
- parser.add_argument(
54
- "--torch-compile", action="store_true", help="Model torch compile"
55
- )
56
-
57
- parser.add_argument(
58
- "--safetensors-model-reload",
59
- action="store_true",
60
- help="Safetensors model reload",
61
- )
62
-
63
- parser.add_argument(
64
- "--safetensors-model-dir",
65
- default=None,
66
- help="Directory of safetensors model",
67
- )
68
-
69
- parser.add_argument(
70
- "--params-load", action="store_true", help="Model parameters load"
71
- )
72
-
73
- parser.add_argument("--json-path", help="Specify the path of saved json file")
74
-
75
- parser.add_argument(
76
- "--safetensors-path",
77
- default=None,
78
- help="Specify the path of saved safetensors file",
79
- )
80
-
81
- return parser
82
-
83
- def run(
84
- self,
85
- state: State,
86
- quant_scheme: str,
87
- quant_algo: str,
88
- torch_compile: bool = False,
89
- safetensors_model_reload: bool = False,
90
- safetensors_model_dir: str = None,
91
- params_load: bool = False,
92
- json_path: str = None,
93
- safetensors_path: str = None,
94
- ) -> State:
95
- """
96
- Executes the QuarkLoad process.
97
- Returns:
98
- State: The updated state after loading the model.
99
- Raises:
100
- Exception: If an error occurs during the QuarkLoad process.
101
- """
102
-
103
- import torch
104
-
105
- try:
106
- if os.path.isdir(DEFAULT_QUARK_DIR):
107
- quark_llm_path = os.path.join(
108
- DEFAULT_QUARK_DIR, "examples", "torch", "language_modeling"
109
- )
110
- sys.path.insert(0, quark_llm_path)
111
- else:
112
- raise FileNotFoundError(
113
- f"The directory {DEFAULT_QUARK_DIR} does not exist. \
114
- Please check your installation."
115
- )
116
-
117
- # Default load path specific to recipe
118
- # This will NOT work
119
- # The default path is now uniquely craeated with timestamp
120
- # Default load path will not work. Need to pass explicit load path
121
- model_export_path = os.path.join(
122
- build.output_dir(state.cache_dir, state.build_name),
123
- "exported_model",
124
- quant_scheme,
125
- quant_algo,
126
- )
127
-
128
- # Set default paths only if current values are None
129
- if safetensors_model_dir is None:
130
- safetensors_model_dir = model_export_path
131
- if safetensors_path is None:
132
- safetensors_path = os.path.join(model_export_path, "model.safetensors")
133
- printing.log_info("Loading model ...")
134
- if not params_load and not safetensors_model_reload:
135
- raise ValueError(
136
- " Specify load format: 'params_load' or 'safetensors_model_reload'."
137
- )
138
-
139
- # Reload quantized model if specified
140
- from quark.torch import load_params, import_model_info
141
-
142
- if params_load:
143
- printing.log_info(
144
- "Restoring quantized model from JSON/safetensors files"
145
- )
146
- model = load_params(
147
- model,
148
- json_path=json_path,
149
- safetensors_path=safetensors_path,
150
- )
151
- elif safetensors_model_reload:
152
- printing.log_info(
153
- "Restoring quantized model from quark_safetensors files"
154
- )
155
- model = import_model_info(model, model_info_dir=safetensors_model_dir)
156
-
157
- if torch_compile:
158
- printing.log_info("torch.compile...")
159
- model = torch.compile(model)
160
-
161
- state.model = model
162
- state.dtype = model.dtype
163
-
164
- printing.log_info("Quark Load process completed.")
165
-
166
- except Exception as e:
167
- printing.log_error(f"An error occurred during the QuarkLoad process: {e}")
168
- raise
169
- return state
170
-
171
-
172
- # This file was originally licensed under Apache 2.0. It has been modified.
173
- # Modifications Copyright (c) 2025 AMD
@@ -1,439 +0,0 @@
1
- import argparse
2
- import os
3
- import sys
4
- from pathlib import Path
5
- from lemonade.state import State
6
- from lemonade.tools import Tool
7
- import lemonade.common.printing as printing
8
- import lemonade.common.build as build
9
- from lemonade_install.install import DEFAULT_QUARK_DIR
10
-
11
-
12
- class QuarkQuantize(Tool):
13
- """
14
- Quantize a model using the Quark Quantization tool.
15
-
16
- This Tool performs the following steps:
17
- 1. Downloads and extracts necessary resources from AMD Quark Web Page.
18
- 2. Based on the target model, it prepares the model, tokenizer, and calibration data.
19
- 3. Optionally quantizes, freezes, and exports the model.
20
- 4. Optionally evaluates the model.
21
-
22
- Required Input State:
23
- - state.model.model: Pretrained model instance to be quantized.
24
- - state.tokenizer: Tokenizer instance from Hugging Face.
25
- Output:
26
- - Modifies `state` with quantized and optionally exported model.
27
-
28
- See docs/dev_cli/quark.md for more details.
29
- """
30
-
31
- unique_name = "quark-quantize"
32
-
33
- def __init__(self):
34
- super().__init__(monitor_message="Quark Quantizing model")
35
-
36
- @staticmethod
37
- def parser(add_help: bool = True) -> argparse.ArgumentParser:
38
- parser = __class__.helpful_parser(
39
- short_description="Quantize a model using Quark",
40
- add_help=add_help,
41
- )
42
- parser.add_argument(
43
- "--device",
44
- default="cpu",
45
- choices=["cuda", "cpu"],
46
- help="Device for running the quantizer",
47
- )
48
- parser.add_argument("--multi-gpu", action="store_true")
49
- parser.add_argument(
50
- "--data-type",
51
- default="auto",
52
- choices=["auto", "float16", "bfloat16", "float32"],
53
- help="Input datatype of the model",
54
- )
55
- parser.add_argument(
56
- "--seq-len", type=int, default=512, help="Sequence length of data"
57
- )
58
- parser.add_argument(
59
- "--batch-size", type=int, default=1, help="Batch size for calibration."
60
- )
61
- parser.add_argument(
62
- "--num-fewshot",
63
- type=int,
64
- default=None,
65
- metavar="N",
66
- help="Number of examples in few-shot context",
67
- )
68
- parser.add_argument(
69
- "--output-dir", default=None, help="Output directory for exported model"
70
- )
71
- parser.add_argument(
72
- "--no-weight-matrix-merge",
73
- action="store_true",
74
- help="If set, merges onnx model and weight \
75
- together before export.\
76
- By default, for onnx export, spits out a model.onnx and a model.weights",
77
- )
78
- parser.add_argument(
79
- "--dataset",
80
- default="pileval",
81
- choices=[
82
- "pileval",
83
- "wikitext",
84
- "pileval_for_awq_benchmark",
85
- "wikitext_for_gptq_benchmark",
86
- "HuggingFaceH4/ultrachat_200k",
87
- ],
88
- help="Dataset for calibration",
89
- )
90
- parser.add_argument(
91
- "--num-calib-data",
92
- type=int,
93
- default=512,
94
- help="Number of samples for calibration.",
95
- )
96
-
97
- # See docs/dev_cli/quark.md for more details.
98
- parser.add_argument(
99
- "--quant-scheme",
100
- type=str,
101
- default=None,
102
- choices=[
103
- "w_fp8_a_fp8",
104
- "w_int4_per_channel_sym",
105
- "w_uint4_per_group_asym",
106
- "w_int4_per_group_sym",
107
- "w_uint4_a_bfloat16_per_group_asym",
108
- "w_int8_per_tensor_sym",
109
- "w_int8_per_group_sym",
110
- "w_uint8_per_group_asym",
111
- "w_int8_a_int8_per_tensor_sym",
112
- "w_int8_a_int8_per_tensor_sym_dynamic",
113
- "w_uint8_a_uint8_per_tensor_asym",
114
- "w_fp8_a_fp8_o_fp8",
115
- "w_mx_fp8",
116
- "w_mx_fp8_a_mx_fp8",
117
- "w_int8_a_int8_per_token_dynamic",
118
- "w_bfp16",
119
- "w_bfp16_a_bfp16",
120
- "w_mx6",
121
- "w_mx6_a_mx6",
122
- "w_fp8_per_channel_sym",
123
- "w_int4_per_channel_asym",
124
- "w_int4_per_group_asym",
125
- "w_uint4_per_group_sym",
126
- "w_uint4_per_channel_sym",
127
- "w_uint4_per_channel_asym",
128
- "w_int8_per_tensor_percentile",
129
- "w_int8_per_tensor_mse",
130
- "w_uint8_per_tensor_percentile",
131
- "w_uint8_per_tensor_mse",
132
- "w_mx_fp4_per_group_sym",
133
- "w_mx_fp6_e3m2_per_group_sym",
134
- "w_mx_fp6_e2m3_per_group_sym",
135
- "w_mx_int8_per_group_sym",
136
- "w_uint4_per_channel_a_int8_per_tensor",
137
- "w_uint4_per_group_a_int8_per_tensor",
138
- "w_bfp16_per_group_sym",
139
- None,
140
- ],
141
- help="Supported quantization schemes in Quark",
142
- )
143
- parser.add_argument(
144
- "--quant-algo",
145
- type=str,
146
- default=None,
147
- choices=["awq", "gptq", "autosmoothquant", None],
148
- help="Support quantization algorithms in Quark",
149
- )
150
- parser.add_argument(
151
- "--pre-optimization-config-file-path",
152
- type=str,
153
- default=None,
154
- help="The JSON file path of pre-optimization config",
155
- )
156
- parser.add_argument(
157
- "--quant-algo-config-file-path",
158
- type=str,
159
- default=None,
160
- help="The JSON file path of quantization algorithm config",
161
- )
162
- parser.add_argument(
163
- "--group-size",
164
- type=int,
165
- default=128,
166
- help="Group size for per_group quantization",
167
- )
168
- parser.add_argument(
169
- "--pack-method",
170
- type=str,
171
- default="reorder",
172
- choices=["order", "reorder"],
173
- help="Pack method for awq_export",
174
- )
175
- parser.add_argument(
176
- "--exclude-layers",
177
- type=str,
178
- nargs="*",
179
- default=None,
180
- help="List of layers to exclude from quantization.",
181
- )
182
- parser.add_argument(
183
- "--kv-cache-dtype",
184
- default=None,
185
- choices=["fp8", None],
186
- help="KV Cache dtype.",
187
- )
188
- parser.add_argument(
189
- "--pre-quantization-optimization",
190
- action="append",
191
- default=[],
192
- choices=["rotation", "smoothquant"],
193
- help="Pre Quantization Optimization.",
194
- )
195
- parser.add_argument(
196
- "--model-export",
197
- default=None,
198
- action="append",
199
- choices=[
200
- None,
201
- "onnx",
202
- "vllm_adopted_safetensors",
203
- "quark_safetensors",
204
- "gguf",
205
- ],
206
- help="Model export format",
207
- )
208
- parser.add_argument(
209
- "--custom-mode",
210
- default="quark",
211
- type=str,
212
- choices=["quark", "awq", "fp8"],
213
- help="Custom mode for export \
214
- This is especially relevant for npu/hybrid export",
215
- )
216
- parser.add_argument(
217
- "--torch-compile",
218
- action="store_true",
219
- help="Compile the quantized model using torch.compile",
220
- )
221
- parser.add_argument(
222
- "--params-save", action="store_true", help="Save model params"
223
- )
224
- parser.add_argument(
225
- "--save-dir",
226
- help="Directory to save model parameters as \
227
- safetensors or pth, in the case when --params_save is used.",
228
- )
229
- parser.add_argument(
230
- "--log-severity-level", type=int, default=3, help="DEBUG=1, INFO=2, ERROR=3"
231
- )
232
- parser.add_argument("--skip-quantization", action="store_true")
233
-
234
- return parser
235
-
236
- def run(self, state: State, **kwargs) -> State:
237
- """
238
- Executes the QuarkQuantize process.
239
-
240
- Args:
241
- state (State): The current state of the process, containing necessary
242
- information such as cache directory and build name.
243
- **kwargs: Additional keyword arguments that may include:
244
- - output_dir (str): Directory to save the output model.
245
- - safetensors_model_dir (str): Directory to save the safetensors model.
246
- - save_dir (str): Directory to save model parameters.
247
- - safetensors_path (str): Path to the safetensors model.
248
- - quant_algo (str): The quantization algorithm to use.
249
- - quant_algo_config_file_path (str): Path to the quantization algorithm
250
- configuration file.
251
- - model_dir (str): Directory of the model.
252
- Returns:
253
- State: The updated state after the quantization process.
254
- Raises:
255
- Exception: If an error occurs during the QuarkQuantize process
256
- and when installation path does not exist.
257
- """
258
-
259
- try:
260
-
261
- if os.path.isdir(DEFAULT_QUARK_DIR):
262
- quark_llm_path = os.path.join(
263
- DEFAULT_QUARK_DIR, "examples", "torch", "language_modeling"
264
- )
265
- sys.path.extend([quark_llm_path])
266
- else:
267
- raise FileNotFoundError(
268
- f"The directory {DEFAULT_QUARK_DIR} does not exist. \
269
- Please check your installation."
270
- )
271
- model_build_path = os.path.join(
272
- build.output_dir(state.cache_dir, state.build_name)
273
- )
274
- model_export_path = os.path.join(
275
- model_build_path,
276
- "exported_model",
277
- kwargs.get("quant_scheme"),
278
- kwargs.get("quant_algo"),
279
- )
280
- # Set default paths only if current values are None
281
- if kwargs.get("model_dir") is None:
282
- kwargs["model_dir"] = model_build_path
283
- if kwargs.get("output_dir") is None:
284
- kwargs["output_dir"] = model_export_path
285
- if kwargs.get("save_dir") is None:
286
- kwargs["save_dir"] = os.path.join(model_export_path, "model_params")
287
-
288
- from llm_utils.model_preparation import get_model_type
289
-
290
- model_type = get_model_type(state.model.model)
291
-
292
- quant_algo = kwargs.get("quant_algo")
293
- kwargs["quant_algo_config_file_path"] = os.path.join(
294
- quark_llm_path,
295
- "llm_ptq",
296
- "models",
297
- model_type,
298
- f"{quant_algo}_config.json",
299
- )
300
-
301
- self._quantize(state, **kwargs)
302
-
303
- except Exception as e:
304
- printing.log_error(f"Error during the QuarkQuantize process: {e}")
305
- raise
306
- return state
307
-
308
- def _quantize(self, state: State, **kwargs) -> None:
309
- """
310
- Main quantization and export process.
311
-
312
- This method is responsible for:
313
- - Loading the model and tokenizer.
314
- - Preparing the calibration dataset.
315
- - Quantizing the model.
316
- - Optionally exporting, compiling, and evaluating the model.
317
- """
318
-
319
- import torch
320
- from transformers import AutoProcessor
321
-
322
- # Importing quark utils after adding to sys.path
323
- from llm_utils.data_preparation import get_calib_dataloader
324
- from llm_utils.model_preparation import get_model_type
325
- from llm_ptq.configuration_preparation import get_config, get_export_config
326
- from quark.torch import ModelQuantizer, ModelExporter, save_params
327
-
328
- model = state.model.model
329
- tokenizer = state.tokenizer
330
-
331
- # 1. Load Model
332
- printing.log_info("Loading model ...")
333
- model_type = get_model_type(model)
334
-
335
- # [mllama specifics]
336
- if model_type == "mllama" and kwargs.get("model_export") is not None:
337
- processor = AutoProcessor.from_pretrained(kwargs.get("model_dir"))
338
- export_dir = Path(kwargs.get("output_dir"))
339
- export_dir.mkdir(parents=True, exist_ok=True)
340
- processor.save_pretrained(kwargs.get("output_dir"))
341
-
342
- # 2. Load dataset
343
- printing.log_info("Loading dataset ...")
344
- main_device = model.device if kwargs.get("multi_gpu") else kwargs.get("device")
345
- calib_dataloader = get_calib_dataloader(
346
- dataset_name=kwargs.get("dataset"),
347
- tokenizer=tokenizer,
348
- batch_size=1,
349
- num_calib_data=kwargs.get("num_calib_data"),
350
- seqlen=kwargs.get("seq_len"),
351
- device=main_device,
352
- )
353
-
354
- # 3. Quantize model
355
- if not kwargs.get("skip_quantization"):
356
- printing.log_info("Starting quantization process ...")
357
- args = argparse.Namespace(**kwargs)
358
- quant_config = get_config(args, model_type)
359
- quant_config.log_severity_level = kwargs.get("log_severity_level", 3)
360
- quantizer = ModelQuantizer(quant_config)
361
- model = quantizer.quantize_model(model, calib_dataloader)
362
- printing.log_info("Quantization completed.")
363
-
364
- if (
365
- kwargs.get("model_export") is not None
366
- or kwargs.get("params_save")
367
- or kwargs.get("torch_compile")
368
- ):
369
- printing.log_info("Freezing the quantized model ...")
370
- model = quantizer.freeze(model)
371
-
372
- # 4. Export model
373
- if kwargs.get("model_export") is not None:
374
- printing.log_info("Exporting the model ...")
375
- export_path = kwargs.get("output_dir")
376
-
377
- args = argparse.Namespace(**kwargs)
378
- export_config = get_export_config(args, model_type)
379
- exporter = ModelExporter(config=export_config, export_dir=export_path)
380
- if "quark_safetensors" in kwargs.get("model_export"):
381
- printing.log_info("Exporting quark native json and safetensors...")
382
- with torch.no_grad():
383
- quant_config = get_config(args, model_type)
384
- exporter.export_model_info(
385
- model,
386
- quant_config=quant_config,
387
- tokenizer=tokenizer,
388
- custom_mode=kwargs.get("custom_mode"),
389
- )
390
- if "vllm_adopted_safetensors" in kwargs.get("model_export"):
391
- printing.log_info("Exporting vllm adopted json and safetensors...")
392
- with torch.inference_mode():
393
- exporter.export_model_info(
394
- model,
395
- model_type=model_type,
396
- model_dtype=state.dtype,
397
- export_type="vllm-adopt",
398
- )
399
- if "onnx" in kwargs.get("model_export"):
400
- printing.log_info("Exporting onnx graph...")
401
- with torch.inference_mode():
402
- batch_iter = iter(calib_dataloader)
403
- input_args = next(batch_iter)
404
- if kwargs.get("quant_scheme") in [
405
- "w_int4_per_channel_sym",
406
- "w_uint4_per_group_asym",
407
- "w_int4_per_group_sym",
408
- "w_uint4_a_bfloat16_per_group_asym",
409
- ]:
410
- uint4_int4_flag = True
411
- else:
412
- uint4_int4_flag = False
413
- exporter.export_onnx_model(
414
- model, input_args, uint4_int4_flag=uint4_int4_flag
415
- )
416
- if "gguf" in kwargs.get("model_export"):
417
- printing.log_info("Exporting gguf model...")
418
- with torch.inference_mode():
419
- exporter.export_gguf_model(
420
- model, kwargs.get("model_dir"), model_type
421
- )
422
-
423
- # 6. [Optional] Compile model
424
- if kwargs.get("torch_compile"):
425
- printing.log_info("torch.compile...")
426
- model = torch.compile(model)
427
-
428
- # 7. Save model parameters
429
- if kwargs.get("params_save"):
430
- printing.log_info("Saving model parameters ...")
431
- save_params(model, model_type=model_type, export_dir=kwargs.get("save_dir"))
432
-
433
- state.model.model = model
434
- state.dtype = model.dtype
435
- printing.log_info("QuarkQuantize process completed.")
436
-
437
-
438
- # This file was originally licensed under Apache 2.0. It has been modified.
439
- # Modifications Copyright (c) 2025 AMD