mct-nightly 2.3.0.20250323.559__py3-none-any.whl → 2.3.0.20250325.524__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (17) hide show
  1. {mct_nightly-2.3.0.20250323.559.dist-info → mct_nightly-2.3.0.20250325.524.dist-info}/METADATA +1 -1
  2. {mct_nightly-2.3.0.20250323.559.dist-info → mct_nightly-2.3.0.20250325.524.dist-info}/RECORD +17 -17
  3. {mct_nightly-2.3.0.20250323.559.dist-info → mct_nightly-2.3.0.20250325.524.dist-info}/WHEEL +1 -1
  4. model_compression_toolkit/__init__.py +1 -1
  5. model_compression_toolkit/core/common/graph/base_graph.py +14 -4
  6. model_compression_toolkit/core/common/mixed_precision/mixed_precision_ru_helper.py +32 -96
  7. model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py +17 -42
  8. model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py +179 -60
  9. model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization.py +22 -10
  10. model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_calculator.py +1 -5
  11. model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_data.py +14 -94
  12. model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py +132 -312
  13. model_compression_toolkit/core/common/mixed_precision/solution_refinement_procedure.py +1 -1
  14. model_compression_toolkit/core/pytorch/reader/graph_builders.py +2 -0
  15. model_compression_toolkit/core/runner.py +2 -12
  16. {mct_nightly-2.3.0.20250323.559.dist-info → mct_nightly-2.3.0.20250325.524.dist-info}/licenses/LICENSE.md +0 -0
  17. {mct_nightly-2.3.0.20250323.559.dist-info → mct_nightly-2.3.0.20250325.524.dist-info}/top_level.txt +0 -0
@@ -12,11 +12,16 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  # ==============================================================================
15
+ import copy
16
+ from collections import defaultdict
15
17
 
16
- from typing import Callable, Dict, List
18
+ from tqdm import tqdm
19
+
20
+ from typing import Dict, List, Tuple
17
21
 
18
22
  import numpy as np
19
23
 
24
+ from model_compression_toolkit.constants import EPS
20
25
  from model_compression_toolkit.core.common import BaseNode
21
26
  from model_compression_toolkit.core.common.framework_implementation import FrameworkImplementation
22
27
  from model_compression_toolkit.core.common.framework_info import FrameworkInfo
@@ -29,7 +34,10 @@ from model_compression_toolkit.core.common.mixed_precision.resource_utilization_
29
34
  TargetInclusionCriterion, BitwidthMode
30
35
  from model_compression_toolkit.core.common.mixed_precision.mixed_precision_ru_helper import \
31
36
  MixedPrecisionRUHelper
37
+ from model_compression_toolkit.core.common.mixed_precision.search_methods.linear_programming import \
38
+ MixedPrecisionIntegerLPSolver
32
39
  from model_compression_toolkit.core.common.mixed_precision.sensitivity_evaluation import SensitivityEvaluation
40
+ from model_compression_toolkit.core.common.substitutions.apply_substitutions import substitute
33
41
  from model_compression_toolkit.logger import Logger
34
42
 
35
43
 
@@ -43,8 +51,7 @@ class MixedPrecisionSearchManager:
43
51
  fw_info: FrameworkInfo,
44
52
  fw_impl: FrameworkImplementation,
45
53
  sensitivity_evaluator: SensitivityEvaluation,
46
- target_resource_utilization: ResourceUtilization,
47
- original_graph: Graph = None):
54
+ target_resource_utilization: ResourceUtilization):
48
55
  """
49
56
 
50
57
  Args:
@@ -54,96 +61,208 @@ class MixedPrecisionSearchManager:
54
61
  sensitivity_evaluator: A SensitivityEvaluation which provides a function that evaluates the sensitivity of
55
62
  a bit-width configuration for the MP model.
56
63
  target_resource_utilization: Target Resource Utilization to bound our feasible solution space s.t the configuration does not violate it.
57
- original_graph: In case we have a search over a virtual graph (if we have BOPS utilization target), then this argument
58
- will contain the original graph (for config reconstruction purposes).
59
64
  """
60
65
 
61
- self.graph = graph
62
- self.original_graph = graph if original_graph is None else original_graph
63
66
  self.fw_info = fw_info
64
67
  self.fw_impl = fw_impl
68
+
69
+ self.original_graph = graph
70
+ # graph for mp search
71
+ self.mp_graph, self.using_virtual_graph = self._get_mp_graph(graph, target_resource_utilization)
72
+ del graph # so that it's not used by mistake
73
+
65
74
  self.sensitivity_evaluator = sensitivity_evaluator
75
+ self.target_resource_utilization = target_resource_utilization
76
+
77
+ self.mp_topo_configurable_nodes = self.mp_graph.get_configurable_sorted_nodes(fw_info)
66
78
  self.layer_to_bitwidth_mapping = self.get_search_space()
67
- self.compute_metric_fn = self.get_sensitivity_metric()
68
- self._cuts = None
69
79
 
70
- # To define RU Total constraints we need to compute weights and activations even if they have no constraints
71
- # TODO currently this logic is duplicated in linear_programming.py
72
- targets = target_resource_utilization.get_restricted_targets()
73
- if RUTarget.TOTAL in targets:
74
- targets = targets.union({RUTarget.ACTIVATION, RUTarget.WEIGHTS}) - {RUTarget.TOTAL}
75
- self.ru_targets_to_compute = targets
80
+ self.ru_targets = target_resource_utilization.get_restricted_targets()
81
+ self.ru_helper = MixedPrecisionRUHelper(self.mp_graph, fw_info, fw_impl)
76
82
 
77
- self.ru_helper = MixedPrecisionRUHelper(graph, fw_info, fw_impl)
78
- self.target_resource_utilization = target_resource_utilization
79
- self.min_ru_config = self.graph.get_min_candidates_config(fw_info)
80
- self.max_ru_config = self.graph.get_max_candidates_config(fw_info)
81
- self.min_ru = self.ru_helper.compute_utilization(self.ru_targets_to_compute, self.min_ru_config)
82
- self.non_conf_ru_dict = self.ru_helper.compute_utilization(self.ru_targets_to_compute, None)
83
+ self.min_ru_config = self.mp_graph.get_min_candidates_config(fw_info)
84
+ self.max_ru_config = self.mp_graph.get_max_candidates_config(fw_info)
85
+ self.min_ru = self.ru_helper.compute_utilization(self.ru_targets, self.min_ru_config)
83
86
 
84
- self.config_reconstruction_helper = ConfigReconstructionHelper(virtual_graph=self.graph,
87
+ self.config_reconstruction_helper = ConfigReconstructionHelper(virtual_graph=self.mp_graph,
85
88
  original_graph=self.original_graph)
86
89
 
87
- def get_search_space(self) -> Dict[int, List[int]]:
90
+ def search(self) -> List[int]:
88
91
  """
89
- The search space is a mapping from a node's index to a list of integers (possible bitwidths candidates indeces
90
- for the node).
92
+ Run mixed precision search.
91
93
 
92
94
  Returns:
93
- The entire search space of the graph.
95
+ Indices of the selected bit-widths candidates.
94
96
  """
97
+ candidates_sensitivity = self._build_sensitivity_mapping()
98
+ candidates_ru = self._compute_relative_ru_matrices()
99
+ rel_target_ru = self._get_relative_ru_constraint_per_mem_element()
100
+ solver = MixedPrecisionIntegerLPSolver(candidates_sensitivity, candidates_ru, rel_target_ru)
101
+ config = solver.run()
95
102
 
96
- indices_mapping = {}
97
- nodes_to_configure = self.graph.get_configurable_sorted_nodes(self.fw_info)
98
- for idx, n in enumerate(nodes_to_configure):
99
- # For each node, get all possible bitwidth indices for it
100
- # (which is a list from 0 to the length of the candidates mp_config list of the node).
101
- indices_mapping[idx] = list(range(len(n.candidates_quantization_cfg))) # all search_methods space
102
- return indices_mapping
103
+ if self.using_virtual_graph:
104
+ config = self.config_reconstruction_helper.reconstruct_config_from_virtual_graph(config)
105
+ return config
103
106
 
104
- def get_sensitivity_metric(self) -> Callable:
107
+ def _get_relative_ru_constraint_per_mem_element(self) -> Dict[RUTarget, np.ndarray]:
105
108
  """
109
+ Computes resource utilization constraint with respect to the minimal bit configuration, i.e. corresponding
110
+ constraint for each memory element is the relative utilization between the target utilization and
111
+ element's utilization for min-bit configuration.
112
+
113
+ Returns:
114
+ A dictionary of relative resource utilization constraints per ru target.
115
+
116
+ Raises:
117
+ ValueError: if target resource utilization cannot be satisfied (utilization for the minimal bit
118
+ configuration exceeds the requested target utilization for any target).
119
+ """
120
+ target_ru = self.target_resource_utilization.get_resource_utilization_dict(restricted_only=True)
121
+ rel_target_ru = {
122
+ ru_target: ru - self.min_ru[ru_target] for ru_target, ru in target_ru.items()
123
+ }
124
+ unsatisfiable_targets = {
125
+ ru_target.value: target_ru[ru_target] for ru_target, ru in rel_target_ru.items() if any(ru < 0)
126
+ }
127
+ if unsatisfiable_targets:
128
+ raise ValueError(f"The model cannot be quantized to meet the specified resource utilization for the "
129
+ f"following targets: {unsatisfiable_targets}")
130
+ return rel_target_ru
131
+
132
+ def _build_sensitivity_mapping(self, eps: float = EPS) -> Dict[int, Dict[int, float]]:
133
+ """
134
+ This function measures the sensitivity of a change in a bitwidth of a layer on the entire model.
135
+ It builds a mapping from a node's index, to its bitwidht's effect on the model sensitivity.
136
+ For each node and some possible node's bitwidth (according to the given search space), we use
137
+ the framework function compute_metric_fn in order to infer
138
+ a batch of images, and compute (using the inference results) the sensitivity metric of
139
+ the configured mixed-precision model.
140
+
141
+ Args:
142
+ eps: Epsilon value to manually increase metric value (if necessary) for numerical stability
106
143
 
107
- Returns: Return a function (from the framework implementation) to compute a metric that
108
- indicates the similarity of the mixed-precision model (to the float model) for a given
109
- mixed-precision configuration.
144
+ Returns:
145
+ Mapping from each node's index in a graph, to a dictionary from the bitwidth index (of this node) to
146
+ the sensitivity of the model.
110
147
 
111
148
  """
112
- # Get from the framework an evaluation function on how a MP configuration,
113
- # affects the expected loss.
114
149
 
115
- return self.sensitivity_evaluator.compute_metric
150
+ Logger.info('Starting to evaluate metrics')
151
+ layer_to_metrics_mapping = {}
152
+
153
+ compute_metric = self.sensitivity_evaluator.compute_metric
154
+ if self.using_virtual_graph:
155
+ origin_max_config = self.config_reconstruction_helper.reconstruct_config_from_virtual_graph(
156
+ self.max_ru_config)
157
+ max_config_value = compute_metric(origin_max_config)
158
+ else:
159
+ max_config_value = compute_metric(self.max_ru_config)
160
+
161
+ for node_idx, layer_possible_bitwidths_indices in tqdm(self.layer_to_bitwidth_mapping.items(),
162
+ total=len(self.layer_to_bitwidth_mapping)):
163
+ layer_to_metrics_mapping[node_idx] = {}
164
+
165
+ for bitwidth_idx in layer_possible_bitwidths_indices:
166
+ if self.max_ru_config[node_idx] == bitwidth_idx:
167
+ # This is a computation of the metric for the max configuration, assign pre-calculated value
168
+ layer_to_metrics_mapping[node_idx][bitwidth_idx] = max_config_value
169
+ continue
170
+
171
+ # Create a configuration that differs at one layer only from the baseline model
172
+ mp_model_configuration = self.max_ru_config.copy()
173
+ mp_model_configuration[node_idx] = bitwidth_idx
174
+
175
+ # Build a distance matrix using the function we got from the framework implementation.
176
+ if self.using_virtual_graph:
177
+ # Reconstructing original graph's configuration from virtual graph's configuration
178
+ origin_mp_model_configuration = \
179
+ self.config_reconstruction_helper.reconstruct_config_from_virtual_graph(
180
+ mp_model_configuration,
181
+ changed_virtual_nodes_idx=[node_idx],
182
+ original_base_config=origin_max_config)
183
+ origin_changed_nodes_indices = [i for i, c in enumerate(origin_max_config) if
184
+ c != origin_mp_model_configuration[i]]
185
+ metric_value = compute_metric(
186
+ origin_mp_model_configuration,
187
+ origin_changed_nodes_indices,
188
+ origin_max_config)
189
+ else:
190
+ metric_value = compute_metric(
191
+ mp_model_configuration,
192
+ [node_idx],
193
+ self.max_ru_config)
194
+
195
+ layer_to_metrics_mapping[node_idx][bitwidth_idx] = max(metric_value, max_config_value + eps)
116
196
 
117
- def compute_resource_utilization_matrix(self, target: RUTarget) -> np.ndarray:
197
+ # Finalize distance metric mapping
198
+ self.finalize_distance_metric(layer_to_metrics_mapping)
199
+
200
+ return layer_to_metrics_mapping
201
+
202
+ def _get_mp_graph(self, graph: Graph, target_resource_utilization: ResourceUtilization) -> Tuple[Graph, bool]:
118
203
  """
119
- Computes and builds a resource utilization matrix, to be used for the mixed-precision search problem formalization.
120
- Utilization is computed relative to the minimal configuration, i.e. utilization for it will be 0.
204
+ Get graph for mixed precision search. Virtual graph is built if bops is restricted and both activation and
205
+ weights are configurable.
121
206
 
122
207
  Args:
123
- target: The resource target for which the resource utilization is calculated (a RUTarget value).
208
+ graph: input graph.
209
+ target_resource_utilization: target resource utilization.
210
+
211
+ Returns:
212
+ Graph for mixed precision search (virtual or original), and a boolean flag whether a virtual graph has been
213
+ constructed.
214
+ """
215
+ if (target_resource_utilization.bops_restricted() and
216
+ graph.has_any_configurable_activation() and
217
+ graph.has_any_configurable_weights()):
218
+ mp_graph = substitute(copy.deepcopy(graph),
219
+ self.fw_impl.get_substitutions_virtual_weights_activation_coupling())
220
+ return mp_graph, True
221
+
222
+ return graph, False
223
+
224
+ def get_search_space(self) -> Dict[int, List[int]]:
225
+ """
226
+ The search space is a mapping from a node's index to a list of integers (possible bitwidths candidates indeces
227
+ for the node).
124
228
 
125
229
  Returns:
126
- A resource utilization matrix of shape (num configurations, num memory elements). Num memory elements
127
- depends on the target, e.g. num nodes or num cuts, for which utilization is computed.
230
+ The entire search space of the graph.
128
231
  """
129
- assert isinstance(target, RUTarget), f"{target} is not a valid resource target"
130
232
 
131
- configurable_sorted_nodes = self.graph.get_configurable_sorted_nodes(self.fw_info)
233
+ indices_mapping = {}
234
+ for idx, n in enumerate(self.mp_topo_configurable_nodes):
235
+ # For each node, get all possible bitwidth indices for it
236
+ # (which is a list from 0 to the length of the candidates mp_config list of the node).
237
+ indices_mapping[idx] = list(range(len(n.candidates_quantization_cfg))) # all search_methods space
238
+ return indices_mapping
239
+
240
+ def _compute_relative_ru_matrices(self) -> Dict[RUTarget, np.ndarray]:
241
+ """
242
+ Computes and builds a resource utilization matrix for all restricted targets, to be used for the
243
+ mixed-precision search problem formalization.
244
+ Utilization is computed relative to the minimal configuration, i.e. utilization for it will be 0.
132
245
 
133
- ru_matrix = []
134
- for c, c_n in enumerate(configurable_sorted_nodes):
246
+ Returns:
247
+ A dictionary containing resource utilization matrix of shape (num configurations, num memory elements)
248
+ per ru target. Num memory elements depends on the target, e.g. num cuts or 1 for cumulative metrics.
249
+ """
250
+ rus_per_candidate = defaultdict(list)
251
+ for c, c_n in enumerate(self.mp_topo_configurable_nodes):
135
252
  for candidate_idx in range(len(c_n.candidates_quantization_cfg)):
136
253
  if candidate_idx == self.min_ru_config[c]:
137
- candidate_rus = self.min_ru[target]
254
+ candidate_rus = self.min_ru
138
255
  else:
139
- candidate_rus = self.compute_node_ru_for_candidate(c, candidate_idx, target)
256
+ candidate_rus = self.compute_ru_for_candidate(c, candidate_idx)
140
257
 
141
- ru_matrix.append(np.asarray(candidate_rus))
258
+ for target, ru in candidate_rus.items():
259
+ rus_per_candidate[target].append(ru)
142
260
 
143
- np_ru_matrix = np.array(ru_matrix) - self.min_ru[target] # num configurations X num elements
144
- return np_ru_matrix
261
+ # Each target contains a matrix of num configurations X num elements
262
+ relative_rus = {target: np.array(ru) - self.min_ru[target] for target, ru in rus_per_candidate.items()}
263
+ return relative_rus
145
264
 
146
- def compute_node_ru_for_candidate(self, conf_node_idx: int, candidate_idx: int, target: RUTarget) -> np.ndarray:
265
+ def compute_ru_for_candidate(self, conf_node_idx: int, candidate_idx: int) -> Dict[RUTarget, np.ndarray]:
147
266
  """
148
267
  Computes a resource utilization vector after replacing the given node's configuration candidate in the minimal
149
268
  target configuration with the given candidate index.
@@ -151,13 +270,13 @@ class MixedPrecisionSearchManager:
151
270
  Args:
152
271
  conf_node_idx: The index of a node in a sorted configurable nodes list.
153
272
  candidate_idx: Quantization config candidate to be used for the node's resource utilization computation.
154
- target: The target for which the resource utilization is calculated (a RUTarget value).
155
273
 
156
- Returns: Node's resource utilization vector.
274
+ Returns:
275
+ Node's resource utilization vector.
157
276
 
158
277
  """
159
278
  cfg = self.replace_config_in_index(self.min_ru_config, conf_node_idx, candidate_idx)
160
- return self.ru_helper.compute_utilization({target}, cfg)[target]
279
+ return self.ru_helper.compute_utilization(self.ru_targets, cfg)
161
280
 
162
281
  @staticmethod
163
282
  def replace_config_in_index(mp_cfg: List[int], idx: int, value: int) -> List[int]:
@@ -191,7 +310,7 @@ class MixedPrecisionSearchManager:
191
310
  act_qcs, w_qcs = self.ru_helper.get_quantization_candidates(config)
192
311
  ru = self.ru_helper.ru_calculator.compute_resource_utilization(
193
312
  target_criterion=TargetInclusionCriterion.AnyQuantized, bitwidth_mode=BitwidthMode.QCustom, act_qcs=act_qcs,
194
- w_qcs=w_qcs, ru_targets=self.ru_targets_to_compute, allow_unused_qcs=True)
313
+ w_qcs=w_qcs, ru_targets=self.ru_targets, allow_unused_qcs=True)
195
314
  return ru
196
315
 
197
316
  def finalize_distance_metric(self, layer_to_metrics_mapping: Dict[int, Dict[int, float]]):
@@ -51,25 +51,34 @@ class ResourceUtilization:
51
51
  bops: float = np.inf
52
52
 
53
53
  def weight_restricted(self):
54
- return self.weights_memory < np.inf
54
+ return self._is_restricted(self.weights_memory)
55
55
 
56
56
  def activation_restricted(self):
57
- return self.activation_memory < np.inf
57
+ return self._is_restricted(self.activation_memory)
58
58
 
59
59
  def total_mem_restricted(self):
60
- return self.total_memory < np.inf
60
+ return self._is_restricted(self.total_memory)
61
61
 
62
62
  def bops_restricted(self):
63
- return self.bops < np.inf
63
+ return self._is_restricted(self.bops)
64
64
 
65
- def get_resource_utilization_dict(self) -> Dict[RUTarget, float]:
65
+ def get_resource_utilization_dict(self, restricted_only: bool = False) -> Dict[RUTarget, float]:
66
66
  """
67
- Returns: a dictionary with the ResourceUtilization object's values for each resource utilization target.
67
+ Get resource utilization as a dictionary.
68
+
69
+ Args:
70
+ restricted_only: whether to include only targets with restricted utilization.
71
+
72
+ Returns:
73
+ A dictionary containing the resource utilization with targets as keys.
68
74
  """
69
- return {RUTarget.WEIGHTS: self.weights_memory,
70
- RUTarget.ACTIVATION: self.activation_memory,
71
- RUTarget.TOTAL: self.total_memory,
72
- RUTarget.BOPS: self.bops}
75
+ ru_dict = {RUTarget.WEIGHTS: self.weights_memory,
76
+ RUTarget.ACTIVATION: self.activation_memory,
77
+ RUTarget.TOTAL: self.total_memory,
78
+ RUTarget.BOPS: self.bops}
79
+ if restricted_only:
80
+ ru_dict = {k: v for k, v in ru_dict.items() if self._is_restricted(v)}
81
+ return ru_dict
73
82
 
74
83
  def is_satisfied_by(self, ru: 'ResourceUtilization') -> bool:
75
84
  """
@@ -114,3 +123,6 @@ class ResourceUtilization:
114
123
  if RUTarget.BOPS in targets:
115
124
  summary.append(f"BOPS: {self.bops}")
116
125
  return ', '.join(summary)
126
+
127
+ def _is_restricted(self, v):
128
+ return v < np.inf
@@ -431,8 +431,7 @@ class ResourceUtilizationCalculator:
431
431
  Returns:
432
432
  Node's activation utilization.
433
433
  """
434
- if qc and bitwidth_mode != BitwidthMode.QCustom:
435
- raise ValueError(self.unexpected_qc_error)
434
+ self._validate_custom_qcs(qc, bitwidth_mode)
436
435
 
437
436
  if target_criterion:
438
437
  # only check whether the node meets the criterion
@@ -470,9 +469,6 @@ class ResourceUtilizationCalculator:
470
469
  - Total BOPS count of the network.
471
470
  - Detailed BOPS count per node.
472
471
  """
473
- self._validate_custom_qcs(act_qcs, bitwidth_mode)
474
- self._validate_custom_qcs(w_qcs, bitwidth_mode)
475
-
476
472
  nodes_bops = {}
477
473
  for n in self.graph.get_topo_sorted_nodes():
478
474
  w_qc = w_qcs.get(n.name) if w_qcs else None
@@ -16,10 +16,7 @@ import copy
16
16
  from typing import Callable, Any
17
17
 
18
18
  from model_compression_toolkit.core import FrameworkInfo, ResourceUtilization, CoreConfig, QuantizationErrorMethod
19
- from model_compression_toolkit.core.common import Graph
20
19
  from model_compression_toolkit.core.common.framework_implementation import FrameworkImplementation
21
- from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import \
22
- RUTarget
23
20
  from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization_calculator import \
24
21
  ResourceUtilizationCalculator, BitwidthMode, TargetInclusionCriterion
25
22
  from model_compression_toolkit.core.graph_prep_runner import graph_preparation_runner
@@ -31,13 +28,10 @@ def compute_resource_utilization_data(in_model: Any,
31
28
  core_config: CoreConfig,
32
29
  fqc: FrameworkQuantizationCapabilities,
33
30
  fw_info: FrameworkInfo,
34
- fw_impl: FrameworkImplementation,
35
- transformed_graph: Graph = None,
36
- mixed_precision_enable: bool = True) -> ResourceUtilization:
31
+ fw_impl: FrameworkImplementation) -> ResourceUtilization:
37
32
  """
38
- Compute Resource Utilization information that can be relevant for defining target ResourceUtilization for mixed precision search.
39
- Calculates maximal activation tensor size, the sum of the model's weight parameters and the total memory combining both weights
40
- and maximal activation tensor size.
33
+ Compute Resource Utilization of a model with the default single precision quantization.
34
+ This can serve as a basis for defining target Resource Utilization for mixed precision search.
41
35
 
42
36
  Args:
43
37
  in_model: Model to build graph from (the model that intended to be quantized).
@@ -47,100 +41,26 @@ def compute_resource_utilization_data(in_model: Any,
47
41
  the attached framework operator's information.
48
42
  fw_info: Information needed for quantization about the specific framework.
49
43
  fw_impl: FrameworkImplementation object with a specific framework methods implementation.
50
- transformed_graph: An internal graph representation of the input model. Defaults to None.
51
- If no graph is provided, a graph will be constructed using the specified model.
52
- mixed_precision_enable: Indicates if mixed precision is enabled, defaults to True.
53
- If disabled, computes resource utilization using base quantization
54
- configurations across all layers.
55
44
 
56
45
  Returns:
57
46
  ResourceUtilization: An object encapsulating the calculated resource utilization computations.
58
47
 
59
48
  """
60
- core_config = _create_core_config_for_ru(core_config)
61
- # We assume that the resource_utilization_data API is used to compute the model resource utilization for
62
- # mixed precision scenario, so we run graph preparation under the assumption of enabled mixed precision.
63
- if transformed_graph is None:
64
- transformed_graph = graph_preparation_runner(in_model,
65
- representative_data_gen,
66
- core_config.quantization_config,
67
- fw_info,
68
- fw_impl,
69
- fqc,
70
- bit_width_config=core_config.bit_width_config,
71
- mixed_precision_enable=mixed_precision_enable,
72
- running_gptq=False)
73
-
74
- ru_calculator = ResourceUtilizationCalculator(transformed_graph, fw_impl, fw_info)
75
- ru = ru_calculator.compute_resource_utilization(TargetInclusionCriterion.AnyQuantized, BitwidthMode.Q8Bit,
76
- ru_targets=set(RUTarget) - {RUTarget.BOPS})
77
- ru.bops, _ = ru_calculator.compute_bops(TargetInclusionCriterion.AnyQuantized, BitwidthMode.Float)
78
- return ru
79
-
80
-
81
- def requires_mixed_precision(in_model: Any,
82
- target_resource_utilization: ResourceUtilization,
83
- representative_data_gen: Callable,
84
- core_config: CoreConfig,
85
- fqc: FrameworkQuantizationCapabilities,
86
- fw_info: FrameworkInfo,
87
- fw_impl: FrameworkImplementation) -> bool:
88
- """
89
- The function checks whether the model requires mixed precision to meet the requested target resource utilization.
90
- This is determined by whether the target memory usage of the weights is less than the available memory,
91
- the target maximum size of an activation tensor is less than the available memory,
92
- and the target number of BOPs is less than the available BOPs.
93
- If any of these conditions are met, the function returns True. Otherwise, it returns False.
94
-
95
- Args:
96
- in_model: The model to be evaluated.
97
- target_resource_utilization: The resource utilization of the target device.
98
- representative_data_gen: A function that generates representative data for the model.
99
- core_config: CoreConfig containing parameters of how the model should be quantized.
100
- fqc: FrameworkQuantizationCapabilities object that models the inference target platform and
101
- the attached framework operator's information.
102
- fw_info: Information needed for quantization about the specific framework.
103
- fw_impl: FrameworkImplementation object with a specific framework methods implementation.
104
-
105
- Returns: A boolean indicating if mixed precision is needed.
106
- """
107
- # Any target resource utilization other than weights will always require MP calculation.
108
- if target_resource_utilization.activation_restricted() or \
109
- target_resource_utilization.total_mem_restricted() or \
110
- target_resource_utilization.bops_restricted():
111
- return True
112
-
113
- core_config = _create_core_config_for_ru(core_config)
49
+ core_config = copy.deepcopy(core_config)
50
+ # For resource utilization graph_preparation_runner runs with gptq=False (the default value). HMSE is not supported
51
+ # without GPTQ and will raise an error later so we replace it with MSE.
52
+ if core_config.quantization_config.weights_error_method == QuantizationErrorMethod.HMSE:
53
+ core_config.quantization_config.weights_error_method = QuantizationErrorMethod.MSE
114
54
 
115
55
  transformed_graph = graph_preparation_runner(in_model,
116
- representative_data_gen,
117
- core_config.quantization_config,
118
- fw_info,
119
- fw_impl,
120
- fqc,
56
+ representative_data_gen=representative_data_gen,
57
+ quantization_config=core_config.quantization_config,
58
+ fw_info=fw_info,
59
+ fw_impl=fw_impl,
60
+ fqc=fqc,
121
61
  bit_width_config=core_config.bit_width_config,
122
62
  mixed_precision_enable=False,
123
63
  running_gptq=False)
124
64
 
125
65
  ru_calculator = ResourceUtilizationCalculator(transformed_graph, fw_impl, fw_info)
126
- max_ru = ru_calculator.compute_resource_utilization(TargetInclusionCriterion.AnyQuantized, BitwidthMode.QMaxBit,
127
- ru_targets=target_resource_utilization.get_restricted_targets())
128
- return not target_resource_utilization.is_satisfied_by(max_ru)
129
-
130
-
131
- def _create_core_config_for_ru(core_config: CoreConfig) -> CoreConfig:
132
- """
133
- Create a core config to use for resource utilization computation.
134
-
135
- Args:
136
- core_config: input core config
137
-
138
- Returns:
139
- Core config for resource utilization.
140
- """
141
- core_config = copy.deepcopy(core_config)
142
- # For resource utilization graph_preparation_runner runs with gptq=False (the default value). HMSE is not supported
143
- # without GPTQ and will raise an error later so we replace it with MSE.
144
- if core_config.quantization_config.weights_error_method == QuantizationErrorMethod.HMSE:
145
- core_config.quantization_config.weights_error_method = QuantizationErrorMethod.MSE
146
- return core_config
66
+ return ru_calculator.compute_resource_utilization(TargetInclusionCriterion.AnyQuantized, BitwidthMode.QDefaultSP)