mct-nightly 2.2.0.20250113.134913__py3-none-any.whl → 2.2.0.20250114.134534__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. {mct_nightly-2.2.0.20250113.134913.dist-info → mct_nightly-2.2.0.20250114.134534.dist-info}/METADATA +1 -1
  2. {mct_nightly-2.2.0.20250113.134913.dist-info → mct_nightly-2.2.0.20250114.134534.dist-info}/RECORD +102 -104
  3. model_compression_toolkit/__init__.py +2 -2
  4. model_compression_toolkit/core/common/framework_info.py +1 -3
  5. model_compression_toolkit/core/common/fusion/layer_fusing.py +6 -5
  6. model_compression_toolkit/core/common/graph/base_graph.py +20 -21
  7. model_compression_toolkit/core/common/graph/base_node.py +44 -17
  8. model_compression_toolkit/core/common/mixed_precision/mixed_precision_candidates_filter.py +7 -6
  9. model_compression_toolkit/core/common/mixed_precision/mixed_precision_ru_helper.py +187 -0
  10. model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py +0 -6
  11. model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py +35 -162
  12. model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization.py +36 -62
  13. model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_calculator.py +668 -0
  14. model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_data.py +25 -202
  15. model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py +74 -51
  16. model_compression_toolkit/core/common/mixed_precision/sensitivity_evaluation.py +3 -5
  17. model_compression_toolkit/core/common/mixed_precision/solution_refinement_procedure.py +2 -2
  18. model_compression_toolkit/core/common/pruning/greedy_mask_calculator.py +7 -6
  19. model_compression_toolkit/core/common/pruning/mask/per_channel_mask.py +0 -1
  20. model_compression_toolkit/core/common/pruning/mask/per_simd_group_mask.py +0 -1
  21. model_compression_toolkit/core/common/pruning/pruner.py +5 -3
  22. model_compression_toolkit/core/common/quantization/bit_width_config.py +6 -12
  23. model_compression_toolkit/core/common/quantization/filter_nodes_candidates.py +1 -2
  24. model_compression_toolkit/core/common/quantization/node_quantization_config.py +2 -2
  25. model_compression_toolkit/core/common/quantization/quantization_config.py +1 -1
  26. model_compression_toolkit/core/common/quantization/quantization_fn_selection.py +1 -1
  27. model_compression_toolkit/core/common/quantization/quantization_params_fn_selection.py +1 -1
  28. model_compression_toolkit/core/common/quantization/quantization_params_generation/error_functions.py +1 -1
  29. model_compression_toolkit/core/common/quantization/quantization_params_generation/power_of_two_selection.py +1 -1
  30. model_compression_toolkit/core/common/quantization/quantization_params_generation/qparams_activations_computation.py +1 -1
  31. model_compression_toolkit/core/common/quantization/quantization_params_generation/symmetric_selection.py +1 -1
  32. model_compression_toolkit/core/common/quantization/quantization_params_generation/uniform_selection.py +1 -1
  33. model_compression_toolkit/core/common/quantization/set_node_quantization_config.py +15 -14
  34. model_compression_toolkit/core/common/substitutions/batchnorm_reconstruction.py +1 -1
  35. model_compression_toolkit/core/common/substitutions/batchnorm_refusing.py +1 -1
  36. model_compression_toolkit/core/common/substitutions/shift_negative_activation.py +5 -5
  37. model_compression_toolkit/core/graph_prep_runner.py +12 -11
  38. model_compression_toolkit/core/keras/default_framework_info.py +1 -1
  39. model_compression_toolkit/core/keras/mixed_precision/configurable_weights_quantizer.py +1 -2
  40. model_compression_toolkit/core/keras/resource_utilization_data_facade.py +5 -6
  41. model_compression_toolkit/core/pytorch/back2framework/pytorch_model_builder.py +1 -1
  42. model_compression_toolkit/core/pytorch/default_framework_info.py +1 -1
  43. model_compression_toolkit/core/pytorch/mixed_precision/configurable_activation_quantizer.py +1 -1
  44. model_compression_toolkit/core/pytorch/mixed_precision/configurable_weights_quantizer.py +1 -1
  45. model_compression_toolkit/core/pytorch/resource_utilization_data_facade.py +4 -5
  46. model_compression_toolkit/core/runner.py +33 -60
  47. model_compression_toolkit/exporter/model_wrapper/keras/builder/node_to_quantizer.py +1 -1
  48. model_compression_toolkit/exporter/model_wrapper/pytorch/builder/node_to_quantizer.py +1 -1
  49. model_compression_toolkit/gptq/keras/quantization_facade.py +8 -9
  50. model_compression_toolkit/gptq/keras/quantizer/soft_rounding/symmetric_soft_quantizer.py +1 -1
  51. model_compression_toolkit/gptq/keras/quantizer/soft_rounding/uniform_soft_quantizer.py +1 -1
  52. model_compression_toolkit/gptq/keras/quantizer/ste_rounding/symmetric_ste.py +1 -1
  53. model_compression_toolkit/gptq/pytorch/quantization_facade.py +8 -9
  54. model_compression_toolkit/gptq/pytorch/quantizer/soft_rounding/symmetric_soft_quantizer.py +1 -1
  55. model_compression_toolkit/gptq/pytorch/quantizer/soft_rounding/uniform_soft_quantizer.py +1 -1
  56. model_compression_toolkit/gptq/pytorch/quantizer/ste_rounding/symmetric_ste.py +1 -1
  57. model_compression_toolkit/metadata.py +11 -10
  58. model_compression_toolkit/pruning/keras/pruning_facade.py +5 -6
  59. model_compression_toolkit/pruning/pytorch/pruning_facade.py +6 -7
  60. model_compression_toolkit/ptq/keras/quantization_facade.py +8 -9
  61. model_compression_toolkit/ptq/pytorch/quantization_facade.py +8 -9
  62. model_compression_toolkit/qat/keras/quantization_facade.py +5 -6
  63. model_compression_toolkit/qat/keras/quantizer/lsq/symmetric_lsq.py +1 -1
  64. model_compression_toolkit/qat/keras/quantizer/ste_rounding/symmetric_ste.py +1 -1
  65. model_compression_toolkit/qat/pytorch/quantization_facade.py +5 -9
  66. model_compression_toolkit/qat/pytorch/quantizer/lsq/symmetric_lsq.py +1 -1
  67. model_compression_toolkit/qat/pytorch/quantizer/lsq/uniform_lsq.py +1 -1
  68. model_compression_toolkit/qat/pytorch/quantizer/ste_rounding/symmetric_ste.py +1 -1
  69. model_compression_toolkit/qat/pytorch/quantizer/ste_rounding/uniform_ste.py +1 -1
  70. model_compression_toolkit/target_platform_capabilities/__init__.py +9 -0
  71. model_compression_toolkit/target_platform_capabilities/constants.py +1 -1
  72. model_compression_toolkit/target_platform_capabilities/schema/mct_current_schema.py +2 -2
  73. model_compression_toolkit/target_platform_capabilities/schema/schema_functions.py +18 -18
  74. model_compression_toolkit/target_platform_capabilities/schema/v1.py +13 -13
  75. model_compression_toolkit/target_platform_capabilities/{target_platform/targetplatform2framework → targetplatform2framework}/__init__.py +6 -6
  76. model_compression_toolkit/target_platform_capabilities/{target_platform/targetplatform2framework → targetplatform2framework}/attach2fw.py +10 -10
  77. model_compression_toolkit/target_platform_capabilities/{target_platform/targetplatform2framework → targetplatform2framework}/attach2keras.py +3 -3
  78. model_compression_toolkit/target_platform_capabilities/{target_platform/targetplatform2framework → targetplatform2framework}/attach2pytorch.py +3 -2
  79. model_compression_toolkit/target_platform_capabilities/{target_platform/targetplatform2framework → targetplatform2framework}/current_tpc.py +8 -8
  80. model_compression_toolkit/target_platform_capabilities/{target_platform/targetplatform2framework/target_platform_capabilities.py → targetplatform2framework/framework_quantization_capabilities.py} +40 -40
  81. model_compression_toolkit/target_platform_capabilities/{target_platform/targetplatform2framework/target_platform_capabilities_component.py → targetplatform2framework/framework_quantization_capabilities_component.py} +2 -2
  82. model_compression_toolkit/target_platform_capabilities/{target_platform/targetplatform2framework → targetplatform2framework}/layer_filter_params.py +0 -1
  83. model_compression_toolkit/target_platform_capabilities/{target_platform/targetplatform2framework → targetplatform2framework}/operations_to_layers.py +8 -8
  84. model_compression_toolkit/target_platform_capabilities/tpc_io_handler.py +24 -24
  85. model_compression_toolkit/target_platform_capabilities/tpc_models/get_target_platform_capabilities.py +18 -18
  86. model_compression_toolkit/target_platform_capabilities/tpc_models/imx500_tpc/latest/__init__.py +3 -3
  87. model_compression_toolkit/target_platform_capabilities/tpc_models/imx500_tpc/v1/{tp_model.py → tpc.py} +31 -32
  88. model_compression_toolkit/target_platform_capabilities/tpc_models/qnnpack_tpc/latest/__init__.py +3 -3
  89. model_compression_toolkit/target_platform_capabilities/tpc_models/qnnpack_tpc/v1/{tp_model.py → tpc.py} +27 -27
  90. model_compression_toolkit/target_platform_capabilities/tpc_models/tflite_tpc/latest/__init__.py +4 -4
  91. model_compression_toolkit/target_platform_capabilities/tpc_models/tflite_tpc/v1/{tp_model.py → tpc.py} +27 -27
  92. model_compression_toolkit/trainable_infrastructure/common/get_quantizers.py +1 -2
  93. model_compression_toolkit/trainable_infrastructure/common/trainable_quantizer_config.py +2 -1
  94. model_compression_toolkit/trainable_infrastructure/keras/activation_quantizers/lsq/symmetric_lsq.py +1 -2
  95. model_compression_toolkit/trainable_infrastructure/keras/config_serialization.py +1 -1
  96. model_compression_toolkit/xquant/common/model_folding_utils.py +7 -6
  97. model_compression_toolkit/xquant/keras/keras_report_utils.py +4 -4
  98. model_compression_toolkit/xquant/pytorch/pytorch_report_utils.py +3 -3
  99. model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/ru_aggregation_methods.py +0 -105
  100. model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/ru_functions_mapping.py +0 -33
  101. model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/ru_methods.py +0 -528
  102. model_compression_toolkit/target_platform_capabilities/target_platform/__init__.py +0 -23
  103. {mct_nightly-2.2.0.20250113.134913.dist-info → mct_nightly-2.2.0.20250114.134534.dist-info}/LICENSE.md +0 -0
  104. {mct_nightly-2.2.0.20250113.134913.dist-info → mct_nightly-2.2.0.20250114.134534.dist-info}/WHEEL +0 -0
  105. {mct_nightly-2.2.0.20250113.134913.dist-info → mct_nightly-2.2.0.20250114.134534.dist-info}/top_level.txt +0 -0
  106. /model_compression_toolkit/target_platform_capabilities/{target_platform/targetplatform2framework → targetplatform2framework}/attribute_filter.py +0 -0
@@ -13,27 +13,23 @@
13
13
  # limitations under the License.
14
14
  # ==============================================================================
15
15
  import copy
16
- from collections import defaultdict
16
+ from typing import Callable, Any
17
17
 
18
- import numpy as np
19
- from typing import Callable, Any, Dict, Tuple
20
-
21
- from model_compression_toolkit.logger import Logger
22
- from model_compression_toolkit.constants import FLOAT_BITWIDTH, BITS_TO_BYTES
23
18
  from model_compression_toolkit.core import FrameworkInfo, ResourceUtilization, CoreConfig, QuantizationErrorMethod
24
19
  from model_compression_toolkit.core.common import Graph
25
20
  from model_compression_toolkit.core.common.framework_implementation import FrameworkImplementation
26
- from model_compression_toolkit.core.common.graph.edge import EDGE_SINK_INDEX
21
+ from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import \
22
+ RUTarget
23
+ from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization_calculator import \
24
+ ResourceUtilizationCalculator, BitwidthMode, TargetInclusionCriterion
27
25
  from model_compression_toolkit.core.graph_prep_runner import graph_preparation_runner
28
- from model_compression_toolkit.target_platform_capabilities.target_platform import TargetPlatformCapabilities
29
- from model_compression_toolkit.target_platform_capabilities.schema.mct_current_schema import QuantizationConfigOptions
30
- from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.ru_methods import calc_graph_cuts
26
+ from model_compression_toolkit.target_platform_capabilities import FrameworkQuantizationCapabilities
31
27
 
32
28
 
33
29
  def compute_resource_utilization_data(in_model: Any,
34
30
  representative_data_gen: Callable,
35
31
  core_config: CoreConfig,
36
- tpc: TargetPlatformCapabilities,
32
+ fqc: FrameworkQuantizationCapabilities,
37
33
  fw_info: FrameworkInfo,
38
34
  fw_impl: FrameworkImplementation,
39
35
  transformed_graph: Graph = None,
@@ -47,7 +43,7 @@ def compute_resource_utilization_data(in_model: Any,
47
43
  in_model: Model to build graph from (the model that intended to be quantized).
48
44
  representative_data_gen: Dataset used for calibration.
49
45
  core_config: CoreConfig containing parameters of how the model should be quantized.
50
- tpc: TargetPlatformCapabilities object that models the inference target platform and
46
+ fqc: FrameworkQuantizationCapabilities object that models the inference target platform and
51
47
  the attached framework operator's information.
52
48
  fw_info: Information needed for quantization about the specific framework.
53
49
  fw_impl: FrameworkImplementation object with a specific framework methods implementation.
@@ -70,183 +66,23 @@ def compute_resource_utilization_data(in_model: Any,
70
66
  core_config.quantization_config,
71
67
  fw_info,
72
68
  fw_impl,
73
- tpc,
69
+ fqc,
74
70
  bit_width_config=core_config.bit_width_config,
75
- mixed_precision_enable=mixed_precision_enable)
76
-
77
- # Compute parameters sum
78
- weights_memory_bytes, weights_params = compute_nodes_weights_params(graph=transformed_graph, fw_info=fw_info)
79
- total_weights_params = 0 if len(weights_params) == 0 else sum(weights_params)
80
-
81
- # Compute max activation tensor
82
- activation_output_sizes_bytes, activation_output_sizes = compute_activation_output_maxcut_sizes(graph=transformed_graph)
83
- max_activation_tensor_size = 0 if len(activation_output_sizes) == 0 else max(activation_output_sizes)
84
-
85
- # Compute total memory utilization - parameters sum + max activation tensor
86
- total_size = total_weights_params + max_activation_tensor_size
87
-
88
- # Compute BOPS utilization - total count of bit-operations for all configurable layers with kernel
89
- bops_count = compute_total_bops(graph=transformed_graph, fw_info=fw_info, fw_impl=fw_impl)
90
- bops_count = np.inf if len(bops_count) == 0 else sum(bops_count)
91
-
92
- return ResourceUtilization(weights_memory=total_weights_params,
93
- activation_memory=max_activation_tensor_size,
94
- total_memory=total_size,
95
- bops=bops_count)
96
-
97
-
98
- def compute_nodes_weights_params(graph: Graph, fw_info: FrameworkInfo) -> Tuple[np.ndarray, np.ndarray]:
99
- """
100
- Calculates the memory usage in bytes and the number of weight parameters for each node within a graph.
101
- Memory calculations are based on the maximum bit-width used for quantization per node.
102
-
103
- Args:
104
- graph: A finalized Graph object, representing the model structure.
105
- fw_info: FrameworkInfo object containing details about the specific framework's
106
- quantization attributes for different layers' weights.
107
-
108
- Returns:
109
- A tuple containing two arrays:
110
- - The first array represents the memory in bytes for each node's weights when quantized at the maximal bit-width.
111
- - The second array represents the total number of weight parameters for each node.
112
- """
113
- weights_params = []
114
- weights_memory_bytes = []
115
- for n in graph.nodes:
116
- # TODO: when enabling multiple attribute quantization by default (currently,
117
- # only kernel quantization is enabled) we should include other attributes memory in the sum of all
118
- # weights memory.
119
- # When implementing this, we should just go over all attributes in the node instead of counting only kernels.
120
- kernel_attr = fw_info.get_kernel_op_attributes(n.type)[0]
121
- if kernel_attr is not None and not n.reuse:
122
- kernel_candidates = n.get_all_weights_attr_candidates(kernel_attr)
123
-
124
- if len(kernel_candidates) > 0 and any([c.enable_weights_quantization for c in kernel_candidates]):
125
- max_weight_bits = max([kc.weights_n_bits for kc in kernel_candidates])
126
- node_num_weights_params = 0
127
- for attr in fw_info.get_kernel_op_attributes(n.type):
128
- if attr is not None:
129
- node_num_weights_params += n.get_weights_by_keys(attr).flatten().shape[0]
130
-
131
- weights_params.append(node_num_weights_params)
132
-
133
- # multiply num params by num bits and divide by BITS_TO_BYTES to convert from bits to bytes
134
- weights_memory_bytes.append(node_num_weights_params * max_weight_bits / BITS_TO_BYTES)
135
-
136
- return np.array(weights_memory_bytes), np.array(weights_params)
137
-
138
-
139
- def compute_activation_output_maxcut_sizes(graph: Graph) -> Tuple[np.ndarray, np.ndarray]:
140
- """
141
- Computes an array of the respective output tensor maxcut size and an array of the output tensor
142
- cut size in bytes for each cut.
143
-
144
- Args:
145
- graph: A finalized Graph object, representing the model structure.
146
-
147
- Returns:
148
- A tuple containing two arrays:
149
- - The first is an array of the size of each activation max-cut size in bytes, calculated
150
- using the maximal bit-width for quantization.
151
- - The second array an array of the size of each activation max-cut activation size in number of parameters.
152
-
153
- """
154
- cuts = calc_graph_cuts(graph)
155
-
156
- # map nodes to cuts.
157
- node_to_cat_mapping = defaultdict(list)
158
- for i, cut in enumerate(cuts):
159
- mem_element_names = [m.node_name for m in cut.mem_elements.elements]
160
- for m_name in mem_element_names:
161
- if len(graph.find_node_by_name(m_name)) > 0:
162
- node_to_cat_mapping[m_name].append(i)
163
- else:
164
- Logger.critical(f"Missing node: {m_name}") # pragma: no cover
71
+ mixed_precision_enable=mixed_precision_enable,
72
+ running_gptq=False)
165
73
 
166
- activation_outputs = np.zeros(len(cuts))
167
- activation_outputs_bytes = np.zeros(len(cuts))
168
- for n in graph.nodes:
169
- # Go over all nodes that have activation quantization enabled.
170
- if n.has_activation_quantization_enabled_candidate():
171
- # Fetch maximum bits required for activations quantization.
172
- max_activation_bits = max([qc.activation_quantization_cfg.activation_n_bits for qc in n.candidates_quantization_cfg])
173
- node_output_size = n.get_total_output_params()
174
- for cut_index in node_to_cat_mapping[n.name]:
175
- activation_outputs[cut_index] += node_output_size
176
- # Calculate activation size in bytes and append to list
177
- activation_outputs_bytes[cut_index] += node_output_size * max_activation_bits / BITS_TO_BYTES
178
-
179
- return activation_outputs_bytes, activation_outputs
180
-
181
-
182
- # TODO maxcut: add test for this function and remove no cover
183
- def compute_activation_output_sizes(graph: Graph) -> Tuple[np.ndarray, np.ndarray]: # pragma: no cover
184
- """
185
- Computes an array of the respective output tensor size and an array of the output tensor size in bytes for
186
- each node.
187
-
188
- Args:
189
- graph: A finalized Graph object, representing the model structure.
190
-
191
- Returns:
192
- A tuple containing two arrays:
193
- - The first array represents the size of each node's activation output tensor size in bytes,
194
- calculated using the maximal bit-width for quantization.
195
- - The second array represents the size of each node's activation output tensor size.
196
-
197
- """
198
- activation_outputs = []
199
- activation_outputs_bytes = []
200
- for n in graph.nodes:
201
- # Go over all nodes that have configurable activation.
202
- if n.has_activation_quantization_enabled_candidate():
203
- # Fetch maximum bits required for quantizing activations
204
- max_activation_bits = max([qc.activation_quantization_cfg.activation_n_bits for qc in n.candidates_quantization_cfg])
205
- node_output_size = n.get_total_output_params()
206
- activation_outputs.append(node_output_size)
207
- # Calculate activation size in bytes and append to list
208
- activation_outputs_bytes.append(node_output_size * max_activation_bits / BITS_TO_BYTES)
209
-
210
- return np.array(activation_outputs_bytes), np.array(activation_outputs)
211
-
212
-
213
- def compute_total_bops(graph: Graph, fw_info: FrameworkInfo, fw_impl: FrameworkImplementation) -> np.ndarray:
214
- """
215
- Computes a vector with the respective Bit-operations count for each configurable node that includes MAC operations.
216
- The computation assumes that the graph is a representation of a float model, thus, BOPs computation uses 32-bit.
217
-
218
- Args:
219
- graph: Finalized Graph object.
220
- fw_info: FrameworkInfo object about the specific framework
221
- (e.g., attributes of different layers' weights to quantize).
222
- fw_impl: FrameworkImplementation object with a specific framework methods implementation.
223
-
224
- Returns: A vector of nodes' Bit-operations count.
225
-
226
- """
227
-
228
- bops = []
229
-
230
- # Go over all configurable nodes that have kernels.
231
- for n in graph.get_topo_sorted_nodes():
232
- if n.has_kernel_weight_to_quantize(fw_info):
233
- # If node doesn't have weights then its MAC count is 0, and we shouldn't consider it in the BOPS count.
234
- incoming_edges = graph.incoming_edges(n, sort_by_attr=EDGE_SINK_INDEX)
235
- assert len(incoming_edges) == 1, f"Can't compute BOPS metric for node {n.name} with multiple inputs."
236
-
237
- node_mac = fw_impl.get_node_mac_operations(n, fw_info)
238
-
239
- node_bops = (FLOAT_BITWIDTH ** 2) * node_mac
240
- bops.append(node_bops)
241
-
242
- return np.array(bops)
74
+ ru_calculator = ResourceUtilizationCalculator(transformed_graph, fw_impl, fw_info)
75
+ ru = ru_calculator.compute_resource_utilization(TargetInclusionCriterion.AnyQuantized, BitwidthMode.Q8Bit,
76
+ ru_targets=set(RUTarget) - {RUTarget.BOPS})
77
+ ru.bops, _ = ru_calculator.compute_bops(TargetInclusionCriterion.AnyQuantized, BitwidthMode.Float)
78
+ return ru
243
79
 
244
80
 
245
81
  def requires_mixed_precision(in_model: Any,
246
82
  target_resource_utilization: ResourceUtilization,
247
83
  representative_data_gen: Callable,
248
84
  core_config: CoreConfig,
249
- tpc: TargetPlatformCapabilities,
85
+ fqc: FrameworkQuantizationCapabilities,
250
86
  fw_info: FrameworkInfo,
251
87
  fw_impl: FrameworkImplementation) -> bool:
252
88
  """
@@ -261,14 +97,13 @@ def requires_mixed_precision(in_model: Any,
261
97
  target_resource_utilization: The resource utilization of the target device.
262
98
  representative_data_gen: A function that generates representative data for the model.
263
99
  core_config: CoreConfig containing parameters of how the model should be quantized.
264
- tpc: TargetPlatformCapabilities object that models the inference target platform and
100
+ fqc: FrameworkQuantizationCapabilities object that models the inference target platform and
265
101
  the attached framework operator's information.
266
102
  fw_info: Information needed for quantization about the specific framework.
267
103
  fw_impl: FrameworkImplementation object with a specific framework methods implementation.
268
104
 
269
105
  Returns: A boolean indicating if mixed precision is needed.
270
106
  """
271
- is_mixed_precision = False
272
107
  core_config = _create_core_config_for_ru(core_config)
273
108
 
274
109
  transformed_graph = graph_preparation_runner(in_model,
@@ -276,27 +111,15 @@ def requires_mixed_precision(in_model: Any,
276
111
  core_config.quantization_config,
277
112
  fw_info,
278
113
  fw_impl,
279
- tpc,
114
+ fqc,
280
115
  bit_width_config=core_config.bit_width_config,
281
- mixed_precision_enable=False)
282
- # Compute max weights memory in bytes
283
- weights_memory_by_layer_bytes, _ = compute_nodes_weights_params(transformed_graph, fw_info)
284
- total_weights_memory_bytes = 0 if len(weights_memory_by_layer_bytes) == 0 else sum(weights_memory_by_layer_bytes)
285
-
286
- # Compute max activation tensor in bytes
287
- activation_memory_estimation_bytes, _ = compute_activation_output_maxcut_sizes(transformed_graph)
288
- max_activation_memory_estimation_bytes = 0 if len(activation_memory_estimation_bytes) == 0 \
289
- else max(activation_memory_estimation_bytes)
290
-
291
- # Compute BOPS utilization - total count of bit-operations for all configurable layers with kernel
292
- bops_count = compute_total_bops(graph=transformed_graph, fw_info=fw_info, fw_impl=fw_impl)
293
- bops_count = np.inf if len(bops_count) == 0 else sum(bops_count)
116
+ mixed_precision_enable=False,
117
+ running_gptq=False)
294
118
 
295
- is_mixed_precision |= target_resource_utilization.weights_memory < total_weights_memory_bytes
296
- is_mixed_precision |= target_resource_utilization.activation_memory < max_activation_memory_estimation_bytes
297
- is_mixed_precision |= target_resource_utilization.total_memory < total_weights_memory_bytes + max_activation_memory_estimation_bytes
298
- is_mixed_precision |= target_resource_utilization.bops < bops_count
299
- return is_mixed_precision
119
+ ru_calculator = ResourceUtilizationCalculator(transformed_graph, fw_impl, fw_info)
120
+ max_ru = ru_calculator.compute_resource_utilization(TargetInclusionCriterion.AnyQuantized, BitwidthMode.QMaxBit,
121
+ ru_targets=target_resource_utilization.get_restricted_metrics())
122
+ return not target_resource_utilization.is_satisfied_by(max_ru)
300
123
 
301
124
 
302
125
  def _create_core_config_for_ru(core_config: CoreConfig) -> CoreConfig:
@@ -16,7 +16,7 @@
16
16
  import numpy as np
17
17
  from pulp import *
18
18
  from tqdm import tqdm
19
- from typing import Dict, List, Tuple, Callable
19
+ from typing import Dict, Tuple, Set, Any
20
20
 
21
21
  from model_compression_toolkit.logger import Logger
22
22
  from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import ResourceUtilization, RUTarget
@@ -167,72 +167,95 @@ def _formalize_problem(layer_to_indicator_vars_mapping: Dict[int, Dict[int, LpVa
167
167
  indicators_arr = np.array(indicators)
168
168
  indicators_matrix = np.diag(indicators_arr)
169
169
 
170
- for target, ru_value in target_resource_utilization.get_resource_utilization_dict().items():
171
- if not np.isinf(ru_value):
172
- non_conf_ru_vector = None if search_manager.non_conf_ru_dict is None \
173
- else search_manager.non_conf_ru_dict.get(target)
174
- _add_set_of_ru_constraints(search_manager=search_manager,
175
- target=target,
176
- target_resource_utilization_value=ru_value,
177
- indicators_matrix=indicators_matrix,
178
- lp_problem=lp_problem,
179
- non_conf_ru_vector=non_conf_ru_vector)
170
+ _add_ru_constraints(search_manager=search_manager,
171
+ target_resource_utilization=target_resource_utilization,
172
+ indicators_matrix=indicators_matrix,
173
+ lp_problem=lp_problem,
174
+ non_conf_ru_dict=search_manager.non_conf_ru_dict)
180
175
  else: # pragma: no cover
181
176
  Logger.critical("Unable to execute mixed-precision search: 'target_resource_utilization' is None. "
182
177
  "A valid 'target_resource_utilization' is required.")
183
178
  return lp_problem
184
179
 
185
180
 
186
- def _add_set_of_ru_constraints(search_manager: MixedPrecisionSearchManager,
187
- target: RUTarget,
188
- target_resource_utilization_value: float,
189
- indicators_matrix: np.ndarray,
190
- lp_problem: LpProblem,
191
- non_conf_ru_vector: np.ndarray):
181
+ def _add_ru_constraints(search_manager: MixedPrecisionSearchManager,
182
+ target_resource_utilization: ResourceUtilization,
183
+ indicators_matrix: np.ndarray,
184
+ lp_problem: LpProblem,
185
+ non_conf_ru_dict: Optional[Dict[RUTarget, np.ndarray]]):
192
186
  """
193
- Adding a constraint for the Lp problem for the given target resource utilization.
187
+ Adding targets constraints for the Lp problem for the given target resource utilization.
194
188
  The update to the Lp problem object is done inplace.
195
189
 
196
190
  Args:
197
191
  search_manager: MixedPrecisionSearchManager object to be used for resource utilization constraints formalization.
198
- target: A RUTarget.
199
- target_resource_utilization_value: Target resource utilization value of the given target resource utilization
200
- for which the constraint is added.
192
+ target_resource_utilization: Target resource utilization.
201
193
  indicators_matrix: A diagonal matrix of the Lp problem's indicators.
202
194
  lp_problem: An Lp problem object to add constraint to.
203
- non_conf_ru_vector: A non-configurable nodes' resource utilization vector.
195
+ non_conf_ru_dict: A non-configurable nodes' resource utilization vectors for the constrained targets.
196
+ """
197
+ ru_indicated_vectors = {}
198
+ # targets to add constraints for
199
+ constraints_targets = target_resource_utilization.get_restricted_metrics()
200
+ # to add constraints for Total target we need to compute weight and activation
201
+ targets_to_compute = constraints_targets
202
+ if RUTarget.TOTAL in constraints_targets:
203
+ targets_to_compute = targets_to_compute.union({RUTarget.ACTIVATION, RUTarget.WEIGHTS}) - {RUTarget.TOTAL}
204
+
205
+ for target in targets_to_compute:
206
+ ru_matrix = search_manager.compute_resource_utilization_matrix(target) # num elements X num configurations
207
+ indicated_ru_matrix = np.matmul(ru_matrix.T, indicators_matrix) # num elements X num configurations
208
+
209
+ # Sum the indicated values over all configurations, and add the value for minimal configuration once.
210
+ # Indicated utilization values are relative to the minimal configuration, i.e. they represent the extra memory
211
+ # that would be required if that configuration is selected).
212
+ # Each element in a vector is an lp object representing the configurations sum term for a memory element.
213
+ ru_vec = indicated_ru_matrix.sum(axis=1) + search_manager.min_ru[target]
214
+
215
+ non_conf_ru_vec = non_conf_ru_dict[target]
216
+ if non_conf_ru_vec is not None and non_conf_ru_vec.size:
217
+ # add non-conf value as additional mem elements so that they get aggregated
218
+ ru_vec = np.concatenate([ru_vec, non_conf_ru_vec])
219
+ ru_indicated_vectors[target] = ru_vec
220
+
221
+ # add constraints only for the restricted targets in target resource utilization.
222
+ for target in constraints_targets:
223
+ target_resource_utilization_value = target_resource_utilization.get_resource_utilization_dict()[target]
224
+ aggr_ru = _aggregate_for_lp(ru_indicated_vectors, target)
225
+ for v in aggr_ru:
226
+ if isinstance(v, float):
227
+ if v > target_resource_utilization_value:
228
+ Logger.critical(
229
+ f"The model cannot be quantized to meet the specified target resource utilization {target.value} "
230
+ f"with the value {target_resource_utilization_value}.") # pragma: no cover
231
+ else:
232
+ lp_problem += v <= target_resource_utilization_value
233
+
204
234
 
235
+ def _aggregate_for_lp(targets_ru_vec: Dict[RUTarget, Any], target: RUTarget) -> list:
205
236
  """
237
+ Aggregate resource utilization values for the LP.
206
238
 
207
- ru_matrix = search_manager.compute_resource_utilization_matrix(target)
208
- indicated_ru_matrix = np.matmul(ru_matrix, indicators_matrix)
209
- # Need to re-organize the tensor such that the configurations' axis will be second,
210
- # and all metric values' axis will come afterword
211
- indicated_ru_matrix = np.moveaxis(indicated_ru_matrix, source=len(indicated_ru_matrix.shape) - 1, destination=1)
212
-
213
- # In order to get the result resource utilization according to a chosen set of indicators, we sum each row in
214
- # the result matrix. Each row represents the resource utilization values for a specific resource utilization metric,
215
- # such that only elements corresponding to a configuration which implied by the set of indicators will have some
216
- # positive value different than 0 (and will contribute to the total resource utilization).
217
- ru_sum_vector = np.array([
218
- np.sum(indicated_ru_matrix[i], axis=0) + # sum of metric values over all configurations in a row
219
- search_manager.min_ru[target][i] for i in range(indicated_ru_matrix.shape[0])])
220
-
221
- # search_manager.compute_ru_functions contains a pair of ru_metric and ru_aggregation for each ru target
222
- # get aggregated ru, considering both configurable and non-configurable nodes
223
- if non_conf_ru_vector is None or len(non_conf_ru_vector) == 0:
224
- aggr_ru = search_manager.compute_ru_functions[target].aggregate_fn(ru_sum_vector)
225
- else:
226
- aggr_ru = search_manager.compute_ru_functions[target].aggregate_fn(np.concatenate([ru_sum_vector, non_conf_ru_vector]))
227
-
228
- for v in aggr_ru:
229
- if isinstance(v, float):
230
- if v > target_resource_utilization_value:
231
- Logger.critical(
232
- f"The model cannot be quantized to meet the specified target resource utilization {target.value} "
233
- f"with the value {target_resource_utilization_value}.") # pragma: no cover
234
- else:
235
- lp_problem += v <= target_resource_utilization_value
239
+ Args:
240
+ targets_ru_vec: resource utilization vectors for all precomputed targets.
241
+ target: resource utilization target.
242
+
243
+ Returns:
244
+ Aggregated resource utilization.
245
+ """
246
+ if target == RUTarget.TOTAL:
247
+ w = lpSum(targets_ru_vec[RUTarget.WEIGHTS])
248
+ act_ru_vec = targets_ru_vec[RUTarget.ACTIVATION]
249
+ return [w + v for v in act_ru_vec]
250
+
251
+ if target in [RUTarget.WEIGHTS, RUTarget.BOPS]:
252
+ return [lpSum(targets_ru_vec[target])]
253
+
254
+ if target == RUTarget.ACTIVATION:
255
+ # for max aggregation, each value constitutes a separate constraint
256
+ return list(targets_ru_vec[target])
257
+
258
+ raise ValueError(f'Unexpected target {target}.') # pragma: no cover
236
259
 
237
260
 
238
261
  def _build_layer_to_metrics_mapping(search_manager: MixedPrecisionSearchManager,
@@ -113,11 +113,9 @@ class SensitivityEvaluation:
113
113
  # in the new built MP model.
114
114
  self.baseline_model, self.model_mp, self.conf_node2layers = self._build_models()
115
115
 
116
- # Build images batches for inference comparison
117
- self.images_batches = self._get_images_batches(quant_config.num_of_images)
118
-
119
- # Casting images tensors to the framework tensor type.
120
- self.images_batches = [self.fw_impl.to_tensor(img) for img in self.images_batches]
116
+ # Build images batches for inference comparison and cat to framework type
117
+ images_batches = self._get_images_batches(quant_config.num_of_images)
118
+ self.images_batches = [self.fw_impl.to_tensor(img) for img in images_batches]
121
119
 
122
120
  # Initiating baseline_tensors_list since it is not initiated in SensitivityEvaluationManager init.
123
121
  self.baseline_tensors_list = self._init_baseline_tensors_list()
@@ -80,8 +80,8 @@ def greedy_solution_refinement_procedure(mp_solution: List[int],
80
80
  updated_ru.append(node_updated_ru)
81
81
 
82
82
  # filter out new configs that don't hold the resource utilization restrictions
83
- node_filtered_ru = [(node_idx, ru) for node_idx, ru in zip(valid_candidates, updated_ru) if
84
- target_resource_utilization.holds_constraints(ru)]
83
+ node_filtered_ru = [(node_idx, ru) for node_idx, ru in zip(valid_candidates, updated_ru)
84
+ if target_resource_utilization.is_satisfied_by(ru)]
85
85
 
86
86
  if len(node_filtered_ru) > 0:
87
87
  sorted_by_ru = sorted(node_filtered_ru, key=lambda node_ru: (node_ru[1].total_memory,
@@ -24,7 +24,8 @@ from model_compression_toolkit.core.common.pruning.memory_calculator import Memo
24
24
  from model_compression_toolkit.core.common.pruning.pruning_framework_implementation import PruningFrameworkImplementation
25
25
  from model_compression_toolkit.core.common.pruning.mask.per_simd_group_mask import PerSIMDGroupMask
26
26
  from model_compression_toolkit.logger import Logger
27
- from model_compression_toolkit.target_platform_capabilities.target_platform import TargetPlatformCapabilities
27
+ from model_compression_toolkit.target_platform_capabilities.targetplatform2framework.framework_quantization_capabilities import \
28
+ FrameworkQuantizationCapabilities
28
29
 
29
30
 
30
31
  class GreedyMaskCalculator:
@@ -42,7 +43,7 @@ class GreedyMaskCalculator:
42
43
  target_resource_utilization: ResourceUtilization,
43
44
  graph: Graph,
44
45
  fw_impl: PruningFrameworkImplementation,
45
- tpc: TargetPlatformCapabilities,
46
+ fqc: FrameworkQuantizationCapabilities,
46
47
  simd_groups_indices: Dict[BaseNode, List[List[int]]]):
47
48
  """
48
49
  Args:
@@ -52,7 +53,7 @@ class GreedyMaskCalculator:
52
53
  target_resource_utilization (ResourceUtilization): The target resource utilization to achieve.
53
54
  graph (Graph): The computational graph of the model.
54
55
  fw_impl (PruningFrameworkImplementation): Framework-specific implementation details.
55
- tpc (TargetPlatformCapabilities): Platform-specific constraints and capabilities.
56
+ fqc (FrameworkQuantizationCapabilities): Platform-specific constraints and capabilities.
56
57
  simd_groups_indices (Dict[BaseNode, List[List[int]]]): Indices of SIMD groups in each node.
57
58
  """
58
59
  self.prunable_nodes = prunable_nodes
@@ -60,7 +61,7 @@ class GreedyMaskCalculator:
60
61
  self.target_resource_utilization = target_resource_utilization
61
62
  self.graph = graph
62
63
  self.fw_impl = fw_impl
63
- self.tpc = tpc
64
+ self.fqc = fqc
64
65
 
65
66
  self.simd_groups_indices = simd_groups_indices
66
67
  self.simd_groups_scores = simd_groups_scores
@@ -90,7 +91,7 @@ class GreedyMaskCalculator:
90
91
  """
91
92
  # Iteratively unprune the graph while monitoring the memory footprint.
92
93
  current_memory = self.memory_calculator.get_pruned_graph_memory(masks=self.oc_pruning_mask.get_mask(),
93
- include_padded_channels=self.tpc.is_simd_padding)
94
+ include_padded_channels=self.fqc.is_simd_padding)
94
95
  if current_memory > self.target_resource_utilization.weights_memory:
95
96
  Logger.critical(f"Insufficient memory for the target resource utilization: current memory {current_memory}, "
96
97
  f"target memory {self.target_resource_utilization.weights_memory}.")
@@ -105,7 +106,7 @@ class GreedyMaskCalculator:
105
106
  group_index=group_to_remain_idx,
106
107
  mask_indicator=MaskIndicator.REMAINED)
107
108
  current_memory = self.memory_calculator.get_pruned_graph_memory(masks=self.oc_pruning_mask.get_mask(),
108
- include_padded_channels=self.tpc.is_simd_padding)
109
+ include_padded_channels=self.fqc.is_simd_padding)
109
110
 
110
111
  # If the target memory is exceeded, revert the last addition.
111
112
  if current_memory > self.target_resource_utilization.weights_memory:
@@ -23,7 +23,6 @@ from model_compression_toolkit.core.common.mixed_precision.resource_utilization_
23
23
  from model_compression_toolkit.core.common.pruning.memory_calculator import MemoryCalculator
24
24
  from model_compression_toolkit.core.common.pruning.pruning_framework_implementation import PruningFrameworkImplementation
25
25
  from model_compression_toolkit.logger import Logger
26
- from model_compression_toolkit.target_platform_capabilities.target_platform import TargetPlatformCapabilities
27
26
 
28
27
  class MaskIndicator(Enum):
29
28
  """
@@ -23,7 +23,6 @@ from model_compression_toolkit.core.common.pruning.mask.per_channel_mask import
23
23
  from model_compression_toolkit.core.common.pruning.memory_calculator import MemoryCalculator
24
24
  from model_compression_toolkit.core.common.pruning.pruning_framework_implementation import PruningFrameworkImplementation
25
25
  from model_compression_toolkit.logger import Logger
26
- from model_compression_toolkit.target_platform_capabilities.target_platform import TargetPlatformCapabilities
27
26
 
28
27
  class PerSIMDGroupMask:
29
28
  def __init__(self,
@@ -29,7 +29,9 @@ from model_compression_toolkit.core.common.pruning.pruning_framework_implementat
29
29
  from model_compression_toolkit.core.common.pruning.pruning_info import PruningInfo, \
30
30
  unroll_simd_scores_to_per_channel_scores
31
31
  from model_compression_toolkit.logger import Logger
32
- from model_compression_toolkit.target_platform_capabilities.target_platform import TargetPlatformCapabilities
32
+ from model_compression_toolkit.target_platform_capabilities.targetplatform2framework import \
33
+ FrameworkQuantizationCapabilities
34
+
33
35
 
34
36
  class Pruner:
35
37
  """
@@ -43,7 +45,7 @@ class Pruner:
43
45
  target_resource_utilization: ResourceUtilization,
44
46
  representative_data_gen: Callable,
45
47
  pruning_config: PruningConfig,
46
- target_platform_capabilities: TargetPlatformCapabilities):
48
+ target_platform_capabilities: FrameworkQuantizationCapabilities):
47
49
  """
48
50
  Args:
49
51
  float_graph (Graph): The floating-point representation of the model's computation graph.
@@ -52,7 +54,7 @@ class Pruner:
52
54
  target_resource_utilization (ResourceUtilization): The target resource utilization to be achieved after pruning.
53
55
  representative_data_gen (Callable): Generator function for representative dataset used in pruning analysis.
54
56
  pruning_config (PruningConfig): Configuration object specifying how pruning should be performed.
55
- target_platform_capabilities (TargetPlatformCapabilities): Object encapsulating the capabilities of the target hardware platform.
57
+ target_platform_capabilities (FrameworkQuantizationCapabilities): Object encapsulating the capabilities of the target hardware platform.
56
58
  """
57
59
  self.float_graph = float_graph
58
60
  self.fw_info = fw_info
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  # ==============================================================================
15
+ from dataclasses import dataclass, field
15
16
  from typing import List, Union, Dict
16
17
 
17
18
  from model_compression_toolkit.core.common import Graph
@@ -19,6 +20,7 @@ from model_compression_toolkit.core.common.matchers.node_matcher import BaseNode
19
20
  from model_compression_toolkit.logger import Logger
20
21
 
21
22
 
23
+ @dataclass
22
24
  class ManualBitWidthSelection:
23
25
  """
24
26
  Class to encapsulate the manual bit width selection configuration for a specific filter.
@@ -27,13 +29,11 @@ class ManualBitWidthSelection:
27
29
  filter (BaseNodeMatcher): The filter used to select nodes for bit width manipulation.
28
30
  bit_width (int): The bit width to be applied to the selected nodes.
29
31
  """
30
- def __init__(self,
31
- filter: BaseNodeMatcher,
32
- bit_width: int):
33
- self.filter = filter
34
- self.bit_width = bit_width
32
+ filter: BaseNodeMatcher
33
+ bit_width: int
35
34
 
36
35
 
36
+ @dataclass
37
37
  class BitWidthConfig:
38
38
  """
39
39
  Class to manage manual bit-width configurations.
@@ -41,13 +41,7 @@ class BitWidthConfig:
41
41
  Attributes:
42
42
  manual_activation_bit_width_selection_list (List[ManualBitWidthSelection]): A list of ManualBitWidthSelection objects defining manual bit-width configurations.
43
43
  """
44
- def __init__(self,
45
- manual_activation_bit_width_selection_list: List[ManualBitWidthSelection] = None):
46
- self.manual_activation_bit_width_selection_list = [] if manual_activation_bit_width_selection_list is None else manual_activation_bit_width_selection_list
47
-
48
- def __repr__(self):
49
- # Used for debugging, thus no cover.
50
- return str(self.__dict__) # pragma: no cover
44
+ manual_activation_bit_width_selection_list: List[ManualBitWidthSelection] = field(default_factory=list)
51
45
 
52
46
  def set_manual_activation_bit_width(self,
53
47
  filters: Union[List[BaseNodeMatcher], BaseNodeMatcher],
@@ -15,8 +15,7 @@
15
15
  import copy
16
16
  from typing import List
17
17
 
18
- from model_compression_toolkit.target_platform_capabilities.target_platform import QuantizationMethod
19
-
18
+ from mct_quantizers import QuantizationMethod
20
19
  from model_compression_toolkit.core.common import Graph, BaseNode
21
20
  from model_compression_toolkit.constants import FLOAT_BITWIDTH
22
21
  from model_compression_toolkit.core.common.quantization.candidate_node_quantization_config import \
@@ -401,9 +401,9 @@ class NodeWeightsQuantizationConfig(BaseNodeQuantizationConfig):
401
401
  # therefore, we need to look for the attribute in the op_cfg that is contained in the node attribute's name.
402
402
  attrs_included_in_name = {k: v for k, v in op_cfg.attr_weights_configs_mapping.items() if k in attr}
403
403
  if len(attrs_included_in_name) > 1: # pragma: no cover
404
- Logger.critical(f"Found multiple attribute in TPC OpConfig that are contained "
404
+ Logger.critical(f"Found multiple attribute in FQC OpConfig that are contained "
405
405
  f"in the attribute name '{attr}'."
406
- f"Please fix the TPC attribute names mapping such that each operator's attribute would "
406
+ f"Please fix the FQC attribute names mapping such that each operator's attribute would "
407
407
  f"have a unique matching name.")
408
408
  if len(attrs_included_in_name) == 0:
409
409
  attr_cfg = op_cfg.default_weight_attr_config