mct-nightly 2.2.0.20250113.527__py3-none-any.whl → 2.2.0.20250114.84821__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mct_nightly-2.2.0.20250113.527.dist-info → mct_nightly-2.2.0.20250114.84821.dist-info}/METADATA +1 -1
- {mct_nightly-2.2.0.20250113.527.dist-info → mct_nightly-2.2.0.20250114.84821.dist-info}/RECORD +103 -105
- model_compression_toolkit/__init__.py +2 -2
- model_compression_toolkit/core/common/framework_info.py +1 -3
- model_compression_toolkit/core/common/fusion/layer_fusing.py +6 -5
- model_compression_toolkit/core/common/graph/base_graph.py +20 -21
- model_compression_toolkit/core/common/graph/base_node.py +44 -17
- model_compression_toolkit/core/common/mixed_precision/mixed_precision_candidates_filter.py +7 -6
- model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py +0 -6
- model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py +26 -135
- model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization.py +36 -62
- model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_calculator.py +667 -0
- model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_data.py +25 -202
- model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/ru_methods.py +164 -470
- model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py +30 -7
- model_compression_toolkit/core/common/mixed_precision/sensitivity_evaluation.py +3 -5
- model_compression_toolkit/core/common/mixed_precision/solution_refinement_procedure.py +2 -2
- model_compression_toolkit/core/common/pruning/greedy_mask_calculator.py +7 -6
- model_compression_toolkit/core/common/pruning/mask/per_channel_mask.py +0 -1
- model_compression_toolkit/core/common/pruning/mask/per_simd_group_mask.py +0 -1
- model_compression_toolkit/core/common/pruning/pruner.py +5 -3
- model_compression_toolkit/core/common/quantization/bit_width_config.py +6 -12
- model_compression_toolkit/core/common/quantization/filter_nodes_candidates.py +1 -2
- model_compression_toolkit/core/common/quantization/node_quantization_config.py +2 -2
- model_compression_toolkit/core/common/quantization/quantization_config.py +1 -1
- model_compression_toolkit/core/common/quantization/quantization_fn_selection.py +1 -1
- model_compression_toolkit/core/common/quantization/quantization_params_fn_selection.py +1 -1
- model_compression_toolkit/core/common/quantization/quantization_params_generation/error_functions.py +1 -1
- model_compression_toolkit/core/common/quantization/quantization_params_generation/power_of_two_selection.py +1 -1
- model_compression_toolkit/core/common/quantization/quantization_params_generation/qparams_activations_computation.py +1 -1
- model_compression_toolkit/core/common/quantization/quantization_params_generation/symmetric_selection.py +1 -1
- model_compression_toolkit/core/common/quantization/quantization_params_generation/uniform_selection.py +1 -1
- model_compression_toolkit/core/common/quantization/set_node_quantization_config.py +15 -14
- model_compression_toolkit/core/common/substitutions/batchnorm_reconstruction.py +1 -1
- model_compression_toolkit/core/common/substitutions/batchnorm_refusing.py +1 -1
- model_compression_toolkit/core/common/substitutions/shift_negative_activation.py +5 -5
- model_compression_toolkit/core/graph_prep_runner.py +12 -11
- model_compression_toolkit/core/keras/data_util.py +24 -5
- model_compression_toolkit/core/keras/default_framework_info.py +1 -1
- model_compression_toolkit/core/keras/mixed_precision/configurable_weights_quantizer.py +1 -2
- model_compression_toolkit/core/keras/resource_utilization_data_facade.py +5 -6
- model_compression_toolkit/core/pytorch/back2framework/pytorch_model_builder.py +1 -1
- model_compression_toolkit/core/pytorch/default_framework_info.py +1 -1
- model_compression_toolkit/core/pytorch/mixed_precision/configurable_activation_quantizer.py +1 -1
- model_compression_toolkit/core/pytorch/mixed_precision/configurable_weights_quantizer.py +1 -1
- model_compression_toolkit/core/pytorch/resource_utilization_data_facade.py +4 -5
- model_compression_toolkit/core/runner.py +33 -60
- model_compression_toolkit/exporter/model_wrapper/keras/builder/node_to_quantizer.py +1 -1
- model_compression_toolkit/exporter/model_wrapper/pytorch/builder/node_to_quantizer.py +1 -1
- model_compression_toolkit/gptq/keras/quantization_facade.py +8 -9
- model_compression_toolkit/gptq/keras/quantizer/soft_rounding/symmetric_soft_quantizer.py +1 -1
- model_compression_toolkit/gptq/keras/quantizer/soft_rounding/uniform_soft_quantizer.py +1 -1
- model_compression_toolkit/gptq/keras/quantizer/ste_rounding/symmetric_ste.py +1 -1
- model_compression_toolkit/gptq/pytorch/quantization_facade.py +8 -9
- model_compression_toolkit/gptq/pytorch/quantizer/soft_rounding/symmetric_soft_quantizer.py +1 -1
- model_compression_toolkit/gptq/pytorch/quantizer/soft_rounding/uniform_soft_quantizer.py +1 -1
- model_compression_toolkit/gptq/pytorch/quantizer/ste_rounding/symmetric_ste.py +1 -1
- model_compression_toolkit/metadata.py +11 -10
- model_compression_toolkit/pruning/keras/pruning_facade.py +5 -6
- model_compression_toolkit/pruning/pytorch/pruning_facade.py +6 -7
- model_compression_toolkit/ptq/keras/quantization_facade.py +8 -9
- model_compression_toolkit/ptq/pytorch/quantization_facade.py +8 -9
- model_compression_toolkit/qat/keras/quantization_facade.py +5 -6
- model_compression_toolkit/qat/keras/quantizer/lsq/symmetric_lsq.py +1 -1
- model_compression_toolkit/qat/keras/quantizer/ste_rounding/symmetric_ste.py +1 -1
- model_compression_toolkit/qat/pytorch/quantization_facade.py +5 -9
- model_compression_toolkit/qat/pytorch/quantizer/lsq/symmetric_lsq.py +1 -1
- model_compression_toolkit/qat/pytorch/quantizer/lsq/uniform_lsq.py +1 -1
- model_compression_toolkit/qat/pytorch/quantizer/ste_rounding/symmetric_ste.py +1 -1
- model_compression_toolkit/qat/pytorch/quantizer/ste_rounding/uniform_ste.py +1 -1
- model_compression_toolkit/target_platform_capabilities/__init__.py +9 -0
- model_compression_toolkit/target_platform_capabilities/constants.py +1 -1
- model_compression_toolkit/target_platform_capabilities/schema/mct_current_schema.py +2 -2
- model_compression_toolkit/target_platform_capabilities/schema/schema_functions.py +18 -18
- model_compression_toolkit/target_platform_capabilities/schema/v1.py +13 -13
- model_compression_toolkit/target_platform_capabilities/{target_platform/targetplatform2framework → targetplatform2framework}/__init__.py +6 -6
- model_compression_toolkit/target_platform_capabilities/{target_platform/targetplatform2framework → targetplatform2framework}/attach2fw.py +10 -10
- model_compression_toolkit/target_platform_capabilities/{target_platform/targetplatform2framework → targetplatform2framework}/attach2keras.py +3 -3
- model_compression_toolkit/target_platform_capabilities/{target_platform/targetplatform2framework → targetplatform2framework}/attach2pytorch.py +3 -2
- model_compression_toolkit/target_platform_capabilities/{target_platform/targetplatform2framework → targetplatform2framework}/current_tpc.py +8 -8
- model_compression_toolkit/target_platform_capabilities/{target_platform/targetplatform2framework/target_platform_capabilities.py → targetplatform2framework/framework_quantization_capabilities.py} +40 -40
- model_compression_toolkit/target_platform_capabilities/{target_platform/targetplatform2framework/target_platform_capabilities_component.py → targetplatform2framework/framework_quantization_capabilities_component.py} +2 -2
- model_compression_toolkit/target_platform_capabilities/{target_platform/targetplatform2framework → targetplatform2framework}/layer_filter_params.py +0 -1
- model_compression_toolkit/target_platform_capabilities/{target_platform/targetplatform2framework → targetplatform2framework}/operations_to_layers.py +8 -8
- model_compression_toolkit/target_platform_capabilities/tpc_io_handler.py +24 -24
- model_compression_toolkit/target_platform_capabilities/tpc_models/get_target_platform_capabilities.py +18 -18
- model_compression_toolkit/target_platform_capabilities/tpc_models/imx500_tpc/latest/__init__.py +3 -3
- model_compression_toolkit/target_platform_capabilities/tpc_models/imx500_tpc/v1/{tp_model.py → tpc.py} +31 -32
- model_compression_toolkit/target_platform_capabilities/tpc_models/qnnpack_tpc/latest/__init__.py +3 -3
- model_compression_toolkit/target_platform_capabilities/tpc_models/qnnpack_tpc/v1/{tp_model.py → tpc.py} +27 -27
- model_compression_toolkit/target_platform_capabilities/tpc_models/tflite_tpc/latest/__init__.py +4 -4
- model_compression_toolkit/target_platform_capabilities/tpc_models/tflite_tpc/v1/{tp_model.py → tpc.py} +27 -27
- model_compression_toolkit/trainable_infrastructure/common/get_quantizers.py +1 -2
- model_compression_toolkit/trainable_infrastructure/common/trainable_quantizer_config.py +2 -1
- model_compression_toolkit/trainable_infrastructure/keras/activation_quantizers/lsq/symmetric_lsq.py +1 -2
- model_compression_toolkit/trainable_infrastructure/keras/config_serialization.py +1 -1
- model_compression_toolkit/xquant/common/model_folding_utils.py +7 -6
- model_compression_toolkit/xquant/keras/keras_report_utils.py +4 -4
- model_compression_toolkit/xquant/pytorch/pytorch_report_utils.py +3 -3
- model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/ru_aggregation_methods.py +0 -105
- model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/ru_functions_mapping.py +0 -33
- model_compression_toolkit/target_platform_capabilities/target_platform/__init__.py +0 -23
- {mct_nightly-2.2.0.20250113.527.dist-info → mct_nightly-2.2.0.20250114.84821.dist-info}/LICENSE.md +0 -0
- {mct_nightly-2.2.0.20250113.527.dist-info → mct_nightly-2.2.0.20250114.84821.dist-info}/WHEEL +0 -0
- {mct_nightly-2.2.0.20250113.527.dist-info → mct_nightly-2.2.0.20250114.84821.dist-info}/top_level.txt +0 -0
- /model_compression_toolkit/target_platform_capabilities/{target_platform/targetplatform2framework → targetplatform2framework}/attribute_filter.py +0 -0
model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/ru_methods.py
CHANGED
@@ -12,389 +12,191 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
# ==============================================================================
|
15
|
-
from
|
16
|
-
from functools import partial
|
17
|
-
from typing import List, Optional
|
18
|
-
from copy import deepcopy
|
15
|
+
from typing import List, Set, Dict, Optional, Tuple
|
19
16
|
|
20
17
|
import numpy as np
|
21
18
|
|
22
19
|
from model_compression_toolkit.core import FrameworkInfo
|
23
20
|
from model_compression_toolkit.core.common import Graph, BaseNode
|
24
|
-
from model_compression_toolkit.constants import BITS_TO_BYTES, FLOAT_BITWIDTH
|
25
21
|
from model_compression_toolkit.core.common.framework_implementation import FrameworkImplementation
|
26
|
-
from model_compression_toolkit.core.common.graph.
|
27
|
-
from model_compression_toolkit.core.common.graph.virtual_activation_weights_node import VirtualActivationWeightsNode
|
28
|
-
|
29
|
-
|
30
|
-
from model_compression_toolkit.core.common.
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
22
|
+
from model_compression_toolkit.core.common.graph.memory_graph.cut import Cut
|
23
|
+
from model_compression_toolkit.core.common.graph.virtual_activation_weights_node import VirtualActivationWeightsNode
|
24
|
+
from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import \
|
25
|
+
RUTarget
|
26
|
+
from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization_calculator import \
|
27
|
+
ResourceUtilizationCalculator, BitwidthMode, TargetInclusionCriterion
|
28
|
+
from model_compression_toolkit.core.common.quantization.node_quantization_config import NodeWeightsQuantizationConfig, \
|
29
|
+
NodeActivationQuantizationConfig
|
30
|
+
|
31
|
+
|
32
|
+
# TODO take into account Virtual nodes. Are candidates defined with respect to virtual or original nodes?
|
33
|
+
# Can we use the virtual graph only for bops and the original graph for everything else?
|
34
|
+
|
35
|
+
class MixedPrecisionRUHelper:
|
36
|
+
""" Helper class for resource utilization computations for mixed precision optimization. """
|
37
|
+
|
38
|
+
def __init__(self, graph: Graph, fw_info: FrameworkInfo, fw_impl: FrameworkImplementation):
|
39
|
+
self.graph = graph
|
40
|
+
self.fw_info = fw_info
|
41
|
+
self.fw_impl = fw_impl
|
42
|
+
self.ru_calculator = ResourceUtilizationCalculator(graph, fw_impl, fw_info)
|
43
|
+
|
44
|
+
def compute_utilization(self, ru_targets: Set[RUTarget], mp_cfg: Optional[List[int]]) -> Dict[RUTarget, np.ndarray]:
|
45
|
+
"""
|
46
|
+
Compute utilization of requested targets for a specific configuration in the format expected by LP problem
|
47
|
+
formulation, namely an array of ru values corresponding to graph's configurable nodes in the topological order.
|
48
|
+
For activation target, the array contains values for activation cuts in unspecified order (as long as it is
|
49
|
+
consistent between configurations).
|
50
|
+
|
51
|
+
Args:
|
52
|
+
ru_targets: resource utilization targets to compute.
|
53
|
+
mp_cfg: a list of candidates indices for configurable layers.
|
54
|
+
|
55
|
+
Returns:
|
56
|
+
Dict of the computed utilization per target.
|
57
|
+
"""
|
58
|
+
|
59
|
+
ru = {}
|
60
|
+
|
61
|
+
act_qcs, w_qcs = self.get_configurable_qcs(mp_cfg) if mp_cfg else (None, None)
|
62
|
+
w_util = None
|
63
|
+
if RUTarget.WEIGHTS in ru_targets:
|
64
|
+
w_util = self._weights_utilization(w_qcs)
|
65
|
+
ru[RUTarget.WEIGHTS] = np.array(list(w_util.values()))
|
66
|
+
|
67
|
+
# TODO make mp agnostic to activation method
|
68
|
+
if RUTarget.ACTIVATION in ru_targets:
|
69
|
+
act_util = self._activation_maxcut_utilization(act_qcs)
|
70
|
+
ru[RUTarget.ACTIVATION] = np.array(list(act_util.values()))
|
71
|
+
|
72
|
+
# TODO use maxcut
|
73
|
+
if RUTarget.TOTAL in ru_targets:
|
74
|
+
act_tensors_util = self._activation_tensor_utilization(act_qcs)
|
75
|
+
w_util = w_util or self._weights_utilization(w_qcs)
|
76
|
+
total = {n: (w_util.get(n, 0), act_tensors_util.get(n, 0))
|
77
|
+
# for n in self.graph.nodes if n in act_tensors_util or n in w_util}
|
78
|
+
for n in self.graph.get_topo_sorted_nodes() if n in act_tensors_util or n in w_util}
|
79
|
+
ru[RUTarget.TOTAL] = np.array(list(total.values()))
|
80
|
+
|
81
|
+
if RUTarget.BOPS in ru_targets:
|
82
|
+
ru[RUTarget.BOPS] = self._bops_utilization(mp_cfg)
|
83
|
+
|
84
|
+
return ru
|
85
|
+
|
86
|
+
def get_configurable_qcs(self, mp_cfg) \
|
87
|
+
-> Tuple[Dict[BaseNode, NodeActivationQuantizationConfig], Dict[BaseNode, NodeWeightsQuantizationConfig]]:
|
88
|
+
"""
|
89
|
+
Retrieve quantization candidates objects for weights and activations from the configuration list.
|
90
|
+
|
91
|
+
Args:
|
92
|
+
mp_cfg: a list of candidates indices for configurable layers.
|
93
|
+
|
94
|
+
Returns:
|
95
|
+
Mapping between nodes to weights quantization config, and a mapping between nodes and activation
|
96
|
+
quantization config.
|
97
|
+
"""
|
98
|
+
mp_nodes = self.graph.get_configurable_sorted_nodes(self.fw_info)
|
99
|
+
node_qcs = {n: n.candidates_quantization_cfg[mp_cfg[i]] for i, n in enumerate(mp_nodes)}
|
100
|
+
act_qcs = {n: node_qcs[n].activation_quantization_cfg
|
101
|
+
for n in self.graph.get_activation_configurable_nodes()}
|
102
|
+
w_qcs = {n: node_qcs[n].weights_quantization_cfg
|
103
|
+
for n in self.graph.get_weights_configurable_nodes(self.fw_info)}
|
104
|
+
return act_qcs, w_qcs
|
105
|
+
|
106
|
+
def _weights_utilization(self, w_qcs: Optional[Dict[BaseNode, NodeWeightsQuantizationConfig]]) -> Dict[BaseNode, float]:
|
107
|
+
"""
|
108
|
+
Compute weights utilization for configurable weights if configuration is passed,
|
109
|
+
or for non-configurable nodes otherwise.
|
110
|
+
|
111
|
+
Args:
|
112
|
+
w_qcs: nodes quantization configuration to compute, or None.
|
113
|
+
|
114
|
+
Returns:
|
115
|
+
Weight utilization per node.
|
116
|
+
"""
|
117
|
+
if w_qcs:
|
118
|
+
target_criterion = TargetInclusionCriterion.QConfigurable
|
119
|
+
bitwidth_mode = BitwidthMode.QCustom
|
120
|
+
else:
|
121
|
+
target_criterion = TargetInclusionCriterion.QNonConfigurable
|
122
|
+
bitwidth_mode = BitwidthMode.QDefaultSP
|
123
|
+
|
124
|
+
_, nodes_util, _ = self.ru_calculator.compute_weights_utilization(target_criterion=target_criterion,
|
125
|
+
bitwidth_mode=bitwidth_mode,
|
126
|
+
w_qcs=w_qcs)
|
127
|
+
nodes_util = {n: u.bytes for n, u in nodes_util.items()}
|
128
|
+
return nodes_util
|
129
|
+
|
130
|
+
def _activation_maxcut_utilization(self, act_qcs: Optional[Dict[BaseNode, NodeActivationQuantizationConfig]]) \
|
131
|
+
-> Optional[Dict[Cut, float]]:
|
132
|
+
"""
|
133
|
+
Compute activation utilization using MaxCut for all quantized nodes if configuration is passed.
|
134
|
+
|
135
|
+
Args:
|
136
|
+
act_qcs: nodes activation configuration or None.
|
137
|
+
|
138
|
+
Returns:
|
139
|
+
Activation utilization per cut, or empty dict if no configuration was passed.
|
140
|
+
"""
|
141
|
+
if act_qcs:
|
142
|
+
_, cuts_util, _ = self.ru_calculator.compute_cut_activation_utilization(TargetInclusionCriterion.AnyQuantized,
|
143
|
+
bitwidth_mode=BitwidthMode.QCustom,
|
144
|
+
act_qcs=act_qcs)
|
145
|
+
cuts_util = {c: u.bytes for c, u in cuts_util.items()}
|
146
|
+
return cuts_util
|
127
147
|
|
128
|
-
Args:
|
129
|
-
mp_cfg: A mixed-precision configuration (list of candidates index for each configurable node)
|
130
|
-
graph: Graph object.
|
131
|
-
fw_info: FrameworkInfo object about the specific framework (e.g., attributes of different layers' weights to quantize)
|
132
|
-
(not used in this method).
|
133
|
-
fw_impl: FrameworkImplementation object with specific framework methods implementation(not used in this method).
|
134
|
-
cuts: a list of graph cuts (optional. if not provided calculated locally).
|
135
|
-
TODO maxcut: refactor - need to remove the cuts so all metric functions signatures are the same.
|
136
|
-
|
137
|
-
Returns: A vector of node's cut memory sizes.
|
138
|
-
Note that the vector is not necessarily of the same length as the given config.
|
139
|
-
|
140
|
-
"""
|
141
|
-
if len(mp_cfg) == 0:
|
142
148
|
# Computing non-configurable nodes resource utilization for max-cut is included in the calculation of the
|
143
149
|
# configurable nodes.
|
144
|
-
return
|
145
|
-
|
146
|
-
activation_cut_memory = []
|
147
|
-
mp_nodes = graph.get_configurable_sorted_nodes_names(fw_info)
|
148
|
-
# Go over all nodes that should be taken into consideration when computing the weights memory utilization.
|
149
|
-
nodes_act_nbits = {}
|
150
|
-
for n in graph.get_sorted_activation_configurable_nodes():
|
151
|
-
node_idx = mp_nodes.index(n.name)
|
152
|
-
node_qc = n.candidates_quantization_cfg[mp_cfg[node_idx]]
|
153
|
-
node_nbits = node_qc.activation_quantization_cfg.activation_n_bits
|
154
|
-
nodes_act_nbits[n.name] = node_nbits
|
155
|
-
|
156
|
-
if cuts is None:
|
157
|
-
cuts = calc_graph_cuts(graph)
|
158
|
-
|
159
|
-
for i, cut in enumerate(cuts):
|
160
|
-
mem_elements = [m.node_name for m in cut.mem_elements.elements]
|
161
|
-
mem = 0
|
162
|
-
for op_name in mem_elements:
|
163
|
-
n = graph.find_node_by_name(op_name)[0]
|
164
|
-
if n.is_activation_quantization_enabled():
|
165
|
-
base_nbits = n.candidates_quantization_cfg[0].activation_quantization_cfg.activation_n_bits
|
166
|
-
mem += _compute_node_activation_memory(n, nodes_act_nbits.get(op_name, base_nbits))
|
167
|
-
|
168
|
-
activation_cut_memory.append(mem)
|
169
|
-
|
170
|
-
return np.array(activation_cut_memory)
|
171
|
-
|
172
|
-
|
173
|
-
# TODO maxcut: add test for this function and remove no cover
|
174
|
-
def activation_output_size_utilization(mp_cfg: List[int],
|
175
|
-
graph: Graph,
|
176
|
-
fw_info: FrameworkInfo,
|
177
|
-
fw_impl: FrameworkImplementation) -> np.ndarray: # pragma: no cover
|
178
|
-
"""
|
179
|
-
Computes a resource utilization vector with the respective output memory size for each activation configurable node,
|
180
|
-
according to the given mixed-precision configuration.
|
181
|
-
If an empty configuration is given, then computes resource utilization vector for non-configurable nodes.
|
182
|
-
|
183
|
-
Args:
|
184
|
-
mp_cfg: A mixed-precision configuration (list of candidates index for each configurable node)
|
185
|
-
graph: Graph object.
|
186
|
-
fw_info: FrameworkInfo object about the specific framework (e.g., attributes of different layers' weights to quantize)
|
187
|
-
(not used in this method).
|
188
|
-
fw_impl: FrameworkImplementation object with specific framework methods implementation(not used in this method).
|
189
|
-
|
190
|
-
Returns: A vector of node's activation memory sizes.
|
191
|
-
Note that the vector is not necessarily of the same length as the given config.
|
192
|
-
|
193
|
-
"""
|
194
|
-
activation_memory = []
|
195
|
-
mp_nodes = graph.get_configurable_sorted_nodes_names(fw_info)
|
196
|
-
activation_mp_nodes = [n.name for n in graph.get_sorted_activation_configurable_nodes()]
|
197
|
-
|
198
|
-
if len(mp_cfg) == 0:
|
199
|
-
# Computing non-configurable nodes resource utilization
|
200
|
-
for n in graph.nodes:
|
201
|
-
non_configurable_node = n.name not in activation_mp_nodes \
|
202
|
-
and n.has_activation_quantization_enabled_candidate() \
|
203
|
-
and n.is_all_activation_candidates_equal()
|
204
|
-
|
205
|
-
if non_configurable_node:
|
206
|
-
node_nbits = n.candidates_quantization_cfg[0].activation_quantization_cfg.activation_n_bits
|
207
|
-
node_activation_memory_in_bytes = _compute_node_activation_memory(n, node_nbits)
|
208
|
-
activation_memory.append(node_activation_memory_in_bytes)
|
209
|
-
else:
|
210
|
-
# Go over all nodes that should be taken into consideration when computing the weights memory utilization.
|
211
|
-
for n in graph.get_sorted_activation_configurable_nodes():
|
212
|
-
node_idx = mp_nodes.index(n.name)
|
213
|
-
node_qc = n.candidates_quantization_cfg[mp_cfg[node_idx]]
|
214
|
-
node_nbits = node_qc.activation_quantization_cfg.activation_n_bits
|
215
|
-
|
216
|
-
node_activation_memory_in_bytes = _compute_node_activation_memory(n, node_nbits)
|
217
|
-
|
218
|
-
activation_memory.append(node_activation_memory_in_bytes)
|
219
|
-
|
220
|
-
return np.array(activation_memory)
|
221
|
-
|
222
|
-
|
223
|
-
def total_weights_activation_utilization(mp_cfg: List[int],
|
224
|
-
graph: Graph,
|
225
|
-
fw_info: FrameworkInfo,
|
226
|
-
fw_impl: FrameworkImplementation) -> np.ndarray:
|
227
|
-
"""
|
228
|
-
Computes resource utilization tensor with the respective weights size and output memory size for each activation configurable node,
|
229
|
-
according to the given mixed-precision configuration.
|
230
|
-
If an empty configuration is given, then computes resource utilization vector for non-configurable nodes.
|
150
|
+
return {}
|
231
151
|
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
(not used in this method).
|
237
|
-
fw_impl: FrameworkImplementation object with specific framework methods implementation(not used in this method).
|
152
|
+
def _activation_tensor_utilization(self, act_qcs: Optional[Dict[BaseNode, NodeActivationQuantizationConfig]]):
|
153
|
+
"""
|
154
|
+
Compute activation tensors utilization fo configurable nodes if configuration is passed or
|
155
|
+
for non-configurable nodes otherwise.
|
238
156
|
|
239
|
-
|
240
|
-
|
157
|
+
Args:
|
158
|
+
act_qcs: activation quantization configuration or None.
|
241
159
|
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
non_configurable = False
|
252
|
-
node_weights_memory_in_bytes, node_activation_memory_in_bytes = 0, 0
|
253
|
-
|
254
|
-
# Non-configurable Weights
|
255
|
-
# TODO: currently considering only kernel attributes in weights memory utilization.
|
256
|
-
# When enabling multi-attribute quantization we need to modify this method to count all attributes.
|
257
|
-
kernel_attr = fw_info.get_kernel_op_attributes(n.type)[0]
|
258
|
-
if kernel_attr is not None:
|
259
|
-
is_non_configurable_weights = n.name not in weights_mp_nodes and \
|
260
|
-
n.is_all_weights_candidates_equal(kernel_attr) and \
|
261
|
-
not n.reuse
|
262
|
-
|
263
|
-
if is_non_configurable_weights:
|
264
|
-
node_nbits = (n.candidates_quantization_cfg[0].weights_quantization_cfg
|
265
|
-
.get_attr_config(kernel_attr).weights_n_bits)
|
266
|
-
node_weights_memory_in_bytes = _compute_node_weights_memory(n, node_nbits, fw_info)
|
267
|
-
non_configurable = True
|
268
|
-
|
269
|
-
# Non-configurable Activation
|
270
|
-
is_non_configurable_activation = n.name not in activation_mp_nodes and \
|
271
|
-
n.has_activation_quantization_enabled_candidate() and \
|
272
|
-
n.is_all_activation_candidates_equal()
|
273
|
-
|
274
|
-
if is_non_configurable_activation:
|
275
|
-
node_nbits = n.candidates_quantization_cfg[0].activation_quantization_cfg.activation_n_bits
|
276
|
-
node_activation_memory_in_bytes = _compute_node_activation_memory(n, node_nbits)
|
277
|
-
non_configurable = True
|
278
|
-
|
279
|
-
if non_configurable:
|
280
|
-
weights_activation_memory.append(
|
281
|
-
np.array([node_weights_memory_in_bytes, node_activation_memory_in_bytes]))
|
282
|
-
else:
|
283
|
-
# Go over all nodes that should be taken into consideration when computing the weights or
|
284
|
-
# activation memory utilization (all configurable nodes).
|
285
|
-
for node_idx, n in enumerate(graph.get_configurable_sorted_nodes(fw_info)):
|
286
|
-
# TODO: currently considering only kernel attributes in weights memory utilization. When enabling multi-attribute
|
287
|
-
# quantization we need to modify this method to count all attributes.
|
288
|
-
|
289
|
-
node_qc = n.candidates_quantization_cfg[mp_cfg[node_idx]]
|
290
|
-
|
291
|
-
# Compute node's weights memory (if no weights to quantize then set to 0)
|
292
|
-
node_weights_memory_in_bytes = 0
|
293
|
-
kernel_attr = fw_info.get_kernel_op_attributes(n.type)[0]
|
294
|
-
if kernel_attr is not None:
|
295
|
-
if n.is_weights_quantization_enabled(kernel_attr) and not n.is_all_weights_candidates_equal(kernel_attr):
|
296
|
-
node_weights_nbits = node_qc.weights_quantization_cfg.get_attr_config(kernel_attr).weights_n_bits
|
297
|
-
node_weights_memory_in_bytes = _compute_node_weights_memory(n, node_weights_nbits, fw_info)
|
298
|
-
|
299
|
-
# Compute node's activation memory (if node's activation are not being quantized then set to 0)
|
300
|
-
node_activation_nbits = node_qc.activation_quantization_cfg.activation_n_bits
|
301
|
-
node_activation_memory_in_bytes = 0
|
302
|
-
if n.is_activation_quantization_enabled() and not n.is_all_activation_candidates_equal():
|
303
|
-
node_activation_memory_in_bytes = _compute_node_activation_memory(n, node_activation_nbits)
|
304
|
-
|
305
|
-
weights_activation_memory.append(np.array([node_weights_memory_in_bytes, node_activation_memory_in_bytes]))
|
306
|
-
|
307
|
-
return np.array(weights_activation_memory)
|
308
|
-
|
309
|
-
|
310
|
-
def bops_utilization(mp_cfg: List[int],
|
311
|
-
graph: Graph,
|
312
|
-
fw_info: FrameworkInfo,
|
313
|
-
fw_impl: FrameworkImplementation,
|
314
|
-
set_constraints: bool = True) -> np.ndarray:
|
315
|
-
"""
|
316
|
-
Computes a resource utilization vector with the respective bit-operations (BOPS) count for each configurable node,
|
317
|
-
according to the given mixed-precision configuration of a virtual graph with composed nodes.
|
160
|
+
Returns:
|
161
|
+
Activation utilization per node.
|
162
|
+
"""
|
163
|
+
if act_qcs:
|
164
|
+
target_criterion = TargetInclusionCriterion.QConfigurable
|
165
|
+
bitwidth_mode = BitwidthMode.QCustom
|
166
|
+
else:
|
167
|
+
target_criterion = TargetInclusionCriterion.QNonConfigurable
|
168
|
+
bitwidth_mode = BitwidthMode.QDefaultSP
|
318
169
|
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
fw_impl: FrameworkImplementation object with specific framework methods implementation.
|
324
|
-
set_constraints: A flag for utilizing the method for resource utilization computation of a
|
325
|
-
given config not for LP formalization purposes.
|
170
|
+
_, nodes_util = self.ru_calculator.compute_activation_tensors_utilization(target_criterion=target_criterion,
|
171
|
+
bitwidth_mode=bitwidth_mode,
|
172
|
+
act_qcs=act_qcs)
|
173
|
+
return {n: u.bytes for n, u in nodes_util.items()}
|
326
174
|
|
327
|
-
|
328
|
-
|
175
|
+
def _bops_utilization(self, mp_cfg: List[int]):
|
176
|
+
"""
|
177
|
+
Computes a resource utilization vector with the respective bit-operations (BOPS) count for each configurable node,
|
178
|
+
according to the given mixed-precision configuration of a virtual graph with composed nodes.
|
329
179
|
|
330
|
-
|
331
|
-
|
332
|
-
if not set_constraints:
|
333
|
-
return _bops_utilization(mp_cfg,
|
334
|
-
graph,
|
335
|
-
fw_info,
|
336
|
-
fw_impl)
|
180
|
+
Args:
|
181
|
+
mp_cfg: A mixed-precision configuration (list of candidates index for each configurable node)
|
337
182
|
|
338
|
-
|
339
|
-
|
183
|
+
Returns: A vector of node's BOPS count.
|
184
|
+
Note that the vector is not necessarily of the same length as the given config.
|
340
185
|
|
341
|
-
|
186
|
+
"""
|
187
|
+
# TODO keeping old implementation for now
|
342
188
|
|
343
|
-
|
344
|
-
|
189
|
+
# BOPs utilization method considers non-configurable nodes, therefore, it doesn't need separate implementation
|
190
|
+
# for non-configurable nodes for setting a constraint (no need for separate implementation for len(mp_cfg) = 0).
|
345
191
|
|
346
|
-
|
192
|
+
virtual_bops_nodes = [n for n in self.graph.get_topo_sorted_nodes() if isinstance(n, VirtualActivationWeightsNode)]
|
347
193
|
|
194
|
+
mp_nodes = self.graph.get_configurable_sorted_nodes_names(self.fw_info)
|
348
195
|
|
349
|
-
|
350
|
-
|
351
|
-
fw_info: FrameworkInfo,
|
352
|
-
fw_impl: FrameworkImplementation) -> np.ndarray:
|
353
|
-
"""
|
354
|
-
Computes a resource utilization vector with the respective bit-operations (BOPS) count for each configurable node,
|
355
|
-
according to the given mixed-precision configuration of an original graph.
|
196
|
+
bops = [n.get_bops_count(self.fw_impl, self.fw_info, candidate_idx=_get_node_cfg_idx(n, mp_cfg, mp_nodes))
|
197
|
+
for n in virtual_bops_nodes]
|
356
198
|
|
357
|
-
|
358
|
-
mp_cfg: A mixed-precision configuration (list of candidates index for each configurable node)
|
359
|
-
graph: Graph object.
|
360
|
-
fw_info: FrameworkInfo object about the specific framework (e.g., attributes of different layers' weights to quantize).
|
361
|
-
fw_impl: FrameworkImplementation object with specific framework methods implementation.
|
362
|
-
|
363
|
-
Returns: A vector of node's BOPS count.
|
364
|
-
|
365
|
-
"""
|
366
|
-
|
367
|
-
mp_nodes = graph.get_configurable_sorted_nodes_names(fw_info)
|
368
|
-
|
369
|
-
# Go over all nodes that should be taken into consideration when computing the BOPS utilization.
|
370
|
-
bops = []
|
371
|
-
for n in graph.get_topo_sorted_nodes():
|
372
|
-
if n.has_kernel_weight_to_quantize(fw_info) and not n.has_positional_weights:
|
373
|
-
# If node doesn't have weights then its MAC count is 0, and we shouldn't consider it in the BOPS count.
|
374
|
-
incoming_edges = graph.incoming_edges(n, sort_by_attr=EDGE_SINK_INDEX)
|
375
|
-
if len(incoming_edges) != 1:
|
376
|
-
Logger.critical(f"Unable to compute BOPS metric for node {n.name} due to multiple inputs.") # pragma: no cover
|
377
|
-
input_activation_node = incoming_edges[0].source_node
|
378
|
-
if len(graph.out_edges(input_activation_node)) > 1:
|
379
|
-
# In the case where the activation node has multiple outgoing edges
|
380
|
-
# we don't consider this edge in the BOPS utilization calculation
|
381
|
-
continue
|
382
|
-
|
383
|
-
input_activation_node_cfg = input_activation_node.candidates_quantization_cfg[_get_node_cfg_idx(input_activation_node, mp_cfg, mp_nodes)]
|
384
|
-
|
385
|
-
node_mac = fw_impl.get_node_mac_operations(n, fw_info)
|
386
|
-
|
387
|
-
node_qc = n.candidates_quantization_cfg[_get_node_cfg_idx(n, mp_cfg, mp_nodes)]
|
388
|
-
kenrel_node_qc = node_qc.weights_quantization_cfg.get_attr_config(fw_info.get_kernel_op_attributes(n.type)[0])
|
389
|
-
node_weights_nbits = kenrel_node_qc.weights_n_bits if \
|
390
|
-
kenrel_node_qc.enable_weights_quantization else FLOAT_BITWIDTH
|
391
|
-
input_activation_nbits = input_activation_node_cfg.activation_quantization_cfg.activation_n_bits if \
|
392
|
-
input_activation_node_cfg.activation_quantization_cfg.enable_activation_quantization else FLOAT_BITWIDTH
|
393
|
-
|
394
|
-
node_bops = node_weights_nbits * input_activation_nbits * node_mac
|
395
|
-
bops.append(node_bops)
|
396
|
-
|
397
|
-
return np.array(bops)
|
199
|
+
return np.array(bops)
|
398
200
|
|
399
201
|
|
400
202
|
def _get_node_cfg_idx(node: BaseNode, mp_cfg: List[int], sorted_configurable_nodes_names: List[str]) -> int:
|
@@ -414,115 +216,7 @@ def _get_node_cfg_idx(node: BaseNode, mp_cfg: List[int], sorted_configurable_nod
|
|
414
216
|
if node.name in sorted_configurable_nodes_names:
|
415
217
|
node_idx = sorted_configurable_nodes_names.index(node.name)
|
416
218
|
return mp_cfg[node_idx]
|
417
|
-
else:
|
219
|
+
else: # pragma: no cover
|
418
220
|
assert len(node.candidates_quantization_cfg) > 0, \
|
419
221
|
"Any node should have at least one candidate configuration."
|
420
222
|
return 0
|
421
|
-
|
422
|
-
|
423
|
-
def _get_origin_weights_node(n: BaseNode) -> BaseNode:
|
424
|
-
"""
|
425
|
-
In case we run a resource utilization computation on a virtual graph,
|
426
|
-
this method is used to retrieve the original node out of a virtual weights node,
|
427
|
-
|
428
|
-
Args:
|
429
|
-
n: A possibly virtual node.
|
430
|
-
|
431
|
-
Returns: A node from the original (non-virtual) graph which the given node represents.
|
432
|
-
|
433
|
-
"""
|
434
|
-
|
435
|
-
if isinstance(n, VirtualActivationWeightsNode):
|
436
|
-
return n.original_weights_node
|
437
|
-
if isinstance(n, VirtualSplitWeightsNode):
|
438
|
-
return n.origin_node
|
439
|
-
|
440
|
-
return n
|
441
|
-
|
442
|
-
|
443
|
-
def _get_origin_activation_node(n: BaseNode) -> BaseNode:
|
444
|
-
"""
|
445
|
-
In case we run a resource utilization computation on a virtual graph,
|
446
|
-
this method is used to retrieve the original node out of a virtual activation node,
|
447
|
-
|
448
|
-
Args:
|
449
|
-
n: A possibly virtual node.
|
450
|
-
|
451
|
-
Returns: A node from the original (non-virtual) graph which the given node represents.
|
452
|
-
|
453
|
-
"""
|
454
|
-
|
455
|
-
if isinstance(n, VirtualActivationWeightsNode):
|
456
|
-
return n.original_activation_node
|
457
|
-
if isinstance(n, VirtualSplitActivationNode):
|
458
|
-
return n.origin_node
|
459
|
-
|
460
|
-
return n
|
461
|
-
|
462
|
-
|
463
|
-
def _compute_node_weights_memory(n: BaseNode, node_nbits: int, fw_info: FrameworkInfo) -> float:
|
464
|
-
"""
|
465
|
-
Computes the weights' memory of the given node.
|
466
|
-
|
467
|
-
Args:
|
468
|
-
n: A node to compute its weights' memory.
|
469
|
-
node_nbits: A bit-width in which the node's weights should be quantized.
|
470
|
-
fw_info: FrameworkInfo object about the specific framework.
|
471
|
-
|
472
|
-
Returns: The total memory of the node's weights when quantized to the given bit-width.
|
473
|
-
|
474
|
-
"""
|
475
|
-
|
476
|
-
origin_node = _get_origin_weights_node(n)
|
477
|
-
|
478
|
-
node_num_weights_params = 0
|
479
|
-
for attr in fw_info.get_kernel_op_attributes(origin_node.type):
|
480
|
-
if attr is not None:
|
481
|
-
node_num_weights_params += origin_node.get_weights_by_keys(attr).flatten().shape[0]
|
482
|
-
|
483
|
-
return node_num_weights_params * node_nbits / BITS_TO_BYTES
|
484
|
-
|
485
|
-
|
486
|
-
def _compute_node_activation_memory(n: BaseNode, node_nbits: int) -> float:
|
487
|
-
"""
|
488
|
-
Computes the activation tensor memory of the given node.
|
489
|
-
|
490
|
-
Args:
|
491
|
-
n: A node to compute its activation tensor memory.
|
492
|
-
node_nbits: A bit-width in which the node's weights should be quantized.
|
493
|
-
|
494
|
-
Returns: The total memory of the node's activation tensor when quantized to the given bit-width.
|
495
|
-
|
496
|
-
"""
|
497
|
-
|
498
|
-
origin_node = _get_origin_activation_node(n)
|
499
|
-
node_output_size = origin_node.get_total_output_params()
|
500
|
-
|
501
|
-
return node_output_size * node_nbits / BITS_TO_BYTES
|
502
|
-
|
503
|
-
|
504
|
-
class MpRuMetric(Enum):
|
505
|
-
"""
|
506
|
-
Defines resource utilization computation functions that can be used to compute bops_utilization for a given target
|
507
|
-
for a given mp config. The enum values can be used to call a function on a set of arguments.
|
508
|
-
|
509
|
-
WEIGHTS_SIZE - applies the weights_size_utilization function
|
510
|
-
|
511
|
-
ACTIVATION_MAXCUT_SIZE - applies the activation_maxcut_size_utilization function.
|
512
|
-
|
513
|
-
ACTIVATION_OUTPUT_SIZE - applies the activation_output_size_utilization function
|
514
|
-
|
515
|
-
TOTAL_WEIGHTS_ACTIVATION_SIZE - applies the total_weights_activation_utilization function
|
516
|
-
|
517
|
-
BOPS_COUNT - applies the bops_utilization function
|
518
|
-
|
519
|
-
"""
|
520
|
-
|
521
|
-
WEIGHTS_SIZE = partial(weights_size_utilization)
|
522
|
-
ACTIVATION_MAXCUT_SIZE = partial(activation_maxcut_size_utilization)
|
523
|
-
ACTIVATION_OUTPUT_SIZE = partial(activation_output_size_utilization)
|
524
|
-
TOTAL_WEIGHTS_ACTIVATION_SIZE = partial(total_weights_activation_utilization)
|
525
|
-
BOPS_COUNT = partial(bops_utilization)
|
526
|
-
|
527
|
-
def __call__(self, *args):
|
528
|
-
return self.value(*args)
|