PyPI - mct-nightly - Versions diffs - 1.7.1.31122022.post351__py3-none-any.whl → 1.8.0.1042023.post423__py3-none-any.whl - Mend

mct-nightly 1.7.1.31122022.post351py3-none-any.whl → 1.8.0.1042023.post423py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (241) hide show

model_compression_toolkit/core/keras/back2framework/keras_model_builder.py CHANGED Viewed

@@ -16,11 +16,13 @@
 from abc import abstractmethod
 import tensorflow as tf
-from keras.models import Model
+from keras.engine.input_layer import InputLayer
+from keras.models import Model, clone_model
 from packaging import version
 from model_compression_toolkit.core.common.back2framework.base_model_builder import BaseModelBuilder
 from model_compression_toolkit.core.common.user_info import UserInformation
+from model_compression_toolkit.core.common.constants import INPUT_BASE_NAME
 # As from Tensorflow 2.6, keras is a separate package and some classes should be imported differently.
 if version.parse(tf.__version__) < version.parse("2.6"):
@@ -42,7 +44,7 @@ from model_compression_toolkit.core.common.framework_info import FrameworkInfo
 from model_compression_toolkit.core.keras.default_framework_info import DEFAULT_KERAS_INFO
 from model_compression_toolkit.core.common import BaseNode
 from model_compression_toolkit.core.common.graph.edge import EDGE_SINK_INDEX
-from model_compression_toolkit.core.keras.back2framework.instance_builder import OperationHandler, identity_wrapper
+from model_compression_toolkit.core.keras.back2framework.instance_builder import OperationHandler
 from model_compression_toolkit.core.keras.reader.connectivity_handler import OutTensor
 # In tf2.3 fake quant node is implemented as TensorFlowOpLayer, while in tf2.4 as TFOpLambda.
@@ -93,7 +95,7 @@ class KerasModelBuilder(BaseModelBuilder):
                  append2output=None,
                  fw_info: FrameworkInfo = DEFAULT_KERAS_INFO,
                  return_float_outputs: bool = False,
-                 wrapper: Callable = identity_wrapper):
+                 wrapper: Callable = None):
         """
         Args:
@@ -101,6 +103,7 @@ class KerasModelBuilder(BaseModelBuilder):
             append2output: Nodes to append to model's output.
             fw_info: Information about the specific framework of the model that is built.
             return_float_outputs: Whether the model returns float tensors or not.
+            wrapper: A function wrapper keras Layers.
         """
         super().__init__(graph,
@@ -109,9 +112,9 @@ class KerasModelBuilder(BaseModelBuilder):
                          return_float_outputs)
         # Build an OperationHandler to handle conversions from graph nodes to Keras operators.
-        self.oh = OperationHandler(self.graph, wrapper)
+        self.oh = OperationHandler(self.graph)
+        self.wrapper = wrapper
-    @abstractmethod
     def _quantize_node_activations(self,
                                    node: BaseNode,
                                    input_tensors: List[TFReference]) -> List[TFReference]:
@@ -126,7 +129,8 @@ class KerasModelBuilder(BaseModelBuilder):
             Output of the node.
         """
-        raise NotImplemented(f'{self.__class__.__name__} have to implement a method for quantization activation nodes.')
+        raise NotImplemented(f'{self.__class__.__name__} did not implement a method for quantizating '
+                             f'activation nodes.')  # pragma: no cover
     def build_model(self) -> Tuple[Model, UserInformation]:
         """
@@ -149,10 +153,17 @@ class KerasModelBuilder(BaseModelBuilder):
         # Hold a dictionary from an input node to its corresponding input tensor. It is needed for when
         # building the model. Initially input nodes with input tensors are added to the dictionary,
         # as they're not added later.
-        input_nodes_to_input_tensors = {inode: Input(inode.framework_attr[BATCH_INPUT_SHAPE][1:], name=inode.name)
+        input_nodes_to_input_tensors = {inode: Input(inode.framework_attr[BATCH_INPUT_SHAPE][1:],
+                                                     name=f'{inode.name}_{INPUT_BASE_NAME}')
                                         for
                                         inode in self.graph.get_inputs()}
+        # Support adding Layer after input layers require us to store it in layer_to_node_dict
+        # dict offline (unlike other layers which stored during running).
+        for node, layer in self.oh.node_to_fw_op_dict.items():
+            if node.type == InputLayer:
+                self.oh.layer_to_node_dict[layer] = node
         # Build a list of the model's input tensors. Switching from a dictionary to a list
         # to keep the tensors input order, since inputs in Graph are ordered by their indices.
         inputs_list = []
@@ -198,6 +209,20 @@ class KerasModelBuilder(BaseModelBuilder):
         # Build the model.
         model = tf.keras.Model(inputs=inputs_list, outputs=model_output_tensors)
+        if self.wrapper is not None:
+            def _wrap(layer):
+                _node = self.oh.layer_to_node_dict.get(layer)
+                if _node is not None:
+                    return self.wrapper(_node, layer)
+                elif is_layer_fake_quant(layer):
+                    return layer
+                raise Exception(  # pragma: no cover
+                    f'Mismatch between keras model and graph cant find node named: '
+                    f'{get_node_name_from_layer(layer)}')
+            model = clone_model(model, clone_function=_wrap)
         return model, self.graph.user_info
     def _convert_node2name(self, in_node_to_output_tensors_dict):
@@ -246,19 +271,20 @@ class KerasModelBuilder(BaseModelBuilder):
             input_tensors: List of references to Keras tensors that are the layer's inputs.
             op_func: Layer to apply to the input tensors.
             input_nodes_to_input_tensors: A dictionary from a node to its input tensors.
-            mode: model quantization mode from ModelBuilderMode
         Returns:
             A list of references to Keras tensors. The layer's output tensors after applying the
             layer to the input tensors.
         """
         if len(input_tensors) == 0:  # Placeholder handling
             out_tensors_of_n_float = input_nodes_to_input_tensors[n]
-            out_tensors_of_n = out_tensors_of_n_float
-            if n.is_activation_quantization_enabled():
+            if self.wrapper is not None:
+                # if a wrapper is defined, add an identity layer for cloning. The Identity will be warpped
+                out_tensors_of_n = op_func(out_tensors_of_n_float)
+            elif n.is_activation_quantization_enabled():
                 out_tensors_of_n = self._quantize_node_activations(n, out_tensors_of_n_float)
+            else:
+                out_tensors_of_n = out_tensors_of_n_float
         else:
             input_tensors = [tensor for tensor_list in input_tensors for tensor in tensor_list]  # flat list of lists
             # Build a functional node using its args
@@ -275,8 +301,8 @@ class KerasModelBuilder(BaseModelBuilder):
                 out_tensors_of_n_float = op_func(input_tensors)
             out_tensors_of_n = out_tensors_of_n_float
-            # Add a fake quant node if the node has an activation threshold.
-            if n.is_activation_quantization_enabled():
+            # Add a fake quant node if the node has an activation threshold and a wrapper isn't defined
+            if n.is_activation_quantization_enabled() and self.wrapper is None:
                 out_tensors_of_n = self._quantize_node_activations(n, out_tensors_of_n_float)
         # Save a mapping from the layer that created the tensor to the node (as this layer is not the

model_compression_toolkit/core/keras/back2framework/model_gradients.py CHANGED Viewed

@@ -20,13 +20,13 @@ from packaging import version
 from tqdm import tqdm
 if version.parse(tf.__version__) < version.parse("2.6"):
-    from tensorflow.python.keras.layers import Layer
+    from tensorflow.python.keras.layers import Layer  # pragma: no cover
 else:
     from keras.engine.base_layer import Layer
 from typing import Any, Dict, List, Tuple
 from tensorflow.python.util.object_identity import Reference as TFReference
-from model_compression_toolkit.core.common.constants import EPS
+from model_compression_toolkit.core.common.constants import EPS, MIN_JACOBIANS_ITER, JACOBIANS_COMP_TOLERANCE
 from model_compression_toolkit.core.common.graph.functional_node import FunctionalNode
 from model_compression_toolkit.core import common
 from model_compression_toolkit.core.common import BaseNode, Graph
@@ -128,7 +128,7 @@ def keras_iterative_approx_jacobian_trace(graph_float: common.Graph,
     """
     if not all([images.shape[0] == 1 for node, images in model_input_tensors.items()]):
-        Logger.critical("Iterative jacobian trace computation is only supported on a single image sample")
+        Logger.critical("Iterative jacobian trace computation is only supported on a single image sample")  # pragma: no cover
     with tf.GradientTape(persistent=True, watch_accessed_variables=False) as g:
         outputs, interest_points_tensors = _model_outputs_computation(graph_float,
@@ -136,32 +136,56 @@ def keras_iterative_approx_jacobian_trace(graph_float: common.Graph,
                                                                       interest_points,
                                                                       output_list,
                                                                       gradient_tape=g)
-        outputs_jacobians_approx = []
-        for output in outputs:  # Per model's output tensor
-            output = tf.reshape(output, shape=[output.shape[0], -1])
-            ipts_jac_trace_approx = []
-            for ipt in tqdm(interest_points_tensors):  # Per Interest point activation tensor
-                trace_jv = []
-                for j in range(n_iter):  # Approximation iterations
-                    # Getting a random vector with normal distribution
-                    v = tf.random.normal(shape=output.shape)
-                    f_v = tf.reduce_sum(v * output)
-                    with g.stop_recording():
-                        # Computing the jacobian approximation by getting the gradient of (output * v)
-                        jac_v = g.gradient(f_v, ipt, unconnected_gradients=tf.UnconnectedGradients.ZERO)
-                        jac_v = tf.reshape(jac_v, [jac_v.shape[0], -1])
-                        jac_trace_approx = tf.reduce_mean(tf.reduce_sum(tf.pow(jac_v, 2.0)))
-                        trace_jv.append(jac_trace_approx)
-                ipts_jac_trace_approx.append(2 * tf.reduce_mean(trace_jv) / output.shape[-1])  # Get averaged squared jacobian trace approximation
-            outputs_jacobians_approx.append(ipts_jac_trace_approx)
-        mean_per_point = tf.reduce_mean(outputs_jacobians_approx, axis=0)  # Get mean of jacobian approx of all model outputs
+        # Concat outputs
+        # First, we need to unfold all outputs that are given as list, to extract the actual output tensors
+        unfold_outputs = []
+        for output in outputs:
+            if isinstance(output, List):
+                unfold_outputs += output
+            else:
+                unfold_outputs.append(output)
+        r_outputs = [tf.reshape(output, shape=[output.shape[0], -1]) for output in unfold_outputs]
+        concat_axis_dim = [o.shape[0] for o in r_outputs]
+        if not all(d == concat_axis_dim[0] for d in concat_axis_dim):
+            Logger.critical("Can't concat model's outputs for gradients calculation since the shape of the first axis "  # pragma: no cover
+                            "is not equal in all outputs.")
+        output = tf.concat(r_outputs, axis=1)
+        ipts_jac_trace_approx = []
+        for ipt in tqdm(interest_points_tensors):  # Per Interest point activation tensor
+            trace_jv = []
+            for j in range(n_iter):  # Approximation iterations
+                # Getting a random vector with normal distribution
+                v = tf.random.normal(shape=output.shape)
+                f_v = tf.reduce_sum(v * output)
+                with g.stop_recording():
+                    # Computing the jacobian approximation by getting the gradient of (output * v)
+                    jac_v = g.gradient(f_v, ipt, unconnected_gradients=tf.UnconnectedGradients.ZERO)
+                    jac_v = tf.reshape(jac_v, [jac_v.shape[0], -1])
+                    jac_trace_approx = tf.reduce_mean(tf.reduce_sum(tf.pow(jac_v, 2.0)))
+                    # If the change to the mean Jacobian approximation is insignificant we stop the calculation
+                    if j > MIN_JACOBIANS_ITER:
+                        new_mean = np.mean([jac_trace_approx, *trace_jv])
+                        delta = new_mean - np.mean(trace_jv)
+                        if np.abs(delta) / (np.abs(new_mean) + 1e-6) < JACOBIANS_COMP_TOLERANCE:
+                            trace_jv.append(jac_trace_approx)
+                            break
+                    trace_jv.append(jac_trace_approx)
+            ipts_jac_trace_approx.append(2 * tf.reduce_mean(trace_jv) / output.shape[-1])  # Get averaged squared jacobian trace approximation
+        ipts_jac_trace_approx = tf.reduce_mean([ipts_jac_trace_approx], axis=0)  # Just to get one tensor instead of list of tensors with single element
         if norm_weights:
-            return _normalize_weights(mean_per_point, all_outputs_indices, alpha)
+            return _normalize_weights(ipts_jac_trace_approx, all_outputs_indices, alpha)
         else:
-            return mean_per_point
+            return ipts_jac_trace_approx
 def _model_outputs_computation(graph_float: common.Graph,

model_compression_toolkit/core/keras/constants.py CHANGED Viewed

@@ -101,6 +101,7 @@ RELU_POT_BOUND = 8.0
 # Supported TP models names for Tensorflow:
 DEFAULT_TP_MODEL = 'default'
+IMX500_TP_MODEL = 'imx500'
 TFLITE_TP_MODEL = 'tflite'
 QNNPACK_TP_MODEL = 'qnnpack'

model_compression_toolkit/core/keras/graph_substitutions/substitutions/multi_head_attention_decomposition.py CHANGED Viewed

@@ -23,6 +23,7 @@ else:
     from keras.layers.core import TFOpLambda
     from keras.layers import MultiHeadAttention, Conv2D, Softmax, Concatenate, Reshape, Permute
+from model_compression_toolkit.core.common.logger import Logger
 from model_compression_toolkit.core import common
 from model_compression_toolkit.core.common.graph.base_graph import Graph, BaseNode, OutTensor
 from model_compression_toolkit.core.common.graph.functional_node import FunctionalNode
@@ -448,7 +449,7 @@ class MultiHeadAttentionDecomposition(common.BaseSubstitution):
         """
         if mha_node.reuse:
-            raise Exception("MCT doesn't support reuse of MultiHeadAttention layer")
+            Logger.error("MCT doesn't support reuse of MultiHeadAttention layer")  # pragma: no cover
         params = MHAParams(mha_node)
         mha_in_edges = graph.in_edges(mha_node)

model_compression_toolkit/core/keras/kpi_data_facade.py CHANGED Viewed

@@ -156,10 +156,10 @@ else:
     def keras_kpi_data(*args, **kwargs):
         Logger.critical('Installing tensorflow and tensorflow_model_optimization is mandatory '
                         'when using keras_kpi_data. '
-                        'Could not find Tensorflow package.')
+                        'Could not find Tensorflow package.')  # pragma: no cover
     def keras_kpi_data_experimental(*args, **kwargs):
         Logger.critical('Installing tensorflow and tensorflow_model_optimization is mandatory '
                         'when using keras_kpi_data. '
-                        'Could not find Tensorflow package.')
+                        'Could not find Tensorflow package.')  # pragma: no cover

model_compression_toolkit/core/keras/quantization_facade.py CHANGED Viewed

@@ -19,7 +19,7 @@ from model_compression_toolkit.core import common
 from model_compression_toolkit.core.common import Logger
 from model_compression_toolkit.core.common.constants import TENSORFLOW
 from model_compression_toolkit.core.common.user_info import UserInformation
-from model_compression_toolkit.gptq.common.gptq_config import GradientPTQConfig, GradientPTQConfigV2
+from model_compression_toolkit.gptq import GradientPTQConfig, GradientPTQConfigV2
 from model_compression_toolkit.core.common.mixed_precision.kpi_tools.kpi import KPI
 from model_compression_toolkit.core.common.framework_info import FrameworkInfo
 from model_compression_toolkit.core.common.network_editors.actions import EditRule
@@ -281,10 +281,10 @@ else:
     def keras_post_training_quantization(*args, **kwargs):
         Logger.critical('Installing tensorflow and tensorflow_model_optimization is mandatory '
                         'when using keras_post_training_quantization. '
-                        'Could not find Tensorflow package.')
+                        'Could not find Tensorflow package.')  # pragma: no cover
     def keras_post_training_quantization_mixed_precision(*args, **kwargs):
         Logger.critical('Installing tensorflow and tensorflow_model_optimization is mandatory '
                         'when using keras_post_training_quantization_mixed_precision. '
-                        'Could not find Tensorflow package.')
+                        'Could not find Tensorflow package.')  # pragma: no cover

model_compression_toolkit/core/keras/quantizer/fake_quant_builder.py CHANGED Viewed

@@ -20,7 +20,7 @@ import tensorflow as tf
 import numpy as np
 from tensorflow.python.util.object_identity import Reference as TFReference
-from model_compression_toolkit.core.common import Logger
+from model_compression_toolkit.core.common.logger import Logger
 from model_compression_toolkit.core.common.constants import THRESHOLD, SIGNED, RANGE_MIN, RANGE_MAX
 from model_compression_toolkit.core.common.quantization.quantizers.uniform_quantizers import threshold_is_power_of_two
@@ -68,10 +68,12 @@ def power_of_two_quantization(activation_n_bits: int,
     activation_threshold = quantization_params.get(THRESHOLD)
     activation_is_signed = quantization_params.get(SIGNED)
-    if activation_threshold is None or activation_is_signed is None:
-        return None
+    if activation_threshold is None:
+        Logger.error("Activation threshold is None")  # pragma: no cover
+    if activation_is_signed is None:
+        Logger.error("activation_is_signed is None")  # pragma: no cover
     if not threshold_is_power_of_two(activation_threshold, per_channel=False):
-        return None
+        Logger.error("Activation threshold is not power of two")  # pragma: no cover
     min_value, max_value = quantizer_min_max_calculator(activation_threshold,
                                                         activation_n_bits,
@@ -96,8 +98,10 @@ def symmetric_quantization(activation_n_bits: int,
     activation_threshold = quantization_params.get(THRESHOLD)
     activation_is_signed = quantization_params.get(SIGNED)
-    if activation_threshold is None or activation_is_signed is None:
-        return None
+    if activation_threshold is None:
+        Logger.error("Activation threshold is None")  # pragma: no cover
+    if activation_is_signed is None:
+        Logger.error("activation_is_signed is None")  # pragma: no cover
     min_value, max_value = quantizer_min_max_calculator(activation_threshold,
                                                         activation_n_bits,
@@ -121,8 +125,10 @@ def uniform_quantization(activation_n_bits: int,
     """
     min_value, max_value = quantization_params.get(RANGE_MIN), quantization_params.get(RANGE_MAX)
-    if min_value is None or max_value is None:
-        return None
+    if min_value is None:
+        Logger.error("Min value is None")  # pragma: no cover
+    if max_value is None:
+        Logger.error("Max value is None")  # pragma: no cover
     return lambda x: q(x, min_value, max_value, activation_n_bits)
@@ -141,7 +147,7 @@ def q(x: TFReference, min_value, max_value, activation_n_bits) -> TFReference:
         The fake-quantized input tensor.
     """
     if x.dtype != tf.float32:
-        x = tf.cast(x, dtype=tf.float32)
+        x = tf.cast(x, dtype=tf.float32)  # pragma: no cover
     # fake_quant_with_min_max_vars expects to get x of float32
     return tf.quantization.fake_quant_with_min_max_vars(x,

model_compression_toolkit/core/keras/quantizer/input_layer_quantize_transform.py CHANGED Viewed

@@ -23,6 +23,7 @@ from tensorflow_model_optimization.python.core.quantization.keras.quantize_confi
 from model_compression_toolkit.core.common import BaseNode
+from model_compression_toolkit.core.common.constants import INPUT_BASE_NAME
 class InputLayerWrapperTransform(InputLayerQuantize):
@@ -55,7 +56,7 @@ class InputLayerWrapperTransform(InputLayerQuantize):
         self.wrapper_class = wrapper_class
     def pattern(self):
-        return transforms.LayerPattern('InputLayer', config={'name': self.name})
+        return transforms.LayerPattern('InputLayer', config={'name': f'{self.name}_{INPUT_BASE_NAME}'})
     def replacement(self, match_layer):
         layer_wrapper = self.wrapper_class(InputLayer(input_shape=self.input_layer.input_shape),

model_compression_toolkit/core/keras/quantizer/lut_fake_quant.py CHANGED Viewed

@@ -60,7 +60,7 @@ class LUTFakeQuant(Layer):
         """
         if self.activation_is_signed is None or self.cluster_centers is None or self.threshold is None:
-            return None
+            return None  # pragma: no cover
         _quant_output = self.lut_kmeans_quantizer(input_data)
         return _quant_output

model_compression_toolkit/core/keras/reader/common.py CHANGED Viewed

@@ -29,6 +29,7 @@ else:
     from keras.engine.functional import Functional
     from keras.engine.sequential import Sequential
+from model_compression_toolkit.core.common.logger import Logger
 from model_compression_toolkit.core.common.graph.base_node import BaseNode
@@ -46,7 +47,7 @@ def is_node_an_input_layer(node: BaseNode) -> bool:
     elif isinstance(node, KerasNode):
         return isinstance(node.layer, InputLayer)
     else:
-        raise Exception('Node to check has to be either a graph node or a keras node')
+        Logger.error('Node to check has to be either a graph node or a keras node')  # pragma: no cover
 def is_node_a_model(node: BaseNode) -> bool:
@@ -63,5 +64,5 @@ def is_node_a_model(node: BaseNode) -> bool:
     elif isinstance(node, KerasNode):
         return isinstance(node.layer, Functional) or isinstance(node.layer, Sequential)
     else:
-        raise Exception('Node to check has to be either a graph node or a keras node')
+        Logger.error('Node to check has to be either a graph node or a keras node')  # pragma: no cover

model_compression_toolkit/core/pytorch/back2framework/instance_builder.py CHANGED Viewed

@@ -35,4 +35,17 @@ def node_builder(n: BaseNode) -> Module:
     node_instance = n.type(**framework_attr)
     node_instance.load_state_dict({k: torch.Tensor(v) for k, v in n.weights.items()}, strict=False)
     set_model(node_instance)
-    return node_instance
+    return node_instance
+def identity_wrapper(node: BaseNode, module: Module):
+    """
+    A function which takes a computational graph node and a pytorch module and return an identity wrapping which return the layer itself
+    Args:
+        node: A node of mct graph.
+        layer: A pytorch module
+    Returns: pytorch module
+    """
+    return module

model_compression_toolkit/core/pytorch/back2framework/model_gradients.py CHANGED Viewed

@@ -18,15 +18,18 @@ import torch
 import torch.autograd as autograd
 from networkx import topological_sort
 from tqdm import tqdm
+import numpy as np
 from model_compression_toolkit.core import common
 from model_compression_toolkit.core.common import BaseNode, Graph
-from model_compression_toolkit.core.common.constants import EPS
+from model_compression_toolkit.core.common.constants import EPS, MIN_JACOBIANS_ITER, JACOBIANS_COMP_TOLERANCE
 from model_compression_toolkit.core.common.graph.edge import EDGE_SINK_INDEX
 from model_compression_toolkit.core.common.graph.functional_node import FunctionalNode
 from model_compression_toolkit.core.pytorch.back2framework.instance_builder import node_builder
-from model_compression_toolkit.core.pytorch.reader.node_holders import DummyPlaceHolder
-from model_compression_toolkit.core.pytorch.utils import torch_tensor_to_numpy
+from model_compression_toolkit.core.pytorch.constants import BUFFER
+from model_compression_toolkit.core.pytorch.reader.node_holders import DummyPlaceHolder, BufferHolder
+from model_compression_toolkit.core.pytorch.utils import torch_tensor_to_numpy, get_working_device
+from model_compression_toolkit.core.common.logger import Logger
 def build_input_tensors_list(node: BaseNode,
@@ -95,8 +98,11 @@ def generate_outputs(
     output = []
     for n in out_nodes:
         out_tensors_of_n = node_to_output_tensors_dict.get(n)
-        if len(out_tensors_of_n) > 1:
-            output.append(out_tensors_of_n)
+        if len(out_tensors_of_n) > 1 or isinstance(out_tensors_of_n[0], tuple):
+            if isinstance(out_tensors_of_n[0], tuple):
+                out_tensors_of_n = out_tensors_of_n[0]
+            out_tensors_of_n = [torch.cat(out_tensors_of_n)]
+            output.append(torch.concat(out_tensors_of_n))
         else:
             output += out_tensors_of_n
     return output
@@ -128,7 +134,13 @@ class PytorchModelGradients(torch.nn.Module):
         for n in self.node_sort:
             if not isinstance(n, FunctionalNode):
-                self.add_module(n.name, node_builder(n))
+                if n.type == BufferHolder:
+                    self.add_module(n.name, node_builder(n))
+                    self.get_submodule(n.name). \
+                        register_buffer(n.name,
+                                        torch.Tensor(n.get_weights_by_keys(BUFFER)).to(get_working_device()))
+                else:
+                    self.add_module(n.name, node_builder(n))
     def forward(self,
                 *args: Any) -> Any:
@@ -153,7 +165,7 @@ class PytorchModelGradients(torch.nn.Module):
                                              input_tensors,
                                              op_func=op_func)
-            if isinstance(out_tensors_of_n, list):
+            if isinstance(out_tensors_of_n, list) or isinstance(out_tensors_of_n, tuple):
                 output_t = []
                 for t in out_tensors_of_n:
                     if n in self.interest_points:
@@ -162,13 +174,19 @@ class PytorchModelGradients(torch.nn.Module):
                             self.interest_points_tensors.append(t)
                         else:
                             # We get here in case we have an output node, which is an interest point,
-                            # but it is not differentiable. We need to add this dummy tensor to in order to include this
-                            # node in the coming weights computation.
-                            self.interest_points_tensors.append(torch.tensor([0.0],
+                            # but it is not differentiable. We need to add this dummy tensor in order to include this
+                            # node in the future weights computation.
+                            # Note that this call is excluded from tests coverage,
+                            # since we do not suppose to get here - there is no valid operation that is both
+                            # non-differentiable and return output as a list or a tuple
+                            self.interest_points_tensors.append(torch.tensor([0.0],  # pragma: no cover
                                                                              requires_grad=True,
                                                                              device=t.device))
-                            break
+                            break  # pragma: no cover
                     output_t.append(t)
+                if isinstance(out_tensors_of_n, tuple):
+                    # If the node's output is a Tuple, then we want to keep it as a Tuple
+                    output_t = [tuple(output_t)]
                 node_to_output_tensors_dict.update({n: output_t})
             else:
                 assert isinstance(out_tensors_of_n, torch.Tensor)
@@ -178,8 +196,8 @@ class PytorchModelGradients(torch.nn.Module):
                         self.interest_points_tensors.append(out_tensors_of_n)
                     else:
                         # We get here in case we have an output node, which is an interest point,
-                        # but it is not differentiable. We need to add this dummy tensor to in order to include this
-                        # node in the coming weights computation.
+                        # but it is not differentiable. We need to add this dummy tensor in order to include this
+                        # node in the future weights computation.
                         self.interest_points_tensors.append(torch.tensor([0.0],
                                                                          requires_grad=True,
                                                                          device=out_tensors_of_n.device))
@@ -233,45 +251,69 @@ def pytorch_iterative_approx_jacobian_trace(graph_float: common.Graph,
     output_tensors = model_grads_net(model_input_tensors)
     device = output_tensors[0].device
-    outputs_jacobians_approx = []
-    for output in output_tensors:  # Per model's output tensor
-        output = torch.reshape(output, shape=[output.shape[0], -1])
-        ipts_jac_trace_approx = []
-        for ipt in tqdm(model_grads_net.interest_points_tensors):  # Per Interest point activation tensor
-            trace_jv = []
-            for j in range(n_iter):  # Approximation iterations
-                # Getting a random vector with normal distribution
-                v = torch.randn(output.shape, device=device)
-                f_v = torch.sum(v * output)
-                # Computing the jacobian approximation by getting the gradient of (output * v)
-                jac_v = autograd.grad(outputs=f_v,
-                                      inputs=ipt,
-                                      retain_graph=True,
-                                      allow_unused=True)[0]
-                if jac_v is None:
-                    # In case we have an output node, which is an interest point, but it is not differentiable,
-                    # we still want to set some weight for it. For this, we need to add this dummy tensor to the ipt
-                    # jacobian traces list.
-                    trace_jv.append(torch.tensor([0.0],
-                                                 requires_grad=True,
-                                                 device=device))
+    # Concat outputs
+    # First, we need to unfold all outputs that are given as list, to extract the actual output tensors
+    unfold_outputs = []
+    for output in output_tensors:
+        if isinstance(output, List):
+            unfold_outputs += output
+        else:
+            unfold_outputs.append(output)
+    r_outputs = [torch.reshape(output, shape=[output.shape[0], -1]) for output in unfold_outputs]
+    concat_axis_dim = [o.shape[0] for o in r_outputs]
+    if not all(d == concat_axis_dim[0] for d in concat_axis_dim):
+        Logger.critical("Can't concat model's outputs for gradients calculation since the shape of the first axis "  # pragma: no cover
+                        "is not equal in all outputs.")
+    output = torch.concat(r_outputs, dim=1)
+    ipts_jac_trace_approx = []
+    for ipt in tqdm(model_grads_net.interest_points_tensors):  # Per Interest point activation tensor
+        trace_jv = []
+        for j in range(n_iter):  # Approximation iterations
+            # Getting a random vector with normal distribution
+            v = torch.randn(output.shape, device=device)
+            f_v = torch.sum(v * output)
+            # Computing the jacobian approximation by getting the gradient of (output * v)
+            jac_v = autograd.grad(outputs=f_v,
+                                  inputs=ipt,
+                                  retain_graph=True,
+                                  allow_unused=True)[0]
+            if jac_v is None:
+                # In case we have an output node, which is an interest point, but it is not differentiable,
+                # we still want to set some weight for it. For this, we need to add this dummy tensor to the ipt
+                # jacobian traces list.
+                trace_jv.append(torch.tensor([0.0],
+                                             requires_grad=True,
+                                             device=device))
+                break
+            jac_v = torch.reshape(jac_v, [jac_v.shape[0], -1])
+            jac_trace_approx = torch.mean(torch.sum(torch.pow(jac_v, 2.0)))
+            # If the change to the mean Jacobian approximation is insignificant we stop the calculation
+            if j > MIN_JACOBIANS_ITER:
+                new_mean = torch.mean(torch.stack([jac_trace_approx, *trace_jv]))
+                delta = new_mean - torch.mean(torch.stack(trace_jv))
+                if torch.abs(delta) / (torch.abs(new_mean) + 1e-6) < JACOBIANS_COMP_TOLERANCE:
+                    trace_jv.append(jac_trace_approx)
                     break
-                jac_v = torch.reshape(jac_v, [jac_v.shape[0], -1])
-                jac_trace_approx = torch.mean(torch.sum(torch.pow(jac_v, 2.0)))
-                trace_jv.append(jac_trace_approx)
-            ipts_jac_trace_approx.append(2*torch.mean(torch.stack(trace_jv))/output.shape[-1])  # Get averaged jacobian trace approximation
-        outputs_jacobians_approx.append(ipts_jac_trace_approx)
-    mean_per_point = torch_tensor_to_numpy(torch.mean(torch.Tensor(outputs_jacobians_approx), dim=0))  # Get mean of jacobians of all model's outputs
+            trace_jv.append(jac_trace_approx)
+        ipts_jac_trace_approx.append(2*torch.mean(torch.stack(trace_jv))/output.shape[-1])  # Get averaged jacobian trace approximation
+    ipts_jac_trace_approx = torch_tensor_to_numpy(torch.Tensor(ipts_jac_trace_approx))  # Just to get one tensor instead of list of tensors with single element
     if norm_weights:
-        return _normalize_weights(mean_per_point, all_outputs_indices, alpha)
+        return _normalize_weights(ipts_jac_trace_approx, all_outputs_indices, alpha)
     else:
-        return mean_per_point
+        return ipts_jac_trace_approx
-def _normalize_weights(jacobians_traces: torch.Tensor,
+def _normalize_weights(jacobians_traces: np.ndarray,
                        all_outputs_indices: List[int],
                        alpha: float) -> List[float]:
     """

mct-nightly 1.7.1.31122022.post351__py3-none-any.whl → 1.8.0.1042023.post423__py3-none-any.whl

mct-nightly 1.7.1.31122022.post351py3-none-any.whl → 1.8.0.1042023.post423py3-none-any.whl