PyPI - mct-nightly - Versions diffs - 2.2.0.20241203.546__py3-none-any.whl → 2.2.0.20241205.533__py3-none-any.whl - Mend

mct-nightly 2.2.0.20241203.546py3-none-any.whl → 2.2.0.20241205.533py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

model_compression_toolkit/core/keras/data_util.py CHANGED Viewed

@@ -18,6 +18,27 @@ import tensorflow as tf
 from model_compression_toolkit.core.keras.tf_tensor_numpy import to_tf_tensor
+import tensorflow as tf
+from typing import Callable, Generator, Sequence, Any
+def get_tensor_spec(item, ignore_batch_dim=False):
+    """
+    Get the TensorFlow TensorSpec for an item, optionally ignoring the first dimension.
+    Args:
+        item: The input item, which could be a tensor, tuple, or list.
+        ignore_batch_dim (bool): Whether to ignore the first dimension of the tensor shape.
+    Returns:
+        TensorSpec or a tuple of TensorSpecs.
+    """
+    if isinstance(item, (tuple, list)):
+        return tuple(get_tensor_spec(sub_item, ignore_batch_dim) for sub_item in item)
+    shape = item.shape[1:] if ignore_batch_dim else item.shape
+    return tf.TensorSpec(shape=shape, dtype=item.dtype)
 def flat_gen_fn(data_gen_fn: Callable[[], Generator]):
     """
@@ -29,39 +50,151 @@ def flat_gen_fn(data_gen_fn: Callable[[], Generator]):
     Returns:
         A factory for a flattened data generator.
     """
     def gen():
         for inputs_batch in data_gen_fn():
             for sample in zip(*inputs_batch):
-                yield to_tf_tensor(sample)
-    return gen
+                yield tuple([tf.convert_to_tensor(s) for s in sample])
+    return gen
-# TODO in tf dataset and dataloader are combined within tf.data.Dataset. For advanced use cases such as gptq sla we
-#  need to separate dataset from dataloader similarly to torch data_util.
 class TFDatasetFromGenerator:
-    def __init__(self, data_gen, batch_size):
-        inputs = next(data_gen())
-        if not isinstance(inputs, list):
-            raise TypeError(f'Representative data generator is expected to generate a list of tensors, '
-                            f'got {type(inputs)}')  # pragma: no cover
+    """
+    TensorFlow dataset from a data generator function, batched to a specified size.
+    """
+    def __init__(self, data_gen_fn: Callable[[], Generator]):
+        """
+        Args:
+            data_gen_fn: a factory function for data generator that yields lists of tensors.
+        """
+        inputs = next(data_gen_fn())
+        if not isinstance(inputs, list):
+            raise TypeError(f'Data generator is expected to yield a list of tensors, got {type(inputs)}')
         self.orig_batch_size = inputs[0].shape[0]
-        output_signature = tuple([tf.TensorSpec(shape=t.shape[1:], dtype=t.dtype) for t in inputs])
-        dataset = tf.data.Dataset.from_generator(flat_gen_fn(data_gen), output_signature=output_signature)
-        self.dataset = dataset.batch(batch_size)
         self._size = None
+        # TFDatasetFromGenerator flattens the dataset, thus we ignore the batch dimension
+        output_signature = get_tensor_spec(inputs, ignore_batch_dim=True)
+        self.dataset = tf.data.Dataset.from_generator(flat_gen_fn(data_gen_fn), output_signature=output_signature)
     def __iter__(self):
         return iter(self.dataset)
     def __len__(self):
         """ Returns the number of batches. """
         if self._size is None:
-            self._num_batches = sum(1 for _ in self)
-        return self._num_batches
+            self._size = sum(1 for _ in self.dataset)
+        return self._size
+class FixedTFDataset:
+    """
+    Fixed dataset containing samples from a generator, stored in memory.
+    """
+    def __init__(self, data_gen_fn: Callable[[], Generator], n_samples: int = None):
+        """
+        Args:
+            data_gen_fn: data generator function.
+            n_samples: number of samples to store in the dataset. If None, uses all samples in one pass.
+        """
+        inputs = next(data_gen_fn())
+        if not isinstance(inputs, list):
+            raise TypeError(f'Data generator is expected to yield a list of tensors, got {type(inputs)}')
+        self.orig_batch_size = inputs[0].shape[0]
+        samples = []
+        for batch in data_gen_fn():
+            samples.extend(zip(*[tf.convert_to_tensor(t) for t in batch]))
+            if n_samples is not None and len(samples) >= n_samples:
+                samples = samples[:n_samples]
+                break
+        if n_samples and len(samples) < n_samples:
+            raise ValueError(f'Not enough samples to create a dataset with {n_samples} samples')
+        self.samples = samples
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, index):
+        return self.samples[index]
+class FixedSampleInfoDataset:
+    """
+    Dataset for samples with additional info, each element is a tuple of (sample, sample_info).
+    """
+    def __init__(self, samples: Sequence, sample_info: Sequence):
+        if not all(len(info) == len(samples) for info in sample_info):
+            raise ValueError('Sample and additional info lengths must match')
+        self.samples = samples
+        self.sample_info = sample_info
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, index):
+        return self.samples[index], tuple([info[index] for info in self.sample_info])
+class IterableSampleWithConstInfoDataset:
+    """
+    Augments each sample in an iterable dataset with constant additional information.
+    """
+    def __init__(self, samples_dataset: tf.data.Dataset, *info: Any):
+        self.samples_dataset = samples_dataset
+        self.info = info
+    def __iter__(self):
+        for sample in self.samples_dataset:
+            yield (sample, *self.info)
+def data_gen_to_dataloader(data_gen_fn: Callable[[], Generator], batch_size: int):
+    """Create a DataLoader based on samples yielded by data_gen."""
+    ds = TFDatasetFromGenerator(data_gen_fn)
+    return create_tf_dataloader(dataset=ds, batch_size=batch_size)
+def create_tf_dataloader(dataset, batch_size, shuffle=False, collate_fn=None):
+    """
+    Creates a tf.data.Dataset with specified loading options.
+    Args:
+        dataset: The dataset container (e.g., FixedDatasetFromGenerator or FixedSampleInfoDataset).
+        batch_size: Number of samples per batch.
+        shuffle: Whether to shuffle the dataset.
+        collate_fn: A function to apply to each batch (e.g., add extra outputs like regularization weights).
+    Returns:
+        tf.data.Dataset: Configured for batching, shuffling, and custom transformations.
+    """
+    def generator():
+        for item in dataset:
+            yield item
+    dummy_input_tensors = next(generator())
+    output_signature = get_tensor_spec(dummy_input_tensors)
+    tf_dataset = tf.data.Dataset.from_generator(
+        generator,
+        output_signature=output_signature
+    )
+    if shuffle:
+        tf_dataset = tf_dataset.shuffle(buffer_size=len(dataset))
+    tf_dataset = tf_dataset.batch(batch_size)
+    # Apply collate function if provided
+    if collate_fn:
+        tf_dataset = tf_dataset.map(lambda *args: collate_fn(args))
-def data_gen_to_dataloader(data_gen_fn: Callable[[], Generator], batch_size) -> TFDatasetFromGenerator:
-    """ Create DataLoader based on samples yielded by data_gen. """
-    return TFDatasetFromGenerator(data_gen_fn, batch_size)
+    return tf_dataset

model_compression_toolkit/core/keras/hessian/activation_hessian_scores_calculator_keras.py CHANGED Viewed

@@ -60,96 +60,103 @@ class ActivationHessianScoresCalculatorKeras(HessianScoresCalculatorKeras):
         Returns:
             List[np.ndarray]: Scores based on the Hessian-approximation for the requested nodes.
         """
-        if self.hessian_request.granularity == HessianScoresGranularity.PER_TENSOR:
-            model_output_nodes = [ot.node for ot in self.graph.get_outputs()]
-            if len([n for n in self.hessian_request.target_nodes if n in model_output_nodes]) > 0:
-                Logger.critical("Trying to compute activation Hessian approximation with respect to the model output. "
-                                "This operation is not supported. "
-                                "Remove the output node from the set of node targets in the Hessian request.")
-            grad_model_outputs = self.hessian_request.target_nodes + model_output_nodes
-            # Building a model to run Hessian approximation on
-            model, _ = FloatKerasModelBuilder(graph=self.graph, append2output=grad_model_outputs).build_model()
-            # Record operations for automatic differentiation
-            with tf.GradientTape(persistent=True, watch_accessed_variables=False) as g:
-                g.watch(self.input_images)
-                if len(self.input_images) > 1:
-                    outputs = model(self.input_images)
-                else:
-                    outputs = model(*self.input_images)
-                if len(outputs) != len(grad_model_outputs):  # pragma: no cover
-                    Logger.critical(
-                        f"Model for computing activation Hessian approximation expects {len(grad_model_outputs)} "
-                        f"outputs, but got {len(outputs)} output tensors.")
-                # Extracting the intermediate activation tensors and the model real output.
-                # Note that we do not allow computing Hessian for output nodes, so there shouldn't be an overlap.
-                num_target_nodes = len(self.hessian_request.target_nodes)
-                # Extract activation tensors of nodes for which we want to compute Hessian
-                target_activation_tensors = outputs[:num_target_nodes]
-                # Extract the model outputs
-                output_tensors = outputs[num_target_nodes:]
-                # Unfold and concatenate all outputs to form a single tensor
-                output = self._concat_tensors(output_tensors)
-                # List to store the Hessian-approximation scores for each interest point
-                ipts_hessian_approximations = [tf.Variable([0.0], dtype=tf.float32, trainable=True)
-                                               for _ in range(len(target_activation_tensors))]
-                # Loop through each interest point activation tensor
-                prev_mean_results = None
-                for j in tqdm(range(self.num_iterations_for_approximation)):  # Approximation iterations
-                    # Getting a random vector with normal distribution
-                    v = tf.random.normal(shape=output.shape, dtype=output.dtype)
-                    f_v = tf.reduce_sum(v * output)
-                    for i, ipt in enumerate(target_activation_tensors):  # Per Interest point activation tensor
-                        interest_point_scores = []  # List to store scores for each interest point
-                        with g.stop_recording():
-                            # Computing the approximation by getting the gradient of (output * v)
-                            hess_v = g.gradient(f_v, ipt)
-                            if hess_v is None:
-                                # In case we have an output node, which is an interest point, but it is not
-                                # differentiable, we consider its Hessian to be the initial value 0.
-                                continue  # pragma: no cover
+        model_output_nodes = [ot.node for ot in self.graph.get_outputs()]
+        if len([n for n in self.hessian_request.target_nodes if n in model_output_nodes]) > 0:
+            Logger.critical("Trying to compute activation Hessian approximation with respect to the model output. "
+                            "This operation is not supported. "
+                            "Remove the output node from the set of node targets in the Hessian request.")
+        grad_model_outputs = self.hessian_request.target_nodes + model_output_nodes
+        # Building a model to run Hessian approximation on
+        model, _ = FloatKerasModelBuilder(graph=self.graph, append2output=grad_model_outputs).build_model()
+        # Record operations for automatic differentiation
+        with tf.GradientTape(persistent=True, watch_accessed_variables=False) as g:
+            g.watch(self.input_images)
+            if len(self.input_images) > 1:
+                outputs = model(self.input_images)
+            else:
+                outputs = model(*self.input_images)
+            if len(outputs) != len(grad_model_outputs):  # pragma: no cover
+                Logger.critical(
+                    f"Model for computing activation Hessian approximation expects {len(grad_model_outputs)} "
+                    f"outputs, but got {len(outputs)} output tensors.")
+            # Extracting the intermediate activation tensors and the model real output.
+            # Note that we do not allow computing Hessian for output nodes, so there shouldn't be an overlap.
+            num_target_nodes = len(self.hessian_request.target_nodes)
+            # Extract activation tensors of nodes for which we want to compute Hessian
+            target_activation_tensors = outputs[:num_target_nodes]
+            # Extract the model outputs
+            output_tensors = outputs[num_target_nodes:]
+            # Unfold and concatenate all outputs to form a single tensor
+            output = self._concat_tensors(output_tensors)
+            # List to store the Hessian-approximation scores for each interest point
+            ipts_hessian_approximations = [tf.Variable([0.0], dtype=tf.float32, trainable=True)
+                                           for _ in range(len(target_activation_tensors))]
+            # Loop through each interest point activation tensor
+            prev_mean_results = None
+            for j in tqdm(range(self.num_iterations_for_approximation)):  # Approximation iterations
+                # Generate random tensor of 1s and -1s
+                v = self._generate_random_vectors_batch(output.shape)
+                f_v = tf.reduce_sum(v * output)
+                for i, ipt in enumerate(target_activation_tensors):  # Per Interest point activation tensor
+                    interest_point_scores = []  # List to store scores for each interest point
+                    with g.stop_recording():
+                        # Computing the approximation by getting the gradient of (output * v)
+                        hess_v = g.gradient(f_v, ipt)
+                        if hess_v is None:
+                            # In case we have an output node, which is an interest point, but it is not
+                            # differentiable, we consider its Hessian to be the initial value 0.
+                            continue  # pragma: no cover
+                        if self.hessian_request.granularity == HessianScoresGranularity.PER_TENSOR:
                             # Mean over all dims but the batch (CXHXW for conv)
                             hessian_approx = tf.reduce_sum(hess_v ** 2.0,
                                                            axis=tuple(d for d in range(1, len(hess_v.shape))))
-                            # Free gradients
-                            del hess_v
-                            # Update node Hessian approximation mean over random iterations
-                            ipts_hessian_approximations[i] = (j * ipts_hessian_approximations[i] + hessian_approx) / (j + 1)
-                    # If the change to the mean approximation is insignificant (to all outputs)
-                    # we stop the calculation.
-                    if j > MIN_HESSIAN_ITER:
-                        if prev_mean_results is not None:
-                            new_mean_res = tf.reduce_mean(tf.stack(ipts_hessian_approximations), axis=1)
-                            relative_delta_per_node = (tf.abs(new_mean_res - prev_mean_results) /
-                                                       (tf.abs(new_mean_res) + 1e-6))
-                            max_delta = tf.reduce_max(relative_delta_per_node)
-                            if max_delta < HESSIAN_COMP_TOLERANCE:
-                                break
+                        elif self.hessian_request.granularity == HessianScoresGranularity.PER_ELEMENT:
+                            hessian_approx = hess_v ** 2
+                        elif self.hessian_request.granularity == HessianScoresGranularity.PER_OUTPUT_CHANNEL:
+                            axes_to_sum = tuple(d for d in range(1, len(hess_v.shape)-1))
+                            hessian_approx = tf.reduce_sum(hess_v ** 2.0, axis=axes_to_sum)
+                        else:  # pragma: no cover
+                            Logger.critical(f"{self.hessian_request.granularity} "
+                                            f"is not supported for Keras activation hessian\'s approximation scores calculator.")
+                        # Free gradients
+                        del hess_v
+                        # Update node Hessian approximation mean over random iterations
+                        ipts_hessian_approximations[i] = (j * ipts_hessian_approximations[i] + hessian_approx) / (j + 1)
+                # If the change to the mean approximation is insignificant (to all outputs)
+                # we stop the calculation.
+                if j > MIN_HESSIAN_ITER and prev_mean_results is not None:
+                    new_mean_res = tf.reduce_mean(tf.stack(ipts_hessian_approximations), axis=1)
+                    relative_delta_per_node = (tf.abs(new_mean_res - prev_mean_results) /
+                                               (tf.abs(new_mean_res) + 1e-6))
+                    max_delta = tf.reduce_max(relative_delta_per_node)
+                    if max_delta < HESSIAN_COMP_TOLERANCE:
+                        break
+                if self.hessian_request.granularity == HessianScoresGranularity.PER_TENSOR:
                     prev_mean_results = tf.reduce_mean(tf.stack(ipts_hessian_approximations), axis=1)
-                # Convert results to list of numpy arrays
-                hessian_results = [h.numpy() for h in ipts_hessian_approximations]
-                # Extend the Hessian tensors shape to align with expected return type
-                # TODO: currently, only per-tensor Hessian is available for activation.
-                #  Once implementing per-channel or per-element, this alignment needs to be verified and handled separately.
-                hessian_results = [h[..., np.newaxis] for h in hessian_results]
+            # Convert results to list of numpy arrays
+            hessian_results = [h.numpy() for h in ipts_hessian_approximations]
+            # Extend the Hessian tensors shape to align with expected return type
+            # TODO: currently, only per-tensor Hessian is available for activation.
+            #  Once implementing per-channel or per-element, this alignment needs to be verified and handled separately.
+            hessian_results = [h[..., np.newaxis] for h in hessian_results]
-                return hessian_results
+            return hessian_results
-        else:  # pragma: no cover
-            Logger.critical(f"{self.hessian_request.granularity} "
-                            f"is not supported for Keras activation hessian\'s approximation scores calculator.")

model_compression_toolkit/core/keras/hessian/hessian_scores_calculator_keras.py CHANGED Viewed

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+from tensorflow import TensorShape
 from model_compression_toolkit.core.common.hessian.hessian_scores_calculator import HessianScoresCalculator
@@ -77,3 +78,19 @@ class HessianScoresCalculatorKeras(HessianScoresCalculator):
                 "Unable to concatenate tensors for gradient calculation due to mismatched shapes along the first axis.") # pragma: no cover
         return tf.concat(_r_tensors, axis=1)
+    def _generate_random_vectors_batch(self, shape: TensorShape) -> tf.Tensor:
+        """
+        Generate a batch of random vectors for Hutchinson estimation using Rademacher distribution.
+        Args:
+            shape: target shape.
+        Returns:
+            Random tensor.
+        """
+        v = tf.random.uniform(shape=shape, minval=0, maxval=2, dtype=tf.int32)
+        v = tf.where(v == 0, -1, 1)
+        v = tf.cast(v, tf.float32)
+        return v

model_compression_toolkit/core/keras/hessian/weights_hessian_scores_calculator_keras.py CHANGED Viewed

@@ -89,8 +89,7 @@ class WeightsHessianScoresCalculatorKeras(HessianScoresCalculatorKeras):
             prev_mean_results = None
             tensors_original_shape = []
             for j in tqdm(range(self.num_iterations_for_approximation)):  # Approximation iterations
-                # Getting a random vector with normal distribution and the same shape as the model output
-                v = tf.random.normal(shape=output.shape)
+                v = self._generate_random_vectors_batch(output.shape)
                 f_v = tf.reduce_sum(v * output)
                 for i, ipt_node in enumerate(self.hessian_request.target_nodes):  # Per Interest point weights tensor

model_compression_toolkit/core/keras/keras_implementation.py CHANGED Viewed

@@ -438,17 +438,11 @@ class KerasImplementation(FrameworkImplementation):
             node: Node to indicate whether it needs to be part of the interest points set.
         Returns: True if the node should be considered an interest point, False otherwise.
         """
-        if node.is_match_type(Activation):
-            node_type_name = node.framework_attr[keras_constants.ACTIVATION]
-            if node_type_name in [keras_constants.SOFTMAX, keras_constants.SIGMOID]:
-                return True
-        elif any([node.is_match_type(_type) for _type in [tf.nn.softmax, tf.keras.layers.Softmax, tf.nn.sigmoid, Conv2D,
-                                                          DepthwiseConv2D, Conv2DTranspose, Dense, Concatenate, tf.concat,
-                                                          Add, tf.add]]):
+        if self.is_softmax(node) or self.is_sigmoid(node):
             return True
-        return False
+        return any([node.is_match_type(_type) for _type in [Conv2D, DepthwiseConv2D, Conv2DTranspose, Dense,
+                                                            Concatenate, tf.concat, Add, tf.add]])
     def get_mp_node_distance_fn(self, n: BaseNode,
                                 compute_distance_fn: Callable = None,
@@ -466,32 +460,34 @@ class KerasImplementation(FrameworkImplementation):
         Returns: A distance function between two tensors and a axis on which the distance is computed (if exists).
         """
-        axis = n.framework_attr.get(keras_constants.AXIS) \
-            if not isinstance(n, FunctionalNode) else n.op_call_kwargs.get(keras_constants.AXIS)
-        layer_class = n.layer_class
-        framework_attrs = n.framework_attr
+        axis = n.op_call_kwargs.get(keras_constants.AXIS) if isinstance(n, FunctionalNode) else n.framework_attr.get(keras_constants.AXIS)
         if compute_distance_fn is not None:
             return compute_distance_fn, axis
-        if layer_class == Activation:
-            node_type_name = framework_attrs[ACTIVATION]
-            if node_type_name == SOFTMAX and axis is not None:
-                return compute_kl_divergence, axis
-            elif node_type_name == SIGMOID:
-                return compute_cs, axis
-        elif axis is not None and (layer_class == tf.nn.softmax or layer_class == tf.keras.layers.Softmax
-                                   or (layer_class == TFOpLambda and
-                                       SOFTMAX in framework_attrs[keras_constants.FUNCTION])):
+        # TODO should we really return mse if axis is None? Error? Fill default?
+        if self.is_softmax(n) and axis is not None:
             return compute_kl_divergence, axis
-        elif layer_class == tf.nn.sigmoid or (layer_class == TFOpLambda and
-                                              SIGMOID in framework_attrs[keras_constants.FUNCTION]):
-            return compute_cs, axis
-        elif layer_class == Dense:
+        if self.is_sigmoid(n) or n.layer_class == Dense:
             return compute_cs, axis
         return partial(compute_mse, norm=norm_mse), axis
+    @staticmethod
+    def is_sigmoid(node: BaseNode):
+        cls = node.layer_class
+        return ((cls == Activation and node.framework_attr[ACTIVATION] == SIGMOID) or
+                cls == tf.nn.sigmoid or
+                cls == TFOpLambda and SIGMOID in node.framework_attr[keras_constants.FUNCTION])
+    @staticmethod
+    def is_softmax(node: BaseNode):
+        cls = node.layer_class
+        return ((cls == Activation and node.framework_attr[ACTIVATION] == SOFTMAX) or
+                cls in [tf.nn.softmax, tf.keras.layers.Softmax] or
+                cls == TFOpLambda and SOFTMAX in node.framework_attr[keras_constants.FUNCTION])
     def get_hessian_scores_calculator(self,
                                       graph: Graph,
                                       input_images: List[Any],

model_compression_toolkit/core/pytorch/pytorch_implementation.py CHANGED Viewed

@@ -427,10 +427,8 @@ class PytorchImplementation(FrameworkImplementation):
         Returns: True if the node should be considered an interest point, False otherwise.
         """
-        if any([node.is_match_type(_type) for _type in [Conv2d, Linear, ConvTranspose2d, Sigmoid, sigmoid, Softmax,
-                                                        softmax, operator.add, add, cat, operator.concat]]):
-            return True
-        return False
+        return any(node.is_match_type(_type) for _type in [Conv2d, Linear, ConvTranspose2d, Sigmoid, sigmoid, Softmax,
+                                                           softmax, operator.add, add, cat, operator.concat])
     def get_mp_node_distance_fn(self, n: BaseNode,
                                 compute_distance_fn: Callable = None,

model_compression_toolkit/gptq/common/gptq_training.py CHANGED Viewed

@@ -27,7 +27,11 @@ from model_compression_toolkit.gptq.common.gptq_config import GradientPTQConfig
 from model_compression_toolkit.gptq.common.gptq_constants import QUANT_PARAM_LEARNING_STR
 from model_compression_toolkit.gptq.common.gptq_framework_implementation import GPTQFrameworkImplemantation
 from model_compression_toolkit.gptq.common.gptq_graph import get_compare_points
+from model_compression_toolkit.gptq.common.gradual_activation_quantization import \
+    get_gradual_activation_quantizer_wrapper_factory
+from model_compression_toolkit.gptq.common.regularization_factory import get_regularization
 from model_compression_toolkit.logger import Logger
+from model_compression_toolkit.trainable_infrastructure.common.util import get_total_grad_steps
 class GPTQTrainer(ABC):
@@ -64,6 +68,14 @@ class GPTQTrainer(ABC):
         self.fw_impl = fw_impl
         self.fw_info = fw_info
         self.representative_data_gen_fn = representative_data_gen_fn
+        def _get_total_grad_steps():
+            return get_total_grad_steps(representative_data_gen_fn) * gptq_config.n_epochs
+        self.gradual_act_quantizer_wrapper_factory = get_gradual_activation_quantizer_wrapper_factory(gptq_config,
+                                                                                                      _get_total_grad_steps,
+                                                                                                      self.fw_linear_annealing_scheduler)
         # ----------------------------------------------
         # Build two models and create compare nodes
         # ----------------------------------------------
@@ -81,6 +93,52 @@ class GPTQTrainer(ABC):
                                 f"an 'HessianInfoService' object must be provided, but received: {hessian_info_service}.")   # pragma: no cover
             self.hessian_service = hessian_info_service
+        self.reg_func = get_regularization(self.gptq_config,
+                                           _get_total_grad_steps,
+                                           self.fw_soft_quantizer_regularization,
+                                           self.fw_linear_annealing_scheduler)
+        self.loss_list = []
+        self.input_scale = 1
+        if self.float_user_info.input_scale != self.gptq_user_info.input_scale:
+            Logger.critical("Input scale mismatch between float and GPTQ networks. "
+                            "Ensure both networks have matching input scales.")  # pragma: no cover
+        else:
+            self.input_scale = self.gptq_user_info.input_scale
+        trainable_weights, trainable_bias, trainable_threshold = self.fw_get_gptq_trainable_parameters_fn(
+            self.fxp_model,
+            add_bias=self.gptq_config.train_bias)
+        self.flp_weights_list, self.fxp_weights_list = self.fw_get_weights_for_loss_fn(self.fxp_model)
+        if not (len(self.compare_points) == len(trainable_weights) == len(self.flp_weights_list) == len(
+                self.fxp_weights_list)):
+            Logger.critical("Mismatch in the number of comparison points, layers with trainable weights, "
+                            "and the number of float and quantized weights for loss calculation. "
+                            "Ensure all these elements align to proceed with GPTQ training.")
+        # In Keras we need to flatten the weights first before attaching the optimizer
+        if len(trainable_weights) > 0 and isinstance(trainable_weights[0], (list, tuple)):
+            trainable_weights = [w for layer_weights in trainable_weights for w in layer_weights]
+        if len(trainable_bias) > 0 and isinstance(trainable_bias[0], (list, tuple)):
+            trainable_bias = [w for layer_weights in trainable_bias for w in layer_weights]
+        self.optimizer_with_param = self.get_optimizer_with_param(trainable_weights,
+                                                                  trainable_bias,
+                                                                  trainable_threshold)
+        hessian_cfg = self.gptq_config.hessian_weights_config
+        self.has_params_to_train = np.sum(
+            [len(optimizer_params_tuple[1]) for optimizer_params_tuple in self.optimizer_with_param]) > 0
+        self.use_sample_layer_attention = hessian_cfg and hessian_cfg.per_sample
+        if self.use_sample_layer_attention:
+            # normalization is currently not supported, make sure the config reflects it.
+            if hessian_cfg.norm_scores or hessian_cfg.log_norm or hessian_cfg.scale_log_norm:
+                raise NotImplementedError()
+            self.train_dataloader = self._prepare_train_dataloader_sla(representative_data_gen_fn)
+        else:
+            self.train_dataloader = self._prepare_train_dataloader_for_non_sla(representative_data_gen_fn)
     def get_optimizer_with_param(self,
                                  flattened_trainable_weights: List[Any],
                                  flattened_bias_weights: List[Any],

model_compression_toolkit/gptq/keras/gptq_loss.py CHANGED Viewed

@@ -13,9 +13,8 @@
 # limitations under the License.
 # ==============================================================================
-from typing import Any, Tuple, List
 import tensorflow as tf
+from typing import List, Tuple
 def mse_loss(y: tf.Tensor, x: tf.Tensor, normalized: bool = True) -> tf.Tensor:
@@ -67,6 +66,40 @@ def multiple_tensors_mse_loss(y_list: List[tf.Tensor],
     else:
         return tf.reduce_mean(tf.stack(loss_values_list))
+def sample_layer_attention_loss(y_list: List[tf.Tensor],
+                                x_list: List[tf.Tensor],
+                                fxp_w_list,
+                                flp_w_list,
+                                act_bn_mean,
+                                act_bn_std,
+                                loss_weights: Tuple[tf.Tensor]) -> tf.Tensor:
+    """
+    Compute Sample Layer Attention loss between two lists of tensors using TensorFlow.
+    Args:
+        y_list: First list of tensors.
+        x_list: Second list of tensors.
+        fxp_w_list, flp_w_list, act_bn_mean, act_bn_std: unused (needed to comply with the interface).
+        loss_weights: layer-sample attention scores (tuplle by the same length as the number of layers, where each element is a tf.Tensor vector of length of number of samples).
+    Returns:
+        Sample Layer Attention loss (a scalar).
+    """
+    loss = 0
+    layers_mean_w = []
+    loss_weights = tf.stack(loss_weights, axis=1)
+    for i, (y, x) in enumerate(zip(y_list, x_list)):
+        norm = tf.reduce_sum(tf.square(y - x), axis=1)
+        if len(norm.shape) > 1:
+            norm = tf.reduce_mean(tf.reshape(norm, [norm.shape[0], -1]), axis=1)
+        w = loss_weights[:, i]
+        loss += tf.reduce_mean(w * norm)
+        layers_mean_w.append(tf.reduce_mean(w))
+    loss = loss / tf.reduce_max(tf.stack(layers_mean_w))
+    return loss
 def mse_loss_per_tensor(y: tf.Tensor,
                         x: tf.Tensor,

mct-nightly 2.2.0.20241203.546__py3-none-any.whl → 2.2.0.20241205.533__py3-none-any.whl

mct-nightly 2.2.0.20241203.546py3-none-any.whl → 2.2.0.20241205.533py3-none-any.whl