PyPI - likelihood - Versions diffs - 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

likelihood 1.4.1py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

likelihood/graph/nn.py +8 -2
likelihood/models/deep/autoencoders.py +312 -109
likelihood/tools/figures.py +348 -0
likelihood/tools/models_tools.py +161 -9
likelihood/tools/tools.py +26 -84
{likelihood-1.4.1.dist-info → likelihood-1.5.0.dist-info}/METADATA +1 -1
{likelihood-1.4.1.dist-info → likelihood-1.5.0.dist-info}/RECORD +10 -9
{likelihood-1.4.1.dist-info → likelihood-1.5.0.dist-info}/WHEEL +1 -1
{likelihood-1.4.1.dist-info → likelihood-1.5.0.dist-info}/LICENSE +0 -0
{likelihood-1.4.1.dist-info → likelihood-1.5.0.dist-info}/top_level.txt +0 -0

likelihood/graph/nn.py CHANGED Viewed

@@ -61,6 +61,8 @@ def cal_adjacency_matrix(
     ----------
     similarity: `int`
         The minimum number of features that must be the same in both arrays to be considered similar.
+    threshold : `float`
+        The threshold value used in the `compare_similarity` function. Default is 0.05.
     Returns
     -------
@@ -79,6 +81,7 @@ def cal_adjacency_matrix(
     assert len(df_) > 0
     similarity = kwargs.get("similarity", len(df_.columns) - 1)
+    threshold = kwargs.get("threshold", 0.05)
     assert similarity <= df_.shape[1]
     adj_dict = {index: row.tolist() for index, row in df_.iterrows()}
@@ -87,7 +90,7 @@ def cal_adjacency_matrix(
     for i in range(len(df_)):
         for j in range(len(df_)):
-            if compare_similarity(adj_dict[i], adj_dict[j]) >= similarity:
+            if compare_similarity(adj_dict[i], adj_dict[j], threshold=threshold) >= similarity:
                 adjacency_matrix[i][j] = 1
     if sparse:
@@ -114,7 +117,10 @@ class Data:
         **kwargs,
     ):
         sparse = kwargs.get("sparse", True)
-        _, adjacency = cal_adjacency_matrix(df, exclude_subset=exclude_subset, sparse=sparse)
+        threshold = kwargs.get("threshold", 0.05)
+        _, adjacency = cal_adjacency_matrix(
+            df, exclude_subset=exclude_subset, sparse=sparse, threshold=threshold
+        )
         if target is not None:
             X = df.drop(columns=[target] + exclude_subset)
         else:

likelihood/models/deep/autoencoders.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import logging
 import os
 import random
+import warnings
 from functools import partial
 from shutil import rmtree
@@ -14,8 +15,8 @@ from pandas.plotting import radviz
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 logging.getLogger("tensorflow").setLevel(logging.ERROR)
-import warnings
-from functools import wraps
+from typing import List
 import keras_tuner
 import tensorflow as tf
@@ -24,21 +25,11 @@ from sklearn.manifold import TSNE
 from tensorflow.keras.layers import InputLayer
 from tensorflow.keras.regularizers import l2
-from likelihood.tools import LoRALayer, OneHotEncoder
+from likelihood.tools import LoRALayer, OneHotEncoder, suppress_warnings
 tf.get_logger().setLevel("ERROR")
-def suppress_warnings(func):
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            return func(*args, **kwargs)
-    return wrapper
 class EarlyStopping:
     def __init__(self, patience=10, min_delta=0.001):
         self.patience = patience
@@ -246,7 +237,7 @@ class AutoClassifier(tf.keras.Model):
     Additional keyword arguments to pass to the model.
     classifier_activation : `str`
-        The activation function to use for the classifier layer. Default is "softmax". If the activation function is not a classification function, the model can be used in regression problems.
+        The activation function to use for the classifier layer. Default is `softmax`. If the activation function is not a classification function, the model can be used in regression problems.
     num_layers : `int`
         The number of hidden layers in the classifier. Default is 1.
     dropout : `float`
@@ -373,7 +364,6 @@ class AutoClassifier(tf.keras.Model):
         else:
             self.build_encoder_decoder(input_shape)
-        # Classifier with L2 regularization
         self.classifier = tf.keras.Sequential()
         if self.num_layers > 1 and not self.lora_mode:
             for _ in range(self.num_layers - 1):
@@ -527,7 +517,6 @@ class AutoClassifier(tf.keras.Model):
         if not isinstance(source_model, AutoClassifier):
             raise ValueError("Source model must be an instance of AutoClassifier.")
-        # Check compatibility in input shape and units
         if self.input_shape_parm != source_model.input_shape_parm:
             raise ValueError(
                 f"Incompatible input shape. Expected {self.input_shape_parm}, got {source_model.input_shape_parm}."
@@ -537,9 +526,8 @@ class AutoClassifier(tf.keras.Model):
                 f"Incompatible number of units. Expected {self.units}, got {source_model.units}."
             )
         self.encoder, self.decoder = tf.keras.Sequential(), tf.keras.Sequential()
-        # Copy the encoder layers
         for i, layer in enumerate(source_model.encoder.layers):
-            if isinstance(layer, tf.keras.layers.Dense):  # Make sure it's a Dense layer
+            if isinstance(layer, tf.keras.layers.Dense):
                 dummy_input = tf.convert_to_tensor(tf.random.normal([1, layer.input_shape[1]]))
                 dense_layer = tf.keras.layers.Dense(
                     units=layer.units,
@@ -548,14 +536,12 @@ class AutoClassifier(tf.keras.Model):
                 )
                 dense_layer.build(dummy_input.shape)
                 self.encoder.add(dense_layer)
-                # Set the weights correctly
                 self.encoder.layers[i].set_weights(layer.get_weights())
             elif not isinstance(layer, InputLayer):
                 raise ValueError(f"Layer type {type(layer)} not supported for copying.")
-        # Copy the decoder layers
         for i, layer in enumerate(source_model.decoder.layers):
-            if isinstance(layer, tf.keras.layers.Dense):  # Ensure it's a Dense layer
+            if isinstance(layer, tf.keras.layers.Dense):
                 dummy_input = tf.convert_to_tensor(tf.random.normal([1, layer.input_shape[1]]))
                 dense_layer = tf.keras.layers.Dense(
                     units=layer.units,
@@ -564,7 +550,6 @@ class AutoClassifier(tf.keras.Model):
                 )
                 dense_layer.build(dummy_input.shape)
                 self.decoder.add(dense_layer)
-                # Set the weights correctly
                 self.decoder.layers[i].set_weights(layer.get_weights())
             elif not isinstance(layer, InputLayer):
                 raise ValueError(f"Layer type {type(layer)} not supported for copying.")
@@ -907,62 +892,220 @@ def setup_model(
 class GetInsights:
+    """
+    A class to analyze the output of a neural network model, including visualizations
+    of the weights, t-SNE representation, and feature statistics.
+    Parameters
+    ----------
+    model : `AutoClassifier`
+        The trained model to analyze.
+    inputs : `np.ndarray`
+        The input data for analysis.
+    """
     def __init__(self, model: AutoClassifier, inputs: np.ndarray) -> None:
+        """
+        Initializes the GetInsights class.
+        Parameters
+        ----------
+        model : `AutoClassifier`
+            The trained model to analyze.
+        inputs : `np.ndarray`
+            The input data for analysis.
+        """
         self.inputs = inputs
         self.model = model
-        if isinstance(self.model.encoder.layers[0], InputLayer):
-            self.encoder_layer = self.model.encoder.layers[1]
-        else:
-            self.encoder_layer = self.model.encoder.layers[0]
+        self.encoder_layer = (
+            self.model.encoder.layers[1]
+            if isinstance(self.model.encoder.layers[0], InputLayer)
+            else self.model.encoder.layers[0]
+        )
         self.decoder_layer = self.model.decoder.layers[0]
         self.encoder_weights = self.encoder_layer.get_weights()[0]
         self.decoder_weights = self.decoder_layer.get_weights()[0]
-        colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)
+        self.sorted_names = self._generate_sorted_color_names()
+    def _generate_sorted_color_names(self) -> list:
+        """
+        Generate sorted color names based on their HSV values.
+        Parameters
+        ----------
+        `None`
+        Returns
+        -------
+        `list` : Sorted color names.
+        """
+        colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)
         by_hsv = sorted(
             (tuple(mcolors.rgb_to_hsv(mcolors.to_rgba(color)[:3])), name)
             for name, color in colors.items()
         )
-        self.sorted_names = [name for hsv, name in by_hsv if hsv[1] > 0.4 and hsv[2] >= 0.4]
-        random.shuffle(self.sorted_names)
+        sorted_names = [name for hsv, name in by_hsv if hsv[1] > 0.4 and hsv[2] >= 0.4]
+        random.shuffle(sorted_names)
+        return sorted_names
     def predictor_analyzer(
         self,
-        frac=None,
+        frac: float = None,
         cmap: str = "viridis",
         aspect: str = "auto",
         highlight: bool = True,
         **kwargs,
     ) -> None:
+        """
+        Analyze the model's predictions and visualize data.
+        Parameters
+        ----------
+        frac : `float`, optional
+            Fraction of data to use for analysis (default is `None`).
+        cmap : `str`, optional
+            The colormap for visualization (default is `"viridis"`).
+        aspect : `str`, optional
+            Aspect ratio for the visualization (default is `"auto"`).
+        highlight : `bool`, optional
+            Whether to highlight the maximum weights (default is `True`).
+        **kwargs : `dict`, optional
+            Additional keyword arguments for customization.
+        Returns
+        -------
+        `DataFrame` : The statistical summary of the input data.
+        """
         self._viz_weights(cmap=cmap, aspect=aspect, highlight=highlight, **kwargs)
         inputs = self.inputs.copy()
+        inputs = self._prepare_inputs(inputs, frac)
         y_labels = kwargs.get("y_labels", None)
+        encoded, reconstructed = self._encode_decode(inputs)
+        self._visualize_data(inputs, reconstructed, cmap, aspect)
+        self._prepare_data_for_analysis(inputs, reconstructed, encoded, y_labels)
+        try:
+            self._get_tsne_repr(inputs, frac)
+            self._viz_tsne_repr(c=self.classification)
+            self._viz_radviz(self.data, "class", "Radviz Visualization of Latent Space")
+            self._viz_radviz(self.data_input, "class", "Radviz Visualization of Input Data")
+        except ValueError:
+            warnings.warn(
+                "Some functions or processes will not be executed for regression problems.",
+                UserWarning,
+            )
+        return self._statistics(self.data_input)
+    def _prepare_inputs(self, inputs: np.ndarray, frac: float) -> np.ndarray:
+        """
+        Prepare the input data, possibly selecting a fraction of it.
+        Parameters
+        ----------
+        inputs : `np.ndarray`
+            The input data.
+        frac : `float`
+            Fraction of data to use.
+        Returns
+        -------
+        `np.ndarray` : The prepared input data.
+        """
         if frac:
             n = int(frac * self.inputs.shape[0])
             indexes = np.random.choice(np.arange(inputs.shape[0]), n, replace=False)
             inputs = inputs[indexes]
         inputs[np.isnan(inputs)] = 0.0
-        # check if self.model.encoder(inputs) has two outputs
+        return inputs
+    def _encode_decode(self, inputs: np.ndarray) -> tuple:
+        """
+        Perform encoding and decoding on the input data.
+        Parameters
+        ----------
+        inputs : `np.ndarray`
+            The input data.
+        Returns
+        -------
+        `tuple` : The encoded and reconstructed data.
+        """
         try:
             mean, log_var = self.model.encoder(inputs)
             encoded = sampling(mean, log_var)
         except:
             encoded = self.model.encoder(inputs)
         reconstructed = self.model.decoder(encoded)
-        combined = tf.concat([reconstructed, encoded], axis=1)
-        self.classification = self.model.classifier(combined).numpy().argmax(axis=1)
+        return encoded, reconstructed
+    def _visualize_data(
+        self, inputs: np.ndarray, reconstructed: np.ndarray, cmap: str, aspect: str
+    ) -> None:
+        """
+        Visualize the original data and the reconstructed data.
+        Parameters
+        ----------
+        inputs : `np.ndarray`
+            The input data.
+        reconstructed : `np.ndarray`
+            The reconstructed data.
+        cmap : `str`
+            The colormap for visualization.
+        aspect : `str`
+            Aspect ratio for the visualization.
+        Returns
+        -------
+        `None`
+        """
         ax = plt.subplot(1, 2, 1)
-        plt.imshow(self.inputs, cmap=cmap, aspect=aspect)
+        plt.imshow(inputs, cmap=cmap, aspect=aspect)
         plt.colorbar()
         plt.title("Original Data")
         plt.subplot(1, 2, 2, sharex=ax, sharey=ax)
         plt.imshow(reconstructed, cmap=cmap, aspect=aspect)
         plt.colorbar()
         plt.title("Decoder Layer Reconstruction")
         plt.show()
-        self._get_tsne_repr(inputs=inputs, frac=frac)
-        self._viz_tsne_repr(c=self.classification)
+    def _prepare_data_for_analysis(
+        self,
+        inputs: np.ndarray,
+        reconstructed: np.ndarray,
+        encoded: np.ndarray,
+        y_labels: List[str],
+    ) -> None:
+        """
+        Prepare data for statistical analysis.
+        Parameters
+        ----------
+        inputs : `np.ndarray`
+            The input data.
+        reconstructed : `np.ndarray`
+            The reconstructed data.
+        encoded : `np.ndarray`
+            The encoded data.
+        y_labels : `List[str]`
+            The labels of features.
+        Returns
+        -------
+        `None`
+        """
+        self.classification = (
+            self.model.classifier(tf.concat([reconstructed, encoded], axis=1))
+            .numpy()
+            .argmax(axis=1)
+        )
         self.data = pd.DataFrame(encoded, columns=[f"Feature {i}" for i in range(encoded.shape[1])])
         self.data_input = pd.DataFrame(
@@ -971,84 +1114,25 @@ class GetInsights:
                 [f"Feature {i}" for i in range(inputs.shape[1])] if y_labels is None else y_labels
             ),
         )
         self.data["class"] = self.classification
         self.data_input["class"] = self.classification
-        self.data_normalized = self.data.copy(deep=True)
-        self.data_normalized.iloc[:, :-1] = (
-            2.0
-            * (self.data_normalized.iloc[:, :-1] - self.data_normalized.iloc[:, :-1].min())
-            / (self.data_normalized.iloc[:, :-1].max() - self.data_normalized.iloc[:, :-1].min())
-            - 1
-        )
-        radviz(self.data_normalized, "class", color=self.colors)
-        plt.title("Radviz Visualization of Latent Space")
-        plt.show()
-        self.data_input_normalized = self.data_input.copy(deep=True)
-        self.data_input_normalized.iloc[:, :-1] = (
-            2.0
-            * (
-                self.data_input_normalized.iloc[:, :-1]
-                - self.data_input_normalized.iloc[:, :-1].min()
-            )
-            / (
-                self.data_input_normalized.iloc[:, :-1].max()
-                - self.data_input_normalized.iloc[:, :-1].min()
-            )
-            - 1
-        )
-        radviz(self.data_input_normalized, "class", color=self.colors)
-        plt.title("Radviz Visualization of Input Data")
-        plt.show()
-        return self._statistics(self.data_input)
-    def _statistics(self, data_input: DataFrame, **kwargs) -> DataFrame:
-        data = data_input.copy(deep=True)
-        if not pd.api.types.is_string_dtype(data["class"]):
-            data["class"] = data["class"].astype(str)
-        data.ffill(inplace=True)
-        grouped_data = data.groupby("class")
-        numerical_stats = grouped_data.agg(["mean", "min", "max", "std", "median"])
-        numerical_stats.columns = ["_".join(col).strip() for col in numerical_stats.columns.values]
-        def get_mode(x):
-            mode_series = x.mode()
-            return mode_series.iloc[0] if not mode_series.empty else None
-        mode_stats = grouped_data.apply(get_mode, include_groups=False)
-        mode_stats.columns = [f"{col}_mode" for col in mode_stats.columns]
-        combined_stats = pd.concat([numerical_stats, mode_stats], axis=1)
-        return combined_stats.T
-    def _viz_weights(
-        self, cmap: str = "viridis", aspect: str = "auto", highlight: bool = True, **kwargs
-    ) -> None:
-        title = kwargs.get("title", "Encoder Layer Weights (Dense Layer)")
-        y_labels = kwargs.get("y_labels", None)
-        cmap_highlight = kwargs.get("cmap_highlight", "Pastel1")
-        highlight_mask = np.zeros_like(self.encoder_weights, dtype=bool)
+    def _get_tsne_repr(self, inputs: np.ndarray = None, frac: float = None) -> None:
+        """
+        Perform t-SNE dimensionality reduction on the input data.
-        plt.imshow(self.encoder_weights, cmap=cmap, aspect=aspect)
-        plt.colorbar()
-        plt.title(title)
-        if y_labels is not None:
-            plt.yticks(ticks=np.arange(self.encoder_weights.shape[0]), labels=y_labels)
-        if highlight:
-            for i, j in enumerate(self.encoder_weights.argmax(axis=1)):
-                highlight_mask[i, j] = True
-            plt.imshow(
-                np.ma.masked_where(~highlight_mask, self.encoder_weights),
-                cmap=cmap_highlight,
-                alpha=0.5,
-                aspect=aspect,
-            )
-        plt.show()
+        Parameters
+        ----------
+        inputs : `np.ndarray`
+            The input data.
+        frac : `float`
+            Fraction of data to use.
-    def _get_tsne_repr(self, inputs=None, frac=None) -> None:
+        Returns
+        -------
+        `None`
+        """
         if inputs is None:
             inputs = self.inputs.copy()
             if frac:
@@ -1062,26 +1146,145 @@ class GetInsights:
         self.reduced_data_tsne = tsne.fit_transform(self.latent_representations)
     def _viz_tsne_repr(self, **kwargs) -> None:
+        """
+        Visualize the t-SNE representation of the latent space.
+        Parameters
+        ----------
+        **kwargs : `dict`
+            Additional keyword arguments for customization.
+        Returns
+        -------
+        `None`
+        """
         c = kwargs.get("c", None)
         self.colors = (
             kwargs.get("colors", self.sorted_names[: len(np.unique(c))]) if c is not None else None
         )
         plt.scatter(
             self.reduced_data_tsne[:, 0],
             self.reduced_data_tsne[:, 1],
             cmap=matplotlib.colors.ListedColormap(self.colors) if c is not None else None,
             c=c,
         )
         if c is not None:
             cb = plt.colorbar()
             loc = np.arange(0, max(c), max(c) / float(len(self.colors)))
             cb.set_ticks(loc)
             cb.set_ticklabels(np.unique(c))
         plt.title("t-SNE Visualization of Latent Space")
         plt.xlabel("t-SNE 1")
         plt.ylabel("t-SNE 2")
         plt.show()
+    def _viz_radviz(self, data: pd.DataFrame, color_column: str, title: str) -> None:
+        """
+        Visualize the data using RadViz.
+        Parameters
+        ----------
+        data : `pd.DataFrame`
+            The data to visualize.
+        color_column : `str`
+            The column to use for coloring.
+        title : `str`
+            The title of the plot.
+        Returns
+        -------
+        `None`
+        """
+        data_normalized = data.copy(deep=True)
+        data_normalized.iloc[:, :-1] = (
+            2.0
+            * (data_normalized.iloc[:, :-1] - data_normalized.iloc[:, :-1].min())
+            / (data_normalized.iloc[:, :-1].max() - data_normalized.iloc[:, :-1].min())
+            - 1
+        )
+        radviz(data_normalized, color_column, color=self.colors)
+        plt.title(title)
+        plt.show()
+    def _viz_weights(
+        self, cmap: str = "viridis", aspect: str = "auto", highlight: bool = True, **kwargs
+    ) -> None:
+        """
+        Visualize the encoder layer weights of the model.
+        Parameters
+        ----------
+        cmap : `str`, optional
+            The colormap for visualization (default is `"viridis"`).
+        aspect : `str`, optional
+            Aspect ratio for the visualization (default is `"auto"`).
+        highlight : `bool`, optional
+            Whether to highlight the maximum weights (default is `True`).
+        **kwargs : `dict`, optional
+            Additional keyword arguments for customization.
+        Returns
+        -------
+        `None`
+        """
+        title = kwargs.get("title", "Encoder Layer Weights (Dense Layer)")
+        y_labels = kwargs.get("y_labels", None)
+        cmap_highlight = kwargs.get("cmap_highlight", "Pastel1")
+        highlight_mask = np.zeros_like(self.encoder_weights, dtype=bool)
+        plt.imshow(self.encoder_weights, cmap=cmap, aspect=aspect)
+        plt.colorbar()
+        plt.title(title)
+        if y_labels is not None:
+            plt.yticks(ticks=np.arange(self.encoder_weights.shape[0]), labels=y_labels)
+        if highlight:
+            for i, j in enumerate(self.encoder_weights.argmax(axis=1)):
+                highlight_mask[i, j] = True
+            plt.imshow(
+                np.ma.masked_where(~highlight_mask, self.encoder_weights),
+                cmap=cmap_highlight,
+                alpha=0.5,
+                aspect=aspect,
+            )
+        plt.show()
+    def _statistics(self, data_input: DataFrame) -> DataFrame:
+        """
+        Compute statistical summaries of the input data.
+        Parameters
+        ----------
+        data_input : `DataFrame`
+            The data to compute statistics for.
+        Returns
+        -------
+        `DataFrame` : The statistical summary of the input data.
+        """
+        data = data_input.copy(deep=True)
+        if not pd.api.types.is_string_dtype(data["class"]):
+            data["class"] = data["class"].astype(str)
+        data.ffill(inplace=True)
+        grouped_data = data.groupby("class")
+        numerical_stats = grouped_data.agg(["mean", "min", "max", "std", "median"])
+        numerical_stats.columns = ["_".join(col).strip() for col in numerical_stats.columns.values]
+        def get_mode(x):
+            mode_series = x.mode()
+            return mode_series.iloc[0] if not mode_series.empty else None
+        mode_stats = grouped_data.apply(get_mode, include_groups=False)
+        mode_stats.columns = [f"{col}_mode" for col in mode_stats.columns]
+        combined_stats = pd.concat([numerical_stats, mode_stats], axis=1)
+        return combined_stats.T
 ########################################################################################

likelihood 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl

likelihood 1.4.1py3-none-any.whl → 1.5.0py3-none-any.whl