PyPI - likelihood - Versions diffs - 1.2.23__py3-none-any.whl → 1.2.25__py3-none-any.whl - Mend

likelihood 1.2.23py3-none-any.whl → 1.2.25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

likelihood/graph/nn.py CHANGED Viewed

@@ -1,7 +1,9 @@
+import logging
 import os
-os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
-import logging
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+logging.getLogger("tensorflow").setLevel(logging.ERROR)
 import warnings
 from typing import List, Tuple
@@ -9,19 +11,16 @@ import numpy as np
 import pandas as pd
 import tensorflow as tf
 from IPython.display import clear_output
-from numpy import ndarray
 from pandas.core.frame import DataFrame
 from sklearn.metrics import f1_score
 from sklearn.model_selection import train_test_split
 from likelihood.tools import generate_feature_yaml
-logging.getLogger("tensorflow").setLevel(logging.ERROR)
-tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
+tf.get_logger().setLevel("ERROR")
-def compare_similarity(arr1: ndarray, arr2: ndarray) -> int:
+def compare_similarity(arr1: np.ndarray, arr2: np.ndarray) -> int:
     """Compares the similarity between two arrays of categories.
     Parameters
@@ -44,9 +43,9 @@ def compare_similarity(arr1: ndarray, arr2: ndarray) -> int:
     return count
-def cal_adjency_matrix(
+def cal_adjacency_matrix(
     df: DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
-) -> Tuple[dict, ndarray]:
+) -> Tuple[dict, np.ndarray]:
     """Calculates the adjacency matrix for a given DataFrame.
     The adjacency matrix is a matrix that represents the similarity between each pair of categories.
     The similarity is calculated using the `compare_similarity` function.
@@ -133,7 +132,7 @@ class Data:
         target: str | None = None,
         exclude_subset: List[str] = [],
     ):
-        _, adjacency = cal_adjency_matrix(df, exclude_subset=exclude_subset, sparse=True)
+        _, adjacency = cal_adjacency_matrix(df, exclude_subset=exclude_subset, sparse=True)
         if target is not None:
             X = df.drop(columns=[target] + exclude_subset)
         else:

likelihood/models/deep/autoencoders.py CHANGED Viewed

@@ -1,19 +1,40 @@
 import logging
 import os
+import random
 from functools import partial
 from shutil import rmtree
-import keras_tuner
+import matplotlib
+import matplotlib.colors as mcolors
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+from pandas.plotting import radviz
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+logging.getLogger("tensorflow").setLevel(logging.ERROR)
+import warnings
+from functools import wraps
+import keras_tuner
 import tensorflow as tf
 from pandas.core.frame import DataFrame
+from sklearn.manifold import TSNE
 from likelihood.tools import OneHotEncoder
-logging.getLogger("tensorflow").setLevel(logging.ERROR)
+tf.get_logger().setLevel("ERROR")
+def suppress_warnings(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            return func(*args, **kwargs)
-tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
+    return wrapper
 @tf.keras.utils.register_keras_serializable(package="Custom", name="AutoClassifier")
@@ -35,7 +56,7 @@ class AutoClassifier(tf.keras.Model):
         from_config(cls, config): Recreates an instance of AutoClassifier from its configuration.
     """
-    def __init__(self, input_shape_parm, num_classes, units, activation):
+    def __init__(self, input_shape_parm, num_classes, units, activation, **kwargs):
         """
         Initializes an AutoClassifier instance with the given parameters.
@@ -49,6 +70,15 @@ class AutoClassifier(tf.keras.Model):
             The number of neurons in each hidden layer.
         activation : `str`
             The type of activation function to use for the neural network layers.
+        Keyword Arguments:
+        ----------
+        Additional keyword arguments to pass to the model.
+        classifier_activation : `str`
+            The activation function to use for the classifier layer. Default is "softmax". If the activation function is not a classification function, the model can be used in regression problems.
+        num_layers : `int`
+            The number of hidden layers in the classifier. Default is 1.
         """
         super(AutoClassifier, self).__init__()
         self.input_shape_parm = input_shape_parm
@@ -59,6 +89,8 @@ class AutoClassifier(tf.keras.Model):
         self.encoder = None
         self.decoder = None
         self.classifier = None
+        self.classifier_activation = kwargs.get("classifier_activation", "softmax")
+        self.num_layers = kwargs.get("num_layers", 1)
     def build(self, input_shape):
         self.encoder = tf.keras.Sequential(
@@ -75,8 +107,14 @@ class AutoClassifier(tf.keras.Model):
             ]
         )
-        self.classifier = tf.keras.Sequential(
-            [tf.keras.layers.Dense(self.num_classes, activation="softmax")]
+        self.classifier = tf.keras.Sequential()
+        if self.num_layers > 1:
+            for _ in range(self.num_layers - 1):
+                self.classifier.add(
+                    tf.keras.layers.Dense(units=self.units, activation=self.activation)
+                )
+        self.classifier.add(
+            tf.keras.layers.Dense(units=self.num_classes, activation=self.classifier_activation)
         )
     def call(self, x):
@@ -92,6 +130,8 @@ class AutoClassifier(tf.keras.Model):
             "num_classes": self.num_classes,
             "units": self.units,
             "activation": self.activation,
+            "classifier_activation": self.classifier_activation,
+            "num_layers": self.num_layers,
         }
         base_config = super(AutoClassifier, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
@@ -103,6 +143,8 @@ class AutoClassifier(tf.keras.Model):
             num_classes=config["num_classes"],
             units=config["units"],
             activation=config["activation"],
+            classifier_activation=config["classifier_activation"],
+            num_layers=config["num_layers"],
         )
@@ -113,6 +155,7 @@ def call_existing_code(
     optimizer: str,
     input_shape_parm: None | int = None,
     num_classes: None | int = None,
+    num_layers: int = 1,
 ) -> AutoClassifier:
     """
     Calls an existing AutoClassifier instance.
@@ -142,6 +185,7 @@ def call_existing_code(
         num_classes=num_classes,
         units=units,
         activation=activation,
+        num_layers=num_layers,
     )
     model.compile(
         optimizer=optimizer,
@@ -151,7 +195,9 @@ def call_existing_code(
     return model
-def build_model(hp, input_shape_parm: None | int, num_classes: None | int) -> AutoClassifier:
+def build_model(
+    hp, input_shape_parm: None | int, num_classes: None | int, **kwargs
+) -> AutoClassifier:
     """Builds a neural network model using Keras Tuner's search algorithm.
     Parameters
@@ -163,17 +209,51 @@ def build_model(hp, input_shape_parm: None | int, num_classes: None | int) -> Au
     num_classes : `int`
         The number of classes in the dataset.
+    Keyword Arguments:
+    ----------
+    Additional keyword arguments to pass to the model.
+    hyperparameters : `dict`
+        The hyperparameters to set.
     Returns
     -------
     `keras.Model`
         The neural network model.
     """
-    units = hp.Int(
-        "units", min_value=int(input_shape_parm * 0.2), max_value=input_shape_parm, step=2
+    hyperparameters = kwargs.get("hyperparameters", None)
+    hyperparameters_keys = hyperparameters.keys() if hyperparameters is not None else []
+    units = (
+        hp.Int(
+            "units",
+            min_value=int(input_shape_parm * 0.2),
+            max_value=int(input_shape_parm * 1.5),
+            step=2,
+        )
+        if "units" not in hyperparameters_keys
+        else hyperparameters["units"]
+    )
+    activation = (
+        hp.Choice("activation", ["sigmoid", "relu", "tanh", "selu", "softplus", "softsign"])
+        if "activation" not in hyperparameters_keys
+        else hyperparameters["activation"]
+    )
+    optimizer = (
+        hp.Choice("optimizer", ["sgd", "adam", "adadelta", "rmsprop", "adamax", "adagrad"])
+        if "optimizer" not in hyperparameters_keys
+        else hyperparameters["optimizer"]
+    )
+    threshold = (
+        hp.Float("threshold", min_value=0.1, max_value=0.9, sampling="log")
+        if "threshold" not in hyperparameters_keys
+        else hyperparameters["threshold"]
+    )
+    num_layers = (
+        hp.Int("num_layers", min_value=1, max_value=10, step=1)
+        if "num_layers" not in hyperparameters_keys
+        else hyperparameters["num_layers"]
     )
-    activation = hp.Choice("activation", ["sigmoid", "relu", "tanh", "selu", "softplus"])
-    optimizer = hp.Choice("optimizer", ["sgd", "adam", "adadelta"])
-    threshold = hp.Float("threshold", min_value=0.1, max_value=0.9, sampling="log")
     model = call_existing_code(
         units=units,
@@ -182,10 +262,12 @@ def build_model(hp, input_shape_parm: None | int, num_classes: None | int) -> Au
         optimizer=optimizer,
         input_shape_parm=input_shape_parm,
         num_classes=num_classes,
+        num_layers=num_layers,
     )
     return model
+@suppress_warnings
 def setup_model(
     data: DataFrame,
     target: str,
@@ -194,6 +276,7 @@ def setup_model(
     seed=None,
     train_mode: bool = True,
     filepath: str = "./my_dir/best_model",
+    method: str = "Hyperband",
     **kwargs,
 ) -> AutoClassifier:
     """Setup model for training and tuning.
@@ -214,6 +297,8 @@ def setup_model(
         Whether to train the model or not.
     filepath : `str`
         The path to save the best model to.
+    method : `str`
+        The method to use for hyperparameter tuning. Options are "Hyperband" and "RandomSearch".
     Keyword Arguments:
     ----------
@@ -229,30 +314,30 @@ def setup_model(
         The objective to optimize.
     verbose : `bool`
         Whether to print verbose output.
+    hyperparameters : `dict`
+        The hyperparameters to set.
     Returns
     -------
     model : `AutoClassifier`
         The trained model.
     """
-    max_trials = kwargs["max_trials"] if "max_trials" in kwargs else 10
-    directory = kwargs["directory"] if "directory" in kwargs else "./my_dir"
-    project_name = kwargs["project_name"] if "project_name" in kwargs else "get_best"
-    objective = kwargs["objective"] if "objective" in kwargs else "val_loss"
-    verbose = kwargs["verbose"] if "verbose" in kwargs else True
+    max_trials = kwargs.get("max_trials", 10)
+    directory = kwargs.get("directory", "./my_dir")
+    project_name = kwargs.get("project_name", "get_best")
+    objective = kwargs.get("objective", "val_loss")
+    verbose = kwargs.get("verbose", True)
+    hyperparameters = kwargs.get("hyperparameters", None)
     X = data.drop(columns=target)
     input_sample = X.sample(1)
     y = data[target]
-    # Verify if there are categorical columns in the dataframe
     assert (
         X.select_dtypes(include=["object"]).empty == True
     ), "Categorical variables within the DataFrame must be encoded, this is done by using the DataFrameEncoder from likelihood."
     validation_split = 1.0 - train_size
-    # Create my_dir path if it does not exist
     if train_mode:
-        # Create a new directory if it does not exist
         try:
             if (not os.path.exists(directory)) and directory != "./":
                 os.makedirs(directory)
@@ -263,7 +348,6 @@ def setup_model(
         except:
             print("Warning: unable to create directory")
-        # Create a Classifier instance
         y_encoder = OneHotEncoder()
         y = y_encoder.encode(y.to_list())
         X = X.to_numpy()
@@ -276,34 +360,239 @@ def setup_model(
         num_classes = y.shape[1]
         global build_model
         build_model = partial(
-            build_model, input_shape_parm=input_shape_parm, num_classes=num_classes
+            build_model,
+            input_shape_parm=input_shape_parm,
+            num_classes=num_classes,
+            hyperparameters=hyperparameters,
         )
-        # Create the AutoKeras model
-        tuner = keras_tuner.RandomSearch(
-            hypermodel=build_model,
-            objective=objective,
-            max_trials=max_trials,
-            directory=directory,
-            project_name=project_name,
-            seed=seed,
-        )
-        tuner.search(X, y, epochs=epochs, validation_split=validation_split)
+        if method == "Hyperband":
+            tuner = keras_tuner.Hyperband(
+                hypermodel=build_model,
+                objective=objective,
+                max_epochs=epochs,
+                factor=3,
+                directory=directory,
+                project_name=project_name,
+                seed=seed,
+            )
+        elif method == "RandomSearch":
+            tuner = keras_tuner.RandomSearch(
+                hypermodel=build_model,
+                objective=objective,
+                max_trials=max_trials,
+                directory=directory,
+                project_name=project_name,
+                seed=seed,
+            )
+        tuner.search(X, y, epochs=epochs, validation_split=validation_split, verbose=verbose)
         models = tuner.get_best_models(num_models=2)
         best_model = models[0]
         best_model(input_sample)
-        # save model
         best_model.save(filepath, save_format="tf")
         if verbose:
             tuner.results_summary()
     else:
-        # Load the best model from the directory
         best_model = tf.keras.models.load_model(filepath)
-    return best_model
+    best_hps = tuner.get_best_hyperparameters(1)[0].values
+    return best_model, pd.DataFrame(best_hps, index=["Value"])
+class GetInsights:
+    def __init__(self, model: AutoClassifier, inputs: np.ndarray) -> None:
+        self.inputs = inputs
+        self.model = model
+        self.encoder_layer = self.model.encoder.layers[0]
+        self.decoder_layer = self.model.decoder.layers[0]
+        self.classifier_layer = self.model.classifier.layers[-2]
+        self.encoder_weights = self.encoder_layer.get_weights()[0]
+        self.decoder_weights = self.decoder_layer.get_weights()[0]
+        self.classifier_weights = self.classifier_layer.get_weights()[0]
+        colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)
+        by_hsv = sorted(
+            (tuple(mcolors.rgb_to_hsv(mcolors.to_rgba(color)[:3])), name)
+            for name, color in colors.items()
+        )
+        self.sorted_names = [name for hsv, name in by_hsv if hsv[1] > 0.4 and hsv[2] >= 0.4]
+        random.shuffle(self.sorted_names)
+    def predictor_analyzer(
+        self,
+        frac=None,
+        cmap: str = "viridis",
+        aspect: str = "auto",
+        highlight: bool = True,
+        **kwargs,
+    ) -> None:
+        self._viz_weights(cmap=cmap, aspect=aspect, highlight=highlight, **kwargs)
+        inputs = self.inputs.copy()
+        y_labels = kwargs.get("y_labels", None)
+        if frac:
+            n = int(frac * self.inputs.shape[0])
+            indexes = np.random.choice(np.arange(inputs.shape[0]), n, replace=False)
+            inputs = inputs[indexes]
+        inputs[np.isnan(inputs)] = 0.0
+        encoded = self.model.encoder(inputs)
+        reconstructed = self.model.decoder(encoded)
+        combined = tf.concat([reconstructed, encoded], axis=1)
+        self.classification = self.model.classifier(combined).numpy().argmax(axis=1)
+        ax = plt.subplot(1, 2, 1)
+        plt.imshow(self.inputs, cmap=cmap, aspect=aspect)
+        plt.colorbar()
+        plt.title("Original Data")
+        plt.subplot(1, 2, 2, sharex=ax, sharey=ax)
+        plt.imshow(reconstructed, cmap=cmap, aspect=aspect)
+        plt.colorbar()
+        plt.title("Decoder Layer Reconstruction")
+        plt.show()
+        self._get_tsne_repr(inputs=inputs, frac=frac)
+        self._viz_tsne_repr(c=self.classification)
+        self.data = pd.DataFrame(encoded, columns=[f"Feature {i}" for i in range(encoded.shape[1])])
+        self.data_input = pd.DataFrame(
+            inputs,
+            columns=(
+                [f"Feature {i}" for i in range(inputs.shape[1])] if y_labels is None else y_labels
+            ),
+        )
+        self.data["class"] = self.classification
+        self.data_input["class"] = self.classification
+        radviz(self.data, "class", color=self.colors)
+        plt.title("Radviz Visualization of Latent Space")
+        plt.show()
+        radviz(self.data_input, "class", color=self.colors)
+        plt.title("Radviz Visualization of Input Data")
+        plt.show()
+        return self._statistics(self.data_input)
+    def _statistics(self, data_input: DataFrame, **kwargs) -> DataFrame:
+        data = data_input.copy(deep=True)
+        if not pd.api.types.is_string_dtype(data["class"]):
+            data["class"] = data["class"].astype(str)
+        data.ffill(inplace=True)
+        grouped_data = data.groupby("class")
+        numerical_stats = grouped_data.agg(["mean", "min", "max", "std", "median"])
+        numerical_stats.columns = ["_".join(col).strip() for col in numerical_stats.columns.values]
+        def get_mode(x):
+            mode_series = x.mode()
+            return mode_series.iloc[0] if not mode_series.empty else None
+        mode_stats = grouped_data.apply(get_mode, include_groups=False)
+        mode_stats.columns = [f"{col}_mode" for col in mode_stats.columns]
+        combined_stats = pd.concat([numerical_stats, mode_stats], axis=1)
+        return combined_stats.T
+    def _viz_weights(
+        self, cmap: str = "viridis", aspect: str = "auto", highlight: bool = True, **kwargs
+    ) -> None:
+        title = kwargs.get("title", "Encoder Layer Weights (Dense Layer)")
+        y_labels = kwargs.get("y_labels", None)
+        cmap_highlight = kwargs.get("cmap_highlight", "Pastel1")
+        highlight_mask = np.zeros_like(self.encoder_weights, dtype=bool)
+        plt.imshow(self.encoder_weights, cmap=cmap, aspect=aspect)
+        plt.colorbar()
+        plt.title(title)
+        if y_labels is not None:
+            plt.yticks(ticks=np.arange(self.encoder_weights.shape[0]), labels=y_labels)
+        if highlight:
+            for i, j in enumerate(self.encoder_weights.argmax(axis=1)):
+                highlight_mask[i, j] = True
+            plt.imshow(
+                np.ma.masked_where(~highlight_mask, self.encoder_weights),
+                cmap=cmap_highlight,
+                alpha=0.5,
+                aspect=aspect,
+            )
+        plt.show()
+    def _get_tsne_repr(self, inputs=None, frac=None) -> None:
+        if inputs is None:
+            inputs = self.inputs.copy()
+            if frac:
+                n = int(frac * self.inputs.shape[0])
+                indexes = np.random.choice(np.arange(inputs.shape[0]), n, replace=False)
+                inputs = inputs[indexes]
+            inputs[np.isnan(inputs)] = 0.0
+        self.latent_representations = inputs @ self.encoder_weights
+        tsne = TSNE(n_components=2)
+        self.reduced_data_tsne = tsne.fit_transform(self.latent_representations)
+    def _viz_tsne_repr(self, **kwargs) -> None:
+        c = kwargs.get("c", None)
+        self.colors = (
+            kwargs.get("colors", self.sorted_names[: len(np.unique(c))]) if c is not None else None
+        )
+        plt.scatter(
+            self.reduced_data_tsne[:, 0],
+            self.reduced_data_tsne[:, 1],
+            cmap=matplotlib.colors.ListedColormap(self.colors) if c is not None else None,
+            c=c,
+        )
+        if c is not None:
+            cb = plt.colorbar()
+            loc = np.arange(0, max(c), max(c) / float(len(self.colors)))
+            cb.set_ticks(loc)
+            cb.set_ticklabels(np.unique(c))
+        plt.title("t-SNE Visualization of Latent Space")
+        plt.xlabel("t-SNE 1")
+        plt.ylabel("t-SNE 2")
+        plt.show()
 ########################################################################################
+if __name__ == "__main__":
+    # Example usage
+    import pandas as pd
+    from sklearn.datasets import load_iris
+    from sklearn.preprocessing import OneHotEncoder
+    # Load the dataset
+    iris = load_iris()
+    # Convert to a DataFrame for easy exploration
+    iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
+    iris_df["species"] = iris.target
+    X = iris_df.drop(columns="species")
+    y_labels = X.columns
+    X = X.values
+    y = iris_df["species"].values
+    X = np.asarray(X).astype(np.float32)
+    encoder = OneHotEncoder()
+    y = encoder.fit_transform(y.reshape(-1, 1)).toarray()
+    y = np.asarray(y).astype(np.float32)
+    model = AutoClassifier(
+        input_shape_parm=X.shape[1], num_classes=3, units=27, activation="selu", num_layers=2
+    )
+    model.compile(
+        optimizer="adam",
+        loss=tf.keras.losses.CategoricalCrossentropy(),
+        metrics=[tf.keras.metrics.F1Score(threshold=0.5)],
+    )
+    model.fit(X, y, epochs=50, validation_split=0.2)
+    insights = GetInsights(model, X)
+    summary = insights.predictor_analyzer(frac=1.0, y_labels=y_labels)
+    insights._get_tsne_repr()
+    insights._viz_tsne_repr()
+    insights._viz_tsne_repr(c=iris_df["species"])
+    insights._viz_weights()
+    print(summary)

likelihood/models/hmm.py ADDED Viewed

@@ -0,0 +1,163 @@
+import logging
+import os
+import pickle
+from typing import List, Tuple
+import numpy as np
+from IPython.display import clear_output
+class HMM:
+    def __init__(self, n_states: int, n_observations: int):
+        self.n_states = n_states
+        self.n_observations = n_observations
+        # Initialize parameters with random values
+        self.pi = np.random.dirichlet(np.ones(n_states), size=1)[0]
+        self.A = np.random.dirichlet(np.ones(n_states), size=n_states)
+        self.B = np.random.dirichlet(np.ones(n_observations), size=n_states)
+    def save_model(self, filename: str = "./hmm") -> None:
+        filename = filename if filename.endswith(".pkl") else filename + ".pkl"
+        with open(filename, "wb") as f:
+            pickle.dump(self, f)
+    @staticmethod
+    def load_model(filename: str = "./hmm") -> "HMM":
+        filename = filename + ".pkl" if not filename.endswith(".pkl") else filename
+        with open(filename, "rb") as f:
+            return pickle.load(f)
+    def forward(self, sequence: List[int]) -> np.ndarray:
+        T = len(sequence)
+        alpha = np.zeros((T, self.n_states))
+        # Add a small constant (smoothing) to avoid log(0)
+        epsilon = 1e-10  # Small value to avoid taking log(0)
+        # Initialization (log-space)
+        alpha[0] = np.log(self.pi + epsilon) + np.log(self.B[:, sequence[0]] + epsilon)
+        alpha[0] -= np.log(np.sum(np.exp(alpha[0])))  # Normalization (log-space)
+        # Recursion (log-space)
+        for t in range(1, T):
+            for i in range(self.n_states):
+                alpha[t, i] = np.log(
+                    np.sum(np.exp(alpha[t - 1] + np.log(self.A[:, i] + epsilon)))
+                ) + np.log(self.B[i, sequence[t]] + epsilon)
+            alpha[t] -= np.log(np.sum(np.exp(alpha[t])))  # Normalization
+        return alpha
+    def backward(self, sequence: List[int]) -> np.ndarray:
+        T = len(sequence)
+        beta = np.ones((T, self.n_states))
+        # Backward recursion
+        for t in range(T - 2, -1, -1):
+            for i in range(self.n_states):
+                beta[t, i] = np.sum(self.A[i] * self.B[:, sequence[t + 1]] * beta[t + 1])
+        return beta
+    def viterbi(self, sequence: List[int]) -> np.ndarray:
+        T = len(sequence)
+        delta = np.zeros((T, self.n_states))
+        psi = np.zeros((T, self.n_states), dtype=int)
+        # Initialization
+        delta[0] = self.pi * self.B[:, sequence[0]]
+        # Recursion
+        for t in range(1, T):
+            for i in range(self.n_states):
+                delta[t, i] = np.max(delta[t - 1] * self.A[:, i]) * self.B[i, sequence[t]]
+                psi[t, i] = np.argmax(delta[t - 1] * self.A[:, i])
+        # Reconstruct the most probable path
+        state_sequence = np.zeros(T, dtype=int)
+        state_sequence[T - 1] = np.argmax(delta[T - 1])
+        for t in range(T - 2, -1, -1):
+            state_sequence[t] = psi[t + 1, state_sequence[t + 1]]
+        return state_sequence
+    def baum_welch(
+        self, sequences: List[List[int]], n_iterations: int, verbose: bool = False
+    ) -> None:
+        for iteration in range(n_iterations):
+            # Initialize accumulators
+            A_num = np.zeros((self.n_states, self.n_states))
+            B_num = np.zeros((self.n_states, self.n_observations))
+            pi_num = np.zeros(self.n_states)
+            for sequence in sequences:
+                T = len(sequence)
+                alpha = self.forward(sequence)
+                beta = self.backward(sequence)
+                # Update pi
+                gamma = (alpha * beta) / np.sum(alpha * beta, axis=1, keepdims=True)
+                pi_num += gamma[0]
+                # Update A and B
+                for t in range(T - 1):
+                    xi = np.zeros((self.n_states, self.n_states))
+                    denom = np.sum(alpha[t] * self.A * self.B[:, sequence[t + 1]] * beta[t + 1])
+                    for i in range(self.n_states):
+                        for j in range(self.n_states):
+                            xi[i, j] = (
+                                alpha[t, i]
+                                * self.A[i, j]
+                                * self.B[j, sequence[t + 1]]
+                                * beta[t + 1, j]
+                            ) / denom
+                        A_num[i] += xi[i]
+                    B_num[:, sequence[t]] += gamma[t]
+                # For the last step of the sequence
+                B_num[:, sequence[-1]] += gamma[-1]
+            # Normalize and update parameters
+            self.pi = pi_num / len(sequences)
+            self.A = A_num / np.sum(A_num, axis=1, keepdims=True)
+            self.B = B_num / np.sum(B_num, axis=1, keepdims=True)
+            # Logging parameters every 10 iterations
+            if iteration % 10 == 0 and verbose:
+                os.system("cls" if os.name == "nt" else "clear")
+                clear_output(wait=True)
+                logging.info(f"Iteration {iteration}:")
+                logging.info("Pi: %s", self.pi)
+                logging.info("A:\n%s", self.A)
+                logging.info("B:\n%s", self.B)
+    def decoding_accuracy(self, sequences: List[List[int]], true_states: List[List[int]]) -> float:
+        correct_predictions = 0
+        total_predictions = 0
+        for sequence, true_state in zip(sequences, true_states):
+            predicted_states = self.viterbi(sequence)
+            correct_predictions += np.sum(predicted_states == true_state)
+            total_predictions += len(sequence)
+        accuracy = (correct_predictions / total_predictions) * 100
+        return accuracy
+    def state_probabilities(self, sequence: List[int]) -> np.ndarray:
+        """
+        Returns the smoothed probabilities of the hidden states at each time step.
+        This is done by using both forward and backward probabilities.
+        """
+        alpha = self.forward(sequence)
+        beta = self.backward(sequence)
+        # Compute smoothed probabilities (gamma)
+        smoothed_probs = (alpha * beta) / np.sum(alpha * beta, axis=1, keepdims=True)
+        return smoothed_probs
+    def sequence_probability(self, sequence: List[int]) -> np.ndarray:
+        return self.state_probabilities(sequence)[-1]

likelihood/models/simulation.py CHANGED Viewed

@@ -5,7 +5,6 @@ from typing import List, Tuple, Union
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from numpy import ndarray
 from pandas.core.frame import DataFrame
 from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, cdf, check_nan_inf
@@ -66,12 +65,12 @@ class SimulationEngine(FeatureSelection):
         super().__init__(**kwargs)
-    def predict(self, df: DataFrame, column: str) -> ndarray | list:
+    def predict(self, df: DataFrame, column: str) -> np.ndarray | list:
         # Let us assign the dictionary entries corresponding to the column
         w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column]
         df = df[names_cols].copy()
-        # Change the scale of the dataframe
+        # Change the scale of the DataFrame
         dataset = self.df.copy()
         dataset.drop(columns=column, inplace=True)
         numeric_df = dataset.select_dtypes(include="number")
@@ -85,7 +84,7 @@ class SimulationEngine(FeatureSelection):
             for col in numeric_df.columns:
                 df[col] = numeric_df[col].values
-        # Encoding the datadrame
+        # Encoding the DataFrame
         for num, colname in enumerate(dfe._encode_columns):
             if df[colname].dtype == "object":
                 encode_dict = dfe.encoding_list[num]
@@ -93,7 +92,7 @@ class SimulationEngine(FeatureSelection):
                     dfe._code_transformation_to, dictionary_list=encode_dict
                 )
-        # PREDICTION
+        # Prediction
         y = df.to_numpy() @ w
         # Categorical column
@@ -113,7 +112,7 @@ class SimulationEngine(FeatureSelection):
         return y[:]
-    def _encode(self, df: DataFrame) -> ndarray | list:
+    def _encode(self, df: DataFrame) -> np.ndarray | list:
         df = df.copy()
         column = df.columns[0]
         frec = df[column].value_counts() / len(df)

{likelihood-1.2.23.dist-info → likelihood-1.2.25.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: likelihood
-Version: 1.2.23
+Version: 1.2.25
 Summary: A package that performs the maximum likelihood algorithm.
 Home-page: https://github.com/jzsmoreno/likelihood/
 Author: J. A. Moreno-Guerra
@@ -13,7 +13,7 @@ Classifier: Operating System :: OS Independent
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: black[jupyter]==24.1.1
+Requires-Dist: black[jupyter]>=24.3.0
 Requires-Dist: mypy-extensions==1.0.0
 Requires-Dist: types-openpyxl==3.1.0.15
 Requires-Dist: pydocstyle==6.3.0
@@ -31,6 +31,18 @@ Requires-Dist: pyvis; extra == "full"
 Requires-Dist: tensorflow==2.15.0; extra == "full"
 Requires-Dist: keras-tuner; extra == "full"
 Requires-Dist: scikit-learn; extra == "full"
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: maintainer
+Dynamic: maintainer-email
+Dynamic: provides-extra
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
 ![likelihood](https://raw.githubusercontent.com/RodolfoFerro/likelihood/main/likelihood.png)

{likelihood-1.2.23.dist-info → likelihood-1.2.25.dist-info}/RECORD RENAMED Viewed

@@ -2,18 +2,19 @@ likelihood/__init__.py,sha256=5C0hapdsk85XZhN_rssRAEFpkRRuKNtj6cyRbqD2_gM,994
 likelihood/main.py,sha256=fcCkGOOWKjfvw2tLVqjuKPV8t0rVCIT9FlbYcOv4EYo,7974
 likelihood/graph/__init__.py,sha256=6TuFDfmXTwpLyHl7_KqBfdzW6zqHjGzIFvymjFPlvjI,21
 likelihood/graph/graph.py,sha256=hGWCznxaRQ8BfY2aLjrvwriZkAIsz5ydKXF4x_7b0EQ,3359
-likelihood/graph/nn.py,sha256=jBgb2SMUwM5OBatkIxH2I-_hH1ok5aw2fwXq5a1VAEg,12306
+likelihood/graph/nn.py,sha256=WuK66hRTN5hdVIArgfSweqtE098tb6QFd2ZMFaHvnZA,12263
 likelihood/models/__init__.py,sha256=e6nB4w47w0Q9DrAFeP3OcUgcoHOtf7Il4mBhgf4AARg,52
+likelihood/models/hmm.py,sha256=0s0gFySH1u4NjRaZDxiZ8oeTaFhFrw1x0GJxwy3dFrA,6253
 likelihood/models/regression.py,sha256=9cakyGlJCEO6WfpoKLh3GxdXQeQp7cUvJIkQ5odT0TA,9404
-likelihood/models/simulation.py,sha256=Y4RXkeYHmQCve-EpEYVmzh6tm5pkJa_Pbx0iYJmptU8,8852
+likelihood/models/simulation.py,sha256=L_9Mihcca7i_AnvWWrZilFV8VEhz_Z8fDLepmwBGSi8,8832
 likelihood/models/utils.py,sha256=VtEj07lV-GRoWraQgpfjU0jTt1Ntf9MXgYwe6XYQh20,1552
 likelihood/models/deep/__init__.py,sha256=-KIPippVaMqgG8mEgYjNxYQdqOUcFhUuKhbVe8TTCfo,28
-likelihood/models/deep/autoencoders.py,sha256=2P--nS96XwMi44q0OIxvIp6Mdbt-B4LqwCSXTn2jYrY,10070
+likelihood/models/deep/autoencoders.py,sha256=seE1rb1t1gbbKRyEzfi01BqMsV4MU6yakVTLcukAMkg,20591
 likelihood/tools/__init__.py,sha256=MCjsCWfBNKE2uMN0VizDN1uFzZ_md0X2WZeBdWhrCR8,50
 likelihood/tools/numeric_tools.py,sha256=FA44kbiAcxcquz1el_g3Pqsp5ii8XFkAIrsMs5bGkj0,11445
 likelihood/tools/tools.py,sha256=iZBC7IHTFpAyxooyel7ZFi-5-G0nCotNLLtxenPw9T8,44303
-likelihood-1.2.23.dist-info/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
-likelihood-1.2.23.dist-info/METADATA,sha256=sdJRNVLSm5SNwfQkolcusGvkFnlf_dNcMzeRmb4JUyQ,2504
-likelihood-1.2.23.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-likelihood-1.2.23.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
-likelihood-1.2.23.dist-info/RECORD,,
+likelihood-1.2.25.dist-info/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
+likelihood-1.2.25.dist-info/METADATA,sha256=hUsmkghXP8m4z3FtWcM64gwBEW74HIOTNJifK26OOkw,2771
+likelihood-1.2.25.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+likelihood-1.2.25.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
+likelihood-1.2.25.dist-info/RECORD,,

{likelihood-1.2.23.dist-info → likelihood-1.2.25.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.6.0)
+Generator: setuptools (75.8.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{likelihood-1.2.23.dist-info → likelihood-1.2.25.dist-info}/LICENSE RENAMED Viewed

File without changes

{likelihood-1.2.23.dist-info → likelihood-1.2.25.dist-info}/top_level.txt RENAMED Viewed

File without changes

likelihood 1.2.23__py3-none-any.whl → 1.2.25__py3-none-any.whl

likelihood 1.2.23py3-none-any.whl → 1.2.25py3-none-any.whl