PyPI - likelihood - Versions diffs - 1.2.16__py3-none-any.whl → 1.2.18__py3-none-any.whl - Mend

likelihood 1.2.16py3-none-any.whl → 1.2.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

likelihood/graph/nn.py +344 -0
likelihood/models/deep/autoencoders.py +53 -24
likelihood/models/simulation.py +51 -39
likelihood/tools/numeric_tools.py +57 -30
likelihood/tools/tools.py +28 -10
{likelihood-1.2.16.dist-info → likelihood-1.2.18.dist-info}/METADATA +3 -2
{likelihood-1.2.16.dist-info → likelihood-1.2.18.dist-info}/RECORD +10 -9
{likelihood-1.2.16.dist-info → likelihood-1.2.18.dist-info}/WHEEL +1 -1
{likelihood-1.2.16.dist-info → likelihood-1.2.18.dist-info}/LICENSE +0 -0
{likelihood-1.2.16.dist-info → likelihood-1.2.18.dist-info}/top_level.txt +0 -0

likelihood/graph/nn.py ADDED Viewed

@@ -0,0 +1,344 @@
+import warnings
+from typing import List, Tuple
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+from numpy import ndarray
+from pandas.core.frame import DataFrame
+from sklearn.metrics import f1_score
+from sklearn.model_selection import train_test_split
+from likelihood.tools import generate_feature_yaml
+def compare_similarity(arr1: ndarray, arr2: ndarray) -> int:
+    """Compares the similarity between two arrays of categories.
+    Parameters
+    ----------
+    arr1 : `ndarray`
+        The first array of categories.
+    arr2 : `ndarray`
+        The second array of categories.
+    Returns
+    -------
+    count: `int`
+        The number of categories that are the same in both arrays.
+    """
+    count = 0
+    for i in range(len(arr1)):
+        if arr1[i] == arr2[i]:
+            count += 1
+    return count
+def cal_adjency_matrix(
+    df: DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
+) -> Tuple[dict, ndarray]:
+    """Calculates the adjacency matrix for a given DataFrame.
+    The adjacency matrix is a matrix that represents the similarity between each pair of categories.
+    The similarity is calculated using the `compare_similarity` function.
+    The resulting matrix is a square matrix with the same number of rows and columns as the input DataFrame.
+    Parameters
+    ----------
+    df : `DataFrame`
+        The input DataFrame containing the categories.
+    exclude_subset : `List[str]`, optional
+        A list of categories to exclude from the calculation of the adjacency matrix.
+    sparse : `bool`, optional
+        Whether to return a sparse matrix or a dense matrix.
+    **kwargs : `dict`
+        Additional keyword arguments to pass to the `compare_similarity` function.
+    Keyword Arguments:
+    ----------
+    similarity: `int`
+        The minimum number of categories that must be the same in both arrays to be considered similar.
+    Returns
+    -------
+    adj_dict : `dict`
+        A dictionary containing the categories.
+    adjacency_matrix : `ndarray`
+        The adjacency matrix.
+    """
+    yaml_ = generate_feature_yaml(df)
+    categorical_columns = yaml_["categorical_features"]
+    if len(exclude_subset) > 0:
+        categorical_columns = [col for col in categorical_columns if col not in exclude_subset]
+    if len(categorical_columns) > 1:
+        df_categorical = df[categorical_columns].copy()
+    else:
+        categorical_columns = [
+            col
+            for col in df.columns
+            if (
+                col not in exclude_subset
+                and pd.api.types.is_integer_dtype(df[col])
+                and len(df[col].unique()) > 2
+            )
+        ]
+        df_categorical = df[categorical_columns].copy()
+    assert len(df_categorical) > 0
+    similarity = kwargs["similarity"] if "similarity" in kwargs else len(df_categorical.columns) - 1
+    assert similarity <= df_categorical.shape[1]
+    adj_dict = {}
+    for index, row in df_categorical.iterrows():
+        adj_dict[index] = row.to_list()
+    adjacency_matrix = np.zeros((len(df_categorical), len(df_categorical)))
+    for i in range(len(df_categorical)):
+        for j in range(len(df_categorical)):
+            if compare_similarity(adj_dict[i], adj_dict[j]) >= similarity:
+                adjacency_matrix[i][j] = 1
+    if sparse:
+        num_nodes = adjacency_matrix.shape[0]
+        indices = np.argwhere(adjacency_matrix != 0.0)
+        indices = tf.constant(indices, dtype=tf.int64)
+        values = tf.constant(adjacency_matrix[indices[:, 0], indices[:, 1]], dtype=tf.float32)
+        adjacency_matrix = tf.sparse.SparseTensor(
+            indices=indices, values=values, dense_shape=(num_nodes, num_nodes)
+        )
+        return adj_dict, adjacency_matrix
+    else:
+        return adj_dict, adjacency_matrix
+class Data:
+    def __init__(
+        self,
+        df: DataFrame,
+        target: str | None = None,
+        exclude_subset: List[str] = [],
+    ):
+        _, adjacency = cal_adjency_matrix(df, exclude_subset=exclude_subset, sparse=True)
+        if target is not None:
+            X = df.drop(columns=[target] + exclude_subset)
+        else:
+            X = df.drop(columns=exclude_subset)
+        self.columns = X.columns
+        X = X.to_numpy()
+        self.x = np.asarray(X).astype(np.float32)
+        self.adjacency = adjacency
+        if target is not None:
+            self.y = np.asarray(df[target].values).astype(np.int32)
+@tf.keras.utils.register_keras_serializable(package="Custom", name="VanillaGNNLayer")
+class VanillaGNNLayer(tf.keras.layers.Layer):
+    def __init__(self, dim_in, dim_out, kernel_initializer="glorot_uniform", **kwargs):
+        super(VanillaGNNLayer, self).__init__(**kwargs)
+        self.dim_out = dim_out
+        self.kernel_initializer = kernel_initializer
+        self.linear = None
+    def build(self, input_shape):
+        self.linear = tf.keras.layers.Dense(
+            self.dim_out, use_bias=False, kernel_initializer=self.kernel_initializer
+        )
+        super(VanillaGNNLayer, self).build(input_shape)
+    def call(self, x, adjacency):
+        x = self.linear(x)
+        x = tf.sparse.sparse_dense_matmul(adjacency, x)
+        return x
+    def get_config(self):
+        config = super(VanillaGNNLayer, self).get_config()
+        config.update(
+            {
+                "dim_out": self.dim_out,
+                "kernel_initializer": tf.keras.initializers.serialize(
+                    self.linear.kernel_initializer
+                ),
+            }
+        )
+        return config
+@tf.keras.utils.register_keras_serializable(package="Custom", name="VanillaGNN")
+class VanillaGNN(tf.keras.Model):
+    def __init__(self, dim_in, dim_h, dim_out, **kwargs):
+        super(VanillaGNN, self).__init__(**kwargs)
+        self.dim_in = dim_in
+        self.dim_h = dim_h
+        self.dim_out = dim_out
+        self.gnn1 = VanillaGNNLayer(self.dim_in, self.dim_h)
+        self.gnn2 = VanillaGNNLayer(self.dim_h, self.dim_h)
+        self.gnn3 = VanillaGNNLayer(self.dim_h, self.dim_out)
+    def build(self, input_shape):
+        super(VanillaGNN, self).build(input_shape)
+        dummy_input = tf.keras.Input(shape=input_shape[1:])
+        dummy_adjacency = tf.sparse.SparseTensor(
+            indices=[[0, 0]], values=[1.0], dense_shape=[input_shape[0], input_shape[0]]
+        )
+        _ = self(dummy_input, dummy_adjacency)
+    def call(self, x, adjacency):
+        h = self.gnn1(x, adjacency)
+        h = tf.nn.tanh(h)
+        h = self.gnn2(h, adjacency)
+        h = self.gnn3(h, adjacency)
+        return tf.nn.softmax(h, axis=1)
+    def f1_macro(self, y_true, y_pred):
+        return f1_score(y_true, y_pred, average="macro")
+    def compute_f1_score(self, logits, labels):
+        predictions = tf.argmax(logits, axis=1, output_type=tf.int32)
+        true_labels = tf.cast(labels, tf.int32)
+        return self.f1_macro(true_labels.numpy(), predictions.numpy())
+    def evaluate(self, x, adjacency, y):
+        y = tf.cast(y, tf.int32)
+        out = self(x, adjacency)
+        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=out)
+        loss = tf.reduce_mean(loss)
+        f1 = self.compute_f1_score(out, y)
+        return loss.numpy(), f1
+    def test(self, data):
+        out = self(data.x, data.adjacency)
+        test_f1 = self.compute_f1_score(out, data.y)
+        return test_f1
+    def predict(self, data):
+        out = self(data.x, data.adjacency)
+        return tf.argmax(out, axis=1, output_type=tf.int32).numpy()
+    def get_config(self):
+        config = {
+            "dim_in": self.dim_in,
+            "dim_h": self.dim_h,
+            "dim_out": self.dim_out,
+        }
+        base_config = super(VanillaGNN, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+    @classmethod
+    def from_config(cls, config):
+        return cls(
+            dim_in=config["dim_in"],
+            dim_h=config["dim_h"],
+            dim_out=config["dim_out"],
+        )
+    @tf.function
+    def train_step(self, batch_x, batch_adjacency, batch_y, optimizer):
+        with tf.GradientTape() as tape:
+            out = self(batch_x, batch_adjacency)
+            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=batch_y, logits=out)
+            loss = tf.reduce_mean(loss)
+        gradients = tape.gradient(loss, self.trainable_variables)
+        optimizer.apply_gradients(zip(gradients, self.trainable_variables))
+        return loss
+    def fit(self, data, epochs, batch_size, test_size=0.2, optimizer="adam"):
+        warnings.warn(
+            "It is normal for validation metrics to underperform. Use the test method to validate after training.",
+            UserWarning,
+        )
+        optimizers = {
+            "sgd": tf.keras.optimizers.SGD(),
+            "adam": tf.keras.optimizers.Adam(),
+            "adamw": tf.keras.optimizers.AdamW(),
+            "adadelta": tf.keras.optimizers.Adadelta(),
+            "rmsprop": tf.keras.optimizers.RMSprop(),
+        }
+        optimizer = optimizers[optimizer]
+        train_losses = []
+        train_f1_scores = []
+        val_losses = []
+        val_f1_scores = []
+        X_train, X_test, y_train, y_test = train_test_split(
+            data.x, data.y, test_size=test_size, shuffle=False
+        )
+        adjacency_train = tf.sparse.slice(data.adjacency, [0, 0], [len(X_train), len(X_train)])
+        adjacency_test = tf.sparse.slice(
+            data.adjacency, [len(X_train), 0], [len(X_test), len(X_test)]
+        )
+        batch_starts = np.arange(0, len(X_train), batch_size)
+        for epoch in range(epochs):
+            np.random.shuffle(batch_starts)
+            for start in batch_starts:
+                end = start + batch_size
+                batch_x = X_train[start:end, :]
+                batch_adjacency = tf.sparse.slice(
+                    adjacency_train, [start, start], [batch_size, batch_size]
+                )
+                batch_y = y_train[start:end]
+                train_loss = self.train_step(batch_x, batch_adjacency, batch_y, optimizer)
+            train_loss, train_f1 = self.evaluate(X_train, adjacency_train, y_train)
+            train_losses.append(train_loss)
+            train_f1_scores.append(train_f1)
+            if epoch % 2 == 0:
+                val_loss, val_f1 = self.evaluate(X_test, adjacency_test, y_test)
+                val_losses.append(val_loss)
+                val_f1_scores.append(val_f1)
+                print(
+                    f"Epoch {epoch:>3} | Train Loss: {train_loss:.3f} | Train F1: {train_f1:.3f} | Val Loss: {val_loss:.3f} | Val F1: {val_f1:.3f}"
+                )
+        return train_losses, train_f1_scores, val_losses, val_f1_scores
+if __name__ == "__main__":
+    # Example usage
+    import pandas as pd
+    from sklearn.datasets import load_iris
+    # Load the dataset
+    iris = load_iris()
+    # Convert to a DataFrame for easy exploration
+    iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
+    iris_df["species"] = iris.target
+    iris_df["sepal length (cm)"] = iris_df["sepal length (cm)"].astype("category")
+    iris_df["sepal width (cm)"] = iris_df["sepal width (cm)"].astype("category")
+    iris_df["petal length (cm)"] = iris_df["petal length (cm)"].astype("category")
+    iris_df["petal width (cm)"] = iris_df["petal width (cm)"].astype("category")
+    # Display the first few rows of the dataset
+    print(iris_df.head())
+    iris_df = iris_df.sample(frac=1, replace=False).reset_index(drop=True)
+    data = Data(iris_df, "species")
+    model = VanillaGNN(dim_in=data.x.shape[1], dim_h=8, dim_out=len(iris_df["species"].unique()))
+    print("Before training F1:", model.test(data))
+    model.fit(data, epochs=200, batch_size=32, test_size=0.5)
+    model.save("./best_model.keras")
+    print("After training F1:", model.test(data))
+    best_model = tf.keras.models.load_model("./best_model.keras")
+    print("After loading F1:", best_model.test(data))
+    df_results = pd.DataFrame()
+    # Suppose we have a new dataset without the target variable
+    iris_df = iris_df.drop(columns=["species"])
+    data_new = Data(iris_df)
+    print("Predictions:", best_model.predict(data_new))
+    df_results["predicted"] = list(model.predict(data))
+    df_results["actual"] = list(data.y)
+    # df_results.to_csv("results.csv", index=False)
+    breakpoint()

likelihood/models/deep/autoencoders.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 from functools import partial
+from shutil import rmtree
 import keras_tuner
 import numpy as np
@@ -15,26 +16,26 @@ class AutoClassifier(tf.keras.Model):
     An auto-classifier model that automatically determines the best classification strategy based on the input data.
     Attributes:
-        - input_shape: The shape of the input data.
+        - input_shape_parm: The shape of the input data.
         - num_classes: The number of classes in the dataset.
         - units: The number of neurons in each hidden layer.
         - activation: The type of activation function to use for the neural network layers.
     Methods:
-        __init__(self, input_shape, num_classes, units, activation): Initializes an AutoClassifier instance with the given parameters.
-        build(self, input_shape): Builds the model architecture based on input_shape.
+        __init__(self, input_shape_parm, num_classes, units, activation): Initializes an AutoClassifier instance with the given parameters.
+        build(self, input_shape_parm): Builds the model architecture based on input_shape_parm.
         call(self, x): Defines the forward pass of the model.
         get_config(self): Returns the configuration of the model.
         from_config(cls, config): Recreates an instance of AutoClassifier from its configuration.
     """
-    def __init__(self, input_shape, num_classes, units, activation):
+    def __init__(self, input_shape_parm, num_classes, units, activation):
         """
         Initializes an AutoClassifier instance with the given parameters.
         Parameters
         ----------
-        input_shape : `int`
+        input_shape_parm : `int`
             The shape of the input data.
         num_classes : `int`
             The number of classes in the dataset.
@@ -44,7 +45,7 @@ class AutoClassifier(tf.keras.Model):
             The type of activation function to use for the neural network layers.
         """
         super(AutoClassifier, self).__init__()
-        self.input_shape = input_shape
+        self.input_shape_parm = input_shape_parm
         self.num_classes = num_classes
         self.units = units
         self.activation = activation
@@ -53,7 +54,7 @@ class AutoClassifier(tf.keras.Model):
         self.decoder = None
         self.classifier = None
-    def build(self, input_shape):
+    def build(self, input_shape_parm):
         self.encoder = tf.keras.Sequential(
             [
                 tf.keras.layers.Dense(units=self.units, activation=self.activation),
@@ -64,7 +65,7 @@ class AutoClassifier(tf.keras.Model):
         self.decoder = tf.keras.Sequential(
             [
                 tf.keras.layers.Dense(units=self.units, activation=self.activation),
-                tf.keras.layers.Dense(units=self.input_shape, activation=self.activation),
+                tf.keras.layers.Dense(units=self.input_shape_parm, activation=self.activation),
             ]
         )
@@ -81,7 +82,7 @@ class AutoClassifier(tf.keras.Model):
     def get_config(self):
         config = {
-            "input_shape": self.input_shape,
+            "input_shape_parm": self.input_shape_parm,
             "num_classes": self.num_classes,
             "units": self.units,
             "activation": self.activation,
@@ -92,7 +93,7 @@ class AutoClassifier(tf.keras.Model):
     @classmethod
     def from_config(cls, config):
         return cls(
-            input_shape=config["input_shape"],
+            input_shape_parm=config["input_shape_parm"],
             num_classes=config["num_classes"],
             units=config["units"],
             activation=config["activation"],
@@ -104,7 +105,7 @@ def call_existing_code(
     activation: str,
     threshold: float,
     optimizer: str,
-    input_shape: None | int = None,
+    input_shape_parm: None | int = None,
     num_classes: None | int = None,
 ) -> AutoClassifier:
     """
@@ -120,7 +121,7 @@ def call_existing_code(
         The threshold for the classifier.
     optimizer : `str`
         The type of optimizer to use for the neural network layers.
-    input_shape : `None` | `int`
+    input_shape_parm : `None` | `int`
         The shape of the input data.
     num_classes : `int`
         The number of classes in the dataset.
@@ -131,7 +132,10 @@ def call_existing_code(
         The AutoClassifier instance.
     """
     model = AutoClassifier(
-        input_shape=input_shape, num_classes=num_classes, units=units, activation=activation
+        input_shape_parm=input_shape_parm,
+        num_classes=num_classes,
+        units=units,
+        activation=activation,
     )
     model.compile(
         optimizer=optimizer,
@@ -141,14 +145,14 @@ def call_existing_code(
     return model
-def build_model(hp, input_shape: None | int, num_classes: None | int) -> AutoClassifier:
+def build_model(hp, input_shape_parm: None | int, num_classes: None | int) -> AutoClassifier:
     """Builds a neural network model using Keras Tuner's search algorithm.
     Parameters
     ----------
     hp : `keras_tuner.HyperParameters`
         The hyperparameters to tune.
-    input_shape : `None` | `int`
+    input_shape_parm : `None` | `int`
         The shape of the input data.
     num_classes : `int`
         The number of classes in the dataset.
@@ -158,7 +162,9 @@ def build_model(hp, input_shape: None | int, num_classes: None | int) -> AutoCla
     `keras.Model`
         The neural network model.
     """
-    units = hp.Int("units", min_value=int(input_shape * 0.2), max_value=input_shape, step=2)
+    units = hp.Int(
+        "units", min_value=int(input_shape_parm * 0.2), max_value=input_shape_parm, step=2
+    )
     activation = hp.Choice("activation", ["sigmoid", "relu", "tanh", "selu", "softplus"])
     optimizer = hp.Choice("optimizer", ["sgd", "adam", "adadelta"])
     threshold = hp.Float("threshold", min_value=0.1, max_value=0.9, sampling="log")
@@ -168,14 +174,21 @@ def build_model(hp, input_shape: None | int, num_classes: None | int) -> AutoCla
         activation=activation,
         threshold=threshold,
         optimizer=optimizer,
-        input_shape=input_shape,
+        input_shape_parm=input_shape_parm,
         num_classes=num_classes,
     )
     return model
 def setup_model(
-    data: DataFrame, target: str, epochs: int, train_size: float = 0.7, seed=None, **kwargs
+    data: DataFrame,
+    target: str,
+    epochs: int,
+    train_size: float = 0.7,
+    seed=None,
+    train_mode: bool = True,
+    filepath: str = "./my_dir/best_model.keras",
+    **kwargs,
 ) -> AutoClassifier:
     """Setup model for training and tuning.
@@ -191,6 +204,10 @@ def setup_model(
         The proportion of the dataset to use for training.
     seed : `Any` | `int`
         The random seed to use for reproducibility.
+    train_mode : `bool`
+        Whether to train the model or not.
+    filepath : `str`
+        The path to save the best model to.
     Keyword Arguments:
     ----------
@@ -226,8 +243,18 @@ def setup_model(
     ), "Categorical variables within the DataFrame must be encoded, this is done by using the DataFrameEncoder from likelihood."
     validation_split = 1.0 - train_size
     # Create my_dir path if it does not exist
-    if not os.path.exists(directory):
-        os.makedirs(directory)
+    if train_mode:
+        # Create a new directory if it does not exist
+        try:
+            if not os.path.exists(directory):
+                os.makedirs(directory)
+            else:
+                print(f"Directory {directory} already exists, it will be deleted.")
+                rmtree(directory)
+                os.makedirs(directory)
+        except:
+            print("Warning: unable to create directory")
         # Create a Classifier instance
         y_encoder = OneHotEncoder()
@@ -237,10 +264,12 @@ def setup_model(
         y = np.asarray(y).astype(np.float32)
-        input_shape = X.shape[1]
+        input_shape_parm = X.shape[1]
         num_classes = y.shape[1]
         global build_model
-        build_model = partial(build_model, input_shape=input_shape, num_classes=num_classes)
+        build_model = partial(
+            build_model, input_shape_parm=input_shape_parm, num_classes=num_classes
+        )
         # Create the AutoKeras model
         tuner = keras_tuner.RandomSearch(
@@ -257,13 +286,13 @@ def setup_model(
         best_model = models[0]
         # save model
-        best_model.save("./my_dir/best_model.keras")
+        best_model.save(filepath)
         if verbose:
             tuner.results_summary()
     else:
         # Load the best model from the directory
-        best_model = tf.keras.models.load_model("./my_dir/best_model.keras")
+        best_model = tf.keras.models.load_model(filepath)
     return best_model

likelihood/models/simulation.py CHANGED Viewed

@@ -10,53 +10,65 @@ from likelihood.tools import DataScaler, FeatureSelection, OneHotEncoder, check_
 class SimulationEngine(FeatureSelection):
+    """
+    This class implements a predictive model that utilizes multiple linear regression for numerical target variables
+    and multiple logistic regression for categorical target variables.
-    def __init__(self, df: DataFrame, n_importances: int, **kwargs):
+    The class provides methods for training the model on a given dataset, making predictions,
+    and evaluating the model's performance.
+    Key features:
+    - Supports both numerical and categorical target variables, automatically selecting the appropriate regression method.
+    - Includes methods for data preprocessing, model fitting, prediction, and evaluation metrics.
+    - Designed to be flexible and user-friendly, allowing for easy integration with various datasets.
+    Usage:
+    - Instantiate the class with the training data and target variable.
+    - Call the fit method to train the model.
+    - Use the predict method to generate predictions on new data.
+    - Evaluate the model using built-in metrics for accuracy and error.
+    This class is suitable for applications in data analysis and machine learning, enabling users to leverage regression techniques
+    for both numerical and categorical outcomes efficiently.
+    """
+    def __init__(self, df: DataFrame, n_importances: int, use_scaler: bool = False, **kwargs):
         self.df = df
         self.n_importances = n_importances
+        self.use_scaler = use_scaler
         super().__init__(**kwargs)
-    def predict(self, df: DataFrame, column: str, n: int = None) -> ndarray | list:
-        # We clean the data set
-        df = self._clean_data(df)
+    def predict(self, df: DataFrame, column: str) -> ndarray | list:
         # Let us assign the dictionary entries corresponding to the column
         w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column]
-        try:
-            df = df[names_cols].copy()
-            # Change the scale of the dataframe
-            numeric_df = df.select_dtypes(include="number")
+        df = df[names_cols].copy()
+        # Change the scale of the dataframe
+        dataset = self.df.copy()
+        dataset.drop(columns=column, inplace=True)
+        numeric_df = dataset.select_dtypes(include="number")
+        if self.use_scaler:
             scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
-            numeric_scaled = scaler.rescale()
+            _ = scaler.rescale()
+            dataset_ = df.copy()
+            numeric_df = dataset_.select_dtypes(include="number")
+            numeric_scaled = scaler.rescale(dataset_=numeric_df.to_numpy())
             numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
-            df[numeric_df.columns] = numeric_df
-            # Encoding the datadrame
-            for num, colname in enumerate(dfe._encode_columns):
-                if df[colname].dtype == "object":
-                    encode_dict = dfe.encoding_list[num]
-                    df[colname] = df[colname].apply(
-                        dfe._code_transformation_to, dictionary_list=encode_dict
-                    )
-        except:
-            print("The dataframe provided does not have the same columns as in the fit method.")
-        # Assign value to n if n is None
-        n = n if n != None else len(df)
-        # Generation of assertion
-        assert n > 0 and n <= len(df), '"n" must be interger or "<= len(df)".'
+            for col in numeric_df.columns:
+                df[col] = numeric_df[col].values
-        # Sample dataframe
-        df_aux = df.sample(n)
+        # Encoding the datadrame
+        for num, colname in enumerate(dfe._encode_columns):
+            if df[colname].dtype == "object":
+                encode_dict = dfe.encoding_list[num]
+                df[colname] = df[colname].apply(
+                    dfe._code_transformation_to, dictionary_list=encode_dict
+                )
         # PREDICTION
-        y = df_aux.to_numpy() @ w
+        y = df.to_numpy() @ w
         # Categorical column
         if quick_encoder != None:
@@ -67,18 +79,18 @@ class SimulationEngine(FeatureSelection):
             y = [encoding_dic[item] for item in y]
         # Numeric column
         else:
-            # scale output
-            i = numeric_dict[column]
-            y += 1
-            y /= 2
-            y = y * self.scaler.values[1][i]
+            if self.use_scaler:
+                # scale output
+                y += 1
+                y /= 2
+                y = y * (self.df[column].max() - self.df[column].min())
-        return y
+        return y[:]
     def fit(self, **kwargs) -> None:
         # We run the feature selection algorithm
-        self.get_digraph(self.df, self.n_importances)
+        self.get_digraph(self.df, self.n_importances, self.use_scaler)
     def _clean_data(self, df: DataFrame) -> DataFrame:

likelihood/tools/numeric_tools.py CHANGED Viewed

@@ -1,14 +1,14 @@
 from typing import Dict
 import numpy as np
+import pandas as pd
 from numpy import arange, array, ndarray, random
 from numpy.linalg import solve
 from pandas.core.frame import DataFrame
-# -------------------------------------------------------------------------
-def xi_corr(df: DataFrame) -> DataFrame:
+# -------------------------------------------------------------------------
+def xi_corr(df: pd.DataFrame) -> pd.DataFrame:
     """Calculate new coefficient of correlation for all pairs of columns in a `DataFrame`.
     Parameters
@@ -19,11 +19,15 @@ def xi_corr(df: DataFrame) -> DataFrame:
     Returns
     -------
     `DataFrame`
-        A dataframe with variable names as keys and their corresponding
-        correlation coefficients as values.
+        A square dataframe with variable names as both index and columns,
+        containing their corresponding correlation coefficients.
     """
-    correlations = {}
-    columns = df.columns
+    columns = df.select_dtypes(include="number").columns
+    n = len(columns)
+    # Initialize a square matrix for the correlations
+    correlations = pd.DataFrame(1.0, index=columns, columns=columns)
     for i, col1 in enumerate(columns):
         for j, col2 in enumerate(columns):
@@ -32,9 +36,9 @@ def xi_corr(df: DataFrame) -> DataFrame:
                 y = df[col2].values
                 correlation = xicor(x, y)
-                correlations[(col1, col2)] = round(correlation, 8)
-    # dictionary to dataframe
-    correlations = DataFrame(list(correlations.items()), columns=["Variables", "Xi Correlation"])
+                correlations.loc[col1, col2] = round(correlation, 8)
+                correlations.loc[col2, col1] = round(correlation, 8)  # Mirror the correlation
     return correlations
@@ -51,10 +55,11 @@ def xi_corr(df: DataFrame) -> DataFrame:
 """
-def xicor(X: ndarray, Y: ndarray, ties: bool = True) -> float:
-    """Calculate a new coefficient of correlation between two variables.
+def xicor(X: np.ndarray, Y: np.ndarray, ties: bool = True, random_seed: int = None) -> float:
+    """
+    Calculate a generalized coefficient of correlation between two variables.
-    The new coefficient of correlation is a generalization of Pearson's correlation.
+    This coefficient is an extension of Pearson's correlation, accounting for ties with optional randomization.
     Parameters
     ----------
@@ -62,30 +67,52 @@ def xicor(X: ndarray, Y: ndarray, ties: bool = True) -> float:
         The first variable to be correlated. Must have at least one dimension.
     Y : `np.ndarray`
         The second variable to be correlated. Must have at least one dimension.
+    ties : bool
+        Whether to handle ties using randomization.
+    random_seed : int, optional
+        Seed for the random number generator for reproducibility.
     Returns
     -------
     xi : `float`
         The estimated value of the new coefficient of correlation.
     """
-    random.seed(42)
+    # Early return for identical arrays
+    if np.array_equal(X, Y):
+        return 1.0
     n = len(X)
-    order = array([i[0] for i in sorted(enumerate(X), key=lambda x: x[1])])
+    # Early return for cases with less than 2 elements
+    if n < 2:
+        return 0.0
+    # Flatten the input arrays if they are multidimensional
+    X = X.flatten()
+    Y = Y.flatten()
+    # Get the sorted order of X
+    order = np.argsort(X)
     if ties:
-        l = array([sum(y >= Y[order]) for y in Y[order]])
-        r = l.copy()
-        for j in range(n):
-            if sum([r[j] == r[i] for i in range(n)]) > 1:
-                tie_index = array([r[j] == r[i] for i in range(n)])
-                r[tie_index] = random.choice(
-                    r[tie_index] - arange(0, sum([r[j] == r[i] for i in range(n)])),
-                    sum(tie_index),
-                    replace=False,
-                )
-        return 1 - n * sum(abs(r[1:] - r[: n - 1])) / (2 * sum(l * (n - l)))
+        np.random.seed(random_seed)  # Set seed for reproducibility if needed
+        ranks = np.argsort(np.argsort(Y[order]))  # Get ranks
+        unique_ranks, counts = np.unique(ranks, return_counts=True)
+        # Adjust ranks for ties by shuffling
+        for rank, count in zip(unique_ranks, counts):
+            if count > 1:
+                tie_indices = np.where(ranks == rank)[0]
+                np.random.shuffle(ranks[tie_indices])  # Randomize ties
+        cumulative_counts = np.array([np.sum(y >= Y[order]) for y in Y[order]])
+        return 1 - n * np.sum(np.abs(ranks[1:] - ranks[: n - 1])) / (
+            2 * np.sum(cumulative_counts * (n - cumulative_counts))
+        )
     else:
-        r = array([sum(y >= Y[order]) for y in Y[order]])
-        return 1 - 3 * sum(abs(r[1:] - r[: n - 1])) / (n**2 - 1)
+        ranks = np.argsort(np.argsort(Y[order]))  # Get ranks without randomization
+        return 1 - 3 * np.sum(np.abs(ranks[1:] - ranks[: n - 1])) / (n**2 - 1)
 # -------------------------------------------------------------------------
@@ -257,8 +284,8 @@ if __name__ == "__main__":
     print("New correlation coefficient test")
     X = np.random.rand(100, 1)
     Y = X * X
-    print("coefficient for Y = X * X : ", xicor(X, Y))
+    print("coefficient for Y = X * X : ", xicor(X, Y, False))
+    df["index"] = ["A", "B", "C", "D"]
     print("New correlation coefficient test for pandas DataFrame")
     values_df = xi_corr(df)
     breakpoint()

likelihood/tools/tools.py CHANGED Viewed

@@ -640,14 +640,14 @@ def cal_average(y: ndarray, alpha: float = 1):
 class DataScaler:
     """numpy array `scaler` and `rescaler`"""
-    __slots__ = ["dataset_", "_n", "data_scaled", "values", "transpose"]
+    __slots__ = ["dataset_", "_n", "data_scaled", "values", "transpose", "inv_fitting"]
     def __init__(self, dataset: ndarray, n: int = 1) -> None:
         """Initializes the parameters required for scaling the data"""
         self.dataset_ = dataset.copy()
         self._n = n
-    def rescale(self) -> ndarray:
+    def rescale(self, dataset_: ndarray | None = None) -> ndarray:
         """Perform a standard rescaling of the data
         Returns
@@ -655,11 +655,26 @@ class DataScaler:
         data_scaled : `np.array`
             An array containing the scaled data.
         """
+        if isinstance(dataset_, ndarray):
+            data_scaled = np.copy(dataset_)
+            mu = self.values[0]
+            sigma = self.values[1]
+            f = self.values[2]
+            data_scaled = data_scaled.reshape((self.dataset_.shape[0], -1))
+            for i in range(self.dataset_.shape[0]):
+                if self._n != None:
+                    poly = f[i](self.inv_fitting[i](data_scaled[i]))
+                    data_scaled[i] += -poly
+                data_scaled[i] = 2 * ((data_scaled[i] - mu[i]) / sigma[i]) - 1
+            return data_scaled
+        else:
+            self.data_scaled = np.copy(self.dataset_.copy())
         mu = []
         sigma = []
         fitting = []
-        self.data_scaled = np.copy(self.dataset_)
+        self.inv_fitting = []
         try:
             xaxis = range(self.dataset_.shape[1])
         except:
@@ -675,12 +690,15 @@ class DataScaler:
         for i in range(self.dataset_.shape[0]):
             if self._n != None:
                 fit = np.polyfit(xaxis, self.dataset_[i, :], self._n)
+                inv_fit = np.polyfit(self.dataset_[i, :], xaxis, self._n)
                 f = np.poly1d(fit)
                 poly = f(xaxis)
                 fitting.append(f)
+                self.inv_fitting.append(inv_fit)
                 self.data_scaled[i, :] += -poly
             else:
                 fitting.append(0.0)
+                self.inv_fitting.append(0.0)
             mu.append(np.min(self.data_scaled[i, :]))
             if np.max(self.data_scaled[i, :]) != 0:
                 sigma.append(np.max(self.data_scaled[i, :]) - mu[i])
@@ -1064,7 +1082,7 @@ class FeatureSelection:
         self.all_features_imp_graph: List[Tuple] = []
         self.w_dict = dict()
-    def get_digraph(self, dataset: DataFrame, n_importances: int) -> str:
+    def get_digraph(self, dataset: DataFrame, n_importances: int, use_scaler: bool = False) -> str:
         """
         Get directed graph showing importance of features.
@@ -1092,10 +1110,11 @@ class FeatureSelection:
             feature_string += column + "; "
         numeric_df = curr_dataset.select_dtypes(include="number")
-        self.scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
-        numeric_scaled = self.scaler.rescale()
-        numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
-        curr_dataset[numeric_df.columns] = numeric_df
+        if use_scaler:
+            self.scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None)
+            numeric_scaled = self.scaler.rescale()
+            numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns)
+            curr_dataset[numeric_df.columns] = numeric_df
         # We construct dictionary to save index for scaling
         numeric_dict = dict(zip(list(numeric_df.columns), range(len(list(numeric_df.columns)))))
@@ -1119,7 +1138,6 @@ class FeatureSelection:
                 dfe = DataFrameEncoder(X_aux)
                 encoded_df = dfe.encode(save_mode=False)
                 # We train
                 Model.fit(encoded_df.to_numpy().T, Y.to_numpy().T)
                 # We obtain importance
                 importance = Model.get_importances()
@@ -1202,7 +1220,7 @@ class FeatureSelection:
 def check_nan_inf(df: DataFrame) -> DataFrame:
-    """Check for `NaN` and `Inf` values in the `DataFrame`. If any are found removes them."""
+    """Checks for `NaN` and `Inf` values in the `DataFrame`. If any are found they will be removed."""
     nan_values = df.isnull().values.any()
     count = np.isinf(df.select_dtypes(include="number")).values.sum()
     print("There are null values : ", nan_values)

{likelihood-1.2.16.dist-info → likelihood-1.2.18.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: likelihood
-Version: 1.2.16
+Version: 1.2.18
 Summary: A package that performs the maximum likelihood algorithm.
 Home-page: https://github.com/jzsmoreno/likelihood/
 Author: J. A. Moreno-Guerra
@@ -28,8 +28,9 @@ Requires-Dist: corner
 Provides-Extra: full
 Requires-Dist: networkx ; extra == 'full'
 Requires-Dist: pyvis ; extra == 'full'
-Requires-Dist: tensorflow ; extra == 'full'
+Requires-Dist: tensorflow ==2.15.0 ; extra == 'full'
 Requires-Dist: keras-tuner ; extra == 'full'
+Requires-Dist: scikit-learn ; extra == 'full'
 ![likelihood](https://raw.githubusercontent.com/RodolfoFerro/likelihood/main/likelihood.png)

{likelihood-1.2.16.dist-info → likelihood-1.2.18.dist-info}/RECORD RENAMED Viewed

@@ -2,17 +2,18 @@ likelihood/__init__.py,sha256=5C0hapdsk85XZhN_rssRAEFpkRRuKNtj6cyRbqD2_gM,994
 likelihood/main.py,sha256=prqT9egu3B2rcbsVMqYxuosNbe7NhDBCmmZtQ21aSlQ,8591
 likelihood/graph/__init__.py,sha256=6TuFDfmXTwpLyHl7_KqBfdzW6zqHjGzIFvymjFPlvjI,21
 likelihood/graph/graph.py,sha256=wKJqgxXiSbnvzyW3SjhQVrqp00yKMHf3ph6CIDNVhNM,2891
+likelihood/graph/nn.py,sha256=XqTnAHzXP0jSdLd0IOFjVZUZTcQU-XppsZLmJrG2GMo,12372
 likelihood/models/__init__.py,sha256=e6nB4w47w0Q9DrAFeP3OcUgcoHOtf7Il4mBhgf4AARg,52
 likelihood/models/regression.py,sha256=9cakyGlJCEO6WfpoKLh3GxdXQeQp7cUvJIkQ5odT0TA,9404
-likelihood/models/simulation.py,sha256=KYdVjt2PaLo04g8kBsRGQJ5AKMBaQVUH3orZE_TXTy8,2960
+likelihood/models/simulation.py,sha256=mdgQPg_LEY5svPaF4TFv-DoQRE2oP2ig_uXnwINtewM,4039
 likelihood/models/utils.py,sha256=VtEj07lV-GRoWraQgpfjU0jTt1Ntf9MXgYwe6XYQh20,1552
 likelihood/models/deep/__init__.py,sha256=-KIPippVaMqgG8mEgYjNxYQdqOUcFhUuKhbVe8TTCfo,28
-likelihood/models/deep/autoencoders.py,sha256=wgra29Wjyh4KOMdOVEhWLtfqTFvjKeOVf1GthomB7PE,8857
+likelihood/models/deep/autoencoders.py,sha256=lUvFQ7lbjvIPR_IKFnK5VCrSa419P5dOaTL3qSHntJk,9623
 likelihood/tools/__init__.py,sha256=MCjsCWfBNKE2uMN0VizDN1uFzZ_md0X2WZeBdWhrCR8,50
-likelihood/tools/numeric_tools.py,sha256=EQD959b56aovi4PI_og0BITgyUONgDUU9LG9YqNgX70,7554
-likelihood/tools/tools.py,sha256=B1_xRZeO2fUSCVUvdkhlB6zO9dGzIglSknydLv7VCEc,41627
-likelihood-1.2.16.dist-info/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
-likelihood-1.2.16.dist-info/METADATA,sha256=5htpwpnzwy5Y0sU103sm_K8Yt5xhMjjmLf1a3rx_40s,2463
-likelihood-1.2.16.dist-info/WHEEL,sha256=mguMlWGMX-VHnMpKOjjQidIo1ssRlCFu4a4mBpz1s2M,91
-likelihood-1.2.16.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
-likelihood-1.2.16.dist-info/RECORD,,
+likelihood/tools/numeric_tools.py,sha256=cPTPgdww2ofxfyhJDomqvtXDgsSDs9iRQ7GHLt5Vl6M,8457
+likelihood/tools/tools.py,sha256=O39aPxTNsaBVSJFIkNsUESNSkfG4C7GG77wcR51a8IQ,42543
+likelihood-1.2.18.dist-info/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
+likelihood-1.2.18.dist-info/METADATA,sha256=8nAjAwwqCDw8K9IBzKG2cgBU5DOLAA-N-RIlr02eyjU,2518
+likelihood-1.2.18.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
+likelihood-1.2.18.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
+likelihood-1.2.18.dist-info/RECORD,,

{likelihood-1.2.16.dist-info → likelihood-1.2.18.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (70.1.1)
+Generator: setuptools (75.3.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{likelihood-1.2.16.dist-info → likelihood-1.2.18.dist-info}/LICENSE RENAMED Viewed

File without changes

{likelihood-1.2.16.dist-info → likelihood-1.2.18.dist-info}/top_level.txt RENAMED Viewed

File without changes

likelihood 1.2.16__py3-none-any.whl → 1.2.18__py3-none-any.whl

likelihood 1.2.16py3-none-any.whl → 1.2.18py3-none-any.whl