PyPI - likelihood - Versions diffs - 1.5.0__py3-none-any.whl → 1.5.2__py3-none-any.whl - Mend

likelihood 1.5.0py3-none-any.whl → 1.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

likelihood/models/simulation.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import pickle
 import warnings
-from typing import List, Tuple, Union
+from typing import Dict, List, Tuple, Union
 import numpy as np
 import pandas as pd
@@ -106,7 +106,7 @@ class SimulationEngine(FeatureSelection):
         return y[:]
-    def _encode(self, df: DataFrame) -> np.ndarray | list:
+    def _encode(self, df: DataFrame) -> Dict[str, float]:
         df = df.copy()
         column = df.columns[0]
         frec = df[column].value_counts() / len(df)
@@ -132,9 +132,9 @@ class SimulationEngine(FeatureSelection):
             plot = kwargs.get("plot", False)
             if not x[1]:
                 media = self.df[key].mean()
-                desviacion_estandar = self.df[key].std()
-                cota_inferior = media - 1.5 * desviacion_estandar
-                cota_superior = media + 1.5 * desviacion_estandar
+                standard_deviation = self.df[key].std()
+                lower_limit = media - 1.5 * standard_deviation
+                upper_limit = media + 1.5 * standard_deviation
                 if plot:
                     print(f"Cumulative Distribution Function ({key})")
                 f, cdf_, ox = cdf(x[0].flatten(), poly=poly, plot=plot)
@@ -143,14 +143,14 @@ class SimulationEngine(FeatureSelection):
                 least_frequent_category, most_frequent_category = categories_by_quartile(
                     self.df[[key]], key
                 )
-                cota_inferior = x[1].get(least_frequent_category, 0)
-                cota_superior = x[1].get(most_frequent_category, 0)
+                lower_limit = x[1].get(least_frequent_category, 0)
+                upper_limit = x[1].get(most_frequent_category, 0)
             self.proba_dict[key] = (
                 f if f else None,
                 x[1],
                 (np.mean(np.abs(np.diff(ox))) / 2.0 if isinstance(ox, np.ndarray) else None),
-                f(cota_inferior) if f else cota_inferior,
-                f(cota_superior) if f else cota_superior,
+                f(lower_limit) if f else lower_limit,
+                f(upper_limit) if f else upper_limit,
             )
     def get_proba(self, value: Union[Union[float, int], str] | list, colname: str) -> List[float]:

likelihood/tools/impute.py ADDED Viewed

@@ -0,0 +1,279 @@
+import pickle
+import warnings
+from typing import Union
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from likelihood.models import SimulationEngine
+from likelihood.tools.numeric_tools import find_multiples
+warnings.simplefilter(action="ignore", category=FutureWarning)
+class SimpleImputer:
+    """Multiple imputation using simulation engine."""
+    def __init__(self, n_features: int | None = None, use_scaler: bool = False):
+        """
+        Initialize the imputer.
+        Parameters
+        ----------
+        n_features: int | None
+            Number of features to be used in the imputer. Default is None.
+        use_scaler: bool
+            Whether to use a scaler. Default is False.
+        """
+        self.n_features = n_features
+        self.sim = SimulationEngine(use_scaler=use_scaler)
+        self.params = {}
+        self.cols_transf = pd.Series([])
+    def fit(self, X: pd.DataFrame) -> None:
+        """
+        Fit the imputer to the data.
+        Parameters
+        ----------
+        X: pd.DataFrame
+            Dataframe to fit the imputer to.
+        """
+        X_impute = X.copy()
+        self.params = self._get_dict_params(X_impute)
+        X_impute = self.sim._clean_data(X_impute)
+        if X_impute.empty:
+            raise ValueError(
+                "The dataframe is empty after cleaning, it is not possible to train the imputer."
+            )
+        self.n_features = self.n_features or X_impute.shape[1] - 1
+        self.sim.fit(X_impute, self.n_features)
+    def transform(
+        self, X: pd.DataFrame, boundary: bool = True, inplace: bool = True
+    ) -> pd.DataFrame:
+        """
+        Impute missing values in the data.
+        Parameters
+        -----------
+        X: pd.DataFrame
+            Dataframe to impute missing values.
+        boundary: bool
+            Whether to use the boundaries of the data to impute missing values. Default is True.
+        inplace: bool
+            Whether to modify the columns of the original dataframe or return new ones. Default is True.
+        """
+        X_impute = X.copy()
+        self.cols_transf = X_impute.columns
+        for column in X_impute.columns:
+            if X_impute[column].isnull().sum() > 0:
+                if not X_impute[column].dtype == "object":
+                    min_value = self.params[column]["min"]
+                    max_value = self.params[column]["max"]
+                    to_compare = self.params[column]["to_compare"]
+                for row in X_impute.index:
+                    if pd.isnull(X_impute.loc[row, column]):
+                        value_impute = self._check_dtype_convert(
+                            self.sim.predict(
+                                self._set_zero(X_impute.loc[row, :], column),
+                                column,
+                            )[0],
+                            to_compare,
+                        )
+                        if not X_impute[column].dtype == "object" and boundary:
+                            if value_impute < min_value:
+                                value_impute = min_value
+                            if value_impute > max_value:
+                                value_impute = max_value
+                        X_impute.loc[row, column] = value_impute
+            else:
+                self.cols_transf = self.cols_transf.drop(column)
+        if not inplace:
+            X_impute = X_impute[self.cols_transf].copy()
+            X_impute = X_impute.rename(
+                columns={column: column + "_imputed" for column in self.cols_transf}
+            )
+            X_impute = X.join(X_impute, rsuffix="_imputed")
+            order_cols = []
+            for column in X.columns:
+                if column + "_imputed" in X_impute.columns:
+                    order_cols.append(column)
+                    order_cols.append(column + "_imputed")
+                else:
+                    order_cols.append(column)
+            X_impute = X_impute[order_cols]
+        return X_impute
+    def fit_transform(
+        self, X: pd.DataFrame, boundary: bool = True, inplace: bool = True
+    ) -> pd.DataFrame:
+        """
+        Fit and transform the data.
+        Parameters
+        -----------
+        X: pd.DataFrame
+            Dataframe to fit and transform.
+        boundary: bool
+            Whether to use the boundaries of the data to impute missing values. Default is True.
+        inplace: bool
+            Whether to modify the columns of the original dataframe or return new ones. Default is True.
+        """
+        X_train = X.copy()
+        self.fit(X_train)
+        return self.transform(X, boundary, inplace)
+    def _set_zero(self, X: pd.Series, column_exception) -> pd.DataFrame:
+        """
+        Set missing values to zero, except for `column_exception`.
+        Parameters
+        -----------
+        X: pd.Series
+            Series to set missing values to zero.
+        """
+        X = X.copy()
+        for column in X.index:
+            if pd.isnull(X[column]) and column != column_exception:
+                X[column] = 0
+        data = X.to_frame().T
+        return data
+    def _check_dtype_convert(self, value: Union[int, float], to_compare: Union[int, float]) -> None:
+        """
+        Check if the value is an integer and convert it to float if it is.
+        Parameters
+        -----------
+        value: Union[int, float]
+            Value to check and convert.
+        to_compare: Union[int, float]
+            Value to compare to.
+        """
+        if isinstance(to_compare, int) and isinstance(value, float):
+            value = int(round(value, 0))
+        if isinstance(to_compare, float) and isinstance(value, float):
+            value = round(value, len(str(to_compare).split(".")[1]))
+        return value
+    def _get_dict_params(self, df: pd.DataFrame) -> dict:
+        """
+        Get the parameters for the imputer.
+        Parameters
+        -----------
+        df: pd.DataFrame
+            Dataframe to get the parameters from.
+        """
+        params = {}
+        for column in df.columns:
+            if df[column].isnull().sum() > 0:
+                if not df[column].dtype == "object":
+                    to_compare = df[column].dropna().sample().values[0]
+                    params[column] = {
+                        "min": df[column].min(),
+                        "to_compare": to_compare,
+                        "max": df[column].max(),
+                    }
+        return params
+    def eval(self, X: pd.DataFrame) -> None:
+        """
+        Create a histogram of the imputed values.
+        Parameters
+        -----------
+        X: pd.DataFrame
+            Dataframe to create the histogram from.
+        """
+        if not isinstance(X, pd.DataFrame):
+            raise ValueError("Input X must be a pandas DataFrame.")
+        df = X.copy()
+        imputed_cols = [col for col in df.columns if col.endswith("_imputed")]
+        num_impute = len(imputed_cols)
+        if num_impute == 0:
+            print("No imputed columns found in the DataFrame.")
+            return
+        try:
+            ncols, nrows = find_multiples(num_impute)
+        except ValueError as e:
+            print(f"Error finding multiples for {num_impute}: {e}")
+            ncols = 1
+            nrows = num_impute
+        _, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(12, 5 * nrows))
+        axes = axes.flatten() if isinstance(axes, np.ndarray) else [axes]
+        for i, col in enumerate(imputed_cols):
+            original_col = col.replace("_imputed", "")
+            if original_col in df.columns:
+                original_col_data = df[original_col].dropna()
+                ax = axes[i]
+                # Plot the original data
+                sns.histplot(
+                    original_col_data,
+                    kde=True,
+                    color="blue",
+                    label=f"Original",
+                    bins=10,
+                    ax=ax,
+                )
+                # Plot the imputed data
+                sns.histplot(
+                    df[col],
+                    kde=True,
+                    color="red",
+                    label=f"Imputed",
+                    bins=10,
+                    ax=ax,
+                )
+                ax.set_xlabel(original_col)
+                ax.set_ylabel("Frequency" if i % ncols == 0 else "")
+                ax.legend(loc="upper right")
+        plt.suptitle("Histogram Comparison", fontsize=16, fontweight="bold")
+        plt.tight_layout()
+        plt.subplots_adjust(top=0.9)
+        plt.show()
+    def save(self, filename: str = "./imputer") -> None:
+        """
+        Save the state of the SimpleImputer to a file.
+        Parameters
+        -----------
+        filename: str
+            Name of the file to save the imputer to. Default is "./imputer".
+        """
+        filename = filename if filename.endswith(".pkl") else filename + ".pkl"
+        with open(filename, "wb") as f:
+            pickle.dump(self, f)
+    @staticmethod
+    def load(filename: str = "./imputer"):
+        """
+        Load the state of a SimpleImputer from a file.
+        Parameters
+        -----------
+        filename: str
+            Name of the file to load the imputer from. Default is "./imputer".
+        """
+        filename = filename + ".pkl" if not filename.endswith(".pkl") else filename
+        with open(filename, "rb") as f:
+            return pickle.load(f)

likelihood/tools/numeric_tools.py CHANGED Viewed

@@ -345,6 +345,27 @@ def gauss_elimination(A: ndarray | list, pr: int = 2) -> ndarray:
     return X
+def find_multiples(target: int) -> tuple[int, int] | None:
+    """Find two factors of a given target number.
+    Parameters
+    ----------
+    target : int
+        The target number to find factors for.
+    Returns
+    -------
+    tuple[int, int] | None
+        A tuple containing two factors of the target number.
+        Returns None if no factors are found.
+    """
+    for i in range(2, target + 1):
+        if target % i == 0:
+            factor = target // i
+            return i, factor
+    return None
 # Example usage:
 if __name__ == "__main__":
     import pandas as pd

likelihood/tools/tools.py CHANGED Viewed

@@ -1167,7 +1167,7 @@ class FeatureSelection:
         self.X = self.X.drop(columns=["index"])
-def check_nan_inf(df: DataFrame) -> DataFrame:
+def check_nan_inf(df: DataFrame, verbose: bool = False) -> DataFrame:
     """
     Checks for NaN and Inf values in the DataFrame. If any are found, they will be removed.
@@ -1185,20 +1185,32 @@ def check_nan_inf(df: DataFrame) -> DataFrame:
     nan_values = df.isnull().values.any()
     inf_values = np.isinf(df.select_dtypes(include="number")).values.any()
+    nan_count = df.isnull().values.sum()
+    inf_count = np.isinf(df.select_dtypes(include="number")).values.sum()
     if nan_values:
-        print("UserWarning: Some rows may have been deleted due to the existence of NaN values.")
+        (
+            print(
+                "UserWarning: Some rows may have been deleted due to the existence of NaN values."
+            )
+            if verbose
+            else None
+        )
         df.dropna(inplace=True)
     if inf_values:
-        print("UserWarning: Some rows may have been deleted due to the existence of Inf values.")
+        (
+            print(
+                "UserWarning: Some rows may have been deleted due to the existence of Inf values."
+            )
+            if verbose
+            else None
+        )
         df.replace([np.inf, -np.inf], np.nan, inplace=True)
         df.dropna(inplace=True)
-    nan_count = df.isnull().values.sum()
-    inf_count = np.isinf(df.select_dtypes(include="number")).values.sum()
-    print(f"NaN values removed: {nan_count}")
-    print(f"Infinite values removed: {inf_count}")
+    print(f"NaN values removed: ", "{:,}".format(nan_count))
+    print(f"Infinite values removed: ", "{:,}".format(inf_count))
     return df

{likelihood-1.5.0.dist-info → likelihood-1.5.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: likelihood
-Version: 1.5.0
+Version: 1.5.2
 Summary: A package that performs the maximum likelihood algorithm.
 Home-page: https://github.com/jzsmoreno/likelihood/
 Author: J. A. Moreno-Guerra
@@ -24,6 +24,7 @@ Requires-Dist: numpy<2.0.0
 Requires-Dist: pydot==2.0.0
 Requires-Dist: matplotlib
 Requires-Dist: graphviz
+Requires-Dist: seaborn
 Requires-Dist: pyyaml
 Requires-Dist: pandas
 Requires-Dist: corner
@@ -39,6 +40,7 @@ Dynamic: classifier
 Dynamic: description
 Dynamic: description-content-type
 Dynamic: home-page
+Dynamic: license-file
 Dynamic: maintainer
 Dynamic: maintainer-email
 Dynamic: provides-extra

{likelihood-1.5.0.dist-info → likelihood-1.5.2.dist-info}/RECORD RENAMED Viewed

@@ -6,17 +6,18 @@ likelihood/graph/nn.py,sha256=EaMmboKriCFnkP48_HLGRAsOZSWxwUlMG0WDGZ4ey1o,11035
 likelihood/models/__init__.py,sha256=e6nB4w47w0Q9DrAFeP3OcUgcoHOtf7Il4mBhgf4AARg,52
 likelihood/models/hmm.py,sha256=0s0gFySH1u4NjRaZDxiZ8oeTaFhFrw1x0GJxwy3dFrA,6253
 likelihood/models/regression.py,sha256=9cakyGlJCEO6WfpoKLh3GxdXQeQp7cUvJIkQ5odT0TA,9404
-likelihood/models/simulation.py,sha256=LFyE_szo7sDukviMLeg_6RoyAaI7yMXUy8f4mDOrGoc,8460
+likelihood/models/simulation.py,sha256=IkYGA6-L1LvSnIlyrVWTzQQu-JnfXml5Tewt-GC05PY,8446
 likelihood/models/utils.py,sha256=dvigPi_hxcs5ntfHr7Y1JvP5ULtMW3kkN0nJpS4orE8,1319
 likelihood/models/deep/__init__.py,sha256=-KIPippVaMqgG8mEgYjNxYQdqOUcFhUuKhbVe8TTCfo,28
 likelihood/models/deep/autoencoders.py,sha256=0EIZwDNlZ9NCfQbhQ_KdXkkRwIjUEU-jk0l0u-J1wmA,44212
 likelihood/tools/__init__.py,sha256=N1IhMDzacsGQT2MIYBMBC0zTxes78vC_0gGrwkuPgmg,78
 likelihood/tools/figures.py,sha256=waF0NHIMrctCmaLhcuz5DMcXyRKynmn6aG0XITYCTLc,10940
+likelihood/tools/impute.py,sha256=BwBVFSQkG3uWsZEk1THTmqZc3YhHlDhMXgKIV3sx5Lg,9486
 likelihood/tools/models_tools.py,sha256=c3-vac-1MYSarYDtfR6XfVC7X_WY9auS7y2_3Z973IQ,8875
-likelihood/tools/numeric_tools.py,sha256=FA44kbiAcxcquz1el_g3Pqsp5ii8XFkAIrsMs5bGkj0,11445
-likelihood/tools/tools.py,sha256=SePaBg-gP29rt5SR2xhqNNQLu7_m0Wner5y_XzdSdpc,42031
-likelihood-1.5.0.dist-info/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
-likelihood-1.5.0.dist-info/METADATA,sha256=zTpqZ3w7y_vWY2dqQH7JSfROIkC8dbRcLn2LSCAQGc4,2822
-likelihood-1.5.0.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
-likelihood-1.5.0.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
-likelihood-1.5.0.dist-info/RECORD,,
+likelihood/tools/numeric_tools.py,sha256=OelCF45QO-zhanX3GmfcdYMfUZxYt353oJ8_gPEdWss,11959
+likelihood/tools/tools.py,sha256=vlQ-peK_z5-MLVnStxlBdl-NfmF6ILxZ6LhBd4K77JI,42282
+likelihood-1.5.2.dist-info/licenses/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
+likelihood-1.5.2.dist-info/METADATA,sha256=ioc6f7SQTASnslCzc4N-dJ4xvnGZTn3llC0Q0OX7nP8,2867
+likelihood-1.5.2.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+likelihood-1.5.2.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
+likelihood-1.5.2.dist-info/RECORD,,

{likelihood-1.5.0.dist-info → likelihood-1.5.2.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (76.0.0)
+Generator: setuptools (78.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{likelihood-1.5.0.dist-info → likelihood-1.5.2.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

{likelihood-1.5.0.dist-info → likelihood-1.5.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

likelihood 1.5.0__py3-none-any.whl → 1.5.2__py3-none-any.whl

likelihood 1.5.0py3-none-any.whl → 1.5.2py3-none-any.whl