PyPI - likelihood - Versions diffs - 2.0.0__tar.gz → 2.0.2__tar.gz - Mend

likelihood 2.0.0tar.gz → 2.0.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

{likelihood-2.0.0 → likelihood-2.0.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: likelihood
-Version: 2.0.0
+Version: 2.0.2
 Summary: A package that performs the maximum likelihood algorithm.
 Home-page: https://github.com/jzsmoreno/likelihood/
 Author: J. A. Moreno-Guerra

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/__init__.py RENAMED Viewed

@@ -16,4 +16,5 @@ To get started with Likelihood, simply import the desired modules and start expl
 from likelihood.main import *
 from likelihood.models import *
+from likelihood.pipes import Pipeline
 from likelihood.tools import *

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/deep/_autoencoders.py RENAMED Viewed

@@ -7,8 +7,10 @@ from .autoencoders import (
     keras_tuner,
     l2,
     np,
+    os,
     partial,
     pd,
+    rmtree,
     sampling,
     suppress_warnings,
     tf,

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/deep/rl.py RENAMED Viewed

@@ -27,12 +27,12 @@ class Env:
         Parameters
         ----------
-            model : Any
-                Model with `.predict()` method (e.g., Keras model).
-            maxlen : int
-                Maximum length of deque. By default it is set to `100`.
-            name : str
-                The name of the environment. By default it is set to `likenasium`.
+        model : Any
+            Model with `.predict()` method (e.g., Keras model).
+        maxlen : int
+            Maximum length of deque. By default it is set to `100`.
+        name : str
+            The name of the environment. By default it is set to `likenasium`.
         """
         self.model = model
         self.maxlen = maxlen
@@ -49,14 +49,14 @@ class Env:
         Parameters
         ----------
-            state : `np.ndarray`
-                Current state to process (input to the model).
-            action : int
-                Expected action to process.
+        state : `np.ndarray`
+            Current state to process (input to the model).
+        action : `int`
+            Expected action to process.
         Returns
         -------
-            tuple: (current_state, action_pred, reward, next_action, done)
+        tuple : (current_state, action_pred, reward, next_action, done)
         """
         if self.done:
             return None, None, 0, None, True
@@ -120,9 +120,9 @@ class AutoQL:
         Parameters
         ----------
-        env : Any
+        env : `Any`
             The environment to interact with
-        model : tf.keras.Model
+        model : `tf.keras.Model`
             The Q-network model
         """
@@ -137,16 +137,16 @@ class AutoQL:
         Parameters
         ----------
-            state : `np.ndarray`
-                Current state.
-            action : int
-                Expected action to process.
-            epsilon : float
-                Exploration probability. By default it is set to `0`
+        state : `np.ndarray`
+            Current state.
+        action : `int`
+            Expected action to process.
+        epsilon : `float`
+            Exploration probability. By default it is set to `0`
         Returns
         -------
-            tuple: (state, action, reward, next_action, done)
+        tuple : (state, action, reward, next_action, done)
         """
         current_state, value, reward, next_action, done = self.env.step(state, action)
@@ -164,17 +164,17 @@ class AutoQL:
         Parameters
         ----------
-            state : `np.ndarray`
-                Current state
-            action : int
-                Expected action to process.
+        state : `np.ndarray`
+            Current state
+        action : `int`
+            Expected action to process.
-            epsilon : float
-                Exploration probability.
+        epsilon : `float`
+            Exploration probability.
         Returns
         -------
-            tuple: (state, action, reward, next_action, done)
+        tuple : (state, action, reward, next_action, done)
         """
         current_state, greedy_action, reward, next_action, done = self.epsilon_greedy_policy(
             state, action, epsilon
@@ -202,7 +202,7 @@ class AutoQL:
         Returns
         -------
-            float: Training loss
+        float : Training loss
         """
         batch_ = random.sample(self.replay_buffer, self.batch_size)
@@ -250,21 +250,21 @@ class AutoQL:
         Parameters
         ----------
-        optimizer : str
+        optimizer : `str`
             The optimizer for training (e.g., `sgd`). By default it is set to `adam`.
-        loss_fn : str
+        loss_fn : `str`
             The loss function. By default it is set to `mse`.
-        num_episodes : int
+        num_episodes : `int`
             Total number of episodes to train. By default it is set to `50`.
-        num_steps : int
+        num_steps : `int`
             Steps per episode. By default it is set to `100`. If `num_steps` is less than `self.env.maxlen`, then the second will be chosen.
-        gamma : float
+        gamma : `float`
             Discount factor. By default it is set to `0.7`.
-        batch_size : int
+        batch_size : `int`
             Size of training batches. By default it is set to `32`.
-        patience : int
+        patience : `int`
             How many episodes to wait for improvement.
-        alpha : float
+        alpha : `float`
             Trade-off factor between loss and reward.
         """
         rewards = []

likelihood-2.0.2/likelihood/pipes.py ADDED Viewed

@@ -0,0 +1,355 @@
+import json
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+import pandas as pd
+from likelihood.tools import generate_html_pipeline
+from likelihood.tools.impute import SimpleImputer
+from likelihood.tools.models_tools import TransformRange, remove_collinearity
+from likelihood.tools.tools import DataFrameEncoder, DataScaler, LinearRegression, OneHotEncoder
+class Pipeline:
+    def __init__(self, config_path: str):
+        """
+        Initialize the pipeline with a JSON configuration file.
+        Parameters
+        ----------
+        config_path : str
+            Path to the JSON config defining target column and preprocessing steps.
+        """
+        self.config = self._load_config(config_path)
+        self.target_col = self.config["target_column"]
+        self.steps = self.config["preprocessing_steps"]
+        self.compute_importance = self.config.get("compute_feature_importance", False)
+        self.fitted_components: Dict[str, object] = {}
+        self.columns_bin_sizes: Dict[str, int] | None = None
+    def _load_config(self, config_path: str) -> Dict:
+        """Load and validate the JSON configuration."""
+        with open(config_path, "r") as f:
+            config = json.load(f)
+        assert "target_column" in config, "Config must specify 'target_column'"
+        assert "preprocessing_steps" in config, "Config must specify 'preprocessing_steps'"
+        return config
+    def fit(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, np.ndarray, Optional[np.ndarray]]:
+        """
+        Fit preprocessing components on the input DataFrame and return cleaned X/y.
+        Parameters
+        ----------
+        df : pd.DataFrame
+            Input data with features + target column.
+        Returns
+        -------
+        X : pd.DataFrame
+            Cleaned feature matrix.
+        y : np.ndarray
+            Target vector (from self.target_col).
+        importances : Optional[np.ndarray]
+            Feature importance scores (if compute_feature_importance=True).
+        """
+        y = df[self.target_col].values
+        X = df.drop(columns=[self.target_col]).copy()
+        initial_info = {
+            "shape": X.shape,
+            "columns": list(X.columns),
+            "dtypes": X.dtypes.apply(lambda x: x.name).to_dict(),
+            "missing_values": X.isnull().sum().to_dict(),
+        }
+        steps_info = []
+        for step in self.steps:
+            step_name = step["name"]
+            params = step.get("params", {})
+            step_info = {
+                "step_name": step_name,
+                "parameters": params,
+                "description": self._get_step_description(step_name),
+            }
+            step_info["input_columns"] = list(X.columns)
+            X = self._apply_step(step_name, X, fit=True, **params)
+            step_info["output_shape"] = X.shape
+            step_info["output_columns"] = list(X.columns)
+            step_info["output_dtypes"] = X.dtypes.apply(lambda x: x.name).to_dict()
+            steps_info.append(step_info)
+        final_info = {
+            "shape": X.shape,
+            "columns": list(X.columns),
+            "dtypes": X.dtypes.apply(lambda x: x.name).to_dict(),
+            "missing_values": X.isnull().sum().to_dict(),
+        }
+        self.documentation = {
+            "initial_dataset": initial_info,
+            "processing_steps": steps_info,
+            "final_dataset": final_info,
+        }
+        importances = None
+        if self.compute_importance:
+            numeric_X = X.select_dtypes(include=["float"])
+            numeric_columns = numeric_X.columns.tolist()
+            model = LinearRegression()
+            model.fit(numeric_X.T.values, y)
+            importances = model.get_importances()
+            df_scores = pd.DataFrame([importances], columns=numeric_columns)
+            df_scores_abs = df_scores.abs()
+        df_scores_norm = (
+            df_scores_abs / df_scores_abs.to_numpy().sum()
+            if isinstance(importances, np.ndarray)
+            else pd.DataFrame()
+        )
+        return X, y, df_scores_norm
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Apply fitted preprocessing steps to new data (no target column needed).
+        Parameters
+        ----------
+        df : pd.DataFrame
+            New data to transform.
+        Returns
+        -------
+        X_transformed : pd.DataFrame
+            Cleaned feature matrix.
+        """
+        X = df.copy()
+        for step_name, _ in self.fitted_components.items():
+            X = self._apply_step(step_name, X, fit=False)
+        return X
+    def get_doc(
+        self, save_to_file: bool = True, file_name: str = "data_processing_report.html"
+    ) -> None:
+        """
+        Generate an HTML report from `self.documentation` for pipeline documentation.
+        Parameters
+        ----------
+        save_to_file : bool, optional
+            Whether to save generated HTML content to a file. Default is True.
+        file_name : str, optional
+            Filename for output when `save_to_file` is True. Default is "data_processing_report.html".
+        """
+        generate_html_pipeline(self.documentation, save_to_file=save_to_file, file_name=file_name)
+    def _apply_step(self, step_name: str, X: pd.DataFrame, fit: bool, **params) -> pd.DataFrame:
+        """Dispatch to the correct handler for a preprocessing step."""
+        handlers = {
+            "DataScaler": self._handle_datascaler,
+            "DataFrameEncoder": self._handle_dataframeencoder,
+            "remove_collinearity": self._handle_remove_collinearity,
+            "TransformRange": self._handle_transformrange,
+            "OneHotEncoder": self._handle_onehotencoder,
+            "SimpleImputer": self._handle_simpleimputer,
+        }
+        if step_name not in handlers:
+            raise ValueError(
+                f"Step '{step_name}' not supported. Supported steps: {list(handlers.keys())}"
+            )
+        return handlers[step_name](X, fit=fit, **params)
+    def _get_step_description(self, step_name: str) -> str:
+        """Return a description of what each preprocessing step does."""
+        descriptions = {
+            "DataScaler": "Scales numerical features using normalization",
+            "DataFrameEncoder": "Encodes categorical variables and normalizes to numerical features",
+            "remove_collinearity": "Removes highly correlated features to reduce multicollinearity",
+            "TransformRange": "Bins continuous features into discrete ranges",
+            "OneHotEncoder": "Converts categorical variables into binary variables",
+            "SimpleImputer": "Handles missing values by imputing with multiple linear regression strategies",
+        }
+        return descriptions.get(step_name, f"Unknown preprocessing step: {step_name}")
+    # ------------------------------ Step Handlers ------------------------------
+    def _handle_datascaler(self, X: pd.DataFrame, fit: bool, n: int = 1) -> pd.DataFrame:
+        """Handle DataScaler (fits on training data, applies to all)."""
+        numeric_X = X.select_dtypes(include=["float"])
+        numeric_columns = numeric_X.columns.tolist()
+        n = None if n == 0 else n
+        if fit:
+            scaler = DataScaler(numeric_X.values.T, n=n)
+            self.fitted_components["DataScaler"] = scaler
+            numeric_X = pd.DataFrame(scaler.rescale().T, columns=numeric_X.columns)
+        else:
+            scaler = self.fitted_components["DataScaler"]
+            numeric_X = pd.DataFrame(
+                scaler.rescale(numeric_X.values.T).T, columns=numeric_X.columns
+            )
+        for col in numeric_columns:
+            X[col] = numeric_X[col]
+        return X
+    def _handle_dataframeencoder(
+        self, X: pd.DataFrame, fit: bool, norm_method: str = "mean"
+    ) -> pd.DataFrame:
+        """Handle DataFrameEncoder (fits encoders/normalizers)."""
+        if fit:
+            encoder = DataFrameEncoder(X)
+            encoded_X = encoder.encode(norm_method=norm_method)
+            self.fitted_components["DataFrameEncoder"] = encoder
+            return encoded_X
+        else:
+            encoder = self.fitted_components["DataFrameEncoder"]
+            encoder._df = X
+            return encoder.encode()
+    def _handle_remove_collinearity(
+        self, X: pd.DataFrame, fit: bool, threshold: float = 0.9
+    ) -> pd.DataFrame:
+        """Handle collinearity removal (fits by selecting columns to drop)."""
+        numeric_X = X.select_dtypes(include=["float"])
+        numeric_columns = numeric_X.columns.tolist()
+        categorical_columns = set(X.columns) - set(numeric_columns)
+        if fit:
+            cleaned_X = remove_collinearity(numeric_X, threshold=threshold)
+            dropped_cols = set(X.columns) - set(cleaned_X.columns) - categorical_columns
+            self.fitted_components["remove_collinearity"] = dropped_cols
+            return X.drop(columns=dropped_cols)
+        else:
+            dropped_cols = self.fitted_components["remove_collinearity"]
+            return X.drop(columns=dropped_cols)
+    def _handle_transformrange(
+        self, X: pd.DataFrame, fit: bool, columns_bin_sizes: Dict[str, int] | None = None
+    ) -> pd.DataFrame:
+        """Handle TransformRange (bin numerical features into ranges)."""
+        if fit:
+            transformer = TransformRange(columns_bin_sizes)
+            cleaned_X = transformer.transform(X)
+            self.fitted_components["TransformRange"] = transformer
+            self.columns_bin_sizes = columns_bin_sizes
+            return cleaned_X
+        else:
+            transformer = self.fitted_components["TransformRange"]
+            return transformer.transform(X, fit=False)
+    def _handle_onehotencoder(
+        self, X: pd.DataFrame, fit: bool, columns: List[str] | None = None
+    ) -> pd.DataFrame:
+        """Handle OneHotEncoder (fits on categorical columns)."""
+        if fit:
+            tmp_df = X.drop(columns=columns)
+            encoder = OneHotEncoder()
+            category_to_indices = {}
+            for col in columns:
+                unique_values = X[col].unique()
+                category_to_indices[col] = {value: i for i, value in enumerate(unique_values)}
+                encoded_X = encoder.encode(
+                    X[col].values
+                    if isinstance(unique_values[0], int)
+                    else X[col].map(category_to_indices[col])
+                )
+                tmp_df = pd.concat([tmp_df, pd.DataFrame(encoded_X, columns=unique_values)], axis=1)
+            self.fitted_components["OneHotEncoder"] = (encoder, columns, category_to_indices)
+        else:
+            encoder, columns, category_to_indices = self.fitted_components["OneHotEncoder"]
+            tmp_df = X.drop(columns=columns)
+            for col in columns:
+                unique_values = list(category_to_indices[col].keys())
+                encoded_X = encoder.encode(
+                    (
+                        X[col].values
+                        if isinstance(unique_values[0], int)
+                        else X[col].map(category_to_indices[col])
+                    ),
+                    fit=False,
+                )
+                tmp_df = pd.concat([tmp_df, pd.DataFrame(encoded_X, columns=unique_values)], axis=1)
+        return tmp_df
+    def _handle_simpleimputer(
+        self,
+        X: pd.DataFrame,
+        fit: bool,
+        use_scaler: bool = False,
+        boundary: bool = True,
+    ) -> pd.DataFrame:
+        "Handle SimpleImputer (fit on numerical and categorical columns)."
+        if fit:
+            use_scaler = True if use_scaler == 1 else False
+            imputer = SimpleImputer(use_scaler=use_scaler)
+            tmp_df = imputer.fit_transform(X, boundary=boundary)
+            self.fitted_components["SimpleImputer"] = imputer
+            return tmp_df
+        else:
+            imputer = self.fitted_components["SimpleImputer"]
+            return imputer.transform(X, boundary=boundary)
+    def save(self, filepath: str) -> None:
+        """
+        Save the fitted pipeline state to a file using pickle.
+        Parameters
+        ----------
+        filepath : str
+            Path where the serialized pipeline will be saved.
+        """
+        import pickle
+        save_dict = {
+            "config": self.config,
+            "fitted_components": self.fitted_components,
+            "target_col": self.target_col,
+            "steps": self.steps,
+            "compute_importance": self.compute_importance,
+            "columns_bin_sizes": self.columns_bin_sizes,
+            "documentation": self.documentation,
+        }
+        filepath = filepath + ".pkl" if not filepath.endswith(".pkl") else filepath
+        with open(filepath, "wb") as f:
+            pickle.dump(save_dict, f)
+    @classmethod
+    def load(cls, filepath: str) -> "Pipeline":
+        """
+        Load a fitted pipeline from a file.
+        Parameters
+        ----------
+        filepath : str
+            Path to the serialized pipeline file.
+        Returns
+        -------
+        pipeline : Pipeline
+            Reconstructed pipeline instance with fitted components.
+        """
+        import pickle
+        filepath = filepath + ".pkl" if not filepath.endswith(".pkl") else filepath
+        with open(filepath, "rb") as f:
+            save_dict = pickle.load(f)
+        pipeline = cls.__new__(cls)
+        pipeline.config = save_dict["config"]
+        pipeline.fitted_components = save_dict["fitted_components"]
+        pipeline.target_col = save_dict["target_col"]
+        pipeline.steps = save_dict["steps"]
+        pipeline.compute_importance = save_dict["compute_importance"]
+        pipeline.columns_bin_sizes = save_dict["columns_bin_sizes"]
+        pipeline.documentation = save_dict["documentation"]
+        return pipeline

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/tools/__init__.py RENAMED Viewed

@@ -1,3 +1,4 @@
 from .models_tools import *
 from .numeric_tools import *
+from .reports import generate_html_pipeline
 from .tools import *

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/tools/models_tools.py RENAMED Viewed

@@ -11,7 +11,7 @@ logging.getLogger("tensorflow").setLevel(logging.ERROR)
 import sys
 import warnings
 from functools import wraps
-from typing import Dict
+from typing import Dict, List, Optional, Tuple, Union
 import numpy as np
 import tensorflow as tf
@@ -40,6 +40,214 @@ def suppress_warnings(func):
     return wrapper
+class TransformRange:
+    """
+    Generates a new DataFrame with ranges represented as strings.
+    Transforms numerical columns into categorical range bins with descriptive labels.
+    """
+    def __init__(self, columns_bin_sizes: Dict[str, int]) -> None:
+        """Initializes the class with the original DataFrame.
+        Parameters
+        ----------
+        columns_bin_sizes : `dict`
+            A dictionary where the keys are column names and the values are the bin sizes.
+        Raises
+        ------
+        TypeError
+            If df is not a pandas DataFrame.
+        """
+        self.info = {}
+        self.columns_bin_sizes = columns_bin_sizes
+    def _create_bins_and_labels(
+        self, min_val: Union[int, float], max_val: Union[int, float], bin_size: int
+    ) -> Tuple[np.ndarray, List[str]]:
+        """
+        Creates the bin edges and their labels.
+        Parameters
+        ----------
+        min_val : `int` or `float`
+            The minimum value for the range.
+        max_val : `int` or `float`
+            The maximum value for the range.
+        bin_size : `int`
+            The size of each bin.
+        Returns
+        -------
+        bins : `np.ndarray`
+            The bin edges.
+        labels : `list`
+            The labels for the bins.
+        Raises
+        ------
+        ValueError
+            If bin_size is not positive or if min_val >= max_val.
+        """
+        if bin_size <= 0:
+            raise ValueError("bin_size must be positive")
+        if min_val >= max_val:
+            raise ValueError("min_val must be less than max_val")
+        start = int(min_val)
+        end = int(max_val) + bin_size
+        bins = np.arange(start, end + 1, bin_size)
+        if bins[-1] <= max_val:
+            bins = np.append(bins, max_val + 1)
+        lower_bin_edge = -np.inf
+        upper_bin_edge = np.inf
+        labels = [f"{int(bins[i])}-{int(bins[i+1] - 1)}" for i in range(len(bins) - 1)]
+        end = int(bins[-1] - 1)
+        bins = bins.tolist()
+        bins.insert(0, lower_bin_edge)
+        bins.append(upper_bin_edge)
+        labels.insert(0, f"< {start}")
+        labels.append(f"> {end}")
+        return bins, labels
+    def _transform_column_to_ranges(
+        self, df: pd.DataFrame, column: str, bin_size: int, fit: bool = True
+    ) -> pd.Series:
+        """
+        Transforms a column in the DataFrame into range bins.
+        Parameters
+        ----------
+        df : `pd.DataFrame`
+            The original DataFrame to transform.
+        column : `str`
+            The name of the column to transform.
+        bin_size : `int`
+            The size of each bin.
+        Returns
+        -------
+        `pd.Series`
+            A Series with the range labels.
+        Raises
+        ------
+        KeyError
+            If column is not found in the DataFrame.
+        ValueError
+            If bin_size is not positive or if column contains non-numeric data.
+        """
+        if not isinstance(df, pd.DataFrame):
+            raise TypeError("df must be a pandas DataFrame")
+        df_ = df.copy()  # Create a copy to avoid modifying the original
+        numeric_series = pd.to_numeric(df_[column], errors="coerce")
+        if fit:
+            self.df = df_.copy()
+            if column not in df_.columns:
+                raise KeyError(f"Column '{column}' not found in DataFrame")
+            if bin_size <= 0:
+                raise ValueError("bin_size must be positive")
+            if numeric_series.isna().all():
+                raise ValueError(f"Column '{column}' contains no valid numeric data")
+            min_val = numeric_series.min()
+            max_val = numeric_series.max()
+            if min_val == max_val:
+                return pd.Series(
+                    [f"{int(min_val)}-{int(max_val)}"] * len(df_), name=f"{column}_range"
+                )
+            self.info[column] = {"min_value": min_val, "max_value": max_val, "range": bin_size}
+        else:
+            min_val = self.info[column]["min_value"]
+            max_val = self.info[column]["max_value"]
+            bin_size = self.info[column]["range"]
+        bins, labels = self._create_bins_and_labels(min_val, max_val, bin_size)
+        return pd.cut(numeric_series, bins=bins, labels=labels, right=False, include_lowest=True)
+    def transform(
+        self, df: pd.DataFrame, drop_original: bool = False, fit: bool = True
+    ) -> pd.DataFrame:
+        """
+        Creates a new DataFrame with range columns.
+        Parameters
+        ----------
+        df : `pd.DataFrame`
+            The original DataFrame to transform.
+        drop_original : `bool`, optional
+            If True, drops original columns from the result, by default False
+        fit : `bool`, default=True
+            Whether to compute bin edges based on the data (True) or use predefined binning (False).
+        Returns
+        -------
+        `pd.DataFrame`
+            A DataFrame with the transformed range columns.
+        Raises
+        ------
+        TypeError
+            If columns_bin_sizes is not a dictionary.
+        """
+        if not isinstance(self.columns_bin_sizes, dict):
+            raise TypeError("columns_bin_sizes must be a dictionary")
+        if not self.columns_bin_sizes:
+            return pd.DataFrame()
+        range_columns = {}
+        for column, bin_size in self.columns_bin_sizes.items():
+            range_columns[f"{column}_range"] = self._transform_column_to_ranges(
+                df, column, bin_size, fit
+            )
+        result_df = pd.DataFrame(range_columns)
+        if not drop_original:
+            original_cols = [col for col in df.columns if col not in self.columns_bin_sizes]
+            if original_cols:
+                result_df = pd.concat([df[original_cols], result_df], axis=1)
+        return result_df
+    def get_range_info(self, column: str) -> Dict[str, Union[int, float, List[str]]]:
+        """
+        Get information about the range transformation for a specific column.
+        Parameters
+        ----------
+        column : `str`
+            The name of the column to analyze.
+        Returns
+        -------
+        `dict`
+            Dictionary containing min_val, max_val, bin_size, and labels.
+        """
+        if column not in self.df.columns:
+            raise KeyError(f"Column '{column}' not found in DataFrame")
+        numeric_series = pd.to_numeric(self.df[column], errors="coerce")
+        min_val = numeric_series.min()
+        max_val = numeric_series.max()
+        return {
+            "min_value": min_val,
+            "max_value": max_val,
+            "range": max_val - min_val,
+            "column": column,
+        }
 def remove_collinearity(df: DataFrame, threshold: float = 0.9):
     """
     Removes highly collinear features from the DataFrame based on a correlation threshold.
@@ -56,8 +264,8 @@ def remove_collinearity(df: DataFrame, threshold: float = 0.9):
         The correlation threshold above which features will be removed. Default is `0.9`.
     Returns
-    ----------
-        DataFrame: A DataFrame with highly collinear features removed.
+    -------
+        DataFrame : A DataFrame with highly collinear features removed.
     """
     corr_matrix = df.corr().abs()
     upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
@@ -97,11 +305,11 @@ def train_and_insights(
         Fraction of data to use (default is 1.0).
     Keyword Arguments:
-    ----------
+    ------------------
     Additional keyword arguments passed to the `model.fit` function, such as validation split and callbacks.
     Returns
-    ----------
+    -------
     `tf.keras.Model`
         The trained model after fitting.
     """
@@ -207,7 +415,7 @@ def graph_metrics(adj_matrix: np.ndarray, eigenvector_threshold: float = 1e-6) -
         A threshold for the eigenvector centrality calculation, used to determine the cutoff for small eigenvalues. Default is `1e-6`.
     Returns
-    ----------
+    -------
     DataFrame : A DataFrame containing the following graph metrics as columns.
         - `Degree Centrality`: Degree centrality values for each node, indicating the number of direct connections each node has.
         - `Clustering Coefficient`: Clustering coefficient values for each node, representing the degree to which nodes cluster together.
@@ -218,7 +426,7 @@ def graph_metrics(adj_matrix: np.ndarray, eigenvector_threshold: float = 1e-6) -
         - `Assortativity`: The assortativity coefficient of the graph, measuring the tendency of nodes to connect to similar nodes.
     Notes
-    ----------
+    -----
     The returned DataFrame will have one row for each node and one column for each of the computed metrics.
     """
     adj_matrix = adj_matrix.astype(int)
@@ -251,3 +459,7 @@ def graph_metrics(adj_matrix: np.ndarray, eigenvector_threshold: float = 1e-6) -
     metrics_df["Assortativity"] = assortativity
     return metrics_df
+if __name__ == "__main__":
+    pass

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/tools/numeric_tools.py RENAMED Viewed

@@ -154,7 +154,7 @@ def xicor(X: np.ndarray, Y: np.ndarray, ties: bool = True, random_seed: int = No
         The first variable to be correlated. Must have at least one dimension.
     Y : `np.ndarray`
         The second variable to be correlated. Must have at least one dimension.
-    ties : bool
+    ties : `bool`
         Whether to handle ties using randomization.
     random_seed : int, optional
         Seed for the random number generator for reproducibility.
@@ -356,9 +356,9 @@ def find_multiples(target: int) -> tuple[int, int] | None:
     Returns
     -------
     tuple[int, int] | None
-        If i and i+1 both divide target, returns (i, i+1).
-        Otherwise, returns (i, target // i).
-        Returns None if no factors are found.
+        If `i` and `i+1` both divide target, returns (i, i+1).
+        Otherwise, returns `(i, target // i)`.
+        Returns `None` if no factors are found.
     """
     for i in range(2, target + 1):
         if target % i == 0:

likelihood-2.0.2/likelihood/tools/reports.py ADDED Viewed

@@ -0,0 +1,195 @@
+from html import escape
+from IPython.display import HTML, display
+def generate_html_pipeline(data_dict, save_to_file=False, file_name="data_processing_report.html"):
+    css_js = """
+    <style>
+        :root {
+            --primary: #0d9488;
+            --primary-dark: #0f766e;
+            --success: #10b981;
+            --accent: #3b82f6;
+            --card-bg: #ffffff;
+            --shadow-sm: 0 2px 6px rgba(0, 0, 0, 0.03);
+            --border-radius-md: 6px;
+        }
+        * {
+            box-sizing: border-box;
+        }
+        body {
+            font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
+            background: #f8fafc;
+            color: #1e293b;
+            margin: 0;
+            padding: 1rem;
+            font-size: 14px;
+        }
+        h2 {
+            background: linear-gradient(135deg, var(--primary), var(--primary-dark));
+            color: white;
+            text-align: center;
+            padding: 1rem;
+            border-radius: var(--border-radius-md);
+            font-weight: 600;
+            font-size: 1.5rem;
+            margin-bottom: 1.5rem;
+        }
+        section {
+            background: var(--card-bg);
+            border-radius: var(--border-radius-md);
+            padding: 1rem;
+            box-shadow: var(--shadow-sm);
+            margin-bottom: 1.2rem;
+        }
+        h3 {
+            color: var(--primary-dark);
+            font-weight: 600;
+            font-size: 1.2rem;
+            border-left: 4px solid var(--success);
+            padding-left: 0.8rem;
+            margin: 1rem 0 0.8rem;
+        }
+        table {
+            width: 100%;
+            border-collapse: collapse;
+            font-size: 13px;
+            margin: 0.5rem 0 1rem;
+        }
+        th, td {
+            padding: 0.5rem 0.75rem;
+            text-align: left;
+            border-bottom: 1px solid #e2e8f0;
+            vertical-align: top;
+        }
+        thead {
+            background-color: #f0fdf4;
+        }
+        tbody tr:nth-child(odd) {
+            background-color: #f9fafb;
+        }
+        tbody tr:hover {
+            background-color: #e0f2fe;
+        }
+        .nested-table {
+            font-size: 12px;
+            margin-top: 0.5rem;
+        }
+        details {
+            margin-bottom: 0.8rem;
+            padding: 0.5rem 0.8rem;
+            background: #f9f9f9;
+            border-radius: var(--border-radius-md);
+        }
+        summary {
+            font-weight: 600;
+            font-size: 1rem;
+            color: var(--primary-dark);
+            cursor: pointer;
+        }
+        summary::before {
+            content: "▶";
+            margin-right: 6px;
+            color: var(--success);
+            font-size: 0.9rem;
+        }
+        @media (max-width: 768px) {
+            body {
+                font-size: 13px;
+            }
+            h2 {
+                font-size: 1.3rem;
+                padding: 0.8rem;
+            }
+            h3 {
+                font-size: 1.1rem;
+            }
+            table, .nested-table {
+                font-size: 12px;
+            }
+        }
+    </style>
+    """
+    def render_value(val):
+        if isinstance(val, dict):
+            return dict_to_table(val, nested=True)
+        elif isinstance(val, list):
+            if all(isinstance(item, (str, int, float)) for item in val):
+                return ", ".join(escape(str(x)) for x in val)
+            else:
+                return "<ul>" + "".join(f"<li>{render_value(v)}</li>" for v in val) + "</ul>"
+        else:
+            return escape(str(val))
+    def dict_to_table(d, title=None, nested=False):
+        html = ""
+        if title and not nested:
+            html += f"<h4>{escape(title)}</h4>"
+        table_class = "nested-table" if nested else "table"
+        html += f"<table class='{table_class}'>"
+        html += "<thead><tr><th>Key</th><th>Value</th></tr></thead><tbody>"
+        for key, val in d.items():
+            key_html = escape(str(key))
+            val_html = render_value(val)
+            html += f"<tr><td>{key_html}</td><td>{val_html}</td></tr>"
+        html += "</tbody></table>"
+        return html
+    html_content = css_js
+    html_content += "<h2>📈 Data Processing Report</h2>"
+    html_content += "<section>"
+    html_content += "<h3>📁 Initial Dataset</h3>"
+    html_content += dict_to_table(data_dict["initial_dataset"])
+    html_content += "</section>"
+    html_content += "<section>"
+    html_content += "<h3>🔧 Processing Steps</h3>"
+    for i, step in enumerate(data_dict["processing_steps"]):
+        html_content += "<details open>"
+        html_content += f"<summary>Step {i + 1}: {escape(step['step_name'])}</summary>"
+        html_content += f"<p><strong>Description:</strong> {escape(step['description'])}</p>"
+        html_content += dict_to_table(step["parameters"], title="Parameters", nested=True)
+        html_content += dict_to_table(
+            {
+                "Output Shape": step["output_shape"],
+                "Input Columns": step["input_columns"],
+                "Output Columns": step["output_columns"],
+                "Output Dtypes": step["output_dtypes"],
+            },
+            title="Output Info",
+            nested=True,
+        )
+        html_content += "</details>"
+    html_content += "</section>"
+    html_content += "<section>"
+    html_content += "<h3>✅ Final Dataset</h3>"
+    html_content += dict_to_table(data_dict["final_dataset"])
+    html_content += "</section>"
+    if save_to_file:
+        with open(file_name, "w", encoding="utf-8") as f:
+            f.write(html_content)
+        print(f"✅ Report saved to '{file_name}'")
+    else:
+        display(HTML(html_content))

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/tools/tools.py RENAMED Viewed

@@ -2,7 +2,7 @@ import math
 import os
 import pickle
 import warnings
-from typing import Callable, Dict, List, Tuple, Union
+from typing import Callable, Dict, Generator, List, Tuple, Union
 import matplotlib.pyplot as plt
 import numpy as np
@@ -25,7 +25,7 @@ Data Science from Scratch, Second Edition, by Joel Grus (O'Reilly).Copyright 201
 """
-def minibatches(dataset: List, batch_size: int, shuffle: bool = True) -> List:
+def minibatches(dataset: List, batch_size: int, shuffle: bool = True) -> Generator:
     """Generates 'batch_size'-sized minibatches from the dataset
     Parameters
@@ -660,7 +660,7 @@ class DataScaler:
     __slots__ = ["dataset_", "_n", "data_scaled", "values", "inv_fitting"]
-    def __init__(self, dataset: np.ndarray, n: int = 1) -> None:
+    def __init__(self, dataset: np.ndarray, n: int | None = 1) -> None:
         """Initializes the parameters required for scaling the data"""
         self.dataset_ = dataset.copy()
         self._n = n
@@ -861,7 +861,7 @@ class DataFrameEncoder:
         """Encodes the `object` type columns of the dataframe
         Keyword Arguments:
-        ----------
+        ------------------
         - save_mode (`bool`): An optional integer parameter. By default it is set to `True`
         - dictionary_name (`str`): An optional string parameter. By default it is set to `labelencoder_dictionary`
         - norm_method (`str`): An optional string parameter to perform normalization. By default it is set to `None`
@@ -1024,20 +1024,21 @@ class OneHotEncoder:
     It receives an array of integers and returns a binary array using the one-hot encoding method.
     """
-    __slots__ = ["x"]
+    __slots__ = ["num_categories"]
     def __init__(self) -> None:
         pass
-    def encode(self, x: np.ndarray | list):
-        self.x = x
-        if not isinstance(self.x, np.ndarray):
-            self.x = np.array(self.x)
+    def encode(self, x: np.ndarray | list, fit: bool = True):
+        if not isinstance(x, np.ndarray):
+            x = np.array(x)
+        x = x.astype(int)
+        if fit:
+            self.num_categories = x.max() + 1
-        y = np.zeros((self.x.size, self.x.max() + 1))
+        y = np.zeros((x.size, self.num_categories))
-        y[np.arange(self.x.size), self.x] = 1
+        y[np.arange(x.size), x] = 1
         return y
@@ -1189,7 +1190,9 @@ def check_nan_inf(df: DataFrame, verbose: bool = False) -> DataFrame:
     if nan_values:
         (
             print(
-                "UserWarning: Some rows may have been deleted due to the existence of NaN values."
+                "UserWarning: Some rows may have been deleted due to the existence of NaN values.",
+                f"NaN values removed: ",
+                "{:,}".format(nan_count),
             )
             if verbose
             else None
@@ -1199,7 +1202,9 @@ def check_nan_inf(df: DataFrame, verbose: bool = False) -> DataFrame:
     if inf_values:
         (
             print(
-                "UserWarning: Some rows may have been deleted due to the existence of Inf values."
+                "UserWarning: Some rows may have been deleted due to the existence of Inf values.",
+                f"Infinite values removed: ",
+                "{:,}".format(inf_count),
             )
             if verbose
             else None
@@ -1207,9 +1212,6 @@ def check_nan_inf(df: DataFrame, verbose: bool = False) -> DataFrame:
         df.replace([np.inf, -np.inf], np.nan, inplace=True)
         df.dropna(inplace=True)
-    print(f"NaN values removed: ", "{:,}".format(nan_count))
-    print(f"Infinite values removed: ", "{:,}".format(inf_count))
     return df

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: likelihood
-Version: 2.0.0
+Version: 2.0.2
 Summary: A package that performs the maximum likelihood algorithm.
 Home-page: https://github.com/jzsmoreno/likelihood/
 Author: J. A. Moreno-Guerra

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood.egg-info/SOURCES.txt RENAMED Viewed

@@ -3,6 +3,7 @@ README.md
 setup.py
 likelihood/__init__.py
 likelihood/main.py
+likelihood/pipes.py
 likelihood.egg-info/PKG-INFO
 likelihood.egg-info/SOURCES.txt
 likelihood.egg-info/dependency_links.txt
@@ -30,4 +31,5 @@ likelihood/tools/figures.py
 likelihood/tools/impute.py
 likelihood/tools/models_tools.py
 likelihood/tools/numeric_tools.py
+likelihood/tools/reports.py
 likelihood/tools/tools.py

{likelihood-2.0.0 → likelihood-2.0.2}/LICENSE RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/README.md RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/graph/__init__.py RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/graph/_nn.py RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/graph/graph.py RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/graph/nn.py RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/main.py RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/__init__.py RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/deep/__init__.py RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/deep/_predictor.py RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/deep/autoencoders.py RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/deep/gan.py RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/deep/predictor.py RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/hmm.py RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/regression.py RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/simulation.py RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/models/utils.py RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/tools/cat_embed.py RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/tools/figures.py RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood/tools/impute.py RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood.egg-info/requires.txt RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/likelihood.egg-info/top_level.txt RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/setup.cfg RENAMED Viewed

File without changes

{likelihood-2.0.0 → likelihood-2.0.2}/setup.py RENAMED Viewed

File without changes

likelihood 2.0.0__tar.gz → 2.0.2__tar.gz

likelihood 2.0.0tar.gz → 2.0.2tar.gz