PyPI - dragon-ml-toolbox - Versions diffs - 1.4.1__tar.gz → 1.4.3__tar.gz - Mend

dragon-ml-toolbox 1.4.1tar.gz → 1.4.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (24) hide show

{dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/LICENSE-THIRD-PARTY.md RENAMED Viewed

@@ -8,7 +8,12 @@ This project depends on the following third-party packages. Each is governed by
 - [seaborn](https://github.com/mwaskom/seaborn/blob/main/LICENSE)
 - [statsmodels](https://github.com/statsmodels/statsmodels/blob/main/LICENSE.txt)
 - [ipython](https://github.com/ipython/ipython/blob/main/COPYING.rst)
+- [ipykernel](https://github.com/ipython/ipykernel/blob/main/COPYING.rst)
+- [notebook](https://github.com/jupyter/notebook/blob/main/LICENSE)
+- [jupyterlab](https://github.com/jupyterlab/jupyterlab/blob/main/LICENSE)
+- [ipywidgets](https://github.com/jupyter-widgets/ipywidgets/blob/main/LICENSE)
 - [torch](https://github.com/pytorch/pytorch/blob/main/LICENSE)
+- [torchvision](https://github.com/pytorch/vision/blob/main/LICENSE)
 - [scikit-learn](https://github.com/scikit-learn/scikit-learn/blob/main/COPYING)
 - [imblearn](https://github.com/scikit-learn-contrib/imbalanced-learn/blob/main/LICENSE)
 - [Pillow](https://github.com/python-pillow/Pillow/blob/main/LICENSE)
@@ -19,5 +24,5 @@ This project depends on the following third-party packages. Each is governed by
 - [openpyxl](https://github.com/chronossc/openpyxl/blob/main/LICENSE)
 - [miceforest](https://github.com/AnotherSamWilson/miceforest/blob/main/LICENSE)
 - [polars](https://github.com/pola-rs/polars/blob/main/LICENSE)
-- [torchvision](https://github.com/pytorch/vision/blob/main/LICENSE)
+- [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE.txt)
 - [pyswarm](https://pythonhosted.org/pyswarm/#license)

{dragon_ml_toolbox-1.4.1/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-1.4.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 1.4.1
+Version: 1.4.3
 Summary: A collection of tools for data science and machine learning projects
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3/dragon_ml_toolbox.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 1.4.1
+Version: 1.4.3
 Summary: A collection of tools for data science and machine learning projects
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/ml_tools/MICE_imputation.py RENAMED Viewed

@@ -3,7 +3,7 @@ import miceforest as mf
 import os
 import matplotlib.pyplot as plt
 import numpy as np
-from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info
+from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe
 from plotnine import ggplot, labs, theme, element_blank # type: ignore
@@ -49,15 +49,11 @@ def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterat
     return kernel, imputed_datasets, imputed_dataset_names
-def save_imputed_datasets(save_dir: str, imputed_datasets: list, imputed_dataset_names: list[str]):
-    # Check path
-    os.makedirs(save_dir, exist_ok=True)
+def save_imputed_datasets(save_dir: str, imputed_datasets: list, df_targets: pd.DataFrame, imputed_dataset_names: list[str]):
     for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
-        output_path = os.path.join(save_dir, subname + ".csv")
-        imputed_df.to_csv(output_path, index=False, encoding='utf-8')
-        print(f"\tSaved {subname} with shape {imputed_df.shape}")
+        merged_df = merge_dataframes(imputed_df, df_targets, direction="horizontal", verbose=False)
+        save_dataframe(df=merged_df, save_dir=save_dir, filename=subname)
 #Get names of features that had missing values before imputation
 def get_na_column_names(df: pd.DataFrame):
@@ -119,7 +115,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
             plt.savefig(save_path, bbox_inches='tight', format="svg")
             plt.close()
-        print(f"\t{dataset_file_dir} completed.")
+        print(f"{dataset_file_dir} completed.")
 # Imputed distributions
@@ -131,7 +127,8 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
     '''
     # Check path
     os.makedirs(root_dir, exist_ok=True)
-    local_save_dir = os.path.join(root_dir, f"Distribution_Metrics_{df_name}_imputed")
+    local_dir_name = f"Distribution_Metrics_{df_name}_imputed"
+    local_save_dir = os.path.join(root_dir, local_dir_name)
     if not os.path.isdir(local_save_dir):
         os.makedirs(local_save_dir)
@@ -202,10 +199,10 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
             fig = kernel.plot_imputed_distributions(variables=[feature])
             _process_figure(fig, feature)
-    print("\tImputed distributions saved successfully.")
+    print(f"{local_dir_name} completed.")
-def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_dir: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
+def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str], save_datasets_dir: str, save_metrics_dir: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
     """
     Call functions in sequence for each dataset in the provided path or directory:
         1. Load dataframe
@@ -213,6 +210,8 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
         3. Save imputed dataset(s)
         4. Save convergence metrics
         5. Save distribution metrics
+    Target columns must be skipped from the imputation.
     """
     # Check paths
     os.makedirs(save_datasets_dir, exist_ok=True)
@@ -228,9 +227,11 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
     for df_path in all_file_paths:
         df, df_name = load_dataframe(df_path=df_path)
+        df, df_targets = _skip_targets(df, target_columns)
         kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
-        save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, imputed_dataset_names=imputed_dataset_names)
+        save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
         imputed_column_names = get_na_column_names(df=df)
@@ -239,5 +240,12 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
         get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_dir, column_names=imputed_column_names)
+def _skip_targets(df: pd.DataFrame, target_cols: list[str]):
+    valid_targets = [col for col in target_cols if col in df.columns]
+    df_targets = df[valid_targets]
+    df_feats = df.drop(columns=valid_targets)
+    return df_feats, df_targets
 def info():
     _script_info(__all__)

{dragon_ml_toolbox-1.4.1 → dragon_ml_toolbox-1.4.3}/ml_tools/data_exploration.py RENAMED Viewed

@@ -5,10 +5,8 @@ import seaborn as sns
 from IPython import get_ipython
 from IPython.display import clear_output
 import time
-from typing import Union, Literal, Dict, Tuple
+from typing import Union, Literal, Dict, Tuple, Iterator
 import os
-import sys
-import textwrap
 from ml_tools.utilities import sanitize_filename, _script_info
@@ -24,7 +22,8 @@ __all__ = [
     "check_value_distributions",
     "plot_value_distributions",
     "clip_outliers_single",
-    "clip_outliers_multi"
+    "clip_outliers_multi",
+    "distribute_datasets_by_target"
 ]
@@ -113,7 +112,7 @@ def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
     Parameters:
         df (pd.DataFrame): The input DataFrame.
         round_digits (int): Number of decimal places for the percentage.
     Returns:
         pd.DataFrame: A DataFrame summarizing missing values in each column.
     """
@@ -133,13 +132,14 @@ def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
     return null_summary
-def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
+def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True) -> pd.DataFrame:
     """
     Drops columns with more than `threshold` fraction of missing values.
     Parameters:
         df (pd.DataFrame): The input DataFrame.
         threshold (float): Fraction of missing values above which columns are dropped.
+        show_nulls_after (bool): Prints `show_null_columns` after dropping columns.
     Returns:
         pd.DataFrame: A new DataFrame without the dropped columns.
@@ -150,10 +150,15 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) ->
     if len(cols_to_drop) > 0:
         print(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
         print(list(cols_to_drop))
+        result_df = df.drop(columns=cols_to_drop)
+        if show_nulls_after:
+            show_null_columns(df=result_df).head(20)
+        return result_df
     else:
         print(f"No columns have more than {threshold*100:.0f}% missing data.")
-    return df.drop(columns=cols_to_drop)
+        return df
 def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
@@ -514,6 +519,34 @@ def clip_outliers_multi(
     return new_df
+def distribute_datasets_by_target(
+    df: pd.DataFrame,
+    target_columns: list[str]
+) -> Iterator[Tuple[str, pd.DataFrame]]:
+    """
+    Yields cleaned DataFrames for each target column, where rows with missing
+    target values are removed. The target column is placed at the end.
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Preprocessed dataframe with all feature and target columns ready to train.
+    target_columns : List[str]
+        List of target column names to generate per-target DataFrames.
+    Yields
+    ------
+    Tuple[str, pd.DataFrame]
+        * First element is the target column name.
+        * Second element is the corresponding cleaned DataFrame.
+    """
+    feature_columns = [col for col in df.columns if col not in target_columns]
+    for target in target_columns:
+        subset = df[feature_columns + [target]].dropna(subset=[target])
+        yield target, subset
 def _is_notebook():
     return get_ipython() is not None

dragon-ml-toolbox 1.4.1__tar.gz → 1.4.3__tar.gz

Potentially problematic release.

dragon-ml-toolbox 1.4.1tar.gz → 1.4.3tar.gz