PyPI - dragon-ml-toolbox - Versions diffs - 1.3.1__tar.gz → 1.3.2__tar.gz - Mend

dragon-ml-toolbox 1.3.1tar.gz → 1.3.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (23) hide show

{dragon_ml_toolbox-1.3.1/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-1.3.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 1.3.1
+Version: 1.3.2
 Summary: A collection of tools for data science and machine learning projects
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT
@@ -15,7 +15,8 @@ License-File: LICENSE-THIRD-PARTY.md
 Requires-Dist: numpy<2.0
 Requires-Dist: scikit-learn
 Requires-Dist: openpyxl
-Requires-Dist: miceforest
+Requires-Dist: miceforest<7.0.0,>=6.0.0
+Requires-Dist: plotnine<0.13,>=0.12
 Requires-Dist: matplotlib
 Requires-Dist: seaborn
 Requires-Dist: pandas
@@ -23,9 +24,12 @@ Requires-Dist: polars
 Requires-Dist: imbalanced-learn
 Requires-Dist: statsmodels
 Requires-Dist: ipython
+Requires-Dist: ipykernel
+Requires-Dist: notebook
+Requires-Dist: jupyterlab
 Requires-Dist: joblib
 Requires-Dist: xgboost
-Requires-Dist: lightgbm
+Requires-Dist: lightgbm<=4.5.0
 Requires-Dist: shap
 Provides-Extra: pytorch
 Requires-Dist: torch; extra == "pytorch"

{dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2/dragon_ml_toolbox.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 1.3.1
+Version: 1.3.2
 Summary: A collection of tools for data science and machine learning projects
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT
@@ -15,7 +15,8 @@ License-File: LICENSE-THIRD-PARTY.md
 Requires-Dist: numpy<2.0
 Requires-Dist: scikit-learn
 Requires-Dist: openpyxl
-Requires-Dist: miceforest
+Requires-Dist: miceforest<7.0.0,>=6.0.0
+Requires-Dist: plotnine<0.13,>=0.12
 Requires-Dist: matplotlib
 Requires-Dist: seaborn
 Requires-Dist: pandas
@@ -23,9 +24,12 @@ Requires-Dist: polars
 Requires-Dist: imbalanced-learn
 Requires-Dist: statsmodels
 Requires-Dist: ipython
+Requires-Dist: ipykernel
+Requires-Dist: notebook
+Requires-Dist: jupyterlab
 Requires-Dist: joblib
 Requires-Dist: xgboost
-Requires-Dist: lightgbm
+Requires-Dist: lightgbm<=4.5.0
 Requires-Dist: shap
 Provides-Extra: pytorch
 Requires-Dist: torch; extra == "pytorch"

{dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/dragon_ml_toolbox.egg-info/requires.txt RENAMED Viewed

@@ -1,7 +1,8 @@
 numpy<2.0
 scikit-learn
 openpyxl
-miceforest
+miceforest<7.0.0,>=6.0.0
+plotnine<0.13,>=0.12
 matplotlib
 seaborn
 pandas
@@ -9,9 +10,12 @@ polars
 imbalanced-learn
 statsmodels
 ipython
+ipykernel
+notebook
+jupyterlab
 joblib
 xgboost
-lightgbm
+lightgbm<=4.5.0
 shap
 [pytorch]

{dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/ml_tools/MICE_imputation.py RENAMED Viewed

@@ -3,15 +3,15 @@ import miceforest as mf
 import os
 import matplotlib.pyplot as plt
 import numpy as np
-from ml_tools.utilities import load_dataframe, list_csv_paths
+from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename
+from plotnine import ggplot, labs, theme, element_blank # type: ignore
 def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
     # Initialize kernel with number of imputed datasets to generate
     kernel = mf.ImputationKernel(
         data=df,
-        datasets=resulting_datasets,
+        num_datasets=resulting_datasets,
         random_state=random_state
     )
@@ -21,6 +21,9 @@ def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterat
     # Retrieve the imputed datasets
     imputed_datasets = [kernel.complete_data(dataset=i) for i in range(resulting_datasets)]
+    if imputed_datasets is None or len(imputed_datasets) == 0:
+        raise ValueError("No imputed datasets were generated. Check the MICE process.")
     if resulting_datasets == 1:
         imputed_dataset_names = [f"{df_name}_imputed"]
     else:
@@ -28,8 +31,8 @@ def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterat
     # Ensure indexes match
     for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
-        assert imputed_df.shape[0] == df.shape[0], f"Row count mismatch in dataset {subname}"
-        assert all(imputed_df.index == df.index), f"Index mismatch in dataset {subname}"
+        assert imputed_df.shape[0] == df.shape[0], f"Row count mismatch in dataset {subname}" # type: ignore
+        assert all(imputed_df.index == df.index), f"Index mismatch in dataset {subname}" # type: ignore
     # print("✅ All imputed datasets match the original DataFrame indexes.")
     return kernel, imputed_datasets, imputed_dataset_names
@@ -51,46 +54,65 @@ def get_na_column_names(df: pd.DataFrame):
 #Convergence diagnostic
-def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_names: list[str], column_names: list[str], root_dir: str):
+def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_names: list[str], column_names: list[str], root_dir: str, fontsize: int=16):
+    """
+    Generate and save convergence diagnostic plots for imputed variables.
+    Parameters:
+    - kernel: Trained miceforest.ImputationKernel.
+    - imputed_dataset_names: Names assigned to each imputed dataset.
+    - column_names: List of feature names to track over iterations.
+    - root_dir: Directory to save convergence plots.
+    """
     # get number of iterations used
     iterations_cap = kernel.iteration_count()
+    dataset_count = kernel.num_datasets
+    if dataset_count != len(imputed_dataset_names):
+        raise ValueError(f"Expected {dataset_count} names in imputed_dataset_names, got {len(imputed_dataset_names)}")
     # Check path
     os.makedirs(root_dir, exist_ok=True)
+    # Styling parameters
+    label_font = {'size': fontsize, 'weight': 'bold'}
     # iterate over each imputed dataset
-    for dataset_id, imputed_dataset_name in zip(range(kernel.num_datasets), imputed_dataset_names):
+    for dataset_id, imputed_dataset_name in zip(range(dataset_count), imputed_dataset_names):
         #Check directory for current dataset
         dataset_file_dir = f"Convergence_Metrics_{imputed_dataset_name}"
         local_save_dir = os.path.join(root_dir, dataset_file_dir)
-        if not os.path.isdir(local_save_dir):
-            os.makedirs(local_save_dir)
+        os.makedirs(local_save_dir, exist_ok=True)
         for feature_name in column_names:
             means_per_iteration = []
             for iteration in range(iterations_cap):
                 current_imputed = kernel.complete_data(dataset=dataset_id, iteration=iteration)
-                means_per_iteration.append(np.mean(current_imputed[feature_name]))
+                means_per_iteration.append(np.mean(current_imputed[feature_name])) # type: ignore
+            plt.figure(figsize=(10, 8))
             plt.plot(means_per_iteration, marker='o')
-            plt.xlabel("Iteration")
-            plt.ylabel("Mean of Imputed Values")
-            plt.title(f"Mean Convergence for '{feature_name}'")
+            plt.xlabel("Iteration", **label_font)
+            plt.ylabel("Mean of Imputed Values", **label_font)
+            plt.title(f"Mean Convergence for '{feature_name}'", **label_font)
             # Adjust plot display for the X axis
             _ticks = np.arange(iterations_cap)
             _labels = np.arange(1, iterations_cap + 1)
             plt.xticks(ticks=_ticks, labels=_labels)
+            plt.grid(True)
-            save_path = os.path.join(local_save_dir, feature_name + ".svg")
+            feature_save_name = sanitize_filename(feature_name)
+            save_path = os.path.join(local_save_dir, feature_save_name + ".svg")
             plt.savefig(save_path, bbox_inches='tight', format="svg")
             plt.close()
-        print(f"{dataset_file_dir} completed.")
+        print(f"\t{dataset_file_dir} completed.")
 # Imputed distributions
-def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_dir: str, column_names: list[str], one_plot: bool=False, fontsize: int=18):
+def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_dir: str, column_names: list[str], one_plot: bool=False, fontsize: int=14):
     '''
     It works using miceforest's authors implementation of the method `.plot_imputed_distributions()`.
@@ -106,12 +128,35 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
     legend_kwargs = {'frameon': True, 'facecolor': 'white', 'framealpha': 0.8}
     label_font = {'size': fontsize, 'weight': 'bold'}
-    def _process_figure(fig, filename):
+    def _process_figure(fig, filename: str):
         """Helper function to add labels and legends to a figure"""
-        for ax in fig.axes:
+        if not isinstance(fig, ggplot):
+            raise TypeError("Expected a plotnine.ggplot object")
+        # Edit labels and title
+        fig = fig + theme(
+                plot_title=element_blank(),  # removes labs(title=...)
+                strip_text=element_blank()   # removes facet_wrap labels
+            )
+        fig = fig + labs(y="", x="")
+        # Render to matplotlib figure
+        fig = fig.draw()
+        if not hasattr(fig, 'axes') or len(fig.axes) == 0:
+            raise RuntimeError("Rendered figure has no axes to modify")
+        if filename == "Combined_Distributions":
+            custom_xlabel = "Feature Values"
+        else:
+            custom_xlabel = filename
+        for ax in fig.axes:
             # Set axis labels
-            ax.set_xlabel('Value', **label_font)
-            ax.set_ylabel('Density', **label_font)
+            ax.set_xlabel(custom_xlabel, **label_font)
+            ax.set_ylabel('Distribution', **label_font)
             # Add legend based on line colors
             lines = ax.get_lines()
@@ -122,26 +167,28 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
                 ax.legend(**legend_kwargs)
         # Adjust layout and save
-        fig.tight_layout()
+        # fig.tight_layout()
+        # fig.subplots_adjust(bottom=0.2, left=0.2)  # Optional, depending on overflow
         fig.savefig(
-            os.path.join(local_save_dir, filename),
+            os.path.join(local_save_dir, filename + ".svg"),
             format='svg',
             bbox_inches='tight',
-            pad_inches=0
+            pad_inches=0.1
         )
         plt.close(fig)
     if one_plot:
         # Generate combined plot
         fig = kernel.plot_imputed_distributions(variables=column_names)
-        _process_figure(fig, "Combined_Distributions.svg")
+        _process_figure(fig, "Combined_Distributions")
         # Generate individual plots per feature
     else:
         for feature in column_names:
             fig = kernel.plot_imputed_distributions(variables=[feature])
-            _process_figure(fig, f"{feature}.svg")
+            feature_save_name = sanitize_filename(feature)
+            _process_figure(fig, feature_save_name)
-    print("Imputed distributions saved successfully.")
+    print("\tImputed distributions saved successfully.")
 def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_dir: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):

{dragon_ml_toolbox-1.3.1 → dragon_ml_toolbox-1.3.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "dragon-ml-toolbox"
-version = "1.3.1"
+version = "1.3.2"
 description = "A collection of tools for data science and machine learning projects"
 authors = [
     { name = "Karl Loza", email = "luigiloza@gmail.com" }
@@ -16,7 +16,8 @@ dependencies = [
     "numpy<2.0",
     "scikit-learn",
     "openpyxl",
-    "miceforest",
+    "miceforest>=6.0.0,<7.0.0",
+    "plotnine>=0.12,<0.13",
     "matplotlib",
     "seaborn",
     "pandas",
@@ -24,9 +25,12 @@ dependencies = [
     "imbalanced-learn",
     "statsmodels",
     "ipython",
+    "ipykernel",
+    "notebook",
+    "jupyterlab",
     "joblib",
     "xgboost",
-    "lightgbm",
+    "lightgbm<=4.5.0",
     "shap"
 ]