PyPI - dragon-ml-toolbox - Versions diffs - 5.2.2__py3-none-any.whl → 5.3.1__py3-none-any.whl - Mend

dragon-ml-toolbox 5.2.2py3-none-any.whl → 5.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (14) hide show

{dragon_ml_toolbox-5.2.2.dist-info → dragon_ml_toolbox-5.3.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 5.2.2
+Version: 5.3.1
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT
@@ -18,7 +18,7 @@ Requires-Dist: numpy; extra == "base"
 Requires-Dist: polars; extra == "base"
 Requires-Dist: joblib; extra == "base"
 Provides-Extra: ml
-Requires-Dist: numpy; extra == "ml"
+Requires-Dist: numpy>=2.0; extra == "ml"
 Requires-Dist: pandas; extra == "ml"
 Requires-Dist: polars; extra == "ml"
 Requires-Dist: joblib; extra == "ml"

{dragon_ml_toolbox-5.2.2.dist-info → dragon_ml_toolbox-5.3.1.dist-info}/RECORD RENAMED Viewed

@@ -1,15 +1,15 @@
-dragon_ml_toolbox-5.2.2.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
-dragon_ml_toolbox-5.2.2.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
+dragon_ml_toolbox-5.3.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
+dragon_ml_toolbox-5.3.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
 ml_tools/ETL_engineering.py,sha256=4wwZXi9_U7xfCY70jGBaKniOeZ0m75ppxWpQBd_DmLc,39369
 ml_tools/GUI_tools.py,sha256=n4ZZ5kEjwK5rkOCFJE41HeLFfjhpJVLUSzk9Kd9Kr_0,45410
-ml_tools/MICE_imputation.py,sha256=b6ZTs8RedXFifOpuMCzr68xM16mCBVh1Ua6kcGfiVtg,11462
-ml_tools/ML_callbacks.py,sha256=xiJ6NnoVwF_TVak6sYzwWFk4CI3vRJGjxvGI1Yq6euw,13332
-ml_tools/ML_datasetmaster.py,sha256=IzT2v1o71PgYCFi9RXccBnmH-t-ExzX8sn9cCD2gz-Y,33603
-ml_tools/ML_evaluation.py,sha256=4dVqe6JF1Ukmk1sAcY8E5EG1oB1_oy2HXE5OT-pZwCs,10273
+ml_tools/MICE_imputation.py,sha256=oFHg-OytOzPYTzBR_wIRHhP71cMn3aupDeT59ABsXlQ,11576
+ml_tools/ML_callbacks.py,sha256=hOGWYM6ndaH0ibaHgM14j74MtWFalToY-oTnB2jsQ4A,13268
+ml_tools/ML_datasetmaster.py,sha256=bbKCNA_b_uDIfxP9YIYKZm-VSfUSD15LvegFxpE9DIQ,34315
+ml_tools/ML_evaluation.py,sha256=LX6UkUC80y43lYKBkw03CptZ3PJGkZXfmZZHL-2kd1s,11590
 ml_tools/ML_inference.py,sha256=Fh-X2UQn3AznWBjf-7iPSxwE-EzkGQm1VEIRUAkURmE,5336
 ml_tools/ML_models.py,sha256=SJhKHGAN2VTBqzcHUOpFWuVZ2Y7U1M4P_axG_LNYWcI,6460
-ml_tools/ML_optimization.py,sha256=2L9BSUzgLOEwBU84TN1qDh1KAOJ4R6C6NYSe7jmE4RI,9656
-ml_tools/ML_trainer.py,sha256=t58Ka6ryaYm0Fi5xje-e-fkmz9DwDLIeJLbh04n_gDg,15034
+ml_tools/ML_optimization.py,sha256=zGKpWW4SL1-3iiHglDP-dkuADL73T0kxs3Dc-Lyishs,9671
+ml_tools/ML_trainer.py,sha256=ENOxTq07kWYn7ZolMfXYLSy-cLZOdty0dRmutA84SV4,15146
 ml_tools/PSO_optimization.py,sha256=stH2Ux1sftQgX5EwLc85kHcoT4Rmz6zv7sH2yzf4Zrw,22710
 ml_tools/RNN_forecast.py,sha256=2CyjBLSYYc3xLHxwLXUmP5Qv8AmV1OB_EndETNX1IBk,1956
 ml_tools/SQL.py,sha256=9zzS6AFEJM9aj6nE31hDe8S9TqLonk-J1amwZoiHNbk,10468
@@ -18,15 +18,15 @@ ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
 ml_tools/_logger.py,sha256=TpgYguxO-CWYqqgLW0tqFjtwZ58PE_W2OCfWNGZr0n0,1175
 ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
 ml_tools/custom_logger.py,sha256=njM_0XPbQ1S-x5LeSQAaTo2if-XVOR_pQSGg4EDeiTU,4603
-ml_tools/data_exploration.py,sha256=qc_Oolxco2x9IhlYu5zPIuVBGiBw65HnypuGm8cQOOM,23677
+ml_tools/data_exploration.py,sha256=P4f8OpRa7Q4i-11nkppxXw5Lx2lwlpn20GwWBbN_xbM,23901
 ml_tools/ensemble_inference.py,sha256=0SNX3YAz5bpvtwYmqEwqyWeIJP2Pb-v-bemENRSO7qg,9426
 ml_tools/ensemble_learning.py,sha256=Zi1oy6G2FWnTI5hBwjlexwF3JKALFS2FN6F8HAlVi_s,35391
 ml_tools/handle_excel.py,sha256=J9iwIqMZemoxK49J5osSwp9Ge0h9YTKyYGbOm53hcno,13007
 ml_tools/keys.py,sha256=kK9UF-hek2VcPGFILCKl5geoN6flmMOu7IzhdEA6z5Y,1068
 ml_tools/optimization_tools.py,sha256=MuT4OG7_r1QqLUti-yYix7QeCpglezD0oe9BDCq0QXk,5086
 ml_tools/path_manager.py,sha256=Z8e7w3MPqQaN8xmTnKuXZS6CIW59BFwwqGhGc00sdp4,13692
-ml_tools/utilities.py,sha256=mz-M351DzxWxnYVcLX-7ZQ6c-RGoCV9g4VTS9Qif2Es,18348
-dragon_ml_toolbox-5.2.2.dist-info/METADATA,sha256=1xc1_iWoGsLxwEFcyLRRSJCJJNdQZNsVHCSykfaVKGQ,6638
-dragon_ml_toolbox-5.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-5.2.2.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-5.2.2.dist-info/RECORD,,
+ml_tools/utilities.py,sha256=T5xbxzBr14odUj7KncSeg-tJzqjmSDLOOmxEaGYLLi4,18447
+dragon_ml_toolbox-5.3.1.dist-info/METADATA,sha256=XMn0E2Bh_6X97SScFy08jxJvo_KYeS5yuApaHTDPeqY,6643
+dragon_ml_toolbox-5.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-5.3.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-5.3.1.dist-info/RECORD,,

ml_tools/MICE_imputation.py CHANGED Viewed

@@ -29,6 +29,8 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
         random_state=random_state
     )
+    _LOGGER.info("➡️ MICE imputation running...")
     # Perform MICE with n iterations per dataset
     kernel.mice(iterations)
@@ -61,6 +63,8 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
         assert all(imputed_df.index == df.index), f"❌ Index mismatch in dataset {subname}" # type: ignore
     # print("✅ All imputed datasets match the original DataFrame indexes.")
+    _LOGGER.info("✅ MICE imputation complete.")
     return kernel, imputed_datasets, imputed_dataset_names

ml_tools/ML_callbacks.py CHANGED Viewed

@@ -124,7 +124,7 @@ class EarlyStopping(Callback):
                     inferred from the name of the monitored quantity.
         verbose (int): Verbosity mode.
     """
-    def __init__(self, monitor: str=LogKeys.VAL_LOSS, min_delta=0.0, patience=3, mode: Literal['auto', 'min', 'max']='auto', verbose: int=1):
+    def __init__(self, monitor: str=LogKeys.VAL_LOSS, min_delta: float=0.0, patience: int=5, mode: Literal['auto', 'min', 'max']='auto', verbose: int=0):
         super().__init__()
         self.monitor = monitor
         self.patience = patience
@@ -148,13 +148,13 @@ class EarlyStopping(Callback):
             else: # Default to min mode for loss or other metrics
                 self.monitor_op = np.less
-        self.best = np.Inf if self.monitor_op == np.less else -np.Inf # type: ignore
+        self.best = np.inf if self.monitor_op == np.less else -np.inf
     def on_train_begin(self, logs=None):
         # Reset state at the beginning of training
         self.wait = 0
         self.stopped_epoch = 0
-        self.best = np.Inf if self.monitor_op == np.less else -np.Inf # type: ignore
+        self.best = np.inf if self.monitor_op == np.less else -np.inf
     def on_epoch_end(self, epoch, logs=None):
         current = logs.get(self.monitor) # type: ignore
@@ -202,7 +202,7 @@ class ModelCheckpoint(Callback):
         verbose (int): Verbosity mode.
     """
     def __init__(self, save_dir: Union[str,Path], monitor: str = LogKeys.VAL_LOSS,
-                 save_best_only: bool = False, mode: Literal['auto', 'min', 'max']= 'auto', verbose: int = 1):
+                 save_best_only: bool = True, mode: Literal['auto', 'min', 'max']= 'auto', verbose: int = 0):
         super().__init__()
         self.save_dir = make_fullpath(save_dir, make=True, enforce="directory")
         if not self.save_dir.is_dir():
@@ -228,11 +228,11 @@ class ModelCheckpoint(Callback):
         else:
             self.monitor_op = np.less if 'loss' in self.monitor else np.greater
-        self.best = np.Inf if self.monitor_op == np.less else -np.Inf # type: ignore
+        self.best = np.inf if self.monitor_op == np.less else -np.inf
     def on_train_begin(self, logs=None):
         """Reset state when training starts."""
-        self.best = np.Inf if self.monitor_op == np.less else -np.Inf # type: ignore
+        self.best = np.inf if self.monitor_op == np.less else -np.inf
         self.saved_checkpoints = []
         self.last_best_filepath = None
@@ -251,7 +251,7 @@ class ModelCheckpoint(Callback):
             return
         if self.monitor_op(current, self.best):
-            old_best_str = f"{self.best:.4f}" if self.best not in [np.Inf, -np.Inf] else "inf" # type: ignore
+            old_best_str = f"{self.best:.4f}" if self.best not in [np.inf, -np.inf] else "inf"
             # Create a descriptive filename
             filename = f"epoch_{epoch}-{self.monitor}_{current:.4f}.pth"

ml_tools/ML_datasetmaster.py CHANGED Viewed

@@ -128,17 +128,18 @@ class DatasetMaker(_BaseMaker):
     - Automated (single call):
     ```python
     maker = DatasetMaker(df, label_col='target')
-    maker.process() # uses simplified arguments
+    maker.auto_process() # uses simplified arguments
     train_ds, test_ds = maker.get_datasets()
     ```
     """
-    def __init__(self, pandas_df: pandas.DataFrame, label_col: str):
+    def __init__(self, pandas_df: pandas.DataFrame, label_col: str, kind: Literal["regression", "classification"]):
         super().__init__()
         if not isinstance(pandas_df, pandas.DataFrame):
             raise TypeError("Input must be a pandas.DataFrame.")
         if label_col not in pandas_df.columns:
             raise ValueError(f"Label column '{label_col}' not found in DataFrame.")
+        self.kind = kind
         self.labels = pandas_df[label_col]
         self.features = pandas_df.drop(columns=label_col)
         self.labels_map = None
@@ -277,7 +278,7 @@ class DatasetMaker(_BaseMaker):
         _LOGGER.info(f"Balancing complete. New training set size: {len(self.features_train)} samples.")
         return self
-    def process(self, test_size: float = 0.2, cat_method: Literal["one-hot", "embed"] = "one-hot", normalize_method: Literal["standard", "minmax"] = "standard",
+    def auto_process(self, test_size: float = 0.2, cat_method: Literal["one-hot", "embed"] = "one-hot", normalize_method: Literal["standard", "minmax"] = "standard",
                 balance: bool = False, random_state: Optional[int] = None) -> 'DatasetMaker':
         """Runs a standard, fully automated preprocessing pipeline."""
         _LOGGER.info("--- 🤖 Running Automated Processing Pipeline ---")
@@ -334,8 +335,10 @@ class DatasetMaker(_BaseMaker):
         if not self._is_split:
             raise RuntimeError("Data has not been split yet. Call .split_data() or .process() first.")
-        self._train_dataset = _PytorchDataset(self.features_train, self.labels_train) # type: ignore
-        self._test_dataset = _PytorchDataset(self.features_test, self.labels_test) # type: ignore
+        label_dtype = torch.float32 if self.kind == "regression" else torch.int64
+        self._train_dataset = _PytorchDataset(self.features_train, self.labels_train, labels_dtype=label_dtype) # type: ignore
+        self._test_dataset = _PytorchDataset(self.features_test, self.labels_test, labels_dtype=label_dtype)  # type: ignore
         return self._train_dataset, self._test_dataset
@@ -382,12 +385,13 @@ class SimpleDatasetMaker:
     Args:
         pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
+        kind (Literal["regression", "classification"]): The type of ML task. This determines the data type of the labels.
         test_size (float): The proportion of the dataset to allocate to the
                            test split.
         random_state (int): The seed for the random number generator for
                             reproducibility.
     """
-    def __init__(self, pandas_df: pandas.DataFrame, test_size: float = 0.2, random_state: int = 42):
+    def __init__(self, pandas_df: pandas.DataFrame, kind: Literal["regression", "classification"], test_size: float = 0.2, random_state: int = 42):
         """
         Attributes:
             `train_dataset` -> PyTorch Dataset
@@ -398,9 +402,11 @@ class SimpleDatasetMaker:
         The ID can be manually set to any string if needed, it is `None` by default.
         """
+        # Validation
         if not isinstance(pandas_df, pandas.DataFrame):
-            raise TypeError("Input must be a pandas.DataFrame.")
+            raise TypeError("Input must be a pandas.DataFrame.")
+        if kind not in ["regression", "classification"]:
+            raise ValueError("`kind` must be 'regression' or 'classification'.")
         # 1. Identify features and target
         features = pandas_df.iloc[:, :-1]
@@ -422,9 +428,11 @@ class SimpleDatasetMaker:
         self._y_train_shape = y_train.shape
         self._y_test_shape = y_test.shape
-        # 3. Convert to PyTorch Datasets
-        self._train_ds = _PytorchDataset(X_train.values, y_train.values)
-        self._test_ds = _PytorchDataset(X_test.values, y_test.values)
+        # 3. Convert to PyTorch Datasets with the correct label dtype
+        label_dtype = torch.float32 if kind == "regression" else torch.int64
+        self._train_ds = _PytorchDataset(X_train.values, y_train.values, labels_dtype=label_dtype)
+        self._test_ds = _PytorchDataset(X_test.values, y_test.values, labels_dtype=label_dtype)
     @property
     def train_dataset(self) -> Dataset:

ml_tools/ML_evaluation.py CHANGED Viewed

@@ -195,7 +195,7 @@ def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Optiona
         plt.close(fig_tvp)
-def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain: torch.Tensor,
+def shap_summary_plot(model, background_data: Union[torch.Tensor,np.ndarray], instances_to_explain: Union[torch.Tensor,np.ndarray],
                       feature_names: Optional[list[str]]=None, save_dir: Optional[Union[str, Path]] = None):
     """
     Calculates SHAP values and saves summary plots and data.
@@ -207,24 +207,54 @@ def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain
         feature_names (list of str | None): Names of the features for plot labeling.
         save_dir (str | Path | None): Directory to save SHAP artifacts. If None, dot plot is shown.
     """
+    # everything to numpy
+    if isinstance(background_data, np.ndarray):
+        background_data_np = background_data
+    else:
+        background_data_np = background_data.numpy()
+    if isinstance(instances_to_explain, np.ndarray):
+        instances_to_explain_np = instances_to_explain
+    else:
+        instances_to_explain_np = instances_to_explain.numpy()
+    # --- Data Validation Step ---
+    if np.isnan(background_data_np).any() or np.isnan(instances_to_explain_np).any():
+        _LOGGER.error("❌ Input data for SHAP contains NaN values. Aborting explanation.")
+        return
     print("\n--- SHAP Value Explanation ---")
-    print("Calculating SHAP values... ")
     model.eval()
     model.cpu()
-    explainer = shap.DeepExplainer(model, background_data)
-    shap_values = explainer.shap_values(instances_to_explain)
-    shap_values_for_plot = shap_values[1] if isinstance(shap_values, list) else shap_values
-    if isinstance(shap_values, list):
-        _LOGGER.info("Using SHAP values for the positive class (class 1) for plots.")
+    # 1. Summarize the background data.
+    # Summarize the background data using k-means. 10-50 clusters is a good starting point.
+    background_summary = shap.kmeans(background_data_np, 30)
+    # 2. Define a prediction function wrapper that SHAP can use. It must take a numpy array and return a numpy array.
+    def prediction_wrapper(x_np: np.ndarray) -> np.ndarray:
+        # Convert numpy data to torch tensor
+        x_torch = torch.from_numpy(x_np).float()
+        with torch.no_grad():
+            # Get model output
+            output = model(x_torch)
+        # Return as numpy array
+        return output.cpu().numpy().flatten()
+    # 3. Create the KernelExplainer
+    explainer = shap.KernelExplainer(prediction_wrapper, background_summary)
+    print("Calculating SHAP values with KernelExplainer...")
+    shap_values = explainer.shap_values(instances_to_explain_np, l1_reg="aic")
     if save_dir:
         save_dir_path = make_fullpath(save_dir, make=True, enforce="directory")
+        plt.ioff()
         # Save Bar Plot
         bar_path = save_dir_path / "shap_bar_plot.svg"
-        shap.summary_plot(shap_values_for_plot, instances_to_explain, feature_names=feature_names, plot_type="bar", show=False)
+        shap.summary_plot(shap_values, instances_to_explain_np, feature_names=feature_names, plot_type="bar", show=False)
         plt.title("SHAP Feature Importance")
         plt.tight_layout()
         plt.savefig(bar_path)
@@ -233,7 +263,7 @@ def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain
         # Save Dot Plot
         dot_path = save_dir_path / "shap_dot_plot.svg"
-        shap.summary_plot(shap_values_for_plot, instances_to_explain, feature_names=feature_names, plot_type="dot", show=False)
+        shap.summary_plot(shap_values, instances_to_explain_np, feature_names=feature_names, plot_type="dot", show=False)
         plt.title("SHAP Feature Importance")
         plt.tight_layout()
         plt.savefig(dot_path)
@@ -242,18 +272,25 @@ def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain
         # Save Summary Data to CSV
         summary_path = save_dir_path / "shap_summary.csv"
-        mean_abs_shap = np.abs(shap_values_for_plot).mean(axis=0)
+        # Ensure the array is 1D before creating the DataFrame
+        mean_abs_shap = np.abs(shap_values).mean(axis=0).flatten()
         if feature_names is None:
             feature_names = [f'feature_{i}' for i in range(len(mean_abs_shap))]
         summary_df = pd.DataFrame({
             'feature': feature_names,
             'mean_abs_shap_value': mean_abs_shap
         }).sort_values('mean_abs_shap_value', ascending=False)
         summary_df.to_csv(summary_path, index=False)
         _LOGGER.info(f"📝 SHAP summary data saved as '{summary_path.name}'")
+        plt.ion()
     else:
         _LOGGER.info("No save directory provided. Displaying SHAP dot plot.")
-        shap.summary_plot(shap_values_for_plot, instances_to_explain, feature_names=feature_names, plot_type="dot")
+        shap.summary_plot(shap_values, instances_to_explain_np, feature_names=feature_names, plot_type="dot")
 def info():

ml_tools/ML_optimization.py CHANGED Viewed

@@ -49,6 +49,7 @@ def create_pytorch_problem(
             selected search algorithm's constructor (e.g., stdev_init=0.5 for CMAES).
     Returns:
+        Tuple:
         A tuple containing the configured evotorch.Problem and evotorch.Searcher.
     """
     lower_bounds, upper_bounds = bounds

ml_tools/ML_trainer.py CHANGED Viewed

@@ -95,14 +95,16 @@ class MyTrainer:
             batch_size=batch_size,
             shuffle=shuffle,
             num_workers=loader_workers,
-            pin_memory=(self.device.type == "cuda")
+            pin_memory=("cuda" in self.device.type),
+            drop_last=True  # Drops the last batch if incomplete, selecting a good batch size is key.
         )
         self.test_loader = DataLoader(
             dataset=self.test_dataset,
             batch_size=batch_size,
             shuffle=False,
             num_workers=loader_workers,
-            pin_memory=(self.device.type == "cuda")
+            pin_memory=("cuda" in self.device.type)
         )
     def fit(self, epochs: int = 10, batch_size: int = 10, shuffle: bool = True):

ml_tools/data_exploration.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import Union, Literal, Dict, Tuple, List, Optional
 from pathlib import Path
 from .path_manager import sanitize_filename, make_fullpath
 from ._script_info import _script_info
+from ._logger import _LOGGER
 import re
@@ -55,7 +56,7 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
         ].round(round_digits)
         summary = summary.join(summary_numeric, how='left')
-    print(f"Shape: {df.shape}")
+    print(f"DataFrame Shape: {df.shape}")
     return summary
@@ -98,7 +99,7 @@ def drop_constant_columns(df: pd.DataFrame, verbose: bool = True) -> pd.DataFram
     dropped_columns = original_columns - set(cols_to_keep)
     if verbose:
-        print(f"🧹 Dropped {len(dropped_columns)} constant columns.")
+        _LOGGER.info(f"🧹 Dropped {len(dropped_columns)} constant columns.")
         if dropped_columns:
             for dropped_column in dropped_columns:
                 print(f"    {dropped_column}")
@@ -129,10 +130,10 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
         valid_targets = _validate_columns(df_clean, targets)
         target_na = df_clean[valid_targets].isnull().all(axis=1)
         if target_na.any():
-            print(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
+            _LOGGER.info(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
             df_clean = df_clean[~target_na]
         else:
-            print("✅ No rows with all targets missing.")
+            _LOGGER.info("✅ No rows with all targets missing.")
     else:
         valid_targets = []
@@ -142,12 +143,12 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
         feature_na_frac = df_clean[feature_cols].isnull().mean(axis=1)
         rows_to_drop = feature_na_frac[feature_na_frac > threshold].index
         if len(rows_to_drop) > 0:
-            print(f"🧹 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
+            _LOGGER.info(f"🧹 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
             df_clean = df_clean.drop(index=rows_to_drop)
         else:
-            print(f"✅ No rows exceed the {threshold*100:.0f}% missing feature data threshold.")
+            _LOGGER.info(f"✅ No rows exceed the {threshold*100:.0f}% missing feature data threshold.")
     else:
-        print("⚠️ No feature columns available to evaluate.")
+        _LOGGER.warning("⚠️ No feature columns available to evaluate.")
     return df_clean
@@ -207,7 +208,7 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
     cols_to_drop = missing_fraction[missing_fraction > threshold].index
     if len(cols_to_drop) > 0:
-        print(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
+        _LOGGER.info(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
         print(list(cols_to_drop))
         result_df = df.drop(columns=cols_to_drop)
@@ -216,7 +217,7 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
         return result_df
     else:
-        print(f"No columns have more than {threshold*100:.0f}% missing data.")
+        _LOGGER.info(f"No columns have more than {threshold*100:.0f}% missing data.")
         return df
@@ -311,7 +312,7 @@ def plot_correlation_heatmap(df: pd.DataFrame,
     """
     numeric_df = df.select_dtypes(include='number')
     if numeric_df.empty:
-        print("No numeric columns found. Heatmap not generated.")
+        _LOGGER.warning("⚠️ No numeric columns found. Heatmap not generated.")
         return
     corr = numeric_df.corr(method=method)
@@ -348,7 +349,7 @@ def plot_correlation_heatmap(df: pd.DataFrame,
         full_path = save_path / plot_title
         plt.savefig(full_path, bbox_inches="tight", format='svg')
-        print(f"Saved correlation heatmap: '{plot_title}'")
+        _LOGGER.info(f"Saved correlation heatmap: '{plot_title}'")
     plt.show()
     plt.close()
@@ -454,7 +455,7 @@ def plot_value_distributions(df: pd.DataFrame, save_dir: Union[str, Path], bin_t
     _plot_helper(dict_=dict_to_plot_std, target_dir=std_dir, ylabel="Counts")
     _plot_helper(dict_=dict_to_plot_freq, target_dir=freq_dir, ylabel="Frequency")
-    print(f"Saved {saved_plots} plot(s)")
+    _LOGGER.info(f"Saved {saved_plots} value distribution plots.")
 def clip_outliers_single(
@@ -479,17 +480,17 @@ def clip_outliers_single(
         None: if a problem with the dataframe column occurred.
     """
     if column not in df.columns:
-        print(f"Column '{column}' not found in DataFrame.")
+        _LOGGER.warning(f"⚠️ Column '{column}' not found in DataFrame.")
         return None
     if not pd.api.types.is_numeric_dtype(df[column]):
-        print(f"Column '{column}' must be numeric.")
+        _LOGGER.warning(f"⚠️ Column '{column}' must be numeric.")
         return None
     new_df = df.copy(deep=True)
     new_df[column] = new_df[column].clip(lower=min_val, upper=max_val)
-    print(f"Column '{column}' clipped to range [{min_val}, {max_val}].")
+    _LOGGER.info(f"Column '{column}' clipped to range [{min_val}, {max_val}].")
     return new_df
@@ -539,10 +540,10 @@ def clip_outliers_multi(
             skipped_columns.append((col, str(e)))
             continue
-    print(f"Clipped {clipped_columns} columns.")
+    _LOGGER.info(f"Clipped {clipped_columns} columns.")
     if skipped_columns:
-        print("\n⚠️ Skipped columns:")
+        _LOGGER.warning("⚠️ Skipped columns:")
         for col, msg in skipped_columns:
             print(f" - {col}: {msg}")
@@ -574,7 +575,7 @@ def match_and_filter_columns_by_regex(
     matched_columns = df.columns[mask].to_list()
     filtered_df = df.loc[:, mask]
-    print(f"{len(matched_columns)} column(s) match the regex pattern '{pattern}'.")
+    _LOGGER.info(f"{len(matched_columns)} columns match the regex pattern '{pattern}'.")
     return filtered_df, matched_columns
@@ -628,11 +629,11 @@ def standardize_percentages(
     for col in columns:
         # --- Robustness Checks ---
         if col not in df_copy.columns:
-            print(f"Warning: Column '{col}' not found. Skipping.")
+            _LOGGER.warning(f"⚠️ Column '{col}' not found. Skipping.")
             continue
         if not is_numeric_dtype(df_copy[col]):
-            print(f"Warning: Column '{col}' is not numeric. Skipping.")
+            _LOGGER.warning(f"⚠️ Column '{col}' is not numeric. Skipping.")
             continue
         # --- Applying the Logic ---

ml_tools/utilities.py CHANGED Viewed

@@ -8,6 +8,7 @@ import joblib
 from joblib.externals.loky.process_executor import TerminatedWorkerError
 from .path_manager import sanitize_filename, make_fullpath, list_csv_paths
 from ._script_info import _script_info
+from ._logger import _LOGGER
 # Keep track of available tools
@@ -81,7 +82,7 @@ def load_dataframe(
         raise ValueError(f"❌ DataFrame '{df_name}' loaded from '{path}' is empty.")
     if verbose:
-        print(f"\n💾 Loaded {kind.upper()} dataset: '{df_name}' with shape: {df.shape}")
+        _LOGGER.info(f"💾 Loaded {kind.upper()} dataset: '{df_name}' with shape: {df.shape}")
     return df, df_name
@@ -166,7 +167,7 @@ def merge_dataframes(
         merged_df = merged_df.reset_index(drop=True)
     if verbose:
-        print(f"\n✅ Merged DataFrame shape: {merged_df.shape}")
+        _LOGGER.info(f"✅ Merged DataFrame shape: {merged_df.shape}")
     return merged_df
@@ -185,7 +186,7 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Pa
     """
     # This check works for both pandas and polars
     if df.shape[0] == 0:
-        print(f"⚠️ Attempting to save an empty DataFrame: '{filename}'. Process Skipped.")
+        _LOGGER.warning(f"⚠️ Attempting to save an empty DataFrame: '{filename}'. Process Skipped.")
         return
     # Create the directory if it doesn't exist
@@ -207,7 +208,7 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Pa
         # This error handles cases where an unsupported type is passed
         raise TypeError(f"❌ Unsupported DataFrame type: {type(df)}. Must be pandas or polars.")
-    print(f"\n✅ Saved dataset: '{filename}' with shape: {df.shape}")
+    _LOGGER.info(f"✅ Saved dataset: '{filename}' with shape: {df.shape}")
 def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
@@ -382,11 +383,11 @@ def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose
         if raise_on_error:
             raise Exception(message)
         else:
-            print(message)
+            _LOGGER.warning(message)
         return None
     else:
         if verbose:
-            print(f"\n✅ Object of type '{type(obj)}' saved to '{full_path}'")
+            _LOGGER.info(f"✅ Object of type '{type(obj)}' saved to '{full_path}'")
         return None
@@ -409,11 +410,11 @@ def deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_e
         if raise_on_error:
             raise Exception(message)
         else:
-            print(message)
+            _LOGGER.warning(message)
         return None
     else:
         if verbose:
-            print(f"\n✅ Loaded object of type '{type(obj)}'")
+            _LOGGER.info(f"✅ Loaded object of type '{type(obj)}'")
         return obj
@@ -500,10 +501,10 @@ def train_dataset_orchestrator(list_of_dirs: list[Union[str,Path]],
                     save_dataframe(df=df, save_dir=save_dir, filename=filename)
                     total_saved += 1
             except Exception as e:
-                print(f"⚠️ Failed to process file '{df_path}'. Reason: {e}")
+                _LOGGER.warning(f"⚠️ Failed to process file '{df_path}'. Reason: {e}")
                 continue
-    print(f"\n✅ {total_saved} single-target datasets were created.")
+    _LOGGER.info(f"✅ {total_saved} single-target datasets were created.")
 def info():

{dragon_ml_toolbox-5.2.2.dist-info → dragon_ml_toolbox-5.3.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{dragon_ml_toolbox-5.2.2.dist-info → dragon_ml_toolbox-5.3.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dragon_ml_toolbox-5.2.2.dist-info → dragon_ml_toolbox-5.3.1.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

File without changes

{dragon_ml_toolbox-5.2.2.dist-info → dragon_ml_toolbox-5.3.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

dragon-ml-toolbox 5.2.2__py3-none-any.whl → 5.3.1__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 5.2.2py3-none-any.whl → 5.3.1py3-none-any.whl