dragon-ml-toolbox 5.2.2__py3-none-any.whl → 5.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 5.2.2
3
+ Version: 5.3.1
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -18,7 +18,7 @@ Requires-Dist: numpy; extra == "base"
18
18
  Requires-Dist: polars; extra == "base"
19
19
  Requires-Dist: joblib; extra == "base"
20
20
  Provides-Extra: ml
21
- Requires-Dist: numpy; extra == "ml"
21
+ Requires-Dist: numpy>=2.0; extra == "ml"
22
22
  Requires-Dist: pandas; extra == "ml"
23
23
  Requires-Dist: polars; extra == "ml"
24
24
  Requires-Dist: joblib; extra == "ml"
@@ -1,15 +1,15 @@
1
- dragon_ml_toolbox-5.2.2.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-5.2.2.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
1
+ dragon_ml_toolbox-5.3.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
+ dragon_ml_toolbox-5.3.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
3
3
  ml_tools/ETL_engineering.py,sha256=4wwZXi9_U7xfCY70jGBaKniOeZ0m75ppxWpQBd_DmLc,39369
4
4
  ml_tools/GUI_tools.py,sha256=n4ZZ5kEjwK5rkOCFJE41HeLFfjhpJVLUSzk9Kd9Kr_0,45410
5
- ml_tools/MICE_imputation.py,sha256=b6ZTs8RedXFifOpuMCzr68xM16mCBVh1Ua6kcGfiVtg,11462
6
- ml_tools/ML_callbacks.py,sha256=xiJ6NnoVwF_TVak6sYzwWFk4CI3vRJGjxvGI1Yq6euw,13332
7
- ml_tools/ML_datasetmaster.py,sha256=IzT2v1o71PgYCFi9RXccBnmH-t-ExzX8sn9cCD2gz-Y,33603
8
- ml_tools/ML_evaluation.py,sha256=4dVqe6JF1Ukmk1sAcY8E5EG1oB1_oy2HXE5OT-pZwCs,10273
5
+ ml_tools/MICE_imputation.py,sha256=oFHg-OytOzPYTzBR_wIRHhP71cMn3aupDeT59ABsXlQ,11576
6
+ ml_tools/ML_callbacks.py,sha256=hOGWYM6ndaH0ibaHgM14j74MtWFalToY-oTnB2jsQ4A,13268
7
+ ml_tools/ML_datasetmaster.py,sha256=bbKCNA_b_uDIfxP9YIYKZm-VSfUSD15LvegFxpE9DIQ,34315
8
+ ml_tools/ML_evaluation.py,sha256=LX6UkUC80y43lYKBkw03CptZ3PJGkZXfmZZHL-2kd1s,11590
9
9
  ml_tools/ML_inference.py,sha256=Fh-X2UQn3AznWBjf-7iPSxwE-EzkGQm1VEIRUAkURmE,5336
10
10
  ml_tools/ML_models.py,sha256=SJhKHGAN2VTBqzcHUOpFWuVZ2Y7U1M4P_axG_LNYWcI,6460
11
- ml_tools/ML_optimization.py,sha256=2L9BSUzgLOEwBU84TN1qDh1KAOJ4R6C6NYSe7jmE4RI,9656
12
- ml_tools/ML_trainer.py,sha256=t58Ka6ryaYm0Fi5xje-e-fkmz9DwDLIeJLbh04n_gDg,15034
11
+ ml_tools/ML_optimization.py,sha256=zGKpWW4SL1-3iiHglDP-dkuADL73T0kxs3Dc-Lyishs,9671
12
+ ml_tools/ML_trainer.py,sha256=ENOxTq07kWYn7ZolMfXYLSy-cLZOdty0dRmutA84SV4,15146
13
13
  ml_tools/PSO_optimization.py,sha256=stH2Ux1sftQgX5EwLc85kHcoT4Rmz6zv7sH2yzf4Zrw,22710
14
14
  ml_tools/RNN_forecast.py,sha256=2CyjBLSYYc3xLHxwLXUmP5Qv8AmV1OB_EndETNX1IBk,1956
15
15
  ml_tools/SQL.py,sha256=9zzS6AFEJM9aj6nE31hDe8S9TqLonk-J1amwZoiHNbk,10468
@@ -18,15 +18,15 @@ ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
18
18
  ml_tools/_logger.py,sha256=TpgYguxO-CWYqqgLW0tqFjtwZ58PE_W2OCfWNGZr0n0,1175
19
19
  ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
20
20
  ml_tools/custom_logger.py,sha256=njM_0XPbQ1S-x5LeSQAaTo2if-XVOR_pQSGg4EDeiTU,4603
21
- ml_tools/data_exploration.py,sha256=qc_Oolxco2x9IhlYu5zPIuVBGiBw65HnypuGm8cQOOM,23677
21
+ ml_tools/data_exploration.py,sha256=P4f8OpRa7Q4i-11nkppxXw5Lx2lwlpn20GwWBbN_xbM,23901
22
22
  ml_tools/ensemble_inference.py,sha256=0SNX3YAz5bpvtwYmqEwqyWeIJP2Pb-v-bemENRSO7qg,9426
23
23
  ml_tools/ensemble_learning.py,sha256=Zi1oy6G2FWnTI5hBwjlexwF3JKALFS2FN6F8HAlVi_s,35391
24
24
  ml_tools/handle_excel.py,sha256=J9iwIqMZemoxK49J5osSwp9Ge0h9YTKyYGbOm53hcno,13007
25
25
  ml_tools/keys.py,sha256=kK9UF-hek2VcPGFILCKl5geoN6flmMOu7IzhdEA6z5Y,1068
26
26
  ml_tools/optimization_tools.py,sha256=MuT4OG7_r1QqLUti-yYix7QeCpglezD0oe9BDCq0QXk,5086
27
27
  ml_tools/path_manager.py,sha256=Z8e7w3MPqQaN8xmTnKuXZS6CIW59BFwwqGhGc00sdp4,13692
28
- ml_tools/utilities.py,sha256=mz-M351DzxWxnYVcLX-7ZQ6c-RGoCV9g4VTS9Qif2Es,18348
29
- dragon_ml_toolbox-5.2.2.dist-info/METADATA,sha256=1xc1_iWoGsLxwEFcyLRRSJCJJNdQZNsVHCSykfaVKGQ,6638
30
- dragon_ml_toolbox-5.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
31
- dragon_ml_toolbox-5.2.2.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
32
- dragon_ml_toolbox-5.2.2.dist-info/RECORD,,
28
+ ml_tools/utilities.py,sha256=T5xbxzBr14odUj7KncSeg-tJzqjmSDLOOmxEaGYLLi4,18447
29
+ dragon_ml_toolbox-5.3.1.dist-info/METADATA,sha256=XMn0E2Bh_6X97SScFy08jxJvo_KYeS5yuApaHTDPeqY,6643
30
+ dragon_ml_toolbox-5.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
31
+ dragon_ml_toolbox-5.3.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
32
+ dragon_ml_toolbox-5.3.1.dist-info/RECORD,,
@@ -29,6 +29,8 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
29
29
  random_state=random_state
30
30
  )
31
31
 
32
+ _LOGGER.info("➡️ MICE imputation running...")
33
+
32
34
  # Perform MICE with n iterations per dataset
33
35
  kernel.mice(iterations)
34
36
 
@@ -61,6 +63,8 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
61
63
  assert all(imputed_df.index == df.index), f"❌ Index mismatch in dataset {subname}" # type: ignore
62
64
  # print("✅ All imputed datasets match the original DataFrame indexes.")
63
65
 
66
+ _LOGGER.info("✅ MICE imputation complete.")
67
+
64
68
  return kernel, imputed_datasets, imputed_dataset_names
65
69
 
66
70
 
ml_tools/ML_callbacks.py CHANGED
@@ -124,7 +124,7 @@ class EarlyStopping(Callback):
124
124
  inferred from the name of the monitored quantity.
125
125
  verbose (int): Verbosity mode.
126
126
  """
127
- def __init__(self, monitor: str=LogKeys.VAL_LOSS, min_delta=0.0, patience=3, mode: Literal['auto', 'min', 'max']='auto', verbose: int=1):
127
+ def __init__(self, monitor: str=LogKeys.VAL_LOSS, min_delta: float=0.0, patience: int=5, mode: Literal['auto', 'min', 'max']='auto', verbose: int=0):
128
128
  super().__init__()
129
129
  self.monitor = monitor
130
130
  self.patience = patience
@@ -148,13 +148,13 @@ class EarlyStopping(Callback):
148
148
  else: # Default to min mode for loss or other metrics
149
149
  self.monitor_op = np.less
150
150
 
151
- self.best = np.Inf if self.monitor_op == np.less else -np.Inf # type: ignore
151
+ self.best = np.inf if self.monitor_op == np.less else -np.inf
152
152
 
153
153
  def on_train_begin(self, logs=None):
154
154
  # Reset state at the beginning of training
155
155
  self.wait = 0
156
156
  self.stopped_epoch = 0
157
- self.best = np.Inf if self.monitor_op == np.less else -np.Inf # type: ignore
157
+ self.best = np.inf if self.monitor_op == np.less else -np.inf
158
158
 
159
159
  def on_epoch_end(self, epoch, logs=None):
160
160
  current = logs.get(self.monitor) # type: ignore
@@ -202,7 +202,7 @@ class ModelCheckpoint(Callback):
202
202
  verbose (int): Verbosity mode.
203
203
  """
204
204
  def __init__(self, save_dir: Union[str,Path], monitor: str = LogKeys.VAL_LOSS,
205
- save_best_only: bool = False, mode: Literal['auto', 'min', 'max']= 'auto', verbose: int = 1):
205
+ save_best_only: bool = True, mode: Literal['auto', 'min', 'max']= 'auto', verbose: int = 0):
206
206
  super().__init__()
207
207
  self.save_dir = make_fullpath(save_dir, make=True, enforce="directory")
208
208
  if not self.save_dir.is_dir():
@@ -228,11 +228,11 @@ class ModelCheckpoint(Callback):
228
228
  else:
229
229
  self.monitor_op = np.less if 'loss' in self.monitor else np.greater
230
230
 
231
- self.best = np.Inf if self.monitor_op == np.less else -np.Inf # type: ignore
231
+ self.best = np.inf if self.monitor_op == np.less else -np.inf
232
232
 
233
233
  def on_train_begin(self, logs=None):
234
234
  """Reset state when training starts."""
235
- self.best = np.Inf if self.monitor_op == np.less else -np.Inf # type: ignore
235
+ self.best = np.inf if self.monitor_op == np.less else -np.inf
236
236
  self.saved_checkpoints = []
237
237
  self.last_best_filepath = None
238
238
 
@@ -251,7 +251,7 @@ class ModelCheckpoint(Callback):
251
251
  return
252
252
 
253
253
  if self.monitor_op(current, self.best):
254
- old_best_str = f"{self.best:.4f}" if self.best not in [np.Inf, -np.Inf] else "inf" # type: ignore
254
+ old_best_str = f"{self.best:.4f}" if self.best not in [np.inf, -np.inf] else "inf"
255
255
 
256
256
  # Create a descriptive filename
257
257
  filename = f"epoch_{epoch}-{self.monitor}_{current:.4f}.pth"
@@ -128,17 +128,18 @@ class DatasetMaker(_BaseMaker):
128
128
  - Automated (single call):
129
129
  ```python
130
130
  maker = DatasetMaker(df, label_col='target')
131
- maker.process() # uses simplified arguments
131
+ maker.auto_process() # uses simplified arguments
132
132
  train_ds, test_ds = maker.get_datasets()
133
133
  ```
134
134
  """
135
- def __init__(self, pandas_df: pandas.DataFrame, label_col: str):
135
+ def __init__(self, pandas_df: pandas.DataFrame, label_col: str, kind: Literal["regression", "classification"]):
136
136
  super().__init__()
137
137
  if not isinstance(pandas_df, pandas.DataFrame):
138
138
  raise TypeError("Input must be a pandas.DataFrame.")
139
139
  if label_col not in pandas_df.columns:
140
140
  raise ValueError(f"Label column '{label_col}' not found in DataFrame.")
141
-
141
+
142
+ self.kind = kind
142
143
  self.labels = pandas_df[label_col]
143
144
  self.features = pandas_df.drop(columns=label_col)
144
145
  self.labels_map = None
@@ -277,7 +278,7 @@ class DatasetMaker(_BaseMaker):
277
278
  _LOGGER.info(f"Balancing complete. New training set size: {len(self.features_train)} samples.")
278
279
  return self
279
280
 
280
- def process(self, test_size: float = 0.2, cat_method: Literal["one-hot", "embed"] = "one-hot", normalize_method: Literal["standard", "minmax"] = "standard",
281
+ def auto_process(self, test_size: float = 0.2, cat_method: Literal["one-hot", "embed"] = "one-hot", normalize_method: Literal["standard", "minmax"] = "standard",
281
282
  balance: bool = False, random_state: Optional[int] = None) -> 'DatasetMaker':
282
283
  """Runs a standard, fully automated preprocessing pipeline."""
283
284
  _LOGGER.info("--- 🤖 Running Automated Processing Pipeline ---")
@@ -334,8 +335,10 @@ class DatasetMaker(_BaseMaker):
334
335
  if not self._is_split:
335
336
  raise RuntimeError("Data has not been split yet. Call .split_data() or .process() first.")
336
337
 
337
- self._train_dataset = _PytorchDataset(self.features_train, self.labels_train) # type: ignore
338
- self._test_dataset = _PytorchDataset(self.features_test, self.labels_test) # type: ignore
338
+ label_dtype = torch.float32 if self.kind == "regression" else torch.int64
339
+
340
+ self._train_dataset = _PytorchDataset(self.features_train, self.labels_train, labels_dtype=label_dtype) # type: ignore
341
+ self._test_dataset = _PytorchDataset(self.features_test, self.labels_test, labels_dtype=label_dtype) # type: ignore
339
342
 
340
343
  return self._train_dataset, self._test_dataset
341
344
 
@@ -382,12 +385,13 @@ class SimpleDatasetMaker:
382
385
 
383
386
  Args:
384
387
  pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
388
+ kind (Literal["regression", "classification"]): The type of ML task. This determines the data type of the labels.
385
389
  test_size (float): The proportion of the dataset to allocate to the
386
390
  test split.
387
391
  random_state (int): The seed for the random number generator for
388
392
  reproducibility.
389
393
  """
390
- def __init__(self, pandas_df: pandas.DataFrame, test_size: float = 0.2, random_state: int = 42):
394
+ def __init__(self, pandas_df: pandas.DataFrame, kind: Literal["regression", "classification"], test_size: float = 0.2, random_state: int = 42):
391
395
  """
392
396
  Attributes:
393
397
  `train_dataset` -> PyTorch Dataset
@@ -398,9 +402,11 @@ class SimpleDatasetMaker:
398
402
 
399
403
  The ID can be manually set to any string if needed, it is `None` by default.
400
404
  """
401
-
405
+ # Validation
402
406
  if not isinstance(pandas_df, pandas.DataFrame):
403
- raise TypeError("Input must be a pandas.DataFrame.")
407
+ raise TypeError("Input must be a pandas.DataFrame.")
408
+ if kind not in ["regression", "classification"]:
409
+ raise ValueError("`kind` must be 'regression' or 'classification'.")
404
410
 
405
411
  # 1. Identify features and target
406
412
  features = pandas_df.iloc[:, :-1]
@@ -422,9 +428,11 @@ class SimpleDatasetMaker:
422
428
  self._y_train_shape = y_train.shape
423
429
  self._y_test_shape = y_test.shape
424
430
 
425
- # 3. Convert to PyTorch Datasets
426
- self._train_ds = _PytorchDataset(X_train.values, y_train.values)
427
- self._test_ds = _PytorchDataset(X_test.values, y_test.values)
431
+ # 3. Convert to PyTorch Datasets with the correct label dtype
432
+ label_dtype = torch.float32 if kind == "regression" else torch.int64
433
+
434
+ self._train_ds = _PytorchDataset(X_train.values, y_train.values, labels_dtype=label_dtype)
435
+ self._test_ds = _PytorchDataset(X_test.values, y_test.values, labels_dtype=label_dtype)
428
436
 
429
437
  @property
430
438
  def train_dataset(self) -> Dataset:
ml_tools/ML_evaluation.py CHANGED
@@ -195,7 +195,7 @@ def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Optiona
195
195
  plt.close(fig_tvp)
196
196
 
197
197
 
198
- def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain: torch.Tensor,
198
+ def shap_summary_plot(model, background_data: Union[torch.Tensor,np.ndarray], instances_to_explain: Union[torch.Tensor,np.ndarray],
199
199
  feature_names: Optional[list[str]]=None, save_dir: Optional[Union[str, Path]] = None):
200
200
  """
201
201
  Calculates SHAP values and saves summary plots and data.
@@ -207,24 +207,54 @@ def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain
207
207
  feature_names (list of str | None): Names of the features for plot labeling.
208
208
  save_dir (str | Path | None): Directory to save SHAP artifacts. If None, dot plot is shown.
209
209
  """
210
+ # everything to numpy
211
+ if isinstance(background_data, np.ndarray):
212
+ background_data_np = background_data
213
+ else:
214
+ background_data_np = background_data.numpy()
215
+
216
+ if isinstance(instances_to_explain, np.ndarray):
217
+ instances_to_explain_np = instances_to_explain
218
+ else:
219
+ instances_to_explain_np = instances_to_explain.numpy()
220
+
221
+ # --- Data Validation Step ---
222
+ if np.isnan(background_data_np).any() or np.isnan(instances_to_explain_np).any():
223
+ _LOGGER.error("❌ Input data for SHAP contains NaN values. Aborting explanation.")
224
+ return
225
+
210
226
  print("\n--- SHAP Value Explanation ---")
211
- print("Calculating SHAP values... ")
212
227
 
213
228
  model.eval()
214
229
  model.cpu()
215
230
 
216
- explainer = shap.DeepExplainer(model, background_data)
217
- shap_values = explainer.shap_values(instances_to_explain)
218
-
219
- shap_values_for_plot = shap_values[1] if isinstance(shap_values, list) else shap_values
220
- if isinstance(shap_values, list):
221
- _LOGGER.info("Using SHAP values for the positive class (class 1) for plots.")
231
+ # 1. Summarize the background data.
232
+ # Summarize the background data using k-means. 10-50 clusters is a good starting point.
233
+ background_summary = shap.kmeans(background_data_np, 30)
234
+
235
+ # 2. Define a prediction function wrapper that SHAP can use. It must take a numpy array and return a numpy array.
236
+ def prediction_wrapper(x_np: np.ndarray) -> np.ndarray:
237
+ # Convert numpy data to torch tensor
238
+ x_torch = torch.from_numpy(x_np).float()
239
+ with torch.no_grad():
240
+ # Get model output
241
+ output = model(x_torch)
242
+ # Return as numpy array
243
+ return output.cpu().numpy().flatten()
222
244
 
245
+ # 3. Create the KernelExplainer
246
+ explainer = shap.KernelExplainer(prediction_wrapper, background_summary)
247
+
248
+ print("Calculating SHAP values with KernelExplainer...")
249
+ shap_values = explainer.shap_values(instances_to_explain_np, l1_reg="aic")
250
+
223
251
  if save_dir:
224
252
  save_dir_path = make_fullpath(save_dir, make=True, enforce="directory")
253
+ plt.ioff()
254
+
225
255
  # Save Bar Plot
226
256
  bar_path = save_dir_path / "shap_bar_plot.svg"
227
- shap.summary_plot(shap_values_for_plot, instances_to_explain, feature_names=feature_names, plot_type="bar", show=False)
257
+ shap.summary_plot(shap_values, instances_to_explain_np, feature_names=feature_names, plot_type="bar", show=False)
228
258
  plt.title("SHAP Feature Importance")
229
259
  plt.tight_layout()
230
260
  plt.savefig(bar_path)
@@ -233,7 +263,7 @@ def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain
233
263
 
234
264
  # Save Dot Plot
235
265
  dot_path = save_dir_path / "shap_dot_plot.svg"
236
- shap.summary_plot(shap_values_for_plot, instances_to_explain, feature_names=feature_names, plot_type="dot", show=False)
266
+ shap.summary_plot(shap_values, instances_to_explain_np, feature_names=feature_names, plot_type="dot", show=False)
237
267
  plt.title("SHAP Feature Importance")
238
268
  plt.tight_layout()
239
269
  plt.savefig(dot_path)
@@ -242,18 +272,25 @@ def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain
242
272
 
243
273
  # Save Summary Data to CSV
244
274
  summary_path = save_dir_path / "shap_summary.csv"
245
- mean_abs_shap = np.abs(shap_values_for_plot).mean(axis=0)
275
+ # Ensure the array is 1D before creating the DataFrame
276
+ mean_abs_shap = np.abs(shap_values).mean(axis=0).flatten()
277
+
246
278
  if feature_names is None:
247
279
  feature_names = [f'feature_{i}' for i in range(len(mean_abs_shap))]
280
+
248
281
  summary_df = pd.DataFrame({
249
282
  'feature': feature_names,
250
283
  'mean_abs_shap_value': mean_abs_shap
251
284
  }).sort_values('mean_abs_shap_value', ascending=False)
285
+
252
286
  summary_df.to_csv(summary_path, index=False)
287
+
253
288
  _LOGGER.info(f"📝 SHAP summary data saved as '{summary_path.name}'")
289
+ plt.ion()
290
+
254
291
  else:
255
292
  _LOGGER.info("No save directory provided. Displaying SHAP dot plot.")
256
- shap.summary_plot(shap_values_for_plot, instances_to_explain, feature_names=feature_names, plot_type="dot")
293
+ shap.summary_plot(shap_values, instances_to_explain_np, feature_names=feature_names, plot_type="dot")
257
294
 
258
295
 
259
296
  def info():
@@ -49,6 +49,7 @@ def create_pytorch_problem(
49
49
  selected search algorithm's constructor (e.g., stdev_init=0.5 for CMAES).
50
50
 
51
51
  Returns:
52
+ Tuple:
52
53
  A tuple containing the configured evotorch.Problem and evotorch.Searcher.
53
54
  """
54
55
  lower_bounds, upper_bounds = bounds
ml_tools/ML_trainer.py CHANGED
@@ -95,14 +95,16 @@ class MyTrainer:
95
95
  batch_size=batch_size,
96
96
  shuffle=shuffle,
97
97
  num_workers=loader_workers,
98
- pin_memory=(self.device.type == "cuda")
98
+ pin_memory=("cuda" in self.device.type),
99
+ drop_last=True # Drops the last batch if incomplete, selecting a good batch size is key.
99
100
  )
101
+
100
102
  self.test_loader = DataLoader(
101
103
  dataset=self.test_dataset,
102
104
  batch_size=batch_size,
103
105
  shuffle=False,
104
106
  num_workers=loader_workers,
105
- pin_memory=(self.device.type == "cuda")
107
+ pin_memory=("cuda" in self.device.type)
106
108
  )
107
109
 
108
110
  def fit(self, epochs: int = 10, batch_size: int = 10, shuffle: bool = True):
@@ -7,6 +7,7 @@ from typing import Union, Literal, Dict, Tuple, List, Optional
7
7
  from pathlib import Path
8
8
  from .path_manager import sanitize_filename, make_fullpath
9
9
  from ._script_info import _script_info
10
+ from ._logger import _LOGGER
10
11
  import re
11
12
 
12
13
 
@@ -55,7 +56,7 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
55
56
  ].round(round_digits)
56
57
  summary = summary.join(summary_numeric, how='left')
57
58
 
58
- print(f"Shape: {df.shape}")
59
+ print(f"DataFrame Shape: {df.shape}")
59
60
  return summary
60
61
 
61
62
 
@@ -98,7 +99,7 @@ def drop_constant_columns(df: pd.DataFrame, verbose: bool = True) -> pd.DataFram
98
99
 
99
100
  dropped_columns = original_columns - set(cols_to_keep)
100
101
  if verbose:
101
- print(f"🧹 Dropped {len(dropped_columns)} constant columns.")
102
+ _LOGGER.info(f"🧹 Dropped {len(dropped_columns)} constant columns.")
102
103
  if dropped_columns:
103
104
  for dropped_column in dropped_columns:
104
105
  print(f" {dropped_column}")
@@ -129,10 +130,10 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
129
130
  valid_targets = _validate_columns(df_clean, targets)
130
131
  target_na = df_clean[valid_targets].isnull().all(axis=1)
131
132
  if target_na.any():
132
- print(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
133
+ _LOGGER.info(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
133
134
  df_clean = df_clean[~target_na]
134
135
  else:
135
- print("✅ No rows with all targets missing.")
136
+ _LOGGER.info("✅ No rows with all targets missing.")
136
137
  else:
137
138
  valid_targets = []
138
139
 
@@ -142,12 +143,12 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
142
143
  feature_na_frac = df_clean[feature_cols].isnull().mean(axis=1)
143
144
  rows_to_drop = feature_na_frac[feature_na_frac > threshold].index
144
145
  if len(rows_to_drop) > 0:
145
- print(f"🧹 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
146
+ _LOGGER.info(f"🧹 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
146
147
  df_clean = df_clean.drop(index=rows_to_drop)
147
148
  else:
148
- print(f"✅ No rows exceed the {threshold*100:.0f}% missing feature data threshold.")
149
+ _LOGGER.info(f"✅ No rows exceed the {threshold*100:.0f}% missing feature data threshold.")
149
150
  else:
150
- print("⚠️ No feature columns available to evaluate.")
151
+ _LOGGER.warning("⚠️ No feature columns available to evaluate.")
151
152
 
152
153
  return df_clean
153
154
 
@@ -207,7 +208,7 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
207
208
  cols_to_drop = missing_fraction[missing_fraction > threshold].index
208
209
 
209
210
  if len(cols_to_drop) > 0:
210
- print(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
211
+ _LOGGER.info(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
211
212
  print(list(cols_to_drop))
212
213
 
213
214
  result_df = df.drop(columns=cols_to_drop)
@@ -216,7 +217,7 @@ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, sho
216
217
 
217
218
  return result_df
218
219
  else:
219
- print(f"No columns have more than {threshold*100:.0f}% missing data.")
220
+ _LOGGER.info(f"No columns have more than {threshold*100:.0f}% missing data.")
220
221
  return df
221
222
 
222
223
 
@@ -311,7 +312,7 @@ def plot_correlation_heatmap(df: pd.DataFrame,
311
312
  """
312
313
  numeric_df = df.select_dtypes(include='number')
313
314
  if numeric_df.empty:
314
- print("No numeric columns found. Heatmap not generated.")
315
+ _LOGGER.warning("⚠️ No numeric columns found. Heatmap not generated.")
315
316
  return
316
317
 
317
318
  corr = numeric_df.corr(method=method)
@@ -348,7 +349,7 @@ def plot_correlation_heatmap(df: pd.DataFrame,
348
349
  full_path = save_path / plot_title
349
350
 
350
351
  plt.savefig(full_path, bbox_inches="tight", format='svg')
351
- print(f"Saved correlation heatmap: '{plot_title}'")
352
+ _LOGGER.info(f"Saved correlation heatmap: '{plot_title}'")
352
353
 
353
354
  plt.show()
354
355
  plt.close()
@@ -454,7 +455,7 @@ def plot_value_distributions(df: pd.DataFrame, save_dir: Union[str, Path], bin_t
454
455
  _plot_helper(dict_=dict_to_plot_std, target_dir=std_dir, ylabel="Counts")
455
456
  _plot_helper(dict_=dict_to_plot_freq, target_dir=freq_dir, ylabel="Frequency")
456
457
 
457
- print(f"Saved {saved_plots} plot(s)")
458
+ _LOGGER.info(f"Saved {saved_plots} value distribution plots.")
458
459
 
459
460
 
460
461
  def clip_outliers_single(
@@ -479,17 +480,17 @@ def clip_outliers_single(
479
480
  None: if a problem with the dataframe column occurred.
480
481
  """
481
482
  if column not in df.columns:
482
- print(f"Column '{column}' not found in DataFrame.")
483
+ _LOGGER.warning(f"⚠️ Column '{column}' not found in DataFrame.")
483
484
  return None
484
485
 
485
486
  if not pd.api.types.is_numeric_dtype(df[column]):
486
- print(f"Column '{column}' must be numeric.")
487
+ _LOGGER.warning(f"⚠️ Column '{column}' must be numeric.")
487
488
  return None
488
489
 
489
490
  new_df = df.copy(deep=True)
490
491
  new_df[column] = new_df[column].clip(lower=min_val, upper=max_val)
491
492
 
492
- print(f"Column '{column}' clipped to range [{min_val}, {max_val}].")
493
+ _LOGGER.info(f"Column '{column}' clipped to range [{min_val}, {max_val}].")
493
494
  return new_df
494
495
 
495
496
 
@@ -539,10 +540,10 @@ def clip_outliers_multi(
539
540
  skipped_columns.append((col, str(e)))
540
541
  continue
541
542
 
542
- print(f"Clipped {clipped_columns} columns.")
543
+ _LOGGER.info(f"Clipped {clipped_columns} columns.")
543
544
 
544
545
  if skipped_columns:
545
- print("\n⚠️ Skipped columns:")
546
+ _LOGGER.warning("⚠️ Skipped columns:")
546
547
  for col, msg in skipped_columns:
547
548
  print(f" - {col}: {msg}")
548
549
 
@@ -574,7 +575,7 @@ def match_and_filter_columns_by_regex(
574
575
  matched_columns = df.columns[mask].to_list()
575
576
  filtered_df = df.loc[:, mask]
576
577
 
577
- print(f"{len(matched_columns)} column(s) match the regex pattern '{pattern}'.")
578
+ _LOGGER.info(f"{len(matched_columns)} columns match the regex pattern '{pattern}'.")
578
579
 
579
580
  return filtered_df, matched_columns
580
581
 
@@ -628,11 +629,11 @@ def standardize_percentages(
628
629
  for col in columns:
629
630
  # --- Robustness Checks ---
630
631
  if col not in df_copy.columns:
631
- print(f"Warning: Column '{col}' not found. Skipping.")
632
+ _LOGGER.warning(f"⚠️ Column '{col}' not found. Skipping.")
632
633
  continue
633
634
 
634
635
  if not is_numeric_dtype(df_copy[col]):
635
- print(f"Warning: Column '{col}' is not numeric. Skipping.")
636
+ _LOGGER.warning(f"⚠️ Column '{col}' is not numeric. Skipping.")
636
637
  continue
637
638
 
638
639
  # --- Applying the Logic ---
ml_tools/utilities.py CHANGED
@@ -8,6 +8,7 @@ import joblib
8
8
  from joblib.externals.loky.process_executor import TerminatedWorkerError
9
9
  from .path_manager import sanitize_filename, make_fullpath, list_csv_paths
10
10
  from ._script_info import _script_info
11
+ from ._logger import _LOGGER
11
12
 
12
13
 
13
14
  # Keep track of available tools
@@ -81,7 +82,7 @@ def load_dataframe(
81
82
  raise ValueError(f"❌ DataFrame '{df_name}' loaded from '{path}' is empty.")
82
83
 
83
84
  if verbose:
84
- print(f"\n💾 Loaded {kind.upper()} dataset: '{df_name}' with shape: {df.shape}")
85
+ _LOGGER.info(f"💾 Loaded {kind.upper()} dataset: '{df_name}' with shape: {df.shape}")
85
86
 
86
87
  return df, df_name
87
88
 
@@ -166,7 +167,7 @@ def merge_dataframes(
166
167
  merged_df = merged_df.reset_index(drop=True)
167
168
 
168
169
  if verbose:
169
- print(f"\n✅ Merged DataFrame shape: {merged_df.shape}")
170
+ _LOGGER.info(f"✅ Merged DataFrame shape: {merged_df.shape}")
170
171
 
171
172
  return merged_df
172
173
 
@@ -185,7 +186,7 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Pa
185
186
  """
186
187
  # This check works for both pandas and polars
187
188
  if df.shape[0] == 0:
188
- print(f"⚠️ Attempting to save an empty DataFrame: '{filename}'. Process Skipped.")
189
+ _LOGGER.warning(f"⚠️ Attempting to save an empty DataFrame: '{filename}'. Process Skipped.")
189
190
  return
190
191
 
191
192
  # Create the directory if it doesn't exist
@@ -207,7 +208,7 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Pa
207
208
  # This error handles cases where an unsupported type is passed
208
209
  raise TypeError(f"❌ Unsupported DataFrame type: {type(df)}. Must be pandas or polars.")
209
210
 
210
- print(f"\n✅ Saved dataset: '{filename}' with shape: {df.shape}")
211
+ _LOGGER.info(f"✅ Saved dataset: '{filename}' with shape: {df.shape}")
211
212
 
212
213
 
213
214
  def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
@@ -382,11 +383,11 @@ def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose
382
383
  if raise_on_error:
383
384
  raise Exception(message)
384
385
  else:
385
- print(message)
386
+ _LOGGER.warning(message)
386
387
  return None
387
388
  else:
388
389
  if verbose:
389
- print(f"\n✅ Object of type '{type(obj)}' saved to '{full_path}'")
390
+ _LOGGER.info(f"✅ Object of type '{type(obj)}' saved to '{full_path}'")
390
391
  return None
391
392
 
392
393
 
@@ -409,11 +410,11 @@ def deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_e
409
410
  if raise_on_error:
410
411
  raise Exception(message)
411
412
  else:
412
- print(message)
413
+ _LOGGER.warning(message)
413
414
  return None
414
415
  else:
415
416
  if verbose:
416
- print(f"\n✅ Loaded object of type '{type(obj)}'")
417
+ _LOGGER.info(f"✅ Loaded object of type '{type(obj)}'")
417
418
  return obj
418
419
 
419
420
 
@@ -500,10 +501,10 @@ def train_dataset_orchestrator(list_of_dirs: list[Union[str,Path]],
500
501
  save_dataframe(df=df, save_dir=save_dir, filename=filename)
501
502
  total_saved += 1
502
503
  except Exception as e:
503
- print(f"⚠️ Failed to process file '{df_path}'. Reason: {e}")
504
+ _LOGGER.warning(f"⚠️ Failed to process file '{df_path}'. Reason: {e}")
504
505
  continue
505
506
 
506
- print(f"\n✅ {total_saved} single-target datasets were created.")
507
+ _LOGGER.info(f"✅ {total_saved} single-target datasets were created.")
507
508
 
508
509
 
509
510
  def info():