dragon-ml-toolbox 5.3.0__py3-none-any.whl → 5.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 5.3.0
3
+ Version: 5.3.1
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,15 +1,15 @@
1
- dragon_ml_toolbox-5.3.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-5.3.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
1
+ dragon_ml_toolbox-5.3.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
+ dragon_ml_toolbox-5.3.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
3
3
  ml_tools/ETL_engineering.py,sha256=4wwZXi9_U7xfCY70jGBaKniOeZ0m75ppxWpQBd_DmLc,39369
4
4
  ml_tools/GUI_tools.py,sha256=n4ZZ5kEjwK5rkOCFJE41HeLFfjhpJVLUSzk9Kd9Kr_0,45410
5
5
  ml_tools/MICE_imputation.py,sha256=oFHg-OytOzPYTzBR_wIRHhP71cMn3aupDeT59ABsXlQ,11576
6
- ml_tools/ML_callbacks.py,sha256=eOCSc-1_e5vC2dQN1ydHGKDLeJ3DqB-eLRLuXp2DpFM,13257
6
+ ml_tools/ML_callbacks.py,sha256=hOGWYM6ndaH0ibaHgM14j74MtWFalToY-oTnB2jsQ4A,13268
7
7
  ml_tools/ML_datasetmaster.py,sha256=bbKCNA_b_uDIfxP9YIYKZm-VSfUSD15LvegFxpE9DIQ,34315
8
- ml_tools/ML_evaluation.py,sha256=4dVqe6JF1Ukmk1sAcY8E5EG1oB1_oy2HXE5OT-pZwCs,10273
8
+ ml_tools/ML_evaluation.py,sha256=LX6UkUC80y43lYKBkw03CptZ3PJGkZXfmZZHL-2kd1s,11590
9
9
  ml_tools/ML_inference.py,sha256=Fh-X2UQn3AznWBjf-7iPSxwE-EzkGQm1VEIRUAkURmE,5336
10
10
  ml_tools/ML_models.py,sha256=SJhKHGAN2VTBqzcHUOpFWuVZ2Y7U1M4P_axG_LNYWcI,6460
11
11
  ml_tools/ML_optimization.py,sha256=zGKpWW4SL1-3iiHglDP-dkuADL73T0kxs3Dc-Lyishs,9671
12
- ml_tools/ML_trainer.py,sha256=t58Ka6ryaYm0Fi5xje-e-fkmz9DwDLIeJLbh04n_gDg,15034
12
+ ml_tools/ML_trainer.py,sha256=ENOxTq07kWYn7ZolMfXYLSy-cLZOdty0dRmutA84SV4,15146
13
13
  ml_tools/PSO_optimization.py,sha256=stH2Ux1sftQgX5EwLc85kHcoT4Rmz6zv7sH2yzf4Zrw,22710
14
14
  ml_tools/RNN_forecast.py,sha256=2CyjBLSYYc3xLHxwLXUmP5Qv8AmV1OB_EndETNX1IBk,1956
15
15
  ml_tools/SQL.py,sha256=9zzS6AFEJM9aj6nE31hDe8S9TqLonk-J1amwZoiHNbk,10468
@@ -26,7 +26,7 @@ ml_tools/keys.py,sha256=kK9UF-hek2VcPGFILCKl5geoN6flmMOu7IzhdEA6z5Y,1068
26
26
  ml_tools/optimization_tools.py,sha256=MuT4OG7_r1QqLUti-yYix7QeCpglezD0oe9BDCq0QXk,5086
27
27
  ml_tools/path_manager.py,sha256=Z8e7w3MPqQaN8xmTnKuXZS6CIW59BFwwqGhGc00sdp4,13692
28
28
  ml_tools/utilities.py,sha256=T5xbxzBr14odUj7KncSeg-tJzqjmSDLOOmxEaGYLLi4,18447
29
- dragon_ml_toolbox-5.3.0.dist-info/METADATA,sha256=Lu_JBMfkCPssLk-a2v4b-oZu86cFK1OIB4HtHspVRIk,6643
30
- dragon_ml_toolbox-5.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
31
- dragon_ml_toolbox-5.3.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
32
- dragon_ml_toolbox-5.3.0.dist-info/RECORD,,
29
+ dragon_ml_toolbox-5.3.1.dist-info/METADATA,sha256=XMn0E2Bh_6X97SScFy08jxJvo_KYeS5yuApaHTDPeqY,6643
30
+ dragon_ml_toolbox-5.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
31
+ dragon_ml_toolbox-5.3.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
32
+ dragon_ml_toolbox-5.3.1.dist-info/RECORD,,
ml_tools/ML_callbacks.py CHANGED
@@ -124,7 +124,7 @@ class EarlyStopping(Callback):
124
124
  inferred from the name of the monitored quantity.
125
125
  verbose (int): Verbosity mode.
126
126
  """
127
- def __init__(self, monitor: str=LogKeys.VAL_LOSS, min_delta=0.0, patience=3, mode: Literal['auto', 'min', 'max']='auto', verbose: int=1):
127
+ def __init__(self, monitor: str=LogKeys.VAL_LOSS, min_delta: float=0.0, patience: int=5, mode: Literal['auto', 'min', 'max']='auto', verbose: int=0):
128
128
  super().__init__()
129
129
  self.monitor = monitor
130
130
  self.patience = patience
@@ -202,7 +202,7 @@ class ModelCheckpoint(Callback):
202
202
  verbose (int): Verbosity mode.
203
203
  """
204
204
  def __init__(self, save_dir: Union[str,Path], monitor: str = LogKeys.VAL_LOSS,
205
- save_best_only: bool = False, mode: Literal['auto', 'min', 'max']= 'auto', verbose: int = 1):
205
+ save_best_only: bool = True, mode: Literal['auto', 'min', 'max']= 'auto', verbose: int = 0):
206
206
  super().__init__()
207
207
  self.save_dir = make_fullpath(save_dir, make=True, enforce="directory")
208
208
  if not self.save_dir.is_dir():
ml_tools/ML_evaluation.py CHANGED
@@ -195,7 +195,7 @@ def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Optiona
195
195
  plt.close(fig_tvp)
196
196
 
197
197
 
198
- def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain: torch.Tensor,
198
+ def shap_summary_plot(model, background_data: Union[torch.Tensor,np.ndarray], instances_to_explain: Union[torch.Tensor,np.ndarray],
199
199
  feature_names: Optional[list[str]]=None, save_dir: Optional[Union[str, Path]] = None):
200
200
  """
201
201
  Calculates SHAP values and saves summary plots and data.
@@ -207,24 +207,54 @@ def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain
207
207
  feature_names (list of str | None): Names of the features for plot labeling.
208
208
  save_dir (str | Path | None): Directory to save SHAP artifacts. If None, dot plot is shown.
209
209
  """
210
+ # everything to numpy
211
+ if isinstance(background_data, np.ndarray):
212
+ background_data_np = background_data
213
+ else:
214
+ background_data_np = background_data.numpy()
215
+
216
+ if isinstance(instances_to_explain, np.ndarray):
217
+ instances_to_explain_np = instances_to_explain
218
+ else:
219
+ instances_to_explain_np = instances_to_explain.numpy()
220
+
221
+ # --- Data Validation Step ---
222
+ if np.isnan(background_data_np).any() or np.isnan(instances_to_explain_np).any():
223
+ _LOGGER.error("❌ Input data for SHAP contains NaN values. Aborting explanation.")
224
+ return
225
+
210
226
  print("\n--- SHAP Value Explanation ---")
211
- print("Calculating SHAP values... ")
212
227
 
213
228
  model.eval()
214
229
  model.cpu()
215
230
 
216
- explainer = shap.DeepExplainer(model, background_data)
217
- shap_values = explainer.shap_values(instances_to_explain)
218
-
219
- shap_values_for_plot = shap_values[1] if isinstance(shap_values, list) else shap_values
220
- if isinstance(shap_values, list):
221
- _LOGGER.info("Using SHAP values for the positive class (class 1) for plots.")
231
+ # 1. Summarize the background data.
232
+ # Summarize the background data using k-means. 10-50 clusters is a good starting point.
233
+ background_summary = shap.kmeans(background_data_np, 30)
234
+
235
+ # 2. Define a prediction function wrapper that SHAP can use. It must take a numpy array and return a numpy array.
236
+ def prediction_wrapper(x_np: np.ndarray) -> np.ndarray:
237
+ # Convert numpy data to torch tensor
238
+ x_torch = torch.from_numpy(x_np).float()
239
+ with torch.no_grad():
240
+ # Get model output
241
+ output = model(x_torch)
242
+ # Return as numpy array
243
+ return output.cpu().numpy().flatten()
222
244
 
245
+ # 3. Create the KernelExplainer
246
+ explainer = shap.KernelExplainer(prediction_wrapper, background_summary)
247
+
248
+ print("Calculating SHAP values with KernelExplainer...")
249
+ shap_values = explainer.shap_values(instances_to_explain_np, l1_reg="aic")
250
+
223
251
  if save_dir:
224
252
  save_dir_path = make_fullpath(save_dir, make=True, enforce="directory")
253
+ plt.ioff()
254
+
225
255
  # Save Bar Plot
226
256
  bar_path = save_dir_path / "shap_bar_plot.svg"
227
- shap.summary_plot(shap_values_for_plot, instances_to_explain, feature_names=feature_names, plot_type="bar", show=False)
257
+ shap.summary_plot(shap_values, instances_to_explain_np, feature_names=feature_names, plot_type="bar", show=False)
228
258
  plt.title("SHAP Feature Importance")
229
259
  plt.tight_layout()
230
260
  plt.savefig(bar_path)
@@ -233,7 +263,7 @@ def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain
233
263
 
234
264
  # Save Dot Plot
235
265
  dot_path = save_dir_path / "shap_dot_plot.svg"
236
- shap.summary_plot(shap_values_for_plot, instances_to_explain, feature_names=feature_names, plot_type="dot", show=False)
266
+ shap.summary_plot(shap_values, instances_to_explain_np, feature_names=feature_names, plot_type="dot", show=False)
237
267
  plt.title("SHAP Feature Importance")
238
268
  plt.tight_layout()
239
269
  plt.savefig(dot_path)
@@ -242,18 +272,25 @@ def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain
242
272
 
243
273
  # Save Summary Data to CSV
244
274
  summary_path = save_dir_path / "shap_summary.csv"
245
- mean_abs_shap = np.abs(shap_values_for_plot).mean(axis=0)
275
+ # Ensure the array is 1D before creating the DataFrame
276
+ mean_abs_shap = np.abs(shap_values).mean(axis=0).flatten()
277
+
246
278
  if feature_names is None:
247
279
  feature_names = [f'feature_{i}' for i in range(len(mean_abs_shap))]
280
+
248
281
  summary_df = pd.DataFrame({
249
282
  'feature': feature_names,
250
283
  'mean_abs_shap_value': mean_abs_shap
251
284
  }).sort_values('mean_abs_shap_value', ascending=False)
285
+
252
286
  summary_df.to_csv(summary_path, index=False)
287
+
253
288
  _LOGGER.info(f"📝 SHAP summary data saved as '{summary_path.name}'")
289
+ plt.ion()
290
+
254
291
  else:
255
292
  _LOGGER.info("No save directory provided. Displaying SHAP dot plot.")
256
- shap.summary_plot(shap_values_for_plot, instances_to_explain, feature_names=feature_names, plot_type="dot")
293
+ shap.summary_plot(shap_values, instances_to_explain_np, feature_names=feature_names, plot_type="dot")
257
294
 
258
295
 
259
296
  def info():
ml_tools/ML_trainer.py CHANGED
@@ -95,14 +95,16 @@ class MyTrainer:
95
95
  batch_size=batch_size,
96
96
  shuffle=shuffle,
97
97
  num_workers=loader_workers,
98
- pin_memory=(self.device.type == "cuda")
98
+ pin_memory=("cuda" in self.device.type),
99
+ drop_last=True # Drops the last batch if incomplete, selecting a good batch size is key.
99
100
  )
101
+
100
102
  self.test_loader = DataLoader(
101
103
  dataset=self.test_dataset,
102
104
  batch_size=batch_size,
103
105
  shuffle=False,
104
106
  num_workers=loader_workers,
105
- pin_memory=(self.device.type == "cuda")
107
+ pin_memory=("cuda" in self.device.type)
106
108
  )
107
109
 
108
110
  def fit(self, epochs: int = 10, batch_size: int = 10, shuffle: bool = True):