dragon-ml-toolbox 1.4.0__py3-none-any.whl → 1.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -17,11 +17,10 @@ import xgboost as xgb
17
17
  import lightgbm as lgb
18
18
 
19
19
  from sklearn.model_selection import train_test_split
20
- from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
21
20
  from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, r2_score, roc_curve, roc_auc_score
22
21
  import shap
23
22
 
24
- from .utilities import yield_dataframes_from_dir
23
+ from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info
25
24
 
26
25
  import warnings # Ignore warnings
27
26
  warnings.filterwarnings('ignore', category=DeprecationWarning)
@@ -29,9 +28,21 @@ warnings.filterwarnings('ignore', category=FutureWarning)
29
28
  warnings.filterwarnings('ignore', category=UserWarning)
30
29
 
31
30
 
31
+ __all__ = [
32
+ "get_models",
33
+ "dataset_pipeline",
34
+ "evaluate_model_classification",
35
+ "plot_roc_curve",
36
+ "evaluate_model_regression",
37
+ "get_shap_values",
38
+ "train_test_pipeline",
39
+ "run_ensemble_pipeline"
40
+ ]
41
+
42
+
32
43
  ###### 1. Dataset Loader ######
33
44
  #Split a dataset into features and targets datasets
34
- def dataset_yielder(df: pd.DataFrame, target_cols: list[str]):
45
+ def _dataset_yielder(df: pd.DataFrame, target_cols: list[str]):
35
46
  '''
36
47
  Yields one Tuple at a time: `(df_features, df_target, feature_names, target_name)`
37
48
  '''
@@ -144,22 +155,8 @@ def _split_data(features, target, test_size, random_state, task):
144
155
  stratify=target if task=="classification" else None)
145
156
  return X_train, X_test, y_train, y_test
146
157
 
147
- # function to standardize the data
148
- def _standardize_data(train_features, test_features, scaler_code):
149
- if scaler_code == "standard":
150
- scaler = StandardScaler()
151
- elif scaler_code == "minmax":
152
- scaler = MinMaxScaler()
153
- elif scaler_code == "maxabs":
154
- scaler = MaxAbsScaler()
155
- else:
156
- raise ValueError(f"Unrecognized scaler {scaler_code}")
157
- train_scaled = scaler.fit_transform(train_features)
158
- test_scaled = scaler.transform(test_features)
159
- return train_scaled, test_scaled, scaler
160
-
161
158
  # Over-sample minority class (Positive cases) and return several single target datasets (Classification)
162
- def _resample(X_train_scaled: np.ndarray, y_train: pd.Series,
159
+ def _resample(X_train: np.ndarray, y_train: pd.Series,
163
160
  strategy: Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE'], random_state):
164
161
  '''
165
162
  Oversample minority class or undersample majority class.
@@ -177,21 +174,20 @@ def _resample(X_train_scaled: np.ndarray, y_train: pd.Series,
177
174
  else:
178
175
  raise ValueError(f"Invalid resampling strategy: {strategy}")
179
176
 
180
- X_res, y_res, *_ = resample_algorithm.fit_resample(X_train_scaled, y_train)
177
+ X_res, y_res, *_ = resample_algorithm.fit_resample(X_train, y_train)
181
178
  return X_res, y_res
182
179
 
183
180
  # DATASET PIPELINE
184
181
  def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Literal["classification", "regression"],
185
- resample_strategy: Union[Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE'], None], scaler: Literal["standard", "minmax", "maxabs"],
182
+ resample_strategy: Union[Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE'], None],
186
183
  test_size: float=0.2, debug: bool=False, random_state: int=101):
187
184
  '''
188
185
  1. Make Train/Test splits
189
- 2. Standardize Train and Test Features
190
- 3. Oversample imbalanced classes (classification)
186
+ 2. Oversample imbalanced classes (classification)
191
187
 
192
- Return a processed Tuple: (X_train, y_train, X_test, y_test, Scaler)
188
+ Return a processed Tuple: (X_train, y_train, X_test, y_test)
193
189
 
194
- `(nD-array, 1D-array, nD-array, Series, Scaler)`
190
+ `(nD-array, 1D-array, nD-array, Series)`
195
191
  '''
196
192
  #DEBUG
197
193
  if debug:
@@ -206,24 +202,18 @@ def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Lite
206
202
  if debug:
207
203
  print(f"Shapes after train test split - X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
208
204
 
209
- # Standardize
210
- X_train_scaled, X_test_scaled, scaler_object = _standardize_data(train_features=X_train, test_features=X_test, scaler_code=scaler)
211
-
212
- #DEBUG
213
- if debug:
214
- print(f"Shapes after scaling features - X_train: {X_train_scaled.shape}, y_train: {y_train.shape}, X_test: {X_test_scaled.shape}, y_test: {y_test.shape}")
215
205
 
216
- # Scale
206
+ # Resample
217
207
  if resample_strategy is None or task == "regression":
218
- X_train_oversampled, y_train_oversampled = X_train_scaled, y_train
208
+ X_train_oversampled, y_train_oversampled = X_train, y_train
219
209
  else:
220
- X_train_oversampled, y_train_oversampled = _resample(X_train_scaled=X_train_scaled, y_train=y_train, strategy=resample_strategy, random_state=random_state)
210
+ X_train_oversampled, y_train_oversampled = _resample(X_train=X_train, y_train=y_train, strategy=resample_strategy, random_state=random_state)
221
211
 
222
212
  #DEBUG
223
213
  if debug:
224
- print(f"Shapes after resampling - X_train: {X_train_oversampled.shape}, y_train: {y_train_oversampled.shape}, X_test: {X_test_scaled.shape}, y_test: {y_test.shape}")
214
+ print(f"Shapes after resampling - X_train: {X_train_oversampled.shape}, y_train: {y_train_oversampled.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")
225
215
 
226
- return X_train_oversampled, y_train_oversampled, X_test_scaled, y_test, scaler_object
216
+ return X_train_oversampled, y_train_oversampled, X_test, y_test
227
217
 
228
218
  ###### 4. Train and Evaluation ######
229
219
  # Trainer function
@@ -244,9 +234,11 @@ def _local_directories(model_name: str, dataset_id: str, save_dir: str):
244
234
  return model_dir
245
235
 
246
236
  # save model
247
- def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str, scaler_object: Union[StandardScaler, MinMaxScaler, MaxAbsScaler]):
248
- full_path = os.path.join(save_directory, f"{model_name}_{target_name}.joblib")
249
- joblib.dump({'model': trained_model, 'scaler':scaler_object, 'feature_names': feature_names, 'target_name':target_name}, full_path)
237
+ def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str):
238
+ #Sanitize filenames to save
239
+ sanitized_target_name = sanitize_filename(target_name)
240
+ full_path = os.path.join(save_directory, f"{model_name}_{sanitized_target_name}.joblib")
241
+ joblib.dump({'model': trained_model, 'feature_names': feature_names, 'target_name':target_name}, full_path)
250
242
 
251
243
  # function to evaluate the model and save metrics (Classification)
252
244
  def evaluate_model_classification(
@@ -255,10 +247,9 @@ def evaluate_model_classification(
255
247
  save_dir: str,
256
248
  x_test_scaled: np.ndarray,
257
249
  single_y_test: np.ndarray,
258
- target_id: str,
250
+ target_name: str,
259
251
  figsize: tuple = (10, 8),
260
- title_fontsize: int = 24,
261
- label_fontsize: int = 24,
252
+ base_fontsize: int = 24,
262
253
  cmap: Colormap = plt.cm.Blues # type: ignore
263
254
  ) -> np.ndarray:
264
255
  """
@@ -269,8 +260,8 @@ def evaluate_model_classification(
269
260
  model_name: Identifier for the model
270
261
  save_dir: Directory where results are saved
271
262
  x_test_scaled: Feature matrix for test set
272
- single_y_test: True binary labels
273
- target_id: Suffix for naming output files
263
+ single_y_test: True targets
264
+ target_name: Target name
274
265
  figsize: Size of the confusion matrix figure (width, height)
275
266
  fontsize: Font size used for title, axis labels and ticks
276
267
  cmap: Color map for the confusion matrix. Examples include:
@@ -298,9 +289,10 @@ def evaluate_model_classification(
298
289
  )
299
290
 
300
291
  # Save text report
301
- report_path = os.path.join(save_dir, f"Classification_Report_{target_id}.txt")
292
+ sanitized_target_name = sanitize_filename(target_name)
293
+ report_path = os.path.join(save_dir, f"Classification_Report_{sanitized_target_name}.txt")
302
294
  with open(report_path, "w") as f:
303
- f.write(f"{model_name} - {target_id}\t\tAccuracy: {accuracy:.2f}\n")
295
+ f.write(f"{model_name} - {target_name}\t\tAccuracy: {accuracy:.2f}\n")
304
296
  f.write("Classification Report:\n")
305
297
  f.write(report) # type: ignore
306
298
 
@@ -315,20 +307,20 @@ def evaluate_model_classification(
315
307
  ax=ax
316
308
  )
317
309
 
318
- ax.set_title(f"{model_name} - {target_id}", fontsize=title_fontsize)
319
- ax.tick_params(axis='both', labelsize=label_fontsize)
320
- ax.set_xlabel("Predicted label", fontsize=label_fontsize)
321
- ax.set_ylabel("True label", fontsize=label_fontsize)
310
+ ax.set_title(f"{model_name} - {target_name}", fontsize=base_fontsize)
311
+ ax.tick_params(axis='both', labelsize=base_fontsize)
312
+ ax.set_xlabel("Predicted label", fontsize=base_fontsize)
313
+ ax.set_ylabel("True label", fontsize=base_fontsize)
322
314
 
323
315
  # Turn off gridlines
324
316
  ax.grid(False)
325
317
 
326
318
  # Manually update font size of cell texts
327
319
  for text in ax.texts:
328
- text.set_fontsize(title_fontsize+4)
320
+ text.set_fontsize(base_fontsize+4)
329
321
 
330
322
  fig.tight_layout()
331
- fig_path = os.path.join(save_dir, f"Confusion_Matrix_{target_id}.svg")
323
+ fig_path = os.path.join(save_dir, f"Confusion_Matrix_{sanitized_target_name}.svg")
332
324
  fig.savefig(fig_path, format="svg", bbox_inches="tight")
333
325
  plt.close(fig)
334
326
 
@@ -353,7 +345,7 @@ def plot_roc_curve(
353
345
  Parameters:
354
346
  true_labels: np.ndarray of shape (n_samples,), ground truth binary labels (0 or 1).
355
347
  probabilities_or_model: either predicted probabilities (ndarray), or a trained model with attribute `.predict_proba()`.
356
- target_name: str, used for figure title and filename.
348
+ target_name: str, Target name.
357
349
  save_directory: str, path to directory where figure is saved.
358
350
  color: color of the ROC curve. Accepts any valid Matplotlib color specification. Examples:
359
351
  - Named colors: "darkorange", "blue", "red", "green", "black"
@@ -411,7 +403,8 @@ def plot_roc_curve(
411
403
 
412
404
  # Save figure
413
405
  os.makedirs(save_directory, exist_ok=True)
414
- save_path = os.path.join(save_directory, f"ROC_{target_name}.svg")
406
+ sanitized_target_name = sanitize_filename(target_name)
407
+ save_path = os.path.join(save_directory, f"ROC_{sanitized_target_name}.svg")
415
408
  fig.savefig(save_path, bbox_inches="tight", format="svg")
416
409
 
417
410
  return fig
@@ -421,7 +414,7 @@ def plot_roc_curve(
421
414
  def evaluate_model_regression(model, model_name: str,
422
415
  save_dir: str,
423
416
  x_test_scaled: np.ndarray, single_y_test: np.ndarray,
424
- target_id: str,
417
+ target_name: str,
425
418
  figure_size: tuple = (12, 8),
426
419
  alpha_transparency: float = 0.5,
427
420
  base_fontsize: int = 24):
@@ -435,9 +428,10 @@ def evaluate_model_regression(model, model_name: str,
435
428
  r2 = r2_score(single_y_test, y_pred)
436
429
 
437
430
  # Create formatted report
438
- report_path = os.path.join(save_dir, f"Regression_Report_{target_id}.txt")
431
+ sanitized_target_name = sanitize_filename(target_name)
432
+ report_path = os.path.join(save_dir, f"Regression_Report_{sanitized_target_name}.txt")
439
433
  with open(report_path, "w") as f:
440
- f.write(f"{model_name} - {target_id} Regression Performance\n")
434
+ f.write(f"{model_name} - {target_name} Regression Performance\n")
441
435
  f.write(f"Mean Absolute Error (MAE): {mae:.4f}\n")
442
436
  f.write(f"Mean Squared Error (MSE): {mse:.4f}\n")
443
437
  f.write(f"Root Mean Squared Error (RMSE): {rmse:.4f}\n")
@@ -450,10 +444,10 @@ def evaluate_model_regression(model, model_name: str,
450
444
  plt.axhline(0, color='red', linestyle='--')
451
445
  plt.xlabel("Predicted Values", fontsize=base_fontsize)
452
446
  plt.ylabel("Residuals", fontsize=base_fontsize)
453
- plt.title(f"{model_name} - Residual Plot for {target_id}", fontsize=base_fontsize)
447
+ plt.title(f"{model_name} - Residual Plot for {target_name}", fontsize=base_fontsize)
454
448
  plt.grid(True)
455
449
  plt.tight_layout()
456
- plt.savefig(os.path.join(save_dir, f"Residual_Plot_{target_id}.svg"), bbox_inches='tight', format="svg")
450
+ plt.savefig(os.path.join(save_dir, f"Residual_Plot_{sanitized_target_name}.svg"), bbox_inches='tight', format="svg")
457
451
  plt.close()
458
452
 
459
453
  # Create true vs predicted values plot
@@ -464,14 +458,15 @@ def evaluate_model_regression(model, model_name: str,
464
458
  'k--', lw=2)
465
459
  plt.xlabel('True Values', fontsize=base_fontsize)
466
460
  plt.ylabel('Predictions', fontsize=base_fontsize)
467
- plt.title(f"{model_name} - True vs Predicted for {target_id}", fontsize=base_fontsize)
461
+ plt.title(f"{model_name} - True vs Predicted for {target_name}", fontsize=base_fontsize)
468
462
  plt.grid(True)
469
- plot_path = os.path.join(save_dir, f"Regression_Plot_{target_id}.svg")
463
+ plot_path = os.path.join(save_dir, f"Regression_Plot_{sanitized_target_name}.svg")
470
464
  plt.savefig(plot_path, bbox_inches='tight', format="svg")
471
465
  plt.close()
472
466
 
473
467
  return y_pred
474
468
 
469
+
475
470
  # Get SHAP values
476
471
  def get_shap_values(
477
472
  model,
@@ -479,7 +474,7 @@ def get_shap_values(
479
474
  save_dir: str,
480
475
  features_to_explain: np.ndarray,
481
476
  feature_names: list[str],
482
- target_id: str,
477
+ target_name: str,
483
478
  task: Literal["classification", "regression"],
484
479
  max_display_features: int = 10,
485
480
  figsize: tuple = (16, 20),
@@ -498,7 +493,8 @@ def get_shap_values(
498
493
  features_to_explain: Should match the model's training data format, including scaling.
499
494
  save_dir: Directory to save visualizations
500
495
  """
501
-
496
+ sanitized_target_name = sanitize_filename(target_name)
497
+
502
498
  def _apply_plot_style():
503
499
  styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
504
500
  for style in styles:
@@ -560,9 +556,9 @@ def get_shap_values(
560
556
  _create_shap_plot(
561
557
  shap_values=class_shap,
562
558
  features=features_to_explain,
563
- save_path=os.path.join(save_dir, f"SHAP_{target_id}_Class{class_name}_{plot_type}.svg"),
559
+ save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_Class{class_name}_{plot_type}.svg"),
564
560
  plot_type=plot_type,
565
- title=f"{model_name} - {target_id} (Class {class_name})"
561
+ title=f"{model_name} - {target_name} (Class {class_name})"
566
562
  )
567
563
  else:
568
564
  values = shap_values[1] if isinstance(shap_values, list) else shap_values
@@ -570,9 +566,9 @@ def get_shap_values(
570
566
  _create_shap_plot(
571
567
  shap_values=values,
572
568
  features=features_to_explain,
573
- save_path=os.path.join(save_dir, f"SHAP_{target_id}_{plot_type}.svg"),
569
+ save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_{plot_type}.svg"),
574
570
  plot_type=plot_type,
575
- title=f"{model_name} - {target_id}"
571
+ title=f"{model_name} - {target_name}"
576
572
  )
577
573
 
578
574
  def _plot_for_regression(shap_values):
@@ -580,10 +576,11 @@ def get_shap_values(
580
576
  _create_shap_plot(
581
577
  shap_values=shap_values,
582
578
  features=features_to_explain,
583
- save_path=os.path.join(save_dir, f"SHAP_{target_id}_{plot_type}.svg"),
579
+ save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_{plot_type}.svg"),
584
580
  plot_type=plot_type,
585
- title=f"{model_name} - {target_id}"
581
+ title=f"{model_name} - {target_name}"
586
582
  )
583
+ #START_O
587
584
 
588
585
  explainer = shap.TreeExplainer(model)
589
586
  shap_values = explainer.shap_values(features_to_explain)
@@ -602,7 +599,7 @@ def get_shap_values(
602
599
  def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["classification", "regression"],
603
600
  train_features: np.ndarray, train_target: np.ndarray,
604
601
  test_features: np.ndarray, test_target: np.ndarray,
605
- feature_names: list[str], target_id: str, scaler_object: Union[StandardScaler, MinMaxScaler, MaxAbsScaler],
602
+ feature_names: list[str], target_name: str,
606
603
  save_dir: str,
607
604
  debug: bool=False, save_model: bool=False):
608
605
  '''
@@ -612,7 +609,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
612
609
 
613
610
  Returns: Tuple(Trained model, Test-set Predictions)
614
611
  '''
615
- print(f"\tModel: {model_name} for Target: {target_id}...")
612
+ print(f"\tModel: {model_name} for Target: {target_name}...")
616
613
  trained_model = _train_model(model=model, train_features=train_features, train_target=train_target)
617
614
  if debug:
618
615
  print(f"Trained model object: {type(trained_model)}")
@@ -620,42 +617,42 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
620
617
 
621
618
  if save_model:
622
619
  _save_model(trained_model=trained_model, model_name=model_name,
623
- target_name=target_id, feature_names=feature_names,
624
- save_directory=local_save_directory, scaler_object=scaler_object)
620
+ target_name=target_name, feature_names=feature_names,
621
+ save_directory=local_save_directory)
625
622
 
626
623
  if task == "classification":
627
624
  y_pred = evaluate_model_classification(model=trained_model, model_name=model_name, save_dir=local_save_directory,
628
- x_test_scaled=test_features, single_y_test=test_target, target_id=target_id)
625
+ x_test_scaled=test_features, single_y_test=test_target, target_name=target_name)
629
626
  plot_roc_curve(true_labels=test_target,
630
627
  probabilities_or_model=trained_model, model_name=model_name,
631
- target_name=target_id, save_directory=local_save_directory,
628
+ target_name=target_name, save_directory=local_save_directory,
632
629
  input_features=test_features)
633
630
  elif task == "regression":
634
631
  y_pred = evaluate_model_regression(model=trained_model, model_name=model_name, save_dir=local_save_directory,
635
- x_test_scaled=test_features, single_y_test=test_target, target_id=target_id)
632
+ x_test_scaled=test_features, single_y_test=test_target, target_name=target_name)
636
633
  else:
637
634
  raise ValueError(f"Unrecognized task '{task}' for model training,")
638
635
  if debug:
639
636
  print(f"Predicted vector: {type(y_pred)} with shape: {y_pred.shape}")
640
637
 
641
638
  get_shap_values(model=trained_model, model_name=model_name, save_dir=local_save_directory,
642
- features_to_explain=train_features, feature_names=feature_names, target_id=target_id, task=task)
639
+ features_to_explain=train_features, feature_names=feature_names, target_name=target_name, task=task)
643
640
  print("\t...done.")
644
641
  return trained_model, y_pred
645
642
 
646
643
  ###### 5. Execution ######
647
644
  def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], task: Literal["classification", "regression"],
648
- resample_strategy: Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE', None]=None, scaler: Literal["standard", "minmax", "maxabs"]="minmax", save_model: bool=False,
645
+ resample_strategy: Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE', None]=None, save_model: bool=False,
649
646
  test_size: float=0.2, debug:bool=False, L1_regularization: float=0.5, L2_regularization: float=0.5, learning_rate: float=0.005, random_state: int=101):
650
647
  #Check paths
651
648
  _check_paths(datasets_dir, save_dir)
652
649
  #Yield imputed dataset
653
650
  for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_dir):
654
651
  #Yield features dataframe and target dataframe
655
- for df_features, df_target, feature_names, target_name in dataset_yielder(df=dataframe, target_cols=target_columns):
652
+ for df_features, df_target, feature_names, target_name in _dataset_yielder(df=dataframe, target_cols=target_columns):
656
653
  #Dataset pipeline
657
- X_train, y_train, X_test, y_test, scaler_object = dataset_pipeline(df_features=df_features, df_target=df_target, task=task,
658
- resample_strategy=resample_strategy, scaler=scaler,
654
+ X_train, y_train, X_test, y_test = dataset_pipeline(df_features=df_features, df_target=df_target, task=task,
655
+ resample_strategy=resample_strategy,
659
656
  test_size=test_size, debug=debug, random_state=random_state)
660
657
  #Get models
661
658
  models_dict = get_models(task=task, is_balanced=False if resample_strategy is None else True,
@@ -665,13 +662,17 @@ def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list
665
662
  train_test_pipeline(model=model, model_name=model_name, dataset_id=dataframe_name, task=task,
666
663
  train_features=X_train, train_target=y_train, # type: ignore
667
664
  test_features=X_test, test_target=y_test,
668
- feature_names=feature_names,target_id=target_name, scaler_object=scaler_object,
665
+ feature_names=feature_names,target_name=target_name,
669
666
  debug=debug, save_dir=save_dir, save_model=save_model)
670
667
  print("\n✅ Training and evaluation complete.")
671
668
 
672
669
 
673
670
  def _check_paths(datasets_dir: str, save_dir:str):
674
671
  if not os.path.isdir(save_dir):
675
- os.makedirs(save_dir)
672
+ os.makedirs(save_dir)
676
673
  if not os.path.isdir(datasets_dir):
677
674
  raise IOError(f"Datasets directory '{datasets_dir}' not found.")
675
+
676
+
677
+ def info():
678
+ _script_info(__all__)
ml_tools/handle_excel.py CHANGED
@@ -2,6 +2,16 @@ import os
2
2
  from openpyxl import load_workbook, Workbook
3
3
  import pandas as pd
4
4
  from typing import List, Optional
5
+ from utilities import _script_info, sanitize_filename
6
+
7
+
8
+ __all__ = [
9
+ "unmerge_and_split_excel",
10
+ "unmerge_and_split_from_directory",
11
+ "validate_excel_schema",
12
+ "vertical_merge_transform_excel",
13
+ "horizontal_merge_transform_excel"
14
+ ]
5
15
 
6
16
 
7
17
  def unmerge_and_split_excel(filepath: str) -> None:
@@ -25,12 +35,12 @@ def unmerge_and_split_excel(filepath: str) -> None:
25
35
  ws = wb[sheet_name]
26
36
  new_wb = Workbook()
27
37
  new_ws = new_wb.active
28
- new_ws.title = sheet_name
38
+ new_ws.title = sheet_name # type: ignore
29
39
 
30
40
  # Copy all cell values
31
41
  for row in ws.iter_rows():
32
42
  for cell in row:
33
- new_ws.cell(row=cell.row, column=cell.column, value=cell.value)
43
+ new_ws.cell(row=cell.row, column=cell.column, value=cell.value) # type: ignore
34
44
 
35
45
  # Fill and unmerge merged regions
36
46
  for merged_range in list(ws.merged_cells.ranges):
@@ -41,10 +51,10 @@ def unmerge_and_split_excel(filepath: str) -> None:
41
51
  value = ws.cell(row=min_row, column=min_col).value
42
52
  for row in range(min_row, max_row + 1):
43
53
  for col in range(min_col, max_col + 1):
44
- new_ws.cell(row=row, column=col, value=value)
54
+ new_ws.cell(row=row, column=col, value=value) # type: ignore
45
55
 
46
56
  # Construct flat output file name
47
- sanitized_sheet_name = sheet_name.replace("/", "_").replace("\\", "_")
57
+ sanitized_sheet_name = sanitize_filename(sheet_name)
48
58
  output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
49
59
  output_path = os.path.join(base_dir, output_filename)
50
60
  new_wb.save(output_path)
@@ -85,12 +95,12 @@ def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
85
95
  ws = wb[sheet_name]
86
96
  new_wb = Workbook()
87
97
  new_ws = new_wb.active
88
- new_ws.title = sheet_name
98
+ new_ws.title = sheet_name # type: ignore
89
99
 
90
100
  # Copy all cell values
91
101
  for row in ws.iter_rows():
92
102
  for cell in row:
93
- new_ws.cell(row=cell.row, column=cell.column, value=cell.value)
103
+ new_ws.cell(row=cell.row, column=cell.column, value=cell.value) # type: ignore
94
104
 
95
105
  # Fill and unmerge merged regions
96
106
  for merged_range in list(ws.merged_cells.ranges):
@@ -101,10 +111,10 @@ def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
101
111
  value = ws.cell(row=min_row, column=min_col).value
102
112
  for row in range(min_row, max_row + 1):
103
113
  for col in range(min_col, max_col + 1):
104
- new_ws.cell(row=row, column=col, value=value)
114
+ new_ws.cell(row=row, column=col, value=value) # type: ignore
105
115
 
106
116
  # Construct flat output file name
107
- sanitized_sheet_name = sheet_name.replace("/", "_").replace("\\", "_")
117
+ sanitized_sheet_name = sanitize_filename(sheet_name)
108
118
  output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
109
119
  output_path = os.path.join(output_dir, output_filename)
110
120
  new_wb.save(output_path)
@@ -151,7 +161,7 @@ def validate_excel_schema(
151
161
  wb = load_workbook(file_path, read_only=True)
152
162
  ws = wb.active # Only check the first worksheet
153
163
 
154
- header = [cell.value for cell in next(ws.iter_rows(max_row=1))]
164
+ header = [cell.value for cell in next(ws.iter_rows(max_row=1))] # type: ignore
155
165
 
156
166
  if strict:
157
167
  if header != expected_columns:
@@ -202,6 +212,11 @@ def vertical_merge_transform_excel(
202
212
 
203
213
  if not excel_files:
204
214
  raise ValueError("No Excel files found in the target directory.")
215
+
216
+ # sanitize filename
217
+ csv_filename = sanitize_filename(csv_filename)
218
+ # make directory
219
+ os.makedirs(output_dir, exist_ok=True)
205
220
 
206
221
  csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
207
222
  csv_path = os.path.join(output_dir, csv_filename)
@@ -260,6 +275,11 @@ def horizontal_merge_transform_excel(
260
275
  excel_files = [f for f in raw_excel_files if not f.startswith('~')] # Exclude temporary files
261
276
  if not excel_files:
262
277
  raise ValueError("No Excel files found in the target directory.")
278
+
279
+ # sanitize filename
280
+ csv_filename = sanitize_filename(csv_filename)
281
+ # make directory
282
+ os.makedirs(output_dir, exist_ok=True)
263
283
 
264
284
  csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
265
285
  csv_path = os.path.join(output_dir, csv_filename)
@@ -308,3 +328,6 @@ def horizontal_merge_transform_excel(
308
328
  if duplicate_columns:
309
329
  print(f"⚠️ Duplicate columns: {duplicate_columns}")
310
330
 
331
+
332
+ def info():
333
+ _script_info(__all__)
ml_tools/logger.py CHANGED
@@ -5,7 +5,12 @@ import pandas as pd
5
5
  from openpyxl.styles import Font, PatternFill
6
6
  import traceback
7
7
  import json
8
- from ml_tools.utilities import sanitize_filename
8
+ from ml_tools.utilities import sanitize_filename, _script_info
9
+
10
+
11
+ __all__ = [
12
+ "custom_logger"
13
+ ]
9
14
 
10
15
 
11
16
  def custom_logger(
@@ -143,3 +148,7 @@ def _log_exception_to_log(exc: BaseException, path: str) -> None:
143
148
  def _log_dict_to_json(data: Dict[Any, Any], path: str) -> None:
144
149
  with open(path, 'w', encoding='utf-8') as f:
145
150
  json.dump(data, f, indent=4, ensure_ascii=False)
151
+
152
+
153
+ def info():
154
+ _script_info(__all__)