dragon-ml-toolbox 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -5,7 +5,7 @@ import matplotlib.pyplot as plt
5
5
  from matplotlib.colors import Colormap
6
6
  from matplotlib import rcdefaults
7
7
 
8
- import os
8
+ from pathlib import Path
9
9
  from typing import Literal, Union, Optional, Iterator, Tuple
10
10
 
11
11
  from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
@@ -19,7 +19,7 @@ from sklearn.model_selection import train_test_split
19
19
  from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, r2_score, roc_curve, roc_auc_score
20
20
  import shap
21
21
 
22
- from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info, serialize_object
22
+ from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info, serialize_object, make_fullpath
23
23
 
24
24
  import warnings # Ignore warnings
25
25
  warnings.filterwarnings('ignore', category=DeprecationWarning)
@@ -469,30 +469,31 @@ def _train_model(model, train_features, train_target):
469
469
  return model
470
470
 
471
471
  # handle local directories
472
- def _local_directories(model_name: str, dataset_id: str, save_dir: str):
473
- dataset_dir = os.path.join(save_dir, dataset_id)
474
- if not os.path.isdir(dataset_dir):
475
- os.makedirs(dataset_dir)
472
+ def _local_directories(model_name: str, dataset_id: str, save_dir: Union[str,Path]):
473
+ save_path = make_fullpath(save_dir, make=True)
476
474
 
477
- model_dir = os.path.join(dataset_dir, model_name)
478
- if not os.path.isdir(model_dir):
479
- os.makedirs(model_dir)
475
+ dataset_dir = save_path / dataset_id
476
+ dataset_dir.mkdir(parents=True, exist_ok=True)
477
+
478
+ model_dir = dataset_dir / model_name
479
+ model_dir.mkdir(parents=True, exist_ok=True)
480
480
 
481
481
  return model_dir
482
482
 
483
483
  # save model
484
- def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str):
484
+ def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: Union[str,Path]):
485
485
  #Sanitize filenames to save
486
486
  sanitized_target_name = sanitize_filename(target_name)
487
487
  filename = f"{model_name}_{sanitized_target_name}"
488
488
  to_save = {'model': trained_model, 'feature_names': feature_names, 'target_name':target_name}
489
+
489
490
  serialize_object(obj=to_save, save_dir=save_directory, filename=filename, verbose=False, raise_on_error=True)
490
491
 
491
492
  # function to evaluate the model and save metrics (Classification)
492
493
  def evaluate_model_classification(
493
494
  model,
494
495
  model_name: str,
495
- save_dir: str,
496
+ save_dir: Union[str,Path],
496
497
  x_test_scaled: np.ndarray,
497
498
  single_y_test: np.ndarray,
498
499
  target_name: str,
@@ -524,7 +525,7 @@ def evaluate_model_classification(
524
525
  Returns:
525
526
  y_pred: Predicted class labels
526
527
  """
527
- os.makedirs(save_dir, exist_ok=True)
528
+ save_path = make_fullpath(save_dir, make=True)
528
529
 
529
530
  y_pred = model.predict(x_test_scaled)
530
531
  accuracy = accuracy_score(single_y_test, y_pred)
@@ -538,7 +539,7 @@ def evaluate_model_classification(
538
539
 
539
540
  # Save text report
540
541
  sanitized_target_name = sanitize_filename(target_name)
541
- report_path = os.path.join(save_dir, f"Classification_Report_{sanitized_target_name}.txt")
542
+ report_path = save_path / f"Classification_Report_{sanitized_target_name}.txt"
542
543
  with open(report_path, "w") as f:
543
544
  f.write(f"{model_name} - {target_name}\t\tAccuracy: {accuracy:.2f}\n")
544
545
  f.write("Classification Report:\n")
@@ -568,7 +569,7 @@ def evaluate_model_classification(
568
569
  text.set_fontsize(base_fontsize+4)
569
570
 
570
571
  fig.tight_layout()
571
- fig_path = os.path.join(save_dir, f"Confusion_Matrix_{sanitized_target_name}.svg")
572
+ fig_path = save_path / f"Confusion_Matrix_{sanitized_target_name}.svg"
572
573
  fig.savefig(fig_path, format="svg", bbox_inches="tight")
573
574
  plt.close(fig)
574
575
 
@@ -580,7 +581,7 @@ def plot_roc_curve(
580
581
  probabilities_or_model: Union[np.ndarray, xgb.XGBClassifier, lgb.LGBMClassifier, object],
581
582
  model_name: str,
582
583
  target_name: str,
583
- save_directory: str,
584
+ save_directory: Union[str,Path],
584
585
  color: str = "darkorange",
585
586
  figure_size: tuple = (10, 10),
586
587
  linewidth: int = 2,
@@ -594,7 +595,7 @@ def plot_roc_curve(
594
595
  true_labels: np.ndarray of shape (n_samples,), ground truth binary labels (0 or 1).
595
596
  probabilities_or_model: either predicted probabilities (ndarray), or a trained model with attribute `.predict_proba()`.
596
597
  target_name: str, Target name.
597
- save_directory: str, path to directory where figure is saved.
598
+ save_directory: str or Path, path to directory where figure is saved.
598
599
  color: color of the ROC curve. Accepts any valid Matplotlib color specification. Examples:
599
600
  - Named colors: "darkorange", "blue", "red", "green", "black"
600
601
  - Hex codes: "#1f77b4", "#ff7f0e"
@@ -650,17 +651,17 @@ def plot_roc_curve(
650
651
  ax.grid(True)
651
652
 
652
653
  # Save figure
653
- os.makedirs(save_directory, exist_ok=True)
654
+ save_path = make_fullpath(save_directory, make=True)
654
655
  sanitized_target_name = sanitize_filename(target_name)
655
- save_path = os.path.join(save_directory, f"ROC_{sanitized_target_name}.svg")
656
- fig.savefig(save_path, bbox_inches="tight", format="svg")
656
+ full_save_path = save_path / f"ROC_{sanitized_target_name}.svg"
657
+ fig.savefig(full_save_path, bbox_inches="tight", format="svg")
657
658
 
658
659
  return fig
659
660
 
660
661
 
661
662
  # function to evaluate the model and save metrics (Regression)
662
663
  def evaluate_model_regression(model, model_name: str,
663
- save_dir: str,
664
+ save_dir: Union[str,Path],
664
665
  x_test_scaled: np.ndarray, single_y_test: np.ndarray,
665
666
  target_name: str,
666
667
  figure_size: tuple = (12, 8),
@@ -677,7 +678,8 @@ def evaluate_model_regression(model, model_name: str,
677
678
 
678
679
  # Create formatted report
679
680
  sanitized_target_name = sanitize_filename(target_name)
680
- report_path = os.path.join(save_dir, f"Regression_Report_{sanitized_target_name}.txt")
681
+ save_path = make_fullpath(save_dir, make=True)
682
+ report_path = save_path / f"Regression_Report_{sanitized_target_name}.txt"
681
683
  with open(report_path, "w") as f:
682
684
  f.write(f"{model_name} - Regression Performance for '{target_name}'\n\n")
683
685
  f.write(f"Mean Absolute Error (MAE): {mae:.4f}\n")
@@ -695,7 +697,8 @@ def evaluate_model_regression(model, model_name: str,
695
697
  plt.title(f"{model_name} - Residual Plot for {target_name}", fontsize=base_fontsize)
696
698
  plt.grid(True)
697
699
  plt.tight_layout()
698
- plt.savefig(os.path.join(save_dir, f"Residual_Plot_{sanitized_target_name}.svg"), bbox_inches='tight', format="svg")
700
+ residual_path = save_path / f"Residual_Plot_{sanitized_target_name}.svg"
701
+ plt.savefig(residual_path, bbox_inches='tight', format="svg")
699
702
  plt.close()
700
703
 
701
704
  # Create true vs predicted values plot
@@ -708,7 +711,7 @@ def evaluate_model_regression(model, model_name: str,
708
711
  plt.ylabel('Predictions', fontsize=base_fontsize)
709
712
  plt.title(f"{model_name} - True vs Predicted for {target_name}", fontsize=base_fontsize)
710
713
  plt.grid(True)
711
- plot_path = os.path.join(save_dir, f"Regression_Plot_{sanitized_target_name}.svg")
714
+ plot_path = save_path / f"Regression_Plot_{sanitized_target_name}.svg"
712
715
  plt.savefig(plot_path, bbox_inches='tight', format="svg")
713
716
  plt.close()
714
717
 
@@ -719,7 +722,7 @@ def evaluate_model_regression(model, model_name: str,
719
722
  def get_shap_values(
720
723
  model,
721
724
  model_name: str,
722
- save_dir: str,
725
+ save_dir: Union[str, Path],
723
726
  features_to_explain: np.ndarray,
724
727
  feature_names: list[str],
725
728
  target_name: str,
@@ -737,11 +740,12 @@ def get_shap_values(
737
740
  * Use the entire dataset to get the global view.
738
741
 
739
742
  Parameters:
740
- task: 'regression' or 'classification'
743
+ task: 'regression' or 'classification'.
741
744
  features_to_explain: Should match the model's training data format, including scaling.
742
- save_dir: Directory to save visualizations
745
+ save_dir: Directory to save visualizations.
743
746
  """
744
747
  sanitized_target_name = sanitize_filename(target_name)
748
+ global_save_path = make_fullpath(save_dir, make=True)
745
749
 
746
750
  def _apply_plot_style():
747
751
  styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
@@ -759,7 +763,7 @@ def get_shap_values(
759
763
  plt.rc('legend', fontsize=base_fontsize)
760
764
  plt.rc('figure', titlesize=base_fontsize)
761
765
 
762
- def _create_shap_plot(shap_values, features, save_path: str, plot_type: str, title: str):
766
+ def _create_shap_plot(shap_values, features, save_path: Path, plot_type: str, title: str):
763
767
  _apply_plot_style()
764
768
  _configure_rcparams()
765
769
  plt.figure(figsize=figsize)
@@ -804,7 +808,7 @@ def get_shap_values(
804
808
  _create_shap_plot(
805
809
  shap_values=class_shap,
806
810
  features=features_to_explain,
807
- save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_Class{class_name}_{plot_type}.svg"),
811
+ save_path=global_save_path / f"SHAP_{sanitized_target_name}_Class{class_name}_{plot_type}.svg",
808
812
  plot_type=plot_type,
809
813
  title=f"{model_name} - {target_name} (Class {class_name})"
810
814
  )
@@ -814,7 +818,7 @@ def get_shap_values(
814
818
  _create_shap_plot(
815
819
  shap_values=values,
816
820
  features=features_to_explain,
817
- save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_{plot_type}.svg"),
821
+ save_path=global_save_path / f"SHAP_{sanitized_target_name}_{plot_type}.svg",
818
822
  plot_type=plot_type,
819
823
  title=f"{model_name} - {target_name}"
820
824
  )
@@ -824,7 +828,7 @@ def get_shap_values(
824
828
  _create_shap_plot(
825
829
  shap_values=shap_values,
826
830
  features=features_to_explain,
827
- save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_{plot_type}.svg"),
831
+ save_path=global_save_path / f"SHAP_{sanitized_target_name}_{plot_type}.svg",
828
832
  plot_type=plot_type,
829
833
  title=f"{model_name} - {target_name}"
830
834
  )
@@ -848,7 +852,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: TaskType,
848
852
  train_features: np.ndarray, train_target: np.ndarray,
849
853
  test_features: np.ndarray, test_target: np.ndarray,
850
854
  feature_names: list[str], target_name: str,
851
- save_dir: str,
855
+ save_dir: Union[str,Path],
852
856
  debug: bool=False, save_model: bool=False):
853
857
  '''
854
858
  1. Train model.
@@ -889,7 +893,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: TaskType,
889
893
  return trained_model, y_pred
890
894
 
891
895
  ###### 5. Execution ######
892
- def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], model_object: Union[RegressionTreeModels, ClassificationTreeModels],
896
+ def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Path], target_columns: list[str], model_object: Union[RegressionTreeModels, ClassificationTreeModels],
893
897
  handle_classification_imbalance: HandleImbalanceStrategy=None, save_model: bool=False,
894
898
  test_size: float=0.2, debug:bool=False):
895
899
  #Check models
@@ -907,10 +911,11 @@ def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list
907
911
  raise TypeError(f"Unrecognized model {type(model_object)}")
908
912
 
909
913
  #Check paths
910
- _check_paths(datasets_dir, save_dir)
914
+ datasets_path = make_fullpath(datasets_dir)
915
+ save_path = make_fullpath(save_dir, make=True)
911
916
 
912
917
  #Yield imputed dataset
913
- for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_dir):
918
+ for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_path):
914
919
  #Yield features dataframe and target dataframe
915
920
  for df_features, df_target, feature_names, target_name in dataset_yielder(df=dataframe, target_cols=target_columns):
916
921
  #Dataset pipeline
@@ -925,15 +930,8 @@ def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list
925
930
  train_features=X_train, train_target=y_train, # type: ignore
926
931
  test_features=X_test, test_target=y_test,
927
932
  feature_names=feature_names,target_name=target_name,
928
- debug=debug, save_dir=save_dir, save_model=save_model)
933
+ debug=debug, save_dir=save_path, save_model=save_model)
929
934
  print("\n✅ Training and evaluation complete.")
930
-
931
-
932
- def _check_paths(datasets_dir: str, save_dir:str):
933
- if not os.path.isdir(save_dir):
934
- os.makedirs(save_dir)
935
- if not os.path.isdir(datasets_dir):
936
- raise IOError(f"Datasets directory '{datasets_dir}' not found.")
937
935
 
938
936
 
939
937
  def info():
ml_tools/handle_excel.py CHANGED
@@ -1,11 +1,12 @@
1
- import os
1
+ from pathlib import Path
2
2
  from openpyxl import load_workbook, Workbook
3
3
  import pandas as pd
4
- from typing import List, Optional
5
- from .utilities import _script_info, sanitize_filename
4
+ from typing import List, Optional, Union
5
+ from .utilities import _script_info, sanitize_filename, make_fullpath
6
6
 
7
7
 
8
8
  __all__ = [
9
+ "find_excel_files",
9
10
  "unmerge_and_split_excel",
10
11
  "unmerge_and_split_from_directory",
11
12
  "validate_excel_schema",
@@ -14,20 +15,55 @@ __all__ = [
14
15
  ]
15
16
 
16
17
 
17
- def unmerge_and_split_excel(filepath: str) -> None:
18
+ def find_excel_files(
19
+ directory: Union[str, Path],
20
+ *,
21
+ extensions: tuple[str, ...] = (".xlsx", ".xls"),
22
+ exclude_temp: bool = True
23
+ ) -> list[Path]:
24
+ """
25
+ Returns a list of Excel file Paths in the specified directory.
26
+
27
+ Parameters:
28
+ directory (str | Path): Directory to search.
29
+ extensions (tuple[str, ...]): Valid Excel file extensions (default: .xlsx, .xls).
30
+ exclude_temp (bool): Whether to exclude files that start with '~'.
31
+
32
+ Returns:
33
+ list[Path]: List of Excel file paths matching criteria.
34
+ """
35
+ input_path = make_fullpath(directory)
36
+
37
+ if not input_path.is_dir():
38
+ raise NotADirectoryError(f"Directory not found: {input_path}")
39
+
40
+ excel_files = [
41
+ f for f in input_path.iterdir()
42
+ if f.is_file()
43
+ and f.suffix.lower() in extensions
44
+ and (not f.name.startswith('~') if exclude_temp else True)
45
+ ]
46
+
47
+ if not excel_files:
48
+ raise FileNotFoundError(f"No valid Excel files found in directory: {input_path}")
49
+
50
+ return excel_files
51
+
52
+
53
+ def unmerge_and_split_excel(filepath: Union[str,Path]) -> None:
18
54
  """
19
55
  Processes a single Excel file:
20
- - Unmerges all merged cells (vertical and horizontal),
21
- - Fills each merged region with the top-left cell value,
22
- - Splits each sheet into a separate Excel file,
56
+ - Unmerges all merged cells (vertical and horizontal), fills each merged region with the top-left cell value.
57
+ - Splits each sheet into a separate Excel file.
23
58
  - Saves all results in the same directory as the input file.
24
59
 
25
60
  Parameters:
26
- filepath (str): Full path to the Excel file to process.
61
+ filepath (str | Path): Full path to the Excel file to process.
27
62
  """
28
- wb = load_workbook(filepath)
29
- base_dir = os.path.dirname(os.path.abspath(filepath))
30
- base_name = os.path.splitext(os.path.basename(filepath))[0]
63
+ file_path = make_fullpath(filepath)
64
+ wb = load_workbook(file_path)
65
+ base_dir = file_path.parent
66
+ base_name = file_path.stem
31
67
 
32
68
  total_output_files = 0
33
69
 
@@ -56,40 +92,37 @@ def unmerge_and_split_excel(filepath: str) -> None:
56
92
  # Construct flat output file name
57
93
  sanitized_sheet_name = sanitize_filename(sheet_name)
58
94
  output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
59
- output_path = os.path.join(base_dir, output_filename)
95
+ output_path = base_dir / output_filename
60
96
  new_wb.save(output_path)
61
97
 
62
98
  # print(f"Saved: {output_path}")
63
99
  total_output_files += 1
64
100
 
65
- print(f"✅ Processed file: {filepath} into {total_output_files} output file(s).")
101
+ print(f"✅ Processed file: {file_path} into {total_output_files} output file(s).")
66
102
  return None
67
103
 
68
104
 
69
- def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
105
+ def unmerge_and_split_from_directory(input_dir: Union[str,Path], output_dir: Union[str,Path]) -> None:
70
106
  """
71
107
  Processes all Excel files in the input directory:
72
- - Unmerges all merged cells (vertical and horizontal),
73
- - Fills each merged region with the top-left cell value,
74
- - Splits each sheet into separate Excel files,
108
+ - Unmerges all merged cells (vertical and horizontal), fills each merged region with the top-left cell value,
109
+ - Splits each sheet into separate Excel files.
75
110
  - Saves all results into the output directory.
76
111
 
77
112
  Parameters:
78
- input_dir (str): Directory containing Excel files to process.
79
- output_dir (str): Directory to save processed Excel files.
113
+ input_dir (str | Path): Directory containing Excel files to process.
114
+ output_dir (str | Path): Directory to save processed Excel files.
80
115
  """
81
- raw_files = [f for f in os.listdir(input_dir) if f.endswith(('.xlsx', '.xls'))]
82
- excel_files = [os.path.join(input_dir, f) for f in raw_files if not f.startswith('~')]
83
-
84
- if not excel_files:
85
- raise FileNotFoundError(f"No valid Excel files found in directory: {input_dir}")
116
+ global_input_path = make_fullpath(input_dir)
117
+ global_output_path = make_fullpath(output_dir, make=True)
118
+
119
+ excel_files = find_excel_files(global_input_path)
86
120
 
87
- os.makedirs(output_dir, exist_ok=True)
88
121
  total_output_files = 0
89
122
 
90
123
  for file_path in excel_files:
91
124
  wb = load_workbook(file_path)
92
- base_name = os.path.splitext(os.path.basename(file_path))[0]
125
+ base_name = file_path.stem
93
126
 
94
127
  for sheet_name in wb.sheetnames:
95
128
  ws = wb[sheet_name]
@@ -116,7 +149,7 @@ def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
116
149
  # Construct flat output file name
117
150
  sanitized_sheet_name = sanitize_filename(sheet_name)
118
151
  output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
119
- output_path = os.path.join(output_dir, output_filename)
152
+ output_path = global_output_path / output_filename
120
153
  new_wb.save(output_path)
121
154
 
122
155
  # print(f"Saved: {output_path}")
@@ -127,7 +160,7 @@ def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
127
160
 
128
161
 
129
162
  def validate_excel_schema(
130
- target_dir: str,
163
+ target_dir: Union[str,Path],
131
164
  expected_columns: List[str],
132
165
  strict: bool = False
133
166
  ) -> None:
@@ -135,7 +168,7 @@ def validate_excel_schema(
135
168
  Validates that each Excel file in a directory conforms to the expected column schema.
136
169
 
137
170
  Parameters:
138
- target_dir (str): Path to the directory containing Excel files.
171
+ target_dir (str | Path): Path to the directory containing Excel files.
139
172
  expected_columns (list[str]): List of expected column names.
140
173
  strict (bool): If True, columns must match exactly (names and order).
141
174
  If False, columns must contain at least all expected names.
@@ -143,52 +176,46 @@ def validate_excel_schema(
143
176
  Returns:
144
177
  List[str]: List of file paths that failed the schema validation.
145
178
  """
146
- invalid_files = []
179
+ invalid_files: list[Path] = []
147
180
  expected_set = set(expected_columns)
148
181
 
149
- excel_seen = 0
150
-
151
- for filename in os.listdir(target_dir):
152
- if not filename.lower().endswith(".xlsx"):
153
- continue # Skip non-Excel files
154
-
155
- if filename.startswith("~"): # Skip temporary files
156
- continue
157
-
158
- file_path = os.path.join(target_dir, filename)
159
- excel_seen += 1
182
+ target_path = make_fullpath(target_dir)
183
+
184
+ excel_paths = find_excel_files(target_path)
185
+
186
+ for file in excel_paths:
160
187
  try:
161
- wb = load_workbook(file_path, read_only=True)
188
+ wb = load_workbook(file, read_only=True)
162
189
  ws = wb.active # Only check the first worksheet
163
190
 
164
191
  header = [cell.value for cell in next(ws.iter_rows(max_row=1))] # type: ignore
165
192
 
166
193
  if strict:
167
194
  if header != expected_columns:
168
- invalid_files.append(file_path)
195
+ invalid_files.append(file)
169
196
  else:
170
197
  header_set = set(header)
171
198
  if not expected_set.issubset(header_set):
172
- invalid_files.append(file_path)
199
+ invalid_files.append(file)
173
200
 
174
201
  except Exception as e:
175
- print(f"Error processing '{file_path}': {e}")
176
- invalid_files.append(file_path)
202
+ print(f"Error processing '{file}': {e}")
203
+ invalid_files.append(file)
177
204
 
178
- valid_excel_number = excel_seen - len(invalid_files)
179
- print(f"{valid_excel_number} out of {excel_seen} excel files conform to the schema.")
205
+ valid_excel_number = len(excel_paths) - len(invalid_files)
206
+ print(f"{valid_excel_number} out of {len(excel_paths)} excel files conform to the schema.")
180
207
  if invalid_files:
181
208
  print(f"⚠️ {len(invalid_files)} excel files are invalid:")
182
- for file in invalid_files:
183
- print(f" - {file}")
209
+ for in_file in invalid_files:
210
+ print(f" - {in_file.name}")
184
211
 
185
212
  return None
186
213
 
187
214
 
188
215
  def vertical_merge_transform_excel(
189
- target_dir: str,
216
+ target_dir: Union[str,Path],
190
217
  csv_filename: str,
191
- output_dir: str,
218
+ output_dir: Union[str,Path],
192
219
  target_columns: Optional[List[str]] = None,
193
220
  rename_columns: Optional[List[str]] = None
194
221
  ) -> None:
@@ -201,35 +228,31 @@ def vertical_merge_transform_excel(
201
228
  - If `rename_columns` is provided, it must match the length of `target_columns` (if used) or the original columns. Names match by position.
202
229
 
203
230
  Parameters:
204
- target_dir (str): Directory containing Excel files.
231
+ target_dir (str | Path): Directory containing Excel files.
205
232
  csv_filename (str): Output CSV filename.
206
- output_dir (str): Directory to save the output CSV file.
233
+ output_dir (str | Path): Directory to save the output CSV file.
207
234
  target_columns (list[str] | None): Columns to select from each Excel file.
208
235
  rename_columns (list[str] | None): Optional renaming for columns. Position-based matching.
209
236
  """
210
- raw_excel_files = [f for f in os.listdir(target_dir) if f.endswith(('.xlsx', '.xls'))]
211
- excel_files = [f for f in raw_excel_files if not f.startswith('~')] # Exclude temporary files
212
-
213
- if not excel_files:
214
- raise ValueError("No Excel files found in the target directory.")
237
+ target_path = make_fullpath(target_dir)
238
+ excel_files = find_excel_files(target_path)
215
239
 
216
240
  # sanitize filename
217
241
  csv_filename = sanitize_filename(csv_filename)
218
- # make directory
219
- os.makedirs(output_dir, exist_ok=True)
242
+ # make output directory
243
+ output_path = make_fullpath(output_dir, make=True)
220
244
 
221
245
  csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
222
- csv_path = os.path.join(output_dir, csv_filename)
246
+ csv_path = output_path / csv_filename
223
247
 
224
248
  dataframes = []
225
249
  for file in excel_files:
226
- file_path = os.path.join(target_dir, file)
227
- df = pd.read_excel(file_path, engine='openpyxl')
250
+ df = pd.read_excel(file, engine='openpyxl')
228
251
 
229
252
  if target_columns is not None:
230
253
  missing = [col for col in target_columns if col not in df.columns]
231
254
  if missing:
232
- raise ValueError(f"Missing columns in {file}: {missing}")
255
+ raise ValueError(f"Invalid columns in {file.name}: {missing}")
233
256
  df = df[target_columns]
234
257
 
235
258
  dataframes.append(df)
@@ -239,7 +262,7 @@ def vertical_merge_transform_excel(
239
262
  if rename_columns is not None:
240
263
  expected_len = len(target_columns if target_columns is not None else merged_df.columns)
241
264
  if len(rename_columns) != expected_len:
242
- raise ValueError("Length of rename_columns must match the selected columns")
265
+ raise ValueError("Length of 'rename_columns' must match the selected columns")
243
266
  merged_df.columns = rename_columns
244
267
 
245
268
  merged_df.to_csv(csv_path, index=False, encoding='utf-8')
@@ -247,9 +270,9 @@ def vertical_merge_transform_excel(
247
270
 
248
271
 
249
272
  def horizontal_merge_transform_excel(
250
- target_dir: str,
273
+ target_dir: Union[str,Path],
251
274
  csv_filename: str,
252
- output_dir: str,
275
+ output_dir: Union[str,Path],
253
276
  drop_columns: Optional[list[str]] = None,
254
277
  skip_duplicates: bool = False
255
278
  ) -> None:
@@ -265,31 +288,28 @@ def horizontal_merge_transform_excel(
265
288
  If True, only the first occurrence of each column name is kept.
266
289
 
267
290
  Parameters:
268
- target_dir (str): Directory containing Excel files.
291
+ target_dir (str | Path): Directory containing Excel files.
269
292
  csv_filename (str): Name of the output CSV file.
270
- output_dir (str): Directory to save the output CSV file.
293
+ output_dir (str | Path): Directory to save the output CSV file.
271
294
  drop_columns (list[str] | None): Columns to exclude from each file before merging.
272
295
  skip_duplicates (bool): Whether to skip duplicate columns or rename them.
273
296
  """
274
- raw_excel_files = [f for f in os.listdir(target_dir) if f.endswith(('.xlsx', '.xls'))]
275
- excel_files = [f for f in raw_excel_files if not f.startswith('~')] # Exclude temporary files
276
- if not excel_files:
277
- raise ValueError("No Excel files found in the target directory.")
297
+ target_path = make_fullpath(target_dir)
298
+ excel_files = find_excel_files(target_path)
278
299
 
279
300
  # sanitize filename
280
301
  csv_filename = sanitize_filename(csv_filename)
281
302
  # make directory
282
- os.makedirs(output_dir, exist_ok=True)
303
+ output_path = make_fullpath(output_dir, make=True)
283
304
 
284
305
  csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
285
- csv_path = os.path.join(output_dir, csv_filename)
306
+ csv_path = output_path / csv_filename
286
307
 
287
308
  dataframes = []
288
309
  max_rows = 0
289
310
 
290
311
  for file in excel_files:
291
- file_path = os.path.join(target_dir, file)
292
- df = pd.read_excel(file_path, engine='openpyxl')
312
+ df = pd.read_excel(file, engine='openpyxl')
293
313
 
294
314
  if drop_columns is not None:
295
315
  df = df.drop(columns=[col for col in drop_columns if col in df.columns])