dragon-ml-toolbox 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/METADATA +1 -1
- dragon_ml_toolbox-2.2.0.dist-info/RECORD +21 -0
- ml_tools/ETL_engineering.py +543 -0
- ml_tools/MICE_imputation.py +27 -28
- ml_tools/PSO_optimization.py +15 -15
- ml_tools/VIF_factor.py +20 -17
- ml_tools/data_exploration.py +58 -32
- ml_tools/ensemble_learning.py +40 -42
- ml_tools/handle_excel.py +98 -78
- ml_tools/logger.py +13 -11
- ml_tools/utilities.py +165 -60
- dragon_ml_toolbox-2.0.0.dist-info/RECORD +0 -20
- {dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/top_level.txt +0 -0
ml_tools/ensemble_learning.py
CHANGED
|
@@ -5,7 +5,7 @@ import matplotlib.pyplot as plt
|
|
|
5
5
|
from matplotlib.colors import Colormap
|
|
6
6
|
from matplotlib import rcdefaults
|
|
7
7
|
|
|
8
|
-
import
|
|
8
|
+
from pathlib import Path
|
|
9
9
|
from typing import Literal, Union, Optional, Iterator, Tuple
|
|
10
10
|
|
|
11
11
|
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
|
|
@@ -19,7 +19,7 @@ from sklearn.model_selection import train_test_split
|
|
|
19
19
|
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, r2_score, roc_curve, roc_auc_score
|
|
20
20
|
import shap
|
|
21
21
|
|
|
22
|
-
from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info, serialize_object
|
|
22
|
+
from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info, serialize_object, make_fullpath
|
|
23
23
|
|
|
24
24
|
import warnings # Ignore warnings
|
|
25
25
|
warnings.filterwarnings('ignore', category=DeprecationWarning)
|
|
@@ -469,30 +469,31 @@ def _train_model(model, train_features, train_target):
|
|
|
469
469
|
return model
|
|
470
470
|
|
|
471
471
|
# handle local directories
|
|
472
|
-
def _local_directories(model_name: str, dataset_id: str, save_dir: str):
|
|
473
|
-
|
|
474
|
-
if not os.path.isdir(dataset_dir):
|
|
475
|
-
os.makedirs(dataset_dir)
|
|
472
|
+
def _local_directories(model_name: str, dataset_id: str, save_dir: Union[str,Path]):
|
|
473
|
+
save_path = make_fullpath(save_dir, make=True)
|
|
476
474
|
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
475
|
+
dataset_dir = save_path / dataset_id
|
|
476
|
+
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
477
|
+
|
|
478
|
+
model_dir = dataset_dir / model_name
|
|
479
|
+
model_dir.mkdir(parents=True, exist_ok=True)
|
|
480
480
|
|
|
481
481
|
return model_dir
|
|
482
482
|
|
|
483
483
|
# save model
|
|
484
|
-
def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str):
|
|
484
|
+
def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: Union[str,Path]):
|
|
485
485
|
#Sanitize filenames to save
|
|
486
486
|
sanitized_target_name = sanitize_filename(target_name)
|
|
487
487
|
filename = f"{model_name}_{sanitized_target_name}"
|
|
488
488
|
to_save = {'model': trained_model, 'feature_names': feature_names, 'target_name':target_name}
|
|
489
|
+
|
|
489
490
|
serialize_object(obj=to_save, save_dir=save_directory, filename=filename, verbose=False, raise_on_error=True)
|
|
490
491
|
|
|
491
492
|
# function to evaluate the model and save metrics (Classification)
|
|
492
493
|
def evaluate_model_classification(
|
|
493
494
|
model,
|
|
494
495
|
model_name: str,
|
|
495
|
-
save_dir: str,
|
|
496
|
+
save_dir: Union[str,Path],
|
|
496
497
|
x_test_scaled: np.ndarray,
|
|
497
498
|
single_y_test: np.ndarray,
|
|
498
499
|
target_name: str,
|
|
@@ -524,7 +525,7 @@ def evaluate_model_classification(
|
|
|
524
525
|
Returns:
|
|
525
526
|
y_pred: Predicted class labels
|
|
526
527
|
"""
|
|
527
|
-
|
|
528
|
+
save_path = make_fullpath(save_dir, make=True)
|
|
528
529
|
|
|
529
530
|
y_pred = model.predict(x_test_scaled)
|
|
530
531
|
accuracy = accuracy_score(single_y_test, y_pred)
|
|
@@ -538,7 +539,7 @@ def evaluate_model_classification(
|
|
|
538
539
|
|
|
539
540
|
# Save text report
|
|
540
541
|
sanitized_target_name = sanitize_filename(target_name)
|
|
541
|
-
report_path =
|
|
542
|
+
report_path = save_path / f"Classification_Report_{sanitized_target_name}.txt"
|
|
542
543
|
with open(report_path, "w") as f:
|
|
543
544
|
f.write(f"{model_name} - {target_name}\t\tAccuracy: {accuracy:.2f}\n")
|
|
544
545
|
f.write("Classification Report:\n")
|
|
@@ -568,7 +569,7 @@ def evaluate_model_classification(
|
|
|
568
569
|
text.set_fontsize(base_fontsize+4)
|
|
569
570
|
|
|
570
571
|
fig.tight_layout()
|
|
571
|
-
fig_path =
|
|
572
|
+
fig_path = save_path / f"Confusion_Matrix_{sanitized_target_name}.svg"
|
|
572
573
|
fig.savefig(fig_path, format="svg", bbox_inches="tight")
|
|
573
574
|
plt.close(fig)
|
|
574
575
|
|
|
@@ -580,7 +581,7 @@ def plot_roc_curve(
|
|
|
580
581
|
probabilities_or_model: Union[np.ndarray, xgb.XGBClassifier, lgb.LGBMClassifier, object],
|
|
581
582
|
model_name: str,
|
|
582
583
|
target_name: str,
|
|
583
|
-
save_directory: str,
|
|
584
|
+
save_directory: Union[str,Path],
|
|
584
585
|
color: str = "darkorange",
|
|
585
586
|
figure_size: tuple = (10, 10),
|
|
586
587
|
linewidth: int = 2,
|
|
@@ -594,7 +595,7 @@ def plot_roc_curve(
|
|
|
594
595
|
true_labels: np.ndarray of shape (n_samples,), ground truth binary labels (0 or 1).
|
|
595
596
|
probabilities_or_model: either predicted probabilities (ndarray), or a trained model with attribute `.predict_proba()`.
|
|
596
597
|
target_name: str, Target name.
|
|
597
|
-
save_directory: str, path to directory where figure is saved.
|
|
598
|
+
save_directory: str or Path, path to directory where figure is saved.
|
|
598
599
|
color: color of the ROC curve. Accepts any valid Matplotlib color specification. Examples:
|
|
599
600
|
- Named colors: "darkorange", "blue", "red", "green", "black"
|
|
600
601
|
- Hex codes: "#1f77b4", "#ff7f0e"
|
|
@@ -650,17 +651,17 @@ def plot_roc_curve(
|
|
|
650
651
|
ax.grid(True)
|
|
651
652
|
|
|
652
653
|
# Save figure
|
|
653
|
-
|
|
654
|
+
save_path = make_fullpath(save_directory, make=True)
|
|
654
655
|
sanitized_target_name = sanitize_filename(target_name)
|
|
655
|
-
|
|
656
|
-
fig.savefig(
|
|
656
|
+
full_save_path = save_path / f"ROC_{sanitized_target_name}.svg"
|
|
657
|
+
fig.savefig(full_save_path, bbox_inches="tight", format="svg")
|
|
657
658
|
|
|
658
659
|
return fig
|
|
659
660
|
|
|
660
661
|
|
|
661
662
|
# function to evaluate the model and save metrics (Regression)
|
|
662
663
|
def evaluate_model_regression(model, model_name: str,
|
|
663
|
-
save_dir: str,
|
|
664
|
+
save_dir: Union[str,Path],
|
|
664
665
|
x_test_scaled: np.ndarray, single_y_test: np.ndarray,
|
|
665
666
|
target_name: str,
|
|
666
667
|
figure_size: tuple = (12, 8),
|
|
@@ -677,7 +678,8 @@ def evaluate_model_regression(model, model_name: str,
|
|
|
677
678
|
|
|
678
679
|
# Create formatted report
|
|
679
680
|
sanitized_target_name = sanitize_filename(target_name)
|
|
680
|
-
|
|
681
|
+
save_path = make_fullpath(save_dir, make=True)
|
|
682
|
+
report_path = save_path / f"Regression_Report_{sanitized_target_name}.txt"
|
|
681
683
|
with open(report_path, "w") as f:
|
|
682
684
|
f.write(f"{model_name} - Regression Performance for '{target_name}'\n\n")
|
|
683
685
|
f.write(f"Mean Absolute Error (MAE): {mae:.4f}\n")
|
|
@@ -695,7 +697,8 @@ def evaluate_model_regression(model, model_name: str,
|
|
|
695
697
|
plt.title(f"{model_name} - Residual Plot for {target_name}", fontsize=base_fontsize)
|
|
696
698
|
plt.grid(True)
|
|
697
699
|
plt.tight_layout()
|
|
698
|
-
|
|
700
|
+
residual_path = save_path / f"Residual_Plot_{sanitized_target_name}.svg"
|
|
701
|
+
plt.savefig(residual_path, bbox_inches='tight', format="svg")
|
|
699
702
|
plt.close()
|
|
700
703
|
|
|
701
704
|
# Create true vs predicted values plot
|
|
@@ -708,7 +711,7 @@ def evaluate_model_regression(model, model_name: str,
|
|
|
708
711
|
plt.ylabel('Predictions', fontsize=base_fontsize)
|
|
709
712
|
plt.title(f"{model_name} - True vs Predicted for {target_name}", fontsize=base_fontsize)
|
|
710
713
|
plt.grid(True)
|
|
711
|
-
plot_path =
|
|
714
|
+
plot_path = save_path / f"Regression_Plot_{sanitized_target_name}.svg"
|
|
712
715
|
plt.savefig(plot_path, bbox_inches='tight', format="svg")
|
|
713
716
|
plt.close()
|
|
714
717
|
|
|
@@ -719,7 +722,7 @@ def evaluate_model_regression(model, model_name: str,
|
|
|
719
722
|
def get_shap_values(
|
|
720
723
|
model,
|
|
721
724
|
model_name: str,
|
|
722
|
-
save_dir: str,
|
|
725
|
+
save_dir: Union[str, Path],
|
|
723
726
|
features_to_explain: np.ndarray,
|
|
724
727
|
feature_names: list[str],
|
|
725
728
|
target_name: str,
|
|
@@ -737,11 +740,12 @@ def get_shap_values(
|
|
|
737
740
|
* Use the entire dataset to get the global view.
|
|
738
741
|
|
|
739
742
|
Parameters:
|
|
740
|
-
task: 'regression' or 'classification'
|
|
743
|
+
task: 'regression' or 'classification'.
|
|
741
744
|
features_to_explain: Should match the model's training data format, including scaling.
|
|
742
|
-
save_dir: Directory to save visualizations
|
|
745
|
+
save_dir: Directory to save visualizations.
|
|
743
746
|
"""
|
|
744
747
|
sanitized_target_name = sanitize_filename(target_name)
|
|
748
|
+
global_save_path = make_fullpath(save_dir, make=True)
|
|
745
749
|
|
|
746
750
|
def _apply_plot_style():
|
|
747
751
|
styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
|
|
@@ -759,7 +763,7 @@ def get_shap_values(
|
|
|
759
763
|
plt.rc('legend', fontsize=base_fontsize)
|
|
760
764
|
plt.rc('figure', titlesize=base_fontsize)
|
|
761
765
|
|
|
762
|
-
def _create_shap_plot(shap_values, features, save_path:
|
|
766
|
+
def _create_shap_plot(shap_values, features, save_path: Path, plot_type: str, title: str):
|
|
763
767
|
_apply_plot_style()
|
|
764
768
|
_configure_rcparams()
|
|
765
769
|
plt.figure(figsize=figsize)
|
|
@@ -804,7 +808,7 @@ def get_shap_values(
|
|
|
804
808
|
_create_shap_plot(
|
|
805
809
|
shap_values=class_shap,
|
|
806
810
|
features=features_to_explain,
|
|
807
|
-
save_path=
|
|
811
|
+
save_path=global_save_path / f"SHAP_{sanitized_target_name}_Class{class_name}_{plot_type}.svg",
|
|
808
812
|
plot_type=plot_type,
|
|
809
813
|
title=f"{model_name} - {target_name} (Class {class_name})"
|
|
810
814
|
)
|
|
@@ -814,7 +818,7 @@ def get_shap_values(
|
|
|
814
818
|
_create_shap_plot(
|
|
815
819
|
shap_values=values,
|
|
816
820
|
features=features_to_explain,
|
|
817
|
-
save_path=
|
|
821
|
+
save_path=global_save_path / f"SHAP_{sanitized_target_name}_{plot_type}.svg",
|
|
818
822
|
plot_type=plot_type,
|
|
819
823
|
title=f"{model_name} - {target_name}"
|
|
820
824
|
)
|
|
@@ -824,7 +828,7 @@ def get_shap_values(
|
|
|
824
828
|
_create_shap_plot(
|
|
825
829
|
shap_values=shap_values,
|
|
826
830
|
features=features_to_explain,
|
|
827
|
-
save_path=
|
|
831
|
+
save_path=global_save_path / f"SHAP_{sanitized_target_name}_{plot_type}.svg",
|
|
828
832
|
plot_type=plot_type,
|
|
829
833
|
title=f"{model_name} - {target_name}"
|
|
830
834
|
)
|
|
@@ -848,7 +852,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: TaskType,
|
|
|
848
852
|
train_features: np.ndarray, train_target: np.ndarray,
|
|
849
853
|
test_features: np.ndarray, test_target: np.ndarray,
|
|
850
854
|
feature_names: list[str], target_name: str,
|
|
851
|
-
save_dir: str,
|
|
855
|
+
save_dir: Union[str,Path],
|
|
852
856
|
debug: bool=False, save_model: bool=False):
|
|
853
857
|
'''
|
|
854
858
|
1. Train model.
|
|
@@ -889,7 +893,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: TaskType,
|
|
|
889
893
|
return trained_model, y_pred
|
|
890
894
|
|
|
891
895
|
###### 5. Execution ######
|
|
892
|
-
def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], model_object: Union[RegressionTreeModels, ClassificationTreeModels],
|
|
896
|
+
def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Path], target_columns: list[str], model_object: Union[RegressionTreeModels, ClassificationTreeModels],
|
|
893
897
|
handle_classification_imbalance: HandleImbalanceStrategy=None, save_model: bool=False,
|
|
894
898
|
test_size: float=0.2, debug:bool=False):
|
|
895
899
|
#Check models
|
|
@@ -907,10 +911,11 @@ def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list
|
|
|
907
911
|
raise TypeError(f"Unrecognized model {type(model_object)}")
|
|
908
912
|
|
|
909
913
|
#Check paths
|
|
910
|
-
|
|
914
|
+
datasets_path = make_fullpath(datasets_dir)
|
|
915
|
+
save_path = make_fullpath(save_dir, make=True)
|
|
911
916
|
|
|
912
917
|
#Yield imputed dataset
|
|
913
|
-
for dataframe, dataframe_name in yield_dataframes_from_dir(
|
|
918
|
+
for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_path):
|
|
914
919
|
#Yield features dataframe and target dataframe
|
|
915
920
|
for df_features, df_target, feature_names, target_name in dataset_yielder(df=dataframe, target_cols=target_columns):
|
|
916
921
|
#Dataset pipeline
|
|
@@ -925,15 +930,8 @@ def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list
|
|
|
925
930
|
train_features=X_train, train_target=y_train, # type: ignore
|
|
926
931
|
test_features=X_test, test_target=y_test,
|
|
927
932
|
feature_names=feature_names,target_name=target_name,
|
|
928
|
-
debug=debug, save_dir=
|
|
933
|
+
debug=debug, save_dir=save_path, save_model=save_model)
|
|
929
934
|
print("\n✅ Training and evaluation complete.")
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
def _check_paths(datasets_dir: str, save_dir:str):
|
|
933
|
-
if not os.path.isdir(save_dir):
|
|
934
|
-
os.makedirs(save_dir)
|
|
935
|
-
if not os.path.isdir(datasets_dir):
|
|
936
|
-
raise IOError(f"Datasets directory '{datasets_dir}' not found.")
|
|
937
935
|
|
|
938
936
|
|
|
939
937
|
def info():
|
ml_tools/handle_excel.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
|
-
import
|
|
1
|
+
from pathlib import Path
|
|
2
2
|
from openpyxl import load_workbook, Workbook
|
|
3
3
|
import pandas as pd
|
|
4
|
-
from typing import List, Optional
|
|
5
|
-
from .utilities import _script_info, sanitize_filename
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
from .utilities import _script_info, sanitize_filename, make_fullpath
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
__all__ = [
|
|
9
|
+
"find_excel_files",
|
|
9
10
|
"unmerge_and_split_excel",
|
|
10
11
|
"unmerge_and_split_from_directory",
|
|
11
12
|
"validate_excel_schema",
|
|
@@ -14,20 +15,55 @@ __all__ = [
|
|
|
14
15
|
]
|
|
15
16
|
|
|
16
17
|
|
|
17
|
-
def
|
|
18
|
+
def find_excel_files(
|
|
19
|
+
directory: Union[str, Path],
|
|
20
|
+
*,
|
|
21
|
+
extensions: tuple[str, ...] = (".xlsx", ".xls"),
|
|
22
|
+
exclude_temp: bool = True
|
|
23
|
+
) -> list[Path]:
|
|
24
|
+
"""
|
|
25
|
+
Returns a list of Excel file Paths in the specified directory.
|
|
26
|
+
|
|
27
|
+
Parameters:
|
|
28
|
+
directory (str | Path): Directory to search.
|
|
29
|
+
extensions (tuple[str, ...]): Valid Excel file extensions (default: .xlsx, .xls).
|
|
30
|
+
exclude_temp (bool): Whether to exclude files that start with '~'.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
list[Path]: List of Excel file paths matching criteria.
|
|
34
|
+
"""
|
|
35
|
+
input_path = make_fullpath(directory)
|
|
36
|
+
|
|
37
|
+
if not input_path.is_dir():
|
|
38
|
+
raise NotADirectoryError(f"Directory not found: {input_path}")
|
|
39
|
+
|
|
40
|
+
excel_files = [
|
|
41
|
+
f for f in input_path.iterdir()
|
|
42
|
+
if f.is_file()
|
|
43
|
+
and f.suffix.lower() in extensions
|
|
44
|
+
and (not f.name.startswith('~') if exclude_temp else True)
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
if not excel_files:
|
|
48
|
+
raise FileNotFoundError(f"No valid Excel files found in directory: {input_path}")
|
|
49
|
+
|
|
50
|
+
return excel_files
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def unmerge_and_split_excel(filepath: Union[str,Path]) -> None:
|
|
18
54
|
"""
|
|
19
55
|
Processes a single Excel file:
|
|
20
|
-
- Unmerges all merged cells (vertical and horizontal),
|
|
21
|
-
-
|
|
22
|
-
- Splits each sheet into a separate Excel file,
|
|
56
|
+
- Unmerges all merged cells (vertical and horizontal), fills each merged region with the top-left cell value.
|
|
57
|
+
- Splits each sheet into a separate Excel file.
|
|
23
58
|
- Saves all results in the same directory as the input file.
|
|
24
59
|
|
|
25
60
|
Parameters:
|
|
26
|
-
filepath (str): Full path to the Excel file to process.
|
|
61
|
+
filepath (str | Path): Full path to the Excel file to process.
|
|
27
62
|
"""
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
63
|
+
file_path = make_fullpath(filepath)
|
|
64
|
+
wb = load_workbook(file_path)
|
|
65
|
+
base_dir = file_path.parent
|
|
66
|
+
base_name = file_path.stem
|
|
31
67
|
|
|
32
68
|
total_output_files = 0
|
|
33
69
|
|
|
@@ -56,40 +92,37 @@ def unmerge_and_split_excel(filepath: str) -> None:
|
|
|
56
92
|
# Construct flat output file name
|
|
57
93
|
sanitized_sheet_name = sanitize_filename(sheet_name)
|
|
58
94
|
output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
|
|
59
|
-
output_path =
|
|
95
|
+
output_path = base_dir / output_filename
|
|
60
96
|
new_wb.save(output_path)
|
|
61
97
|
|
|
62
98
|
# print(f"Saved: {output_path}")
|
|
63
99
|
total_output_files += 1
|
|
64
100
|
|
|
65
|
-
print(f"✅ Processed file: {
|
|
101
|
+
print(f"✅ Processed file: {file_path} into {total_output_files} output file(s).")
|
|
66
102
|
return None
|
|
67
103
|
|
|
68
104
|
|
|
69
|
-
def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
|
|
105
|
+
def unmerge_and_split_from_directory(input_dir: Union[str,Path], output_dir: Union[str,Path]) -> None:
|
|
70
106
|
"""
|
|
71
107
|
Processes all Excel files in the input directory:
|
|
72
|
-
- Unmerges all merged cells (vertical and horizontal),
|
|
73
|
-
-
|
|
74
|
-
- Splits each sheet into separate Excel files,
|
|
108
|
+
- Unmerges all merged cells (vertical and horizontal), fills each merged region with the top-left cell value,
|
|
109
|
+
- Splits each sheet into separate Excel files.
|
|
75
110
|
- Saves all results into the output directory.
|
|
76
111
|
|
|
77
112
|
Parameters:
|
|
78
|
-
input_dir (str): Directory containing Excel files to process.
|
|
79
|
-
output_dir (str): Directory to save processed Excel files.
|
|
113
|
+
input_dir (str | Path): Directory containing Excel files to process.
|
|
114
|
+
output_dir (str | Path): Directory to save processed Excel files.
|
|
80
115
|
"""
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
raise FileNotFoundError(f"No valid Excel files found in directory: {input_dir}")
|
|
116
|
+
global_input_path = make_fullpath(input_dir)
|
|
117
|
+
global_output_path = make_fullpath(output_dir, make=True)
|
|
118
|
+
|
|
119
|
+
excel_files = find_excel_files(global_input_path)
|
|
86
120
|
|
|
87
|
-
os.makedirs(output_dir, exist_ok=True)
|
|
88
121
|
total_output_files = 0
|
|
89
122
|
|
|
90
123
|
for file_path in excel_files:
|
|
91
124
|
wb = load_workbook(file_path)
|
|
92
|
-
base_name =
|
|
125
|
+
base_name = file_path.stem
|
|
93
126
|
|
|
94
127
|
for sheet_name in wb.sheetnames:
|
|
95
128
|
ws = wb[sheet_name]
|
|
@@ -116,7 +149,7 @@ def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
|
|
|
116
149
|
# Construct flat output file name
|
|
117
150
|
sanitized_sheet_name = sanitize_filename(sheet_name)
|
|
118
151
|
output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
|
|
119
|
-
output_path =
|
|
152
|
+
output_path = global_output_path / output_filename
|
|
120
153
|
new_wb.save(output_path)
|
|
121
154
|
|
|
122
155
|
# print(f"Saved: {output_path}")
|
|
@@ -127,7 +160,7 @@ def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
|
|
|
127
160
|
|
|
128
161
|
|
|
129
162
|
def validate_excel_schema(
|
|
130
|
-
target_dir: str,
|
|
163
|
+
target_dir: Union[str,Path],
|
|
131
164
|
expected_columns: List[str],
|
|
132
165
|
strict: bool = False
|
|
133
166
|
) -> None:
|
|
@@ -135,7 +168,7 @@ def validate_excel_schema(
|
|
|
135
168
|
Validates that each Excel file in a directory conforms to the expected column schema.
|
|
136
169
|
|
|
137
170
|
Parameters:
|
|
138
|
-
target_dir (str): Path to the directory containing Excel files.
|
|
171
|
+
target_dir (str | Path): Path to the directory containing Excel files.
|
|
139
172
|
expected_columns (list[str]): List of expected column names.
|
|
140
173
|
strict (bool): If True, columns must match exactly (names and order).
|
|
141
174
|
If False, columns must contain at least all expected names.
|
|
@@ -143,52 +176,46 @@ def validate_excel_schema(
|
|
|
143
176
|
Returns:
|
|
144
177
|
List[str]: List of file paths that failed the schema validation.
|
|
145
178
|
"""
|
|
146
|
-
invalid_files = []
|
|
179
|
+
invalid_files: list[Path] = []
|
|
147
180
|
expected_set = set(expected_columns)
|
|
148
181
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
if filename.startswith("~"): # Skip temporary files
|
|
156
|
-
continue
|
|
157
|
-
|
|
158
|
-
file_path = os.path.join(target_dir, filename)
|
|
159
|
-
excel_seen += 1
|
|
182
|
+
target_path = make_fullpath(target_dir)
|
|
183
|
+
|
|
184
|
+
excel_paths = find_excel_files(target_path)
|
|
185
|
+
|
|
186
|
+
for file in excel_paths:
|
|
160
187
|
try:
|
|
161
|
-
wb = load_workbook(
|
|
188
|
+
wb = load_workbook(file, read_only=True)
|
|
162
189
|
ws = wb.active # Only check the first worksheet
|
|
163
190
|
|
|
164
191
|
header = [cell.value for cell in next(ws.iter_rows(max_row=1))] # type: ignore
|
|
165
192
|
|
|
166
193
|
if strict:
|
|
167
194
|
if header != expected_columns:
|
|
168
|
-
invalid_files.append(
|
|
195
|
+
invalid_files.append(file)
|
|
169
196
|
else:
|
|
170
197
|
header_set = set(header)
|
|
171
198
|
if not expected_set.issubset(header_set):
|
|
172
|
-
invalid_files.append(
|
|
199
|
+
invalid_files.append(file)
|
|
173
200
|
|
|
174
201
|
except Exception as e:
|
|
175
|
-
print(f"Error processing '{
|
|
176
|
-
invalid_files.append(
|
|
202
|
+
print(f"Error processing '{file}': {e}")
|
|
203
|
+
invalid_files.append(file)
|
|
177
204
|
|
|
178
|
-
valid_excel_number =
|
|
179
|
-
print(f"{valid_excel_number} out of {
|
|
205
|
+
valid_excel_number = len(excel_paths) - len(invalid_files)
|
|
206
|
+
print(f"{valid_excel_number} out of {len(excel_paths)} excel files conform to the schema.")
|
|
180
207
|
if invalid_files:
|
|
181
208
|
print(f"⚠️ {len(invalid_files)} excel files are invalid:")
|
|
182
|
-
for
|
|
183
|
-
print(f" - {
|
|
209
|
+
for in_file in invalid_files:
|
|
210
|
+
print(f" - {in_file.name}")
|
|
184
211
|
|
|
185
212
|
return None
|
|
186
213
|
|
|
187
214
|
|
|
188
215
|
def vertical_merge_transform_excel(
|
|
189
|
-
target_dir: str,
|
|
216
|
+
target_dir: Union[str,Path],
|
|
190
217
|
csv_filename: str,
|
|
191
|
-
output_dir: str,
|
|
218
|
+
output_dir: Union[str,Path],
|
|
192
219
|
target_columns: Optional[List[str]] = None,
|
|
193
220
|
rename_columns: Optional[List[str]] = None
|
|
194
221
|
) -> None:
|
|
@@ -201,35 +228,31 @@ def vertical_merge_transform_excel(
|
|
|
201
228
|
- If `rename_columns` is provided, it must match the length of `target_columns` (if used) or the original columns. Names match by position.
|
|
202
229
|
|
|
203
230
|
Parameters:
|
|
204
|
-
target_dir (str): Directory containing Excel files.
|
|
231
|
+
target_dir (str | Path): Directory containing Excel files.
|
|
205
232
|
csv_filename (str): Output CSV filename.
|
|
206
|
-
output_dir (str): Directory to save the output CSV file.
|
|
233
|
+
output_dir (str | Path): Directory to save the output CSV file.
|
|
207
234
|
target_columns (list[str] | None): Columns to select from each Excel file.
|
|
208
235
|
rename_columns (list[str] | None): Optional renaming for columns. Position-based matching.
|
|
209
236
|
"""
|
|
210
|
-
|
|
211
|
-
excel_files =
|
|
212
|
-
|
|
213
|
-
if not excel_files:
|
|
214
|
-
raise ValueError("No Excel files found in the target directory.")
|
|
237
|
+
target_path = make_fullpath(target_dir)
|
|
238
|
+
excel_files = find_excel_files(target_path)
|
|
215
239
|
|
|
216
240
|
# sanitize filename
|
|
217
241
|
csv_filename = sanitize_filename(csv_filename)
|
|
218
|
-
# make directory
|
|
219
|
-
|
|
242
|
+
# make output directory
|
|
243
|
+
output_path = make_fullpath(output_dir, make=True)
|
|
220
244
|
|
|
221
245
|
csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
|
|
222
|
-
csv_path =
|
|
246
|
+
csv_path = output_path / csv_filename
|
|
223
247
|
|
|
224
248
|
dataframes = []
|
|
225
249
|
for file in excel_files:
|
|
226
|
-
|
|
227
|
-
df = pd.read_excel(file_path, engine='openpyxl')
|
|
250
|
+
df = pd.read_excel(file, engine='openpyxl')
|
|
228
251
|
|
|
229
252
|
if target_columns is not None:
|
|
230
253
|
missing = [col for col in target_columns if col not in df.columns]
|
|
231
254
|
if missing:
|
|
232
|
-
raise ValueError(f"
|
|
255
|
+
raise ValueError(f"Invalid columns in {file.name}: {missing}")
|
|
233
256
|
df = df[target_columns]
|
|
234
257
|
|
|
235
258
|
dataframes.append(df)
|
|
@@ -239,7 +262,7 @@ def vertical_merge_transform_excel(
|
|
|
239
262
|
if rename_columns is not None:
|
|
240
263
|
expected_len = len(target_columns if target_columns is not None else merged_df.columns)
|
|
241
264
|
if len(rename_columns) != expected_len:
|
|
242
|
-
raise ValueError("Length of rename_columns must match the selected columns")
|
|
265
|
+
raise ValueError("Length of 'rename_columns' must match the selected columns")
|
|
243
266
|
merged_df.columns = rename_columns
|
|
244
267
|
|
|
245
268
|
merged_df.to_csv(csv_path, index=False, encoding='utf-8')
|
|
@@ -247,9 +270,9 @@ def vertical_merge_transform_excel(
|
|
|
247
270
|
|
|
248
271
|
|
|
249
272
|
def horizontal_merge_transform_excel(
|
|
250
|
-
target_dir: str,
|
|
273
|
+
target_dir: Union[str,Path],
|
|
251
274
|
csv_filename: str,
|
|
252
|
-
output_dir: str,
|
|
275
|
+
output_dir: Union[str,Path],
|
|
253
276
|
drop_columns: Optional[list[str]] = None,
|
|
254
277
|
skip_duplicates: bool = False
|
|
255
278
|
) -> None:
|
|
@@ -265,31 +288,28 @@ def horizontal_merge_transform_excel(
|
|
|
265
288
|
If True, only the first occurrence of each column name is kept.
|
|
266
289
|
|
|
267
290
|
Parameters:
|
|
268
|
-
target_dir (str): Directory containing Excel files.
|
|
291
|
+
target_dir (str | Path): Directory containing Excel files.
|
|
269
292
|
csv_filename (str): Name of the output CSV file.
|
|
270
|
-
output_dir (str): Directory to save the output CSV file.
|
|
293
|
+
output_dir (str | Path): Directory to save the output CSV file.
|
|
271
294
|
drop_columns (list[str] | None): Columns to exclude from each file before merging.
|
|
272
295
|
skip_duplicates (bool): Whether to skip duplicate columns or rename them.
|
|
273
296
|
"""
|
|
274
|
-
|
|
275
|
-
excel_files =
|
|
276
|
-
if not excel_files:
|
|
277
|
-
raise ValueError("No Excel files found in the target directory.")
|
|
297
|
+
target_path = make_fullpath(target_dir)
|
|
298
|
+
excel_files = find_excel_files(target_path)
|
|
278
299
|
|
|
279
300
|
# sanitize filename
|
|
280
301
|
csv_filename = sanitize_filename(csv_filename)
|
|
281
302
|
# make directory
|
|
282
|
-
|
|
303
|
+
output_path = make_fullpath(output_dir, make=True)
|
|
283
304
|
|
|
284
305
|
csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
|
|
285
|
-
csv_path =
|
|
306
|
+
csv_path = output_path / csv_filename
|
|
286
307
|
|
|
287
308
|
dataframes = []
|
|
288
309
|
max_rows = 0
|
|
289
310
|
|
|
290
311
|
for file in excel_files:
|
|
291
|
-
|
|
292
|
-
df = pd.read_excel(file_path, engine='openpyxl')
|
|
312
|
+
df = pd.read_excel(file, engine='openpyxl')
|
|
293
313
|
|
|
294
314
|
if drop_columns is not None:
|
|
295
315
|
df = df.drop(columns=[col for col in drop_columns if col in df.columns])
|