PyPI - dragon-ml-toolbox - Versions diffs - 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl - Mend

dragon-ml-toolbox 2.0.0py3-none-any.whl → 2.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (16) hide show

{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/METADATA +1 -1
dragon_ml_toolbox-2.2.0.dist-info/RECORD +21 -0
ml_tools/ETL_engineering.py +543 -0
ml_tools/MICE_imputation.py +27 -28
ml_tools/PSO_optimization.py +15 -15
ml_tools/VIF_factor.py +20 -17
ml_tools/data_exploration.py +58 -32
ml_tools/ensemble_learning.py +40 -42
ml_tools/handle_excel.py +98 -78
ml_tools/logger.py +13 -11
ml_tools/utilities.py +165 -60
dragon_ml_toolbox-2.0.0.dist-info/RECORD +0 -20
{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/licenses/LICENSE +0 -0
{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/top_level.txt +0 -0

ml_tools/ensemble_learning.py CHANGED Viewed

@@ -5,7 +5,7 @@ import matplotlib.pyplot as plt
 from matplotlib.colors import Colormap
 from matplotlib import rcdefaults
-import os
+from pathlib import Path
 from typing import Literal, Union, Optional, Iterator, Tuple
 from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
@@ -19,7 +19,7 @@ from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, r2_score, roc_curve, roc_auc_score
 import shap
-from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info, serialize_object
+from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info, serialize_object, make_fullpath
 import warnings # Ignore warnings
 warnings.filterwarnings('ignore', category=DeprecationWarning)
@@ -469,30 +469,31 @@ def _train_model(model, train_features, train_target):
     return model
 # handle local directories
-def _local_directories(model_name: str, dataset_id: str, save_dir: str):
-    dataset_dir = os.path.join(save_dir, dataset_id)
-    if not os.path.isdir(dataset_dir):
-        os.makedirs(dataset_dir)
+def _local_directories(model_name: str, dataset_id: str, save_dir: Union[str,Path]):
+    save_path = make_fullpath(save_dir, make=True)
-    model_dir = os.path.join(dataset_dir, model_name)
-    if not os.path.isdir(model_dir):
-        os.makedirs(model_dir)
+    dataset_dir = save_path / dataset_id
+    dataset_dir.mkdir(parents=True, exist_ok=True)
+    model_dir = dataset_dir / model_name
+    model_dir.mkdir(parents=True, exist_ok=True)
     return model_dir
 # save model
-def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str):
+def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: Union[str,Path]):
     #Sanitize filenames to save
     sanitized_target_name = sanitize_filename(target_name)
     filename = f"{model_name}_{sanitized_target_name}"
     to_save = {'model': trained_model, 'feature_names': feature_names, 'target_name':target_name}
     serialize_object(obj=to_save, save_dir=save_directory, filename=filename, verbose=False, raise_on_error=True)
 # function to evaluate the model and save metrics (Classification)
 def evaluate_model_classification(
     model,
     model_name: str,
-    save_dir: str,
+    save_dir: Union[str,Path],
     x_test_scaled: np.ndarray,
     single_y_test: np.ndarray,
     target_name: str,
@@ -524,7 +525,7 @@ def evaluate_model_classification(
     Returns:
         y_pred: Predicted class labels
     """
-    os.makedirs(save_dir, exist_ok=True)
+    save_path = make_fullpath(save_dir, make=True)
     y_pred = model.predict(x_test_scaled)
     accuracy = accuracy_score(single_y_test, y_pred)
@@ -538,7 +539,7 @@ def evaluate_model_classification(
     # Save text report
     sanitized_target_name = sanitize_filename(target_name)
-    report_path = os.path.join(save_dir, f"Classification_Report_{sanitized_target_name}.txt")
+    report_path = save_path / f"Classification_Report_{sanitized_target_name}.txt"
     with open(report_path, "w") as f:
         f.write(f"{model_name} - {target_name}\t\tAccuracy: {accuracy:.2f}\n")
         f.write("Classification Report:\n")
@@ -568,7 +569,7 @@ def evaluate_model_classification(
         text.set_fontsize(base_fontsize+4)
     fig.tight_layout()
-    fig_path = os.path.join(save_dir, f"Confusion_Matrix_{sanitized_target_name}.svg")
+    fig_path = save_path / f"Confusion_Matrix_{sanitized_target_name}.svg"
     fig.savefig(fig_path, format="svg", bbox_inches="tight")
     plt.close(fig)
@@ -580,7 +581,7 @@ def plot_roc_curve(
     probabilities_or_model: Union[np.ndarray, xgb.XGBClassifier, lgb.LGBMClassifier, object],
     model_name: str,
     target_name: str,
-    save_directory: str,
+    save_directory: Union[str,Path],
     color: str = "darkorange",
     figure_size: tuple = (10, 10),
     linewidth: int = 2,
@@ -594,7 +595,7 @@ def plot_roc_curve(
         true_labels: np.ndarray of shape (n_samples,), ground truth binary labels (0 or 1).
         probabilities_or_model: either predicted probabilities (ndarray), or a trained model with attribute `.predict_proba()`.
         target_name: str, Target name.
-        save_directory: str, path to directory where figure is saved.
+        save_directory: str or Path, path to directory where figure is saved.
         color: color of the ROC curve. Accepts any valid Matplotlib color specification. Examples:
             - Named colors: "darkorange", "blue", "red", "green", "black"
             - Hex codes: "#1f77b4", "#ff7f0e"
@@ -650,17 +651,17 @@ def plot_roc_curve(
     ax.grid(True)
     # Save figure
-    os.makedirs(save_directory, exist_ok=True)
+    save_path = make_fullpath(save_directory, make=True)
     sanitized_target_name = sanitize_filename(target_name)
-    save_path = os.path.join(save_directory, f"ROC_{sanitized_target_name}.svg")
-    fig.savefig(save_path, bbox_inches="tight", format="svg")
+    full_save_path = save_path / f"ROC_{sanitized_target_name}.svg"
+    fig.savefig(full_save_path, bbox_inches="tight", format="svg")
     return fig
 # function to evaluate the model and save metrics (Regression)
 def evaluate_model_regression(model, model_name: str,
-                               save_dir: str,
+                               save_dir: Union[str,Path],
                                x_test_scaled: np.ndarray, single_y_test: np.ndarray,
                                target_name: str,
                                figure_size: tuple = (12, 8),
@@ -677,7 +678,8 @@ def evaluate_model_regression(model, model_name: str,
     # Create formatted report
     sanitized_target_name = sanitize_filename(target_name)
-    report_path = os.path.join(save_dir, f"Regression_Report_{sanitized_target_name}.txt")
+    save_path = make_fullpath(save_dir, make=True)
+    report_path = save_path / f"Regression_Report_{sanitized_target_name}.txt"
     with open(report_path, "w") as f:
         f.write(f"{model_name} - Regression Performance for '{target_name}'\n\n")
         f.write(f"Mean Absolute Error (MAE): {mae:.4f}\n")
@@ -695,7 +697,8 @@ def evaluate_model_regression(model, model_name: str,
     plt.title(f"{model_name} - Residual Plot for {target_name}", fontsize=base_fontsize)
     plt.grid(True)
     plt.tight_layout()
-    plt.savefig(os.path.join(save_dir, f"Residual_Plot_{sanitized_target_name}.svg"), bbox_inches='tight', format="svg")
+    residual_path = save_path / f"Residual_Plot_{sanitized_target_name}.svg"
+    plt.savefig(residual_path, bbox_inches='tight', format="svg")
     plt.close()
     # Create true vs predicted values plot
@@ -708,7 +711,7 @@ def evaluate_model_regression(model, model_name: str,
     plt.ylabel('Predictions', fontsize=base_fontsize)
     plt.title(f"{model_name} - True vs Predicted for {target_name}", fontsize=base_fontsize)
     plt.grid(True)
-    plot_path = os.path.join(save_dir, f"Regression_Plot_{sanitized_target_name}.svg")
+    plot_path = save_path / f"Regression_Plot_{sanitized_target_name}.svg"
     plt.savefig(plot_path, bbox_inches='tight', format="svg")
     plt.close()
@@ -719,7 +722,7 @@ def evaluate_model_regression(model, model_name: str,
 def get_shap_values(
     model,
     model_name: str,
-    save_dir: str,
+    save_dir: Union[str, Path],
     features_to_explain: np.ndarray,
     feature_names: list[str],
     target_name: str,
@@ -737,11 +740,12 @@ def get_shap_values(
 	    * Use the entire dataset to get the global view.
     Parameters:
-        task: 'regression' or 'classification'
+        task: 'regression' or 'classification'.
         features_to_explain: Should match the model's training data format, including scaling.
-        save_dir: Directory to save visualizations
+        save_dir: Directory to save visualizations.
     """
     sanitized_target_name = sanitize_filename(target_name)
+    global_save_path = make_fullpath(save_dir, make=True)
     def _apply_plot_style():
         styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
@@ -759,7 +763,7 @@ def get_shap_values(
         plt.rc('legend', fontsize=base_fontsize)
         plt.rc('figure', titlesize=base_fontsize)
-    def _create_shap_plot(shap_values, features, save_path: str, plot_type: str, title: str):
+    def _create_shap_plot(shap_values, features, save_path: Path, plot_type: str, title: str):
         _apply_plot_style()
         _configure_rcparams()
         plt.figure(figsize=figsize)
@@ -804,7 +808,7 @@ def get_shap_values(
                     _create_shap_plot(
                         shap_values=class_shap,
                         features=features_to_explain,
-                        save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_Class{class_name}_{plot_type}.svg"),
+                        save_path=global_save_path / f"SHAP_{sanitized_target_name}_Class{class_name}_{plot_type}.svg",
                         plot_type=plot_type,
                         title=f"{model_name} - {target_name} (Class {class_name})"
                     )
@@ -814,7 +818,7 @@ def get_shap_values(
                 _create_shap_plot(
                     shap_values=values,
                     features=features_to_explain,
-                    save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_{plot_type}.svg"),
+                    save_path=global_save_path / f"SHAP_{sanitized_target_name}_{plot_type}.svg",
                     plot_type=plot_type,
                     title=f"{model_name} - {target_name}"
                 )
@@ -824,7 +828,7 @@ def get_shap_values(
             _create_shap_plot(
                 shap_values=shap_values,
                 features=features_to_explain,
-                save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_{plot_type}.svg"),
+                save_path=global_save_path / f"SHAP_{sanitized_target_name}_{plot_type}.svg",
                 plot_type=plot_type,
                 title=f"{model_name} - {target_name}"
             )
@@ -848,7 +852,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: TaskType,
              train_features: np.ndarray, train_target: np.ndarray,
              test_features: np.ndarray, test_target: np.ndarray,
              feature_names: list[str], target_name: str,
-             save_dir: str,
+             save_dir: Union[str,Path],
              debug: bool=False, save_model: bool=False):
     '''
     1. Train model.
@@ -889,7 +893,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: TaskType,
     return trained_model, y_pred
 ###### 5. Execution ######
-def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], model_object: Union[RegressionTreeModels, ClassificationTreeModels],
+def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Path], target_columns: list[str], model_object: Union[RegressionTreeModels, ClassificationTreeModels],
          handle_classification_imbalance: HandleImbalanceStrategy=None, save_model: bool=False,
          test_size: float=0.2, debug:bool=False):
     #Check models
@@ -907,10 +911,11 @@ def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list
         raise TypeError(f"Unrecognized model {type(model_object)}")
     #Check paths
-    _check_paths(datasets_dir, save_dir)
+    datasets_path = make_fullpath(datasets_dir)
+    save_path = make_fullpath(save_dir, make=True)
     #Yield imputed dataset
-    for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_dir):
+    for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_path):
         #Yield features dataframe and target dataframe
         for df_features, df_target, feature_names, target_name in dataset_yielder(df=dataframe, target_cols=target_columns):
             #Dataset pipeline
@@ -925,15 +930,8 @@ def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list
                                     train_features=X_train, train_target=y_train, # type: ignore
                                     test_features=X_test, test_target=y_test,
                                     feature_names=feature_names,target_name=target_name,
-                                    debug=debug, save_dir=save_dir, save_model=save_model)
+                                    debug=debug, save_dir=save_path, save_model=save_model)
     print("\n✅ Training and evaluation complete.")
-def _check_paths(datasets_dir: str, save_dir:str):
-    if not os.path.isdir(save_dir):
-        os.makedirs(save_dir)
-    if not os.path.isdir(datasets_dir):
-        raise IOError(f"Datasets directory '{datasets_dir}' not found.")
 def info():

ml_tools/handle_excel.py CHANGED Viewed

@@ -1,11 +1,12 @@
-import os
+from pathlib import Path
 from openpyxl import load_workbook, Workbook
 import pandas as pd
-from typing import List, Optional
-from .utilities import _script_info, sanitize_filename
+from typing import List, Optional, Union
+from .utilities import _script_info, sanitize_filename, make_fullpath
 __all__ = [
+    "find_excel_files",
     "unmerge_and_split_excel",
     "unmerge_and_split_from_directory",
     "validate_excel_schema",
@@ -14,20 +15,55 @@ __all__ = [
 ]
-def unmerge_and_split_excel(filepath: str) -> None:
+def find_excel_files(
+    directory: Union[str, Path],
+    *,
+    extensions: tuple[str, ...] = (".xlsx", ".xls"),
+    exclude_temp: bool = True
+) -> list[Path]:
+    """
+    Returns a list of Excel file Paths in the specified directory.
+    Parameters:
+        directory (str | Path): Directory to search.
+        extensions (tuple[str, ...]): Valid Excel file extensions (default: .xlsx, .xls).
+        exclude_temp (bool): Whether to exclude files that start with '~'.
+    Returns:
+        list[Path]: List of Excel file paths matching criteria.
+    """
+    input_path = make_fullpath(directory)
+    if not input_path.is_dir():
+        raise NotADirectoryError(f"Directory not found: {input_path}")
+    excel_files = [
+        f for f in input_path.iterdir()
+        if f.is_file()
+        and f.suffix.lower() in extensions
+        and (not f.name.startswith('~') if exclude_temp else True)
+    ]
+    if not excel_files:
+        raise FileNotFoundError(f"No valid Excel files found in directory: {input_path}")
+    return excel_files
+def unmerge_and_split_excel(filepath: Union[str,Path]) -> None:
     """
     Processes a single Excel file:
-      - Unmerges all merged cells (vertical and horizontal),
-      - Fills each merged region with the top-left cell value,
-      - Splits each sheet into a separate Excel file,
+      - Unmerges all merged cells (vertical and horizontal), fills each merged region with the top-left cell value.
+      - Splits each sheet into a separate Excel file.
       - Saves all results in the same directory as the input file.
     Parameters:
-        filepath (str): Full path to the Excel file to process.
+        filepath (str | Path): Full path to the Excel file to process.
     """
-    wb = load_workbook(filepath)
-    base_dir = os.path.dirname(os.path.abspath(filepath))
-    base_name = os.path.splitext(os.path.basename(filepath))[0]
+    file_path = make_fullpath(filepath)
+    wb = load_workbook(file_path)
+    base_dir = file_path.parent
+    base_name = file_path.stem
     total_output_files = 0
@@ -56,40 +92,37 @@ def unmerge_and_split_excel(filepath: str) -> None:
         # Construct flat output file name
         sanitized_sheet_name = sanitize_filename(sheet_name)
         output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
-        output_path = os.path.join(base_dir, output_filename)
+        output_path = base_dir / output_filename
         new_wb.save(output_path)
         # print(f"Saved: {output_path}")
         total_output_files += 1
-    print(f"✅ Processed file: {filepath} into {total_output_files} output file(s).")
+    print(f"✅ Processed file: {file_path} into {total_output_files} output file(s).")
     return None
-def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
+def unmerge_and_split_from_directory(input_dir: Union[str,Path], output_dir: Union[str,Path]) -> None:
     """
     Processes all Excel files in the input directory:
-      - Unmerges all merged cells (vertical and horizontal),
-      - Fills each merged region with the top-left cell value,
-      - Splits each sheet into separate Excel files,
+      - Unmerges all merged cells (vertical and horizontal), fills each merged region with the top-left cell value,
+      - Splits each sheet into separate Excel files.
       - Saves all results into the output directory.
     Parameters:
-        input_dir (str): Directory containing Excel files to process.
-        output_dir (str): Directory to save processed Excel files.
+        input_dir (str | Path): Directory containing Excel files to process.
+        output_dir (str | Path): Directory to save processed Excel files.
     """
-    raw_files = [f for f in os.listdir(input_dir) if f.endswith(('.xlsx', '.xls'))]
-    excel_files = [os.path.join(input_dir, f) for f in raw_files if not f.startswith('~')]
-    if not excel_files:
-        raise FileNotFoundError(f"No valid Excel files found in directory: {input_dir}")
+    global_input_path = make_fullpath(input_dir)
+    global_output_path = make_fullpath(output_dir, make=True)
+    excel_files = find_excel_files(global_input_path)
-    os.makedirs(output_dir, exist_ok=True)
     total_output_files = 0
     for file_path in excel_files:
         wb = load_workbook(file_path)
-        base_name = os.path.splitext(os.path.basename(file_path))[0]
+        base_name = file_path.stem
         for sheet_name in wb.sheetnames:
             ws = wb[sheet_name]
@@ -116,7 +149,7 @@ def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
             # Construct flat output file name
             sanitized_sheet_name = sanitize_filename(sheet_name)
             output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
-            output_path = os.path.join(output_dir, output_filename)
+            output_path = global_output_path / output_filename
             new_wb.save(output_path)
             # print(f"Saved: {output_path}")
@@ -127,7 +160,7 @@ def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
 def validate_excel_schema(
-    target_dir: str,
+    target_dir: Union[str,Path],
     expected_columns: List[str],
     strict: bool = False
 ) -> None:
@@ -135,7 +168,7 @@ def validate_excel_schema(
     Validates that each Excel file in a directory conforms to the expected column schema.
     Parameters:
-        target_dir (str): Path to the directory containing Excel files.
+        target_dir (str | Path): Path to the directory containing Excel files.
         expected_columns (list[str]): List of expected column names.
         strict (bool): If True, columns must match exactly (names and order).
                       If False, columns must contain at least all expected names.
@@ -143,52 +176,46 @@ def validate_excel_schema(
     Returns:
         List[str]: List of file paths that failed the schema validation.
     """
-    invalid_files = []
+    invalid_files: list[Path] = []
     expected_set = set(expected_columns)
-    excel_seen = 0
-    for filename in os.listdir(target_dir):
-        if not filename.lower().endswith(".xlsx"):
-            continue  # Skip non-Excel files
-        if filename.startswith("~"): # Skip temporary files
-            continue
-        file_path = os.path.join(target_dir, filename)
-        excel_seen += 1
+    target_path = make_fullpath(target_dir)
+    excel_paths = find_excel_files(target_path)
+    for file in excel_paths:
         try:
-            wb = load_workbook(file_path, read_only=True)
+            wb = load_workbook(file, read_only=True)
             ws = wb.active  # Only check the first worksheet
             header = [cell.value for cell in next(ws.iter_rows(max_row=1))] # type: ignore
             if strict:
                 if header != expected_columns:
-                    invalid_files.append(file_path)
+                    invalid_files.append(file)
             else:
                 header_set = set(header)
                 if not expected_set.issubset(header_set):
-                    invalid_files.append(file_path)
+                    invalid_files.append(file)
         except Exception as e:
-            print(f"Error processing '{file_path}': {e}")
-            invalid_files.append(file_path)
+            print(f"Error processing '{file}': {e}")
+            invalid_files.append(file)
-    valid_excel_number = excel_seen - len(invalid_files)
-    print(f"{valid_excel_number} out of {excel_seen} excel files conform to the schema.")
+    valid_excel_number = len(excel_paths) - len(invalid_files)
+    print(f"{valid_excel_number} out of {len(excel_paths)} excel files conform to the schema.")
     if invalid_files:
         print(f"⚠️ {len(invalid_files)} excel files are invalid:")
-        for file in invalid_files:
-            print(f"  - {file}")
+        for in_file in invalid_files:
+            print(f"  - {in_file.name}")
     return None
 def vertical_merge_transform_excel(
-    target_dir: str,
+    target_dir: Union[str,Path],
     csv_filename: str,
-    output_dir: str,
+    output_dir: Union[str,Path],
     target_columns: Optional[List[str]] = None,
     rename_columns: Optional[List[str]] = None
 ) -> None:
@@ -201,35 +228,31 @@ def vertical_merge_transform_excel(
     - If `rename_columns` is provided, it must match the length of `target_columns` (if used) or the original columns. Names match by position.
     Parameters:
-        target_dir (str): Directory containing Excel files.
+        target_dir (str | Path): Directory containing Excel files.
         csv_filename (str): Output CSV filename.
-        output_dir (str): Directory to save the output CSV file.
+        output_dir (str | Path): Directory to save the output CSV file.
         target_columns (list[str] | None): Columns to select from each Excel file.
         rename_columns (list[str] | None): Optional renaming for columns. Position-based matching.
     """
-    raw_excel_files = [f for f in os.listdir(target_dir) if f.endswith(('.xlsx', '.xls'))]
-    excel_files = [f for f in raw_excel_files if not f.startswith('~')]  # Exclude temporary files
-    if not excel_files:
-        raise ValueError("No Excel files found in the target directory.")
+    target_path = make_fullpath(target_dir)
+    excel_files = find_excel_files(target_path)
     # sanitize filename
     csv_filename = sanitize_filename(csv_filename)
-    # make directory
-    os.makedirs(output_dir, exist_ok=True)
+    # make output directory
+    output_path = make_fullpath(output_dir, make=True)
     csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
-    csv_path = os.path.join(output_dir, csv_filename)
+    csv_path = output_path / csv_filename
     dataframes = []
     for file in excel_files:
-        file_path = os.path.join(target_dir, file)
-        df = pd.read_excel(file_path, engine='openpyxl')
+        df = pd.read_excel(file, engine='openpyxl')
         if target_columns is not None:
             missing = [col for col in target_columns if col not in df.columns]
             if missing:
-                raise ValueError(f"Missing columns in {file}: {missing}")
+                raise ValueError(f"Invalid columns in {file.name}: {missing}")
             df = df[target_columns]
         dataframes.append(df)
@@ -239,7 +262,7 @@ def vertical_merge_transform_excel(
     if rename_columns is not None:
         expected_len = len(target_columns if target_columns is not None else merged_df.columns)
         if len(rename_columns) != expected_len:
-            raise ValueError("Length of rename_columns must match the selected columns")
+            raise ValueError("Length of 'rename_columns' must match the selected columns")
         merged_df.columns = rename_columns
     merged_df.to_csv(csv_path, index=False, encoding='utf-8')
@@ -247,9 +270,9 @@ def vertical_merge_transform_excel(
 def horizontal_merge_transform_excel(
-    target_dir: str,
+    target_dir: Union[str,Path],
     csv_filename: str,
-    output_dir: str,
+    output_dir: Union[str,Path],
     drop_columns: Optional[list[str]] = None,
     skip_duplicates: bool = False
 ) -> None:
@@ -265,31 +288,28 @@ def horizontal_merge_transform_excel(
       If True, only the first occurrence of each column name is kept.
     Parameters:
-        target_dir (str): Directory containing Excel files.
+        target_dir (str | Path): Directory containing Excel files.
         csv_filename (str): Name of the output CSV file.
-        output_dir (str): Directory to save the output CSV file.
+        output_dir (str | Path): Directory to save the output CSV file.
         drop_columns (list[str] | None): Columns to exclude from each file before merging.
         skip_duplicates (bool): Whether to skip duplicate columns or rename them.
     """
-    raw_excel_files = [f for f in os.listdir(target_dir) if f.endswith(('.xlsx', '.xls'))]
-    excel_files = [f for f in raw_excel_files if not f.startswith('~')]  # Exclude temporary files
-    if not excel_files:
-        raise ValueError("No Excel files found in the target directory.")
+    target_path = make_fullpath(target_dir)
+    excel_files = find_excel_files(target_path)
     # sanitize filename
     csv_filename = sanitize_filename(csv_filename)
     # make directory
-    os.makedirs(output_dir, exist_ok=True)
+    output_path = make_fullpath(output_dir, make=True)
     csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
-    csv_path = os.path.join(output_dir, csv_filename)
+    csv_path = output_path / csv_filename
     dataframes = []
     max_rows = 0
     for file in excel_files:
-        file_path = os.path.join(target_dir, file)
-        df = pd.read_excel(file_path, engine='openpyxl')
+        df = pd.read_excel(file, engine='openpyxl')
         if drop_columns is not None:
             df = df.drop(columns=[col for col in drop_columns if col in df.columns])

dragon-ml-toolbox 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 2.0.0py3-none-any.whl → 2.2.0py3-none-any.whl