PyPI - dragon-ml-toolbox - Versions diffs - 1.3.2__py3-none-any.whl → 1.4.1__py3-none-any.whl - Mend

dragon-ml-toolbox 1.3.2py3-none-any.whl → 1.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (19) hide show

{dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/METADATA +19 -2
dragon_ml_toolbox-1.4.1.dist-info/RECORD +19 -0
ml_tools/MICE_imputation.py +24 -6
ml_tools/VIF_factor.py +224 -0
ml_tools/data_exploration.py +74 -286
ml_tools/datasetmaster.py +13 -1
ml_tools/ensemble_learning.py +128 -129
ml_tools/handle_excel.py +32 -9
ml_tools/logger.py +10 -1
ml_tools/particle_swarm_optimization.py +71 -34
ml_tools/pytorch_models.py +13 -1
ml_tools/trainer.py +10 -30
ml_tools/utilities.py +122 -14
ml_tools/vision_helpers.py +14 -1
dragon_ml_toolbox-1.3.2.dist-info/RECORD +0 -18
{dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/licenses/LICENSE +0 -0
{dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
{dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/top_level.txt +0 -0

ml_tools/ensemble_learning.py CHANGED Viewed

@@ -21,7 +21,7 @@ from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
 from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, r2_score, roc_curve, roc_auc_score
 import shap
-from .utilities import yield_dataframes_from_dir
+from .utilities import yield_dataframes_from_dir, sanitize_filename
 import warnings # Ignore warnings
 warnings.filterwarnings('ignore', category=DeprecationWarning)
@@ -139,8 +139,9 @@ def get_models(task: Literal["classification", "regression"], random_state: int=
 ###### 3. Process Dataset ######
 # function to split data into train and test
-def _split_data(features, target, test_size, random_state):
-    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=test_size, random_state=random_state, stratify=target)
+def _split_data(features, target, test_size, random_state, task):
+    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=test_size, random_state=random_state,
+                                                        stratify=target if task=="classification" else None)
     return X_train, X_test, y_train, y_test
 # function to standardize the data
@@ -176,7 +177,7 @@ def _resample(X_train_scaled: np.ndarray, y_train: pd.Series,
     else:
         raise ValueError(f"Invalid resampling strategy: {strategy}")
-    X_res, y_res = resample_algorithm.fit_resample(X_train_scaled, y_train)
+    X_res, y_res, *_ = resample_algorithm.fit_resample(X_train_scaled, y_train)
     return X_res, y_res
 # DATASET PIPELINE
@@ -199,7 +200,7 @@ def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Lite
         print(f"\tUnique values for '{df_target.name}': {unique_values}")
     #Train test split
-    X_train, X_test, y_train, y_test = _split_data(features=df_features, target=df_target, test_size=test_size, random_state=random_state)
+    X_train, X_test, y_train, y_test = _split_data(features=df_features, target=df_target, test_size=test_size, random_state=random_state, task=task)
     #DEBUG
     if debug:
@@ -244,7 +245,9 @@ def _local_directories(model_name: str, dataset_id: str, save_dir: str):
 # save model
 def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str, scaler_object: Union[StandardScaler, MinMaxScaler, MaxAbsScaler]):
-    full_path = os.path.join(save_directory, f"{model_name}_{target_name}.joblib")
+    #Sanitize filenames to save
+    sanitized_target_name = sanitize_filename(target_name)
+    full_path = os.path.join(save_directory, f"{model_name}_{sanitized_target_name}.joblib")
     joblib.dump({'model': trained_model, 'scaler':scaler_object, 'feature_names': feature_names, 'target_name':target_name}, full_path)
 # function to evaluate the model and save metrics (Classification)
@@ -297,7 +300,8 @@ def evaluate_model_classification(
     )
     # Save text report
-    report_path = os.path.join(save_dir, f"Classification_Report_{target_id}.txt")
+    sanitized_target_id = sanitize_filename(target_id)
+    report_path = os.path.join(save_dir, f"Classification_Report_{sanitized_target_id}.txt")
     with open(report_path, "w") as f:
         f.write(f"{model_name} - {target_id}\t\tAccuracy: {accuracy:.2f}\n")
         f.write("Classification Report:\n")
@@ -327,7 +331,7 @@ def evaluate_model_classification(
         text.set_fontsize(title_fontsize+4)
     fig.tight_layout()
-    fig_path = os.path.join(save_dir, f"Confusion_Matrix_{target_id}.svg")
+    fig_path = os.path.join(save_dir, f"Confusion_Matrix_{sanitized_target_id}.svg")
     fig.savefig(fig_path, format="svg", bbox_inches="tight")
     plt.close(fig)
@@ -343,8 +347,7 @@ def plot_roc_curve(
     color: str = "darkorange",
     figure_size: tuple = (10, 10),
     linewidth: int = 2,
-    title_fontsize: int = 24,
-    label_fontsize: int = 24,
+    base_fontsize: int = 24,
     input_features: Optional[np.ndarray] = None,
 ) -> plt.Figure: # type: ignore
     """
@@ -402,20 +405,22 @@ def plot_roc_curve(
     ax.plot(fpr, tpr, color=color, lw=linewidth, label=f"AUC = {auc_score:.2f}")
     ax.plot([0, 1], [0, 1], color="gray", linestyle="--", lw=1)
-    ax.set_title(f"{model_name} - {target_name}", fontsize=title_fontsize)
-    ax.set_xlabel("False Positive Rate", fontsize=label_fontsize)
-    ax.set_ylabel("True Positive Rate", fontsize=label_fontsize)
-    ax.tick_params(axis='both', labelsize=label_fontsize)
-    ax.legend(loc="lower right", fontsize=label_fontsize)
+    ax.set_title(f"{model_name} - {target_name}", fontsize=base_fontsize)
+    ax.set_xlabel("False Positive Rate", fontsize=base_fontsize)
+    ax.set_ylabel("True Positive Rate", fontsize=base_fontsize)
+    ax.tick_params(axis='both', labelsize=base_fontsize)
+    ax.legend(loc="lower right", fontsize=base_fontsize)
     ax.grid(True)
     # Save figure
     os.makedirs(save_directory, exist_ok=True)
-    save_path = os.path.join(save_directory, f"ROC_{target_name}.svg")
+    sanitized_target_name = sanitize_filename(target_name)
+    save_path = os.path.join(save_directory, f"ROC_{sanitized_target_name}.svg")
     fig.savefig(save_path, bbox_inches="tight", format="svg")
     return fig
 # function to evaluate the model and save metrics (Regression)
 def evaluate_model_regression(model, model_name: str,
                                save_dir: str,
@@ -423,8 +428,7 @@ def evaluate_model_regression(model, model_name: str,
                                target_id: str,
                                figure_size: tuple = (12, 8),
                                alpha_transparency: float = 0.5,
-                               title_fontsize: int = 24,
-                               normal_fontsize: int = 24):
+                               base_fontsize: int = 24):
     # Generate predictions
     y_pred = model.predict(x_test_scaled)
@@ -435,7 +439,8 @@ def evaluate_model_regression(model, model_name: str,
     r2 = r2_score(single_y_test, y_pred)
     # Create formatted report
-    report_path = os.path.join(save_dir, f"Regression_Report_{target_id}.txt")
+    sanitized_target_id = sanitize_filename(target_id)
+    report_path = os.path.join(save_dir, f"Regression_Report_{sanitized_target_id}.txt")
     with open(report_path, "w") as f:
         f.write(f"{model_name} - {target_id} Regression Performance\n")
         f.write(f"Mean Absolute Error (MAE): {mae:.4f}\n")
@@ -448,12 +453,12 @@ def evaluate_model_regression(model, model_name: str,
     plt.figure(figsize=figure_size)
     plt.scatter(y_pred, residuals, alpha=alpha_transparency)
     plt.axhline(0, color='red', linestyle='--')
-    plt.xlabel("Predicted Values", fontsize=normal_fontsize)
-    plt.ylabel("Residuals", fontsize=normal_fontsize)
-    plt.title(f"{model_name} - Residual Plot for {target_id}", fontsize=title_fontsize)
+    plt.xlabel("Predicted Values", fontsize=base_fontsize)
+    plt.ylabel("Residuals", fontsize=base_fontsize)
+    plt.title(f"{model_name} - Residual Plot for {target_id}", fontsize=base_fontsize)
     plt.grid(True)
     plt.tight_layout()
-    plt.savefig(os.path.join(save_dir, f"Residual_Plot_{target_id}.svg"), bbox_inches='tight', format="svg")
+    plt.savefig(os.path.join(save_dir, f"Residual_Plot_{sanitized_target_id}.svg"), bbox_inches='tight', format="svg")
     plt.close()
     # Create true vs predicted values plot
@@ -462,63 +467,66 @@ def evaluate_model_regression(model, model_name: str,
     plt.plot([single_y_test.min(), single_y_test.max()],
              [single_y_test.min(), single_y_test.max()],
              'k--', lw=2)
-    plt.xlabel('True Values', fontsize=normal_fontsize)
-    plt.ylabel('Predictions', fontsize=normal_fontsize)
-    plt.title(f"{model_name} - True vs Predicted for {target_id}", fontsize=title_fontsize)
+    plt.xlabel('True Values', fontsize=base_fontsize)
+    plt.ylabel('Predictions', fontsize=base_fontsize)
+    plt.title(f"{model_name} - True vs Predicted for {target_id}", fontsize=base_fontsize)
     plt.grid(True)
-    plot_path = os.path.join(save_dir, f"Regression_Plot_{target_id}.svg")
+    plot_path = os.path.join(save_dir, f"Regression_Plot_{sanitized_target_id}.svg")
     plt.savefig(plot_path, bbox_inches='tight', format="svg")
     plt.close()
     return y_pred
 # Get SHAP values
-def get_shap_values(model, model_name: str,
-                   save_dir: str,
-                   features_to_explain: np.ndarray,
-                   feature_names: list[str],
-                   target_id: str,
-                   task: Literal["classification", "regression"],
-                   max_display_features: int=8,
-                   figsize: tuple=(14, 20),
-                   title_fontsize: int=38,
-                   label_fontsize: int=38,
-                   plot_type: Literal["bar", "dot"] = "dot"
-                   ):
+def get_shap_values(
+    model,
+    model_name: str,
+    save_dir: str,
+    features_to_explain: np.ndarray,
+    feature_names: list[str],
+    target_id: str,
+    task: Literal["classification", "regression"],
+    max_display_features: int = 10,
+    figsize: tuple = (16, 20),
+    base_fontsize: int = 38,
+):
     """
     Universal SHAP explainer for regression and classification.
-    - Use `X_train` (or a subsample of it) to see how the model explains the data it was trained on.
-	- Use `X_test` (or a hold-out set) to see how the model explains unseen data.
-	- Use the entire dataset to get the global view.
+        * Use `X_train` (or a subsample of it) to see how the model explains the data it was trained on.
+	    * Use `X_test` (or a hold-out set) to see how the model explains unseen data.
+	    * Use the entire dataset to get the global view.
     Parameters:
-    - 'task': 'regression' or 'classification'
-    - 'features_to_explain': Should match the model's training data format, including scaling.
-    - 'save_dir': Directory to save visualizations
+        task: 'regression' or 'classification'
+        features_to_explain: Should match the model's training data format, including scaling.
+        save_dir: Directory to save visualizations
     """
-    def _create_shap_plot(shap_values, features, feature_names,
-                         full_save_path: str, plot_type: str,
-                         title: str):
-        """Helper function to create and save SHAP plots"""
-        # Set style
-        preferred_styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
-        for style in preferred_styles:
+    sanitized_target_id = sanitize_filename(target_id)
+    def _apply_plot_style():
+        styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
+        for style in styles:
             if style in plt.style.available or style == 'default':
                 plt.style.use(style)
                 break
+    def _configure_rcparams():
+        plt.rc('font', size=base_fontsize)
+        plt.rc('axes', titlesize=base_fontsize)
+        plt.rc('axes', labelsize=base_fontsize)
+        plt.rc('xtick', labelsize=base_fontsize)
+        plt.rc('ytick', labelsize=base_fontsize + 2)
+        plt.rc('legend', fontsize=base_fontsize)
+        plt.rc('figure', titlesize=base_fontsize)
+    def _create_shap_plot(shap_values, features, save_path: str, plot_type: str, title: str):
+        _apply_plot_style()
+        _configure_rcparams()
         plt.figure(figsize=figsize)
-        #set rc parameters for better readability
-        plt.rc('font', size=label_fontsize)
-        plt.rc('axes', titlesize=title_fontsize)
-        plt.rc('axes', labelsize=label_fontsize)
-        plt.rc('xtick', labelsize=label_fontsize)
-        plt.rc('ytick', labelsize=label_fontsize)
-        plt.rc('legend', fontsize=label_fontsize)
-        plt.rc('figure', titlesize=title_fontsize)
-        # Create the SHAP plot
         shap.summary_plot(
             shap_values=shap_values,
             features=features,
@@ -528,85 +536,76 @@ def get_shap_values(model, model_name: str,
             plot_size=figsize,
             max_display=max_display_features,
             alpha=0.7,
-            color=plt.get_cmap('viridis') # type: ignore
+            # color='viridis'
         )
-        # Add professional styling
         ax = plt.gca()
-        ax.set_xlabel("SHAP Value Impact", fontsize=title_fontsize, weight='bold')
-        ax.set_ylabel("Features", fontsize=title_fontsize, weight='bold')
-        plt.title(title, fontsize=title_fontsize, pad=20, weight='bold')
-        # Manually fix tick fonts
+        ax.set_xlabel("SHAP Value Impact", fontsize=base_fontsize + 2, weight='bold', labelpad=20)
+        plt.title(title, fontsize=base_fontsize + 2, pad=20, weight='bold')
         for tick in ax.get_xticklabels():
-            tick.set_fontsize(label_fontsize)
-            tick.set_rotation(45)
+            tick.set_fontsize(base_fontsize)
+            tick.set_rotation(30)
         for tick in ax.get_yticklabels():
-            tick.set_fontsize(label_fontsize)
+            tick.set_fontsize(base_fontsize + 2)
-        # Handle colorbar for dot plots
         if plot_type == "dot":
             cb = plt.gcf().axes[-1]
-            # cb.set_ylabel("Feature Value", size=label_fontsize)
             cb.set_ylabel("", size=1)
-            cb.tick_params(labelsize=label_fontsize - 2)
-        # Save and clean up
-        plt.savefig(
-            full_save_path,
-            bbox_inches='tight',
-            facecolor='white',
-            format="svg"
-        )
+            cb.tick_params(labelsize=base_fontsize - 2)
+        plt.savefig(save_path, bbox_inches='tight', facecolor='white', format="svg")
         plt.close()
-        rcdefaults()  # Reset rc parameters to default
-    # START
-    explainer = shap.TreeExplainer(model)
-    shap_values = explainer.shap_values(features_to_explain)
-    # Handle different model types
-    if task == 'classification':
-        # Determine if multiclass
-        try:
-            is_multiclass = len(model.classes_) > 2
-            class_names = model.classes_
-        except AttributeError:
-            is_multiclass = isinstance(shap_values, list) and len(shap_values) > 1
-            class_names = list(range(len(shap_values))) if is_multiclass else [0, 1]
+        rcdefaults()
+    def _plot_for_classification(shap_values, class_names):
+        is_multiclass = isinstance(shap_values, list) and len(shap_values) > 1
         if is_multiclass:
-            for class_idx, (class_shap, class_name) in enumerate(zip(shap_values, class_names)):
+            for class_shap, class_name in zip(shap_values, class_names):
+                for plot_type in ["bar", "dot"]:
+                    _create_shap_plot(
+                        shap_values=class_shap,
+                        features=features_to_explain,
+                        save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_id}_Class{class_name}_{plot_type}.svg"),
+                        plot_type=plot_type,
+                        title=f"{model_name} - {target_id} (Class {class_name})"
+                    )
+        else:
+            values = shap_values[1] if isinstance(shap_values, list) else shap_values
+            for plot_type in ["bar", "dot"]:
                 _create_shap_plot(
-                    shap_values=class_shap,
+                    shap_values=values,
                     features=features_to_explain,
-                    feature_names=feature_names,
-                    full_save_path=os.path.join(save_dir, f"SHAP_{target_id}_Class{class_name}.svg"),
+                    save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_id}_{plot_type}.svg"),
                     plot_type=plot_type,
-                    title=f"{model_name} - {target_id} (Class {class_name})"
+                    title=f"{model_name} - {target_id}"
                 )
-        else:
-            # Handle binary classification (single array case)
-            plot_vals = shap_values[1] if isinstance(shap_values, list) else shap_values
+    def _plot_for_regression(shap_values):
+        for plot_type in ["bar", "dot"]:
             _create_shap_plot(
-                shap_values=plot_vals,
+                shap_values=shap_values,
                 features=features_to_explain,
-                feature_names=feature_names,
-                full_save_path=os.path.join(save_dir, f"SHAP_{target_id}.svg"),
+                save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_id}_{plot_type}.svg"),
                 plot_type=plot_type,
                 title=f"{model_name} - {target_id}"
             )
-    else:  # Regression
-        _create_shap_plot(
-            shap_values=shap_values,
-            features=features_to_explain,
-            feature_names=feature_names,
-            full_save_path=os.path.join(save_dir, f"SHAP_{target_id}.svg"),
-            plot_type=plot_type,
-            title=f"{model_name} - {target_id}"
-        )
+    #START_O
+    explainer = shap.TreeExplainer(model)
+    shap_values = explainer.shap_values(features_to_explain)
+    if task == 'classification':
+        try:
+            class_names = model.classes_ if hasattr(model, 'classes_') else list(range(len(shap_values)))
+        except Exception:
+            class_names = list(range(len(shap_values)))
+        _plot_for_classification(shap_values, class_names)
+    else:
+        _plot_for_regression(shap_values)
 # TRAIN TEST PIPELINE
 def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["classification", "regression"],
              train_features: np.ndarray, train_target: np.ndarray,
@@ -653,7 +652,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
     return trained_model, y_pred
 ###### 5. Execution ######
-def run_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], task: Literal["classification", "regression"]="regression",
+def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], task: Literal["classification", "regression"],
          resample_strategy: Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE', None]=None, scaler: Literal["standard", "minmax", "maxabs"]="minmax", save_model: bool=False,
          test_size: float=0.2, debug:bool=False, L1_regularization: float=0.5, L2_regularization: float=0.5, learning_rate: float=0.005, random_state: int=101):
     #Check paths
@@ -672,15 +671,15 @@ def run_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], ta
             #Train models
             for model_name, model in models_dict.items():
                 train_test_pipeline(model=model, model_name=model_name, dataset_id=dataframe_name, task=task,
-                                    train_features=X_train, train_target=y_train,
+                                    train_features=X_train, train_target=y_train, # type: ignore
                                     test_features=X_test, test_target=y_test,
                                     feature_names=feature_names,target_id=target_name, scaler_object=scaler_object,
                                     debug=debug, save_dir=save_dir, save_model=save_model)
-    print("\nTraining and evaluation complete.")
+    print("\n✅ Training and evaluation complete.")
 def _check_paths(datasets_dir: str, save_dir:str):
     if not os.path.isdir(save_dir):
-        os.makedirs(save_dir)
+        os.makedirs(save_dir)
     if not os.path.isdir(datasets_dir):
-        raise IOError(f"Datasets directory '{datasets_dir}' not found.\nCheck path or run MICE script first.")
+        raise IOError(f"Datasets directory '{datasets_dir}' not found.")

ml_tools/handle_excel.py CHANGED Viewed

@@ -2,6 +2,16 @@ import os
 from openpyxl import load_workbook, Workbook
 import pandas as pd
 from typing import List, Optional
+from utilities import _script_info, sanitize_filename
+__all__ = [
+    "unmerge_and_split_excel",
+    "unmerge_and_split_from_directory",
+    "validate_excel_schema",
+    "vertical_merge_transform_excel",
+    "horizontal_merge_transform_excel"
+]
 def unmerge_and_split_excel(filepath: str) -> None:
@@ -25,12 +35,12 @@ def unmerge_and_split_excel(filepath: str) -> None:
         ws = wb[sheet_name]
         new_wb = Workbook()
         new_ws = new_wb.active
-        new_ws.title = sheet_name
+        new_ws.title = sheet_name # type: ignore
         # Copy all cell values
         for row in ws.iter_rows():
             for cell in row:
-                new_ws.cell(row=cell.row, column=cell.column, value=cell.value)
+                new_ws.cell(row=cell.row, column=cell.column, value=cell.value) # type: ignore
         # Fill and unmerge merged regions
         for merged_range in list(ws.merged_cells.ranges):
@@ -41,10 +51,10 @@ def unmerge_and_split_excel(filepath: str) -> None:
             value = ws.cell(row=min_row, column=min_col).value
             for row in range(min_row, max_row + 1):
                 for col in range(min_col, max_col + 1):
-                    new_ws.cell(row=row, column=col, value=value)
+                    new_ws.cell(row=row, column=col, value=value) # type: ignore
         # Construct flat output file name
-        sanitized_sheet_name = sheet_name.replace("/", "_").replace("\\", "_")
+        sanitized_sheet_name = sanitize_filename(sheet_name)
         output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
         output_path = os.path.join(base_dir, output_filename)
         new_wb.save(output_path)
@@ -85,12 +95,12 @@ def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
             ws = wb[sheet_name]
             new_wb = Workbook()
             new_ws = new_wb.active
-            new_ws.title = sheet_name
+            new_ws.title = sheet_name # type: ignore
             # Copy all cell values
             for row in ws.iter_rows():
                 for cell in row:
-                    new_ws.cell(row=cell.row, column=cell.column, value=cell.value)
+                    new_ws.cell(row=cell.row, column=cell.column, value=cell.value) # type: ignore
             # Fill and unmerge merged regions
             for merged_range in list(ws.merged_cells.ranges):
@@ -101,10 +111,10 @@ def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
                 value = ws.cell(row=min_row, column=min_col).value
                 for row in range(min_row, max_row + 1):
                     for col in range(min_col, max_col + 1):
-                        new_ws.cell(row=row, column=col, value=value)
+                        new_ws.cell(row=row, column=col, value=value) # type: ignore
             # Construct flat output file name
-            sanitized_sheet_name = sheet_name.replace("/", "_").replace("\\", "_")
+            sanitized_sheet_name = sanitize_filename(sheet_name)
             output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
             output_path = os.path.join(output_dir, output_filename)
             new_wb.save(output_path)
@@ -151,7 +161,7 @@ def validate_excel_schema(
             wb = load_workbook(file_path, read_only=True)
             ws = wb.active  # Only check the first worksheet
-            header = [cell.value for cell in next(ws.iter_rows(max_row=1))]
+            header = [cell.value for cell in next(ws.iter_rows(max_row=1))] # type: ignore
             if strict:
                 if header != expected_columns:
@@ -202,6 +212,11 @@ def vertical_merge_transform_excel(
     if not excel_files:
         raise ValueError("No Excel files found in the target directory.")
+    # sanitize filename
+    csv_filename = sanitize_filename(csv_filename)
+    # make directory
+    os.makedirs(output_dir, exist_ok=True)
     csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
     csv_path = os.path.join(output_dir, csv_filename)
@@ -260,6 +275,11 @@ def horizontal_merge_transform_excel(
     excel_files = [f for f in raw_excel_files if not f.startswith('~')]  # Exclude temporary files
     if not excel_files:
         raise ValueError("No Excel files found in the target directory.")
+    # sanitize filename
+    csv_filename = sanitize_filename(csv_filename)
+    # make directory
+    os.makedirs(output_dir, exist_ok=True)
     csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
     csv_path = os.path.join(output_dir, csv_filename)
@@ -308,3 +328,6 @@ def horizontal_merge_transform_excel(
     if duplicate_columns:
         print(f"⚠️ Duplicate columns: {duplicate_columns}")
+def info():
+    _script_info(__all__)

ml_tools/logger.py CHANGED Viewed

@@ -5,7 +5,12 @@ import pandas as pd
 from openpyxl.styles import Font, PatternFill
 import traceback
 import json
-from ml_tools.utilities import sanitize_filename
+from ml_tools.utilities import sanitize_filename, _script_info
+__all__ = [
+    "custom_logger"
+]
 def custom_logger(
@@ -143,3 +148,7 @@ def _log_exception_to_log(exc: BaseException, path: str) -> None:
 def _log_dict_to_json(data: Dict[Any, Any], path: str) -> None:
     with open(path, 'w', encoding='utf-8') as f:
         json.dump(data, f, indent=4, ensure_ascii=False)
+def info():
+    _script_info(__all__)

dragon-ml-toolbox 1.3.2__py3-none-any.whl → 1.4.1__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 1.3.2py3-none-any.whl → 1.4.1py3-none-any.whl