PyPI - dragon-ml-toolbox - Versions diffs - 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl - Mend

dragon-ml-toolbox 1.4.0py3-none-any.whl → 1.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (19) hide show

{dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/METADATA +18 -2
dragon_ml_toolbox-1.4.1.dist-info/RECORD +19 -0
ml_tools/MICE_imputation.py +17 -2
ml_tools/VIF_factor.py +29 -14
ml_tools/data_exploration.py +68 -140
ml_tools/datasetmaster.py +13 -1
ml_tools/ensemble_learning.py +21 -13
ml_tools/handle_excel.py +32 -9
ml_tools/logger.py +10 -1
ml_tools/particle_swarm_optimization.py +71 -34
ml_tools/pytorch_models.py +13 -1
ml_tools/trainer.py +10 -30
ml_tools/utilities.py +105 -23
ml_tools/vision_helpers.py +14 -1
dragon_ml_toolbox-1.4.0.dist-info/RECORD +0 -19
{dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/licenses/LICENSE +0 -0
{dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
{dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/top_level.txt +0 -0

{dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 1.4.0
+Version: 1.4.1
 Summary: A collection of tools for data science and machine learning projects
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT
@@ -80,7 +80,7 @@ Clone the repository and install in editable mode with optional dependencies:
 ```bash
 git clone https://github.com/DrAg0n-BoRn/ML_tools.git
 cd ML_tools
-pip install -e '.[pytorch]'
+pip install -e .
 ```
 ## Usage
@@ -91,3 +91,19 @@ After installation, import modules like this:
 from ml_tools.utilities import sanitize_filename
 from ml_tools.logger import custom_logger
 ```
+## Available modules
+```bash
+data_exploration
+datasetmaster
+ensemble_learning
+handle_excel
+logger
+MICE_imputation
+particle_swarm_optimization
+trainer
+utilities
+VIF_factor
+vision_helpers
+```

dragon_ml_toolbox-1.4.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,19 @@
+dragon_ml_toolbox-1.4.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
+dragon_ml_toolbox-1.4.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=e1Hg5ZtaBpDV7ZvxhLe1ac28l7nMjvi1MSE5YvB1s-o,1472
+ml_tools/MICE_imputation.py,sha256=CK0tYZ_kQkdETohOlhI7RP7oFkJTXrP-XtIxb--dzpU,9726
+ml_tools/VIF_factor.py,sha256=LQWr1P8WYij07FX_3RZC6Rr22bfAMnrt0Lhvi7SbBpY,9846
+ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+ml_tools/data_exploration.py,sha256=FXP5i6bQo8J3RCyLRmlX-qJVh4VH8DbMjrdUmyd1mF0,18708
+ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
+ml_tools/ensemble_learning.py,sha256=khXXRiR7boWwI4CAvb2bxzS3fhLADNETMOiRe3ihZ4Y,28821
+ml_tools/handle_excel.py,sha256=ZJui5__0rc2T8UGHTheqZGhKmdVZ7Q2I54IoYCjAqJw,12612
+ml_tools/logger.py,sha256=NOtL3YSuffAGmpTpXjY-uJjqFLdRG_jpL7MDyloBw9c,4712
+ml_tools/particle_swarm_optimization.py,sha256=714kZo6lvUvRaPTtj6kJGecZwHcehcSkLysokXAf3No,20706
+ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
+ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
+ml_tools/utilities.py,sha256=z2JPy4GM2YBLUC0sPq7aNLuesPFAQu5KNcsgmuOywdU,8738
+ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
+dragon_ml_toolbox-1.4.1.dist-info/METADATA,sha256=0XdPwNWe81rCvJLJfSS5XvB2ZdJKpBLLoqMU5uxYLMc,2516
+dragon_ml_toolbox-1.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-1.4.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-1.4.1.dist-info/RECORD,,

ml_tools/MICE_imputation.py CHANGED Viewed

@@ -3,9 +3,20 @@ import miceforest as mf
 import os
 import matplotlib.pyplot as plt
 import numpy as np
-from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename
+from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info
 from plotnine import ggplot, labs, theme, element_blank # type: ignore
+__all__ = [
+    "apply_mice",
+    "save_imputed_datasets",
+    "get_na_column_names",
+    "get_convergence_diagnostic",
+    "get_imputed_distributions",
+    "run_mice_pipeline"
+]
 def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
     # Initialize kernel with number of imputed datasets to generate
@@ -210,7 +221,7 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
     if os.path.isfile(df_path_or_dir):
         all_file_paths = [df_path_or_dir]
     elif os.path.isdir(df_path_or_dir):
-        all_file_paths = list_csv_paths(df_path_or_dir).values()
+        all_file_paths = list(list_csv_paths(df_path_or_dir).values())
     else:
         raise ValueError(f"Invalid path or directory: {df_path_or_dir}")
@@ -226,3 +237,7 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
         get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_dir)
         get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_dir, column_names=imputed_column_names)
+def info():
+    _script_info(__all__)

ml_tools/VIF_factor.py CHANGED Viewed

@@ -7,12 +7,19 @@ from statsmodels.stats.outliers_influence import variance_inflation_factor
 from statsmodels.tools.tools import add_constant
 import warnings
 import os
-from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe
+from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe, _script_info
+__all__ = [
+    "compute_vif",
+    "drop_vif_based",
+    "compute_vif_multi"
+]
 def compute_vif(
     df: pd.DataFrame,
-    target_columns: Optional[list[str]] = None,
+    use_columns: Optional[list[str]] = None,
     ignore_columns: Optional[list[str]] = None,
     max_features_to_plot: int = 20,
     save_dir: Optional[str] = None,
@@ -25,7 +32,7 @@ def compute_vif(
     Args:
         df (pd.DataFrame): The input DataFrame.
-        target_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
+        use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
         ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
         max_features_to_plot (int): Adjust the number of features shown in the plot.
         save_dir (str | None): Directory to save the plot as SVG. If None, the plot is not saved.
@@ -42,20 +49,20 @@ def compute_vif(
     A VIF of 1 suggests no correlation, values between 1 and 5 indicate moderate correlation, and values greater than 10 typically signal high multicollinearity, which may distort model interpretation and degrade performance.
     """
     ground_truth_cols = df.columns.to_list()
-    if target_columns is None:
+    if use_columns is None:
         sanitized_columns = df.select_dtypes(include='number').columns.tolist()
         missing_features = set(ground_truth_cols) - set(sanitized_columns)
         if missing_features:
             print(f"⚠️ These columns are not Numeric:\n{missing_features}")
     else:
         sanitized_columns = list()
-        for feature in target_columns:
+        for feature in use_columns:
             if feature not in ground_truth_cols:
                 print(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
             else:
                 sanitized_columns.append(feature)
-    if ignore_columns is not None and target_columns is None:
+    if ignore_columns is not None and use_columns is None:
         missing_ignore = set(ignore_columns) - set(ground_truth_cols)
         if missing_ignore:
             print(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
@@ -137,7 +144,7 @@ def compute_vif(
     return vif_data.drop(columns="color")
-def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10.0) -> pd.DataFrame:
+def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10.0) -> tuple[pd.DataFrame, list[str]]:
     """
     Drops columns from the original DataFrame based on their VIF values exceeding a given threshold.
@@ -147,7 +154,9 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
         threshold (float): VIF threshold above which columns will be dropped.
     Returns:
-        pd.DataFrame: A new DataFrame with high-VIF columns removed.
+        (tuple[pd.DataFrame, list[str]]):
+            - A new DataFrame with high-VIF columns removed.
+            - A list with dropped column names.
     """
     # Ensure expected structure
     if 'feature' not in vif_df.columns or 'VIF' not in vif_df.columns:
@@ -162,13 +171,13 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
     if result_df.empty:
         print(f"\t⚠️ Warning: All columns were dropped.")
-    return result_df
+    return result_df, to_drop
 def compute_vif_multi(input_directory: str,
                       output_plot_directory: str,
                       output_dataset_directory: Optional[str] = None,
-                      target_columns: Optional[list[str]] = None,
+                      use_columns: Optional[list[str]] = None,
                       ignore_columns: Optional[list[str]] = None,
                       max_features_to_plot: int = 20,
                       fontsize: int = 14):
@@ -180,7 +189,7 @@ def compute_vif_multi(input_directory: str,
         input_directory (str): Target directory with CSV files able to be loaded as DataFrame.
         output_plot_directory (str): Save plots to this directory.
         output_dataset_directory (str | None): If provided, saves new CSV files to this directory.
-        target_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
+        use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
         ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
         max_features_to_plot (int): Adjust the number of features shown in the plot.
         fontsize (int): Base fontsize to scale title and labels on hte plot.
@@ -195,7 +204,7 @@ def compute_vif_multi(input_directory: str,
     for df, df_name in yield_dataframes_from_dir(datasets_dir=input_directory):
         vif_dataframe = compute_vif(df=df,
-                            target_columns=target_columns,
+                            use_columns=use_columns,
                             ignore_columns=ignore_columns,
                             max_features_to_plot=max_features_to_plot,
                             fontsize=fontsize,
@@ -205,5 +214,11 @@ def compute_vif_multi(input_directory: str,
         if output_dataset_directory is not None:
             new_filename = 'VIF_' + df_name
-            result_df = drop_vif_based(df=df, vif_df=vif_dataframe)
-            save_dataframe(df=result_df, save_dir=output_dataset_directory, filename=new_filename)
+            result_df, dropped_cols = drop_vif_based(df=df, vif_df=vif_dataframe)
+            if len(dropped_cols) > 0:
+                save_dataframe(df=result_df, save_dir=output_dataset_directory, filename=new_filename)
+def info():
+    _script_info(__all__)

ml_tools/data_exploration.py CHANGED Viewed

@@ -9,22 +9,23 @@ from typing import Union, Literal, Dict, Tuple
 import os
 import sys
 import textwrap
-from ml_tools.utilities import sanitize_filename
+from ml_tools.utilities import sanitize_filename, _script_info
-# Keep track of all available functions, show using `info()`
-__all__ = ["summarize_dataframe",
-           "drop_rows_with_missing_data",
-           "split_features_targets",
-           "show_null_columns",
-           "drop_columns_with_missing_data",
-           "split_continuous_binary",
-           "plot_correlation_heatmap",
-           "check_value_distributions",
-           "plot_value_distributions",
-           "clip_outliers_single",
-           "clip_outliers_multi",
-           "merge_dataframes"]
+# Keep track of all available tools, show using `info()`
+__all__ = [
+    "summarize_dataframe",
+    "drop_rows_with_missing_data",
+    "split_features_targets",
+    "show_null_columns",
+    "drop_columns_with_missing_data",
+    "split_continuous_binary",
+    "plot_correlation_heatmap",
+    "check_value_distributions",
+    "plot_value_distributions",
+    "clip_outliers_single",
+    "clip_outliers_multi"
+]
 def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
@@ -58,34 +59,6 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
     return summary
-def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
-    """
-    Displays a table of columns with missing values, showing both the count and
-    percentage of missing entries per column.
-    Parameters:
-        df (pd.DataFrame): The input DataFrame.
-        round_digits (int): Number of decimal places for the percentage.
-    Returns:
-        pd.DataFrame: A DataFrame summarizing missing values in each column.
-    """
-    null_counts = df.isnull().sum()
-    null_percent = df.isnull().mean() * 100
-    # Filter only columns with at least one null
-    mask = null_counts > 0
-    null_summary = pd.DataFrame({
-        'Missing Count': null_counts[mask],
-        'Missing %': null_percent[mask].round(round_digits)
-    })
-    # Sort by descending percentage of missing values
-    null_summary = null_summary.sort_values(by='Missing %', ascending=False)
-    # print(null_summary)
-    return null_summary
 def drop_rows_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
     """
     Drops rows with more than `threshold` fraction of missing values.
@@ -132,6 +105,57 @@ def split_features_targets(df: pd.DataFrame, targets: list[str]):
     return df_targets, df_features
+def show_null_columns(df: pd.DataFrame, round_digits: int = 2):
+    """
+    Displays a table of columns with missing values, showing both the count and
+    percentage of missing entries per column.
+    Parameters:
+        df (pd.DataFrame): The input DataFrame.
+        round_digits (int): Number of decimal places for the percentage.
+    Returns:
+        pd.DataFrame: A DataFrame summarizing missing values in each column.
+    """
+    null_counts = df.isnull().sum()
+    null_percent = df.isnull().mean() * 100
+    # Filter only columns with at least one null
+    mask = null_counts > 0
+    null_summary = pd.DataFrame({
+        'Missing Count': null_counts[mask],
+        'Missing %': null_percent[mask].round(round_digits)
+    })
+    # Sort by descending percentage of missing values
+    null_summary = null_summary.sort_values(by='Missing %', ascending=False)
+    # print(null_summary)
+    return null_summary
+def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
+    """
+    Drops columns with more than `threshold` fraction of missing values.
+    Parameters:
+        df (pd.DataFrame): The input DataFrame.
+        threshold (float): Fraction of missing values above which columns are dropped.
+    Returns:
+        pd.DataFrame: A new DataFrame without the dropped columns.
+    """
+    missing_fraction = df.isnull().mean()
+    cols_to_drop = missing_fraction[missing_fraction > threshold].index
+    if len(cols_to_drop) > 0:
+        print(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
+        print(list(cols_to_drop))
+    else:
+        print(f"No columns have more than {threshold*100:.0f}% missing data.")
+    return df.drop(columns=cols_to_drop)
 def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """
     Split DataFrame into two DataFrames: one with continuous columns, one with binary columns.
@@ -174,29 +198,6 @@ def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFram
     return df_cont, df_bin # type: ignore
-def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
-    """
-    Drops columns with more than `threshold` fraction of missing values.
-    Parameters:
-        df (pd.DataFrame): The input DataFrame.
-        threshold (float): Fraction of missing values above which columns are dropped.
-    Returns:
-        pd.DataFrame: A new DataFrame without the dropped columns.
-    """
-    missing_fraction = df.isnull().mean()
-    cols_to_drop = missing_fraction[missing_fraction > threshold].index
-    if len(cols_to_drop) > 0:
-        print(f"Dropping columns with more than {threshold*100:.0f}% missing data:")
-        print(list(cols_to_drop))
-    else:
-        print(f"No columns have more than {threshold*100:.0f}% missing data.")
-    return df.drop(columns=cols_to_drop)
 def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None, method: Literal["pearson", "kendall", "spearman"]="pearson", plot_title: str="Correlation Heatmap"):
     """
@@ -513,83 +514,10 @@ def clip_outliers_multi(
     return new_df
-def merge_dataframes(
-    *dfs: pd.DataFrame,
-    reset_index: bool = False,
-    direction: Literal["horizontal", "vertical"] = "horizontal"
-) -> pd.DataFrame:
-    """
-    Merges multiple DataFrames either horizontally or vertically.
-    Parameters:
-        *dfs (pd.DataFrame): Variable number of DataFrames to merge.
-        reset_index (bool): Whether to reset index in the final merged DataFrame.
-        direction (["horizontal" | "vertical"]):
-            - "horizontal": Merge on index, adding columns.
-            - "vertical": Append rows; all DataFrames must have identical columns.
-    Returns:
-        pd.DataFrame: A single merged DataFrame.
-    Raises:
-        ValueError:
-            - If fewer than 2 DataFrames are provided.
-            - If indexes do not match for horizontal merge.
-            - If column names or order differ for vertical merge.
-    """
-    if len(dfs) < 2:
-        raise ValueError("At least 2 DataFrames must be provided.")
-    for i, df in enumerate(dfs, start=1):
-        print(f"DataFrame {i} shape: {df.shape}")
-    if direction == "horizontal":
-        reference_index = dfs[0].index
-        for i, df in enumerate(dfs, start=1):
-            if not df.index.equals(reference_index):
-                raise ValueError(f"Indexes do not match: Dataset 1 and Dataset {i}.")
-        merged_df = pd.concat(dfs, axis=1)
-    elif direction == "vertical":
-        reference_columns = dfs[0].columns
-        for i, df in enumerate(dfs, start=1):
-            if not df.columns.equals(reference_columns):
-                raise ValueError(f"Column names/order do not match: Dataset 1 and Dataset {i}.")
-        merged_df = pd.concat(dfs, axis=0)
-    else:
-        raise ValueError(f"Invalid merge direction: {direction}")
-    if reset_index:
-        merged_df = merged_df.reset_index(drop=True)
-    print(f"Merged DataFrame shape: {merged_df.shape}")
-    return merged_df
 def _is_notebook():
     return get_ipython() is not None
-def info(full_info: bool=True):
-    """
-    List available functions and their descriptions.
-    """
-    print("Available functions for data exploration:")
-    if full_info:
-        module = sys.modules[__name__]
-        for name in __all__:
-            obj = getattr(module, name, None)
-            if callable(obj):
-                doc = obj.__doc__ or "No docstring provided."
-                formatted_doc = textwrap.indent(textwrap.dedent(doc.strip()), prefix="    ")
-                print(f"\n{name}:\n{formatted_doc}")
-    else:
-        for i, name in enumerate(__all__, start=1):
-            print(f"{i} - {name}")
+def info():
+    _script_info(__all__)
-if __name__ == "__main__":
-    info()

ml_tools/datasetmaster.py CHANGED Viewed

@@ -11,6 +11,15 @@ from PIL import Image
 from torchvision.datasets import ImageFolder
 from torchvision import transforms
 import matplotlib.pyplot as plt
+from .utilities import _script_info
+__all__ = [
+    "DatasetMaker",
+    "PytorchDataset",
+    "make_vision_dataset",
+    "SequenceDataset",
+]
 class DatasetMaker():
@@ -592,4 +601,7 @@ class SequenceDataset():
     def __len__(self):
         return f"Train: {len(self.train_dataset)}, Test: {len(self.test_dataset)}"
+def info():
+    _script_info(__all__)

ml_tools/ensemble_learning.py CHANGED Viewed

@@ -21,7 +21,7 @@ from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
 from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, r2_score, roc_curve, roc_auc_score
 import shap
-from .utilities import yield_dataframes_from_dir
+from .utilities import yield_dataframes_from_dir, sanitize_filename
 import warnings # Ignore warnings
 warnings.filterwarnings('ignore', category=DeprecationWarning)
@@ -245,7 +245,9 @@ def _local_directories(model_name: str, dataset_id: str, save_dir: str):
 # save model
 def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str, scaler_object: Union[StandardScaler, MinMaxScaler, MaxAbsScaler]):
-    full_path = os.path.join(save_directory, f"{model_name}_{target_name}.joblib")
+    #Sanitize filenames to save
+    sanitized_target_name = sanitize_filename(target_name)
+    full_path = os.path.join(save_directory, f"{model_name}_{sanitized_target_name}.joblib")
     joblib.dump({'model': trained_model, 'scaler':scaler_object, 'feature_names': feature_names, 'target_name':target_name}, full_path)
 # function to evaluate the model and save metrics (Classification)
@@ -298,7 +300,8 @@ def evaluate_model_classification(
     )
     # Save text report
-    report_path = os.path.join(save_dir, f"Classification_Report_{target_id}.txt")
+    sanitized_target_id = sanitize_filename(target_id)
+    report_path = os.path.join(save_dir, f"Classification_Report_{sanitized_target_id}.txt")
     with open(report_path, "w") as f:
         f.write(f"{model_name} - {target_id}\t\tAccuracy: {accuracy:.2f}\n")
         f.write("Classification Report:\n")
@@ -328,7 +331,7 @@ def evaluate_model_classification(
         text.set_fontsize(title_fontsize+4)
     fig.tight_layout()
-    fig_path = os.path.join(save_dir, f"Confusion_Matrix_{target_id}.svg")
+    fig_path = os.path.join(save_dir, f"Confusion_Matrix_{sanitized_target_id}.svg")
     fig.savefig(fig_path, format="svg", bbox_inches="tight")
     plt.close(fig)
@@ -411,7 +414,8 @@ def plot_roc_curve(
     # Save figure
     os.makedirs(save_directory, exist_ok=True)
-    save_path = os.path.join(save_directory, f"ROC_{target_name}.svg")
+    sanitized_target_name = sanitize_filename(target_name)
+    save_path = os.path.join(save_directory, f"ROC_{sanitized_target_name}.svg")
     fig.savefig(save_path, bbox_inches="tight", format="svg")
     return fig
@@ -435,7 +439,8 @@ def evaluate_model_regression(model, model_name: str,
     r2 = r2_score(single_y_test, y_pred)
     # Create formatted report
-    report_path = os.path.join(save_dir, f"Regression_Report_{target_id}.txt")
+    sanitized_target_id = sanitize_filename(target_id)
+    report_path = os.path.join(save_dir, f"Regression_Report_{sanitized_target_id}.txt")
     with open(report_path, "w") as f:
         f.write(f"{model_name} - {target_id} Regression Performance\n")
         f.write(f"Mean Absolute Error (MAE): {mae:.4f}\n")
@@ -453,7 +458,7 @@ def evaluate_model_regression(model, model_name: str,
     plt.title(f"{model_name} - Residual Plot for {target_id}", fontsize=base_fontsize)
     plt.grid(True)
     plt.tight_layout()
-    plt.savefig(os.path.join(save_dir, f"Residual_Plot_{target_id}.svg"), bbox_inches='tight', format="svg")
+    plt.savefig(os.path.join(save_dir, f"Residual_Plot_{sanitized_target_id}.svg"), bbox_inches='tight', format="svg")
     plt.close()
     # Create true vs predicted values plot
@@ -466,12 +471,13 @@ def evaluate_model_regression(model, model_name: str,
     plt.ylabel('Predictions', fontsize=base_fontsize)
     plt.title(f"{model_name} - True vs Predicted for {target_id}", fontsize=base_fontsize)
     plt.grid(True)
-    plot_path = os.path.join(save_dir, f"Regression_Plot_{target_id}.svg")
+    plot_path = os.path.join(save_dir, f"Regression_Plot_{sanitized_target_id}.svg")
     plt.savefig(plot_path, bbox_inches='tight', format="svg")
     plt.close()
     return y_pred
 # Get SHAP values
 def get_shap_values(
     model,
@@ -498,7 +504,8 @@ def get_shap_values(
         features_to_explain: Should match the model's training data format, including scaling.
         save_dir: Directory to save visualizations
     """
+    sanitized_target_id = sanitize_filename(target_id)
     def _apply_plot_style():
         styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
         for style in styles:
@@ -560,7 +567,7 @@ def get_shap_values(
                     _create_shap_plot(
                         shap_values=class_shap,
                         features=features_to_explain,
-                        save_path=os.path.join(save_dir, f"SHAP_{target_id}_Class{class_name}_{plot_type}.svg"),
+                        save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_id}_Class{class_name}_{plot_type}.svg"),
                         plot_type=plot_type,
                         title=f"{model_name} - {target_id} (Class {class_name})"
                     )
@@ -570,7 +577,7 @@ def get_shap_values(
                 _create_shap_plot(
                     shap_values=values,
                     features=features_to_explain,
-                    save_path=os.path.join(save_dir, f"SHAP_{target_id}_{plot_type}.svg"),
+                    save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_id}_{plot_type}.svg"),
                     plot_type=plot_type,
                     title=f"{model_name} - {target_id}"
                 )
@@ -580,10 +587,11 @@ def get_shap_values(
             _create_shap_plot(
                 shap_values=shap_values,
                 features=features_to_explain,
-                save_path=os.path.join(save_dir, f"SHAP_{target_id}_{plot_type}.svg"),
+                save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_id}_{plot_type}.svg"),
                 plot_type=plot_type,
                 title=f"{model_name} - {target_id}"
             )
+    #START_O
     explainer = shap.TreeExplainer(model)
     shap_values = explainer.shap_values(features_to_explain)
@@ -672,6 +680,6 @@ def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list
 def _check_paths(datasets_dir: str, save_dir:str):
     if not os.path.isdir(save_dir):
-        os.makedirs(save_dir)
+        os.makedirs(save_dir)
     if not os.path.isdir(datasets_dir):
         raise IOError(f"Datasets directory '{datasets_dir}' not found.")

ml_tools/handle_excel.py CHANGED Viewed

@@ -2,6 +2,16 @@ import os
 from openpyxl import load_workbook, Workbook
 import pandas as pd
 from typing import List, Optional
+from utilities import _script_info, sanitize_filename
+__all__ = [
+    "unmerge_and_split_excel",
+    "unmerge_and_split_from_directory",
+    "validate_excel_schema",
+    "vertical_merge_transform_excel",
+    "horizontal_merge_transform_excel"
+]
 def unmerge_and_split_excel(filepath: str) -> None:
@@ -25,12 +35,12 @@ def unmerge_and_split_excel(filepath: str) -> None:
         ws = wb[sheet_name]
         new_wb = Workbook()
         new_ws = new_wb.active
-        new_ws.title = sheet_name
+        new_ws.title = sheet_name # type: ignore
         # Copy all cell values
         for row in ws.iter_rows():
             for cell in row:
-                new_ws.cell(row=cell.row, column=cell.column, value=cell.value)
+                new_ws.cell(row=cell.row, column=cell.column, value=cell.value) # type: ignore
         # Fill and unmerge merged regions
         for merged_range in list(ws.merged_cells.ranges):
@@ -41,10 +51,10 @@ def unmerge_and_split_excel(filepath: str) -> None:
             value = ws.cell(row=min_row, column=min_col).value
             for row in range(min_row, max_row + 1):
                 for col in range(min_col, max_col + 1):
-                    new_ws.cell(row=row, column=col, value=value)
+                    new_ws.cell(row=row, column=col, value=value) # type: ignore
         # Construct flat output file name
-        sanitized_sheet_name = sheet_name.replace("/", "_").replace("\\", "_")
+        sanitized_sheet_name = sanitize_filename(sheet_name)
         output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
         output_path = os.path.join(base_dir, output_filename)
         new_wb.save(output_path)
@@ -85,12 +95,12 @@ def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
             ws = wb[sheet_name]
             new_wb = Workbook()
             new_ws = new_wb.active
-            new_ws.title = sheet_name
+            new_ws.title = sheet_name # type: ignore
             # Copy all cell values
             for row in ws.iter_rows():
                 for cell in row:
-                    new_ws.cell(row=cell.row, column=cell.column, value=cell.value)
+                    new_ws.cell(row=cell.row, column=cell.column, value=cell.value) # type: ignore
             # Fill and unmerge merged regions
             for merged_range in list(ws.merged_cells.ranges):
@@ -101,10 +111,10 @@ def unmerge_and_split_from_directory(input_dir: str, output_dir: str) -> None:
                 value = ws.cell(row=min_row, column=min_col).value
                 for row in range(min_row, max_row + 1):
                     for col in range(min_col, max_col + 1):
-                        new_ws.cell(row=row, column=col, value=value)
+                        new_ws.cell(row=row, column=col, value=value) # type: ignore
             # Construct flat output file name
-            sanitized_sheet_name = sheet_name.replace("/", "_").replace("\\", "_")
+            sanitized_sheet_name = sanitize_filename(sheet_name)
             output_filename = f"{base_name}_{sanitized_sheet_name}.xlsx"
             output_path = os.path.join(output_dir, output_filename)
             new_wb.save(output_path)
@@ -151,7 +161,7 @@ def validate_excel_schema(
             wb = load_workbook(file_path, read_only=True)
             ws = wb.active  # Only check the first worksheet
-            header = [cell.value for cell in next(ws.iter_rows(max_row=1))]
+            header = [cell.value for cell in next(ws.iter_rows(max_row=1))] # type: ignore
             if strict:
                 if header != expected_columns:
@@ -202,6 +212,11 @@ def vertical_merge_transform_excel(
     if not excel_files:
         raise ValueError("No Excel files found in the target directory.")
+    # sanitize filename
+    csv_filename = sanitize_filename(csv_filename)
+    # make directory
+    os.makedirs(output_dir, exist_ok=True)
     csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
     csv_path = os.path.join(output_dir, csv_filename)
@@ -260,6 +275,11 @@ def horizontal_merge_transform_excel(
     excel_files = [f for f in raw_excel_files if not f.startswith('~')]  # Exclude temporary files
     if not excel_files:
         raise ValueError("No Excel files found in the target directory.")
+    # sanitize filename
+    csv_filename = sanitize_filename(csv_filename)
+    # make directory
+    os.makedirs(output_dir, exist_ok=True)
     csv_filename = csv_filename if csv_filename.endswith('.csv') else f"{csv_filename}.csv"
     csv_path = os.path.join(output_dir, csv_filename)
@@ -308,3 +328,6 @@ def horizontal_merge_transform_excel(
     if duplicate_columns:
         print(f"⚠️ Duplicate columns: {duplicate_columns}")
+def info():
+    _script_info(__all__)

ml_tools/logger.py CHANGED Viewed

@@ -5,7 +5,12 @@ import pandas as pd
 from openpyxl.styles import Font, PatternFill
 import traceback
 import json
-from ml_tools.utilities import sanitize_filename
+from ml_tools.utilities import sanitize_filename, _script_info
+__all__ = [
+    "custom_logger"
+]
 def custom_logger(
@@ -143,3 +148,7 @@ def _log_exception_to_log(exc: BaseException, path: str) -> None:
 def _log_dict_to_json(data: Dict[Any, Any], path: str) -> None:
     with open(path, 'w', encoding='utf-8') as f:
         json.dump(data, f, indent=4, ensure_ascii=False)
+def info():
+    _script_info(__all__)

ml_tools/particle_swarm_optimization.py CHANGED Viewed

@@ -5,23 +5,29 @@ import xgboost as xgb
 import lightgbm as lgb
 from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
 from sklearn.base import ClassifierMixin
-from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
 from typing import Literal, Union, Tuple, Dict
-from collections.abc import Sequence
 import polars as pl
 from functools import partial
+from .utilities import sanitize_filename, _script_info
+__all__ = [
+    "ObjectiveFunction",
+    "run_pso"
+]
 class ObjectiveFunction():
     """
     Callable objective function designed for optimizing continuous outputs from regression models.
-    The trained model must include a 'model' and a 'scaler'. Additionally 'feature_names' and 'target_name' will be parsed if present.
+    The target serialized file (joblib) must include a 'model' and a 'scaler'. Additionally 'feature_names' and 'target_name' will be parsed if present.
     Parameters
     ----------
     trained_model_path : str
-        Path to a serialized model (joblib) compatible with scikit-learn-like `.predict`.
+        Path to a serialized model and its scaler (joblib) compatible with scikit-learn-like `.predict`.
     add_noise : bool
         Whether to apply multiplicative noise to the input features during evaluation.
     binary_features : int, default=0
@@ -67,8 +73,18 @@ class ObjectiveFunction():
         return new_feature_values
     def _handle_hybrid(self, features_array):
-        feat_continuous = features_array[:self.binary_features]
-        feat_binary = (features_array[self.binary_features:] > 0.5).astype(int) #threshold binary values
+        total_features = features_array.shape[0]
+        if self.binary_features > total_features:
+            raise ValueError("self.binary_features exceeds total number of features.")
+        # Handle corner case where all features are binary
+        if self.binary_features == total_features:
+            feat_binary = (features_array > 0.5).astype(int)
+            return feat_binary
+        # Normal case: split into continuous and binary parts
+        feat_continuous = features_array[:-self.binary_features]
+        feat_binary = (features_array[-self.binary_features:] > 0.5).astype(int) #threshold binary values
         new_feature_values = np.concatenate([feat_continuous, feat_binary])
         return new_feature_values
@@ -92,7 +108,7 @@ class ObjectiveFunction():
         return (f"<ObjectiveFunction(model={type(self.model).__name__}, scaler={type(self.scaler).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
-def _set_boundaries(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float]):
+def _set_boundaries(lower_boundaries: list[float], upper_boundaries: list[float]):
     assert len(lower_boundaries) == len(upper_boundaries), "Lower and upper boundaries must have the same length."
     assert len(lower_boundaries) >= 1, "At least one boundary pair is required."
     lower = np.array(lower_boundaries)
@@ -112,31 +128,40 @@ def _save_results(*dicts, save_dir: str, target_name: str):
     combined_dict = dict()
     for single_dict in dicts:
         combined_dict.update(single_dict)
-    full_path = os.path.join(save_dir, f"results_{target_name}.csv")
+    sanitized_target_name = sanitize_filename(target_name)
+    full_path = os.path.join(save_dir, f"Optimization_{sanitized_target_name}.csv")
     pl.DataFrame(combined_dict).write_csv(full_path)
-def run_pso(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float], objective_function: ObjectiveFunction,
-            save_results_dir: str,
+def run_pso(lower_boundaries: list[float],
+            upper_boundaries: list[float],
+            objective_function: ObjectiveFunction,
+            save_results_dir: str,
+            auto_binary_boundaries: bool=True,
             target_name: Union[str, None]=None,
             feature_names: Union[list[str], None]=None,
-            swarm_size: int=100, max_iterations: int=100,
+            swarm_size: int=100,
+            max_iterations: int=100,
             inequality_constrain_function=None,
-            post_hoc_analysis: Union[int, None]=None) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
+            post_hoc_analysis: Union[int, None]=None,
+            workers: int=5) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
     """
-    Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results.
+    Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
     Parameters
     ----------
-    lower_boundaries : Sequence[float]
-        Lower bounds for each feature in the search space.
-    upper_boundaries : Sequence[float]
-        Upper bounds for each feature in the search space.
+    lower_boundaries : list[float]
+        Lower bounds for each feature in the search space (as many as features expected by the model).
+    upper_boundaries : list[float]
+        Upper bounds for each feature in the search space (as many as features expected by the model).
     objective_function : ObjectiveFunction
         A callable object encapsulating a regression model and its scaler.
     save_results_dir : str
         Directory path to save the results CSV file.
+    auto_binary_boundaries : bool
+        Use `ObjectiveFunction.binary_features` to append as many binary boundaries as needed to `lower_boundaries` and `upper_boundaries` automatically.
     target_name : str or None, optional
         Name of the target variable. If None, attempts to retrieve from the ObjectiveFunction object.
     feature_names : list[str] or None, optional
@@ -149,30 +174,38 @@ def run_pso(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float]
         Optional function defining inequality constraints to be respected by the optimization.
     post_hoc_analysis : int or None, optional
         If specified, runs the optimization multiple times to perform post hoc analysis. The value indicates the number of repetitions.
+    workers : int
+        Number of parallel processes to use.
     Returns
     -------
     Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]
         If `post_hoc_analysis` is None, returns two dictionaries:
-            - best_features_named: Feature values (after inverse scaling) that yield the best result.
-            - best_target_named: Best result obtained for the target variable.
+            - feature_names: Feature values (after inverse scaling) that yield the best result.
+            - target_name: Best result obtained for the target variable.
         If `post_hoc_analysis` is an integer, returns two dictionaries:
-            - all_best_features_named: Lists of best feature values (after inverse scaling) for each repetition.
-            - all_best_targets_named: List of best target values across repetitions.
+            - feature_names: Lists of best feature values (after inverse scaling) for each repetition.
+            - target_name: List of best target values across repetitions.
     Notes
     -----
     - PSO minimizes the objective function by default; if maximization is desired, it should be handled inside the ObjectiveFunction.
     - Feature values are scaled before being passed to the model and inverse-transformed before result saving.
     """
+    # Append binary boundaries
+    binary_number = objective_function.binary_features
+    if auto_binary_boundaries and binary_number > 0:
+        lower_boundaries.extend([0] * binary_number)
+        upper_boundaries.extend([1] * binary_number)
     lower, upper = _set_boundaries(lower_boundaries, upper_boundaries)
     # feature names
     if feature_names is None and objective_function.feature_names is not None:
         feature_names = objective_function.feature_names
     names = _set_feature_names(size=len(lower_boundaries), names=feature_names)
     # target name
     if target_name is None and objective_function.target_name is not None:
         target_name = objective_function.target_name
@@ -186,13 +219,15 @@ def run_pso(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float]
             "f_ieqcons": inequality_constrain_function,
             "swarmsize": swarm_size,
             "maxiter": max_iterations,
-            "processes": 1,
-            "particle_output": True
+            "processes": workers,
+            "particle_output": False
     }
-    if post_hoc_analysis is None:
-        # best_features, best_target = pso(**arguments)
-        best_features, best_target, _particle_positions, _target_values_per_position = pso(**arguments)
+    os.makedirs(save_results_dir, exist_ok=True)
+    if post_hoc_analysis is None or post_hoc_analysis == 1:
+        best_features, best_target, *_ = _pso(**arguments)
+        # best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
         # inverse transformation
         best_features = np.array(best_features).reshape(1, -1)
@@ -209,9 +244,9 @@ def run_pso(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float]
     else:
         all_best_targets = list()
         all_best_features = [[] for _ in range(len(lower_boundaries))]
-        for  _ in range(post_hoc_analysis):
-            # best_features, best_target = pso(**arguments)
-            best_features, best_target, _particle_positions, _target_values_per_position = pso(**arguments)
+        for _ in range(post_hoc_analysis):
+            best_features, best_target, *_ = _pso(**arguments)
+            # best_features, best_target, _particle_positions, _target_values_per_position = _pso(**arguments)
             # inverse transformation
             best_features = np.array(best_features).reshape(1, -1)
@@ -231,6 +266,8 @@ def run_pso(lower_boundaries: Sequence[float], upper_boundaries: Sequence[float]
         return all_best_features_named, all_best_targets_named # type: ignore
+def info():
+    _script_info(__all__)
 ### SOURCE CODE FOR PSO ###
@@ -249,7 +286,7 @@ def _cons_ieqcons_wrapper(ieqcons, args, kwargs, x):
 def _cons_f_ieqcons_wrapper(f_ieqcons, args, kwargs, x):
     return np.array(f_ieqcons(x, *args, **kwargs))
-def pso(func, lb, ub, ieqcons=[], f_ieqcons=None, args=(), kwargs={},
+def _pso(func, lb, ub, ieqcons=[], f_ieqcons=None, args=(), kwargs={},
         swarmsize=100, omega=0.5, phip=0.5, phig=0.5, maxiter=100,
         minstep=1e-8, minfunc=1e-8, debug=False, processes=1,
         particle_output=False):
@@ -377,7 +414,7 @@ def pso(func, lb, ub, ieqcons=[], f_ieqcons=None, args=(), kwargs={},
         for i in range(S):
             fx[i] = obj(x[i, :])
             fs[i] = is_feasible(x[i, :])
     # Store particle's best position (if constraints are satisfied)
     i_update = np.logical_and((fx < fp), fs)
     p[i_update, :] = x[i_update, :].copy()

ml_tools/pytorch_models.py CHANGED Viewed

@@ -1,5 +1,12 @@
 import torch
 from torch import nn
+from .utilities import _script_info
+__all__ = [
+    "MyNeuralNetwork",
+    "MyLSTMNetwork"
+]
 class MyNeuralNetwork(nn.Module):
@@ -73,9 +80,11 @@ class MyNeuralNetwork(nn.Module):
         return X
-class MyConvolutionalNetwork(nn.Module):
+class _MyConvolutionalNetwork(nn.Module):
     def __init__(self, outputs: int, color_channels: int=3, img_size: int=256, drop_out: float=0.2):
         """
+        - EDUCATIONAL PURPOSES ONLY, not optimized and requires lots of memory.
         Create a basic Convolutional Neural Network with two convolution layers with a pooling layer after each convolution.
         Args:
@@ -225,3 +234,6 @@ class MyLSTMNetwork(nn.Module):
         else:
             return output
+def info():
+    _script_info(__all__)

ml_tools/trainer.py CHANGED Viewed

@@ -6,6 +6,12 @@ import matplotlib.pyplot as plt
 import torch
 from torch import nn
 from sklearn.metrics import mean_squared_error, classification_report, ConfusionMatrixDisplay, roc_curve, roc_auc_score, r2_score, median_absolute_error
+from .utilities import _script_info
+__all__ = [
+    "MyTrainer"
+]
 class MyTrainer():
@@ -288,36 +294,6 @@ class MyTrainer():
                 print(f"Area under the curve score: {area_under_curve:4.2f}")
         else:
             print("Error encountered while retrieving 'model.kind' attribute.")
-    def forecast(self, samples_list: list[torch.Tensor], view_as: tuple[int,int]=(1,-1)):
-        """
-            DEPRECATED - Use `helpers.model_predict()` instead
-        Returns a list containing lists of predicted values, one for each sample.
-        Each sample must be a tensor and have the same shape and normalization expected by the model
-        (this method will add the batch dimension automatically).
-        Args:
-            `samples_list`: list of tensors.
-            `view_as`: reshape each output, default is (1,-1).
-        Returns: List of lists.
-        """
-        self.model.eval()
-        results = list()
-        with torch.no_grad():
-            for data_point in samples_list:
-                data_point = data_point.unsqueeze(0).to(self.device)
-                output = self.model(data_point)
-                if self.kind == "classification":
-                    results.append(output.argmax(dim=1).view(view_as).cpu().tolist())
-                else:  #regression
-                    results.append(output.view(view_as).cpu().tolist())
-        return results
     def rnn_forecast(self, sequence: torch.Tensor, steps: int):
@@ -364,3 +340,7 @@ class MyTrainer():
         # Cast to array and return
         predictions = numpy.array(predictions)
         return predictions
+def info():
+    _script_info(__all__)

ml_tools/utilities.py CHANGED Viewed

@@ -4,6 +4,19 @@ import pandas as pd
 import os
 from pathlib import Path
 import re
+from typing import Literal
+# Keep track of available tools
+__all__ = [
+    "list_csv_paths",
+    "load_dataframe",
+    "yield_dataframes_from_dir",
+    "merge_dataframes",
+    "save_dataframe",
+    "normalize_mixed_list",
+    "sanitize_filename"
+]
 def list_csv_paths(directory: str) -> dict[str, str]:
@@ -76,11 +89,93 @@ def yield_dataframes_from_dir(datasets_dir: str):
     for df_name, df_path in list_csv_paths(datasets_dir).items():
         df, _ = load_dataframe(df_path)
         yield df, df_name
+def merge_dataframes(
+    *dfs: pd.DataFrame,
+    reset_index: bool = False,
+    direction: Literal["horizontal", "vertical"] = "horizontal"
+) -> pd.DataFrame:
+    """
+    Merges multiple DataFrames either horizontally or vertically.
+    Parameters:
+        *dfs (pd.DataFrame): Variable number of DataFrames to merge.
+        reset_index (bool): Whether to reset index in the final merged DataFrame.
+        direction (["horizontal" | "vertical"]):
+            - "horizontal": Merge on index, adding columns.
+            - "vertical": Append rows; all DataFrames must have identical columns.
+    Returns:
+        pd.DataFrame: A single merged DataFrame.
+    Raises:
+        ValueError:
+            - If fewer than 2 DataFrames are provided.
+            - If indexes do not match for horizontal merge.
+            - If column names or order differ for vertical merge.
+    """
+    if len(dfs) < 2:
+        raise ValueError("At least 2 DataFrames must be provided.")
+    for i, df in enumerate(dfs, start=1):
+        print(f"DataFrame {i} shape: {df.shape}")
+    if direction == "horizontal":
+        reference_index = dfs[0].index
+        for i, df in enumerate(dfs, start=1):
+            if not df.index.equals(reference_index):
+                raise ValueError(f"Indexes do not match: Dataset 1 and Dataset {i}.")
+        merged_df = pd.concat(dfs, axis=1)
+    elif direction == "vertical":
+        reference_columns = dfs[0].columns
+        for i, df in enumerate(dfs, start=1):
+            if not df.columns.equals(reference_columns):
+                raise ValueError(f"Column names/order do not match: Dataset 1 and Dataset {i}.")
+        merged_df = pd.concat(dfs, axis=0)
+    else:
+        raise ValueError(f"Invalid merge direction: {direction}")
+    if reset_index:
+        merged_df = merged_df.reset_index(drop=True)
+    print(f"Merged DataFrame shape: {merged_df.shape}")
+    return merged_df
+def save_dataframe(df: pd.DataFrame, save_dir: str, filename: str) -> None:
+    """
+    Save a pandas DataFrame to a CSV file.
+    Parameters:
+        df: pandas.DataFrame to save
+        save_dir: str, directory where the CSV file will be saved.
+        filename: str, CSV filename, extension will be added if missing.
+    """
+    if df.empty:
+        print(f"⚠️ Attempting to save an empty DataFrame: '{filename}'. Process Skipped.")
+        return
+    os.makedirs(save_dir, exist_ok=True)
+    filename = sanitize_filename(filename)
+    if not filename.endswith('.csv'):
+        filename += '.csv'
+    output_path = os.path.join(save_dir, filename)
+    df.to_csv(output_path, index=False, encoding='utf-8')
+    print(f"✅ Saved file: '{filename}'")
 def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
     """
-    Normalize a mixed list of numeric values and strings so that the sum of the values equals 1.0,
+    Normalize a mixed list of numeric values and strings casted to floats so that the sum of the values equals 1.0,
     applying heuristic adjustments to correct for potential data entry scale mismatches.
     Parameters:
@@ -168,27 +263,14 @@ def sanitize_filename(filename: str) -> str:
     return sanitized
-def save_dataframe(df: pd.DataFrame, save_dir: str, filename: str) -> None:
+def _script_info(all_data: list[str]):
     """
-    Save a pandas DataFrame to a CSV file.
-    Parameters:
-        df: pandas.DataFrame to save
-        save_dir: str, directory where the CSV file will be saved.
-        filename: str, CSV filename, extension will be added if missing.
+    List available names.
     """
-    if df.empty:
-        print(f"⚠️ Attempting to save an empty DataFrame: '{filename}'. Process Skipped.")
-        return
-    os.makedirs(save_dir, exist_ok=True)
-    filename = sanitize_filename(filename)
-    if not filename.endswith('.csv'):
-        filename += '.csv'
-    output_path = os.path.join(save_dir, filename)
-    df.to_csv(output_path, index=False, encoding='utf-8')
-    print(f"✅ Saved file: '{filename}'")
+    print("Available functions and objects:")
+    for i, name in enumerate(all_data, start=1):
+            print(f"{i} - {name}")
+def info():
+    _script_info(__all__)

ml_tools/vision_helpers.py CHANGED Viewed

@@ -4,9 +4,18 @@ from PIL import Image, ImageOps
 from typing import Literal
 from torchvision import transforms
 import torch
+from .utilities import _script_info
+__all__ = [
+    "inspect_images",
+    "image_augmentation",
+    "ResizeAspectFill",
+    "is_image",
+    "model_predict"
+]
-# --- Helper Functions ---
 def inspect_images(path: str):
     """
     Prints out the types, sizes and channels of image files found in the directory and its subdirectories.
@@ -216,3 +225,7 @@ def model_predict(model: torch.nn.Module, kind: Literal["regression", "classific
                 results.append(output.view(view_as).cpu().tolist())
     return results
+def info():
+    _script_info(__all__)

dragon_ml_toolbox-1.4.0.dist-info/RECORD DELETED Viewed

@@ -1,19 +0,0 @@
-dragon_ml_toolbox-1.4.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
-dragon_ml_toolbox-1.4.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=e1Hg5ZtaBpDV7ZvxhLe1ac28l7nMjvi1MSE5YvB1s-o,1472
-ml_tools/MICE_imputation.py,sha256=4kqZiesk8vyh4MBLnNE9grflG4fDusqzuYBElsbk4LY,9484
-ml_tools/VIF_factor.py,sha256=rHSAxQcXLrG8dIjCXBAvETsSkCBfYus9NqimOnm2Bvk,9559
-ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ml_tools/data_exploration.py,sha256=qtkGumckC2PmTpj3brVFi072ewX0OI6dwUF4Or7Yikg,21341
-ml_tools/datasetmaster.py,sha256=VUneKshnmjOGbtqVVGTFcIMRKF3s6ZDYrosIYKDjD80,28956
-ml_tools/ensemble_learning.py,sha256=wK6mtOE4v9AWlxkcWhJj5XZjREChxb46kE0i2IxS-OE,28372
-ml_tools/handle_excel.py,sha256=IR0VQc3hYdmjwC31E5YxDnRcWig4jSIx7Y_7to-KZz4,11969
-ml_tools/logger.py,sha256=XwSpCUzw2Le24fJHyljBxNLgw63SwjZ0pMjTJqf0ylI,4622
-ml_tools/particle_swarm_optimization.py,sha256=jpkje4OETC9fyISxxUTx4XGrImSU6gDEcwz46ZDs2bQ,19250
-ml_tools/pytorch_models.py,sha256=Oykw02sOZLCjvSadQd64UGesBN7kq0x1EGXHusvYiQI,9908
-ml_tools/trainer.py,sha256=Zd7AaHeoNd8dEas2JChWoHaCUpWUVRDUMybuHaKJ0XY,16740
-ml_tools/utilities.py,sha256=gr1cyRUfZcRo9fjWpCaQkrvWY0-xJnDJdrE8JEsOi8o,6309
-ml_tools/vision_helpers.py,sha256=lBAW6dzAK-HOswAt1fU_tfP9hkNLY5D8c_I_7hhEXno,7528
-dragon_ml_toolbox-1.4.0.dist-info/METADATA,sha256=V7Y96iAbgX6Xl6RWzEt4nGfKMZe4cuLs0BrFQghXxX8,2335
-dragon_ml_toolbox-1.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-1.4.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-1.4.0.dist-info/RECORD,,

{dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

File without changes

{dragon_ml_toolbox-1.4.0.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

dragon-ml-toolbox 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 1.4.0py3-none-any.whl → 1.4.1py3-none-any.whl