PyPI - dragon-ml-toolbox - Versions diffs - 3.12.0__py3-none-any.whl → 3.12.1__py3-none-any.whl - Mend

dragon-ml-toolbox 3.12.0py3-none-any.whl → 3.12.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (13) hide show

{dragon_ml_toolbox-3.12.0.dist-info → dragon_ml_toolbox-3.12.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 3.12.0
+Version: 3.12.1
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-3.12.0.dist-info → dragon_ml_toolbox-3.12.1.dist-info}/RECORD RENAMED Viewed

@@ -1,26 +1,26 @@
-dragon_ml_toolbox-3.12.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
-dragon_ml_toolbox-3.12.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
+dragon_ml_toolbox-3.12.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
+dragon_ml_toolbox-3.12.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
 ml_tools/ETL_engineering.py,sha256=yeZsW_7zRvEcuMZbM4E2GV1dxwBoWIeJAcFFk2AK0fY,39502
 ml_tools/GUI_tools.py,sha256=VonZEizPS0ncm8HWU-ik-SgcXKryJU8eSG7NN0QN9cc,42222
-ml_tools/MICE_imputation.py,sha256=rYqvwQDVtoAJJ0agXWoGzoZEHedWiA6QzcEKEIkiZ08,11388
+ml_tools/MICE_imputation.py,sha256=7CDsIQxx5Jb_DwPAmWmz3FXcn85sUyH7g9UcZ1_E07s,11412
 ml_tools/ML_callbacks.py,sha256=g_9nSzoA22UJOQZCPKeDz-Ayh0ECFZLzRd6rZ8SokrE,13080
 ml_tools/ML_evaluation.py,sha256=oiDV6HItQloUUKCUpltV-2pogubWLBieGpc-VUwosAQ,10106
 ml_tools/ML_trainer.py,sha256=gGXAu65v_5yYCqKqmHpSLJ3yY0M_Scr_nJ6qHBHSK1k,14487
 ml_tools/ML_tutorial.py,sha256=m5mZPULhO4mOpfp32fM_mUNVduv-S2hoKNbsZObNI4k,12233
-ml_tools/PSO_optimization.py,sha256=64sQCavw8ecFr318-fugnax8LhjSWiR4aiH6aYiVD2k,24839
+ml_tools/PSO_optimization.py,sha256=1wRM-goZSwCji5LQVDP1VjF0LyGN5-QWBvofbwfjQRQ,24780
 ml_tools/RNN_forecast.py,sha256=IZLcPs3by0Chei7ill_Grjxs7BBUnzau0Oavi3dWiyE,1886
-ml_tools/VIF_factor.py,sha256=BeP4ig3l7b1Igwgte9z8rEwHdSZvVT7W_9mcBHGoNJw,10299
+ml_tools/VIF_factor.py,sha256=gD3sZ9HBdTHlf4gbvUvx6kKczO_JFxMZKTXw1h0KVCg,10365
 ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ml_tools/_pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
-ml_tools/data_exploration.py,sha256=M7bn2q5XN9zJZJGAmMMFSFFZh8LGzC2arFelrXw3N6Q,25241
+ml_tools/data_exploration.py,sha256=ZpjK_lN5mDhjf9iQpvyYNA2SF7M5q4D5m09saln7YFI,25241
 ml_tools/datasetmaster.py,sha256=S3PKHNQZ9cyAOck8xQltVLZhaD1gFLfgHFL-aRjz4JU,30077
 ml_tools/ensemble_learning.py,sha256=D-9IbOKtCvyAB-LbPu3sdSRtdp0RZIcQEZcyMnarHmQ,45758
-ml_tools/handle_excel.py,sha256=lwds7rDLlGSCWiWGI7xNg-Z7kxAepogp0lstSFa0590,12949
+ml_tools/handle_excel.py,sha256=2Q_MBArss4emPQ8p-Uj9x_e7wGg3OoYM2AU_HG59UCY,12978
 ml_tools/keys.py,sha256=A3mLrtLZrxL27whAs2F1GPqZ1KzJpxBp6QbhxY5ioPI,636
 ml_tools/logger.py,sha256=UkbiU9ihBhw9VKyn3rZzisdClWV94EBV6B09_D0iUU0,6026
-ml_tools/path_manager.py,sha256=OCpESgdftbi6mOxetDMIaHhazt4N-W8pJx11X3-yNOs,8305
-ml_tools/utilities.py,sha256=FW97hMTLLxjDR1so-C-_yDm_iz2z_YfirRXjG_IwSLo,22843
-dragon_ml_toolbox-3.12.0.dist-info/METADATA,sha256=JD5pg6MBVM3stGknoD2vwec1pKgykEwNVtRmanRV2sw,3274
-dragon_ml_toolbox-3.12.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-3.12.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-3.12.0.dist-info/RECORD,,
+ml_tools/path_manager.py,sha256=1LD9JFzqVyJQl2kTA7tK930_IV3qxfiV4cMIBzItytY,8309
+ml_tools/utilities.py,sha256=Vh4ZdI03g8EpgQL7KDwnAw2vtBlHtx6KxCuAATxLvT4,24208
+dragon_ml_toolbox-3.12.1.dist-info/METADATA,sha256=vwEN95BhK71LrhuuTuZbxdyfdq_X5VljuP89uXNguok,3274
+dragon_ml_toolbox-3.12.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-3.12.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-3.12.1.dist-info/RECORD,,

ml_tools/MICE_imputation.py CHANGED Viewed

@@ -35,7 +35,7 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
     imputed_datasets = [kernel.complete_data(dataset=i) for i in range(resulting_datasets)]
     if imputed_datasets is None or len(imputed_datasets) == 0:
-        raise ValueError("No imputed datasets were generated. Check the MICE process.")
+        raise ValueError("❌ No imputed datasets were generated. Check the MICE process.")
     # threshold binary columns
     if binary_columns is not None:
@@ -56,8 +56,8 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
     # Ensure indexes match
     for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
-        assert imputed_df.shape[0] == df.shape[0], f"Row count mismatch in dataset {subname}" # type: ignore
-        assert all(imputed_df.index == df.index), f"Index mismatch in dataset {subname}" # type: ignore
+        assert imputed_df.shape[0] == df.shape[0], f"❌ Row count mismatch in dataset {subname}" # type: ignore
+        assert all(imputed_df.index == df.index), f"❌ Index mismatch in dataset {subname}" # type: ignore
     # print("✅ All imputed datasets match the original DataFrame indexes.")
     return kernel, imputed_datasets, imputed_dataset_names
@@ -90,7 +90,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
     dataset_count = kernel.num_datasets
     if dataset_count != len(imputed_dataset_names):
-        raise ValueError(f"Expected {dataset_count} names in imputed_dataset_names, got {len(imputed_dataset_names)}")
+        raise ValueError(f"❌ Expected {dataset_count} names in imputed_dataset_names, got {len(imputed_dataset_names)}")
     # Check path
     root_path = make_fullpath(root_dir, make=True)
@@ -152,7 +152,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
         """Helper function to add labels and legends to a figure"""
         if not isinstance(fig, ggplot):
-            raise TypeError("Expected a plotnine.ggplot object")
+            raise TypeError("❌ Expected a plotnine.ggplot object")
         # Edit labels and title
         fig = fig + theme(
@@ -166,7 +166,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
         fig = fig.draw()
         if not hasattr(fig, 'axes') or len(fig.axes) == 0:
-            raise RuntimeError("Rendered figure has no axes to modify")
+            raise RuntimeError("❌ Rendered figure has no axes to modify")
         if filename == "Combined_Distributions":
             custom_xlabel = "Feature Values"

ml_tools/PSO_optimization.py CHANGED Viewed

@@ -530,10 +530,8 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
     results_path = make_fullpath(results_dir)
     output_path = make_fullpath(save_dir, make=True)
-    all_csvs = list_csv_paths(results_path)
-    if not all_csvs:
-        _LOGGER.warning("⚠️ No data found. No plots will be generated.")
-        return
+    # Check that the directory contains csv files
+    list_csv_paths(results_path, verbose=False)
     # --- Data Loading and Preparation ---
     _LOGGER.info(f"📁 Starting analysis from results in: '{results_dir}'")

ml_tools/VIF_factor.py CHANGED Viewed

@@ -26,8 +26,7 @@ def compute_vif(
     save_dir: Optional[Union[str,Path]] = None,
     filename: Optional[str] = None,
     fontsize: int = 14,
-    show_plot: bool = True,
-    verbose: bool = True
+    show_plot: bool = True
 ) -> pd.DataFrame:
     """
     Computes Variance Inflation Factors (VIF) for numeric columns in a DataFrame. Optionally, generates a bar plot of VIF values.
@@ -54,21 +53,20 @@ def compute_vif(
     if use_columns is None:
         sanitized_columns = df.select_dtypes(include='number').columns.tolist()
         missing_features = set(ground_truth_cols) - set(sanitized_columns)
-        if missing_features and verbose:
+        if missing_features:
             _LOGGER.warning(f"⚠️ These columns are not Numeric:\n{missing_features}")
     else:
         sanitized_columns = list()
         for feature in use_columns:
             if feature not in ground_truth_cols:
-                if verbose:
-                    _LOGGER.warning(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
+                _LOGGER.warning(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
             else:
                 sanitized_columns.append(feature)
     if ignore_columns is not None and use_columns is None:
         missing_ignore = set(ignore_columns) - set(ground_truth_cols)
-        if missing_ignore and verbose:
-            _LOGGER.warning(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
+        if missing_ignore:
+            _LOGGER.warning(f"⚠️ Warning: The following 'columns to ignore' are not found in the Dataframe:\n{missing_ignore}")
         sanitized_columns = [f for f in sanitized_columns if f not in ignore_columns]
     X = df[sanitized_columns].copy()
@@ -139,7 +137,7 @@ def compute_vif(
                         filename += ".svg"
                 full_save_path = save_path / filename
                 plt.savefig(full_save_path, format='svg', bbox_inches='tight')
-                print(f"\tSaved VIF plot: '{filename}'")
+                _LOGGER.info(f"✅ Saved VIF plot: '{filename}'")
             if show_plot:
                 plt.show()
@@ -164,11 +162,16 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
     """
     # Ensure expected structure
     if 'feature' not in vif_df.columns or 'VIF' not in vif_df.columns:
-        raise ValueError("`vif_df` must contain 'feature' and 'VIF' columns.")
+        raise ValueError("'vif_df' must contain 'feature' and 'VIF' columns.")
     # Identify features to drop
     to_drop = vif_df[vif_df["VIF"] > threshold]["feature"].tolist()
-    _LOGGER.info(f"🗑️ Dropping {len(to_drop)} column(s) with VIF > {threshold}: {to_drop}")
+    if len(to_drop) > 0:
+        _LOGGER.info(f"🗑️ Dropping {len(to_drop)} column(s) with VIF > {threshold}:")
+        for dropped_column in to_drop:
+            print(f"\t{dropped_column}")
+    else:
+        _LOGGER.info(f"No columns exceed the VIF threshold of '{threshold}'.")
     result_df = df.drop(columns=to_drop)
@@ -186,7 +189,7 @@ def compute_vif_multi(input_directory: Union[str, Path],
                       max_features_to_plot: int = 20,
                       fontsize: int = 14):
     """
-    Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames). No plots or warnings will be displayed inline.
+    Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames). No plots will be displayed inline.
     Generates a bar plot of VIF values. Optionally drops columns with VIF >= 10 and saves as a new CSV file.
     Args:
@@ -216,8 +219,7 @@ def compute_vif_multi(input_directory: Union[str, Path],
                             fontsize=fontsize,
                             save_dir=output_plot_directory,
                             filename=df_name,
-                            show_plot=False,
-                            verbose=False)
+                            show_plot=False)
         if output_dataset_path is not None:
             new_filename = df_name + '_VIF'

ml_tools/data_exploration.py CHANGED Viewed

@@ -143,7 +143,7 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
         feature_na_frac = df_clean[feature_cols].isnull().mean(axis=1)
         rows_to_drop = feature_na_frac[feature_na_frac > threshold].index
         if len(rows_to_drop) > 0:
-            print(f"📉 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
+            print(f"🧹 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
             df_clean = df_clean.drop(index=rows_to_drop)
         else:
             print(f"✅ No rows exceed the {threshold*100:.0f}% missing feature data threshold.")

ml_tools/handle_excel.py CHANGED Viewed

@@ -36,7 +36,7 @@ def find_excel_files(
     input_path = make_fullpath(directory)
     if not input_path.is_dir():
-        raise NotADirectoryError(f"Directory not found: {input_path}")
+        raise NotADirectoryError(f"❌ Directory not found: {input_path}")
     excel_files = [
         f for f in input_path.iterdir()
@@ -46,7 +46,7 @@ def find_excel_files(
     ]
     if not excel_files:
-        raise FileNotFoundError(f"No valid Excel files found in directory: {input_path}")
+        raise FileNotFoundError(f"❌ No valid Excel files found in directory: {input_path}")
     return excel_files
@@ -198,7 +198,7 @@ def validate_excel_schema(
                     invalid_files.append(file)
         except Exception as e:
-            _LOGGER.error(f"Error processing '{file}': {e}")
+            _LOGGER.error(f"❌ Error processing '{file}': {e}")
             invalid_files.append(file)
     valid_excel_number = len(excel_paths) - len(invalid_files)
@@ -251,7 +251,7 @@ def vertical_merge_transform_excel(
         if target_columns is not None:
             missing = [col for col in target_columns if col not in df.columns]
             if missing:
-                raise ValueError(f"Invalid columns in {file.name}: {missing}")
+                raise ValueError(f"❌ Invalid columns in {file.name}: {missing}")
             df = df[target_columns]
         dataframes.append(df)
@@ -261,7 +261,7 @@ def vertical_merge_transform_excel(
     if rename_columns is not None:
         expected_len = len(target_columns if target_columns is not None else merged_df.columns)
         if len(rename_columns) != expected_len:
-            raise ValueError("Length of 'rename_columns' must match the selected columns")
+            raise ValueError("❌ Length of 'rename_columns' must match the selected columns")
         merged_df.columns = rename_columns
     merged_df.to_csv(csv_path, index=False, encoding='utf-8')
@@ -324,6 +324,9 @@ def horizontal_merge_transform_excel(
     merged_df = pd.concat(padded_dataframes, axis=1)
     duplicate_columns = merged_df.columns[merged_df.columns.duplicated()].tolist()
+    if duplicate_columns:
+        _LOGGER.warning(f"⚠️ Duplicate columns: {duplicate_columns}")
     if skip_duplicates:
         merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
@@ -344,9 +347,7 @@ def horizontal_merge_transform_excel(
     merged_df.to_csv(csv_path, index=False, encoding='utf-8')
     _LOGGER.info(f"✅ Merged {len(excel_files)} Excel files into '{csv_filename}'.")
-    if duplicate_columns:
-        _LOGGER.warning(f"⚠️ Duplicate columns: {duplicate_columns}")
 def info():
     _script_info(__all__)

ml_tools/path_manager.py CHANGED Viewed

@@ -102,7 +102,7 @@ class PathManager:
             for key in new_paths:
                 if key in self._paths:
                     raise KeyError(
-                        f"Path key '{key}' already exists in the manager. To replace it, call update() with overwrite=True."
+                        f"❌ Path key '{key}' already exists in the manager. To replace it, call update() with overwrite=True."
                     )
         # Resolve any string paths to Path objects before storing

ml_tools/utilities.py CHANGED Viewed

@@ -32,28 +32,42 @@ __all__ = [
 def make_fullpath(
         input_path: Union[str, Path],
         make: bool = False,
-        verbose: bool = False
+        verbose: bool = False,
+        enforce: Optional[Literal["directory", "file"]] = None
     ) -> Path:
     """
-    Resolves a string or Path into an absolute Path.
+    Resolves a string or Path into an absolute Path, optionally creating it.
     - If the path exists, it is returned.
     - If the path does not exist and `make=True`, it will:
-        - Create the file if the path has a suffix (i.e., is treated as a file)
+        - Create the file if the path has a suffix
         - Create the directory if it has no suffix
     - If `make=False` and the path does not exist, an error is raised.
+    - If `enforce`, raises an error if the resolved path is not what was enforced.
     - Optionally prints whether the resolved path is a file or directory.
     Parameters:
-        input_path (str | Path): Path to resolve.
-        make (bool): If True, attempt to create file or directory.
-        verbose (bool): Print classification after resolution.
+        input_path (str | Path):
+            Path to resolve.
+        make (bool):
+            If True, attempt to create file or directory.
+        verbose (bool):
+            Print classification after resolution.
+        enforce ("directory" | "file" | None):
+            Raises an error if the resolved path is not what was enforced.
     Returns:
         Path: Resolved absolute path.
     Raises:
         ValueError: If the path doesn't exist and can't be created.
+        TypeError: If the final path does not match the `enforce` parameter.
+    ## 🗒️ Note:
+    Directories with dots will be treated as files.
+    Files without extension will be treated as directories.
     """
     path = Path(input_path).expanduser()
@@ -75,6 +89,12 @@ def make_fullpath(
             resolved = path.resolve(strict=True)
         except Exception as e:
             raise ValueError(f"❌ Failed to create {'file' if is_file else 'directory'} '{path}': {e}")
+    if enforce == "file" and not resolved.is_file():
+        raise TypeError(f"❌ Path was enforced as a file, but it is not: '{resolved}'")
+    if enforce == "directory" and not resolved.is_dir():
+        raise TypeError(f"❌ Path was enforced as a directory, but it is not: '{resolved}'")
     if verbose:
         if resolved.is_file():
@@ -87,7 +107,7 @@ def make_fullpath(
     return resolved
-def list_csv_paths(directory: Union[str,Path]) -> dict[str, Path]:
+def list_csv_paths(directory: Union[str,Path], verbose: bool=True) -> dict[str, Path]:
     """
     Lists all `.csv` files in the specified directory and returns a mapping: filenames (without extensions) to their absolute paths.
@@ -101,19 +121,20 @@ def list_csv_paths(directory: Union[str,Path]) -> dict[str, Path]:
     csv_paths = list(dir_path.glob("*.csv"))
     if not csv_paths:
-        raise IOError(f"No CSV files found in directory: {dir_path.name}")
+        raise IOError(f"❌ No CSV files found in directory: {dir_path.name}")
     # make a dictionary of paths and names
     name_path_dict = {p.stem: p for p in csv_paths}
-    print("\n🗂️ CSV files found:")
-    for name in name_path_dict.keys():
-        print(f"\t{name}")
+    if verbose:
+        print("\n🗂️ CSV files found:")
+        for name in name_path_dict.keys():
+            print(f"\t{name}")
     return name_path_dict
-def list_files_by_extension(directory: Union[str,Path], extension: str) -> dict[str, Path]:
+def list_files_by_extension(directory: Union[str,Path], extension: str, verbose: bool=True) -> dict[str, Path]:
     """
     Lists all files with the specified extension in the given directory and returns a mapping:
     filenames (without extensions) to their absolute paths.
@@ -133,13 +154,14 @@ def list_files_by_extension(directory: Union[str,Path], extension: str) -> dict[
     matched_paths = list(dir_path.glob(pattern))
     if not matched_paths:
-        raise IOError(f"No '.{normalized_ext}' files found in directory: {dir_path}")
+        raise IOError(f"❌ No '.{normalized_ext}' files found in directory: {dir_path}")
     name_path_dict = {p.stem: p for p in matched_paths}
-    print(f"\n📂 '{normalized_ext.upper()}' files found:")
-    for name in name_path_dict:
-        print(f"\t{name}")
+    if verbose:
+        print(f"\n📂 '{normalized_ext.upper()}' files found:")
+        for name in name_path_dict:
+            print(f"\t{name}")
     return name_path_dict
@@ -147,7 +169,8 @@ def list_files_by_extension(directory: Union[str,Path], extension: str) -> dict[
 def load_dataframe(
     df_path: Union[str, Path],
     kind: Literal["pandas", "polars"] = "pandas",
-    all_strings: bool = False
+    all_strings: bool = False,
+    verbose: bool = True
 ) -> Tuple[Union[pd.DataFrame, pl.DataFrame], str]:
     """
     Load a CSV file into a DataFrame and extract its base name.
@@ -191,20 +214,21 @@ def load_dataframe(
             df = pl.read_csv(path, infer_schema_length=1000)
     else:
-        raise ValueError(f"Invalid kind '{kind}'. Must be one of 'pandas' or 'polars'.")
+        raise ValueError(f"❌ Invalid kind '{kind}'. Must be one of 'pandas' or 'polars'.")
     # This check works for both pandas and polars DataFrames
     if df.shape[0] == 0:
-        raise ValueError(f"DataFrame '{df_name}' loaded from '{path}' is empty.")
+        raise ValueError(f"❌ DataFrame '{df_name}' loaded from '{path}' is empty.")
-    print(f"\n💿 Loaded {kind} dataset: '{df_name}' with shape: {df.shape}")
+    if verbose:
+        print(f"\n💾 Loaded {kind.upper()} dataset: '{df_name}' with shape: {df.shape}")
     return df, df_name
-def yield_dataframes_from_dir(datasets_dir: Union[str,Path]):
+def yield_dataframes_from_dir(datasets_dir: Union[str,Path], verbose: bool=True):
     """
-    Iterates over all CSV files in a given directory, loading each into a pandas DataFrame.
+    Iterates over all CSV files in a given directory, loading each into a Pandas DataFrame.
     Parameters:
         datasets_dir (str | Path):
@@ -221,9 +245,10 @@ def yield_dataframes_from_dir(datasets_dir: Union[str,Path]):
     - Output is streamed via a generator to support lazy loading of multiple datasets.
     """
     datasets_path = make_fullpath(datasets_dir)
-    for df_name, df_path in list_csv_paths(datasets_path).items():
+    files_dict = list_csv_paths(datasets_path, verbose=verbose)
+    for df_name, df_path in files_dict.items():
         df: pd.DataFrame
-        df, _ = load_dataframe(df_path, kind="pandas") # type: ignore
+        df, _ = load_dataframe(df_path, kind="pandas", verbose=verbose) # type: ignore
         yield df, df_name
@@ -253,35 +278,35 @@ def merge_dataframes(
             - If column names or order differ for vertical merge.
     """
     if len(dfs) < 2:
-        raise ValueError("At least 2 DataFrames must be provided.")
+        raise ValueError("❌ At least 2 DataFrames must be provided.")
     if verbose:
         for i, df in enumerate(dfs, start=1):
-            print(f"DataFrame {i} shape: {df.shape}")
+            print(f"➡️ DataFrame {i} shape: {df.shape}")
     if direction == "horizontal":
         reference_index = dfs[0].index
         for i, df in enumerate(dfs, start=1):
             if not df.index.equals(reference_index):
-                raise ValueError(f"Indexes do not match: Dataset 1 and Dataset {i}.")
+                raise ValueError(f"❌ Indexes do not match: Dataset 1 and Dataset {i}.")
         merged_df = pd.concat(dfs, axis=1)
     elif direction == "vertical":
         reference_columns = dfs[0].columns
         for i, df in enumerate(dfs, start=1):
             if not df.columns.equals(reference_columns):
-                raise ValueError(f"Column names/order do not match: Dataset 1 and Dataset {i}.")
+                raise ValueError(f"❌ Column names/order do not match: Dataset 1 and Dataset {i}.")
         merged_df = pd.concat(dfs, axis=0)
     else:
-        raise ValueError(f"Invalid merge direction: {direction}")
+        raise ValueError(f"❌ Invalid merge direction: {direction}")
     if reset_index:
         merged_df = merged_df.reset_index(drop=True)
     if verbose:
-        print(f"Merged DataFrame shape: {merged_df.shape}")
+        print(f"\n✅ Merged DataFrame shape: {merged_df.shape}")
     return merged_df
@@ -320,9 +345,9 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Pa
         df.write_csv(output_path) # Polars defaults to utf8 and no index
     else:
         # This error handles cases where an unsupported type is passed
-        raise TypeError(f"Unsupported DataFrame type: {type(df)}. Must be pandas or polars.")
+        raise TypeError(f"❌ Unsupported DataFrame type: {type(df)}. Must be pandas or polars.")
-    print(f"✅ Saved dataset: '{filename}' with shape: {df.shape}")
+    print(f"\n✅ Saved dataset: '{filename}' with shape: {df.shape}")
 def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
@@ -356,7 +381,7 @@ def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
     # Raise for negative values
     if any(x < 0 for x in float_list):
-        raise ValueError("Negative values are not allowed in the input list.")
+        raise ValueError("❌ Negative values are not allowed in the input list.")
     # Step 2: Compute log10 of non-zero values
     nonzero = [x for x in float_list if x > 0]
@@ -395,7 +420,7 @@ def sanitize_filename(filename: str) -> str:
     - Removing or replacing characters invalid in filenames.
     Args:
-        name (str): Base filename.
+        filename (str): Base filename.
     Returns:
         str: A sanitized string suitable to use as a filename.
@@ -408,6 +433,10 @@ def sanitize_filename(filename: str) -> str:
     # Conservative filter to keep filenames safe across platforms
     sanitized = re.sub(r'[^\w\-.]', '', sanitized)
+    # Check for empty string after sanitization
+    if not sanitized:
+        raise ValueError("The sanitized filename is empty. The original input may have contained only invalid characters.")
     return sanitized
@@ -418,6 +447,8 @@ def threshold_binary_values(
 ) -> Union[np.ndarray, pd.Series, pl.Series, list[float], tuple[float]]:
     """
     Thresholds binary features in a 1D input. The number of binary features are counted starting from the end.
+    Binary elements are converted to 0 or 1 using a 0.5 threshold.
     Parameters:
         input_array: 1D sequence, NumPy array, pandas Series, or polars Series.
@@ -426,7 +457,8 @@ def threshold_binary_values(
             - If `int`, only this many last `binary_values` are thresholded.
     Returns:
-        Same type as input, with binary elements binarized to 0 or 1 using a 0.5 threshold.
+        Any:
+        Same type as input
     """
     original_type = type(input_array)
@@ -437,14 +469,14 @@ def threshold_binary_values(
     elif isinstance(input_array, (list, tuple)):
         array = np.array(input_array)
     else:
-        raise TypeError("Unsupported input type")
+        raise TypeError("❌ Unsupported input type")
     array = array.flatten()
     total = array.shape[0]
     bin_count = total if binary_values is None else binary_values
     if not (0 <= bin_count <= total):
-        raise ValueError("binary_values must be between 0 and the total number of elements")
+        raise ValueError("❌ binary_values must be between 0 and the total number of elements")
     if bin_count == 0:
         result = array
@@ -484,9 +516,9 @@ def threshold_binary_values_batch(
     np.ndarray
         Thresholded array, same shape as input.
     """
-    assert input_array.ndim == 2, f"Expected 2D array, got {input_array.ndim}D"
+    assert input_array.ndim == 2, f"❌ Expected 2D array, got {input_array.ndim}D"
     batch_size, total_features = input_array.shape
-    assert 0 <= binary_values <= total_features, "binary_values out of valid range"
+    assert 0 <= binary_values <= total_features, "❌ binary_values out of valid range"
     if binary_values == 0:
         return input_array.copy()
@@ -523,7 +555,7 @@ def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose
         return None
     else:
         if verbose:
-            print(f"✅ Object of type '{type(obj)}' saved to '{full_path}'")
+            print(f"\n✅ Object of type '{type(obj)}' saved to '{full_path}'")
         return None
@@ -550,7 +582,7 @@ def deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_e
         return None
     else:
         if verbose:
-            print(f"✅ Loaded object of type '{type(obj)}'")
+            print(f"\n✅ Loaded object of type '{type(obj)}'")
         return obj

{dragon_ml_toolbox-3.12.0.dist-info → dragon_ml_toolbox-3.12.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{dragon_ml_toolbox-3.12.0.dist-info → dragon_ml_toolbox-3.12.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dragon_ml_toolbox-3.12.0.dist-info → dragon_ml_toolbox-3.12.1.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

File without changes

{dragon_ml_toolbox-3.12.0.dist-info → dragon_ml_toolbox-3.12.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

dragon-ml-toolbox 3.12.0__py3-none-any.whl → 3.12.1__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 3.12.0py3-none-any.whl → 3.12.1py3-none-any.whl