PyPI - dragon-ml-toolbox - Versions diffs - 10.1.1__py3-none-any.whl → 14.2.0__py3-none-any.whl - Mend

dragon-ml-toolbox 10.1.1py3-none-any.whl → 14.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (48) hide show

{dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/METADATA +38 -63
dragon_ml_toolbox-14.2.0.dist-info/RECORD +48 -0
{dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE +1 -1
{dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +11 -0
ml_tools/ETL_cleaning.py +175 -59
ml_tools/ETL_engineering.py +506 -70
ml_tools/GUI_tools.py +2 -1
ml_tools/MICE_imputation.py +212 -7
ml_tools/ML_callbacks.py +73 -40
ml_tools/ML_datasetmaster.py +267 -284
ml_tools/ML_evaluation.py +119 -58
ml_tools/ML_evaluation_multi.py +107 -32
ml_tools/ML_inference.py +15 -5
ml_tools/ML_models.py +234 -170
ml_tools/ML_models_advanced.py +323 -0
ml_tools/ML_optimization.py +321 -97
ml_tools/ML_scaler.py +10 -5
ml_tools/ML_trainer.py +585 -40
ml_tools/ML_utilities.py +528 -0
ml_tools/ML_vision_datasetmaster.py +1315 -0
ml_tools/ML_vision_evaluation.py +260 -0
ml_tools/ML_vision_inference.py +428 -0
ml_tools/ML_vision_models.py +627 -0
ml_tools/ML_vision_transformers.py +58 -0
ml_tools/PSO_optimization.py +10 -7
ml_tools/RNN_forecast.py +2 -0
ml_tools/SQL.py +22 -9
ml_tools/VIF_factor.py +4 -3
ml_tools/_ML_vision_recipe.py +88 -0
ml_tools/__init__.py +1 -0
ml_tools/_logger.py +0 -2
ml_tools/_schema.py +96 -0
ml_tools/constants.py +79 -0
ml_tools/custom_logger.py +164 -16
ml_tools/data_exploration.py +1092 -109
ml_tools/ensemble_evaluation.py +48 -1
ml_tools/ensemble_inference.py +6 -7
ml_tools/ensemble_learning.py +4 -3
ml_tools/handle_excel.py +1 -0
ml_tools/keys.py +80 -0
ml_tools/math_utilities.py +259 -0
ml_tools/optimization_tools.py +198 -24
ml_tools/path_manager.py +144 -45
ml_tools/serde.py +192 -0
ml_tools/utilities.py +287 -227
dragon_ml_toolbox-10.1.1.dist-info/RECORD +0 -36
{dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/top_level.txt +0 -0

{dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,23 +1,17 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 10.1.1
+Version: 14.2.0
 Summary: A collection of tools for data science and machine learning projects.
-Author-email: Karl Loza <luigiloza@gmail.com>
+Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
 License-Expression: MIT
 Project-URL: Homepage, https://github.com/DrAg0n-BoRn/ML_tools
 Project-URL: Changelog, https://github.com/DrAg0n-BoRn/ML_tools/blob/master/CHANGELOG.md
 Classifier: Programming Language :: Python :: 3
 Classifier: Operating System :: OS Independent
-Requires-Python: >=3.10
+Requires-Python: >=3.12
 Description-Content-Type: text/markdown
 License-File: LICENSE
 License-File: LICENSE-THIRD-PARTY.md
-Provides-Extra: base
-Requires-Dist: pandas; extra == "base"
-Requires-Dist: numpy; extra == "base"
-Requires-Dist: polars; extra == "base"
-Requires-Dist: joblib; extra == "base"
-Requires-Dist: colorlog; extra == "base"
 Provides-Extra: ml
 Requires-Dist: numpy>=2.0; extra == "ml"
 Requires-Dist: pandas; extra == "ml"
@@ -38,7 +32,12 @@ Requires-Dist: shap; extra == "ml"
 Requires-Dist: tqdm; extra == "ml"
 Requires-Dist: Pillow; extra == "ml"
 Requires-Dist: evotorch; extra == "ml"
+Requires-Dist: pyarrow; extra == "ml"
 Requires-Dist: colorlog; extra == "ml"
+Requires-Dist: torchmetrics; extra == "ml"
+Provides-Extra: py-tab
+Requires-Dist: pytorch_tabular; extra == "py-tab"
+Requires-Dist: omegaconf; extra == "py-tab"
 Provides-Extra: mice
 Requires-Dist: numpy<2.0; extra == "mice"
 Requires-Dist: pandas; extra == "mice"
@@ -51,9 +50,7 @@ Requires-Dist: statsmodels; extra == "mice"
 Requires-Dist: lightgbm<=4.5.0; extra == "mice"
 Requires-Dist: shap; extra == "mice"
 Requires-Dist: colorlog; extra == "mice"
-Provides-Extra: pytorch
-Requires-Dist: torch; extra == "pytorch"
-Requires-Dist: torchvision; extra == "pytorch"
+Requires-Dist: pyarrow; extra == "mice"
 Provides-Extra: excel
 Requires-Dist: pandas; extra == "excel"
 Requires-Dist: openpyxl; extra == "excel"
@@ -72,9 +69,6 @@ Requires-Dist: lightgbm; extra == "gui-boost"
 Provides-Extra: gui-torch
 Requires-Dist: numpy; extra == "gui-torch"
 Requires-Dist: FreeSimpleGUI>=5.2; extra == "gui-torch"
-Provides-Extra: plot
-Requires-Dist: matplotlib; extra == "plot"
-Requires-Dist: seaborn; extra == "plot"
 Provides-Extra: pyinstaller
 Requires-Dist: pyinstaller; extra == "pyinstaller"
 Provides-Extra: nuitka
@@ -94,7 +88,7 @@ A collection of Python utilities for data science and machine learning, structur
 ## Installation
-**Python 3.10+**
+**Python 3.12**
 ### Via PyPI
@@ -104,22 +98,22 @@ Install the latest stable release from PyPI:
 pip install dragon-ml-toolbox
 ```
-### Via GitHub (Editable)
+### Via conda-forge
-Clone the repository and install in editable mode with optional dependencies:
+Install from the conda-forge channel:
 ```bash
-git clone https://github.com/DrAg0n-BoRn/ML_tools.git
-cd ML_tools
-pip install -e .
+conda install -c conda-forge dragon-ml-toolbox
 ```
-### Via conda-forge
+### Via GitHub (Editable)
-Install from the conda-forge channel:
+Clone the repository and install in editable mode:
 ```bash
-conda install -c conda-forge dragon-ml-toolbox
+git clone https://github.com/DrAg0n-BoRn/ML_tools.git
+cd ML_tools
+pip install -e .
 ```
 ## Modular Installation
@@ -132,17 +126,12 @@ Installs a comprehensive set of tools for typical data science workflows, includ
 pip install "dragon-ml-toolbox[ML]"
 ```
-To install the standard CPU-only versions of Torch and Torchvision:
-```Bash
-pip install "dragon-ml-toolbox[pytorch]"
-```
-⚠️ To make use of GPU acceleration (highly recommended), follow the official instructions: [PyTorch website](https://pytorch.org/get-started/locally/)
+⚠️ PyTorch required, follow the official instructions: [PyTorch website](https://pytorch.org/get-started/locally/)
 #### Modules:
 ```bash
+constants
 custom_logger
 data_exploration
 ensemble_evaluation
@@ -150,19 +139,28 @@ ensemble_inference
 ensemble_learning
 ETL_cleaning
 ETL_engineering
+math_utilities
 ML_callbacks
 ML_datasetmaster
 ML_evaluation_multi
 ML_evaluation
 ML_inference
 ML_models
+ML_models_advanced # Requires the extra flag [py-tab]
 ML_optimization
 ML_scaler
 ML_trainer
+ML_utilities
+ML_vision_datasetmaster
+ML_vision_evaluation
+ML_vision_inference
+ML_vision_models
+ML_vision_transformers
 optimization_tools
 path_manager
 PSO_optimization
 RNN_forecast
+serde
 SQL
 utilities
 ```
@@ -180,8 +178,11 @@ pip install "dragon-ml-toolbox[mice]"
 #### Modules:
 ```Bash
+constants
 custom_logger
+math_utilities
 MICE_imputation
+serde
 VIF_factor
 path_manager
 utilities
@@ -209,42 +210,37 @@ path_manager
 ### 🎰 GUI for Boosting Algorithms (XGBoost, LightGBM) [gui-boost]
-For GUIs that include plotting functionality, you must also install the [plot] extra.
+GUI tools compatible with XGBoost and LightGBM models used for inference.
 ```Bash
 pip install "dragon-ml-toolbox[gui-boost]"
 ```
-```Bash
-pip install "dragon-ml-toolbox[gui-boost,plot]"
-```
 #### Modules:
 ```Bash
+constants
 custom_logger
 GUI_tools
 ensemble_inference
 path_manager
+serde
 ```
 ---
 ### 🤖 GUI for PyTorch Models [gui-torch]
-For GUIs that include plotting functionality, you must also install the [plot] extra.
+GUI tools compatible with PyTorch models used for inference.
 ```Bash
 pip install "dragon-ml-toolbox[gui-torch]"
 ```
-```Bash
-pip install "dragon-ml-toolbox[gui-torch,plot]"
-```
 #### Modules:
 ```Bash
+constants
 custom_logger
 GUI_tools
 ML_models
@@ -255,27 +251,6 @@ path_manager
 ---
-### 🎫 Base Tools [base]
-General purpose functions and classes.
-```Bash
-pip install "dragon-ml-toolbox[base]"
-```
-#### Modules:
-```Bash
-ETL_cleaning
-ETL_engineering
-custom_logger
-SQL
-utilities
-path_manager
-```
----
 ### ⚒️ APP bundlers
 Choose one if needed.
@@ -293,6 +268,6 @@ pip install "dragon-ml-toolbox[nuitka]"
 After installation, import modules like this:
 ```python
-from ml_tools.utilities import serialize_object, deserialize_object
+from ml_tools.serde import serialize_object, deserialize_object
 from ml_tools import custom_logger
 ```

dragon_ml_toolbox-14.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,48 @@
+dragon_ml_toolbox-14.2.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
+dragon_ml_toolbox-14.2.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=gkOdNDbKYpIJezwSo2CEnISkLeYfYHv9t8b5K2-P69A,2687
+ml_tools/ETL_cleaning.py,sha256=2VBRllV8F-ZiPylPp8Az2gwn5ztgazN0BH5OKnRUhV0,20402
+ml_tools/ETL_engineering.py,sha256=KfYqgsxupAx6e_TxwO1LZXeu5mFkIhVXJrNjP3CzIZc,54927
+ml_tools/GUI_tools.py,sha256=Va6ig-dHULPVRwQYYtH3fvY5XPIoqRcJpRW8oXC55Hw,45413
+ml_tools/MICE_imputation.py,sha256=KLJXGQLKJ6AuWWttAG-LCCaxpS-ygM4dXPiguHDaL6Y,20815
+ml_tools/ML_callbacks.py,sha256=elD2Yr030sv_6gX_m9GVd6HTyrbmt34nFS8lrgS4HtM,15808
+ml_tools/ML_datasetmaster.py,sha256=rsJgZEGBJmfeKF6cR8CQZzfEx4T7Y-p1wUnR15_nNw0,28400
+ml_tools/ML_evaluation.py,sha256=4GU86rUWMIGbkXrvN6PyjfGwKtWvXKE7pMlWpWeBq14,18988
+ml_tools/ML_evaluation_multi.py,sha256=rJKdgtq-9I7oaI7PRzq7aIZ84XdNV0xzlVePZW4nj0k,16095
+ml_tools/ML_inference.py,sha256=YJ953bhNWsdlPRtJQh3h2ACfMIgp8dQ9KtL9Azar-5s,23489
+ml_tools/ML_models.py,sha256=PqOcNlws7vCJMbiVCKqlPuktxvskZVUHG3VfU-Yshf8,31415
+ml_tools/ML_models_advanced.py,sha256=vk3PZBSu3DVso2S1rKTxxdS43XG8Q5FnasIL3-rMajc,12410
+ml_tools/ML_optimization.py,sha256=P0zkhKAwTpkorIBtR0AOIDcyexo5ngmvFUzo3DfNO-E,22692
+ml_tools/ML_scaler.py,sha256=tw6onj9o8_kk3FQYb930HUzvv1zsFZe2YZJdF3LtHkU,7538
+ml_tools/ML_trainer.py,sha256=ZWI4MbUcLeBxyfoUTL96l5tjHHMp9I64h4SdXnjYmBE,49795
+ml_tools/ML_utilities.py,sha256=z6LbpbZwhn8F__fWlKi-g-cAJQXSxwg1NHfC5FBoAyc,21139
+ml_tools/ML_vision_datasetmaster.py,sha256=tOrdatuq_AP8-GDiTrtARvSJdpc8h7dT-OhDJtRQnsk,54433
+ml_tools/ML_vision_evaluation.py,sha256=t12R7i1RkOCt9zu1_lxSBr8OH6A6Get0k8ftDLctn6I,10486
+ml_tools/ML_vision_inference.py,sha256=He3KV3VJAm8PwO-fOq4b9VO8UXFr-GmpuCnoHXf4VZI,20588
+ml_tools/ML_vision_models.py,sha256=G3S4jB9AE9wMpU9ZygOgOx9q1K6t6LAXBYcJ-U2XQ1M,25600
+ml_tools/ML_vision_transformers.py,sha256=95e0aBkHY5VDGE8i5xy57COU7NvSNIgFknnhBubwE40,1832
+ml_tools/PSO_optimization.py,sha256=T-HWHMRJUnPvPwixdU5jif3_rnnI36TzcL8u3oSCwuA,22960
+ml_tools/RNN_forecast.py,sha256=Qa2KoZfdAvSjZ4yE78N4BFXtr3tTr0Gx7tQJZPotsh0,1967
+ml_tools/SQL.py,sha256=vXLPGfVVg8bfkbBE3HVfyEclVbdJy0TBhuQONtMwSCQ,11234
+ml_tools/VIF_factor.py,sha256=at5IVqPvicja2-DNSTSIIy3SkzDWCmLzo3qTG_qr5n8,10422
+ml_tools/_ML_vision_recipe.py,sha256=zrgxFUvTJqQVuwR7jWlbIC2FD29u6eNFPkTRoJ7yEZI,3178
+ml_tools/__init__.py,sha256=kJiankjz9_qXu7gU92mYqYg_anLvt-B6RtW0mMH8uGo,76
+ml_tools/_logger.py,sha256=dlp5cGbzooK9YSNSZYB4yjZrOaQUGW8PTrM411AOvL8,4717
+ml_tools/_schema.py,sha256=yu6aWmn_2Z4_AxAtJGDDCIa96y6JcUp-vgnCS013Qmw,3908
+ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
+ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
+ml_tools/custom_logger.py,sha256=TGc0Ww2Xlqj2XE3q4bP43hV7T3qnb5ci9f0pYHXF5TY,11226
+ml_tools/data_exploration.py,sha256=bwHzFJ-IAo5GN3T53F-1J_pXUg8VHS91sG_90utAsfg,69911
+ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
+ml_tools/ensemble_inference.py,sha256=0yLmLNj45RVVoSCLH1ZYJG9IoAhTkWUqEZmLOQTFGTY,9348
+ml_tools/ensemble_learning.py,sha256=vsIED7nlheYI4w2SBzP6SC1AnNeMfn-2A1Gqw5EfxsM,21964
+ml_tools/handle_excel.py,sha256=pfdAPb9ywegFkM9T54bRssDOsX-K7rSeV0RaMz7lEAo,14006
+ml_tools/keys.py,sha256=wZOBuEnnHc54vlOZiimnrxfk-sZh6f6suPppJW8rbPQ,3326
+ml_tools/math_utilities.py,sha256=xeKq1quR_3DYLgowcp4Uam_4s3JltUyOnqMOGuAiYWU,8802
+ml_tools/optimization_tools.py,sha256=TYFQ2nSnp7xxs-VyoZISWgnGJghFbsWasHjruegyJRs,12763
+ml_tools/path_manager.py,sha256=CyDU16pOKmC82jPubqJPT6EBt-u-3rGVbxyPIZCvDDY,18432
+ml_tools/serde.py,sha256=c8uDYjYry_VrLvoG4ixqDj5pij88lVn6Tu4NHcPkwDU,6943
+ml_tools/utilities.py,sha256=aWqvYzmxlD74PD5Yqu1VuTekDJeYLQrmPIU_VeVyRp0,22526
+dragon_ml_toolbox-14.2.0.dist-info/METADATA,sha256=T0eIxD-eO3cbAIzJ1HskJbog6RUYgXwXQQ2OU8Z-GQM,6475
+dragon_ml_toolbox-14.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-14.2.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-14.2.0.dist-info/RECORD,,

{dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE RENAMED Viewed

@@ -1,6 +1,6 @@
 MIT License
-Copyright (c) 2025 Karl Loza
+Copyright (c) 2025 Karl Luigi Loza Vidaurre
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

{dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

@@ -26,3 +26,14 @@ This project depends on the following third-party packages. Each is governed by
 - [polars](https://github.com/pola-rs/polars/blob/main/LICENSE)
 - [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE)
 - [tqdm](https://github.com/tqdm/tqdm/blob/master/LICENSE)
+- [pyarrow](https://github.com/apache/arrow/blob/main/LICENSE.txt)
+- [colorlog](https://github.com/borntyping/python-colorlog/blob/main/LICENSE)
+- [evotorch](https://github.com/nnaisense/evotorch/blob/master/LICENSE)
+- [FreeSimpleGUI](https://github.com/spyoungtech/FreeSimpleGUI/blob/main/license.txt)
+- [nuitka](https://github.com/Nuitka/Nuitka/blob/main/LICENSE.txt)
+- [omegaconf](https://github.com/omry/omegaconf/blob/master/LICENSE)
+- [ordered-set](https://github.com/rspeer/ordered-set/blob/master/MIT-LICENSE)
+- [pyinstaller](https://github.com/pyinstaller/pyinstaller/blob/develop/COPYING.txt)
+- [pytorch_tabular](https://github.com/manujosephv/pytorch_tabular/blob/main/LICENSE)
+- [torchmetrics](https://github.com/Lightning-AI/torchmetrics/blob/master/LICENSE)
+- [zstandard](https://github.com/indygreg/python-zstandard/blob/main/LICENSE)

ml_tools/ETL_cleaning.py CHANGED Viewed

@@ -2,8 +2,10 @@ import polars as pl
 import pandas as pd
 from pathlib import Path
 from typing import Union, List, Dict
 from .path_manager import sanitize_filename, make_fullpath
-from .utilities import save_dataframe, load_dataframe
+from .data_exploration import drop_macro
+from .utilities import save_dataframe_filename, load_dataframe
 from ._script_info import _script_info
 from ._logger import _LOGGER
@@ -11,26 +13,33 @@ from ._logger import _LOGGER
 __all__ = [
     "save_unique_values",
     "basic_clean",
+    "basic_clean_drop",
     "ColumnCleaner",
     "DataFrameCleaner"
 ]
 ################ Unique Values per column #################
-def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path], verbose: bool=False) -> None:
+def save_unique_values(csv_path: Union[str, Path],
+                       output_dir: Union[str, Path],
+                       verbose: bool=False,
+                       keep_column_order: bool = True) -> None:
     """
     Loads a CSV file, then analyzes it and saves the unique non-null values
     from each column into a separate text file exactly as they appear.
     This is useful for understanding the raw categories or range of values
-    within a dataset before cleaning.
+    within a dataset before and after cleaning.
     Args:
-        csv_path (Union[str, Path]):
+        csv_path (str | Path):
             The file path to the input CSV file.
-        output_dir (Union[str, Path]):
+        output_dir (str | Path):
             The path to the directory where the .txt files will be saved.
             The directory will be created if it does not exist.
+        keep_column_order (bool):
+            If True, prepends a numeric prefix (e.g., '1_', '2_') to each
+            output filename to maintain the original column order.
     """
     # --- 1. Input Validation ---
     csv_path = make_fullpath(input_path=csv_path, enforce="file")
@@ -72,7 +81,12 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
         sanitized_name = sanitize_filename(column_name)
         if not sanitized_name.strip('_'):
             sanitized_name = f'column_{i}'
-        file_path = output_dir / f"{sanitized_name}_unique_values.txt"
+        # --- create filename prefix ---
+        # If keep_column_order is True, create a prefix like "1_", "2_", etc.
+        prefix = f"{i + 1}_" if keep_column_order else ''
+        file_path = output_dir / f"{prefix}{sanitized_name}_unique_values.txt"
         # --- Write to file ---
         try:
@@ -93,39 +107,8 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
     _LOGGER.info(f"{counter} files of unique values created.")
-########## Basic df cleaner #############
-def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path,None]=None):
-    """
-    Performs a comprehensive, standardized cleaning on all columns of a CSV file.
-    The cleaning process includes:
-    - Normalizing full-width and typographical punctuation to standard equivalents.
-    - Consolidating all internal whitespace (spaces, tabs, newlines) into a single space.
-    - Stripping any leading or trailing whitespace.
-    - Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
-    - Converting strings that become empty after cleaning into true null values.
-    - Normalizing all text to lowercase.
-    Args:
-        input_filepath (Union[str, Path]):
-            The path to the source CSV file to be cleaned.
-        output_filepath (Union[str, Path, None], optional):
-            The path to save the cleaned CSV file. If None (default),
-            the original input file will be overwritten.
-    """
-    # Handle paths
-    input_path = make_fullpath(input_filepath, enforce="file")
-    # Unless explicitly defined, overwrite file.
-    if output_filepath is not None:
-        parent_dir = make_fullpath(Path(output_filepath).parent, make=True, enforce="directory")
-        output_path = parent_dir / Path(output_filepath).name
-    else:
-        output_path = input_path
-    # load polars df
-    df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
+########## Basic df cleaners #############
+def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
     # Cleaning rules
     cleaning_rules = {
         # 1. Comprehensive Punctuation & Symbol Normalization
@@ -141,6 +124,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
         '⁶': '6', '⁷': '7', '⁸': '8', '⁹': '9', '⁰': '0',
         '₁': '1', '₂': '2', '₃': '3', '₄': '4', '₅': '5',
         '₆': '6', '₇': '7', '₈': '8', '₉': '9', '₀': '0',
+        '⁺': '', '⁻': '', '₊': '', '₋': '',
         # Uppercase Alphabet
         'Ａ': 'A', 'Ｂ': 'B', 'Ｃ': 'C', 'Ｄ': 'D', 'Ｅ': 'E', 'Ｆ': 'F',
         'Ｇ': 'G', 'Ｈ': 'H', 'Ｉ': 'I', 'Ｊ': 'J', 'Ｋ': 'K', 'Ｌ': 'L',
@@ -154,26 +138,44 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
         'ｓ': 's', 'ｔ': 't', 'ｕ': 'u', 'ｖ': 'v', 'ｗ': 'w', 'ｘ': 'x',
         'ｙ': 'y', 'ｚ': 'z',
         # Punctuation
-        '》': '>', '《': '<', '：': ':', '，': ',', '。': '.', '；': ';', '【': '[', '】': ']',
+        '》': '>', '《': '<', '：': ':', '。': '.', '；': ';', '【': '[', '】': ']', '∼': '~',
         '（': '(', '）': ')', '？': '?', '！': '!', '～': '~', '＠': '@', '＃': '#', '＋': '+', '－': '-',
-        '＄': '$', '％': '%', '＾': '^', '＆': '&', '＊': '*', '＼': '\\', '｜': '|', '、':',', '≈':'=',
+        '＄': '$', '％': '%', '＾': '^', '＆': '&', '＊': '*', '＼': '-', '｜': '|', '≈':'=', '·': '', '⋅': '',
+        '¯': '-',
+        # Commas (avoid commas in entries)
+        '，': ';',
+        ',': ';',
+        '、':';',
         # Others
+        'σ': '',
+        '□': '',
         '©': '',
         '®': '',
         '™': '',
+        r'[°˚]': '',
+        # Replace special characters in entries
+        r'\\': '_',
+        # Typographical standardization
+        # Unify various dashes and hyphens to a standard hyphen
+        r'[—–―]': '-',
+        r'−': '-',
+        # remove various quote types
+        r'[“”"]': '',
+        r"[‘’′']": '',
         # Collapse repeating punctuation
         r'\.{2,}': '.',      # Replace two or more dots with a single dot
         r'\?{2,}': '?',      # Replace two or more question marks with a single question mark
         r'!{2,}': '!',      # Replace two or more exclamation marks with a single one
-        # Typographical standardization
-        # Unify various dashes and hyphens to a standard hyphen-minus
-        r'[—–―]': '-',
-        # Unify various quote types to standard quotes
-        r'[“”]': "'",
-        r'[‘’′]': "'",
+        r';{2,}': ';',
+        r'-{2,}': '-',
+        r'/{2,}': '/',
+        r'%{2,}': '%',
+        r'&{2,}': '&',
         # 2. Internal Whitespace Consolidation
         # Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
@@ -184,36 +186,150 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
         r'^\s+|\s+$': '',
         # 4. Textual Null Standardization (New Step)
-        # Convert common null-like text to actual nulls. (?i) makes it case-insensitive.
-        r'^(N/A|无|NA|NULL|NONE|NIL|)$': None,
+        # Convert common null-like text to actual nulls.
+        r'^(N/A|无|NA|NULL|NONE|NIL|-|\.|;|/|%|&)$': None,
         # 5. Final Nullification of Empty Strings
         # After all cleaning, if a string is now empty, convert it to a null
-        r'^$': None
+        r'^\s*$': None,
+        r'^$': None,
     }
     # Clean data
     try:
         # Create a cleaner for every column in the dataframe
-        all_columns = df.columns
+        all_columns = df_in.columns
         column_cleaners = [
             ColumnCleaner(col, rules=cleaning_rules, case_insensitive=True) for col in all_columns
         ]
         # Instantiate and run the main dataframe cleaner
         df_cleaner = DataFrameCleaner(cleaners=column_cleaners)
-        df_cleaned = df_cleaner.clean(df, clone_df=False) # Use clone_df=False for efficiency
+        df_cleaned = df_cleaner.clean(df_in, clone_df=False) # Use clone_df=False for efficiency
         # apply lowercase to all string columns
-        df_final = df_cleaned.with_columns(
-            pl.col(pl.String).str.to_lowercase()
-        )
+        if all_lowercase:
+            df_final = df_cleaned.with_columns(
+                pl.col(pl.String).str.to_lowercase()
+            )
+        else:
+            df_final = df_cleaned
     except Exception as e:
-        _LOGGER.error(f"An error occurred during the cleaning process for '{input_path.name}'.")
+        _LOGGER.error(f"An error occurred during the cleaning process.")
         raise e
+    else:
+        return df_final
+def _path_manager(path_in: Union[str,Path], path_out: Union[str,Path]):
+    # Handle paths
+    input_path = make_fullpath(path_in, enforce="file")
+    parent_dir = make_fullpath(Path(path_out).parent, make=True, enforce="directory")
+    output_path = parent_dir / Path(path_out).name
+    return input_path, output_path
+def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path], all_lowercase: bool=True):
+    """
+    Performs a comprehensive, standardized cleaning on all columns of a CSV file.
+    The cleaning process includes:
+    - Normalizing full-width and typographical punctuation to standard equivalents.
+    - Consolidating all internal whitespace (spaces, tabs, newlines) into a single space.
+    - Stripping any leading or trailing whitespace.
+    - Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
+    - Converting strings that become empty after cleaning into true null values.
+    - Normalizing all text to lowercase (Optional).
+    Args:
+        input_filepath (str | Path):
+            The path to the source CSV file to be cleaned.
+        output_filepath (str | Path):
+            The path to save the cleaned CSV file.
+        all_lowercase (bool):
+            Whether to normalize all text to lowercase.
+    """
+    # Handle paths
+    input_path, output_path = _path_manager(path_in=input_filepath, path_out=output_filepath)
+    # load polars df
+    df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
+    # CLEAN
+    df_final = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
+    # Save cleaned dataframe
+    save_dataframe_filename(df=df_final, save_dir=output_path.parent, filename=output_path.name)
+    _LOGGER.info(f"Data successfully cleaned.")
+def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str,Path], log_directory: Union[str,Path], targets: list[str],
+                     skip_targets: bool=False, threshold: float=0.8, all_lowercase: bool=True):
+    """
+    Performs standardized cleaning followed by iterative removal of rows and
+    columns with excessive missing data.
+    This function combines the functionality of `basic_clean` and `drop_macro`. It first
+    applies a comprehensive normalization process to all columns in the input CSV file,
+    ensuring consistent formatting and proper null value handling. The cleaned data is then
+    converted to a pandas DataFrame, where iterative row and column dropping is applied
+    to remove redundant or incomplete data.
+    The iterative dropping cycle continues until no further rows or columns meet the
+    removal criteria, ensuring that dependencies between row and column deletions are
+    fully resolved. Logs documenting the missing data profile before and after the
+    dropping process are saved to the specified log directory.
+    Args:
+        input_filepath (str | Path):
+            The path to the source CSV file to be cleaned.
+        output_filepath (str | Path):
+            The path to save the fully cleaned CSV file after cleaning
+            and missing-data-based pruning.
+        log_directory (str | Path):
+            Path to the directory where missing data reports will be stored.
+        targets (list[str]):
+            A list of column names to be treated as target variables.
+            This list guides the row-dropping logic.
+        skip_targets (bool):
+            If True, the columns listed in `targets` will be exempt from being dropped,
+            even if they exceed the missing data threshold.
+        threshold (float):
+            The proportion of missing data required to drop a row or column.
+            For example, 0.8 means a row/column will be dropped if 80% or more
+            of its data is missing.
+        all_lowercase (bool):
+            Whether to normalize all text to lowercase.
+    """
+    # handle log path
+    log_path = make_fullpath(log_directory, make=True, enforce="directory")
+    # Handle df paths
+    input_path, output_path = _path_manager(path_in=input_filepath, path_out=output_filepath)
+    # load polars df
+    df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
+    # CLEAN
+    df_cleaned = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
+    # switch to pandas
+    df_cleaned_pandas = df_cleaned.to_pandas()
+    # Drop macro
+    df_final = drop_macro(df=df_cleaned_pandas,
+                          log_directory=log_path,
+                          targets=targets,
+                          skip_targets=skip_targets,
+                          threshold=threshold)
     # Save cleaned dataframe
-    save_dataframe(df=df_final, save_dir=output_path.parent, filename=output_path.name)
+    save_dataframe_filename(df=df_final, save_dir=output_path.parent, filename=output_path.name)
     _LOGGER.info(f"Data successfully cleaned.")
@@ -378,7 +494,7 @@ class DataFrameCleaner:
         if isinstance(output_filepath, str):
             output_filepath = make_fullpath(input_path=output_filepath, enforce="file")
-        save_dataframe(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
+        save_dataframe_filename(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
         return None

dragon-ml-toolbox 10.1.1__py3-none-any.whl → 14.2.0__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 10.1.1py3-none-any.whl → 14.2.0py3-none-any.whl