PyPI - dragon-ml-toolbox - Versions diffs - 10.2.0__py3-none-any.whl → 14.2.0__py3-none-any.whl - Mend

dragon-ml-toolbox 10.2.0py3-none-any.whl → 14.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (48) hide show

{dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/METADATA +38 -63
dragon_ml_toolbox-14.2.0.dist-info/RECORD +48 -0
{dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE +1 -1
{dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +11 -0
ml_tools/ETL_cleaning.py +72 -34
ml_tools/ETL_engineering.py +506 -70
ml_tools/GUI_tools.py +2 -1
ml_tools/MICE_imputation.py +212 -7
ml_tools/ML_callbacks.py +73 -40
ml_tools/ML_datasetmaster.py +267 -284
ml_tools/ML_evaluation.py +119 -58
ml_tools/ML_evaluation_multi.py +107 -32
ml_tools/ML_inference.py +15 -5
ml_tools/ML_models.py +234 -170
ml_tools/ML_models_advanced.py +323 -0
ml_tools/ML_optimization.py +321 -97
ml_tools/ML_scaler.py +10 -5
ml_tools/ML_trainer.py +585 -40
ml_tools/ML_utilities.py +528 -0
ml_tools/ML_vision_datasetmaster.py +1315 -0
ml_tools/ML_vision_evaluation.py +260 -0
ml_tools/ML_vision_inference.py +428 -0
ml_tools/ML_vision_models.py +627 -0
ml_tools/ML_vision_transformers.py +58 -0
ml_tools/PSO_optimization.py +10 -7
ml_tools/RNN_forecast.py +2 -0
ml_tools/SQL.py +22 -9
ml_tools/VIF_factor.py +4 -3
ml_tools/_ML_vision_recipe.py +88 -0
ml_tools/__init__.py +1 -0
ml_tools/_logger.py +0 -2
ml_tools/_schema.py +96 -0
ml_tools/constants.py +79 -0
ml_tools/custom_logger.py +164 -16
ml_tools/data_exploration.py +1092 -109
ml_tools/ensemble_evaluation.py +48 -1
ml_tools/ensemble_inference.py +6 -7
ml_tools/ensemble_learning.py +4 -3
ml_tools/handle_excel.py +1 -0
ml_tools/keys.py +80 -0
ml_tools/math_utilities.py +259 -0
ml_tools/optimization_tools.py +198 -24
ml_tools/path_manager.py +144 -45
ml_tools/serde.py +192 -0
ml_tools/utilities.py +287 -227
dragon_ml_toolbox-10.2.0.dist-info/RECORD +0 -36
{dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/top_level.txt +0 -0

{dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,23 +1,17 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 10.2.0
+Version: 14.2.0
 Summary: A collection of tools for data science and machine learning projects.
-Author-email: Karl Loza <luigiloza@gmail.com>
+Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
 License-Expression: MIT
 Project-URL: Homepage, https://github.com/DrAg0n-BoRn/ML_tools
 Project-URL: Changelog, https://github.com/DrAg0n-BoRn/ML_tools/blob/master/CHANGELOG.md
 Classifier: Programming Language :: Python :: 3
 Classifier: Operating System :: OS Independent
-Requires-Python: >=3.10
+Requires-Python: >=3.12
 Description-Content-Type: text/markdown
 License-File: LICENSE
 License-File: LICENSE-THIRD-PARTY.md
-Provides-Extra: base
-Requires-Dist: pandas; extra == "base"
-Requires-Dist: numpy; extra == "base"
-Requires-Dist: polars; extra == "base"
-Requires-Dist: joblib; extra == "base"
-Requires-Dist: colorlog; extra == "base"
 Provides-Extra: ml
 Requires-Dist: numpy>=2.0; extra == "ml"
 Requires-Dist: pandas; extra == "ml"
@@ -38,7 +32,12 @@ Requires-Dist: shap; extra == "ml"
 Requires-Dist: tqdm; extra == "ml"
 Requires-Dist: Pillow; extra == "ml"
 Requires-Dist: evotorch; extra == "ml"
+Requires-Dist: pyarrow; extra == "ml"
 Requires-Dist: colorlog; extra == "ml"
+Requires-Dist: torchmetrics; extra == "ml"
+Provides-Extra: py-tab
+Requires-Dist: pytorch_tabular; extra == "py-tab"
+Requires-Dist: omegaconf; extra == "py-tab"
 Provides-Extra: mice
 Requires-Dist: numpy<2.0; extra == "mice"
 Requires-Dist: pandas; extra == "mice"
@@ -51,9 +50,7 @@ Requires-Dist: statsmodels; extra == "mice"
 Requires-Dist: lightgbm<=4.5.0; extra == "mice"
 Requires-Dist: shap; extra == "mice"
 Requires-Dist: colorlog; extra == "mice"
-Provides-Extra: pytorch
-Requires-Dist: torch; extra == "pytorch"
-Requires-Dist: torchvision; extra == "pytorch"
+Requires-Dist: pyarrow; extra == "mice"
 Provides-Extra: excel
 Requires-Dist: pandas; extra == "excel"
 Requires-Dist: openpyxl; extra == "excel"
@@ -72,9 +69,6 @@ Requires-Dist: lightgbm; extra == "gui-boost"
 Provides-Extra: gui-torch
 Requires-Dist: numpy; extra == "gui-torch"
 Requires-Dist: FreeSimpleGUI>=5.2; extra == "gui-torch"
-Provides-Extra: plot
-Requires-Dist: matplotlib; extra == "plot"
-Requires-Dist: seaborn; extra == "plot"
 Provides-Extra: pyinstaller
 Requires-Dist: pyinstaller; extra == "pyinstaller"
 Provides-Extra: nuitka
@@ -94,7 +88,7 @@ A collection of Python utilities for data science and machine learning, structur
 ## Installation
-**Python 3.10+**
+**Python 3.12**
 ### Via PyPI
@@ -104,22 +98,22 @@ Install the latest stable release from PyPI:
 pip install dragon-ml-toolbox
 ```
-### Via GitHub (Editable)
+### Via conda-forge
-Clone the repository and install in editable mode with optional dependencies:
+Install from the conda-forge channel:
 ```bash
-git clone https://github.com/DrAg0n-BoRn/ML_tools.git
-cd ML_tools
-pip install -e .
+conda install -c conda-forge dragon-ml-toolbox
 ```
-### Via conda-forge
+### Via GitHub (Editable)
-Install from the conda-forge channel:
+Clone the repository and install in editable mode:
 ```bash
-conda install -c conda-forge dragon-ml-toolbox
+git clone https://github.com/DrAg0n-BoRn/ML_tools.git
+cd ML_tools
+pip install -e .
 ```
 ## Modular Installation
@@ -132,17 +126,12 @@ Installs a comprehensive set of tools for typical data science workflows, includ
 pip install "dragon-ml-toolbox[ML]"
 ```
-To install the standard CPU-only versions of Torch and Torchvision:
-```Bash
-pip install "dragon-ml-toolbox[pytorch]"
-```
-⚠️ To make use of GPU acceleration (highly recommended), follow the official instructions: [PyTorch website](https://pytorch.org/get-started/locally/)
+⚠️ PyTorch required, follow the official instructions: [PyTorch website](https://pytorch.org/get-started/locally/)
 #### Modules:
 ```bash
+constants
 custom_logger
 data_exploration
 ensemble_evaluation
@@ -150,19 +139,28 @@ ensemble_inference
 ensemble_learning
 ETL_cleaning
 ETL_engineering
+math_utilities
 ML_callbacks
 ML_datasetmaster
 ML_evaluation_multi
 ML_evaluation
 ML_inference
 ML_models
+ML_models_advanced # Requires the extra flag [py-tab]
 ML_optimization
 ML_scaler
 ML_trainer
+ML_utilities
+ML_vision_datasetmaster
+ML_vision_evaluation
+ML_vision_inference
+ML_vision_models
+ML_vision_transformers
 optimization_tools
 path_manager
 PSO_optimization
 RNN_forecast
+serde
 SQL
 utilities
 ```
@@ -180,8 +178,11 @@ pip install "dragon-ml-toolbox[mice]"
 #### Modules:
 ```Bash
+constants
 custom_logger
+math_utilities
 MICE_imputation
+serde
 VIF_factor
 path_manager
 utilities
@@ -209,42 +210,37 @@ path_manager
 ### 🎰 GUI for Boosting Algorithms (XGBoost, LightGBM) [gui-boost]
-For GUIs that include plotting functionality, you must also install the [plot] extra.
+GUI tools compatible with XGBoost and LightGBM models used for inference.
 ```Bash
 pip install "dragon-ml-toolbox[gui-boost]"
 ```
-```Bash
-pip install "dragon-ml-toolbox[gui-boost,plot]"
-```
 #### Modules:
 ```Bash
+constants
 custom_logger
 GUI_tools
 ensemble_inference
 path_manager
+serde
 ```
 ---
 ### 🤖 GUI for PyTorch Models [gui-torch]
-For GUIs that include plotting functionality, you must also install the [plot] extra.
+GUI tools compatible with PyTorch models used for inference.
 ```Bash
 pip install "dragon-ml-toolbox[gui-torch]"
 ```
-```Bash
-pip install "dragon-ml-toolbox[gui-torch,plot]"
-```
 #### Modules:
 ```Bash
+constants
 custom_logger
 GUI_tools
 ML_models
@@ -255,27 +251,6 @@ path_manager
 ---
-### 🎫 Base Tools [base]
-General purpose functions and classes.
-```Bash
-pip install "dragon-ml-toolbox[base]"
-```
-#### Modules:
-```Bash
-ETL_cleaning
-ETL_engineering
-custom_logger
-SQL
-utilities
-path_manager
-```
----
 ### ⚒️ APP bundlers
 Choose one if needed.
@@ -293,6 +268,6 @@ pip install "dragon-ml-toolbox[nuitka]"
 After installation, import modules like this:
 ```python
-from ml_tools.utilities import serialize_object, deserialize_object
+from ml_tools.serde import serialize_object, deserialize_object
 from ml_tools import custom_logger
 ```

dragon_ml_toolbox-14.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,48 @@
+dragon_ml_toolbox-14.2.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
+dragon_ml_toolbox-14.2.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=gkOdNDbKYpIJezwSo2CEnISkLeYfYHv9t8b5K2-P69A,2687
+ml_tools/ETL_cleaning.py,sha256=2VBRllV8F-ZiPylPp8Az2gwn5ztgazN0BH5OKnRUhV0,20402
+ml_tools/ETL_engineering.py,sha256=KfYqgsxupAx6e_TxwO1LZXeu5mFkIhVXJrNjP3CzIZc,54927
+ml_tools/GUI_tools.py,sha256=Va6ig-dHULPVRwQYYtH3fvY5XPIoqRcJpRW8oXC55Hw,45413
+ml_tools/MICE_imputation.py,sha256=KLJXGQLKJ6AuWWttAG-LCCaxpS-ygM4dXPiguHDaL6Y,20815
+ml_tools/ML_callbacks.py,sha256=elD2Yr030sv_6gX_m9GVd6HTyrbmt34nFS8lrgS4HtM,15808
+ml_tools/ML_datasetmaster.py,sha256=rsJgZEGBJmfeKF6cR8CQZzfEx4T7Y-p1wUnR15_nNw0,28400
+ml_tools/ML_evaluation.py,sha256=4GU86rUWMIGbkXrvN6PyjfGwKtWvXKE7pMlWpWeBq14,18988
+ml_tools/ML_evaluation_multi.py,sha256=rJKdgtq-9I7oaI7PRzq7aIZ84XdNV0xzlVePZW4nj0k,16095
+ml_tools/ML_inference.py,sha256=YJ953bhNWsdlPRtJQh3h2ACfMIgp8dQ9KtL9Azar-5s,23489
+ml_tools/ML_models.py,sha256=PqOcNlws7vCJMbiVCKqlPuktxvskZVUHG3VfU-Yshf8,31415
+ml_tools/ML_models_advanced.py,sha256=vk3PZBSu3DVso2S1rKTxxdS43XG8Q5FnasIL3-rMajc,12410
+ml_tools/ML_optimization.py,sha256=P0zkhKAwTpkorIBtR0AOIDcyexo5ngmvFUzo3DfNO-E,22692
+ml_tools/ML_scaler.py,sha256=tw6onj9o8_kk3FQYb930HUzvv1zsFZe2YZJdF3LtHkU,7538
+ml_tools/ML_trainer.py,sha256=ZWI4MbUcLeBxyfoUTL96l5tjHHMp9I64h4SdXnjYmBE,49795
+ml_tools/ML_utilities.py,sha256=z6LbpbZwhn8F__fWlKi-g-cAJQXSxwg1NHfC5FBoAyc,21139
+ml_tools/ML_vision_datasetmaster.py,sha256=tOrdatuq_AP8-GDiTrtARvSJdpc8h7dT-OhDJtRQnsk,54433
+ml_tools/ML_vision_evaluation.py,sha256=t12R7i1RkOCt9zu1_lxSBr8OH6A6Get0k8ftDLctn6I,10486
+ml_tools/ML_vision_inference.py,sha256=He3KV3VJAm8PwO-fOq4b9VO8UXFr-GmpuCnoHXf4VZI,20588
+ml_tools/ML_vision_models.py,sha256=G3S4jB9AE9wMpU9ZygOgOx9q1K6t6LAXBYcJ-U2XQ1M,25600
+ml_tools/ML_vision_transformers.py,sha256=95e0aBkHY5VDGE8i5xy57COU7NvSNIgFknnhBubwE40,1832
+ml_tools/PSO_optimization.py,sha256=T-HWHMRJUnPvPwixdU5jif3_rnnI36TzcL8u3oSCwuA,22960
+ml_tools/RNN_forecast.py,sha256=Qa2KoZfdAvSjZ4yE78N4BFXtr3tTr0Gx7tQJZPotsh0,1967
+ml_tools/SQL.py,sha256=vXLPGfVVg8bfkbBE3HVfyEclVbdJy0TBhuQONtMwSCQ,11234
+ml_tools/VIF_factor.py,sha256=at5IVqPvicja2-DNSTSIIy3SkzDWCmLzo3qTG_qr5n8,10422
+ml_tools/_ML_vision_recipe.py,sha256=zrgxFUvTJqQVuwR7jWlbIC2FD29u6eNFPkTRoJ7yEZI,3178
+ml_tools/__init__.py,sha256=kJiankjz9_qXu7gU92mYqYg_anLvt-B6RtW0mMH8uGo,76
+ml_tools/_logger.py,sha256=dlp5cGbzooK9YSNSZYB4yjZrOaQUGW8PTrM411AOvL8,4717
+ml_tools/_schema.py,sha256=yu6aWmn_2Z4_AxAtJGDDCIa96y6JcUp-vgnCS013Qmw,3908
+ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
+ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
+ml_tools/custom_logger.py,sha256=TGc0Ww2Xlqj2XE3q4bP43hV7T3qnb5ci9f0pYHXF5TY,11226
+ml_tools/data_exploration.py,sha256=bwHzFJ-IAo5GN3T53F-1J_pXUg8VHS91sG_90utAsfg,69911
+ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
+ml_tools/ensemble_inference.py,sha256=0yLmLNj45RVVoSCLH1ZYJG9IoAhTkWUqEZmLOQTFGTY,9348
+ml_tools/ensemble_learning.py,sha256=vsIED7nlheYI4w2SBzP6SC1AnNeMfn-2A1Gqw5EfxsM,21964
+ml_tools/handle_excel.py,sha256=pfdAPb9ywegFkM9T54bRssDOsX-K7rSeV0RaMz7lEAo,14006
+ml_tools/keys.py,sha256=wZOBuEnnHc54vlOZiimnrxfk-sZh6f6suPppJW8rbPQ,3326
+ml_tools/math_utilities.py,sha256=xeKq1quR_3DYLgowcp4Uam_4s3JltUyOnqMOGuAiYWU,8802
+ml_tools/optimization_tools.py,sha256=TYFQ2nSnp7xxs-VyoZISWgnGJghFbsWasHjruegyJRs,12763
+ml_tools/path_manager.py,sha256=CyDU16pOKmC82jPubqJPT6EBt-u-3rGVbxyPIZCvDDY,18432
+ml_tools/serde.py,sha256=c8uDYjYry_VrLvoG4ixqDj5pij88lVn6Tu4NHcPkwDU,6943
+ml_tools/utilities.py,sha256=aWqvYzmxlD74PD5Yqu1VuTekDJeYLQrmPIU_VeVyRp0,22526
+dragon_ml_toolbox-14.2.0.dist-info/METADATA,sha256=T0eIxD-eO3cbAIzJ1HskJbog6RUYgXwXQQ2OU8Z-GQM,6475
+dragon_ml_toolbox-14.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-14.2.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-14.2.0.dist-info/RECORD,,

{dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE RENAMED Viewed

@@ -1,6 +1,6 @@
 MIT License
-Copyright (c) 2025 Karl Loza
+Copyright (c) 2025 Karl Luigi Loza Vidaurre
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

{dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

@@ -26,3 +26,14 @@ This project depends on the following third-party packages. Each is governed by
 - [polars](https://github.com/pola-rs/polars/blob/main/LICENSE)
 - [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE)
 - [tqdm](https://github.com/tqdm/tqdm/blob/master/LICENSE)
+- [pyarrow](https://github.com/apache/arrow/blob/main/LICENSE.txt)
+- [colorlog](https://github.com/borntyping/python-colorlog/blob/main/LICENSE)
+- [evotorch](https://github.com/nnaisense/evotorch/blob/master/LICENSE)
+- [FreeSimpleGUI](https://github.com/spyoungtech/FreeSimpleGUI/blob/main/license.txt)
+- [nuitka](https://github.com/Nuitka/Nuitka/blob/main/LICENSE.txt)
+- [omegaconf](https://github.com/omry/omegaconf/blob/master/LICENSE)
+- [ordered-set](https://github.com/rspeer/ordered-set/blob/master/MIT-LICENSE)
+- [pyinstaller](https://github.com/pyinstaller/pyinstaller/blob/develop/COPYING.txt)
+- [pytorch_tabular](https://github.com/manujosephv/pytorch_tabular/blob/main/LICENSE)
+- [torchmetrics](https://github.com/Lightning-AI/torchmetrics/blob/master/LICENSE)
+- [zstandard](https://github.com/indygreg/python-zstandard/blob/main/LICENSE)

ml_tools/ETL_cleaning.py CHANGED Viewed

@@ -2,9 +2,10 @@ import polars as pl
 import pandas as pd
 from pathlib import Path
 from typing import Union, List, Dict
 from .path_manager import sanitize_filename, make_fullpath
 from .data_exploration import drop_macro
-from .utilities import save_dataframe, load_dataframe
+from .utilities import save_dataframe_filename, load_dataframe
 from ._script_info import _script_info
 from ._logger import _LOGGER
@@ -19,20 +20,26 @@ __all__ = [
 ################ Unique Values per column #################
-def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path], verbose: bool=False) -> None:
+def save_unique_values(csv_path: Union[str, Path],
+                       output_dir: Union[str, Path],
+                       verbose: bool=False,
+                       keep_column_order: bool = True) -> None:
     """
     Loads a CSV file, then analyzes it and saves the unique non-null values
     from each column into a separate text file exactly as they appear.
     This is useful for understanding the raw categories or range of values
-    within a dataset before cleaning.
+    within a dataset before and after cleaning.
     Args:
-        csv_path (Union[str, Path]):
+        csv_path (str | Path):
             The file path to the input CSV file.
-        output_dir (Union[str, Path]):
+        output_dir (str | Path):
             The path to the directory where the .txt files will be saved.
             The directory will be created if it does not exist.
+        keep_column_order (bool):
+            If True, prepends a numeric prefix (e.g., '1_', '2_') to each
+            output filename to maintain the original column order.
     """
     # --- 1. Input Validation ---
     csv_path = make_fullpath(input_path=csv_path, enforce="file")
@@ -74,7 +81,12 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
         sanitized_name = sanitize_filename(column_name)
         if not sanitized_name.strip('_'):
             sanitized_name = f'column_{i}'
-        file_path = output_dir / f"{sanitized_name}_unique_values.txt"
+        # --- create filename prefix ---
+        # If keep_column_order is True, create a prefix like "1_", "2_", etc.
+        prefix = f"{i + 1}_" if keep_column_order else ''
+        file_path = output_dir / f"{prefix}{sanitized_name}_unique_values.txt"
         # --- Write to file ---
         try:
@@ -96,7 +108,7 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
 ########## Basic df cleaners #############
-def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
+def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
     # Cleaning rules
     cleaning_rules = {
         # 1. Comprehensive Punctuation & Symbol Normalization
@@ -126,27 +138,44 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
         'ｓ': 's', 'ｔ': 't', 'ｕ': 'u', 'ｖ': 'v', 'ｗ': 'w', 'ｘ': 'x',
         'ｙ': 'y', 'ｚ': 'z',
         # Punctuation
-        '》': '>', '《': '<', '：': ':', '，': ',', '。': '.', '；': ';', '【': '[', '】': ']',
+        '》': '>', '《': '<', '：': ':', '。': '.', '；': ';', '【': '[', '】': ']', '∼': '~',
         '（': '(', '）': ')', '？': '?', '！': '!', '～': '~', '＠': '@', '＃': '#', '＋': '+', '－': '-',
-        '＄': '$', '％': '%', '＾': '^', '＆': '&', '＊': '*', '＼': '\\', '｜': '|', '、':',', '≈':'=',
+        '＄': '$', '％': '%', '＾': '^', '＆': '&', '＊': '*', '＼': '-', '｜': '|', '≈':'=', '·': '', '⋅': '',
+        '¯': '-',
+        # Commas (avoid commas in entries)
+        '，': ';',
+        ',': ';',
+        '、':';',
         # Others
+        'σ': '',
+        '□': '',
         '©': '',
         '®': '',
         '™': '',
+        r'[°˚]': '',
-        # Collapse repeating punctuation
-        r'\.{2,}': '.',      # Replace two or more dots with a single dot
-        r'\?{2,}': '?',      # Replace two or more question marks with a single question mark
-        r'!{2,}': '!',      # Replace two or more exclamation marks with a single one
+        # Replace special characters in entries
+        r'\\': '_',
         # Typographical standardization
-        # Unify various dashes and hyphens to a standard hyphen-minus
+        # Unify various dashes and hyphens to a standard hyphen
         r'[—–―]': '-',
         r'−': '-',
-        # Unify various quote types to standard quotes
-        r'[“”]': "'",
-        r'[‘’′]': "'",
+        # remove various quote types
+        r'[“”"]': '',
+        r"[‘’′']": '',
+        # Collapse repeating punctuation
+        r'\.{2,}': '.',      # Replace two or more dots with a single dot
+        r'\?{2,}': '?',      # Replace two or more question marks with a single question mark
+        r'!{2,}': '!',      # Replace two or more exclamation marks with a single one
+        r';{2,}': ';',
+        r'-{2,}': '-',
+        r'/{2,}': '/',
+        r'%{2,}': '%',
+        r'&{2,}': '&',
         # 2. Internal Whitespace Consolidation
         # Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
@@ -158,7 +187,7 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
         # 4. Textual Null Standardization (New Step)
         # Convert common null-like text to actual nulls.
-        r'^(N/A|无|NA|NULL|NONE|NIL|)$': None,
+        r'^(N/A|无|NA|NULL|NONE|NIL|-|\.|;|/|%|&)$': None,
         # 5. Final Nullification of Empty Strings
         # After all cleaning, if a string is now empty, convert it to a null
@@ -179,9 +208,13 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
         df_cleaned = df_cleaner.clean(df_in, clone_df=False) # Use clone_df=False for efficiency
         # apply lowercase to all string columns
-        df_final = df_cleaned.with_columns(
-            pl.col(pl.String).str.to_lowercase()
-        )
+        if all_lowercase:
+            df_final = df_cleaned.with_columns(
+                pl.col(pl.String).str.to_lowercase()
+            )
+        else:
+            df_final = df_cleaned
     except Exception as e:
         _LOGGER.error(f"An error occurred during the cleaning process.")
         raise e
@@ -199,7 +232,7 @@ def _path_manager(path_in: Union[str,Path], path_out: Union[str,Path]):
     return input_path, output_path
-def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path]):
+def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path], all_lowercase: bool=True):
     """
     Performs a comprehensive, standardized cleaning on all columns of a CSV file.
@@ -209,13 +242,16 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
     - Stripping any leading or trailing whitespace.
     - Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
     - Converting strings that become empty after cleaning into true null values.
-    - Normalizing all text to lowercase.
+    - Normalizing all text to lowercase (Optional).
     Args:
-        input_filepath (Union[str, Path]):
+        input_filepath (str | Path):
             The path to the source CSV file to be cleaned.
-        output_filepath (Union[str, Path, None], optional):
+        output_filepath (str | Path):
             The path to save the cleaned CSV file.
+        all_lowercase (bool):
+            Whether to normalize all text to lowercase.
     """
     # Handle paths
     input_path, output_path = _path_manager(path_in=input_filepath, path_out=output_filepath)
@@ -224,16 +260,16 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
     df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
     # CLEAN
-    df_final = _cleaner_core(df)
+    df_final = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
     # Save cleaned dataframe
-    save_dataframe(df=df_final, save_dir=output_path.parent, filename=output_path.name)
+    save_dataframe_filename(df=df_final, save_dir=output_path.parent, filename=output_path.name)
     _LOGGER.info(f"Data successfully cleaned.")
 def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str,Path], log_directory: Union[str,Path], targets: list[str],
-                     skip_targets: bool=False, threshold: float=0.8):
+                     skip_targets: bool=False, threshold: float=0.8, all_lowercase: bool=True):
     """
     Performs standardized cleaning followed by iterative removal of rows and
     columns with excessive missing data.
@@ -250,12 +286,12 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
     dropping process are saved to the specified log directory.
     Args:
-        input_filepath (str, Path):
+        input_filepath (str | Path):
             The path to the source CSV file to be cleaned.
-        output_filepath (str, Path):
+        output_filepath (str | Path):
             The path to save the fully cleaned CSV file after cleaning
             and missing-data-based pruning.
-        log_directory (str, Path):
+        log_directory (str | Path):
             Path to the directory where missing data reports will be stored.
         targets (list[str]):
             A list of column names to be treated as target variables.
@@ -267,6 +303,8 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
             The proportion of missing data required to drop a row or column.
             For example, 0.8 means a row/column will be dropped if 80% or more
             of its data is missing.
+        all_lowercase (bool):
+            Whether to normalize all text to lowercase.
     """
     # handle log path
     log_path = make_fullpath(log_directory, make=True, enforce="directory")
@@ -278,7 +316,7 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
     df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
     # CLEAN
-    df_cleaned = _cleaner_core(df)
+    df_cleaned = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
     # switch to pandas
     df_cleaned_pandas = df_cleaned.to_pandas()
@@ -291,7 +329,7 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
                           threshold=threshold)
     # Save cleaned dataframe
-    save_dataframe(df=df_final, save_dir=output_path.parent, filename=output_path.name)
+    save_dataframe_filename(df=df_final, save_dir=output_path.parent, filename=output_path.name)
     _LOGGER.info(f"Data successfully cleaned.")
@@ -456,7 +494,7 @@ class DataFrameCleaner:
         if isinstance(output_filepath, str):
             output_filepath = make_fullpath(input_path=output_filepath, enforce="file")
-        save_dataframe(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
+        save_dataframe_filename(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
         return None

dragon-ml-toolbox 10.2.0__py3-none-any.whl → 14.2.0__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 10.2.0py3-none-any.whl → 14.2.0py3-none-any.whl