PyPI - dragon-ml-toolbox - Versions diffs - 10.12.1__tar.gz → 10.14.0__tar.gz - Mend

dragon-ml-toolbox 10.12.1tar.gz → 10.14.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (41) hide show

{dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/LICENSE RENAMED Viewed

@@ -1,6 +1,6 @@
 MIT License
-Copyright (c) 2025 Karl Loza
+Copyright (c) 2025 Karl Luigi Loza Vidaurre
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

{dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/LICENSE-THIRD-PARTY.md RENAMED Viewed

@@ -26,3 +26,4 @@ This project depends on the following third-party packages. Each is governed by
 - [polars](https://github.com/pola-rs/polars/blob/main/LICENSE)
 - [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE)
 - [tqdm](https://github.com/tqdm/tqdm/blob/master/LICENSE)
+- [pyarrow](https://github.com/apache/arrow/blob/main/LICENSE.txt)

{dragon_ml_toolbox-10.12.1/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-10.14.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 10.12.1
+Version: 10.14.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT
@@ -12,12 +12,6 @@ Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
 License-File: LICENSE-THIRD-PARTY.md
-Provides-Extra: base
-Requires-Dist: pandas; extra == "base"
-Requires-Dist: numpy; extra == "base"
-Requires-Dist: polars; extra == "base"
-Requires-Dist: joblib; extra == "base"
-Requires-Dist: colorlog; extra == "base"
 Provides-Extra: ml
 Requires-Dist: numpy>=2.0; extra == "ml"
 Requires-Dist: pandas; extra == "ml"
@@ -38,6 +32,7 @@ Requires-Dist: shap; extra == "ml"
 Requires-Dist: tqdm; extra == "ml"
 Requires-Dist: Pillow; extra == "ml"
 Requires-Dist: evotorch; extra == "ml"
+Requires-Dist: pyarrow; extra == "ml"
 Requires-Dist: colorlog; extra == "ml"
 Provides-Extra: mice
 Requires-Dist: numpy<2.0; extra == "mice"
@@ -51,6 +46,7 @@ Requires-Dist: statsmodels; extra == "mice"
 Requires-Dist: lightgbm<=4.5.0; extra == "mice"
 Requires-Dist: shap; extra == "mice"
 Requires-Dist: colorlog; extra == "mice"
+Requires-Dist: pyarrow; extra == "mice"
 Provides-Extra: pytorch
 Requires-Dist: torch; extra == "pytorch"
 Requires-Dist: torchvision; extra == "pytorch"
@@ -255,27 +251,6 @@ path_manager
 ---
-### 🎫 Base Tools [base]
-General purpose functions and classes.
-```Bash
-pip install "dragon-ml-toolbox[base]"
-```
-#### Modules:
-```Bash
-ETL_cleaning
-ETL_engineering
-custom_logger
-SQL
-utilities
-path_manager
-```
----
 ### ⚒️ APP bundlers
 Choose one if needed.

{dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/README.md RENAMED Viewed

@@ -170,27 +170,6 @@ path_manager
 ---
-### 🎫 Base Tools [base]
-General purpose functions and classes.
-```Bash
-pip install "dragon-ml-toolbox[base]"
-```
-#### Modules:
-```Bash
-ETL_cleaning
-ETL_engineering
-custom_logger
-SQL
-utilities
-path_manager
-```
----
 ### ⚒️ APP bundlers
 Choose one if needed.

{dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0/dragon_ml_toolbox.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 10.12.1
+Version: 10.14.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT
@@ -12,12 +12,6 @@ Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
 License-File: LICENSE-THIRD-PARTY.md
-Provides-Extra: base
-Requires-Dist: pandas; extra == "base"
-Requires-Dist: numpy; extra == "base"
-Requires-Dist: polars; extra == "base"
-Requires-Dist: joblib; extra == "base"
-Requires-Dist: colorlog; extra == "base"
 Provides-Extra: ml
 Requires-Dist: numpy>=2.0; extra == "ml"
 Requires-Dist: pandas; extra == "ml"
@@ -38,6 +32,7 @@ Requires-Dist: shap; extra == "ml"
 Requires-Dist: tqdm; extra == "ml"
 Requires-Dist: Pillow; extra == "ml"
 Requires-Dist: evotorch; extra == "ml"
+Requires-Dist: pyarrow; extra == "ml"
 Requires-Dist: colorlog; extra == "ml"
 Provides-Extra: mice
 Requires-Dist: numpy<2.0; extra == "mice"
@@ -51,6 +46,7 @@ Requires-Dist: statsmodels; extra == "mice"
 Requires-Dist: lightgbm<=4.5.0; extra == "mice"
 Requires-Dist: shap; extra == "mice"
 Requires-Dist: colorlog; extra == "mice"
+Requires-Dist: pyarrow; extra == "mice"
 Provides-Extra: pytorch
 Requires-Dist: torch; extra == "pytorch"
 Requires-Dist: torchvision; extra == "pytorch"
@@ -255,27 +251,6 @@ path_manager
 ---
-### 🎫 Base Tools [base]
-General purpose functions and classes.
-```Bash
-pip install "dragon-ml-toolbox[base]"
-```
-#### Modules:
-```Bash
-ETL_cleaning
-ETL_engineering
-custom_logger
-SQL
-utilities
-path_manager
-```
----
 ### ⚒️ APP bundlers
 Choose one if needed.

{dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/dragon_ml_toolbox.egg-info/requires.txt RENAMED Viewed

@@ -19,13 +19,7 @@ shap
 tqdm
 Pillow
 evotorch
-colorlog
-[base]
-pandas
-numpy
-polars
-joblib
+pyarrow
 colorlog
 [excel]
@@ -61,6 +55,7 @@ statsmodels
 lightgbm<=4.5.0
 shap
 colorlog
+pyarrow
 [nuitka]
 nuitka

{dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/ETL_cleaning.py RENAMED Viewed

@@ -96,7 +96,7 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
 ########## Basic df cleaners #############
-def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
+def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
     # Cleaning rules
     cleaning_rules = {
         # 1. Comprehensive Punctuation & Symbol Normalization
@@ -128,7 +128,7 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
         # Punctuation
         '》': '>', '《': '<', '：': ':', '。': '.', '；': ';', '【': '[', '】': ']',
         '（': '(', '）': ')', '？': '?', '！': '!', '～': '~', '＠': '@', '＃': '#', '＋': '+', '－': '-',
-        '＄': '$', '％': '%', '＾': '^', '＆': '&', '＊': '*', '＼': '-', '｜': '|', '≈':'=',
+        '＄': '$', '％': '%', '＾': '^', '＆': '&', '＊': '*', '＼': '-', '｜': '|', '≈':'=', '·': '-',
         # Commas (avoid commas in entries)
         '，': ';',
@@ -159,6 +159,9 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
         r'!{2,}': '!',      # Replace two or more exclamation marks with a single one
         r';{2,}': ';',
         r'-{2,}': '-',
+        r'/{2,}': '/',
+        r'%{2,}': '%',
+        r'&{2,}': '&',
         # 2. Internal Whitespace Consolidation
         # Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
@@ -170,7 +173,7 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
         # 4. Textual Null Standardization (New Step)
         # Convert common null-like text to actual nulls.
-        r'^(N/A|无|NA|NULL|NONE|NIL|-|\.|;)$': None,
+        r'^(N/A|无|NA|NULL|NONE|NIL|-|\.|;|/|%|&)$': None,
         # 5. Final Nullification of Empty Strings
         # After all cleaning, if a string is now empty, convert it to a null
@@ -191,9 +194,13 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
         df_cleaned = df_cleaner.clean(df_in, clone_df=False) # Use clone_df=False for efficiency
         # apply lowercase to all string columns
-        df_final = df_cleaned.with_columns(
-            pl.col(pl.String).str.to_lowercase()
-        )
+        if all_lowercase:
+            df_final = df_cleaned.with_columns(
+                pl.col(pl.String).str.to_lowercase()
+            )
+        else:
+            df_final = df_cleaned
     except Exception as e:
         _LOGGER.error(f"An error occurred during the cleaning process.")
         raise e
@@ -211,7 +218,7 @@ def _path_manager(path_in: Union[str,Path], path_out: Union[str,Path]):
     return input_path, output_path
-def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path]):
+def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path], all_lowercase: bool=True):
     """
     Performs a comprehensive, standardized cleaning on all columns of a CSV file.
@@ -221,13 +228,16 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
     - Stripping any leading or trailing whitespace.
     - Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
     - Converting strings that become empty after cleaning into true null values.
-    - Normalizing all text to lowercase.
+    - Normalizing all text to lowercase (Optional).
     Args:
-        input_filepath (Union[str, Path]):
+        input_filepath (str | Path):
             The path to the source CSV file to be cleaned.
-        output_filepath (Union[str, Path, None], optional):
+        output_filepath (str | Path):
             The path to save the cleaned CSV file.
+        all_lowercase (bool):
+            Whether to normalize all text to lowercase.
     """
     # Handle paths
     input_path, output_path = _path_manager(path_in=input_filepath, path_out=output_filepath)
@@ -236,7 +246,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
     df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
     # CLEAN
-    df_final = _cleaner_core(df)
+    df_final = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
     # Save cleaned dataframe
     save_dataframe(df=df_final, save_dir=output_path.parent, filename=output_path.name)
@@ -245,7 +255,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
 def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str,Path], log_directory: Union[str,Path], targets: list[str],
-                     skip_targets: bool=False, threshold: float=0.8):
+                     skip_targets: bool=False, threshold: float=0.8, all_lowercase: bool=True):
     """
     Performs standardized cleaning followed by iterative removal of rows and
     columns with excessive missing data.
@@ -262,12 +272,12 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
     dropping process are saved to the specified log directory.
     Args:
-        input_filepath (str, Path):
+        input_filepath (str | Path):
             The path to the source CSV file to be cleaned.
-        output_filepath (str, Path):
+        output_filepath (str | Path):
             The path to save the fully cleaned CSV file after cleaning
             and missing-data-based pruning.
-        log_directory (str, Path):
+        log_directory (str | Path):
             Path to the directory where missing data reports will be stored.
         targets (list[str]):
             A list of column names to be treated as target variables.
@@ -279,6 +289,8 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
             The proportion of missing data required to drop a row or column.
             For example, 0.8 means a row/column will be dropped if 80% or more
             of its data is missing.
+        all_lowercase (bool):
+            Whether to normalize all text to lowercase.
     """
     # handle log path
     log_path = make_fullpath(log_directory, make=True, enforce="directory")
@@ -290,7 +302,7 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
     df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
     # CLEAN
-    df_cleaned = _cleaner_core(df)
+    df_cleaned = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
     # switch to pandas
     df_cleaned_pandas = df_cleaned.to_pandas()

{dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/ensemble_inference.py RENAMED Viewed

@@ -219,7 +219,7 @@ def model_report(
     return report_data
-# Local implementation to avoid calling utilities' dependencies
+# Local implementation to avoid calling utilities dependencies
 def _deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
     """
     Loads a serialized object from a .joblib file.

{dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "dragon-ml-toolbox"
-version = "10.12.1"
+version = "10.14.0"
 description = "A collection of tools for data science and machine learning projects."
 authors = [
     { name = "Karl Loza", email = "luigiloza@gmail.com" }
@@ -19,15 +19,6 @@ Homepage = "https://github.com/DrAg0n-BoRn/ML_tools"
 Changelog = "https://github.com/DrAg0n-BoRn/ML_tools/blob/master/CHANGELOG.md"
 [project.optional-dependencies]
-# Base all purpose tools
-base = [
-    "pandas",
-    "numpy",
-    "polars",
-    "joblib",
-    "colorlog"
-]
 # Machine Learning main toolbox. Additionally Requires PyTorch with CUDA / MPS support
 ML = [
     "numpy>=2.0",
@@ -48,7 +39,8 @@ ML = [
     "shap",
     "tqdm",
     "Pillow",
-    "evotorch",
+    "evotorch",
+    "pyarrow",
     "colorlog"
 ]
@@ -64,7 +56,8 @@ mice = [
     "statsmodels",
     "lightgbm<=4.5.0",
     "shap",
-    "colorlog"
+    "colorlog",
+    "pyarrow"
 ]
 # pytorch base CPU installations - not recommended