PyPI - dragon-ml-toolbox - Versions diffs - 10.0.0__tar.gz → 10.1.0__tar.gz - Mend

dragon-ml-toolbox 10.0.0tar.gz → 10.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (41) hide show

{dragon_ml_toolbox-10.0.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-10.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 10.0.0
+Version: 10.1.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT
@@ -148,6 +148,7 @@ data_exploration
 ensemble_evaluation
 ensemble_inference
 ensemble_learning
+ETL_cleaning
 ETL_engineering
 ML_callbacks
 ML_datasetmaster
@@ -265,7 +266,8 @@ pip install "dragon-ml-toolbox[base]"
 #### Modules:
 ```Bash
-ETL_Engineering
+ETL_cleaning
+ETL_engineering
 custom_logger
 SQL
 utilities

{dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/README.md RENAMED Viewed

@@ -63,6 +63,7 @@ data_exploration
 ensemble_evaluation
 ensemble_inference
 ensemble_learning
+ETL_cleaning
 ETL_engineering
 ML_callbacks
 ML_datasetmaster
@@ -180,7 +181,8 @@ pip install "dragon-ml-toolbox[base]"
 #### Modules:
 ```Bash
-ETL_Engineering
+ETL_cleaning
+ETL_engineering
 custom_logger
 SQL
 utilities

{dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0/dragon_ml_toolbox.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 10.0.0
+Version: 10.1.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT
@@ -148,6 +148,7 @@ data_exploration
 ensemble_evaluation
 ensemble_inference
 ensemble_learning
+ETL_cleaning
 ETL_engineering
 ML_callbacks
 ML_datasetmaster
@@ -265,7 +266,8 @@ pip install "dragon-ml-toolbox[base]"
 #### Modules:
 ```Bash
-ETL_Engineering
+ETL_cleaning
+ETL_engineering
 custom_logger
 SQL
 utilities

{dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/ETL_cleaning.py RENAMED Viewed

@@ -1,8 +1,7 @@
 import polars as pl
 import pandas as pd
-import re
 from pathlib import Path
-from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
+from typing import Union, List, Dict
 from .path_manager import sanitize_filename, make_fullpath
 from .utilities import save_dataframe, load_dataframe
 from ._script_info import _script_info
@@ -131,16 +130,37 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
         r'\p{C}+': '',
         # Full-width to half-width
+        # Numbers
+        '０': '0', '１': '1', '２': '2', '３': '3', '４': '4',
+        '５': '5', '６': '6', '７': '7', '８': '8', '９': '9',
+        # Superscripts & Subscripts
+        '¹': '1', '²': '2', '³': '3', '⁴': '4', '⁵': '5',
+        '⁶': '6', '⁷': '7', '⁸': '8', '⁹': '9', '⁰': '0',
+        '₁': '1', '₂': '2', '₃': '3', '₄': '4', '₅': '5',
+        '₆': '6', '₇': '7', '₈': '8', '₉': '9', '₀': '0',
+        # Uppercase Alphabet
+        'Ａ': 'A', 'Ｂ': 'B', 'Ｃ': 'C', 'Ｄ': 'D', 'Ｅ': 'E', 'Ｆ': 'F',
+        'Ｇ': 'G', 'Ｈ': 'H', 'Ｉ': 'I', 'Ｊ': 'J', 'Ｋ': 'K', 'Ｌ': 'L',
+        'Ｍ': 'M', 'Ｎ': 'N', 'Ｏ': 'O', 'Ｐ': 'P', 'Ｑ': 'Q', 'Ｒ': 'R',
+        'Ｓ': 'S', 'Ｔ': 'T', 'Ｕ': 'U', 'Ｖ': 'V', 'Ｗ': 'W', 'Ｘ': 'X',
+        'Ｙ': 'Y', 'Ｚ': 'Z',
+        # Lowercase Alphabet
+        'ａ': 'a', 'ｂ': 'b', 'ｃ': 'c', 'ｄ': 'd', 'ｅ': 'e', 'ｆ': 'f',
+        'ｇ': 'g', 'ｈ': 'h', 'ｉ': 'i', 'ｊ': 'j', 'ｋ': 'k', 'ｌ': 'l',
+        'ｍ': 'm', 'ｎ': 'n', 'ｏ': 'o', 'ｐ': 'p', 'ｑ': 'q', 'ｒ': 'r',
+        'ｓ': 's', 'ｔ': 't', 'ｕ': 'u', 'ｖ': 'v', 'ｗ': 'w', 'ｘ': 'x',
+        'ｙ': 'y', 'ｚ': 'z',
+        # Punctuation
         '》': '>', '《': '<', '：': ':', '，': ',', '。': '.', '；': ';', '【': '[', '】': ']',
-        '（': '(', '）': ')', '？': '?', '！': '!', '～': '~', '＠': '@', '＃': '#',
-        '＄': '$', '％': '%', '＾': '^', '＆': '&', '＊': '*', '＼': '\\', '｜': '|',
+        '（': '(', '）': ')', '？': '?', '！': '!', '～': '~', '＠': '@', '＃': '#', '＋': '+', '－': '-',
+        '＄': '$', '％': '%', '＾': '^', '＆': '&', '＊': '*', '＼': '\\', '｜': '|', '、':',', '≈':'=',
         # Others
         '©': '',
         '®': '',
         '™': '',
-        # Collapse repeating punctuation (explicit method)
+        # Collapse repeating punctuation
         r'\.{2,}': '.',      # Replace two or more dots with a single dot
         r'\?{2,}': '?',      # Replace two or more question marks with a single question mark
         r'!{2,}': '!',      # Replace two or more exclamation marks with a single one
@@ -148,9 +168,9 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
         # Typographical standardization
         # Unify various dashes and hyphens to a standard hyphen-minus
         r'[—–―]': '-',
-        # Unify various quote types to standard single quotes
+        # Unify various quote types to standard quotes
         r'[“”]': "'",
-        r'[‘’]': "'",
+        r'[‘’′]': "'",
         # 2. Internal Whitespace Consolidation
         # Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
@@ -162,7 +182,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
         # 4. Textual Null Standardization (New Step)
         # Convert common null-like text to actual nulls. (?i) makes it case-insensitive.
-        r'^(N/A|NA|NULL|NONE|NIL|)$': None,
+        r'^(N/A|无|NA|NULL|NONE|NIL|)$': None,
         # 5. Final Nullification of Empty Strings
         # After all cleaning, if a string is now empty, convert it to a null
@@ -192,7 +212,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
     # Save cleaned dataframe
     save_dataframe(df=df_final, save_dir=output_path.parent, filename=output_path.name)
-    _LOGGER.info(f"Successfully cleaned and saved data to '{output_path.name}'.")
+    _LOGGER.info(f"Data successfully cleaned.")
 ########## EXTRACT and CLEAN ##########
@@ -238,14 +258,6 @@ class ColumnCleaner:
             _LOGGER.error("The 'rules' argument must be a dictionary.")
             raise TypeError()
-        # Validate each regex pattern for correctness
-        for pattern in rules.keys():
-            try:
-                re.compile(pattern)
-            except re.error:
-                _LOGGER.error(f"Invalid regex pattern '{pattern}'.")
-                raise
         self.column_name = column_name
         self.rules = rules
         self.case_insensitive = case_insensitive

{dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "dragon-ml-toolbox"
-version = "10.0.0"
+version = "10.1.0"
 description = "A collection of tools for data science and machine learning projects."
 authors = [
     { name = "Karl Loza", email = "luigiloza@gmail.com" }