dragon-ml-toolbox 10.0.1__py3-none-any.whl → 10.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 10.0.1
3
+ Version: 10.1.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
- dragon_ml_toolbox-10.0.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-10.0.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
3
- ml_tools/ETL_cleaning.py,sha256=g_6BH0amK4aQwX8aEM2z4JYyskjbSg5ktu8n0YbrM3w,14905
1
+ dragon_ml_toolbox-10.1.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
+ dragon_ml_toolbox-10.1.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
3
+ ml_tools/ETL_cleaning.py,sha256=khr7g_9McEiniMKOyyVT2sWXQBwg99NmML1Lc4FRkx0,15878
4
4
  ml_tools/ETL_engineering.py,sha256=sgpIhlFIeId4eSJ-a33MnVuPNXs50msxFWa8-kw2hOI,36369
5
5
  ml_tools/GUI_tools.py,sha256=kEQWg-bog3pB5tI22gMGKWaCGHnz9TB2Lvvfhf5F2CI,45412
6
6
  ml_tools/MICE_imputation.py,sha256=kVSythWfxJFR4-2mtcYCWQaQ1Oz5yyx_SJu5gjnS7H8,11670
@@ -30,7 +30,7 @@ ml_tools/keys.py,sha256=HtPG8-MWh89C32A7eIlfuuA-DLwkxGkoDfwR2TGN9CQ,1074
30
30
  ml_tools/optimization_tools.py,sha256=P3I6lIpvZ8Xf2kX5FvvBKBmrK2pB6idBpkTzfUJxTeE,5073
31
31
  ml_tools/path_manager.py,sha256=TJgoqMAryc5F0dal8W_zvJgE1TpOzlskIyYJk614WW4,13809
32
32
  ml_tools/utilities.py,sha256=SVMaSDigh6SUoAeig2_sXLLIj5w5mUs5KuVWpHvFDec,19816
33
- dragon_ml_toolbox-10.0.1.dist-info/METADATA,sha256=aWKOlsr3Ru2rUeadnl_uhKNbjFTPTtYDHv4zg7kcM9c,6968
34
- dragon_ml_toolbox-10.0.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
35
- dragon_ml_toolbox-10.0.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
36
- dragon_ml_toolbox-10.0.1.dist-info/RECORD,,
33
+ dragon_ml_toolbox-10.1.0.dist-info/METADATA,sha256=N1SOO1xHhKMj3_ontR98YV9S2BhaRLRWFxywzDkuJbc,6968
34
+ dragon_ml_toolbox-10.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
35
+ dragon_ml_toolbox-10.1.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
36
+ dragon_ml_toolbox-10.1.0.dist-info/RECORD,,
ml_tools/ETL_cleaning.py CHANGED
@@ -1,8 +1,7 @@
1
1
  import polars as pl
2
2
  import pandas as pd
3
- import re
4
3
  from pathlib import Path
5
- from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
4
+ from typing import Union, List, Dict
6
5
  from .path_manager import sanitize_filename, make_fullpath
7
6
  from .utilities import save_dataframe, load_dataframe
8
7
  from ._script_info import _script_info
@@ -131,9 +130,30 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
131
130
  r'\p{C}+': '',
132
131
 
133
132
  # Full-width to half-width
133
+ # Numbers
134
+ '0': '0', '1': '1', '2': '2', '3': '3', '4': '4',
135
+ '5': '5', '6': '6', '7': '7', '8': '8', '9': '9',
136
+ # Superscripts & Subscripts
137
+ '¹': '1', '²': '2', '³': '3', '⁴': '4', '⁵': '5',
138
+ '⁶': '6', '⁷': '7', '⁸': '8', '⁹': '9', '⁰': '0',
139
+ '₁': '1', '₂': '2', '₃': '3', '₄': '4', '₅': '5',
140
+ '₆': '6', '₇': '7', '₈': '8', '₉': '9', '₀': '0',
141
+ # Uppercase Alphabet
142
+ 'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F',
143
+ 'G': 'G', 'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L',
144
+ 'M': 'M', 'N': 'N', 'O': 'O', 'P': 'P', 'Q': 'Q', 'R': 'R',
145
+ 'S': 'S', 'T': 'T', 'U': 'U', 'V': 'V', 'W': 'W', 'X': 'X',
146
+ 'Y': 'Y', 'Z': 'Z',
147
+ # Lowercase Alphabet
148
+ 'a': 'a', 'b': 'b', 'c': 'c', 'd': 'd', 'e': 'e', 'f': 'f',
149
+ 'g': 'g', 'h': 'h', 'i': 'i', 'j': 'j', 'k': 'k', 'l': 'l',
150
+ 'm': 'm', 'n': 'n', 'o': 'o', 'p': 'p', 'q': 'q', 'r': 'r',
151
+ 's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x',
152
+ 'y': 'y', 'z': 'z',
153
+ # Punctuation
134
154
  '》': '>', '《': '<', ':': ':', ',': ',', '。': '.', ';': ';', '【': '[', '】': ']',
135
- '(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#',
136
- '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '\\', '|': '|',
155
+ '(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#', '+': '+', '-': '-',
156
+ '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '\\', '|': '|', '、':',', '≈':'=',
137
157
 
138
158
  # Others
139
159
  '©': '',
@@ -148,9 +168,9 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
148
168
  # Typographical standardization
149
169
  # Unify various dashes and hyphens to a standard hyphen-minus
150
170
  r'[—–―]': '-',
151
- # Unify various quote types to standard single quotes
171
+ # Unify various quote types to standard quotes
152
172
  r'[“”]': "'",
153
- r'[‘’]': "'",
173
+ r'[‘’′]': "'",
154
174
 
155
175
  # 2. Internal Whitespace Consolidation
156
176
  # Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
@@ -162,7 +182,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
162
182
 
163
183
  # 4. Textual Null Standardization (New Step)
164
184
  # Convert common null-like text to actual nulls. (?i) makes it case-insensitive.
165
- r'^(N/A|NA|NULL|NONE|NIL|)$': None,
185
+ r'^(N/A|无|NA|NULL|NONE|NIL|)$': None,
166
186
 
167
187
  # 5. Final Nullification of Empty Strings
168
188
  # After all cleaning, if a string is now empty, convert it to a null
@@ -238,14 +258,6 @@ class ColumnCleaner:
238
258
  _LOGGER.error("The 'rules' argument must be a dictionary.")
239
259
  raise TypeError()
240
260
 
241
- # Validate each regex pattern for correctness
242
- for pattern in rules.keys():
243
- try:
244
- re.compile(pattern)
245
- except re.error:
246
- _LOGGER.error(f"Invalid regex pattern '{pattern}'.")
247
- raise
248
-
249
261
  self.column_name = column_name
250
262
  self.rules = rules
251
263
  self.case_insensitive = case_insensitive