dragon-ml-toolbox 10.0.1__py3-none-any.whl → 10.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 10.0.1
3
+ Version: 10.1.1
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
- dragon_ml_toolbox-10.0.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-10.0.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
3
- ml_tools/ETL_cleaning.py,sha256=g_6BH0amK4aQwX8aEM2z4JYyskjbSg5ktu8n0YbrM3w,14905
1
+ dragon_ml_toolbox-10.1.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
+ dragon_ml_toolbox-10.1.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
3
+ ml_tools/ETL_cleaning.py,sha256=i-hrafaAivg8wprcCmwHA5MkXFsUmHNR9RRGbIyw4ZE,15981
4
4
  ml_tools/ETL_engineering.py,sha256=sgpIhlFIeId4eSJ-a33MnVuPNXs50msxFWa8-kw2hOI,36369
5
5
  ml_tools/GUI_tools.py,sha256=kEQWg-bog3pB5tI22gMGKWaCGHnz9TB2Lvvfhf5F2CI,45412
6
6
  ml_tools/MICE_imputation.py,sha256=kVSythWfxJFR4-2mtcYCWQaQ1Oz5yyx_SJu5gjnS7H8,11670
@@ -30,7 +30,7 @@ ml_tools/keys.py,sha256=HtPG8-MWh89C32A7eIlfuuA-DLwkxGkoDfwR2TGN9CQ,1074
30
30
  ml_tools/optimization_tools.py,sha256=P3I6lIpvZ8Xf2kX5FvvBKBmrK2pB6idBpkTzfUJxTeE,5073
31
31
  ml_tools/path_manager.py,sha256=TJgoqMAryc5F0dal8W_zvJgE1TpOzlskIyYJk614WW4,13809
32
32
  ml_tools/utilities.py,sha256=SVMaSDigh6SUoAeig2_sXLLIj5w5mUs5KuVWpHvFDec,19816
33
- dragon_ml_toolbox-10.0.1.dist-info/METADATA,sha256=aWKOlsr3Ru2rUeadnl_uhKNbjFTPTtYDHv4zg7kcM9c,6968
34
- dragon_ml_toolbox-10.0.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
35
- dragon_ml_toolbox-10.0.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
36
- dragon_ml_toolbox-10.0.1.dist-info/RECORD,,
33
+ dragon_ml_toolbox-10.1.1.dist-info/METADATA,sha256=wJ2byoP5azuIBrLRpUUQ96DkDAQuxVtgf2lFPafBUUQ,6968
34
+ dragon_ml_toolbox-10.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
35
+ dragon_ml_toolbox-10.1.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
36
+ dragon_ml_toolbox-10.1.1.dist-info/RECORD,,
ml_tools/ETL_cleaning.py CHANGED
@@ -1,8 +1,7 @@
1
1
  import polars as pl
2
2
  import pandas as pd
3
- import re
4
3
  from pathlib import Path
5
- from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
4
+ from typing import Union, List, Dict
6
5
  from .path_manager import sanitize_filename, make_fullpath
7
6
  from .utilities import save_dataframe, load_dataframe
8
7
  from ._script_info import _script_info
@@ -18,7 +17,7 @@ __all__ = [
18
17
 
19
18
 
20
19
  ################ Unique Values per column #################
21
- def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path]) -> None:
20
+ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path], verbose: bool=False) -> None:
22
21
  """
23
22
  Loads a CSV file, then analyzes it and saves the unique non-null values
24
23
  from each column into a separate text file exactly as they appear.
@@ -51,6 +50,7 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path])
51
50
  _LOGGER.info(f"Data loaded from '{csv_path}'")
52
51
 
53
52
  # --- 3. Process Each Column ---
53
+ counter = 0
54
54
  for i, column_name in enumerate(df.columns):
55
55
  # _LOGGER.info(f"Processing column: '{column_name}'...")
56
56
 
@@ -86,9 +86,11 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path])
86
86
  except IOError:
87
87
  _LOGGER.exception(f"Error writing to file {file_path}.")
88
88
  else:
89
- _LOGGER.info(f"Successfully saved {len(sorted_uniques)} unique values from '{column_name}'.")
90
-
91
- _LOGGER.info("Process complete.")
89
+ if verbose:
90
+ _LOGGER.info(f"Successfully saved {len(sorted_uniques)} unique values from '{column_name}'.")
91
+ counter += 1
92
+
93
+ _LOGGER.info(f"{counter} files of unique values created.")
92
94
 
93
95
 
94
96
  ########## Basic df cleaner #############
@@ -131,9 +133,30 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
131
133
  r'\p{C}+': '',
132
134
 
133
135
  # Full-width to half-width
136
+ # Numbers
137
+ '0': '0', '1': '1', '2': '2', '3': '3', '4': '4',
138
+ '5': '5', '6': '6', '7': '7', '8': '8', '9': '9',
139
+ # Superscripts & Subscripts
140
+ '¹': '1', '²': '2', '³': '3', '⁴': '4', '⁵': '5',
141
+ '⁶': '6', '⁷': '7', '⁸': '8', '⁹': '9', '⁰': '0',
142
+ '₁': '1', '₂': '2', '₃': '3', '₄': '4', '₅': '5',
143
+ '₆': '6', '₇': '7', '₈': '8', '₉': '9', '₀': '0',
144
+ # Uppercase Alphabet
145
+ 'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F',
146
+ 'G': 'G', 'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L',
147
+ 'M': 'M', 'N': 'N', 'O': 'O', 'P': 'P', 'Q': 'Q', 'R': 'R',
148
+ 'S': 'S', 'T': 'T', 'U': 'U', 'V': 'V', 'W': 'W', 'X': 'X',
149
+ 'Y': 'Y', 'Z': 'Z',
150
+ # Lowercase Alphabet
151
+ 'a': 'a', 'b': 'b', 'c': 'c', 'd': 'd', 'e': 'e', 'f': 'f',
152
+ 'g': 'g', 'h': 'h', 'i': 'i', 'j': 'j', 'k': 'k', 'l': 'l',
153
+ 'm': 'm', 'n': 'n', 'o': 'o', 'p': 'p', 'q': 'q', 'r': 'r',
154
+ 's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x',
155
+ 'y': 'y', 'z': 'z',
156
+ # Punctuation
134
157
  '》': '>', '《': '<', ':': ':', ',': ',', '。': '.', ';': ';', '【': '[', '】': ']',
135
- '(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#',
136
- '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '\\', '|': '|',
158
+ '(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#', '+': '+', '-': '-',
159
+ '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '\\', '|': '|', '、':',', '≈':'=',
137
160
 
138
161
  # Others
139
162
  '©': '',
@@ -148,9 +171,9 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
148
171
  # Typographical standardization
149
172
  # Unify various dashes and hyphens to a standard hyphen-minus
150
173
  r'[—–―]': '-',
151
- # Unify various quote types to standard single quotes
174
+ # Unify various quote types to standard quotes
152
175
  r'[“”]': "'",
153
- r'[‘’]': "'",
176
+ r'[‘’′]': "'",
154
177
 
155
178
  # 2. Internal Whitespace Consolidation
156
179
  # Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
@@ -162,7 +185,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
162
185
 
163
186
  # 4. Textual Null Standardization (New Step)
164
187
  # Convert common null-like text to actual nulls. (?i) makes it case-insensitive.
165
- r'^(N/A|NA|NULL|NONE|NIL|)$': None,
188
+ r'^(N/A|无|NA|NULL|NONE|NIL|)$': None,
166
189
 
167
190
  # 5. Final Nullification of Empty Strings
168
191
  # After all cleaning, if a string is now empty, convert it to a null
@@ -238,14 +261,6 @@ class ColumnCleaner:
238
261
  _LOGGER.error("The 'rules' argument must be a dictionary.")
239
262
  raise TypeError()
240
263
 
241
- # Validate each regex pattern for correctness
242
- for pattern in rules.keys():
243
- try:
244
- re.compile(pattern)
245
- except re.error:
246
- _LOGGER.error(f"Invalid regex pattern '{pattern}'.")
247
- raise
248
-
249
264
  self.column_name = column_name
250
265
  self.rules = rules
251
266
  self.case_insensitive = case_insensitive