dragon-ml-toolbox 10.0.1__tar.gz → 10.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-10.0.1/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-10.1.1}/PKG-INFO +1 -1
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/ETL_cleaning.py +34 -19
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/pyproject.toml +1 -1
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/LICENSE +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/README.md +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/ETL_engineering.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/GUI_tools.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/MICE_imputation.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/ML_callbacks.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/ML_datasetmaster.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/ML_evaluation.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/ML_evaluation_multi.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/ML_inference.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/ML_models.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/ML_optimization.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/ML_scaler.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/ML_trainer.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/PSO_optimization.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/RNN_forecast.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/SQL.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/VIF_factor.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/_logger.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/_script_info.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/custom_logger.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/data_exploration.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/ensemble_evaluation.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/ensemble_inference.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/ensemble_learning.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/handle_excel.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/keys.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/optimization_tools.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/path_manager.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/ml_tools/utilities.py +0 -0
- {dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/setup.cfg +0 -0
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
import polars as pl
|
|
2
2
|
import pandas as pd
|
|
3
|
-
import re
|
|
4
3
|
from pathlib import Path
|
|
5
|
-
from typing import
|
|
4
|
+
from typing import Union, List, Dict
|
|
6
5
|
from .path_manager import sanitize_filename, make_fullpath
|
|
7
6
|
from .utilities import save_dataframe, load_dataframe
|
|
8
7
|
from ._script_info import _script_info
|
|
@@ -18,7 +17,7 @@ __all__ = [
|
|
|
18
17
|
|
|
19
18
|
|
|
20
19
|
################ Unique Values per column #################
|
|
21
|
-
def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path]) -> None:
|
|
20
|
+
def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path], verbose: bool=False) -> None:
|
|
22
21
|
"""
|
|
23
22
|
Loads a CSV file, then analyzes it and saves the unique non-null values
|
|
24
23
|
from each column into a separate text file exactly as they appear.
|
|
@@ -51,6 +50,7 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path])
|
|
|
51
50
|
_LOGGER.info(f"Data loaded from '{csv_path}'")
|
|
52
51
|
|
|
53
52
|
# --- 3. Process Each Column ---
|
|
53
|
+
counter = 0
|
|
54
54
|
for i, column_name in enumerate(df.columns):
|
|
55
55
|
# _LOGGER.info(f"Processing column: '{column_name}'...")
|
|
56
56
|
|
|
@@ -86,9 +86,11 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path])
|
|
|
86
86
|
except IOError:
|
|
87
87
|
_LOGGER.exception(f"Error writing to file {file_path}.")
|
|
88
88
|
else:
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
89
|
+
if verbose:
|
|
90
|
+
_LOGGER.info(f"Successfully saved {len(sorted_uniques)} unique values from '{column_name}'.")
|
|
91
|
+
counter += 1
|
|
92
|
+
|
|
93
|
+
_LOGGER.info(f"{counter} files of unique values created.")
|
|
92
94
|
|
|
93
95
|
|
|
94
96
|
########## Basic df cleaner #############
|
|
@@ -131,9 +133,30 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
|
|
|
131
133
|
r'\p{C}+': '',
|
|
132
134
|
|
|
133
135
|
# Full-width to half-width
|
|
136
|
+
# Numbers
|
|
137
|
+
'0': '0', '1': '1', '2': '2', '3': '3', '4': '4',
|
|
138
|
+
'5': '5', '6': '6', '7': '7', '8': '8', '9': '9',
|
|
139
|
+
# Superscripts & Subscripts
|
|
140
|
+
'¹': '1', '²': '2', '³': '3', '⁴': '4', '⁵': '5',
|
|
141
|
+
'⁶': '6', '⁷': '7', '⁸': '8', '⁹': '9', '⁰': '0',
|
|
142
|
+
'₁': '1', '₂': '2', '₃': '3', '₄': '4', '₅': '5',
|
|
143
|
+
'₆': '6', '₇': '7', '₈': '8', '₉': '9', '₀': '0',
|
|
144
|
+
# Uppercase Alphabet
|
|
145
|
+
'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F',
|
|
146
|
+
'G': 'G', 'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L',
|
|
147
|
+
'M': 'M', 'N': 'N', 'O': 'O', 'P': 'P', 'Q': 'Q', 'R': 'R',
|
|
148
|
+
'S': 'S', 'T': 'T', 'U': 'U', 'V': 'V', 'W': 'W', 'X': 'X',
|
|
149
|
+
'Y': 'Y', 'Z': 'Z',
|
|
150
|
+
# Lowercase Alphabet
|
|
151
|
+
'a': 'a', 'b': 'b', 'c': 'c', 'd': 'd', 'e': 'e', 'f': 'f',
|
|
152
|
+
'g': 'g', 'h': 'h', 'i': 'i', 'j': 'j', 'k': 'k', 'l': 'l',
|
|
153
|
+
'm': 'm', 'n': 'n', 'o': 'o', 'p': 'p', 'q': 'q', 'r': 'r',
|
|
154
|
+
's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x',
|
|
155
|
+
'y': 'y', 'z': 'z',
|
|
156
|
+
# Punctuation
|
|
134
157
|
'》': '>', '《': '<', ':': ':', ',': ',', '。': '.', ';': ';', '【': '[', '】': ']',
|
|
135
|
-
'(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#',
|
|
136
|
-
'$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '\\', '|': '|',
|
|
158
|
+
'(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#', '+': '+', '-': '-',
|
|
159
|
+
'$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '\\', '|': '|', '、':',', '≈':'=',
|
|
137
160
|
|
|
138
161
|
# Others
|
|
139
162
|
'©': '',
|
|
@@ -148,9 +171,9 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
|
|
|
148
171
|
# Typographical standardization
|
|
149
172
|
# Unify various dashes and hyphens to a standard hyphen-minus
|
|
150
173
|
r'[—–―]': '-',
|
|
151
|
-
# Unify various quote types to standard
|
|
174
|
+
# Unify various quote types to standard quotes
|
|
152
175
|
r'[“”]': "'",
|
|
153
|
-
r'[
|
|
176
|
+
r'[‘’′]': "'",
|
|
154
177
|
|
|
155
178
|
# 2. Internal Whitespace Consolidation
|
|
156
179
|
# Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
|
|
@@ -162,7 +185,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
|
|
|
162
185
|
|
|
163
186
|
# 4. Textual Null Standardization (New Step)
|
|
164
187
|
# Convert common null-like text to actual nulls. (?i) makes it case-insensitive.
|
|
165
|
-
r'^(N/A
|
|
188
|
+
r'^(N/A|无|NA|NULL|NONE|NIL|)$': None,
|
|
166
189
|
|
|
167
190
|
# 5. Final Nullification of Empty Strings
|
|
168
191
|
# After all cleaning, if a string is now empty, convert it to a null
|
|
@@ -238,14 +261,6 @@ class ColumnCleaner:
|
|
|
238
261
|
_LOGGER.error("The 'rules' argument must be a dictionary.")
|
|
239
262
|
raise TypeError()
|
|
240
263
|
|
|
241
|
-
# Validate each regex pattern for correctness
|
|
242
|
-
for pattern in rules.keys():
|
|
243
|
-
try:
|
|
244
|
-
re.compile(pattern)
|
|
245
|
-
except re.error:
|
|
246
|
-
_LOGGER.error(f"Invalid regex pattern '{pattern}'.")
|
|
247
|
-
raise
|
|
248
|
-
|
|
249
264
|
self.column_name = column_name
|
|
250
265
|
self.rules = rules
|
|
251
266
|
self.case_insensitive = case_insensitive
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/dragon_ml_toolbox.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/dragon_ml_toolbox.egg-info/requires.txt
RENAMED
|
File without changes
|
{dragon_ml_toolbox-10.0.1 → dragon_ml_toolbox-10.1.1}/dragon_ml_toolbox.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|