dragon-ml-toolbox 10.0.0__tar.gz → 10.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-10.0.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-10.1.0}/PKG-INFO +4 -2
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/README.md +3 -1
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0/dragon_ml_toolbox.egg-info}/PKG-INFO +4 -2
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/ETL_cleaning.py +29 -17
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/pyproject.toml +1 -1
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/LICENSE +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/ETL_engineering.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/GUI_tools.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/MICE_imputation.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/ML_callbacks.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/ML_datasetmaster.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/ML_evaluation.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/ML_evaluation_multi.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/ML_inference.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/ML_models.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/ML_optimization.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/ML_scaler.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/ML_trainer.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/PSO_optimization.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/RNN_forecast.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/SQL.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/VIF_factor.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/_logger.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/_script_info.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/custom_logger.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/data_exploration.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/ensemble_evaluation.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/ensemble_inference.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/ensemble_learning.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/handle_excel.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/keys.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/optimization_tools.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/path_manager.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/ml_tools/utilities.py +0 -0
- {dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version: 10.
|
|
3
|
+
Version: 10.1.0
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects.
|
|
5
5
|
Author-email: Karl Loza <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -148,6 +148,7 @@ data_exploration
|
|
|
148
148
|
ensemble_evaluation
|
|
149
149
|
ensemble_inference
|
|
150
150
|
ensemble_learning
|
|
151
|
+
ETL_cleaning
|
|
151
152
|
ETL_engineering
|
|
152
153
|
ML_callbacks
|
|
153
154
|
ML_datasetmaster
|
|
@@ -265,7 +266,8 @@ pip install "dragon-ml-toolbox[base]"
|
|
|
265
266
|
#### Modules:
|
|
266
267
|
|
|
267
268
|
```Bash
|
|
268
|
-
|
|
269
|
+
ETL_cleaning
|
|
270
|
+
ETL_engineering
|
|
269
271
|
custom_logger
|
|
270
272
|
SQL
|
|
271
273
|
utilities
|
|
@@ -63,6 +63,7 @@ data_exploration
|
|
|
63
63
|
ensemble_evaluation
|
|
64
64
|
ensemble_inference
|
|
65
65
|
ensemble_learning
|
|
66
|
+
ETL_cleaning
|
|
66
67
|
ETL_engineering
|
|
67
68
|
ML_callbacks
|
|
68
69
|
ML_datasetmaster
|
|
@@ -180,7 +181,8 @@ pip install "dragon-ml-toolbox[base]"
|
|
|
180
181
|
#### Modules:
|
|
181
182
|
|
|
182
183
|
```Bash
|
|
183
|
-
|
|
184
|
+
ETL_cleaning
|
|
185
|
+
ETL_engineering
|
|
184
186
|
custom_logger
|
|
185
187
|
SQL
|
|
186
188
|
utilities
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version: 10.
|
|
3
|
+
Version: 10.1.0
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects.
|
|
5
5
|
Author-email: Karl Loza <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -148,6 +148,7 @@ data_exploration
|
|
|
148
148
|
ensemble_evaluation
|
|
149
149
|
ensemble_inference
|
|
150
150
|
ensemble_learning
|
|
151
|
+
ETL_cleaning
|
|
151
152
|
ETL_engineering
|
|
152
153
|
ML_callbacks
|
|
153
154
|
ML_datasetmaster
|
|
@@ -265,7 +266,8 @@ pip install "dragon-ml-toolbox[base]"
|
|
|
265
266
|
#### Modules:
|
|
266
267
|
|
|
267
268
|
```Bash
|
|
268
|
-
|
|
269
|
+
ETL_cleaning
|
|
270
|
+
ETL_engineering
|
|
269
271
|
custom_logger
|
|
270
272
|
SQL
|
|
271
273
|
utilities
|
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
import polars as pl
|
|
2
2
|
import pandas as pd
|
|
3
|
-
import re
|
|
4
3
|
from pathlib import Path
|
|
5
|
-
from typing import
|
|
4
|
+
from typing import Union, List, Dict
|
|
6
5
|
from .path_manager import sanitize_filename, make_fullpath
|
|
7
6
|
from .utilities import save_dataframe, load_dataframe
|
|
8
7
|
from ._script_info import _script_info
|
|
@@ -131,16 +130,37 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
|
|
|
131
130
|
r'\p{C}+': '',
|
|
132
131
|
|
|
133
132
|
# Full-width to half-width
|
|
133
|
+
# Numbers
|
|
134
|
+
'0': '0', '1': '1', '2': '2', '3': '3', '4': '4',
|
|
135
|
+
'5': '5', '6': '6', '7': '7', '8': '8', '9': '9',
|
|
136
|
+
# Superscripts & Subscripts
|
|
137
|
+
'¹': '1', '²': '2', '³': '3', '⁴': '4', '⁵': '5',
|
|
138
|
+
'⁶': '6', '⁷': '7', '⁸': '8', '⁹': '9', '⁰': '0',
|
|
139
|
+
'₁': '1', '₂': '2', '₃': '3', '₄': '4', '₅': '5',
|
|
140
|
+
'₆': '6', '₇': '7', '₈': '8', '₉': '9', '₀': '0',
|
|
141
|
+
# Uppercase Alphabet
|
|
142
|
+
'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F',
|
|
143
|
+
'G': 'G', 'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L',
|
|
144
|
+
'M': 'M', 'N': 'N', 'O': 'O', 'P': 'P', 'Q': 'Q', 'R': 'R',
|
|
145
|
+
'S': 'S', 'T': 'T', 'U': 'U', 'V': 'V', 'W': 'W', 'X': 'X',
|
|
146
|
+
'Y': 'Y', 'Z': 'Z',
|
|
147
|
+
# Lowercase Alphabet
|
|
148
|
+
'a': 'a', 'b': 'b', 'c': 'c', 'd': 'd', 'e': 'e', 'f': 'f',
|
|
149
|
+
'g': 'g', 'h': 'h', 'i': 'i', 'j': 'j', 'k': 'k', 'l': 'l',
|
|
150
|
+
'm': 'm', 'n': 'n', 'o': 'o', 'p': 'p', 'q': 'q', 'r': 'r',
|
|
151
|
+
's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x',
|
|
152
|
+
'y': 'y', 'z': 'z',
|
|
153
|
+
# Punctuation
|
|
134
154
|
'》': '>', '《': '<', ':': ':', ',': ',', '。': '.', ';': ';', '【': '[', '】': ']',
|
|
135
|
-
'(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#',
|
|
136
|
-
'$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '\\', '|': '|',
|
|
155
|
+
'(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#', '+': '+', '-': '-',
|
|
156
|
+
'$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '\\', '|': '|', '、':',', '≈':'=',
|
|
137
157
|
|
|
138
158
|
# Others
|
|
139
159
|
'©': '',
|
|
140
160
|
'®': '',
|
|
141
161
|
'™': '',
|
|
142
162
|
|
|
143
|
-
# Collapse repeating punctuation
|
|
163
|
+
# Collapse repeating punctuation
|
|
144
164
|
r'\.{2,}': '.', # Replace two or more dots with a single dot
|
|
145
165
|
r'\?{2,}': '?', # Replace two or more question marks with a single question mark
|
|
146
166
|
r'!{2,}': '!', # Replace two or more exclamation marks with a single one
|
|
@@ -148,9 +168,9 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
|
|
|
148
168
|
# Typographical standardization
|
|
149
169
|
# Unify various dashes and hyphens to a standard hyphen-minus
|
|
150
170
|
r'[—–―]': '-',
|
|
151
|
-
# Unify various quote types to standard
|
|
171
|
+
# Unify various quote types to standard quotes
|
|
152
172
|
r'[“”]': "'",
|
|
153
|
-
r'[
|
|
173
|
+
r'[‘’′]': "'",
|
|
154
174
|
|
|
155
175
|
# 2. Internal Whitespace Consolidation
|
|
156
176
|
# Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
|
|
@@ -162,7 +182,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
|
|
|
162
182
|
|
|
163
183
|
# 4. Textual Null Standardization (New Step)
|
|
164
184
|
# Convert common null-like text to actual nulls. (?i) makes it case-insensitive.
|
|
165
|
-
r'^(N/A
|
|
185
|
+
r'^(N/A|无|NA|NULL|NONE|NIL|)$': None,
|
|
166
186
|
|
|
167
187
|
# 5. Final Nullification of Empty Strings
|
|
168
188
|
# After all cleaning, if a string is now empty, convert it to a null
|
|
@@ -192,7 +212,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
|
|
|
192
212
|
# Save cleaned dataframe
|
|
193
213
|
save_dataframe(df=df_final, save_dir=output_path.parent, filename=output_path.name)
|
|
194
214
|
|
|
195
|
-
_LOGGER.info(f"
|
|
215
|
+
_LOGGER.info(f"Data successfully cleaned.")
|
|
196
216
|
|
|
197
217
|
|
|
198
218
|
########## EXTRACT and CLEAN ##########
|
|
@@ -238,14 +258,6 @@ class ColumnCleaner:
|
|
|
238
258
|
_LOGGER.error("The 'rules' argument must be a dictionary.")
|
|
239
259
|
raise TypeError()
|
|
240
260
|
|
|
241
|
-
# Validate each regex pattern for correctness
|
|
242
|
-
for pattern in rules.keys():
|
|
243
|
-
try:
|
|
244
|
-
re.compile(pattern)
|
|
245
|
-
except re.error:
|
|
246
|
-
_LOGGER.error(f"Invalid regex pattern '{pattern}'.")
|
|
247
|
-
raise
|
|
248
|
-
|
|
249
261
|
self.column_name = column_name
|
|
250
262
|
self.rules = rules
|
|
251
263
|
self.case_insensitive = case_insensitive
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/dragon_ml_toolbox.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/dragon_ml_toolbox.egg-info/requires.txt
RENAMED
|
File without changes
|
{dragon_ml_toolbox-10.0.0 → dragon_ml_toolbox-10.1.0}/dragon_ml_toolbox.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|