dragon-ml-toolbox 10.1.1__py3-none-any.whl → 14.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/METADATA +38 -63
- dragon_ml_toolbox-14.2.0.dist-info/RECORD +48 -0
- {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE +1 -1
- {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +11 -0
- ml_tools/ETL_cleaning.py +175 -59
- ml_tools/ETL_engineering.py +506 -70
- ml_tools/GUI_tools.py +2 -1
- ml_tools/MICE_imputation.py +212 -7
- ml_tools/ML_callbacks.py +73 -40
- ml_tools/ML_datasetmaster.py +267 -284
- ml_tools/ML_evaluation.py +119 -58
- ml_tools/ML_evaluation_multi.py +107 -32
- ml_tools/ML_inference.py +15 -5
- ml_tools/ML_models.py +234 -170
- ml_tools/ML_models_advanced.py +323 -0
- ml_tools/ML_optimization.py +321 -97
- ml_tools/ML_scaler.py +10 -5
- ml_tools/ML_trainer.py +585 -40
- ml_tools/ML_utilities.py +528 -0
- ml_tools/ML_vision_datasetmaster.py +1315 -0
- ml_tools/ML_vision_evaluation.py +260 -0
- ml_tools/ML_vision_inference.py +428 -0
- ml_tools/ML_vision_models.py +627 -0
- ml_tools/ML_vision_transformers.py +58 -0
- ml_tools/PSO_optimization.py +10 -7
- ml_tools/RNN_forecast.py +2 -0
- ml_tools/SQL.py +22 -9
- ml_tools/VIF_factor.py +4 -3
- ml_tools/_ML_vision_recipe.py +88 -0
- ml_tools/__init__.py +1 -0
- ml_tools/_logger.py +0 -2
- ml_tools/_schema.py +96 -0
- ml_tools/constants.py +79 -0
- ml_tools/custom_logger.py +164 -16
- ml_tools/data_exploration.py +1092 -109
- ml_tools/ensemble_evaluation.py +48 -1
- ml_tools/ensemble_inference.py +6 -7
- ml_tools/ensemble_learning.py +4 -3
- ml_tools/handle_excel.py +1 -0
- ml_tools/keys.py +80 -0
- ml_tools/math_utilities.py +259 -0
- ml_tools/optimization_tools.py +198 -24
- ml_tools/path_manager.py +144 -45
- ml_tools/serde.py +192 -0
- ml_tools/utilities.py +287 -227
- dragon_ml_toolbox-10.1.1.dist-info/RECORD +0 -36
- {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,23 +1,17 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version:
|
|
3
|
+
Version: 14.2.0
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects.
|
|
5
|
-
Author-email: Karl Loza <luigiloza@gmail.com>
|
|
5
|
+
Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/DrAg0n-BoRn/ML_tools
|
|
8
8
|
Project-URL: Changelog, https://github.com/DrAg0n-BoRn/ML_tools/blob/master/CHANGELOG.md
|
|
9
9
|
Classifier: Programming Language :: Python :: 3
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
|
11
|
-
Requires-Python: >=3.
|
|
11
|
+
Requires-Python: >=3.12
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
14
|
License-File: LICENSE-THIRD-PARTY.md
|
|
15
|
-
Provides-Extra: base
|
|
16
|
-
Requires-Dist: pandas; extra == "base"
|
|
17
|
-
Requires-Dist: numpy; extra == "base"
|
|
18
|
-
Requires-Dist: polars; extra == "base"
|
|
19
|
-
Requires-Dist: joblib; extra == "base"
|
|
20
|
-
Requires-Dist: colorlog; extra == "base"
|
|
21
15
|
Provides-Extra: ml
|
|
22
16
|
Requires-Dist: numpy>=2.0; extra == "ml"
|
|
23
17
|
Requires-Dist: pandas; extra == "ml"
|
|
@@ -38,7 +32,12 @@ Requires-Dist: shap; extra == "ml"
|
|
|
38
32
|
Requires-Dist: tqdm; extra == "ml"
|
|
39
33
|
Requires-Dist: Pillow; extra == "ml"
|
|
40
34
|
Requires-Dist: evotorch; extra == "ml"
|
|
35
|
+
Requires-Dist: pyarrow; extra == "ml"
|
|
41
36
|
Requires-Dist: colorlog; extra == "ml"
|
|
37
|
+
Requires-Dist: torchmetrics; extra == "ml"
|
|
38
|
+
Provides-Extra: py-tab
|
|
39
|
+
Requires-Dist: pytorch_tabular; extra == "py-tab"
|
|
40
|
+
Requires-Dist: omegaconf; extra == "py-tab"
|
|
42
41
|
Provides-Extra: mice
|
|
43
42
|
Requires-Dist: numpy<2.0; extra == "mice"
|
|
44
43
|
Requires-Dist: pandas; extra == "mice"
|
|
@@ -51,9 +50,7 @@ Requires-Dist: statsmodels; extra == "mice"
|
|
|
51
50
|
Requires-Dist: lightgbm<=4.5.0; extra == "mice"
|
|
52
51
|
Requires-Dist: shap; extra == "mice"
|
|
53
52
|
Requires-Dist: colorlog; extra == "mice"
|
|
54
|
-
|
|
55
|
-
Requires-Dist: torch; extra == "pytorch"
|
|
56
|
-
Requires-Dist: torchvision; extra == "pytorch"
|
|
53
|
+
Requires-Dist: pyarrow; extra == "mice"
|
|
57
54
|
Provides-Extra: excel
|
|
58
55
|
Requires-Dist: pandas; extra == "excel"
|
|
59
56
|
Requires-Dist: openpyxl; extra == "excel"
|
|
@@ -72,9 +69,6 @@ Requires-Dist: lightgbm; extra == "gui-boost"
|
|
|
72
69
|
Provides-Extra: gui-torch
|
|
73
70
|
Requires-Dist: numpy; extra == "gui-torch"
|
|
74
71
|
Requires-Dist: FreeSimpleGUI>=5.2; extra == "gui-torch"
|
|
75
|
-
Provides-Extra: plot
|
|
76
|
-
Requires-Dist: matplotlib; extra == "plot"
|
|
77
|
-
Requires-Dist: seaborn; extra == "plot"
|
|
78
72
|
Provides-Extra: pyinstaller
|
|
79
73
|
Requires-Dist: pyinstaller; extra == "pyinstaller"
|
|
80
74
|
Provides-Extra: nuitka
|
|
@@ -94,7 +88,7 @@ A collection of Python utilities for data science and machine learning, structur
|
|
|
94
88
|
|
|
95
89
|
## Installation
|
|
96
90
|
|
|
97
|
-
**Python 3.
|
|
91
|
+
**Python 3.12**
|
|
98
92
|
|
|
99
93
|
### Via PyPI
|
|
100
94
|
|
|
@@ -104,22 +98,22 @@ Install the latest stable release from PyPI:
|
|
|
104
98
|
pip install dragon-ml-toolbox
|
|
105
99
|
```
|
|
106
100
|
|
|
107
|
-
### Via
|
|
101
|
+
### Via conda-forge
|
|
108
102
|
|
|
109
|
-
|
|
103
|
+
Install from the conda-forge channel:
|
|
110
104
|
|
|
111
105
|
```bash
|
|
112
|
-
|
|
113
|
-
cd ML_tools
|
|
114
|
-
pip install -e .
|
|
106
|
+
conda install -c conda-forge dragon-ml-toolbox
|
|
115
107
|
```
|
|
116
108
|
|
|
117
|
-
### Via
|
|
109
|
+
### Via GitHub (Editable)
|
|
118
110
|
|
|
119
|
-
|
|
111
|
+
Clone the repository and install in editable mode:
|
|
120
112
|
|
|
121
113
|
```bash
|
|
122
|
-
|
|
114
|
+
git clone https://github.com/DrAg0n-BoRn/ML_tools.git
|
|
115
|
+
cd ML_tools
|
|
116
|
+
pip install -e .
|
|
123
117
|
```
|
|
124
118
|
|
|
125
119
|
## Modular Installation
|
|
@@ -132,17 +126,12 @@ Installs a comprehensive set of tools for typical data science workflows, includ
|
|
|
132
126
|
pip install "dragon-ml-toolbox[ML]"
|
|
133
127
|
```
|
|
134
128
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
```Bash
|
|
138
|
-
pip install "dragon-ml-toolbox[pytorch]"
|
|
139
|
-
```
|
|
140
|
-
|
|
141
|
-
⚠️ To make use of GPU acceleration (highly recommended), follow the official instructions: [PyTorch website](https://pytorch.org/get-started/locally/)
|
|
129
|
+
⚠️ PyTorch required, follow the official instructions: [PyTorch website](https://pytorch.org/get-started/locally/)
|
|
142
130
|
|
|
143
131
|
#### Modules:
|
|
144
132
|
|
|
145
133
|
```bash
|
|
134
|
+
constants
|
|
146
135
|
custom_logger
|
|
147
136
|
data_exploration
|
|
148
137
|
ensemble_evaluation
|
|
@@ -150,19 +139,28 @@ ensemble_inference
|
|
|
150
139
|
ensemble_learning
|
|
151
140
|
ETL_cleaning
|
|
152
141
|
ETL_engineering
|
|
142
|
+
math_utilities
|
|
153
143
|
ML_callbacks
|
|
154
144
|
ML_datasetmaster
|
|
155
145
|
ML_evaluation_multi
|
|
156
146
|
ML_evaluation
|
|
157
147
|
ML_inference
|
|
158
148
|
ML_models
|
|
149
|
+
ML_models_advanced # Requires the extra flag [py-tab]
|
|
159
150
|
ML_optimization
|
|
160
151
|
ML_scaler
|
|
161
152
|
ML_trainer
|
|
153
|
+
ML_utilities
|
|
154
|
+
ML_vision_datasetmaster
|
|
155
|
+
ML_vision_evaluation
|
|
156
|
+
ML_vision_inference
|
|
157
|
+
ML_vision_models
|
|
158
|
+
ML_vision_transformers
|
|
162
159
|
optimization_tools
|
|
163
160
|
path_manager
|
|
164
161
|
PSO_optimization
|
|
165
162
|
RNN_forecast
|
|
163
|
+
serde
|
|
166
164
|
SQL
|
|
167
165
|
utilities
|
|
168
166
|
```
|
|
@@ -180,8 +178,11 @@ pip install "dragon-ml-toolbox[mice]"
|
|
|
180
178
|
#### Modules:
|
|
181
179
|
|
|
182
180
|
```Bash
|
|
181
|
+
constants
|
|
183
182
|
custom_logger
|
|
183
|
+
math_utilities
|
|
184
184
|
MICE_imputation
|
|
185
|
+
serde
|
|
185
186
|
VIF_factor
|
|
186
187
|
path_manager
|
|
187
188
|
utilities
|
|
@@ -209,42 +210,37 @@ path_manager
|
|
|
209
210
|
|
|
210
211
|
### 🎰 GUI for Boosting Algorithms (XGBoost, LightGBM) [gui-boost]
|
|
211
212
|
|
|
212
|
-
|
|
213
|
+
GUI tools compatible with XGBoost and LightGBM models used for inference.
|
|
213
214
|
|
|
214
215
|
```Bash
|
|
215
216
|
pip install "dragon-ml-toolbox[gui-boost]"
|
|
216
217
|
```
|
|
217
218
|
|
|
218
|
-
```Bash
|
|
219
|
-
pip install "dragon-ml-toolbox[gui-boost,plot]"
|
|
220
|
-
```
|
|
221
|
-
|
|
222
219
|
#### Modules:
|
|
223
220
|
|
|
224
221
|
```Bash
|
|
222
|
+
constants
|
|
225
223
|
custom_logger
|
|
226
224
|
GUI_tools
|
|
227
225
|
ensemble_inference
|
|
228
226
|
path_manager
|
|
227
|
+
serde
|
|
229
228
|
```
|
|
230
229
|
|
|
231
230
|
---
|
|
232
231
|
|
|
233
232
|
### 🤖 GUI for PyTorch Models [gui-torch]
|
|
234
233
|
|
|
235
|
-
|
|
234
|
+
GUI tools compatible with PyTorch models used for inference.
|
|
236
235
|
|
|
237
236
|
```Bash
|
|
238
237
|
pip install "dragon-ml-toolbox[gui-torch]"
|
|
239
238
|
```
|
|
240
239
|
|
|
241
|
-
```Bash
|
|
242
|
-
pip install "dragon-ml-toolbox[gui-torch,plot]"
|
|
243
|
-
```
|
|
244
|
-
|
|
245
240
|
#### Modules:
|
|
246
241
|
|
|
247
242
|
```Bash
|
|
243
|
+
constants
|
|
248
244
|
custom_logger
|
|
249
245
|
GUI_tools
|
|
250
246
|
ML_models
|
|
@@ -255,27 +251,6 @@ path_manager
|
|
|
255
251
|
|
|
256
252
|
---
|
|
257
253
|
|
|
258
|
-
### 🎫 Base Tools [base]
|
|
259
|
-
|
|
260
|
-
General purpose functions and classes.
|
|
261
|
-
|
|
262
|
-
```Bash
|
|
263
|
-
pip install "dragon-ml-toolbox[base]"
|
|
264
|
-
```
|
|
265
|
-
|
|
266
|
-
#### Modules:
|
|
267
|
-
|
|
268
|
-
```Bash
|
|
269
|
-
ETL_cleaning
|
|
270
|
-
ETL_engineering
|
|
271
|
-
custom_logger
|
|
272
|
-
SQL
|
|
273
|
-
utilities
|
|
274
|
-
path_manager
|
|
275
|
-
```
|
|
276
|
-
|
|
277
|
-
---
|
|
278
|
-
|
|
279
254
|
### ⚒️ APP bundlers
|
|
280
255
|
|
|
281
256
|
Choose one if needed.
|
|
@@ -293,6 +268,6 @@ pip install "dragon-ml-toolbox[nuitka]"
|
|
|
293
268
|
After installation, import modules like this:
|
|
294
269
|
|
|
295
270
|
```python
|
|
296
|
-
from ml_tools.
|
|
271
|
+
from ml_tools.serde import serialize_object, deserialize_object
|
|
297
272
|
from ml_tools import custom_logger
|
|
298
273
|
```
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
dragon_ml_toolbox-14.2.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
|
|
2
|
+
dragon_ml_toolbox-14.2.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=gkOdNDbKYpIJezwSo2CEnISkLeYfYHv9t8b5K2-P69A,2687
|
|
3
|
+
ml_tools/ETL_cleaning.py,sha256=2VBRllV8F-ZiPylPp8Az2gwn5ztgazN0BH5OKnRUhV0,20402
|
|
4
|
+
ml_tools/ETL_engineering.py,sha256=KfYqgsxupAx6e_TxwO1LZXeu5mFkIhVXJrNjP3CzIZc,54927
|
|
5
|
+
ml_tools/GUI_tools.py,sha256=Va6ig-dHULPVRwQYYtH3fvY5XPIoqRcJpRW8oXC55Hw,45413
|
|
6
|
+
ml_tools/MICE_imputation.py,sha256=KLJXGQLKJ6AuWWttAG-LCCaxpS-ygM4dXPiguHDaL6Y,20815
|
|
7
|
+
ml_tools/ML_callbacks.py,sha256=elD2Yr030sv_6gX_m9GVd6HTyrbmt34nFS8lrgS4HtM,15808
|
|
8
|
+
ml_tools/ML_datasetmaster.py,sha256=rsJgZEGBJmfeKF6cR8CQZzfEx4T7Y-p1wUnR15_nNw0,28400
|
|
9
|
+
ml_tools/ML_evaluation.py,sha256=4GU86rUWMIGbkXrvN6PyjfGwKtWvXKE7pMlWpWeBq14,18988
|
|
10
|
+
ml_tools/ML_evaluation_multi.py,sha256=rJKdgtq-9I7oaI7PRzq7aIZ84XdNV0xzlVePZW4nj0k,16095
|
|
11
|
+
ml_tools/ML_inference.py,sha256=YJ953bhNWsdlPRtJQh3h2ACfMIgp8dQ9KtL9Azar-5s,23489
|
|
12
|
+
ml_tools/ML_models.py,sha256=PqOcNlws7vCJMbiVCKqlPuktxvskZVUHG3VfU-Yshf8,31415
|
|
13
|
+
ml_tools/ML_models_advanced.py,sha256=vk3PZBSu3DVso2S1rKTxxdS43XG8Q5FnasIL3-rMajc,12410
|
|
14
|
+
ml_tools/ML_optimization.py,sha256=P0zkhKAwTpkorIBtR0AOIDcyexo5ngmvFUzo3DfNO-E,22692
|
|
15
|
+
ml_tools/ML_scaler.py,sha256=tw6onj9o8_kk3FQYb930HUzvv1zsFZe2YZJdF3LtHkU,7538
|
|
16
|
+
ml_tools/ML_trainer.py,sha256=ZWI4MbUcLeBxyfoUTL96l5tjHHMp9I64h4SdXnjYmBE,49795
|
|
17
|
+
ml_tools/ML_utilities.py,sha256=z6LbpbZwhn8F__fWlKi-g-cAJQXSxwg1NHfC5FBoAyc,21139
|
|
18
|
+
ml_tools/ML_vision_datasetmaster.py,sha256=tOrdatuq_AP8-GDiTrtARvSJdpc8h7dT-OhDJtRQnsk,54433
|
|
19
|
+
ml_tools/ML_vision_evaluation.py,sha256=t12R7i1RkOCt9zu1_lxSBr8OH6A6Get0k8ftDLctn6I,10486
|
|
20
|
+
ml_tools/ML_vision_inference.py,sha256=He3KV3VJAm8PwO-fOq4b9VO8UXFr-GmpuCnoHXf4VZI,20588
|
|
21
|
+
ml_tools/ML_vision_models.py,sha256=G3S4jB9AE9wMpU9ZygOgOx9q1K6t6LAXBYcJ-U2XQ1M,25600
|
|
22
|
+
ml_tools/ML_vision_transformers.py,sha256=95e0aBkHY5VDGE8i5xy57COU7NvSNIgFknnhBubwE40,1832
|
|
23
|
+
ml_tools/PSO_optimization.py,sha256=T-HWHMRJUnPvPwixdU5jif3_rnnI36TzcL8u3oSCwuA,22960
|
|
24
|
+
ml_tools/RNN_forecast.py,sha256=Qa2KoZfdAvSjZ4yE78N4BFXtr3tTr0Gx7tQJZPotsh0,1967
|
|
25
|
+
ml_tools/SQL.py,sha256=vXLPGfVVg8bfkbBE3HVfyEclVbdJy0TBhuQONtMwSCQ,11234
|
|
26
|
+
ml_tools/VIF_factor.py,sha256=at5IVqPvicja2-DNSTSIIy3SkzDWCmLzo3qTG_qr5n8,10422
|
|
27
|
+
ml_tools/_ML_vision_recipe.py,sha256=zrgxFUvTJqQVuwR7jWlbIC2FD29u6eNFPkTRoJ7yEZI,3178
|
|
28
|
+
ml_tools/__init__.py,sha256=kJiankjz9_qXu7gU92mYqYg_anLvt-B6RtW0mMH8uGo,76
|
|
29
|
+
ml_tools/_logger.py,sha256=dlp5cGbzooK9YSNSZYB4yjZrOaQUGW8PTrM411AOvL8,4717
|
|
30
|
+
ml_tools/_schema.py,sha256=yu6aWmn_2Z4_AxAtJGDDCIa96y6JcUp-vgnCS013Qmw,3908
|
|
31
|
+
ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
|
|
32
|
+
ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
|
|
33
|
+
ml_tools/custom_logger.py,sha256=TGc0Ww2Xlqj2XE3q4bP43hV7T3qnb5ci9f0pYHXF5TY,11226
|
|
34
|
+
ml_tools/data_exploration.py,sha256=bwHzFJ-IAo5GN3T53F-1J_pXUg8VHS91sG_90utAsfg,69911
|
|
35
|
+
ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
|
|
36
|
+
ml_tools/ensemble_inference.py,sha256=0yLmLNj45RVVoSCLH1ZYJG9IoAhTkWUqEZmLOQTFGTY,9348
|
|
37
|
+
ml_tools/ensemble_learning.py,sha256=vsIED7nlheYI4w2SBzP6SC1AnNeMfn-2A1Gqw5EfxsM,21964
|
|
38
|
+
ml_tools/handle_excel.py,sha256=pfdAPb9ywegFkM9T54bRssDOsX-K7rSeV0RaMz7lEAo,14006
|
|
39
|
+
ml_tools/keys.py,sha256=wZOBuEnnHc54vlOZiimnrxfk-sZh6f6suPppJW8rbPQ,3326
|
|
40
|
+
ml_tools/math_utilities.py,sha256=xeKq1quR_3DYLgowcp4Uam_4s3JltUyOnqMOGuAiYWU,8802
|
|
41
|
+
ml_tools/optimization_tools.py,sha256=TYFQ2nSnp7xxs-VyoZISWgnGJghFbsWasHjruegyJRs,12763
|
|
42
|
+
ml_tools/path_manager.py,sha256=CyDU16pOKmC82jPubqJPT6EBt-u-3rGVbxyPIZCvDDY,18432
|
|
43
|
+
ml_tools/serde.py,sha256=c8uDYjYry_VrLvoG4ixqDj5pij88lVn6Tu4NHcPkwDU,6943
|
|
44
|
+
ml_tools/utilities.py,sha256=aWqvYzmxlD74PD5Yqu1VuTekDJeYLQrmPIU_VeVyRp0,22526
|
|
45
|
+
dragon_ml_toolbox-14.2.0.dist-info/METADATA,sha256=T0eIxD-eO3cbAIzJ1HskJbog6RUYgXwXQQ2OU8Z-GQM,6475
|
|
46
|
+
dragon_ml_toolbox-14.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
47
|
+
dragon_ml_toolbox-14.2.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
48
|
+
dragon_ml_toolbox-14.2.0.dist-info/RECORD,,
|
|
@@ -26,3 +26,14 @@ This project depends on the following third-party packages. Each is governed by
|
|
|
26
26
|
- [polars](https://github.com/pola-rs/polars/blob/main/LICENSE)
|
|
27
27
|
- [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE)
|
|
28
28
|
- [tqdm](https://github.com/tqdm/tqdm/blob/master/LICENSE)
|
|
29
|
+
- [pyarrow](https://github.com/apache/arrow/blob/main/LICENSE.txt)
|
|
30
|
+
- [colorlog](https://github.com/borntyping/python-colorlog/blob/main/LICENSE)
|
|
31
|
+
- [evotorch](https://github.com/nnaisense/evotorch/blob/master/LICENSE)
|
|
32
|
+
- [FreeSimpleGUI](https://github.com/spyoungtech/FreeSimpleGUI/blob/main/license.txt)
|
|
33
|
+
- [nuitka](https://github.com/Nuitka/Nuitka/blob/main/LICENSE.txt)
|
|
34
|
+
- [omegaconf](https://github.com/omry/omegaconf/blob/master/LICENSE)
|
|
35
|
+
- [ordered-set](https://github.com/rspeer/ordered-set/blob/master/MIT-LICENSE)
|
|
36
|
+
- [pyinstaller](https://github.com/pyinstaller/pyinstaller/blob/develop/COPYING.txt)
|
|
37
|
+
- [pytorch_tabular](https://github.com/manujosephv/pytorch_tabular/blob/main/LICENSE)
|
|
38
|
+
- [torchmetrics](https://github.com/Lightning-AI/torchmetrics/blob/master/LICENSE)
|
|
39
|
+
- [zstandard](https://github.com/indygreg/python-zstandard/blob/main/LICENSE)
|
ml_tools/ETL_cleaning.py
CHANGED
|
@@ -2,8 +2,10 @@ import polars as pl
|
|
|
2
2
|
import pandas as pd
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Union, List, Dict
|
|
5
|
+
|
|
5
6
|
from .path_manager import sanitize_filename, make_fullpath
|
|
6
|
-
from .
|
|
7
|
+
from .data_exploration import drop_macro
|
|
8
|
+
from .utilities import save_dataframe_filename, load_dataframe
|
|
7
9
|
from ._script_info import _script_info
|
|
8
10
|
from ._logger import _LOGGER
|
|
9
11
|
|
|
@@ -11,26 +13,33 @@ from ._logger import _LOGGER
|
|
|
11
13
|
__all__ = [
|
|
12
14
|
"save_unique_values",
|
|
13
15
|
"basic_clean",
|
|
16
|
+
"basic_clean_drop",
|
|
14
17
|
"ColumnCleaner",
|
|
15
18
|
"DataFrameCleaner"
|
|
16
19
|
]
|
|
17
20
|
|
|
18
21
|
|
|
19
22
|
################ Unique Values per column #################
|
|
20
|
-
def save_unique_values(csv_path: Union[str, Path],
|
|
23
|
+
def save_unique_values(csv_path: Union[str, Path],
|
|
24
|
+
output_dir: Union[str, Path],
|
|
25
|
+
verbose: bool=False,
|
|
26
|
+
keep_column_order: bool = True) -> None:
|
|
21
27
|
"""
|
|
22
28
|
Loads a CSV file, then analyzes it and saves the unique non-null values
|
|
23
29
|
from each column into a separate text file exactly as they appear.
|
|
24
30
|
|
|
25
31
|
This is useful for understanding the raw categories or range of values
|
|
26
|
-
within a dataset before cleaning.
|
|
32
|
+
within a dataset before and after cleaning.
|
|
27
33
|
|
|
28
34
|
Args:
|
|
29
|
-
csv_path (
|
|
35
|
+
csv_path (str | Path):
|
|
30
36
|
The file path to the input CSV file.
|
|
31
|
-
output_dir (
|
|
37
|
+
output_dir (str | Path):
|
|
32
38
|
The path to the directory where the .txt files will be saved.
|
|
33
39
|
The directory will be created if it does not exist.
|
|
40
|
+
keep_column_order (bool):
|
|
41
|
+
If True, prepends a numeric prefix (e.g., '1_', '2_') to each
|
|
42
|
+
output filename to maintain the original column order.
|
|
34
43
|
"""
|
|
35
44
|
# --- 1. Input Validation ---
|
|
36
45
|
csv_path = make_fullpath(input_path=csv_path, enforce="file")
|
|
@@ -72,7 +81,12 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
|
|
|
72
81
|
sanitized_name = sanitize_filename(column_name)
|
|
73
82
|
if not sanitized_name.strip('_'):
|
|
74
83
|
sanitized_name = f'column_{i}'
|
|
75
|
-
|
|
84
|
+
|
|
85
|
+
# --- create filename prefix ---
|
|
86
|
+
# If keep_column_order is True, create a prefix like "1_", "2_", etc.
|
|
87
|
+
prefix = f"{i + 1}_" if keep_column_order else ''
|
|
88
|
+
|
|
89
|
+
file_path = output_dir / f"{prefix}{sanitized_name}_unique_values.txt"
|
|
76
90
|
|
|
77
91
|
# --- Write to file ---
|
|
78
92
|
try:
|
|
@@ -93,39 +107,8 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
|
|
|
93
107
|
_LOGGER.info(f"{counter} files of unique values created.")
|
|
94
108
|
|
|
95
109
|
|
|
96
|
-
########## Basic df
|
|
97
|
-
def
|
|
98
|
-
"""
|
|
99
|
-
Performs a comprehensive, standardized cleaning on all columns of a CSV file.
|
|
100
|
-
|
|
101
|
-
The cleaning process includes:
|
|
102
|
-
- Normalizing full-width and typographical punctuation to standard equivalents.
|
|
103
|
-
- Consolidating all internal whitespace (spaces, tabs, newlines) into a single space.
|
|
104
|
-
- Stripping any leading or trailing whitespace.
|
|
105
|
-
- Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
|
|
106
|
-
- Converting strings that become empty after cleaning into true null values.
|
|
107
|
-
- Normalizing all text to lowercase.
|
|
108
|
-
|
|
109
|
-
Args:
|
|
110
|
-
input_filepath (Union[str, Path]):
|
|
111
|
-
The path to the source CSV file to be cleaned.
|
|
112
|
-
output_filepath (Union[str, Path, None], optional):
|
|
113
|
-
The path to save the cleaned CSV file. If None (default),
|
|
114
|
-
the original input file will be overwritten.
|
|
115
|
-
"""
|
|
116
|
-
# Handle paths
|
|
117
|
-
input_path = make_fullpath(input_filepath, enforce="file")
|
|
118
|
-
|
|
119
|
-
# Unless explicitly defined, overwrite file.
|
|
120
|
-
if output_filepath is not None:
|
|
121
|
-
parent_dir = make_fullpath(Path(output_filepath).parent, make=True, enforce="directory")
|
|
122
|
-
output_path = parent_dir / Path(output_filepath).name
|
|
123
|
-
else:
|
|
124
|
-
output_path = input_path
|
|
125
|
-
|
|
126
|
-
# load polars df
|
|
127
|
-
df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
|
|
128
|
-
|
|
110
|
+
########## Basic df cleaners #############
|
|
111
|
+
def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
|
|
129
112
|
# Cleaning rules
|
|
130
113
|
cleaning_rules = {
|
|
131
114
|
# 1. Comprehensive Punctuation & Symbol Normalization
|
|
@@ -141,6 +124,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
|
|
|
141
124
|
'⁶': '6', '⁷': '7', '⁸': '8', '⁹': '9', '⁰': '0',
|
|
142
125
|
'₁': '1', '₂': '2', '₃': '3', '₄': '4', '₅': '5',
|
|
143
126
|
'₆': '6', '₇': '7', '₈': '8', '₉': '9', '₀': '0',
|
|
127
|
+
'⁺': '', '⁻': '', '₊': '', '₋': '',
|
|
144
128
|
# Uppercase Alphabet
|
|
145
129
|
'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F',
|
|
146
130
|
'G': 'G', 'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L',
|
|
@@ -154,26 +138,44 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
|
|
|
154
138
|
's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x',
|
|
155
139
|
'y': 'y', 'z': 'z',
|
|
156
140
|
# Punctuation
|
|
157
|
-
'》': '>', '《': '<', ':': ':', '
|
|
141
|
+
'》': '>', '《': '<', ':': ':', '。': '.', ';': ';', '【': '[', '】': ']', '∼': '~',
|
|
158
142
|
'(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#', '+': '+', '-': '-',
|
|
159
|
-
'$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '
|
|
143
|
+
'$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '-', '|': '|', '≈':'=', '·': '', '⋅': '',
|
|
144
|
+
'¯': '-',
|
|
145
|
+
|
|
146
|
+
# Commas (avoid commas in entries)
|
|
147
|
+
',': ';',
|
|
148
|
+
',': ';',
|
|
149
|
+
'、':';',
|
|
160
150
|
|
|
161
151
|
# Others
|
|
152
|
+
'σ': '',
|
|
153
|
+
'□': '',
|
|
162
154
|
'©': '',
|
|
163
155
|
'®': '',
|
|
164
156
|
'™': '',
|
|
157
|
+
r'[°˚]': '',
|
|
158
|
+
|
|
159
|
+
# Replace special characters in entries
|
|
160
|
+
r'\\': '_',
|
|
161
|
+
|
|
162
|
+
# Typographical standardization
|
|
163
|
+
# Unify various dashes and hyphens to a standard hyphen
|
|
164
|
+
r'[—–―]': '-',
|
|
165
|
+
r'−': '-',
|
|
166
|
+
# remove various quote types
|
|
167
|
+
r'[“”"]': '',
|
|
168
|
+
r"[‘’′']": '',
|
|
165
169
|
|
|
166
170
|
# Collapse repeating punctuation
|
|
167
171
|
r'\.{2,}': '.', # Replace two or more dots with a single dot
|
|
168
172
|
r'\?{2,}': '?', # Replace two or more question marks with a single question mark
|
|
169
173
|
r'!{2,}': '!', # Replace two or more exclamation marks with a single one
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
r'
|
|
174
|
-
|
|
175
|
-
r'[“”]': "'",
|
|
176
|
-
r'[‘’′]': "'",
|
|
174
|
+
r';{2,}': ';',
|
|
175
|
+
r'-{2,}': '-',
|
|
176
|
+
r'/{2,}': '/',
|
|
177
|
+
r'%{2,}': '%',
|
|
178
|
+
r'&{2,}': '&',
|
|
177
179
|
|
|
178
180
|
# 2. Internal Whitespace Consolidation
|
|
179
181
|
# Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
|
|
@@ -184,36 +186,150 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
|
|
|
184
186
|
r'^\s+|\s+$': '',
|
|
185
187
|
|
|
186
188
|
# 4. Textual Null Standardization (New Step)
|
|
187
|
-
# Convert common null-like text to actual nulls.
|
|
188
|
-
r'^(N/A|无|NA|NULL|NONE|NIL
|
|
189
|
+
# Convert common null-like text to actual nulls.
|
|
190
|
+
r'^(N/A|无|NA|NULL|NONE|NIL|-|\.|;|/|%|&)$': None,
|
|
189
191
|
|
|
190
192
|
# 5. Final Nullification of Empty Strings
|
|
191
193
|
# After all cleaning, if a string is now empty, convert it to a null
|
|
192
|
-
r'
|
|
194
|
+
r'^\s*$': None,
|
|
195
|
+
r'^$': None,
|
|
193
196
|
}
|
|
194
197
|
|
|
195
198
|
# Clean data
|
|
196
199
|
try:
|
|
197
200
|
# Create a cleaner for every column in the dataframe
|
|
198
|
-
all_columns =
|
|
201
|
+
all_columns = df_in.columns
|
|
199
202
|
column_cleaners = [
|
|
200
203
|
ColumnCleaner(col, rules=cleaning_rules, case_insensitive=True) for col in all_columns
|
|
201
204
|
]
|
|
202
205
|
|
|
203
206
|
# Instantiate and run the main dataframe cleaner
|
|
204
207
|
df_cleaner = DataFrameCleaner(cleaners=column_cleaners)
|
|
205
|
-
df_cleaned = df_cleaner.clean(
|
|
208
|
+
df_cleaned = df_cleaner.clean(df_in, clone_df=False) # Use clone_df=False for efficiency
|
|
206
209
|
|
|
207
210
|
# apply lowercase to all string columns
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
+
if all_lowercase:
|
|
212
|
+
df_final = df_cleaned.with_columns(
|
|
213
|
+
pl.col(pl.String).str.to_lowercase()
|
|
214
|
+
)
|
|
215
|
+
else:
|
|
216
|
+
df_final = df_cleaned
|
|
217
|
+
|
|
211
218
|
except Exception as e:
|
|
212
|
-
_LOGGER.error(f"An error occurred during the cleaning process
|
|
219
|
+
_LOGGER.error(f"An error occurred during the cleaning process.")
|
|
213
220
|
raise e
|
|
221
|
+
else:
|
|
222
|
+
return df_final
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _path_manager(path_in: Union[str,Path], path_out: Union[str,Path]):
|
|
226
|
+
# Handle paths
|
|
227
|
+
input_path = make_fullpath(path_in, enforce="file")
|
|
228
|
+
|
|
229
|
+
parent_dir = make_fullpath(Path(path_out).parent, make=True, enforce="directory")
|
|
230
|
+
output_path = parent_dir / Path(path_out).name
|
|
231
|
+
|
|
232
|
+
return input_path, output_path
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path], all_lowercase: bool=True):
|
|
236
|
+
"""
|
|
237
|
+
Performs a comprehensive, standardized cleaning on all columns of a CSV file.
|
|
238
|
+
|
|
239
|
+
The cleaning process includes:
|
|
240
|
+
- Normalizing full-width and typographical punctuation to standard equivalents.
|
|
241
|
+
- Consolidating all internal whitespace (spaces, tabs, newlines) into a single space.
|
|
242
|
+
- Stripping any leading or trailing whitespace.
|
|
243
|
+
- Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
|
|
244
|
+
- Converting strings that become empty after cleaning into true null values.
|
|
245
|
+
- Normalizing all text to lowercase (Optional).
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
input_filepath (str | Path):
|
|
249
|
+
The path to the source CSV file to be cleaned.
|
|
250
|
+
output_filepath (str | Path):
|
|
251
|
+
The path to save the cleaned CSV file.
|
|
252
|
+
all_lowercase (bool):
|
|
253
|
+
Whether to normalize all text to lowercase.
|
|
254
|
+
|
|
255
|
+
"""
|
|
256
|
+
# Handle paths
|
|
257
|
+
input_path, output_path = _path_manager(path_in=input_filepath, path_out=output_filepath)
|
|
258
|
+
|
|
259
|
+
# load polars df
|
|
260
|
+
df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
|
|
261
|
+
|
|
262
|
+
# CLEAN
|
|
263
|
+
df_final = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
|
|
264
|
+
|
|
265
|
+
# Save cleaned dataframe
|
|
266
|
+
save_dataframe_filename(df=df_final, save_dir=output_path.parent, filename=output_path.name)
|
|
267
|
+
|
|
268
|
+
_LOGGER.info(f"Data successfully cleaned.")
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str,Path], log_directory: Union[str,Path], targets: list[str],
|
|
272
|
+
skip_targets: bool=False, threshold: float=0.8, all_lowercase: bool=True):
|
|
273
|
+
"""
|
|
274
|
+
Performs standardized cleaning followed by iterative removal of rows and
|
|
275
|
+
columns with excessive missing data.
|
|
276
|
+
|
|
277
|
+
This function combines the functionality of `basic_clean` and `drop_macro`. It first
|
|
278
|
+
applies a comprehensive normalization process to all columns in the input CSV file,
|
|
279
|
+
ensuring consistent formatting and proper null value handling. The cleaned data is then
|
|
280
|
+
converted to a pandas DataFrame, where iterative row and column dropping is applied
|
|
281
|
+
to remove redundant or incomplete data.
|
|
282
|
+
|
|
283
|
+
The iterative dropping cycle continues until no further rows or columns meet the
|
|
284
|
+
removal criteria, ensuring that dependencies between row and column deletions are
|
|
285
|
+
fully resolved. Logs documenting the missing data profile before and after the
|
|
286
|
+
dropping process are saved to the specified log directory.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
input_filepath (str | Path):
|
|
290
|
+
The path to the source CSV file to be cleaned.
|
|
291
|
+
output_filepath (str | Path):
|
|
292
|
+
The path to save the fully cleaned CSV file after cleaning
|
|
293
|
+
and missing-data-based pruning.
|
|
294
|
+
log_directory (str | Path):
|
|
295
|
+
Path to the directory where missing data reports will be stored.
|
|
296
|
+
targets (list[str]):
|
|
297
|
+
A list of column names to be treated as target variables.
|
|
298
|
+
This list guides the row-dropping logic.
|
|
299
|
+
skip_targets (bool):
|
|
300
|
+
If True, the columns listed in `targets` will be exempt from being dropped,
|
|
301
|
+
even if they exceed the missing data threshold.
|
|
302
|
+
threshold (float):
|
|
303
|
+
The proportion of missing data required to drop a row or column.
|
|
304
|
+
For example, 0.8 means a row/column will be dropped if 80% or more
|
|
305
|
+
of its data is missing.
|
|
306
|
+
all_lowercase (bool):
|
|
307
|
+
Whether to normalize all text to lowercase.
|
|
308
|
+
"""
|
|
309
|
+
# handle log path
|
|
310
|
+
log_path = make_fullpath(log_directory, make=True, enforce="directory")
|
|
311
|
+
|
|
312
|
+
# Handle df paths
|
|
313
|
+
input_path, output_path = _path_manager(path_in=input_filepath, path_out=output_filepath)
|
|
314
|
+
|
|
315
|
+
# load polars df
|
|
316
|
+
df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
|
|
317
|
+
|
|
318
|
+
# CLEAN
|
|
319
|
+
df_cleaned = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
|
|
320
|
+
|
|
321
|
+
# switch to pandas
|
|
322
|
+
df_cleaned_pandas = df_cleaned.to_pandas()
|
|
323
|
+
|
|
324
|
+
# Drop macro
|
|
325
|
+
df_final = drop_macro(df=df_cleaned_pandas,
|
|
326
|
+
log_directory=log_path,
|
|
327
|
+
targets=targets,
|
|
328
|
+
skip_targets=skip_targets,
|
|
329
|
+
threshold=threshold)
|
|
214
330
|
|
|
215
331
|
# Save cleaned dataframe
|
|
216
|
-
|
|
332
|
+
save_dataframe_filename(df=df_final, save_dir=output_path.parent, filename=output_path.name)
|
|
217
333
|
|
|
218
334
|
_LOGGER.info(f"Data successfully cleaned.")
|
|
219
335
|
|
|
@@ -378,7 +494,7 @@ class DataFrameCleaner:
|
|
|
378
494
|
if isinstance(output_filepath, str):
|
|
379
495
|
output_filepath = make_fullpath(input_path=output_filepath, enforce="file")
|
|
380
496
|
|
|
381
|
-
|
|
497
|
+
save_dataframe_filename(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
|
|
382
498
|
|
|
383
499
|
return None
|
|
384
500
|
|