dragon-ml-toolbox 10.2.0__py3-none-any.whl → 14.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/METADATA +38 -63
- dragon_ml_toolbox-14.2.0.dist-info/RECORD +48 -0
- {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE +1 -1
- {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +11 -0
- ml_tools/ETL_cleaning.py +72 -34
- ml_tools/ETL_engineering.py +506 -70
- ml_tools/GUI_tools.py +2 -1
- ml_tools/MICE_imputation.py +212 -7
- ml_tools/ML_callbacks.py +73 -40
- ml_tools/ML_datasetmaster.py +267 -284
- ml_tools/ML_evaluation.py +119 -58
- ml_tools/ML_evaluation_multi.py +107 -32
- ml_tools/ML_inference.py +15 -5
- ml_tools/ML_models.py +234 -170
- ml_tools/ML_models_advanced.py +323 -0
- ml_tools/ML_optimization.py +321 -97
- ml_tools/ML_scaler.py +10 -5
- ml_tools/ML_trainer.py +585 -40
- ml_tools/ML_utilities.py +528 -0
- ml_tools/ML_vision_datasetmaster.py +1315 -0
- ml_tools/ML_vision_evaluation.py +260 -0
- ml_tools/ML_vision_inference.py +428 -0
- ml_tools/ML_vision_models.py +627 -0
- ml_tools/ML_vision_transformers.py +58 -0
- ml_tools/PSO_optimization.py +10 -7
- ml_tools/RNN_forecast.py +2 -0
- ml_tools/SQL.py +22 -9
- ml_tools/VIF_factor.py +4 -3
- ml_tools/_ML_vision_recipe.py +88 -0
- ml_tools/__init__.py +1 -0
- ml_tools/_logger.py +0 -2
- ml_tools/_schema.py +96 -0
- ml_tools/constants.py +79 -0
- ml_tools/custom_logger.py +164 -16
- ml_tools/data_exploration.py +1092 -109
- ml_tools/ensemble_evaluation.py +48 -1
- ml_tools/ensemble_inference.py +6 -7
- ml_tools/ensemble_learning.py +4 -3
- ml_tools/handle_excel.py +1 -0
- ml_tools/keys.py +80 -0
- ml_tools/math_utilities.py +259 -0
- ml_tools/optimization_tools.py +198 -24
- ml_tools/path_manager.py +144 -45
- ml_tools/serde.py +192 -0
- ml_tools/utilities.py +287 -227
- dragon_ml_toolbox-10.2.0.dist-info/RECORD +0 -36
- {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,23 +1,17 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version:
|
|
3
|
+
Version: 14.2.0
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects.
|
|
5
|
-
Author-email: Karl Loza <luigiloza@gmail.com>
|
|
5
|
+
Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/DrAg0n-BoRn/ML_tools
|
|
8
8
|
Project-URL: Changelog, https://github.com/DrAg0n-BoRn/ML_tools/blob/master/CHANGELOG.md
|
|
9
9
|
Classifier: Programming Language :: Python :: 3
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
|
11
|
-
Requires-Python: >=3.
|
|
11
|
+
Requires-Python: >=3.12
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
14
|
License-File: LICENSE-THIRD-PARTY.md
|
|
15
|
-
Provides-Extra: base
|
|
16
|
-
Requires-Dist: pandas; extra == "base"
|
|
17
|
-
Requires-Dist: numpy; extra == "base"
|
|
18
|
-
Requires-Dist: polars; extra == "base"
|
|
19
|
-
Requires-Dist: joblib; extra == "base"
|
|
20
|
-
Requires-Dist: colorlog; extra == "base"
|
|
21
15
|
Provides-Extra: ml
|
|
22
16
|
Requires-Dist: numpy>=2.0; extra == "ml"
|
|
23
17
|
Requires-Dist: pandas; extra == "ml"
|
|
@@ -38,7 +32,12 @@ Requires-Dist: shap; extra == "ml"
|
|
|
38
32
|
Requires-Dist: tqdm; extra == "ml"
|
|
39
33
|
Requires-Dist: Pillow; extra == "ml"
|
|
40
34
|
Requires-Dist: evotorch; extra == "ml"
|
|
35
|
+
Requires-Dist: pyarrow; extra == "ml"
|
|
41
36
|
Requires-Dist: colorlog; extra == "ml"
|
|
37
|
+
Requires-Dist: torchmetrics; extra == "ml"
|
|
38
|
+
Provides-Extra: py-tab
|
|
39
|
+
Requires-Dist: pytorch_tabular; extra == "py-tab"
|
|
40
|
+
Requires-Dist: omegaconf; extra == "py-tab"
|
|
42
41
|
Provides-Extra: mice
|
|
43
42
|
Requires-Dist: numpy<2.0; extra == "mice"
|
|
44
43
|
Requires-Dist: pandas; extra == "mice"
|
|
@@ -51,9 +50,7 @@ Requires-Dist: statsmodels; extra == "mice"
|
|
|
51
50
|
Requires-Dist: lightgbm<=4.5.0; extra == "mice"
|
|
52
51
|
Requires-Dist: shap; extra == "mice"
|
|
53
52
|
Requires-Dist: colorlog; extra == "mice"
|
|
54
|
-
|
|
55
|
-
Requires-Dist: torch; extra == "pytorch"
|
|
56
|
-
Requires-Dist: torchvision; extra == "pytorch"
|
|
53
|
+
Requires-Dist: pyarrow; extra == "mice"
|
|
57
54
|
Provides-Extra: excel
|
|
58
55
|
Requires-Dist: pandas; extra == "excel"
|
|
59
56
|
Requires-Dist: openpyxl; extra == "excel"
|
|
@@ -72,9 +69,6 @@ Requires-Dist: lightgbm; extra == "gui-boost"
|
|
|
72
69
|
Provides-Extra: gui-torch
|
|
73
70
|
Requires-Dist: numpy; extra == "gui-torch"
|
|
74
71
|
Requires-Dist: FreeSimpleGUI>=5.2; extra == "gui-torch"
|
|
75
|
-
Provides-Extra: plot
|
|
76
|
-
Requires-Dist: matplotlib; extra == "plot"
|
|
77
|
-
Requires-Dist: seaborn; extra == "plot"
|
|
78
72
|
Provides-Extra: pyinstaller
|
|
79
73
|
Requires-Dist: pyinstaller; extra == "pyinstaller"
|
|
80
74
|
Provides-Extra: nuitka
|
|
@@ -94,7 +88,7 @@ A collection of Python utilities for data science and machine learning, structur
|
|
|
94
88
|
|
|
95
89
|
## Installation
|
|
96
90
|
|
|
97
|
-
**Python 3.
|
|
91
|
+
**Python 3.12**
|
|
98
92
|
|
|
99
93
|
### Via PyPI
|
|
100
94
|
|
|
@@ -104,22 +98,22 @@ Install the latest stable release from PyPI:
|
|
|
104
98
|
pip install dragon-ml-toolbox
|
|
105
99
|
```
|
|
106
100
|
|
|
107
|
-
### Via
|
|
101
|
+
### Via conda-forge
|
|
108
102
|
|
|
109
|
-
|
|
103
|
+
Install from the conda-forge channel:
|
|
110
104
|
|
|
111
105
|
```bash
|
|
112
|
-
|
|
113
|
-
cd ML_tools
|
|
114
|
-
pip install -e .
|
|
106
|
+
conda install -c conda-forge dragon-ml-toolbox
|
|
115
107
|
```
|
|
116
108
|
|
|
117
|
-
### Via
|
|
109
|
+
### Via GitHub (Editable)
|
|
118
110
|
|
|
119
|
-
|
|
111
|
+
Clone the repository and install in editable mode:
|
|
120
112
|
|
|
121
113
|
```bash
|
|
122
|
-
|
|
114
|
+
git clone https://github.com/DrAg0n-BoRn/ML_tools.git
|
|
115
|
+
cd ML_tools
|
|
116
|
+
pip install -e .
|
|
123
117
|
```
|
|
124
118
|
|
|
125
119
|
## Modular Installation
|
|
@@ -132,17 +126,12 @@ Installs a comprehensive set of tools for typical data science workflows, includ
|
|
|
132
126
|
pip install "dragon-ml-toolbox[ML]"
|
|
133
127
|
```
|
|
134
128
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
```Bash
|
|
138
|
-
pip install "dragon-ml-toolbox[pytorch]"
|
|
139
|
-
```
|
|
140
|
-
|
|
141
|
-
⚠️ To make use of GPU acceleration (highly recommended), follow the official instructions: [PyTorch website](https://pytorch.org/get-started/locally/)
|
|
129
|
+
⚠️ PyTorch required, follow the official instructions: [PyTorch website](https://pytorch.org/get-started/locally/)
|
|
142
130
|
|
|
143
131
|
#### Modules:
|
|
144
132
|
|
|
145
133
|
```bash
|
|
134
|
+
constants
|
|
146
135
|
custom_logger
|
|
147
136
|
data_exploration
|
|
148
137
|
ensemble_evaluation
|
|
@@ -150,19 +139,28 @@ ensemble_inference
|
|
|
150
139
|
ensemble_learning
|
|
151
140
|
ETL_cleaning
|
|
152
141
|
ETL_engineering
|
|
142
|
+
math_utilities
|
|
153
143
|
ML_callbacks
|
|
154
144
|
ML_datasetmaster
|
|
155
145
|
ML_evaluation_multi
|
|
156
146
|
ML_evaluation
|
|
157
147
|
ML_inference
|
|
158
148
|
ML_models
|
|
149
|
+
ML_models_advanced # Requires the extra flag [py-tab]
|
|
159
150
|
ML_optimization
|
|
160
151
|
ML_scaler
|
|
161
152
|
ML_trainer
|
|
153
|
+
ML_utilities
|
|
154
|
+
ML_vision_datasetmaster
|
|
155
|
+
ML_vision_evaluation
|
|
156
|
+
ML_vision_inference
|
|
157
|
+
ML_vision_models
|
|
158
|
+
ML_vision_transformers
|
|
162
159
|
optimization_tools
|
|
163
160
|
path_manager
|
|
164
161
|
PSO_optimization
|
|
165
162
|
RNN_forecast
|
|
163
|
+
serde
|
|
166
164
|
SQL
|
|
167
165
|
utilities
|
|
168
166
|
```
|
|
@@ -180,8 +178,11 @@ pip install "dragon-ml-toolbox[mice]"
|
|
|
180
178
|
#### Modules:
|
|
181
179
|
|
|
182
180
|
```Bash
|
|
181
|
+
constants
|
|
183
182
|
custom_logger
|
|
183
|
+
math_utilities
|
|
184
184
|
MICE_imputation
|
|
185
|
+
serde
|
|
185
186
|
VIF_factor
|
|
186
187
|
path_manager
|
|
187
188
|
utilities
|
|
@@ -209,42 +210,37 @@ path_manager
|
|
|
209
210
|
|
|
210
211
|
### 🎰 GUI for Boosting Algorithms (XGBoost, LightGBM) [gui-boost]
|
|
211
212
|
|
|
212
|
-
|
|
213
|
+
GUI tools compatible with XGBoost and LightGBM models used for inference.
|
|
213
214
|
|
|
214
215
|
```Bash
|
|
215
216
|
pip install "dragon-ml-toolbox[gui-boost]"
|
|
216
217
|
```
|
|
217
218
|
|
|
218
|
-
```Bash
|
|
219
|
-
pip install "dragon-ml-toolbox[gui-boost,plot]"
|
|
220
|
-
```
|
|
221
|
-
|
|
222
219
|
#### Modules:
|
|
223
220
|
|
|
224
221
|
```Bash
|
|
222
|
+
constants
|
|
225
223
|
custom_logger
|
|
226
224
|
GUI_tools
|
|
227
225
|
ensemble_inference
|
|
228
226
|
path_manager
|
|
227
|
+
serde
|
|
229
228
|
```
|
|
230
229
|
|
|
231
230
|
---
|
|
232
231
|
|
|
233
232
|
### 🤖 GUI for PyTorch Models [gui-torch]
|
|
234
233
|
|
|
235
|
-
|
|
234
|
+
GUI tools compatible with PyTorch models used for inference.
|
|
236
235
|
|
|
237
236
|
```Bash
|
|
238
237
|
pip install "dragon-ml-toolbox[gui-torch]"
|
|
239
238
|
```
|
|
240
239
|
|
|
241
|
-
```Bash
|
|
242
|
-
pip install "dragon-ml-toolbox[gui-torch,plot]"
|
|
243
|
-
```
|
|
244
|
-
|
|
245
240
|
#### Modules:
|
|
246
241
|
|
|
247
242
|
```Bash
|
|
243
|
+
constants
|
|
248
244
|
custom_logger
|
|
249
245
|
GUI_tools
|
|
250
246
|
ML_models
|
|
@@ -255,27 +251,6 @@ path_manager
|
|
|
255
251
|
|
|
256
252
|
---
|
|
257
253
|
|
|
258
|
-
### 🎫 Base Tools [base]
|
|
259
|
-
|
|
260
|
-
General purpose functions and classes.
|
|
261
|
-
|
|
262
|
-
```Bash
|
|
263
|
-
pip install "dragon-ml-toolbox[base]"
|
|
264
|
-
```
|
|
265
|
-
|
|
266
|
-
#### Modules:
|
|
267
|
-
|
|
268
|
-
```Bash
|
|
269
|
-
ETL_cleaning
|
|
270
|
-
ETL_engineering
|
|
271
|
-
custom_logger
|
|
272
|
-
SQL
|
|
273
|
-
utilities
|
|
274
|
-
path_manager
|
|
275
|
-
```
|
|
276
|
-
|
|
277
|
-
---
|
|
278
|
-
|
|
279
254
|
### ⚒️ APP bundlers
|
|
280
255
|
|
|
281
256
|
Choose one if needed.
|
|
@@ -293,6 +268,6 @@ pip install "dragon-ml-toolbox[nuitka]"
|
|
|
293
268
|
After installation, import modules like this:
|
|
294
269
|
|
|
295
270
|
```python
|
|
296
|
-
from ml_tools.
|
|
271
|
+
from ml_tools.serde import serialize_object, deserialize_object
|
|
297
272
|
from ml_tools import custom_logger
|
|
298
273
|
```
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
dragon_ml_toolbox-14.2.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
|
|
2
|
+
dragon_ml_toolbox-14.2.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=gkOdNDbKYpIJezwSo2CEnISkLeYfYHv9t8b5K2-P69A,2687
|
|
3
|
+
ml_tools/ETL_cleaning.py,sha256=2VBRllV8F-ZiPylPp8Az2gwn5ztgazN0BH5OKnRUhV0,20402
|
|
4
|
+
ml_tools/ETL_engineering.py,sha256=KfYqgsxupAx6e_TxwO1LZXeu5mFkIhVXJrNjP3CzIZc,54927
|
|
5
|
+
ml_tools/GUI_tools.py,sha256=Va6ig-dHULPVRwQYYtH3fvY5XPIoqRcJpRW8oXC55Hw,45413
|
|
6
|
+
ml_tools/MICE_imputation.py,sha256=KLJXGQLKJ6AuWWttAG-LCCaxpS-ygM4dXPiguHDaL6Y,20815
|
|
7
|
+
ml_tools/ML_callbacks.py,sha256=elD2Yr030sv_6gX_m9GVd6HTyrbmt34nFS8lrgS4HtM,15808
|
|
8
|
+
ml_tools/ML_datasetmaster.py,sha256=rsJgZEGBJmfeKF6cR8CQZzfEx4T7Y-p1wUnR15_nNw0,28400
|
|
9
|
+
ml_tools/ML_evaluation.py,sha256=4GU86rUWMIGbkXrvN6PyjfGwKtWvXKE7pMlWpWeBq14,18988
|
|
10
|
+
ml_tools/ML_evaluation_multi.py,sha256=rJKdgtq-9I7oaI7PRzq7aIZ84XdNV0xzlVePZW4nj0k,16095
|
|
11
|
+
ml_tools/ML_inference.py,sha256=YJ953bhNWsdlPRtJQh3h2ACfMIgp8dQ9KtL9Azar-5s,23489
|
|
12
|
+
ml_tools/ML_models.py,sha256=PqOcNlws7vCJMbiVCKqlPuktxvskZVUHG3VfU-Yshf8,31415
|
|
13
|
+
ml_tools/ML_models_advanced.py,sha256=vk3PZBSu3DVso2S1rKTxxdS43XG8Q5FnasIL3-rMajc,12410
|
|
14
|
+
ml_tools/ML_optimization.py,sha256=P0zkhKAwTpkorIBtR0AOIDcyexo5ngmvFUzo3DfNO-E,22692
|
|
15
|
+
ml_tools/ML_scaler.py,sha256=tw6onj9o8_kk3FQYb930HUzvv1zsFZe2YZJdF3LtHkU,7538
|
|
16
|
+
ml_tools/ML_trainer.py,sha256=ZWI4MbUcLeBxyfoUTL96l5tjHHMp9I64h4SdXnjYmBE,49795
|
|
17
|
+
ml_tools/ML_utilities.py,sha256=z6LbpbZwhn8F__fWlKi-g-cAJQXSxwg1NHfC5FBoAyc,21139
|
|
18
|
+
ml_tools/ML_vision_datasetmaster.py,sha256=tOrdatuq_AP8-GDiTrtARvSJdpc8h7dT-OhDJtRQnsk,54433
|
|
19
|
+
ml_tools/ML_vision_evaluation.py,sha256=t12R7i1RkOCt9zu1_lxSBr8OH6A6Get0k8ftDLctn6I,10486
|
|
20
|
+
ml_tools/ML_vision_inference.py,sha256=He3KV3VJAm8PwO-fOq4b9VO8UXFr-GmpuCnoHXf4VZI,20588
|
|
21
|
+
ml_tools/ML_vision_models.py,sha256=G3S4jB9AE9wMpU9ZygOgOx9q1K6t6LAXBYcJ-U2XQ1M,25600
|
|
22
|
+
ml_tools/ML_vision_transformers.py,sha256=95e0aBkHY5VDGE8i5xy57COU7NvSNIgFknnhBubwE40,1832
|
|
23
|
+
ml_tools/PSO_optimization.py,sha256=T-HWHMRJUnPvPwixdU5jif3_rnnI36TzcL8u3oSCwuA,22960
|
|
24
|
+
ml_tools/RNN_forecast.py,sha256=Qa2KoZfdAvSjZ4yE78N4BFXtr3tTr0Gx7tQJZPotsh0,1967
|
|
25
|
+
ml_tools/SQL.py,sha256=vXLPGfVVg8bfkbBE3HVfyEclVbdJy0TBhuQONtMwSCQ,11234
|
|
26
|
+
ml_tools/VIF_factor.py,sha256=at5IVqPvicja2-DNSTSIIy3SkzDWCmLzo3qTG_qr5n8,10422
|
|
27
|
+
ml_tools/_ML_vision_recipe.py,sha256=zrgxFUvTJqQVuwR7jWlbIC2FD29u6eNFPkTRoJ7yEZI,3178
|
|
28
|
+
ml_tools/__init__.py,sha256=kJiankjz9_qXu7gU92mYqYg_anLvt-B6RtW0mMH8uGo,76
|
|
29
|
+
ml_tools/_logger.py,sha256=dlp5cGbzooK9YSNSZYB4yjZrOaQUGW8PTrM411AOvL8,4717
|
|
30
|
+
ml_tools/_schema.py,sha256=yu6aWmn_2Z4_AxAtJGDDCIa96y6JcUp-vgnCS013Qmw,3908
|
|
31
|
+
ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
|
|
32
|
+
ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
|
|
33
|
+
ml_tools/custom_logger.py,sha256=TGc0Ww2Xlqj2XE3q4bP43hV7T3qnb5ci9f0pYHXF5TY,11226
|
|
34
|
+
ml_tools/data_exploration.py,sha256=bwHzFJ-IAo5GN3T53F-1J_pXUg8VHS91sG_90utAsfg,69911
|
|
35
|
+
ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
|
|
36
|
+
ml_tools/ensemble_inference.py,sha256=0yLmLNj45RVVoSCLH1ZYJG9IoAhTkWUqEZmLOQTFGTY,9348
|
|
37
|
+
ml_tools/ensemble_learning.py,sha256=vsIED7nlheYI4w2SBzP6SC1AnNeMfn-2A1Gqw5EfxsM,21964
|
|
38
|
+
ml_tools/handle_excel.py,sha256=pfdAPb9ywegFkM9T54bRssDOsX-K7rSeV0RaMz7lEAo,14006
|
|
39
|
+
ml_tools/keys.py,sha256=wZOBuEnnHc54vlOZiimnrxfk-sZh6f6suPppJW8rbPQ,3326
|
|
40
|
+
ml_tools/math_utilities.py,sha256=xeKq1quR_3DYLgowcp4Uam_4s3JltUyOnqMOGuAiYWU,8802
|
|
41
|
+
ml_tools/optimization_tools.py,sha256=TYFQ2nSnp7xxs-VyoZISWgnGJghFbsWasHjruegyJRs,12763
|
|
42
|
+
ml_tools/path_manager.py,sha256=CyDU16pOKmC82jPubqJPT6EBt-u-3rGVbxyPIZCvDDY,18432
|
|
43
|
+
ml_tools/serde.py,sha256=c8uDYjYry_VrLvoG4ixqDj5pij88lVn6Tu4NHcPkwDU,6943
|
|
44
|
+
ml_tools/utilities.py,sha256=aWqvYzmxlD74PD5Yqu1VuTekDJeYLQrmPIU_VeVyRp0,22526
|
|
45
|
+
dragon_ml_toolbox-14.2.0.dist-info/METADATA,sha256=T0eIxD-eO3cbAIzJ1HskJbog6RUYgXwXQQ2OU8Z-GQM,6475
|
|
46
|
+
dragon_ml_toolbox-14.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
47
|
+
dragon_ml_toolbox-14.2.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
48
|
+
dragon_ml_toolbox-14.2.0.dist-info/RECORD,,
|
|
@@ -26,3 +26,14 @@ This project depends on the following third-party packages. Each is governed by
|
|
|
26
26
|
- [polars](https://github.com/pola-rs/polars/blob/main/LICENSE)
|
|
27
27
|
- [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE)
|
|
28
28
|
- [tqdm](https://github.com/tqdm/tqdm/blob/master/LICENSE)
|
|
29
|
+
- [pyarrow](https://github.com/apache/arrow/blob/main/LICENSE.txt)
|
|
30
|
+
- [colorlog](https://github.com/borntyping/python-colorlog/blob/main/LICENSE)
|
|
31
|
+
- [evotorch](https://github.com/nnaisense/evotorch/blob/master/LICENSE)
|
|
32
|
+
- [FreeSimpleGUI](https://github.com/spyoungtech/FreeSimpleGUI/blob/main/license.txt)
|
|
33
|
+
- [nuitka](https://github.com/Nuitka/Nuitka/blob/main/LICENSE.txt)
|
|
34
|
+
- [omegaconf](https://github.com/omry/omegaconf/blob/master/LICENSE)
|
|
35
|
+
- [ordered-set](https://github.com/rspeer/ordered-set/blob/master/MIT-LICENSE)
|
|
36
|
+
- [pyinstaller](https://github.com/pyinstaller/pyinstaller/blob/develop/COPYING.txt)
|
|
37
|
+
- [pytorch_tabular](https://github.com/manujosephv/pytorch_tabular/blob/main/LICENSE)
|
|
38
|
+
- [torchmetrics](https://github.com/Lightning-AI/torchmetrics/blob/master/LICENSE)
|
|
39
|
+
- [zstandard](https://github.com/indygreg/python-zstandard/blob/main/LICENSE)
|
ml_tools/ETL_cleaning.py
CHANGED
|
@@ -2,9 +2,10 @@ import polars as pl
|
|
|
2
2
|
import pandas as pd
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Union, List, Dict
|
|
5
|
+
|
|
5
6
|
from .path_manager import sanitize_filename, make_fullpath
|
|
6
7
|
from .data_exploration import drop_macro
|
|
7
|
-
from .utilities import
|
|
8
|
+
from .utilities import save_dataframe_filename, load_dataframe
|
|
8
9
|
from ._script_info import _script_info
|
|
9
10
|
from ._logger import _LOGGER
|
|
10
11
|
|
|
@@ -19,20 +20,26 @@ __all__ = [
|
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
################ Unique Values per column #################
|
|
22
|
-
def save_unique_values(csv_path: Union[str, Path],
|
|
23
|
+
def save_unique_values(csv_path: Union[str, Path],
|
|
24
|
+
output_dir: Union[str, Path],
|
|
25
|
+
verbose: bool=False,
|
|
26
|
+
keep_column_order: bool = True) -> None:
|
|
23
27
|
"""
|
|
24
28
|
Loads a CSV file, then analyzes it and saves the unique non-null values
|
|
25
29
|
from each column into a separate text file exactly as they appear.
|
|
26
30
|
|
|
27
31
|
This is useful for understanding the raw categories or range of values
|
|
28
|
-
within a dataset before cleaning.
|
|
32
|
+
within a dataset before and after cleaning.
|
|
29
33
|
|
|
30
34
|
Args:
|
|
31
|
-
csv_path (
|
|
35
|
+
csv_path (str | Path):
|
|
32
36
|
The file path to the input CSV file.
|
|
33
|
-
output_dir (
|
|
37
|
+
output_dir (str | Path):
|
|
34
38
|
The path to the directory where the .txt files will be saved.
|
|
35
39
|
The directory will be created if it does not exist.
|
|
40
|
+
keep_column_order (bool):
|
|
41
|
+
If True, prepends a numeric prefix (e.g., '1_', '2_') to each
|
|
42
|
+
output filename to maintain the original column order.
|
|
36
43
|
"""
|
|
37
44
|
# --- 1. Input Validation ---
|
|
38
45
|
csv_path = make_fullpath(input_path=csv_path, enforce="file")
|
|
@@ -74,7 +81,12 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
|
|
|
74
81
|
sanitized_name = sanitize_filename(column_name)
|
|
75
82
|
if not sanitized_name.strip('_'):
|
|
76
83
|
sanitized_name = f'column_{i}'
|
|
77
|
-
|
|
84
|
+
|
|
85
|
+
# --- create filename prefix ---
|
|
86
|
+
# If keep_column_order is True, create a prefix like "1_", "2_", etc.
|
|
87
|
+
prefix = f"{i + 1}_" if keep_column_order else ''
|
|
88
|
+
|
|
89
|
+
file_path = output_dir / f"{prefix}{sanitized_name}_unique_values.txt"
|
|
78
90
|
|
|
79
91
|
# --- Write to file ---
|
|
80
92
|
try:
|
|
@@ -96,7 +108,7 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
|
|
|
96
108
|
|
|
97
109
|
|
|
98
110
|
########## Basic df cleaners #############
|
|
99
|
-
def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
|
|
111
|
+
def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
|
|
100
112
|
# Cleaning rules
|
|
101
113
|
cleaning_rules = {
|
|
102
114
|
# 1. Comprehensive Punctuation & Symbol Normalization
|
|
@@ -126,27 +138,44 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
|
|
|
126
138
|
's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x',
|
|
127
139
|
'y': 'y', 'z': 'z',
|
|
128
140
|
# Punctuation
|
|
129
|
-
'》': '>', '《': '<', ':': ':', '
|
|
141
|
+
'》': '>', '《': '<', ':': ':', '。': '.', ';': ';', '【': '[', '】': ']', '∼': '~',
|
|
130
142
|
'(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#', '+': '+', '-': '-',
|
|
131
|
-
'$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '
|
|
143
|
+
'$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '-', '|': '|', '≈':'=', '·': '', '⋅': '',
|
|
144
|
+
'¯': '-',
|
|
145
|
+
|
|
146
|
+
# Commas (avoid commas in entries)
|
|
147
|
+
',': ';',
|
|
148
|
+
',': ';',
|
|
149
|
+
'、':';',
|
|
132
150
|
|
|
133
151
|
# Others
|
|
152
|
+
'σ': '',
|
|
153
|
+
'□': '',
|
|
134
154
|
'©': '',
|
|
135
155
|
'®': '',
|
|
136
156
|
'™': '',
|
|
157
|
+
r'[°˚]': '',
|
|
137
158
|
|
|
138
|
-
#
|
|
139
|
-
r'
|
|
140
|
-
r'\?{2,}': '?', # Replace two or more question marks with a single question mark
|
|
141
|
-
r'!{2,}': '!', # Replace two or more exclamation marks with a single one
|
|
159
|
+
# Replace special characters in entries
|
|
160
|
+
r'\\': '_',
|
|
142
161
|
|
|
143
162
|
# Typographical standardization
|
|
144
|
-
# Unify various dashes and hyphens to a standard hyphen
|
|
163
|
+
# Unify various dashes and hyphens to a standard hyphen
|
|
145
164
|
r'[—–―]': '-',
|
|
146
165
|
r'−': '-',
|
|
147
|
-
#
|
|
148
|
-
r'[“”]':
|
|
149
|
-
r
|
|
166
|
+
# remove various quote types
|
|
167
|
+
r'[“”"]': '',
|
|
168
|
+
r"[‘’′']": '',
|
|
169
|
+
|
|
170
|
+
# Collapse repeating punctuation
|
|
171
|
+
r'\.{2,}': '.', # Replace two or more dots with a single dot
|
|
172
|
+
r'\?{2,}': '?', # Replace two or more question marks with a single question mark
|
|
173
|
+
r'!{2,}': '!', # Replace two or more exclamation marks with a single one
|
|
174
|
+
r';{2,}': ';',
|
|
175
|
+
r'-{2,}': '-',
|
|
176
|
+
r'/{2,}': '/',
|
|
177
|
+
r'%{2,}': '%',
|
|
178
|
+
r'&{2,}': '&',
|
|
150
179
|
|
|
151
180
|
# 2. Internal Whitespace Consolidation
|
|
152
181
|
# Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
|
|
@@ -158,7 +187,7 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
|
|
|
158
187
|
|
|
159
188
|
# 4. Textual Null Standardization (New Step)
|
|
160
189
|
# Convert common null-like text to actual nulls.
|
|
161
|
-
r'^(N/A|无|NA|NULL|NONE|NIL
|
|
190
|
+
r'^(N/A|无|NA|NULL|NONE|NIL|-|\.|;|/|%|&)$': None,
|
|
162
191
|
|
|
163
192
|
# 5. Final Nullification of Empty Strings
|
|
164
193
|
# After all cleaning, if a string is now empty, convert it to a null
|
|
@@ -179,9 +208,13 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
|
|
|
179
208
|
df_cleaned = df_cleaner.clean(df_in, clone_df=False) # Use clone_df=False for efficiency
|
|
180
209
|
|
|
181
210
|
# apply lowercase to all string columns
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
211
|
+
if all_lowercase:
|
|
212
|
+
df_final = df_cleaned.with_columns(
|
|
213
|
+
pl.col(pl.String).str.to_lowercase()
|
|
214
|
+
)
|
|
215
|
+
else:
|
|
216
|
+
df_final = df_cleaned
|
|
217
|
+
|
|
185
218
|
except Exception as e:
|
|
186
219
|
_LOGGER.error(f"An error occurred during the cleaning process.")
|
|
187
220
|
raise e
|
|
@@ -199,7 +232,7 @@ def _path_manager(path_in: Union[str,Path], path_out: Union[str,Path]):
|
|
|
199
232
|
return input_path, output_path
|
|
200
233
|
|
|
201
234
|
|
|
202
|
-
def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path]):
|
|
235
|
+
def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path], all_lowercase: bool=True):
|
|
203
236
|
"""
|
|
204
237
|
Performs a comprehensive, standardized cleaning on all columns of a CSV file.
|
|
205
238
|
|
|
@@ -209,13 +242,16 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
|
|
|
209
242
|
- Stripping any leading or trailing whitespace.
|
|
210
243
|
- Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
|
|
211
244
|
- Converting strings that become empty after cleaning into true null values.
|
|
212
|
-
- Normalizing all text to lowercase.
|
|
245
|
+
- Normalizing all text to lowercase (Optional).
|
|
213
246
|
|
|
214
247
|
Args:
|
|
215
|
-
input_filepath (
|
|
248
|
+
input_filepath (str | Path):
|
|
216
249
|
The path to the source CSV file to be cleaned.
|
|
217
|
-
output_filepath (
|
|
250
|
+
output_filepath (str | Path):
|
|
218
251
|
The path to save the cleaned CSV file.
|
|
252
|
+
all_lowercase (bool):
|
|
253
|
+
Whether to normalize all text to lowercase.
|
|
254
|
+
|
|
219
255
|
"""
|
|
220
256
|
# Handle paths
|
|
221
257
|
input_path, output_path = _path_manager(path_in=input_filepath, path_out=output_filepath)
|
|
@@ -224,16 +260,16 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
|
|
|
224
260
|
df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
|
|
225
261
|
|
|
226
262
|
# CLEAN
|
|
227
|
-
df_final = _cleaner_core(df)
|
|
263
|
+
df_final = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
|
|
228
264
|
|
|
229
265
|
# Save cleaned dataframe
|
|
230
|
-
|
|
266
|
+
save_dataframe_filename(df=df_final, save_dir=output_path.parent, filename=output_path.name)
|
|
231
267
|
|
|
232
268
|
_LOGGER.info(f"Data successfully cleaned.")
|
|
233
269
|
|
|
234
270
|
|
|
235
271
|
def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str,Path], log_directory: Union[str,Path], targets: list[str],
|
|
236
|
-
skip_targets: bool=False, threshold: float=0.8):
|
|
272
|
+
skip_targets: bool=False, threshold: float=0.8, all_lowercase: bool=True):
|
|
237
273
|
"""
|
|
238
274
|
Performs standardized cleaning followed by iterative removal of rows and
|
|
239
275
|
columns with excessive missing data.
|
|
@@ -250,12 +286,12 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
|
|
|
250
286
|
dropping process are saved to the specified log directory.
|
|
251
287
|
|
|
252
288
|
Args:
|
|
253
|
-
input_filepath (str
|
|
289
|
+
input_filepath (str | Path):
|
|
254
290
|
The path to the source CSV file to be cleaned.
|
|
255
|
-
output_filepath (str
|
|
291
|
+
output_filepath (str | Path):
|
|
256
292
|
The path to save the fully cleaned CSV file after cleaning
|
|
257
293
|
and missing-data-based pruning.
|
|
258
|
-
log_directory (str
|
|
294
|
+
log_directory (str | Path):
|
|
259
295
|
Path to the directory where missing data reports will be stored.
|
|
260
296
|
targets (list[str]):
|
|
261
297
|
A list of column names to be treated as target variables.
|
|
@@ -267,6 +303,8 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
|
|
|
267
303
|
The proportion of missing data required to drop a row or column.
|
|
268
304
|
For example, 0.8 means a row/column will be dropped if 80% or more
|
|
269
305
|
of its data is missing.
|
|
306
|
+
all_lowercase (bool):
|
|
307
|
+
Whether to normalize all text to lowercase.
|
|
270
308
|
"""
|
|
271
309
|
# handle log path
|
|
272
310
|
log_path = make_fullpath(log_directory, make=True, enforce="directory")
|
|
@@ -278,7 +316,7 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
|
|
|
278
316
|
df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
|
|
279
317
|
|
|
280
318
|
# CLEAN
|
|
281
|
-
df_cleaned = _cleaner_core(df)
|
|
319
|
+
df_cleaned = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
|
|
282
320
|
|
|
283
321
|
# switch to pandas
|
|
284
322
|
df_cleaned_pandas = df_cleaned.to_pandas()
|
|
@@ -291,7 +329,7 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
|
|
|
291
329
|
threshold=threshold)
|
|
292
330
|
|
|
293
331
|
# Save cleaned dataframe
|
|
294
|
-
|
|
332
|
+
save_dataframe_filename(df=df_final, save_dir=output_path.parent, filename=output_path.name)
|
|
295
333
|
|
|
296
334
|
_LOGGER.info(f"Data successfully cleaned.")
|
|
297
335
|
|
|
@@ -456,7 +494,7 @@ class DataFrameCleaner:
|
|
|
456
494
|
if isinstance(output_filepath, str):
|
|
457
495
|
output_filepath = make_fullpath(input_path=output_filepath, enforce="file")
|
|
458
496
|
|
|
459
|
-
|
|
497
|
+
save_dataframe_filename(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
|
|
460
498
|
|
|
461
499
|
return None
|
|
462
500
|
|