dragon-ml-toolbox 13.3.0__py3-none-any.whl → 16.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/METADATA +20 -6
- dragon_ml_toolbox-16.2.0.dist-info/RECORD +51 -0
- {dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +10 -0
- ml_tools/ETL_cleaning.py +20 -20
- ml_tools/ETL_engineering.py +23 -25
- ml_tools/GUI_tools.py +20 -20
- ml_tools/MICE_imputation.py +207 -5
- ml_tools/ML_callbacks.py +43 -26
- ml_tools/ML_configuration.py +788 -0
- ml_tools/ML_datasetmaster.py +303 -448
- ml_tools/ML_evaluation.py +351 -93
- ml_tools/ML_evaluation_multi.py +139 -42
- ml_tools/ML_inference.py +290 -209
- ml_tools/ML_models.py +33 -106
- ml_tools/ML_models_advanced.py +323 -0
- ml_tools/ML_optimization.py +12 -12
- ml_tools/ML_scaler.py +11 -11
- ml_tools/ML_sequence_datasetmaster.py +341 -0
- ml_tools/ML_sequence_evaluation.py +219 -0
- ml_tools/ML_sequence_inference.py +391 -0
- ml_tools/ML_sequence_models.py +139 -0
- ml_tools/ML_trainer.py +1604 -179
- ml_tools/ML_utilities.py +351 -4
- ml_tools/ML_vision_datasetmaster.py +1540 -0
- ml_tools/ML_vision_evaluation.py +284 -0
- ml_tools/ML_vision_inference.py +405 -0
- ml_tools/ML_vision_models.py +641 -0
- ml_tools/ML_vision_transformers.py +284 -0
- ml_tools/PSO_optimization.py +6 -6
- ml_tools/SQL.py +4 -4
- ml_tools/_keys.py +171 -0
- ml_tools/_schema.py +1 -1
- ml_tools/custom_logger.py +37 -14
- ml_tools/data_exploration.py +502 -93
- ml_tools/ensemble_evaluation.py +54 -11
- ml_tools/ensemble_inference.py +7 -33
- ml_tools/ensemble_learning.py +1 -1
- ml_tools/math_utilities.py +1 -1
- ml_tools/optimization_tools.py +2 -2
- ml_tools/path_manager.py +5 -5
- ml_tools/serde.py +2 -2
- ml_tools/utilities.py +192 -4
- dragon_ml_toolbox-13.3.0.dist-info/RECORD +0 -41
- ml_tools/RNN_forecast.py +0 -56
- ml_tools/keys.py +0 -87
- {dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version:
|
|
4
|
-
Summary:
|
|
5
|
-
Author-email:
|
|
3
|
+
Version: 16.2.0
|
|
4
|
+
Summary: Complete pipelines and helper tools for data science and machine learning projects.
|
|
5
|
+
Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/DrAg0n-BoRn/ML_tools
|
|
8
8
|
Project-URL: Changelog, https://github.com/DrAg0n-BoRn/ML_tools/blob/master/CHANGELOG.md
|
|
@@ -34,6 +34,10 @@ Requires-Dist: Pillow; extra == "ml"
|
|
|
34
34
|
Requires-Dist: evotorch; extra == "ml"
|
|
35
35
|
Requires-Dist: pyarrow; extra == "ml"
|
|
36
36
|
Requires-Dist: colorlog; extra == "ml"
|
|
37
|
+
Requires-Dist: torchmetrics; extra == "ml"
|
|
38
|
+
Provides-Extra: py-tab
|
|
39
|
+
Requires-Dist: pytorch_tabular; extra == "py-tab"
|
|
40
|
+
Requires-Dist: omegaconf; extra == "py-tab"
|
|
37
41
|
Provides-Extra: mice
|
|
38
42
|
Requires-Dist: numpy<2.0; extra == "mice"
|
|
39
43
|
Requires-Dist: pandas; extra == "mice"
|
|
@@ -75,7 +79,7 @@ Dynamic: license-file
|
|
|
75
79
|
|
|
76
80
|
# dragon-ml-toolbox
|
|
77
81
|
|
|
78
|
-
A collection of
|
|
82
|
+
A collection of machine learning pipelines and utilities, structured as modular packages for easy reuse and installation. This package has no base dependencies, allowing for lightweight and customized virtual environments.
|
|
79
83
|
|
|
80
84
|
### Features:
|
|
81
85
|
|
|
@@ -137,19 +141,29 @@ ETL_cleaning
|
|
|
137
141
|
ETL_engineering
|
|
138
142
|
math_utilities
|
|
139
143
|
ML_callbacks
|
|
144
|
+
ML_configuration
|
|
140
145
|
ML_datasetmaster
|
|
141
146
|
ML_evaluation_multi
|
|
142
147
|
ML_evaluation
|
|
143
148
|
ML_inference
|
|
144
149
|
ML_models
|
|
150
|
+
ML_models_advanced # Requires the extra flag [py-tab]
|
|
145
151
|
ML_optimization
|
|
146
152
|
ML_scaler
|
|
153
|
+
ML_sequence_datasetmaster
|
|
154
|
+
ML_sequence_evaluation
|
|
155
|
+
ML_sequence_inference
|
|
156
|
+
ML_sequence_models
|
|
147
157
|
ML_trainer
|
|
148
158
|
ML_utilities
|
|
159
|
+
ML_vision_datasetmaster
|
|
160
|
+
ML_vision_evaluation
|
|
161
|
+
ML_vision_inference
|
|
162
|
+
ML_vision_models
|
|
163
|
+
ML_vision_transformers
|
|
149
164
|
optimization_tools
|
|
150
165
|
path_manager
|
|
151
166
|
PSO_optimization
|
|
152
|
-
RNN_forecast
|
|
153
167
|
serde
|
|
154
168
|
SQL
|
|
155
169
|
utilities
|
|
@@ -191,7 +205,6 @@ pip install "dragon-ml-toolbox[excel]"
|
|
|
191
205
|
#### Modules:
|
|
192
206
|
|
|
193
207
|
```Bash
|
|
194
|
-
constants
|
|
195
208
|
custom_logger
|
|
196
209
|
handle_excel
|
|
197
210
|
path_manager
|
|
@@ -236,6 +249,7 @@ custom_logger
|
|
|
236
249
|
GUI_tools
|
|
237
250
|
ML_models
|
|
238
251
|
ML_inference
|
|
252
|
+
ML_sequence_inference
|
|
239
253
|
ML_scaler
|
|
240
254
|
path_manager
|
|
241
255
|
```
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
dragon_ml_toolbox-16.2.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
|
|
2
|
+
dragon_ml_toolbox-16.2.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=gkOdNDbKYpIJezwSo2CEnISkLeYfYHv9t8b5K2-P69A,2687
|
|
3
|
+
ml_tools/ETL_cleaning.py,sha256=Bg0nTmpNzQKDdezK3m0NjYT7N8_ANGlmD9mDXjggqkA,20522
|
|
4
|
+
ml_tools/ETL_engineering.py,sha256=PGXvlvMWa05J1rsMNXxnHzXIe2K68qhtigSn74W8kFI,54961
|
|
5
|
+
ml_tools/GUI_tools.py,sha256=QMSu-8eSNminD6A6Yg9sXo4ff6GNPThwRBVgQQwAAbY,45508
|
|
6
|
+
ml_tools/MICE_imputation.py,sha256=2MsHeKTd8MSBIYmj0q671Fm4wCBvMGjpxULp__jDNgo,20812
|
|
7
|
+
ml_tools/ML_callbacks.py,sha256=EF7Px_IV3IIJpfaT0Nwbv4-_0C6IUlJ_xjzHOekXwq0,16410
|
|
8
|
+
ml_tools/ML_configuration.py,sha256=MKuy1v53LDLX_a8TDn5DwIdv_tyS2pz44ycw0S6aQfY,31601
|
|
9
|
+
ml_tools/ML_datasetmaster.py,sha256=isvRXI8vNRTFNCFFFpGtsUA8hS6ZDNezLuDpKd9VU9c,28514
|
|
10
|
+
ml_tools/ML_evaluation.py,sha256=LrvTnrS32pFmmsmh_3KGHUREUCNSI5vKIB2JIuBq8oI,30107
|
|
11
|
+
ml_tools/ML_evaluation_multi.py,sha256=mEN8jKaU1N7UdgldEykqME0MV_yubojD1StyQC5bFEA,20416
|
|
12
|
+
ml_tools/ML_inference.py,sha256=qxoeurcqp-soapfgHUuzt-NFg0KGwg_wOIuzsRMyJqQ,29447
|
|
13
|
+
ml_tools/ML_models.py,sha256=OEiuUduu2KqsfXZIfzJHR3uop_Zo6dzdKtvaOeRt1G0,27932
|
|
14
|
+
ml_tools/ML_models_advanced.py,sha256=5Y-Kda3P972F9zyfqCS4ndqOL-XXri010nhNp_bhHvY,12411
|
|
15
|
+
ml_tools/ML_optimization.py,sha256=2EwaKHKoZPnvN02d4q0tLO7aBMXSO8cEuhLl0bx28bg,22692
|
|
16
|
+
ml_tools/ML_scaler.py,sha256=Rp6h6U013UK56XhiV_Rmj1CSMI7OSIJLqC0vn6RkiQY,7527
|
|
17
|
+
ml_tools/ML_sequence_datasetmaster.py,sha256=WLkZ_yBcT5bjnbZ1SaecSXBms9IqC596lOCq14D48bc,15569
|
|
18
|
+
ml_tools/ML_sequence_evaluation.py,sha256=TYl5g1sgBK1Eo-J9WE00cr_N3bTP5UZJtAkKggh9ZU8,7888
|
|
19
|
+
ml_tools/ML_sequence_inference.py,sha256=k9Q8nSvUGdNrmnS4uXh3DkfrxQAqJI68Zs3oaiz2daY,17876
|
|
20
|
+
ml_tools/ML_sequence_models.py,sha256=PVmk7nK-lIl2asR4r6XgT0TIYSJKY4Um4D65gsyE_Qw,5597
|
|
21
|
+
ml_tools/ML_trainer.py,sha256=0kuwjPfFdp181qu0RwRGsFWSqNcjHR3P6hkTVJNNWIE,104927
|
|
22
|
+
ml_tools/ML_utilities.py,sha256=kj4CoI7YyAj9fipzFdl9gSYIMzDAjcPqgupTBkB1BQg,22994
|
|
23
|
+
ml_tools/ML_vision_datasetmaster.py,sha256=3rT7q91t_FghiLf9LT45rTYGhoyHOz_HJUOaf-4kvUQ,64857
|
|
24
|
+
ml_tools/ML_vision_evaluation.py,sha256=dOJBy1ja4Njg215w2DyB6jwC6nWz5G3Hg7LT7oWtIpI,11538
|
|
25
|
+
ml_tools/ML_vision_inference.py,sha256=XFXVQAZhivLKU9EfvpiqXj5th41bz3bkKhpPo8zdpE8,19645
|
|
26
|
+
ml_tools/ML_vision_models.py,sha256=NojhEZcQiIZ3iKCo5eFkcxetCEVuBKbujVUNruHes-U,26175
|
|
27
|
+
ml_tools/ML_vision_transformers.py,sha256=CEHPzkonub4-s21hjhj30O01dr5sVj9EEkrqmnFl03Y,10749
|
|
28
|
+
ml_tools/PSO_optimization.py,sha256=wAi7BaY-_QoRZ8ibHD6xpyhUABofrabHV7oiryBz5D0,22931
|
|
29
|
+
ml_tools/SQL.py,sha256=hBTKC_OotSuWc0DeD8sI-u2GJS7X_4oANjmLcY1YW_w,11210
|
|
30
|
+
ml_tools/VIF_factor.py,sha256=at5IVqPvicja2-DNSTSIIy3SkzDWCmLzo3qTG_qr5n8,10422
|
|
31
|
+
ml_tools/__init__.py,sha256=kJiankjz9_qXu7gU92mYqYg_anLvt-B6RtW0mMH8uGo,76
|
|
32
|
+
ml_tools/_keys.py,sha256=n967pTEwW935Eog7CJXChLru7374SC2Xv5fb7dX8mPc,4785
|
|
33
|
+
ml_tools/_logger.py,sha256=dlp5cGbzooK9YSNSZYB4yjZrOaQUGW8PTrM411AOvL8,4717
|
|
34
|
+
ml_tools/_schema.py,sha256=bE2RhOhXZd2u8MEQLOM--01ILPDxLqQAhZ3hZpFTXAI,3909
|
|
35
|
+
ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
|
|
36
|
+
ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
|
|
37
|
+
ml_tools/custom_logger.py,sha256=TGc0Ww2Xlqj2XE3q4bP43hV7T3qnb5ci9f0pYHXF5TY,11226
|
|
38
|
+
ml_tools/data_exploration.py,sha256=bwHzFJ-IAo5GN3T53F-1J_pXUg8VHS91sG_90utAsfg,69911
|
|
39
|
+
ml_tools/ensemble_evaluation.py,sha256=-pxhmCMPjaqSjJxXxaD_asKtoamztATjXJL7YKlsvZk,28369
|
|
40
|
+
ml_tools/ensemble_inference.py,sha256=uQPJiBK1GcckmeWgZn7BzaaPKIAQIBglmbUOuK9WknY,8560
|
|
41
|
+
ml_tools/ensemble_learning.py,sha256=Bh5WupUF93yLM5IBaQBsOqTVjWKjyfz7jN9IuRaZQ_o,21965
|
|
42
|
+
ml_tools/handle_excel.py,sha256=pfdAPb9ywegFkM9T54bRssDOsX-K7rSeV0RaMz7lEAo,14006
|
|
43
|
+
ml_tools/math_utilities.py,sha256=xeKq1quR_3DYLgowcp4Uam_4s3JltUyOnqMOGuAiYWU,8802
|
|
44
|
+
ml_tools/optimization_tools.py,sha256=_sCLZy9LRIIqt1zkYyKNsSbDK3JjRIhC-sADq-JtegE,12751
|
|
45
|
+
ml_tools/path_manager.py,sha256=2lTnhfDNdYlrqP_LGDoP51LdUf9hlTsZKuZJoYq5W-U,18462
|
|
46
|
+
ml_tools/serde.py,sha256=c8uDYjYry_VrLvoG4ixqDj5pij88lVn6Tu4NHcPkwDU,6943
|
|
47
|
+
ml_tools/utilities.py,sha256=wFwdv7xFV8Sv6kNy4_tE7RNasRs_318Zm7s65Uwu2Us,22509
|
|
48
|
+
dragon_ml_toolbox-16.2.0.dist-info/METADATA,sha256=AX3k2aBOqU4TVzZxagmFQ-NZTJLKa7hpTK7qb0YasuM,6591
|
|
49
|
+
dragon_ml_toolbox-16.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
50
|
+
dragon_ml_toolbox-16.2.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
51
|
+
dragon_ml_toolbox-16.2.0.dist-info/RECORD,,
|
|
@@ -27,3 +27,13 @@ This project depends on the following third-party packages. Each is governed by
|
|
|
27
27
|
- [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE)
|
|
28
28
|
- [tqdm](https://github.com/tqdm/tqdm/blob/master/LICENSE)
|
|
29
29
|
- [pyarrow](https://github.com/apache/arrow/blob/main/LICENSE.txt)
|
|
30
|
+
- [colorlog](https://github.com/borntyping/python-colorlog/blob/main/LICENSE)
|
|
31
|
+
- [evotorch](https://github.com/nnaisense/evotorch/blob/master/LICENSE)
|
|
32
|
+
- [FreeSimpleGUI](https://github.com/spyoungtech/FreeSimpleGUI/blob/main/license.txt)
|
|
33
|
+
- [nuitka](https://github.com/Nuitka/Nuitka/blob/main/LICENSE.txt)
|
|
34
|
+
- [omegaconf](https://github.com/omry/omegaconf/blob/master/LICENSE)
|
|
35
|
+
- [ordered-set](https://github.com/rspeer/ordered-set/blob/master/MIT-LICENSE)
|
|
36
|
+
- [pyinstaller](https://github.com/pyinstaller/pyinstaller/blob/develop/COPYING.txt)
|
|
37
|
+
- [pytorch_tabular](https://github.com/manujosephv/pytorch_tabular/blob/main/LICENSE)
|
|
38
|
+
- [torchmetrics](https://github.com/Lightning-AI/torchmetrics/blob/master/LICENSE)
|
|
39
|
+
- [zstandard](https://github.com/indygreg/python-zstandard/blob/main/LICENSE)
|
ml_tools/ETL_cleaning.py
CHANGED
|
@@ -14,8 +14,8 @@ __all__ = [
|
|
|
14
14
|
"save_unique_values",
|
|
15
15
|
"basic_clean",
|
|
16
16
|
"basic_clean_drop",
|
|
17
|
-
"
|
|
18
|
-
"
|
|
17
|
+
"DragonColumnCleaner",
|
|
18
|
+
"DragonDataFrameCleaner"
|
|
19
19
|
]
|
|
20
20
|
|
|
21
21
|
|
|
@@ -200,11 +200,11 @@ def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
|
|
|
200
200
|
# Create a cleaner for every column in the dataframe
|
|
201
201
|
all_columns = df_in.columns
|
|
202
202
|
column_cleaners = [
|
|
203
|
-
|
|
203
|
+
DragonColumnCleaner(col, rules=cleaning_rules, case_insensitive=True) for col in all_columns
|
|
204
204
|
]
|
|
205
205
|
|
|
206
206
|
# Instantiate and run the main dataframe cleaner
|
|
207
|
-
df_cleaner =
|
|
207
|
+
df_cleaner = DragonDataFrameCleaner(cleaners=column_cleaners)
|
|
208
208
|
df_cleaned = df_cleaner.clean(df_in, clone_df=False) # Use clone_df=False for efficiency
|
|
209
209
|
|
|
210
210
|
# apply lowercase to all string columns
|
|
@@ -335,12 +335,12 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
|
|
|
335
335
|
|
|
336
336
|
|
|
337
337
|
########## EXTRACT and CLEAN ##########
|
|
338
|
-
class
|
|
338
|
+
class DragonColumnCleaner:
|
|
339
339
|
"""
|
|
340
340
|
A configuration object that defines cleaning rules for a single Polars DataFrame column.
|
|
341
341
|
|
|
342
342
|
This class holds a dictionary of regex-to-replacement rules, the target column name,
|
|
343
|
-
and the case-sensitivity setting. It is intended to be used with the
|
|
343
|
+
and the case-sensitivity setting. It is intended to be used with the DragonDataFrameCleaner.
|
|
344
344
|
|
|
345
345
|
Notes:
|
|
346
346
|
- Define rules from most specific to more general to create a fallback system.
|
|
@@ -365,8 +365,8 @@ class ColumnCleaner:
|
|
|
365
365
|
r'ID[- ](\\d+)': r'ID:$1'
|
|
366
366
|
}
|
|
367
367
|
|
|
368
|
-
id_cleaner =
|
|
369
|
-
# This object would then be passed to a
|
|
368
|
+
id_cleaner = DragonColumnCleaner(column_name='user_id', rules=id_rules)
|
|
369
|
+
# This object would then be passed to a DragonDataFrameCleaner.
|
|
370
370
|
```
|
|
371
371
|
"""
|
|
372
372
|
def __init__(self, column_name: str, rules: Dict[str, str], case_insensitive: bool = True):
|
|
@@ -382,34 +382,34 @@ class ColumnCleaner:
|
|
|
382
382
|
self.case_insensitive = case_insensitive
|
|
383
383
|
|
|
384
384
|
|
|
385
|
-
class
|
|
385
|
+
class DragonDataFrameCleaner:
|
|
386
386
|
"""
|
|
387
387
|
Orchestrates cleaning multiple columns in a Polars DataFrame.
|
|
388
388
|
|
|
389
|
-
This class takes a list of
|
|
389
|
+
This class takes a list of DragonColumnCleaner objects and applies their defined
|
|
390
390
|
rules to the corresponding columns of a DataFrame using high-performance
|
|
391
391
|
Polars expressions.
|
|
392
392
|
|
|
393
393
|
Args:
|
|
394
|
-
cleaners (List[
|
|
395
|
-
A list of
|
|
394
|
+
cleaners (List[DragonColumnCleaner]):
|
|
395
|
+
A list of DragonColumnCleaner configuration objects.
|
|
396
396
|
|
|
397
397
|
Raises:
|
|
398
|
-
TypeError: If 'cleaners' is not a list or contains non-
|
|
399
|
-
ValueError: If multiple
|
|
398
|
+
TypeError: If 'cleaners' is not a list or contains non-DragonColumnCleaner objects.
|
|
399
|
+
ValueError: If multiple DragonColumnCleaner objects target the same column.
|
|
400
400
|
"""
|
|
401
|
-
def __init__(self, cleaners: List[
|
|
401
|
+
def __init__(self, cleaners: List[DragonColumnCleaner]):
|
|
402
402
|
if not isinstance(cleaners, list):
|
|
403
|
-
_LOGGER.error("The 'cleaners' argument must be a list of
|
|
403
|
+
_LOGGER.error("The 'cleaners' argument must be a list of DragonColumnCleaner objects.")
|
|
404
404
|
raise TypeError()
|
|
405
405
|
|
|
406
406
|
seen_columns = set()
|
|
407
407
|
for cleaner in cleaners:
|
|
408
|
-
if not isinstance(cleaner,
|
|
409
|
-
_LOGGER.error(f"All items in 'cleaners' list must be
|
|
408
|
+
if not isinstance(cleaner, DragonColumnCleaner):
|
|
409
|
+
_LOGGER.error(f"All items in 'cleaners' list must be DragonColumnCleaner objects, but found an object of type {type(cleaner).__name__}.")
|
|
410
410
|
raise TypeError()
|
|
411
411
|
if cleaner.column_name in seen_columns:
|
|
412
|
-
_LOGGER.error(f"Duplicate
|
|
412
|
+
_LOGGER.error(f"Duplicate DragonColumnCleaner found for column '{cleaner.column_name}'. Each column should only have one cleaner.")
|
|
413
413
|
raise ValueError()
|
|
414
414
|
seen_columns.add(cleaner.column_name)
|
|
415
415
|
|
|
@@ -475,7 +475,7 @@ class DataFrameCleaner:
|
|
|
475
475
|
"""
|
|
476
476
|
This convenience method encapsulates the entire cleaning process into a
|
|
477
477
|
single call. It loads a DataFrame from a specified file, applies all
|
|
478
|
-
cleaning rules configured in the `
|
|
478
|
+
cleaning rules configured in the `DragonDataFrameCleaner` instance, and saves
|
|
479
479
|
the resulting cleaned DataFrame to a new file.
|
|
480
480
|
|
|
481
481
|
The method ensures that all data is loaded as string types to prevent
|
ml_tools/ETL_engineering.py
CHANGED
|
@@ -8,11 +8,12 @@ from .path_manager import make_fullpath
|
|
|
8
8
|
from ._script_info import _script_info
|
|
9
9
|
from ._logger import _LOGGER
|
|
10
10
|
from .constants import CHEMICAL_ELEMENT_SYMBOLS
|
|
11
|
+
from ._keys import MagicWords
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
__all__ = [
|
|
14
|
-
"
|
|
15
|
-
"
|
|
15
|
+
"DragonTransformRecipe",
|
|
16
|
+
"DragonProcessor",
|
|
16
17
|
"BinaryTransformer",
|
|
17
18
|
"MultiBinaryDummifier",
|
|
18
19
|
"AutoDummifier",
|
|
@@ -32,16 +33,13 @@ __all__ = [
|
|
|
32
33
|
|
|
33
34
|
############ TRANSFORM MAIN ####################
|
|
34
35
|
|
|
35
|
-
|
|
36
|
-
_RENAME = "rename"
|
|
37
|
-
|
|
38
|
-
class TransformationRecipe:
|
|
36
|
+
class DragonTransformRecipe:
|
|
39
37
|
"""
|
|
40
38
|
A builder class for creating a data transformation recipe.
|
|
41
39
|
|
|
42
40
|
This class provides a structured way to define a series of transformation
|
|
43
41
|
steps, with validation performed at the time of addition. It is designed
|
|
44
|
-
to be passed to a `
|
|
42
|
+
to be passed to a `DragonProcessor`.
|
|
45
43
|
|
|
46
44
|
Use the method `add()` to add recipes.
|
|
47
45
|
"""
|
|
@@ -53,7 +51,7 @@ class TransformationRecipe:
|
|
|
53
51
|
input_col_name: str,
|
|
54
52
|
transform: Union[str, Callable],
|
|
55
53
|
output_col_names: Optional[Union[str, List[str]]] = None
|
|
56
|
-
) -> "
|
|
54
|
+
) -> "DragonTransformRecipe":
|
|
57
55
|
"""
|
|
58
56
|
Adds a new transformation step to the recipe.
|
|
59
57
|
|
|
@@ -77,12 +75,12 @@ class TransformationRecipe:
|
|
|
77
75
|
_LOGGER.error("'input_col' must be a non-empty string.")
|
|
78
76
|
raise TypeError()
|
|
79
77
|
|
|
80
|
-
if transform ==
|
|
78
|
+
if transform == MagicWords.RENAME:
|
|
81
79
|
if not isinstance(output_col_names, str):
|
|
82
80
|
_LOGGER.error("For a RENAME operation, 'output_col' must be a string.")
|
|
83
81
|
raise TypeError()
|
|
84
82
|
elif not isinstance(transform, Callable):
|
|
85
|
-
_LOGGER.error(f"'transform' must be a callable function or the string '{
|
|
83
|
+
_LOGGER.error(f"'transform' must be a callable function or the string '{MagicWords.RENAME}'.")
|
|
86
84
|
raise TypeError()
|
|
87
85
|
|
|
88
86
|
# --- Add Step ---
|
|
@@ -103,22 +101,22 @@ class TransformationRecipe:
|
|
|
103
101
|
return len(self._steps)
|
|
104
102
|
|
|
105
103
|
|
|
106
|
-
class
|
|
104
|
+
class DragonProcessor:
|
|
107
105
|
"""
|
|
108
|
-
Transforms a Polars DataFrame based on a provided `
|
|
106
|
+
Transforms a Polars DataFrame based on a provided `DragonTransformRecipe` object.
|
|
109
107
|
|
|
110
108
|
Use the methods `transform()` or `load_transform_save()`.
|
|
111
109
|
"""
|
|
112
|
-
def __init__(self, recipe:
|
|
110
|
+
def __init__(self, recipe: DragonTransformRecipe):
|
|
113
111
|
"""
|
|
114
|
-
Initializes the
|
|
112
|
+
Initializes the DragonProcessor with a transformation recipe.
|
|
115
113
|
|
|
116
114
|
Args:
|
|
117
|
-
recipe: An instance of the `
|
|
115
|
+
recipe: An instance of the `DragonTransformRecipe` class that has
|
|
118
116
|
been populated with transformation steps.
|
|
119
117
|
"""
|
|
120
|
-
if not isinstance(recipe,
|
|
121
|
-
_LOGGER.error("The recipe must be an instance of
|
|
118
|
+
if not isinstance(recipe, DragonTransformRecipe):
|
|
119
|
+
_LOGGER.error("The recipe must be an instance of DragonTransformRecipe.")
|
|
122
120
|
raise TypeError()
|
|
123
121
|
if len(recipe) == 0:
|
|
124
122
|
_LOGGER.error("The recipe cannot be empty.")
|
|
@@ -142,7 +140,7 @@ class DataProcessor:
|
|
|
142
140
|
|
|
143
141
|
input_series = df.get_column(input_col_name)
|
|
144
142
|
|
|
145
|
-
if transform_action ==
|
|
143
|
+
if transform_action == MagicWords.RENAME:
|
|
146
144
|
processed_columns.append(input_series.alias(output_col_spec))
|
|
147
145
|
continue
|
|
148
146
|
|
|
@@ -237,7 +235,7 @@ class DataProcessor:
|
|
|
237
235
|
Provides a detailed, human-readable string representation of the
|
|
238
236
|
entire processing pipeline.
|
|
239
237
|
"""
|
|
240
|
-
header = "
|
|
238
|
+
header = "DragonProcessor Pipeline"
|
|
241
239
|
divider = "-" * len(header)
|
|
242
240
|
num_steps = len(self._recipe)
|
|
243
241
|
|
|
@@ -255,7 +253,7 @@ class DataProcessor:
|
|
|
255
253
|
transform_action = step["transform"]
|
|
256
254
|
|
|
257
255
|
# Get a clean name for the transformation action
|
|
258
|
-
if transform_action ==
|
|
256
|
+
if transform_action == MagicWords.RENAME: # "rename"
|
|
259
257
|
transform_name = "Rename"
|
|
260
258
|
else:
|
|
261
259
|
# This works for both functions and class instances
|
|
@@ -394,7 +392,7 @@ class MultiBinaryDummifier:
|
|
|
394
392
|
|
|
395
393
|
For each keyword provided, this transformer generates a corresponding column
|
|
396
394
|
with a value of 1 if the keyword is present in the input string, and 0 otherwise.
|
|
397
|
-
It is designed to be used within the
|
|
395
|
+
It is designed to be used within the DragonProcessor pipeline.
|
|
398
396
|
|
|
399
397
|
Args:
|
|
400
398
|
keywords (List[str]):
|
|
@@ -443,7 +441,7 @@ class MultiBinaryDummifier:
|
|
|
443
441
|
.when(str_column.str.contains(pattern))
|
|
444
442
|
.then(pl.lit(1, dtype=pl.UInt8))
|
|
445
443
|
.otherwise(pl.lit(0, dtype=pl.UInt8))
|
|
446
|
-
.alias(f"{column_base_name}_{keyword}") # name for
|
|
444
|
+
.alias(f"{column_base_name}_{keyword}") # name for DragonProcessor
|
|
447
445
|
)
|
|
448
446
|
output_expressions.append(expr)
|
|
449
447
|
|
|
@@ -533,7 +531,7 @@ class NumberExtractor:
|
|
|
533
531
|
A configurable transformer that extracts a single number from a Polars string series using a regular expression.
|
|
534
532
|
|
|
535
533
|
An instance can be used as a 'transform' callable within the
|
|
536
|
-
`
|
|
534
|
+
`DragonProcessor` pipeline.
|
|
537
535
|
|
|
538
536
|
Args:
|
|
539
537
|
regex_pattern (str):
|
|
@@ -872,7 +870,7 @@ class MultiTemperatureExtractor:
|
|
|
872
870
|
pl.when(column.is_not_null())
|
|
873
871
|
.then(final_expr)
|
|
874
872
|
.otherwise(None)
|
|
875
|
-
.alias(f"{column_base_name}_{i}") # Temporary name for
|
|
873
|
+
.alias(f"{column_base_name}_{i}") # Temporary name for DragonProcessor
|
|
876
874
|
)
|
|
877
875
|
|
|
878
876
|
output_expressions.append(final_expr)
|
|
@@ -1300,7 +1298,7 @@ class MolecularFormulaTransformer:
|
|
|
1300
1298
|
each chemical element has its own column. The value in each column is the
|
|
1301
1299
|
stoichiometric quantity of that element.
|
|
1302
1300
|
|
|
1303
|
-
It is designed to be used within the
|
|
1301
|
+
It is designed to be used within the DragonProcessor pipeline.
|
|
1304
1302
|
"""
|
|
1305
1303
|
|
|
1306
1304
|
def __init__(self):
|
ml_tools/GUI_tools.py
CHANGED
|
@@ -8,15 +8,15 @@ import numpy as np
|
|
|
8
8
|
|
|
9
9
|
from ._script_info import _script_info
|
|
10
10
|
from ._logger import _LOGGER
|
|
11
|
-
from .
|
|
11
|
+
from ._keys import _OneHotOtherPlaceholder
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
__all__ = [
|
|
15
|
-
"
|
|
16
|
-
"
|
|
15
|
+
"DragonGUIConfig",
|
|
16
|
+
"DragonGUIFactory",
|
|
17
17
|
"catch_exceptions",
|
|
18
|
-
"
|
|
19
|
-
"
|
|
18
|
+
"DragonFeatureMaster",
|
|
19
|
+
"DragonGUIHandler"
|
|
20
20
|
]
|
|
21
21
|
|
|
22
22
|
# --- Configuration Management ---
|
|
@@ -55,14 +55,14 @@ class _SectionProxy:
|
|
|
55
55
|
# Fallback to the original string
|
|
56
56
|
return value_str
|
|
57
57
|
|
|
58
|
-
class
|
|
58
|
+
class DragonGUIConfig:
|
|
59
59
|
"""
|
|
60
60
|
Loads a .ini file and provides access to its values as object attributes.
|
|
61
61
|
Includes a method to generate a default configuration template.
|
|
62
62
|
"""
|
|
63
63
|
def __init__(self, config_path: str | Path):
|
|
64
64
|
"""
|
|
65
|
-
Initializes the
|
|
65
|
+
Initializes the DragonGUIConfig and dynamically creates attributes
|
|
66
66
|
based on the .ini file's sections and options.
|
|
67
67
|
"""
|
|
68
68
|
config_path = Path(config_path)
|
|
@@ -78,7 +78,7 @@ class ConfigManager:
|
|
|
78
78
|
@staticmethod
|
|
79
79
|
def generate_template(file_path: str | Path):
|
|
80
80
|
"""
|
|
81
|
-
Generates a complete, commented .ini template file that works with the
|
|
81
|
+
Generates a complete, commented .ini template file that works with the DragonGUIFactory.
|
|
82
82
|
|
|
83
83
|
Args:
|
|
84
84
|
file_path (str | Path): The path where the .ini file will be saved.
|
|
@@ -155,12 +155,12 @@ class ConfigManager:
|
|
|
155
155
|
|
|
156
156
|
|
|
157
157
|
# --- GUI Factory ---
|
|
158
|
-
class
|
|
158
|
+
class DragonGUIFactory:
|
|
159
159
|
"""
|
|
160
160
|
Builds styled FreeSimpleGUI elements and layouts using a "building block"
|
|
161
|
-
approach, driven by a
|
|
161
|
+
approach, driven by a DragonGUIConfig instance.
|
|
162
162
|
"""
|
|
163
|
-
def __init__(self, config:
|
|
163
|
+
def __init__(self, config: DragonGUIConfig):
|
|
164
164
|
"""
|
|
165
165
|
Initializes the factory with a configuration object.
|
|
166
166
|
"""
|
|
@@ -456,7 +456,7 @@ def catch_exceptions(show_popup: bool = True):
|
|
|
456
456
|
|
|
457
457
|
|
|
458
458
|
# --- Feature Handler ---
|
|
459
|
-
class
|
|
459
|
+
class DragonFeatureMaster:
|
|
460
460
|
"""
|
|
461
461
|
Manages and organizes feature definitions for a machine learning model.
|
|
462
462
|
|
|
@@ -488,7 +488,7 @@ class FeatureMaster:
|
|
|
488
488
|
categorical_features: Optional[List[Tuple[str, str, Dict[str, int]]]] = None,
|
|
489
489
|
add_one_hot_other_placeholder: bool = True) -> None:
|
|
490
490
|
"""
|
|
491
|
-
Initializes the
|
|
491
|
+
Initializes the DragonFeatureMaster instance by processing feature and target definitions.
|
|
492
492
|
|
|
493
493
|
This constructor creates internal mappings to translate between GUI-friendly names and model-specific feature names. It also
|
|
494
494
|
prepares data structures needed to populate UI components.
|
|
@@ -806,17 +806,17 @@ class FeatureMaster:
|
|
|
806
806
|
|
|
807
807
|
|
|
808
808
|
# --- GUI-Model API ---
|
|
809
|
-
class
|
|
809
|
+
class DragonGUIHandler:
|
|
810
810
|
"""
|
|
811
811
|
Translates data between a GUI and a machine learning model.
|
|
812
812
|
|
|
813
813
|
This class acts as the primary interface between a user-facing application
|
|
814
|
-
(FreeSimpleGUI) and the model's expected data format. It uses a `
|
|
814
|
+
(FreeSimpleGUI) and the model's expected data format. It uses a `DragonFeatureMaster` instance to correctly process
|
|
815
815
|
and encode user inputs.
|
|
816
816
|
|
|
817
817
|
Its main responsibilities are:
|
|
818
818
|
1. To take raw values from GUI elements and, using the definitions from
|
|
819
|
-
`
|
|
819
|
+
`DragonFeatureMaster`, convert them into a single, ordered `numpy.ndarray`
|
|
820
820
|
that can be fed directly into a model for inference.
|
|
821
821
|
2. To take the results of a model's inference and update the
|
|
822
822
|
corresponding target fields in the GUI to display the prediction.
|
|
@@ -824,13 +824,13 @@ class GUIHandler:
|
|
|
824
824
|
This handler ensures a clean separation of concerns, where the GUI is
|
|
825
825
|
only responsible for presentation, and the model sees correctly formatted numerical data.
|
|
826
826
|
"""
|
|
827
|
-
def __init__(self, feature_handler:
|
|
827
|
+
def __init__(self, feature_handler: DragonFeatureMaster, model_expected_features: list[str]) -> None:
|
|
828
828
|
"""
|
|
829
|
-
Initializes the
|
|
829
|
+
Initializes the DragonGUIHandler.
|
|
830
830
|
|
|
831
831
|
Args:
|
|
832
|
-
feature_handler (
|
|
833
|
-
An initialized instance of the `
|
|
832
|
+
feature_handler (DragonFeatureMaster):
|
|
833
|
+
An initialized instance of the `DragonFeatureMaster` class. This object
|
|
834
834
|
contains all the necessary mappings and definitions for the model's
|
|
835
835
|
features and targets.
|
|
836
836
|
model_expected_features (list[str]):
|