dragon-ml-toolbox 12.13.0__py3-none-any.whl → 14.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-14.3.0.dist-info}/METADATA +11 -2
- dragon_ml_toolbox-14.3.0.dist-info/RECORD +48 -0
- {dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-14.3.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +10 -0
- ml_tools/MICE_imputation.py +207 -5
- ml_tools/ML_callbacks.py +40 -8
- ml_tools/ML_datasetmaster.py +200 -261
- ml_tools/ML_evaluation.py +29 -17
- ml_tools/ML_evaluation_multi.py +13 -10
- ml_tools/ML_inference.py +14 -5
- ml_tools/ML_models.py +135 -55
- ml_tools/ML_models_advanced.py +323 -0
- ml_tools/ML_optimization.py +49 -36
- ml_tools/ML_trainer.py +560 -30
- ml_tools/ML_utilities.py +302 -4
- ml_tools/ML_vision_datasetmaster.py +1352 -0
- ml_tools/ML_vision_evaluation.py +260 -0
- ml_tools/ML_vision_inference.py +428 -0
- ml_tools/ML_vision_models.py +627 -0
- ml_tools/ML_vision_transformers.py +58 -0
- ml_tools/PSO_optimization.py +5 -1
- ml_tools/_ML_vision_recipe.py +88 -0
- ml_tools/__init__.py +1 -0
- ml_tools/_schema.py +96 -0
- ml_tools/custom_logger.py +37 -14
- ml_tools/data_exploration.py +576 -138
- ml_tools/keys.py +51 -1
- ml_tools/math_utilities.py +1 -1
- ml_tools/optimization_tools.py +65 -86
- ml_tools/serde.py +78 -17
- ml_tools/utilities.py +192 -3
- dragon_ml_toolbox-12.13.0.dist-info/RECORD +0 -41
- ml_tools/ML_simple_optimization.py +0 -413
- {dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-14.3.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-14.3.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-12.13.0.dist-info → dragon_ml_toolbox-14.3.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version:
|
|
3
|
+
Version: 14.3.0
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects.
|
|
5
5
|
Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -34,6 +34,10 @@ Requires-Dist: Pillow; extra == "ml"
|
|
|
34
34
|
Requires-Dist: evotorch; extra == "ml"
|
|
35
35
|
Requires-Dist: pyarrow; extra == "ml"
|
|
36
36
|
Requires-Dist: colorlog; extra == "ml"
|
|
37
|
+
Requires-Dist: torchmetrics; extra == "ml"
|
|
38
|
+
Provides-Extra: py-tab
|
|
39
|
+
Requires-Dist: pytorch_tabular; extra == "py-tab"
|
|
40
|
+
Requires-Dist: omegaconf; extra == "py-tab"
|
|
37
41
|
Provides-Extra: mice
|
|
38
42
|
Requires-Dist: numpy<2.0; extra == "mice"
|
|
39
43
|
Requires-Dist: pandas; extra == "mice"
|
|
@@ -142,10 +146,16 @@ ML_evaluation_multi
|
|
|
142
146
|
ML_evaluation
|
|
143
147
|
ML_inference
|
|
144
148
|
ML_models
|
|
149
|
+
ML_models_advanced # Requires the extra flag [py-tab]
|
|
145
150
|
ML_optimization
|
|
146
151
|
ML_scaler
|
|
147
152
|
ML_trainer
|
|
148
153
|
ML_utilities
|
|
154
|
+
ML_vision_datasetmaster
|
|
155
|
+
ML_vision_evaluation
|
|
156
|
+
ML_vision_inference
|
|
157
|
+
ML_vision_models
|
|
158
|
+
ML_vision_transformers
|
|
149
159
|
optimization_tools
|
|
150
160
|
path_manager
|
|
151
161
|
PSO_optimization
|
|
@@ -191,7 +201,6 @@ pip install "dragon-ml-toolbox[excel]"
|
|
|
191
201
|
#### Modules:
|
|
192
202
|
|
|
193
203
|
```Bash
|
|
194
|
-
constants
|
|
195
204
|
custom_logger
|
|
196
205
|
handle_excel
|
|
197
206
|
path_manager
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
dragon_ml_toolbox-14.3.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
|
|
2
|
+
dragon_ml_toolbox-14.3.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=gkOdNDbKYpIJezwSo2CEnISkLeYfYHv9t8b5K2-P69A,2687
|
|
3
|
+
ml_tools/ETL_cleaning.py,sha256=2VBRllV8F-ZiPylPp8Az2gwn5ztgazN0BH5OKnRUhV0,20402
|
|
4
|
+
ml_tools/ETL_engineering.py,sha256=KfYqgsxupAx6e_TxwO1LZXeu5mFkIhVXJrNjP3CzIZc,54927
|
|
5
|
+
ml_tools/GUI_tools.py,sha256=Va6ig-dHULPVRwQYYtH3fvY5XPIoqRcJpRW8oXC55Hw,45413
|
|
6
|
+
ml_tools/MICE_imputation.py,sha256=KLJXGQLKJ6AuWWttAG-LCCaxpS-ygM4dXPiguHDaL6Y,20815
|
|
7
|
+
ml_tools/ML_callbacks.py,sha256=elD2Yr030sv_6gX_m9GVd6HTyrbmt34nFS8lrgS4HtM,15808
|
|
8
|
+
ml_tools/ML_datasetmaster.py,sha256=rsJgZEGBJmfeKF6cR8CQZzfEx4T7Y-p1wUnR15_nNw0,28400
|
|
9
|
+
ml_tools/ML_evaluation.py,sha256=4GU86rUWMIGbkXrvN6PyjfGwKtWvXKE7pMlWpWeBq14,18988
|
|
10
|
+
ml_tools/ML_evaluation_multi.py,sha256=rJKdgtq-9I7oaI7PRzq7aIZ84XdNV0xzlVePZW4nj0k,16095
|
|
11
|
+
ml_tools/ML_inference.py,sha256=YJ953bhNWsdlPRtJQh3h2ACfMIgp8dQ9KtL9Azar-5s,23489
|
|
12
|
+
ml_tools/ML_models.py,sha256=PqOcNlws7vCJMbiVCKqlPuktxvskZVUHG3VfU-Yshf8,31415
|
|
13
|
+
ml_tools/ML_models_advanced.py,sha256=vk3PZBSu3DVso2S1rKTxxdS43XG8Q5FnasIL3-rMajc,12410
|
|
14
|
+
ml_tools/ML_optimization.py,sha256=P0zkhKAwTpkorIBtR0AOIDcyexo5ngmvFUzo3DfNO-E,22692
|
|
15
|
+
ml_tools/ML_scaler.py,sha256=tw6onj9o8_kk3FQYb930HUzvv1zsFZe2YZJdF3LtHkU,7538
|
|
16
|
+
ml_tools/ML_trainer.py,sha256=ZWI4MbUcLeBxyfoUTL96l5tjHHMp9I64h4SdXnjYmBE,49795
|
|
17
|
+
ml_tools/ML_utilities.py,sha256=z6LbpbZwhn8F__fWlKi-g-cAJQXSxwg1NHfC5FBoAyc,21139
|
|
18
|
+
ml_tools/ML_vision_datasetmaster.py,sha256=feFNUBjybzVJJrdyqToQ_mLV1uDJXHkNL0tmn_zofSY,56034
|
|
19
|
+
ml_tools/ML_vision_evaluation.py,sha256=t12R7i1RkOCt9zu1_lxSBr8OH6A6Get0k8ftDLctn6I,10486
|
|
20
|
+
ml_tools/ML_vision_inference.py,sha256=He3KV3VJAm8PwO-fOq4b9VO8UXFr-GmpuCnoHXf4VZI,20588
|
|
21
|
+
ml_tools/ML_vision_models.py,sha256=G3S4jB9AE9wMpU9ZygOgOx9q1K6t6LAXBYcJ-U2XQ1M,25600
|
|
22
|
+
ml_tools/ML_vision_transformers.py,sha256=95e0aBkHY5VDGE8i5xy57COU7NvSNIgFknnhBubwE40,1832
|
|
23
|
+
ml_tools/PSO_optimization.py,sha256=T-HWHMRJUnPvPwixdU5jif3_rnnI36TzcL8u3oSCwuA,22960
|
|
24
|
+
ml_tools/RNN_forecast.py,sha256=Qa2KoZfdAvSjZ4yE78N4BFXtr3tTr0Gx7tQJZPotsh0,1967
|
|
25
|
+
ml_tools/SQL.py,sha256=vXLPGfVVg8bfkbBE3HVfyEclVbdJy0TBhuQONtMwSCQ,11234
|
|
26
|
+
ml_tools/VIF_factor.py,sha256=at5IVqPvicja2-DNSTSIIy3SkzDWCmLzo3qTG_qr5n8,10422
|
|
27
|
+
ml_tools/_ML_vision_recipe.py,sha256=zrgxFUvTJqQVuwR7jWlbIC2FD29u6eNFPkTRoJ7yEZI,3178
|
|
28
|
+
ml_tools/__init__.py,sha256=kJiankjz9_qXu7gU92mYqYg_anLvt-B6RtW0mMH8uGo,76
|
|
29
|
+
ml_tools/_logger.py,sha256=dlp5cGbzooK9YSNSZYB4yjZrOaQUGW8PTrM411AOvL8,4717
|
|
30
|
+
ml_tools/_schema.py,sha256=yu6aWmn_2Z4_AxAtJGDDCIa96y6JcUp-vgnCS013Qmw,3908
|
|
31
|
+
ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
|
|
32
|
+
ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
|
|
33
|
+
ml_tools/custom_logger.py,sha256=TGc0Ww2Xlqj2XE3q4bP43hV7T3qnb5ci9f0pYHXF5TY,11226
|
|
34
|
+
ml_tools/data_exploration.py,sha256=bwHzFJ-IAo5GN3T53F-1J_pXUg8VHS91sG_90utAsfg,69911
|
|
35
|
+
ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
|
|
36
|
+
ml_tools/ensemble_inference.py,sha256=0yLmLNj45RVVoSCLH1ZYJG9IoAhTkWUqEZmLOQTFGTY,9348
|
|
37
|
+
ml_tools/ensemble_learning.py,sha256=vsIED7nlheYI4w2SBzP6SC1AnNeMfn-2A1Gqw5EfxsM,21964
|
|
38
|
+
ml_tools/handle_excel.py,sha256=pfdAPb9ywegFkM9T54bRssDOsX-K7rSeV0RaMz7lEAo,14006
|
|
39
|
+
ml_tools/keys.py,sha256=wZOBuEnnHc54vlOZiimnrxfk-sZh6f6suPppJW8rbPQ,3326
|
|
40
|
+
ml_tools/math_utilities.py,sha256=xeKq1quR_3DYLgowcp4Uam_4s3JltUyOnqMOGuAiYWU,8802
|
|
41
|
+
ml_tools/optimization_tools.py,sha256=TYFQ2nSnp7xxs-VyoZISWgnGJghFbsWasHjruegyJRs,12763
|
|
42
|
+
ml_tools/path_manager.py,sha256=CyDU16pOKmC82jPubqJPT6EBt-u-3rGVbxyPIZCvDDY,18432
|
|
43
|
+
ml_tools/serde.py,sha256=c8uDYjYry_VrLvoG4ixqDj5pij88lVn6Tu4NHcPkwDU,6943
|
|
44
|
+
ml_tools/utilities.py,sha256=aWqvYzmxlD74PD5Yqu1VuTekDJeYLQrmPIU_VeVyRp0,22526
|
|
45
|
+
dragon_ml_toolbox-14.3.0.dist-info/METADATA,sha256=TeVrfmCt4AVSweSN4Ai0yyZCJMQtSD1MHsUoEQHXLg4,6475
|
|
46
|
+
dragon_ml_toolbox-14.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
47
|
+
dragon_ml_toolbox-14.3.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
48
|
+
dragon_ml_toolbox-14.3.0.dist-info/RECORD,,
|
|
@@ -27,3 +27,13 @@ This project depends on the following third-party packages. Each is governed by
|
|
|
27
27
|
- [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE)
|
|
28
28
|
- [tqdm](https://github.com/tqdm/tqdm/blob/master/LICENSE)
|
|
29
29
|
- [pyarrow](https://github.com/apache/arrow/blob/main/LICENSE.txt)
|
|
30
|
+
- [colorlog](https://github.com/borntyping/python-colorlog/blob/main/LICENSE)
|
|
31
|
+
- [evotorch](https://github.com/nnaisense/evotorch/blob/master/LICENSE)
|
|
32
|
+
- [FreeSimpleGUI](https://github.com/spyoungtech/FreeSimpleGUI/blob/main/license.txt)
|
|
33
|
+
- [nuitka](https://github.com/Nuitka/Nuitka/blob/main/LICENSE.txt)
|
|
34
|
+
- [omegaconf](https://github.com/omry/omegaconf/blob/master/LICENSE)
|
|
35
|
+
- [ordered-set](https://github.com/rspeer/ordered-set/blob/master/MIT-LICENSE)
|
|
36
|
+
- [pyinstaller](https://github.com/pyinstaller/pyinstaller/blob/develop/COPYING.txt)
|
|
37
|
+
- [pytorch_tabular](https://github.com/manujosephv/pytorch_tabular/blob/main/LICENSE)
|
|
38
|
+
- [torchmetrics](https://github.com/Lightning-AI/torchmetrics/blob/master/LICENSE)
|
|
39
|
+
- [zstandard](https://github.com/indygreg/python-zstandard/blob/main/LICENSE)
|
ml_tools/MICE_imputation.py
CHANGED
|
@@ -7,19 +7,20 @@ from plotnine import ggplot, labs, theme, element_blank # type: ignore
|
|
|
7
7
|
from typing import Optional, Union
|
|
8
8
|
|
|
9
9
|
from .utilities import load_dataframe, merge_dataframes, save_dataframe_filename
|
|
10
|
-
from .math_utilities import threshold_binary_values
|
|
10
|
+
from .math_utilities import threshold_binary_values, discretize_categorical_values
|
|
11
11
|
from .path_manager import sanitize_filename, make_fullpath, list_csv_paths
|
|
12
12
|
from ._logger import _LOGGER
|
|
13
13
|
from ._script_info import _script_info
|
|
14
|
+
from ._schema import FeatureSchema
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
__all__ = [
|
|
18
|
+
"MiceImputer",
|
|
17
19
|
"apply_mice",
|
|
18
20
|
"save_imputed_datasets",
|
|
19
|
-
"get_na_column_names",
|
|
20
21
|
"get_convergence_diagnostic",
|
|
21
22
|
"get_imputed_distributions",
|
|
22
|
-
"run_mice_pipeline"
|
|
23
|
+
"run_mice_pipeline",
|
|
23
24
|
]
|
|
24
25
|
|
|
25
26
|
|
|
@@ -79,7 +80,7 @@ def save_imputed_datasets(save_dir: Union[str, Path], imputed_datasets: list, df
|
|
|
79
80
|
|
|
80
81
|
|
|
81
82
|
#Get names of features that had missing values before imputation
|
|
82
|
-
def
|
|
83
|
+
def _get_na_column_names(df: pd.DataFrame):
|
|
83
84
|
return [col for col in df.columns if df[col].isna().any()]
|
|
84
85
|
|
|
85
86
|
|
|
@@ -264,7 +265,7 @@ def run_mice_pipeline(df_path_or_dir: Union[str,Path], target_columns: list[str]
|
|
|
264
265
|
|
|
265
266
|
save_imputed_datasets(save_dir=save_datasets_path, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
|
|
266
267
|
|
|
267
|
-
imputed_column_names =
|
|
268
|
+
imputed_column_names = _get_na_column_names(df=df)
|
|
268
269
|
|
|
269
270
|
get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_path)
|
|
270
271
|
|
|
@@ -278,5 +279,206 @@ def _skip_targets(df: pd.DataFrame, target_cols: list[str]):
|
|
|
278
279
|
return df_feats, df_targets
|
|
279
280
|
|
|
280
281
|
|
|
282
|
+
# modern implementation
|
|
283
|
+
class MiceImputer:
|
|
284
|
+
"""
|
|
285
|
+
A modern MICE imputation pipeline that uses a FeatureSchema
|
|
286
|
+
to correctly discretize categorical features after imputation.
|
|
287
|
+
"""
|
|
288
|
+
def __init__(self,
|
|
289
|
+
schema: FeatureSchema,
|
|
290
|
+
iterations: int=20,
|
|
291
|
+
resulting_datasets: int = 1,
|
|
292
|
+
random_state: int = 101):
|
|
293
|
+
|
|
294
|
+
self.schema = schema
|
|
295
|
+
self.random_state = random_state
|
|
296
|
+
self.iterations = iterations
|
|
297
|
+
self.resulting_datasets = resulting_datasets
|
|
298
|
+
|
|
299
|
+
# --- Store schema info ---
|
|
300
|
+
|
|
301
|
+
# 1. Categorical info
|
|
302
|
+
if not self.schema.categorical_index_map:
|
|
303
|
+
_LOGGER.warning("FeatureSchema has no 'categorical_index_map'. No discretization will be applied.")
|
|
304
|
+
self.cat_info = {}
|
|
305
|
+
else:
|
|
306
|
+
self.cat_info = self.schema.categorical_index_map
|
|
307
|
+
|
|
308
|
+
# 2. Ordered feature names (critical for index mapping)
|
|
309
|
+
self.ordered_features = list(self.schema.feature_names)
|
|
310
|
+
|
|
311
|
+
# 3. Names of categorical features
|
|
312
|
+
self.categorical_features = list(self.schema.categorical_feature_names)
|
|
313
|
+
|
|
314
|
+
_LOGGER.info(f"MiceImputer initialized. Found {len(self.cat_info)} categorical features to discretize.")
|
|
315
|
+
|
|
316
|
+
def _post_process(self, imputed_df: pd.DataFrame) -> pd.DataFrame:
|
|
317
|
+
"""
|
|
318
|
+
Applies schema-based discretization to a completed dataframe.
|
|
319
|
+
|
|
320
|
+
This method works around the behavior of `discretize_categorical_values`
|
|
321
|
+
(which returns a full int32 array) by:
|
|
322
|
+
1. Calling it on the full, ordered feature array.
|
|
323
|
+
2. Extracting *only* the valid discretized categorical columns.
|
|
324
|
+
3. Updating the original float dataframe with these integer values.
|
|
325
|
+
"""
|
|
326
|
+
# If no categorical features are defined, return the df as-is.
|
|
327
|
+
if not self.cat_info:
|
|
328
|
+
return imputed_df
|
|
329
|
+
|
|
330
|
+
try:
|
|
331
|
+
# 1. Ensure DataFrame columns match the schema order
|
|
332
|
+
# This is critical for the index-based categorical_info
|
|
333
|
+
df_ordered: pd.DataFrame = imputed_df[self.ordered_features] # type: ignore
|
|
334
|
+
|
|
335
|
+
# 2. Convert to NumPy array
|
|
336
|
+
array_ordered = df_ordered.to_numpy()
|
|
337
|
+
|
|
338
|
+
# 3. Apply discretization utility (which returns a full int32 array)
|
|
339
|
+
# This array has *correct* categorical values but *truncated* continuous values.
|
|
340
|
+
discretized_array_int32 = discretize_categorical_values(
|
|
341
|
+
array_ordered,
|
|
342
|
+
self.cat_info,
|
|
343
|
+
start_at_zero=True # Assuming 0-based indexing
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# 4. Create a new DF from the int32 array, keeping the categorical columns.
|
|
347
|
+
df_discretized_cats = pd.DataFrame(
|
|
348
|
+
discretized_array_int32,
|
|
349
|
+
columns=self.ordered_features,
|
|
350
|
+
index=df_ordered.index # <-- Critical: align index
|
|
351
|
+
)[self.categorical_features] # <-- Select only cat features
|
|
352
|
+
|
|
353
|
+
# 5. "Rejoin": Start with a fresh copy of the *original* imputed DF (which has correct continuous floats).
|
|
354
|
+
final_df = df_ordered.copy()
|
|
355
|
+
|
|
356
|
+
# 6. Use .update() to "paste" the integer categorical values
|
|
357
|
+
# over the old float categorical values. Continuous floats are unaffected.
|
|
358
|
+
final_df.update(df_discretized_cats)
|
|
359
|
+
|
|
360
|
+
return final_df
|
|
361
|
+
|
|
362
|
+
except Exception as e:
|
|
363
|
+
_LOGGER.error(f"Failed during post-processing discretization:\n\tInput DF shape: {imputed_df.shape}\n\tSchema features: {len(self.ordered_features)}\n\tCategorical info keys: {list(self.cat_info.keys())}\n{e}")
|
|
364
|
+
raise
|
|
365
|
+
|
|
366
|
+
def _run_mice(self,
|
|
367
|
+
df: pd.DataFrame,
|
|
368
|
+
df_name: str) -> tuple[mf.ImputationKernel, list[pd.DataFrame], list[str]]:
|
|
369
|
+
"""
|
|
370
|
+
Runs the MICE kernel and applies schema-based post-processing.
|
|
371
|
+
|
|
372
|
+
Parameters:
|
|
373
|
+
df (pd.DataFrame): The input dataframe *with NaNs*. Should only contain feature columns.
|
|
374
|
+
df_name (str): The base name for the dataset.
|
|
375
|
+
|
|
376
|
+
Returns:
|
|
377
|
+
tuple[mf.ImputationKernel, list[pd.DataFrame], list[str]]:
|
|
378
|
+
- The trained MICE kernel
|
|
379
|
+
- A list of imputed and processed DataFrames
|
|
380
|
+
- A list of names for the new DataFrames
|
|
381
|
+
"""
|
|
382
|
+
# Ensure input df only contains features from the schema and is in the correct order.
|
|
383
|
+
try:
|
|
384
|
+
df_feats = df[self.ordered_features]
|
|
385
|
+
except KeyError as e:
|
|
386
|
+
_LOGGER.error(f"Input DataFrame is missing required schema columns: {e}")
|
|
387
|
+
raise
|
|
388
|
+
|
|
389
|
+
# 1. Initialize kernel
|
|
390
|
+
kernel = mf.ImputationKernel(
|
|
391
|
+
data=df_feats,
|
|
392
|
+
num_datasets=self.resulting_datasets,
|
|
393
|
+
random_state=self.random_state
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
_LOGGER.info("➡️ Schema-based MICE imputation running...")
|
|
397
|
+
|
|
398
|
+
# 2. Perform MICE
|
|
399
|
+
kernel.mice(self.iterations)
|
|
400
|
+
|
|
401
|
+
# 3. Retrieve, process, and collect datasets
|
|
402
|
+
imputed_datasets = []
|
|
403
|
+
for i in range(self.resulting_datasets):
|
|
404
|
+
# complete_data returns a pd.DataFrame
|
|
405
|
+
completed_df = kernel.complete_data(dataset=i)
|
|
406
|
+
|
|
407
|
+
# Apply our new discretization and ordering
|
|
408
|
+
processed_df = self._post_process(completed_df)
|
|
409
|
+
imputed_datasets.append(processed_df)
|
|
410
|
+
|
|
411
|
+
if not imputed_datasets:
|
|
412
|
+
_LOGGER.error("No imputed datasets were generated.")
|
|
413
|
+
raise ValueError()
|
|
414
|
+
|
|
415
|
+
# 4. Generate names
|
|
416
|
+
if self.resulting_datasets == 1:
|
|
417
|
+
imputed_dataset_names = [f"{df_name}_MICE"]
|
|
418
|
+
else:
|
|
419
|
+
imputed_dataset_names = [f"{df_name}_MICE_{i+1}" for i in range(self.resulting_datasets)]
|
|
420
|
+
|
|
421
|
+
# 5. Validate indexes
|
|
422
|
+
for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
|
|
423
|
+
assert imputed_df.shape[0] == df.shape[0], f"❌ Row count mismatch in dataset {subname}"
|
|
424
|
+
assert all(imputed_df.index == df.index), f"❌ Index mismatch in dataset {subname}"
|
|
425
|
+
|
|
426
|
+
_LOGGER.info("Schema-based MICE imputation complete.")
|
|
427
|
+
|
|
428
|
+
return kernel, imputed_datasets, imputed_dataset_names
|
|
429
|
+
|
|
430
|
+
def run_pipeline(self,
|
|
431
|
+
df_path_or_dir: Union[str,Path],
|
|
432
|
+
save_datasets_dir: Union[str,Path],
|
|
433
|
+
save_metrics_dir: Union[str,Path],
|
|
434
|
+
):
|
|
435
|
+
"""
|
|
436
|
+
Runs the complete MICE imputation pipeline.
|
|
437
|
+
|
|
438
|
+
This method automates the entire workflow:
|
|
439
|
+
1. Loads data from a CSV file path or a directory with CSV files.
|
|
440
|
+
2. Separates features and targets based on the `FeatureSchema`.
|
|
441
|
+
3. Runs the MICE algorithm on the feature set.
|
|
442
|
+
4. Applies schema-based post-processing to discretize categorical features.
|
|
443
|
+
5. Saves the final, processed, and imputed dataset(s) (re-joined with targets) to `save_datasets_dir`.
|
|
444
|
+
6. Generates and saves convergence and distribution plots for all imputed columns to `save_metrics_dir`.
|
|
445
|
+
|
|
446
|
+
Parameters
|
|
447
|
+
----------
|
|
448
|
+
df_path_or_dir :[str,Path]
|
|
449
|
+
Path to a single CSV file or a directory containing multiple CSV files to impute.
|
|
450
|
+
save_datasets_dir : [str,Path]
|
|
451
|
+
Directory where the final imputed and processed dataset(s) will be saved as CSVs.
|
|
452
|
+
save_metrics_dir : [str,Path]
|
|
453
|
+
Directory where convergence and distribution plots will be saved.
|
|
454
|
+
"""
|
|
455
|
+
# Check paths
|
|
456
|
+
save_datasets_path = make_fullpath(save_datasets_dir, make=True)
|
|
457
|
+
save_metrics_path = make_fullpath(save_metrics_dir, make=True)
|
|
458
|
+
|
|
459
|
+
input_path = make_fullpath(df_path_or_dir)
|
|
460
|
+
if input_path.is_file():
|
|
461
|
+
all_file_paths = [input_path]
|
|
462
|
+
else:
|
|
463
|
+
all_file_paths = list(list_csv_paths(input_path).values())
|
|
464
|
+
|
|
465
|
+
for df_path in all_file_paths:
|
|
466
|
+
|
|
467
|
+
df, df_name = load_dataframe(df_path=df_path, kind="pandas")
|
|
468
|
+
|
|
469
|
+
df_features: pd.DataFrame = df[self.schema.feature_names] # type: ignore
|
|
470
|
+
df_targets = df.drop(columns=self.schema.feature_names)
|
|
471
|
+
|
|
472
|
+
imputed_column_names = _get_na_column_names(df=df_features)
|
|
473
|
+
|
|
474
|
+
kernel, imputed_datasets, imputed_dataset_names = self._run_mice(df=df_features, df_name=df_name)
|
|
475
|
+
|
|
476
|
+
save_imputed_datasets(save_dir=save_datasets_path, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
|
|
477
|
+
|
|
478
|
+
get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_path)
|
|
479
|
+
|
|
480
|
+
get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_path, column_names=imputed_column_names)
|
|
481
|
+
|
|
482
|
+
|
|
281
483
|
def info():
|
|
282
484
|
_script_info(__all__)
|
ml_tools/ML_callbacks.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Union, Literal, Optional
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
|
|
7
7
|
from .path_manager import make_fullpath, sanitize_filename
|
|
8
|
-
from .keys import PyTorchLogKeys
|
|
8
|
+
from .keys import PyTorchLogKeys, PyTorchCheckpointKeys
|
|
9
9
|
from ._logger import _LOGGER
|
|
10
10
|
from ._script_info import _script_info
|
|
11
11
|
|
|
@@ -189,7 +189,7 @@ class EarlyStopping(Callback):
|
|
|
189
189
|
|
|
190
190
|
class ModelCheckpoint(Callback):
|
|
191
191
|
"""
|
|
192
|
-
Saves the model weights to a directory with automated filename generation and rotation.
|
|
192
|
+
Saves the model weights, optimizer state, LR scheduler state (if any), and epoch number to a directory with automated filename generation and rotation.
|
|
193
193
|
"""
|
|
194
194
|
def __init__(self, save_dir: Union[str,Path], checkpoint_name: Optional[str]=None, monitor: str = PyTorchLogKeys.VAL_LOSS,
|
|
195
195
|
save_best_only: bool = True, mode: Literal['auto', 'min', 'max']= 'auto', verbose: int = 0):
|
|
@@ -200,7 +200,7 @@ class ModelCheckpoint(Callback):
|
|
|
200
200
|
Args:
|
|
201
201
|
save_dir (str): Directory where checkpoint files will be saved.
|
|
202
202
|
checkpoint_name (str| None): If None, the filename will include the epoch and score.
|
|
203
|
-
monitor (str): Metric to monitor
|
|
203
|
+
monitor (str): Metric to monitor.
|
|
204
204
|
save_best_only (bool): If true, save only the best model.
|
|
205
205
|
mode (str): One of {'auto', 'min', 'max'}.
|
|
206
206
|
verbose (int): Verbosity mode.
|
|
@@ -270,15 +270,29 @@ class ModelCheckpoint(Callback):
|
|
|
270
270
|
if self.verbose > 0:
|
|
271
271
|
_LOGGER.info(f"Epoch {epoch}: {self.monitor} improved from {old_best_str} to {current:.4f}, saving model to {new_filepath}")
|
|
272
272
|
|
|
273
|
+
# Update best score *before* saving
|
|
274
|
+
self.best = current
|
|
275
|
+
|
|
276
|
+
# Create a comprehensive checkpoint dictionary
|
|
277
|
+
checkpoint_data = {
|
|
278
|
+
PyTorchCheckpointKeys.EPOCH: epoch,
|
|
279
|
+
PyTorchCheckpointKeys.MODEL_STATE: self.trainer.model.state_dict(), # type: ignore
|
|
280
|
+
PyTorchCheckpointKeys.OPTIMIZER_STATE: self.trainer.optimizer.state_dict(), # type: ignore
|
|
281
|
+
PyTorchCheckpointKeys.BEST_SCORE: self.best,
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
# Check for scheduler
|
|
285
|
+
if hasattr(self.trainer, 'scheduler') and self.trainer.scheduler is not None: # type: ignore
|
|
286
|
+
checkpoint_data[PyTorchCheckpointKeys.SCHEDULER_STATE] = self.trainer.scheduler.state_dict() # type: ignore
|
|
287
|
+
|
|
273
288
|
# Save the new best model
|
|
274
|
-
torch.save(
|
|
289
|
+
torch.save(checkpoint_data, new_filepath)
|
|
275
290
|
|
|
276
291
|
# Delete the old best model file
|
|
277
292
|
if self.last_best_filepath and self.last_best_filepath.exists():
|
|
278
293
|
self.last_best_filepath.unlink()
|
|
279
294
|
|
|
280
295
|
# Update state
|
|
281
|
-
self.best = current
|
|
282
296
|
self.last_best_filepath = new_filepath
|
|
283
297
|
|
|
284
298
|
def _save_rolling_checkpoints(self, epoch, logs):
|
|
@@ -292,7 +306,19 @@ class ModelCheckpoint(Callback):
|
|
|
292
306
|
|
|
293
307
|
if self.verbose > 0:
|
|
294
308
|
_LOGGER.info(f'Epoch {epoch}: saving model to {filepath}')
|
|
295
|
-
|
|
309
|
+
|
|
310
|
+
# Create a comprehensive checkpoint dictionary
|
|
311
|
+
checkpoint_data = {
|
|
312
|
+
PyTorchCheckpointKeys.EPOCH: epoch,
|
|
313
|
+
PyTorchCheckpointKeys.MODEL_STATE: self.trainer.model.state_dict(), # type: ignore
|
|
314
|
+
PyTorchCheckpointKeys.OPTIMIZER_STATE: self.trainer.optimizer.state_dict(), # type: ignore
|
|
315
|
+
PyTorchCheckpointKeys.BEST_SCORE: self.best, # Save the current best score
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
if hasattr(self.trainer, 'scheduler') and self.trainer.scheduler is not None: # type: ignore
|
|
319
|
+
checkpoint_data[PyTorchCheckpointKeys.SCHEDULER_STATE] = self.trainer.scheduler.state_dict() # type: ignore
|
|
320
|
+
|
|
321
|
+
torch.save(checkpoint_data, filepath)
|
|
296
322
|
|
|
297
323
|
self.saved_checkpoints.append(filepath)
|
|
298
324
|
|
|
@@ -309,19 +335,25 @@ class LRScheduler(Callback):
|
|
|
309
335
|
"""
|
|
310
336
|
Callback to manage a PyTorch learning rate scheduler.
|
|
311
337
|
"""
|
|
312
|
-
def __init__(self, scheduler, monitor: Optional[str] =
|
|
338
|
+
def __init__(self, scheduler, monitor: Optional[str] = PyTorchLogKeys.VAL_LOSS):
|
|
313
339
|
"""
|
|
314
340
|
This callback automatically calls the scheduler's `step()` method at the
|
|
315
341
|
end of each epoch. It also logs a message when the learning rate changes.
|
|
316
342
|
|
|
317
343
|
Args:
|
|
318
344
|
scheduler: An initialized PyTorch learning rate scheduler.
|
|
319
|
-
monitor (str
|
|
345
|
+
monitor (str): The metric to monitor for schedulers that require it, like `ReduceLROnPlateau`. Should match a key in the logs (e.g., 'val_loss').
|
|
320
346
|
"""
|
|
321
347
|
super().__init__()
|
|
322
348
|
self.scheduler = scheduler
|
|
323
349
|
self.monitor = monitor
|
|
324
350
|
self.previous_lr = None
|
|
351
|
+
|
|
352
|
+
def set_trainer(self, trainer):
|
|
353
|
+
"""This is called by the Trainer to associate itself with the callback."""
|
|
354
|
+
super().set_trainer(trainer)
|
|
355
|
+
# Register the scheduler with the trainer so it can be added to the checkpoint
|
|
356
|
+
self.trainer.scheduler = self.scheduler # type: ignore
|
|
325
357
|
|
|
326
358
|
def on_train_begin(self, logs=None):
|
|
327
359
|
"""Store the initial learning rate."""
|