dragon-ml-toolbox 20.0.0__py3-none-any.whl → 20.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-20.0.0.dist-info → dragon_ml_toolbox-20.0.1.dist-info}/METADATA +1 -1
- {dragon_ml_toolbox-20.0.0.dist-info → dragon_ml_toolbox-20.0.1.dist-info}/RECORD +10 -10
- ml_tools/MICE/_dragon_mice.py +23 -19
- ml_tools/data_exploration/__init__.py +1 -1
- ml_tools/data_exploration/_imprimir.py +1 -1
- ml_tools/schema/_feature_schema.py +1 -1
- {dragon_ml_toolbox-20.0.0.dist-info → dragon_ml_toolbox-20.0.1.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-20.0.0.dist-info → dragon_ml_toolbox-20.0.1.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-20.0.0.dist-info → dragon_ml_toolbox-20.0.1.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-20.0.0.dist-info → dragon_ml_toolbox-20.0.1.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
dragon_ml_toolbox-20.0.
|
|
2
|
-
dragon_ml_toolbox-20.0.
|
|
1
|
+
dragon_ml_toolbox-20.0.1.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
|
|
2
|
+
dragon_ml_toolbox-20.0.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
|
|
3
3
|
ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
|
|
5
5
|
ml_tools/ETL_cleaning/__init__.py,sha256=TytE8RKmtW4KQlkaTxpYKlJAbCu-VAc82eDdHwVD3Jo,427
|
|
@@ -21,7 +21,7 @@ ml_tools/IO_tools/__init__.py,sha256=ZeEM5bbZ5udgRXFAL51uRXzoCzPLO8TWZ4AiME7NNy0
|
|
|
21
21
|
ml_tools/IO_tools/_imprimir.py,sha256=eN-V60xtDNFINThuRTjXknMxtbK8Ah0MWgc8l2GTXMA,250
|
|
22
22
|
ml_tools/MICE/_MICE_imputation.py,sha256=N1cDwVYfoHvIZz7FLLcW-guZUo8iFKedtkfS7CU6TVE,5318
|
|
23
23
|
ml_tools/MICE/__init__.py,sha256=i5N_fd3rxpEgLsKKDoLbokW0rHm-ADEg8r3gBB5426E,313
|
|
24
|
-
ml_tools/MICE/_dragon_mice.py,sha256=
|
|
24
|
+
ml_tools/MICE/_dragon_mice.py,sha256=qEOy9Gx1QzVBvkvGR8790TkvKw8-fp06vCDGWM6j9os,17806
|
|
25
25
|
ml_tools/MICE/_imprimir.py,sha256=YVhgZlUQ-NrDUVhHTK3u8s1QEbZ_jvDVF7-0FptVsxs,215
|
|
26
26
|
ml_tools/ML_callbacks/__init__.py,sha256=dF37KXezy6P3VArhZbm5CI6si65GA-qVY70jvZFZYkA,427
|
|
27
27
|
ml_tools/ML_callbacks/_base.py,sha256=xLVAFOhBHjqnf8a_wKgW1F-tn2u6EqV3IHXsXKTn2NE,3269
|
|
@@ -125,11 +125,11 @@ ml_tools/_core/__init__.py,sha256=m-VP0RW0tOTm9N5NI3kFNcpM7WtVgs0RK9pK3ZJRZQQ,14
|
|
|
125
125
|
ml_tools/_core/_logger.py,sha256=xzhn_FouMDRVNwXGBGlPC9Ruq6i5uCrmNaS5jesguMU,4972
|
|
126
126
|
ml_tools/_core/_schema_load_ops.py,sha256=KLs9vBzANz5ESe2wlP-C41N4VlgGil-ywcfvWKSOGss,1551
|
|
127
127
|
ml_tools/_core/_script_info.py,sha256=LtFGt10gEvCnhIRMKJPi2yXkiGLcdr7lE-oIP2XGHzQ,234
|
|
128
|
-
ml_tools/data_exploration/__init__.py,sha256=
|
|
128
|
+
ml_tools/data_exploration/__init__.py,sha256=w9dM6wjmxfbEXQCWGFVL_cIuLHtYVP364aQvzRwfZXY,1674
|
|
129
129
|
ml_tools/data_exploration/_analysis.py,sha256=H6LryV56FFCHWjvQdkhZbtprZy6aP8EqU_hC2Cf9CLE,7832
|
|
130
130
|
ml_tools/data_exploration/_cleaning.py,sha256=LpoOHOB6HVtdObZExg-B8SxZW-JUc51tblnkCFDZxKg,20846
|
|
131
131
|
ml_tools/data_exploration/_features.py,sha256=wW-M8n2aLIy05DR2z4fI8wjpPjn3mOAnm9aSGYbMKwI,23363
|
|
132
|
-
ml_tools/data_exploration/_imprimir.py,sha256=
|
|
132
|
+
ml_tools/data_exploration/_imprimir.py,sha256=0nXu60HpeJZ8s83mpVoRtdKILK3t8EHRFVk7d9vRVUo,876
|
|
133
133
|
ml_tools/data_exploration/_plotting.py,sha256=zH1dPcIoAlOuww23xIoBCsQOAshPPv9OyGposOA2RvI,19883
|
|
134
134
|
ml_tools/data_exploration/_schema_ops.py,sha256=PoFeHaS9dXI9gfL0SRD-8uSP4owqmbQFbtfA-HxkLnY,7108
|
|
135
135
|
ml_tools/ensemble_evaluation/__init__.py,sha256=Xxx-F-_TvSVzMaocKXOo_tEXLibMJtf_YY85Ac3U0EI,483
|
|
@@ -162,7 +162,7 @@ ml_tools/plot_fonts/__init__.py,sha256=l-vSSpjZb6IeWjjgPTcNmEs7M-vbw0lqgEKD5jhtX
|
|
|
162
162
|
ml_tools/plot_fonts/_imprimir.py,sha256=zNi6naa5eWBFfa_yV569MhUtSAL44H0xDjMcgrJSlXk,131
|
|
163
163
|
ml_tools/plot_fonts/_plot_fonts.py,sha256=mfjXNT9P59ymHoTI85Q8CcvfxfK5BIFBWtTZH-hNIC4,2209
|
|
164
164
|
ml_tools/schema/__init__.py,sha256=9LQtKz3OO9wm-1piUgAhCJZVZT-F-YSg5QLus9pxfgA,263
|
|
165
|
-
ml_tools/schema/_feature_schema.py,sha256=
|
|
165
|
+
ml_tools/schema/_feature_schema.py,sha256=ICymTIL05n1qs61TvyY7rapDOJ9PlaOHi0F86N4tNlU,8547
|
|
166
166
|
ml_tools/schema/_gui_schema.py,sha256=IVwN4THAdFrvh2TpV4SFd_zlzMX3eioF-w-qcSVTndE,7245
|
|
167
167
|
ml_tools/schema/_imprimir.py,sha256=waNHozZmkCKKNFWSw0HFf9489FkSXogl6KuT5cn5V74,190
|
|
168
168
|
ml_tools/serde/__init__.py,sha256=Gj6B8Sgf0-ad72jFXq2W_k5pXOT2iNx5Dvzwrd7Tj1U,229
|
|
@@ -172,7 +172,7 @@ ml_tools/utilities/__init__.py,sha256=pkR2HxUIlKZMDderP2awYXVIFxkU2Xt3FkJmcmuRIp
|
|
|
172
172
|
ml_tools/utilities/_imprimir.py,sha256=sV3ASBOsTdVYvGojOTIpZYFyrnd4panS5h_4HcMzob4,432
|
|
173
173
|
ml_tools/utilities/_utility_save_load.py,sha256=7skiiuYGVLVMK_nU9uLfUZw16ePvF3i9ub7G7LMyUgs,16085
|
|
174
174
|
ml_tools/utilities/_utility_tools.py,sha256=bN0J9d1S0W5wNzNntBWqDsJcEAK7-1OgQg3X2fwXns0,6918
|
|
175
|
-
dragon_ml_toolbox-20.0.
|
|
176
|
-
dragon_ml_toolbox-20.0.
|
|
177
|
-
dragon_ml_toolbox-20.0.
|
|
178
|
-
dragon_ml_toolbox-20.0.
|
|
175
|
+
dragon_ml_toolbox-20.0.1.dist-info/METADATA,sha256=ApSFj2vI7jdgUYtlYgjBpAXFQw9OKcd6em0ssSVZvGg,7866
|
|
176
|
+
dragon_ml_toolbox-20.0.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
177
|
+
dragon_ml_toolbox-20.0.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
178
|
+
dragon_ml_toolbox-20.0.1.dist-info/RECORD,,
|
ml_tools/MICE/_dragon_mice.py
CHANGED
|
@@ -197,7 +197,7 @@ class DragonMICE:
|
|
|
197
197
|
_LOGGER.error(f"Index mismatch in dataset {subname}")
|
|
198
198
|
raise ValueError()
|
|
199
199
|
|
|
200
|
-
_LOGGER.info("Schema-based MICE imputation complete.")
|
|
200
|
+
_LOGGER.info("⬅️ Schema-based MICE imputation complete.")
|
|
201
201
|
|
|
202
202
|
return kernel, imputed_datasets, imputed_dataset_names
|
|
203
203
|
|
|
@@ -237,9 +237,6 @@ class DragonMICE:
|
|
|
237
237
|
# We pass an empty DF as 'targets' to save_imputed_datasets to prevent duplication.
|
|
238
238
|
df_input = df
|
|
239
239
|
df_targets_to_save = pd.DataFrame(index=df.index)
|
|
240
|
-
|
|
241
|
-
# Monitor all columns that had NaNs
|
|
242
|
-
imputed_column_names = [col for col in df.columns if df[col].isna().any()]
|
|
243
240
|
else:
|
|
244
241
|
# Explicitly cast tuple to list for Pandas indexing
|
|
245
242
|
feature_cols = list(self._schema.feature_names)
|
|
@@ -253,8 +250,9 @@ class DragonMICE:
|
|
|
253
250
|
df_input = df[feature_cols]
|
|
254
251
|
# Drop features to get targets (more robust than explicit selection if targets vary)
|
|
255
252
|
df_targets_to_save = df.drop(columns=feature_cols)
|
|
256
|
-
|
|
257
|
-
|
|
253
|
+
|
|
254
|
+
# Monitor all columns that had NaNs
|
|
255
|
+
imputed_column_names = [col for col in df_input.columns if df_input[col].isna().any()]
|
|
258
256
|
|
|
259
257
|
# Run core logic
|
|
260
258
|
kernel, imputed_datasets, imputed_dataset_names = self._run_mice(df=df_input, df_name=df_name) # type: ignore
|
|
@@ -316,35 +314,41 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
|
|
|
316
314
|
|
|
317
315
|
# iterate over each imputed dataset
|
|
318
316
|
for dataset_id, imputed_dataset_name in zip(range(dataset_count), imputed_dataset_names):
|
|
319
|
-
#Check directory for current dataset
|
|
320
317
|
dataset_file_dir = f"Convergence_Metrics_{imputed_dataset_name}"
|
|
321
318
|
local_save_dir = make_fullpath(input_path=root_path / dataset_file_dir, make=True)
|
|
322
319
|
|
|
323
|
-
for
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
320
|
+
# 1. Pre-calculate means for all features across all iterations
|
|
321
|
+
# Structure: {feature_name: [mean_iter_0, mean_iter_1, ...]}
|
|
322
|
+
history = {col: [] for col in column_names}
|
|
323
|
+
|
|
324
|
+
for iteration in range(iterations_cap):
|
|
325
|
+
# Resolve dataset ONLY ONCE per iteration
|
|
326
|
+
current_imputed = kernel.complete_data(dataset=dataset_id, iteration=iteration)
|
|
327
|
+
|
|
328
|
+
for col in column_names:
|
|
329
|
+
# Fast lookup
|
|
330
|
+
val = np.mean(current_imputed[col])
|
|
331
|
+
history[col].append(val)
|
|
332
|
+
|
|
333
|
+
# 2. Plotting loop
|
|
334
|
+
for feature_name, means_per_iteration in history.items():
|
|
329
335
|
plt.figure(figsize=(10, 8))
|
|
330
336
|
plt.plot(means_per_iteration, marker='o')
|
|
331
337
|
plt.xlabel("Iteration", **label_font)
|
|
332
338
|
plt.ylabel("Mean of Imputed Values", **label_font)
|
|
333
339
|
plt.title(f"Mean Convergence for '{feature_name}'", **label_font)
|
|
334
340
|
|
|
335
|
-
# Adjust plot display for the X axis
|
|
336
341
|
_ticks = np.arange(iterations_cap)
|
|
337
342
|
_labels = np.arange(1, iterations_cap + 1)
|
|
338
|
-
plt.xticks(ticks=_ticks, labels=_labels)
|
|
343
|
+
plt.xticks(ticks=_ticks, labels=_labels)
|
|
339
344
|
plt.grid(True)
|
|
340
345
|
|
|
341
|
-
feature_save_name = sanitize_filename(feature_name)
|
|
342
|
-
feature_save_name = feature_save_name + ".svg"
|
|
346
|
+
feature_save_name = sanitize_filename(feature_name) + ".svg"
|
|
343
347
|
save_path = local_save_dir / feature_save_name
|
|
344
348
|
plt.savefig(save_path, bbox_inches='tight', format="svg")
|
|
345
349
|
plt.close()
|
|
346
350
|
|
|
347
|
-
|
|
351
|
+
_LOGGER.info(f"📉 Convergence diagnostics complete.")
|
|
348
352
|
|
|
349
353
|
|
|
350
354
|
# Imputed distributions
|
|
@@ -431,5 +435,5 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
431
435
|
fig = kernel.plot_imputed_distributions(variables=[feature])
|
|
432
436
|
_process_figure(fig, feature)
|
|
433
437
|
|
|
434
|
-
_LOGGER.info(f"
|
|
438
|
+
_LOGGER.info(f"📊 Imputed distributions complete.")
|
|
435
439
|
|
|
@@ -53,13 +53,13 @@ __all__ = [
|
|
|
53
53
|
"split_features_targets",
|
|
54
54
|
"split_continuous_binary",
|
|
55
55
|
"split_continuous_categorical_targets",
|
|
56
|
-
"encode_categorical_features",
|
|
57
56
|
"clip_outliers_single",
|
|
58
57
|
"clip_outliers_multi",
|
|
59
58
|
"drop_outlier_samples",
|
|
60
59
|
"plot_continuous_vs_target",
|
|
61
60
|
"plot_categorical_vs_target",
|
|
62
61
|
"plot_correlation_heatmap",
|
|
62
|
+
"encode_categorical_features",
|
|
63
63
|
"finalize_feature_schema",
|
|
64
64
|
"apply_feature_schema",
|
|
65
65
|
"match_and_filter_columns_by_regex",
|
|
@@ -12,13 +12,13 @@ _GRUPOS = [
|
|
|
12
12
|
"split_features_targets",
|
|
13
13
|
"split_continuous_binary",
|
|
14
14
|
"split_continuous_categorical_targets",
|
|
15
|
-
"encode_categorical_features",
|
|
16
15
|
"clip_outliers_single",
|
|
17
16
|
"clip_outliers_multi",
|
|
18
17
|
"drop_outlier_samples",
|
|
19
18
|
"plot_continuous_vs_target",
|
|
20
19
|
"plot_categorical_vs_target",
|
|
21
20
|
"plot_correlation_heatmap",
|
|
21
|
+
"encode_categorical_features",
|
|
22
22
|
"finalize_feature_schema",
|
|
23
23
|
"apply_feature_schema",
|
|
24
24
|
"match_and_filter_columns_by_regex",
|
|
@@ -44,7 +44,7 @@ class FeatureSchema(NamedTuple):
|
|
|
44
44
|
Handles conversion of Tuple->List and IntKeys->StrKeys automatically.
|
|
45
45
|
"""
|
|
46
46
|
# validate path
|
|
47
|
-
dir_path = make_fullpath(directory, enforce="directory")
|
|
47
|
+
dir_path = make_fullpath(directory, make=True, enforce="directory")
|
|
48
48
|
file_path = dir_path / SchemaKeys.SCHEMA_FILENAME
|
|
49
49
|
|
|
50
50
|
try:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|