dragon-ml-toolbox 20.0.0__py3-none-any.whl → 20.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 20.0.0
3
+ Version: 20.0.1
4
4
  Summary: Complete pipelines and helper tools for data science and machine learning projects.
5
5
  Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,5 +1,5 @@
1
- dragon_ml_toolbox-20.0.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
- dragon_ml_toolbox-20.0.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
1
+ dragon_ml_toolbox-20.0.1.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
+ dragon_ml_toolbox-20.0.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
3
3
  ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
5
5
  ml_tools/ETL_cleaning/__init__.py,sha256=TytE8RKmtW4KQlkaTxpYKlJAbCu-VAc82eDdHwVD3Jo,427
@@ -21,7 +21,7 @@ ml_tools/IO_tools/__init__.py,sha256=ZeEM5bbZ5udgRXFAL51uRXzoCzPLO8TWZ4AiME7NNy0
21
21
  ml_tools/IO_tools/_imprimir.py,sha256=eN-V60xtDNFINThuRTjXknMxtbK8Ah0MWgc8l2GTXMA,250
22
22
  ml_tools/MICE/_MICE_imputation.py,sha256=N1cDwVYfoHvIZz7FLLcW-guZUo8iFKedtkfS7CU6TVE,5318
23
23
  ml_tools/MICE/__init__.py,sha256=i5N_fd3rxpEgLsKKDoLbokW0rHm-ADEg8r3gBB5426E,313
24
- ml_tools/MICE/_dragon_mice.py,sha256=E6LyCe7JjEvDeKJfDfDd1iKJS86pDQLYGYoajahtuyg,17736
24
+ ml_tools/MICE/_dragon_mice.py,sha256=qEOy9Gx1QzVBvkvGR8790TkvKw8-fp06vCDGWM6j9os,17806
25
25
  ml_tools/MICE/_imprimir.py,sha256=YVhgZlUQ-NrDUVhHTK3u8s1QEbZ_jvDVF7-0FptVsxs,215
26
26
  ml_tools/ML_callbacks/__init__.py,sha256=dF37KXezy6P3VArhZbm5CI6si65GA-qVY70jvZFZYkA,427
27
27
  ml_tools/ML_callbacks/_base.py,sha256=xLVAFOhBHjqnf8a_wKgW1F-tn2u6EqV3IHXsXKTn2NE,3269
@@ -125,11 +125,11 @@ ml_tools/_core/__init__.py,sha256=m-VP0RW0tOTm9N5NI3kFNcpM7WtVgs0RK9pK3ZJRZQQ,14
125
125
  ml_tools/_core/_logger.py,sha256=xzhn_FouMDRVNwXGBGlPC9Ruq6i5uCrmNaS5jesguMU,4972
126
126
  ml_tools/_core/_schema_load_ops.py,sha256=KLs9vBzANz5ESe2wlP-C41N4VlgGil-ywcfvWKSOGss,1551
127
127
  ml_tools/_core/_script_info.py,sha256=LtFGt10gEvCnhIRMKJPi2yXkiGLcdr7lE-oIP2XGHzQ,234
128
- ml_tools/data_exploration/__init__.py,sha256=a4hlq6Pyc_cQjiys_2CUFd5nIvzqPc4g8asWEHJz9Es,1674
128
+ ml_tools/data_exploration/__init__.py,sha256=w9dM6wjmxfbEXQCWGFVL_cIuLHtYVP364aQvzRwfZXY,1674
129
129
  ml_tools/data_exploration/_analysis.py,sha256=H6LryV56FFCHWjvQdkhZbtprZy6aP8EqU_hC2Cf9CLE,7832
130
130
  ml_tools/data_exploration/_cleaning.py,sha256=LpoOHOB6HVtdObZExg-B8SxZW-JUc51tblnkCFDZxKg,20846
131
131
  ml_tools/data_exploration/_features.py,sha256=wW-M8n2aLIy05DR2z4fI8wjpPjn3mOAnm9aSGYbMKwI,23363
132
- ml_tools/data_exploration/_imprimir.py,sha256=PkvDvQkYTQC_KnfI1gxxUxtC-XeSRePniM1TyJj8Caw,876
132
+ ml_tools/data_exploration/_imprimir.py,sha256=0nXu60HpeJZ8s83mpVoRtdKILK3t8EHRFVk7d9vRVUo,876
133
133
  ml_tools/data_exploration/_plotting.py,sha256=zH1dPcIoAlOuww23xIoBCsQOAshPPv9OyGposOA2RvI,19883
134
134
  ml_tools/data_exploration/_schema_ops.py,sha256=PoFeHaS9dXI9gfL0SRD-8uSP4owqmbQFbtfA-HxkLnY,7108
135
135
  ml_tools/ensemble_evaluation/__init__.py,sha256=Xxx-F-_TvSVzMaocKXOo_tEXLibMJtf_YY85Ac3U0EI,483
@@ -162,7 +162,7 @@ ml_tools/plot_fonts/__init__.py,sha256=l-vSSpjZb6IeWjjgPTcNmEs7M-vbw0lqgEKD5jhtX
162
162
  ml_tools/plot_fonts/_imprimir.py,sha256=zNi6naa5eWBFfa_yV569MhUtSAL44H0xDjMcgrJSlXk,131
163
163
  ml_tools/plot_fonts/_plot_fonts.py,sha256=mfjXNT9P59ymHoTI85Q8CcvfxfK5BIFBWtTZH-hNIC4,2209
164
164
  ml_tools/schema/__init__.py,sha256=9LQtKz3OO9wm-1piUgAhCJZVZT-F-YSg5QLus9pxfgA,263
165
- ml_tools/schema/_feature_schema.py,sha256=QLsxBS3_CIJp4c4dknvMs7RHZl_GZDEBJQ0MxLrQo6Y,8536
165
+ ml_tools/schema/_feature_schema.py,sha256=ICymTIL05n1qs61TvyY7rapDOJ9PlaOHi0F86N4tNlU,8547
166
166
  ml_tools/schema/_gui_schema.py,sha256=IVwN4THAdFrvh2TpV4SFd_zlzMX3eioF-w-qcSVTndE,7245
167
167
  ml_tools/schema/_imprimir.py,sha256=waNHozZmkCKKNFWSw0HFf9489FkSXogl6KuT5cn5V74,190
168
168
  ml_tools/serde/__init__.py,sha256=Gj6B8Sgf0-ad72jFXq2W_k5pXOT2iNx5Dvzwrd7Tj1U,229
@@ -172,7 +172,7 @@ ml_tools/utilities/__init__.py,sha256=pkR2HxUIlKZMDderP2awYXVIFxkU2Xt3FkJmcmuRIp
172
172
  ml_tools/utilities/_imprimir.py,sha256=sV3ASBOsTdVYvGojOTIpZYFyrnd4panS5h_4HcMzob4,432
173
173
  ml_tools/utilities/_utility_save_load.py,sha256=7skiiuYGVLVMK_nU9uLfUZw16ePvF3i9ub7G7LMyUgs,16085
174
174
  ml_tools/utilities/_utility_tools.py,sha256=bN0J9d1S0W5wNzNntBWqDsJcEAK7-1OgQg3X2fwXns0,6918
175
- dragon_ml_toolbox-20.0.0.dist-info/METADATA,sha256=ILeGioHn8qeLS5vaaqOs-zId8QvQxoWZcjKgHYmeuPo,7866
176
- dragon_ml_toolbox-20.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
177
- dragon_ml_toolbox-20.0.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
178
- dragon_ml_toolbox-20.0.0.dist-info/RECORD,,
175
+ dragon_ml_toolbox-20.0.1.dist-info/METADATA,sha256=ApSFj2vI7jdgUYtlYgjBpAXFQw9OKcd6em0ssSVZvGg,7866
176
+ dragon_ml_toolbox-20.0.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
177
+ dragon_ml_toolbox-20.0.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
178
+ dragon_ml_toolbox-20.0.1.dist-info/RECORD,,
@@ -197,7 +197,7 @@ class DragonMICE:
197
197
  _LOGGER.error(f"Index mismatch in dataset {subname}")
198
198
  raise ValueError()
199
199
 
200
- _LOGGER.info("Schema-based MICE imputation complete.")
200
+ _LOGGER.info("⬅️ Schema-based MICE imputation complete.")
201
201
 
202
202
  return kernel, imputed_datasets, imputed_dataset_names
203
203
 
@@ -237,9 +237,6 @@ class DragonMICE:
237
237
  # We pass an empty DF as 'targets' to save_imputed_datasets to prevent duplication.
238
238
  df_input = df
239
239
  df_targets_to_save = pd.DataFrame(index=df.index)
240
-
241
- # Monitor all columns that had NaNs
242
- imputed_column_names = [col for col in df.columns if df[col].isna().any()]
243
240
  else:
244
241
  # Explicitly cast tuple to list for Pandas indexing
245
242
  feature_cols = list(self._schema.feature_names)
@@ -253,8 +250,9 @@ class DragonMICE:
253
250
  df_input = df[feature_cols]
254
251
  # Drop features to get targets (more robust than explicit selection if targets vary)
255
252
  df_targets_to_save = df.drop(columns=feature_cols)
256
-
257
- imputed_column_names = _get_na_column_names(df=df_input) # type: ignore
253
+
254
+ # Monitor all columns that had NaNs
255
+ imputed_column_names = [col for col in df_input.columns if df_input[col].isna().any()]
258
256
 
259
257
  # Run core logic
260
258
  kernel, imputed_datasets, imputed_dataset_names = self._run_mice(df=df_input, df_name=df_name) # type: ignore
@@ -316,35 +314,41 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
316
314
 
317
315
  # iterate over each imputed dataset
318
316
  for dataset_id, imputed_dataset_name in zip(range(dataset_count), imputed_dataset_names):
319
- #Check directory for current dataset
320
317
  dataset_file_dir = f"Convergence_Metrics_{imputed_dataset_name}"
321
318
  local_save_dir = make_fullpath(input_path=root_path / dataset_file_dir, make=True)
322
319
 
323
- for feature_name in column_names:
324
- means_per_iteration = []
325
- for iteration in range(iterations_cap):
326
- current_imputed = kernel.complete_data(dataset=dataset_id, iteration=iteration)
327
- means_per_iteration.append(np.mean(current_imputed[feature_name])) # type: ignore
328
-
320
+ # 1. Pre-calculate means for all features across all iterations
321
+ # Structure: {feature_name: [mean_iter_0, mean_iter_1, ...]}
322
+ history = {col: [] for col in column_names}
323
+
324
+ for iteration in range(iterations_cap):
325
+ # Resolve dataset ONLY ONCE per iteration
326
+ current_imputed = kernel.complete_data(dataset=dataset_id, iteration=iteration)
327
+
328
+ for col in column_names:
329
+ # Fast lookup
330
+ val = np.mean(current_imputed[col])
331
+ history[col].append(val)
332
+
333
+ # 2. Plotting loop
334
+ for feature_name, means_per_iteration in history.items():
329
335
  plt.figure(figsize=(10, 8))
330
336
  plt.plot(means_per_iteration, marker='o')
331
337
  plt.xlabel("Iteration", **label_font)
332
338
  plt.ylabel("Mean of Imputed Values", **label_font)
333
339
  plt.title(f"Mean Convergence for '{feature_name}'", **label_font)
334
340
 
335
- # Adjust plot display for the X axis
336
341
  _ticks = np.arange(iterations_cap)
337
342
  _labels = np.arange(1, iterations_cap + 1)
338
- plt.xticks(ticks=_ticks, labels=_labels) # type: ignore
343
+ plt.xticks(ticks=_ticks, labels=_labels)
339
344
  plt.grid(True)
340
345
 
341
- feature_save_name = sanitize_filename(feature_name)
342
- feature_save_name = feature_save_name + ".svg"
346
+ feature_save_name = sanitize_filename(feature_name) + ".svg"
343
347
  save_path = local_save_dir / feature_save_name
344
348
  plt.savefig(save_path, bbox_inches='tight', format="svg")
345
349
  plt.close()
346
350
 
347
- _LOGGER.info(f"{dataset_file_dir} process completed.")
351
+ _LOGGER.info(f"📉 Convergence diagnostics complete.")
348
352
 
349
353
 
350
354
  # Imputed distributions
@@ -431,5 +435,5 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
431
435
  fig = kernel.plot_imputed_distributions(variables=[feature])
432
436
  _process_figure(fig, feature)
433
437
 
434
- _LOGGER.info(f"{local_dir_name} completed.")
438
+ _LOGGER.info(f"📊 Imputed distributions complete.")
435
439
 
@@ -53,13 +53,13 @@ __all__ = [
53
53
  "split_features_targets",
54
54
  "split_continuous_binary",
55
55
  "split_continuous_categorical_targets",
56
- "encode_categorical_features",
57
56
  "clip_outliers_single",
58
57
  "clip_outliers_multi",
59
58
  "drop_outlier_samples",
60
59
  "plot_continuous_vs_target",
61
60
  "plot_categorical_vs_target",
62
61
  "plot_correlation_heatmap",
62
+ "encode_categorical_features",
63
63
  "finalize_feature_schema",
64
64
  "apply_feature_schema",
65
65
  "match_and_filter_columns_by_regex",
@@ -12,13 +12,13 @@ _GRUPOS = [
12
12
  "split_features_targets",
13
13
  "split_continuous_binary",
14
14
  "split_continuous_categorical_targets",
15
- "encode_categorical_features",
16
15
  "clip_outliers_single",
17
16
  "clip_outliers_multi",
18
17
  "drop_outlier_samples",
19
18
  "plot_continuous_vs_target",
20
19
  "plot_categorical_vs_target",
21
20
  "plot_correlation_heatmap",
21
+ "encode_categorical_features",
22
22
  "finalize_feature_schema",
23
23
  "apply_feature_schema",
24
24
  "match_and_filter_columns_by_regex",
@@ -44,7 +44,7 @@ class FeatureSchema(NamedTuple):
44
44
  Handles conversion of Tuple->List and IntKeys->StrKeys automatically.
45
45
  """
46
46
  # validate path
47
- dir_path = make_fullpath(directory, enforce="directory")
47
+ dir_path = make_fullpath(directory, make=True, enforce="directory")
48
48
  file_path = dir_path / SchemaKeys.SCHEMA_FILENAME
49
49
 
50
50
  try: