dragon-ml-toolbox 19.14.0__py3-none-any.whl → 20.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/METADATA +29 -46
  2. dragon_ml_toolbox-20.0.0.dist-info/RECORD +178 -0
  3. ml_tools/{ETL_cleaning.py → ETL_cleaning/__init__.py} +13 -5
  4. ml_tools/ETL_cleaning/_basic_clean.py +351 -0
  5. ml_tools/ETL_cleaning/_clean_tools.py +128 -0
  6. ml_tools/ETL_cleaning/_dragon_cleaner.py +245 -0
  7. ml_tools/ETL_cleaning/_imprimir.py +13 -0
  8. ml_tools/{ETL_engineering.py → ETL_engineering/__init__.py} +8 -4
  9. ml_tools/ETL_engineering/_dragon_engineering.py +261 -0
  10. ml_tools/ETL_engineering/_imprimir.py +24 -0
  11. ml_tools/{_core/_ETL_engineering.py → ETL_engineering/_transforms.py} +14 -267
  12. ml_tools/{_core → GUI_tools}/_GUI_tools.py +37 -40
  13. ml_tools/{GUI_tools.py → GUI_tools/__init__.py} +7 -5
  14. ml_tools/GUI_tools/_imprimir.py +12 -0
  15. ml_tools/IO_tools/_IO_loggers.py +235 -0
  16. ml_tools/IO_tools/_IO_save_load.py +151 -0
  17. ml_tools/IO_tools/_IO_utils.py +140 -0
  18. ml_tools/{IO_tools.py → IO_tools/__init__.py} +13 -5
  19. ml_tools/IO_tools/_imprimir.py +14 -0
  20. ml_tools/MICE/_MICE_imputation.py +132 -0
  21. ml_tools/{MICE_imputation.py → MICE/__init__.py} +6 -7
  22. ml_tools/{_core/_MICE_imputation.py → MICE/_dragon_mice.py} +243 -322
  23. ml_tools/MICE/_imprimir.py +11 -0
  24. ml_tools/{ML_callbacks.py → ML_callbacks/__init__.py} +12 -4
  25. ml_tools/ML_callbacks/_base.py +101 -0
  26. ml_tools/ML_callbacks/_checkpoint.py +232 -0
  27. ml_tools/ML_callbacks/_early_stop.py +208 -0
  28. ml_tools/ML_callbacks/_imprimir.py +12 -0
  29. ml_tools/ML_callbacks/_scheduler.py +197 -0
  30. ml_tools/{ML_chaining_utilities.py → ML_chain/__init__.py} +8 -3
  31. ml_tools/{_core/_ML_chaining_utilities.py → ML_chain/_chaining_tools.py} +5 -129
  32. ml_tools/ML_chain/_dragon_chain.py +140 -0
  33. ml_tools/ML_chain/_imprimir.py +11 -0
  34. ml_tools/ML_configuration/__init__.py +90 -0
  35. ml_tools/ML_configuration/_base_model_config.py +69 -0
  36. ml_tools/ML_configuration/_finalize.py +366 -0
  37. ml_tools/ML_configuration/_imprimir.py +47 -0
  38. ml_tools/ML_configuration/_metrics.py +593 -0
  39. ml_tools/ML_configuration/_models.py +206 -0
  40. ml_tools/ML_configuration/_training.py +124 -0
  41. ml_tools/ML_datasetmaster/__init__.py +28 -0
  42. ml_tools/ML_datasetmaster/_base_datasetmaster.py +337 -0
  43. ml_tools/{_core/_ML_datasetmaster.py → ML_datasetmaster/_datasetmaster.py} +9 -329
  44. ml_tools/ML_datasetmaster/_imprimir.py +15 -0
  45. ml_tools/{_core/_ML_sequence_datasetmaster.py → ML_datasetmaster/_sequence_datasetmaster.py} +13 -15
  46. ml_tools/{_core/_ML_vision_datasetmaster.py → ML_datasetmaster/_vision_datasetmaster.py} +63 -65
  47. ml_tools/ML_evaluation/__init__.py +53 -0
  48. ml_tools/ML_evaluation/_classification.py +629 -0
  49. ml_tools/ML_evaluation/_feature_importance.py +409 -0
  50. ml_tools/ML_evaluation/_imprimir.py +25 -0
  51. ml_tools/ML_evaluation/_loss.py +92 -0
  52. ml_tools/ML_evaluation/_regression.py +273 -0
  53. ml_tools/{_core/_ML_sequence_evaluation.py → ML_evaluation/_sequence.py} +8 -11
  54. ml_tools/{_core/_ML_vision_evaluation.py → ML_evaluation/_vision.py} +12 -17
  55. ml_tools/{_core → ML_evaluation_captum}/_ML_evaluation_captum.py +11 -38
  56. ml_tools/{ML_evaluation_captum.py → ML_evaluation_captum/__init__.py} +6 -4
  57. ml_tools/ML_evaluation_captum/_imprimir.py +10 -0
  58. ml_tools/{_core → ML_finalize_handler}/_ML_finalize_handler.py +3 -7
  59. ml_tools/ML_finalize_handler/__init__.py +10 -0
  60. ml_tools/ML_finalize_handler/_imprimir.py +8 -0
  61. ml_tools/ML_inference/__init__.py +22 -0
  62. ml_tools/ML_inference/_base_inference.py +166 -0
  63. ml_tools/{_core/_ML_chaining_inference.py → ML_inference/_chain_inference.py} +14 -17
  64. ml_tools/ML_inference/_dragon_inference.py +332 -0
  65. ml_tools/ML_inference/_imprimir.py +11 -0
  66. ml_tools/ML_inference/_multi_inference.py +180 -0
  67. ml_tools/ML_inference_sequence/__init__.py +10 -0
  68. ml_tools/ML_inference_sequence/_imprimir.py +8 -0
  69. ml_tools/{_core/_ML_sequence_inference.py → ML_inference_sequence/_sequence_inference.py} +11 -15
  70. ml_tools/ML_inference_vision/__init__.py +10 -0
  71. ml_tools/ML_inference_vision/_imprimir.py +8 -0
  72. ml_tools/{_core/_ML_vision_inference.py → ML_inference_vision/_vision_inference.py} +15 -19
  73. ml_tools/ML_models/__init__.py +32 -0
  74. ml_tools/{_core/_ML_models_advanced.py → ML_models/_advanced_models.py} +22 -18
  75. ml_tools/ML_models/_base_mlp_attention.py +198 -0
  76. ml_tools/{_core/_models_advanced_base.py → ML_models/_base_save_load.py} +73 -49
  77. ml_tools/ML_models/_dragon_tabular.py +248 -0
  78. ml_tools/ML_models/_imprimir.py +18 -0
  79. ml_tools/ML_models/_mlp_attention.py +134 -0
  80. ml_tools/{_core → ML_models}/_models_advanced_helpers.py +13 -13
  81. ml_tools/ML_models_sequence/__init__.py +10 -0
  82. ml_tools/ML_models_sequence/_imprimir.py +8 -0
  83. ml_tools/{_core/_ML_sequence_models.py → ML_models_sequence/_sequence_models.py} +5 -8
  84. ml_tools/ML_models_vision/__init__.py +29 -0
  85. ml_tools/ML_models_vision/_base_wrapper.py +254 -0
  86. ml_tools/ML_models_vision/_image_classification.py +182 -0
  87. ml_tools/ML_models_vision/_image_segmentation.py +108 -0
  88. ml_tools/ML_models_vision/_imprimir.py +16 -0
  89. ml_tools/ML_models_vision/_object_detection.py +135 -0
  90. ml_tools/ML_optimization/__init__.py +21 -0
  91. ml_tools/ML_optimization/_imprimir.py +13 -0
  92. ml_tools/{_core/_ML_optimization_pareto.py → ML_optimization/_multi_dragon.py} +18 -24
  93. ml_tools/ML_optimization/_single_dragon.py +203 -0
  94. ml_tools/{_core/_ML_optimization.py → ML_optimization/_single_manual.py} +75 -213
  95. ml_tools/{_core → ML_scaler}/_ML_scaler.py +8 -11
  96. ml_tools/ML_scaler/__init__.py +10 -0
  97. ml_tools/ML_scaler/_imprimir.py +8 -0
  98. ml_tools/ML_trainer/__init__.py +20 -0
  99. ml_tools/ML_trainer/_base_trainer.py +297 -0
  100. ml_tools/ML_trainer/_dragon_detection_trainer.py +402 -0
  101. ml_tools/ML_trainer/_dragon_sequence_trainer.py +540 -0
  102. ml_tools/ML_trainer/_dragon_trainer.py +1160 -0
  103. ml_tools/ML_trainer/_imprimir.py +10 -0
  104. ml_tools/{ML_utilities.py → ML_utilities/__init__.py} +14 -6
  105. ml_tools/ML_utilities/_artifact_finder.py +382 -0
  106. ml_tools/ML_utilities/_imprimir.py +16 -0
  107. ml_tools/ML_utilities/_inspection.py +325 -0
  108. ml_tools/ML_utilities/_train_tools.py +205 -0
  109. ml_tools/{ML_vision_transformers.py → ML_vision_transformers/__init__.py} +9 -6
  110. ml_tools/{_core/_ML_vision_transformers.py → ML_vision_transformers/_core_transforms.py} +11 -155
  111. ml_tools/ML_vision_transformers/_imprimir.py +14 -0
  112. ml_tools/ML_vision_transformers/_offline_augmentation.py +159 -0
  113. ml_tools/{_core/_PSO_optimization.py → PSO_optimization/_PSO.py} +58 -15
  114. ml_tools/{PSO_optimization.py → PSO_optimization/__init__.py} +5 -3
  115. ml_tools/PSO_optimization/_imprimir.py +10 -0
  116. ml_tools/SQL/__init__.py +7 -0
  117. ml_tools/{_core/_SQL.py → SQL/_dragon_SQL.py} +7 -11
  118. ml_tools/SQL/_imprimir.py +8 -0
  119. ml_tools/{_core → VIF}/_VIF_factor.py +5 -8
  120. ml_tools/{VIF_factor.py → VIF/__init__.py} +4 -2
  121. ml_tools/VIF/_imprimir.py +10 -0
  122. ml_tools/_core/__init__.py +7 -1
  123. ml_tools/_core/_logger.py +8 -18
  124. ml_tools/_core/_schema_load_ops.py +43 -0
  125. ml_tools/_core/_script_info.py +2 -2
  126. ml_tools/{data_exploration.py → data_exploration/__init__.py} +32 -16
  127. ml_tools/data_exploration/_analysis.py +214 -0
  128. ml_tools/data_exploration/_cleaning.py +566 -0
  129. ml_tools/data_exploration/_features.py +583 -0
  130. ml_tools/data_exploration/_imprimir.py +32 -0
  131. ml_tools/data_exploration/_plotting.py +487 -0
  132. ml_tools/data_exploration/_schema_ops.py +176 -0
  133. ml_tools/{ensemble_evaluation.py → ensemble_evaluation/__init__.py} +6 -4
  134. ml_tools/{_core → ensemble_evaluation}/_ensemble_evaluation.py +3 -7
  135. ml_tools/ensemble_evaluation/_imprimir.py +14 -0
  136. ml_tools/{ensemble_inference.py → ensemble_inference/__init__.py} +5 -3
  137. ml_tools/{_core → ensemble_inference}/_ensemble_inference.py +15 -18
  138. ml_tools/ensemble_inference/_imprimir.py +9 -0
  139. ml_tools/{ensemble_learning.py → ensemble_learning/__init__.py} +4 -6
  140. ml_tools/{_core → ensemble_learning}/_ensemble_learning.py +7 -10
  141. ml_tools/ensemble_learning/_imprimir.py +10 -0
  142. ml_tools/{excel_handler.py → excel_handler/__init__.py} +5 -3
  143. ml_tools/{_core → excel_handler}/_excel_handler.py +6 -10
  144. ml_tools/excel_handler/_imprimir.py +13 -0
  145. ml_tools/{keys.py → keys/__init__.py} +4 -1
  146. ml_tools/keys/_imprimir.py +11 -0
  147. ml_tools/{_core → keys}/_keys.py +2 -0
  148. ml_tools/{math_utilities.py → math_utilities/__init__.py} +5 -2
  149. ml_tools/math_utilities/_imprimir.py +11 -0
  150. ml_tools/{_core → math_utilities}/_math_utilities.py +1 -5
  151. ml_tools/{optimization_tools.py → optimization_tools/__init__.py} +9 -4
  152. ml_tools/optimization_tools/_imprimir.py +13 -0
  153. ml_tools/optimization_tools/_optimization_bounds.py +236 -0
  154. ml_tools/optimization_tools/_optimization_plots.py +218 -0
  155. ml_tools/{path_manager.py → path_manager/__init__.py} +6 -3
  156. ml_tools/{_core/_path_manager.py → path_manager/_dragonmanager.py} +11 -347
  157. ml_tools/path_manager/_imprimir.py +15 -0
  158. ml_tools/path_manager/_path_tools.py +346 -0
  159. ml_tools/plot_fonts/__init__.py +8 -0
  160. ml_tools/plot_fonts/_imprimir.py +8 -0
  161. ml_tools/{_core → plot_fonts}/_plot_fonts.py +2 -5
  162. ml_tools/schema/__init__.py +15 -0
  163. ml_tools/schema/_feature_schema.py +223 -0
  164. ml_tools/schema/_gui_schema.py +191 -0
  165. ml_tools/schema/_imprimir.py +10 -0
  166. ml_tools/{serde.py → serde/__init__.py} +4 -2
  167. ml_tools/serde/_imprimir.py +10 -0
  168. ml_tools/{_core → serde}/_serde.py +3 -8
  169. ml_tools/{utilities.py → utilities/__init__.py} +11 -6
  170. ml_tools/utilities/_imprimir.py +18 -0
  171. ml_tools/{_core/_utilities.py → utilities/_utility_save_load.py} +13 -190
  172. ml_tools/utilities/_utility_tools.py +192 -0
  173. dragon_ml_toolbox-19.14.0.dist-info/RECORD +0 -111
  174. ml_tools/ML_chaining_inference.py +0 -8
  175. ml_tools/ML_configuration.py +0 -86
  176. ml_tools/ML_configuration_pytab.py +0 -14
  177. ml_tools/ML_datasetmaster.py +0 -10
  178. ml_tools/ML_evaluation.py +0 -16
  179. ml_tools/ML_evaluation_multi.py +0 -12
  180. ml_tools/ML_finalize_handler.py +0 -8
  181. ml_tools/ML_inference.py +0 -12
  182. ml_tools/ML_models.py +0 -14
  183. ml_tools/ML_models_advanced.py +0 -14
  184. ml_tools/ML_models_pytab.py +0 -14
  185. ml_tools/ML_optimization.py +0 -14
  186. ml_tools/ML_optimization_pareto.py +0 -8
  187. ml_tools/ML_scaler.py +0 -8
  188. ml_tools/ML_sequence_datasetmaster.py +0 -8
  189. ml_tools/ML_sequence_evaluation.py +0 -10
  190. ml_tools/ML_sequence_inference.py +0 -8
  191. ml_tools/ML_sequence_models.py +0 -8
  192. ml_tools/ML_trainer.py +0 -12
  193. ml_tools/ML_vision_datasetmaster.py +0 -12
  194. ml_tools/ML_vision_evaluation.py +0 -10
  195. ml_tools/ML_vision_inference.py +0 -8
  196. ml_tools/ML_vision_models.py +0 -18
  197. ml_tools/SQL.py +0 -8
  198. ml_tools/_core/_ETL_cleaning.py +0 -694
  199. ml_tools/_core/_IO_tools.py +0 -498
  200. ml_tools/_core/_ML_callbacks.py +0 -702
  201. ml_tools/_core/_ML_configuration.py +0 -1332
  202. ml_tools/_core/_ML_configuration_pytab.py +0 -102
  203. ml_tools/_core/_ML_evaluation.py +0 -867
  204. ml_tools/_core/_ML_evaluation_multi.py +0 -544
  205. ml_tools/_core/_ML_inference.py +0 -646
  206. ml_tools/_core/_ML_models.py +0 -668
  207. ml_tools/_core/_ML_models_pytab.py +0 -693
  208. ml_tools/_core/_ML_trainer.py +0 -2323
  209. ml_tools/_core/_ML_utilities.py +0 -886
  210. ml_tools/_core/_ML_vision_models.py +0 -644
  211. ml_tools/_core/_data_exploration.py +0 -1909
  212. ml_tools/_core/_optimization_tools.py +0 -493
  213. ml_tools/_core/_schema.py +0 -359
  214. ml_tools/plot_fonts.py +0 -8
  215. ml_tools/schema.py +0 -12
  216. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/WHEEL +0 -0
  217. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE +0 -0
  218. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
  219. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/top_level.txt +0 -0
ml_tools/_core/_logger.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  import sys
3
- from typing import Optional, Union, Dict, Any
3
+ from typing import Optional, Union, Any
4
4
 
5
5
  # Step 1: Conditionally import colorlog
6
6
  try:
@@ -27,7 +27,7 @@ class _UnifiedFormatter(logging.Formatter):
27
27
  A unified log formatter that adds emojis, uses level-specific formats,
28
28
  and applies colors if colorlog is available.
29
29
  """
30
- def __init__(self, datefmt: Optional[str] = None, log_colors: Optional[Dict[str, str]] = None):
30
+ def __init__(self, datefmt: Optional[str] = None, log_colors: Optional[dict[str, str]] = None):
31
31
  """Initializes the formatter, creating sub-formatters for each level."""
32
32
  # Initialize the base logging.Formatter correctly
33
33
  super().__init__(datefmt=datefmt)
@@ -60,7 +60,7 @@ class _ContextAdapter(logging.LoggerAdapter):
60
60
  """
61
61
  Wraps the logger to automatically prepend the context name to the message.
62
62
  """
63
- def process(self, msg: Any, kwargs: Dict[str, Any]) -> tuple[Any, Dict[str, Any]]:
63
+ def process(self, msg: Any, kwargs: dict[str, Any]) -> tuple[Any, dict[str, Any]]:
64
64
  # Retrieve the context name from the extra dict passed during init
65
65
  context = self.extra.get('context_name', 'Unknown') # type: ignore
66
66
  return f"[{context}] {msg}", kwargs
@@ -75,7 +75,7 @@ def _setup_main_logger(name: str = "ml_tools", level: int = logging.INFO) -> log
75
75
 
76
76
  # Prevents adding handlers multiple times if imported multiple times
77
77
  if not logger.handlers:
78
- formatter_kwargs: Dict[str, Any] = {
78
+ formatter_kwargs: dict[str, Any] = {
79
79
  'datefmt': '%Y-%m-%d %H:%M'
80
80
  }
81
81
 
@@ -121,26 +121,16 @@ def get_logger(name: Optional[str] = None) -> Union[logging.Logger, logging.Logg
121
121
  return _ROOT_LOGGER
122
122
 
123
123
 
124
- # Maintain backward compatibility for scripts importing _LOGGER directly
125
- _LOGGER = _ROOT_LOGGER
126
-
127
-
128
- def _log_and_exit(message: str, exit_code: int = 1):
129
- """Logs a critical message inside an exception block and terminates the program."""
130
- _LOGGER.exception(message)
131
- sys.exit(exit_code)
132
-
133
-
134
124
  if __name__ == "__main__":
135
- _LOGGER.info("Data loading process started.")
136
- _LOGGER.warning("A non-critical configuration value is missing.")
125
+ _ROOT_LOGGER.info("Data loading process started.")
126
+ _ROOT_LOGGER.warning("A non-critical configuration value is missing.")
137
127
 
138
128
  try:
139
129
  x = 1 / 0
140
130
  except ZeroDivisionError:
141
- _LOGGER.exception("Critical error during calculation.")
131
+ _ROOT_LOGGER.exception("Critical error during calculation.")
142
132
 
143
- _LOGGER.critical("Total failure.")
133
+ _ROOT_LOGGER.critical("Total failure.")
144
134
 
145
135
  test_logger = get_logger("SUPER CONTEXT")
146
136
 
@@ -0,0 +1,43 @@
1
+ from typing import Any, Optional
2
+
3
+
4
+ __all__ = ["prepare_schema_from_json"]
5
+
6
+
7
+ def prepare_schema_from_json(data: dict[str, Any]) -> dict[str, Any]:
8
+ """
9
+ Processes a raw dictionary (loaded from JSON) into the clean arguments
10
+ required to instantiate a FeatureSchema.
11
+
12
+ Performs the following restorations:
13
+ 1. Converts list fields back to tuples.
14
+ 2. Converts string keys in 'categorical_index_map' back to integers.
15
+
16
+ Args:
17
+ data (dict): The raw dictionary from a JSON file (e.g. from 'schema_dict').
18
+
19
+ Returns:
20
+ dict: A dictionary of kwargs ready to be unpacked into FeatureSchema(**kwargs).
21
+ """
22
+ # 1. Restore Tuples (JSON loads them as lists)
23
+ feature_names = tuple(data.get("feature_names", []))
24
+ cont_names = tuple(data.get("continuous_feature_names", []))
25
+ cat_names = tuple(data.get("categorical_feature_names", []))
26
+
27
+ # 2. Restore Integer Keys for categorical_index_map
28
+ raw_map = data.get("categorical_index_map")
29
+ cat_index_map: Optional[dict[int, int]] = None
30
+ if raw_map is not None:
31
+ # JSON keys are always strings; convert back to int
32
+ cat_index_map = {int(k): v for k, v in raw_map.items()}
33
+
34
+ # 3. Mappings (keys are strings, no conversion needed)
35
+ cat_mappings = data.get("categorical_mappings", None)
36
+
37
+ return {
38
+ "feature_names": feature_names,
39
+ "continuous_feature_names": cont_names,
40
+ "categorical_feature_names": cat_names,
41
+ "categorical_index_map": cat_index_map,
42
+ "categorical_mappings": cat_mappings
43
+ }
@@ -1,7 +1,7 @@
1
1
 
2
- def _script_info(all_data: list[str]):
2
+ def _imprimir_disponibles(all_data: list[str]):
3
3
  """
4
- List available names.
4
+ List available names in namespace.
5
5
  """
6
6
  print("Available functions and objects:")
7
7
  for i, name in enumerate(all_data, start=1):
@@ -1,54 +1,70 @@
1
- from ._core._data_exploration import (
1
+ from ._analysis import (
2
2
  summarize_dataframe,
3
+ show_null_columns,
4
+ match_and_filter_columns_by_regex,
5
+ )
6
+
7
+ from ._cleaning import (
3
8
  drop_constant_columns,
4
9
  drop_rows_with_missing_data,
5
- show_null_columns,
6
10
  drop_columns_with_missing_data,
7
11
  drop_macro,
8
12
  clean_column_names,
13
+ clip_outliers_single,
14
+ clip_outliers_multi,
15
+ drop_outlier_samples,
16
+ standardize_percentages,
17
+ )
18
+
19
+ from ._plotting import (
9
20
  plot_value_distributions,
10
21
  plot_continuous_vs_target,
11
22
  plot_categorical_vs_target,
12
- encode_categorical_features,
23
+ plot_correlation_heatmap,
24
+ )
25
+
26
+ from ._features import (
13
27
  split_features_targets,
14
28
  split_continuous_binary,
15
- clip_outliers_single,
16
- clip_outliers_multi,
17
- drop_outlier_samples,
18
- plot_correlation_heatmap,
19
- match_and_filter_columns_by_regex,
20
- standardize_percentages,
29
+ split_continuous_categorical_targets,
30
+ encode_categorical_features,
21
31
  reconstruct_one_hot,
22
32
  reconstruct_binary,
23
33
  reconstruct_multibinary,
34
+ )
35
+
36
+ from ._schema_ops import (
24
37
  finalize_feature_schema,
25
38
  apply_feature_schema,
26
- info
27
39
  )
28
40
 
41
+ from ._imprimir import info
42
+
43
+
29
44
  __all__ = [
30
45
  "summarize_dataframe",
46
+ "show_null_columns",
31
47
  "drop_constant_columns",
32
48
  "drop_rows_with_missing_data",
33
- "show_null_columns",
34
49
  "drop_columns_with_missing_data",
35
50
  "drop_macro",
36
51
  "clean_column_names",
37
- "plot_value_distributions",
38
- "plot_continuous_vs_target",
39
- "plot_categorical_vs_target",
52
+ "plot_value_distributions",
40
53
  "split_features_targets",
54
+ "split_continuous_binary",
55
+ "split_continuous_categorical_targets",
41
56
  "encode_categorical_features",
42
57
  "clip_outliers_single",
43
58
  "clip_outliers_multi",
44
59
  "drop_outlier_samples",
60
+ "plot_continuous_vs_target",
61
+ "plot_categorical_vs_target",
45
62
  "plot_correlation_heatmap",
46
63
  "finalize_feature_schema",
64
+ "apply_feature_schema",
47
65
  "match_and_filter_columns_by_regex",
48
66
  "standardize_percentages",
49
67
  "reconstruct_one_hot",
50
68
  "reconstruct_binary",
51
69
  "reconstruct_multibinary",
52
- "split_continuous_binary",
53
- "apply_feature_schema",
54
70
  ]
@@ -0,0 +1,214 @@
1
+ import pandas as pd
2
+ from typing import Optional, Union
3
+ from pathlib import Path
4
+ import numpy as np
5
+ import re
6
+ import matplotlib.pyplot as plt
7
+
8
+ from ..path_manager import make_fullpath, sanitize_filename
9
+ from .._core import get_logger
10
+
11
+
12
+ _LOGGER = get_logger("Data Exploration: Analysis")
13
+
14
+
15
+ __all__ = [
16
+ "summarize_dataframe",
17
+ "show_null_columns",
18
+ "match_and_filter_columns_by_regex",
19
+ ]
20
+
21
+
22
+ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
23
+ """
24
+ Returns a summary DataFrame with data types, non-null counts, number of unique values,
25
+ missing value percentage, and basic statistics for each column.
26
+
27
+ Parameters:
28
+ df (pd.DataFrame): The input DataFrame.
29
+ round_digits (int): Decimal places to round numerical statistics.
30
+
31
+ Returns:
32
+ pd.DataFrame: Summary table.
33
+ """
34
+ summary = pd.DataFrame({
35
+ 'Data Type': df.dtypes,
36
+ 'Completeness %': (df.notnull().mean() * 100).round(2),
37
+ 'Unique Values': df.nunique(),
38
+ # 'Missing %': (df.isnull().mean() * 100).round(2)
39
+ })
40
+
41
+ # For numeric columns, add summary statistics
42
+ numeric_cols = df.select_dtypes(include='number').columns
43
+ if not numeric_cols.empty:
44
+ stats = df[numeric_cols].describe(percentiles=[.10, .25, .50, .70, .80, .90])
45
+
46
+ summary_numeric = stats.T[
47
+ ['mean', 'std', 'min', '10%', '25%', '50%', '70%', '80%', '90%', 'max']
48
+ ].round(round_digits)
49
+ summary = summary.join(summary_numeric, how='left')
50
+
51
+ print(f"DataFrame Shape: {df.shape}")
52
+ return summary
53
+
54
+
55
+ def show_null_columns(
56
+ df: pd.DataFrame,
57
+ round_digits: int = 2,
58
+ plot_to_dir: Optional[Union[str, Path]] = None,
59
+ plot_filename: Optional[str] = None,
60
+ use_all_columns: bool = False
61
+ ) -> pd.DataFrame:
62
+ """
63
+ Returns a table of columns with missing values, showing both the count and
64
+ percentage of missing entries per column.
65
+
66
+ Optionally generates a visualization of the missing data profile.
67
+
68
+ Parameters:
69
+ df (pd.DataFrame): The input DataFrame.
70
+ round_digits (int): Number of decimal places for the percentage.
71
+ plot_to_dir (str | Path | None): If provided, saves a visualization of the
72
+ missing data to this directory.
73
+ plot_filename (str): The filename for the saved plot (without extension).
74
+ Used only if `plot_to_dir` is set.
75
+ use_all_columns (bool): If True, includes all columns in the summary and plot,
76
+ even those with no missing values.
77
+
78
+ Returns:
79
+ pd.DataFrame: A DataFrame summarizing missing values in each column.
80
+ """
81
+ null_counts = df.isnull().sum()
82
+ null_percent = df.isnull().mean() * 100
83
+
84
+ if use_all_columns:
85
+ null_summary = pd.DataFrame({
86
+ 'Missing Count': null_counts,
87
+ 'Missing %': null_percent.round(round_digits)
88
+ })
89
+ else:
90
+ # Filter only columns with at least one null
91
+ mask = null_counts > 0
92
+ null_summary = pd.DataFrame({
93
+ 'Missing Count': null_counts[mask],
94
+ 'Missing %': null_percent[mask].round(round_digits)
95
+ })
96
+
97
+ # Sort by descending percentage of missing values
98
+ null_summary = null_summary.sort_values(by='Missing %', ascending=False)
99
+
100
+ # --- Visualization Logic ---
101
+ if plot_to_dir:
102
+ if null_summary.empty:
103
+ _LOGGER.info("No missing data found. Skipping plot generation.")
104
+ else:
105
+ try:
106
+ # Validate and create save directory
107
+ save_path = make_fullpath(plot_to_dir, make=True, enforce="directory")
108
+
109
+ # Prepare data
110
+ features = null_summary.index.tolist()
111
+ missing_pct = np.array(null_summary['Missing %'].values)
112
+ present_pct = 100 - missing_pct
113
+ n_features = len(features)
114
+
115
+ # Dynamic width
116
+ width = max(10, n_features * 0.4)
117
+ plt.figure(figsize=(width, 8))
118
+
119
+ # Stacked Bar Chart Logic
120
+
121
+ # Grid behind bars
122
+ plt.grid(axis='y', linestyle='--', alpha=0.5, zorder=0)
123
+
124
+ # 1. Present Data: Solid Green
125
+ plt.bar(
126
+ features,
127
+ present_pct,
128
+ color='tab:green',
129
+ label='Present',
130
+ width=0.6,
131
+ zorder=3
132
+ )
133
+
134
+ # 2. Missing Data: Transparent Red Fill + Solid Red Hatch
135
+ # define facecolor (fill) with alpha, but edgecolor (lines) without alpha.
136
+ plt.bar(
137
+ features,
138
+ missing_pct,
139
+ bottom=present_pct,
140
+ facecolor=(1.0, 1.0, 1.0, 0.2), # RGBA
141
+ edgecolor='tab:red', # Solid red for the hatch lines
142
+ hatch='///', # hatch pattern
143
+ linewidth=0.4, # Ensure lines are thick enough to see
144
+ label='Missing',
145
+ width=0.6,
146
+ zorder=3
147
+ )
148
+
149
+ # Styling
150
+ plt.ylim(0, 100)
151
+ plt.ylabel("Data Completeness (%)", fontsize=13)
152
+ plt.yticks(np.arange(0, 101, 10))
153
+ plot_title = f"Missing Data - {plot_filename.replace('_', ' ')}" if plot_filename else "Missing Data"
154
+ plt.title(plot_title)
155
+ plt.xticks(rotation=45, ha='right', fontsize=9)
156
+
157
+ # Reference line
158
+ plt.axhline(y=100, color='black', linestyle='-', linewidth=0.5, alpha=0.3)
159
+
160
+ plt.legend(loc='lower right', framealpha=0.95)
161
+ plt.tight_layout()
162
+
163
+ # Save
164
+ if plot_filename is None or plot_filename.strip() == "":
165
+ plot_filename = "Missing_Data_Profile"
166
+ else:
167
+ plot_filename = "Missing_Data_" + sanitize_filename(plot_filename)
168
+
169
+ full_filename = plot_filename + ".svg"
170
+ plt.savefig(save_path / full_filename, format='svg', bbox_inches="tight")
171
+ plt.close()
172
+
173
+ _LOGGER.info(f"Saved missing data plot as '{full_filename}'")
174
+
175
+ except Exception as e:
176
+ _LOGGER.error(f"Failed to generate missing data plot. Error: {e}")
177
+ plt.close()
178
+
179
+ return null_summary
180
+
181
+
182
+ def match_and_filter_columns_by_regex(
183
+ df: pd.DataFrame,
184
+ pattern: str,
185
+ case_sensitive: bool = False,
186
+ escape_pattern: bool = False
187
+ ) -> tuple[pd.DataFrame, list[str]]:
188
+ """
189
+ Return a tuple of (filtered DataFrame, matched column names) based on a regex pattern.
190
+
191
+ Parameters:
192
+ df (pd.DataFrame): The DataFrame to search.
193
+ pattern (str): The regex pattern to match column names (use a raw string).
194
+ case_sensitive (bool): Whether matching is case-sensitive.
195
+ escape_pattern (bool): If True, the pattern is escaped with `re.escape()` to treat it literally.
196
+
197
+ Returns:
198
+ (Tuple[pd.DataFrame, list[str]]): A DataFrame filtered to matched columns, and a list of matching column names.
199
+ """
200
+ if escape_pattern:
201
+ pattern = re.escape(pattern)
202
+
203
+ mask = df.columns.str.contains(pattern, case=case_sensitive, regex=True)
204
+ matched_columns = df.columns[mask].to_list()
205
+ filtered_df = df.loc[:, mask]
206
+
207
+ _LOGGER.info(f"{len(matched_columns)} columns match the regex pattern '{pattern}'.")
208
+
209
+ # if filtered df is a series, convert to dataframe
210
+ if isinstance(filtered_df, pd.Series):
211
+ filtered_df = filtered_df.to_frame()
212
+
213
+ return filtered_df, matched_columns
214
+