dragon-ml-toolbox 19.13.0__py3-none-any.whl → 20.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/METADATA +29 -46
  2. dragon_ml_toolbox-20.0.0.dist-info/RECORD +178 -0
  3. ml_tools/{ETL_cleaning.py → ETL_cleaning/__init__.py} +13 -5
  4. ml_tools/ETL_cleaning/_basic_clean.py +351 -0
  5. ml_tools/ETL_cleaning/_clean_tools.py +128 -0
  6. ml_tools/ETL_cleaning/_dragon_cleaner.py +245 -0
  7. ml_tools/ETL_cleaning/_imprimir.py +13 -0
  8. ml_tools/{ETL_engineering.py → ETL_engineering/__init__.py} +8 -4
  9. ml_tools/ETL_engineering/_dragon_engineering.py +261 -0
  10. ml_tools/ETL_engineering/_imprimir.py +24 -0
  11. ml_tools/{_core/_ETL_engineering.py → ETL_engineering/_transforms.py} +14 -267
  12. ml_tools/{_core → GUI_tools}/_GUI_tools.py +37 -40
  13. ml_tools/{GUI_tools.py → GUI_tools/__init__.py} +7 -5
  14. ml_tools/GUI_tools/_imprimir.py +12 -0
  15. ml_tools/IO_tools/_IO_loggers.py +235 -0
  16. ml_tools/IO_tools/_IO_save_load.py +151 -0
  17. ml_tools/IO_tools/_IO_utils.py +140 -0
  18. ml_tools/{IO_tools.py → IO_tools/__init__.py} +13 -5
  19. ml_tools/IO_tools/_imprimir.py +14 -0
  20. ml_tools/MICE/_MICE_imputation.py +132 -0
  21. ml_tools/{MICE_imputation.py → MICE/__init__.py} +6 -7
  22. ml_tools/{_core/_MICE_imputation.py → MICE/_dragon_mice.py} +243 -322
  23. ml_tools/MICE/_imprimir.py +11 -0
  24. ml_tools/{ML_callbacks.py → ML_callbacks/__init__.py} +12 -4
  25. ml_tools/ML_callbacks/_base.py +101 -0
  26. ml_tools/ML_callbacks/_checkpoint.py +232 -0
  27. ml_tools/ML_callbacks/_early_stop.py +208 -0
  28. ml_tools/ML_callbacks/_imprimir.py +12 -0
  29. ml_tools/ML_callbacks/_scheduler.py +197 -0
  30. ml_tools/{ML_chaining_utilities.py → ML_chain/__init__.py} +8 -3
  31. ml_tools/{_core/_ML_chaining_utilities.py → ML_chain/_chaining_tools.py} +5 -129
  32. ml_tools/ML_chain/_dragon_chain.py +140 -0
  33. ml_tools/ML_chain/_imprimir.py +11 -0
  34. ml_tools/ML_configuration/__init__.py +90 -0
  35. ml_tools/ML_configuration/_base_model_config.py +69 -0
  36. ml_tools/ML_configuration/_finalize.py +366 -0
  37. ml_tools/ML_configuration/_imprimir.py +47 -0
  38. ml_tools/ML_configuration/_metrics.py +593 -0
  39. ml_tools/ML_configuration/_models.py +206 -0
  40. ml_tools/ML_configuration/_training.py +124 -0
  41. ml_tools/ML_datasetmaster/__init__.py +28 -0
  42. ml_tools/ML_datasetmaster/_base_datasetmaster.py +337 -0
  43. ml_tools/{_core/_ML_datasetmaster.py → ML_datasetmaster/_datasetmaster.py} +9 -329
  44. ml_tools/ML_datasetmaster/_imprimir.py +15 -0
  45. ml_tools/{_core/_ML_sequence_datasetmaster.py → ML_datasetmaster/_sequence_datasetmaster.py} +13 -15
  46. ml_tools/{_core/_ML_vision_datasetmaster.py → ML_datasetmaster/_vision_datasetmaster.py} +63 -65
  47. ml_tools/ML_evaluation/__init__.py +53 -0
  48. ml_tools/ML_evaluation/_classification.py +629 -0
  49. ml_tools/ML_evaluation/_feature_importance.py +409 -0
  50. ml_tools/ML_evaluation/_imprimir.py +25 -0
  51. ml_tools/ML_evaluation/_loss.py +92 -0
  52. ml_tools/ML_evaluation/_regression.py +273 -0
  53. ml_tools/{_core/_ML_sequence_evaluation.py → ML_evaluation/_sequence.py} +8 -11
  54. ml_tools/{_core/_ML_vision_evaluation.py → ML_evaluation/_vision.py} +12 -17
  55. ml_tools/{_core → ML_evaluation_captum}/_ML_evaluation_captum.py +11 -38
  56. ml_tools/{ML_evaluation_captum.py → ML_evaluation_captum/__init__.py} +6 -4
  57. ml_tools/ML_evaluation_captum/_imprimir.py +10 -0
  58. ml_tools/{_core → ML_finalize_handler}/_ML_finalize_handler.py +3 -7
  59. ml_tools/ML_finalize_handler/__init__.py +10 -0
  60. ml_tools/ML_finalize_handler/_imprimir.py +8 -0
  61. ml_tools/ML_inference/__init__.py +22 -0
  62. ml_tools/ML_inference/_base_inference.py +166 -0
  63. ml_tools/{_core/_ML_chaining_inference.py → ML_inference/_chain_inference.py} +14 -17
  64. ml_tools/ML_inference/_dragon_inference.py +332 -0
  65. ml_tools/ML_inference/_imprimir.py +11 -0
  66. ml_tools/ML_inference/_multi_inference.py +180 -0
  67. ml_tools/ML_inference_sequence/__init__.py +10 -0
  68. ml_tools/ML_inference_sequence/_imprimir.py +8 -0
  69. ml_tools/{_core/_ML_sequence_inference.py → ML_inference_sequence/_sequence_inference.py} +11 -15
  70. ml_tools/ML_inference_vision/__init__.py +10 -0
  71. ml_tools/ML_inference_vision/_imprimir.py +8 -0
  72. ml_tools/{_core/_ML_vision_inference.py → ML_inference_vision/_vision_inference.py} +15 -19
  73. ml_tools/ML_models/__init__.py +32 -0
  74. ml_tools/{_core/_ML_models_advanced.py → ML_models/_advanced_models.py} +22 -18
  75. ml_tools/ML_models/_base_mlp_attention.py +198 -0
  76. ml_tools/{_core/_models_advanced_base.py → ML_models/_base_save_load.py} +73 -49
  77. ml_tools/ML_models/_dragon_tabular.py +248 -0
  78. ml_tools/ML_models/_imprimir.py +18 -0
  79. ml_tools/ML_models/_mlp_attention.py +134 -0
  80. ml_tools/{_core → ML_models}/_models_advanced_helpers.py +13 -13
  81. ml_tools/ML_models_sequence/__init__.py +10 -0
  82. ml_tools/ML_models_sequence/_imprimir.py +8 -0
  83. ml_tools/{_core/_ML_sequence_models.py → ML_models_sequence/_sequence_models.py} +5 -8
  84. ml_tools/ML_models_vision/__init__.py +29 -0
  85. ml_tools/ML_models_vision/_base_wrapper.py +254 -0
  86. ml_tools/ML_models_vision/_image_classification.py +182 -0
  87. ml_tools/ML_models_vision/_image_segmentation.py +108 -0
  88. ml_tools/ML_models_vision/_imprimir.py +16 -0
  89. ml_tools/ML_models_vision/_object_detection.py +135 -0
  90. ml_tools/ML_optimization/__init__.py +21 -0
  91. ml_tools/ML_optimization/_imprimir.py +13 -0
  92. ml_tools/{_core/_ML_optimization_pareto.py → ML_optimization/_multi_dragon.py} +18 -24
  93. ml_tools/ML_optimization/_single_dragon.py +203 -0
  94. ml_tools/{_core/_ML_optimization.py → ML_optimization/_single_manual.py} +75 -213
  95. ml_tools/{_core → ML_scaler}/_ML_scaler.py +8 -11
  96. ml_tools/ML_scaler/__init__.py +10 -0
  97. ml_tools/ML_scaler/_imprimir.py +8 -0
  98. ml_tools/ML_trainer/__init__.py +20 -0
  99. ml_tools/ML_trainer/_base_trainer.py +297 -0
  100. ml_tools/ML_trainer/_dragon_detection_trainer.py +402 -0
  101. ml_tools/ML_trainer/_dragon_sequence_trainer.py +540 -0
  102. ml_tools/ML_trainer/_dragon_trainer.py +1160 -0
  103. ml_tools/ML_trainer/_imprimir.py +10 -0
  104. ml_tools/{ML_utilities.py → ML_utilities/__init__.py} +14 -6
  105. ml_tools/ML_utilities/_artifact_finder.py +382 -0
  106. ml_tools/ML_utilities/_imprimir.py +16 -0
  107. ml_tools/ML_utilities/_inspection.py +325 -0
  108. ml_tools/ML_utilities/_train_tools.py +205 -0
  109. ml_tools/{ML_vision_transformers.py → ML_vision_transformers/__init__.py} +9 -6
  110. ml_tools/{_core/_ML_vision_transformers.py → ML_vision_transformers/_core_transforms.py} +11 -155
  111. ml_tools/ML_vision_transformers/_imprimir.py +14 -0
  112. ml_tools/ML_vision_transformers/_offline_augmentation.py +159 -0
  113. ml_tools/{_core/_PSO_optimization.py → PSO_optimization/_PSO.py} +58 -15
  114. ml_tools/{PSO_optimization.py → PSO_optimization/__init__.py} +5 -3
  115. ml_tools/PSO_optimization/_imprimir.py +10 -0
  116. ml_tools/SQL/__init__.py +7 -0
  117. ml_tools/{_core/_SQL.py → SQL/_dragon_SQL.py} +7 -11
  118. ml_tools/SQL/_imprimir.py +8 -0
  119. ml_tools/{_core → VIF}/_VIF_factor.py +5 -8
  120. ml_tools/{VIF_factor.py → VIF/__init__.py} +4 -2
  121. ml_tools/VIF/_imprimir.py +10 -0
  122. ml_tools/_core/__init__.py +7 -1
  123. ml_tools/_core/_logger.py +8 -18
  124. ml_tools/_core/_schema_load_ops.py +43 -0
  125. ml_tools/_core/_script_info.py +2 -2
  126. ml_tools/{data_exploration.py → data_exploration/__init__.py} +32 -16
  127. ml_tools/data_exploration/_analysis.py +214 -0
  128. ml_tools/data_exploration/_cleaning.py +566 -0
  129. ml_tools/data_exploration/_features.py +583 -0
  130. ml_tools/data_exploration/_imprimir.py +32 -0
  131. ml_tools/data_exploration/_plotting.py +487 -0
  132. ml_tools/data_exploration/_schema_ops.py +176 -0
  133. ml_tools/{ensemble_evaluation.py → ensemble_evaluation/__init__.py} +6 -4
  134. ml_tools/{_core → ensemble_evaluation}/_ensemble_evaluation.py +3 -7
  135. ml_tools/ensemble_evaluation/_imprimir.py +14 -0
  136. ml_tools/{ensemble_inference.py → ensemble_inference/__init__.py} +5 -3
  137. ml_tools/{_core → ensemble_inference}/_ensemble_inference.py +15 -18
  138. ml_tools/ensemble_inference/_imprimir.py +9 -0
  139. ml_tools/{ensemble_learning.py → ensemble_learning/__init__.py} +4 -6
  140. ml_tools/{_core → ensemble_learning}/_ensemble_learning.py +7 -10
  141. ml_tools/ensemble_learning/_imprimir.py +10 -0
  142. ml_tools/{excel_handler.py → excel_handler/__init__.py} +5 -3
  143. ml_tools/{_core → excel_handler}/_excel_handler.py +6 -10
  144. ml_tools/excel_handler/_imprimir.py +13 -0
  145. ml_tools/{keys.py → keys/__init__.py} +4 -1
  146. ml_tools/keys/_imprimir.py +11 -0
  147. ml_tools/{_core → keys}/_keys.py +2 -0
  148. ml_tools/{math_utilities.py → math_utilities/__init__.py} +5 -2
  149. ml_tools/math_utilities/_imprimir.py +11 -0
  150. ml_tools/{_core → math_utilities}/_math_utilities.py +1 -5
  151. ml_tools/{optimization_tools.py → optimization_tools/__init__.py} +9 -4
  152. ml_tools/optimization_tools/_imprimir.py +13 -0
  153. ml_tools/optimization_tools/_optimization_bounds.py +236 -0
  154. ml_tools/optimization_tools/_optimization_plots.py +218 -0
  155. ml_tools/{path_manager.py → path_manager/__init__.py} +6 -3
  156. ml_tools/{_core/_path_manager.py → path_manager/_dragonmanager.py} +11 -347
  157. ml_tools/path_manager/_imprimir.py +15 -0
  158. ml_tools/path_manager/_path_tools.py +346 -0
  159. ml_tools/plot_fonts/__init__.py +8 -0
  160. ml_tools/plot_fonts/_imprimir.py +8 -0
  161. ml_tools/{_core → plot_fonts}/_plot_fonts.py +2 -5
  162. ml_tools/schema/__init__.py +15 -0
  163. ml_tools/schema/_feature_schema.py +223 -0
  164. ml_tools/schema/_gui_schema.py +191 -0
  165. ml_tools/schema/_imprimir.py +10 -0
  166. ml_tools/{serde.py → serde/__init__.py} +4 -2
  167. ml_tools/serde/_imprimir.py +10 -0
  168. ml_tools/{_core → serde}/_serde.py +3 -8
  169. ml_tools/{utilities.py → utilities/__init__.py} +11 -6
  170. ml_tools/utilities/_imprimir.py +18 -0
  171. ml_tools/{_core/_utilities.py → utilities/_utility_save_load.py} +13 -190
  172. ml_tools/utilities/_utility_tools.py +192 -0
  173. dragon_ml_toolbox-19.13.0.dist-info/RECORD +0 -111
  174. ml_tools/ML_chaining_inference.py +0 -8
  175. ml_tools/ML_configuration.py +0 -86
  176. ml_tools/ML_configuration_pytab.py +0 -14
  177. ml_tools/ML_datasetmaster.py +0 -10
  178. ml_tools/ML_evaluation.py +0 -16
  179. ml_tools/ML_evaluation_multi.py +0 -12
  180. ml_tools/ML_finalize_handler.py +0 -8
  181. ml_tools/ML_inference.py +0 -12
  182. ml_tools/ML_models.py +0 -14
  183. ml_tools/ML_models_advanced.py +0 -14
  184. ml_tools/ML_models_pytab.py +0 -14
  185. ml_tools/ML_optimization.py +0 -14
  186. ml_tools/ML_optimization_pareto.py +0 -8
  187. ml_tools/ML_scaler.py +0 -8
  188. ml_tools/ML_sequence_datasetmaster.py +0 -8
  189. ml_tools/ML_sequence_evaluation.py +0 -10
  190. ml_tools/ML_sequence_inference.py +0 -8
  191. ml_tools/ML_sequence_models.py +0 -8
  192. ml_tools/ML_trainer.py +0 -12
  193. ml_tools/ML_vision_datasetmaster.py +0 -12
  194. ml_tools/ML_vision_evaluation.py +0 -10
  195. ml_tools/ML_vision_inference.py +0 -8
  196. ml_tools/ML_vision_models.py +0 -18
  197. ml_tools/SQL.py +0 -8
  198. ml_tools/_core/_ETL_cleaning.py +0 -694
  199. ml_tools/_core/_IO_tools.py +0 -498
  200. ml_tools/_core/_ML_callbacks.py +0 -702
  201. ml_tools/_core/_ML_configuration.py +0 -1332
  202. ml_tools/_core/_ML_configuration_pytab.py +0 -102
  203. ml_tools/_core/_ML_evaluation.py +0 -867
  204. ml_tools/_core/_ML_evaluation_multi.py +0 -544
  205. ml_tools/_core/_ML_inference.py +0 -646
  206. ml_tools/_core/_ML_models.py +0 -668
  207. ml_tools/_core/_ML_models_pytab.py +0 -693
  208. ml_tools/_core/_ML_trainer.py +0 -2323
  209. ml_tools/_core/_ML_utilities.py +0 -886
  210. ml_tools/_core/_ML_vision_models.py +0 -644
  211. ml_tools/_core/_data_exploration.py +0 -1901
  212. ml_tools/_core/_optimization_tools.py +0 -493
  213. ml_tools/_core/_schema.py +0 -359
  214. ml_tools/plot_fonts.py +0 -8
  215. ml_tools/schema.py +0 -12
  216. {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/WHEEL +0 -0
  217. {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE +0 -0
  218. {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
  219. {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,487 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from typing import Optional, Union, Literal
4
+ from pathlib import Path
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+ from pandas.api.types import is_numeric_dtype, is_object_dtype
8
+
9
+ from ..path_manager import make_fullpath, sanitize_filename
10
+ from .._core import get_logger
11
+
12
+
13
+ _LOGGER = get_logger("Data Exploration: Plotting")
14
+
15
+
16
+ __all__ = [
17
+ "plot_value_distributions",
18
+ "plot_continuous_vs_target",
19
+ "plot_categorical_vs_target",
20
+ "plot_correlation_heatmap",
21
+ ]
22
+
23
+
24
+ def plot_value_distributions(
25
+ df: pd.DataFrame,
26
+ save_dir: Union[str, Path],
27
+ categorical_columns: Optional[list[str]] = None,
28
+ max_categories: int = 100,
29
+ fill_na_with: str = "MISSING DATA"
30
+ ):
31
+ """
32
+ Plots and saves the value distributions for all columns in a DataFrame,
33
+ using the best plot type for each column (histogram or count plot).
34
+
35
+ Plots are saved as SVG files under two subdirectories in `save_dir`:
36
+ - "Distribution_Continuous" for continuous numeric features (histograms).
37
+ - "Distribution_Categorical" for categorical features (count plots).
38
+
39
+ Args:
40
+ df (pd.DataFrame): The input DataFrame to analyze.
41
+ save_dir (str | Path): Directory path to save the plots.
42
+ categorical_columns (List[str] | None): If provided, these will be treated as categorical, and all other columns will be treated as continuous.
43
+ max_categories (int): The maximum number of unique categories a categorical feature can have to be plotted. Features exceeding this limit will be skipped.
44
+ fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its own category.
45
+
46
+ Notes:
47
+ - `seaborn.histplot` with KDE is used for continuous features.
48
+ - `seaborn.countplot` is used for categorical features.
49
+ """
50
+ # 1. Setup save directories
51
+ base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
52
+ numeric_dir = base_save_path / "Distribution_Continuous"
53
+ categorical_dir = base_save_path / "Distribution_Categorical"
54
+ numeric_dir.mkdir(parents=True, exist_ok=True)
55
+ categorical_dir.mkdir(parents=True, exist_ok=True)
56
+
57
+ # 2. Filter columns to plot
58
+ columns_to_plot = df.columns.to_list()
59
+
60
+ # Setup for forced categorical logic
61
+ categorical_set = set(categorical_columns) if categorical_columns is not None else None
62
+
63
+ numeric_plots_saved = 0
64
+ categorical_plots_saved = 0
65
+
66
+ for col_name in columns_to_plot:
67
+ try:
68
+ is_numeric = is_numeric_dtype(df[col_name])
69
+ n_unique = df[col_name].nunique()
70
+
71
+ # --- 3. Determine Plot Type ---
72
+ is_continuous = False
73
+ if categorical_set is not None:
74
+ # Use the explicit list
75
+ if col_name not in categorical_set:
76
+ is_continuous = True
77
+ else:
78
+ # Use auto-detection
79
+ if is_numeric:
80
+ is_continuous = True
81
+
82
+ # --- Case 1: Continuous Numeric (Histogram) ---
83
+ if is_continuous:
84
+ plt.figure(figsize=(10, 6))
85
+ # Drop NaNs for histogram, as they can't be plotted on a numeric axis
86
+ sns.histplot(x=df[col_name].dropna(), kde=True, bins=30)
87
+ plt.title(f"Distribution of '{col_name}' (Continuous)")
88
+ plt.xlabel(col_name)
89
+ plt.ylabel("Count")
90
+
91
+ save_path = numeric_dir / f"{sanitize_filename(col_name)}.svg"
92
+ numeric_plots_saved += 1
93
+
94
+ # --- Case 2: Categorical (Count Plot) ---
95
+ else:
96
+ # Check max categories
97
+ if n_unique > max_categories:
98
+ _LOGGER.warning(f"Skipping plot for '{col_name}': {n_unique} unique values > {max_categories} max_categories.")
99
+ continue
100
+
101
+ # Adaptive figure size
102
+ fig_width = max(10, n_unique * 0.5)
103
+ plt.figure(figsize=(fig_width, 8))
104
+
105
+ # Make a temporary copy for plotting to handle NaNs
106
+ temp_series = df[col_name].copy()
107
+
108
+ # Handle NaNs by replacing them with the specified string
109
+ if temp_series.isnull().any():
110
+ # Convert to object type first to allow string replacement
111
+ temp_series = temp_series.astype(object).fillna(fill_na_with)
112
+
113
+ # Convert all to string to be safe (handles low-card numeric)
114
+ temp_series = temp_series.astype(str)
115
+
116
+ # Get category order by frequency
117
+ order = temp_series.value_counts().index
118
+ sns.countplot(x=temp_series, order=order, palette="Oranges", hue=temp_series, legend=False)
119
+
120
+ plt.title(f"Distribution of '{col_name}' (Categorical)")
121
+ plt.xlabel(col_name)
122
+ plt.ylabel("Count")
123
+
124
+ # Smart tick rotation
125
+ max_label_len = 0
126
+ if n_unique > 0:
127
+ max_label_len = max(len(str(s)) for s in order)
128
+
129
+ # Rotate if labels are long OR there are many categories
130
+ if max_label_len > 10 or n_unique > 25:
131
+ plt.xticks(rotation=45, ha='right')
132
+
133
+ save_path = categorical_dir / f"{sanitize_filename(col_name)}.svg"
134
+ categorical_plots_saved += 1
135
+
136
+ # --- 4. Save Plot ---
137
+ plt.grid(True, linestyle='--', alpha=0.6, axis='y')
138
+ plt.tight_layout()
139
+ # Save as .svg
140
+ plt.savefig(save_path, format='svg', bbox_inches="tight")
141
+ plt.close()
142
+
143
+ except Exception as e:
144
+ _LOGGER.error(f"Failed to plot distribution for '{col_name}'. Error: {e}")
145
+ plt.close()
146
+
147
+ _LOGGER.info(f"Saved {numeric_plots_saved} continuous distribution plots to '{numeric_dir.name}'.")
148
+ _LOGGER.info(f"Saved {categorical_plots_saved} categorical distribution plots to '{categorical_dir.name}'.")
149
+
150
+
151
+ def plot_continuous_vs_target(
152
+ df_continuous: pd.DataFrame,
153
+ df_targets: pd.DataFrame,
154
+ save_dir: Union[str, Path],
155
+ verbose: int = 1
156
+ ):
157
+ """
158
+ Plots each continuous feature from df_continuous against each target in df_targets.
159
+
160
+ This function creates a scatter plot for each feature-target pair, overlays a
161
+ simple linear regression line, and saves each plot as an individual .svg file.
162
+
163
+ Plots are saved in a structured way, with a subdirectory created for
164
+ each target variable.
165
+
166
+ Args:
167
+ df_continuous (pd.DataFrame): DataFrame containing continuous feature columns (x-axis).
168
+ df_targets (pd.DataFrame): DataFrame containing target columns (y-axis).
169
+ save_dir (str | Path): The base directory where plots will be saved.
170
+ verbose (int): Verbosity level for logging warnings.
171
+
172
+ Notes:
173
+ - Only numeric features and numeric targets are processed.
174
+ - Rows with NaN in either the feature or the target are dropped pairwise.
175
+ - Assumes df_continuous and df_targets share the same index.
176
+ """
177
+ # 1. Validate the base save directory
178
+ base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
179
+
180
+ # 2. Validation helper
181
+ def _get_valid_numeric_cols(df: pd.DataFrame, df_name: str) -> list[str]:
182
+ valid_cols = []
183
+ for col in df.columns:
184
+ if not is_numeric_dtype(df[col]):
185
+ if verbose > 0:
186
+ _LOGGER.warning(f"Column '{col}' in {df_name} is not numeric. Skipping.")
187
+ else:
188
+ valid_cols.append(col)
189
+ return valid_cols
190
+
191
+ # 3. Validate target columns
192
+ valid_targets = _get_valid_numeric_cols(df_targets, "df_targets")
193
+ if not valid_targets:
194
+ _LOGGER.error("No valid numeric target columns provided in df_targets.")
195
+ return
196
+
197
+ # 4. Validate feature columns
198
+ valid_features = _get_valid_numeric_cols(df_continuous, "df_continuous")
199
+ if not valid_features:
200
+ _LOGGER.error("No valid numeric feature columns provided in df_continuous.")
201
+ return
202
+
203
+ # 5. Main plotting loop
204
+ total_plots_saved = 0
205
+
206
+ for target_name in valid_targets:
207
+ # Create a sanitized subdirectory for this target
208
+ safe_target_dir_name = sanitize_filename(f"{target_name}_vs_Continuous")
209
+ target_save_dir = base_save_path / safe_target_dir_name
210
+ target_save_dir.mkdir(parents=True, exist_ok=True)
211
+
212
+ if verbose > 0:
213
+ _LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
214
+
215
+ for feature_name in valid_features:
216
+
217
+ # Align data and drop NaNs pairwise - use concat to ensure we respect the index alignment between the two DFs
218
+ temp_df = pd.concat([
219
+ df_continuous[feature_name],
220
+ df_targets[target_name]
221
+ ], axis=1).dropna()
222
+
223
+ if temp_df.empty:
224
+ if verbose > 1:
225
+ _LOGGER.warning(f"No non-null data for '{feature_name}' vs '{target_name}'. Skipping plot.")
226
+ continue
227
+
228
+ x = temp_df[feature_name]
229
+ y = temp_df[target_name]
230
+
231
+ # 6. Perform linear fit
232
+ try:
233
+ # Modern replacement for np.polyfit + np.poly1d
234
+ p = np.polynomial.Polynomial.fit(x, y, deg=1)
235
+ plot_regression_line = True
236
+ except (np.linalg.LinAlgError, ValueError):
237
+ if verbose > 0:
238
+ _LOGGER.warning(f"Linear regression failed for '{feature_name}' vs '{target_name}'. Plotting scatter only.")
239
+ plot_regression_line = False
240
+
241
+ # 7. Create the plot
242
+ plt.figure(figsize=(10, 6))
243
+ ax = plt.gca()
244
+
245
+ # Plot the raw data points
246
+ ax.plot(x, y, 'o', alpha=0.5, label='Data points', markersize=5)
247
+
248
+ # Plot the regression line
249
+ if plot_regression_line:
250
+ ax.plot(x, p(x), "r--", label='Linear Fit') # type: ignore
251
+
252
+ ax.set_title(f'{feature_name} vs {target_name}')
253
+ ax.set_xlabel(feature_name)
254
+ ax.set_ylabel(target_name)
255
+ ax.legend()
256
+ plt.grid(True, linestyle='--', alpha=0.6)
257
+ plt.tight_layout()
258
+
259
+ # 8. Save the plot
260
+ safe_feature_name = sanitize_filename(feature_name)
261
+ plot_filename = f"{safe_feature_name}_vs_{safe_target_dir_name}.svg"
262
+ plot_path = target_save_dir / plot_filename
263
+
264
+ try:
265
+ plt.savefig(plot_path, bbox_inches="tight", format='svg')
266
+ total_plots_saved += 1
267
+ except Exception as e:
268
+ _LOGGER.error(f"Failed to save plot: {plot_path}. Error: {e}")
269
+
270
+ # Close the figure to free up memory
271
+ plt.close()
272
+
273
+ if verbose > 0:
274
+ _LOGGER.info(f"Successfully saved {total_plots_saved} feature-vs-target plots to '{base_save_path}'.")
275
+
276
+
277
+ def plot_categorical_vs_target(
278
+ df_categorical: pd.DataFrame,
279
+ df_targets: pd.DataFrame,
280
+ save_dir: Union[str, Path],
281
+ max_categories: int = 50,
282
+ fill_na_with: str = "MISSING DATA",
283
+ drop_empty_targets: bool = True,
284
+ verbose: int = 1
285
+ ):
286
+ """
287
+ Plots each feature in df_categorical against each numeric target in df_targets using box plots.
288
+
289
+ Automatically aligns the two DataFrames by index. If a numeric
290
+ column is passed within df_categorical, it will be cast to object type to treat it as a category.
291
+
292
+ Args:
293
+ df_categorical (pd.DataFrame): DataFrame containing categorical feature columns (x-axis).
294
+ df_targets (pd.DataFrame): DataFrame containing numeric target columns (y-axis).
295
+ save_dir (str | Path): Base directory for saving plots.
296
+ max_categories (int): The maximum number of unique categories a feature can have to be plotted.
297
+ fill_na_with (str): String to replace NaN values in categorical columns.
298
+ drop_empty_targets (bool): If True, drops rows where the target value is NaN before plotting.
299
+ verbose (int): Verbosity level for logging warnings.
300
+
301
+ Notes:
302
+ - Assumes df_categorical and df_targets share the same index.
303
+ """
304
+ # 1. Validate the base save directory
305
+ base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
306
+
307
+ # 2. Validate target columns (must be numeric)
308
+ valid_targets = []
309
+ for col in df_targets.columns:
310
+ if not is_numeric_dtype(df_targets[col]):
311
+ if verbose > 0:
312
+ _LOGGER.warning(f"Target column '{col}' in df_targets is not numeric. Skipping.")
313
+ else:
314
+ valid_targets.append(col)
315
+
316
+ if not valid_targets:
317
+ _LOGGER.error("No valid numeric target columns provided in df_targets.")
318
+ return
319
+
320
+ # 3. Validate feature columns (Flexible: Allow numeric but warn)
321
+ valid_features = []
322
+ for col in df_categorical.columns:
323
+ # If numeric, warn but accept it (will be cast to object later)
324
+ if is_numeric_dtype(df_categorical[col]):
325
+ if verbose > 0:
326
+ _LOGGER.warning(f"Feature '{col}' in df_categorical is numeric. It will be cast to 'object' and treated as categorical.")
327
+ valid_features.append(col)
328
+ else:
329
+ # Assume it is already object/category
330
+ valid_features.append(col)
331
+
332
+ if not valid_features:
333
+ _LOGGER.error("No valid feature columns provided in df_categorical.")
334
+ return
335
+
336
+ # 4. Main plotting loop
337
+ total_plots_saved = 0
338
+
339
+ for target_name in valid_targets:
340
+ # Create a sanitized subdirectory for this target
341
+ safe_target_dir_name = sanitize_filename(f"{target_name}_vs_Categorical")
342
+ target_save_dir = base_save_path / safe_target_dir_name
343
+ target_save_dir.mkdir(parents=True, exist_ok=True)
344
+
345
+ if verbose > 0:
346
+ _LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
347
+
348
+ for feature_name in valid_features:
349
+
350
+ # Align data using concat to respect indices
351
+ feature_series = df_categorical[feature_name]
352
+ target_series = df_targets[target_name]
353
+
354
+ # Create a temporary DataFrame for this pair
355
+ temp_df = pd.concat([feature_series, target_series], axis=1)
356
+
357
+ # Optional: Drop rows where the target is NaN
358
+ if drop_empty_targets:
359
+ temp_df = temp_df.dropna(subset=[target_name])
360
+ if temp_df.empty:
361
+ if verbose > 1:
362
+ _LOGGER.warning(f"No valid data left for '{feature_name}' vs '{target_name}' after dropping empty targets. Skipping.")
363
+ continue
364
+
365
+ # Force feature to object if it isn't already (handling the numeric flexibility)
366
+ if not is_object_dtype(temp_df[feature_name]):
367
+ temp_df[feature_name] = temp_df[feature_name].astype(object)
368
+
369
+ # Handle NaNs in the feature column (treat as a category)
370
+ if temp_df[feature_name].isnull().any():
371
+ temp_df[feature_name] = temp_df[feature_name].fillna(fill_na_with)
372
+
373
+ # Convert to string to ensure consistent plotting and cardinality check
374
+ temp_df[feature_name] = temp_df[feature_name].astype(str)
375
+
376
+ # Check cardinality
377
+ n_unique = temp_df[feature_name].nunique()
378
+ if n_unique > max_categories:
379
+ if verbose > 1:
380
+ _LOGGER.warning(f"Skipping '{feature_name}': {n_unique} unique categories > {max_categories} max_categories.")
381
+ continue
382
+
383
+ # 5. Create the plot
384
+ # Dynamic figure width based on number of categories
385
+ plt.figure(figsize=(max(10, n_unique * 0.8), 10))
386
+
387
+ sns.boxplot(x=feature_name, y=target_name, data=temp_df)
388
+
389
+ plt.title(f'{target_name} vs {feature_name}')
390
+ plt.xlabel(feature_name)
391
+ plt.ylabel(target_name)
392
+ plt.xticks(rotation=45, ha='right')
393
+ plt.grid(True, linestyle='--', alpha=0.6, axis='y')
394
+ plt.tight_layout()
395
+
396
+ # 6. Save the plot
397
+ safe_feature_name = sanitize_filename(feature_name)
398
+ plot_filename = f"{safe_feature_name}_vs_{safe_target_dir_name}.svg"
399
+ plot_path = target_save_dir / plot_filename
400
+
401
+ try:
402
+ plt.savefig(plot_path, bbox_inches="tight", format='svg')
403
+ total_plots_saved += 1
404
+ except Exception as e:
405
+ _LOGGER.error(f"Failed to save plot: {plot_path}. Error: {e}")
406
+
407
+ plt.close()
408
+
409
+ if verbose > 0:
410
+ _LOGGER.info(f"Successfully saved {total_plots_saved} categorical-vs-target plots to '{base_save_path}'.")
411
+
412
+
413
+
414
+ def plot_correlation_heatmap(df: pd.DataFrame,
415
+ plot_title: str,
416
+ save_dir: Union[str, Path, None] = None,
417
+ method: Literal["pearson", "kendall", "spearman"]="pearson"):
418
+ """
419
+ Plots a heatmap of pairwise correlations between numeric features in a DataFrame.
420
+
421
+ Args:
422
+ df (pd.DataFrame): The input dataset.
423
+ save_dir (str | Path | None): If provided, the heatmap will be saved to this directory as a svg file.
424
+ plot_title: The suffix "`method` Correlation Heatmap" will be automatically appended.
425
+ method (str): Correlation method to use. Must be one of:
426
+ - 'pearson' (default): measures linear correlation (assumes normally distributed data),
427
+ - 'kendall': rank correlation (non-parametric),
428
+ - 'spearman': monotonic relationship (non-parametric).
429
+
430
+ Notes:
431
+ - Only numeric columns are included.
432
+ - Annotations are disabled if there are more than 20 features.
433
+ - Missing values are handled via pairwise complete observations.
434
+ """
435
+ numeric_df = df.select_dtypes(include='number')
436
+ if numeric_df.empty:
437
+ _LOGGER.warning("No numeric columns found. Heatmap not generated.")
438
+ return
439
+ if method not in ["pearson", "kendall", "spearman"]:
440
+ _LOGGER.error(f"'method' must be pearson, kendall, or spearman.")
441
+ raise ValueError()
442
+
443
+ corr = numeric_df.corr(method=method)
444
+
445
+ # Create a mask for the upper triangle
446
+ mask = np.triu(np.ones_like(corr, dtype=bool))
447
+
448
+ # Plot setup
449
+ size = max(10, numeric_df.shape[1])
450
+ plt.figure(figsize=(size, size * 0.8))
451
+
452
+ annot_bool = numeric_df.shape[1] <= 20
453
+ sns.heatmap(
454
+ corr,
455
+ mask=mask,
456
+ annot=annot_bool,
457
+ cmap='coolwarm',
458
+ fmt=".2f",
459
+ cbar_kws={"shrink": 0.8},
460
+ vmin=-1, # Anchors minimum color to -1
461
+ vmax=1, # Anchors maximum color to 1
462
+ center=0 # Ensures 0 corresponds to the neutral color (white)
463
+ )
464
+
465
+ # add suffix to title
466
+ full_plot_title = f"{plot_title} - {method.title()} Correlation Heatmap"
467
+
468
+ plt.title(full_plot_title)
469
+ plt.xticks(rotation=45, ha='right')
470
+ plt.yticks(rotation=0)
471
+
472
+ plt.tight_layout()
473
+
474
+ if save_dir:
475
+ save_path = make_fullpath(save_dir, make=True)
476
+ # sanitize the plot title to save the file
477
+ sanitized_plot_title = sanitize_filename(plot_title)
478
+ plot_filename = sanitized_plot_title + ".svg"
479
+
480
+ full_path = save_path / plot_filename
481
+
482
+ plt.savefig(full_path, bbox_inches="tight", format='svg')
483
+ _LOGGER.info(f"Saved correlation heatmap: '{plot_filename}'")
484
+
485
+ plt.show()
486
+ plt.close()
487
+
@@ -0,0 +1,176 @@
1
+ import pandas as pd
2
+ from typing import Optional
3
+
4
+ from ..schema import FeatureSchema
5
+
6
+ from .._core import get_logger
7
+
8
+
9
+ _LOGGER = get_logger("Data Exploration: Schema Ops")
10
+
11
+
12
+ def finalize_feature_schema(
13
+ df_features: pd.DataFrame,
14
+ categorical_mappings: Optional[dict[str, dict[str, int]]]
15
+ ) -> FeatureSchema:
16
+ """
17
+ Analyzes the final features DataFrame to create a definitive schema.
18
+
19
+ This function is the "single source of truth" for column order
20
+ and type (categorical vs. continuous) for the entire ML pipeline.
21
+
22
+ It should be called at the end of the feature engineering process.
23
+
24
+ Args:
25
+ df_features (pd.DataFrame):
26
+ The final, processed DataFrame containing *only* feature columns
27
+ in the exact order they will be fed to the model.
28
+ categorical_mappings (Dict[str, Dict[str, int]] | None):
29
+ The mappings dictionary generated by
30
+ `encode_categorical_features`. Can be None if no
31
+ categorical features exist.
32
+
33
+ Returns:
34
+ FeatureSchema: A NamedTuple containing all necessary metadata for the pipeline.
35
+ """
36
+ feature_names: list[str] = df_features.columns.to_list()
37
+
38
+ # Intermediate lists for building
39
+ continuous_feature_names_list: list[str] = []
40
+ categorical_feature_names_list: list[str] = []
41
+ categorical_index_map_dict: dict[int, int] = {}
42
+
43
+ # _LOGGER.info("Finalizing feature schema...")
44
+
45
+ if categorical_mappings:
46
+ # --- Categorical features are present ---
47
+ categorical_names_set = set(categorical_mappings.keys())
48
+
49
+ for index, name in enumerate(feature_names):
50
+ if name in categorical_names_set:
51
+ # This is a categorical feature
52
+ cardinality = len(categorical_mappings[name])
53
+ categorical_index_map_dict[index] = cardinality
54
+ categorical_feature_names_list.append(name)
55
+ else:
56
+ # This is a continuous feature
57
+ continuous_feature_names_list.append(name)
58
+
59
+ # Use the populated dict, or None if it's empty
60
+ final_index_map = categorical_index_map_dict if categorical_index_map_dict else None
61
+
62
+ else:
63
+ # --- No categorical features ---
64
+ _LOGGER.info("No categorical mappings provided. Treating all features as continuous.")
65
+ continuous_feature_names_list = list(feature_names)
66
+ # categorical_feature_names_list remains empty
67
+ # categorical_index_map_dict remains empty
68
+ final_index_map = None # Explicitly set to None to match Optional type
69
+
70
+ _LOGGER.info(f"Schema created: {len(continuous_feature_names_list)} continuous, {len(categorical_feature_names_list)} categorical.")
71
+
72
+ # Create the final immutable instance
73
+ schema_instance = FeatureSchema(
74
+ feature_names=tuple(feature_names),
75
+ continuous_feature_names=tuple(continuous_feature_names_list),
76
+ categorical_feature_names=tuple(categorical_feature_names_list),
77
+ categorical_index_map=final_index_map,
78
+ categorical_mappings=categorical_mappings
79
+ )
80
+
81
+ return schema_instance
82
+
83
+
84
+ def apply_feature_schema(
85
+ df: pd.DataFrame,
86
+ schema: FeatureSchema,
87
+ targets: Optional[list[str]] = None,
88
+ unknown_value: int = 99999,
89
+ verbose: bool = True
90
+ ) -> pd.DataFrame:
91
+ """
92
+ Aligns the input DataFrame with the provided FeatureSchema.
93
+
94
+ This function aligns data for inference/fine-tuning by enforcing the schema's
95
+ structure and encoding.
96
+
97
+ Args:
98
+ df (pd.DataFrame): The input DataFrame.
99
+ schema (FeatureSchema): The schema defining feature names, types, and mappings.
100
+ targets (list[str] | None): Optional list of target column names.
101
+ unknown_value (int): Integer value to assign to unknown categorical levels.
102
+ Defaults to 99999 to avoid collision with existing categories.
103
+ verbose (bool): If True, logs info about dropped extra columns.
104
+
105
+ Returns:
106
+ pd.DataFrame: A new DataFrame with the exact column order and encoding defined by the schema.
107
+
108
+ Raises:
109
+ ValueError: If any required feature or target column is missing.
110
+ """
111
+ # 1. Setup
112
+ df_processed = df.copy()
113
+ targets = targets if targets is not None else []
114
+
115
+ # 2. Validation: Strict Column Presence
116
+ missing_features = [col for col in schema.feature_names if col not in df_processed.columns]
117
+ if missing_features:
118
+ _LOGGER.error(f"Schema Mismatch: Missing required features: {missing_features}")
119
+ raise ValueError()
120
+
121
+ # target columns should not be part of feature columns
122
+ if targets:
123
+ overlapping_columns = set(schema.feature_names).intersection(set(targets))
124
+ if overlapping_columns:
125
+ _LOGGER.error(f"Schema Mismatch: Target columns overlap with feature columns: {overlapping_columns}")
126
+ raise ValueError()
127
+
128
+ # targets were provided, check their presence
129
+ missing_targets = [col for col in targets if col not in df_processed.columns]
130
+ if missing_targets:
131
+ _LOGGER.error(f"Target Mismatch: Missing target columns: {missing_targets}")
132
+ raise ValueError()
133
+
134
+ # 3. Apply Categorical Encoding
135
+ if schema.categorical_feature_names and schema.categorical_mappings:
136
+ for col_name in schema.categorical_feature_names:
137
+ # Should never happen due to schema construction, but double-check and raise
138
+ if col_name not in schema.categorical_mappings:
139
+ _LOGGER.error(f"Schema Inconsistency: No mapping found for categorical feature '{col_name}'.")
140
+ raise ValueError()
141
+
142
+ mapping = schema.categorical_mappings[col_name]
143
+
144
+ # Apply mapping (unknowns become NaN)
145
+ df_processed[col_name] = df_processed[col_name].astype(str).map(mapping)
146
+
147
+ # Handle Unknown Categories
148
+ if df_processed[col_name].isnull().any():
149
+ n_missing = df_processed[col_name].isnull().sum()
150
+ _LOGGER.warning(f"Feature '{col_name}': Found {n_missing} unknown categories. Mapping to {unknown_value}.")
151
+
152
+ # Fill unknowns with the specified integer
153
+ df_processed[col_name] = df_processed[col_name].fillna(unknown_value)
154
+
155
+ df_processed[col_name] = df_processed[col_name].astype(int)
156
+
157
+ # 4. Reorder and Filter
158
+ final_column_order = list(schema.feature_names) + targets
159
+
160
+ extra_cols = set(df_processed.columns) - set(final_column_order)
161
+ if extra_cols:
162
+ _LOGGER.info(f"Dropping {len(extra_cols)} extra columns not present in schema.")
163
+ if verbose:
164
+ for extra_column in extra_cols:
165
+ print(f" - Dropping column: '{extra_column}'")
166
+
167
+ df_final = df_processed[final_column_order]
168
+
169
+ _LOGGER.info(f"Schema applied successfully. Final shape: {df_final.shape}")
170
+
171
+ # df_final should be a dataframe
172
+ if isinstance(df_final, pd.Series):
173
+ df_final = df_final.to_frame()
174
+
175
+ return df_final
176
+
@@ -1,14 +1,16 @@
1
- from ._core._ensemble_evaluation import (
1
+ from ._ensemble_evaluation import (
2
2
  evaluate_model_classification,
3
3
  plot_roc_curve,
4
4
  plot_precision_recall_curve,
5
5
  plot_calibration_curve,
6
6
  evaluate_model_regression,
7
7
  get_shap_values,
8
- plot_learning_curves,
9
- info
8
+ plot_learning_curves
10
9
  )
11
10
 
11
+ from ._imprimir import info
12
+
13
+
12
14
  __all__ = [
13
15
  "evaluate_model_classification",
14
16
  "plot_roc_curve",
@@ -17,4 +19,4 @@ __all__ = [
17
19
  "evaluate_model_regression",
18
20
  "get_shap_values",
19
21
  "plot_learning_curves"
20
- ]
22
+ ]