dragon-ml-toolbox 19.14.0__py3-none-any.whl → 20.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/METADATA +29 -46
  2. dragon_ml_toolbox-20.0.0.dist-info/RECORD +178 -0
  3. ml_tools/{ETL_cleaning.py → ETL_cleaning/__init__.py} +13 -5
  4. ml_tools/ETL_cleaning/_basic_clean.py +351 -0
  5. ml_tools/ETL_cleaning/_clean_tools.py +128 -0
  6. ml_tools/ETL_cleaning/_dragon_cleaner.py +245 -0
  7. ml_tools/ETL_cleaning/_imprimir.py +13 -0
  8. ml_tools/{ETL_engineering.py → ETL_engineering/__init__.py} +8 -4
  9. ml_tools/ETL_engineering/_dragon_engineering.py +261 -0
  10. ml_tools/ETL_engineering/_imprimir.py +24 -0
  11. ml_tools/{_core/_ETL_engineering.py → ETL_engineering/_transforms.py} +14 -267
  12. ml_tools/{_core → GUI_tools}/_GUI_tools.py +37 -40
  13. ml_tools/{GUI_tools.py → GUI_tools/__init__.py} +7 -5
  14. ml_tools/GUI_tools/_imprimir.py +12 -0
  15. ml_tools/IO_tools/_IO_loggers.py +235 -0
  16. ml_tools/IO_tools/_IO_save_load.py +151 -0
  17. ml_tools/IO_tools/_IO_utils.py +140 -0
  18. ml_tools/{IO_tools.py → IO_tools/__init__.py} +13 -5
  19. ml_tools/IO_tools/_imprimir.py +14 -0
  20. ml_tools/MICE/_MICE_imputation.py +132 -0
  21. ml_tools/{MICE_imputation.py → MICE/__init__.py} +6 -7
  22. ml_tools/{_core/_MICE_imputation.py → MICE/_dragon_mice.py} +243 -322
  23. ml_tools/MICE/_imprimir.py +11 -0
  24. ml_tools/{ML_callbacks.py → ML_callbacks/__init__.py} +12 -4
  25. ml_tools/ML_callbacks/_base.py +101 -0
  26. ml_tools/ML_callbacks/_checkpoint.py +232 -0
  27. ml_tools/ML_callbacks/_early_stop.py +208 -0
  28. ml_tools/ML_callbacks/_imprimir.py +12 -0
  29. ml_tools/ML_callbacks/_scheduler.py +197 -0
  30. ml_tools/{ML_chaining_utilities.py → ML_chain/__init__.py} +8 -3
  31. ml_tools/{_core/_ML_chaining_utilities.py → ML_chain/_chaining_tools.py} +5 -129
  32. ml_tools/ML_chain/_dragon_chain.py +140 -0
  33. ml_tools/ML_chain/_imprimir.py +11 -0
  34. ml_tools/ML_configuration/__init__.py +90 -0
  35. ml_tools/ML_configuration/_base_model_config.py +69 -0
  36. ml_tools/ML_configuration/_finalize.py +366 -0
  37. ml_tools/ML_configuration/_imprimir.py +47 -0
  38. ml_tools/ML_configuration/_metrics.py +593 -0
  39. ml_tools/ML_configuration/_models.py +206 -0
  40. ml_tools/ML_configuration/_training.py +124 -0
  41. ml_tools/ML_datasetmaster/__init__.py +28 -0
  42. ml_tools/ML_datasetmaster/_base_datasetmaster.py +337 -0
  43. ml_tools/{_core/_ML_datasetmaster.py → ML_datasetmaster/_datasetmaster.py} +9 -329
  44. ml_tools/ML_datasetmaster/_imprimir.py +15 -0
  45. ml_tools/{_core/_ML_sequence_datasetmaster.py → ML_datasetmaster/_sequence_datasetmaster.py} +13 -15
  46. ml_tools/{_core/_ML_vision_datasetmaster.py → ML_datasetmaster/_vision_datasetmaster.py} +63 -65
  47. ml_tools/ML_evaluation/__init__.py +53 -0
  48. ml_tools/ML_evaluation/_classification.py +629 -0
  49. ml_tools/ML_evaluation/_feature_importance.py +409 -0
  50. ml_tools/ML_evaluation/_imprimir.py +25 -0
  51. ml_tools/ML_evaluation/_loss.py +92 -0
  52. ml_tools/ML_evaluation/_regression.py +273 -0
  53. ml_tools/{_core/_ML_sequence_evaluation.py → ML_evaluation/_sequence.py} +8 -11
  54. ml_tools/{_core/_ML_vision_evaluation.py → ML_evaluation/_vision.py} +12 -17
  55. ml_tools/{_core → ML_evaluation_captum}/_ML_evaluation_captum.py +11 -38
  56. ml_tools/{ML_evaluation_captum.py → ML_evaluation_captum/__init__.py} +6 -4
  57. ml_tools/ML_evaluation_captum/_imprimir.py +10 -0
  58. ml_tools/{_core → ML_finalize_handler}/_ML_finalize_handler.py +3 -7
  59. ml_tools/ML_finalize_handler/__init__.py +10 -0
  60. ml_tools/ML_finalize_handler/_imprimir.py +8 -0
  61. ml_tools/ML_inference/__init__.py +22 -0
  62. ml_tools/ML_inference/_base_inference.py +166 -0
  63. ml_tools/{_core/_ML_chaining_inference.py → ML_inference/_chain_inference.py} +14 -17
  64. ml_tools/ML_inference/_dragon_inference.py +332 -0
  65. ml_tools/ML_inference/_imprimir.py +11 -0
  66. ml_tools/ML_inference/_multi_inference.py +180 -0
  67. ml_tools/ML_inference_sequence/__init__.py +10 -0
  68. ml_tools/ML_inference_sequence/_imprimir.py +8 -0
  69. ml_tools/{_core/_ML_sequence_inference.py → ML_inference_sequence/_sequence_inference.py} +11 -15
  70. ml_tools/ML_inference_vision/__init__.py +10 -0
  71. ml_tools/ML_inference_vision/_imprimir.py +8 -0
  72. ml_tools/{_core/_ML_vision_inference.py → ML_inference_vision/_vision_inference.py} +15 -19
  73. ml_tools/ML_models/__init__.py +32 -0
  74. ml_tools/{_core/_ML_models_advanced.py → ML_models/_advanced_models.py} +22 -18
  75. ml_tools/ML_models/_base_mlp_attention.py +198 -0
  76. ml_tools/{_core/_models_advanced_base.py → ML_models/_base_save_load.py} +73 -49
  77. ml_tools/ML_models/_dragon_tabular.py +248 -0
  78. ml_tools/ML_models/_imprimir.py +18 -0
  79. ml_tools/ML_models/_mlp_attention.py +134 -0
  80. ml_tools/{_core → ML_models}/_models_advanced_helpers.py +13 -13
  81. ml_tools/ML_models_sequence/__init__.py +10 -0
  82. ml_tools/ML_models_sequence/_imprimir.py +8 -0
  83. ml_tools/{_core/_ML_sequence_models.py → ML_models_sequence/_sequence_models.py} +5 -8
  84. ml_tools/ML_models_vision/__init__.py +29 -0
  85. ml_tools/ML_models_vision/_base_wrapper.py +254 -0
  86. ml_tools/ML_models_vision/_image_classification.py +182 -0
  87. ml_tools/ML_models_vision/_image_segmentation.py +108 -0
  88. ml_tools/ML_models_vision/_imprimir.py +16 -0
  89. ml_tools/ML_models_vision/_object_detection.py +135 -0
  90. ml_tools/ML_optimization/__init__.py +21 -0
  91. ml_tools/ML_optimization/_imprimir.py +13 -0
  92. ml_tools/{_core/_ML_optimization_pareto.py → ML_optimization/_multi_dragon.py} +18 -24
  93. ml_tools/ML_optimization/_single_dragon.py +203 -0
  94. ml_tools/{_core/_ML_optimization.py → ML_optimization/_single_manual.py} +75 -213
  95. ml_tools/{_core → ML_scaler}/_ML_scaler.py +8 -11
  96. ml_tools/ML_scaler/__init__.py +10 -0
  97. ml_tools/ML_scaler/_imprimir.py +8 -0
  98. ml_tools/ML_trainer/__init__.py +20 -0
  99. ml_tools/ML_trainer/_base_trainer.py +297 -0
  100. ml_tools/ML_trainer/_dragon_detection_trainer.py +402 -0
  101. ml_tools/ML_trainer/_dragon_sequence_trainer.py +540 -0
  102. ml_tools/ML_trainer/_dragon_trainer.py +1160 -0
  103. ml_tools/ML_trainer/_imprimir.py +10 -0
  104. ml_tools/{ML_utilities.py → ML_utilities/__init__.py} +14 -6
  105. ml_tools/ML_utilities/_artifact_finder.py +382 -0
  106. ml_tools/ML_utilities/_imprimir.py +16 -0
  107. ml_tools/ML_utilities/_inspection.py +325 -0
  108. ml_tools/ML_utilities/_train_tools.py +205 -0
  109. ml_tools/{ML_vision_transformers.py → ML_vision_transformers/__init__.py} +9 -6
  110. ml_tools/{_core/_ML_vision_transformers.py → ML_vision_transformers/_core_transforms.py} +11 -155
  111. ml_tools/ML_vision_transformers/_imprimir.py +14 -0
  112. ml_tools/ML_vision_transformers/_offline_augmentation.py +159 -0
  113. ml_tools/{_core/_PSO_optimization.py → PSO_optimization/_PSO.py} +58 -15
  114. ml_tools/{PSO_optimization.py → PSO_optimization/__init__.py} +5 -3
  115. ml_tools/PSO_optimization/_imprimir.py +10 -0
  116. ml_tools/SQL/__init__.py +7 -0
  117. ml_tools/{_core/_SQL.py → SQL/_dragon_SQL.py} +7 -11
  118. ml_tools/SQL/_imprimir.py +8 -0
  119. ml_tools/{_core → VIF}/_VIF_factor.py +5 -8
  120. ml_tools/{VIF_factor.py → VIF/__init__.py} +4 -2
  121. ml_tools/VIF/_imprimir.py +10 -0
  122. ml_tools/_core/__init__.py +7 -1
  123. ml_tools/_core/_logger.py +8 -18
  124. ml_tools/_core/_schema_load_ops.py +43 -0
  125. ml_tools/_core/_script_info.py +2 -2
  126. ml_tools/{data_exploration.py → data_exploration/__init__.py} +32 -16
  127. ml_tools/data_exploration/_analysis.py +214 -0
  128. ml_tools/data_exploration/_cleaning.py +566 -0
  129. ml_tools/data_exploration/_features.py +583 -0
  130. ml_tools/data_exploration/_imprimir.py +32 -0
  131. ml_tools/data_exploration/_plotting.py +487 -0
  132. ml_tools/data_exploration/_schema_ops.py +176 -0
  133. ml_tools/{ensemble_evaluation.py → ensemble_evaluation/__init__.py} +6 -4
  134. ml_tools/{_core → ensemble_evaluation}/_ensemble_evaluation.py +3 -7
  135. ml_tools/ensemble_evaluation/_imprimir.py +14 -0
  136. ml_tools/{ensemble_inference.py → ensemble_inference/__init__.py} +5 -3
  137. ml_tools/{_core → ensemble_inference}/_ensemble_inference.py +15 -18
  138. ml_tools/ensemble_inference/_imprimir.py +9 -0
  139. ml_tools/{ensemble_learning.py → ensemble_learning/__init__.py} +4 -6
  140. ml_tools/{_core → ensemble_learning}/_ensemble_learning.py +7 -10
  141. ml_tools/ensemble_learning/_imprimir.py +10 -0
  142. ml_tools/{excel_handler.py → excel_handler/__init__.py} +5 -3
  143. ml_tools/{_core → excel_handler}/_excel_handler.py +6 -10
  144. ml_tools/excel_handler/_imprimir.py +13 -0
  145. ml_tools/{keys.py → keys/__init__.py} +4 -1
  146. ml_tools/keys/_imprimir.py +11 -0
  147. ml_tools/{_core → keys}/_keys.py +2 -0
  148. ml_tools/{math_utilities.py → math_utilities/__init__.py} +5 -2
  149. ml_tools/math_utilities/_imprimir.py +11 -0
  150. ml_tools/{_core → math_utilities}/_math_utilities.py +1 -5
  151. ml_tools/{optimization_tools.py → optimization_tools/__init__.py} +9 -4
  152. ml_tools/optimization_tools/_imprimir.py +13 -0
  153. ml_tools/optimization_tools/_optimization_bounds.py +236 -0
  154. ml_tools/optimization_tools/_optimization_plots.py +218 -0
  155. ml_tools/{path_manager.py → path_manager/__init__.py} +6 -3
  156. ml_tools/{_core/_path_manager.py → path_manager/_dragonmanager.py} +11 -347
  157. ml_tools/path_manager/_imprimir.py +15 -0
  158. ml_tools/path_manager/_path_tools.py +346 -0
  159. ml_tools/plot_fonts/__init__.py +8 -0
  160. ml_tools/plot_fonts/_imprimir.py +8 -0
  161. ml_tools/{_core → plot_fonts}/_plot_fonts.py +2 -5
  162. ml_tools/schema/__init__.py +15 -0
  163. ml_tools/schema/_feature_schema.py +223 -0
  164. ml_tools/schema/_gui_schema.py +191 -0
  165. ml_tools/schema/_imprimir.py +10 -0
  166. ml_tools/{serde.py → serde/__init__.py} +4 -2
  167. ml_tools/serde/_imprimir.py +10 -0
  168. ml_tools/{_core → serde}/_serde.py +3 -8
  169. ml_tools/{utilities.py → utilities/__init__.py} +11 -6
  170. ml_tools/utilities/_imprimir.py +18 -0
  171. ml_tools/{_core/_utilities.py → utilities/_utility_save_load.py} +13 -190
  172. ml_tools/utilities/_utility_tools.py +192 -0
  173. dragon_ml_toolbox-19.14.0.dist-info/RECORD +0 -111
  174. ml_tools/ML_chaining_inference.py +0 -8
  175. ml_tools/ML_configuration.py +0 -86
  176. ml_tools/ML_configuration_pytab.py +0 -14
  177. ml_tools/ML_datasetmaster.py +0 -10
  178. ml_tools/ML_evaluation.py +0 -16
  179. ml_tools/ML_evaluation_multi.py +0 -12
  180. ml_tools/ML_finalize_handler.py +0 -8
  181. ml_tools/ML_inference.py +0 -12
  182. ml_tools/ML_models.py +0 -14
  183. ml_tools/ML_models_advanced.py +0 -14
  184. ml_tools/ML_models_pytab.py +0 -14
  185. ml_tools/ML_optimization.py +0 -14
  186. ml_tools/ML_optimization_pareto.py +0 -8
  187. ml_tools/ML_scaler.py +0 -8
  188. ml_tools/ML_sequence_datasetmaster.py +0 -8
  189. ml_tools/ML_sequence_evaluation.py +0 -10
  190. ml_tools/ML_sequence_inference.py +0 -8
  191. ml_tools/ML_sequence_models.py +0 -8
  192. ml_tools/ML_trainer.py +0 -12
  193. ml_tools/ML_vision_datasetmaster.py +0 -12
  194. ml_tools/ML_vision_evaluation.py +0 -10
  195. ml_tools/ML_vision_inference.py +0 -8
  196. ml_tools/ML_vision_models.py +0 -18
  197. ml_tools/SQL.py +0 -8
  198. ml_tools/_core/_ETL_cleaning.py +0 -694
  199. ml_tools/_core/_IO_tools.py +0 -498
  200. ml_tools/_core/_ML_callbacks.py +0 -702
  201. ml_tools/_core/_ML_configuration.py +0 -1332
  202. ml_tools/_core/_ML_configuration_pytab.py +0 -102
  203. ml_tools/_core/_ML_evaluation.py +0 -867
  204. ml_tools/_core/_ML_evaluation_multi.py +0 -544
  205. ml_tools/_core/_ML_inference.py +0 -646
  206. ml_tools/_core/_ML_models.py +0 -668
  207. ml_tools/_core/_ML_models_pytab.py +0 -693
  208. ml_tools/_core/_ML_trainer.py +0 -2323
  209. ml_tools/_core/_ML_utilities.py +0 -886
  210. ml_tools/_core/_ML_vision_models.py +0 -644
  211. ml_tools/_core/_data_exploration.py +0 -1909
  212. ml_tools/_core/_optimization_tools.py +0 -493
  213. ml_tools/_core/_schema.py +0 -359
  214. ml_tools/plot_fonts.py +0 -8
  215. ml_tools/schema.py +0 -12
  216. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/WHEEL +0 -0
  217. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE +0 -0
  218. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
  219. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/top_level.txt +0 -0
@@ -1,493 +0,0 @@
1
- import matplotlib.pyplot as plt
2
- import seaborn as sns
3
- from typing import Union, Any, Literal, Optional, Dict, List, Tuple
4
- from pathlib import Path
5
- import pandas as pd
6
-
7
- from ._path_manager import make_fullpath, list_csv_paths, sanitize_filename
8
- from ._utilities import yield_dataframes_from_dir
9
- from ._logger import get_logger
10
- from ._script_info import _script_info
11
- from ._SQL import DragonSQL
12
- from ._IO_tools import save_json, load_json
13
- from ._schema import FeatureSchema
14
- from ._keys import OptimizationToolsKeys
15
-
16
-
17
- _LOGGER = get_logger("Optimization Tools")
18
-
19
-
20
- __all__ = [
21
- "make_continuous_bounds_template",
22
- "load_continuous_bounds_template",
23
- "create_optimization_bounds",
24
- "parse_lower_upper_bounds",
25
- "plot_optimal_feature_distributions",
26
- "plot_optimal_feature_distributions_from_dataframe",
27
- ]
28
-
29
-
30
- def make_continuous_bounds_template(
31
- directory: Union[str, Path],
32
- feature_schema: FeatureSchema,
33
- default_bounds: Tuple[float, float] = (0, 1)
34
- ) -> None:
35
- """
36
- Creates a JSON template for manual entry of continuous feature optimization bounds.
37
-
38
- The resulting file maps each continuous feature name to a [min, max] list
39
- populated with `default_bounds`. Edit the values in this file before using.
40
-
41
- Args:
42
- directory (str | Path): The directory where the template will be saved.
43
- feature_schema (FeatureSchema): The loaded schema containing feature definitions.
44
- default_bounds (Tuple[float, float]): Default (min, max) values to populate the template.
45
- """
46
- # validate directory path
47
- dir_path = make_fullpath(directory, make=True, enforce="directory")
48
-
49
- # 1. Check if continuous features exist
50
- if not feature_schema.continuous_feature_names:
51
- _LOGGER.warning("No continuous features found in FeatureSchema. Skipping bounds template generation.")
52
- return
53
-
54
- # 2. Construct the dictionary: {feature_name: [min, max]}
55
- bounds_map = {
56
- name: list(default_bounds)
57
- for name in feature_schema.continuous_feature_names
58
- }
59
-
60
- # use a fixed key for the filename
61
- filename = OptimizationToolsKeys.OPTIMIZATION_BOUNDS_FILENAME + ".json"
62
-
63
- # 3. Save to JSON using the IO tool
64
- save_json(
65
- data=bounds_map,
66
- directory=dir_path,
67
- filename=filename,
68
- verbose=False
69
- )
70
-
71
- _LOGGER.info(f"💾 Continuous bounds template saved to: '{dir_path.name}/{filename}'")
72
-
73
-
74
- def load_continuous_bounds_template(directory: Union[str, Path]) -> Dict[str, List[float]]:
75
- """
76
- Loads the continuous feature bounds template from JSON. Expected filename: `optimization_bounds.json`.
77
-
78
- Args:
79
- directory (str | Path): The directory where the template is located.
80
-
81
- Returns:
82
- Dictionary (Dict[str, List[float]]): A dictionary mapping feature names to [min, max] bounds.
83
- """
84
- dir_path = make_fullpath(directory, enforce="directory")
85
- full_path = dir_path / (OptimizationToolsKeys.OPTIMIZATION_BOUNDS_FILENAME + ".json")
86
-
87
- bounds_map = load_json(
88
- file_path=full_path,
89
- expected_type='dict',
90
- verbose=False
91
- )
92
-
93
- # validate loaded data
94
- if not all(
95
- isinstance(v, list) and # Check type
96
- len(v) == 2 and # Check length
97
- all(isinstance(i, (int, float)) for i in v) # Check contents are numbers
98
- for v in bounds_map.values()
99
- ):
100
- _LOGGER.error(f"Invalid format in bounds template at '{full_path}'. Each value must be a list of [min, max].")
101
- raise ValueError()
102
-
103
- _LOGGER.info(f"Continuous bounds template loaded from: '{dir_path.name}'")
104
-
105
- return bounds_map
106
-
107
-
108
- def create_optimization_bounds(
109
- schema: FeatureSchema,
110
- continuous_bounds_map: Union[Dict[str, Tuple[float, float]], Dict[str, List[float]]],
111
- start_at_zero: bool = True
112
- ) -> Tuple[List[float], List[float]]:
113
- """
114
- Generates the lower and upper bounds lists for the optimizer from a FeatureSchema.
115
-
116
- This helper function automates the creation of unbiased bounds for
117
- categorical features and combines them with user-defined bounds for
118
- continuous features, using the schema as the single source of truth
119
- for feature order and type.
120
-
121
- Args:
122
- schema (FeatureSchema):
123
- The definitive schema object created by
124
- `data_exploration.finalize_feature_schema()`.
125
- continuous_bounds_map (Dict[str, Tuple[float, float]], Dict[str, List[float]]):
126
- A dictionary mapping the *name* of each **continuous** feature
127
- to its (min_bound, max_bound).
128
- start_at_zero (bool):
129
- - If True, assumes categorical encoding is [0, 1, ..., k-1].
130
- Bounds will be set as [-0.5, k - 0.5].
131
- - If False, assumes encoding is [1, 2, ..., k].
132
- Bounds will be set as [0.5, k + 0.5].
133
-
134
- Returns:
135
- Tuple[List[float], List[float]]:
136
- A tuple containing two lists: (lower_bounds, upper_bounds).
137
-
138
- Raises:
139
- ValueError: If a feature is missing from `continuous_bounds_map`
140
- or if a feature name in the map is not a
141
- continuous feature according to the schema.
142
- """
143
- # validate length in the continuous_bounds_map values
144
- for name, bounds in continuous_bounds_map.items():
145
- if not (isinstance(bounds, (list, tuple)) and len(bounds) == 2):
146
- _LOGGER.error(f"Bounds for feature '{name}' must be a list or tuple of length 2 (min, max). Found: {bounds}")
147
- raise ValueError()
148
-
149
- # 1. Get feature names and map from schema
150
- feature_names = schema.feature_names
151
- categorical_index_map = schema.categorical_index_map
152
- total_features = len(feature_names)
153
-
154
- if total_features <= 0:
155
- _LOGGER.error("Schema contains no features.")
156
- raise ValueError()
157
-
158
- _LOGGER.info(f"Generating bounds for {total_features} total features...")
159
-
160
- # 2. Initialize bound lists
161
- lower_bounds: List[Optional[float]] = [None] * total_features
162
- upper_bounds: List[Optional[float]] = [None] * total_features
163
-
164
- # 3. Populate categorical bounds (Index-based)
165
- if categorical_index_map:
166
- for index, cardinality in categorical_index_map.items():
167
- if not (0 <= index < total_features):
168
- _LOGGER.error(f"Categorical index {index} is out of range for the {total_features} features.")
169
- raise ValueError()
170
-
171
- if start_at_zero:
172
- # Rule for [0, k-1]: bounds are [-0.5, k - 0.5]
173
- low = -0.5
174
- high = float(cardinality) - 0.5
175
- else:
176
- # Rule for [1, k]: bounds are [0.5, k + 0.5]
177
- low = 0.5
178
- high = float(cardinality) + 0.5
179
-
180
- lower_bounds[index] = low
181
- upper_bounds[index] = high
182
-
183
- _LOGGER.info(f"Automatically set bounds for {len(categorical_index_map)} categorical features.")
184
- else:
185
- _LOGGER.info("No categorical features found in schema.")
186
-
187
- # 4. Populate continuous bounds (Name-based)
188
- # Use schema.continuous_feature_names for robust checking
189
- continuous_names_set = set(schema.continuous_feature_names)
190
-
191
- if continuous_names_set != set(continuous_bounds_map.keys()):
192
- missing_in_map = continuous_names_set - set(continuous_bounds_map.keys())
193
- if missing_in_map:
194
- _LOGGER.error(f"The following continuous features are missing from 'continuous_bounds_map': {list(missing_in_map)}")
195
-
196
- extra_in_map = set(continuous_bounds_map.keys()) - continuous_names_set
197
- if extra_in_map:
198
- _LOGGER.error(f"The following features in 'continuous_bounds_map' are not defined as continuous in the schema: {list(extra_in_map)}")
199
-
200
- raise ValueError("Mismatch between 'continuous_bounds_map' and schema's continuous features.")
201
-
202
- count_continuous = 0
203
- for name, (low, high) in continuous_bounds_map.items():
204
- # Map name to its index in the *feature-only* list
205
- # This is guaranteed to be correct by the schema
206
- index = feature_names.index(name)
207
-
208
- if lower_bounds[index] is not None:
209
- # This should be impossible if schema is correct, but good to check
210
- _LOGGER.error(f"Schema conflict: Feature '{name}' (at index {index}) is defined as both continuous and categorical.")
211
- raise ValueError()
212
-
213
- lower_bounds[index] = float(low)
214
- upper_bounds[index] = float(high)
215
- count_continuous += 1
216
-
217
- _LOGGER.info(f"Manually set bounds for {count_continuous} continuous features.")
218
-
219
- # 5. Final Validation (all Nones should be filled)
220
- if None in lower_bounds:
221
- missing_indices = [i for i, b in enumerate(lower_bounds) if b is None]
222
- missing_names = [feature_names[i] for i in missing_indices]
223
- _LOGGER.error(f"Failed to create all bounds. This indicates an internal logic error. Missing: {missing_names}")
224
- raise RuntimeError("Internal error: Not all bounds were populated.")
225
-
226
- # Cast to float lists, as 'None' sentinels are gone
227
- return (
228
- [float(b) for b in lower_bounds], # type: ignore
229
- [float(b) for b in upper_bounds] # type: ignore
230
- )
231
-
232
-
233
- def parse_lower_upper_bounds(source: dict[str,tuple[Any,Any]]):
234
- """
235
- Parse lower and upper boundaries, returning 2 lists:
236
-
237
- `lower_bounds`, `upper_bounds`
238
- """
239
- lower = [low[0] for low in source.values()]
240
- upper = [up[1] for up in source.values()]
241
-
242
- return lower, upper
243
-
244
-
245
- def plot_optimal_feature_distributions(results_dir: Union[str, Path],
246
- verbose: bool=False,
247
- target_columns: Optional[List[str]] = None):
248
- """
249
- Analyzes optimization results and plots the distribution of optimal values.
250
-
251
- This function is compatible with mixed-type CSVs (strings for
252
- categorical features, numbers for continuous). It automatically
253
- detects the data type for each feature and generates:
254
-
255
- - A Bar Plot for categorical (string) features.
256
- - A KDE Plot for continuous (numeric) features.
257
-
258
- Plots are saved in a subdirectory inside the source directory.
259
-
260
- Parameters
261
- ----------
262
- results_dir : str | Path
263
- The path to the directory containing the optimization result CSV files.
264
- target_columns (list[str] | None):
265
- A list of target column names to explicitly exclude from plotting. If None, it defaults to excluding only the last column (assumed as the target).
266
- """
267
- # Check results_dir and create output path
268
- results_path = make_fullpath(results_dir, enforce="directory")
269
- output_path = make_fullpath(results_path / "DistributionPlots", make=True)
270
-
271
- # Check that the directory contains csv files
272
- list_csv_paths(results_path, verbose=False, raise_on_empty=True)
273
-
274
- # --- Data Loading and Preparation ---
275
- _LOGGER.debug(f"📁 Starting analysis from results in: '{results_dir}'")
276
-
277
- data_to_plot = []
278
- for df, df_name in yield_dataframes_from_dir(results_path, verbose=True):
279
- if df.shape[1] < 2:
280
- _LOGGER.warning(f"Skipping '{df_name}': must have at least 2 columns (feature + target).")
281
- continue
282
-
283
- # --- Column selection logic ---
284
- if target_columns:
285
- # 1. Explicitly drop known targets to isolate features
286
- existing_targets = [c for c in target_columns if c in df.columns]
287
- features_df = df.drop(columns=existing_targets)
288
-
289
- if features_df.empty:
290
- _LOGGER.warning(f"Skipping '{df_name}': All columns were dropped based on target_columns list.")
291
- continue
292
- else:
293
- # 2. Fallback: Assume the last column is the only target
294
- features_df = df.iloc[:, :-1]
295
-
296
- # 3. Melt the filtered dataframe
297
- melted_df = features_df.melt(var_name='feature', value_name='value')
298
-
299
- # Set target as the filename (or joined target names) to differentiate sources
300
- melted_df['target'] = '\n'.join(target_columns) if target_columns else df_name
301
- data_to_plot.append(melted_df)
302
-
303
- if not data_to_plot:
304
- _LOGGER.error("No valid data to plot after processing all CSVs.")
305
- return
306
-
307
- long_df = pd.concat(data_to_plot, ignore_index=True)
308
-
309
- # --- Delegate to Helper ---
310
- _generate_and_save_feature_plots(long_df, output_path, verbose)
311
-
312
-
313
- def plot_optimal_feature_distributions_from_dataframe(dataframe: pd.DataFrame,
314
- save_dir: Union[str, Path],
315
- verbose: bool=False,
316
- target_columns: Optional[List[str]] = None):
317
- """
318
- Analyzes a single dataframe of optimization results and plots the distribution of optimal values.
319
-
320
- This function is compatible with mixed-type data (strings for categorical features,
321
- numbers for continuous). It automatically detects the data type for each feature
322
- and generates:
323
-
324
- - A Bar Plot for categorical (string) features.
325
- - A KDE Plot for continuous (numeric) features.
326
-
327
- Plots are saved in a 'DistributionPlots' subdirectory inside the save_dir.
328
-
329
- Parameters
330
- ----------
331
- dataframe : pd.DataFrame
332
- The dataframe containing the optimization results (features + target/s).
333
- save_dir : str | Path
334
- The directory where the 'DistributionPlots' folder will be created.
335
- verbose : bool, optional
336
- If True, logs details about which plot type is chosen for each feature.
337
- target_columns : list[str] | None
338
- A list of target column names to explicitly exclude from plotting.
339
- If None, it defaults to excluding only the last column (assumed as the target).
340
- """
341
- # Check results_dir and create output path
342
- root_path = make_fullpath(save_dir, make=True, enforce="directory")
343
- output_path = make_fullpath(root_path / "DistributionPlots", make=True, enforce="directory")
344
-
345
- _LOGGER.debug(f"📁 Starting analysis from provided DataFrame. Output: '{output_path}'")
346
-
347
- if dataframe.empty:
348
- _LOGGER.error("Provided dataframe is empty.")
349
- return
350
-
351
- if dataframe.shape[1] < 2:
352
- _LOGGER.warning("DataFrame has fewer than 2 columns. Expecting at least one feature and one target.")
353
-
354
- # --- Data Preparation ---
355
- if target_columns:
356
- # Explicitly drop known targets to isolate features
357
- existing_targets = [c for c in target_columns if c in dataframe.columns]
358
- features_df = dataframe.drop(columns=existing_targets)
359
- target_label = '\n'.join(target_columns)
360
- else:
361
- # Fallback: Assume the last column is the only target
362
- features_df = dataframe.iloc[:, :-1]
363
- target_label = "Optimization Result"
364
-
365
- if features_df.empty:
366
- _LOGGER.warning("Skipping plotting: All columns were dropped based on target_columns list.")
367
- return
368
-
369
- # Melt and assign static target label
370
- long_df = features_df.melt(var_name='feature', value_name='value')
371
- long_df['target'] = target_label
372
-
373
- # --- Delegate to Helper ---
374
- _generate_and_save_feature_plots(long_df, output_path, verbose)
375
-
376
-
377
- def _generate_and_save_feature_plots(long_df: pd.DataFrame, output_path: Path, verbose: bool) -> None:
378
- """
379
- Private helper: iterates over a melted DataFrame (columns: feature, value, target)
380
- and generates/saves the appropriate plot (Bar or KDE) for each feature.
381
- """
382
- features = long_df['feature'].unique()
383
- unique_targets = long_df['target'].unique()
384
-
385
- _LOGGER.info(f"📊 Found data for {len(features)} features. Generating plots...")
386
-
387
- for feature_name in features:
388
- plt.figure(figsize=(12, 7))
389
-
390
- # .copy() to ensure we are working with a distinct object
391
- feature_df = long_df[long_df['feature'] == feature_name].copy()
392
-
393
- # --- Type-checking logic ---
394
- feature_df['numeric_value'] = pd.to_numeric(feature_df['value'], errors='coerce')
395
-
396
- # If *any* value failed conversion (is NaN), treat it as categorical.
397
- if feature_df['numeric_value'].isna().any():
398
-
399
- # --- PLOT 1: CATEGORICAL (String-based) ---
400
- if verbose:
401
- print(f" Plotting '{feature_name}' as categorical (bar plot).")
402
-
403
- # Calculate percentages for a clean bar plot
404
- norm_df = (feature_df.groupby('target')['value']
405
- .value_counts(normalize=True)
406
- .mul(100)
407
- .rename('percent')
408
- .reset_index())
409
-
410
- ax = sns.barplot(data=norm_df, x='value', y='percent', hue='target')
411
- plt.ylabel("Frequency (%)", fontsize=12)
412
- ax.set_ylim(0, 100)
413
-
414
- # always rotate x-ticks for categorical clarity
415
- plt.xticks(rotation=45, ha='right')
416
-
417
- else:
418
- # --- PLOT 2: CONTINUOUS (Numeric-based) ---
419
- if verbose:
420
- print(f" Plotting '{feature_name}' as continuous (KDE plot).")
421
-
422
- ax = sns.kdeplot(data=feature_df, x='numeric_value', hue='target',
423
- fill=True, alpha=0.1, warn_singular=False)
424
-
425
- plt.xlabel("Feature Value", fontsize=12)
426
- plt.ylabel("Density", fontsize=12)
427
-
428
- # --- Common settings for both plot types ---
429
- plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
430
- plt.grid(axis='y', alpha=0.5, linestyle='--')
431
-
432
- legend = ax.get_legend()
433
- if legend:
434
- legend.set_title('Target')
435
-
436
- sanitized_feature_name = sanitize_filename(feature_name)
437
- plot_filename = output_path / f"Distribution_{sanitized_feature_name}.svg"
438
- plt.savefig(plot_filename, bbox_inches='tight')
439
- plt.close()
440
-
441
- _LOGGER.info(f"All plots saved successfully to: '{output_path}'")
442
-
443
-
444
- def _save_result(
445
- result_dict: dict,
446
- save_format: Literal['csv', 'sqlite', 'both'],
447
- csv_path: Path,
448
- db_manager: Optional[DragonSQL] = None,
449
- db_table_name: Optional[str] = None,
450
- categorical_mappings: Optional[Dict[str, Dict[str, int]]] = None
451
- ):
452
- """
453
- Private helper to handle saving a single result to CSV, SQLite, or both.
454
-
455
- If `categorical_mappings` is provided, it will reverse-map integer values
456
- to their string representations before saving.
457
- """
458
- # --- Reverse Mapping Logic ---
459
- # Create a copy to hold the values to be saved
460
- save_dict = result_dict.copy()
461
-
462
- if categorical_mappings:
463
- for feature_name, mapping in categorical_mappings.items():
464
- if feature_name in save_dict:
465
- # Create a reverse map {0: 'Category_A', 1: 'Category_B'}
466
- reverse_map = {idx: name for name, idx in mapping.items()}
467
-
468
- # Get the integer value from the results (e.g., 0)
469
- int_value = save_dict[feature_name]
470
-
471
- # Find the corresponding string (e.g., 'Category_A')
472
- # Use .get() for safety, defaulting to the original value if not found
473
- string_value = reverse_map.get(int_value, int_value)
474
-
475
- # Update the dictionary that will be saved
476
- save_dict[feature_name] = string_value
477
-
478
- # Save to CSV
479
- if save_format in ['csv', 'both']:
480
- df_row = pd.DataFrame([save_dict])
481
- file_exists = csv_path.exists()
482
- df_row.to_csv(csv_path, mode='a', index=False, header=not file_exists)
483
-
484
- # Save to SQLite
485
- if save_format in ['sqlite', 'both']:
486
- if db_manager and db_table_name:
487
- db_manager.insert_row(db_table_name, save_dict)
488
- else:
489
- _LOGGER.warning("SQLite saving requested but db_manager or table_name not provided.")
490
-
491
-
492
- def info():
493
- _script_info(__all__)