dragon-ml-toolbox 19.14.0__py3-none-any.whl → 20.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/METADATA +29 -46
- dragon_ml_toolbox-20.0.0.dist-info/RECORD +178 -0
- ml_tools/{ETL_cleaning.py → ETL_cleaning/__init__.py} +13 -5
- ml_tools/ETL_cleaning/_basic_clean.py +351 -0
- ml_tools/ETL_cleaning/_clean_tools.py +128 -0
- ml_tools/ETL_cleaning/_dragon_cleaner.py +245 -0
- ml_tools/ETL_cleaning/_imprimir.py +13 -0
- ml_tools/{ETL_engineering.py → ETL_engineering/__init__.py} +8 -4
- ml_tools/ETL_engineering/_dragon_engineering.py +261 -0
- ml_tools/ETL_engineering/_imprimir.py +24 -0
- ml_tools/{_core/_ETL_engineering.py → ETL_engineering/_transforms.py} +14 -267
- ml_tools/{_core → GUI_tools}/_GUI_tools.py +37 -40
- ml_tools/{GUI_tools.py → GUI_tools/__init__.py} +7 -5
- ml_tools/GUI_tools/_imprimir.py +12 -0
- ml_tools/IO_tools/_IO_loggers.py +235 -0
- ml_tools/IO_tools/_IO_save_load.py +151 -0
- ml_tools/IO_tools/_IO_utils.py +140 -0
- ml_tools/{IO_tools.py → IO_tools/__init__.py} +13 -5
- ml_tools/IO_tools/_imprimir.py +14 -0
- ml_tools/MICE/_MICE_imputation.py +132 -0
- ml_tools/{MICE_imputation.py → MICE/__init__.py} +6 -7
- ml_tools/{_core/_MICE_imputation.py → MICE/_dragon_mice.py} +243 -322
- ml_tools/MICE/_imprimir.py +11 -0
- ml_tools/{ML_callbacks.py → ML_callbacks/__init__.py} +12 -4
- ml_tools/ML_callbacks/_base.py +101 -0
- ml_tools/ML_callbacks/_checkpoint.py +232 -0
- ml_tools/ML_callbacks/_early_stop.py +208 -0
- ml_tools/ML_callbacks/_imprimir.py +12 -0
- ml_tools/ML_callbacks/_scheduler.py +197 -0
- ml_tools/{ML_chaining_utilities.py → ML_chain/__init__.py} +8 -3
- ml_tools/{_core/_ML_chaining_utilities.py → ML_chain/_chaining_tools.py} +5 -129
- ml_tools/ML_chain/_dragon_chain.py +140 -0
- ml_tools/ML_chain/_imprimir.py +11 -0
- ml_tools/ML_configuration/__init__.py +90 -0
- ml_tools/ML_configuration/_base_model_config.py +69 -0
- ml_tools/ML_configuration/_finalize.py +366 -0
- ml_tools/ML_configuration/_imprimir.py +47 -0
- ml_tools/ML_configuration/_metrics.py +593 -0
- ml_tools/ML_configuration/_models.py +206 -0
- ml_tools/ML_configuration/_training.py +124 -0
- ml_tools/ML_datasetmaster/__init__.py +28 -0
- ml_tools/ML_datasetmaster/_base_datasetmaster.py +337 -0
- ml_tools/{_core/_ML_datasetmaster.py → ML_datasetmaster/_datasetmaster.py} +9 -329
- ml_tools/ML_datasetmaster/_imprimir.py +15 -0
- ml_tools/{_core/_ML_sequence_datasetmaster.py → ML_datasetmaster/_sequence_datasetmaster.py} +13 -15
- ml_tools/{_core/_ML_vision_datasetmaster.py → ML_datasetmaster/_vision_datasetmaster.py} +63 -65
- ml_tools/ML_evaluation/__init__.py +53 -0
- ml_tools/ML_evaluation/_classification.py +629 -0
- ml_tools/ML_evaluation/_feature_importance.py +409 -0
- ml_tools/ML_evaluation/_imprimir.py +25 -0
- ml_tools/ML_evaluation/_loss.py +92 -0
- ml_tools/ML_evaluation/_regression.py +273 -0
- ml_tools/{_core/_ML_sequence_evaluation.py → ML_evaluation/_sequence.py} +8 -11
- ml_tools/{_core/_ML_vision_evaluation.py → ML_evaluation/_vision.py} +12 -17
- ml_tools/{_core → ML_evaluation_captum}/_ML_evaluation_captum.py +11 -38
- ml_tools/{ML_evaluation_captum.py → ML_evaluation_captum/__init__.py} +6 -4
- ml_tools/ML_evaluation_captum/_imprimir.py +10 -0
- ml_tools/{_core → ML_finalize_handler}/_ML_finalize_handler.py +3 -7
- ml_tools/ML_finalize_handler/__init__.py +10 -0
- ml_tools/ML_finalize_handler/_imprimir.py +8 -0
- ml_tools/ML_inference/__init__.py +22 -0
- ml_tools/ML_inference/_base_inference.py +166 -0
- ml_tools/{_core/_ML_chaining_inference.py → ML_inference/_chain_inference.py} +14 -17
- ml_tools/ML_inference/_dragon_inference.py +332 -0
- ml_tools/ML_inference/_imprimir.py +11 -0
- ml_tools/ML_inference/_multi_inference.py +180 -0
- ml_tools/ML_inference_sequence/__init__.py +10 -0
- ml_tools/ML_inference_sequence/_imprimir.py +8 -0
- ml_tools/{_core/_ML_sequence_inference.py → ML_inference_sequence/_sequence_inference.py} +11 -15
- ml_tools/ML_inference_vision/__init__.py +10 -0
- ml_tools/ML_inference_vision/_imprimir.py +8 -0
- ml_tools/{_core/_ML_vision_inference.py → ML_inference_vision/_vision_inference.py} +15 -19
- ml_tools/ML_models/__init__.py +32 -0
- ml_tools/{_core/_ML_models_advanced.py → ML_models/_advanced_models.py} +22 -18
- ml_tools/ML_models/_base_mlp_attention.py +198 -0
- ml_tools/{_core/_models_advanced_base.py → ML_models/_base_save_load.py} +73 -49
- ml_tools/ML_models/_dragon_tabular.py +248 -0
- ml_tools/ML_models/_imprimir.py +18 -0
- ml_tools/ML_models/_mlp_attention.py +134 -0
- ml_tools/{_core → ML_models}/_models_advanced_helpers.py +13 -13
- ml_tools/ML_models_sequence/__init__.py +10 -0
- ml_tools/ML_models_sequence/_imprimir.py +8 -0
- ml_tools/{_core/_ML_sequence_models.py → ML_models_sequence/_sequence_models.py} +5 -8
- ml_tools/ML_models_vision/__init__.py +29 -0
- ml_tools/ML_models_vision/_base_wrapper.py +254 -0
- ml_tools/ML_models_vision/_image_classification.py +182 -0
- ml_tools/ML_models_vision/_image_segmentation.py +108 -0
- ml_tools/ML_models_vision/_imprimir.py +16 -0
- ml_tools/ML_models_vision/_object_detection.py +135 -0
- ml_tools/ML_optimization/__init__.py +21 -0
- ml_tools/ML_optimization/_imprimir.py +13 -0
- ml_tools/{_core/_ML_optimization_pareto.py → ML_optimization/_multi_dragon.py} +18 -24
- ml_tools/ML_optimization/_single_dragon.py +203 -0
- ml_tools/{_core/_ML_optimization.py → ML_optimization/_single_manual.py} +75 -213
- ml_tools/{_core → ML_scaler}/_ML_scaler.py +8 -11
- ml_tools/ML_scaler/__init__.py +10 -0
- ml_tools/ML_scaler/_imprimir.py +8 -0
- ml_tools/ML_trainer/__init__.py +20 -0
- ml_tools/ML_trainer/_base_trainer.py +297 -0
- ml_tools/ML_trainer/_dragon_detection_trainer.py +402 -0
- ml_tools/ML_trainer/_dragon_sequence_trainer.py +540 -0
- ml_tools/ML_trainer/_dragon_trainer.py +1160 -0
- ml_tools/ML_trainer/_imprimir.py +10 -0
- ml_tools/{ML_utilities.py → ML_utilities/__init__.py} +14 -6
- ml_tools/ML_utilities/_artifact_finder.py +382 -0
- ml_tools/ML_utilities/_imprimir.py +16 -0
- ml_tools/ML_utilities/_inspection.py +325 -0
- ml_tools/ML_utilities/_train_tools.py +205 -0
- ml_tools/{ML_vision_transformers.py → ML_vision_transformers/__init__.py} +9 -6
- ml_tools/{_core/_ML_vision_transformers.py → ML_vision_transformers/_core_transforms.py} +11 -155
- ml_tools/ML_vision_transformers/_imprimir.py +14 -0
- ml_tools/ML_vision_transformers/_offline_augmentation.py +159 -0
- ml_tools/{_core/_PSO_optimization.py → PSO_optimization/_PSO.py} +58 -15
- ml_tools/{PSO_optimization.py → PSO_optimization/__init__.py} +5 -3
- ml_tools/PSO_optimization/_imprimir.py +10 -0
- ml_tools/SQL/__init__.py +7 -0
- ml_tools/{_core/_SQL.py → SQL/_dragon_SQL.py} +7 -11
- ml_tools/SQL/_imprimir.py +8 -0
- ml_tools/{_core → VIF}/_VIF_factor.py +5 -8
- ml_tools/{VIF_factor.py → VIF/__init__.py} +4 -2
- ml_tools/VIF/_imprimir.py +10 -0
- ml_tools/_core/__init__.py +7 -1
- ml_tools/_core/_logger.py +8 -18
- ml_tools/_core/_schema_load_ops.py +43 -0
- ml_tools/_core/_script_info.py +2 -2
- ml_tools/{data_exploration.py → data_exploration/__init__.py} +32 -16
- ml_tools/data_exploration/_analysis.py +214 -0
- ml_tools/data_exploration/_cleaning.py +566 -0
- ml_tools/data_exploration/_features.py +583 -0
- ml_tools/data_exploration/_imprimir.py +32 -0
- ml_tools/data_exploration/_plotting.py +487 -0
- ml_tools/data_exploration/_schema_ops.py +176 -0
- ml_tools/{ensemble_evaluation.py → ensemble_evaluation/__init__.py} +6 -4
- ml_tools/{_core → ensemble_evaluation}/_ensemble_evaluation.py +3 -7
- ml_tools/ensemble_evaluation/_imprimir.py +14 -0
- ml_tools/{ensemble_inference.py → ensemble_inference/__init__.py} +5 -3
- ml_tools/{_core → ensemble_inference}/_ensemble_inference.py +15 -18
- ml_tools/ensemble_inference/_imprimir.py +9 -0
- ml_tools/{ensemble_learning.py → ensemble_learning/__init__.py} +4 -6
- ml_tools/{_core → ensemble_learning}/_ensemble_learning.py +7 -10
- ml_tools/ensemble_learning/_imprimir.py +10 -0
- ml_tools/{excel_handler.py → excel_handler/__init__.py} +5 -3
- ml_tools/{_core → excel_handler}/_excel_handler.py +6 -10
- ml_tools/excel_handler/_imprimir.py +13 -0
- ml_tools/{keys.py → keys/__init__.py} +4 -1
- ml_tools/keys/_imprimir.py +11 -0
- ml_tools/{_core → keys}/_keys.py +2 -0
- ml_tools/{math_utilities.py → math_utilities/__init__.py} +5 -2
- ml_tools/math_utilities/_imprimir.py +11 -0
- ml_tools/{_core → math_utilities}/_math_utilities.py +1 -5
- ml_tools/{optimization_tools.py → optimization_tools/__init__.py} +9 -4
- ml_tools/optimization_tools/_imprimir.py +13 -0
- ml_tools/optimization_tools/_optimization_bounds.py +236 -0
- ml_tools/optimization_tools/_optimization_plots.py +218 -0
- ml_tools/{path_manager.py → path_manager/__init__.py} +6 -3
- ml_tools/{_core/_path_manager.py → path_manager/_dragonmanager.py} +11 -347
- ml_tools/path_manager/_imprimir.py +15 -0
- ml_tools/path_manager/_path_tools.py +346 -0
- ml_tools/plot_fonts/__init__.py +8 -0
- ml_tools/plot_fonts/_imprimir.py +8 -0
- ml_tools/{_core → plot_fonts}/_plot_fonts.py +2 -5
- ml_tools/schema/__init__.py +15 -0
- ml_tools/schema/_feature_schema.py +223 -0
- ml_tools/schema/_gui_schema.py +191 -0
- ml_tools/schema/_imprimir.py +10 -0
- ml_tools/{serde.py → serde/__init__.py} +4 -2
- ml_tools/serde/_imprimir.py +10 -0
- ml_tools/{_core → serde}/_serde.py +3 -8
- ml_tools/{utilities.py → utilities/__init__.py} +11 -6
- ml_tools/utilities/_imprimir.py +18 -0
- ml_tools/{_core/_utilities.py → utilities/_utility_save_load.py} +13 -190
- ml_tools/utilities/_utility_tools.py +192 -0
- dragon_ml_toolbox-19.14.0.dist-info/RECORD +0 -111
- ml_tools/ML_chaining_inference.py +0 -8
- ml_tools/ML_configuration.py +0 -86
- ml_tools/ML_configuration_pytab.py +0 -14
- ml_tools/ML_datasetmaster.py +0 -10
- ml_tools/ML_evaluation.py +0 -16
- ml_tools/ML_evaluation_multi.py +0 -12
- ml_tools/ML_finalize_handler.py +0 -8
- ml_tools/ML_inference.py +0 -12
- ml_tools/ML_models.py +0 -14
- ml_tools/ML_models_advanced.py +0 -14
- ml_tools/ML_models_pytab.py +0 -14
- ml_tools/ML_optimization.py +0 -14
- ml_tools/ML_optimization_pareto.py +0 -8
- ml_tools/ML_scaler.py +0 -8
- ml_tools/ML_sequence_datasetmaster.py +0 -8
- ml_tools/ML_sequence_evaluation.py +0 -10
- ml_tools/ML_sequence_inference.py +0 -8
- ml_tools/ML_sequence_models.py +0 -8
- ml_tools/ML_trainer.py +0 -12
- ml_tools/ML_vision_datasetmaster.py +0 -12
- ml_tools/ML_vision_evaluation.py +0 -10
- ml_tools/ML_vision_inference.py +0 -8
- ml_tools/ML_vision_models.py +0 -18
- ml_tools/SQL.py +0 -8
- ml_tools/_core/_ETL_cleaning.py +0 -694
- ml_tools/_core/_IO_tools.py +0 -498
- ml_tools/_core/_ML_callbacks.py +0 -702
- ml_tools/_core/_ML_configuration.py +0 -1332
- ml_tools/_core/_ML_configuration_pytab.py +0 -102
- ml_tools/_core/_ML_evaluation.py +0 -867
- ml_tools/_core/_ML_evaluation_multi.py +0 -544
- ml_tools/_core/_ML_inference.py +0 -646
- ml_tools/_core/_ML_models.py +0 -668
- ml_tools/_core/_ML_models_pytab.py +0 -693
- ml_tools/_core/_ML_trainer.py +0 -2323
- ml_tools/_core/_ML_utilities.py +0 -886
- ml_tools/_core/_ML_vision_models.py +0 -644
- ml_tools/_core/_data_exploration.py +0 -1909
- ml_tools/_core/_optimization_tools.py +0 -493
- ml_tools/_core/_schema.py +0 -359
- ml_tools/plot_fonts.py +0 -8
- ml_tools/schema.py +0 -12
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,493 +0,0 @@
|
|
|
1
|
-
import matplotlib.pyplot as plt
|
|
2
|
-
import seaborn as sns
|
|
3
|
-
from typing import Union, Any, Literal, Optional, Dict, List, Tuple
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
import pandas as pd
|
|
6
|
-
|
|
7
|
-
from ._path_manager import make_fullpath, list_csv_paths, sanitize_filename
|
|
8
|
-
from ._utilities import yield_dataframes_from_dir
|
|
9
|
-
from ._logger import get_logger
|
|
10
|
-
from ._script_info import _script_info
|
|
11
|
-
from ._SQL import DragonSQL
|
|
12
|
-
from ._IO_tools import save_json, load_json
|
|
13
|
-
from ._schema import FeatureSchema
|
|
14
|
-
from ._keys import OptimizationToolsKeys
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
_LOGGER = get_logger("Optimization Tools")
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
__all__ = [
|
|
21
|
-
"make_continuous_bounds_template",
|
|
22
|
-
"load_continuous_bounds_template",
|
|
23
|
-
"create_optimization_bounds",
|
|
24
|
-
"parse_lower_upper_bounds",
|
|
25
|
-
"plot_optimal_feature_distributions",
|
|
26
|
-
"plot_optimal_feature_distributions_from_dataframe",
|
|
27
|
-
]
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def make_continuous_bounds_template(
|
|
31
|
-
directory: Union[str, Path],
|
|
32
|
-
feature_schema: FeatureSchema,
|
|
33
|
-
default_bounds: Tuple[float, float] = (0, 1)
|
|
34
|
-
) -> None:
|
|
35
|
-
"""
|
|
36
|
-
Creates a JSON template for manual entry of continuous feature optimization bounds.
|
|
37
|
-
|
|
38
|
-
The resulting file maps each continuous feature name to a [min, max] list
|
|
39
|
-
populated with `default_bounds`. Edit the values in this file before using.
|
|
40
|
-
|
|
41
|
-
Args:
|
|
42
|
-
directory (str | Path): The directory where the template will be saved.
|
|
43
|
-
feature_schema (FeatureSchema): The loaded schema containing feature definitions.
|
|
44
|
-
default_bounds (Tuple[float, float]): Default (min, max) values to populate the template.
|
|
45
|
-
"""
|
|
46
|
-
# validate directory path
|
|
47
|
-
dir_path = make_fullpath(directory, make=True, enforce="directory")
|
|
48
|
-
|
|
49
|
-
# 1. Check if continuous features exist
|
|
50
|
-
if not feature_schema.continuous_feature_names:
|
|
51
|
-
_LOGGER.warning("No continuous features found in FeatureSchema. Skipping bounds template generation.")
|
|
52
|
-
return
|
|
53
|
-
|
|
54
|
-
# 2. Construct the dictionary: {feature_name: [min, max]}
|
|
55
|
-
bounds_map = {
|
|
56
|
-
name: list(default_bounds)
|
|
57
|
-
for name in feature_schema.continuous_feature_names
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
# use a fixed key for the filename
|
|
61
|
-
filename = OptimizationToolsKeys.OPTIMIZATION_BOUNDS_FILENAME + ".json"
|
|
62
|
-
|
|
63
|
-
# 3. Save to JSON using the IO tool
|
|
64
|
-
save_json(
|
|
65
|
-
data=bounds_map,
|
|
66
|
-
directory=dir_path,
|
|
67
|
-
filename=filename,
|
|
68
|
-
verbose=False
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
_LOGGER.info(f"💾 Continuous bounds template saved to: '{dir_path.name}/{filename}'")
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def load_continuous_bounds_template(directory: Union[str, Path]) -> Dict[str, List[float]]:
|
|
75
|
-
"""
|
|
76
|
-
Loads the continuous feature bounds template from JSON. Expected filename: `optimization_bounds.json`.
|
|
77
|
-
|
|
78
|
-
Args:
|
|
79
|
-
directory (str | Path): The directory where the template is located.
|
|
80
|
-
|
|
81
|
-
Returns:
|
|
82
|
-
Dictionary (Dict[str, List[float]]): A dictionary mapping feature names to [min, max] bounds.
|
|
83
|
-
"""
|
|
84
|
-
dir_path = make_fullpath(directory, enforce="directory")
|
|
85
|
-
full_path = dir_path / (OptimizationToolsKeys.OPTIMIZATION_BOUNDS_FILENAME + ".json")
|
|
86
|
-
|
|
87
|
-
bounds_map = load_json(
|
|
88
|
-
file_path=full_path,
|
|
89
|
-
expected_type='dict',
|
|
90
|
-
verbose=False
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
# validate loaded data
|
|
94
|
-
if not all(
|
|
95
|
-
isinstance(v, list) and # Check type
|
|
96
|
-
len(v) == 2 and # Check length
|
|
97
|
-
all(isinstance(i, (int, float)) for i in v) # Check contents are numbers
|
|
98
|
-
for v in bounds_map.values()
|
|
99
|
-
):
|
|
100
|
-
_LOGGER.error(f"Invalid format in bounds template at '{full_path}'. Each value must be a list of [min, max].")
|
|
101
|
-
raise ValueError()
|
|
102
|
-
|
|
103
|
-
_LOGGER.info(f"Continuous bounds template loaded from: '{dir_path.name}'")
|
|
104
|
-
|
|
105
|
-
return bounds_map
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
def create_optimization_bounds(
|
|
109
|
-
schema: FeatureSchema,
|
|
110
|
-
continuous_bounds_map: Union[Dict[str, Tuple[float, float]], Dict[str, List[float]]],
|
|
111
|
-
start_at_zero: bool = True
|
|
112
|
-
) -> Tuple[List[float], List[float]]:
|
|
113
|
-
"""
|
|
114
|
-
Generates the lower and upper bounds lists for the optimizer from a FeatureSchema.
|
|
115
|
-
|
|
116
|
-
This helper function automates the creation of unbiased bounds for
|
|
117
|
-
categorical features and combines them with user-defined bounds for
|
|
118
|
-
continuous features, using the schema as the single source of truth
|
|
119
|
-
for feature order and type.
|
|
120
|
-
|
|
121
|
-
Args:
|
|
122
|
-
schema (FeatureSchema):
|
|
123
|
-
The definitive schema object created by
|
|
124
|
-
`data_exploration.finalize_feature_schema()`.
|
|
125
|
-
continuous_bounds_map (Dict[str, Tuple[float, float]], Dict[str, List[float]]):
|
|
126
|
-
A dictionary mapping the *name* of each **continuous** feature
|
|
127
|
-
to its (min_bound, max_bound).
|
|
128
|
-
start_at_zero (bool):
|
|
129
|
-
- If True, assumes categorical encoding is [0, 1, ..., k-1].
|
|
130
|
-
Bounds will be set as [-0.5, k - 0.5].
|
|
131
|
-
- If False, assumes encoding is [1, 2, ..., k].
|
|
132
|
-
Bounds will be set as [0.5, k + 0.5].
|
|
133
|
-
|
|
134
|
-
Returns:
|
|
135
|
-
Tuple[List[float], List[float]]:
|
|
136
|
-
A tuple containing two lists: (lower_bounds, upper_bounds).
|
|
137
|
-
|
|
138
|
-
Raises:
|
|
139
|
-
ValueError: If a feature is missing from `continuous_bounds_map`
|
|
140
|
-
or if a feature name in the map is not a
|
|
141
|
-
continuous feature according to the schema.
|
|
142
|
-
"""
|
|
143
|
-
# validate length in the continuous_bounds_map values
|
|
144
|
-
for name, bounds in continuous_bounds_map.items():
|
|
145
|
-
if not (isinstance(bounds, (list, tuple)) and len(bounds) == 2):
|
|
146
|
-
_LOGGER.error(f"Bounds for feature '{name}' must be a list or tuple of length 2 (min, max). Found: {bounds}")
|
|
147
|
-
raise ValueError()
|
|
148
|
-
|
|
149
|
-
# 1. Get feature names and map from schema
|
|
150
|
-
feature_names = schema.feature_names
|
|
151
|
-
categorical_index_map = schema.categorical_index_map
|
|
152
|
-
total_features = len(feature_names)
|
|
153
|
-
|
|
154
|
-
if total_features <= 0:
|
|
155
|
-
_LOGGER.error("Schema contains no features.")
|
|
156
|
-
raise ValueError()
|
|
157
|
-
|
|
158
|
-
_LOGGER.info(f"Generating bounds for {total_features} total features...")
|
|
159
|
-
|
|
160
|
-
# 2. Initialize bound lists
|
|
161
|
-
lower_bounds: List[Optional[float]] = [None] * total_features
|
|
162
|
-
upper_bounds: List[Optional[float]] = [None] * total_features
|
|
163
|
-
|
|
164
|
-
# 3. Populate categorical bounds (Index-based)
|
|
165
|
-
if categorical_index_map:
|
|
166
|
-
for index, cardinality in categorical_index_map.items():
|
|
167
|
-
if not (0 <= index < total_features):
|
|
168
|
-
_LOGGER.error(f"Categorical index {index} is out of range for the {total_features} features.")
|
|
169
|
-
raise ValueError()
|
|
170
|
-
|
|
171
|
-
if start_at_zero:
|
|
172
|
-
# Rule for [0, k-1]: bounds are [-0.5, k - 0.5]
|
|
173
|
-
low = -0.5
|
|
174
|
-
high = float(cardinality) - 0.5
|
|
175
|
-
else:
|
|
176
|
-
# Rule for [1, k]: bounds are [0.5, k + 0.5]
|
|
177
|
-
low = 0.5
|
|
178
|
-
high = float(cardinality) + 0.5
|
|
179
|
-
|
|
180
|
-
lower_bounds[index] = low
|
|
181
|
-
upper_bounds[index] = high
|
|
182
|
-
|
|
183
|
-
_LOGGER.info(f"Automatically set bounds for {len(categorical_index_map)} categorical features.")
|
|
184
|
-
else:
|
|
185
|
-
_LOGGER.info("No categorical features found in schema.")
|
|
186
|
-
|
|
187
|
-
# 4. Populate continuous bounds (Name-based)
|
|
188
|
-
# Use schema.continuous_feature_names for robust checking
|
|
189
|
-
continuous_names_set = set(schema.continuous_feature_names)
|
|
190
|
-
|
|
191
|
-
if continuous_names_set != set(continuous_bounds_map.keys()):
|
|
192
|
-
missing_in_map = continuous_names_set - set(continuous_bounds_map.keys())
|
|
193
|
-
if missing_in_map:
|
|
194
|
-
_LOGGER.error(f"The following continuous features are missing from 'continuous_bounds_map': {list(missing_in_map)}")
|
|
195
|
-
|
|
196
|
-
extra_in_map = set(continuous_bounds_map.keys()) - continuous_names_set
|
|
197
|
-
if extra_in_map:
|
|
198
|
-
_LOGGER.error(f"The following features in 'continuous_bounds_map' are not defined as continuous in the schema: {list(extra_in_map)}")
|
|
199
|
-
|
|
200
|
-
raise ValueError("Mismatch between 'continuous_bounds_map' and schema's continuous features.")
|
|
201
|
-
|
|
202
|
-
count_continuous = 0
|
|
203
|
-
for name, (low, high) in continuous_bounds_map.items():
|
|
204
|
-
# Map name to its index in the *feature-only* list
|
|
205
|
-
# This is guaranteed to be correct by the schema
|
|
206
|
-
index = feature_names.index(name)
|
|
207
|
-
|
|
208
|
-
if lower_bounds[index] is not None:
|
|
209
|
-
# This should be impossible if schema is correct, but good to check
|
|
210
|
-
_LOGGER.error(f"Schema conflict: Feature '{name}' (at index {index}) is defined as both continuous and categorical.")
|
|
211
|
-
raise ValueError()
|
|
212
|
-
|
|
213
|
-
lower_bounds[index] = float(low)
|
|
214
|
-
upper_bounds[index] = float(high)
|
|
215
|
-
count_continuous += 1
|
|
216
|
-
|
|
217
|
-
_LOGGER.info(f"Manually set bounds for {count_continuous} continuous features.")
|
|
218
|
-
|
|
219
|
-
# 5. Final Validation (all Nones should be filled)
|
|
220
|
-
if None in lower_bounds:
|
|
221
|
-
missing_indices = [i for i, b in enumerate(lower_bounds) if b is None]
|
|
222
|
-
missing_names = [feature_names[i] for i in missing_indices]
|
|
223
|
-
_LOGGER.error(f"Failed to create all bounds. This indicates an internal logic error. Missing: {missing_names}")
|
|
224
|
-
raise RuntimeError("Internal error: Not all bounds were populated.")
|
|
225
|
-
|
|
226
|
-
# Cast to float lists, as 'None' sentinels are gone
|
|
227
|
-
return (
|
|
228
|
-
[float(b) for b in lower_bounds], # type: ignore
|
|
229
|
-
[float(b) for b in upper_bounds] # type: ignore
|
|
230
|
-
)
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
def parse_lower_upper_bounds(source: dict[str,tuple[Any,Any]]):
|
|
234
|
-
"""
|
|
235
|
-
Parse lower and upper boundaries, returning 2 lists:
|
|
236
|
-
|
|
237
|
-
`lower_bounds`, `upper_bounds`
|
|
238
|
-
"""
|
|
239
|
-
lower = [low[0] for low in source.values()]
|
|
240
|
-
upper = [up[1] for up in source.values()]
|
|
241
|
-
|
|
242
|
-
return lower, upper
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
def plot_optimal_feature_distributions(results_dir: Union[str, Path],
|
|
246
|
-
verbose: bool=False,
|
|
247
|
-
target_columns: Optional[List[str]] = None):
|
|
248
|
-
"""
|
|
249
|
-
Analyzes optimization results and plots the distribution of optimal values.
|
|
250
|
-
|
|
251
|
-
This function is compatible with mixed-type CSVs (strings for
|
|
252
|
-
categorical features, numbers for continuous). It automatically
|
|
253
|
-
detects the data type for each feature and generates:
|
|
254
|
-
|
|
255
|
-
- A Bar Plot for categorical (string) features.
|
|
256
|
-
- A KDE Plot for continuous (numeric) features.
|
|
257
|
-
|
|
258
|
-
Plots are saved in a subdirectory inside the source directory.
|
|
259
|
-
|
|
260
|
-
Parameters
|
|
261
|
-
----------
|
|
262
|
-
results_dir : str | Path
|
|
263
|
-
The path to the directory containing the optimization result CSV files.
|
|
264
|
-
target_columns (list[str] | None):
|
|
265
|
-
A list of target column names to explicitly exclude from plotting. If None, it defaults to excluding only the last column (assumed as the target).
|
|
266
|
-
"""
|
|
267
|
-
# Check results_dir and create output path
|
|
268
|
-
results_path = make_fullpath(results_dir, enforce="directory")
|
|
269
|
-
output_path = make_fullpath(results_path / "DistributionPlots", make=True)
|
|
270
|
-
|
|
271
|
-
# Check that the directory contains csv files
|
|
272
|
-
list_csv_paths(results_path, verbose=False, raise_on_empty=True)
|
|
273
|
-
|
|
274
|
-
# --- Data Loading and Preparation ---
|
|
275
|
-
_LOGGER.debug(f"📁 Starting analysis from results in: '{results_dir}'")
|
|
276
|
-
|
|
277
|
-
data_to_plot = []
|
|
278
|
-
for df, df_name in yield_dataframes_from_dir(results_path, verbose=True):
|
|
279
|
-
if df.shape[1] < 2:
|
|
280
|
-
_LOGGER.warning(f"Skipping '{df_name}': must have at least 2 columns (feature + target).")
|
|
281
|
-
continue
|
|
282
|
-
|
|
283
|
-
# --- Column selection logic ---
|
|
284
|
-
if target_columns:
|
|
285
|
-
# 1. Explicitly drop known targets to isolate features
|
|
286
|
-
existing_targets = [c for c in target_columns if c in df.columns]
|
|
287
|
-
features_df = df.drop(columns=existing_targets)
|
|
288
|
-
|
|
289
|
-
if features_df.empty:
|
|
290
|
-
_LOGGER.warning(f"Skipping '{df_name}': All columns were dropped based on target_columns list.")
|
|
291
|
-
continue
|
|
292
|
-
else:
|
|
293
|
-
# 2. Fallback: Assume the last column is the only target
|
|
294
|
-
features_df = df.iloc[:, :-1]
|
|
295
|
-
|
|
296
|
-
# 3. Melt the filtered dataframe
|
|
297
|
-
melted_df = features_df.melt(var_name='feature', value_name='value')
|
|
298
|
-
|
|
299
|
-
# Set target as the filename (or joined target names) to differentiate sources
|
|
300
|
-
melted_df['target'] = '\n'.join(target_columns) if target_columns else df_name
|
|
301
|
-
data_to_plot.append(melted_df)
|
|
302
|
-
|
|
303
|
-
if not data_to_plot:
|
|
304
|
-
_LOGGER.error("No valid data to plot after processing all CSVs.")
|
|
305
|
-
return
|
|
306
|
-
|
|
307
|
-
long_df = pd.concat(data_to_plot, ignore_index=True)
|
|
308
|
-
|
|
309
|
-
# --- Delegate to Helper ---
|
|
310
|
-
_generate_and_save_feature_plots(long_df, output_path, verbose)
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
def plot_optimal_feature_distributions_from_dataframe(dataframe: pd.DataFrame,
|
|
314
|
-
save_dir: Union[str, Path],
|
|
315
|
-
verbose: bool=False,
|
|
316
|
-
target_columns: Optional[List[str]] = None):
|
|
317
|
-
"""
|
|
318
|
-
Analyzes a single dataframe of optimization results and plots the distribution of optimal values.
|
|
319
|
-
|
|
320
|
-
This function is compatible with mixed-type data (strings for categorical features,
|
|
321
|
-
numbers for continuous). It automatically detects the data type for each feature
|
|
322
|
-
and generates:
|
|
323
|
-
|
|
324
|
-
- A Bar Plot for categorical (string) features.
|
|
325
|
-
- A KDE Plot for continuous (numeric) features.
|
|
326
|
-
|
|
327
|
-
Plots are saved in a 'DistributionPlots' subdirectory inside the save_dir.
|
|
328
|
-
|
|
329
|
-
Parameters
|
|
330
|
-
----------
|
|
331
|
-
dataframe : pd.DataFrame
|
|
332
|
-
The dataframe containing the optimization results (features + target/s).
|
|
333
|
-
save_dir : str | Path
|
|
334
|
-
The directory where the 'DistributionPlots' folder will be created.
|
|
335
|
-
verbose : bool, optional
|
|
336
|
-
If True, logs details about which plot type is chosen for each feature.
|
|
337
|
-
target_columns : list[str] | None
|
|
338
|
-
A list of target column names to explicitly exclude from plotting.
|
|
339
|
-
If None, it defaults to excluding only the last column (assumed as the target).
|
|
340
|
-
"""
|
|
341
|
-
# Check results_dir and create output path
|
|
342
|
-
root_path = make_fullpath(save_dir, make=True, enforce="directory")
|
|
343
|
-
output_path = make_fullpath(root_path / "DistributionPlots", make=True, enforce="directory")
|
|
344
|
-
|
|
345
|
-
_LOGGER.debug(f"📁 Starting analysis from provided DataFrame. Output: '{output_path}'")
|
|
346
|
-
|
|
347
|
-
if dataframe.empty:
|
|
348
|
-
_LOGGER.error("Provided dataframe is empty.")
|
|
349
|
-
return
|
|
350
|
-
|
|
351
|
-
if dataframe.shape[1] < 2:
|
|
352
|
-
_LOGGER.warning("DataFrame has fewer than 2 columns. Expecting at least one feature and one target.")
|
|
353
|
-
|
|
354
|
-
# --- Data Preparation ---
|
|
355
|
-
if target_columns:
|
|
356
|
-
# Explicitly drop known targets to isolate features
|
|
357
|
-
existing_targets = [c for c in target_columns if c in dataframe.columns]
|
|
358
|
-
features_df = dataframe.drop(columns=existing_targets)
|
|
359
|
-
target_label = '\n'.join(target_columns)
|
|
360
|
-
else:
|
|
361
|
-
# Fallback: Assume the last column is the only target
|
|
362
|
-
features_df = dataframe.iloc[:, :-1]
|
|
363
|
-
target_label = "Optimization Result"
|
|
364
|
-
|
|
365
|
-
if features_df.empty:
|
|
366
|
-
_LOGGER.warning("Skipping plotting: All columns were dropped based on target_columns list.")
|
|
367
|
-
return
|
|
368
|
-
|
|
369
|
-
# Melt and assign static target label
|
|
370
|
-
long_df = features_df.melt(var_name='feature', value_name='value')
|
|
371
|
-
long_df['target'] = target_label
|
|
372
|
-
|
|
373
|
-
# --- Delegate to Helper ---
|
|
374
|
-
_generate_and_save_feature_plots(long_df, output_path, verbose)
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
def _generate_and_save_feature_plots(long_df: pd.DataFrame, output_path: Path, verbose: bool) -> None:
|
|
378
|
-
"""
|
|
379
|
-
Private helper: iterates over a melted DataFrame (columns: feature, value, target)
|
|
380
|
-
and generates/saves the appropriate plot (Bar or KDE) for each feature.
|
|
381
|
-
"""
|
|
382
|
-
features = long_df['feature'].unique()
|
|
383
|
-
unique_targets = long_df['target'].unique()
|
|
384
|
-
|
|
385
|
-
_LOGGER.info(f"📊 Found data for {len(features)} features. Generating plots...")
|
|
386
|
-
|
|
387
|
-
for feature_name in features:
|
|
388
|
-
plt.figure(figsize=(12, 7))
|
|
389
|
-
|
|
390
|
-
# .copy() to ensure we are working with a distinct object
|
|
391
|
-
feature_df = long_df[long_df['feature'] == feature_name].copy()
|
|
392
|
-
|
|
393
|
-
# --- Type-checking logic ---
|
|
394
|
-
feature_df['numeric_value'] = pd.to_numeric(feature_df['value'], errors='coerce')
|
|
395
|
-
|
|
396
|
-
# If *any* value failed conversion (is NaN), treat it as categorical.
|
|
397
|
-
if feature_df['numeric_value'].isna().any():
|
|
398
|
-
|
|
399
|
-
# --- PLOT 1: CATEGORICAL (String-based) ---
|
|
400
|
-
if verbose:
|
|
401
|
-
print(f" Plotting '{feature_name}' as categorical (bar plot).")
|
|
402
|
-
|
|
403
|
-
# Calculate percentages for a clean bar plot
|
|
404
|
-
norm_df = (feature_df.groupby('target')['value']
|
|
405
|
-
.value_counts(normalize=True)
|
|
406
|
-
.mul(100)
|
|
407
|
-
.rename('percent')
|
|
408
|
-
.reset_index())
|
|
409
|
-
|
|
410
|
-
ax = sns.barplot(data=norm_df, x='value', y='percent', hue='target')
|
|
411
|
-
plt.ylabel("Frequency (%)", fontsize=12)
|
|
412
|
-
ax.set_ylim(0, 100)
|
|
413
|
-
|
|
414
|
-
# always rotate x-ticks for categorical clarity
|
|
415
|
-
plt.xticks(rotation=45, ha='right')
|
|
416
|
-
|
|
417
|
-
else:
|
|
418
|
-
# --- PLOT 2: CONTINUOUS (Numeric-based) ---
|
|
419
|
-
if verbose:
|
|
420
|
-
print(f" Plotting '{feature_name}' as continuous (KDE plot).")
|
|
421
|
-
|
|
422
|
-
ax = sns.kdeplot(data=feature_df, x='numeric_value', hue='target',
|
|
423
|
-
fill=True, alpha=0.1, warn_singular=False)
|
|
424
|
-
|
|
425
|
-
plt.xlabel("Feature Value", fontsize=12)
|
|
426
|
-
plt.ylabel("Density", fontsize=12)
|
|
427
|
-
|
|
428
|
-
# --- Common settings for both plot types ---
|
|
429
|
-
plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
|
|
430
|
-
plt.grid(axis='y', alpha=0.5, linestyle='--')
|
|
431
|
-
|
|
432
|
-
legend = ax.get_legend()
|
|
433
|
-
if legend:
|
|
434
|
-
legend.set_title('Target')
|
|
435
|
-
|
|
436
|
-
sanitized_feature_name = sanitize_filename(feature_name)
|
|
437
|
-
plot_filename = output_path / f"Distribution_{sanitized_feature_name}.svg"
|
|
438
|
-
plt.savefig(plot_filename, bbox_inches='tight')
|
|
439
|
-
plt.close()
|
|
440
|
-
|
|
441
|
-
_LOGGER.info(f"All plots saved successfully to: '{output_path}'")
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
def _save_result(
|
|
445
|
-
result_dict: dict,
|
|
446
|
-
save_format: Literal['csv', 'sqlite', 'both'],
|
|
447
|
-
csv_path: Path,
|
|
448
|
-
db_manager: Optional[DragonSQL] = None,
|
|
449
|
-
db_table_name: Optional[str] = None,
|
|
450
|
-
categorical_mappings: Optional[Dict[str, Dict[str, int]]] = None
|
|
451
|
-
):
|
|
452
|
-
"""
|
|
453
|
-
Private helper to handle saving a single result to CSV, SQLite, or both.
|
|
454
|
-
|
|
455
|
-
If `categorical_mappings` is provided, it will reverse-map integer values
|
|
456
|
-
to their string representations before saving.
|
|
457
|
-
"""
|
|
458
|
-
# --- Reverse Mapping Logic ---
|
|
459
|
-
# Create a copy to hold the values to be saved
|
|
460
|
-
save_dict = result_dict.copy()
|
|
461
|
-
|
|
462
|
-
if categorical_mappings:
|
|
463
|
-
for feature_name, mapping in categorical_mappings.items():
|
|
464
|
-
if feature_name in save_dict:
|
|
465
|
-
# Create a reverse map {0: 'Category_A', 1: 'Category_B'}
|
|
466
|
-
reverse_map = {idx: name for name, idx in mapping.items()}
|
|
467
|
-
|
|
468
|
-
# Get the integer value from the results (e.g., 0)
|
|
469
|
-
int_value = save_dict[feature_name]
|
|
470
|
-
|
|
471
|
-
# Find the corresponding string (e.g., 'Category_A')
|
|
472
|
-
# Use .get() for safety, defaulting to the original value if not found
|
|
473
|
-
string_value = reverse_map.get(int_value, int_value)
|
|
474
|
-
|
|
475
|
-
# Update the dictionary that will be saved
|
|
476
|
-
save_dict[feature_name] = string_value
|
|
477
|
-
|
|
478
|
-
# Save to CSV
|
|
479
|
-
if save_format in ['csv', 'both']:
|
|
480
|
-
df_row = pd.DataFrame([save_dict])
|
|
481
|
-
file_exists = csv_path.exists()
|
|
482
|
-
df_row.to_csv(csv_path, mode='a', index=False, header=not file_exists)
|
|
483
|
-
|
|
484
|
-
# Save to SQLite
|
|
485
|
-
if save_format in ['sqlite', 'both']:
|
|
486
|
-
if db_manager and db_table_name:
|
|
487
|
-
db_manager.insert_row(db_table_name, save_dict)
|
|
488
|
-
else:
|
|
489
|
-
_LOGGER.warning("SQLite saving requested but db_manager or table_name not provided.")
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
def info():
|
|
493
|
-
_script_info(__all__)
|