dragon-ml-toolbox 12.0.1__py3-none-any.whl → 12.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-12.0.1.dist-info → dragon_ml_toolbox-12.1.0.dist-info}/METADATA +1 -1
- {dragon_ml_toolbox-12.0.1.dist-info → dragon_ml_toolbox-12.1.0.dist-info}/RECORD +11 -10
- ml_tools/ML_optimization.py +149 -97
- ml_tools/ML_simple_optimization.py +413 -0
- ml_tools/data_exploration.py +96 -3
- ml_tools/math_utilities.py +30 -6
- ml_tools/optimization_tools.py +219 -24
- {dragon_ml_toolbox-12.0.1.dist-info → dragon_ml_toolbox-12.1.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-12.0.1.dist-info → dragon_ml_toolbox-12.1.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-12.0.1.dist-info → dragon_ml_toolbox-12.1.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-12.0.1.dist-info → dragon_ml_toolbox-12.1.0.dist-info}/top_level.txt +0 -0
ml_tools/optimization_tools.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import matplotlib.pyplot as plt
|
|
2
2
|
import seaborn as sns
|
|
3
|
-
from typing import Union, Any, Literal, Optional
|
|
3
|
+
from typing import Union, Any, Literal, Optional, Dict, List, Tuple
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
@@ -12,11 +12,153 @@ from .SQL import DatabaseManager
|
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
__all__ = [
|
|
15
|
+
"create_optimization_bounds",
|
|
15
16
|
"parse_lower_upper_bounds",
|
|
16
|
-
"plot_optimal_feature_distributions"
|
|
17
|
+
"plot_optimal_feature_distributions",
|
|
17
18
|
]
|
|
18
19
|
|
|
19
20
|
|
|
21
|
+
def create_optimization_bounds(
|
|
22
|
+
csv_path: Union[str, Path],
|
|
23
|
+
continuous_bounds_map: Dict[str, Tuple[float, float]],
|
|
24
|
+
categorical_map: Dict[int, int],
|
|
25
|
+
target_column: Optional[str] = None,
|
|
26
|
+
start_at_zero: bool = True
|
|
27
|
+
) -> Tuple[List[float], List[float]]:
|
|
28
|
+
"""
|
|
29
|
+
Generates the lower and upper bounds lists for the optimizer from a CSV header.
|
|
30
|
+
|
|
31
|
+
This helper function automates the creation of unbiased bounds for
|
|
32
|
+
categorical features and combines them with user-defined bounds for
|
|
33
|
+
continuous features.
|
|
34
|
+
|
|
35
|
+
It reads *only* the header of the provided CSV to determine the full
|
|
36
|
+
list of feature columns and their order, excluding the specified target.
|
|
37
|
+
This is memory-efficient as the full dataset is not loaded.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
csv_path (Union[str, Path]):
|
|
41
|
+
Path to the final, preprocessed CSV file. The column order in
|
|
42
|
+
this file must match the order expected by the model.
|
|
43
|
+
continuous_bounds_map (Dict[str, Tuple[float, float]]):
|
|
44
|
+
A dictionary mapping the *name* of each **continuous** feature
|
|
45
|
+
to its (min_bound, max_bound) tuple.
|
|
46
|
+
categorical_map (Dict[int, int]):
|
|
47
|
+
The map from the *index* of each **categorical** feature to its cardinality.
|
|
48
|
+
(e.g., {2: 4} for a feature at index 2 with 4 categories).
|
|
49
|
+
target_column (Optional[str], optional):
|
|
50
|
+
The name of the target column to exclude. If None (default), the *last column* in the CSV is assumed to be the target.
|
|
51
|
+
start_at_zero (bool):
|
|
52
|
+
- If True, assumes categorical encoding is [0, 1, ..., k-1].
|
|
53
|
+
Bounds will be set as [-0.5, k - 0.5].
|
|
54
|
+
- If False, assumes encoding is [1, 2, ..., k].
|
|
55
|
+
Bounds will be set as [0.5, k + 0.5].
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Tuple[List[float], List[float]]:
|
|
59
|
+
A tuple containing two lists: (lower_bounds, upper_bounds).
|
|
60
|
+
|
|
61
|
+
Raises:
|
|
62
|
+
ValueError: If a feature is defined in both maps, is missing from
|
|
63
|
+
both maps, or if a name in `continuous_bounds_map`
|
|
64
|
+
or `target_column` is not found in the CSV columns.
|
|
65
|
+
"""
|
|
66
|
+
# 1. Read header and determine feature names
|
|
67
|
+
full_csv_path = make_fullpath(csv_path, enforce="file")
|
|
68
|
+
try:
|
|
69
|
+
df_header = pd.read_csv(full_csv_path, nrows=0)
|
|
70
|
+
except Exception as e:
|
|
71
|
+
_LOGGER.error(f"Failed to read header from CSV: {e}")
|
|
72
|
+
raise
|
|
73
|
+
|
|
74
|
+
all_column_names = df_header.columns.to_list()
|
|
75
|
+
feature_names: List[str] = []
|
|
76
|
+
|
|
77
|
+
if target_column is None:
|
|
78
|
+
feature_names = all_column_names[:-1]
|
|
79
|
+
excluded_target = all_column_names[-1]
|
|
80
|
+
_LOGGER.info(f"No target_column provided. Assuming last column '{excluded_target}' is the target.")
|
|
81
|
+
else:
|
|
82
|
+
if target_column not in all_column_names:
|
|
83
|
+
_LOGGER.error(f"Target column '{target_column}' not found in CSV header.")
|
|
84
|
+
raise ValueError()
|
|
85
|
+
feature_names = [name for name in all_column_names if name != target_column]
|
|
86
|
+
_LOGGER.info(f"Excluding target column '{target_column}'.")
|
|
87
|
+
|
|
88
|
+
# 2. Initialize bound lists
|
|
89
|
+
total_features = len(feature_names)
|
|
90
|
+
if total_features <= 0:
|
|
91
|
+
_LOGGER.error("No feature columns remain after excluding the target.")
|
|
92
|
+
raise ValueError()
|
|
93
|
+
|
|
94
|
+
lower_bounds: List[Optional[float]] = [None] * total_features
|
|
95
|
+
upper_bounds: List[Optional[float]] = [None] * total_features
|
|
96
|
+
|
|
97
|
+
_LOGGER.info(f"Generating bounds for {total_features} total features...")
|
|
98
|
+
|
|
99
|
+
# 3. Populate categorical bounds (Index-based)
|
|
100
|
+
# The indices in categorical_map (e.g., {2: 4}) directly correspond
|
|
101
|
+
# to the indices in our new `feature_names` list.
|
|
102
|
+
for index, cardinality in categorical_map.items():
|
|
103
|
+
if not (0 <= index < total_features):
|
|
104
|
+
_LOGGER.error(f"Categorical index {index} is out of range for the {total_features} features.")
|
|
105
|
+
raise ValueError()
|
|
106
|
+
|
|
107
|
+
if start_at_zero:
|
|
108
|
+
# Rule for [0, k-1]: bounds are [-0.5, k - 0.5]
|
|
109
|
+
low = -0.5
|
|
110
|
+
high = float(cardinality) - 0.5
|
|
111
|
+
else:
|
|
112
|
+
# Rule for [1, k]: bounds are [0.5, k + 0.5]
|
|
113
|
+
low = 0.5
|
|
114
|
+
high = float(cardinality) + 0.5
|
|
115
|
+
|
|
116
|
+
lower_bounds[index] = low
|
|
117
|
+
upper_bounds[index] = high
|
|
118
|
+
|
|
119
|
+
_LOGGER.info(f"Automatically set bounds for {len(categorical_map)} categorical features.")
|
|
120
|
+
|
|
121
|
+
# 4. Populate continuous bounds (Name-based)
|
|
122
|
+
count_continuous = 0
|
|
123
|
+
for name, (low, high) in continuous_bounds_map.items():
|
|
124
|
+
try:
|
|
125
|
+
# Map name to its index in the *feature-only* list
|
|
126
|
+
index = feature_names.index(name)
|
|
127
|
+
except ValueError:
|
|
128
|
+
_LOGGER.error(f"Feature name '{name}' from 'continuous_bounds_map' not found in the CSV's feature columns.")
|
|
129
|
+
raise ValueError()
|
|
130
|
+
|
|
131
|
+
if lower_bounds[index] is not None:
|
|
132
|
+
# This index was already set by the categorical map
|
|
133
|
+
_LOGGER.error(f"Feature '{name}' (at index {index}) is defined in both 'categorical_map' and 'continuous_bounds_map'.")
|
|
134
|
+
raise ValueError()
|
|
135
|
+
|
|
136
|
+
lower_bounds[index] = float(low)
|
|
137
|
+
upper_bounds[index] = float(high)
|
|
138
|
+
count_continuous += 1
|
|
139
|
+
|
|
140
|
+
_LOGGER.info(f"Manually set bounds for {count_continuous} continuous features.")
|
|
141
|
+
|
|
142
|
+
# 5. Validation: Check for any remaining None values
|
|
143
|
+
missing_indices = []
|
|
144
|
+
for i in range(total_features):
|
|
145
|
+
if lower_bounds[i] is None:
|
|
146
|
+
missing_indices.append(i)
|
|
147
|
+
|
|
148
|
+
if missing_indices:
|
|
149
|
+
missing_names = [feature_names[i] for i in missing_indices]
|
|
150
|
+
_LOGGER.error(f"Bounds not defined for all features. Missing: {missing_names}")
|
|
151
|
+
raise ValueError()
|
|
152
|
+
|
|
153
|
+
# _LOGGER.info("All bounds successfully created.")
|
|
154
|
+
|
|
155
|
+
# Cast to float lists, as 'None' sentinels are gone
|
|
156
|
+
return (
|
|
157
|
+
[float(b) for b in lower_bounds], # type: ignore
|
|
158
|
+
[float(b) for b in upper_bounds] # type: ignore
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
|
|
20
162
|
def parse_lower_upper_bounds(source: dict[str,tuple[Any,Any]]):
|
|
21
163
|
"""
|
|
22
164
|
Parse lower and upper boundaries, returning 2 lists:
|
|
@@ -29,13 +171,16 @@ def parse_lower_upper_bounds(source: dict[str,tuple[Any,Any]]):
|
|
|
29
171
|
return lower, upper
|
|
30
172
|
|
|
31
173
|
|
|
32
|
-
def plot_optimal_feature_distributions(results_dir: Union[str, Path]):
|
|
174
|
+
def plot_optimal_feature_distributions(results_dir: Union[str, Path], verbose: bool=False):
|
|
33
175
|
"""
|
|
34
|
-
Analyzes optimization results and plots the distribution of optimal values
|
|
176
|
+
Analyzes optimization results and plots the distribution of optimal values.
|
|
35
177
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
178
|
+
This function is compatible with mixed-type CSVs (strings for
|
|
179
|
+
categorical features, numbers for continuous). It automatically
|
|
180
|
+
detects the data type for each feature and generates:
|
|
181
|
+
|
|
182
|
+
- A Bar Plot for categorical (string) features.
|
|
183
|
+
- A KDE Plot for continuous (numeric) features.
|
|
39
184
|
|
|
40
185
|
Plots are saved in a subdirectory inside the source directory.
|
|
41
186
|
|
|
@@ -55,10 +200,17 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path]):
|
|
|
55
200
|
_LOGGER.info(f"📁 Starting analysis from results in: '{results_dir}'")
|
|
56
201
|
data_to_plot = []
|
|
57
202
|
for df, df_name in yield_dataframes_from_dir(results_path):
|
|
203
|
+
if df.shape[1] < 2:
|
|
204
|
+
_LOGGER.warning(f"Skipping '{df_name}': must have at least 2 columns (feature + target).")
|
|
205
|
+
continue
|
|
58
206
|
melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
|
|
59
|
-
melted_df['target'] = df_name
|
|
207
|
+
melted_df['target'] = df_name
|
|
60
208
|
data_to_plot.append(melted_df)
|
|
61
209
|
|
|
210
|
+
if not data_to_plot:
|
|
211
|
+
_LOGGER.error("No valid data to plot after processing all CSVs.")
|
|
212
|
+
return
|
|
213
|
+
|
|
62
214
|
long_df = pd.concat(data_to_plot, ignore_index=True)
|
|
63
215
|
features = long_df['feature'].unique()
|
|
64
216
|
_LOGGER.info(f"Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
|
|
@@ -66,12 +218,23 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path]):
|
|
|
66
218
|
# --- Plotting Loop ---
|
|
67
219
|
for feature_name in features:
|
|
68
220
|
plt.figure(figsize=(12, 7))
|
|
221
|
+
# Use .copy() to avoid SettingWithCopyWarning
|
|
222
|
+
# feature_df = long_df[long_df['feature'] == feature_name].copy()
|
|
69
223
|
feature_df = long_df[long_df['feature'] == feature_name]
|
|
70
224
|
|
|
71
|
-
#
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
225
|
+
# --- Type-checking logic ---
|
|
226
|
+
# Attempt to convert 'value' column to numeric.
|
|
227
|
+
# errors='coerce' turns non-numeric strings (e.g., 'Category_A') into NaN
|
|
228
|
+
feature_df['numeric_value'] = pd.to_numeric(feature_df['value'], errors='coerce')
|
|
229
|
+
|
|
230
|
+
# If *any* value failed conversion (is NaN), treat it as categorical.
|
|
231
|
+
if feature_df['numeric_value'].isna().any():
|
|
232
|
+
|
|
233
|
+
# --- PLOT 1: CATEGORICAL (String-based) ---
|
|
234
|
+
if verbose:
|
|
235
|
+
_LOGGER.info(f"Plotting '{feature_name}' as categorical (bar plot).")
|
|
236
|
+
|
|
237
|
+
# Calculate percentages for a clean bar plot
|
|
75
238
|
norm_df = (feature_df.groupby('target')['value']
|
|
76
239
|
.value_counts(normalize=True)
|
|
77
240
|
.mul(100)
|
|
@@ -79,21 +242,29 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path]):
|
|
|
79
242
|
.reset_index())
|
|
80
243
|
|
|
81
244
|
ax = sns.barplot(data=norm_df, x='value', y='percent', hue='target')
|
|
82
|
-
|
|
83
|
-
plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
|
|
84
245
|
plt.ylabel("Frequency (%)", fontsize=12)
|
|
85
246
|
ax.set_ylim(0, 100) # Set Y-axis from 0 to 100
|
|
247
|
+
|
|
248
|
+
# Rotate x-labels if there are many categories
|
|
249
|
+
if norm_df['value'].nunique() > 10:
|
|
250
|
+
plt.xticks(rotation=45, ha='right')
|
|
86
251
|
|
|
87
252
|
else:
|
|
88
|
-
# PLOT 2:
|
|
89
|
-
|
|
253
|
+
# --- PLOT 2: CONTINUOUS (Numeric-based) ---
|
|
254
|
+
# All values were successfully converted to numeric.
|
|
255
|
+
if verbose:
|
|
256
|
+
_LOGGER.info(f"Plotting '{feature_name}' as continuous (KDE plot).")
|
|
257
|
+
|
|
258
|
+
# Use the 'numeric_value' column (which is float type) for the KDE
|
|
259
|
+
ax = sns.kdeplot(data=feature_df, x='numeric_value', hue='target',
|
|
90
260
|
fill=True, alpha=0.1, warn_singular=False)
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
plt.
|
|
261
|
+
|
|
262
|
+
# Set the x-axis label back to the original feature name
|
|
263
|
+
plt.xlabel("Feature Value", fontsize=12)
|
|
264
|
+
plt.ylabel("Density", fontsize=12)
|
|
94
265
|
|
|
95
266
|
# --- Common settings for both plot types ---
|
|
96
|
-
plt.
|
|
267
|
+
plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
|
|
97
268
|
plt.grid(axis='y', alpha=0.5, linestyle='--')
|
|
98
269
|
|
|
99
270
|
legend = ax.get_legend()
|
|
@@ -106,28 +277,52 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path]):
|
|
|
106
277
|
plt.close()
|
|
107
278
|
|
|
108
279
|
_LOGGER.info(f"All plots saved successfully to: '{output_path}'")
|
|
109
|
-
|
|
280
|
+
|
|
110
281
|
|
|
111
282
|
def _save_result(
|
|
112
283
|
result_dict: dict,
|
|
113
284
|
save_format: Literal['csv', 'sqlite', 'both'],
|
|
114
285
|
csv_path: Path,
|
|
115
286
|
db_manager: Optional[DatabaseManager] = None,
|
|
116
|
-
db_table_name: Optional[str] = None
|
|
287
|
+
db_table_name: Optional[str] = None,
|
|
288
|
+
categorical_mappings: Optional[Dict[str, Dict[str, int]]] = None
|
|
117
289
|
):
|
|
118
290
|
"""
|
|
119
291
|
Private helper to handle saving a single result to CSV, SQLite, or both.
|
|
292
|
+
|
|
293
|
+
If `categorical_mappings` is provided, it will reverse-map integer values
|
|
294
|
+
to their string representations before saving.
|
|
120
295
|
"""
|
|
296
|
+
# --- Reverse Mapping Logic ---
|
|
297
|
+
# Create a copy to hold the values to be saved
|
|
298
|
+
save_dict = result_dict.copy()
|
|
299
|
+
|
|
300
|
+
if categorical_mappings:
|
|
301
|
+
for feature_name, mapping in categorical_mappings.items():
|
|
302
|
+
if feature_name in save_dict:
|
|
303
|
+
# Create a reverse map {0: 'Category_A', 1: 'Category_B'}
|
|
304
|
+
reverse_map = {idx: name for name, idx in mapping.items()}
|
|
305
|
+
|
|
306
|
+
# Get the integer value from the results (e.g., 0)
|
|
307
|
+
int_value = save_dict[feature_name]
|
|
308
|
+
|
|
309
|
+
# Find the corresponding string (e.g., 'Category_A')
|
|
310
|
+
# Use .get() for safety, defaulting to the original value if not found
|
|
311
|
+
string_value = reverse_map.get(int_value, int_value)
|
|
312
|
+
|
|
313
|
+
# Update the dictionary that will be saved
|
|
314
|
+
save_dict[feature_name] = string_value
|
|
315
|
+
|
|
121
316
|
# Save to CSV
|
|
122
317
|
if save_format in ['csv', 'both']:
|
|
123
|
-
df_row = pd.DataFrame([
|
|
318
|
+
df_row = pd.DataFrame([save_dict])
|
|
124
319
|
file_exists = csv_path.exists()
|
|
125
320
|
df_row.to_csv(csv_path, mode='a', index=False, header=not file_exists)
|
|
126
321
|
|
|
127
322
|
# Save to SQLite
|
|
128
323
|
if save_format in ['sqlite', 'both']:
|
|
129
324
|
if db_manager and db_table_name:
|
|
130
|
-
db_manager.insert_row(db_table_name,
|
|
325
|
+
db_manager.insert_row(db_table_name, save_dict)
|
|
131
326
|
else:
|
|
132
327
|
_LOGGER.warning("SQLite saving requested but db_manager or table_name not provided.")
|
|
133
328
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|