dragon-ml-toolbox 12.0.1__py3-none-any.whl → 12.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  import matplotlib.pyplot as plt
2
2
  import seaborn as sns
3
- from typing import Union, Any, Literal, Optional
3
+ from typing import Union, Any, Literal, Optional, Dict, List, Tuple
4
4
  from pathlib import Path
5
5
  import pandas as pd
6
6
 
@@ -12,11 +12,153 @@ from .SQL import DatabaseManager
12
12
 
13
13
 
14
14
  __all__ = [
15
+ "create_optimization_bounds",
15
16
  "parse_lower_upper_bounds",
16
- "plot_optimal_feature_distributions"
17
+ "plot_optimal_feature_distributions",
17
18
  ]
18
19
 
19
20
 
21
+ def create_optimization_bounds(
22
+ csv_path: Union[str, Path],
23
+ continuous_bounds_map: Dict[str, Tuple[float, float]],
24
+ categorical_map: Dict[int, int],
25
+ target_column: Optional[str] = None,
26
+ start_at_zero: bool = True
27
+ ) -> Tuple[List[float], List[float]]:
28
+ """
29
+ Generates the lower and upper bounds lists for the optimizer from a CSV header.
30
+
31
+ This helper function automates the creation of unbiased bounds for
32
+ categorical features and combines them with user-defined bounds for
33
+ continuous features.
34
+
35
+ It reads *only* the header of the provided CSV to determine the full
36
+ list of feature columns and their order, excluding the specified target.
37
+ This is memory-efficient as the full dataset is not loaded.
38
+
39
+ Args:
40
+ csv_path (Union[str, Path]):
41
+ Path to the final, preprocessed CSV file. The column order in
42
+ this file must match the order expected by the model.
43
+ continuous_bounds_map (Dict[str, Tuple[float, float]]):
44
+ A dictionary mapping the *name* of each **continuous** feature
45
+ to its (min_bound, max_bound) tuple.
46
+ categorical_map (Dict[int, int]):
47
+ The map from the *index* of each **categorical** feature to its cardinality.
48
+ (e.g., {2: 4} for a feature at index 2 with 4 categories).
49
+ target_column (Optional[str], optional):
50
+ The name of the target column to exclude. If None (default), the *last column* in the CSV is assumed to be the target.
51
+ start_at_zero (bool):
52
+ - If True, assumes categorical encoding is [0, 1, ..., k-1].
53
+ Bounds will be set as [-0.5, k - 0.5].
54
+ - If False, assumes encoding is [1, 2, ..., k].
55
+ Bounds will be set as [0.5, k + 0.5].
56
+
57
+ Returns:
58
+ Tuple[List[float], List[float]]:
59
+ A tuple containing two lists: (lower_bounds, upper_bounds).
60
+
61
+ Raises:
62
+ ValueError: If a feature is defined in both maps, is missing from
63
+ both maps, or if a name in `continuous_bounds_map`
64
+ or `target_column` is not found in the CSV columns.
65
+ """
66
+ # 1. Read header and determine feature names
67
+ full_csv_path = make_fullpath(csv_path, enforce="file")
68
+ try:
69
+ df_header = pd.read_csv(full_csv_path, nrows=0)
70
+ except Exception as e:
71
+ _LOGGER.error(f"Failed to read header from CSV: {e}")
72
+ raise
73
+
74
+ all_column_names = df_header.columns.to_list()
75
+ feature_names: List[str] = []
76
+
77
+ if target_column is None:
78
+ feature_names = all_column_names[:-1]
79
+ excluded_target = all_column_names[-1]
80
+ _LOGGER.info(f"No target_column provided. Assuming last column '{excluded_target}' is the target.")
81
+ else:
82
+ if target_column not in all_column_names:
83
+ _LOGGER.error(f"Target column '{target_column}' not found in CSV header.")
84
+ raise ValueError()
85
+ feature_names = [name for name in all_column_names if name != target_column]
86
+ _LOGGER.info(f"Excluding target column '{target_column}'.")
87
+
88
+ # 2. Initialize bound lists
89
+ total_features = len(feature_names)
90
+ if total_features <= 0:
91
+ _LOGGER.error("No feature columns remain after excluding the target.")
92
+ raise ValueError()
93
+
94
+ lower_bounds: List[Optional[float]] = [None] * total_features
95
+ upper_bounds: List[Optional[float]] = [None] * total_features
96
+
97
+ _LOGGER.info(f"Generating bounds for {total_features} total features...")
98
+
99
+ # 3. Populate categorical bounds (Index-based)
100
+ # The indices in categorical_map (e.g., {2: 4}) directly correspond
101
+ # to the indices in our new `feature_names` list.
102
+ for index, cardinality in categorical_map.items():
103
+ if not (0 <= index < total_features):
104
+ _LOGGER.error(f"Categorical index {index} is out of range for the {total_features} features.")
105
+ raise ValueError()
106
+
107
+ if start_at_zero:
108
+ # Rule for [0, k-1]: bounds are [-0.5, k - 0.5]
109
+ low = -0.5
110
+ high = float(cardinality) - 0.5
111
+ else:
112
+ # Rule for [1, k]: bounds are [0.5, k + 0.5]
113
+ low = 0.5
114
+ high = float(cardinality) + 0.5
115
+
116
+ lower_bounds[index] = low
117
+ upper_bounds[index] = high
118
+
119
+ _LOGGER.info(f"Automatically set bounds for {len(categorical_map)} categorical features.")
120
+
121
+ # 4. Populate continuous bounds (Name-based)
122
+ count_continuous = 0
123
+ for name, (low, high) in continuous_bounds_map.items():
124
+ try:
125
+ # Map name to its index in the *feature-only* list
126
+ index = feature_names.index(name)
127
+ except ValueError:
128
+ _LOGGER.error(f"Feature name '{name}' from 'continuous_bounds_map' not found in the CSV's feature columns.")
129
+ raise ValueError()
130
+
131
+ if lower_bounds[index] is not None:
132
+ # This index was already set by the categorical map
133
+ _LOGGER.error(f"Feature '{name}' (at index {index}) is defined in both 'categorical_map' and 'continuous_bounds_map'.")
134
+ raise ValueError()
135
+
136
+ lower_bounds[index] = float(low)
137
+ upper_bounds[index] = float(high)
138
+ count_continuous += 1
139
+
140
+ _LOGGER.info(f"Manually set bounds for {count_continuous} continuous features.")
141
+
142
+ # 5. Validation: Check for any remaining None values
143
+ missing_indices = []
144
+ for i in range(total_features):
145
+ if lower_bounds[i] is None:
146
+ missing_indices.append(i)
147
+
148
+ if missing_indices:
149
+ missing_names = [feature_names[i] for i in missing_indices]
150
+ _LOGGER.error(f"Bounds not defined for all features. Missing: {missing_names}")
151
+ raise ValueError()
152
+
153
+ # _LOGGER.info("All bounds successfully created.")
154
+
155
+ # Cast to float lists, as 'None' sentinels are gone
156
+ return (
157
+ [float(b) for b in lower_bounds], # type: ignore
158
+ [float(b) for b in upper_bounds] # type: ignore
159
+ )
160
+
161
+
20
162
  def parse_lower_upper_bounds(source: dict[str,tuple[Any,Any]]):
21
163
  """
22
164
  Parse lower and upper boundaries, returning 2 lists:
@@ -29,13 +171,16 @@ def parse_lower_upper_bounds(source: dict[str,tuple[Any,Any]]):
29
171
  return lower, upper
30
172
 
31
173
 
32
- def plot_optimal_feature_distributions(results_dir: Union[str, Path]):
174
+ def plot_optimal_feature_distributions(results_dir: Union[str, Path], verbose: bool=False):
33
175
  """
34
- Analyzes optimization results and plots the distribution of optimal values for each feature.
176
+ Analyzes optimization results and plots the distribution of optimal values.
35
177
 
36
- For features with more than two unique values, this function generates a color-coded
37
- Kernel Density Estimate (KDE) plot. For binary or constant features, it generates a bar plot
38
- showing relative frequency.
178
+ This function is compatible with mixed-type CSVs (strings for
179
+ categorical features, numbers for continuous). It automatically
180
+ detects the data type for each feature and generates:
181
+
182
+ - A Bar Plot for categorical (string) features.
183
+ - A KDE Plot for continuous (numeric) features.
39
184
 
40
185
  Plots are saved in a subdirectory inside the source directory.
41
186
 
@@ -55,10 +200,17 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path]):
55
200
  _LOGGER.info(f"📁 Starting analysis from results in: '{results_dir}'")
56
201
  data_to_plot = []
57
202
  for df, df_name in yield_dataframes_from_dir(results_path):
203
+ if df.shape[1] < 2:
204
+ _LOGGER.warning(f"Skipping '{df_name}': must have at least 2 columns (feature + target).")
205
+ continue
58
206
  melted_df = df.iloc[:, :-1].melt(var_name='feature', value_name='value')
59
- melted_df['target'] = df_name.replace("Optimization_", "")
207
+ melted_df['target'] = df_name
60
208
  data_to_plot.append(melted_df)
61
209
 
210
+ if not data_to_plot:
211
+ _LOGGER.error("No valid data to plot after processing all CSVs.")
212
+ return
213
+
62
214
  long_df = pd.concat(data_to_plot, ignore_index=True)
63
215
  features = long_df['feature'].unique()
64
216
  _LOGGER.info(f"Found data for {len(features)} features across {len(long_df['target'].unique())} targets. Generating plots...")
@@ -66,12 +218,23 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path]):
66
218
  # --- Plotting Loop ---
67
219
  for feature_name in features:
68
220
  plt.figure(figsize=(12, 7))
221
+ # Use .copy() to avoid SettingWithCopyWarning
222
+ # feature_df = long_df[long_df['feature'] == feature_name].copy()
69
223
  feature_df = long_df[long_df['feature'] == feature_name]
70
224
 
71
- # Check if the feature is binary or constant
72
- if feature_df['value'].nunique() <= 2:
73
- # PLOT 1: For discrete values, calculate percentages and use a true bar plot.
74
- # This ensures the X-axis is clean (e.g., just 0 and 1).
225
+ # --- Type-checking logic ---
226
+ # Attempt to convert 'value' column to numeric.
227
+ # errors='coerce' turns non-numeric strings (e.g., 'Category_A') into NaN
228
+ feature_df['numeric_value'] = pd.to_numeric(feature_df['value'], errors='coerce')
229
+
230
+ # If *any* value failed conversion (is NaN), treat it as categorical.
231
+ if feature_df['numeric_value'].isna().any():
232
+
233
+ # --- PLOT 1: CATEGORICAL (String-based) ---
234
+ if verbose:
235
+ _LOGGER.info(f"Plotting '{feature_name}' as categorical (bar plot).")
236
+
237
+ # Calculate percentages for a clean bar plot
75
238
  norm_df = (feature_df.groupby('target')['value']
76
239
  .value_counts(normalize=True)
77
240
  .mul(100)
@@ -79,21 +242,29 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path]):
79
242
  .reset_index())
80
243
 
81
244
  ax = sns.barplot(data=norm_df, x='value', y='percent', hue='target')
82
-
83
- plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
84
245
  plt.ylabel("Frequency (%)", fontsize=12)
85
246
  ax.set_ylim(0, 100) # Set Y-axis from 0 to 100
247
+
248
+ # Rotate x-labels if there are many categories
249
+ if norm_df['value'].nunique() > 10:
250
+ plt.xticks(rotation=45, ha='right')
86
251
 
87
252
  else:
88
- # PLOT 2: KDE plot for continuous values.
89
- ax = sns.kdeplot(data=feature_df, x='value', hue='target',
253
+ # --- PLOT 2: CONTINUOUS (Numeric-based) ---
254
+ # All values were successfully converted to numeric.
255
+ if verbose:
256
+ _LOGGER.info(f"Plotting '{feature_name}' as continuous (KDE plot).")
257
+
258
+ # Use the 'numeric_value' column (which is float type) for the KDE
259
+ ax = sns.kdeplot(data=feature_df, x='numeric_value', hue='target',
90
260
  fill=True, alpha=0.1, warn_singular=False)
91
-
92
- plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
93
- plt.ylabel("Density", fontsize=12) # Y-axis is "Density" for KDE plots
261
+
262
+ # Set the x-axis label back to the original feature name
263
+ plt.xlabel("Feature Value", fontsize=12)
264
+ plt.ylabel("Density", fontsize=12)
94
265
 
95
266
  # --- Common settings for both plot types ---
96
- plt.xlabel("Feature Value", fontsize=12)
267
+ plt.title(f"Optimal Value Distribution for '{feature_name}'", fontsize=16)
97
268
  plt.grid(axis='y', alpha=0.5, linestyle='--')
98
269
 
99
270
  legend = ax.get_legend()
@@ -106,28 +277,52 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path]):
106
277
  plt.close()
107
278
 
108
279
  _LOGGER.info(f"All plots saved successfully to: '{output_path}'")
109
-
280
+
110
281
 
111
282
  def _save_result(
112
283
  result_dict: dict,
113
284
  save_format: Literal['csv', 'sqlite', 'both'],
114
285
  csv_path: Path,
115
286
  db_manager: Optional[DatabaseManager] = None,
116
- db_table_name: Optional[str] = None
287
+ db_table_name: Optional[str] = None,
288
+ categorical_mappings: Optional[Dict[str, Dict[str, int]]] = None
117
289
  ):
118
290
  """
119
291
  Private helper to handle saving a single result to CSV, SQLite, or both.
292
+
293
+ If `categorical_mappings` is provided, it will reverse-map integer values
294
+ to their string representations before saving.
120
295
  """
296
+ # --- Reverse Mapping Logic ---
297
+ # Create a copy to hold the values to be saved
298
+ save_dict = result_dict.copy()
299
+
300
+ if categorical_mappings:
301
+ for feature_name, mapping in categorical_mappings.items():
302
+ if feature_name in save_dict:
303
+ # Create a reverse map {0: 'Category_A', 1: 'Category_B'}
304
+ reverse_map = {idx: name for name, idx in mapping.items()}
305
+
306
+ # Get the integer value from the results (e.g., 0)
307
+ int_value = save_dict[feature_name]
308
+
309
+ # Find the corresponding string (e.g., 'Category_A')
310
+ # Use .get() for safety, defaulting to the original value if not found
311
+ string_value = reverse_map.get(int_value, int_value)
312
+
313
+ # Update the dictionary that will be saved
314
+ save_dict[feature_name] = string_value
315
+
121
316
  # Save to CSV
122
317
  if save_format in ['csv', 'both']:
123
- df_row = pd.DataFrame([result_dict])
318
+ df_row = pd.DataFrame([save_dict])
124
319
  file_exists = csv_path.exists()
125
320
  df_row.to_csv(csv_path, mode='a', index=False, header=not file_exists)
126
321
 
127
322
  # Save to SQLite
128
323
  if save_format in ['sqlite', 'both']:
129
324
  if db_manager and db_table_name:
130
- db_manager.insert_row(db_table_name, result_dict)
325
+ db_manager.insert_row(db_table_name, save_dict)
131
326
  else:
132
327
  _LOGGER.warning("SQLite saving requested but db_manager or table_name not provided.")
133
328