alchemist-nrel 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. alchemist_core/__init__.py +2 -2
  2. alchemist_core/acquisition/botorch_acquisition.py +83 -126
  3. alchemist_core/data/experiment_manager.py +181 -12
  4. alchemist_core/models/botorch_model.py +292 -63
  5. alchemist_core/models/sklearn_model.py +145 -13
  6. alchemist_core/session.py +3330 -31
  7. alchemist_core/utils/__init__.py +3 -1
  8. alchemist_core/utils/acquisition_utils.py +60 -0
  9. alchemist_core/visualization/__init__.py +45 -0
  10. alchemist_core/visualization/helpers.py +130 -0
  11. alchemist_core/visualization/plots.py +1449 -0
  12. {alchemist_nrel-0.3.1.dist-info → alchemist_nrel-0.3.2.dist-info}/METADATA +13 -13
  13. {alchemist_nrel-0.3.1.dist-info → alchemist_nrel-0.3.2.dist-info}/RECORD +31 -26
  14. {alchemist_nrel-0.3.1.dist-info → alchemist_nrel-0.3.2.dist-info}/WHEEL +1 -1
  15. api/main.py +1 -1
  16. api/models/requests.py +52 -0
  17. api/models/responses.py +79 -2
  18. api/routers/experiments.py +333 -8
  19. api/routers/sessions.py +84 -9
  20. api/routers/visualizations.py +6 -4
  21. api/routers/websocket.py +2 -2
  22. api/services/session_store.py +295 -71
  23. api/static/assets/index-B6Cf6s_b.css +1 -0
  24. api/static/assets/{index-DWfIKU9j.js → index-B7njvc9r.js} +201 -196
  25. api/static/index.html +2 -2
  26. ui/gpr_panel.py +11 -5
  27. ui/target_column_dialog.py +299 -0
  28. ui/ui.py +52 -5
  29. api/static/assets/index-sMIa_1hV.css +0 -1
  30. {alchemist_nrel-0.3.1.dist-info → alchemist_nrel-0.3.2.dist-info}/entry_points.txt +0 -0
  31. {alchemist_nrel-0.3.1.dist-info → alchemist_nrel-0.3.2.dist-info}/licenses/LICENSE +0 -0
  32. {alchemist_nrel-0.3.1.dist-info → alchemist_nrel-0.3.2.dist-info}/top_level.txt +0 -0
@@ -27,10 +27,10 @@ Example:
27
27
  >>> # Get next experiment suggestion
28
28
  >>> next_point = session.suggest_next(acq_func="ei")
29
29
 
30
- Version: 0.3.0-beta.1
30
+ Version: 0.3.2
31
31
  """
32
32
 
33
- __version__ = "0.3.0b1"
33
+ __version__ = "0.3.2"
34
34
  __author__ = "Caleb Coatney"
35
35
  __email__ = "caleb.coatney@nrel.gov"
36
36
 
@@ -312,6 +312,10 @@ class BoTorchAcquisition(BaseAcquisition):
312
312
  options=options,
313
313
  )
314
314
 
315
+ # Log the acquisition value found
316
+ acq_val = batch_acq_values.item() if batch_acq_values.numel() == 1 else batch_acq_values.max().item()
317
+ logger.info(f"Optimization found acquisition value: {acq_val:.4f}")
318
+
315
319
  # Get the best candidate(s)
316
320
  best_candidates = batch_candidates.detach().cpu()
317
321
 
@@ -533,7 +537,14 @@ class BoTorchAcquisition(BaseAcquisition):
533
537
  return self
534
538
 
535
539
  def find_optimum(self, model=None, maximize=None, random_state=None):
536
- """Find the point where the model predicts the optimal value."""
540
+ """
541
+ Find the point where the model predicts the optimal value.
542
+
543
+ This uses the same approach as regret plot predictions: generate a grid
544
+ in the original variable space, predict using the model's standard pipeline,
545
+ and find the argmax/argmin. This ensures categorical variables are handled
546
+ correctly through proper encoding/decoding.
547
+ """
537
548
  if model is not None:
538
549
  self.model = model
539
550
 
@@ -543,135 +554,81 @@ class BoTorchAcquisition(BaseAcquisition):
543
554
  if random_state is not None:
544
555
  self.random_state = random_state
545
556
 
546
- # Get bounds from the search space
547
- bounds_tensor = self._get_bounds_from_search_space()
557
+ # Generate prediction grid in ORIGINAL variable space (not encoded)
558
+ # This handles categorical variables correctly
559
+ n_grid_points = 10000 # Target number of grid points
560
+ grid = self._generate_prediction_grid(n_grid_points)
561
+
562
+ # Use model's predict method which handles encoding internally
563
+ # This is the same pipeline used by regret plot (correct approach)
564
+ means, stds = self.model.predict(grid, return_std=True)
565
+
566
+ # Find argmax or argmin
567
+ if self.maximize:
568
+ best_idx = np.argmax(means)
569
+ else:
570
+ best_idx = np.argmin(means)
548
571
 
549
- # Identify categorical and integer variables
550
- categorical_variables = []
551
- integer_variables = []
552
- if hasattr(self.search_space_obj, 'get_categorical_variables'):
553
- categorical_variables = self.search_space_obj.get_categorical_variables()
554
- if hasattr(self.search_space_obj, 'get_integer_variables'):
555
- integer_variables = self.search_space_obj.get_integer_variables()
572
+ # Extract the optimal point (already in original variable space)
573
+ opt_point_df = grid.iloc[[best_idx]].reset_index(drop=True)
574
+
575
+ return {
576
+ 'x_opt': opt_point_df,
577
+ 'value': float(means[best_idx]),
578
+ 'std': float(stds[best_idx])
579
+ }
556
580
 
557
- # Prepare for optimization
558
- torch.manual_seed(self.random_state)
581
+ def _generate_prediction_grid(self, n_grid_points: int) -> pd.DataFrame:
582
+ """
583
+ Generate grid of test points across search space for predictions.
559
584
 
560
- try:
561
- # Use a simpler randomized search approach instead of optimize_acqf
562
- # This avoids the dimension issues in the more complex optimization
563
- n_samples = 20000 # Large number of random samples
564
- best_value = float('-inf') if self.maximize else float('inf')
565
- best_x = None
566
-
567
- # Generate random samples within bounds
568
- lower_bounds, upper_bounds = bounds_tensor[0], bounds_tensor[1]
569
- X_samples = torch.rand(n_samples, len(lower_bounds), dtype=torch.double)
570
- X_samples = X_samples * (upper_bounds - lower_bounds) + lower_bounds
571
-
572
- # Round integer variables to nearest integer
573
- if integer_variables:
574
- for i, feature_name in enumerate(self.model.feature_names):
575
- if feature_name in integer_variables:
576
- X_samples[:, i] = torch.round(X_samples[:, i])
577
-
578
- # Evaluate model at all samples
579
- self.model.model.eval()
580
- with torch.no_grad():
581
- posterior = self.model.model.posterior(X_samples)
582
- values = posterior.mean.squeeze()
583
-
584
- # If minimizing, negate values for finding maximum
585
- if not self.maximize:
586
- values = -values
587
-
588
- # Find the best value
589
- best_idx = torch.argmax(values)
590
- best_x = X_samples[best_idx]
591
- best_value = values[best_idx].item()
592
-
593
- # Convert to numpy
594
- best_candidate = best_x.cpu().numpy().reshape(1, -1)
595
- except Exception as e:
596
- logger.error(f"Error in random search optimization: {e}")
597
- # Fallback to grid search
598
- logger.info("Falling back to grid search...")
599
-
600
- # Create a simple grid search
601
- n_points = 10 # Points per dimension
602
- grid_points = []
603
-
604
- # Create grid for each dimension
605
- for i, feature_name in enumerate(self.model.feature_names):
606
- if feature_name in integer_variables:
607
- # For integer variables, create integer grid
608
- min_val = int(lower_bounds[i])
609
- max_val = int(upper_bounds[i])
610
- if max_val - min_val + 1 <= n_points:
611
- # If range is small, use all integer values
612
- grid_points.append(torch.arange(min_val, max_val + 1, dtype=torch.double))
613
- else:
614
- # If range is large, sample n_points integers
615
- step = max(1, (max_val - min_val) // (n_points - 1))
616
- values = torch.arange(min_val, max_val + 1, step, dtype=torch.double)
617
- grid_points.append(values[:n_points])
585
+ This creates a grid in the ORIGINAL variable space (with actual category
586
+ names, not encoded values), which is then properly encoded by the model's
587
+ predict() method.
588
+
589
+ Args:
590
+ n_grid_points: Target number of grid points (actual number depends on dimensionality)
591
+
592
+ Returns:
593
+ DataFrame with columns for each variable in original space
594
+ """
595
+ from itertools import product
596
+
597
+ grid_1d = []
598
+ var_names = []
599
+
600
+ variables = self.search_space_obj.variables
601
+ n_vars = len(variables)
602
+ n_per_dim = max(2, int(n_grid_points ** (1/n_vars)))
603
+
604
+ for var in variables:
605
+ var_names.append(var['name'])
606
+
607
+ if var['type'] == 'real':
608
+ # Continuous: linspace
609
+ grid_1d.append(np.linspace(var['min'], var['max'], n_per_dim))
610
+ elif var['type'] == 'integer':
611
+ # Integer: range of integers
612
+ n_integers = var['max'] - var['min'] + 1
613
+ if n_integers <= n_per_dim:
614
+ # Use all integers if range is small
615
+ grid_1d.append(np.arange(var['min'], var['max'] + 1))
618
616
  else:
619
- # For continuous variables, use linspace
620
- grid_points.append(torch.linspace(
621
- lower_bounds[i], upper_bounds[i], n_points, dtype=torch.double
622
- ))
623
-
624
- # Create meshgrid
625
- meshgrid = torch.meshgrid(*grid_points, indexing='ij')
626
- X_grid = torch.stack([x.reshape(-1) for x in meshgrid], dim=1)
627
-
628
- # Evaluate model on grid
629
- self.model.model.eval()
630
- with torch.no_grad():
631
- posterior = self.model.model.posterior(X_grid)
632
- values = posterior.mean.squeeze()
633
-
634
- # If minimizing, negate values
635
- if not self.maximize:
636
- values = -values
637
-
638
- # Find the best value
639
- best_idx = torch.argmax(values)
640
- best_x = X_grid[best_idx]
641
- best_value = values[best_idx].item()
642
-
643
- # Convert to numpy
644
- best_candidate = best_x.cpu().numpy().reshape(1, -1)
645
-
646
- # Convert to dictionary and then to DataFrame
647
- feature_names = self.model.original_feature_names
648
- result = {}
649
- for i, name in enumerate(feature_names):
650
- value = best_candidate[0, i]
651
-
652
- # If this is a categorical variable, convert back to original value
653
- if name in categorical_variables:
654
- # Find the original categorical value from the encoding
655
- encoding = self.model.categorical_encodings.get(name, {})
656
- inv_encoding = {v: k for k, v in encoding.items()}
657
- if value in inv_encoding:
658
- value = inv_encoding[value]
659
- elif int(value) in inv_encoding:
660
- value = inv_encoding[int(value)]
661
- # If this is an integer variable, ensure it's an integer
662
- elif name in integer_variables:
663
- value = int(round(value))
617
+ # Sample n_per_dim integers
618
+ grid_1d.append(np.linspace(var['min'], var['max'], n_per_dim).astype(int))
619
+ elif var['type'] == 'categorical':
620
+ # Categorical: use ACTUAL category values (not encoded)
621
+ grid_1d.append(var['values'])
664
622
 
665
- result[name] = value
666
-
667
- # Convert to DataFrame
668
- opt_point_df = pd.DataFrame([result])
623
+ # Generate test points using Cartesian product
624
+ X_test_tuples = list(product(*grid_1d))
669
625
 
670
- # Get predicted value and std at optimum
671
- pred_mean, pred_std = self.model.predict_with_std(opt_point_df)
626
+ # Convert to DataFrame with proper variable names and types
627
+ grid = pd.DataFrame(X_test_tuples, columns=var_names)
672
628
 
673
- return {
674
- 'x_opt': opt_point_df,
675
- 'value': float(pred_mean[0]),
676
- 'std': float(pred_std[0])
677
- }
629
+ # Ensure correct dtypes for categorical variables
630
+ for var in variables:
631
+ if var['type'] == 'categorical':
632
+ grid[var['name']] = grid[var['name']].astype(str)
633
+
634
+ return grid
@@ -8,12 +8,20 @@ class ExperimentManager:
8
8
  """
9
9
  Class for storing and managing experimental data in a consistent way across backends.
10
10
  Provides methods for data access, saving/loading, and conversion to formats needed by different backends.
11
+
12
+ Supports both single-objective and multi-objective optimization:
13
+ - Single-objective: Uses single target column (default: 'Output', but configurable)
14
+ - Multi-objective: Uses multiple target columns specified in target_columns attribute
15
+
16
+ The target_column parameter allows flexible column naming to support various CSV formats.
11
17
  """
12
- def __init__(self, search_space=None):
18
+ def __init__(self, search_space=None, target_columns: Optional[List[str]] = None):
13
19
  self.df = pd.DataFrame() # Raw experimental data
14
20
  self.search_space = search_space # Reference to the search space
15
21
  self.filepath = None # Path to saved experiment file
16
22
  self._current_iteration = 0 # Track current iteration for audit log
23
+ # Support flexible target column naming for both single and multi-objective
24
+ self.target_columns = target_columns or ['Output'] # Default to 'Output' for backward compatibility
17
25
 
18
26
  def set_search_space(self, search_space):
19
27
  """Set or update the search space reference."""
@@ -35,9 +43,9 @@ class ExperimentManager:
35
43
  # Create a copy of the point_dict to avoid modifying the original
36
44
  new_point = point_dict.copy()
37
45
 
38
- # Add output value if provided
46
+ # Add output value if provided (use first target column for single-objective)
39
47
  if output_value is not None:
40
- new_point['Output'] = output_value
48
+ new_point[self.target_columns[0]] = output_value
41
49
 
42
50
  # Add noise value if provided
43
51
  if noise_value is not None:
@@ -107,12 +115,20 @@ class ExperimentManager:
107
115
  Returns:
108
116
  X: Features DataFrame
109
117
  y: Target Series
118
+
119
+ Raises:
120
+ ValueError: If configured target column is not found in data
110
121
  """
111
- if 'Output' not in self.df.columns:
112
- raise ValueError("DataFrame doesn't contain 'Output' column")
122
+ target_col = self.target_columns[0] # Use first target column for single-objective
113
123
 
114
- # Drop metadata columns (Output, Noise, Iteration, Reason)
115
- metadata_cols = ['Output']
124
+ if target_col not in self.df.columns:
125
+ raise ValueError(
126
+ f"DataFrame doesn't contain target column '{target_col}'. "
127
+ f"Available columns: {list(self.df.columns)}"
128
+ )
129
+
130
+ # Drop metadata columns (target, Noise, Iteration, Reason)
131
+ metadata_cols = self.target_columns.copy()
116
132
  if 'Noise' in self.df.columns:
117
133
  metadata_cols.append('Noise')
118
134
  if 'Iteration' in self.df.columns:
@@ -121,7 +137,7 @@ class ExperimentManager:
121
137
  metadata_cols.append('Reason')
122
138
 
123
139
  X = self.df.drop(columns=metadata_cols)
124
- y = self.df['Output']
140
+ y = self.df[target_col]
125
141
  return X, y
126
142
 
127
143
  def get_features_target_and_noise(self) -> Tuple[pd.DataFrame, pd.Series, Optional[pd.Series]]:
@@ -132,12 +148,20 @@ class ExperimentManager:
132
148
  X: Features DataFrame
133
149
  y: Target Series
134
150
  noise: Noise Series if available, otherwise None
151
+
152
+ Raises:
153
+ ValueError: If configured target column is not found in data
135
154
  """
136
- if 'Output' not in self.df.columns:
137
- raise ValueError("DataFrame doesn't contain 'Output' column")
155
+ target_col = self.target_columns[0] # Use first target column for single-objective
156
+
157
+ if target_col not in self.df.columns:
158
+ raise ValueError(
159
+ f"DataFrame doesn't contain target column '{target_col}'. "
160
+ f"Available columns: {list(self.df.columns)}"
161
+ )
138
162
 
139
163
  # Drop metadata columns
140
- metadata_cols = ['Output']
164
+ metadata_cols = self.target_columns.copy()
141
165
  if 'Noise' in self.df.columns:
142
166
  metadata_cols.append('Noise')
143
167
  if 'Iteration' in self.df.columns:
@@ -146,7 +170,7 @@ class ExperimentManager:
146
170
  metadata_cols.append('Reason')
147
171
 
148
172
  X = self.df.drop(columns=metadata_cols)
149
- y = self.df['Output']
173
+ y = self.df[target_col]
150
174
  noise = self.df['Noise'] if 'Noise' in self.df.columns else None
151
175
  return X, y, noise
152
176
 
@@ -224,3 +248,148 @@ class ExperimentManager:
224
248
 
225
249
  def __len__(self):
226
250
  return len(self.df)
251
+
252
+ def get_pareto_frontier(self, directions: Optional[List[str]] = None) -> pd.DataFrame:
253
+ """
254
+ Compute Pareto-optimal solutions from experiments with multiple objectives.
255
+
256
+ Uses BoTorch's fast non-dominated sorting algorithm to identify Pareto-optimal
257
+ points. Works with both single-objective (returns all data) and multi-objective
258
+ experiments.
259
+
260
+ Args:
261
+ directions: List of 'maximize' or 'minimize' for each target column.
262
+ If None, assumes all objectives are maximized.
263
+ Length must match number of target columns.
264
+
265
+ Returns:
266
+ DataFrame containing only Pareto-optimal experiments with all columns.
267
+
268
+ Raises:
269
+ ValueError: If directions length doesn't match target columns.
270
+ ValueError: If target columns contain missing data.
271
+
272
+ Example:
273
+ >>> # For 2 objectives: maximize yield, minimize cost
274
+ >>> pareto_df = exp_mgr.get_pareto_frontier(['maximize', 'minimize'])
275
+ """
276
+ import torch
277
+ from botorch.utils.multi_objective.pareto import is_non_dominated
278
+
279
+ if len(self.df) == 0:
280
+ return pd.DataFrame()
281
+
282
+ # Validate target columns exist
283
+ missing_cols = [col for col in self.target_columns if col not in self.df.columns]
284
+ if missing_cols:
285
+ raise ValueError(f"Target columns {missing_cols} not found in experiment data")
286
+
287
+ # Extract objective values
288
+ Y = self.df[self.target_columns].values
289
+
290
+ # Check for missing values
291
+ if pd.isna(Y).any():
292
+ raise ValueError("Target columns contain missing values (NaN). Cannot compute Pareto frontier.")
293
+
294
+ # Single objective case: return all data
295
+ if len(self.target_columns) == 1:
296
+ return self.df.copy()
297
+
298
+ # Set default directions if not provided
299
+ if directions is None:
300
+ directions = ['maximize'] * len(self.target_columns)
301
+
302
+ # Validate directions
303
+ if len(directions) != len(self.target_columns):
304
+ raise ValueError(
305
+ f"Number of directions ({len(directions)}) must match number of "
306
+ f"target columns ({len(self.target_columns)})"
307
+ )
308
+
309
+ # Convert objectives to maximization form (BoTorch assumes maximization)
310
+ Y_torch = torch.tensor(Y, dtype=torch.double)
311
+ for i, direction in enumerate(directions):
312
+ if direction.lower() == 'minimize':
313
+ Y_torch[:, i] = -Y_torch[:, i]
314
+
315
+ # Compute non-dominated mask
316
+ nd_mask = is_non_dominated(Y_torch, maximize=True, deduplicate=True)
317
+
318
+ # Return Pareto-optimal experiments
319
+ return self.df[nd_mask.numpy()].copy()
320
+
321
+ def compute_hypervolume(self, ref_point: Union[List[float], np.ndarray],
322
+ directions: Optional[List[str]] = None) -> float:
323
+ """
324
+ Compute hypervolume indicator for multi-objective experiments.
325
+
326
+ The hypervolume measures the volume of objective space dominated by the
327
+ Pareto frontier relative to a reference point. Larger values indicate
328
+ better overall performance.
329
+
330
+ Args:
331
+ ref_point: Reference point (worst acceptable values) for each objective.
332
+ Must have same length as target_columns.
333
+ For maximization: should be below minimum observed values.
334
+ For minimization: should be above maximum observed values.
335
+ directions: List of 'maximize' or 'minimize' for each target column.
336
+ If None, assumes all objectives are maximized.
337
+
338
+ Returns:
339
+ Hypervolume value (float). Zero if no Pareto-optimal points exist.
340
+
341
+ Raises:
342
+ ValueError: If ref_point length doesn't match target columns.
343
+ ValueError: If target columns contain missing data.
344
+
345
+ Example:
346
+ >>> # For 2 objectives (maximize yield, minimize cost)
347
+ >>> # ref_point = [min_acceptable_yield, max_acceptable_cost]
348
+ >>> hv = exp_mgr.compute_hypervolume([50.0, 100.0], ['maximize', 'minimize'])
349
+ """
350
+ import torch
351
+ from botorch.utils.multi_objective.hypervolume import Hypervolume
352
+
353
+ if len(self.df) == 0:
354
+ return 0.0
355
+
356
+ # Single objective case: not meaningful
357
+ if len(self.target_columns) == 1:
358
+ raise ValueError(
359
+ "Hypervolume is only defined for multi-objective problems. "
360
+ "For single-objective, use best observed value instead."
361
+ )
362
+
363
+ # Validate ref_point
364
+ ref_point = np.array(ref_point)
365
+ if len(ref_point) != len(self.target_columns):
366
+ raise ValueError(
367
+ f"Reference point length ({len(ref_point)}) must match number of "
368
+ f"target columns ({len(self.target_columns)})"
369
+ )
370
+
371
+ # Get Pareto frontier
372
+ pareto_df = self.get_pareto_frontier(directions)
373
+ if len(pareto_df) == 0:
374
+ return 0.0
375
+
376
+ # Set default directions if not provided
377
+ if directions is None:
378
+ directions = ['maximize'] * len(self.target_columns)
379
+
380
+ # Extract Pareto objectives and convert to torch tensors
381
+ Y_pareto = pareto_df[self.target_columns].values
382
+ Y_torch = torch.tensor(Y_pareto, dtype=torch.double)
383
+ ref_torch = torch.tensor(ref_point, dtype=torch.double)
384
+
385
+ # Convert to maximization form (BoTorch assumes maximization)
386
+ for i, direction in enumerate(directions):
387
+ if direction.lower() == 'minimize':
388
+ Y_torch[:, i] = -Y_torch[:, i]
389
+ ref_torch[i] = -ref_torch[i]
390
+
391
+ # Compute hypervolume
392
+ hv_calculator = Hypervolume(ref_point=ref_torch)
393
+ hv = hv_calculator.compute(Y_torch)
394
+
395
+ return float(hv)