gpclarity 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,647 @@
1
+ """
2
+ Uncertainty profiling and analysis for Gaussian Process models.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import logging
8
+ from dataclasses import dataclass, field
9
+ from enum import Enum, auto
10
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Protocol, Tuple, Union
11
+
12
+ import numpy as np
13
+
14
+ from gpclarity.exceptions import UncertaintyError
15
+ from gpclarity.utils import _validate_array
16
+
17
+ if TYPE_CHECKING:
18
+ import matplotlib.pyplot as plt
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class UncertaintyRegion(Enum):
24
+ """Classification of uncertainty regions."""
25
+ EXTRAPOLATION = auto() # Far from training data
26
+ INTERPOLATION = auto() # Within training data convex hull
27
+ BOUNDARY = auto() # Near edge of training data
28
+ HIGH_NOISE = auto() # High aleatoric uncertainty
29
+ STRUCTURAL = auto() # High epistemic uncertainty
30
+
31
+
32
+ @dataclass(frozen=True)
33
+ class UncertaintyConfig:
34
+ """Configurable parameters for uncertainty analysis."""
35
+ min_variance: float = 1e-10
36
+ max_variance: float = 1e10
37
+ default_confidence_level: float = 2.0
38
+ high_uncertainty_percentile: float = 90.0
39
+ calibration_bins: int = 10
40
+ numerical_jitter: float = 1e-9
41
+
42
+ def __post_init__(self):
43
+ if self.min_variance <= 0:
44
+ raise ValueError("min_variance must be positive")
45
+ if self.max_variance <= self.min_variance:
46
+ raise ValueError("max_variance must be > min_variance")
47
+ if self.default_confidence_level <= 0:
48
+ raise ValueError("confidence_level must be positive")
49
+ if not 0 < self.high_uncertainty_percentile < 100:
50
+ raise ValueError("high_uncertainty_percentile must be in (0, 100)")
51
+
52
+
53
+ @dataclass
54
+ class UncertaintyDiagnostics:
55
+ """Comprehensive uncertainty diagnostics."""
56
+ mean_uncertainty: float
57
+ median_uncertainty: float
58
+ max_uncertainty: float
59
+ min_uncertainty: float
60
+ std_uncertainty: float
61
+ total_uncertainty: float
62
+ high_uncertainty_ratio: float
63
+ uncertainty_skewness: float
64
+ uncertainty_kurtosis: float
65
+ coefficient_of_variation: float
66
+
67
+ # Spatial characteristics
68
+ n_extrapolation_points: int = 0
69
+ n_boundary_points: int = 0
70
+ uncertainty_gradient_mean: float = 0.0
71
+
72
+ @property
73
+ def is_well_calibrated(self) -> bool:
74
+ """Check if uncertainty distribution looks reasonable."""
75
+ # Heuristic: CV should be moderate, not extreme
76
+ return 0.1 < self.coefficient_of_variation < 10.0
77
+
78
+
79
+ @dataclass
80
+ class PredictionResult:
81
+ """Container for prediction with uncertainty."""
82
+ mean: np.ndarray
83
+ variance: np.ndarray
84
+ std: np.ndarray
85
+ confidence_intervals: Dict[str, Tuple[np.ndarray, np.ndarray]] = field(default_factory=dict)
86
+
87
+ def __post_init__(self):
88
+ if self.confidence_intervals is None:
89
+ self.confidence_intervals = {}
90
+
91
+ def get_interval(self, level: float) -> Tuple[np.ndarray, np.ndarray]:
92
+ """Get confidence interval at specified level."""
93
+ key = f"{level:.1f}sigma"
94
+ if key not in self.confidence_intervals:
95
+ lower = self.mean - level * self.std
96
+ upper = self.mean + level * self.std
97
+ self.confidence_intervals[key] = (lower, upper)
98
+ return self.confidence_intervals[key]
99
+
100
+
101
+ class UncertaintyQuantifier(Protocol):
102
+ """Protocol for custom uncertainty quantification methods."""
103
+ def __call__(
104
+ self,
105
+ model: Any,
106
+ X_test: np.ndarray,
107
+ X_train: Optional[np.ndarray] = None
108
+ ) -> Tuple[np.ndarray, np.ndarray]:
109
+ ...
110
+
111
+
112
+ class UncertaintyProfiler:
113
+ """
114
+ Analyze and visualize uncertainty behavior across input space.
115
+
116
+ Provides comprehensive uncertainty quantification including:
117
+ - Point predictions with calibrated intervals
118
+ - Spatial uncertainty analysis
119
+ - Extrapolation detection
120
+ - Uncertainty decomposition (aleatoric vs epistemic)
121
+ """
122
+
123
+ def __init__(
124
+ self,
125
+ model: Any,
126
+ config: Optional[UncertaintyConfig] = None,
127
+ X_train: Optional[np.ndarray] = None,
128
+ ):
129
+ """
130
+ Initialize profiler with a GP model.
131
+
132
+ Args:
133
+ model: Trained GP model with predict() method
134
+ config: Uncertainty configuration
135
+ X_train: Training data for extrapolation detection (optional)
136
+ """
137
+ if not hasattr(model, "predict"):
138
+ raise UncertaintyError("Model must have predict() method")
139
+
140
+ self.model = model
141
+ self.config = config or UncertaintyConfig()
142
+ self.X_train = X_train
143
+
144
+ # Cache for expensive computations
145
+ self._prediction_cache: Dict[str, PredictionResult] = {}
146
+ self._diagnostics_cache: Optional[UncertaintyDiagnostics] = None
147
+
148
+ def predict(
149
+ self,
150
+ X_test: np.ndarray,
151
+ *,
152
+ return_covariance: bool = False,
153
+ cache_key: Optional[str] = None,
154
+ ) -> PredictionResult:
155
+ """
156
+ Safe prediction with comprehensive uncertainty quantification.
157
+
158
+ Args:
159
+ X_test: Test input locations (n_test, n_features)
160
+ return_covariance: If True, also return full covariance matrix
161
+ cache_key: Optional key for caching results
162
+
163
+ Returns:
164
+ PredictionResult with mean, variance, std, and intervals
165
+
166
+ Raises:
167
+ UncertaintyError: If prediction fails
168
+ """
169
+ # Check cache
170
+ if cache_key and cache_key in self._prediction_cache:
171
+ return self._prediction_cache[cache_key]
172
+
173
+ X_test = _validate_array(X_test, "X_test")
174
+
175
+ try:
176
+ # Handle different model interfaces
177
+ if return_covariance and hasattr(self.model, 'predict_full_cov'):
178
+ mean, var = self.model.predict_full_cov(X_test)
179
+ var = np.diag(var) # Extract diagonal for consistency
180
+ else:
181
+ mean, var = self.model.predict(X_test)
182
+
183
+ # Ensure correct shapes
184
+ mean = np.atleast_1d(mean).flatten()
185
+ var = np.atleast_1d(var).flatten()
186
+
187
+ # Numerical safety
188
+ var = np.clip(
189
+ var,
190
+ self.config.min_variance,
191
+ self.config.max_variance
192
+ )
193
+
194
+ # Check for issues
195
+ if not np.all(np.isfinite(mean)):
196
+ n_invalid = np.sum(~np.isfinite(mean))
197
+ logger.warning(f"{n_invalid} predictions are non-finite")
198
+
199
+ result = PredictionResult(
200
+ mean=mean,
201
+ variance=var,
202
+ std=np.sqrt(var),
203
+ )
204
+
205
+ # Cache if requested
206
+ if cache_key:
207
+ self._prediction_cache[cache_key] = result
208
+
209
+ return result
210
+
211
+ except Exception as e:
212
+ raise UncertaintyError(f"Prediction failed: {e}") from e
213
+
214
+ def compute_diagnostics(
215
+ self,
216
+ X_test: np.ndarray,
217
+ force_recompute: bool = False,
218
+ ) -> UncertaintyDiagnostics:
219
+ """
220
+ Compute comprehensive spatial uncertainty metrics.
221
+
222
+ Args:
223
+ X_test: Test input locations
224
+ force_recompute: Ignore cache if True
225
+
226
+ Returns:
227
+ UncertaintyDiagnostics with detailed statistics
228
+ """
229
+ if not force_recompute and self._diagnostics_cache is not None:
230
+ return self._diagnostics_cache
231
+
232
+ pred = self.predict(X_test)
233
+ var = pred.variance
234
+
235
+ # Basic statistics
236
+ mean_var = float(np.mean(var))
237
+ median_var = float(np.median(var))
238
+ max_var = float(np.max(var))
239
+ min_var = float(np.min(var))
240
+ std_var = float(np.std(var))
241
+
242
+ # Advanced statistics
243
+ skew = self._compute_skewness(var)
244
+ kurt = self._compute_kurtosis(var)
245
+ cv = std_var / mean_var if mean_var > 0 else 0.0
246
+
247
+ # High uncertainty ratio
248
+ threshold = np.percentile(var, self.config.high_uncertainty_percentile)
249
+ high_unc_ratio = float(np.mean(var > threshold))
250
+
251
+ # Spatial analysis
252
+ n_ext, n_bound = 0, 0
253
+ grad_mean = 0.0
254
+
255
+ if self.X_train is not None:
256
+ regions = self.classify_regions(X_test)
257
+ n_ext = np.sum(regions == UncertaintyRegion.EXTRAPOLATION)
258
+ n_bound = np.sum(regions == UncertaintyRegion.BOUNDARY)
259
+
260
+ # Uncertainty gradient (how fast uncertainty changes)
261
+ if X_test.shape[0] > 1 and X_test.shape[1] == 1:
262
+ sorted_idx = np.argsort(X_test.flatten())
263
+ var_sorted = var[sorted_idx]
264
+ gradients = np.abs(np.diff(var_sorted))
265
+ grad_mean = float(np.mean(gradients)) if len(gradients) > 0 else 0.0
266
+
267
+ diagnostics = UncertaintyDiagnostics(
268
+ mean_uncertainty=mean_var,
269
+ median_uncertainty=median_var,
270
+ max_uncertainty=max_var,
271
+ min_uncertainty=min_var,
272
+ std_uncertainty=std_var,
273
+ total_uncertainty=float(np.sum(var)),
274
+ high_uncertainty_ratio=high_unc_ratio,
275
+ uncertainty_skewness=skew,
276
+ uncertainty_kurtosis=kurt,
277
+ coefficient_of_variation=cv,
278
+ n_extrapolation_points=n_ext,
279
+ n_boundary_points=n_bound,
280
+ uncertainty_gradient_mean=grad_mean,
281
+ )
282
+
283
+ self._diagnostics_cache = diagnostics
284
+ return diagnostics
285
+
286
+ def classify_regions(
287
+ self,
288
+ X_test: np.ndarray,
289
+ X_train: Optional[np.ndarray] = None,
290
+ ) -> np.ndarray:
291
+ """
292
+ Classify test points by uncertainty region type.
293
+
294
+ Args:
295
+ X_test: Test locations
296
+ X_train: Training data (uses self.X_train if None)
297
+
298
+ Returns:
299
+ Array of UncertaintyRegion enums
300
+ """
301
+ X_train = X_train or self.X_train
302
+ if X_train is None:
303
+ logger.warning("No training data provided, all points marked as extrapolation")
304
+ return np.full(X_test.shape[0], UncertaintyRegion.EXTRAPOLATION)
305
+
306
+ X_test = _validate_array(X_test, "X_test")
307
+ X_train = _validate_array(X_train, "X_train")
308
+
309
+ regions = np.full(X_test.shape[0], UncertaintyRegion.INTERPOLATION, dtype=object)
310
+
311
+ # Distance to nearest training point
312
+ distances = self._compute_distances_to_train(X_test, X_train)
313
+
314
+ # Training data hull (convex hull for 2D+, range for 1D)
315
+ if X_train.shape[1] == 1:
316
+ # 1D: check if within training range
317
+ train_min, train_max = np.min(X_train), np.max(X_train)
318
+ tolerance = 0.05 * (train_max - train_min)
319
+
320
+ for i, x in enumerate(X_test):
321
+ if x < train_min - tolerance or x > train_max + tolerance:
322
+ regions[i] = UncertaintyRegion.EXTRAPOLATION
323
+ elif x < train_min + tolerance or x > train_max - tolerance:
324
+ regions[i] = UncertaintyRegion.BOUNDARY
325
+ else:
326
+ # Multi-dimensional: use distance-based heuristic
327
+ train_diameter = np.max(distances)
328
+ threshold = train_diameter * 0.5
329
+
330
+ for i, d in enumerate(distances):
331
+ if d > threshold:
332
+ regions[i] = UncertaintyRegion.EXTRAPOLATION
333
+ elif d < train_diameter * 0.1:
334
+ regions[i] = UncertaintyRegion.BOUNDARY
335
+
336
+ # Refine with uncertainty magnitude
337
+ pred = self.predict(X_test)
338
+ high_unc_mask = pred.variance > np.percentile(pred.variance, 75)
339
+
340
+ for i in np.where(high_unc_mask)[0]:
341
+ if regions[i] == UncertaintyRegion.INTERPOLATION:
342
+ regions[i] = UncertaintyRegion.STRUCTURAL
343
+
344
+ return regions
345
+
346
+ def identify_uncertainty_regions(
347
+ self,
348
+ X_test: np.ndarray,
349
+ threshold_percentile: Optional[float] = None,
350
+ return_regions: bool = False,
351
+ ) -> Dict[str, Any]:
352
+ """
353
+ Identify and characterize high/low uncertainty regions.
354
+
355
+ Args:
356
+ X_test: Input locations
357
+ threshold_percentile: Percentile threshold for "high" uncertainty
358
+ return_regions: If True, include full region classification
359
+
360
+ Returns:
361
+ Dictionary with region characteristics and statistics
362
+ """
363
+ percentile = threshold_percentile or self.config.high_uncertainty_percentile
364
+
365
+ pred = self.predict(X_test)
366
+ var = pred.variance
367
+ threshold = np.percentile(var, percentile)
368
+
369
+ high_unc_mask = var.flatten() > threshold
370
+ low_unc_mask = ~high_unc_mask
371
+
372
+ result = {
373
+ "high_uncertainty": {
374
+ "points": X_test[high_unc_mask],
375
+ "values": var[high_unc_mask],
376
+ "indices": np.where(high_unc_mask)[0],
377
+ "mean_uncertainty": float(np.mean(var[high_unc_mask])) if np.any(high_unc_mask) else 0.0,
378
+ },
379
+ "low_uncertainty": {
380
+ "points": X_test[low_unc_mask],
381
+ "values": var[low_unc_mask],
382
+ "indices": np.where(low_unc_mask)[0],
383
+ "mean_uncertainty": float(np.mean(var[low_unc_mask])) if np.any(low_unc_mask) else 0.0,
384
+ },
385
+ "threshold": float(threshold),
386
+ "threshold_percentile": percentile,
387
+ "total_points": X_test.shape[0],
388
+ "high_uncertainty_ratio": float(np.mean(high_unc_mask)),
389
+ }
390
+
391
+ if return_regions:
392
+ regions = self.classify_regions(X_test)
393
+ result["region_breakdown"] = {
394
+ region.name: int(np.sum(regions == region))
395
+ for region in UncertaintyRegion
396
+ }
397
+ result["regions"] = regions
398
+
399
+ return result
400
+
401
+ def calibrate_uncertainty(
402
+ self,
403
+ X_val: np.ndarray,
404
+ y_val: np.ndarray,
405
+ method: str = "scaling",
406
+ ) -> Dict[str, float]:
407
+ """
408
+ Calibrate uncertainty using validation data.
409
+
410
+ Args:
411
+ X_val: Validation inputs
412
+ y_val: Validation targets
413
+ method: Calibration method ('scaling', 'isotonic', 'none')
414
+
415
+ Returns:
416
+ Calibration parameters and metrics
417
+ """
418
+ pred = self.predict(X_val)
419
+ residuals = np.abs(y_val.flatten() - pred.mean)
420
+ coverage = residuals < (self.config.default_confidence_level * pred.std)
421
+
422
+ empirical_coverage = np.mean(coverage)
423
+ target_coverage = 0.95 # For 2-sigma
424
+
425
+ if method == "scaling":
426
+ # Find optimal sigma scaling
427
+ from scipy.optimize import minimize_scalar
428
+
429
+ def objective(scale):
430
+ scaled_std = pred.std * scale
431
+ cov = np.mean(residuals < (self.config.default_confidence_level * scaled_std))
432
+ return (cov - target_coverage) ** 2
433
+
434
+ result = minimize_scalar(objective, bounds=(0.1, 10.0), method='bounded')
435
+ optimal_scale = result.x
436
+
437
+ return {
438
+ "method": "scaling",
439
+ "optimal_scale": float(optimal_scale),
440
+ "original_coverage": float(empirical_coverage),
441
+ "target_coverage": target_coverage,
442
+ "miscalibration": float(abs(empirical_coverage - target_coverage)),
443
+ }
444
+
445
+ return {
446
+ "method": "none",
447
+ "empirical_coverage": float(empirical_coverage),
448
+ "target_coverage": target_coverage,
449
+ }
450
+
451
+ def plot(
452
+ self,
453
+ X_test: np.ndarray,
454
+ *,
455
+ X_train: Optional[np.ndarray] = None,
456
+ y_train: Optional[np.ndarray] = None,
457
+ y_test: Optional[np.ndarray] = None,
458
+ ax: Optional["plt.Axes"] = None,
459
+ confidence_levels: Tuple[float, ...] = (1.0, 2.0),
460
+ plot_std: bool = False,
461
+ fill_alpha: float = 0.2,
462
+ color_mean: str = "#1f77b4",
463
+ color_fill: str = "#1f77b4",
464
+ color_train: str = "red",
465
+ show_regions: bool = False,
466
+ **kwargs,
467
+ ) -> "plt.Axes":
468
+ """
469
+ Comprehensive uncertainty visualization.
470
+
471
+ Delegates to plotting module for rendering.
472
+ """
473
+ from gpclarity.plotting import plot_uncertainty_profile
474
+
475
+ X_train = X_train or self.X_train
476
+
477
+ return plot_uncertainty_profile(
478
+ self,
479
+ X_test,
480
+ X_train=X_train,
481
+ y_train=y_train,
482
+ y_test=y_test,
483
+ ax=ax,
484
+ confidence_levels=confidence_levels,
485
+ plot_std=plot_std,
486
+ fill_alpha=fill_alpha,
487
+ color_mean=color_mean,
488
+ color_fill=color_fill,
489
+ color_train=color_train,
490
+ show_regions=show_regions,
491
+ **kwargs,
492
+ )
493
+
494
+ def get_summary(self, X_test: np.ndarray) -> Dict[str, Any]:
495
+ """
496
+ Generate comprehensive uncertainty summary report.
497
+
498
+ Args:
499
+ X_test: Test locations for analysis
500
+
501
+ Returns:
502
+ Dictionary with summary statistics and recommendations
503
+ """
504
+ diagnostics = self.compute_diagnostics(X_test)
505
+ regions = self.identify_uncertainty_regions(X_test, return_regions=True)
506
+
507
+ # Generate recommendations
508
+ recommendations = []
509
+
510
+ if diagnostics.coefficient_of_variation > 5.0:
511
+ recommendations.append(
512
+ "High uncertainty variance: consider adaptive sampling"
513
+ )
514
+
515
+ if regions["high_uncertainty_ratio"] > 0.5:
516
+ recommendations.append(
517
+ "More than 50% high uncertainty: model needs more data"
518
+ )
519
+
520
+ if diagnostics.n_extrapolation_points > 0:
521
+ recommendations.append(
522
+ f"{diagnostics.n_extrapolation_points} extrapolation points: "
523
+ "predictions unreliable in these regions"
524
+ )
525
+
526
+ if not diagnostics.is_well_calibrated:
527
+ recommendations.append(
528
+ "Uncertainty distribution unusual: check model specification"
529
+ )
530
+
531
+ return {
532
+ "diagnostics": {
533
+ "mean_uncertainty": diagnostics.mean_uncertainty,
534
+ "uncertainty_range": [
535
+ diagnostics.min_uncertainty,
536
+ diagnostics.max_uncertainty,
537
+ ],
538
+ "high_uncertainty_ratio": diagnostics.high_uncertainty_ratio,
539
+ "extrapolation_points": diagnostics.n_extrapolation_points,
540
+ },
541
+ "regions": regions.get("region_breakdown", {}),
542
+ "recommendations": recommendations,
543
+ "well_specified": diagnostics.is_well_calibrated,
544
+ }
545
+
546
+ def clear_cache(self) -> None:
547
+ """Clear internal prediction caches."""
548
+ self._prediction_cache.clear()
549
+ self._diagnostics_cache = None
550
+
551
+ @staticmethod
552
+ def _compute_skewness(arr: np.ndarray) -> float:
553
+ """Compute Fisher-Pearson skewness coefficient."""
554
+ if len(arr) < 3:
555
+ return 0.0
556
+ mean, std = np.mean(arr), np.std(arr)
557
+ if std == 0:
558
+ return 0.0
559
+ return float(np.mean(((arr - mean) / std) ** 3))
560
+
561
+ @staticmethod
562
+ def _compute_kurtosis(arr: np.ndarray) -> float:
563
+ """Compute excess kurtosis."""
564
+ if len(arr) < 4:
565
+ return 0.0
566
+ mean, std = np.mean(arr), np.std(arr)
567
+ if std == 0:
568
+ return 0.0
569
+ return float(np.mean(((arr - mean) / std) ** 4) - 3.0)
570
+
571
+ def _compute_distances_to_train(
572
+ self,
573
+ X_test: np.ndarray,
574
+ X_train: np.ndarray,
575
+ ) -> np.ndarray:
576
+ """Compute minimum distance from each test point to training set."""
577
+ # Efficient pairwise distance computation
578
+ if X_train.shape[0] > 1000:
579
+ # Use approximate nearest neighbor for large datasets
580
+ try:
581
+ from sklearn.neighbors import NearestNeighbors
582
+ nn = NearestNeighbors(n_neighbors=1, algorithm='auto')
583
+ nn.fit(X_train)
584
+ distances, _ = nn.kneighbors(X_test)
585
+ return distances.flatten()
586
+ except ImportError:
587
+ pass
588
+
589
+ # Exact computation
590
+ distances = np.zeros(X_test.shape[0])
591
+ for i, x in enumerate(X_test):
592
+ dists = np.linalg.norm(X_train - x, axis=1)
593
+ distances[i] = np.min(dists)
594
+ return distances
595
+
596
+
597
+ # High-level convenience functions
598
+ def quick_uncertainty_check(
599
+ model: Any,
600
+ X_test: np.ndarray,
601
+ X_train: Optional[np.ndarray] = None,
602
+ ) -> str:
603
+ """
604
+ One-line uncertainty assessment.
605
+
606
+ Returns:
607
+ Human-readable uncertainty summary
608
+ """
609
+ try:
610
+ profiler = UncertaintyProfiler(model, X_train=X_train)
611
+ diag = profiler.compute_diagnostics(X_test)
612
+
613
+ status = "Well-calibrated" if diag.is_well_calibrated else "Poorly-calibrated"
614
+ return (
615
+ f"{status}: mean σ²={diag.mean_uncertainty:.3e}, "
616
+ f"CV={diag.coefficient_of_variation:.2f}, "
617
+ f"{diag.n_extrapolation_points} extrapolation points"
618
+ )
619
+ except Exception as e:
620
+ return f"Uncertainty check failed: {e}"
621
+
622
+
623
+ def compare_uncertainty_profiles(
624
+ models: Dict[str, Any],
625
+ X_test: np.ndarray,
626
+ X_train: Optional[np.ndarray] = None,
627
+ ) -> Dict[str, UncertaintyDiagnostics]:
628
+ """
629
+ Compare uncertainty profiles across multiple models.
630
+
631
+ Args:
632
+ models: Dictionary mapping model names to model objects
633
+ X_test: Test locations
634
+ X_train: Optional training data
635
+
636
+ Returns:
637
+ Dictionary mapping model names to their diagnostics
638
+ """
639
+ results = {}
640
+ for name, model in models.items():
641
+ try:
642
+ profiler = UncertaintyProfiler(model, X_train=X_train)
643
+ results[name] = profiler.compute_diagnostics(X_test)
644
+ except Exception as e:
645
+ logger.error(f"Failed to profile {name}: {e}")
646
+ results[name] = None
647
+ return results