machinegnostics 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. __init__.py +0 -0
  2. machinegnostics/__init__.py +24 -0
  3. machinegnostics/magcal/__init__.py +37 -0
  4. machinegnostics/magcal/characteristics.py +460 -0
  5. machinegnostics/magcal/criteria_eval.py +268 -0
  6. machinegnostics/magcal/criterion.py +140 -0
  7. machinegnostics/magcal/data_conversion.py +381 -0
  8. machinegnostics/magcal/gcor.py +64 -0
  9. machinegnostics/magcal/gdf/__init__.py +2 -0
  10. machinegnostics/magcal/gdf/base_df.py +39 -0
  11. machinegnostics/magcal/gdf/base_distfunc.py +1202 -0
  12. machinegnostics/magcal/gdf/base_egdf.py +823 -0
  13. machinegnostics/magcal/gdf/base_eldf.py +830 -0
  14. machinegnostics/magcal/gdf/base_qgdf.py +1234 -0
  15. machinegnostics/magcal/gdf/base_qldf.py +1019 -0
  16. machinegnostics/magcal/gdf/cluster_analysis.py +456 -0
  17. machinegnostics/magcal/gdf/data_cluster.py +975 -0
  18. machinegnostics/magcal/gdf/data_intervals.py +853 -0
  19. machinegnostics/magcal/gdf/data_membership.py +536 -0
  20. machinegnostics/magcal/gdf/der_egdf.py +243 -0
  21. machinegnostics/magcal/gdf/distfunc_engine.py +841 -0
  22. machinegnostics/magcal/gdf/egdf.py +324 -0
  23. machinegnostics/magcal/gdf/eldf.py +297 -0
  24. machinegnostics/magcal/gdf/eldf_intv.py +609 -0
  25. machinegnostics/magcal/gdf/eldf_ma.py +627 -0
  26. machinegnostics/magcal/gdf/homogeneity.py +1218 -0
  27. machinegnostics/magcal/gdf/intv_engine.py +1523 -0
  28. machinegnostics/magcal/gdf/marginal_intv_analysis.py +558 -0
  29. machinegnostics/magcal/gdf/qgdf.py +289 -0
  30. machinegnostics/magcal/gdf/qldf.py +296 -0
  31. machinegnostics/magcal/gdf/scedasticity.py +197 -0
  32. machinegnostics/magcal/gdf/wedf.py +181 -0
  33. machinegnostics/magcal/gdf/z0_estimator.py +1047 -0
  34. machinegnostics/magcal/layer_base.py +42 -0
  35. machinegnostics/magcal/layer_history_base.py +74 -0
  36. machinegnostics/magcal/layer_io_process_base.py +238 -0
  37. machinegnostics/magcal/layer_param_base.py +448 -0
  38. machinegnostics/magcal/mg_weights.py +36 -0
  39. machinegnostics/magcal/sample_characteristics.py +532 -0
  40. machinegnostics/magcal/scale_optimization.py +185 -0
  41. machinegnostics/magcal/scale_param.py +313 -0
  42. machinegnostics/magcal/util/__init__.py +0 -0
  43. machinegnostics/magcal/util/dis_docstring.py +18 -0
  44. machinegnostics/magcal/util/logging.py +24 -0
  45. machinegnostics/magcal/util/min_max_float.py +34 -0
  46. machinegnostics/magnet/__init__.py +0 -0
  47. machinegnostics/metrics/__init__.py +28 -0
  48. machinegnostics/metrics/accu.py +61 -0
  49. machinegnostics/metrics/accuracy.py +67 -0
  50. machinegnostics/metrics/auto_correlation.py +183 -0
  51. machinegnostics/metrics/auto_covariance.py +204 -0
  52. machinegnostics/metrics/cls_report.py +130 -0
  53. machinegnostics/metrics/conf_matrix.py +93 -0
  54. machinegnostics/metrics/correlation.py +178 -0
  55. machinegnostics/metrics/cross_variance.py +167 -0
  56. machinegnostics/metrics/divi.py +82 -0
  57. machinegnostics/metrics/evalmet.py +109 -0
  58. machinegnostics/metrics/f1_score.py +128 -0
  59. machinegnostics/metrics/gmmfe.py +108 -0
  60. machinegnostics/metrics/hc.py +141 -0
  61. machinegnostics/metrics/mae.py +72 -0
  62. machinegnostics/metrics/mean.py +117 -0
  63. machinegnostics/metrics/median.py +122 -0
  64. machinegnostics/metrics/mg_r2.py +167 -0
  65. machinegnostics/metrics/mse.py +78 -0
  66. machinegnostics/metrics/precision.py +119 -0
  67. machinegnostics/metrics/r2.py +122 -0
  68. machinegnostics/metrics/recall.py +108 -0
  69. machinegnostics/metrics/rmse.py +77 -0
  70. machinegnostics/metrics/robr2.py +119 -0
  71. machinegnostics/metrics/std.py +144 -0
  72. machinegnostics/metrics/variance.py +101 -0
  73. machinegnostics/models/__init__.py +2 -0
  74. machinegnostics/models/classification/__init__.py +1 -0
  75. machinegnostics/models/classification/layer_history_log_reg.py +121 -0
  76. machinegnostics/models/classification/layer_io_process_log_reg.py +98 -0
  77. machinegnostics/models/classification/layer_mlflow_log_reg.py +107 -0
  78. machinegnostics/models/classification/layer_param_log_reg.py +275 -0
  79. machinegnostics/models/classification/mg_log_reg.py +273 -0
  80. machinegnostics/models/cross_validation.py +118 -0
  81. machinegnostics/models/data_split.py +106 -0
  82. machinegnostics/models/regression/__init__.py +2 -0
  83. machinegnostics/models/regression/layer_histroy_rob_reg.py +139 -0
  84. machinegnostics/models/regression/layer_io_process_rob_rig.py +88 -0
  85. machinegnostics/models/regression/layer_mlflow_rob_reg.py +134 -0
  86. machinegnostics/models/regression/layer_param_rob_reg.py +212 -0
  87. machinegnostics/models/regression/mg_lin_reg.py +253 -0
  88. machinegnostics/models/regression/mg_poly_reg.py +258 -0
  89. machinegnostics-0.0.1.dist-info/METADATA +246 -0
  90. machinegnostics-0.0.1.dist-info/RECORD +93 -0
  91. machinegnostics-0.0.1.dist-info/WHEEL +5 -0
  92. machinegnostics-0.0.1.dist-info/licenses/LICENSE +674 -0
  93. machinegnostics-0.0.1.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1218 @@
1
+ import numpy as np
2
+ import warnings
3
+ import matplotlib.pyplot as plt
4
+ from scipy.signal import find_peaks
5
+ from scipy.ndimage import gaussian_filter1d
6
+ from typing import Union, Dict, Any, Optional, Tuple, List
7
+ from machinegnostics.magcal import EGDF
8
+ import logging
9
+ from machinegnostics.magcal.util.logging import get_logger
10
+
11
+ class DataHomogeneity:
12
+ """
13
+ Analyze data homogeneity for EGDF objects using probability density function analysis.
14
+
15
+ This class provides comprehensive homogeneity analysis for Estimating Global Distribution Functions (EGDF)
16
+ by examining the shape and characteristics of their probability density functions (PDF). The
17
+ homogeneity criterion is based on the mathematical properties and expected PDF behavior of EGDF
18
+ according to gnostic theory principles.
19
+
20
+ **Gnostic Theory Foundation:**
21
+
22
+ The EGDF is uniquely determined by the data sample and finds the optimal scale parameter automatically.
23
+ Unlike local distribution functions, EGDF has limited flexibility and provides a unique representation
24
+ for each homogeneous data sample. The key principle is that homogeneous data should produce a
25
+ distribution with a single density maximum, while non-homogeneous data will exhibit multiple maxima
26
+ or negative density values.
27
+
28
+ **Homogeneity Criteria:**
29
+
30
+ - **EGDF (Estimating Global Distribution Function)**: Data is considered homogeneous if:
31
+ 1. PDF has exactly one global maximum (single peak)
32
+ 2. PDF contains no negative values
33
+
34
+ **EGDF Characteristics:**
35
+
36
+ - **Uniqueness**: EGDF finds the best scale parameter automatically, providing a unique model
37
+ - **Robustness**: EGDF is robust with respect to outliers
38
+ - **Homogeneity Testing**: Particularly suitable for reliable data homogeneity testing
39
+ - **Global Nature**: Uses normalized weights resulting in limited flexibility controlled by optimal scale
40
+ - **Data-Driven**: Primary parameters are the data themselves, following gnostic "let data speak" principle
41
+
42
+ **Non-Homogeneity Detection:**
43
+
44
+ EGDF can sensitively detect two main causes of non-homogeneity:
45
+ 1. **Outliers**: Individual data points significantly different from others, creating local maxima
46
+ 2. **Clusters**: Separate groups in the data, resulting in multiple density peaks
47
+
48
+ **Key Features:**
49
+
50
+ - Automatic EGDF validation
51
+ - Robust peak detection with configurable smoothing
52
+ - Comprehensive error and warning tracking
53
+ - Memory management with optional data flushing
54
+ - Detailed visualization of analysis results
55
+ - Integration with existing GDF parameter systems
56
+
57
+ **Analysis Pipeline:**
58
+
59
+ 1. **Validation**: Ensures input is EGDF only (rejects QGDF/ELDF/QLDF)
60
+ 2. **PDF Extraction**: Retrieves PDF points from fitted EGDF object
61
+ 3. **Smoothing**: Applies Gaussian filtering for noise reduction
62
+ 4. **Maxima Detection**: Identifies peaks in the smoothed PDF
63
+ 5. **Homogeneity Assessment**: Evaluates based on peak count and PDF negativity
64
+ 6. **Result Storage**: Comprehensive parameter collection and storage
65
+
66
+ Parameters
67
+ ----------
68
+ gdf : EGDF
69
+ A fitted Estimating Global Distribution Function object. Must be EGDF
70
+ (QGDF, ELDF and QLDF are not supported). The object must:
71
+ - Be fitted (gdf._fitted == True)
72
+ - Have catch=True to generate required pdf_points and di_points_n
73
+ - Contain valid data and PDF information
74
+ - Have optimized scale parameter S_opt from EGDF fitting process
75
+
76
+ verbose : bool, default=True
77
+ Controls output verbosity during analysis.
78
+ - True: Prints detailed progress, warnings, and results
79
+ - False: Silent operation (errors still raise exceptions)
80
+
81
+ catch : bool, default=True
82
+ Enables comprehensive result storage in params dictionary.
83
+ - True: Stores all analysis results, parameters, and metadata
84
+ - False: Minimal storage (not recommended for most use cases)
85
+
86
+ flush : bool, default=False
87
+ Controls memory management of large arrays after analysis.
88
+ - True: Clears pdf_points and di_points_n from GDF object to save memory
89
+ - False: Preserves all data arrays (recommended for further analysis)
90
+
91
+ smoothing_sigma : float, default=1.0
92
+ Gaussian smoothing parameter for PDF preprocessing before peak detection.
93
+ - Larger values: More aggressive smoothing, may merge distinct features
94
+ - Smaller values: Less smoothing, may detect noise as features
95
+ - Range: 0.1 to 5.0 (typical), must be positive
96
+ - Important for numerical sensitivity beyond visual inspection
97
+
98
+ min_height_ratio : float, default=0.01
99
+ Minimum relative height threshold for peak detection.
100
+ - Expressed as fraction of global maximum height
101
+ - Range: 0.001 to 0.1 (typical)
102
+ - Higher values: More selective, fewer detected peaks
103
+ - Lower values: More sensitive, may include noise
104
+
105
+ min_distance : Optional[int], default=None
106
+ Minimum separation between detected peaks in array indices.
107
+ - None: Automatically calculated as len(pdf_data) // 20
108
+ - Integer: Explicit minimum distance constraint
109
+ - Prevents detection of closely spaced spurious peaks
110
+
111
+ Attributes
112
+ ----------
113
+ is_homogeneous : bool or None
114
+ Primary analysis result. None before fit(), True/False after analysis
115
+
116
+ picks : List[Dict]
117
+ Detected maxima with detailed information:
118
+ - index: Array index of maximum
119
+ - position: Data value at maximum
120
+ - pdf_value: Original PDF value at maximum
121
+ - smoothed_pdf_value: Smoothed PDF value at maximum
122
+ - is_global: Boolean indicating global maximum
123
+
124
+ z0 : float or None
125
+ Global optimum value from EGDF object or detected from PDF
126
+
127
+ global_extremum_idx : int or None
128
+ Array index of the global maximum
129
+
130
+ fitted : bool
131
+ Read-only property indicating if analysis has been completed
132
+
133
+ Raises
134
+ ------
135
+ ValueError
136
+ - If input is not EGDF object
137
+ - If GDF object is not fitted
138
+ - If required attributes are missing
139
+
140
+ AttributeError
141
+ - If EGDF object lacks pdf_points (catch=False during EGDF fitting)
142
+ - If required EGDF attributes are not accessible
143
+
144
+ RuntimeError
145
+ - If fit() method fails due to numerical issues
146
+ - If plot() or results() called before fit()
147
+
148
+ Examples
149
+ --------
150
+ **Basic Homogeneity Analysis with EGDF:**
151
+
152
+ >>> import numpy as np
153
+ >>> from machinegnostics.magcal import EGDF
154
+ >>> from machinegnostics.magcal import DataHomogeneity
155
+ >>>
156
+ >>> # Prepare homogeneous data (single cluster)
157
+ >>> data = np.array([1.0, 1.1, 1.2, 0.9, 1.0, 1.1])
158
+ >>>
159
+ >>> # Fit EGDF with catch=True (required for homogeneity analysis)
160
+ >>> egdf = EGDF(data=data, catch=True, verbose=False)
161
+ >>> egdf.fit() # Automatically finds optimal scale parameter
162
+ >>>
163
+ >>> # Analyze homogeneity
164
+ >>> homogeneity = DataHomogeneity(egdf, verbose=True)
165
+ >>> is_homogeneous = homogeneity.fit()
166
+ >>> print(f"Data is homogeneous: {is_homogeneous}")
167
+ >>>
168
+ >>> # Visualize results
169
+ >>> homogeneity.plot()
170
+ >>>
171
+ >>> # Get detailed results
172
+ >>> results = homogeneity.results()
173
+ >>> print(f"Number of maxima detected: {len(results['picks'])}")
174
+
175
+ **EGDF Analysis with Multiple Clusters:**
176
+
177
+ >>> # Heterogeneous data (multiple clusters)
178
+ >>> data = np.array([1, 2, 3, 10, 11, 12, 20, 21, 22])
179
+ >>>
180
+ >>> # Fit EGDF (will find optimal S automatically)
181
+ >>> egdf = EGDF(data=data, catch=True)
182
+ >>> egdf.fit()
183
+ >>>
184
+ >>> # Analyze with custom smoothing for numerical sensitivity
185
+ >>> homogeneity = DataHomogeneity(
186
+ ... egdf,
187
+ ... verbose=True,
188
+ ... smoothing_sigma=2.0, # More aggressive smoothing
189
+ ... min_height_ratio=0.05, # Higher threshold
190
+ ... flush=True # Save memory
191
+ ... )
192
+ >>>
193
+ >>> is_homogeneous = homogeneity.fit()
194
+ >>> # Expected: False due to multiple clusters creating multiple maxima
195
+
196
+ **Outlier Detection Example:**
197
+
198
+ >>> # Data with outlier
199
+ >>> data = np.array([5, 5.1, 5.2, 4.9, 5.0, 15.0]) # 15.0 is outlier
200
+ >>>
201
+ >>> # Fit EGDF
202
+ >>> egdf = EGDF(data=data, catch=True)
203
+ >>> egdf.fit()
204
+ >>>
205
+ >>> # Analyze homogeneity
206
+ >>> homogeneity = DataHomogeneity(egdf, verbose=True)
207
+ >>> is_homogeneous = homogeneity.fit()
208
+ >>> # Expected: False due to outlier creating additional local maximum
209
+
210
+ **Error Handling and Parameter Access:**
211
+
212
+ >>> # Access comprehensive results
213
+ >>> results = homogeneity.results()
214
+ >>>
215
+ >>> # Check for analysis errors
216
+ >>> if 'errors' in results:
217
+ ... print("Analysis errors:", results['errors'])
218
+ >>>
219
+ >>> # Check for warnings
220
+ >>> if 'warnings' in results:
221
+ ... print("Analysis warnings:", results['warnings'])
222
+ >>>
223
+ >>> # Access EGDF parameters
224
+ >>> gdf_params = results['gdf_parameters']
225
+ >>> print(f"Optimal scale parameter: {gdf_params.get('S_opt', 'Not found')}")
226
+ >>> print(f"Global optimum Z0: {gdf_params.get('z0', 'Not found')}")
227
+
228
+ **Memory Management:**
229
+
230
+ >>> # For large datasets, use flush=True to save memory
231
+ >>> large_data = np.random.normal(0, 1, 10000)
232
+ >>> egdf_large = EGDF(data=large_data, catch=True)
233
+ >>> egdf_large.fit()
234
+ >>>
235
+ >>> # Analysis with memory cleanup
236
+ >>> homogeneity = DataHomogeneity(egdf_large, flush=True)
237
+ >>> homogeneity.fit() # pdf_points and di_points_n cleared after analysis
238
+
239
+ Notes
240
+ -----
241
+ **Mathematical Background:**
242
+
243
+ The gnostic homogeneity analysis is based on the principle that homogeneous data should
244
+ produce a unimodal PDF with specific characteristics for EGDF:
245
+
246
+ - **EGDF Uniqueness**: Each data sample has exactly one optimal EGDF representation
247
+ - **Scale Optimization**: EGDF automatically finds the best scale parameter S_opt
248
+ - **Density Properties**: Homogeneous data produces single maximum, non-negative density
249
+ - **Numerical Sensitivity**: Analysis must be numerical, not based on visual inspection
250
+
251
+ **Why Only EGDF:**
252
+
253
+ Homogeneity testing is only applicable to EGDF because:
254
+ - EGDF provides unique representation for each data sample
255
+ - Automatic scale parameter optimization enables reliable homogeneity testing
256
+ - Global nature with normalized weights makes it suitable for detecting data structure
257
+ - Robustness against outliers while maintaining sensitivity to detect them
258
+ - QGDF, ELDF, and QLDF have different mathematical properties unsuitable for this analysis
259
+
260
+ **Gnostic Principles Applied:**
261
+
262
+ - **Data Primacy**: Data are the primary parameters determining the distribution
263
+ - **Let Data Speak**: Analysis relies on data-driven optimal parameters
264
+ - **Unique Representation**: EGDF provides the one and only best representation
265
+ - **Numerical Decision Making**: Homogeneity decisions must be numerical, not visual
266
+
267
+ **Parameter Tuning Guidelines:**
268
+
269
+ - **smoothing_sigma**: Start with 1.0, increase for noisy data to improve numerical stability
270
+ - **min_height_ratio**: Start with 0.01, increase to reduce false positives from noise
271
+ - **min_distance**: Usually auto-calculated, manually set for specific data characteristics
272
+ - Remember: Visual inspection can be misleading, rely on numerical analysis
273
+
274
+ **Performance Considerations:**
275
+
276
+ - Memory usage scales with data size due to PDF point storage
277
+ - Use flush=True for large datasets if PDF data not needed afterward
278
+ - Smoothing adds computational cost but improves numerical robustness
279
+ - EGDF fitting provides optimal parameters, reducing computational overhead
280
+
281
+ **Integration with Existing Workflows:**
282
+
283
+ This class integrates seamlessly with existing EGDF workflows:
284
+ - Reads parameters from fitted EGDF objects including S_opt
285
+ - Appends errors/warnings to existing EGDF parameter collections
286
+ - Updates EGDF objects with homogeneity results
287
+ - Preserves all original EGDF functionality and gnostic principles
288
+ - Works with EGDF's automatic parameter optimization
289
+
290
+ **Theoretical Foundation:**
291
+
292
+ Based on gnostic theory where:
293
+ - Global distribution functions assume data sample homogeneity
294
+ - Non-homogeneous samples exhibit multiple density maxima or negative densities
295
+ - EGDF's unique scale parameter enables reliable homogeneity hypothesis testing
296
+ - Robustness properties make EGDF particularly suitable for small, widely spread samples
297
+
298
+ See Also
299
+ --------
300
+ EGDF : Estimating Global Distribution Function
301
+ """
302
+
303
+ def __init__(self, gdf: EGDF, verbose=True, catch=True, flush=False,
304
+ smoothing_sigma=1.0, min_height_ratio=0.01, min_distance=None):
305
+ self.gdf = gdf
306
+ self.verbose = verbose
307
+ self.catch = catch
308
+ self.flush = flush
309
+ self.params = {}
310
+ self._fitted = False
311
+
312
+ # Analysis parameters
313
+ self.smoothing_sigma = smoothing_sigma
314
+ self.min_height_ratio = min_height_ratio
315
+ self.min_distance = min_distance
316
+
317
+ # Results
318
+ self.z0 = None
319
+ self.picks = []
320
+ self.is_homogeneous = None
321
+ self.global_extremum_idx = None
322
+
323
+ # Logger setup
324
+ self.logger = get_logger(self.__class__.__name__, level=logging.DEBUG if verbose else logging.ERROR)
325
+ self.logger.debug(f"{self.__class__.__name__} initialized: ")
326
+
327
+ self._gdf_obj_validation()
328
+ self._validate_egdf_only()
329
+
330
+ def _validate_egdf_only(self):
331
+ """Validate that the GDF object is EGDF only."""
332
+ self.logger.info("Validating GDF object for DataHomogeneity analysis")
333
+ class_name = self.gdf.__class__.__name__
334
+
335
+ if 'QGDF' in class_name:
336
+ self.logger.error(f"DataHomogeneity only supports EGDF objects. "
337
+ f"Received {class_name}. QGDF is not supported for homogeneity analysis.")
338
+ raise ValueError(
339
+ f"DataHomogeneity only supports EGDF objects. "
340
+ f"Received {class_name}. QGDF is not supported for homogeneity analysis."
341
+ )
342
+
343
+ if 'ELDF' in class_name or 'QLDF' in class_name:
344
+ self.logger.error(f"DataHomogeneity only supports EGDF objects. "
345
+ f"Received {class_name}. Local distribution functions (ELDF, QLDF) are not supported "
346
+ f"for homogeneity analysis.")
347
+ raise ValueError(
348
+ f"DataHomogeneity only supports EGDF objects. "
349
+ f"Received {class_name}. Local distribution functions (ELDF, QLDF) are not supported "
350
+ f"for homogeneity analysis."
351
+ )
352
+
353
+ if 'EGDF' not in class_name:
354
+ # Fallback detection based on methods
355
+ if not hasattr(self.gdf, '_fit_egdf'):
356
+ self.logger.error(f"DataHomogeneity only supports EGDF objects. "
357
+ f"Cannot determine if {class_name} is EGDF. "
358
+ f"Object must be EGDF for homogeneity analysis.")
359
+ raise ValueError(
360
+ f"DataHomogeneity only supports EGDF objects. "
361
+ f"Cannot determine if {class_name} is EGDF. "
362
+ f"Object must be EGDF for homogeneity analysis."
363
+ )
364
+
365
+ def _gdf_obj_validation(self):
366
+ """Validate that the EGDF object meets requirements for homogeneity analysis."""
367
+ self.logger.debug("Validating EGDF object attributes for homogeneity analysis")
368
+ if not hasattr(self.gdf, '_fitted'):
369
+ self.logger.error("EGDF object must have _fitted attribute")
370
+ raise ValueError("EGDF object must have _fitted attribute")
371
+
372
+ if not self.gdf._fitted:
373
+ self.logger.error("EGDF object must be fitted before homogeneity analysis")
374
+ raise ValueError("EGDF object must be fitted before homogeneity analysis")
375
+
376
+ required_attrs = ['data']
377
+ for attr in required_attrs:
378
+ if not hasattr(self.gdf, attr):
379
+ self.logger.error(f"EGDF object missing required attribute: {attr}")
380
+ raise ValueError(f"EGDF object missing required attribute: {attr}")
381
+
382
+ if not (hasattr(self.gdf, 'pdf_points') and self.gdf.pdf_points is not None):
383
+ if hasattr(self.gdf, 'catch') and not self.gdf.catch:
384
+ self.logger.error("EGDF object must have catch=True to generate "
385
+ "pdf_points required for homogeneity analysis.")
386
+ raise AttributeError(
387
+ f"EGDF object must have catch=True to generate "
388
+ f"pdf_points required for homogeneity analysis."
389
+ )
390
+ else:
391
+ self.logger.error("EGDF object is missing 'pdf_points'. "
392
+ "Please ensure catch=True when fitting EGDF.")
393
+ raise AttributeError(
394
+ f"EGDF object is missing 'pdf_points'. "
395
+ f"Please ensure catch=True when fitting EGDF."
396
+ )
397
+
398
+ def _prepare_params_from_gdf(self):
399
+ """Extract and prepare parameters from the EGDF object."""
400
+ self.logger.debug("Extracting parameters from EGDF object")
401
+ gdf_params = {}
402
+
403
+ # Extract basic parameters
404
+ if hasattr(self.gdf, 'params') and self.gdf.params:
405
+ gdf_params.update(self.gdf.params)
406
+
407
+ # Extract direct attributes
408
+ direct_attrs = ['S', 'S_opt', 'z0', 'data', 'pdf_points', 'di_points_n']
409
+ for attr in direct_attrs:
410
+ if hasattr(self.gdf, attr):
411
+ value = getattr(self.gdf, attr)
412
+ if value is not None:
413
+ gdf_params[attr] = value
414
+
415
+ return gdf_params
416
+
417
+ def _append_error(self, error_message, exception_type=None):
418
+ """Append error to existing errors in EGDF params or create new ones."""
419
+ self.logger.error(error_message)
420
+ error_entry = {
421
+ 'method': 'DataHomogeneity',
422
+ 'error': error_message,
423
+ 'exception_type': exception_type or 'DataHomogeneityError'
424
+ }
425
+
426
+ # Add to EGDF object params if possible
427
+ if hasattr(self.gdf, 'params'):
428
+ if 'errors' not in self.gdf.params:
429
+ self.gdf.params['errors'] = []
430
+ self.gdf.params['errors'].append(error_entry)
431
+
432
+ # Also add to local params
433
+ if 'errors' not in self.params:
434
+ self.params['errors'] = []
435
+ self.params['errors'].append(error_entry)
436
+
437
+ def _append_warning(self, warning_message):
438
+ """Append warning to existing warnings in EGDF params or create new ones."""
439
+ self.logger.warning(warning_message)
440
+ warning_entry = {
441
+ 'method': 'DataHomogeneity',
442
+ 'warning': warning_message
443
+ }
444
+
445
+ # Add to EGDF object params if possible
446
+ if hasattr(self.gdf, 'params'):
447
+ if 'warnings' not in self.gdf.params:
448
+ self.gdf.params['warnings'] = []
449
+ self.gdf.params['warnings'].append(warning_entry)
450
+
451
+ # Also add to local params
452
+ if 'warnings' not in self.params:
453
+ self.params['warnings'] = []
454
+ self.params['warnings'].append(warning_entry)
455
+
456
+ def _flush_memory(self):
457
+ """Flush di_points and pdf_points from memory if flush=True."""
458
+ self.logger.info("Flushing memory if flush=True")
459
+ if self.flush:
460
+ # # Flush from EGDF object attributes
461
+ # if hasattr(self.gdf, 'di_points_n'):
462
+ # self.gdf.di_points_n = None
463
+ # if self.verbose:
464
+ # print("Flushed di_points_n from EGDF object to save memory.")
465
+
466
+ # if hasattr(self.gdf, 'pdf_points'):
467
+ # self.gdf.pdf_points = None
468
+ # if self.verbose:
469
+ # print("Flushed pdf_points from EGDF object to save memory.")
470
+
471
+ # Flush from EGDF object params dictionary
472
+ if hasattr(self.gdf, 'params') and self.gdf.params:
473
+ if 'di_points_n' in self.gdf.params:
474
+ del self.gdf.params['di_points_n']
475
+ self.logger.info("Removed di_points_n from EGDF params dictionary to save memory.")
476
+
477
+ if 'pdf_points' in self.gdf.params:
478
+ del self.gdf.params['pdf_points']
479
+ self.logger.info("Removed pdf_points from EGDF params dictionary to save memory.")
480
+
481
+ # Also flush from local params if they exist
482
+ if 'gdf_parameters' in self.params and self.params['gdf_parameters']:
483
+ if 'di_points_n' in self.params['gdf_parameters']:
484
+ del self.params['gdf_parameters']['di_points_n']
485
+ self.logger.info("Removed di_points_n from local gdf_parameters to save memory.")
486
+
487
+ if 'pdf_points' in self.params['gdf_parameters']:
488
+ del self.params['gdf_parameters']['pdf_points']
489
+ self.logger.info("Removed pdf_points from local gdf_parameters to save memory.")
490
+
491
+ def fit(self, plot: bool = False) -> bool:
492
+ """
493
+ Perform comprehensive homogeneity analysis on the EGDF object.
494
+
495
+ This is the primary analysis method that executes the complete homogeneity assessment
496
+ pipeline. It analyzes the probability density function (PDF) of the fitted EGDF object
497
+ to determine if the underlying data exhibits homogeneous characteristics based on
498
+ peak detection and PDF properties.
499
+
500
+ **Analysis Pipeline:**
501
+
502
+ 1. **Parameter Extraction**: Retrieves comprehensive parameters from the input EGDF object
503
+ 2. **PDF Processing**: Applies Gaussian smoothing to reduce noise and improve detection
504
+ 3. **Peak Detection**: Identifies maxima in the smoothed PDF
505
+ 4. **Homogeneity Assessment**: Evaluates based on peak count and PDF negativity
506
+ 5. **Result Storage**: Stores comprehensive analysis results and metadata
507
+ 6. **Memory Management**: Optionally flushes large arrays to conserve memory
508
+
509
+ **Homogeneity Criteria:**
510
+
511
+ - **EGDF**: Data is homogeneous if PDF has exactly one global maximum and no negative values
512
+
513
+ The method automatically handles parameter tuning, error tracking, and integration
514
+ with the existing EGDF parameter system.
515
+
516
+ Parameters
517
+ ----------
518
+ plot : bool, optional
519
+ If True, generates plots for visual inspection of the analysis results.
520
+ - True: Displays plots of original and smoothed PDF with detected maxima
521
+
522
+ Returns
523
+ -------
524
+ bool
525
+ The primary homogeneity result:
526
+ - True: Data exhibits homogeneous characteristics
527
+ - False: Data is heterogeneous (multiple maxima or negative PDF values)
528
+
529
+ Raises
530
+ ------
531
+ RuntimeError
532
+ If the analysis fails due to:
533
+ - Numerical instabilities in PDF processing
534
+ - Insufficient or corrupted PDF data
535
+ - Memory allocation issues during processing
536
+
537
+ AttributeError
538
+ If the EGDF object lacks required attributes:
539
+ - Missing pdf_points (ensure catch=True during EGDF fitting)
540
+ - Missing di_points_n for position mapping
541
+ - Invalid or incomplete EGDF state
542
+
543
+ ValueError
544
+ If analysis parameters are invalid:
545
+ - Negative smoothing_sigma
546
+ - Invalid min_height_ratio (not between 0 and 1)
547
+ - Corrupted PDF data (NaN, infinite values)
548
+
549
+ Side Effects
550
+ -----------
551
+ - Updates self.is_homogeneous with the analysis result
552
+ - Populates self.picks with detected maxima information
553
+ - Sets self.z0 with the global optimum value
554
+ - Updates self.global_extremum_idx with the maximum location
555
+ - Modifies EGDF object params with homogeneity results (if catch=True)
556
+ - May clear pdf_points and di_points_n from EGDF object (if flush=True)
557
+ - Appends any errors or warnings to existing EGDF error/warning collections
558
+
559
+ Examples
560
+ --------
561
+ **Basic Usage:**
562
+
563
+ >>> # After creating DataHomogeneity instance
564
+ >>> homogeneity = DataHomogeneity(egdf_object, verbose=True)
565
+ >>> is_homogeneous = homogeneity.fit()
566
+ >>> print(f"Analysis complete. Homogeneous: {is_homogeneous}")
567
+
568
+ **Memory Management:**
569
+
570
+ >>> # For large datasets
571
+ >>> homogeneity = DataHomogeneity(large_egdf, flush=True)
572
+ >>> result = homogeneity.fit() # Automatically frees memory after analysis
573
+
574
+ **Integration with Workflows:**
575
+
576
+ >>> # Analysis integrates seamlessly with existing EGDF workflows
577
+ >>> egdf.fit() # Standard EGDF fitting
578
+ >>> homogeneity = DataHomogeneity(egdf)
579
+ >>> homogeneity.fit() # Homogeneity analysis
580
+ >>>
581
+ >>> # Results now available in both objects
582
+ >>> print("EGDF homogeneity flag:", egdf.params['is_homogeneous'])
583
+ >>> print("Detailed analysis:", homogeneity.results())
584
+
585
+ Notes
586
+ -----
587
+ **Performance Considerations:**
588
+
589
+ - Processing time scales approximately O(n log n) with PDF length
590
+ - Memory usage depends on PDF resolution and catch parameter
591
+ - Smoothing adds computational overhead but improves robustness
592
+
593
+ **Parameter Sensitivity:**
594
+
595
+ The analysis robustness depends on proper parameter tuning:
596
+ - Increase smoothing_sigma for noisy data
597
+ - Adjust min_height_ratio to control sensitivity
598
+ - Set appropriate min_distance to avoid spurious detections
599
+
600
+ **Mathematical Foundation:**
601
+
602
+ The method implements gnostic homogeneity theory where:
603
+ - Homogeneous data should produce unimodal PDFs
604
+ - EGDF represents optimal scale parameter selection (expect single peak)
605
+
606
+ **Quality Assurance:**
607
+
608
+ The method includes comprehensive validation:
609
+ - PDF integrity checks (no NaN, infinite values)
610
+ - Parameter bounds validation
611
+ - Numerical stability monitoring
612
+ - Automatic fallback strategies for edge cases
613
+
614
+ See Also
615
+ --------
616
+ plot : Visualize the analysis results
617
+ results : Access comprehensive analysis data
618
+ """
619
+ self.logger.info("Starting homogeneity analysis fit() method")
620
+ try:
621
+ # Prepare parameters from EGDF
622
+ self.logger.debug("Preparing parameters from EGDF object")
623
+ gdf_params = self._prepare_params_from_gdf()
624
+
625
+ # Set minimum distance if not provided
626
+ if self.min_distance is None:
627
+ self.logger.debug("Minimum distance not provided, calculating...")
628
+ pdf_data = self._get_pdf_data()
629
+ self.min_distance = max(1, len(pdf_data) // 20)
630
+
631
+ # Perform homogeneity test
632
+ self.logger.info("Testing homogeneity")
633
+ self.is_homogeneous = self._test_homogeneity()
634
+
635
+ # Extract Z0
636
+ self.logger.info("Extracting global optimum Z0")
637
+ self.z0 = self._get_z0()
638
+
639
+ # Store comprehensive results
640
+ if self.catch:
641
+ self.params.update({
642
+ 'gdf_type': 'egdf',
643
+ 'is_homogeneous': self.is_homogeneous,
644
+ 'picks': self.picks,
645
+ 'z0': self.z0,
646
+ 'global_extremum_idx': self.global_extremum_idx,
647
+ 'analysis_parameters': {
648
+ 'smoothing_sigma': self.smoothing_sigma,
649
+ 'min_height_ratio': self.min_height_ratio,
650
+ 'min_distance': self.min_distance,
651
+ 'flush': self.flush
652
+ },
653
+ 'homogeneity_fitted': True
654
+ })
655
+
656
+ # Include EGDF parameters
657
+ self.params['gdf_parameters'] = gdf_params
658
+
659
+ # Update EGDF object params if possible
660
+ if hasattr(self.gdf, 'catch') and self.gdf.catch and hasattr(self.gdf, 'params'):
661
+ self.gdf.params.update({
662
+ 'is_homogeneous': self.is_homogeneous,
663
+ 'homogeneity_checked': True,
664
+ 'homogeneity_fitted': True
665
+ })
666
+
667
+ self.logger.info("Homogeneity results written to EGDF params dictionary.")
668
+
669
+ self._fitted = True
670
+
671
+ # plot
672
+ if plot:
673
+ self.logger.info("Plotting results as requested")
674
+ self.plot()
675
+
676
+ # Flush memory if requested
677
+ self.logger.info("Handling memory flush if requested")
678
+ self._flush_memory()
679
+
680
+ self.logger.info("Homogeneity analysis completed for EGDF.")
681
+ self.logger.info(f"Data is {'homogeneous' if self.is_homogeneous else 'not homogeneous'}")
682
+ self.logger.info(f"Number of maxima detected: {len(self.picks)}")
683
+
684
+ return self.is_homogeneous
685
+
686
+ except Exception as e:
687
+ error_msg = f"Error during homogeneity analysis: {str(e)}"
688
+ self._append_error(error_msg, type(e).__name__)
689
+ raise
690
+
691
+ def _test_homogeneity(self):
692
+ """
693
+ Test data homogeneity for EGDF.
694
+
695
+ Returns
696
+ -------
697
+ bool
698
+ True if homogeneous, False otherwise.
699
+ """
700
+ self.logger.info("Starting homogeneity test for EGDF")
701
+ try:
702
+ pdf_data = self._get_pdf_data()
703
+ has_negative_pdf = np.any(pdf_data < 0)
704
+
705
+ # EGDF: Look for single global maximum
706
+ self.picks = self._detect_maxima()
707
+ extrema_type = "maxima"
708
+ num_extrema = len(self.picks)
709
+ is_homogeneous = not has_negative_pdf and num_extrema == 1
710
+
711
+ if self.verbose:
712
+ if not is_homogeneous:
713
+ reasons = []
714
+ if has_negative_pdf:
715
+ reasons.append("PDF has negative values")
716
+ self._append_warning("PDF contains negative values - may indicate numerical issues")
717
+ if num_extrema > 1:
718
+ reasons.append(f"multiple {extrema_type} [{num_extrema}] detected")
719
+ self._append_warning(f"Multiple {extrema_type} detected - data may not be homogeneous")
720
+ elif num_extrema == 0:
721
+ reasons.append(f"no significant {extrema_type} detected")
722
+ self._append_warning(f"No significant {extrema_type} detected - check smoothing parameters")
723
+ self.logger.info(f"EGDF data is not homogeneous: {', '.join(reasons)}.")
724
+ else:
725
+ self.logger.info(f"EGDF data is homogeneous: PDF has no negative values "
726
+ f"and exactly one {extrema_type[:-1]} detected.")
727
+
728
+ # Store additional info in params
729
+ if self.catch:
730
+ self.params.update({
731
+ 'has_negative_pdf': has_negative_pdf,
732
+ f'num_{extrema_type}': num_extrema,
733
+ 'extrema_type': extrema_type
734
+ })
735
+
736
+ return is_homogeneous
737
+
738
+ except Exception as e:
739
+ error_msg = f"Error in homogeneity test: {str(e)}"
740
+ self._append_error(error_msg, type(e).__name__)
741
+ raise
742
+
743
+ def _detect_maxima(self):
744
+ """Detect maxima for EGDF analysis."""
745
+ self.logger.info("Detecting maxima in the PDF")
746
+ try:
747
+ pdf_data = self._get_pdf_data()
748
+ data_points = self._get_data_points()
749
+ smoothed_pdf = self._smooth_pdf()
750
+
751
+ min_height = np.max(smoothed_pdf) * self.min_height_ratio
752
+ maxima_idx, _ = find_peaks(smoothed_pdf,
753
+ height=min_height,
754
+ distance=self.min_distance)
755
+
756
+ picks = []
757
+ global_max_value = -np.inf
758
+
759
+ for idx in maxima_idx:
760
+ pick_info = {
761
+ 'index': int(idx),
762
+ 'position': float(data_points[idx]),
763
+ 'pdf_value': float(pdf_data[idx]),
764
+ 'smoothed_pdf_value': float(smoothed_pdf[idx]),
765
+ 'is_global': False
766
+ }
767
+ picks.append(pick_info)
768
+
769
+ if smoothed_pdf[idx] > global_max_value:
770
+ global_max_value = smoothed_pdf[idx]
771
+ self.global_extremum_idx = idx
772
+
773
+ # Mark global maximum
774
+ for pick in picks:
775
+ if pick['index'] == self.global_extremum_idx:
776
+ pick['is_global'] = True
777
+ break
778
+
779
+ # Sort by importance (global first, then by height)
780
+ picks.sort(key=lambda x: (not x['is_global'], -x['smoothed_pdf_value']))
781
+
782
+ return picks
783
+
784
+ except Exception as e:
785
+ error_msg = f"Error detecting maxima: {str(e)}"
786
+ self._append_error(error_msg, type(e).__name__)
787
+ return []
788
+
789
+ def _smooth_pdf(self):
790
+ """Apply Gaussian smoothing to PDF."""
791
+ self.logger.info("Smoothing PDF with Gaussian filter")
792
+ try:
793
+ pdf_data = self._get_pdf_data()
794
+ return gaussian_filter1d(pdf_data, sigma=self.smoothing_sigma)
795
+ except Exception as e:
796
+ error_msg = f"Error smoothing PDF: {str(e)}"
797
+ self._append_error(error_msg, type(e).__name__)
798
+ return pdf_data # Return unsmoothed data as fallback
799
+
800
+ def _get_pdf_data(self):
801
+ """Get PDF values from the EGDF object."""
802
+ self.logger.info("Retrieving PDF data from EGDF object")
803
+ return self.gdf.pdf_points
804
+
805
+ def _get_data_points(self):
806
+ """Get data point positions from the EGDF object."""
807
+ self.logger.info("Retrieving data point positions from EGDF object")
808
+ return self.gdf.di_points_n
809
+
810
+ def _get_z0(self):
811
+ """Get Z0 (global optimum) value from the EGDF object."""
812
+ self.logger.info("Retrieving Z0 (global optimum) from EGDF object")
813
+ if hasattr(self.gdf, 'z0') and self.gdf.z0 is not None:
814
+ return self.gdf.z0
815
+ elif hasattr(self.gdf, 'params') and 'z0' in self.gdf.params:
816
+ return self.gdf.params['z0']
817
+ else:
818
+ # Fallback: use global extremum from PDF
819
+ if self.global_extremum_idx is not None:
820
+ data_points = self._get_data_points()
821
+ if self.verbose:
822
+ self._append_warning("Z0 not found in EGDF object. Using PDF global extremum as Z0.")
823
+ return data_points[self.global_extremum_idx]
824
+ return None
825
+
826
+ def plot(self, figsize=(12, 8), title=None):
827
+ """
828
+ Create a comprehensive visualization of the homogeneity analysis results.
829
+
830
+ This method generates an informative plot that displays the probability density
831
+ function (PDF), detected maxima, homogeneity status, and key analysis metrics.
832
+ The visualization provides both quantitative and qualitative insights into the
833
+ data's homogeneous characteristics.
834
+
835
+ **Plot Components:**
836
+
837
+ 1. **Original PDF Curve**: Blue solid line showing the raw probability density
838
+ 2. **Smoothed PDF Curve**: Orange dashed line showing Gaussian-filtered PDF
839
+ 3. **Global Maximum**: Red circle with vertical line marking the primary maximum
840
+ 4. **Secondary Maxima**: Grey circles with vertical lines for additional maxima
841
+ 5. **Z0 Reference**: Cyan dotted line if Z0 differs from detected maximum
842
+ 6. **Status Indicator**: Color-coded text box showing homogeneity result
843
+ 7. **Analysis Summary**: Information box with key metrics and statistics
844
+
845
+ The plot layout is optimized for both screen display and publication quality,
846
+ with clear legends, appropriate scaling, and professional formatting.
847
+
848
+ Parameters
849
+ ----------
850
+ figsize : tuple of float, default=(12, 8)
851
+ Figure dimensions in inches as (width, height).
852
+ - Larger sizes provide better detail visibility
853
+ - Smaller sizes suitable for embedded displays
854
+ - Recommended range: (8, 6) to (16, 12)
855
+
856
+ title : str, optional
857
+ Custom plot title. If None, generates descriptive title automatically.
858
+ - None: Auto-generated title with EGDF type and homogeneity status
859
+ - str: Custom title text (supports LaTeX formatting)
860
+ - Empty string: No title displayed
861
+
862
+ Returns
863
+ -------
864
+ None
865
+ The method displays the plot using matplotlib.pyplot.show() and does not
866
+ return any value. The plot appears in the current matplotlib backend.
867
+
868
+ Raises
869
+ ------
870
+ RuntimeError
871
+ If called before the fit() method has been executed:
872
+ - No analysis results available for visualization
873
+ - Internal state inconsistent or incomplete
874
+
875
+ AttributeError
876
+ If required plot data is missing or corrupted:
877
+ - PDF data unavailable or deleted (check flush parameter)
878
+ - Data points array missing or malformed
879
+ - Maxima detection results incomplete
880
+
881
+ ImportError
882
+ If matplotlib is not available or not properly installed
883
+
884
+ MemoryError
885
+ If insufficient memory for plot generation (rare, for very large datasets)
886
+
887
+ Side Effects
888
+ -----------
889
+ - Displays interactive plot window (backend-dependent)
890
+ - May create temporary matplotlib figure and axis objects
891
+ - Does not modify any analysis results or object state
892
+ - Plot appearance depends on current matplotlib style settings
893
+
894
+ Examples
895
+ --------
896
+ **Basic Plotting:**
897
+
898
+ >>> # After running analysis
899
+ >>> homogeneity = DataHomogeneity(egdf_object)
900
+ >>> homogeneity.fit()
901
+ >>> homogeneity.plot() # Display with default settings
902
+
903
+ **Custom Formatting:**
904
+
905
+ >>> # Custom size and title
906
+ >>> homogeneity.plot(
907
+ ... figsize=(14, 10),
908
+ ... title="EGDF Homogeneity Analysis: Production Data"
909
+ ... )
910
+
911
+ Notes
912
+ -----
913
+ **Visual Interpretation Guide:**
914
+
915
+ - **Green Status Box**: Data is homogeneous (single maximum, no negative PDF)
916
+ - **Red Status Box**: Data is heterogeneous (multiple maxima or negative values)
917
+ - **Red Markers**: Global maximum
918
+ - **Grey Markers**: Secondary maxima indicating potential heterogeneity
919
+ - **Smooth vs Raw PDF**: Comparison shows impact of noise filtering
920
+
921
+ **Plot Customization:**
922
+
923
+ The plot uses matplotlib's standard customization system:
924
+ - Colors follow standard scientific visualization conventions
925
+ - Font sizes and line weights optimized for readability
926
+ - Grid and legend placement maximize information density
927
+ - Axis labels and scales automatically adjusted for data range
928
+
929
+ **Performance Notes:**
930
+
931
+ - Plot generation is typically fast (< 1 second for most datasets)
932
+ - Large datasets may require longer rendering times
933
+ - Interactive backends may be slower than static ones
934
+ - Memory usage scales with plot resolution and data size
935
+
936
+ **Troubleshooting:**
937
+
938
+ Common issues and solutions:
939
+ - **Empty plot**: Check if fit() was called successfully
940
+ - **Missing data**: Verify flush=False if data needed for plotting
941
+ - **Poor visibility**: Adjust figsize or matplotlib DPI settings
942
+ - **Layout issues**: Use plt.tight_layout() or bbox_inches='tight'
943
+
944
+ **Mathematical Context:**
945
+
946
+ The visualization directly represents the mathematical foundation:
947
+ - PDF height indicates probability density magnitude
948
+ - Maximum positions show optimal data characteristics
949
+ - Smoothing reveals underlying distributional structure
950
+ - Multiple maxima indicate potential data clustering or heterogeneity
951
+
952
+ See Also
953
+ --------
954
+ fit : Perform the homogeneity analysis (required before plotting)
955
+ results : Access numerical analysis results
956
+ """
957
+ self.logger.info("Generating homogeneity analysis plot")
958
+ if not self._fitted:
959
+ self.logger.error("Must call fit() before plotting. Run fit() method first.")
960
+ raise RuntimeError("Must call fit() before plotting. Run fit() method first.")
961
+
962
+ try:
963
+ fig, ax = plt.subplots(figsize=figsize)
964
+
965
+ pdf_data = self._get_pdf_data()
966
+ data_points = self._get_data_points()
967
+ smoothed_pdf = self._smooth_pdf()
968
+
969
+ # Plot PDF and smoothed PDF
970
+ ax.plot(data_points, pdf_data, 'b-', linewidth=2, label='PDF', alpha=0.7)
971
+ ax.plot(data_points, smoothed_pdf, 'orange', linestyle='--', linewidth=1.5,
972
+ label='Smoothed PDF', alpha=0.8)
973
+
974
+ # Plot detected maxima
975
+ for pick in self.picks:
976
+ pos = pick['position']
977
+ pdf_val = pick['pdf_value']
978
+ is_global = pick['is_global']
979
+
980
+ if is_global:
981
+ ax.axvline(pos, color='red', linestyle='-', linewidth=2, alpha=0.8)
982
+ ax.plot(pos, pdf_val, 'o', color='red', markersize=10,
983
+ label=f'Global maximum (Z0={pos:.3f})')
984
+ else:
985
+ ax.axvline(pos, color='grey', linestyle='-', linewidth=1, alpha=0.6)
986
+ ax.plot(pos, pdf_val, 'o', color='grey', markersize=6, alpha=0.7)
987
+
988
+ # Add Z0 line if different from global maximum
989
+ if self.z0 is not None and self.global_extremum_idx is not None:
990
+ global_maximum_pos = data_points[self.global_extremum_idx]
991
+ if abs(self.z0 - global_maximum_pos) > 0.001:
992
+ ax.axvline(self.z0, color='cyan', linestyle=':', linewidth=2, alpha=0.8,
993
+ label=f'Original Z0={self.z0:.3f}')
994
+
995
+ # Add homogeneity status text
996
+ status_text = "Homogeneous" if self.is_homogeneous else "Not Homogeneous"
997
+ status_color = 'green' if self.is_homogeneous else 'red'
998
+
999
+ ax.text(0.02, 0.98, status_text, transform=ax.transAxes,
1000
+ fontsize=12, fontweight='bold', color=status_color,
1001
+ verticalalignment='top',
1002
+ bbox=dict(boxstyle='round', facecolor='white', alpha=0.8, edgecolor=status_color))
1003
+
1004
+ # Add analysis info
1005
+ info_text = f"Type: EGDF\n"
1006
+ info_text += f"Maxima: {len(self.picks)}\n"
1007
+
1008
+ if hasattr(self, 'params') and 'has_negative_pdf' in self.params:
1009
+ info_text += f"Negative PDF: {'Yes' if self.params['has_negative_pdf'] else 'No'}"
1010
+
1011
+ ax.text(0.02, 0.02, info_text, transform=ax.transAxes,
1012
+ fontsize=10, verticalalignment='bottom',
1013
+ bbox=dict(boxstyle='round', facecolor='lightgrey', alpha=0.7))
1014
+
1015
+ ax.set_xlabel('Data Points')
1016
+ ax.set_ylabel('PDF Values')
1017
+
1018
+ if title is None:
1019
+ homogeneous_str = "Homogeneous" if self.is_homogeneous else "Non-Homogeneous"
1020
+ title = f"EGDF {homogeneous_str} Data Analysis"
1021
+ ax.set_title(title)
1022
+
1023
+ ax.legend()
1024
+ ax.grid(True, alpha=0.3)
1025
+
1026
+ plt.tight_layout()
1027
+ plt.show()
1028
+
1029
+ except Exception as e:
1030
+ error_msg = f"Error creating plot: {str(e)}"
1031
+ self._append_error(error_msg, type(e).__name__)
1032
+ raise
1033
+
1034
+ def results(self) -> Dict[str, Any]:
1035
+ """
1036
+ Retrieve comprehensive homogeneity analysis results and metadata.
1037
+
1038
+ This method provides access to all analysis results, parameters, and diagnostic
1039
+ information generated during the homogeneity assessment. It returns a complete
1040
+ dictionary containing quantitative results, detected maxima details, analysis
1041
+ parameters, original EGDF object information, and any errors or warnings
1042
+ encountered during processing.
1043
+
1044
+ **Result Categories:**
1045
+
1046
+ 1. **Primary Results**: Core homogeneity findings (is_homogeneous, maxima count)
1047
+ 2. **Maxima Details**: Complete information about detected peaks
1048
+ 3. **Analysis Parameters**: Configuration settings used during analysis
1049
+ 4. **EGDF Parameters**: Original parameters from the input EGDF object
1050
+ 5. **Diagnostic Data**: Errors, warnings, and processing metadata
1051
+ 6. **Quality Metrics**: PDF characteristics and numerical indicators
1052
+
1053
+ The returned dictionary maintains referential integrity and provides
1054
+ comprehensive traceability for analysis reproducibility and debugging.
1055
+
1056
+ Returns
1057
+ -------
1058
+ dict
1059
+ Comprehensive results dictionary with the following structure:
1060
+
1061
+ **Core Analysis Results:**
1062
+ - 'gdf_type' (str): Always 'egdf' for this class
1063
+ - 'is_homogeneous' (bool): Primary homogeneity determination
1064
+ - 'z0' (float): Global optimum value (Z0) from EGDF or detected maximum
1065
+ - 'global_extremum_idx' (int): Array index of global maximum
1066
+ - 'homogeneity_fitted' (bool): Confirmation flag for completed analysis
1067
+
1068
+ **Maxima Information:**
1069
+ - 'picks' (List[Dict]): Detected maxima with detailed properties:
1070
+ - 'index' (int): Array position of maximum
1071
+ - 'position' (float): Data value at maximum location
1072
+ - 'pdf_value' (float): Original PDF value at maximum
1073
+ - 'smoothed_pdf_value' (float): Smoothed PDF value at maximum
1074
+ - 'is_global' (bool): Flag indicating global maximum
1075
+
1076
+ **PDF Characteristics:**
1077
+ - 'has_negative_pdf' (bool): Whether PDF contains negative values
1078
+ - 'num_maxima' (int): Count of detected maxima
1079
+ - 'extrema_type' (str): Always 'maxima' for EGDF
1080
+
1081
+ **Analysis Configuration:**
1082
+ - 'analysis_parameters' (Dict): Settings used during analysis:
1083
+ - 'smoothing_sigma' (float): Gaussian smoothing parameter
1084
+ - 'min_height_ratio' (float): Minimum height threshold for detection
1085
+ - 'min_distance' (int): Minimum separation between maxima
1086
+ - 'flush' (bool): Memory management setting
1087
+
1088
+ **Original EGDF Data:**
1089
+ - 'gdf_parameters' (Dict): Complete parameter set from input EGDF object
1090
+ including S, S_opt, z0, data arrays, and fitted results
1091
+
1092
+ **Diagnostics (if present):**
1093
+ - 'errors' (List[Dict]): Analysis errors with method and type information
1094
+ - 'warnings' (List[Dict]): Analysis warnings and advisory messages
1095
+
1096
+ Raises
1097
+ ------
1098
+ RuntimeError
1099
+ If called before fit() method execution:
1100
+ - "No analysis results available. Call fit() method first."
1101
+ - Analysis state is incomplete or inconsistent
1102
+
1103
+ RuntimeError
1104
+ If results storage is disabled:
1105
+ - "No results stored. Ensure catch=True during initialization."
1106
+ - catch=False prevents result storage for memory conservation
1107
+
1108
+ Examples
1109
+ --------
1110
+ **Basic Result Access:**
1111
+
1112
+ >>> # After running analysis
1113
+ >>> homogeneity = DataHomogeneity(egdf_object)
1114
+ >>> homogeneity.fit()
1115
+ >>> results = homogeneity.results()
1116
+ >>> print(f"Homogeneous: {results['is_homogeneous']}")
1117
+ >>> print(f"Maxima detected: {len(results['picks'])}")
1118
+
1119
+ **Detailed Maxima Analysis:**
1120
+
1121
+ >>> results = homogeneity.results()
1122
+ >>> for i, maximum in enumerate(results['picks']):
1123
+ ... status = "Global" if maximum['is_global'] else "Local"
1124
+ ... print(f"{status} maximum {i+1}:")
1125
+ ... print(f" Position: {maximum['position']:.4f}")
1126
+ ... print(f" PDF value: {maximum['pdf_value']:.4f}")
1127
+ ... print(f" Smoothed PDF: {maximum['smoothed_pdf_value']:.4f}")
1128
+
1129
+ **Error and Warning Inspection:**
1130
+
1131
+ >>> results = homogeneity.results()
1132
+ >>> if 'errors' in results:
1133
+ ... print("Analysis encountered errors:")
1134
+ ... for error in results['errors']:
1135
+ ... print(f" {error['method']}: {error['error']}")
1136
+ >>>
1137
+ >>> if 'warnings' in results:
1138
+ ... print("Analysis warnings:")
1139
+ ... for warning in results['warnings']:
1140
+ ... print(f" {warning['method']}: {warning['warning']}")
1141
+
1142
+ **Parameter Traceability:**
1143
+
1144
+ >>> results = homogeneity.results()
1145
+ >>> analysis_config = results['analysis_parameters']
1146
+ >>> print("Analysis was performed with:")
1147
+ >>> print(f" Smoothing: {analysis_config['smoothing_sigma']}")
1148
+ >>> print(f" Min height ratio: {analysis_config['min_height_ratio']}")
1149
+ >>> print(f" Min distance: {analysis_config['min_distance']}")
1150
+
1151
+ Notes
1152
+ -----
1153
+ **Data Integrity:**
1154
+
1155
+ The returned dictionary is a deep copy of internal results, ensuring:
1156
+ - Modifications to returned data don't affect internal state
1157
+ - Thread-safe access to results
1158
+ - Consistent data even if original EGDF object changes
1159
+
1160
+ **Memory Considerations:**
1161
+
1162
+ - Results dictionary may contain large arrays (PDF points, data points)
1163
+ - Use flush=True during initialization to reduce memory footprint
1164
+ - Consider extracting only needed fields for memory-constrained environments
1165
+
1166
+ **Version Compatibility:**
1167
+
1168
+ The results structure is designed for forward/backward compatibility:
1169
+ - New fields added with default values for missing data
1170
+ - Deprecated fields maintained for transition periods
1171
+ - Type consistency maintained across versions
1172
+
1173
+ **Performance Notes:**
1174
+
1175
+ - Dictionary creation involves copying large data structures
1176
+ - Access time is O(1) for individual fields
1177
+ - Memory usage scales with original data size and PDF resolution
1178
+
1179
+ **Integration Patterns:**
1180
+
1181
+ Common usage patterns for results integration:
1182
+ - Store results in databases using JSON serialization
1183
+ - Pass results to downstream analysis pipelines
1184
+ - Generate reports using template systems
1185
+ - Create batch analysis summaries and comparisons
1186
+
1187
+ **Validation and Quality Control:**
1188
+
1189
+ The results include comprehensive quality indicators:
1190
+ - Error counts and descriptions for debugging
1191
+ - Warning flags for borderline cases
1192
+ - Parameter consistency checks
1193
+ - Numerical stability indicators
1194
+
1195
+ See Also
1196
+ --------
1197
+ fit : Perform the analysis to generate results
1198
+ plot : Visualize the analysis results
1199
+ DataHomogeneity.__init__ : Configure result storage with catch parameter
1200
+ """
1201
+ self.logger.info("Retrieving homogeneity analysis results")
1202
+ if not self._fitted:
1203
+ self.logger.error("No analysis results available. Call fit() method first.")
1204
+ raise RuntimeError("No analysis results available. Call fit() method first.")
1205
+
1206
+ if not self.params:
1207
+ self.logger.error("No results stored. Ensure catch=True during initialization.")
1208
+ raise RuntimeError("No results stored. Ensure catch=True during initialization.")
1209
+
1210
+ return self.params.copy()
1211
+
1212
+ @property
1213
+ def fitted(self):
1214
+ """bool: True if the analysis has been completed, False otherwise."""
1215
+ return self._fitted
1216
+
1217
+ def __repr__(self):
1218
+ return f"DataHomogeneity(gdf_type='egdf', fitted={self._fitted}, is_homogeneous={self.is_homogeneous})"