machinegnostics 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __init__.py +0 -0
- machinegnostics/__init__.py +24 -0
- machinegnostics/magcal/__init__.py +37 -0
- machinegnostics/magcal/characteristics.py +460 -0
- machinegnostics/magcal/criteria_eval.py +268 -0
- machinegnostics/magcal/criterion.py +140 -0
- machinegnostics/magcal/data_conversion.py +381 -0
- machinegnostics/magcal/gcor.py +64 -0
- machinegnostics/magcal/gdf/__init__.py +2 -0
- machinegnostics/magcal/gdf/base_df.py +39 -0
- machinegnostics/magcal/gdf/base_distfunc.py +1202 -0
- machinegnostics/magcal/gdf/base_egdf.py +823 -0
- machinegnostics/magcal/gdf/base_eldf.py +830 -0
- machinegnostics/magcal/gdf/base_qgdf.py +1234 -0
- machinegnostics/magcal/gdf/base_qldf.py +1019 -0
- machinegnostics/magcal/gdf/cluster_analysis.py +456 -0
- machinegnostics/magcal/gdf/data_cluster.py +975 -0
- machinegnostics/magcal/gdf/data_intervals.py +853 -0
- machinegnostics/magcal/gdf/data_membership.py +536 -0
- machinegnostics/magcal/gdf/der_egdf.py +243 -0
- machinegnostics/magcal/gdf/distfunc_engine.py +841 -0
- machinegnostics/magcal/gdf/egdf.py +324 -0
- machinegnostics/magcal/gdf/eldf.py +297 -0
- machinegnostics/magcal/gdf/eldf_intv.py +609 -0
- machinegnostics/magcal/gdf/eldf_ma.py +627 -0
- machinegnostics/magcal/gdf/homogeneity.py +1218 -0
- machinegnostics/magcal/gdf/intv_engine.py +1523 -0
- machinegnostics/magcal/gdf/marginal_intv_analysis.py +558 -0
- machinegnostics/magcal/gdf/qgdf.py +289 -0
- machinegnostics/magcal/gdf/qldf.py +296 -0
- machinegnostics/magcal/gdf/scedasticity.py +197 -0
- machinegnostics/magcal/gdf/wedf.py +181 -0
- machinegnostics/magcal/gdf/z0_estimator.py +1047 -0
- machinegnostics/magcal/layer_base.py +42 -0
- machinegnostics/magcal/layer_history_base.py +74 -0
- machinegnostics/magcal/layer_io_process_base.py +238 -0
- machinegnostics/magcal/layer_param_base.py +448 -0
- machinegnostics/magcal/mg_weights.py +36 -0
- machinegnostics/magcal/sample_characteristics.py +532 -0
- machinegnostics/magcal/scale_optimization.py +185 -0
- machinegnostics/magcal/scale_param.py +313 -0
- machinegnostics/magcal/util/__init__.py +0 -0
- machinegnostics/magcal/util/dis_docstring.py +18 -0
- machinegnostics/magcal/util/logging.py +24 -0
- machinegnostics/magcal/util/min_max_float.py +34 -0
- machinegnostics/magnet/__init__.py +0 -0
- machinegnostics/metrics/__init__.py +28 -0
- machinegnostics/metrics/accu.py +61 -0
- machinegnostics/metrics/accuracy.py +67 -0
- machinegnostics/metrics/auto_correlation.py +183 -0
- machinegnostics/metrics/auto_covariance.py +204 -0
- machinegnostics/metrics/cls_report.py +130 -0
- machinegnostics/metrics/conf_matrix.py +93 -0
- machinegnostics/metrics/correlation.py +178 -0
- machinegnostics/metrics/cross_variance.py +167 -0
- machinegnostics/metrics/divi.py +82 -0
- machinegnostics/metrics/evalmet.py +109 -0
- machinegnostics/metrics/f1_score.py +128 -0
- machinegnostics/metrics/gmmfe.py +108 -0
- machinegnostics/metrics/hc.py +141 -0
- machinegnostics/metrics/mae.py +72 -0
- machinegnostics/metrics/mean.py +117 -0
- machinegnostics/metrics/median.py +122 -0
- machinegnostics/metrics/mg_r2.py +167 -0
- machinegnostics/metrics/mse.py +78 -0
- machinegnostics/metrics/precision.py +119 -0
- machinegnostics/metrics/r2.py +122 -0
- machinegnostics/metrics/recall.py +108 -0
- machinegnostics/metrics/rmse.py +77 -0
- machinegnostics/metrics/robr2.py +119 -0
- machinegnostics/metrics/std.py +144 -0
- machinegnostics/metrics/variance.py +101 -0
- machinegnostics/models/__init__.py +2 -0
- machinegnostics/models/classification/__init__.py +1 -0
- machinegnostics/models/classification/layer_history_log_reg.py +121 -0
- machinegnostics/models/classification/layer_io_process_log_reg.py +98 -0
- machinegnostics/models/classification/layer_mlflow_log_reg.py +107 -0
- machinegnostics/models/classification/layer_param_log_reg.py +275 -0
- machinegnostics/models/classification/mg_log_reg.py +273 -0
- machinegnostics/models/cross_validation.py +118 -0
- machinegnostics/models/data_split.py +106 -0
- machinegnostics/models/regression/__init__.py +2 -0
- machinegnostics/models/regression/layer_histroy_rob_reg.py +139 -0
- machinegnostics/models/regression/layer_io_process_rob_rig.py +88 -0
- machinegnostics/models/regression/layer_mlflow_rob_reg.py +134 -0
- machinegnostics/models/regression/layer_param_rob_reg.py +212 -0
- machinegnostics/models/regression/mg_lin_reg.py +253 -0
- machinegnostics/models/regression/mg_poly_reg.py +258 -0
- machinegnostics-0.0.1.dist-info/METADATA +246 -0
- machinegnostics-0.0.1.dist-info/RECORD +93 -0
- machinegnostics-0.0.1.dist-info/WHEEL +5 -0
- machinegnostics-0.0.1.dist-info/licenses/LICENSE +674 -0
- machinegnostics-0.0.1.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,975 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DataCluster: Advanced Cluster Boundary Detection for Gnostic Distribution Functions (GDFs)
|
|
3
|
+
|
|
4
|
+
The DataCluster class identifies main cluster boundaries (CLB and CUB) from probability density functions of four types of Gnostic Distribution Functions: ELDF, EGDF, QLDF, and QGDF.
|
|
5
|
+
|
|
6
|
+
Author: Nirmal Parmar
|
|
7
|
+
Machine Gnostics
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import warnings
|
|
12
|
+
import logging
|
|
13
|
+
from machinegnostics.magcal.util.logging import get_logger
|
|
14
|
+
import matplotlib.pyplot as plt
|
|
15
|
+
from scipy.ndimage import gaussian_filter1d
|
|
16
|
+
from scipy.signal import find_peaks, argrelextrema
|
|
17
|
+
from typing import Union, Dict, Any, Optional, Tuple, List
|
|
18
|
+
|
|
19
|
+
class DataCluster:
|
|
20
|
+
"""
|
|
21
|
+
Advanced cluster boundary detection for Gnostic Distribution Functions (GDFs).
|
|
22
|
+
|
|
23
|
+
The DataCluster class identifies main cluster boundaries (CLB and CUB) from probability
|
|
24
|
+
density functions of four types of Gnostic Distribution Functions: ELDF, EGDF, QLDF, and QGDF.
|
|
25
|
+
It uses normalized PDF analysis with derivative-based methods and shape detection algorithms
|
|
26
|
+
to precisely locate cluster boundaries.
|
|
27
|
+
|
|
28
|
+
Clustering Performance by GDF Type:
|
|
29
|
+
- **Local Functions (ELDF, QLDF)**: Excellent clustering performance due to unlimited
|
|
30
|
+
flexibility controlled by scale parameter
|
|
31
|
+
- **Global Functions (EGDF, QGDF)**: Limited clustering effectiveness due to constrained
|
|
32
|
+
flexibility and uniqueness assumptions
|
|
33
|
+
|
|
34
|
+
Key Features:
|
|
35
|
+
- PDF normalization for consistent analysis across all GDF types
|
|
36
|
+
- QLDF W-shape vs U-shape detection for accurate valley boundary identification
|
|
37
|
+
- Derivative-based boundary detection with adaptive thresholds
|
|
38
|
+
- Multiple fallback methods for robust cluster identification
|
|
39
|
+
- Comprehensive error handling and validation
|
|
40
|
+
|
|
41
|
+
Parameters
|
|
42
|
+
----------
|
|
43
|
+
gdf : ELDF, EGDF, QLDF, or QGDF
|
|
44
|
+
A fitted Gnostic Distribution Function object with pdf_points available.
|
|
45
|
+
Must have been fitted with catch=True to ensure pdf_points are stored.
|
|
46
|
+
verbose : bool, default=False
|
|
47
|
+
Enable detailed progress reporting and diagnostic output.
|
|
48
|
+
catch : bool, default=True
|
|
49
|
+
Enable error catching and graceful degradation (inherited from GDF conventions).
|
|
50
|
+
derivative_threshold : float, default=0.01
|
|
51
|
+
Threshold for ELDF/EGDF boundary detection. Points where (PDF + 1st_derivative)
|
|
52
|
+
falls below this threshold are considered boundary candidates.
|
|
53
|
+
slope_percentile : int, default=70
|
|
54
|
+
Percentile threshold for QLDF/QGDF slope-based boundary detection. Higher values
|
|
55
|
+
create more conservative (narrower) cluster boundaries.
|
|
56
|
+
|
|
57
|
+
Attributes
|
|
58
|
+
----------
|
|
59
|
+
CLB : float or None
|
|
60
|
+
Cluster Lower Boundary - left boundary of the main cluster
|
|
61
|
+
CUB : float or None
|
|
62
|
+
Cluster Upper Boundary - right boundary of the main cluster
|
|
63
|
+
z0 : float or None
|
|
64
|
+
Characteristic point of the distribution (from GDF object)
|
|
65
|
+
S_opt : float or None
|
|
66
|
+
Optimal scale parameter (from GDF object)
|
|
67
|
+
pdf_normalized : ndarray or None
|
|
68
|
+
Min-max normalized PDF values [0,1] used for analysis
|
|
69
|
+
pdf_original : ndarray or None
|
|
70
|
+
Original PDF values before normalization
|
|
71
|
+
params : dict
|
|
72
|
+
Complete analysis results including boundaries, methods used, and diagnostics
|
|
73
|
+
|
|
74
|
+
Methods
|
|
75
|
+
-------
|
|
76
|
+
fit()
|
|
77
|
+
Perform cluster boundary detection analysis
|
|
78
|
+
plot(figsize=(12, 8))
|
|
79
|
+
Visualize PDF, boundaries, and derivative analysis
|
|
80
|
+
results()
|
|
81
|
+
Return comprehensive analysis results dictionary
|
|
82
|
+
|
|
83
|
+
Algorithm Details
|
|
84
|
+
----------------
|
|
85
|
+
**ELDF/EGDF (Estimating Distribution Functions):**
|
|
86
|
+
- PDF has global maximum at z0 (characteristic point)
|
|
87
|
+
- Boundaries found where (PDF + 1st_derivative) ≤ derivative_threshold
|
|
88
|
+
- Main cluster region is BETWEEN CLB and CUB (shaded green)
|
|
89
|
+
- Works best with local ELDF due to flexible scale parameter control
|
|
90
|
+
|
|
91
|
+
**QLDF (Quantifying Local Distribution Function):**
|
|
92
|
+
- **W-shape detection**: Identifies peaks between boundary extremes
|
|
93
|
+
- 1 internal peak → W-shape → Find valley minima as boundaries
|
|
94
|
+
- 0 internal peaks → U-shape → Use slope transition method
|
|
95
|
+
- 2+ internal peaks → Heterogeneous data warning
|
|
96
|
+
- **Valley detection**: Uses scipy.signal.argrelextrema for precise minima
|
|
97
|
+
- Main cluster region is OUTSIDE CLB and CUB boundaries (shaded green)
|
|
98
|
+
|
|
99
|
+
**QGDF (Quantifying Global Distribution Function):**
|
|
100
|
+
- Uses slope transition detection with percentile-based thresholds
|
|
101
|
+
- Limited effectiveness due to global function constraints
|
|
102
|
+
- Fallback to curvature analysis when slope detection fails
|
|
103
|
+
- Main cluster region is OUTSIDE CLB and CUB boundaries (shaded green)
|
|
104
|
+
|
|
105
|
+
Normalization Strategy
|
|
106
|
+
---------------------
|
|
107
|
+
All PDFs are normalized to [0,1] range using min-max normalization:
|
|
108
|
+
- Ensures consistent threshold application across different GDF types
|
|
109
|
+
- Enables robust derivative analysis regardless of original PDF scale
|
|
110
|
+
- Maintains relative shape characteristics while standardizing magnitude
|
|
111
|
+
|
|
112
|
+
Error Handling
|
|
113
|
+
-------------
|
|
114
|
+
- Validates GDF object fitness and required attributes
|
|
115
|
+
- Warns when using global functions (EGDF/QGDF) for clustering
|
|
116
|
+
- Provides fallback to data bounds when boundary detection fails
|
|
117
|
+
- Comprehensive error logging with method traceability
|
|
118
|
+
|
|
119
|
+
Examples
|
|
120
|
+
--------
|
|
121
|
+
>>> # Basic usage with QLDF
|
|
122
|
+
>>> from machinegnostics.magcal import QLDF
|
|
123
|
+
>>> from machinegnostics.magcal import DataCluster
|
|
124
|
+
>>>
|
|
125
|
+
>>> # Fit QLDF first
|
|
126
|
+
>>> qldf = QLDF(data=your_data, catch=True)
|
|
127
|
+
>>> qldf.fit()
|
|
128
|
+
>>>
|
|
129
|
+
>>> # Perform cluster analysis
|
|
130
|
+
>>> cluster = DataCluster(gdf=qldf, verbose=True)
|
|
131
|
+
>>> cluster.fit()
|
|
132
|
+
>>> cluster.plot()
|
|
133
|
+
>>>
|
|
134
|
+
>>> # Get results
|
|
135
|
+
>>> results = cluster.results()
|
|
136
|
+
>>> print(f"CLB: {results['LCB']}, CUB: {results['UCB']}")
|
|
137
|
+
>>> print(f"Cluster width: {results['cluster_width']}")
|
|
138
|
+
>>> print(f"PDF shape: {results['pdf_shape']}") # For QLDF
|
|
139
|
+
|
|
140
|
+
>>> # Advanced usage with custom thresholds
|
|
141
|
+
>>> cluster = DataCluster(
|
|
142
|
+
... gdf=eldf,
|
|
143
|
+
... derivative_threshold=0.005, # More sensitive
|
|
144
|
+
... slope_percentile=80, # More conservative
|
|
145
|
+
... verbose=True
|
|
146
|
+
... )
|
|
147
|
+
>>> cluster.fit()
|
|
148
|
+
|
|
149
|
+
Notes
|
|
150
|
+
-----
|
|
151
|
+
- Clustering works best with local distribution functions (ELDF, QLDF)
|
|
152
|
+
- Global functions (EGDF, QGDF) have limited clustering effectiveness due to
|
|
153
|
+
their uniqueness constraints and automatic parameter optimization
|
|
154
|
+
- QLDF W-shape detection is particularly effective for data with central clusters
|
|
155
|
+
between outlying regions
|
|
156
|
+
- For heterogeneous data with multiple clusters, consider data splitting before analysis
|
|
157
|
+
|
|
158
|
+
References
|
|
159
|
+
----------
|
|
160
|
+
Based on Gnostic Distribution Function theory and cluster analysis methods
|
|
161
|
+
as described in mathematical gnostics literature.
|
|
162
|
+
"""
|
|
163
|
+
def __init__(self, gdf, verbose=False, catch=True, derivative_threshold=0.01, slope_percentile=70):
|
|
164
|
+
"""
|
|
165
|
+
Initialize DataCluster for boundary detection analysis.
|
|
166
|
+
|
|
167
|
+
Parameters
|
|
168
|
+
----------
|
|
169
|
+
gdf : ELDF, EGDF, QLDF, or QGDF
|
|
170
|
+
A fitted Gnostic Distribution Function object. Must have pdf_points
|
|
171
|
+
available (fitted with catch=True).
|
|
172
|
+
verbose : bool, default=False
|
|
173
|
+
Enable detailed progress reporting and diagnostic messages.
|
|
174
|
+
catch : bool, default=True
|
|
175
|
+
Enable error catching and graceful degradation.
|
|
176
|
+
derivative_threshold : float, default=0.01
|
|
177
|
+
Threshold for ELDF/EGDF boundary detection. Lower values create
|
|
178
|
+
wider cluster boundaries, higher values create narrower boundaries.
|
|
179
|
+
slope_percentile : int, default=70
|
|
180
|
+
Percentile threshold (0-100) for QLDF/QGDF slope detection.
|
|
181
|
+
Higher values create more conservative cluster boundaries.
|
|
182
|
+
|
|
183
|
+
Raises
|
|
184
|
+
------
|
|
185
|
+
ValueError
|
|
186
|
+
If GDF object is not fitted or missing required attributes.
|
|
187
|
+
AttributeError
|
|
188
|
+
If GDF object is missing pdf_points (ensure catch=True during fitting).
|
|
189
|
+
"""
|
|
190
|
+
self.gdf = gdf
|
|
191
|
+
self.gdf_type = gdf.__class__.__name__.lower()
|
|
192
|
+
self.verbose = verbose
|
|
193
|
+
self.catch = catch
|
|
194
|
+
self.derivative_threshold = derivative_threshold
|
|
195
|
+
self.slope_percentile = slope_percentile
|
|
196
|
+
|
|
197
|
+
self.params = {
|
|
198
|
+
'gdf_type': self.gdf_type,
|
|
199
|
+
'derivative_threshold': self.derivative_threshold,
|
|
200
|
+
'slope_percentile': self.slope_percentile,
|
|
201
|
+
'LCB': None,
|
|
202
|
+
'UCB': None,
|
|
203
|
+
'Z0': None,
|
|
204
|
+
'S_opt': None,
|
|
205
|
+
'cluster_width': None,
|
|
206
|
+
'clustering_successful': False,
|
|
207
|
+
'method_used': None,
|
|
208
|
+
'normalization_method': None,
|
|
209
|
+
'pdf_shape': None,
|
|
210
|
+
'errors': [],
|
|
211
|
+
'warnings': []
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
self.LCB = None
|
|
215
|
+
self.UCB = None
|
|
216
|
+
self.z0 = None
|
|
217
|
+
self.S_opt = None
|
|
218
|
+
self._fitted = False
|
|
219
|
+
|
|
220
|
+
self.pdf_normalized = None
|
|
221
|
+
self.pdf_original = None
|
|
222
|
+
|
|
223
|
+
# logger setup
|
|
224
|
+
self.logger = get_logger(self.__class__.__name__, logging.DEBUG if verbose else logging.WARNING)
|
|
225
|
+
self.logger.debug(f"{self.__class__.__name__} initialized:")
|
|
226
|
+
|
|
227
|
+
# validation
|
|
228
|
+
try:
|
|
229
|
+
self._validate_gdf()
|
|
230
|
+
self._validate_gdf_type_for_clustering()
|
|
231
|
+
except Exception as e:
|
|
232
|
+
self._append_error(f"GDF validation failed: {str(e)}", type(e).__name__)
|
|
233
|
+
|
|
234
|
+
def _validate_gdf(self):
|
|
235
|
+
self.logger.info("Validating GDF object for clustering analysis.")
|
|
236
|
+
if not hasattr(self.gdf, '_fitted') or not self.gdf._fitted:
|
|
237
|
+
self.logger.error("GDF object must be fitted before cluster analysis.")
|
|
238
|
+
raise ValueError("GDF object must be fitted before cluster analysis")
|
|
239
|
+
|
|
240
|
+
if not hasattr(self.gdf, 'pdf_points') or self.gdf.pdf_points is None:
|
|
241
|
+
self.logger.error("GDF object missing pdf_points. Ensure catch=True during fitting.")
|
|
242
|
+
raise AttributeError("GDF object missing pdf_points. Ensure catch=True during fitting.")
|
|
243
|
+
|
|
244
|
+
if not hasattr(self.gdf, 'data'):
|
|
245
|
+
self.logger.error("GDF object missing data attribute.")
|
|
246
|
+
raise ValueError("GDF object missing data attribute.")
|
|
247
|
+
|
|
248
|
+
def _validate_gdf_type_for_clustering(self):
|
|
249
|
+
self.logger.info("Validating GDF type for clustering suitability.")
|
|
250
|
+
|
|
251
|
+
if self.gdf_type in ['egdf', 'qgdf']:
|
|
252
|
+
gdf_full_name = 'EGDF' if self.gdf_type == 'egdf' else 'QGDF'
|
|
253
|
+
local_alternative = 'ELDF' if self.gdf_type == 'egdf' else 'QLDF'
|
|
254
|
+
|
|
255
|
+
warning_msg = (
|
|
256
|
+
f"Using {gdf_full_name} (Global Distribution Function) for clustering analysis. "
|
|
257
|
+
f"Clustering may not be as effective with global functions. "
|
|
258
|
+
f"Consider using {local_alternative} (Local Distribution Function) for better clustering results."
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
self._append_warning(warning_msg)
|
|
262
|
+
|
|
263
|
+
def _append_error(self, error_message, exception_type=None):
|
|
264
|
+
error_entry = {
|
|
265
|
+
'method': 'DataCluster',
|
|
266
|
+
'error': error_message,
|
|
267
|
+
'exception_type': exception_type or 'DataClusterError'
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
self.params['errors'].append(error_entry)
|
|
271
|
+
|
|
272
|
+
if hasattr(self.gdf, 'params') and 'errors' in self.gdf.params:
|
|
273
|
+
self.gdf.params['errors'].append(error_entry)
|
|
274
|
+
elif hasattr(self.gdf, 'params'):
|
|
275
|
+
self.gdf.params['errors'] = [error_entry]
|
|
276
|
+
|
|
277
|
+
self.logger.error(f" Error: {error_message} ({exception_type})")
|
|
278
|
+
|
|
279
|
+
def _append_warning(self, warning_message):
|
|
280
|
+
warning_entry = {
|
|
281
|
+
'method': 'DataCluster',
|
|
282
|
+
'warning': warning_message
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
self.params['warnings'].append(warning_entry)
|
|
286
|
+
|
|
287
|
+
if hasattr(self.gdf, 'params') and 'warnings' in self.gdf.params:
|
|
288
|
+
self.gdf.params['warnings'].append(warning_entry)
|
|
289
|
+
elif hasattr(self.gdf, 'params'):
|
|
290
|
+
self.gdf.params['warnings'] = [warning_entry]
|
|
291
|
+
|
|
292
|
+
self.logger.warning(f" Warning: {warning_message}")
|
|
293
|
+
|
|
294
|
+
def _get_pdf_data(self):
|
|
295
|
+
self.logger.info("Retrieving PDF data from GDF object.")
|
|
296
|
+
return self.gdf.pdf_points
|
|
297
|
+
|
|
298
|
+
def _get_data_points(self):
|
|
299
|
+
self.logger.info("Retrieving data points from GDF object.")
|
|
300
|
+
if hasattr(self.gdf, 'di_points_n') and self.gdf.di_points_n is not None:
|
|
301
|
+
return self.gdf.di_points_n
|
|
302
|
+
elif hasattr(self.gdf, 'di_points') and self.gdf.di_points is not None:
|
|
303
|
+
return self.gdf.di_points
|
|
304
|
+
elif hasattr(self.gdf, 'params') and 'di_points' in self.gdf.params:
|
|
305
|
+
return self.gdf.params['di_points']
|
|
306
|
+
else:
|
|
307
|
+
self.logger.error("Cannot find data points in GDF object")
|
|
308
|
+
raise AttributeError("Cannot find data points in GDF object")
|
|
309
|
+
|
|
310
|
+
def _normalize_pdf(self, pdf_data):
|
|
311
|
+
self.logger.info("Normalizing PDF data.")
|
|
312
|
+
self.pdf_original = pdf_data.copy()
|
|
313
|
+
|
|
314
|
+
pdf_min = np.min(pdf_data)
|
|
315
|
+
pdf_max = np.max(pdf_data)
|
|
316
|
+
|
|
317
|
+
if pdf_max == pdf_min:
|
|
318
|
+
normalized_pdf = np.ones_like(pdf_data) * 0.5
|
|
319
|
+
self.params['normalization_method'] = 'constant_pdf'
|
|
320
|
+
else:
|
|
321
|
+
normalized_pdf = (pdf_data - pdf_min) / (pdf_max - pdf_min)
|
|
322
|
+
self.params['normalization_method'] = 'min_max_normalization'
|
|
323
|
+
|
|
324
|
+
self.logger.info(f"PDF normalization: {self.params['normalization_method']}")
|
|
325
|
+
self.logger.info(f"Normalized PDF range: [{np.min(normalized_pdf):.3f}, {np.max(normalized_pdf):.3f}]")
|
|
326
|
+
|
|
327
|
+
return normalized_pdf
|
|
328
|
+
|
|
329
|
+
def _get_z0(self):
|
|
330
|
+
self.logger.info("Retrieving Z0 from GDF object.")
|
|
331
|
+
|
|
332
|
+
if hasattr(self.gdf, 'z0') and self.gdf.z0 is not None:
|
|
333
|
+
return self.gdf.z0
|
|
334
|
+
elif hasattr(self.gdf, 'params') and 'z0' in self.gdf.params:
|
|
335
|
+
return self.gdf.params['z0']
|
|
336
|
+
else:
|
|
337
|
+
self._append_warning("Z0 not found in GDF object. Using PDF global extremum as Z0.")
|
|
338
|
+
return self._find_pdf_z0()
|
|
339
|
+
|
|
340
|
+
def _get_s_opt(self):
|
|
341
|
+
self.logger.info("Retrieving S_opt from GDF object.")
|
|
342
|
+
|
|
343
|
+
if hasattr(self.gdf, 'S_opt') and self.gdf.S_opt is not None:
|
|
344
|
+
return self.gdf.S_opt
|
|
345
|
+
elif hasattr(self.gdf, 'params') and 'S_opt' in self.gdf.params:
|
|
346
|
+
return self.gdf.params['S_opt']
|
|
347
|
+
else:
|
|
348
|
+
self._append_warning("S_opt not found in GDF object. Using default value 1.0.")
|
|
349
|
+
return 1.0
|
|
350
|
+
|
|
351
|
+
def _get_data_bounds(self):
|
|
352
|
+
self.logger.info("Retrieving data bounds from GDF object.")
|
|
353
|
+
|
|
354
|
+
if hasattr(self.gdf, 'DLB') and hasattr(self.gdf, 'DUB'):
|
|
355
|
+
return self.gdf.DLB, self.gdf.DUB
|
|
356
|
+
else:
|
|
357
|
+
return np.min(self.gdf.data), np.max(self.gdf.data)
|
|
358
|
+
|
|
359
|
+
def _find_pdf_z0(self):
|
|
360
|
+
self.logger.info("Finding Z0 as PDF global extremum.")
|
|
361
|
+
|
|
362
|
+
data_points = self._get_data_points()
|
|
363
|
+
|
|
364
|
+
if self.gdf_type in ['eldf', 'egdf']:
|
|
365
|
+
max_idx = np.argmax(self.pdf_normalized)
|
|
366
|
+
return data_points[max_idx]
|
|
367
|
+
else:
|
|
368
|
+
min_idx = np.argmin(self.pdf_normalized)
|
|
369
|
+
return data_points[min_idx]
|
|
370
|
+
|
|
371
|
+
def _find_z0_index(self, data_points):
|
|
372
|
+
self.logger.info("Finding index of Z0 in data points.")
|
|
373
|
+
|
|
374
|
+
z0_idx = np.argmin(np.abs(data_points - self.z0))
|
|
375
|
+
return z0_idx
|
|
376
|
+
|
|
377
|
+
def _detect_qldf_shape_and_boundaries(self, pdf_normalized, data_points):
|
|
378
|
+
self.logger.info("Detecting QLDF shape and determining boundaries.")
|
|
379
|
+
|
|
380
|
+
z0_idx = self._find_z0_index(data_points)
|
|
381
|
+
|
|
382
|
+
# Find all peaks with lower sensitivity to catch all significant peaks
|
|
383
|
+
peaks, peak_properties = find_peaks(pdf_normalized,
|
|
384
|
+
height=0.05,
|
|
385
|
+
distance=5,
|
|
386
|
+
prominence=0.05)
|
|
387
|
+
|
|
388
|
+
# Exclude boundary peaks (first and last 10% of data)
|
|
389
|
+
boundary_margin = len(data_points) // 10
|
|
390
|
+
internal_peaks = peaks[(peaks > boundary_margin) & (peaks < len(data_points) - boundary_margin)]
|
|
391
|
+
|
|
392
|
+
if self.verbose:
|
|
393
|
+
self.logger.info(f"Found {len(internal_peaks)} internal peaks at indices: {internal_peaks}")
|
|
394
|
+
if len(internal_peaks) > 0:
|
|
395
|
+
peak_values = [f'{data_points[p]:.1f}' for p in internal_peaks]
|
|
396
|
+
self.logger.info(f"Internal peak values: {peak_values}")
|
|
397
|
+
|
|
398
|
+
# Determine shape based on number of internal peaks
|
|
399
|
+
if len(internal_peaks) == 1:
|
|
400
|
+
# W-shape: One peak between extremes
|
|
401
|
+
self.params['pdf_shape'] = 'W-shape'
|
|
402
|
+
return self._find_w_shape_valley_boundaries(pdf_normalized, data_points, internal_peaks[0])
|
|
403
|
+
|
|
404
|
+
elif len(internal_peaks) == 0:
|
|
405
|
+
# U-shape: No peaks between extremes
|
|
406
|
+
self.params['pdf_shape'] = 'U-shape'
|
|
407
|
+
return self._find_u_shape_slope_boundaries(pdf_normalized, data_points)
|
|
408
|
+
|
|
409
|
+
else:
|
|
410
|
+
# Heterogeneous: Multiple peaks (2+)
|
|
411
|
+
self.params['pdf_shape'] = 'Heterogeneous'
|
|
412
|
+
warning_msg = f"QLDF detected {len(internal_peaks)} internal peaks. Data may be heterogeneous. Consider splitting the dataset."
|
|
413
|
+
self._append_warning(warning_msg)
|
|
414
|
+
# Fallback to slope method
|
|
415
|
+
return self._find_u_shape_slope_boundaries(pdf_normalized, data_points)
|
|
416
|
+
|
|
417
|
+
def _find_w_shape_valley_boundaries(self, pdf_normalized, data_points, central_peak_idx):
|
|
418
|
+
self.logger.info("W-shape detected, using valley detection method.")
|
|
419
|
+
z0_idx = self._find_z0_index(data_points)
|
|
420
|
+
central_peak_value = data_points[central_peak_idx]
|
|
421
|
+
|
|
422
|
+
self.logger.info(f"W-shape detected with central peak at {central_peak_value:.3f}")
|
|
423
|
+
self.logger.info(f"Z0 at {data_points[z0_idx]:.3f}")
|
|
424
|
+
|
|
425
|
+
left_candidates = []
|
|
426
|
+
right_candidates = []
|
|
427
|
+
|
|
428
|
+
# Method 1: Find actual minima using scipy
|
|
429
|
+
self.logger.info("Finding valley minima using scipy.signal.argrelextrema")
|
|
430
|
+
minima_indices = argrelextrema(pdf_normalized, np.less, order=3)[0]
|
|
431
|
+
|
|
432
|
+
# Filter minima and find those on left and right of central peak
|
|
433
|
+
left_minima = [m for m in minima_indices if m < central_peak_idx and m > len(data_points)//10]
|
|
434
|
+
right_minima = [m for m in minima_indices if m > central_peak_idx and m < len(data_points)*9//10]
|
|
435
|
+
|
|
436
|
+
self.logger.info(f"Found {len(left_minima)} left minima, {len(right_minima)} right minima")
|
|
437
|
+
|
|
438
|
+
# Take the closest minima to the central peak
|
|
439
|
+
if left_minima:
|
|
440
|
+
closest_left_min = max(left_minima) # Closest to central peak from left
|
|
441
|
+
left_candidates.append(closest_left_min)
|
|
442
|
+
self.logger.info(f"Left valley minimum at {data_points[closest_left_min]:.3f}")
|
|
443
|
+
|
|
444
|
+
if right_minima:
|
|
445
|
+
closest_right_min = min(right_minima) # Closest to central peak from right
|
|
446
|
+
right_candidates.append(closest_right_min)
|
|
447
|
+
self.logger.info(f"Right valley minimum at {data_points[closest_right_min]:.3f}")
|
|
448
|
+
|
|
449
|
+
# Method 2: If no clear minima found, use regional minimum search
|
|
450
|
+
self.logger.info("Checking for regional minima if no clear minima found")
|
|
451
|
+
if not left_candidates or not right_candidates:
|
|
452
|
+
self.logger.info("No clear minima found, using regional minimum search")
|
|
453
|
+
|
|
454
|
+
# Define search regions around the central peak
|
|
455
|
+
search_radius = (len(data_points) // 4)
|
|
456
|
+
|
|
457
|
+
# Left region: from start to central peak
|
|
458
|
+
left_start = max(0, central_peak_idx - search_radius)
|
|
459
|
+
left_end = central_peak_idx
|
|
460
|
+
if not left_candidates and left_end > left_start:
|
|
461
|
+
left_region = pdf_normalized[left_start:left_end]
|
|
462
|
+
local_min_idx = np.argmin(left_region) + left_start
|
|
463
|
+
left_candidates.append(local_min_idx)
|
|
464
|
+
self.logger.info(f"Left regional minimum at {data_points[local_min_idx]:.3f}")
|
|
465
|
+
|
|
466
|
+
# Right region: from central peak to end
|
|
467
|
+
right_start = central_peak_idx
|
|
468
|
+
right_end = min(len(pdf_normalized), central_peak_idx + search_radius)
|
|
469
|
+
if not right_candidates and right_end > right_start:
|
|
470
|
+
right_region = pdf_normalized[right_start:right_end]
|
|
471
|
+
local_min_idx = np.argmin(right_region) + right_start
|
|
472
|
+
right_candidates.append(local_min_idx)
|
|
473
|
+
self.logger.info(f"Right regional minimum at {data_points[local_min_idx]:.3f}")
|
|
474
|
+
|
|
475
|
+
# Method 3: Enhanced valley detection using percentile approach
|
|
476
|
+
self.logger.info("Checking for percentile-based valleys")
|
|
477
|
+
if not left_candidates or not right_candidates:
|
|
478
|
+
self.logger.info("Using percentile-based valley detection")
|
|
479
|
+
|
|
480
|
+
# Find points in bottom 20% of PDF values
|
|
481
|
+
valley_threshold = np.percentile(pdf_normalized, 20)
|
|
482
|
+
valley_indices = np.where(pdf_normalized <= valley_threshold)[0]
|
|
483
|
+
|
|
484
|
+
# Split valleys by central peak
|
|
485
|
+
left_valleys = [v for v in valley_indices if v < central_peak_idx]
|
|
486
|
+
right_valleys = [v for v in valley_indices if v > central_peak_idx]
|
|
487
|
+
|
|
488
|
+
if left_valleys and not left_candidates:
|
|
489
|
+
# Take valley closest to central peak
|
|
490
|
+
left_candidates.append(max(left_valleys))
|
|
491
|
+
self.logger.info(f"Left percentile valley at {data_points[max(left_valleys)]:.3f}")
|
|
492
|
+
|
|
493
|
+
if right_valleys and not right_candidates:
|
|
494
|
+
# Take valley closest to central peak
|
|
495
|
+
right_candidates.append(min(right_valleys))
|
|
496
|
+
self.logger.info(f"Right percentile valley at {data_points[min(right_valleys)]:.3f}")
|
|
497
|
+
|
|
498
|
+
return left_candidates, right_candidates
|
|
499
|
+
|
|
500
|
+
def _find_u_shape_slope_boundaries(self, pdf_normalized, data_points):
|
|
501
|
+
self.logger.info("U-shape detected, using slope transition method.")
|
|
502
|
+
|
|
503
|
+
z0_idx = self._find_z0_index(data_points)
|
|
504
|
+
|
|
505
|
+
self.logger.info("U-shape detected, using slope transition method")
|
|
506
|
+
|
|
507
|
+
# Use existing slope detection logic
|
|
508
|
+
first_derivative = np.gradient(pdf_normalized)
|
|
509
|
+
deriv_abs = np.abs(first_derivative)
|
|
510
|
+
slope_threshold = np.percentile(deriv_abs, self.slope_percentile)
|
|
511
|
+
|
|
512
|
+
left_candidates = []
|
|
513
|
+
right_candidates = []
|
|
514
|
+
|
|
515
|
+
search_radius = min(20, len(data_points) // 4)
|
|
516
|
+
|
|
517
|
+
# Search for slope transitions
|
|
518
|
+
for i in range(z0_idx - search_radius, -1, -1):
|
|
519
|
+
if i >= 0 and deriv_abs[i] > slope_threshold:
|
|
520
|
+
left_candidates.append(i)
|
|
521
|
+
break
|
|
522
|
+
|
|
523
|
+
for i in range(z0_idx + search_radius, len(deriv_abs)):
|
|
524
|
+
if deriv_abs[i] > slope_threshold:
|
|
525
|
+
right_candidates.append(i)
|
|
526
|
+
break
|
|
527
|
+
|
|
528
|
+
return left_candidates, right_candidates
|
|
529
|
+
|
|
530
|
+
def _find_boundaries_normalized_method(self, pdf_normalized, data_points):
|
|
531
|
+
self.logger.info("Finding cluster boundaries using normalized PDF and derivative methods.")
|
|
532
|
+
|
|
533
|
+
z0_idx = self._find_z0_index(data_points)
|
|
534
|
+
|
|
535
|
+
# Calculate derivatives on normalized PDF
|
|
536
|
+
first_derivative = np.gradient(pdf_normalized)
|
|
537
|
+
second_derivative = np.gradient(first_derivative)
|
|
538
|
+
|
|
539
|
+
if self.gdf_type in ['eldf', 'egdf']:
|
|
540
|
+
self.logger.info("Using ELDF/EGDF boundary detection method.")
|
|
541
|
+
# ELDF/EGDF: Find where pdf + derivative falls below threshold
|
|
542
|
+
combined_signal = pdf_normalized + first_derivative
|
|
543
|
+
|
|
544
|
+
left_candidates = []
|
|
545
|
+
right_candidates = []
|
|
546
|
+
|
|
547
|
+
# Search outward from Z0
|
|
548
|
+
for i in range(z0_idx - 1, -1, -1):
|
|
549
|
+
if combined_signal[i] <= self.derivative_threshold:
|
|
550
|
+
left_candidates.append(i)
|
|
551
|
+
break
|
|
552
|
+
|
|
553
|
+
for i in range(z0_idx + 1, len(combined_signal)):
|
|
554
|
+
if combined_signal[i] <= self.derivative_threshold:
|
|
555
|
+
right_candidates.append(i)
|
|
556
|
+
break
|
|
557
|
+
|
|
558
|
+
if left_candidates:
|
|
559
|
+
self.LCB = data_points[left_candidates[0]]
|
|
560
|
+
if right_candidates:
|
|
561
|
+
self.UCB = data_points[right_candidates[0]]
|
|
562
|
+
|
|
563
|
+
self.params['method_used'] = 'normalized_derivative_eldf_egdf'
|
|
564
|
+
|
|
565
|
+
elif self.gdf_type == 'qldf':
|
|
566
|
+
self.logger.info("Using QLDF boundary detection method.")
|
|
567
|
+
# QLDF: Use shape-based detection strategy
|
|
568
|
+
left_candidates, right_candidates = self._detect_qldf_shape_and_boundaries(pdf_normalized, data_points)
|
|
569
|
+
|
|
570
|
+
if left_candidates:
|
|
571
|
+
self.LCB = data_points[left_candidates[0]]
|
|
572
|
+
if right_candidates:
|
|
573
|
+
self.UCB = data_points[right_candidates[0]]
|
|
574
|
+
|
|
575
|
+
shape = self.params.get('pdf_shape', 'unknown')
|
|
576
|
+
self.params['method_used'] = f'qldf_{shape.lower()}_valley_detection'
|
|
577
|
+
|
|
578
|
+
else:
|
|
579
|
+
self.logger.info("Using QGDF boundary detection method.")
|
|
580
|
+
# QGDF: Use slope transition method
|
|
581
|
+
deriv_abs = np.abs(first_derivative)
|
|
582
|
+
slope_threshold = np.percentile(deriv_abs, self.slope_percentile)
|
|
583
|
+
|
|
584
|
+
left_candidates = []
|
|
585
|
+
right_candidates = []
|
|
586
|
+
|
|
587
|
+
search_radius = min(20, len(data_points) // 4)
|
|
588
|
+
|
|
589
|
+
for i in range(z0_idx - search_radius, -1, -1):
|
|
590
|
+
if i >= 0 and deriv_abs[i] > slope_threshold:
|
|
591
|
+
left_candidates.append(i)
|
|
592
|
+
break
|
|
593
|
+
|
|
594
|
+
for i in range(z0_idx + search_radius, len(deriv_abs)):
|
|
595
|
+
if deriv_abs[i] > slope_threshold:
|
|
596
|
+
right_candidates.append(i)
|
|
597
|
+
break
|
|
598
|
+
|
|
599
|
+
if not left_candidates or not right_candidates:
|
|
600
|
+
self.logger.info("Using normalized curvature-based detection")
|
|
601
|
+
|
|
602
|
+
curvature_threshold = np.std(second_derivative) * 0.7
|
|
603
|
+
|
|
604
|
+
for i in range(z0_idx - 1, -1, -1):
|
|
605
|
+
if abs(second_derivative[i]) > curvature_threshold:
|
|
606
|
+
if not left_candidates:
|
|
607
|
+
left_candidates.append(i)
|
|
608
|
+
break
|
|
609
|
+
|
|
610
|
+
for i in range(z0_idx + 1, len(second_derivative)):
|
|
611
|
+
if abs(second_derivative[i]) > curvature_threshold:
|
|
612
|
+
if not right_candidates:
|
|
613
|
+
right_candidates.append(i)
|
|
614
|
+
break
|
|
615
|
+
|
|
616
|
+
self.params['method_used'] = 'normalized_curvature_qgdf'
|
|
617
|
+
else:
|
|
618
|
+
self.params['method_used'] = 'normalized_slope_transition_qgdf'
|
|
619
|
+
|
|
620
|
+
if left_candidates:
|
|
621
|
+
self.LCB = data_points[left_candidates[0]]
|
|
622
|
+
if right_candidates:
|
|
623
|
+
self.UCB = data_points[right_candidates[0]]
|
|
624
|
+
|
|
625
|
+
if self.verbose:
|
|
626
|
+
method = self.params['method_used']
|
|
627
|
+
self.logger.info(f"Using method: {method}")
|
|
628
|
+
if hasattr(self, 'params') and 'pdf_shape' in self.params:
|
|
629
|
+
self.logger.info(f"PDF shape: {self.params['pdf_shape']}")
|
|
630
|
+
if self.LCB is not None:
|
|
631
|
+
self.logger.info(f"Found CLB at {self.LCB:.3f}")
|
|
632
|
+
if self.UCB is not None:
|
|
633
|
+
self.logger.info(f"Found CUB at {self.UCB:.3f}")
|
|
634
|
+
|
|
635
|
+
def _fallback_to_data_bounds(self):
|
|
636
|
+
self.logger.info("Falling back to data bounds for cluster boundaries.")
|
|
637
|
+
|
|
638
|
+
dlb, dub = self._get_data_bounds()
|
|
639
|
+
|
|
640
|
+
if self.LCB is None:
|
|
641
|
+
self.LCB = dlb
|
|
642
|
+
if self.verbose:
|
|
643
|
+
self.logger.info(f"CLB set to data lower bound: {self.LCB:.3f}")
|
|
644
|
+
|
|
645
|
+
if self.UCB is None:
|
|
646
|
+
self.UCB = dub
|
|
647
|
+
if self.verbose:
|
|
648
|
+
self.logger.info(f"CUB set to data upper bound: {self.UCB:.3f}")
|
|
649
|
+
|
|
650
|
+
def _update_params(self):
|
|
651
|
+
self.logger.info("Updating parameters with clustering results.")
|
|
652
|
+
self.params.update({
|
|
653
|
+
'LCB': float(self.LCB) if self.LCB is not None else None,
|
|
654
|
+
'UCB': float(self.UCB) if self.UCB is not None else None,
|
|
655
|
+
'Z0': float(self.z0) if self.z0 is not None else None,
|
|
656
|
+
'S_opt': float(self.S_opt) if self.S_opt is not None else None,
|
|
657
|
+
'cluster_width': float(self.UCB - self.LCB) if (self.LCB is not None and self.UCB is not None) else None,
|
|
658
|
+
'clustering_successful': self.LCB is not None and self.UCB is not None
|
|
659
|
+
})
|
|
660
|
+
|
|
661
|
+
if hasattr(self.gdf, 'params'):
|
|
662
|
+
cluster_params = {
|
|
663
|
+
'data_cluster': {
|
|
664
|
+
'LCB': self.params['LCB'],
|
|
665
|
+
'UCB': self.params['UCB'],
|
|
666
|
+
'cluster_width': self.params['cluster_width'],
|
|
667
|
+
'clustering_successful': self.params['clustering_successful'],
|
|
668
|
+
'method_used': self.params['method_used'],
|
|
669
|
+
'derivative_threshold': self.params['derivative_threshold'],
|
|
670
|
+
'slope_percentile': self.params['slope_percentile'],
|
|
671
|
+
'normalization_method': self.params['normalization_method'],
|
|
672
|
+
'pdf_shape': self.params.get('pdf_shape', None)
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
self.gdf.params.update(cluster_params)
|
|
676
|
+
|
|
677
|
+
def fit(self, plot: bool = False) -> Optional[Tuple[float, float]]:
|
|
678
|
+
"""
|
|
679
|
+
Perform cluster boundary detection analysis on the GDF.
|
|
680
|
+
|
|
681
|
+
Executes the complete clustering pipeline:
|
|
682
|
+
1. Validates GDF object and extracts PDF data
|
|
683
|
+
2. Normalizes PDF for consistent analysis
|
|
684
|
+
3. Applies GDF-specific boundary detection algorithms
|
|
685
|
+
4. Implements fallback strategies if needed
|
|
686
|
+
5. Updates all parameters and results
|
|
687
|
+
|
|
688
|
+
The method automatically selects the appropriate algorithm based on GDF type:
|
|
689
|
+
- **ELDF/EGDF**: Derivative threshold method
|
|
690
|
+
- **QLDF**: Shape detection (W-shape vs U-shape) with valley finding
|
|
691
|
+
- **QGDF**: Slope transition detection with curvature fallback
|
|
692
|
+
|
|
693
|
+
Parameters
|
|
694
|
+
----------
|
|
695
|
+
plot : bool, default=False
|
|
696
|
+
If True, generates a plot of the PDF, detected boundaries, and derivative analysis.
|
|
697
|
+
|
|
698
|
+
Returns
|
|
699
|
+
-------
|
|
700
|
+
bool
|
|
701
|
+
True if clustering analysis completed successfully, False if errors occurred.
|
|
702
|
+
Check self.params['errors'] for detailed error information.
|
|
703
|
+
|
|
704
|
+
Side Effects
|
|
705
|
+
------------
|
|
706
|
+
- Sets self.LCB and self.UCB with detected boundaries
|
|
707
|
+
- Updates self.params with complete analysis results
|
|
708
|
+
- Stores normalized and original PDF data
|
|
709
|
+
- Adds cluster parameters to original GDF object
|
|
710
|
+
|
|
711
|
+
Examples
|
|
712
|
+
--------
|
|
713
|
+
>>> cluster = DataCluster(gdf=qldf, verbose=True)
|
|
714
|
+
>>> CLB, CUB = cluster.fit()
|
|
715
|
+
>>> if CLB is not None and CUB is not None:
|
|
716
|
+
... print(f"Boundaries: CLB={CLB:.3f}, CUB={CUB:.3f}")
|
|
717
|
+
... else:
|
|
718
|
+
... print("Clustering failed:", cluster.params['errors'])
|
|
719
|
+
"""
|
|
720
|
+
self.logger.info("Starting cluster boundary detection analysis.")
|
|
721
|
+
try:
|
|
722
|
+
if self.verbose:
|
|
723
|
+
self.logger.info(f"Starting normalized cluster analysis for {self.gdf_type.upper()}")
|
|
724
|
+
self.logger.info(f"Derivative threshold: {self.derivative_threshold}")
|
|
725
|
+
self.logger.info(f"Slope percentile: {self.slope_percentile}")
|
|
726
|
+
|
|
727
|
+
# Get basic data
|
|
728
|
+
self.logger.info("Extracting PDF and data points from GDF.")
|
|
729
|
+
pdf_data = self._get_pdf_data()
|
|
730
|
+
data_points = self._get_data_points()
|
|
731
|
+
|
|
732
|
+
# Normalize PDF for consistent processing
|
|
733
|
+
self.logger.info("Normalizing PDF data for analysis.")
|
|
734
|
+
self.pdf_normalized = self._normalize_pdf(pdf_data)
|
|
735
|
+
|
|
736
|
+
# Get Z0 and S_opt
|
|
737
|
+
self.logger.info("Retrieving Z0 and S_opt from GDF.")
|
|
738
|
+
self.z0 = self._get_z0()
|
|
739
|
+
self.S_opt = self._get_s_opt()
|
|
740
|
+
|
|
741
|
+
self.logger.info(f"Z0: {self.z0:.3f}, S_opt: {self.S_opt:.3f}")
|
|
742
|
+
|
|
743
|
+
# Apply normalized clustering method
|
|
744
|
+
self.logger.info("Applying boundary detection method based on GDF type.")
|
|
745
|
+
self._find_boundaries_normalized_method(self.pdf_normalized, data_points)
|
|
746
|
+
|
|
747
|
+
# Fallback to data bounds if needed
|
|
748
|
+
if self.LCB is None or self.UCB is None:
|
|
749
|
+
self.logger.info("Normalized method incomplete, using data bounds as fallback")
|
|
750
|
+
self._fallback_to_data_bounds()
|
|
751
|
+
|
|
752
|
+
# Update params
|
|
753
|
+
self.logger.info("Updating parameters with final results.")
|
|
754
|
+
self._update_params()
|
|
755
|
+
|
|
756
|
+
self._fitted = True
|
|
757
|
+
|
|
758
|
+
# Optional plotting
|
|
759
|
+
if plot:
|
|
760
|
+
self.logger.info("Generating plot for PDF and detected boundaries.")
|
|
761
|
+
self.plot()
|
|
762
|
+
|
|
763
|
+
self.logger.info(f"Final boundaries: CLB={self.LCB:.3f}, CUB={self.UCB:.3f}")
|
|
764
|
+
self.logger.info("Clustering analysis completed successfully.")
|
|
765
|
+
|
|
766
|
+
return self.LCB, self.UCB
|
|
767
|
+
|
|
768
|
+
except Exception as e:
|
|
769
|
+
error_msg = f"Error during cluster analysis: {str(e)}"
|
|
770
|
+
self._append_error(error_msg, type(e).__name__)
|
|
771
|
+
|
|
772
|
+
return None, None
|
|
773
|
+
|
|
774
|
+
def results(self):
|
|
775
|
+
"""
|
|
776
|
+
Return comprehensive cluster analysis results dictionary.
|
|
777
|
+
|
|
778
|
+
Provides complete analysis results including boundaries, cluster characteristics,
|
|
779
|
+
method diagnostics, and error information.
|
|
780
|
+
|
|
781
|
+
Returns
|
|
782
|
+
-------
|
|
783
|
+
dict
|
|
784
|
+
Complete analysis results with the following keys:
|
|
785
|
+
|
|
786
|
+
**Boundary Results:**
|
|
787
|
+
- 'LCB' : float or None - Cluster Lower Boundary
|
|
788
|
+
- 'UCB' : float or None - Cluster Upper Boundary
|
|
789
|
+
- 'cluster_width' : float or None - Distance between boundaries
|
|
790
|
+
- 'clustering_successful' : bool - Overall success status
|
|
791
|
+
|
|
792
|
+
**GDF Information:**
|
|
793
|
+
- 'gdf_type' : str - Type of GDF ('eldf', 'egdf', 'qldf', 'qgdf')
|
|
794
|
+
- 'Z0' : float or None - Characteristic point from GDF
|
|
795
|
+
- 'S_opt' : float or None - Optimal scale parameter from GDF
|
|
796
|
+
|
|
797
|
+
**Method Details:**
|
|
798
|
+
- 'method_used' : str - Specific algorithm used for boundary detection
|
|
799
|
+
- 'normalization_method' : str - PDF normalization approach
|
|
800
|
+
- 'pdf_shape' : str or None - Detected shape for QLDF ('W-shape', 'U-shape', 'Heterogeneous')
|
|
801
|
+
|
|
802
|
+
**Parameters:**
|
|
803
|
+
- 'derivative_threshold' : float - Threshold used for ELDF/EGDF
|
|
804
|
+
- 'slope_percentile' : int - Percentile used for QLDF/QGDF
|
|
805
|
+
|
|
806
|
+
**Diagnostics:**
|
|
807
|
+
- 'errors' : list - Any errors encountered during analysis
|
|
808
|
+
- 'warnings' : list - Warning messages (e.g., global function usage)
|
|
809
|
+
|
|
810
|
+
Raises
|
|
811
|
+
------
|
|
812
|
+
RuntimeError
|
|
813
|
+
If fit() method has not been called successfully.
|
|
814
|
+
|
|
815
|
+
Examples
|
|
816
|
+
--------
|
|
817
|
+
>>> cluster = DataCluster(gdf=qldf)
|
|
818
|
+
>>> cluster.fit()
|
|
819
|
+
>>> results = cluster.results()
|
|
820
|
+
>>>
|
|
821
|
+
>>> # Access boundary information
|
|
822
|
+
>>> print(f"Lower boundary: {results['LCB']}")
|
|
823
|
+
>>> print(f"Upper boundary: {results['UCB']}")
|
|
824
|
+
>>> print(f"Cluster width: {results['cluster_width']}")
|
|
825
|
+
>>>
|
|
826
|
+
>>> # Check method and shape information
|
|
827
|
+
>>> print(f"Method used: {results['method_used']}")
|
|
828
|
+
>>> if results['pdf_shape']:
|
|
829
|
+
... print(f"PDF shape: {results['pdf_shape']}")
|
|
830
|
+
>>>
|
|
831
|
+
>>> # Verify success and check for issues
|
|
832
|
+
>>> if results['clustering_successful']:
|
|
833
|
+
... print("Clustering completed successfully")
|
|
834
|
+
>>> else:
|
|
835
|
+
... print("Issues found:", results['errors'])
|
|
836
|
+
"""
|
|
837
|
+
self.logger.info("Retrieving cluster analysis results.")
|
|
838
|
+
|
|
839
|
+
if not self._fitted:
|
|
840
|
+
self.logger.error("No analysis results available. Call fit() method first.")
|
|
841
|
+
raise RuntimeError("No analysis results available. Call fit() method first.")
|
|
842
|
+
|
|
843
|
+
return self.params.copy()
|
|
844
|
+
|
|
845
|
+
def plot(self, figsize=(12, 8)):
|
|
846
|
+
"""
|
|
847
|
+
Create comprehensive visualization of cluster boundary detection results.
|
|
848
|
+
|
|
849
|
+
Generates a two-panel plot showing:
|
|
850
|
+
1. **Top panel**: Original PDF with detected boundaries, Z0, and cluster regions
|
|
851
|
+
2. **Bottom panel**: Derivative analysis with thresholds and boundary markers
|
|
852
|
+
|
|
853
|
+
Visualization Features:
|
|
854
|
+
- Original PDF curve with detected CLB/CUB boundaries (green dotted lines)
|
|
855
|
+
- Z0 characteristic point (red solid line)
|
|
856
|
+
- Cluster region shading (light green):
|
|
857
|
+
- ELDF/EGDF: Between CLB and CUB
|
|
858
|
+
- QLDF/QGDF: Outside CLB and CUB boundaries
|
|
859
|
+
- First and second derivatives for boundary detection analysis
|
|
860
|
+
- Threshold lines and slope indicators
|
|
861
|
+
- QLDF shape information (W-shape, U-shape, Heterogeneous) in title
|
|
862
|
+
|
|
863
|
+
Parameters
|
|
864
|
+
----------
|
|
865
|
+
figsize : tuple, default=(12, 8)
|
|
866
|
+
Figure size as (width, height) in inches.
|
|
867
|
+
|
|
868
|
+
Raises
|
|
869
|
+
------
|
|
870
|
+
RuntimeError
|
|
871
|
+
If fit() has not been called successfully before plotting.
|
|
872
|
+
|
|
873
|
+
Notes
|
|
874
|
+
-----
|
|
875
|
+
- Requires successful completion of fit() method
|
|
876
|
+
- Automatically adjusts visualization based on GDF type
|
|
877
|
+
- For QLDF, includes PDF shape detection results in title
|
|
878
|
+
- Derivative plots help understand boundary detection mechanism
|
|
879
|
+
- Green shaded regions indicate the main cluster areas
|
|
880
|
+
|
|
881
|
+
Examples
|
|
882
|
+
--------
|
|
883
|
+
>>> cluster = DataCluster(gdf=qldf)
|
|
884
|
+
>>> cluster.fit()
|
|
885
|
+
>>> cluster.plot() # Standard plot
|
|
886
|
+
>>> cluster.plot(figsize=(15, 10)) # Larger plot
|
|
887
|
+
"""
|
|
888
|
+
self.logger.info("Creating plot for cluster boundary detection results.")
|
|
889
|
+
try:
|
|
890
|
+
data_points = self._get_data_points()
|
|
891
|
+
|
|
892
|
+
# Calculate derivatives for plotting
|
|
893
|
+
first_derivative = np.gradient(self.pdf_normalized)
|
|
894
|
+
second_derivative = np.gradient(first_derivative)
|
|
895
|
+
combined_signal = self.pdf_normalized + first_derivative
|
|
896
|
+
|
|
897
|
+
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize, height_ratios=[3, 2])
|
|
898
|
+
|
|
899
|
+
# Top plot: Original PDF and boundaries
|
|
900
|
+
ax1.plot(data_points, self.pdf_original, 'b-', label='Original PDF', linewidth=2)
|
|
901
|
+
|
|
902
|
+
# Plot Z0
|
|
903
|
+
if self.z0 is not None:
|
|
904
|
+
ax1.axvline(x=self.z0, color='red', linestyle='-', linewidth=2, alpha=0.7, label=f'Z0={self.z0:.3f}')
|
|
905
|
+
|
|
906
|
+
# Plot boundaries
|
|
907
|
+
if self.LCB is not None:
|
|
908
|
+
ax1.axvline(x=self.LCB, color='green', linestyle=':', linewidth=2, label=f'CLB={self.LCB:.3f}')
|
|
909
|
+
if self.UCB is not None:
|
|
910
|
+
ax1.axvline(x=self.UCB, color='green', linestyle=':', linewidth=2, label=f'CUB={self.UCB:.3f}')
|
|
911
|
+
|
|
912
|
+
# Shade regions based on GDF type
|
|
913
|
+
dlb, dub = self._get_data_bounds()
|
|
914
|
+
if self.LCB is not None and self.UCB is not None:
|
|
915
|
+
if self.gdf_type in ['eldf', 'egdf']:
|
|
916
|
+
ax1.axvspan(self.LCB, self.UCB, alpha=0.2, color='lightgreen', label='Main Cluster')
|
|
917
|
+
else:
|
|
918
|
+
ax1.axvspan(dlb, self.LCB, alpha=0.2, color='lightgreen', label='Main Cluster')
|
|
919
|
+
ax1.axvspan(self.UCB, dub, alpha=0.2, color='lightgreen')
|
|
920
|
+
|
|
921
|
+
# Add shape info to title for QLDF
|
|
922
|
+
title = f'{self.gdf_type.upper()} Normalized Cluster Detection'
|
|
923
|
+
if self.gdf_type == 'qldf' and 'pdf_shape' in self.params:
|
|
924
|
+
title += f' ({self.params["pdf_shape"]})'
|
|
925
|
+
|
|
926
|
+
ax1.set_ylabel('PDF Values')
|
|
927
|
+
ax1.set_title(title)
|
|
928
|
+
ax1.legend()
|
|
929
|
+
ax1.grid(True, alpha=0.3)
|
|
930
|
+
|
|
931
|
+
# Bottom plot: Derivatives and thresholds
|
|
932
|
+
ax2.plot(data_points, first_derivative, 'orange', label='1st Derivative', alpha=0.7)
|
|
933
|
+
ax2.plot(data_points, combined_signal, 'purple', label='PDF + 1st Derivative', linewidth=2)
|
|
934
|
+
|
|
935
|
+
# Plot threshold lines
|
|
936
|
+
if self.gdf_type in ['eldf', 'egdf']:
|
|
937
|
+
ax2.axhline(y=self.derivative_threshold, color='red', linestyle='--', alpha=0.7,
|
|
938
|
+
label=f'Threshold={self.derivative_threshold}')
|
|
939
|
+
else:
|
|
940
|
+
# For QLDF/QGDF, show slope threshold
|
|
941
|
+
deriv_abs = np.abs(first_derivative)
|
|
942
|
+
slope_threshold = np.percentile(deriv_abs, self.slope_percentile)
|
|
943
|
+
ax2.plot(data_points, deriv_abs, 'brown', label='|1st Derivative|', alpha=0.7)
|
|
944
|
+
ax2.axhline(y=slope_threshold, color='red', linestyle='--', alpha=0.7,
|
|
945
|
+
label=f'Slope Threshold ({self.slope_percentile}%)')
|
|
946
|
+
ax2.plot(data_points, second_derivative, 'gray', label='2nd Derivative', alpha=0.5)
|
|
947
|
+
ax2.axhline(y=0, color='black', linestyle='-', alpha=0.3, label='Zero Line')
|
|
948
|
+
|
|
949
|
+
# Plot boundaries on derivative plot
|
|
950
|
+
if self.LCB is not None:
|
|
951
|
+
ax2.axvline(x=self.LCB, color='green', linestyle=':', linewidth=2, alpha=0.7)
|
|
952
|
+
if self.UCB is not None:
|
|
953
|
+
ax2.axvline(x=self.UCB, color='green', linestyle=':', linewidth=2, alpha=0.7)
|
|
954
|
+
|
|
955
|
+
# Plot Z0 on derivative plot
|
|
956
|
+
if self.z0 is not None:
|
|
957
|
+
ax2.axvline(x=self.z0, color='red', linestyle='-', linewidth=2, alpha=0.7)
|
|
958
|
+
|
|
959
|
+
ax2.set_xlabel('Data Points')
|
|
960
|
+
ax2.set_ylabel('Derivative Values')
|
|
961
|
+
ax2.legend()
|
|
962
|
+
ax2.grid(True, alpha=0.3)
|
|
963
|
+
|
|
964
|
+
plt.tight_layout()
|
|
965
|
+
plt.show()
|
|
966
|
+
|
|
967
|
+
except Exception as e:
|
|
968
|
+
error_msg = f"Error creating plot: {str(e)}"
|
|
969
|
+
self._append_error(error_msg, type(e).__name__)
|
|
970
|
+
|
|
971
|
+
def __repr__(self):
|
|
972
|
+
return (f"<DataCluster(gdf_type={self.gdf_type}, "
|
|
973
|
+
f"LCB={self.LCB}, UCB={self.UCB}, "
|
|
974
|
+
f"Z0={self.z0}, S_opt={self.S_opt}, "
|
|
975
|
+
f"fitted={self._fitted})>")
|