mlquantify 0.0.11.2__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify/__init__.py +32 -6
- mlquantify/base.py +559 -257
- mlquantify/classification/__init__.py +1 -1
- mlquantify/classification/methods.py +160 -0
- mlquantify/evaluation/__init__.py +14 -2
- mlquantify/evaluation/measures.py +215 -0
- mlquantify/evaluation/protocol.py +647 -0
- mlquantify/methods/__init__.py +37 -40
- mlquantify/methods/aggregative.py +1030 -0
- mlquantify/methods/meta.py +472 -0
- mlquantify/methods/mixture_models.py +1003 -0
- mlquantify/methods/non_aggregative.py +136 -0
- mlquantify/methods/threshold_optimization.py +957 -0
- mlquantify/model_selection.py +377 -232
- mlquantify/plots.py +367 -0
- mlquantify/utils/__init__.py +2 -2
- mlquantify/utils/general.py +334 -0
- mlquantify/utils/method.py +449 -0
- {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/METADATA +137 -122
- mlquantify-0.1.1.dist-info/RECORD +22 -0
- {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/WHEEL +1 -1
- mlquantify/classification/pwkclf.py +0 -73
- mlquantify/evaluation/measures/__init__.py +0 -26
- mlquantify/evaluation/measures/ae.py +0 -11
- mlquantify/evaluation/measures/bias.py +0 -16
- mlquantify/evaluation/measures/kld.py +0 -8
- mlquantify/evaluation/measures/mse.py +0 -12
- mlquantify/evaluation/measures/nae.py +0 -16
- mlquantify/evaluation/measures/nkld.py +0 -13
- mlquantify/evaluation/measures/nrae.py +0 -16
- mlquantify/evaluation/measures/rae.py +0 -12
- mlquantify/evaluation/measures/se.py +0 -12
- mlquantify/evaluation/protocol/_Protocol.py +0 -202
- mlquantify/evaluation/protocol/__init__.py +0 -2
- mlquantify/evaluation/protocol/app.py +0 -146
- mlquantify/evaluation/protocol/npp.py +0 -34
- mlquantify/methods/aggregative/ThreholdOptm/_ThreholdOptimization.py +0 -62
- mlquantify/methods/aggregative/ThreholdOptm/__init__.py +0 -7
- mlquantify/methods/aggregative/ThreholdOptm/acc.py +0 -27
- mlquantify/methods/aggregative/ThreholdOptm/max.py +0 -23
- mlquantify/methods/aggregative/ThreholdOptm/ms.py +0 -21
- mlquantify/methods/aggregative/ThreholdOptm/ms2.py +0 -25
- mlquantify/methods/aggregative/ThreholdOptm/pacc.py +0 -41
- mlquantify/methods/aggregative/ThreholdOptm/t50.py +0 -21
- mlquantify/methods/aggregative/ThreholdOptm/x.py +0 -23
- mlquantify/methods/aggregative/__init__.py +0 -9
- mlquantify/methods/aggregative/cc.py +0 -32
- mlquantify/methods/aggregative/emq.py +0 -86
- mlquantify/methods/aggregative/fm.py +0 -72
- mlquantify/methods/aggregative/gac.py +0 -96
- mlquantify/methods/aggregative/gpac.py +0 -87
- mlquantify/methods/aggregative/mixtureModels/_MixtureModel.py +0 -81
- mlquantify/methods/aggregative/mixtureModels/__init__.py +0 -5
- mlquantify/methods/aggregative/mixtureModels/dys.py +0 -55
- mlquantify/methods/aggregative/mixtureModels/dys_syn.py +0 -89
- mlquantify/methods/aggregative/mixtureModels/hdy.py +0 -46
- mlquantify/methods/aggregative/mixtureModels/smm.py +0 -27
- mlquantify/methods/aggregative/mixtureModels/sord.py +0 -77
- mlquantify/methods/aggregative/pcc.py +0 -33
- mlquantify/methods/aggregative/pwk.py +0 -38
- mlquantify/methods/meta/__init__.py +0 -1
- mlquantify/methods/meta/ensemble.py +0 -236
- mlquantify/methods/non_aggregative/__init__.py +0 -1
- mlquantify/methods/non_aggregative/hdx.py +0 -71
- mlquantify/plots/__init__.py +0 -2
- mlquantify/plots/distribution_plot.py +0 -109
- mlquantify/plots/protocol_plot.py +0 -193
- mlquantify/utils/general_purposes/__init__.py +0 -8
- mlquantify/utils/general_purposes/convert_col_to_array.py +0 -13
- mlquantify/utils/general_purposes/generate_artificial_indexes.py +0 -29
- mlquantify/utils/general_purposes/get_real_prev.py +0 -9
- mlquantify/utils/general_purposes/load_quantifier.py +0 -4
- mlquantify/utils/general_purposes/make_prevs.py +0 -23
- mlquantify/utils/general_purposes/normalize.py +0 -20
- mlquantify/utils/general_purposes/parallel.py +0 -10
- mlquantify/utils/general_purposes/round_protocol_df.py +0 -14
- mlquantify/utils/method_purposes/__init__.py +0 -6
- mlquantify/utils/method_purposes/distances.py +0 -21
- mlquantify/utils/method_purposes/getHist.py +0 -13
- mlquantify/utils/method_purposes/get_scores.py +0 -33
- mlquantify/utils/method_purposes/moss.py +0 -16
- mlquantify/utils/method_purposes/ternary_search.py +0 -14
- mlquantify/utils/method_purposes/tprfpr.py +0 -42
- mlquantify-0.0.11.2.dist-info/RECORD +0 -73
- {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,957 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
import numpy as np
|
|
3
|
+
import warnings
|
|
4
|
+
from sklearn.base import BaseEstimator
|
|
5
|
+
|
|
6
|
+
from ..base import AggregativeQuantifier
|
|
7
|
+
from ..utils.method import adjust_threshold, get_scores
|
|
8
|
+
import mlquantify as mq
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ThresholdOptimization(AggregativeQuantifier):
|
|
14
|
+
"""
|
|
15
|
+
Generic Class for methods that adjust the decision boundary of the underlying classifier
|
|
16
|
+
to make the ACC (base method for threshold methods) estimation more numerically stable.
|
|
17
|
+
Most strategies involve altering the denominator of the ACC equation.
|
|
18
|
+
|
|
19
|
+
This class serves as a base for implementing threshold optimization techniques in classification
|
|
20
|
+
tasks. It is designed to adjust thresholds based on true positive and false positive rates,
|
|
21
|
+
ensuring better quantification performance.
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
learner : BaseEstimator
|
|
26
|
+
A scikit-learn compatible classifier to be used for threshold optimization.
|
|
27
|
+
threshold : float, optional
|
|
28
|
+
The threshold value to be used for classification decisions. Default is 0.5.
|
|
29
|
+
|
|
30
|
+
Attributes
|
|
31
|
+
----------
|
|
32
|
+
learner : BaseEstimator
|
|
33
|
+
A scikit-learn compatible classifier.
|
|
34
|
+
threshold : float, optional
|
|
35
|
+
The optimized threshold used for classification decisions.
|
|
36
|
+
cc_output : float, optional
|
|
37
|
+
The classification count output, representing the proportion of instances classified
|
|
38
|
+
as positive based on the threshold.
|
|
39
|
+
tpr : float, optional
|
|
40
|
+
The true positive rate corresponding to the best threshold.
|
|
41
|
+
fpr : float, optional
|
|
42
|
+
The false positive rate corresponding to the best threshold.
|
|
43
|
+
|
|
44
|
+
Notes
|
|
45
|
+
-----
|
|
46
|
+
All methods that inherit from this class will be binary quantifiers. In case of multiclass problems, it will be made One vs All.
|
|
47
|
+
|
|
48
|
+
Examples
|
|
49
|
+
--------
|
|
50
|
+
>>> from mlquantify.methods.threshold_optimization import ThresholdOptimization
|
|
51
|
+
>>> from mlquantify.utils.general import get_real_prev
|
|
52
|
+
>>> from sklearn.datasets import load_breast_cancer
|
|
53
|
+
>>> from sklearn.svm import SVC
|
|
54
|
+
>>> from sklearn.model_selection import train_test_split
|
|
55
|
+
>>>
|
|
56
|
+
>>> class MyThrMethod(ThresholdOptimization):
|
|
57
|
+
... def __init__(self, learner, threshold=0.5):
|
|
58
|
+
... super().__init__(learner)
|
|
59
|
+
... self.threshold = threshold
|
|
60
|
+
... def best_tprfpr(self, thresholds, tpr, fpr):
|
|
61
|
+
... return thresholds[20], tpr[20], fpr[20]
|
|
62
|
+
>>>
|
|
63
|
+
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
64
|
+
>>>
|
|
65
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
|
|
66
|
+
>>>
|
|
67
|
+
>>> mtm = MyThrMethod(learner=SVC(probability=True), threshold=0.5)
|
|
68
|
+
>>> mtm.fit(X_train, y_train)
|
|
69
|
+
>>> y_pred = mtm.predict(X_test)
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
def __init__(self, learner: BaseEstimator=None):
|
|
73
|
+
self.learner = learner
|
|
74
|
+
self.threshold = None
|
|
75
|
+
self.cc_output = None
|
|
76
|
+
self.tpr = None
|
|
77
|
+
self.fpr = None
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def is_probabilistic(self) -> bool:
|
|
81
|
+
"""
|
|
82
|
+
Returns whether the method is probabilistic.
|
|
83
|
+
|
|
84
|
+
This method is used to determine whether the quantification method is probabilistic,
|
|
85
|
+
meaning it uses class-conditional probabilities to estimate class prevalences.
|
|
86
|
+
|
|
87
|
+
Returns
|
|
88
|
+
-------
|
|
89
|
+
bool
|
|
90
|
+
True, indicating that this method is probabilistic.
|
|
91
|
+
"""
|
|
92
|
+
return True
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def is_multiclass(self) -> bool:
|
|
96
|
+
"""
|
|
97
|
+
Returns whether the method is applicable to multiclass quantification.
|
|
98
|
+
|
|
99
|
+
Threshold-based methods are typically binary classifiers, so this method
|
|
100
|
+
returns False.
|
|
101
|
+
|
|
102
|
+
Returns
|
|
103
|
+
-------
|
|
104
|
+
bool
|
|
105
|
+
False, indicating that this method does not support multiclass quantification.
|
|
106
|
+
"""
|
|
107
|
+
return False
|
|
108
|
+
|
|
109
|
+
def _fit_method(self, X, y):
|
|
110
|
+
"""
|
|
111
|
+
Fits the classifier and adjusts thresholds based on true positive rate (TPR) and false positive rate (FPR).
|
|
112
|
+
|
|
113
|
+
Parameters
|
|
114
|
+
----------
|
|
115
|
+
X : pd.DataFrame or np.ndarray
|
|
116
|
+
The input features for training.
|
|
117
|
+
y : pd.Series or np.ndarray
|
|
118
|
+
The target labels for training.
|
|
119
|
+
|
|
120
|
+
Returns
|
|
121
|
+
-------
|
|
122
|
+
self : ThresholdOptimization
|
|
123
|
+
The fitted quantifier object with the best threshold, TPR, and FPR.
|
|
124
|
+
"""
|
|
125
|
+
# Get predicted labels and probabilities
|
|
126
|
+
if mq.arguments["y_labels"] is not None and mq.arguments["posteriors_train"] is not None:
|
|
127
|
+
y_labels = mq.arguments["y_labels"]
|
|
128
|
+
probabilities = mq.arguments["posteriors_train"]
|
|
129
|
+
else:
|
|
130
|
+
y_labels, probabilities = get_scores(X, y, self.learner, self.cv_folds, self.learner_fitted)
|
|
131
|
+
|
|
132
|
+
# Adjust thresholds and compute true and false positive rates
|
|
133
|
+
thresholds, tprs, fprs = adjust_threshold(y_labels, probabilities[:, 1], self.classes)
|
|
134
|
+
|
|
135
|
+
# Find the best threshold based on TPR and FPR
|
|
136
|
+
self.threshold, self.tpr, self.fpr = self.best_tprfpr(thresholds, tprs, fprs)
|
|
137
|
+
|
|
138
|
+
return self
|
|
139
|
+
|
|
140
|
+
def _predict_method(self, X) -> dict:
|
|
141
|
+
"""
|
|
142
|
+
Predicts class prevalences using the adjusted threshold.
|
|
143
|
+
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
X : pd.DataFrame or np.ndarray
|
|
147
|
+
The input features for prediction.
|
|
148
|
+
|
|
149
|
+
Returns
|
|
150
|
+
-------
|
|
151
|
+
np.ndarray
|
|
152
|
+
An array of predicted prevalences for the classes.
|
|
153
|
+
"""
|
|
154
|
+
# Get predicted probabilities for the positive class
|
|
155
|
+
probabilities = self.predict_learner(X)[:, 1]
|
|
156
|
+
|
|
157
|
+
# Compute the classification count output based on the threshold
|
|
158
|
+
self.cc_output = len(probabilities[probabilities >= self.threshold]) / len(probabilities)
|
|
159
|
+
|
|
160
|
+
# Calculate prevalence, ensuring it is within [0, 1]
|
|
161
|
+
if self.tpr - self.fpr == 0:
|
|
162
|
+
prevalence = self.cc_output
|
|
163
|
+
else:
|
|
164
|
+
# Equation of threshold methods to compute prevalence
|
|
165
|
+
prevalence = np.clip((self.cc_output - self.fpr) / (self.tpr - self.fpr), 0, 1)
|
|
166
|
+
|
|
167
|
+
prevalences = [1 - prevalence, prevalence]
|
|
168
|
+
|
|
169
|
+
return np.asarray(prevalences)
|
|
170
|
+
|
|
171
|
+
@abstractmethod
|
|
172
|
+
def best_tprfpr(self, thresholds: np.ndarray, tpr: np.ndarray, fpr: np.ndarray) -> float:
|
|
173
|
+
"""
|
|
174
|
+
Abstract method for determining the best TPR (True Positive Rate) and FPR (False Positive Rate)
|
|
175
|
+
to use in the equation for threshold optimization.
|
|
176
|
+
|
|
177
|
+
This method needs to be implemented by subclasses to define how the best threshold
|
|
178
|
+
is chosen based on TPR and FPR.
|
|
179
|
+
|
|
180
|
+
Parameters
|
|
181
|
+
----------
|
|
182
|
+
thresholds : np.ndarray
|
|
183
|
+
An array of threshold values.
|
|
184
|
+
tpr : np.ndarray
|
|
185
|
+
An array of true positive rates corresponding to the thresholds.
|
|
186
|
+
fpr : np.ndarray
|
|
187
|
+
An array of false positive rates corresponding to the thresholds.
|
|
188
|
+
|
|
189
|
+
Returns
|
|
190
|
+
-------
|
|
191
|
+
float
|
|
192
|
+
The best threshold value determined based on the true positive and false positive rates.
|
|
193
|
+
"""
|
|
194
|
+
...
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
class ACC(ThresholdOptimization):
|
|
202
|
+
"""
|
|
203
|
+
Adjusted Classify and Count (ACC). This method is a base approach for threshold-based
|
|
204
|
+
quantification methods.
|
|
205
|
+
|
|
206
|
+
As described in the ThresholdOptimization base class, this method estimates the true
|
|
207
|
+
positive rate (TPR) and false positive rate (FPR) from the training data. It then uses
|
|
208
|
+
these values to adjust the output of the Classify and Count (CC) method, making the
|
|
209
|
+
quantification process more accurate and stable.
|
|
210
|
+
|
|
211
|
+
Parameters
|
|
212
|
+
----------
|
|
213
|
+
learner : BaseEstimator
|
|
214
|
+
A scikit-learn compatible classifier to be used for quantification.
|
|
215
|
+
threshold : float, optional
|
|
216
|
+
The decision threshold for classifying instances. Default is 0.5.
|
|
217
|
+
|
|
218
|
+
Attributes
|
|
219
|
+
----------
|
|
220
|
+
learner : BaseEstimator
|
|
221
|
+
A scikit-learn compatible classifier.
|
|
222
|
+
threshold : float
|
|
223
|
+
The decision threshold used to classify instances as positive or negative. Default is 0.5.
|
|
224
|
+
|
|
225
|
+
See Also
|
|
226
|
+
--------
|
|
227
|
+
ThresholdOptimization : Base class for threshold-based quantification methods.
|
|
228
|
+
CC : Classify and Count quantification method.
|
|
229
|
+
|
|
230
|
+
References
|
|
231
|
+
----------
|
|
232
|
+
FORMAN, George. Quantifying counts and costs via classification. Data Mining and Knowledge Discovery, v. 17, p. 164-206, 2008. Available at: https://link.springer.com/article/10.1007/s10618-008-0097-y
|
|
233
|
+
|
|
234
|
+
Examples
|
|
235
|
+
--------
|
|
236
|
+
>>> from mlquantify.methods.aggregative import ACC
|
|
237
|
+
>>> from mlquantify.utils.general import get_real_prev
|
|
238
|
+
>>> from sklearn.datasets import load_breast_cancer
|
|
239
|
+
>>> from sklearn.svm import SVC
|
|
240
|
+
>>> from sklearn.model_selection import train_test_split
|
|
241
|
+
>>>
|
|
242
|
+
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
243
|
+
>>>
|
|
244
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
|
|
245
|
+
>>>
|
|
246
|
+
>>> acc = ACC(learner=SVC(probability=True), threshold=0.5)
|
|
247
|
+
>>> acc.fit(X_train, y_train)
|
|
248
|
+
>>> y_pred = acc.predict(X_test)
|
|
249
|
+
>>> y_pred
|
|
250
|
+
{0: 0.3968506555196656, 1: 0.6031493444803344}
|
|
251
|
+
>>> get_real_prev(y_test)
|
|
252
|
+
{0: 0.3991228070175439, 1: 0.6008771929824561}
|
|
253
|
+
"""
|
|
254
|
+
|
|
255
|
+
def __init__(self, learner: BaseEstimator=None, threshold: float = 0.5):
|
|
256
|
+
super().__init__(learner)
|
|
257
|
+
self.threshold = threshold
|
|
258
|
+
|
|
259
|
+
def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
|
|
260
|
+
"""
|
|
261
|
+
Determines the true positive rate (TPR) and false positive rate (FPR) for the specified threshold.
|
|
262
|
+
|
|
263
|
+
This method identifies the TPR and FPR corresponding to the threshold provided
|
|
264
|
+
during initialization. It assumes that the `thresholds`, `tprs`, and `fprs` arrays
|
|
265
|
+
are aligned, meaning the `i-th` element of each array corresponds to the same threshold.
|
|
266
|
+
|
|
267
|
+
Parameters
|
|
268
|
+
----------
|
|
269
|
+
thresholds : np.ndarray
|
|
270
|
+
An array of threshold values.
|
|
271
|
+
tprs : np.ndarray
|
|
272
|
+
An array of true positive rates corresponding to the thresholds.
|
|
273
|
+
fprs : np.ndarray
|
|
274
|
+
An array of false positive rates corresponding to the thresholds.
|
|
275
|
+
|
|
276
|
+
Returns
|
|
277
|
+
-------
|
|
278
|
+
tuple
|
|
279
|
+
A tuple containing the threshold, the true positive rate (TPR), and the false
|
|
280
|
+
positive rate (FPR) for the specified threshold.
|
|
281
|
+
|
|
282
|
+
Raises
|
|
283
|
+
------
|
|
284
|
+
IndexError
|
|
285
|
+
If the specified threshold is not found in the `thresholds` array.
|
|
286
|
+
"""
|
|
287
|
+
# Get the TPR and FPR where the threshold matches the specified value
|
|
288
|
+
tpr = tprs[thresholds == self.threshold][0]
|
|
289
|
+
fpr = fprs[thresholds == self.threshold][0]
|
|
290
|
+
return (self.threshold, tpr, fpr)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
class MAX(ThresholdOptimization):
|
|
301
|
+
"""
|
|
302
|
+
Threshold MAX. This quantification method selects the threshold that maximizes
|
|
303
|
+
the absolute difference between the true positive rate (TPR) and false positive
|
|
304
|
+
rate (FPR). This threshold is then used in the denominator of the equation for
|
|
305
|
+
adjusted prevalence estimation.
|
|
306
|
+
|
|
307
|
+
Parameters
|
|
308
|
+
----------
|
|
309
|
+
learner : BaseEstimator
|
|
310
|
+
A scikit-learn compatible classifier to be used for quantification.
|
|
311
|
+
|
|
312
|
+
Attributes
|
|
313
|
+
----------
|
|
314
|
+
learner : BaseEstimator
|
|
315
|
+
A scikit-learn compatible classifier.
|
|
316
|
+
|
|
317
|
+
See Also
|
|
318
|
+
--------
|
|
319
|
+
ThresholdOptimization : Base class for threshold-based quantification methods.
|
|
320
|
+
ACC : Adjusted Classify and Count quantification method.
|
|
321
|
+
CC : Classify and Count quantification method.
|
|
322
|
+
|
|
323
|
+
References
|
|
324
|
+
----------
|
|
325
|
+
FORMAN, George. Counting positives accurately despite inaccurate classification. In: European conference on machine learning. Berlin, Heidelberg: Springer Berlin Heidelberg, 2005. p. 564-575. Available at: https://link.springer.com/chapter/10.1007/11564096_56
|
|
326
|
+
|
|
327
|
+
Examples
|
|
328
|
+
--------
|
|
329
|
+
>>> from mlquantify.methods.aggregative import MAX
|
|
330
|
+
>>> from mlquantify.utils.general import get_real_prev
|
|
331
|
+
>>> from sklearn.datasets import load_breast_cancer
|
|
332
|
+
>>> from sklearn.svm import SVC
|
|
333
|
+
>>> from sklearn.model_selection import train_test_split
|
|
334
|
+
>>>
|
|
335
|
+
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
336
|
+
>>>
|
|
337
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
|
|
338
|
+
>>>
|
|
339
|
+
>>> maxq = MAX(learner=SVC(probability=True))
|
|
340
|
+
>>> maxq.fit(X_train, y_train)
|
|
341
|
+
>>> y_pred = maxq.predict(X_test)
|
|
342
|
+
>>> y_pred
|
|
343
|
+
{0: 0.3920664352842359, 1: 0.6079335647157641}
|
|
344
|
+
>>> get_real_prev(y_test)
|
|
345
|
+
{0: 0.3991228070175439, 1: 0.6008771929824561}
|
|
346
|
+
"""
|
|
347
|
+
|
|
348
|
+
def __init__(self, learner: BaseEstimator=None):
|
|
349
|
+
super().__init__(learner)
|
|
350
|
+
|
|
351
|
+
def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
|
|
352
|
+
"""
|
|
353
|
+
Determines the optimal threshold by maximizing the absolute difference between
|
|
354
|
+
the true positive rate (TPR) and the false positive rate (FPR).
|
|
355
|
+
|
|
356
|
+
This method identifies the index where `|TPR - FPR|` is maximized and retrieves
|
|
357
|
+
the corresponding threshold, TPR, and FPR.
|
|
358
|
+
|
|
359
|
+
Parameters
|
|
360
|
+
----------
|
|
361
|
+
thresholds : np.ndarray
|
|
362
|
+
An array of threshold values.
|
|
363
|
+
tprs : np.ndarray
|
|
364
|
+
An array of true positive rates corresponding to the thresholds.
|
|
365
|
+
fprs : np.ndarray
|
|
366
|
+
An array of false positive rates corresponding to the thresholds.
|
|
367
|
+
|
|
368
|
+
Returns
|
|
369
|
+
-------
|
|
370
|
+
tuple
|
|
371
|
+
A tuple containing:
|
|
372
|
+
- The threshold that maximizes `|TPR - FPR|`.
|
|
373
|
+
- The true positive rate (TPR) at the selected threshold.
|
|
374
|
+
- The false positive rate (FPR) at the selected threshold.
|
|
375
|
+
|
|
376
|
+
Raises
|
|
377
|
+
------
|
|
378
|
+
ValueError
|
|
379
|
+
If `thresholds`, `tprs`, or `fprs` are empty or have mismatched lengths.
|
|
380
|
+
"""
|
|
381
|
+
max_index = np.argmax(np.abs(tprs - fprs))
|
|
382
|
+
|
|
383
|
+
# Retrieve the corresponding threshold, TPR, and FPR
|
|
384
|
+
threshold = thresholds[max_index]
|
|
385
|
+
tpr = tprs[max_index]
|
|
386
|
+
fpr = fprs[max_index]
|
|
387
|
+
return (threshold, tpr, fpr)
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
class MS(ThresholdOptimization):
|
|
398
|
+
"""
|
|
399
|
+
Median Sweep (MS). This quantification method uses an ensemble
|
|
400
|
+
of threshold-based methods, taking the median values of the
|
|
401
|
+
true positive rate (TPR) and false positive rate (FPR) across
|
|
402
|
+
all thresholds to compute adjusted prevalences.
|
|
403
|
+
|
|
404
|
+
Parameters
|
|
405
|
+
----------
|
|
406
|
+
learner : BaseEstimator
|
|
407
|
+
A scikit-learn compatible classifier to be used for quantification.
|
|
408
|
+
threshold : float, optional
|
|
409
|
+
The default threshold value to use for the quantification method. Default is 0.5.
|
|
410
|
+
|
|
411
|
+
Attributes
|
|
412
|
+
----------
|
|
413
|
+
learner : BaseEstimator
|
|
414
|
+
A scikit-learn compatible classifier.
|
|
415
|
+
threshold : float
|
|
416
|
+
The default threshold to use for the quantification method, typically 0.5.
|
|
417
|
+
|
|
418
|
+
See Also
|
|
419
|
+
--------
|
|
420
|
+
ThresholdOptimization : Base class for threshold-based quantification methods.
|
|
421
|
+
ACC : Adjusted Classify and Count quantification method.
|
|
422
|
+
MAX : Threshold MAX quantification method.
|
|
423
|
+
CC : Classify and Count quantification method.
|
|
424
|
+
|
|
425
|
+
References
|
|
426
|
+
----------
|
|
427
|
+
FORMAN, George. Quantifying counts and costs via classification. Data Mining and Knowledge Discovery, v. 17, p. 164-206, 2008. Available at: https://link.springer.com/article/10.1007/s10618-008-0097-y
|
|
428
|
+
|
|
429
|
+
Examples
|
|
430
|
+
--------
|
|
431
|
+
>>> from mlquantify.methods.aggregative import MS
|
|
432
|
+
>>> from mlquantify.utils.general import get_real_prev
|
|
433
|
+
>>> from sklearn.datasets import load_breast_cancer
|
|
434
|
+
>>> from sklearn.svm import SVC
|
|
435
|
+
>>> from sklearn.model_selection import train_test_split
|
|
436
|
+
>>>
|
|
437
|
+
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
438
|
+
>>>
|
|
439
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
|
|
440
|
+
>>>
|
|
441
|
+
>>> ms = MS(learner=SVC(probability=True))
|
|
442
|
+
>>> ms.fit(X_train, y_train)
|
|
443
|
+
>>> y_pred = ms.predict(X_test)
|
|
444
|
+
>>> y_pred
|
|
445
|
+
{0: 0.41287676595138967, 1: 0.5871232340486103}
|
|
446
|
+
>>> get_real_prev(y_test)
|
|
447
|
+
{0: 0.3991228070175439, 1: 0.6008771929824561}
|
|
448
|
+
"""
|
|
449
|
+
|
|
450
|
+
def __init__(self, learner: BaseEstimator=None, threshold: float = 0.5):
|
|
451
|
+
super().__init__(learner)
|
|
452
|
+
self.threshold = threshold
|
|
453
|
+
|
|
454
|
+
def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
|
|
455
|
+
"""
|
|
456
|
+
Determines the optimal TPR and FPR by taking the median of
|
|
457
|
+
all TPR and FPR values across the given thresholds.
|
|
458
|
+
|
|
459
|
+
This method computes the median values of TPR and FPR to
|
|
460
|
+
mitigate the influence of outliers and variability in the
|
|
461
|
+
performance metrics.
|
|
462
|
+
|
|
463
|
+
Parameters
|
|
464
|
+
----------
|
|
465
|
+
thresholds : np.ndarray
|
|
466
|
+
An array of threshold values.
|
|
467
|
+
tprs : np.ndarray
|
|
468
|
+
An array of true positive rates corresponding to the thresholds.
|
|
469
|
+
fprs : np.ndarray
|
|
470
|
+
An array of false positive rates corresponding to the thresholds.
|
|
471
|
+
|
|
472
|
+
Returns
|
|
473
|
+
-------
|
|
474
|
+
tuple
|
|
475
|
+
A tuple containing:
|
|
476
|
+
- The default threshold value (float).
|
|
477
|
+
- The median true positive rate (float).
|
|
478
|
+
- The median false positive rate (float).
|
|
479
|
+
|
|
480
|
+
Raises
|
|
481
|
+
------
|
|
482
|
+
ValueError
|
|
483
|
+
If `thresholds`, `tprs`, or `fprs` are empty or have mismatched lengths.
|
|
484
|
+
"""
|
|
485
|
+
# Compute median TPR and FPR
|
|
486
|
+
tpr = np.median(tprs)
|
|
487
|
+
fpr = np.median(fprs)
|
|
488
|
+
|
|
489
|
+
return (self.threshold, tpr, fpr)
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
class MS2(ThresholdOptimization):
|
|
499
|
+
"""
|
|
500
|
+
Median Sweep 2 (MS2). This method is an extension of the
|
|
501
|
+
Median Sweep strategy, but it focuses only on cases where
|
|
502
|
+
the difference between the true positive rate (TPR) and the
|
|
503
|
+
false positive rate (FPR) exceeds a threshold (0.25). The
|
|
504
|
+
method computes the median values of TPR, FPR, and thresholds
|
|
505
|
+
for these selected cases.
|
|
506
|
+
|
|
507
|
+
Parameters
|
|
508
|
+
----------
|
|
509
|
+
learner : BaseEstimator
|
|
510
|
+
A scikit-learn compatible classifier to be used for quantification.
|
|
511
|
+
|
|
512
|
+
Attributes
|
|
513
|
+
----------
|
|
514
|
+
learner : BaseEstimator
|
|
515
|
+
A scikit-learn compatible classifier.
|
|
516
|
+
|
|
517
|
+
References
|
|
518
|
+
----------
|
|
519
|
+
FORMAN, George. Quantifying counts and costs via classification. Data Mining and Knowledge Discovery, v. 17, p. 164-206, 2008. Available at: https://link.springer.com/article/10.1007/s10618-008-0097-y
|
|
520
|
+
|
|
521
|
+
See Also
|
|
522
|
+
--------
|
|
523
|
+
ThresholdOptimization : Base class for threshold-based quantification methods.
|
|
524
|
+
ACC : Adjusted Classify and Count quantification method.
|
|
525
|
+
MS : Median Sweep quantification method.
|
|
526
|
+
CC : Classify and Count quantification method.
|
|
527
|
+
|
|
528
|
+
Examples
|
|
529
|
+
--------
|
|
530
|
+
>>> from mlquantify.methods.aggregative import MS2
|
|
531
|
+
>>> from mlquantify.utils.general import get_real_prev
|
|
532
|
+
>>> from sklearn.datasets import load_breast_cancer
|
|
533
|
+
>>> from sklearn.svm import SVC
|
|
534
|
+
>>> from sklearn.model_selection import train_test_split
|
|
535
|
+
>>>
|
|
536
|
+
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
537
|
+
>>>
|
|
538
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
|
|
539
|
+
>>>
|
|
540
|
+
>>> ms2 = MS2(learner=SVC(probability=True))
|
|
541
|
+
>>> ms2.fit(X_train, y_train)
|
|
542
|
+
>>> y_pred = ms2.predict(X_test)
|
|
543
|
+
>>> y_pred
|
|
544
|
+
{0: 0.41287676595138967, 1: 0.5871232340486103}
|
|
545
|
+
>>> get_real_prev(y_test)
|
|
546
|
+
{0: 0.3991228070175439, 1: 0.6008771929824561}
|
|
547
|
+
"""
|
|
548
|
+
|
|
549
|
+
def __init__(self, learner: BaseEstimator=None):
|
|
550
|
+
super().__init__(learner)
|
|
551
|
+
|
|
552
|
+
def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
|
|
553
|
+
"""
|
|
554
|
+
Determines the optimal threshold, TPR, and FPR by focusing only on
|
|
555
|
+
cases where the absolute difference between TPR and FPR is greater
|
|
556
|
+
than 0.25. For these cases, the method computes the median values.
|
|
557
|
+
|
|
558
|
+
Parameters
|
|
559
|
+
----------
|
|
560
|
+
thresholds : np.ndarray
|
|
561
|
+
An array of threshold values.
|
|
562
|
+
tprs : np.ndarray
|
|
563
|
+
An array of true positive rates corresponding to the thresholds.
|
|
564
|
+
fprs : np.ndarray
|
|
565
|
+
An array of false positive rates corresponding to the thresholds.
|
|
566
|
+
|
|
567
|
+
Returns
|
|
568
|
+
-------
|
|
569
|
+
tuple
|
|
570
|
+
A tuple containing:
|
|
571
|
+
- The median threshold value for cases meeting the condition (float).
|
|
572
|
+
- The median true positive rate for cases meeting the condition (float).
|
|
573
|
+
- The median false positive rate for cases meeting the condition (float).
|
|
574
|
+
|
|
575
|
+
Raises
|
|
576
|
+
------
|
|
577
|
+
ValueError
|
|
578
|
+
If no cases satisfy the condition `|TPR - FPR| > 0.25`.
|
|
579
|
+
Warning
|
|
580
|
+
If all TPR or FPR values are zero.
|
|
581
|
+
"""
|
|
582
|
+
# Check if all TPR or FPR values are zero
|
|
583
|
+
if np.all(tprs == 0) or np.all(fprs == 0):
|
|
584
|
+
warnings.warn("All TPR or FPR values are zero.")
|
|
585
|
+
|
|
586
|
+
# Identify indices where the condition is satisfied
|
|
587
|
+
indices = np.where(np.abs(tprs - fprs) > 0.25)[0]
|
|
588
|
+
if len(indices) == 0:
|
|
589
|
+
raise ValueError("No cases meet the condition |TPR - FPR| > 0.25.")
|
|
590
|
+
|
|
591
|
+
# Compute medians for the selected cases
|
|
592
|
+
threshold = np.median(thresholds[indices])
|
|
593
|
+
tpr = np.median(tprs[indices])
|
|
594
|
+
fpr = np.median(fprs[indices])
|
|
595
|
+
|
|
596
|
+
return (threshold, tpr, fpr)
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
class PACC(ThresholdOptimization):
|
|
600
|
+
"""
|
|
601
|
+
Probabilistic Adjusted Classify and Count (PACC).
|
|
602
|
+
This method extends the Adjusted Classify and Count (AC) approach
|
|
603
|
+
by leveraging the average class-conditional confidences obtained
|
|
604
|
+
from a probabilistic classifier instead of relying solely on true
|
|
605
|
+
positive and false positive rates.
|
|
606
|
+
|
|
607
|
+
Parameters
|
|
608
|
+
----------
|
|
609
|
+
learner : BaseEstimator
|
|
610
|
+
A scikit-learn compatible classifier to be used for quantification.
|
|
611
|
+
threshold : float, optional
|
|
612
|
+
The decision threshold for classification. Default is 0.5.
|
|
613
|
+
|
|
614
|
+
Attributes
|
|
615
|
+
----------
|
|
616
|
+
learner : BaseEstimator
|
|
617
|
+
A scikit-learn compatible classifier.
|
|
618
|
+
threshold : float
|
|
619
|
+
Decision threshold for classification. Default is 0.5.
|
|
620
|
+
tpr : float
|
|
621
|
+
True positive rate computed during the fitting process.
|
|
622
|
+
fpr : float
|
|
623
|
+
False positive rate computed during the fitting process.
|
|
624
|
+
|
|
625
|
+
See Also
|
|
626
|
+
--------
|
|
627
|
+
ThresholdOptimization : Base class for threshold-based quantification methods.
|
|
628
|
+
ACC : Adjusted Classify and Count quantification method.
|
|
629
|
+
CC : Classify and Count quantification method.
|
|
630
|
+
|
|
631
|
+
References
|
|
632
|
+
----------
|
|
633
|
+
A. Bella, C. Ferri, J. Hernández-Orallo and M. J. Ramírez-Quintana, "Quantification via Probability Estimators," 2010 IEEE International Conference on Data Mining, Sydney, NSW, Australia, 2010, pp. 737-742, doi: 10.1109/ICDM.2010.75. Available at: https://ieeexplore.ieee.org/abstract/document/5694031
|
|
634
|
+
|
|
635
|
+
Examples
|
|
636
|
+
--------
|
|
637
|
+
>>> from mlquantify.methods.aggregative import PACC
|
|
638
|
+
>>> from mlquantify.utils.general import get_real_prev
|
|
639
|
+
>>> from sklearn.datasets import load_breast_cancer
|
|
640
|
+
>>> from sklearn.svm import SVC
|
|
641
|
+
>>> from sklearn.model_selection import train_test_split
|
|
642
|
+
>>>
|
|
643
|
+
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
644
|
+
>>>
|
|
645
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
|
|
646
|
+
>>>
|
|
647
|
+
>>> pacc = PACC(learner=SVC(probability=True))
|
|
648
|
+
>>> pacc.fit(X_train, y_train)
|
|
649
|
+
>>> y_pred = pacc.predict(X_test)
|
|
650
|
+
>>> y_pred
|
|
651
|
+
{0: 0.4664886119311328, 1: 0.5335113880688672}
|
|
652
|
+
>>> get_real_prev(y_test)
|
|
653
|
+
{0: 0.3991228070175439, 1: 0.6008771929824561}
|
|
654
|
+
"""
|
|
655
|
+
|
|
656
|
+
def __init__(self, learner: BaseEstimator=None, threshold: float = 0.5):
|
|
657
|
+
super().__init__(learner)
|
|
658
|
+
self.threshold = threshold
|
|
659
|
+
|
|
660
|
+
def _predict_method(self, X):
|
|
661
|
+
"""
|
|
662
|
+
Predicts the class prevalence using the mean class-conditional
|
|
663
|
+
probabilities from a probabilistic classifier.
|
|
664
|
+
|
|
665
|
+
Parameters
|
|
666
|
+
----------
|
|
667
|
+
X : array-like or sparse matrix of shape (n_samples, n_features)
|
|
668
|
+
The input data for prediction.
|
|
669
|
+
|
|
670
|
+
Returns
|
|
671
|
+
-------
|
|
672
|
+
dict
|
|
673
|
+
A dictionary with class labels as keys and their respective
|
|
674
|
+
prevalence estimates as values.
|
|
675
|
+
|
|
676
|
+
Notes
|
|
677
|
+
-----
|
|
678
|
+
The prevalence is adjusted using the formula:
|
|
679
|
+
prevalence = |mean_score - FPR| / (TPR - FPR),
|
|
680
|
+
where mean_score is the average probability for the positive class.
|
|
681
|
+
|
|
682
|
+
Raises
|
|
683
|
+
------
|
|
684
|
+
ZeroDivisionError
|
|
685
|
+
If `TPR - FPR` equals zero, indicating that the classifier's
|
|
686
|
+
performance does not vary across the threshold range.
|
|
687
|
+
"""
|
|
688
|
+
prevalences = {}
|
|
689
|
+
|
|
690
|
+
# Calculate probabilities for the positive class
|
|
691
|
+
probabilities = self.predict_learner(X)[:, 1]
|
|
692
|
+
|
|
693
|
+
# Compute the mean score for the positive class
|
|
694
|
+
mean_scores = np.mean(probabilities)
|
|
695
|
+
|
|
696
|
+
# Adjust prevalence based on TPR and FPR
|
|
697
|
+
if self.tpr - self.fpr == 0:
|
|
698
|
+
prevalence = mean_scores
|
|
699
|
+
else:
|
|
700
|
+
prevalence = np.clip(abs(mean_scores - self.fpr) / (self.tpr - self.fpr), 0, 1)
|
|
701
|
+
|
|
702
|
+
# Map the computed prevalence to the class labels
|
|
703
|
+
prevalences[self.classes[0]] = 1 - prevalence
|
|
704
|
+
prevalences[self.classes[1]] = prevalence
|
|
705
|
+
|
|
706
|
+
return prevalences
|
|
707
|
+
|
|
708
|
+
def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
|
|
709
|
+
"""
|
|
710
|
+
Finds the true positive rate (TPR) and false positive rate (FPR)
|
|
711
|
+
corresponding to the specified decision threshold.
|
|
712
|
+
|
|
713
|
+
Parameters
|
|
714
|
+
----------
|
|
715
|
+
thresholds : np.ndarray
|
|
716
|
+
An array of threshold values.
|
|
717
|
+
tprs : np.ndarray
|
|
718
|
+
An array of true positive rates corresponding to the thresholds.
|
|
719
|
+
fprs : np.ndarray
|
|
720
|
+
An array of false positive rates corresponding to the thresholds.
|
|
721
|
+
|
|
722
|
+
Returns
|
|
723
|
+
-------
|
|
724
|
+
tuple
|
|
725
|
+
A tuple containing the specified threshold, TPR, and FPR.
|
|
726
|
+
|
|
727
|
+
Raises
|
|
728
|
+
------
|
|
729
|
+
IndexError
|
|
730
|
+
If the specified threshold is not found in the `thresholds` array.
|
|
731
|
+
"""
|
|
732
|
+
# Locate TPR and FPR for the specified threshold
|
|
733
|
+
tpr = tprs[thresholds == self.threshold][0]
|
|
734
|
+
fpr = fprs[thresholds == self.threshold][0]
|
|
735
|
+
return (self.threshold, tpr, fpr)
|
|
736
|
+
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
def best_tprfpr(self, thresholds:np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
|
|
741
|
+
tpr = tprs[thresholds == self.threshold][0]
|
|
742
|
+
fpr = fprs[thresholds == self.threshold][0]
|
|
743
|
+
return (self.threshold, tpr, fpr)
|
|
744
|
+
|
|
745
|
+
|
|
746
|
+
|
|
747
|
+
|
|
748
|
+
|
|
749
|
+
|
|
750
|
+
|
|
751
|
+
|
|
752
|
+
class T50(ThresholdOptimization):
|
|
753
|
+
"""
|
|
754
|
+
Threshold 50 (T50). This method adjusts the decision threshold
|
|
755
|
+
to the point where the true positive rate (TPR) is approximately
|
|
756
|
+
equal to 0.5. This approach is particularly useful for balancing
|
|
757
|
+
sensitivity and specificity in binary classification tasks.
|
|
758
|
+
|
|
759
|
+
Parameters
|
|
760
|
+
----------
|
|
761
|
+
learner : BaseEstimator
|
|
762
|
+
A scikit-learn compatible classifier to be used for quantification.
|
|
763
|
+
|
|
764
|
+
Attributes
|
|
765
|
+
----------
|
|
766
|
+
learner : BaseEstimator
|
|
767
|
+
A scikit-learn compatible classifier.
|
|
768
|
+
threshold : float
|
|
769
|
+
Decision threshold determined during training.
|
|
770
|
+
tpr : float
|
|
771
|
+
True positive rate corresponding to the selected threshold.
|
|
772
|
+
fpr : float
|
|
773
|
+
False positive rate corresponding to the selected threshold.
|
|
774
|
+
|
|
775
|
+
See Also
|
|
776
|
+
--------
|
|
777
|
+
ThresholdOptimization : Base class for threshold-based quantification methods.
|
|
778
|
+
ACC : Adjusted Classify and Count quantification method.
|
|
779
|
+
CC : Classify and Count quantification method.
|
|
780
|
+
|
|
781
|
+
References
|
|
782
|
+
----------
|
|
783
|
+
FORMAN, George. Quantifying counts and costs via classification. Data Mining and Knowledge Discovery, v. 17, p. 164-206, 2008. Available at: https://link.springer.com/article/10.1007/s10618-008-0097-y
|
|
784
|
+
|
|
785
|
+
Examples
|
|
786
|
+
--------
|
|
787
|
+
>>> from mlquantify.methods.aggregative import T50
|
|
788
|
+
>>> from mlquantify.utils.general import get_real_prev
|
|
789
|
+
>>> from sklearn.datasets import load_breast_cancer
|
|
790
|
+
>>> from sklearn.svm import SVC
|
|
791
|
+
>>> from sklearn.model_selection import train_test_split
|
|
792
|
+
>>>
|
|
793
|
+
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
794
|
+
>>>
|
|
795
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
|
|
796
|
+
>>>
|
|
797
|
+
>>> t50 = T50(learner=SVC(probability=True))
|
|
798
|
+
>>> t50.fit(X_train, y_train)
|
|
799
|
+
>>> y_pred = t50.predict(X_test)
|
|
800
|
+
>>> y_pred
|
|
801
|
+
{0: 0.49563196626070505, 1: 0.504368033739295}
|
|
802
|
+
>>> get_real_prev(y_test)
|
|
803
|
+
{0: 0.3991228070175439, 1: 0.6008771929824561}
|
|
804
|
+
"""
|
|
805
|
+
|
|
806
|
+
def __init__(self, learner: BaseEstimator=None):
|
|
807
|
+
super().__init__(learner)
|
|
808
|
+
|
|
809
|
+
def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
|
|
810
|
+
"""
|
|
811
|
+
Determines the threshold, true positive rate (TPR), and false positive
|
|
812
|
+
rate (FPR) where TPR is closest to 0.5.
|
|
813
|
+
|
|
814
|
+
Parameters
|
|
815
|
+
----------
|
|
816
|
+
thresholds : np.ndarray
|
|
817
|
+
An array of threshold values.
|
|
818
|
+
tprs : np.ndarray
|
|
819
|
+
An array of true positive rates corresponding to the thresholds.
|
|
820
|
+
fprs : np.ndarray
|
|
821
|
+
An array of false positive rates corresponding to the thresholds.
|
|
822
|
+
|
|
823
|
+
Returns
|
|
824
|
+
-------
|
|
825
|
+
tuple
|
|
826
|
+
A tuple containing the selected threshold, TPR, and FPR.
|
|
827
|
+
|
|
828
|
+
Notes
|
|
829
|
+
-----
|
|
830
|
+
- The method identifies the index where the absolute difference
|
|
831
|
+
between TPR and 0.5 is minimized.
|
|
832
|
+
- This ensures that the selected threshold represents a balance
|
|
833
|
+
point in the ROC space.
|
|
834
|
+
|
|
835
|
+
Raises
|
|
836
|
+
------
|
|
837
|
+
ValueError
|
|
838
|
+
If the arrays `thresholds`, `tprs`, or `fprs` are empty or
|
|
839
|
+
misaligned in length.
|
|
840
|
+
"""
|
|
841
|
+
# Find the index where TPR is closest to 0.5
|
|
842
|
+
min_index = np.argmin(np.abs(tprs - 0.5))
|
|
843
|
+
|
|
844
|
+
# Retrieve the corresponding threshold, TPR, and FPR
|
|
845
|
+
threshold = thresholds[min_index]
|
|
846
|
+
tpr = tprs[min_index]
|
|
847
|
+
fpr = fprs[min_index]
|
|
848
|
+
|
|
849
|
+
return (threshold, tpr, fpr)
|
|
850
|
+
|
|
851
|
+
|
|
852
|
+
|
|
853
|
+
|
|
854
|
+
|
|
855
|
+
|
|
856
|
+
|
|
857
|
+
|
|
858
|
+
|
|
859
|
+
|
|
860
|
+
class X_method(ThresholdOptimization):
|
|
861
|
+
"""
|
|
862
|
+
Threshold X. This method identifies the decision threshold where the
|
|
863
|
+
false positive rate (FPR) is approximately equal to 1 - true positive rate (TPR).
|
|
864
|
+
This criterion is useful for identifying thresholds that align with a balance
|
|
865
|
+
point on the ROC curve.
|
|
866
|
+
|
|
867
|
+
Parameters
|
|
868
|
+
----------
|
|
869
|
+
learner : BaseEstimator
|
|
870
|
+
A scikit-learn compatible classifier to be used for quantification.
|
|
871
|
+
|
|
872
|
+
Attributes
|
|
873
|
+
----------
|
|
874
|
+
learner : BaseEstimator
|
|
875
|
+
A scikit-learn compatible classifier.
|
|
876
|
+
threshold : float
|
|
877
|
+
Decision threshold determined during training.
|
|
878
|
+
tpr : float
|
|
879
|
+
True positive rate corresponding to the selected threshold.
|
|
880
|
+
fpr : float
|
|
881
|
+
False positive rate corresponding to the selected threshold.
|
|
882
|
+
|
|
883
|
+
See Also
|
|
884
|
+
--------
|
|
885
|
+
ThresholdOptimization : Base class for threshold-based quantification methods.
|
|
886
|
+
ACC : Adjusted Classify and Count quantification method.
|
|
887
|
+
CC : Classify and Count quantification method.
|
|
888
|
+
|
|
889
|
+
References
|
|
890
|
+
----------
|
|
891
|
+
FORMAN, George. Quantifying counts and costs via classification. Data Mining and Knowledge Discovery, v. 17, p. 164-206, 2008. Available at: https://link.springer.com/article/10.1007/s10618-008-0097-y
|
|
892
|
+
|
|
893
|
+
Examples
|
|
894
|
+
--------
|
|
895
|
+
>>> from mlquantify.methods.aggregative import X_method
|
|
896
|
+
>>> from mlquantify.utils.general import get_real_prev
|
|
897
|
+
>>> from sklearn.datasets import load_breast_cancer
|
|
898
|
+
>>> from sklearn.svm import SVC
|
|
899
|
+
>>> from sklearn.model_selection import train_test_split
|
|
900
|
+
>>>
|
|
901
|
+
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
902
|
+
>>>
|
|
903
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
|
|
904
|
+
>>>
|
|
905
|
+
>>> x_method = X_method(learner=SVC(probability=True))
|
|
906
|
+
>>> x_method.fit(X_train, y_train)
|
|
907
|
+
>>> y_pred = x_method.predict(X_test)
|
|
908
|
+
>>> y_pred
|
|
909
|
+
{0: 0.40523495782808205, 1: 0.594765042171918}
|
|
910
|
+
>>> get_real_prev(y_test)
|
|
911
|
+
{0: 0.3991228070175439, 1: 0.6008771929824561}
|
|
912
|
+
"""
|
|
913
|
+
|
|
914
|
+
def __init__(self, learner: BaseEstimator=None):
|
|
915
|
+
super().__init__(learner)
|
|
916
|
+
|
|
917
|
+
def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
|
|
918
|
+
"""
|
|
919
|
+
Determines the threshold, true positive rate (TPR), and false positive
|
|
920
|
+
rate (FPR) where FPR is closest to 1 - TPR.
|
|
921
|
+
|
|
922
|
+
Parameters
|
|
923
|
+
----------
|
|
924
|
+
thresholds : np.ndarray
|
|
925
|
+
An array of threshold values.
|
|
926
|
+
tprs : np.ndarray
|
|
927
|
+
An array of true positive rates corresponding to the thresholds.
|
|
928
|
+
fprs : np.ndarray
|
|
929
|
+
An array of false positive rates corresponding to the thresholds.
|
|
930
|
+
|
|
931
|
+
Returns
|
|
932
|
+
-------
|
|
933
|
+
tuple
|
|
934
|
+
A tuple containing the selected threshold, TPR, and FPR.
|
|
935
|
+
|
|
936
|
+
Notes
|
|
937
|
+
-----
|
|
938
|
+
- The method identifies the index where the absolute difference
|
|
939
|
+
between FPR and 1 - TPR is minimized.
|
|
940
|
+
- This ensures that the selected threshold corresponds to a balance
|
|
941
|
+
point based on the given criterion.
|
|
942
|
+
|
|
943
|
+
Raises
|
|
944
|
+
------
|
|
945
|
+
ValueError
|
|
946
|
+
If the arrays `thresholds`, `tprs`, or `fprs` are empty or
|
|
947
|
+
misaligned in length.
|
|
948
|
+
"""
|
|
949
|
+
# Find the index where FPR is closest to 1 - TPR
|
|
950
|
+
min_index = np.argmin(np.abs(1 - (tprs + fprs)))
|
|
951
|
+
|
|
952
|
+
# Retrieve the corresponding threshold, TPR, and FPR
|
|
953
|
+
threshold = thresholds[min_index]
|
|
954
|
+
tpr = tprs[min_index]
|
|
955
|
+
fpr = fprs[min_index]
|
|
956
|
+
|
|
957
|
+
return (threshold, tpr, fpr)
|