mlquantify 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify/__init__.py +10 -29
- mlquantify/adjust_counting/__init__.py +24 -0
- mlquantify/adjust_counting/_adjustment.py +648 -0
- mlquantify/adjust_counting/_base.py +245 -0
- mlquantify/adjust_counting/_counting.py +153 -0
- mlquantify/adjust_counting/_utils.py +109 -0
- mlquantify/base.py +117 -519
- mlquantify/base_aggregative.py +209 -0
- mlquantify/calibration.py +1 -0
- mlquantify/confidence.py +329 -0
- mlquantify/likelihood/__init__.py +5 -0
- mlquantify/likelihood/_base.py +147 -0
- mlquantify/likelihood/_classes.py +430 -0
- mlquantify/meta/__init__.py +1 -0
- mlquantify/meta/_classes.py +785 -0
- mlquantify/metrics/__init__.py +21 -0
- mlquantify/metrics/_oq.py +109 -0
- mlquantify/metrics/_rq.py +98 -0
- mlquantify/{evaluation/measures.py → metrics/_slq.py} +51 -36
- mlquantify/mixture/__init__.py +7 -0
- mlquantify/mixture/_base.py +147 -0
- mlquantify/mixture/_classes.py +458 -0
- mlquantify/mixture/_utils.py +163 -0
- mlquantify/model_selection/__init__.py +9 -0
- mlquantify/model_selection/_protocol.py +358 -0
- mlquantify/model_selection/_search.py +315 -0
- mlquantify/model_selection/_split.py +1 -0
- mlquantify/multiclass.py +350 -0
- mlquantify/neighbors/__init__.py +9 -0
- mlquantify/neighbors/_base.py +168 -0
- mlquantify/neighbors/_classes.py +150 -0
- mlquantify/{classification/methods.py → neighbors/_classification.py} +37 -62
- mlquantify/neighbors/_kde.py +268 -0
- mlquantify/neighbors/_utils.py +131 -0
- mlquantify/neural/__init__.py +1 -0
- mlquantify/utils/__init__.py +47 -2
- mlquantify/utils/_artificial.py +27 -0
- mlquantify/utils/_constraints.py +219 -0
- mlquantify/utils/_context.py +21 -0
- mlquantify/utils/_decorators.py +36 -0
- mlquantify/utils/_exceptions.py +12 -0
- mlquantify/utils/_get_scores.py +159 -0
- mlquantify/utils/_load.py +18 -0
- mlquantify/utils/_parallel.py +6 -0
- mlquantify/utils/_random.py +36 -0
- mlquantify/utils/_sampling.py +273 -0
- mlquantify/utils/_tags.py +44 -0
- mlquantify/utils/_validation.py +447 -0
- mlquantify/utils/prevalence.py +64 -0
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/METADATA +2 -1
- mlquantify-0.1.10.dist-info/RECORD +53 -0
- mlquantify/classification/__init__.py +0 -1
- mlquantify/evaluation/__init__.py +0 -14
- mlquantify/evaluation/protocol.py +0 -289
- mlquantify/methods/__init__.py +0 -37
- mlquantify/methods/aggregative.py +0 -1159
- mlquantify/methods/meta.py +0 -472
- mlquantify/methods/mixture_models.py +0 -1003
- mlquantify/methods/non_aggregative.py +0 -136
- mlquantify/methods/threshold_optimization.py +0 -869
- mlquantify/model_selection.py +0 -377
- mlquantify/plots.py +0 -367
- mlquantify/utils/general.py +0 -371
- mlquantify/utils/method.py +0 -449
- mlquantify-0.1.8.dist-info/RECORD +0 -22
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/WHEEL +0 -0
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from mlquantify.base_aggregative import SoftLearnerQMixin
|
|
3
|
+
from mlquantify.likelihood._base import BaseIterativeLikelihood
|
|
4
|
+
from mlquantify.metrics._slq import MAE
|
|
5
|
+
from mlquantify.multiclass import define_binary
|
|
6
|
+
from mlquantify.utils._constraints import (
|
|
7
|
+
Interval,
|
|
8
|
+
CallableConstraint,
|
|
9
|
+
Options
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
class EMQ(SoftLearnerQMixin, BaseIterativeLikelihood):
|
|
13
|
+
r"""Expectation-Maximization Quantifier (EMQ).
|
|
14
|
+
|
|
15
|
+
Estimates class prevalences under prior probability shift by alternating
|
|
16
|
+
between expectation **(E)** and maximization **(M)** steps on posterior probabilities.
|
|
17
|
+
|
|
18
|
+
E-step:
|
|
19
|
+
.. math::
|
|
20
|
+
p_i^{(s+1)}(x) = \frac{q_i^{(s)} p_i(x)}{\sum_j q_j^{(s)} p_j(x)}
|
|
21
|
+
|
|
22
|
+
M-step:
|
|
23
|
+
.. math::
|
|
24
|
+
q_i^{(s+1)} = \frac{1}{N} \sum_{n=1}^N p_i^{(s+1)}(x_n)
|
|
25
|
+
|
|
26
|
+
where
|
|
27
|
+
- :math:`p_i(x)` are posterior probabilities predicted by the classifier
|
|
28
|
+
- :math:`q_i^{(s)}` are class prevalence estimates at iteration :math:`s`
|
|
29
|
+
- :math:`N` is the number of test instances.
|
|
30
|
+
|
|
31
|
+
Calibrations supported on posterior probabilities before **EM** iteration:
|
|
32
|
+
|
|
33
|
+
Temperature Scaling (TS):
|
|
34
|
+
.. math::
|
|
35
|
+
\hat{p} = \text{softmax}\left(\frac{\log(p)}{T}\right)
|
|
36
|
+
|
|
37
|
+
Bias-Corrected Temperature Scaling (BCTS):
|
|
38
|
+
.. math::
|
|
39
|
+
\hat{p} = \text{softmax}\left(\frac{\log(p)}{T} + b\right)
|
|
40
|
+
|
|
41
|
+
Vector Scaling (VS):
|
|
42
|
+
.. math::
|
|
43
|
+
\hat{p}_i = \text{softmax}(W_i \cdot \log(p_i) + b_i)
|
|
44
|
+
|
|
45
|
+
No-Bias Vector Scaling (NBVS):
|
|
46
|
+
.. math::
|
|
47
|
+
\hat{p}_i = \text{softmax}(W_i \cdot \log(p_i))
|
|
48
|
+
|
|
49
|
+
Parameters
|
|
50
|
+
----------
|
|
51
|
+
learner : estimator, optional
|
|
52
|
+
Probabilistic classifier supporting predict_proba.
|
|
53
|
+
tol : float, default=1e-4
|
|
54
|
+
Convergence threshold.
|
|
55
|
+
max_iter : int, default=100
|
|
56
|
+
Maximum EM iterations.
|
|
57
|
+
calib_function : str or callable, optional
|
|
58
|
+
Calibration method:
|
|
59
|
+
- 'ts': Temperature Scaling
|
|
60
|
+
- 'bcts': Bias-Corrected Temperature Scaling
|
|
61
|
+
- 'vs': Vector Scaling
|
|
62
|
+
- 'nbvs': No-Bias Vector Scaling
|
|
63
|
+
- callable: custom calibration function
|
|
64
|
+
criteria : callable, default=MAE
|
|
65
|
+
Convergence metric.
|
|
66
|
+
|
|
67
|
+
References
|
|
68
|
+
----------
|
|
69
|
+
.. [1] Saerens, M., Latinne, P., & Decaestecker, C. (2002).
|
|
70
|
+
Adjusting the Outputs of a Classifier to New a Priori Probabilities.
|
|
71
|
+
Neural Computation, 14(1), 2141-2156.
|
|
72
|
+
.. [2] Esuli, A., Moreo, A., & Sebastiani, F. (2023). Learning to Quantify. Springer.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
_parameter_constraints = {
|
|
76
|
+
"tol": [Interval(0, None, inclusive_left=False)],
|
|
77
|
+
"max_iter": [Interval(1, None, inclusive_left=True)],
|
|
78
|
+
"calib_function": [
|
|
79
|
+
Options(["bcts", "ts", "vs", "nbvs", None]),
|
|
80
|
+
],
|
|
81
|
+
"criteria": [CallableConstraint()],
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
def __init__(self,
|
|
85
|
+
learner=None,
|
|
86
|
+
tol=1e-4,
|
|
87
|
+
max_iter=100,
|
|
88
|
+
calib_function=None,
|
|
89
|
+
criteria=MAE):
|
|
90
|
+
super().__init__(learner=learner, tol=tol, max_iter=max_iter)
|
|
91
|
+
self.calib_function = calib_function
|
|
92
|
+
self.criteria = criteria
|
|
93
|
+
|
|
94
|
+
def _iterate(self, predictions, priors):
|
|
95
|
+
r"""Perform EM quantification iteration.
|
|
96
|
+
|
|
97
|
+
Steps:
|
|
98
|
+
- Calibrate posterior predictions if calibration function specified.
|
|
99
|
+
- Apply EM procedure to re-estimate prevalences, based on training priors and calibrated posteriors.
|
|
100
|
+
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
predictions : ndarray of shape (n_samples, n_classes)
|
|
104
|
+
Posterior probabilities for each class on test data.
|
|
105
|
+
priors : ndarray of shape (n_classes,)
|
|
106
|
+
Training set class prevalences, serving as initial priors.
|
|
107
|
+
|
|
108
|
+
Returns
|
|
109
|
+
-------
|
|
110
|
+
prevalences : ndarray of shape (n_classes,)
|
|
111
|
+
Estimated class prevalences after EM iteration.
|
|
112
|
+
"""
|
|
113
|
+
calibrated_predictions = self._apply_calibration(predictions)
|
|
114
|
+
prevalences, _ = self.EM(
|
|
115
|
+
posteriors=calibrated_predictions,
|
|
116
|
+
priors=priors,
|
|
117
|
+
tolerance=self.tol,
|
|
118
|
+
max_iter=self.max_iter,
|
|
119
|
+
criteria=self.criteria
|
|
120
|
+
)
|
|
121
|
+
return prevalences
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@classmethod
|
|
125
|
+
def EM(cls, posteriors, priors, tolerance=1e-6, max_iter=100, criteria=MAE):
|
|
126
|
+
r"""Static method implementing the EM algorithm for quantification.
|
|
127
|
+
|
|
128
|
+
Parameters
|
|
129
|
+
----------
|
|
130
|
+
posteriors : ndarray of shape (n_samples, n_classes)
|
|
131
|
+
Posterior probability predictions.
|
|
132
|
+
priors : ndarray of shape (n_classes,)
|
|
133
|
+
Training class prior probabilities.
|
|
134
|
+
tolerance : float
|
|
135
|
+
Convergence threshold based on difference between iterations.
|
|
136
|
+
max_iter : int
|
|
137
|
+
Max number of EM iterations.
|
|
138
|
+
criteria : callable
|
|
139
|
+
Metric to assess convergence, e.g., MAE.
|
|
140
|
+
|
|
141
|
+
Returns
|
|
142
|
+
-------
|
|
143
|
+
qs : ndarray of shape (n_classes,)
|
|
144
|
+
Estimated test set class prevalences.
|
|
145
|
+
ps : ndarray of shape (n_samples, n_classes)
|
|
146
|
+
Updated soft membership probabilities per instance.
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
Px = np.array(posteriors, dtype=np.float64)
|
|
150
|
+
Ptr = np.array(priors, dtype=np.float64)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
if np.prod(Ptr) == 0:
|
|
155
|
+
Ptr += tolerance
|
|
156
|
+
Ptr /= Ptr.sum()
|
|
157
|
+
|
|
158
|
+
qs = np.copy(Ptr)
|
|
159
|
+
s, converged = 0, False
|
|
160
|
+
qs_prev_ = None
|
|
161
|
+
|
|
162
|
+
while not converged and s < max_iter:
|
|
163
|
+
# E-step:
|
|
164
|
+
ps_unnormalized = (qs / Ptr) * Px
|
|
165
|
+
ps = ps_unnormalized / ps_unnormalized.sum(axis=1, keepdims=True)
|
|
166
|
+
|
|
167
|
+
# M-step:
|
|
168
|
+
qs = ps.mean(axis=0)
|
|
169
|
+
|
|
170
|
+
if qs_prev_ is not None and criteria(qs_prev_, qs) < tolerance and s > 10:
|
|
171
|
+
converged = True
|
|
172
|
+
|
|
173
|
+
qs_prev_ = qs
|
|
174
|
+
s += 1
|
|
175
|
+
|
|
176
|
+
if not converged:
|
|
177
|
+
print('[warning] the method has reached the maximum number of iterations; it might have not converged')
|
|
178
|
+
|
|
179
|
+
return qs, ps
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _apply_calibration(self, predictions):
|
|
183
|
+
r"""Calibrate posterior predictions with specified calibration method.
|
|
184
|
+
|
|
185
|
+
Parameters
|
|
186
|
+
----------
|
|
187
|
+
predictions : ndarray
|
|
188
|
+
Posterior predictions to calibrate.
|
|
189
|
+
|
|
190
|
+
Returns
|
|
191
|
+
-------
|
|
192
|
+
calibrated_predictions : ndarray
|
|
193
|
+
Calibrated posterior predictions.
|
|
194
|
+
|
|
195
|
+
Raises
|
|
196
|
+
------
|
|
197
|
+
ValueError
|
|
198
|
+
If calib_function is unrecognized.
|
|
199
|
+
"""
|
|
200
|
+
if self.calib_function is None:
|
|
201
|
+
return predictions
|
|
202
|
+
|
|
203
|
+
if isinstance(self.calib_function, str):
|
|
204
|
+
method = self.calib_function.lower()
|
|
205
|
+
if method == "ts":
|
|
206
|
+
return self._temperature_scaling(predictions)
|
|
207
|
+
elif method == "bcts":
|
|
208
|
+
return self._bias_corrected_temperature_scaling(predictions)
|
|
209
|
+
elif method == "vs":
|
|
210
|
+
return self._vector_scaling(predictions)
|
|
211
|
+
elif method == "nbvs":
|
|
212
|
+
return self._no_bias_vector_scaling(predictions)
|
|
213
|
+
|
|
214
|
+
elif callable(self.calib_function):
|
|
215
|
+
return self.calib_function(predictions)
|
|
216
|
+
|
|
217
|
+
raise ValueError(
|
|
218
|
+
f"Invalid calib_function '{self.calib_function}'. Expected one of {{'bcts', 'ts', 'vs', 'nbvs', None, callable}}."
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
def _temperature_scaling(self, preds):
|
|
222
|
+
"""Temperature Scaling calibration applied to logits."""
|
|
223
|
+
T = 1.0
|
|
224
|
+
preds = np.clip(preds, 1e-12, 1.0)
|
|
225
|
+
logits = np.log(preds)
|
|
226
|
+
scaled = logits / T
|
|
227
|
+
exp_scaled = np.exp(scaled - np.max(scaled, axis=1, keepdims=True))
|
|
228
|
+
return exp_scaled / np.sum(exp_scaled, axis=1, keepdims=True)
|
|
229
|
+
|
|
230
|
+
def _bias_corrected_temperature_scaling(self, preds):
|
|
231
|
+
"""Bias-Corrected Temperature Scaling calibration."""
|
|
232
|
+
T = 1.0
|
|
233
|
+
bias = np.zeros(preds.shape[1])
|
|
234
|
+
preds = np.clip(preds, 1e-12, 1.0)
|
|
235
|
+
logits = np.log(preds)
|
|
236
|
+
logits = logits / T + bias
|
|
237
|
+
exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
|
|
238
|
+
return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
|
|
239
|
+
|
|
240
|
+
def _vector_scaling(self, preds):
|
|
241
|
+
"""Vector Scaling calibration."""
|
|
242
|
+
W = np.ones(preds.shape[1])
|
|
243
|
+
b = np.zeros(preds.shape[1])
|
|
244
|
+
preds = np.clip(preds, 1e-12, 1.0)
|
|
245
|
+
logits = np.log(preds)
|
|
246
|
+
scaled = logits * W + b
|
|
247
|
+
exp_scaled = np.exp(scaled - np.max(scaled, axis=1, keepdims=True))
|
|
248
|
+
return exp_scaled / np.sum(exp_scaled, axis=1, keepdims=True)
|
|
249
|
+
|
|
250
|
+
def _no_bias_vector_scaling(self, preds):
|
|
251
|
+
"""No-Bias Vector Scaling calibration."""
|
|
252
|
+
W = np.ones(preds.shape[1])
|
|
253
|
+
preds = np.clip(preds, 1e-12, 1.0)
|
|
254
|
+
logits = np.log(preds)
|
|
255
|
+
scaled = logits * W
|
|
256
|
+
exp_scaled = np.exp(scaled - np.max(scaled, axis=1, keepdims=True))
|
|
257
|
+
return exp_scaled / np.sum(exp_scaled, axis=1, keepdims=True)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
class MLPE(SoftLearnerQMixin, BaseIterativeLikelihood):
|
|
262
|
+
r"""Maximum Likelihood Prevalence Estimation (MLPE).
|
|
263
|
+
|
|
264
|
+
Returns training priors as prevalence estimates without adaptations.
|
|
265
|
+
|
|
266
|
+
Parameters
|
|
267
|
+
----------
|
|
268
|
+
learner : estimator, optional
|
|
269
|
+
Base classifier.
|
|
270
|
+
|
|
271
|
+
References
|
|
272
|
+
----------
|
|
273
|
+
.. [2] Esuli, A., Moreo, A., & Sebastiani, F. (2023). Learning to Quantify. Springer.
|
|
274
|
+
"""
|
|
275
|
+
|
|
276
|
+
def __init__(self, learner=None):
|
|
277
|
+
super().__init__(learner=learner, max_iter=1)
|
|
278
|
+
|
|
279
|
+
def _iterate(self, predictions, priors):
|
|
280
|
+
"""Returns training priors without adjustment.
|
|
281
|
+
|
|
282
|
+
Parameters
|
|
283
|
+
----------
|
|
284
|
+
predictions : array-like
|
|
285
|
+
Ignored in this implementation.
|
|
286
|
+
priors : array-like
|
|
287
|
+
Training priors, returned as is.
|
|
288
|
+
|
|
289
|
+
Returns
|
|
290
|
+
-------
|
|
291
|
+
prevalences : array-like
|
|
292
|
+
Equal to the training priors.
|
|
293
|
+
"""
|
|
294
|
+
return priors
|
|
295
|
+
|
|
296
|
+
@define_binary
|
|
297
|
+
class CDE(SoftLearnerQMixin, BaseIterativeLikelihood):
|
|
298
|
+
r"""CDE-Iterate for binary classification prevalence estimation.
|
|
299
|
+
|
|
300
|
+
Threshold :math:`\tau` from false positive and false negative costs:
|
|
301
|
+
.. math::
|
|
302
|
+
\tau = \frac{c_{FP}}{c_{FP} + c_{FN}}
|
|
303
|
+
|
|
304
|
+
Hard classification by thresholding posterior probability :math:`p(+|x)` at :math:`\tau`:
|
|
305
|
+
.. math::
|
|
306
|
+
\hat{y}(x) = \mathbf{1}_{p(+|x) > \tau}
|
|
307
|
+
|
|
308
|
+
Prevalence estimation via classify-and-count:
|
|
309
|
+
.. math::
|
|
310
|
+
\hat{p}_U(+) = \frac{1}{N} \sum_{n=1}^N \hat{y}(x_n)
|
|
311
|
+
|
|
312
|
+
False positive cost update:
|
|
313
|
+
.. math::
|
|
314
|
+
c_{FP}^{new} = \frac{p_L(+)}{p_L(-)} \times \frac{\hat{p}_U(-)}{\hat{p}_U(+)} \times c_{FN}
|
|
315
|
+
|
|
316
|
+
Parameters
|
|
317
|
+
----------
|
|
318
|
+
learner : estimator, optional
|
|
319
|
+
Wrapped classifier (unused).
|
|
320
|
+
tol : float, default=1e-4
|
|
321
|
+
Convergence tolerance.
|
|
322
|
+
max_iter : int, default=100
|
|
323
|
+
Max iterations.
|
|
324
|
+
init_cfp : float, default=1.0
|
|
325
|
+
Initial false positive cost.
|
|
326
|
+
|
|
327
|
+
References
|
|
328
|
+
----------
|
|
329
|
+
.. [1] Esuli, A., Moreo, A., & Sebastiani, F. (2023). Learning to Quantify. Springer.
|
|
330
|
+
"""
|
|
331
|
+
|
|
332
|
+
_parameter_constraints = {
|
|
333
|
+
"tol": [Interval(0, None, inclusive_left=False)],
|
|
334
|
+
"max_iter": [Interval(1, None, inclusive_left=True)],
|
|
335
|
+
"init_cfp": [Interval(0, None, inclusive_left=False)]
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
def __init__(self, learner=None, tol=1e-4, max_iter=100, init_cfp=1.0):
|
|
339
|
+
super().__init__(learner=learner, tol=tol, max_iter=max_iter)
|
|
340
|
+
self.init_cfp = float(init_cfp)
|
|
341
|
+
|
|
342
|
+
def _iterate(self, predictions, priors):
|
|
343
|
+
r"""Iteratively estimate prevalences via cost-sensitive thresholding.
|
|
344
|
+
|
|
345
|
+
Parameters
|
|
346
|
+
----------
|
|
347
|
+
predictions : ndarray, shape (n_samples, 2)
|
|
348
|
+
Posterior probabilities for binary classes [neg, pos].
|
|
349
|
+
priors : ndarray, shape (2,)
|
|
350
|
+
Training priors [p(neg), p(pos)].
|
|
351
|
+
|
|
352
|
+
Returns
|
|
353
|
+
-------
|
|
354
|
+
prevalences : ndarray, shape (2,)
|
|
355
|
+
Estimated prevalences for classes [neg, pos].
|
|
356
|
+
"""
|
|
357
|
+
P = np.asarray(predictions, dtype=np.float64)
|
|
358
|
+
Ptr = np.asarray(priors, dtype=np.float64)
|
|
359
|
+
|
|
360
|
+
# basic checks
|
|
361
|
+
if P.ndim != 2 or P.shape[1] != 2:
|
|
362
|
+
raise ValueError("CDE implementation here supports binary case only: predictions shape (n,2).")
|
|
363
|
+
|
|
364
|
+
# ensure no zeros
|
|
365
|
+
eps = 1e-12
|
|
366
|
+
P = np.clip(P, eps, 1.0)
|
|
367
|
+
|
|
368
|
+
# training priors pL(+), pL(-)
|
|
369
|
+
# assume Ptr order matches columns of P; if Ptr sums to 1 but order unknown, user must match.
|
|
370
|
+
pL_pos = Ptr[1]
|
|
371
|
+
pL_neg = Ptr[0]
|
|
372
|
+
if pL_pos <= 0 or pL_neg <= 0:
|
|
373
|
+
# keep them positive to avoid divisions by zero
|
|
374
|
+
pL_pos = max(pL_pos, eps)
|
|
375
|
+
pL_neg = max(pL_neg, eps)
|
|
376
|
+
|
|
377
|
+
# initialize costs
|
|
378
|
+
cFN = 1.0
|
|
379
|
+
cFP = float(self.init_cfp)
|
|
380
|
+
|
|
381
|
+
prev_prev_pos = None
|
|
382
|
+
s = 0
|
|
383
|
+
|
|
384
|
+
# iterate: compute threshold from costs, classify, estimate prevalences via CC,
|
|
385
|
+
# update cFP via eq. (4.27), repeat
|
|
386
|
+
while s < self.max_iter:
|
|
387
|
+
# decision threshold tau for positive class:
|
|
388
|
+
# Derivation:
|
|
389
|
+
# predict positive if cost_FP * p(-|x) < cost_FN * p(+|x)
|
|
390
|
+
# => predict positive if p(+|x) / p(-|x) > cost_FP / cost_FN
|
|
391
|
+
# since p(+|x) / p(-|x) = p(+|x) / (1 - p(+|x)):
|
|
392
|
+
# p(+|x) > cost_FP / (cost_FP + cost_FN)
|
|
393
|
+
tau = cFP / (cFP + cFN)
|
|
394
|
+
|
|
395
|
+
# hard predictions for positive class using threshold on posterior for positive (col 1)
|
|
396
|
+
pos_probs = P[:, 1]
|
|
397
|
+
hard_pos = (pos_probs > tau).astype(float)
|
|
398
|
+
|
|
399
|
+
# classify-and-count prevalence estimate on U
|
|
400
|
+
prev_pos = hard_pos.mean()
|
|
401
|
+
prev_neg = 1.0 - prev_pos
|
|
402
|
+
|
|
403
|
+
# update cFP according to Eq. 4.27:
|
|
404
|
+
# cFP_new = (pL_pos / pL_neg) * (pU_hat(neg) / pU_hat(pos)) * cFN
|
|
405
|
+
# guard against zero prev_pos / prev_neg
|
|
406
|
+
prev_pos_safe = max(prev_pos, eps)
|
|
407
|
+
prev_neg_safe = max(prev_neg, eps)
|
|
408
|
+
|
|
409
|
+
cFP_new = (pL_pos / pL_neg) * (prev_neg_safe / prev_pos_safe) * cFN
|
|
410
|
+
|
|
411
|
+
# check convergence on prevalences (absolute change)
|
|
412
|
+
if prev_prev_pos is not None and abs(prev_pos - prev_prev_pos) < self.tol:
|
|
413
|
+
break
|
|
414
|
+
|
|
415
|
+
# prepare next iter
|
|
416
|
+
cFP = cFP_new
|
|
417
|
+
prev_prev_pos = prev_pos
|
|
418
|
+
s += 1
|
|
419
|
+
|
|
420
|
+
# if didn't converge within max_iter we keep last estimate (book warns about lack of fisher consistency)
|
|
421
|
+
if s >= self.max_iter:
|
|
422
|
+
# optional: warning
|
|
423
|
+
# print('[warning] CDE-Iterate reached max_iter without converging')
|
|
424
|
+
pass
|
|
425
|
+
|
|
426
|
+
prevalences = np.array([prev_neg, prev_pos], dtype=np.float64)
|
|
427
|
+
# ensure sums to 1 (numerical safety)
|
|
428
|
+
prevalences = prevalences / prevalences.sum()
|
|
429
|
+
|
|
430
|
+
return prevalences
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from ._classes import EnsembleQ, QuaDapt, AggregativeBootstrap
|