mlquantify 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify/__init__.py +11 -1
- mlquantify/adjust_counting/__init__.py +11 -1
- mlquantify/adjust_counting/_adjustment.py +370 -87
- mlquantify/adjust_counting/_base.py +1 -3
- mlquantify/adjust_counting/_counting.py +27 -19
- mlquantify/adjust_counting/_utils.py +23 -28
- mlquantify/confidence.py +16 -22
- mlquantify/likelihood/_base.py +38 -52
- mlquantify/likelihood/_classes.py +88 -72
- mlquantify/meta/_classes.py +86 -62
- mlquantify/metrics/_oq.py +2 -2
- mlquantify/metrics/_rq.py +2 -2
- mlquantify/metrics/_slq.py +9 -9
- mlquantify/mixture/_base.py +13 -19
- mlquantify/mixture/_classes.py +68 -10
- mlquantify/mixture/_utils.py +62 -11
- mlquantify/model_selection/_protocol.py +6 -6
- mlquantify/model_selection/_search.py +1 -1
- mlquantify/neighbors/_base.py +35 -65
- mlquantify/neighbors/_classes.py +1 -10
- mlquantify/neighbors/_classification.py +5 -12
- mlquantify/neighbors/_kde.py +7 -9
- mlquantify/neighbors/_utils.py +17 -21
- mlquantify/utils/_validation.py +3 -3
- mlquantify/utils/prevalence.py +4 -1
- {mlquantify-0.1.9.dist-info → mlquantify-0.1.11.dist-info}/METADATA +10 -18
- mlquantify-0.1.11.dist-info/RECORD +53 -0
- mlquantify-0.1.9.dist-info/RECORD +0 -53
- {mlquantify-0.1.9.dist-info → mlquantify-0.1.11.dist-info}/WHEEL +0 -0
- {mlquantify-0.1.9.dist-info → mlquantify-0.1.11.dist-info}/top_level.txt +0 -0
mlquantify/__init__.py
CHANGED
|
@@ -1,3 +1,13 @@
|
|
|
1
1
|
"mlquantify, a Python package for quantification"
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
from . import neighbors
|
|
4
|
+
from . import likelihood
|
|
5
|
+
from . import mixture
|
|
6
|
+
from . import meta
|
|
7
|
+
from . import adjust_counting
|
|
8
|
+
from . import model_selection
|
|
9
|
+
from . import base_aggregative
|
|
10
|
+
from . import base
|
|
11
|
+
from . import calibration
|
|
12
|
+
from . import confidence
|
|
13
|
+
from . import multiclass
|
|
@@ -1,4 +1,7 @@
|
|
|
1
|
-
from ._counting import
|
|
1
|
+
from ._counting import (
|
|
2
|
+
CC,
|
|
3
|
+
PCC
|
|
4
|
+
)
|
|
2
5
|
from ._adjustment import (
|
|
3
6
|
ThresholdAdjustment,
|
|
4
7
|
MatrixAdjustment,
|
|
@@ -11,4 +14,11 @@ from ._adjustment import (
|
|
|
11
14
|
T50,
|
|
12
15
|
MS,
|
|
13
16
|
MS2,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
from ._utils import (
|
|
20
|
+
compute_table,
|
|
21
|
+
compute_fpr,
|
|
22
|
+
compute_tpr,
|
|
23
|
+
evaluate_thresholds,
|
|
14
24
|
)
|
|
@@ -17,38 +17,26 @@ from mlquantify.utils._constraints import Interval, Options
|
|
|
17
17
|
|
|
18
18
|
@define_binary
|
|
19
19
|
class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
|
|
20
|
-
r"""
|
|
21
|
-
Applies threshold-based adjustment methods for quantification.
|
|
20
|
+
r"""Base Class for Threshold-based adjustment methods for quantification.
|
|
22
21
|
|
|
23
22
|
This is the base class for methods such as ACC, X, MAX, T50, MS, and MS2,
|
|
24
|
-
which adjust prevalence estimates based on the classifier
|
|
25
|
-
|
|
23
|
+
which adjust prevalence estimates based on the classifier's ROC curve,
|
|
24
|
+
as proposed by [1]_.
|
|
26
25
|
|
|
27
|
-
These methods correct the bias in *Classify & Count (CC)* estimates caused
|
|
28
|
-
in class distributions between the training and test datasets.
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
Given:
|
|
33
|
-
- \( p' \): observed positive proportion from CC,
|
|
34
|
-
- \( \text{TPR} = P(\hat{y}=1|y=1) \),
|
|
35
|
-
- \( \text{FPR} = P(\hat{y}=1|y=0) \),
|
|
36
|
-
|
|
37
|
-
the adjusted prevalence is given by:
|
|
38
|
-
|
|
39
|
-
\[
|
|
40
|
-
\hat{p} = \frac{p' - \text{FPR}}{\text{TPR} - \text{FPR}}
|
|
41
|
-
\]
|
|
26
|
+
These methods correct the bias in *Classify & Count (CC)* estimates caused
|
|
27
|
+
by differences in class distributions between the training and test datasets.
|
|
28
|
+
|
|
29
|
+
The adjusted prevalence is calculated using the following formula:
|
|
42
30
|
|
|
43
|
-
|
|
44
|
-
*Quantifying Counts and Costs via Classification*, DMKD 2008).
|
|
31
|
+
.. math::
|
|
45
32
|
|
|
33
|
+
\hat{p} = \frac{p' - \text{FPR}}{\text{TPR} - \text{FPR}}
|
|
46
34
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
35
|
+
where:
|
|
36
|
+
- :math:`p'` is the observed positive proportion from CC,
|
|
37
|
+
- :math:`\text{TPR} = P(\hat{y}=1|y=1)` is the True Positive Rate,
|
|
38
|
+
- :math:`\text{FPR} = P(\hat{y}=1|y=0)` is the False Positive Rate.
|
|
39
|
+
|
|
52
40
|
|
|
53
41
|
Parameters
|
|
54
42
|
----------
|
|
@@ -59,7 +47,6 @@ class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
|
|
|
59
47
|
strategy : {'ovr'}, default='ovr'
|
|
60
48
|
Strategy used for multiclass adaptation.
|
|
61
49
|
|
|
62
|
-
|
|
63
50
|
Attributes
|
|
64
51
|
----------
|
|
65
52
|
learner : estimator
|
|
@@ -67,6 +54,12 @@ class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
|
|
|
67
54
|
classes : ndarray of shape (n_classes,)
|
|
68
55
|
Unique class labels observed during training.
|
|
69
56
|
|
|
57
|
+
Notes
|
|
58
|
+
-----
|
|
59
|
+
- Defined only for binary quantification tasks.
|
|
60
|
+
- When applied to multiclass problems, the one-vs-rest strategy (`ovr`)
|
|
61
|
+
is used automatically.
|
|
62
|
+
|
|
70
63
|
|
|
71
64
|
Examples
|
|
72
65
|
--------
|
|
@@ -74,7 +67,7 @@ class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
|
|
|
74
67
|
>>> from mlquantify.adjust_counting import ThresholdAdjustment
|
|
75
68
|
>>> import numpy as np
|
|
76
69
|
>>> class CustomThreshold(ThresholdAdjustment):
|
|
77
|
-
... def
|
|
70
|
+
... def get_best_threshold(self, thresholds, tprs, fprs):
|
|
78
71
|
... idx = np.argmax(tprs - fprs)
|
|
79
72
|
... return thresholds[idx], tprs[idx], fprs[idx]
|
|
80
73
|
>>> X = np.random.randn(100, 4)
|
|
@@ -83,6 +76,13 @@ class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
|
|
|
83
76
|
>>> q.fit(X, y)
|
|
84
77
|
>>> q.predict(X)
|
|
85
78
|
{0: 0.49, 1: 0.51}
|
|
79
|
+
|
|
80
|
+
References
|
|
81
|
+
----------
|
|
82
|
+
.. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate
|
|
83
|
+
Classification", *Proceedings of ECML*, pp. 564-575.
|
|
84
|
+
.. [2] Forman, G. (2008). "Quantifying Counts and Costs via Classification",
|
|
85
|
+
*Data Mining and Knowledge Discovery*, 17(2), 164-206.
|
|
86
86
|
"""
|
|
87
87
|
|
|
88
88
|
_parameter_constraints = {
|
|
@@ -101,8 +101,8 @@ class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
|
|
|
101
101
|
"""Internal adjustment computation based on selected ROC threshold."""
|
|
102
102
|
positive_scores = train_y_scores[:, 1]
|
|
103
103
|
|
|
104
|
-
thresholds, tprs, fprs = evaluate_thresholds(train_y_values, positive_scores
|
|
105
|
-
threshold, tpr, fpr = self.
|
|
104
|
+
thresholds, tprs, fprs = evaluate_thresholds(train_y_values, positive_scores)
|
|
105
|
+
threshold, tpr, fpr = self.get_best_threshold(thresholds, tprs, fprs)
|
|
106
106
|
|
|
107
107
|
cc_predictions = CC(threshold).aggregate(predictions)[1]
|
|
108
108
|
|
|
@@ -114,42 +114,40 @@ class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
|
|
|
114
114
|
return np.asarray([1 - prevalence, prevalence])
|
|
115
115
|
|
|
116
116
|
@abstractmethod
|
|
117
|
-
def
|
|
117
|
+
def get_best_threshold(self, thresholds, tprs, fprs):
|
|
118
118
|
"""Select the best threshold according to the specific method."""
|
|
119
119
|
...
|
|
120
120
|
|
|
121
121
|
|
|
122
122
|
class MatrixAdjustment(BaseAdjustCount):
|
|
123
|
-
r"""
|
|
124
|
-
Base class for matrix-based quantification adjustments (FM, GAC, GPAC).
|
|
123
|
+
r"""Base class for matrix-based quantification adjustments.
|
|
125
124
|
|
|
126
125
|
This class implements the matrix correction model for quantification
|
|
127
|
-
as formulated in Firat (2016), which expresses the observed prevalences
|
|
128
|
-
a linear combination of true prevalences through the confusion matrix.
|
|
126
|
+
as formulated in Firat (2016) [1]_, which expresses the observed prevalences
|
|
127
|
+
as a linear combination of true prevalences through the confusion matrix.
|
|
129
128
|
|
|
130
|
-
|
|
129
|
+
The system is modeled as:
|
|
131
130
|
|
|
132
|
-
|
|
131
|
+
.. math::
|
|
133
132
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
\hat{\pi}_F \ge 0, \quad \sum_k \hat{\pi}_{F,k} = 1
|
|
142
|
-
\]
|
|
133
|
+
\mathbf{y} = \mathbf{C}\hat{\pi}_F + \varepsilon
|
|
134
|
+
|
|
135
|
+
subject to the constraints:
|
|
136
|
+
|
|
137
|
+
.. math::
|
|
138
|
+
|
|
139
|
+
\hat{\pi}_F \ge 0, \quad \sum_k \hat{\pi}_{F,k} = 1
|
|
143
140
|
|
|
144
141
|
where:
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
142
|
+
- :math:`\mathbf{y}` is the vector of predicted prevalences in test set,
|
|
143
|
+
- :math:`\mathbf{C}` is the confusion matrix,
|
|
144
|
+
- :math:`\hat{\pi}_F` is the true class prevalence vector (unknown),
|
|
145
|
+
- :math:`\varepsilon` is the residual error.
|
|
146
|
+
|
|
147
|
+
The model can be solved via:
|
|
149
148
|
|
|
150
|
-
|
|
151
|
-
-
|
|
152
|
-
- Constrained optimization (quadratic or least-squares).
|
|
149
|
+
- **Linear algebraic solution**: uses matrix inversion
|
|
150
|
+
- **Constrained optimization**: quadratic or least-squares approach
|
|
153
151
|
|
|
154
152
|
|
|
155
153
|
Parameters
|
|
@@ -158,10 +156,10 @@ class MatrixAdjustment(BaseAdjustCount):
|
|
|
158
156
|
Classifier with `fit` and `predict` methods.
|
|
159
157
|
solver : {'optim', 'linear'}, optional
|
|
160
158
|
Solver for the adjustment system:
|
|
159
|
+
|
|
161
160
|
- `'linear'`: uses matrix inversion (e.g., GAC, GPAC)
|
|
162
161
|
- `'optim'`: uses optimization (e.g., FM)
|
|
163
162
|
|
|
164
|
-
|
|
165
163
|
Attributes
|
|
166
164
|
----------
|
|
167
165
|
CM : ndarray of shape (n_classes, n_classes)
|
|
@@ -170,15 +168,11 @@ class MatrixAdjustment(BaseAdjustCount):
|
|
|
170
168
|
Class labels observed in training.
|
|
171
169
|
|
|
172
170
|
|
|
173
|
-
References
|
|
174
|
-
----------
|
|
175
|
-
- Firat, A. (2016). *Unified Framework for Quantification.* AAAI, pp. 1-8.
|
|
176
|
-
|
|
177
|
-
|
|
178
171
|
Examples
|
|
179
172
|
--------
|
|
180
173
|
>>> from sklearn.linear_model import LogisticRegression
|
|
181
174
|
>>> from mlquantify.adjust_counting import MatrixAdjustment
|
|
175
|
+
>>> import numpy as np
|
|
182
176
|
>>> class MyMatrix(MatrixAdjustment):
|
|
183
177
|
... def _compute_confusion_matrix(self, preds, y):
|
|
184
178
|
... cm = np.ones((2, 2))
|
|
@@ -189,8 +183,15 @@ class MatrixAdjustment(BaseAdjustCount):
|
|
|
189
183
|
>>> q.fit(X, y)
|
|
190
184
|
>>> q.predict(X)
|
|
191
185
|
{0: 0.5, 1: 0.5}
|
|
186
|
+
|
|
187
|
+
References
|
|
188
|
+
----------
|
|
189
|
+
.. [1] Firat, A. (2016). "Unified Framework for Quantification",
|
|
190
|
+
*Proceedings of AAAI Conference on Artificial Intelligence*,
|
|
191
|
+
pp. 1-8.
|
|
192
192
|
"""
|
|
193
193
|
|
|
194
|
+
|
|
194
195
|
_parameter_constraints = {"solver": Options(["optim", "linear"])}
|
|
195
196
|
|
|
196
197
|
def __init__(self, learner=None, solver=None):
|
|
@@ -215,11 +216,7 @@ class MatrixAdjustment(BaseAdjustCount):
|
|
|
215
216
|
|
|
216
217
|
def _solve_linear(self, prevs_estim):
|
|
217
218
|
r"""
|
|
218
|
-
Solve the system
|
|
219
|
-
|
|
220
|
-
\[
|
|
221
|
-
\hat{\pi}_F = \mathbf{C}^{-1} \mathbf{p}
|
|
222
|
-
\]
|
|
219
|
+
Solve the system using matrix inversion.
|
|
223
220
|
"""
|
|
224
221
|
try:
|
|
225
222
|
adjusted = np.linalg.solve(self.CM, prevs_estim)
|
|
@@ -230,13 +227,26 @@ class MatrixAdjustment(BaseAdjustCount):
|
|
|
230
227
|
return adjusted
|
|
231
228
|
|
|
232
229
|
def _solve_optimization(self, prevs_estim, priors):
|
|
233
|
-
r"""
|
|
234
|
-
|
|
230
|
+
r"""Solve the system linearly.
|
|
231
|
+
|
|
232
|
+
The solution is obtained by matrix inversion:
|
|
235
233
|
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
234
|
+
.. math::
|
|
235
|
+
|
|
236
|
+
\hat{\pi}_F = \mathbf{C}^{-1} \mathbf{p}
|
|
237
|
+
|
|
238
|
+
where :math:`\mathbf{C}` is the confusion matrix and :math:`\mathbf{p}`
|
|
239
|
+
is the observed prevalence vector.
|
|
240
|
+
|
|
241
|
+
Parameters
|
|
242
|
+
----------
|
|
243
|
+
p : ndarray of shape (n_classes,)
|
|
244
|
+
Observed prevalence vector from test set.
|
|
245
|
+
|
|
246
|
+
Returns
|
|
247
|
+
-------
|
|
248
|
+
ndarray of shape (n_classes,)
|
|
249
|
+
Adjusted prevalence estimates :math:`\hat{\pi}_F`.
|
|
240
250
|
"""
|
|
241
251
|
def objective(prevs_pred):
|
|
242
252
|
return np.linalg.norm(self.CM @ prevs_pred - prevs_estim)
|
|
@@ -262,7 +272,63 @@ class MatrixAdjustment(BaseAdjustCount):
|
|
|
262
272
|
|
|
263
273
|
|
|
264
274
|
class FM(SoftLearnerQMixin, MatrixAdjustment):
|
|
265
|
-
"""
|
|
275
|
+
r"""Friedman Method for quantification adjustment.
|
|
276
|
+
|
|
277
|
+
This class implements the Friedman (2015) matrix-based quantification adjustment, which formulates the quantification problem as a constrained optimization problem. It adjusts the estimated class prevalences by minimizing the difference between predicted and expected prevalences, subject to valid prevalence constraints.
|
|
278
|
+
|
|
279
|
+
The confusion matrix is computed by applying estimated posterior probabilities
|
|
280
|
+
over true labels, enabling accurate correction of prevalence estimates under
|
|
281
|
+
concept drift.
|
|
282
|
+
|
|
283
|
+
The confusion matrix is estimated for each class :math:`k` by:
|
|
284
|
+
applying thresholding on posterior probabilities against prior prevalence,
|
|
285
|
+
as described in the FM algorithm. This enables the correction using
|
|
286
|
+
a quadratic optimization approach.
|
|
287
|
+
|
|
288
|
+
The method solves:
|
|
289
|
+
|
|
290
|
+
.. math::
|
|
291
|
+
|
|
292
|
+
\min_{\hat{\pi}_F} \| \mathbf{C} \hat{\pi}_F - \mathbf{p} \|^2
|
|
293
|
+
|
|
294
|
+
subject to constraints:
|
|
295
|
+
|
|
296
|
+
.. math::
|
|
297
|
+
|
|
298
|
+
\hat{\pi}_F \geq 0, \quad \sum_k \hat{\pi}_{F,k} = 1
|
|
299
|
+
|
|
300
|
+
where :math:`\mathbf{C}` is the confusion matrix, :math:`\mathbf{p}` is the
|
|
301
|
+
vector of predicted prevalences.
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
Parameters
|
|
305
|
+
----------
|
|
306
|
+
learner : estimator, optional
|
|
307
|
+
Base classifier with `fit` and `predict_proba` methods.
|
|
308
|
+
If None, a default estimator will be used.
|
|
309
|
+
|
|
310
|
+
Attributes
|
|
311
|
+
----------
|
|
312
|
+
CM : ndarray of shape (n_classes, n_classes)
|
|
313
|
+
Confusion matrix used for correction.
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
Examples
|
|
317
|
+
--------
|
|
318
|
+
>>> from mlquantify.adjust_counting import FM
|
|
319
|
+
>>> import numpy as np
|
|
320
|
+
>>> X = np.random.randn(50, 4)
|
|
321
|
+
>>> y = np.random.randint(0, 2, 50)
|
|
322
|
+
>>> fm = FM(learner=LogisticRegression())
|
|
323
|
+
>>> fm.fit(X, y)
|
|
324
|
+
>>> fm.predict(X)
|
|
325
|
+
{0: 0.5, 1: 0.5}
|
|
326
|
+
|
|
327
|
+
References
|
|
328
|
+
----------
|
|
329
|
+
.. [1] Friedman, J. H., et al. (2015). "Detecting and Dealing with Concept Drift",
|
|
330
|
+
*Proceedings of the IEEE*, 103(11), 1522-1541.
|
|
331
|
+
"""
|
|
266
332
|
def __init__(self, learner=None):
|
|
267
333
|
super().__init__(learner=learner, solver='optim')
|
|
268
334
|
|
|
@@ -274,7 +340,52 @@ class FM(SoftLearnerQMixin, MatrixAdjustment):
|
|
|
274
340
|
|
|
275
341
|
|
|
276
342
|
class GAC(CrispLearnerQMixin, MatrixAdjustment):
|
|
277
|
-
"""
|
|
343
|
+
r"""Generalized Adjusted Count method.
|
|
344
|
+
|
|
345
|
+
This class implements the Generalized Adjusted Count (GAC) algorithm for
|
|
346
|
+
quantification adjustment as described in Firat (2016) [1]_. The method
|
|
347
|
+
adjusts the estimated class prevalences by normalizing the confusion matrix
|
|
348
|
+
based on prevalence estimates, providing a correction for bias caused by
|
|
349
|
+
distribution differences between training and test data.
|
|
350
|
+
|
|
351
|
+
The confusion matrix is normalized by dividing each column by the prevalence
|
|
352
|
+
estimate of the corresponding class. For classes with zero estimated prevalence,
|
|
353
|
+
the diagonal element is set to 1 to avoid division by zero.
|
|
354
|
+
|
|
355
|
+
This normalization ensures that the matrix best reflects the classifier's
|
|
356
|
+
behavior relative to the estimated class distributions, improving quantification
|
|
357
|
+
accuracy.
|
|
358
|
+
|
|
359
|
+
Parameters
|
|
360
|
+
----------
|
|
361
|
+
learner : estimator, optional
|
|
362
|
+
Base classifier with `fit` and `predict` methods.
|
|
363
|
+
|
|
364
|
+
Attributes
|
|
365
|
+
----------
|
|
366
|
+
CM : ndarray of shape (n_classes, n_classes)
|
|
367
|
+
Normalized confusion matrix used for adjusting predicted prevalences.
|
|
368
|
+
classes_ : ndarray
|
|
369
|
+
Array of class labels observed during training.
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
Examples
|
|
373
|
+
--------
|
|
374
|
+
>>> from sklearn.linear_model import LogisticRegression
|
|
375
|
+
>>> from mlquantify.adjust_counting import GAC
|
|
376
|
+
>>> import numpy as np
|
|
377
|
+
>>> gac = GAC(learner=LogisticRegression())
|
|
378
|
+
>>> X = np.random.randn(50, 4)
|
|
379
|
+
>>> y = np.random.randint(0, 2, 50)
|
|
380
|
+
>>> gac.fit(X, y)
|
|
381
|
+
>>> gac.predict(X)
|
|
382
|
+
{0: 0.5, 1: 0.5}
|
|
383
|
+
|
|
384
|
+
References
|
|
385
|
+
----------
|
|
386
|
+
.. [1] Firat, A. (2016). "Unified Framework for Quantification",
|
|
387
|
+
*Proceedings of AAAI Conference on Artificial Intelligence*, pp. 1-8.
|
|
388
|
+
"""
|
|
278
389
|
def __init__(self, learner=None):
|
|
279
390
|
super().__init__(learner=learner, solver='linear')
|
|
280
391
|
|
|
@@ -289,7 +400,51 @@ class GAC(CrispLearnerQMixin, MatrixAdjustment):
|
|
|
289
400
|
|
|
290
401
|
|
|
291
402
|
class GPAC(SoftLearnerQMixin, MatrixAdjustment):
|
|
292
|
-
"""Probabilistic
|
|
403
|
+
r"""Probabilistic Generalized Adjusted Count (GPAC) method.
|
|
404
|
+
|
|
405
|
+
This class implements the probabilistic extension of the Generalized Adjusted Count method
|
|
406
|
+
as presented in Firat (2016) [1]_. The GPAC method normalizes the confusion matrix by
|
|
407
|
+
the estimated prevalences from posterior probabilities, enabling a probabilistic correction
|
|
408
|
+
of class prevalences.
|
|
409
|
+
|
|
410
|
+
The normalization divides each column of the confusion matrix by the estimated prevalence
|
|
411
|
+
of the corresponding class. If a class has zero estimated prevalence, the diagonal element
|
|
412
|
+
for that class is set to 1 to maintain matrix validity.
|
|
413
|
+
|
|
414
|
+
GPAC extends the GAC approach by using soft probabilistic predictions (posterior probabilities)
|
|
415
|
+
rather than crisp class labels, potentially improving quantification accuracy when
|
|
416
|
+
posterior probabilities are well calibrated.
|
|
417
|
+
|
|
418
|
+
Parameters
|
|
419
|
+
----------
|
|
420
|
+
learner : estimator, optional
|
|
421
|
+
Base classifier with `fit` and `predict_proba` methods.
|
|
422
|
+
|
|
423
|
+
Attributes
|
|
424
|
+
----------
|
|
425
|
+
CM : ndarray of shape (n_classes, n_classes)
|
|
426
|
+
Normalized confusion matrix used for adjustment.
|
|
427
|
+
classes_ : ndarray
|
|
428
|
+
Array of class labels observed during training.
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
Examples
|
|
432
|
+
--------
|
|
433
|
+
>>> from sklearn.linear_model import LogisticRegression
|
|
434
|
+
>>> from mlquantify.adjust_counting import GPAC
|
|
435
|
+
>>> import numpy as np
|
|
436
|
+
>>> gpac = GPAC(learner=LogisticRegression())
|
|
437
|
+
>>> X = np.random.randn(50, 4)
|
|
438
|
+
>>> y = np.random.randint(0, 2, 50)
|
|
439
|
+
>>> gpac.fit(X, y)
|
|
440
|
+
>>> gpac.predict(X)
|
|
441
|
+
{0: 0.5, 1: 0.5}
|
|
442
|
+
|
|
443
|
+
References
|
|
444
|
+
----------
|
|
445
|
+
.. [1] Firat, A. (2016). "Unified Framework for Quantification",
|
|
446
|
+
*Proceedings of AAAI Conference on Artificial Intelligence*, pp. 1-8.
|
|
447
|
+
"""
|
|
293
448
|
def __init__(self, learner=None):
|
|
294
449
|
super().__init__(learner=learner, solver='linear')
|
|
295
450
|
|
|
@@ -304,41 +459,145 @@ class GPAC(SoftLearnerQMixin, MatrixAdjustment):
|
|
|
304
459
|
|
|
305
460
|
|
|
306
461
|
class ACC(ThresholdAdjustment):
|
|
307
|
-
"""Adjusted Count (ACC) — baseline threshold correction.
|
|
308
|
-
|
|
462
|
+
r"""Adjusted Count (ACC) — baseline threshold correction.
|
|
463
|
+
|
|
464
|
+
This method corrects the bias in class prevalence estimates caused by imperfect
|
|
465
|
+
classification accuracy, by adjusting the observed positive count using estimates
|
|
466
|
+
of the classifier's true positive rate (TPR) and false positive rate (FPR).
|
|
467
|
+
|
|
468
|
+
It uses a fixed classification threshold and applies the formula:
|
|
469
|
+
|
|
470
|
+
.. math::
|
|
471
|
+
|
|
472
|
+
p = \frac{p' - \text{FPR}}{\text{TPR} - \text{FPR}}
|
|
473
|
+
|
|
474
|
+
where :math:`p'` is the observed positive proportion from :class:`CC`,
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
Parameters
|
|
478
|
+
----------
|
|
479
|
+
learner : estimator, optional
|
|
480
|
+
A supervised learning model with `fit` and `predict_proba` methods.
|
|
481
|
+
threshold : float, default=0.5
|
|
482
|
+
Classification threshold in [0, 1] for applying in the :class:`CC` output.
|
|
483
|
+
|
|
484
|
+
References
|
|
485
|
+
----------
|
|
486
|
+
.. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate Classification",
|
|
487
|
+
*ECML*, pp. 564-575.
|
|
488
|
+
"""
|
|
489
|
+
|
|
490
|
+
def get_best_threshold(self, thresholds, tprs, fprs):
|
|
309
491
|
tpr = tprs[thresholds == self.threshold][0]
|
|
310
492
|
fpr = fprs[thresholds == self.threshold][0]
|
|
311
493
|
return (self.threshold, tpr, fpr)
|
|
312
494
|
|
|
313
495
|
|
|
314
496
|
class X_method(ThresholdAdjustment):
|
|
315
|
-
"""X method — threshold where
|
|
316
|
-
|
|
497
|
+
r"""X method — threshold where :math:`\text{TPR} + \text{FPR} = 1`.
|
|
498
|
+
|
|
499
|
+
This method selects the classification threshold at which the sum of the true positive
|
|
500
|
+
rate (TPR) and false positive rate (FPR) equals one. This threshold choice balances
|
|
501
|
+
errors in a specific way improving quantification.
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
Parameters
|
|
505
|
+
----------
|
|
506
|
+
learner : estimator, optional
|
|
507
|
+
A supervised learning model with `fit` and `predict_proba` methods.
|
|
508
|
+
threshold : float, default=0.5
|
|
509
|
+
Classification threshold in [0, 1] for applying in the :class:`CC` output.
|
|
510
|
+
|
|
511
|
+
References
|
|
512
|
+
----------
|
|
513
|
+
.. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate Classification",
|
|
514
|
+
*ECML*, pp. 564-575.
|
|
515
|
+
"""
|
|
516
|
+
def get_best_threshold(self, thresholds, tprs, fprs):
|
|
317
517
|
idx = np.argmin(np.abs(1 - (tprs + fprs)))
|
|
318
518
|
return thresholds[idx], tprs[idx], fprs[idx]
|
|
319
519
|
|
|
320
520
|
|
|
321
521
|
class MAX(ThresholdAdjustment):
|
|
322
|
-
r"""MAX method — threshold maximizing
|
|
323
|
-
|
|
522
|
+
r"""MAX method — threshold maximizing :math:`\text{TPR} - \text{FPR}`.
|
|
523
|
+
|
|
524
|
+
This method selects the threshold that maximizes the difference between the true positive
|
|
525
|
+
rate (TPR) and the false positive rate (FPR), effectively optimizing classification
|
|
526
|
+
performance for quantification.
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
Parameters
|
|
530
|
+
----------
|
|
531
|
+
learner : estimator, optional
|
|
532
|
+
A supervised learning model with `fit` and `predict_proba` methods.
|
|
533
|
+
threshold : float, default=0.5
|
|
534
|
+
Classification threshold in [0, 1] for applying in the :class:`CC` output.
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
References
|
|
538
|
+
----------
|
|
539
|
+
.. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate Classification",
|
|
540
|
+
*ECML*, pp. 564-575.
|
|
541
|
+
"""
|
|
542
|
+
def get_best_threshold(self, thresholds, tprs, fprs):
|
|
324
543
|
idx = np.argmax(np.abs(tprs - fprs))
|
|
325
544
|
return thresholds[idx], tprs[idx], fprs[idx]
|
|
326
545
|
|
|
327
546
|
|
|
328
547
|
class T50(ThresholdAdjustment):
|
|
329
|
-
r"""T50 — selects threshold where
|
|
330
|
-
|
|
548
|
+
r"""T50 — selects threshold where :math:`\text{TPR} = 0.5`.
|
|
549
|
+
|
|
550
|
+
This method chooses the classification threshold such that the true positive rate (TPR)
|
|
551
|
+
equals 0.5, avoiding regions with unreliable estimates at extreme thresholds.
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
Parameters
|
|
555
|
+
----------
|
|
556
|
+
learner : estimator, optional
|
|
557
|
+
A supervised learning model with `fit` and `predict_proba` methods.
|
|
558
|
+
threshold : float, default=0.5
|
|
559
|
+
Classification threshold in [0, 1] for applying in the :class:`CC` output.
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
References
|
|
563
|
+
----------
|
|
564
|
+
.. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate Classification",
|
|
565
|
+
*ECML*, pp. 564-575.
|
|
566
|
+
"""
|
|
567
|
+
def get_best_threshold(self, thresholds, tprs, fprs):
|
|
331
568
|
idx = np.argmin(np.abs(tprs - 0.5))
|
|
332
569
|
return thresholds[idx], tprs[idx], fprs[idx]
|
|
333
570
|
|
|
334
571
|
|
|
335
572
|
class MS(ThresholdAdjustment):
|
|
336
|
-
r"""Median Sweep (MS) — median prevalence across all thresholds.
|
|
573
|
+
r"""Median Sweep (MS) — median prevalence estimate across all thresholds.
|
|
574
|
+
|
|
575
|
+
This method computes class prevalence estimates at multiple classification thresholds,
|
|
576
|
+
using the adjusted count formula for each, then returns the median of these estimates,
|
|
577
|
+
reducing variance caused by any single threshold selection.
|
|
578
|
+
|
|
579
|
+
It thus leverages the strengths of bootstrap-like variance reduction without heavy
|
|
580
|
+
computation.
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
Parameters
|
|
584
|
+
----------
|
|
585
|
+
learner : estimator, optional
|
|
586
|
+
A supervised learning model with `fit` and `predict_proba` methods.
|
|
587
|
+
threshold : float, default=0.5
|
|
588
|
+
Classification threshold in [0, 1] for applying in the :class:`CC` output.
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
References
|
|
592
|
+
----------
|
|
593
|
+
.. [1] Forman, G. (2008). "Quantifying Counts and Costs via Classification",
|
|
594
|
+
*Data Mining and Knowledge Discovery*, 17(2), 164-206.
|
|
595
|
+
"""
|
|
337
596
|
def _adjust(self, predictions, train_y_scores, train_y_values):
|
|
338
597
|
positive_scores = train_y_scores[:, 1]
|
|
339
598
|
|
|
340
|
-
thresholds, tprs, fprs = evaluate_thresholds(train_y_values, positive_scores
|
|
341
|
-
thresholds, tprs, fprs = self.
|
|
599
|
+
thresholds, tprs, fprs = evaluate_thresholds(train_y_values, positive_scores)
|
|
600
|
+
thresholds, tprs, fprs = self.get_best_threshold(thresholds, tprs, fprs)
|
|
342
601
|
|
|
343
602
|
prevs = []
|
|
344
603
|
for thr, tpr, fpr in zip(thresholds, tprs, fprs):
|
|
@@ -349,13 +608,37 @@ class MS(ThresholdAdjustment):
|
|
|
349
608
|
prevalence = np.median(prevs)
|
|
350
609
|
return np.asarray([1 - prevalence, prevalence])
|
|
351
610
|
|
|
352
|
-
def
|
|
611
|
+
def get_best_threshold(self, thresholds, tprs, fprs):
|
|
353
612
|
return thresholds, tprs, fprs
|
|
354
613
|
|
|
355
614
|
|
|
356
615
|
class MS2(MS):
|
|
357
|
-
r"""MS2 — Median Sweep variant
|
|
358
|
-
|
|
616
|
+
r"""MS2 — Median Sweep variant constraining :math:`|\text{TPR} - \text{FPR}| > 0.25`.
|
|
617
|
+
|
|
618
|
+
This variant of Median Sweep excludes thresholds where the absolute difference
|
|
619
|
+
between true positive rate (TPR) and false positive rate (FPR) is below 0.25,
|
|
620
|
+
improving stability by avoiding ambiguous threshold regions.
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
Parameters
|
|
624
|
+
----------
|
|
625
|
+
learner : estimator, optional
|
|
626
|
+
A supervised learning model with `fit` and `predict_proba` methods.
|
|
627
|
+
threshold : float, default=0.5
|
|
628
|
+
Classification threshold in [0, 1] for applying in the :class:`CC` output.
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
Warnings
|
|
632
|
+
--------
|
|
633
|
+
- Warns if all TPR or FPR values are zero.
|
|
634
|
+
- Warns if no thresholds satisfy the constraint.
|
|
635
|
+
|
|
636
|
+
References
|
|
637
|
+
----------
|
|
638
|
+
.. [1] Forman, G. (2008). "Quantifying Counts and Costs via Classification",
|
|
639
|
+
*Data Mining and Knowledge Discovery*, 17(2), 164-206.
|
|
640
|
+
"""
|
|
641
|
+
def get_best_threshold(self, thresholds, tprs, fprs):
|
|
359
642
|
if np.all(tprs == 0) or np.all(fprs == 0):
|
|
360
643
|
warnings.warn("All TPR or FPR values are zero.")
|
|
361
644
|
indices = np.where(np.abs(tprs - fprs) > 0.25)[0]
|
|
@@ -174,7 +174,7 @@ class BaseAdjustCount(AggregationMixin, BaseQuantifier):
|
|
|
174
174
|
--------
|
|
175
175
|
>>> from mlquantify.base_count import BaseAdjustCount
|
|
176
176
|
>>> import numpy as np
|
|
177
|
-
|
|
177
|
+
>>> from sklearn.linear_model import LogisticRegression
|
|
178
178
|
>>> class ACC(CrispLearnerQMixin, BaseAdjustCount):
|
|
179
179
|
... def _adjust(self, preds, train_preds, y_train):
|
|
180
180
|
... tpr = np.mean(train_preds[y_train == 1])
|
|
@@ -182,8 +182,6 @@ class BaseAdjustCount(AggregationMixin, BaseQuantifier):
|
|
|
182
182
|
... p_obs = np.mean(preds)
|
|
183
183
|
... p_adj = (p_obs - fpr) / (tpr - fpr)
|
|
184
184
|
... return np.clip([1 - p_adj, p_adj], 0, 1)
|
|
185
|
-
|
|
186
|
-
>>> from sklearn.linear_model import LogisticRegression
|
|
187
185
|
>>> X = np.random.randn(100, 5)
|
|
188
186
|
>>> y = np.random.randint(0, 2, 100)
|
|
189
187
|
>>> q = ACC(learner=LogisticRegression())
|