mlquantify 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify/__init__.py +0 -29
- mlquantify/adjust_counting/__init__.py +14 -0
- mlquantify/adjust_counting/_adjustment.py +365 -0
- mlquantify/adjust_counting/_base.py +247 -0
- mlquantify/adjust_counting/_counting.py +145 -0
- mlquantify/adjust_counting/_utils.py +114 -0
- mlquantify/base.py +117 -519
- mlquantify/base_aggregative.py +209 -0
- mlquantify/calibration.py +1 -0
- mlquantify/confidence.py +335 -0
- mlquantify/likelihood/__init__.py +5 -0
- mlquantify/likelihood/_base.py +161 -0
- mlquantify/likelihood/_classes.py +414 -0
- mlquantify/meta/__init__.py +1 -0
- mlquantify/meta/_classes.py +761 -0
- mlquantify/metrics/__init__.py +21 -0
- mlquantify/metrics/_oq.py +109 -0
- mlquantify/metrics/_rq.py +98 -0
- mlquantify/{evaluation/measures.py → metrics/_slq.py} +43 -28
- mlquantify/mixture/__init__.py +7 -0
- mlquantify/mixture/_base.py +153 -0
- mlquantify/mixture/_classes.py +400 -0
- mlquantify/mixture/_utils.py +112 -0
- mlquantify/model_selection/__init__.py +9 -0
- mlquantify/model_selection/_protocol.py +358 -0
- mlquantify/model_selection/_search.py +315 -0
- mlquantify/model_selection/_split.py +1 -0
- mlquantify/multiclass.py +350 -0
- mlquantify/neighbors/__init__.py +9 -0
- mlquantify/neighbors/_base.py +198 -0
- mlquantify/neighbors/_classes.py +159 -0
- mlquantify/{classification/methods.py → neighbors/_classification.py} +48 -66
- mlquantify/neighbors/_kde.py +270 -0
- mlquantify/neighbors/_utils.py +135 -0
- mlquantify/neural/__init__.py +1 -0
- mlquantify/utils/__init__.py +47 -2
- mlquantify/utils/_artificial.py +27 -0
- mlquantify/utils/_constraints.py +219 -0
- mlquantify/utils/_context.py +21 -0
- mlquantify/utils/_decorators.py +36 -0
- mlquantify/utils/_exceptions.py +12 -0
- mlquantify/utils/_get_scores.py +159 -0
- mlquantify/utils/_load.py +18 -0
- mlquantify/utils/_parallel.py +6 -0
- mlquantify/utils/_random.py +36 -0
- mlquantify/utils/_sampling.py +273 -0
- mlquantify/utils/_tags.py +44 -0
- mlquantify/utils/_validation.py +447 -0
- mlquantify/utils/prevalence.py +61 -0
- {mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/METADATA +2 -1
- mlquantify-0.1.9.dist-info/RECORD +53 -0
- mlquantify/classification/__init__.py +0 -1
- mlquantify/evaluation/__init__.py +0 -14
- mlquantify/evaluation/protocol.py +0 -291
- mlquantify/methods/__init__.py +0 -37
- mlquantify/methods/aggregative.py +0 -1159
- mlquantify/methods/meta.py +0 -472
- mlquantify/methods/mixture_models.py +0 -1003
- mlquantify/methods/non_aggregative.py +0 -136
- mlquantify/methods/threshold_optimization.py +0 -869
- mlquantify/model_selection.py +0 -377
- mlquantify/plots.py +0 -367
- mlquantify/utils/general.py +0 -371
- mlquantify/utils/method.py +0 -449
- mlquantify-0.1.7.dist-info/RECORD +0 -22
- {mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/WHEEL +0 -0
- {mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from abc import abstractmethod
|
|
3
|
+
|
|
4
|
+
from mlquantify.base import BaseQuantifier
|
|
5
|
+
|
|
6
|
+
from mlquantify.base_aggregative import (
|
|
7
|
+
AggregationMixin,
|
|
8
|
+
_get_learner_function
|
|
9
|
+
)
|
|
10
|
+
from mlquantify.adjust_counting import CC
|
|
11
|
+
from mlquantify.utils._decorators import _fit_context
|
|
12
|
+
from mlquantify.utils._validation import check_classes_attribute, validate_predictions, validate_y, validate_data, validate_prevalences
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BaseIterativeLikelihood(AggregationMixin, BaseQuantifier):
|
|
17
|
+
"""
|
|
18
|
+
Iterative, likelihood-based quantification via EM adjustment.
|
|
19
|
+
|
|
20
|
+
This is the base class for quantification methods that estimate class prevalences
|
|
21
|
+
by solving the maximum likelihood problem under prior probability shift, using
|
|
22
|
+
iterative procedures such as the EM (Expectation-Maximization) algorithm
|
|
23
|
+
[1], [2].
|
|
24
|
+
|
|
25
|
+
These methods repeatedly adjust the estimated class prevalences for a test set
|
|
26
|
+
by maximizing the likelihood of observed classifier outputs (posterior probabilities),
|
|
27
|
+
under the assumption that the within-class conditional distributions remain fixed
|
|
28
|
+
between training and test domains.
|
|
29
|
+
|
|
30
|
+
Mathematical formulation
|
|
31
|
+
------------------------
|
|
32
|
+
Let:
|
|
33
|
+
- \( p_k^t \) denote the prior probability for class \( k \) in the training set (\( \sum_k p_k^t = 1 \)),
|
|
34
|
+
- \( s_k(x) \) be the classifier's posterior probability estimate (for class \( k \), given instance \( x \), fitted on training set),
|
|
35
|
+
- \( p_k \) be the (unknown) prior for the test set,
|
|
36
|
+
- \( x_1, \dots, x_N \) the unlabeled test set instances.
|
|
37
|
+
|
|
38
|
+
The procedure iteratively estimates \( p_k \) by maximizing the observed data likelihood
|
|
39
|
+
|
|
40
|
+
\[
|
|
41
|
+
L = \prod_{i=1}^N \sum_{k=1}^K s_k(x_i) \frac{p_k}{p_k^t}
|
|
42
|
+
\]
|
|
43
|
+
|
|
44
|
+
The E-step updates soft memberships:
|
|
45
|
+
|
|
46
|
+
\[
|
|
47
|
+
w_{ik}^{(t)} = \frac{s_k(x_i) \cdot (p_k^{(t-1)} / p_k^t)}{\sum_{j=1}^K s_j(x_i) \cdot (p_j^{(t-1)} / p_j^t)}
|
|
48
|
+
\]
|
|
49
|
+
and the M-step re-estimates prevalences:
|
|
50
|
+
|
|
51
|
+
\[
|
|
52
|
+
p_k^{(t)} = \frac{1}{N} \sum_{i=1}^N w_{ik}^{(t)}
|
|
53
|
+
\]
|
|
54
|
+
See also [1].
|
|
55
|
+
|
|
56
|
+
Notes
|
|
57
|
+
-----
|
|
58
|
+
- Defined for multiclass and binary quantification (single-label), as long as the classifier provides well-calibrated posterior probabilities.
|
|
59
|
+
- Assumes prior probability shift only.
|
|
60
|
+
- Converges to a (local) maximum of the data likelihood.
|
|
61
|
+
- The algorithm is Fisher-consistent under prior probability shift [2].
|
|
62
|
+
- Closely related to the Expectation-Maximization (EM) algorithm for mixture models.
|
|
63
|
+
|
|
64
|
+
Parameters
|
|
65
|
+
----------
|
|
66
|
+
learner : estimator, optional
|
|
67
|
+
Probabilistic classifier instance with `fit(X, y)` and `predict_proba(X)`.
|
|
68
|
+
tol : float, default=1e-4
|
|
69
|
+
Convergence tolerance for prevalence update.
|
|
70
|
+
max_iter : int, default=100
|
|
71
|
+
Maximum number of EM update iterations.
|
|
72
|
+
|
|
73
|
+
Attributes
|
|
74
|
+
----------
|
|
75
|
+
learner : estimator
|
|
76
|
+
Underlying classifier instance.
|
|
77
|
+
tol : float
|
|
78
|
+
Stopping tolerance for EM prevalence estimation.
|
|
79
|
+
max_iter : int
|
|
80
|
+
Maximum updates performed.
|
|
81
|
+
classes : ndarray of shape (n_classes,)
|
|
82
|
+
Unique class labels seen in training.
|
|
83
|
+
priors : ndarray of shape (n_classes,)
|
|
84
|
+
Class distribution of the training set.
|
|
85
|
+
y_train : array-like
|
|
86
|
+
Training labels (used for estimating priors and confusion matrix if needed).
|
|
87
|
+
|
|
88
|
+
Examples
|
|
89
|
+
--------
|
|
90
|
+
>>> import numpy as np
|
|
91
|
+
>>> from sklearn.linear_model import LogisticRegression
|
|
92
|
+
>>> class MyEM(BaseIterativeLikelihood):
|
|
93
|
+
... def _iterate(self, predictions, priors):
|
|
94
|
+
... # EM iteration logic here
|
|
95
|
+
... pass
|
|
96
|
+
>>> X = np.random.randn(200, 8)
|
|
97
|
+
>>> y = np.random.randint(0, 3, size=(200,))
|
|
98
|
+
>>> q = MyEM(learner=LogisticRegression(max_iter=200))
|
|
99
|
+
>>> q.fit(X, y)
|
|
100
|
+
>>> q.predict(X)
|
|
101
|
+
{0: 0.32, 1: 0.40, 2: 0.28}
|
|
102
|
+
|
|
103
|
+
References
|
|
104
|
+
----------
|
|
105
|
+
[1] Saerens, M., Latinne, P., & Decaestecker, C. (2002). *Adjusting the Outputs of a Classifier to New a Priori Probabilities: A Simple Procedure.* Neural Computation, 14(1), 2141-2156.
|
|
106
|
+
|
|
107
|
+
[2] Esuli, A., Moreo, A., & Sebastiani, F. (2023). *Learning to Quantify.* The Information Retrieval Series 47, Springer. https://doi.org/10.1007/978-3-031-20467-8
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@abstractmethod
|
|
113
|
+
def __init__(self,
|
|
114
|
+
learner=None,
|
|
115
|
+
tol=1e-4,
|
|
116
|
+
max_iter=100):
|
|
117
|
+
self.learner = learner
|
|
118
|
+
self.tol = tol
|
|
119
|
+
self.max_iter = max_iter
|
|
120
|
+
|
|
121
|
+
def __mlquantify_tags__(self):
|
|
122
|
+
tags = super().__mlquantify_tags__()
|
|
123
|
+
tags.prediction_requirements.requires_train_proba = False
|
|
124
|
+
return tags
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@_fit_context(prefer_skip_nested_validation=True)
|
|
128
|
+
def fit(self, X, y):
|
|
129
|
+
"""Fit the quantifier using the provided data and learner."""
|
|
130
|
+
X, y = validate_data(self, X, y)
|
|
131
|
+
validate_y(self, y)
|
|
132
|
+
self.classes_ = np.unique(y)
|
|
133
|
+
self.learner.fit(X, y)
|
|
134
|
+
counts = np.array([np.count_nonzero(y == _class) for _class in self.classes_])
|
|
135
|
+
self.priors = counts / len(y)
|
|
136
|
+
self.y_train = y
|
|
137
|
+
|
|
138
|
+
return self
|
|
139
|
+
|
|
140
|
+
def predict(self, X):
|
|
141
|
+
"""Predict class prevalences for the given data."""
|
|
142
|
+
estimator_function = _get_learner_function(self)
|
|
143
|
+
predictions = getattr(self.learner, estimator_function)(X)
|
|
144
|
+
prevalences = self.aggregate(predictions, self.y_train)
|
|
145
|
+
return prevalences
|
|
146
|
+
|
|
147
|
+
def aggregate(self, predictions, y_train):
|
|
148
|
+
predictions = validate_predictions(self, predictions)
|
|
149
|
+
self.classes_ = check_classes_attribute(self, np.unique(y_train))
|
|
150
|
+
|
|
151
|
+
if not hasattr(self, 'priors') or len(self.priors) != len(self.classes_):
|
|
152
|
+
counts = np.array([np.count_nonzero(y_train == _class) for _class in self.classes_])
|
|
153
|
+
self.priors = counts / len(y_train)
|
|
154
|
+
|
|
155
|
+
prevalences = self._iterate(predictions, self.priors)
|
|
156
|
+
prevalences = validate_prevalences(self, prevalences, self.classes_)
|
|
157
|
+
return prevalences
|
|
158
|
+
|
|
159
|
+
@abstractmethod
|
|
160
|
+
def _iterate(self, predictions, priors):
|
|
161
|
+
...
|
|
@@ -0,0 +1,414 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from mlquantify.base_aggregative import SoftLearnerQMixin
|
|
3
|
+
from mlquantify.likelihood._base import BaseIterativeLikelihood
|
|
4
|
+
from mlquantify.metrics._slq import MAE
|
|
5
|
+
from mlquantify.multiclass import define_binary
|
|
6
|
+
from mlquantify.utils._constraints import (
|
|
7
|
+
Interval,
|
|
8
|
+
CallableConstraint,
|
|
9
|
+
Options
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
class EMQ(SoftLearnerQMixin, BaseIterativeLikelihood):
|
|
13
|
+
"""Expectation-Maximization Quantifier.
|
|
14
|
+
|
|
15
|
+
Implements iterative quantification using an EM algorithm to adjust class
|
|
16
|
+
prevalences under prior probability shift, assimilating posterior probabilities
|
|
17
|
+
(soft predictions) from a probabilistic classifier.
|
|
18
|
+
|
|
19
|
+
The EM procedure alternates between estimating posterior memberships of test
|
|
20
|
+
instances (E-step) and re-estimating class prevalences (M-step), iterating until
|
|
21
|
+
convergence (tolerance or max iterations) on prevalence change measured by a
|
|
22
|
+
user-defined criteria (default: Mean Absolute Error, MAE).
|
|
23
|
+
|
|
24
|
+
Supports optional calibration of predicted posteriors before iteration.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
learner : estimator, optional
|
|
29
|
+
Probabilistic classifier fit on training data with `predict_proba`.
|
|
30
|
+
tol : float, default=1e-4
|
|
31
|
+
Convergence threshold for EM iterative updates.
|
|
32
|
+
max_iter : int, default=100
|
|
33
|
+
Maximum number of EM iterations.
|
|
34
|
+
calib_function : str or callable, optional
|
|
35
|
+
Calibration method applied to posterior probabilities.
|
|
36
|
+
Supported strings: 'bcts', 'ts', 'vs', 'nbvs'.
|
|
37
|
+
criteria : callable, default=MAE
|
|
38
|
+
Function to measure convergence between prevalence estimates.
|
|
39
|
+
|
|
40
|
+
Methods
|
|
41
|
+
-------
|
|
42
|
+
_iterate(predictions, priors)
|
|
43
|
+
Executes EM iterations to estimate prevalences from posterior probabilities.
|
|
44
|
+
EM(posteriors, priors, tolerance, max_iter, criteria)
|
|
45
|
+
Static method implementing the EM loop with E-step and M-step.
|
|
46
|
+
_apply_calibration(predictions)
|
|
47
|
+
Applies optional calibration method to posterior predictions.
|
|
48
|
+
|
|
49
|
+
References
|
|
50
|
+
----------
|
|
51
|
+
[1] Saerens et al. (2002). Adjusting the Outputs of a Classifier to New a Priori Probabilities. Neural Computation, 14(1), 2141-2156.
|
|
52
|
+
[2] Esuli et al. (2023). Learning to Quantify. Springer.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
_parameter_constraints = {
|
|
56
|
+
"tol": [Interval(0, None, inclusive_left=False)],
|
|
57
|
+
"max_iter": [Interval(1, None, inclusive_left=True)],
|
|
58
|
+
"calib_function": [
|
|
59
|
+
Options(["bcts", "ts", "vs", "nbvs", None]),
|
|
60
|
+
],
|
|
61
|
+
"criteria": [CallableConstraint()],
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
def __init__(self,
|
|
65
|
+
learner=None,
|
|
66
|
+
tol=1e-4,
|
|
67
|
+
max_iter=100,
|
|
68
|
+
calib_function=None,
|
|
69
|
+
criteria=MAE):
|
|
70
|
+
super().__init__(learner=learner, tol=tol, max_iter=max_iter)
|
|
71
|
+
self.calib_function = calib_function
|
|
72
|
+
self.criteria = criteria
|
|
73
|
+
|
|
74
|
+
def _iterate(self, predictions, priors):
|
|
75
|
+
"""
|
|
76
|
+
Perform EM quantification iteration.
|
|
77
|
+
|
|
78
|
+
Steps:
|
|
79
|
+
- Calibrate posterior predictions if calibration function specified.
|
|
80
|
+
- Apply EM procedure to re-estimate prevalences, based on training priors and calibrated posteriors.
|
|
81
|
+
|
|
82
|
+
Parameters
|
|
83
|
+
----------
|
|
84
|
+
predictions : ndarray of shape (n_samples, n_classes)
|
|
85
|
+
Posterior probabilities for each class on test data.
|
|
86
|
+
priors : ndarray of shape (n_classes,)
|
|
87
|
+
Training set class prevalences, serving as initial priors.
|
|
88
|
+
|
|
89
|
+
Returns
|
|
90
|
+
-------
|
|
91
|
+
prevalences : ndarray of shape (n_classes,)
|
|
92
|
+
Estimated class prevalences after EM iteration.
|
|
93
|
+
"""
|
|
94
|
+
calibrated_predictions = self._apply_calibration(predictions)
|
|
95
|
+
prevalences, _ = self.EM(
|
|
96
|
+
posteriors=calibrated_predictions,
|
|
97
|
+
priors=priors,
|
|
98
|
+
tolerance=self.tol,
|
|
99
|
+
max_iter=self.max_iter,
|
|
100
|
+
criteria=self.criteria
|
|
101
|
+
)
|
|
102
|
+
return prevalences
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@classmethod
|
|
106
|
+
def EM(cls, posteriors, priors, tolerance=1e-6, max_iter=100, criteria=MAE):
|
|
107
|
+
"""
|
|
108
|
+
Static method implementing the EM algorithm for quantification.
|
|
109
|
+
|
|
110
|
+
Parameters
|
|
111
|
+
----------
|
|
112
|
+
posteriors : ndarray of shape (n_samples, n_classes)
|
|
113
|
+
Posterior probability predictions.
|
|
114
|
+
priors : ndarray of shape (n_classes,)
|
|
115
|
+
Training class prior probabilities.
|
|
116
|
+
tolerance : float
|
|
117
|
+
Convergence threshold based on difference between iterations.
|
|
118
|
+
max_iter : int
|
|
119
|
+
Max number of EM iterations.
|
|
120
|
+
criteria : callable
|
|
121
|
+
Metric to assess convergence, e.g., MAE.
|
|
122
|
+
|
|
123
|
+
Returns
|
|
124
|
+
-------
|
|
125
|
+
qs : ndarray of shape (n_classes,)
|
|
126
|
+
Estimated test set class prevalences.
|
|
127
|
+
ps : ndarray of shape (n_samples, n_classes)
|
|
128
|
+
Updated soft membership probabilities per instance.
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
Px = np.array(posteriors, dtype=np.float64)
|
|
132
|
+
Ptr = np.array(priors, dtype=np.float64)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
if np.prod(Ptr) == 0:
|
|
137
|
+
Ptr += tolerance
|
|
138
|
+
Ptr /= Ptr.sum()
|
|
139
|
+
|
|
140
|
+
qs = np.copy(Ptr)
|
|
141
|
+
s, converged = 0, False
|
|
142
|
+
qs_prev_ = None
|
|
143
|
+
|
|
144
|
+
while not converged and s < max_iter:
|
|
145
|
+
# E-step:
|
|
146
|
+
ps_unnormalized = (qs / Ptr) * Px
|
|
147
|
+
ps = ps_unnormalized / ps_unnormalized.sum(axis=1, keepdims=True)
|
|
148
|
+
|
|
149
|
+
# M-step:
|
|
150
|
+
qs = ps.mean(axis=0)
|
|
151
|
+
|
|
152
|
+
if qs_prev_ is not None and criteria(qs_prev_, qs) < tolerance and s > 10:
|
|
153
|
+
converged = True
|
|
154
|
+
|
|
155
|
+
qs_prev_ = qs
|
|
156
|
+
s += 1
|
|
157
|
+
|
|
158
|
+
if not converged:
|
|
159
|
+
print('[warning] the method has reached the maximum number of iterations; it might have not converged')
|
|
160
|
+
|
|
161
|
+
return qs, ps
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _apply_calibration(self, predictions):
|
|
165
|
+
"""
|
|
166
|
+
Calibrate posterior predictions with specified calibration method.
|
|
167
|
+
|
|
168
|
+
Parameters
|
|
169
|
+
----------
|
|
170
|
+
predictions : ndarray
|
|
171
|
+
Posterior predictions to calibrate.
|
|
172
|
+
|
|
173
|
+
Returns
|
|
174
|
+
-------
|
|
175
|
+
calibrated_predictions : ndarray
|
|
176
|
+
Calibrated posterior predictions.
|
|
177
|
+
|
|
178
|
+
Raises
|
|
179
|
+
------
|
|
180
|
+
ValueError
|
|
181
|
+
If calib_function is unrecognized.
|
|
182
|
+
"""
|
|
183
|
+
if self.calib_function is None:
|
|
184
|
+
return predictions
|
|
185
|
+
|
|
186
|
+
if isinstance(self.calib_function, str):
|
|
187
|
+
method = self.calib_function.lower()
|
|
188
|
+
if method == "ts":
|
|
189
|
+
return self._temperature_scaling(predictions)
|
|
190
|
+
elif method == "bcts":
|
|
191
|
+
return self._bias_corrected_temperature_scaling(predictions)
|
|
192
|
+
elif method == "vs":
|
|
193
|
+
return self._vector_scaling(predictions)
|
|
194
|
+
elif method == "nbvs":
|
|
195
|
+
return self._no_bias_vector_scaling(predictions)
|
|
196
|
+
|
|
197
|
+
elif callable(self.calib_function):
|
|
198
|
+
return self.calib_function(predictions)
|
|
199
|
+
|
|
200
|
+
raise ValueError(
|
|
201
|
+
f"Invalid calib_function '{self.calib_function}'. Expected one of {{'bcts', 'ts', 'vs', 'nbvs', None, callable}}."
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
def _temperature_scaling(self, preds):
|
|
205
|
+
"""Temperature Scaling calibration applied to logits."""
|
|
206
|
+
T = 1.0
|
|
207
|
+
preds = np.clip(preds, 1e-12, 1.0)
|
|
208
|
+
logits = np.log(preds)
|
|
209
|
+
scaled = logits / T
|
|
210
|
+
exp_scaled = np.exp(scaled - np.max(scaled, axis=1, keepdims=True))
|
|
211
|
+
return exp_scaled / np.sum(exp_scaled, axis=1, keepdims=True)
|
|
212
|
+
|
|
213
|
+
def _bias_corrected_temperature_scaling(self, preds):
|
|
214
|
+
"""Bias-Corrected Temperature Scaling calibration."""
|
|
215
|
+
T = 1.0
|
|
216
|
+
bias = np.zeros(preds.shape[1])
|
|
217
|
+
preds = np.clip(preds, 1e-12, 1.0)
|
|
218
|
+
logits = np.log(preds)
|
|
219
|
+
logits = logits / T + bias
|
|
220
|
+
exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
|
|
221
|
+
return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
|
|
222
|
+
|
|
223
|
+
def _vector_scaling(self, preds):
|
|
224
|
+
"""Vector Scaling calibration."""
|
|
225
|
+
W = np.ones(preds.shape[1])
|
|
226
|
+
b = np.zeros(preds.shape[1])
|
|
227
|
+
preds = np.clip(preds, 1e-12, 1.0)
|
|
228
|
+
logits = np.log(preds)
|
|
229
|
+
scaled = logits * W + b
|
|
230
|
+
exp_scaled = np.exp(scaled - np.max(scaled, axis=1, keepdims=True))
|
|
231
|
+
return exp_scaled / np.sum(exp_scaled, axis=1, keepdims=True)
|
|
232
|
+
|
|
233
|
+
def _no_bias_vector_scaling(self, preds):
|
|
234
|
+
"""No-Bias Vector Scaling calibration."""
|
|
235
|
+
W = np.ones(preds.shape[1])
|
|
236
|
+
preds = np.clip(preds, 1e-12, 1.0)
|
|
237
|
+
logits = np.log(preds)
|
|
238
|
+
scaled = logits * W
|
|
239
|
+
exp_scaled = np.exp(scaled - np.max(scaled, axis=1, keepdims=True))
|
|
240
|
+
return exp_scaled / np.sum(exp_scaled, axis=1, keepdims=True)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
class MLPE(SoftLearnerQMixin, BaseIterativeLikelihood):
|
|
245
|
+
"""
|
|
246
|
+
Maximum Likelihood Prevalence Estimation (MLPE) quantifier.
|
|
247
|
+
|
|
248
|
+
A simple iterative likelihood quantification method that returns the
|
|
249
|
+
training priors as the estimated prevalences, effectively skipping iteration.
|
|
250
|
+
|
|
251
|
+
This method assumes no prior probability shift between training and test.
|
|
252
|
+
|
|
253
|
+
Parameters
|
|
254
|
+
----------
|
|
255
|
+
learner : estimator, optional
|
|
256
|
+
Base classifier for possible extension/fitting.
|
|
257
|
+
"""
|
|
258
|
+
|
|
259
|
+
def __init__(self, learner=None):
|
|
260
|
+
super().__init__(learner=learner, max_iter=1)
|
|
261
|
+
|
|
262
|
+
def _iterate(self, predictions, priors):
|
|
263
|
+
"""Returns training priors without adjustment.
|
|
264
|
+
|
|
265
|
+
Parameters
|
|
266
|
+
----------
|
|
267
|
+
predictions : array-like
|
|
268
|
+
Ignored in this implementation.
|
|
269
|
+
priors : array-like
|
|
270
|
+
Training priors, returned as is.
|
|
271
|
+
|
|
272
|
+
Returns
|
|
273
|
+
-------
|
|
274
|
+
prevalences : array-like
|
|
275
|
+
Equal to the training priors.
|
|
276
|
+
"""
|
|
277
|
+
return priors
|
|
278
|
+
|
|
279
|
+
@define_binary
|
|
280
|
+
class CDE(SoftLearnerQMixin, BaseIterativeLikelihood):
|
|
281
|
+
"""
|
|
282
|
+
CDE-Iterate (Class Distribution Estimation Iterate) for binary classification.
|
|
283
|
+
|
|
284
|
+
This method iteratively estimates class prevalences under prior probability shift
|
|
285
|
+
by updating the false positive cost in a cost-sensitive classification framework,
|
|
286
|
+
using a thresholding strategy based on posterior probabilities.
|
|
287
|
+
|
|
288
|
+
The procedure:
|
|
289
|
+
- Calculates a threshold from false-positive (cFP) and false-negative (cFN) costs.
|
|
290
|
+
- Assigns hard positive predictions where posterior probability exceeds threshold.
|
|
291
|
+
- Estimates the prevalence via classify-and-count on thresholded predictions.
|
|
292
|
+
- Updates false positive cost according to prevalence estimates and training priors.
|
|
293
|
+
- Iterates until prevalence estimates converge or max iterations reached.
|
|
294
|
+
|
|
295
|
+
This implementation adopts the transductive thresholding variant described in
|
|
296
|
+
Esuli et al. (2023), rather than retraining a cost-sensitive classifier as in
|
|
297
|
+
Xue & Weiss (2009).
|
|
298
|
+
|
|
299
|
+
Parameters
|
|
300
|
+
----------
|
|
301
|
+
learner : estimator, optional
|
|
302
|
+
Wrapped classifier (unused here but part of base interface).
|
|
303
|
+
tol : float, default=1e-4
|
|
304
|
+
Absolute tolerance for convergence of estimated prevalences.
|
|
305
|
+
max_iter : int, default=100
|
|
306
|
+
Max number of iterations allowed.
|
|
307
|
+
init_cfp : float, default=1.0
|
|
308
|
+
Initial false positive cost coefficient.
|
|
309
|
+
|
|
310
|
+
References
|
|
311
|
+
----------
|
|
312
|
+
[2] Esuli, A., Moreo, A., & Sebastiani, F. (2023). Learning to Quantify. Springer.
|
|
313
|
+
"""
|
|
314
|
+
|
|
315
|
+
_parameter_constraints = {
|
|
316
|
+
"tol": [Interval(0, None, inclusive_left=False)],
|
|
317
|
+
"max_iter": [Interval(1, None, inclusive_left=True)],
|
|
318
|
+
"init_cfp": [Interval(0, None, inclusive_left=False)]
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
def __init__(self, learner=None, tol=1e-4, max_iter=100, init_cfp=1.0):
|
|
322
|
+
super().__init__(learner=learner, tol=tol, max_iter=max_iter)
|
|
323
|
+
self.init_cfp = float(init_cfp)
|
|
324
|
+
|
|
325
|
+
def _iterate(self, predictions, priors):
|
|
326
|
+
"""
|
|
327
|
+
Iteratively estimate prevalences via cost-sensitive thresholding.
|
|
328
|
+
|
|
329
|
+
Parameters
|
|
330
|
+
----------
|
|
331
|
+
predictions : ndarray, shape (n_samples, 2)
|
|
332
|
+
Posterior probabilities for binary classes [neg, pos].
|
|
333
|
+
priors : ndarray, shape (2,)
|
|
334
|
+
Training priors [p(neg), p(pos)].
|
|
335
|
+
|
|
336
|
+
Returns
|
|
337
|
+
-------
|
|
338
|
+
prevalences : ndarray, shape (2,)
|
|
339
|
+
Estimated prevalences for classes [neg, pos].
|
|
340
|
+
"""
|
|
341
|
+
P = np.asarray(predictions, dtype=np.float64)
|
|
342
|
+
Ptr = np.asarray(priors, dtype=np.float64)
|
|
343
|
+
|
|
344
|
+
# basic checks
|
|
345
|
+
if P.ndim != 2 or P.shape[1] != 2:
|
|
346
|
+
raise ValueError("CDE implementation here supports binary case only: predictions shape (n,2).")
|
|
347
|
+
|
|
348
|
+
# ensure no zeros
|
|
349
|
+
eps = 1e-12
|
|
350
|
+
P = np.clip(P, eps, 1.0)
|
|
351
|
+
|
|
352
|
+
# training priors pL(+), pL(-)
|
|
353
|
+
# assume Ptr order matches columns of P; if Ptr sums to 1 but order unknown, user must match.
|
|
354
|
+
pL_pos = Ptr[1]
|
|
355
|
+
pL_neg = Ptr[0]
|
|
356
|
+
if pL_pos <= 0 or pL_neg <= 0:
|
|
357
|
+
# keep them positive to avoid divisions by zero
|
|
358
|
+
pL_pos = max(pL_pos, eps)
|
|
359
|
+
pL_neg = max(pL_neg, eps)
|
|
360
|
+
|
|
361
|
+
# initialize costs
|
|
362
|
+
cFN = 1.0
|
|
363
|
+
cFP = float(self.init_cfp)
|
|
364
|
+
|
|
365
|
+
prev_prev_pos = None
|
|
366
|
+
s = 0
|
|
367
|
+
|
|
368
|
+
# iterate: compute threshold from costs, classify, estimate prevalences via CC,
|
|
369
|
+
# update cFP via eq. (4.27), repeat
|
|
370
|
+
while s < self.max_iter:
|
|
371
|
+
# decision threshold tau for positive class:
|
|
372
|
+
# Derivation:
|
|
373
|
+
# predict positive if cost_FP * p(-|x) < cost_FN * p(+|x)
|
|
374
|
+
# => predict positive if p(+|x) / p(-|x) > cost_FP / cost_FN
|
|
375
|
+
# since p(+|x) / p(-|x) = p(+|x) / (1 - p(+|x)):
|
|
376
|
+
# p(+|x) > cost_FP / (cost_FP + cost_FN)
|
|
377
|
+
tau = cFP / (cFP + cFN)
|
|
378
|
+
|
|
379
|
+
# hard predictions for positive class using threshold on posterior for positive (col 1)
|
|
380
|
+
pos_probs = P[:, 1]
|
|
381
|
+
hard_pos = (pos_probs > tau).astype(float)
|
|
382
|
+
|
|
383
|
+
# classify-and-count prevalence estimate on U
|
|
384
|
+
prev_pos = hard_pos.mean()
|
|
385
|
+
prev_neg = 1.0 - prev_pos
|
|
386
|
+
|
|
387
|
+
# update cFP according to Eq. 4.27:
|
|
388
|
+
# cFP_new = (pL_pos / pL_neg) * (pU_hat(neg) / pU_hat(pos)) * cFN
|
|
389
|
+
# guard against zero prev_pos / prev_neg
|
|
390
|
+
prev_pos_safe = max(prev_pos, eps)
|
|
391
|
+
prev_neg_safe = max(prev_neg, eps)
|
|
392
|
+
|
|
393
|
+
cFP_new = (pL_pos / pL_neg) * (prev_neg_safe / prev_pos_safe) * cFN
|
|
394
|
+
|
|
395
|
+
# check convergence on prevalences (absolute change)
|
|
396
|
+
if prev_prev_pos is not None and abs(prev_pos - prev_prev_pos) < self.tol:
|
|
397
|
+
break
|
|
398
|
+
|
|
399
|
+
# prepare next iter
|
|
400
|
+
cFP = cFP_new
|
|
401
|
+
prev_prev_pos = prev_pos
|
|
402
|
+
s += 1
|
|
403
|
+
|
|
404
|
+
# if didn't converge within max_iter we keep last estimate (book warns about lack of fisher consistency)
|
|
405
|
+
if s >= self.max_iter:
|
|
406
|
+
# optional: warning
|
|
407
|
+
# print('[warning] CDE-Iterate reached max_iter without converging')
|
|
408
|
+
pass
|
|
409
|
+
|
|
410
|
+
prevalences = np.array([prev_neg, prev_pos], dtype=np.float64)
|
|
411
|
+
# ensure sums to 1 (numerical safety)
|
|
412
|
+
prevalences = prevalences / prevalences.sum()
|
|
413
|
+
|
|
414
|
+
return prevalences
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from ._classes import EnsembleQ, QuaDapt, AggregativeBootstrap
|