mlquantify 0.1.8__tar.gz → 0.1.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mlquantify-0.1.8/mlquantify.egg-info → mlquantify-0.1.10}/PKG-INFO +2 -1
- mlquantify-0.1.10/VERSION.txt +1 -0
- mlquantify-0.1.10/mlquantify/__init__.py +13 -0
- mlquantify-0.1.10/mlquantify/adjust_counting/__init__.py +24 -0
- mlquantify-0.1.10/mlquantify/adjust_counting/_adjustment.py +648 -0
- mlquantify-0.1.10/mlquantify/adjust_counting/_base.py +245 -0
- mlquantify-0.1.10/mlquantify/adjust_counting/_counting.py +153 -0
- mlquantify-0.1.10/mlquantify/adjust_counting/_utils.py +109 -0
- mlquantify-0.1.10/mlquantify/base.py +157 -0
- mlquantify-0.1.10/mlquantify/base_aggregative.py +209 -0
- mlquantify-0.1.10/mlquantify/calibration.py +1 -0
- mlquantify-0.1.10/mlquantify/confidence.py +329 -0
- mlquantify-0.1.10/mlquantify/likelihood/__init__.py +5 -0
- mlquantify-0.1.10/mlquantify/likelihood/_base.py +147 -0
- mlquantify-0.1.10/mlquantify/likelihood/_classes.py +430 -0
- mlquantify-0.1.10/mlquantify/meta/__init__.py +1 -0
- mlquantify-0.1.10/mlquantify/meta/_classes.py +785 -0
- mlquantify-0.1.10/mlquantify/metrics/__init__.py +21 -0
- mlquantify-0.1.10/mlquantify/metrics/_oq.py +109 -0
- mlquantify-0.1.10/mlquantify/metrics/_rq.py +98 -0
- mlquantify-0.1.8/mlquantify/evaluation/measures.py → mlquantify-0.1.10/mlquantify/metrics/_slq.py +51 -36
- mlquantify-0.1.10/mlquantify/mixture/__init__.py +7 -0
- mlquantify-0.1.10/mlquantify/mixture/_base.py +147 -0
- mlquantify-0.1.10/mlquantify/mixture/_classes.py +458 -0
- mlquantify-0.1.10/mlquantify/mixture/_utils.py +163 -0
- mlquantify-0.1.10/mlquantify/model_selection/__init__.py +9 -0
- mlquantify-0.1.10/mlquantify/model_selection/_protocol.py +358 -0
- mlquantify-0.1.10/mlquantify/model_selection/_search.py +315 -0
- mlquantify-0.1.10/mlquantify/model_selection/_split.py +1 -0
- mlquantify-0.1.10/mlquantify/multiclass.py +350 -0
- mlquantify-0.1.10/mlquantify/neighbors/__init__.py +9 -0
- mlquantify-0.1.10/mlquantify/neighbors/_base.py +168 -0
- mlquantify-0.1.10/mlquantify/neighbors/_classes.py +150 -0
- mlquantify-0.1.8/mlquantify/classification/methods.py → mlquantify-0.1.10/mlquantify/neighbors/_classification.py +37 -62
- mlquantify-0.1.10/mlquantify/neighbors/_kde.py +268 -0
- mlquantify-0.1.10/mlquantify/neighbors/_utils.py +131 -0
- mlquantify-0.1.10/mlquantify/neural/__init__.py +1 -0
- mlquantify-0.1.10/mlquantify/utils/__init__.py +47 -0
- mlquantify-0.1.10/mlquantify/utils/_artificial.py +27 -0
- mlquantify-0.1.10/mlquantify/utils/_constraints.py +219 -0
- mlquantify-0.1.10/mlquantify/utils/_context.py +21 -0
- mlquantify-0.1.10/mlquantify/utils/_decorators.py +36 -0
- mlquantify-0.1.10/mlquantify/utils/_exceptions.py +12 -0
- mlquantify-0.1.10/mlquantify/utils/_get_scores.py +159 -0
- mlquantify-0.1.10/mlquantify/utils/_load.py +18 -0
- mlquantify-0.1.10/mlquantify/utils/_parallel.py +6 -0
- mlquantify-0.1.10/mlquantify/utils/_random.py +36 -0
- mlquantify-0.1.10/mlquantify/utils/_sampling.py +273 -0
- mlquantify-0.1.10/mlquantify/utils/_tags.py +44 -0
- mlquantify-0.1.10/mlquantify/utils/_validation.py +447 -0
- mlquantify-0.1.10/mlquantify/utils/prevalence.py +64 -0
- {mlquantify-0.1.8 → mlquantify-0.1.10/mlquantify.egg-info}/PKG-INFO +2 -1
- mlquantify-0.1.10/mlquantify.egg-info/SOURCES.txt +58 -0
- {mlquantify-0.1.8 → mlquantify-0.1.10}/mlquantify.egg-info/requires.txt +1 -0
- {mlquantify-0.1.8 → mlquantify-0.1.10}/setup.py +1 -1
- mlquantify-0.1.8/VERSION.txt +0 -1
- mlquantify-0.1.8/mlquantify/__init__.py +0 -32
- mlquantify-0.1.8/mlquantify/base.py +0 -559
- mlquantify-0.1.8/mlquantify/classification/__init__.py +0 -1
- mlquantify-0.1.8/mlquantify/evaluation/__init__.py +0 -14
- mlquantify-0.1.8/mlquantify/evaluation/protocol.py +0 -289
- mlquantify-0.1.8/mlquantify/methods/__init__.py +0 -37
- mlquantify-0.1.8/mlquantify/methods/aggregative.py +0 -1159
- mlquantify-0.1.8/mlquantify/methods/meta.py +0 -472
- mlquantify-0.1.8/mlquantify/methods/mixture_models.py +0 -1003
- mlquantify-0.1.8/mlquantify/methods/non_aggregative.py +0 -136
- mlquantify-0.1.8/mlquantify/methods/threshold_optimization.py +0 -869
- mlquantify-0.1.8/mlquantify/model_selection.py +0 -377
- mlquantify-0.1.8/mlquantify/plots.py +0 -367
- mlquantify-0.1.8/mlquantify/utils/__init__.py +0 -2
- mlquantify-0.1.8/mlquantify/utils/general.py +0 -371
- mlquantify-0.1.8/mlquantify/utils/method.py +0 -449
- mlquantify-0.1.8/mlquantify.egg-info/SOURCES.txt +0 -27
- {mlquantify-0.1.8 → mlquantify-0.1.10}/MANIFEST.in +0 -0
- {mlquantify-0.1.8 → mlquantify-0.1.10}/README.md +0 -0
- {mlquantify-0.1.8 → mlquantify-0.1.10}/mlquantify.egg-info/dependency_links.txt +0 -0
- {mlquantify-0.1.8 → mlquantify-0.1.10}/mlquantify.egg-info/top_level.txt +0 -0
- {mlquantify-0.1.8 → mlquantify-0.1.10}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mlquantify
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.10
|
|
4
4
|
Summary: Quantification Library
|
|
5
5
|
Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
|
|
6
6
|
Maintainer: Luiz Fernando Luth Junior
|
|
@@ -20,6 +20,7 @@ Requires-Dist: tqdm
|
|
|
20
20
|
Requires-Dist: pandas
|
|
21
21
|
Requires-Dist: xlrd
|
|
22
22
|
Requires-Dist: matplotlib
|
|
23
|
+
Requires-Dist: abstention
|
|
23
24
|
Dynamic: classifier
|
|
24
25
|
Dynamic: description
|
|
25
26
|
Dynamic: description-content-type
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.1.10
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"mlquantify, a Python package for quantification"
|
|
2
|
+
|
|
3
|
+
from . import neighbors
|
|
4
|
+
from . import likelihood
|
|
5
|
+
from . import mixture
|
|
6
|
+
from . import meta
|
|
7
|
+
from . import adjust_counting
|
|
8
|
+
from . import model_selection
|
|
9
|
+
from . import base_aggregative
|
|
10
|
+
from . import base
|
|
11
|
+
from . import calibration
|
|
12
|
+
from . import confidence
|
|
13
|
+
from . import multiclass
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from ._counting import (
|
|
2
|
+
CC,
|
|
3
|
+
PCC
|
|
4
|
+
)
|
|
5
|
+
from ._adjustment import (
|
|
6
|
+
ThresholdAdjustment,
|
|
7
|
+
MatrixAdjustment,
|
|
8
|
+
FM,
|
|
9
|
+
GAC,
|
|
10
|
+
GPAC,
|
|
11
|
+
ACC,
|
|
12
|
+
X_method,
|
|
13
|
+
MAX,
|
|
14
|
+
T50,
|
|
15
|
+
MS,
|
|
16
|
+
MS2,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
from ._utils import (
|
|
20
|
+
compute_table,
|
|
21
|
+
compute_fpr,
|
|
22
|
+
compute_tpr,
|
|
23
|
+
evaluate_thresholds,
|
|
24
|
+
)
|
|
@@ -0,0 +1,648 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from abc import abstractmethod
|
|
3
|
+
from scipy.optimize import minimize
|
|
4
|
+
import warnings
|
|
5
|
+
|
|
6
|
+
from mlquantify.adjust_counting._base import BaseAdjustCount
|
|
7
|
+
from mlquantify.adjust_counting._counting import CC, PCC
|
|
8
|
+
from mlquantify.base_aggregative import (
|
|
9
|
+
CrispLearnerQMixin,
|
|
10
|
+
SoftLearnerQMixin,
|
|
11
|
+
uses_soft_predictions,
|
|
12
|
+
)
|
|
13
|
+
from mlquantify.multiclass import define_binary
|
|
14
|
+
from mlquantify.adjust_counting._utils import evaluate_thresholds
|
|
15
|
+
from mlquantify.utils._constraints import Interval, Options
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@define_binary
|
|
19
|
+
class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
|
|
20
|
+
r"""Base Class for Threshold-based adjustment methods for quantification.
|
|
21
|
+
|
|
22
|
+
This is the base class for methods such as ACC, X, MAX, T50, MS, and MS2,
|
|
23
|
+
which adjust prevalence estimates based on the classifier's ROC curve,
|
|
24
|
+
as proposed by [1]_.
|
|
25
|
+
|
|
26
|
+
These methods correct the bias in *Classify & Count (CC)* estimates caused
|
|
27
|
+
by differences in class distributions between the training and test datasets.
|
|
28
|
+
|
|
29
|
+
The adjusted prevalence is calculated using the following formula:
|
|
30
|
+
|
|
31
|
+
.. math::
|
|
32
|
+
|
|
33
|
+
\hat{p} = \frac{p' - \text{FPR}}{\text{TPR} - \text{FPR}}
|
|
34
|
+
|
|
35
|
+
where:
|
|
36
|
+
- :math:`p'` is the observed positive proportion from CC,
|
|
37
|
+
- :math:`\text{TPR} = P(\hat{y}=1|y=1)` is the True Positive Rate,
|
|
38
|
+
- :math:`\text{FPR} = P(\hat{y}=1|y=0)` is the False Positive Rate.
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
Parameters
|
|
42
|
+
----------
|
|
43
|
+
learner : estimator, optional
|
|
44
|
+
A supervised learning model with `fit` and `predict_proba` methods.
|
|
45
|
+
threshold : float, default=0.5
|
|
46
|
+
Classification threshold in [0, 1].
|
|
47
|
+
strategy : {'ovr'}, default='ovr'
|
|
48
|
+
Strategy used for multiclass adaptation.
|
|
49
|
+
|
|
50
|
+
Attributes
|
|
51
|
+
----------
|
|
52
|
+
learner : estimator
|
|
53
|
+
The underlying classification model.
|
|
54
|
+
classes : ndarray of shape (n_classes,)
|
|
55
|
+
Unique class labels observed during training.
|
|
56
|
+
|
|
57
|
+
Notes
|
|
58
|
+
-----
|
|
59
|
+
- Defined only for binary quantification tasks.
|
|
60
|
+
- When applied to multiclass problems, the one-vs-rest strategy (`ovr`)
|
|
61
|
+
is used automatically.
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
Examples
|
|
65
|
+
--------
|
|
66
|
+
>>> from sklearn.linear_model import LogisticRegression
|
|
67
|
+
>>> from mlquantify.adjust_counting import ThresholdAdjustment
|
|
68
|
+
>>> import numpy as np
|
|
69
|
+
>>> class CustomThreshold(ThresholdAdjustment):
|
|
70
|
+
... def get_best_threshold(self, thresholds, tprs, fprs):
|
|
71
|
+
... idx = np.argmax(tprs - fprs)
|
|
72
|
+
... return thresholds[idx], tprs[idx], fprs[idx]
|
|
73
|
+
>>> X = np.random.randn(100, 4)
|
|
74
|
+
>>> y = np.random.randint(0, 2, 100)
|
|
75
|
+
>>> q = CustomThreshold(learner=LogisticRegression())
|
|
76
|
+
>>> q.fit(X, y)
|
|
77
|
+
>>> q.predict(X)
|
|
78
|
+
{0: 0.49, 1: 0.51}
|
|
79
|
+
|
|
80
|
+
References
|
|
81
|
+
----------
|
|
82
|
+
.. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate
|
|
83
|
+
Classification", *Proceedings of ECML*, pp. 564-575.
|
|
84
|
+
.. [2] Forman, G. (2008). "Quantifying Counts and Costs via Classification",
|
|
85
|
+
*Data Mining and Knowledge Discovery*, 17(2), 164-206.
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
_parameter_constraints = {
|
|
89
|
+
"threshold": [
|
|
90
|
+
Interval(0.0, 1.0),
|
|
91
|
+
Interval(0, 1, discrete=True),
|
|
92
|
+
],
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
def __init__(self, learner=None, threshold=0.5, strategy="ovr"):
|
|
96
|
+
super().__init__(learner=learner)
|
|
97
|
+
self.threshold = threshold
|
|
98
|
+
self.strategy = strategy
|
|
99
|
+
|
|
100
|
+
def _adjust(self, predictions, train_y_scores, train_y_values):
|
|
101
|
+
"""Internal adjustment computation based on selected ROC threshold."""
|
|
102
|
+
positive_scores = train_y_scores[:, 1]
|
|
103
|
+
|
|
104
|
+
thresholds, tprs, fprs = evaluate_thresholds(train_y_values, positive_scores)
|
|
105
|
+
threshold, tpr, fpr = self.get_best_threshold(thresholds, tprs, fprs)
|
|
106
|
+
|
|
107
|
+
cc_predictions = CC(threshold).aggregate(predictions)[1]
|
|
108
|
+
|
|
109
|
+
if tpr - fpr == 0:
|
|
110
|
+
prevalence = cc_predictions
|
|
111
|
+
else:
|
|
112
|
+
prevalence = np.clip((cc_predictions - fpr) / (tpr - fpr), 0, 1)
|
|
113
|
+
|
|
114
|
+
return np.asarray([1 - prevalence, prevalence])
|
|
115
|
+
|
|
116
|
+
@abstractmethod
|
|
117
|
+
def get_best_threshold(self, thresholds, tprs, fprs):
|
|
118
|
+
"""Select the best threshold according to the specific method."""
|
|
119
|
+
...
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class MatrixAdjustment(BaseAdjustCount):
|
|
123
|
+
r"""Base class for matrix-based quantification adjustments.
|
|
124
|
+
|
|
125
|
+
This class implements the matrix correction model for quantification
|
|
126
|
+
as formulated in Firat (2016) [1]_, which expresses the observed prevalences
|
|
127
|
+
as a linear combination of true prevalences through the confusion matrix.
|
|
128
|
+
|
|
129
|
+
The system is modeled as:
|
|
130
|
+
|
|
131
|
+
.. math::
|
|
132
|
+
|
|
133
|
+
\mathbf{y} = \mathbf{C}\hat{\pi}_F + \varepsilon
|
|
134
|
+
|
|
135
|
+
subject to the constraints:
|
|
136
|
+
|
|
137
|
+
.. math::
|
|
138
|
+
|
|
139
|
+
\hat{\pi}_F \ge 0, \quad \sum_k \hat{\pi}_{F,k} = 1
|
|
140
|
+
|
|
141
|
+
where:
|
|
142
|
+
- :math:`\mathbf{y}` is the vector of predicted prevalences in test set,
|
|
143
|
+
- :math:`\mathbf{C}` is the confusion matrix,
|
|
144
|
+
- :math:`\hat{\pi}_F` is the true class prevalence vector (unknown),
|
|
145
|
+
- :math:`\varepsilon` is the residual error.
|
|
146
|
+
|
|
147
|
+
The model can be solved via:
|
|
148
|
+
|
|
149
|
+
- **Linear algebraic solution**: uses matrix inversion
|
|
150
|
+
- **Constrained optimization**: quadratic or least-squares approach
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
Parameters
|
|
154
|
+
----------
|
|
155
|
+
learner : estimator, optional
|
|
156
|
+
Classifier with `fit` and `predict` methods.
|
|
157
|
+
solver : {'optim', 'linear'}, optional
|
|
158
|
+
Solver for the adjustment system:
|
|
159
|
+
|
|
160
|
+
- `'linear'`: uses matrix inversion (e.g., GAC, GPAC)
|
|
161
|
+
- `'optim'`: uses optimization (e.g., FM)
|
|
162
|
+
|
|
163
|
+
Attributes
|
|
164
|
+
----------
|
|
165
|
+
CM : ndarray of shape (n_classes, n_classes)
|
|
166
|
+
Confusion matrix used for correction.
|
|
167
|
+
classes : ndarray
|
|
168
|
+
Class labels observed in training.
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
Examples
|
|
172
|
+
--------
|
|
173
|
+
>>> from sklearn.linear_model import LogisticRegression
|
|
174
|
+
>>> from mlquantify.adjust_counting import MatrixAdjustment
|
|
175
|
+
>>> import numpy as np
|
|
176
|
+
>>> class MyMatrix(MatrixAdjustment):
|
|
177
|
+
... def _compute_confusion_matrix(self, preds, y):
|
|
178
|
+
... cm = np.ones((2, 2))
|
|
179
|
+
... return cm / cm.sum(axis=1, keepdims=True)
|
|
180
|
+
>>> q = MyMatrix(learner=LogisticRegression(), solver='linear')
|
|
181
|
+
>>> X = np.random.randn(50, 4)
|
|
182
|
+
>>> y = np.random.randint(0, 2, 50)
|
|
183
|
+
>>> q.fit(X, y)
|
|
184
|
+
>>> q.predict(X)
|
|
185
|
+
{0: 0.5, 1: 0.5}
|
|
186
|
+
|
|
187
|
+
References
|
|
188
|
+
----------
|
|
189
|
+
.. [1] Firat, A. (2016). "Unified Framework for Quantification",
|
|
190
|
+
*Proceedings of AAAI Conference on Artificial Intelligence*,
|
|
191
|
+
pp. 1-8.
|
|
192
|
+
"""
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
_parameter_constraints = {"solver": Options(["optim", "linear"])}
|
|
196
|
+
|
|
197
|
+
def __init__(self, learner=None, solver=None):
|
|
198
|
+
super().__init__(learner=learner)
|
|
199
|
+
self.solver = solver
|
|
200
|
+
|
|
201
|
+
def _adjust(self, predictions, train_y_pred, train_y_values):
|
|
202
|
+
n_class = len(np.unique(train_y_values))
|
|
203
|
+
self.CM = np.zeros((n_class, n_class))
|
|
204
|
+
|
|
205
|
+
if self.solver == 'optim':
|
|
206
|
+
priors = np.array(list(CC().aggregate(train_y_pred).values()))
|
|
207
|
+
self.CM = self._compute_confusion_matrix(train_y_pred, train_y_values, priors)
|
|
208
|
+
prevs_estim = self._get_estimations(predictions > priors)
|
|
209
|
+
prevalence = self._solve_optimization(prevs_estim, priors)
|
|
210
|
+
else:
|
|
211
|
+
self.CM = self._compute_confusion_matrix(train_y_pred)
|
|
212
|
+
prevs_estim = self._get_estimations(predictions)
|
|
213
|
+
prevalence = self._solve_linear(prevs_estim)
|
|
214
|
+
|
|
215
|
+
return prevalence
|
|
216
|
+
|
|
217
|
+
def _solve_linear(self, prevs_estim):
|
|
218
|
+
r"""
|
|
219
|
+
Solve the system using matrix inversion.
|
|
220
|
+
"""
|
|
221
|
+
try:
|
|
222
|
+
adjusted = np.linalg.solve(self.CM, prevs_estim)
|
|
223
|
+
adjusted = np.clip(adjusted, 0, 1)
|
|
224
|
+
adjusted /= adjusted.sum()
|
|
225
|
+
except np.linalg.LinAlgError:
|
|
226
|
+
adjusted = prevs_estim
|
|
227
|
+
return adjusted
|
|
228
|
+
|
|
229
|
+
def _solve_optimization(self, prevs_estim, priors):
|
|
230
|
+
r"""Solve the system linearly.
|
|
231
|
+
|
|
232
|
+
The solution is obtained by matrix inversion:
|
|
233
|
+
|
|
234
|
+
.. math::
|
|
235
|
+
|
|
236
|
+
\hat{\pi}_F = \mathbf{C}^{-1} \mathbf{p}
|
|
237
|
+
|
|
238
|
+
where :math:`\mathbf{C}` is the confusion matrix and :math:`\mathbf{p}`
|
|
239
|
+
is the observed prevalence vector.
|
|
240
|
+
|
|
241
|
+
Parameters
|
|
242
|
+
----------
|
|
243
|
+
p : ndarray of shape (n_classes,)
|
|
244
|
+
Observed prevalence vector from test set.
|
|
245
|
+
|
|
246
|
+
Returns
|
|
247
|
+
-------
|
|
248
|
+
ndarray of shape (n_classes,)
|
|
249
|
+
Adjusted prevalence estimates :math:`\hat{\pi}_F`.
|
|
250
|
+
"""
|
|
251
|
+
def objective(prevs_pred):
|
|
252
|
+
return np.linalg.norm(self.CM @ prevs_pred - prevs_estim)
|
|
253
|
+
|
|
254
|
+
constraints = [
|
|
255
|
+
{'type': 'eq', 'fun': lambda x: np.sum(x) - 1},
|
|
256
|
+
{'type': 'ineq', 'fun': lambda x: x}
|
|
257
|
+
]
|
|
258
|
+
bounds = [(0, 1)] * self.CM.shape[1]
|
|
259
|
+
init = np.full(self.CM.shape[1], 1 / self.CM.shape[1])
|
|
260
|
+
result = minimize(objective, init, constraints=constraints, bounds=bounds)
|
|
261
|
+
return result.x if result.success else priors
|
|
262
|
+
|
|
263
|
+
def _get_estimations(self, predictions):
|
|
264
|
+
"""Return prevalence estimates using CC (crisp) or PCC (probabilistic)."""
|
|
265
|
+
if uses_soft_predictions(self):
|
|
266
|
+
return np.array(list(PCC().aggregate(predictions).values()))
|
|
267
|
+
return np.array(list(CC().aggregate(predictions).values()))
|
|
268
|
+
|
|
269
|
+
@abstractmethod
|
|
270
|
+
def _compute_confusion_matrix(self, predictions, *args):
|
|
271
|
+
...
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
class FM(SoftLearnerQMixin, MatrixAdjustment):
|
|
275
|
+
r"""Friedman Method for quantification adjustment.
|
|
276
|
+
|
|
277
|
+
This class implements the Friedman (2015) matrix-based quantification adjustment, which formulates the quantification problem as a constrained optimization problem. It adjusts the estimated class prevalences by minimizing the difference between predicted and expected prevalences, subject to valid prevalence constraints.
|
|
278
|
+
|
|
279
|
+
The confusion matrix is computed by applying estimated posterior probabilities
|
|
280
|
+
over true labels, enabling accurate correction of prevalence estimates under
|
|
281
|
+
concept drift.
|
|
282
|
+
|
|
283
|
+
The confusion matrix is estimated for each class :math:`k` by:
|
|
284
|
+
applying thresholding on posterior probabilities against prior prevalence,
|
|
285
|
+
as described in the FM algorithm. This enables the correction using
|
|
286
|
+
a quadratic optimization approach.
|
|
287
|
+
|
|
288
|
+
The method solves:
|
|
289
|
+
|
|
290
|
+
.. math::
|
|
291
|
+
|
|
292
|
+
\min_{\hat{\pi}_F} \| \mathbf{C} \hat{\pi}_F - \mathbf{p} \|^2
|
|
293
|
+
|
|
294
|
+
subject to constraints:
|
|
295
|
+
|
|
296
|
+
.. math::
|
|
297
|
+
|
|
298
|
+
\hat{\pi}_F \geq 0, \quad \sum_k \hat{\pi}_{F,k} = 1
|
|
299
|
+
|
|
300
|
+
where :math:`\mathbf{C}` is the confusion matrix, :math:`\mathbf{p}` is the
|
|
301
|
+
vector of predicted prevalences.
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
Parameters
|
|
305
|
+
----------
|
|
306
|
+
learner : estimator, optional
|
|
307
|
+
Base classifier with `fit` and `predict_proba` methods.
|
|
308
|
+
If None, a default estimator will be used.
|
|
309
|
+
|
|
310
|
+
Attributes
|
|
311
|
+
----------
|
|
312
|
+
CM : ndarray of shape (n_classes, n_classes)
|
|
313
|
+
Confusion matrix used for correction.
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
Examples
|
|
317
|
+
--------
|
|
318
|
+
>>> from mlquantify.adjust_counting import FM
|
|
319
|
+
>>> import numpy as np
|
|
320
|
+
>>> X = np.random.randn(50, 4)
|
|
321
|
+
>>> y = np.random.randint(0, 2, 50)
|
|
322
|
+
>>> fm = FM(learner=LogisticRegression())
|
|
323
|
+
>>> fm.fit(X, y)
|
|
324
|
+
>>> fm.predict(X)
|
|
325
|
+
{0: 0.5, 1: 0.5}
|
|
326
|
+
|
|
327
|
+
References
|
|
328
|
+
----------
|
|
329
|
+
.. [1] Friedman, J. H., et al. (2015). "Detecting and Dealing with Concept Drift",
|
|
330
|
+
*Proceedings of the IEEE*, 103(11), 1522-1541.
|
|
331
|
+
"""
|
|
332
|
+
def __init__(self, learner=None):
|
|
333
|
+
super().__init__(learner=learner, solver='optim')
|
|
334
|
+
|
|
335
|
+
def _compute_confusion_matrix(self, posteriors, y_true, priors):
|
|
336
|
+
for i, _class in enumerate(self.classes_):
|
|
337
|
+
indices = (y_true == _class)
|
|
338
|
+
self.CM[:, i] = self._get_estimations(posteriors[indices] > priors)
|
|
339
|
+
return self.CM
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
class GAC(CrispLearnerQMixin, MatrixAdjustment):
|
|
343
|
+
r"""Generalized Adjusted Count method.
|
|
344
|
+
|
|
345
|
+
This class implements the Generalized Adjusted Count (GAC) algorithm for
|
|
346
|
+
quantification adjustment as described in Firat (2016) [1]_. The method
|
|
347
|
+
adjusts the estimated class prevalences by normalizing the confusion matrix
|
|
348
|
+
based on prevalence estimates, providing a correction for bias caused by
|
|
349
|
+
distribution differences between training and test data.
|
|
350
|
+
|
|
351
|
+
The confusion matrix is normalized by dividing each column by the prevalence
|
|
352
|
+
estimate of the corresponding class. For classes with zero estimated prevalence,
|
|
353
|
+
the diagonal element is set to 1 to avoid division by zero.
|
|
354
|
+
|
|
355
|
+
This normalization ensures that the matrix best reflects the classifier's
|
|
356
|
+
behavior relative to the estimated class distributions, improving quantification
|
|
357
|
+
accuracy.
|
|
358
|
+
|
|
359
|
+
Parameters
|
|
360
|
+
----------
|
|
361
|
+
learner : estimator, optional
|
|
362
|
+
Base classifier with `fit` and `predict` methods.
|
|
363
|
+
|
|
364
|
+
Attributes
|
|
365
|
+
----------
|
|
366
|
+
CM : ndarray of shape (n_classes, n_classes)
|
|
367
|
+
Normalized confusion matrix used for adjusting predicted prevalences.
|
|
368
|
+
classes_ : ndarray
|
|
369
|
+
Array of class labels observed during training.
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
Examples
|
|
373
|
+
--------
|
|
374
|
+
>>> from sklearn.linear_model import LogisticRegression
|
|
375
|
+
>>> from mlquantify.adjust_counting import GAC
|
|
376
|
+
>>> import numpy as np
|
|
377
|
+
>>> gac = GAC(learner=LogisticRegression())
|
|
378
|
+
>>> X = np.random.randn(50, 4)
|
|
379
|
+
>>> y = np.random.randint(0, 2, 50)
|
|
380
|
+
>>> gac.fit(X, y)
|
|
381
|
+
>>> gac.predict(X)
|
|
382
|
+
{0: 0.5, 1: 0.5}
|
|
383
|
+
|
|
384
|
+
References
|
|
385
|
+
----------
|
|
386
|
+
.. [1] Firat, A. (2016). "Unified Framework for Quantification",
|
|
387
|
+
*Proceedings of AAAI Conference on Artificial Intelligence*, pp. 1-8.
|
|
388
|
+
"""
|
|
389
|
+
def __init__(self, learner=None):
|
|
390
|
+
super().__init__(learner=learner, solver='linear')
|
|
391
|
+
|
|
392
|
+
def _compute_confusion_matrix(self, predictions):
|
|
393
|
+
prev_estim = self._get_estimations(predictions)
|
|
394
|
+
for i, _ in enumerate(self.classes_):
|
|
395
|
+
if prev_estim[i] == 0:
|
|
396
|
+
self.CM[i, i] = 1
|
|
397
|
+
else:
|
|
398
|
+
self.CM[:, i] /= prev_estim[i]
|
|
399
|
+
return self.CM
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
class GPAC(SoftLearnerQMixin, MatrixAdjustment):
|
|
403
|
+
r"""Probabilistic Generalized Adjusted Count (GPAC) method.
|
|
404
|
+
|
|
405
|
+
This class implements the probabilistic extension of the Generalized Adjusted Count method
|
|
406
|
+
as presented in Firat (2016) [1]_. The GPAC method normalizes the confusion matrix by
|
|
407
|
+
the estimated prevalences from posterior probabilities, enabling a probabilistic correction
|
|
408
|
+
of class prevalences.
|
|
409
|
+
|
|
410
|
+
The normalization divides each column of the confusion matrix by the estimated prevalence
|
|
411
|
+
of the corresponding class. If a class has zero estimated prevalence, the diagonal element
|
|
412
|
+
for that class is set to 1 to maintain matrix validity.
|
|
413
|
+
|
|
414
|
+
GPAC extends the GAC approach by using soft probabilistic predictions (posterior probabilities)
|
|
415
|
+
rather than crisp class labels, potentially improving quantification accuracy when
|
|
416
|
+
posterior probabilities are well calibrated.
|
|
417
|
+
|
|
418
|
+
Parameters
|
|
419
|
+
----------
|
|
420
|
+
learner : estimator, optional
|
|
421
|
+
Base classifier with `fit` and `predict_proba` methods.
|
|
422
|
+
|
|
423
|
+
Attributes
|
|
424
|
+
----------
|
|
425
|
+
CM : ndarray of shape (n_classes, n_classes)
|
|
426
|
+
Normalized confusion matrix used for adjustment.
|
|
427
|
+
classes_ : ndarray
|
|
428
|
+
Array of class labels observed during training.
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
Examples
|
|
432
|
+
--------
|
|
433
|
+
>>> from sklearn.linear_model import LogisticRegression
|
|
434
|
+
>>> from mlquantify.adjust_counting import GPAC
|
|
435
|
+
>>> import numpy as np
|
|
436
|
+
>>> gpac = GPAC(learner=LogisticRegression())
|
|
437
|
+
>>> X = np.random.randn(50, 4)
|
|
438
|
+
>>> y = np.random.randint(0, 2, 50)
|
|
439
|
+
>>> gpac.fit(X, y)
|
|
440
|
+
>>> gpac.predict(X)
|
|
441
|
+
{0: 0.5, 1: 0.5}
|
|
442
|
+
|
|
443
|
+
References
|
|
444
|
+
----------
|
|
445
|
+
.. [1] Firat, A. (2016). "Unified Framework for Quantification",
|
|
446
|
+
*Proceedings of AAAI Conference on Artificial Intelligence*, pp. 1-8.
|
|
447
|
+
"""
|
|
448
|
+
def __init__(self, learner=None):
|
|
449
|
+
super().__init__(learner=learner, solver='linear')
|
|
450
|
+
|
|
451
|
+
def _compute_confusion_matrix(self, posteriors):
|
|
452
|
+
prev_estim = self._get_estimations(posteriors)
|
|
453
|
+
for i, _ in enumerate(self.classes_):
|
|
454
|
+
if prev_estim[i] == 0:
|
|
455
|
+
self.CM[i, i] = 1
|
|
456
|
+
else:
|
|
457
|
+
self.CM[:, i] /= prev_estim[i]
|
|
458
|
+
return self.CM
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
class ACC(ThresholdAdjustment):
|
|
462
|
+
r"""Adjusted Count (ACC) — baseline threshold correction.
|
|
463
|
+
|
|
464
|
+
This method corrects the bias in class prevalence estimates caused by imperfect
|
|
465
|
+
classification accuracy, by adjusting the observed positive count using estimates
|
|
466
|
+
of the classifier's true positive rate (TPR) and false positive rate (FPR).
|
|
467
|
+
|
|
468
|
+
It uses a fixed classification threshold and applies the formula:
|
|
469
|
+
|
|
470
|
+
.. math::
|
|
471
|
+
|
|
472
|
+
p = \frac{p' - \text{FPR}}{\text{TPR} - \text{FPR}}
|
|
473
|
+
|
|
474
|
+
where :math:`p'` is the observed positive proportion from :class:`CC`,
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
Parameters
|
|
478
|
+
----------
|
|
479
|
+
learner : estimator, optional
|
|
480
|
+
A supervised learning model with `fit` and `predict_proba` methods.
|
|
481
|
+
threshold : float, default=0.5
|
|
482
|
+
Classification threshold in [0, 1] for applying in the :class:`CC` output.
|
|
483
|
+
|
|
484
|
+
References
|
|
485
|
+
----------
|
|
486
|
+
.. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate Classification",
|
|
487
|
+
*ECML*, pp. 564-575.
|
|
488
|
+
"""
|
|
489
|
+
|
|
490
|
+
def get_best_threshold(self, thresholds, tprs, fprs):
|
|
491
|
+
tpr = tprs[thresholds == self.threshold][0]
|
|
492
|
+
fpr = fprs[thresholds == self.threshold][0]
|
|
493
|
+
return (self.threshold, tpr, fpr)
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
class X_method(ThresholdAdjustment):
|
|
497
|
+
r"""X method — threshold where :math:`\text{TPR} + \text{FPR} = 1`.
|
|
498
|
+
|
|
499
|
+
This method selects the classification threshold at which the sum of the true positive
|
|
500
|
+
rate (TPR) and false positive rate (FPR) equals one. This threshold choice balances
|
|
501
|
+
errors in a specific way improving quantification.
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
Parameters
|
|
505
|
+
----------
|
|
506
|
+
learner : estimator, optional
|
|
507
|
+
A supervised learning model with `fit` and `predict_proba` methods.
|
|
508
|
+
threshold : float, default=0.5
|
|
509
|
+
Classification threshold in [0, 1] for applying in the :class:`CC` output.
|
|
510
|
+
|
|
511
|
+
References
|
|
512
|
+
----------
|
|
513
|
+
.. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate Classification",
|
|
514
|
+
*ECML*, pp. 564-575.
|
|
515
|
+
"""
|
|
516
|
+
def get_best_threshold(self, thresholds, tprs, fprs):
|
|
517
|
+
idx = np.argmin(np.abs(1 - (tprs + fprs)))
|
|
518
|
+
return thresholds[idx], tprs[idx], fprs[idx]
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
class MAX(ThresholdAdjustment):
|
|
522
|
+
r"""MAX method — threshold maximizing :math:`\text{TPR} - \text{FPR}`.
|
|
523
|
+
|
|
524
|
+
This method selects the threshold that maximizes the difference between the true positive
|
|
525
|
+
rate (TPR) and the false positive rate (FPR), effectively optimizing classification
|
|
526
|
+
performance for quantification.
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
Parameters
|
|
530
|
+
----------
|
|
531
|
+
learner : estimator, optional
|
|
532
|
+
A supervised learning model with `fit` and `predict_proba` methods.
|
|
533
|
+
threshold : float, default=0.5
|
|
534
|
+
Classification threshold in [0, 1] for applying in the :class:`CC` output.
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
References
|
|
538
|
+
----------
|
|
539
|
+
.. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate Classification",
|
|
540
|
+
*ECML*, pp. 564-575.
|
|
541
|
+
"""
|
|
542
|
+
def get_best_threshold(self, thresholds, tprs, fprs):
|
|
543
|
+
idx = np.argmax(np.abs(tprs - fprs))
|
|
544
|
+
return thresholds[idx], tprs[idx], fprs[idx]
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
class T50(ThresholdAdjustment):
|
|
548
|
+
r"""T50 — selects threshold where :math:`\text{TPR} = 0.5`.
|
|
549
|
+
|
|
550
|
+
This method chooses the classification threshold such that the true positive rate (TPR)
|
|
551
|
+
equals 0.5, avoiding regions with unreliable estimates at extreme thresholds.
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
Parameters
|
|
555
|
+
----------
|
|
556
|
+
learner : estimator, optional
|
|
557
|
+
A supervised learning model with `fit` and `predict_proba` methods.
|
|
558
|
+
threshold : float, default=0.5
|
|
559
|
+
Classification threshold in [0, 1] for applying in the :class:`CC` output.
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
References
|
|
563
|
+
----------
|
|
564
|
+
.. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate Classification",
|
|
565
|
+
*ECML*, pp. 564-575.
|
|
566
|
+
"""
|
|
567
|
+
def get_best_threshold(self, thresholds, tprs, fprs):
|
|
568
|
+
idx = np.argmin(np.abs(tprs - 0.5))
|
|
569
|
+
return thresholds[idx], tprs[idx], fprs[idx]
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
class MS(ThresholdAdjustment):
|
|
573
|
+
r"""Median Sweep (MS) — median prevalence estimate across all thresholds.
|
|
574
|
+
|
|
575
|
+
This method computes class prevalence estimates at multiple classification thresholds,
|
|
576
|
+
using the adjusted count formula for each, then returns the median of these estimates,
|
|
577
|
+
reducing variance caused by any single threshold selection.
|
|
578
|
+
|
|
579
|
+
It thus leverages the strengths of bootstrap-like variance reduction without heavy
|
|
580
|
+
computation.
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
Parameters
|
|
584
|
+
----------
|
|
585
|
+
learner : estimator, optional
|
|
586
|
+
A supervised learning model with `fit` and `predict_proba` methods.
|
|
587
|
+
threshold : float, default=0.5
|
|
588
|
+
Classification threshold in [0, 1] for applying in the :class:`CC` output.
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
References
|
|
592
|
+
----------
|
|
593
|
+
.. [1] Forman, G. (2008). "Quantifying Counts and Costs via Classification",
|
|
594
|
+
*Data Mining and Knowledge Discovery*, 17(2), 164-206.
|
|
595
|
+
"""
|
|
596
|
+
def _adjust(self, predictions, train_y_scores, train_y_values):
|
|
597
|
+
positive_scores = train_y_scores[:, 1]
|
|
598
|
+
|
|
599
|
+
thresholds, tprs, fprs = evaluate_thresholds(train_y_values, positive_scores)
|
|
600
|
+
thresholds, tprs, fprs = self.get_best_threshold(thresholds, tprs, fprs)
|
|
601
|
+
|
|
602
|
+
prevs = []
|
|
603
|
+
for thr, tpr, fpr in zip(thresholds, tprs, fprs):
|
|
604
|
+
cc_predictions = CC(thr).aggregate(predictions)
|
|
605
|
+
cc_predictions = cc_predictions[1]
|
|
606
|
+
prevalence = cc_predictions if tpr - fpr == 0 else (cc_predictions - fpr) / (tpr - fpr)
|
|
607
|
+
prevs.append(prevalence)
|
|
608
|
+
prevalence = np.median(prevs)
|
|
609
|
+
return np.asarray([1 - prevalence, prevalence])
|
|
610
|
+
|
|
611
|
+
def get_best_threshold(self, thresholds, tprs, fprs):
|
|
612
|
+
return thresholds, tprs, fprs
|
|
613
|
+
|
|
614
|
+
|
|
615
|
+
class MS2(MS):
|
|
616
|
+
r"""MS2 — Median Sweep variant constraining :math:`|\text{TPR} - \text{FPR}| > 0.25`.
|
|
617
|
+
|
|
618
|
+
This variant of Median Sweep excludes thresholds where the absolute difference
|
|
619
|
+
between true positive rate (TPR) and false positive rate (FPR) is below 0.25,
|
|
620
|
+
improving stability by avoiding ambiguous threshold regions.
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
Parameters
|
|
624
|
+
----------
|
|
625
|
+
learner : estimator, optional
|
|
626
|
+
A supervised learning model with `fit` and `predict_proba` methods.
|
|
627
|
+
threshold : float, default=0.5
|
|
628
|
+
Classification threshold in [0, 1] for applying in the :class:`CC` output.
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
Warnings
|
|
632
|
+
--------
|
|
633
|
+
- Warns if all TPR or FPR values are zero.
|
|
634
|
+
- Warns if no thresholds satisfy the constraint.
|
|
635
|
+
|
|
636
|
+
References
|
|
637
|
+
----------
|
|
638
|
+
.. [1] Forman, G. (2008). "Quantifying Counts and Costs via Classification",
|
|
639
|
+
*Data Mining and Knowledge Discovery*, 17(2), 164-206.
|
|
640
|
+
"""
|
|
641
|
+
def get_best_threshold(self, thresholds, tprs, fprs):
|
|
642
|
+
if np.all(tprs == 0) or np.all(fprs == 0):
|
|
643
|
+
warnings.warn("All TPR or FPR values are zero.")
|
|
644
|
+
indices = np.where(np.abs(tprs - fprs) > 0.25)[0]
|
|
645
|
+
if len(indices) == 0:
|
|
646
|
+
warnings.warn("No cases satisfy |TPR - FPR| > 0.25.")
|
|
647
|
+
indices = np.where(np.abs(tprs - fprs) >= 0)[0]
|
|
648
|
+
return thresholds[indices], tprs[indices], fprs[indices]
|