mlquantify 0.1.7__tar.gz → 0.1.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mlquantify-0.1.7/mlquantify.egg-info → mlquantify-0.1.9}/PKG-INFO +2 -1
- mlquantify-0.1.9/VERSION.txt +1 -0
- mlquantify-0.1.9/mlquantify/__init__.py +3 -0
- mlquantify-0.1.9/mlquantify/adjust_counting/__init__.py +14 -0
- mlquantify-0.1.9/mlquantify/adjust_counting/_adjustment.py +365 -0
- mlquantify-0.1.9/mlquantify/adjust_counting/_base.py +247 -0
- mlquantify-0.1.9/mlquantify/adjust_counting/_counting.py +145 -0
- mlquantify-0.1.9/mlquantify/adjust_counting/_utils.py +114 -0
- mlquantify-0.1.9/mlquantify/base.py +157 -0
- mlquantify-0.1.9/mlquantify/base_aggregative.py +209 -0
- mlquantify-0.1.9/mlquantify/calibration.py +1 -0
- mlquantify-0.1.9/mlquantify/confidence.py +335 -0
- mlquantify-0.1.9/mlquantify/likelihood/__init__.py +5 -0
- mlquantify-0.1.9/mlquantify/likelihood/_base.py +161 -0
- mlquantify-0.1.9/mlquantify/likelihood/_classes.py +414 -0
- mlquantify-0.1.9/mlquantify/meta/__init__.py +1 -0
- mlquantify-0.1.9/mlquantify/meta/_classes.py +761 -0
- mlquantify-0.1.9/mlquantify/metrics/__init__.py +21 -0
- mlquantify-0.1.9/mlquantify/metrics/_oq.py +109 -0
- mlquantify-0.1.9/mlquantify/metrics/_rq.py +98 -0
- mlquantify-0.1.7/mlquantify/evaluation/measures.py → mlquantify-0.1.9/mlquantify/metrics/_slq.py +43 -28
- mlquantify-0.1.9/mlquantify/mixture/__init__.py +7 -0
- mlquantify-0.1.9/mlquantify/mixture/_base.py +153 -0
- mlquantify-0.1.9/mlquantify/mixture/_classes.py +400 -0
- mlquantify-0.1.9/mlquantify/mixture/_utils.py +112 -0
- mlquantify-0.1.9/mlquantify/model_selection/__init__.py +9 -0
- mlquantify-0.1.9/mlquantify/model_selection/_protocol.py +358 -0
- mlquantify-0.1.9/mlquantify/model_selection/_search.py +315 -0
- mlquantify-0.1.9/mlquantify/model_selection/_split.py +1 -0
- mlquantify-0.1.9/mlquantify/multiclass.py +350 -0
- mlquantify-0.1.9/mlquantify/neighbors/__init__.py +9 -0
- mlquantify-0.1.9/mlquantify/neighbors/_base.py +198 -0
- mlquantify-0.1.9/mlquantify/neighbors/_classes.py +159 -0
- mlquantify-0.1.7/mlquantify/classification/methods.py → mlquantify-0.1.9/mlquantify/neighbors/_classification.py +48 -66
- mlquantify-0.1.9/mlquantify/neighbors/_kde.py +270 -0
- mlquantify-0.1.9/mlquantify/neighbors/_utils.py +135 -0
- mlquantify-0.1.9/mlquantify/neural/__init__.py +1 -0
- mlquantify-0.1.9/mlquantify/utils/__init__.py +47 -0
- mlquantify-0.1.9/mlquantify/utils/_artificial.py +27 -0
- mlquantify-0.1.9/mlquantify/utils/_constraints.py +219 -0
- mlquantify-0.1.9/mlquantify/utils/_context.py +21 -0
- mlquantify-0.1.9/mlquantify/utils/_decorators.py +36 -0
- mlquantify-0.1.9/mlquantify/utils/_exceptions.py +12 -0
- mlquantify-0.1.9/mlquantify/utils/_get_scores.py +159 -0
- mlquantify-0.1.9/mlquantify/utils/_load.py +18 -0
- mlquantify-0.1.9/mlquantify/utils/_parallel.py +6 -0
- mlquantify-0.1.9/mlquantify/utils/_random.py +36 -0
- mlquantify-0.1.9/mlquantify/utils/_sampling.py +273 -0
- mlquantify-0.1.9/mlquantify/utils/_tags.py +44 -0
- mlquantify-0.1.9/mlquantify/utils/_validation.py +447 -0
- mlquantify-0.1.9/mlquantify/utils/prevalence.py +61 -0
- {mlquantify-0.1.7 → mlquantify-0.1.9/mlquantify.egg-info}/PKG-INFO +2 -1
- mlquantify-0.1.9/mlquantify.egg-info/SOURCES.txt +58 -0
- {mlquantify-0.1.7 → mlquantify-0.1.9}/mlquantify.egg-info/requires.txt +1 -0
- {mlquantify-0.1.7 → mlquantify-0.1.9}/setup.py +1 -1
- mlquantify-0.1.7/VERSION.txt +0 -1
- mlquantify-0.1.7/mlquantify/__init__.py +0 -32
- mlquantify-0.1.7/mlquantify/base.py +0 -559
- mlquantify-0.1.7/mlquantify/classification/__init__.py +0 -1
- mlquantify-0.1.7/mlquantify/evaluation/__init__.py +0 -14
- mlquantify-0.1.7/mlquantify/evaluation/protocol.py +0 -291
- mlquantify-0.1.7/mlquantify/methods/__init__.py +0 -37
- mlquantify-0.1.7/mlquantify/methods/aggregative.py +0 -1159
- mlquantify-0.1.7/mlquantify/methods/meta.py +0 -472
- mlquantify-0.1.7/mlquantify/methods/mixture_models.py +0 -1003
- mlquantify-0.1.7/mlquantify/methods/non_aggregative.py +0 -136
- mlquantify-0.1.7/mlquantify/methods/threshold_optimization.py +0 -869
- mlquantify-0.1.7/mlquantify/model_selection.py +0 -377
- mlquantify-0.1.7/mlquantify/plots.py +0 -367
- mlquantify-0.1.7/mlquantify/utils/__init__.py +0 -2
- mlquantify-0.1.7/mlquantify/utils/general.py +0 -371
- mlquantify-0.1.7/mlquantify/utils/method.py +0 -449
- mlquantify-0.1.7/mlquantify.egg-info/SOURCES.txt +0 -27
- {mlquantify-0.1.7 → mlquantify-0.1.9}/MANIFEST.in +0 -0
- {mlquantify-0.1.7 → mlquantify-0.1.9}/README.md +0 -0
- {mlquantify-0.1.7 → mlquantify-0.1.9}/mlquantify.egg-info/dependency_links.txt +0 -0
- {mlquantify-0.1.7 → mlquantify-0.1.9}/mlquantify.egg-info/top_level.txt +0 -0
- {mlquantify-0.1.7 → mlquantify-0.1.9}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mlquantify
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.9
|
|
4
4
|
Summary: Quantification Library
|
|
5
5
|
Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
|
|
6
6
|
Maintainer: Luiz Fernando Luth Junior
|
|
@@ -20,6 +20,7 @@ Requires-Dist: tqdm
|
|
|
20
20
|
Requires-Dist: pandas
|
|
21
21
|
Requires-Dist: xlrd
|
|
22
22
|
Requires-Dist: matplotlib
|
|
23
|
+
Requires-Dist: abstention
|
|
23
24
|
Dynamic: classifier
|
|
24
25
|
Dynamic: description
|
|
25
26
|
Dynamic: description-content-type
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.1.9
|
|
@@ -0,0 +1,365 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from abc import abstractmethod
|
|
3
|
+
from scipy.optimize import minimize
|
|
4
|
+
import warnings
|
|
5
|
+
|
|
6
|
+
from mlquantify.adjust_counting._base import BaseAdjustCount
|
|
7
|
+
from mlquantify.adjust_counting._counting import CC, PCC
|
|
8
|
+
from mlquantify.base_aggregative import (
|
|
9
|
+
CrispLearnerQMixin,
|
|
10
|
+
SoftLearnerQMixin,
|
|
11
|
+
uses_soft_predictions,
|
|
12
|
+
)
|
|
13
|
+
from mlquantify.multiclass import define_binary
|
|
14
|
+
from mlquantify.adjust_counting._utils import evaluate_thresholds
|
|
15
|
+
from mlquantify.utils._constraints import Interval, Options
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@define_binary
|
|
19
|
+
class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
|
|
20
|
+
r"""
|
|
21
|
+
Applies threshold-based adjustment methods for quantification.
|
|
22
|
+
|
|
23
|
+
This is the base class for methods such as ACC, X, MAX, T50, MS, and MS2,
|
|
24
|
+
which adjust prevalence estimates based on the classifier’s ROC curve, as proposed by
|
|
25
|
+
Forman (2005, 2008).
|
|
26
|
+
|
|
27
|
+
These methods correct the bias in *Classify & Count (CC)* estimates caused by differences
|
|
28
|
+
in class distributions between the training and test datasets.
|
|
29
|
+
|
|
30
|
+
Mathematical formulation
|
|
31
|
+
|
|
32
|
+
Given:
|
|
33
|
+
- \( p' \): observed positive proportion from CC,
|
|
34
|
+
- \( \text{TPR} = P(\hat{y}=1|y=1) \),
|
|
35
|
+
- \( \text{FPR} = P(\hat{y}=1|y=0) \),
|
|
36
|
+
|
|
37
|
+
the adjusted prevalence is given by:
|
|
38
|
+
|
|
39
|
+
\[
|
|
40
|
+
\hat{p} = \frac{p' - \text{FPR}}{\text{TPR} - \text{FPR}}
|
|
41
|
+
\]
|
|
42
|
+
|
|
43
|
+
(Forman, *Counting Positives Accurately Despite Inaccurate Classification*, ECML 2005;
|
|
44
|
+
*Quantifying Counts and Costs via Classification*, DMKD 2008).
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
Notes
|
|
48
|
+
-----
|
|
49
|
+
- Defined only for binary quantification tasks.
|
|
50
|
+
- When applied to multiclass problems, the one-vs-rest strategy (`ovr`) is used automatically.
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
learner : estimator, optional
|
|
56
|
+
A supervised learning model with `fit` and `predict_proba` methods.
|
|
57
|
+
threshold : float, default=0.5
|
|
58
|
+
Classification threshold in [0, 1].
|
|
59
|
+
strategy : {'ovr'}, default='ovr'
|
|
60
|
+
Strategy used for multiclass adaptation.
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
Attributes
|
|
64
|
+
----------
|
|
65
|
+
learner : estimator
|
|
66
|
+
The underlying classification model.
|
|
67
|
+
classes : ndarray of shape (n_classes,)
|
|
68
|
+
Unique class labels observed during training.
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
Examples
|
|
72
|
+
--------
|
|
73
|
+
>>> from sklearn.linear_model import LogisticRegression
|
|
74
|
+
>>> from mlquantify.adjust_counting import ThresholdAdjustment
|
|
75
|
+
>>> import numpy as np
|
|
76
|
+
>>> class CustomThreshold(ThresholdAdjustment):
|
|
77
|
+
... def _get_best_threshold(self, thresholds, tprs, fprs):
|
|
78
|
+
... idx = np.argmax(tprs - fprs)
|
|
79
|
+
... return thresholds[idx], tprs[idx], fprs[idx]
|
|
80
|
+
>>> X = np.random.randn(100, 4)
|
|
81
|
+
>>> y = np.random.randint(0, 2, 100)
|
|
82
|
+
>>> q = CustomThreshold(learner=LogisticRegression())
|
|
83
|
+
>>> q.fit(X, y)
|
|
84
|
+
>>> q.predict(X)
|
|
85
|
+
{0: 0.49, 1: 0.51}
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
_parameter_constraints = {
|
|
89
|
+
"threshold": [
|
|
90
|
+
Interval(0.0, 1.0),
|
|
91
|
+
Interval(0, 1, discrete=True),
|
|
92
|
+
],
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
def __init__(self, learner=None, threshold=0.5, strategy="ovr"):
|
|
96
|
+
super().__init__(learner=learner)
|
|
97
|
+
self.threshold = threshold
|
|
98
|
+
self.strategy = strategy
|
|
99
|
+
|
|
100
|
+
def _adjust(self, predictions, train_y_scores, train_y_values):
|
|
101
|
+
"""Internal adjustment computation based on selected ROC threshold."""
|
|
102
|
+
positive_scores = train_y_scores[:, 1]
|
|
103
|
+
|
|
104
|
+
thresholds, tprs, fprs = evaluate_thresholds(train_y_values, positive_scores, self.classes_)
|
|
105
|
+
threshold, tpr, fpr = self._get_best_threshold(thresholds, tprs, fprs)
|
|
106
|
+
|
|
107
|
+
cc_predictions = CC(threshold).aggregate(predictions)[1]
|
|
108
|
+
|
|
109
|
+
if tpr - fpr == 0:
|
|
110
|
+
prevalence = cc_predictions
|
|
111
|
+
else:
|
|
112
|
+
prevalence = np.clip((cc_predictions - fpr) / (tpr - fpr), 0, 1)
|
|
113
|
+
|
|
114
|
+
return np.asarray([1 - prevalence, prevalence])
|
|
115
|
+
|
|
116
|
+
@abstractmethod
|
|
117
|
+
def _get_best_threshold(self, thresholds, tprs, fprs):
|
|
118
|
+
"""Select the best threshold according to the specific method."""
|
|
119
|
+
...
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class MatrixAdjustment(BaseAdjustCount):
|
|
123
|
+
r"""
|
|
124
|
+
Base class for matrix-based quantification adjustments (FM, GAC, GPAC).
|
|
125
|
+
|
|
126
|
+
This class implements the matrix correction model for quantification
|
|
127
|
+
as formulated in Firat (2016), which expresses the observed prevalences as
|
|
128
|
+
a linear combination of true prevalences through the confusion matrix.
|
|
129
|
+
|
|
130
|
+
Mathematical model
|
|
131
|
+
|
|
132
|
+
The system is given by:
|
|
133
|
+
|
|
134
|
+
\[
|
|
135
|
+
\mathbf{y} = \mathbf{C}\hat{\pi}_F + \varepsilon
|
|
136
|
+
\]
|
|
137
|
+
|
|
138
|
+
subject to:
|
|
139
|
+
|
|
140
|
+
\[
|
|
141
|
+
\hat{\pi}_F \ge 0, \quad \sum_k \hat{\pi}_{F,k} = 1
|
|
142
|
+
\]
|
|
143
|
+
|
|
144
|
+
where:
|
|
145
|
+
- \( \mathbf{y} \): vector of predicted prevalences in test set,
|
|
146
|
+
- \( \mathbf{C} \): confusion matrix,
|
|
147
|
+
- \( \hat{\pi}_F \): true class prevalence vector (unknown),
|
|
148
|
+
- \( \varepsilon \): residual error.
|
|
149
|
+
|
|
150
|
+
The model can be solved either via:
|
|
151
|
+
- Linear algebraic solution, or
|
|
152
|
+
- Constrained optimization (quadratic or least-squares).
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
Parameters
|
|
156
|
+
----------
|
|
157
|
+
learner : estimator, optional
|
|
158
|
+
Classifier with `fit` and `predict` methods.
|
|
159
|
+
solver : {'optim', 'linear'}, optional
|
|
160
|
+
Solver for the adjustment system:
|
|
161
|
+
- `'linear'`: uses matrix inversion (e.g., GAC, GPAC)
|
|
162
|
+
- `'optim'`: uses optimization (e.g., FM)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
Attributes
|
|
166
|
+
----------
|
|
167
|
+
CM : ndarray of shape (n_classes, n_classes)
|
|
168
|
+
Confusion matrix used for correction.
|
|
169
|
+
classes : ndarray
|
|
170
|
+
Class labels observed in training.
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
References
|
|
174
|
+
----------
|
|
175
|
+
- Firat, A. (2016). *Unified Framework for Quantification.* AAAI, pp. 1-8.
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
Examples
|
|
179
|
+
--------
|
|
180
|
+
>>> from sklearn.linear_model import LogisticRegression
|
|
181
|
+
>>> from mlquantify.adjust_counting import MatrixAdjustment
|
|
182
|
+
>>> class MyMatrix(MatrixAdjustment):
|
|
183
|
+
... def _compute_confusion_matrix(self, preds, y):
|
|
184
|
+
... cm = np.ones((2, 2))
|
|
185
|
+
... return cm / cm.sum(axis=1, keepdims=True)
|
|
186
|
+
>>> q = MyMatrix(learner=LogisticRegression(), solver='linear')
|
|
187
|
+
>>> X = np.random.randn(50, 4)
|
|
188
|
+
>>> y = np.random.randint(0, 2, 50)
|
|
189
|
+
>>> q.fit(X, y)
|
|
190
|
+
>>> q.predict(X)
|
|
191
|
+
{0: 0.5, 1: 0.5}
|
|
192
|
+
"""
|
|
193
|
+
|
|
194
|
+
_parameter_constraints = {"solver": Options(["optim", "linear"])}
|
|
195
|
+
|
|
196
|
+
def __init__(self, learner=None, solver=None):
|
|
197
|
+
super().__init__(learner=learner)
|
|
198
|
+
self.solver = solver
|
|
199
|
+
|
|
200
|
+
def _adjust(self, predictions, train_y_pred, train_y_values):
|
|
201
|
+
n_class = len(np.unique(train_y_values))
|
|
202
|
+
self.CM = np.zeros((n_class, n_class))
|
|
203
|
+
|
|
204
|
+
if self.solver == 'optim':
|
|
205
|
+
priors = np.array(list(CC().aggregate(train_y_pred).values()))
|
|
206
|
+
self.CM = self._compute_confusion_matrix(train_y_pred, train_y_values, priors)
|
|
207
|
+
prevs_estim = self._get_estimations(predictions > priors)
|
|
208
|
+
prevalence = self._solve_optimization(prevs_estim, priors)
|
|
209
|
+
else:
|
|
210
|
+
self.CM = self._compute_confusion_matrix(train_y_pred)
|
|
211
|
+
prevs_estim = self._get_estimations(predictions)
|
|
212
|
+
prevalence = self._solve_linear(prevs_estim)
|
|
213
|
+
|
|
214
|
+
return prevalence
|
|
215
|
+
|
|
216
|
+
def _solve_linear(self, prevs_estim):
|
|
217
|
+
r"""
|
|
218
|
+
Solve the system linearly:
|
|
219
|
+
|
|
220
|
+
\[
|
|
221
|
+
\hat{\pi}_F = \mathbf{C}^{-1} \mathbf{p}
|
|
222
|
+
\]
|
|
223
|
+
"""
|
|
224
|
+
try:
|
|
225
|
+
adjusted = np.linalg.solve(self.CM, prevs_estim)
|
|
226
|
+
adjusted = np.clip(adjusted, 0, 1)
|
|
227
|
+
adjusted /= adjusted.sum()
|
|
228
|
+
except np.linalg.LinAlgError:
|
|
229
|
+
adjusted = prevs_estim
|
|
230
|
+
return adjusted
|
|
231
|
+
|
|
232
|
+
def _solve_optimization(self, prevs_estim, priors):
|
|
233
|
+
r"""
|
|
234
|
+
Solve via constrained least squares:
|
|
235
|
+
|
|
236
|
+
\[
|
|
237
|
+
\min_{\hat{\pi}_F} \| \mathbf{C}\hat{\pi}_F - \mathbf{p} \|_2^2
|
|
238
|
+
\quad \text{s.t. } \hat{\pi}_F \ge 0, \ \sum_k \hat{\pi}_{F,k} = 1
|
|
239
|
+
\]
|
|
240
|
+
"""
|
|
241
|
+
def objective(prevs_pred):
|
|
242
|
+
return np.linalg.norm(self.CM @ prevs_pred - prevs_estim)
|
|
243
|
+
|
|
244
|
+
constraints = [
|
|
245
|
+
{'type': 'eq', 'fun': lambda x: np.sum(x) - 1},
|
|
246
|
+
{'type': 'ineq', 'fun': lambda x: x}
|
|
247
|
+
]
|
|
248
|
+
bounds = [(0, 1)] * self.CM.shape[1]
|
|
249
|
+
init = np.full(self.CM.shape[1], 1 / self.CM.shape[1])
|
|
250
|
+
result = minimize(objective, init, constraints=constraints, bounds=bounds)
|
|
251
|
+
return result.x if result.success else priors
|
|
252
|
+
|
|
253
|
+
def _get_estimations(self, predictions):
|
|
254
|
+
"""Return prevalence estimates using CC (crisp) or PCC (probabilistic)."""
|
|
255
|
+
if uses_soft_predictions(self):
|
|
256
|
+
return np.array(list(PCC().aggregate(predictions).values()))
|
|
257
|
+
return np.array(list(CC().aggregate(predictions).values()))
|
|
258
|
+
|
|
259
|
+
@abstractmethod
|
|
260
|
+
def _compute_confusion_matrix(self, predictions, *args):
|
|
261
|
+
...
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
class FM(SoftLearnerQMixin, MatrixAdjustment):
|
|
265
|
+
"""Forman's Matrix Adjustment (FM) — solved via optimization."""
|
|
266
|
+
def __init__(self, learner=None):
|
|
267
|
+
super().__init__(learner=learner, solver='optim')
|
|
268
|
+
|
|
269
|
+
def _compute_confusion_matrix(self, posteriors, y_true, priors):
|
|
270
|
+
for i, _class in enumerate(self.classes_):
|
|
271
|
+
indices = (y_true == _class)
|
|
272
|
+
self.CM[:, i] = self._get_estimations(posteriors[indices] > priors)
|
|
273
|
+
return self.CM
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
class GAC(CrispLearnerQMixin, MatrixAdjustment):
|
|
277
|
+
"""Gonzalez-Castro’s Generalized Adjusted Count (GAC) method."""
|
|
278
|
+
def __init__(self, learner=None):
|
|
279
|
+
super().__init__(learner=learner, solver='linear')
|
|
280
|
+
|
|
281
|
+
def _compute_confusion_matrix(self, predictions):
|
|
282
|
+
prev_estim = self._get_estimations(predictions)
|
|
283
|
+
for i, _ in enumerate(self.classes_):
|
|
284
|
+
if prev_estim[i] == 0:
|
|
285
|
+
self.CM[i, i] = 1
|
|
286
|
+
else:
|
|
287
|
+
self.CM[:, i] /= prev_estim[i]
|
|
288
|
+
return self.CM
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
class GPAC(SoftLearnerQMixin, MatrixAdjustment):
|
|
292
|
+
"""Probabilistic GAC (GPAC) — soft version using posterior probabilities."""
|
|
293
|
+
def __init__(self, learner=None):
|
|
294
|
+
super().__init__(learner=learner, solver='linear')
|
|
295
|
+
|
|
296
|
+
def _compute_confusion_matrix(self, posteriors):
|
|
297
|
+
prev_estim = self._get_estimations(posteriors)
|
|
298
|
+
for i, _ in enumerate(self.classes_):
|
|
299
|
+
if prev_estim[i] == 0:
|
|
300
|
+
self.CM[i, i] = 1
|
|
301
|
+
else:
|
|
302
|
+
self.CM[:, i] /= prev_estim[i]
|
|
303
|
+
return self.CM
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
class ACC(ThresholdAdjustment):
|
|
307
|
+
"""Adjusted Count (ACC) — baseline threshold correction."""
|
|
308
|
+
def _get_best_threshold(self, thresholds, tprs, fprs):
|
|
309
|
+
tpr = tprs[thresholds == self.threshold][0]
|
|
310
|
+
fpr = fprs[thresholds == self.threshold][0]
|
|
311
|
+
return (self.threshold, tpr, fpr)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
class X_method(ThresholdAdjustment):
|
|
315
|
+
"""X method — threshold where \( \text{TPR} + \text{FPR} = 1 \)."""
|
|
316
|
+
def _get_best_threshold(self, thresholds, tprs, fprs):
|
|
317
|
+
idx = np.argmin(np.abs(1 - (tprs + fprs)))
|
|
318
|
+
return thresholds[idx], tprs[idx], fprs[idx]
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
class MAX(ThresholdAdjustment):
|
|
322
|
+
r"""MAX method — threshold maximizing \( \text{TPR} - \text{FPR} \)."""
|
|
323
|
+
def _get_best_threshold(self, thresholds, tprs, fprs):
|
|
324
|
+
idx = np.argmax(np.abs(tprs - fprs))
|
|
325
|
+
return thresholds[idx], tprs[idx], fprs[idx]
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
class T50(ThresholdAdjustment):
|
|
329
|
+
r"""T50 — selects threshold where \( \text{TPR} = 0.5 \)."""
|
|
330
|
+
def _get_best_threshold(self, thresholds, tprs, fprs):
|
|
331
|
+
idx = np.argmin(np.abs(tprs - 0.5))
|
|
332
|
+
return thresholds[idx], tprs[idx], fprs[idx]
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
class MS(ThresholdAdjustment):
|
|
336
|
+
r"""Median Sweep (MS) — median prevalence across all thresholds."""
|
|
337
|
+
def _adjust(self, predictions, train_y_scores, train_y_values):
|
|
338
|
+
positive_scores = train_y_scores[:, 1]
|
|
339
|
+
|
|
340
|
+
thresholds, tprs, fprs = evaluate_thresholds(train_y_values, positive_scores, self.classes_)
|
|
341
|
+
thresholds, tprs, fprs = self._get_best_threshold(thresholds, tprs, fprs)
|
|
342
|
+
|
|
343
|
+
prevs = []
|
|
344
|
+
for thr, tpr, fpr in zip(thresholds, tprs, fprs):
|
|
345
|
+
cc_predictions = CC(thr).aggregate(predictions)
|
|
346
|
+
cc_predictions = cc_predictions[1]
|
|
347
|
+
prevalence = cc_predictions if tpr - fpr == 0 else (cc_predictions - fpr) / (tpr - fpr)
|
|
348
|
+
prevs.append(prevalence)
|
|
349
|
+
prevalence = np.median(prevs)
|
|
350
|
+
return np.asarray([1 - prevalence, prevalence])
|
|
351
|
+
|
|
352
|
+
def _get_best_threshold(self, thresholds, tprs, fprs):
|
|
353
|
+
return thresholds, tprs, fprs
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
class MS2(MS):
|
|
357
|
+
r"""MS2 — Median Sweep variant with constraint \( |\text{TPR} - \text{FPR}| > 0.25 \)."""
|
|
358
|
+
def _get_best_threshold(self, thresholds, tprs, fprs):
|
|
359
|
+
if np.all(tprs == 0) or np.all(fprs == 0):
|
|
360
|
+
warnings.warn("All TPR or FPR values are zero.")
|
|
361
|
+
indices = np.where(np.abs(tprs - fprs) > 0.25)[0]
|
|
362
|
+
if len(indices) == 0:
|
|
363
|
+
warnings.warn("No cases satisfy |TPR - FPR| > 0.25.")
|
|
364
|
+
indices = np.where(np.abs(tprs - fprs) >= 0)[0]
|
|
365
|
+
return thresholds[indices], tprs[indices], fprs[indices]
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from abc import abstractmethod
|
|
3
|
+
|
|
4
|
+
from mlquantify.base import BaseQuantifier
|
|
5
|
+
|
|
6
|
+
from mlquantify.base_aggregative import (
|
|
7
|
+
AggregationMixin,
|
|
8
|
+
_get_learner_function
|
|
9
|
+
)
|
|
10
|
+
from mlquantify.utils._decorators import _fit_context
|
|
11
|
+
from mlquantify.utils._validation import check_classes_attribute, validate_predictions, validate_y, validate_data, validate_prevalences
|
|
12
|
+
from mlquantify.utils._get_scores import apply_cross_validation
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BaseCount(AggregationMixin, BaseQuantifier):
|
|
18
|
+
r"""Base class for count-based quantifiers.
|
|
19
|
+
|
|
20
|
+
Implements the foundation for *count-based quantification* methods,
|
|
21
|
+
where class prevalences are estimated directly from classifier outputs
|
|
22
|
+
without any correction.
|
|
23
|
+
|
|
24
|
+
The method assumes a classifier :math:`f(x)` producing either hard or
|
|
25
|
+
probabilistic predictions. The prevalence of each class :math:`c` in
|
|
26
|
+
the unlabeled test set is estimated as:
|
|
27
|
+
|
|
28
|
+
.. math::
|
|
29
|
+
\hat{\pi}_c = \frac{1}{N} \sum_{i=1}^{N} I(f(x_i) = c)
|
|
30
|
+
|
|
31
|
+
for *hard* classifiers, or equivalently as:
|
|
32
|
+
|
|
33
|
+
.. math::
|
|
34
|
+
\hat{\pi}_c = \frac{1}{N} \sum_{i=1}^{N} f_c(x_i)
|
|
35
|
+
|
|
36
|
+
for *soft* classifiers where :math:`f_c(x)` denotes the posterior
|
|
37
|
+
probability of class :math:`c`.
|
|
38
|
+
|
|
39
|
+
This is the classical *Classify and Count (CC)* and *Probabilistic
|
|
40
|
+
Classify and Count (PCC)* approach, introduced by Forman (2005, 2008)
|
|
41
|
+
and unified in the constrained regression framework of Firat et al. (2016).
|
|
42
|
+
|
|
43
|
+
Parameters
|
|
44
|
+
----------
|
|
45
|
+
learner : object, optional
|
|
46
|
+
A supervised learning model implementing `fit` and `predict`
|
|
47
|
+
or `predict_proba`.
|
|
48
|
+
|
|
49
|
+
Attributes
|
|
50
|
+
----------
|
|
51
|
+
learner : object
|
|
52
|
+
Underlying classification model.
|
|
53
|
+
classes : ndarray of shape (n_classes,)
|
|
54
|
+
Unique class labels observed during training.
|
|
55
|
+
|
|
56
|
+
Examples
|
|
57
|
+
--------
|
|
58
|
+
>>> from mlquantify.base_count import BaseCount
|
|
59
|
+
>>> from mlquantify.utils.validation import validate_prevalences
|
|
60
|
+
>>> import numpy as np
|
|
61
|
+
|
|
62
|
+
>>> class CC(CrispLearnerQMixin, BaseCount):
|
|
63
|
+
... def __init__(self, learner=None, threshold=0.5):
|
|
64
|
+
... super().__init__(learner)
|
|
65
|
+
... self.threshold = threshold
|
|
66
|
+
... def aggregate(self, predictions):
|
|
67
|
+
... predictions = validate_predictions(self, predictions)
|
|
68
|
+
... self.classes = self.classes if hasattr(self, 'classes') else np.unique(predictions)
|
|
69
|
+
... counts = np.array([np.count_nonzero(predictions == c) for c in self.classes])
|
|
70
|
+
... prevalences = counts / len(predictions)
|
|
71
|
+
... return validate_prevalences(self, prevalences, self.classes)
|
|
72
|
+
|
|
73
|
+
>>> from sklearn.linear_model import LogisticRegression
|
|
74
|
+
>>> X = np.random.randn(100, 5)
|
|
75
|
+
>>> y = np.random.randint(0, 2, 100)
|
|
76
|
+
>>> q = CC(learner=LogisticRegression())
|
|
77
|
+
>>> q.fit(X, y)
|
|
78
|
+
>>> q.predict(X).round(3)
|
|
79
|
+
array([0.47, 0.53])
|
|
80
|
+
|
|
81
|
+
References
|
|
82
|
+
----------
|
|
83
|
+
[1] Forman, G. (2005). *Counting Positives Accurately Despite Inaccurate Classification.*
|
|
84
|
+
ECML, pp. 564-575.
|
|
85
|
+
[2] Forman, G. (2008). *Quantifying Counts and Costs via Classification.*
|
|
86
|
+
Data Mining and Knowledge Discovery, 17(2), 164-206.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
@abstractmethod
|
|
90
|
+
def __init__(self, learner=None):
|
|
91
|
+
self.learner = learner
|
|
92
|
+
|
|
93
|
+
def __mlquantify_tags__(self):
|
|
94
|
+
tags = super().__mlquantify_tags__()
|
|
95
|
+
tags.prediction_requirements.requires_train_proba = False
|
|
96
|
+
tags.prediction_requirements.requires_train_labels = False
|
|
97
|
+
return tags
|
|
98
|
+
|
|
99
|
+
@_fit_context(prefer_skip_nested_validation=True)
|
|
100
|
+
def fit(self, X, y, learner_fitted=False, *args, **kwargs):
|
|
101
|
+
"""Fit the quantifier using the provided data and learner."""
|
|
102
|
+
X, y = validate_data(self, X, y)
|
|
103
|
+
validate_y(self, y)
|
|
104
|
+
self.classes_ = np.unique(y)
|
|
105
|
+
if not learner_fitted:
|
|
106
|
+
self.learner.fit(X, y, *args, **kwargs)
|
|
107
|
+
return self
|
|
108
|
+
|
|
109
|
+
def predict(self, X):
|
|
110
|
+
"""Predict class prevalences for the given data."""
|
|
111
|
+
estimator_function = _get_learner_function(self)
|
|
112
|
+
predictions = getattr(self.learner, estimator_function)(X)
|
|
113
|
+
prevalences = self.aggregate(predictions)
|
|
114
|
+
return prevalences
|
|
115
|
+
|
|
116
|
+
@abstractmethod
|
|
117
|
+
def aggregate(self, predictions):
|
|
118
|
+
"""Aggregate predictions into class prevalence estimates."""
|
|
119
|
+
...
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class BaseAdjustCount(AggregationMixin, BaseQuantifier):
|
|
123
|
+
r"""Base class for adjustment-based quantifiers.
|
|
124
|
+
|
|
125
|
+
This class generalizes *adjusted count* quantification methods,
|
|
126
|
+
providing a framework for correcting bias in raw classifier outputs
|
|
127
|
+
based on estimated confusion matrices or rate statistics.
|
|
128
|
+
|
|
129
|
+
Following Forman (2005, 2008), in the binary case the correction
|
|
130
|
+
uses true positive (TPR) and false positive (FPR) rates to adjust
|
|
131
|
+
the observed positive proportion :math:`\hat{p}'_{+}`:
|
|
132
|
+
|
|
133
|
+
.. math::
|
|
134
|
+
\hat{p}_{+} = \frac{\hat{p}'_{+} - \text{FPR}}{\text{TPR} - \text{FPR}}
|
|
135
|
+
|
|
136
|
+
In the multiclass extension (Firat et al., 2016), the same principle
|
|
137
|
+
can be expressed using matrix algebra. Let :math:`C` denote the
|
|
138
|
+
normalized confusion matrix where :math:`C_{ij} = P(\hat{y}=i|y=j)`
|
|
139
|
+
estimated via cross-validation. Then, given the observed distribution
|
|
140
|
+
of predictions :math:`\hat{\pi}'`, the corrected prevalence vector
|
|
141
|
+
:math:`\hat{\pi}` is obtained as:
|
|
142
|
+
|
|
143
|
+
.. math::
|
|
144
|
+
\hat{\pi}' = C \hat{\pi}
|
|
145
|
+
\quad \Rightarrow \quad
|
|
146
|
+
\hat{\pi} = C^{-1} \hat{\pi}'
|
|
147
|
+
|
|
148
|
+
subject to non-negativity and unit-sum constraints:
|
|
149
|
+
|
|
150
|
+
.. math::
|
|
151
|
+
\hat{\pi}_c \ge 0, \quad \sum_c \hat{\pi}_c = 1
|
|
152
|
+
|
|
153
|
+
This formulation can be solved via constrained least squares
|
|
154
|
+
(L2), least absolute deviation (L1), or Hellinger divergence
|
|
155
|
+
minimization, as discussed by Firat et al. (2016).
|
|
156
|
+
|
|
157
|
+
Parameters
|
|
158
|
+
----------
|
|
159
|
+
learner : object, optional
|
|
160
|
+
Supervised learner implementing `fit`, `predict`, or `predict_proba`.
|
|
161
|
+
|
|
162
|
+
Attributes
|
|
163
|
+
----------
|
|
164
|
+
learner : object
|
|
165
|
+
Underlying classification model.
|
|
166
|
+
train_predictions : ndarray of shape (n_samples_train, n_classes)
|
|
167
|
+
Predictions on training data from cross-validation.
|
|
168
|
+
train_y_values : ndarray of shape (n_samples_train,)
|
|
169
|
+
True labels corresponding to training predictions.
|
|
170
|
+
classes : ndarray of shape (n_classes,)
|
|
171
|
+
Unique class labels.
|
|
172
|
+
|
|
173
|
+
Examples
|
|
174
|
+
--------
|
|
175
|
+
>>> from mlquantify.base_count import BaseAdjustCount
|
|
176
|
+
>>> import numpy as np
|
|
177
|
+
|
|
178
|
+
>>> class ACC(CrispLearnerQMixin, BaseAdjustCount):
|
|
179
|
+
... def _adjust(self, preds, train_preds, y_train):
|
|
180
|
+
... tpr = np.mean(train_preds[y_train == 1])
|
|
181
|
+
... fpr = np.mean(train_preds[y_train == 0])
|
|
182
|
+
... p_obs = np.mean(preds)
|
|
183
|
+
... p_adj = (p_obs - fpr) / (tpr - fpr)
|
|
184
|
+
... return np.clip([1 - p_adj, p_adj], 0, 1)
|
|
185
|
+
|
|
186
|
+
>>> from sklearn.linear_model import LogisticRegression
|
|
187
|
+
>>> X = np.random.randn(100, 5)
|
|
188
|
+
>>> y = np.random.randint(0, 2, 100)
|
|
189
|
+
>>> q = ACC(learner=LogisticRegression())
|
|
190
|
+
>>> q.fit(X, y)
|
|
191
|
+
>>> q.predict(X).round(3)
|
|
192
|
+
array([0.52, 0.48])
|
|
193
|
+
|
|
194
|
+
References
|
|
195
|
+
----------
|
|
196
|
+
[1] Forman, G. (2005). *Counting Positives Accurately Despite Inaccurate Classification.*
|
|
197
|
+
ECML 2005, LNAI 3720, pp. 564-575.
|
|
198
|
+
[2] Forman, G. (2008). *Quantifying Counts and Costs via Classification.*
|
|
199
|
+
Data Mining and Knowledge Discovery, 17(2), 164-206.
|
|
200
|
+
[3] Firat, A. (2016). *Unified Framework for Quantification.*
|
|
201
|
+
Proceedings of the AAAI Conference on Artificial Intelligence, Sections 3.2-3.3.
|
|
202
|
+
"""
|
|
203
|
+
|
|
204
|
+
@abstractmethod
|
|
205
|
+
def __init__(self, learner=None):
|
|
206
|
+
self.learner = learner
|
|
207
|
+
|
|
208
|
+
@_fit_context(prefer_skip_nested_validation=True)
|
|
209
|
+
def fit(self, X, y, learner_fitted=False):
|
|
210
|
+
"""Fit the quantifier using the provided data and learner."""
|
|
211
|
+
X, y = validate_data(self, X, y)
|
|
212
|
+
validate_y(self, y)
|
|
213
|
+
self.classes_ = np.unique(y)
|
|
214
|
+
learner_function = _get_learner_function(self)
|
|
215
|
+
|
|
216
|
+
if learner_fitted:
|
|
217
|
+
train_predictions = getattr(self.learner, learner_function)(X)
|
|
218
|
+
y_train_labels = y
|
|
219
|
+
else:
|
|
220
|
+
train_predictions, y_train_labels = apply_cross_validation(
|
|
221
|
+
self.learner,
|
|
222
|
+
X,
|
|
223
|
+
y,
|
|
224
|
+
function=learner_function,
|
|
225
|
+
cv=5,
|
|
226
|
+
stratified=True,
|
|
227
|
+
random_state=None,
|
|
228
|
+
shuffle=True
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
self.train_predictions = train_predictions
|
|
232
|
+
self.train_y_values = y_train_labels
|
|
233
|
+
return self
|
|
234
|
+
|
|
235
|
+
def predict(self, X):
|
|
236
|
+
"""Predict class prevalences for the given data."""
|
|
237
|
+
predictions = getattr(self.learner, _get_learner_function(self))(X)
|
|
238
|
+
prevalences = self.aggregate(predictions, self.train_predictions, self.train_y_values)
|
|
239
|
+
return prevalences
|
|
240
|
+
|
|
241
|
+
def aggregate(self, predictions, train_predictions, y_train_values):
|
|
242
|
+
"""Aggregate predictions and apply matrix- or rate-based bias correction."""
|
|
243
|
+
self.classes_ = check_classes_attribute(self, np.unique(y_train_values))
|
|
244
|
+
predictions = validate_predictions(self, train_predictions)
|
|
245
|
+
prevalences = self._adjust(predictions, train_predictions, y_train_values)
|
|
246
|
+
prevalences = validate_prevalences(self, prevalences, self.classes_)
|
|
247
|
+
return prevalences
|