mlquantify 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify/__init__.py +10 -29
- mlquantify/adjust_counting/__init__.py +24 -0
- mlquantify/adjust_counting/_adjustment.py +648 -0
- mlquantify/adjust_counting/_base.py +245 -0
- mlquantify/adjust_counting/_counting.py +153 -0
- mlquantify/adjust_counting/_utils.py +109 -0
- mlquantify/base.py +117 -519
- mlquantify/base_aggregative.py +209 -0
- mlquantify/calibration.py +1 -0
- mlquantify/confidence.py +329 -0
- mlquantify/likelihood/__init__.py +5 -0
- mlquantify/likelihood/_base.py +147 -0
- mlquantify/likelihood/_classes.py +430 -0
- mlquantify/meta/__init__.py +1 -0
- mlquantify/meta/_classes.py +785 -0
- mlquantify/metrics/__init__.py +21 -0
- mlquantify/metrics/_oq.py +109 -0
- mlquantify/metrics/_rq.py +98 -0
- mlquantify/{evaluation/measures.py → metrics/_slq.py} +51 -36
- mlquantify/mixture/__init__.py +7 -0
- mlquantify/mixture/_base.py +147 -0
- mlquantify/mixture/_classes.py +458 -0
- mlquantify/mixture/_utils.py +163 -0
- mlquantify/model_selection/__init__.py +9 -0
- mlquantify/model_selection/_protocol.py +358 -0
- mlquantify/model_selection/_search.py +315 -0
- mlquantify/model_selection/_split.py +1 -0
- mlquantify/multiclass.py +350 -0
- mlquantify/neighbors/__init__.py +9 -0
- mlquantify/neighbors/_base.py +168 -0
- mlquantify/neighbors/_classes.py +150 -0
- mlquantify/{classification/methods.py → neighbors/_classification.py} +37 -62
- mlquantify/neighbors/_kde.py +268 -0
- mlquantify/neighbors/_utils.py +131 -0
- mlquantify/neural/__init__.py +1 -0
- mlquantify/utils/__init__.py +47 -2
- mlquantify/utils/_artificial.py +27 -0
- mlquantify/utils/_constraints.py +219 -0
- mlquantify/utils/_context.py +21 -0
- mlquantify/utils/_decorators.py +36 -0
- mlquantify/utils/_exceptions.py +12 -0
- mlquantify/utils/_get_scores.py +159 -0
- mlquantify/utils/_load.py +18 -0
- mlquantify/utils/_parallel.py +6 -0
- mlquantify/utils/_random.py +36 -0
- mlquantify/utils/_sampling.py +273 -0
- mlquantify/utils/_tags.py +44 -0
- mlquantify/utils/_validation.py +447 -0
- mlquantify/utils/prevalence.py +64 -0
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/METADATA +2 -1
- mlquantify-0.1.10.dist-info/RECORD +53 -0
- mlquantify/classification/__init__.py +0 -1
- mlquantify/evaluation/__init__.py +0 -14
- mlquantify/evaluation/protocol.py +0 -289
- mlquantify/methods/__init__.py +0 -37
- mlquantify/methods/aggregative.py +0 -1159
- mlquantify/methods/meta.py +0 -472
- mlquantify/methods/mixture_models.py +0 -1003
- mlquantify/methods/non_aggregative.py +0 -136
- mlquantify/methods/threshold_optimization.py +0 -869
- mlquantify/model_selection.py +0 -377
- mlquantify/plots.py +0 -367
- mlquantify/utils/general.py +0 -371
- mlquantify/utils/method.py +0 -449
- mlquantify-0.1.8.dist-info/RECORD +0 -22
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/WHEEL +0 -0
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from abc import abstractmethod
|
|
3
|
+
|
|
4
|
+
from mlquantify.base import BaseQuantifier
|
|
5
|
+
|
|
6
|
+
from mlquantify.base_aggregative import (
|
|
7
|
+
AggregationMixin,
|
|
8
|
+
_get_learner_function
|
|
9
|
+
)
|
|
10
|
+
from mlquantify.utils._decorators import _fit_context
|
|
11
|
+
from mlquantify.utils._validation import check_classes_attribute, validate_predictions, validate_y, validate_data, validate_prevalences
|
|
12
|
+
from mlquantify.utils._get_scores import apply_cross_validation
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BaseCount(AggregationMixin, BaseQuantifier):
|
|
18
|
+
r"""Base class for count-based quantifiers.
|
|
19
|
+
|
|
20
|
+
Implements the foundation for *count-based quantification* methods,
|
|
21
|
+
where class prevalences are estimated directly from classifier outputs
|
|
22
|
+
without any correction.
|
|
23
|
+
|
|
24
|
+
The method assumes a classifier :math:`f(x)` producing either hard or
|
|
25
|
+
probabilistic predictions. The prevalence of each class :math:`c` in
|
|
26
|
+
the unlabeled test set is estimated as:
|
|
27
|
+
|
|
28
|
+
.. math::
|
|
29
|
+
\hat{\pi}_c = \frac{1}{N} \sum_{i=1}^{N} I(f(x_i) = c)
|
|
30
|
+
|
|
31
|
+
for *hard* classifiers, or equivalently as:
|
|
32
|
+
|
|
33
|
+
.. math::
|
|
34
|
+
\hat{\pi}_c = \frac{1}{N} \sum_{i=1}^{N} f_c(x_i)
|
|
35
|
+
|
|
36
|
+
for *soft* classifiers where :math:`f_c(x)` denotes the posterior
|
|
37
|
+
probability of class :math:`c`.
|
|
38
|
+
|
|
39
|
+
This is the classical *Classify and Count (CC)* and *Probabilistic
|
|
40
|
+
Classify and Count (PCC)* approach, introduced by Forman (2005, 2008)
|
|
41
|
+
and unified in the constrained regression framework of Firat et al. (2016).
|
|
42
|
+
|
|
43
|
+
Parameters
|
|
44
|
+
----------
|
|
45
|
+
learner : object, optional
|
|
46
|
+
A supervised learning model implementing `fit` and `predict`
|
|
47
|
+
or `predict_proba`.
|
|
48
|
+
|
|
49
|
+
Attributes
|
|
50
|
+
----------
|
|
51
|
+
learner : object
|
|
52
|
+
Underlying classification model.
|
|
53
|
+
classes : ndarray of shape (n_classes,)
|
|
54
|
+
Unique class labels observed during training.
|
|
55
|
+
|
|
56
|
+
Examples
|
|
57
|
+
--------
|
|
58
|
+
>>> from mlquantify.base_count import BaseCount
|
|
59
|
+
>>> from mlquantify.utils.validation import validate_prevalences
|
|
60
|
+
>>> import numpy as np
|
|
61
|
+
|
|
62
|
+
>>> class CC(CrispLearnerQMixin, BaseCount):
|
|
63
|
+
... def __init__(self, learner=None, threshold=0.5):
|
|
64
|
+
... super().__init__(learner)
|
|
65
|
+
... self.threshold = threshold
|
|
66
|
+
... def aggregate(self, predictions):
|
|
67
|
+
... predictions = validate_predictions(self, predictions)
|
|
68
|
+
... self.classes = self.classes if hasattr(self, 'classes') else np.unique(predictions)
|
|
69
|
+
... counts = np.array([np.count_nonzero(predictions == c) for c in self.classes])
|
|
70
|
+
... prevalences = counts / len(predictions)
|
|
71
|
+
... return validate_prevalences(self, prevalences, self.classes)
|
|
72
|
+
|
|
73
|
+
>>> from sklearn.linear_model import LogisticRegression
|
|
74
|
+
>>> X = np.random.randn(100, 5)
|
|
75
|
+
>>> y = np.random.randint(0, 2, 100)
|
|
76
|
+
>>> q = CC(learner=LogisticRegression())
|
|
77
|
+
>>> q.fit(X, y)
|
|
78
|
+
>>> q.predict(X).round(3)
|
|
79
|
+
array([0.47, 0.53])
|
|
80
|
+
|
|
81
|
+
References
|
|
82
|
+
----------
|
|
83
|
+
[1] Forman, G. (2005). *Counting Positives Accurately Despite Inaccurate Classification.*
|
|
84
|
+
ECML, pp. 564-575.
|
|
85
|
+
[2] Forman, G. (2008). *Quantifying Counts and Costs via Classification.*
|
|
86
|
+
Data Mining and Knowledge Discovery, 17(2), 164-206.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
@abstractmethod
|
|
90
|
+
def __init__(self, learner=None):
|
|
91
|
+
self.learner = learner
|
|
92
|
+
|
|
93
|
+
def __mlquantify_tags__(self):
|
|
94
|
+
tags = super().__mlquantify_tags__()
|
|
95
|
+
tags.prediction_requirements.requires_train_proba = False
|
|
96
|
+
tags.prediction_requirements.requires_train_labels = False
|
|
97
|
+
return tags
|
|
98
|
+
|
|
99
|
+
@_fit_context(prefer_skip_nested_validation=True)
|
|
100
|
+
def fit(self, X, y, learner_fitted=False, *args, **kwargs):
|
|
101
|
+
"""Fit the quantifier using the provided data and learner."""
|
|
102
|
+
X, y = validate_data(self, X, y)
|
|
103
|
+
validate_y(self, y)
|
|
104
|
+
self.classes_ = np.unique(y)
|
|
105
|
+
if not learner_fitted:
|
|
106
|
+
self.learner.fit(X, y, *args, **kwargs)
|
|
107
|
+
return self
|
|
108
|
+
|
|
109
|
+
def predict(self, X):
|
|
110
|
+
"""Predict class prevalences for the given data."""
|
|
111
|
+
estimator_function = _get_learner_function(self)
|
|
112
|
+
predictions = getattr(self.learner, estimator_function)(X)
|
|
113
|
+
prevalences = self.aggregate(predictions)
|
|
114
|
+
return prevalences
|
|
115
|
+
|
|
116
|
+
@abstractmethod
|
|
117
|
+
def aggregate(self, predictions):
|
|
118
|
+
"""Aggregate predictions into class prevalence estimates."""
|
|
119
|
+
...
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class BaseAdjustCount(AggregationMixin, BaseQuantifier):
|
|
123
|
+
r"""Base class for adjustment-based quantifiers.
|
|
124
|
+
|
|
125
|
+
This class generalizes *adjusted count* quantification methods,
|
|
126
|
+
providing a framework for correcting bias in raw classifier outputs
|
|
127
|
+
based on estimated confusion matrices or rate statistics.
|
|
128
|
+
|
|
129
|
+
Following Forman (2005, 2008), in the binary case the correction
|
|
130
|
+
uses true positive (TPR) and false positive (FPR) rates to adjust
|
|
131
|
+
the observed positive proportion :math:`\hat{p}'_{+}`:
|
|
132
|
+
|
|
133
|
+
.. math::
|
|
134
|
+
\hat{p}_{+} = \frac{\hat{p}'_{+} - \text{FPR}}{\text{TPR} - \text{FPR}}
|
|
135
|
+
|
|
136
|
+
In the multiclass extension (Firat et al., 2016), the same principle
|
|
137
|
+
can be expressed using matrix algebra. Let :math:`C` denote the
|
|
138
|
+
normalized confusion matrix where :math:`C_{ij} = P(\hat{y}=i|y=j)`
|
|
139
|
+
estimated via cross-validation. Then, given the observed distribution
|
|
140
|
+
of predictions :math:`\hat{\pi}'`, the corrected prevalence vector
|
|
141
|
+
:math:`\hat{\pi}` is obtained as:
|
|
142
|
+
|
|
143
|
+
.. math::
|
|
144
|
+
\hat{\pi}' = C \hat{\pi}
|
|
145
|
+
\quad \Rightarrow \quad
|
|
146
|
+
\hat{\pi} = C^{-1} \hat{\pi}'
|
|
147
|
+
|
|
148
|
+
subject to non-negativity and unit-sum constraints:
|
|
149
|
+
|
|
150
|
+
.. math::
|
|
151
|
+
\hat{\pi}_c \ge 0, \quad \sum_c \hat{\pi}_c = 1
|
|
152
|
+
|
|
153
|
+
This formulation can be solved via constrained least squares
|
|
154
|
+
(L2), least absolute deviation (L1), or Hellinger divergence
|
|
155
|
+
minimization, as discussed by Firat et al. (2016).
|
|
156
|
+
|
|
157
|
+
Parameters
|
|
158
|
+
----------
|
|
159
|
+
learner : object, optional
|
|
160
|
+
Supervised learner implementing `fit`, `predict`, or `predict_proba`.
|
|
161
|
+
|
|
162
|
+
Attributes
|
|
163
|
+
----------
|
|
164
|
+
learner : object
|
|
165
|
+
Underlying classification model.
|
|
166
|
+
train_predictions : ndarray of shape (n_samples_train, n_classes)
|
|
167
|
+
Predictions on training data from cross-validation.
|
|
168
|
+
train_y_values : ndarray of shape (n_samples_train,)
|
|
169
|
+
True labels corresponding to training predictions.
|
|
170
|
+
classes : ndarray of shape (n_classes,)
|
|
171
|
+
Unique class labels.
|
|
172
|
+
|
|
173
|
+
Examples
|
|
174
|
+
--------
|
|
175
|
+
>>> from mlquantify.base_count import BaseAdjustCount
|
|
176
|
+
>>> import numpy as np
|
|
177
|
+
>>> from sklearn.linear_model import LogisticRegression
|
|
178
|
+
>>> class ACC(CrispLearnerQMixin, BaseAdjustCount):
|
|
179
|
+
... def _adjust(self, preds, train_preds, y_train):
|
|
180
|
+
... tpr = np.mean(train_preds[y_train == 1])
|
|
181
|
+
... fpr = np.mean(train_preds[y_train == 0])
|
|
182
|
+
... p_obs = np.mean(preds)
|
|
183
|
+
... p_adj = (p_obs - fpr) / (tpr - fpr)
|
|
184
|
+
... return np.clip([1 - p_adj, p_adj], 0, 1)
|
|
185
|
+
>>> X = np.random.randn(100, 5)
|
|
186
|
+
>>> y = np.random.randint(0, 2, 100)
|
|
187
|
+
>>> q = ACC(learner=LogisticRegression())
|
|
188
|
+
>>> q.fit(X, y)
|
|
189
|
+
>>> q.predict(X).round(3)
|
|
190
|
+
array([0.52, 0.48])
|
|
191
|
+
|
|
192
|
+
References
|
|
193
|
+
----------
|
|
194
|
+
[1] Forman, G. (2005). *Counting Positives Accurately Despite Inaccurate Classification.*
|
|
195
|
+
ECML 2005, LNAI 3720, pp. 564-575.
|
|
196
|
+
[2] Forman, G. (2008). *Quantifying Counts and Costs via Classification.*
|
|
197
|
+
Data Mining and Knowledge Discovery, 17(2), 164-206.
|
|
198
|
+
[3] Firat, A. (2016). *Unified Framework for Quantification.*
|
|
199
|
+
Proceedings of the AAAI Conference on Artificial Intelligence, Sections 3.2-3.3.
|
|
200
|
+
"""
|
|
201
|
+
|
|
202
|
+
@abstractmethod
|
|
203
|
+
def __init__(self, learner=None):
|
|
204
|
+
self.learner = learner
|
|
205
|
+
|
|
206
|
+
@_fit_context(prefer_skip_nested_validation=True)
|
|
207
|
+
def fit(self, X, y, learner_fitted=False):
|
|
208
|
+
"""Fit the quantifier using the provided data and learner."""
|
|
209
|
+
X, y = validate_data(self, X, y)
|
|
210
|
+
validate_y(self, y)
|
|
211
|
+
self.classes_ = np.unique(y)
|
|
212
|
+
learner_function = _get_learner_function(self)
|
|
213
|
+
|
|
214
|
+
if learner_fitted:
|
|
215
|
+
train_predictions = getattr(self.learner, learner_function)(X)
|
|
216
|
+
y_train_labels = y
|
|
217
|
+
else:
|
|
218
|
+
train_predictions, y_train_labels = apply_cross_validation(
|
|
219
|
+
self.learner,
|
|
220
|
+
X,
|
|
221
|
+
y,
|
|
222
|
+
function=learner_function,
|
|
223
|
+
cv=5,
|
|
224
|
+
stratified=True,
|
|
225
|
+
random_state=None,
|
|
226
|
+
shuffle=True
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
self.train_predictions = train_predictions
|
|
230
|
+
self.train_y_values = y_train_labels
|
|
231
|
+
return self
|
|
232
|
+
|
|
233
|
+
def predict(self, X):
|
|
234
|
+
"""Predict class prevalences for the given data."""
|
|
235
|
+
predictions = getattr(self.learner, _get_learner_function(self))(X)
|
|
236
|
+
prevalences = self.aggregate(predictions, self.train_predictions, self.train_y_values)
|
|
237
|
+
return prevalences
|
|
238
|
+
|
|
239
|
+
def aggregate(self, predictions, train_predictions, y_train_values):
|
|
240
|
+
"""Aggregate predictions and apply matrix- or rate-based bias correction."""
|
|
241
|
+
self.classes_ = check_classes_attribute(self, np.unique(y_train_values))
|
|
242
|
+
predictions = validate_predictions(self, train_predictions)
|
|
243
|
+
prevalences = self._adjust(predictions, train_predictions, y_train_values)
|
|
244
|
+
prevalences = validate_prevalences(self, prevalences, self.classes_)
|
|
245
|
+
return prevalences
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
from mlquantify.base_aggregative import (
|
|
4
|
+
SoftLearnerQMixin,
|
|
5
|
+
CrispLearnerQMixin
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
from mlquantify.adjust_counting._base import BaseCount
|
|
9
|
+
from mlquantify.utils._validation import validate_predictions, validate_prevalences, check_classes_attribute
|
|
10
|
+
from mlquantify.utils._constraints import Interval
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class CC(CrispLearnerQMixin, BaseCount):
|
|
15
|
+
r"""Classify and Count (CC) quantifier.
|
|
16
|
+
|
|
17
|
+
Implements the Classify and Count method for quantification, describe as a
|
|
18
|
+
baseline approach in the literature [1][2].
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
learner : estimator, optional
|
|
23
|
+
A supervised learning estimator with `fit` and `predict` methods.
|
|
24
|
+
If None, it is expected that the aggregate method is used directly.
|
|
25
|
+
threshold : float, default=0.5
|
|
26
|
+
Decision threshold for converting predicted probabilities into class labels.
|
|
27
|
+
Must be in the interval [0.0, 1.0].
|
|
28
|
+
|
|
29
|
+
Attributes
|
|
30
|
+
----------
|
|
31
|
+
learner : estimator
|
|
32
|
+
Underlying classification model.
|
|
33
|
+
|
|
34
|
+
Notes
|
|
35
|
+
-----
|
|
36
|
+
The Classify and Count approach performs quantification by classifying each instance
|
|
37
|
+
using the classifier's predicted labels at a given threshold, then counting the
|
|
38
|
+
prevalence of each class.
|
|
39
|
+
|
|
40
|
+
This method can be biased when class distributions differ between training and test sets,
|
|
41
|
+
motivating further adjustment methods.
|
|
42
|
+
|
|
43
|
+
Examples
|
|
44
|
+
--------
|
|
45
|
+
>>> from mlquantify.adjust_counting import CC
|
|
46
|
+
>>> import numpy as np
|
|
47
|
+
>>> from sklearn.linear_model import LogisticRegression
|
|
48
|
+
>>> X = np.random.randn(100, 5)
|
|
49
|
+
>>> y = np.random.randint(0, 2, 100)
|
|
50
|
+
>>> q = CC(learner=LogisticRegression())
|
|
51
|
+
>>> q.fit(X, y)
|
|
52
|
+
>>> q.predict(X)
|
|
53
|
+
{0: 0.47, 1: 0.53}
|
|
54
|
+
>>> q2 = CC()
|
|
55
|
+
>>> predictions = np.random.rand(200)
|
|
56
|
+
>>> q2.aggregate(predictions)
|
|
57
|
+
{0: 0.51, 1: 0.49}
|
|
58
|
+
|
|
59
|
+
References
|
|
60
|
+
----------
|
|
61
|
+
.. [1] Forman, G. (2005). "Counting Positives Accurately Despite Inaccurate Classification",
|
|
62
|
+
*ECML*, pp. 564-575.
|
|
63
|
+
.. [2] Forman, G. (2008). "Quantifying Counts and Costs via Classification",
|
|
64
|
+
*Data Mining and Knowledge Discovery*, 17(2), 164-206.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
_parameters_constraints = {
|
|
68
|
+
"threshold": [
|
|
69
|
+
Interval(0.0, 1.0),
|
|
70
|
+
Interval(0, 1, discrete=True),
|
|
71
|
+
],
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
def __init__(self, learner=None, threshold=0.5):
|
|
75
|
+
super().__init__(learner=learner)
|
|
76
|
+
self.threshold = threshold
|
|
77
|
+
|
|
78
|
+
def aggregate(self, predictions):
|
|
79
|
+
predictions = validate_predictions(self, predictions)
|
|
80
|
+
|
|
81
|
+
self.classes_ = check_classes_attribute(self, np.unique(predictions))
|
|
82
|
+
class_counts = np.array([np.count_nonzero(predictions == _class) for _class in self.classes_])
|
|
83
|
+
prevalences = class_counts / len(predictions)
|
|
84
|
+
|
|
85
|
+
prevalences = validate_prevalences(self, prevalences, self.classes_)
|
|
86
|
+
return prevalences
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class PCC(SoftLearnerQMixin, BaseCount):
|
|
90
|
+
r"""Probabilistic Classify and Count (PCC) quantifier.
|
|
91
|
+
|
|
92
|
+
Implements the Probabilistic Classify and Count method for quantification as described in:
|
|
93
|
+
[1] Forman, G. (2005). *Counting Positives Accurately Despite Inaccurate Classification.*
|
|
94
|
+
ECML, pp. 564-575.
|
|
95
|
+
[2] Forman, G. (2008). *Quantifying Counts and Costs via Classification.*
|
|
96
|
+
Data Mining and Knowledge Discovery, 17(2), 164-206.
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
Parameters
|
|
100
|
+
----------
|
|
101
|
+
learner : estimator, optional
|
|
102
|
+
A supervised learning estimator with fit and predict_proba methods.
|
|
103
|
+
If None, it is expected that will be used the aggregate method directly.
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
Attributes
|
|
107
|
+
----------
|
|
108
|
+
learner : estimator
|
|
109
|
+
Underlying classification model.
|
|
110
|
+
classes : ndarray of shape (n_classes,)
|
|
111
|
+
Unique class labels observed during training.
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
Examples
|
|
115
|
+
--------
|
|
116
|
+
>>> from mlquantify.adjust_counting import PCC
|
|
117
|
+
>>> import numpy as np
|
|
118
|
+
>>> from sklearn.linear_model import LogisticRegression
|
|
119
|
+
>>> X = np.random.randn(100, 5)
|
|
120
|
+
>>> y = np.random.randint(0, 2, 100)
|
|
121
|
+
>>> q = PCC(learner=LogisticRegression())
|
|
122
|
+
>>> q.fit(X, y)
|
|
123
|
+
>>> q.predict(X)
|
|
124
|
+
{0: 0.48, 1: 0.52}
|
|
125
|
+
>>> q2 = PCC()
|
|
126
|
+
>>> predictions = np.random.rand(200, 2)
|
|
127
|
+
>>> q2.aggregate(predictions)
|
|
128
|
+
{0: 0.50, 1: 0.50}
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
def __init__(self, learner=None):
|
|
132
|
+
super().__init__(learner=learner)
|
|
133
|
+
|
|
134
|
+
def aggregate(self, predictions):
|
|
135
|
+
predictions = validate_predictions(self, predictions)
|
|
136
|
+
|
|
137
|
+
# Handle categorical predictions (1D array with class labels)
|
|
138
|
+
if predictions.ndim == 1 and not np.issubdtype(predictions.dtype, (np.floating, np.integer)):
|
|
139
|
+
self.classes_ = check_classes_attribute(self, np.unique(predictions))
|
|
140
|
+
class_counts = np.array([np.count_nonzero(predictions == _class) for _class in self.classes_])
|
|
141
|
+
prevalences = class_counts / len(predictions)
|
|
142
|
+
else:
|
|
143
|
+
# Handle probability predictions (2D array or 1D probabilities)
|
|
144
|
+
if predictions.ndim == 2:
|
|
145
|
+
self.classes_ = check_classes_attribute(self, np.arange(predictions.shape[1]))
|
|
146
|
+
else:
|
|
147
|
+
self.classes_ = check_classes_attribute(self, np.arange(2))
|
|
148
|
+
prevalences = np.mean(predictions, axis=0) if predictions.ndim == 2 else predictions.mean()
|
|
149
|
+
if predictions.ndim == 1:
|
|
150
|
+
prevalences = np.array([1-prevalences, prevalences])
|
|
151
|
+
|
|
152
|
+
prevalences = validate_prevalences(self, prevalences, self.classes_)
|
|
153
|
+
return prevalences
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def compute_table(y, y_pred, classes):
|
|
5
|
+
r"""Compute the confusion matrix table for a binary classification task.
|
|
6
|
+
|
|
7
|
+
Parameters
|
|
8
|
+
----------
|
|
9
|
+
y : np.ndarray
|
|
10
|
+
The true labels.
|
|
11
|
+
y_pred : np.ndarray
|
|
12
|
+
The predicted labels.
|
|
13
|
+
classes : np.ndarray
|
|
14
|
+
The unique classes in the dataset.
|
|
15
|
+
|
|
16
|
+
Returns
|
|
17
|
+
-------
|
|
18
|
+
tuple
|
|
19
|
+
A tuple containing the counts of True Positives, False Positives,
|
|
20
|
+
False Negatives, and True Negatives respectively.
|
|
21
|
+
"""
|
|
22
|
+
TP = np.logical_and(y == y_pred, y == classes[1]).sum()
|
|
23
|
+
FP = np.logical_and(y != y_pred, y == classes[0]).sum()
|
|
24
|
+
FN = np.logical_and(y != y_pred, y == classes[1]).sum()
|
|
25
|
+
TN = np.logical_and(y == y_pred, y == classes[0]).sum()
|
|
26
|
+
return TP, FP, FN, TN
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def compute_tpr(TP, FN):
|
|
30
|
+
r"""Compute the True Positive Rate (Recall) for a binary classification task.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
TP : int
|
|
35
|
+
The number of True Positives.
|
|
36
|
+
FN : int
|
|
37
|
+
The number of False Negatives.
|
|
38
|
+
|
|
39
|
+
Returns
|
|
40
|
+
-------
|
|
41
|
+
float
|
|
42
|
+
The True Positive Rate (Recall).
|
|
43
|
+
"""
|
|
44
|
+
if TP + FN == 0:
|
|
45
|
+
return 0
|
|
46
|
+
return TP / (TP + FN)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def compute_fpr(FP, TN):
|
|
50
|
+
r"""Compute the False Positive Rate for a binary classification task.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
FP : int
|
|
55
|
+
The number of False Positives.
|
|
56
|
+
TN : int
|
|
57
|
+
The number of True Negatives.
|
|
58
|
+
|
|
59
|
+
Returns
|
|
60
|
+
-------
|
|
61
|
+
float
|
|
62
|
+
The False Positive Rate.
|
|
63
|
+
"""
|
|
64
|
+
if FP + TN == 0:
|
|
65
|
+
return 0
|
|
66
|
+
return FP / (FP + TN)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def evaluate_thresholds (y, probabilities:np.ndarray) -> tuple:
|
|
70
|
+
r"""Evaluate a range of classification thresholds to compute the corresponding
|
|
71
|
+
True Positive Rate (TPR) and False Positive Rate (FPR) for a binary quantification task.
|
|
72
|
+
|
|
73
|
+
Parameters
|
|
74
|
+
----------
|
|
75
|
+
y : np.ndarray
|
|
76
|
+
The true labels.
|
|
77
|
+
probabilities : np.ndarray
|
|
78
|
+
The predicted probabilities (scores) for the positive class.
|
|
79
|
+
classes : np.ndarray
|
|
80
|
+
The unique classes in the dataset.
|
|
81
|
+
|
|
82
|
+
Returns
|
|
83
|
+
-------
|
|
84
|
+
tuple
|
|
85
|
+
A tuple of (thresholds, tprs, fprs), where:
|
|
86
|
+
- thresholds is a numpy array of evaluated thresholds,
|
|
87
|
+
- tprs is a numpy array of corresponding True Positive Rates,
|
|
88
|
+
- fprs is a numpy array of corresponding False Positive Rates.
|
|
89
|
+
"""
|
|
90
|
+
unique_scores = np.linspace(0, 1, 101)
|
|
91
|
+
|
|
92
|
+
tprs = []
|
|
93
|
+
fprs = []
|
|
94
|
+
|
|
95
|
+
classes = np.unique(y)
|
|
96
|
+
|
|
97
|
+
for threshold in unique_scores:
|
|
98
|
+
y_pred = np.where(probabilities >= threshold, classes[1], classes[0])
|
|
99
|
+
|
|
100
|
+
TP, FP, FN, TN = compute_table(y, y_pred, classes)
|
|
101
|
+
|
|
102
|
+
tpr = compute_tpr(TP, FN)
|
|
103
|
+
fpr = compute_fpr(FP, TN)
|
|
104
|
+
|
|
105
|
+
tprs.append(tpr)
|
|
106
|
+
fprs.append(fpr)
|
|
107
|
+
|
|
108
|
+
#best_tpr, best_fpr = self.adjust_threshold(np.asarray(tprs), np.asarray(fprs))
|
|
109
|
+
return (unique_scores, np.asarray(tprs), np.asarray(fprs))
|