mlquantify 0.1.20__tar.gz → 0.1.21__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify-0.1.21/LICENSE +28 -0
- {mlquantify-0.1.20/mlquantify.egg-info → mlquantify-0.1.21}/PKG-INFO +13 -18
- {mlquantify-0.1.20 → mlquantify-0.1.21}/README.md +11 -18
- mlquantify-0.1.21/VERSION.txt +1 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/__init__.py +2 -1
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/adjust_counting/__init__.py +6 -5
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/adjust_counting/_adjustment.py +208 -37
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/adjust_counting/_base.py +5 -6
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/adjust_counting/_counting.py +10 -7
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/likelihood/__init__.py +0 -2
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/likelihood/_classes.py +45 -199
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/meta/_classes.py +12 -12
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/mixture/__init__.py +2 -1
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/mixture/_classes.py +310 -15
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/model_selection/_search.py +1 -1
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/neighbors/_base.py +15 -15
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/neighbors/_classes.py +2 -2
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/neighbors/_kde.py +6 -6
- mlquantify-0.1.21/mlquantify/neural/__init__.py +1 -0
- mlquantify-0.1.21/mlquantify/neural/_base.py +0 -0
- mlquantify-0.1.21/mlquantify/neural/_classes.py +609 -0
- mlquantify-0.1.21/mlquantify/neural/_perm_invariant.py +0 -0
- mlquantify-0.1.21/mlquantify/neural/_utils.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/utils/__init__.py +2 -1
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/utils/_constraints.py +2 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/utils/_validation.py +9 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21/mlquantify.egg-info}/PKG-INFO +13 -18
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify.egg-info/SOURCES.txt +5 -1
- mlquantify-0.1.20/VERSION.txt +0 -1
- mlquantify-0.1.20/mlquantify/likelihood/_base.py +0 -147
- mlquantify-0.1.20/mlquantify/neural/__init__.py +0 -1
- {mlquantify-0.1.20 → mlquantify-0.1.21}/MANIFEST.in +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/adjust_counting/_utils.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/base.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/base_aggregative.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/calibration.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/confidence.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/meta/__init__.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/metrics/__init__.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/metrics/_oq.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/metrics/_rq.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/metrics/_slq.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/mixture/_base.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/mixture/_utils.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/model_selection/__init__.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/model_selection/_protocol.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/model_selection/_split.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/multiclass.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/neighbors/__init__.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/neighbors/_classification.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/neighbors/_utils.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/utils/_artificial.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/utils/_context.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/utils/_decorators.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/utils/_exceptions.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/utils/_get_scores.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/utils/_load.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/utils/_parallel.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/utils/_random.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/utils/_sampling.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/utils/_tags.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify/utils/prevalence.py +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify.egg-info/dependency_links.txt +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify.egg-info/requires.txt +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/mlquantify.egg-info/top_level.txt +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/setup.cfg +0 -0
- {mlquantify-0.1.20 → mlquantify-0.1.21}/setup.py +0 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
BSD 3-Clause License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025, Luiz Fernando Luth Junior and Andre Gustavo Maletzke
|
|
4
|
+
|
|
5
|
+
Redistribution and use in source and binary forms, with or without
|
|
6
|
+
modification, are permitted provided that the following conditions are met:
|
|
7
|
+
|
|
8
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
9
|
+
list of conditions and the following disclaimer.
|
|
10
|
+
|
|
11
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
12
|
+
this list of conditions and the following disclaimer in the documentation
|
|
13
|
+
and/or other materials provided with the distribution.
|
|
14
|
+
|
|
15
|
+
3. Neither the name of the copyright holder nor the names of its
|
|
16
|
+
contributors may be used to endorse or promote products derived from
|
|
17
|
+
this software without specific prior written permission.
|
|
18
|
+
|
|
19
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
20
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
21
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
22
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
23
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
24
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
25
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
26
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
27
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
28
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mlquantify
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.21
|
|
4
4
|
Summary: Quantification Library
|
|
5
5
|
Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
|
|
6
6
|
Maintainer: Luiz Fernando Luth Junior
|
|
@@ -12,6 +12,7 @@ Classifier: Operating System :: Unix
|
|
|
12
12
|
Classifier: Operating System :: MacOS :: MacOS X
|
|
13
13
|
Classifier: Operating System :: Microsoft :: Windows
|
|
14
14
|
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
15
16
|
Requires-Dist: scikit-learn
|
|
16
17
|
Requires-Dist: numpy
|
|
17
18
|
Requires-Dist: scipy
|
|
@@ -26,25 +27,23 @@ Dynamic: description
|
|
|
26
27
|
Dynamic: description-content-type
|
|
27
28
|
Dynamic: home-page
|
|
28
29
|
Dynamic: keywords
|
|
30
|
+
Dynamic: license-file
|
|
29
31
|
Dynamic: maintainer
|
|
30
32
|
Dynamic: requires-dist
|
|
31
33
|
Dynamic: summary
|
|
32
34
|
|
|
33
|
-
|
|
35
|
+

|
|
36
|
+
[](https://github.com/luizfernandolj/mlquantify/)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
<a href="https://luizfernandolj.github.io/mlquantify/"><img src="assets/logo_mlquantify-white.svg" alt="mlquantify logo"></a>
|
|
34
40
|
<h4 align="center">A Python Package for Quantification</h4>
|
|
35
41
|
|
|
36
42
|
___
|
|
37
43
|
|
|
38
44
|
**mlquantify** is a Python library for quantification, also known as supervised prevalence estimation, designed to estimate the distribution of classes within datasets. It offers a range of tools for various quantification methods, model selection tailored for quantification tasks, evaluation metrics, and protocols to assess quantification performance. Additionally, mlquantify includes popular datasets and visualization tools to help analyze and interpret results.
|
|
39
45
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
## Latest Release
|
|
43
|
-
|
|
44
|
-
- **Version 0.1.11**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
|
|
45
|
-
- In case you need any help, refer to the [User Guide](https://luizfernandolj.github.io/mlquantify/user_guide.html).
|
|
46
|
-
- Explore the [API documentation](https://luizfernandolj.github.io/mlquantify/api/index.html) for detailed developer information.
|
|
47
|
-
- See also the library in the pypi site in [pypi mlquantify](https://pypi.org/project/mlquantify/)
|
|
46
|
+
Website: https://luizfernandolj.github.io/mlquantify/
|
|
48
47
|
|
|
49
48
|
___
|
|
50
49
|
|
|
@@ -112,6 +111,10 @@ print(f"Mean Absolute Error -> {mae}")
|
|
|
112
111
|
print(f"Normalized Relative Absolute Error -> {nrae}")
|
|
113
112
|
```
|
|
114
113
|
|
|
114
|
+
- In case you need any help, refer to the [User Guide](https://luizfernandolj.github.io/mlquantify/user_guide.html).
|
|
115
|
+
- Explore the [API documentation](https://luizfernandolj.github.io/mlquantify/api/index.html) for detailed developer information.
|
|
116
|
+
- See also the library in the pypi site in [pypi mlquantify](https://pypi.org/project/mlquantify/)
|
|
117
|
+
|
|
115
118
|
___
|
|
116
119
|
|
|
117
120
|
## Requirements
|
|
@@ -123,11 +126,3 @@ ___
|
|
|
123
126
|
- tqdm
|
|
124
127
|
- matplotlib
|
|
125
128
|
- xlrd
|
|
126
|
-
|
|
127
|
-
___
|
|
128
|
-
|
|
129
|
-
## Documentation
|
|
130
|
-
|
|
131
|
-
##### API is avaliable [here](https://luizfernandolj.github.io/mlquantify/api/)
|
|
132
|
-
|
|
133
|
-
___
|
|
@@ -1,18 +1,15 @@
|
|
|
1
|
-
|
|
1
|
+

|
|
2
|
+
[](https://github.com/luizfernandolj/mlquantify/)
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
<a href="https://luizfernandolj.github.io/mlquantify/"><img src="assets/logo_mlquantify-white.svg" alt="mlquantify logo"></a>
|
|
2
6
|
<h4 align="center">A Python Package for Quantification</h4>
|
|
3
7
|
|
|
4
8
|
___
|
|
5
9
|
|
|
6
10
|
**mlquantify** is a Python library for quantification, also known as supervised prevalence estimation, designed to estimate the distribution of classes within datasets. It offers a range of tools for various quantification methods, model selection tailored for quantification tasks, evaluation metrics, and protocols to assess quantification performance. Additionally, mlquantify includes popular datasets and visualization tools to help analyze and interpret results.
|
|
7
11
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
## Latest Release
|
|
11
|
-
|
|
12
|
-
- **Version 0.1.11**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
|
|
13
|
-
- In case you need any help, refer to the [User Guide](https://luizfernandolj.github.io/mlquantify/user_guide.html).
|
|
14
|
-
- Explore the [API documentation](https://luizfernandolj.github.io/mlquantify/api/index.html) for detailed developer information.
|
|
15
|
-
- See also the library in the pypi site in [pypi mlquantify](https://pypi.org/project/mlquantify/)
|
|
12
|
+
Website: https://luizfernandolj.github.io/mlquantify/
|
|
16
13
|
|
|
17
14
|
___
|
|
18
15
|
|
|
@@ -80,6 +77,10 @@ print(f"Mean Absolute Error -> {mae}")
|
|
|
80
77
|
print(f"Normalized Relative Absolute Error -> {nrae}")
|
|
81
78
|
```
|
|
82
79
|
|
|
80
|
+
- In case you need any help, refer to the [User Guide](https://luizfernandolj.github.io/mlquantify/user_guide.html).
|
|
81
|
+
- Explore the [API documentation](https://luizfernandolj.github.io/mlquantify/api/index.html) for detailed developer information.
|
|
82
|
+
- See also the library in the pypi site in [pypi mlquantify](https://pypi.org/project/mlquantify/)
|
|
83
|
+
|
|
83
84
|
___
|
|
84
85
|
|
|
85
86
|
## Requirements
|
|
@@ -90,12 +91,4 @@ ___
|
|
|
90
91
|
- joblib
|
|
91
92
|
- tqdm
|
|
92
93
|
- matplotlib
|
|
93
|
-
- xlrd
|
|
94
|
-
|
|
95
|
-
___
|
|
96
|
-
|
|
97
|
-
## Documentation
|
|
98
|
-
|
|
99
|
-
##### API is avaliable [here](https://luizfernandolj.github.io/mlquantify/api/)
|
|
100
|
-
|
|
101
|
-
___
|
|
94
|
+
- xlrd
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.1.21
|
|
@@ -1,15 +1,30 @@
|
|
|
1
|
+
from mlquantify.utils._validation import validate_prevalences
|
|
2
|
+
from mlquantify.base import BaseQuantifier
|
|
1
3
|
import numpy as np
|
|
2
4
|
from abc import abstractmethod
|
|
3
5
|
from scipy.optimize import minimize
|
|
4
6
|
import warnings
|
|
5
7
|
from sklearn.metrics import confusion_matrix
|
|
6
8
|
|
|
9
|
+
from mlquantify.utils._tags import (
|
|
10
|
+
PredictionRequirements,
|
|
11
|
+
Tags,
|
|
12
|
+
)
|
|
7
13
|
from mlquantify.adjust_counting._base import BaseAdjustCount
|
|
8
14
|
from mlquantify.adjust_counting._counting import CC, PCC
|
|
15
|
+
from mlquantify.utils import (
|
|
16
|
+
_fit_context,
|
|
17
|
+
validate_data,
|
|
18
|
+
validate_prevalences,
|
|
19
|
+
validate_predictions,
|
|
20
|
+
check_classes_attribute
|
|
21
|
+
)
|
|
9
22
|
from mlquantify.base_aggregative import (
|
|
10
23
|
CrispLearnerQMixin,
|
|
11
24
|
SoftLearnerQMixin,
|
|
12
|
-
|
|
25
|
+
AggregationMixin,
|
|
26
|
+
uses_soft_predictions,
|
|
27
|
+
_get_learner_function
|
|
13
28
|
)
|
|
14
29
|
from mlquantify.multiclass import define_binary
|
|
15
30
|
from mlquantify.adjust_counting._utils import evaluate_thresholds
|
|
@@ -98,14 +113,14 @@ class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
|
|
|
98
113
|
self.threshold = threshold
|
|
99
114
|
self.strategy = strategy
|
|
100
115
|
|
|
101
|
-
def _adjust(self, predictions, train_y_scores,
|
|
116
|
+
def _adjust(self, predictions, train_y_scores, y_train):
|
|
102
117
|
"""Internal adjustment computation based on selected ROC threshold."""
|
|
103
118
|
positive_scores = train_y_scores[:, 1]
|
|
104
119
|
|
|
105
|
-
thresholds, tprs, fprs = evaluate_thresholds(
|
|
120
|
+
thresholds, tprs, fprs = evaluate_thresholds(y_train, positive_scores)
|
|
106
121
|
threshold, tpr, fpr = self.get_best_threshold(thresholds, tprs, fprs)
|
|
107
122
|
|
|
108
|
-
cc_predictions = CC(threshold=threshold).aggregate(predictions,
|
|
123
|
+
cc_predictions = CC(threshold=threshold).aggregate(predictions, y_train)
|
|
109
124
|
cc_predictions = list(cc_predictions.values())[1]
|
|
110
125
|
|
|
111
126
|
if tpr - fpr == 0:
|
|
@@ -200,18 +215,18 @@ class MatrixAdjustment(BaseAdjustCount):
|
|
|
200
215
|
super().__init__(learner=learner)
|
|
201
216
|
self.solver = solver
|
|
202
217
|
|
|
203
|
-
def _adjust(self, predictions, train_y_pred,
|
|
204
|
-
n_class = len(np.unique(
|
|
218
|
+
def _adjust(self, predictions, train_y_pred, y_train):
|
|
219
|
+
n_class = len(np.unique(y_train))
|
|
205
220
|
self.CM = np.zeros((n_class, n_class))
|
|
206
221
|
|
|
207
222
|
if self.solver == 'optim':
|
|
208
|
-
priors = np.array(list(CC().aggregate(train_y_pred,
|
|
209
|
-
self.CM = self._compute_confusion_matrix(train_y_pred,
|
|
210
|
-
prevs_estim = self._get_estimations(predictions > priors,
|
|
223
|
+
priors = np.array(list(CC().aggregate(train_y_pred, y_train).values()))
|
|
224
|
+
self.CM = self._compute_confusion_matrix(train_y_pred, y_train, priors)
|
|
225
|
+
prevs_estim = self._get_estimations(predictions > priors, y_train)
|
|
211
226
|
prevalence = self._solve_optimization(prevs_estim, priors)
|
|
212
227
|
else:
|
|
213
|
-
self.CM = self._compute_confusion_matrix(train_y_pred,
|
|
214
|
-
prevs_estim = self._get_estimations(predictions,
|
|
228
|
+
self.CM = self._compute_confusion_matrix(train_y_pred, y_train)
|
|
229
|
+
prevs_estim = self._get_estimations(predictions, y_train)
|
|
215
230
|
prevalence = self._solve_linear(prevs_estim)
|
|
216
231
|
|
|
217
232
|
return prevalence
|
|
@@ -262,17 +277,173 @@ class MatrixAdjustment(BaseAdjustCount):
|
|
|
262
277
|
result = minimize(objective, init, constraints=constraints, bounds=bounds)
|
|
263
278
|
return result.x if result.success else priors
|
|
264
279
|
|
|
265
|
-
def _get_estimations(self, predictions,
|
|
280
|
+
def _get_estimations(self, predictions, y_train):
|
|
266
281
|
"""Return prevalence estimates using CC (crisp) or PCC (probabilistic)."""
|
|
267
282
|
if uses_soft_predictions(self):
|
|
268
283
|
return np.array(list(PCC().aggregate(predictions).values()))
|
|
269
|
-
return np.array(list(CC().aggregate(predictions,
|
|
284
|
+
return np.array(list(CC().aggregate(predictions, y_train).values()))
|
|
270
285
|
|
|
271
286
|
@abstractmethod
|
|
272
287
|
def _compute_confusion_matrix(self, predictions, *args):
|
|
273
288
|
...
|
|
274
289
|
|
|
275
290
|
|
|
291
|
+
|
|
292
|
+
@define_binary
|
|
293
|
+
class CDE(SoftLearnerQMixin, AggregationMixin, BaseQuantifier):
|
|
294
|
+
r"""CDE-Iterate for binary classification prevalence estimation.
|
|
295
|
+
|
|
296
|
+
Threshold :math:`\tau` from false positive and false negative costs:
|
|
297
|
+
.. math::
|
|
298
|
+
\tau = \frac{c_{FP}}{c_{FP} + c_{FN}}
|
|
299
|
+
|
|
300
|
+
Hard classification by thresholding posterior probability :math:`p(+|x)` at :math:`\tau`:
|
|
301
|
+
.. math::
|
|
302
|
+
\hat{y}(x) = \mathbf{1}_{p(+|x) > \tau}
|
|
303
|
+
|
|
304
|
+
Prevalence estimation via classify-and-count:
|
|
305
|
+
.. math::
|
|
306
|
+
\hat{p}_U(+) = \frac{1}{N} \sum_{n=1}^N \hat{y}(x_n)
|
|
307
|
+
|
|
308
|
+
False positive cost update:
|
|
309
|
+
.. math::
|
|
310
|
+
c_{FP}^{new} = \frac{p_L(+)}{p_L(-)} \times \frac{\hat{p}_U(-)}{\hat{p}_U(+)} \times c_{FN}
|
|
311
|
+
|
|
312
|
+
Parameters
|
|
313
|
+
----------
|
|
314
|
+
learner : estimator, optional
|
|
315
|
+
Wrapped classifier (unused).
|
|
316
|
+
tol : float, default=1e-4
|
|
317
|
+
Convergence tolerance.
|
|
318
|
+
max_iter : int, default=100
|
|
319
|
+
Max iterations.
|
|
320
|
+
init_cfp : float, default=1.0
|
|
321
|
+
Initial false positive cost.
|
|
322
|
+
|
|
323
|
+
References
|
|
324
|
+
----------
|
|
325
|
+
.. [1] Esuli, A., Moreo, A., & Sebastiani, F. (2023). Learning to Quantify. Springer.
|
|
326
|
+
"""
|
|
327
|
+
|
|
328
|
+
_parameter_constraints = {
|
|
329
|
+
"tol": [Interval(0, None, inclusive_left=False)],
|
|
330
|
+
"max_iter": [Interval(1, None, inclusive_left=True)],
|
|
331
|
+
"init_cfp": [Interval(0, None, inclusive_left=False)]
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
def __mlquantify_tags__(self):
|
|
335
|
+
tags = super().__mlquantify_tags__()
|
|
336
|
+
tags.prediction_requirements.requires_train_proba = False
|
|
337
|
+
return tags
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def __init__(self, learner=None, tol=1e-4, max_iter=100, init_cfp=1.0, strategy="ovr"):
|
|
341
|
+
self.learner = learner
|
|
342
|
+
self.tol = float(tol)
|
|
343
|
+
self.max_iter = int(max_iter)
|
|
344
|
+
self.init_cfp = float(init_cfp)
|
|
345
|
+
self.strategy = strategy
|
|
346
|
+
|
|
347
|
+
@_fit_context(prefer_skip_nested_validation=True)
|
|
348
|
+
def fit(self, X, y):
|
|
349
|
+
"""Fit the quantifier using the provided data and learner."""
|
|
350
|
+
X, y = validate_data(self, X, y)
|
|
351
|
+
self.classes_ = np.unique(y)
|
|
352
|
+
self.learner.fit(X, y)
|
|
353
|
+
counts = np.array([np.count_nonzero(y == _class) for _class in self.classes_])
|
|
354
|
+
self.priors = counts / len(y)
|
|
355
|
+
self.y_train = y
|
|
356
|
+
|
|
357
|
+
return self
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def predict(self, X):
|
|
361
|
+
"""Predict class prevalences for the given data."""
|
|
362
|
+
predictions = getattr(self.learner, _get_learner_function(self))(X)
|
|
363
|
+
prevalences = self.aggregate(predictions, self.y_train)
|
|
364
|
+
return prevalences
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def aggregate(self, predictions, y_train):
|
|
368
|
+
|
|
369
|
+
self.classes_ = check_classes_attribute(self, np.unique(y_train))
|
|
370
|
+
predictions = validate_predictions(self, predictions)
|
|
371
|
+
|
|
372
|
+
if hasattr(self, 'priors'):
|
|
373
|
+
Ptr = np.asarray(self.priors, dtype=np.float64)
|
|
374
|
+
else:
|
|
375
|
+
counts = np.array([np.count_nonzero(y_train == _class) for _class in self.classes_])
|
|
376
|
+
Ptr = counts / len(y_train)
|
|
377
|
+
|
|
378
|
+
P = np.asarray(predictions, dtype=np.float64)
|
|
379
|
+
|
|
380
|
+
# ensure no zeros
|
|
381
|
+
eps = 1e-12
|
|
382
|
+
P = np.clip(P, eps, 1.0)
|
|
383
|
+
|
|
384
|
+
# training priors pL(+), pL(-)
|
|
385
|
+
# assume Ptr order matches columns of P; if Ptr sums to 1 but order unknown, user must match.
|
|
386
|
+
pL_pos = Ptr[1]
|
|
387
|
+
pL_neg = Ptr[0]
|
|
388
|
+
if pL_pos <= 0 or pL_neg <= 0:
|
|
389
|
+
# keep them positive to avoid divisions by zero
|
|
390
|
+
pL_pos = max(pL_pos, eps)
|
|
391
|
+
pL_neg = max(pL_neg, eps)
|
|
392
|
+
|
|
393
|
+
# initialize costs
|
|
394
|
+
cFN = 1.0
|
|
395
|
+
cFP = float(self.init_cfp)
|
|
396
|
+
|
|
397
|
+
prev_prev_pos = None
|
|
398
|
+
s = 0
|
|
399
|
+
|
|
400
|
+
# iterate: compute threshold from costs, classify, estimate prevalences via CC,
|
|
401
|
+
# update cFP, repeat
|
|
402
|
+
while s < self.max_iter:
|
|
403
|
+
# decision threshold tau for positive class:
|
|
404
|
+
# Derivation:
|
|
405
|
+
# predict positive if cost_FP * p(-|x) < cost_FN * p(+|x)
|
|
406
|
+
# => predict positive if p(+|x) / p(-|x) > cost_FP / cost_FN
|
|
407
|
+
# since p(+|x) / p(-|x) = p(+|x) / (1 - p(+|x)):
|
|
408
|
+
# p(+|x) > cost_FP / (cost_FP + cost_FN)
|
|
409
|
+
tau = cFP / (cFP + cFN)
|
|
410
|
+
|
|
411
|
+
# hard predictions for positive class using threshold on posterior for positive (col 1)
|
|
412
|
+
pos_probs = P[:, 1]
|
|
413
|
+
hard_pos = (pos_probs > tau).astype(float)
|
|
414
|
+
|
|
415
|
+
# classify-and-count prevalence estimate on U
|
|
416
|
+
prev_pos = hard_pos.mean()
|
|
417
|
+
prev_neg = 1.0 - prev_pos
|
|
418
|
+
|
|
419
|
+
# update cFP according to:
|
|
420
|
+
# cFP_new = (pL_pos / pL_neg) * (pU_hat(neg) / pU_hat(pos)) * cFN
|
|
421
|
+
# guard against zero prev_pos / prev_neg
|
|
422
|
+
prev_pos_safe = max(prev_pos, eps)
|
|
423
|
+
prev_neg_safe = max(prev_neg, eps)
|
|
424
|
+
|
|
425
|
+
cFP_new = (pL_pos / pL_neg) * (prev_neg_safe / prev_pos_safe) * cFN
|
|
426
|
+
|
|
427
|
+
# check convergence on prevalences (absolute change)
|
|
428
|
+
if prev_prev_pos is not None and abs(prev_pos - prev_prev_pos) < self.tol:
|
|
429
|
+
break
|
|
430
|
+
|
|
431
|
+
# prepare next iter
|
|
432
|
+
cFP = cFP_new
|
|
433
|
+
prev_prev_pos = prev_pos
|
|
434
|
+
s += 1
|
|
435
|
+
|
|
436
|
+
# if didn't converge within max_iter we keep last estimate (lack of fisher consistency)
|
|
437
|
+
if s >= self.max_iter:
|
|
438
|
+
# optional: warning
|
|
439
|
+
# print('[warning] CDE-Iterate reached max_iter without converging')
|
|
440
|
+
pass
|
|
441
|
+
|
|
442
|
+
prevalences = np.array([prev_neg, prev_pos], dtype=np.float64)
|
|
443
|
+
prevalences = validate_prevalences(self, prevalences, self.classes_)
|
|
444
|
+
return prevalences
|
|
445
|
+
|
|
446
|
+
|
|
276
447
|
class FM(SoftLearnerQMixin, MatrixAdjustment):
|
|
277
448
|
r"""Friedman Method for quantification adjustment.
|
|
278
449
|
|
|
@@ -337,14 +508,14 @@ class FM(SoftLearnerQMixin, MatrixAdjustment):
|
|
|
337
508
|
def _compute_confusion_matrix(self, posteriors, y_true, priors):
|
|
338
509
|
for i, _class in enumerate(self.classes_):
|
|
339
510
|
indices = (y_true == _class)
|
|
340
|
-
self.CM[:, i] = self._get_estimations(posteriors[indices] > priors)
|
|
511
|
+
self.CM[:, i] = self._get_estimations(posteriors[indices] > priors, y_true[indices])
|
|
341
512
|
return self.CM
|
|
342
513
|
|
|
343
514
|
|
|
344
|
-
class
|
|
345
|
-
r"""
|
|
515
|
+
class AC(CrispLearnerQMixin, MatrixAdjustment):
|
|
516
|
+
r"""Adjusted Count method.
|
|
346
517
|
|
|
347
|
-
This class implements the
|
|
518
|
+
This class implements the Adjusted Count (AC) algorithm for
|
|
348
519
|
quantification adjustment as described in Firat (2016) [1]_. The method
|
|
349
520
|
adjusts the estimated class prevalences by normalizing the confusion matrix
|
|
350
521
|
based on prevalence estimates, providing a correction for bias caused by
|
|
@@ -374,12 +545,12 @@ class GAC(CrispLearnerQMixin, MatrixAdjustment):
|
|
|
374
545
|
Examples
|
|
375
546
|
--------
|
|
376
547
|
>>> from sklearn.linear_model import LogisticRegression
|
|
377
|
-
>>> from mlquantify.adjust_counting import
|
|
548
|
+
>>> from mlquantify.adjust_counting import AC
|
|
378
549
|
>>> import numpy as np
|
|
379
|
-
>>>
|
|
550
|
+
>>> ac = AC(learner=LogisticRegression())
|
|
380
551
|
>>> X = np.random.randn(50, 4)
|
|
381
552
|
>>> y = np.random.randint(0, 2, 50)
|
|
382
|
-
>>>
|
|
553
|
+
>>> ac.fit(X, y)
|
|
383
554
|
>>> gac.predict(X)
|
|
384
555
|
{0: 0.5, 1: 0.5}
|
|
385
556
|
|
|
@@ -404,11 +575,11 @@ class GAC(CrispLearnerQMixin, MatrixAdjustment):
|
|
|
404
575
|
return self.CM
|
|
405
576
|
|
|
406
577
|
|
|
407
|
-
class
|
|
408
|
-
r"""Probabilistic
|
|
578
|
+
class PAC(SoftLearnerQMixin, MatrixAdjustment):
|
|
579
|
+
r"""Probabilistic Adjusted Count (PAC) method.
|
|
409
580
|
|
|
410
|
-
This class implements the probabilistic extension of the
|
|
411
|
-
as presented in Firat (2016) [1]_. The
|
|
581
|
+
This class implements the probabilistic extension of the Adjusted Count method
|
|
582
|
+
as presented in Firat (2016) [1]_. The PAC method normalizes the confusion matrix by
|
|
412
583
|
the estimated prevalences from posterior probabilities, enabling a probabilistic correction
|
|
413
584
|
of class prevalences.
|
|
414
585
|
|
|
@@ -436,13 +607,13 @@ class GPAC(SoftLearnerQMixin, MatrixAdjustment):
|
|
|
436
607
|
Examples
|
|
437
608
|
--------
|
|
438
609
|
>>> from sklearn.linear_model import LogisticRegression
|
|
439
|
-
>>> from mlquantify.adjust_counting import
|
|
610
|
+
>>> from mlquantify.adjust_counting import PAC
|
|
440
611
|
>>> import numpy as np
|
|
441
|
-
>>>
|
|
612
|
+
>>> pac = PAC(learner=LogisticRegression())
|
|
442
613
|
>>> X = np.random.randn(50, 4)
|
|
443
614
|
>>> y = np.random.randint(0, 2, 50)
|
|
444
|
-
>>>
|
|
445
|
-
>>>
|
|
615
|
+
>>> pac.fit(X, y)
|
|
616
|
+
>>> pac.predict(X)
|
|
446
617
|
{0: 0.5, 1: 0.5}
|
|
447
618
|
|
|
448
619
|
References
|
|
@@ -466,8 +637,8 @@ class GPAC(SoftLearnerQMixin, MatrixAdjustment):
|
|
|
466
637
|
return self.CM
|
|
467
638
|
|
|
468
639
|
|
|
469
|
-
class
|
|
470
|
-
r"""Adjusted Count (
|
|
640
|
+
class TAC(ThresholdAdjustment):
|
|
641
|
+
r"""Threshold Adjusted Count (TAC) — baseline threshold correction.
|
|
471
642
|
|
|
472
643
|
This method corrects the bias in class prevalence estimates caused by imperfect
|
|
473
644
|
classification accuracy, by adjusting the observed positive count using estimates
|
|
@@ -501,8 +672,8 @@ class ACC(ThresholdAdjustment):
|
|
|
501
672
|
return (self.threshold, tpr, fpr)
|
|
502
673
|
|
|
503
674
|
|
|
504
|
-
class
|
|
505
|
-
r"""X method — threshold where :math:`\text{TPR} + \text{FPR} = 1`.
|
|
675
|
+
class TX(ThresholdAdjustment):
|
|
676
|
+
r"""Threshold X method — threshold where :math:`\text{TPR} + \text{FPR} = 1`.
|
|
506
677
|
|
|
507
678
|
This method selects the classification threshold at which the sum of the true positive
|
|
508
679
|
rate (TPR) and false positive rate (FPR) equals one. This threshold choice balances
|
|
@@ -526,8 +697,8 @@ class X_method(ThresholdAdjustment):
|
|
|
526
697
|
return thresholds[idx], tprs[idx], fprs[idx]
|
|
527
698
|
|
|
528
699
|
|
|
529
|
-
class
|
|
530
|
-
r"""MAX method — threshold maximizing :math:`\text{TPR} - \text{FPR}`.
|
|
700
|
+
class TMAX(ThresholdAdjustment):
|
|
701
|
+
r"""Threshold MAX method — threshold maximizing :math:`\text{TPR} - \text{FPR}`.
|
|
531
702
|
|
|
532
703
|
This method selects the threshold that maximizes the difference between the true positive
|
|
533
704
|
rate (TPR) and the false positive rate (FPR), effectively optimizing classification
|
|
@@ -601,15 +772,15 @@ class MS(ThresholdAdjustment):
|
|
|
601
772
|
.. [1] Forman, G. (2008). "Quantifying Counts and Costs via Classification",
|
|
602
773
|
*Data Mining and Knowledge Discovery*, 17(2), 164-206.
|
|
603
774
|
"""
|
|
604
|
-
def _adjust(self, predictions, train_y_scores,
|
|
775
|
+
def _adjust(self, predictions, train_y_scores, y_train):
|
|
605
776
|
positive_scores = train_y_scores[:, 1]
|
|
606
777
|
|
|
607
|
-
thresholds, tprs, fprs = evaluate_thresholds(
|
|
778
|
+
thresholds, tprs, fprs = evaluate_thresholds(y_train, positive_scores)
|
|
608
779
|
thresholds, tprs, fprs = self.get_best_threshold(thresholds, tprs, fprs)
|
|
609
780
|
|
|
610
781
|
prevs = []
|
|
611
782
|
for thr, tpr, fpr in zip(thresholds, tprs, fprs):
|
|
612
|
-
cc_predictions = CC(threshold=thr).aggregate(predictions,
|
|
783
|
+
cc_predictions = CC(threshold=thr).aggregate(predictions, y_train)
|
|
613
784
|
cc_predictions = list(cc_predictions.values())[1]
|
|
614
785
|
|
|
615
786
|
if tpr - fpr == 0:
|
|
@@ -93,14 +93,13 @@ class BaseCount(AggregationMixin, BaseQuantifier):
|
|
|
93
93
|
def __mlquantify_tags__(self):
|
|
94
94
|
tags = super().__mlquantify_tags__()
|
|
95
95
|
tags.prediction_requirements.requires_train_proba = False
|
|
96
|
-
tags.prediction_requirements.requires_train_labels =
|
|
96
|
+
tags.prediction_requirements.requires_train_labels = True
|
|
97
97
|
return tags
|
|
98
98
|
|
|
99
99
|
@_fit_context(prefer_skip_nested_validation=True)
|
|
100
100
|
def fit(self, X, y, learner_fitted=False, *args, **kwargs):
|
|
101
101
|
"""Fit the quantifier using the provided data and learner."""
|
|
102
102
|
X, y = validate_data(self, X, y)
|
|
103
|
-
validate_y(self, y)
|
|
104
103
|
self.classes_ = np.unique(y)
|
|
105
104
|
if not learner_fitted:
|
|
106
105
|
self.learner.fit(X, y, *args, **kwargs)
|
|
@@ -207,7 +206,6 @@ class BaseAdjustCount(AggregationMixin, BaseQuantifier):
|
|
|
207
206
|
def fit(self, X, y, learner_fitted=False, cv=10, stratified=True, random_state=None, shuffle=True):
|
|
208
207
|
"""Fit the quantifier using the provided data and learner."""
|
|
209
208
|
X, y = validate_data(self, X, y)
|
|
210
|
-
validate_y(self, y)
|
|
211
209
|
self.classes_ = np.unique(y)
|
|
212
210
|
learner_function = _get_learner_function(self)
|
|
213
211
|
|
|
@@ -236,12 +234,13 @@ class BaseAdjustCount(AggregationMixin, BaseQuantifier):
|
|
|
236
234
|
prevalences = self.aggregate(predictions, self.train_predictions, self.train_y_values)
|
|
237
235
|
return prevalences
|
|
238
236
|
|
|
239
|
-
def aggregate(self, predictions, train_predictions,
|
|
237
|
+
def aggregate(self, predictions, train_predictions, y_train):
|
|
240
238
|
"""Aggregate predictions and apply matrix- or rate-based bias correction."""
|
|
241
|
-
self.classes_ = check_classes_attribute(self, np.unique(
|
|
239
|
+
self.classes_ = check_classes_attribute(self, np.unique(y_train))
|
|
242
240
|
|
|
243
241
|
predictions = validate_predictions(self, predictions)
|
|
242
|
+
train_predictions = validate_predictions(self, train_predictions)
|
|
244
243
|
|
|
245
|
-
prevalences = self._adjust(predictions, train_predictions,
|
|
244
|
+
prevalences = self._adjust(predictions, train_predictions, y_train)
|
|
246
245
|
prevalences = validate_prevalences(self, prevalences, self.classes_)
|
|
247
246
|
return prevalences
|
|
@@ -75,13 +75,13 @@ class CC(CrispLearnerQMixin, BaseCount):
|
|
|
75
75
|
super().__init__(learner=learner)
|
|
76
76
|
self.threshold = threshold
|
|
77
77
|
|
|
78
|
-
def aggregate(self, predictions,
|
|
79
|
-
predictions = validate_predictions(self, predictions, self.threshold,
|
|
78
|
+
def aggregate(self, predictions, y_train=None):
|
|
79
|
+
predictions = validate_predictions(self, predictions, self.threshold, y_train)
|
|
80
80
|
|
|
81
|
-
if
|
|
82
|
-
|
|
81
|
+
if y_train is None:
|
|
82
|
+
y_train = np.unique(predictions)
|
|
83
83
|
|
|
84
|
-
self.classes_ = check_classes_attribute(self, np.unique(
|
|
84
|
+
self.classes_ = check_classes_attribute(self, np.unique(y_train))
|
|
85
85
|
class_counts = np.array([np.count_nonzero(predictions == _class) for _class in self.classes_])
|
|
86
86
|
prevalences = class_counts / len(predictions)
|
|
87
87
|
|
|
@@ -134,12 +134,15 @@ class PCC(SoftLearnerQMixin, BaseCount):
|
|
|
134
134
|
def __init__(self, learner=None):
|
|
135
135
|
super().__init__(learner=learner)
|
|
136
136
|
|
|
137
|
-
def aggregate(self, predictions):
|
|
137
|
+
def aggregate(self, predictions, y_train=None):
|
|
138
138
|
predictions = validate_predictions(self, predictions)
|
|
139
139
|
|
|
140
140
|
# Handle categorical predictions (1D array with class labels)
|
|
141
141
|
if predictions.ndim == 1 and not np.issubdtype(predictions.dtype, (np.floating, np.integer)):
|
|
142
|
-
|
|
142
|
+
if y_train is None:
|
|
143
|
+
y_values = np.unique(predictions)
|
|
144
|
+
|
|
145
|
+
self.classes_ = check_classes_attribute(self, np.unique(y_values))
|
|
143
146
|
class_counts = np.array([np.count_nonzero(predictions == _class) for _class in self.classes_])
|
|
144
147
|
prevalences = class_counts / len(predictions)
|
|
145
148
|
else:
|