mlquantify 0.1.2__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify-0.1.4/MANIFEST.in +1 -0
- {mlquantify-0.1.2 → mlquantify-0.1.4}/PKG-INFO +16 -13
- {mlquantify-0.1.2 → mlquantify-0.1.4}/README.md +15 -12
- mlquantify-0.1.4/VERSION.txt +1 -0
- mlquantify-0.1.4/mlquantify/evaluation/protocol.py +297 -0
- {mlquantify-0.1.2 → mlquantify-0.1.4}/mlquantify/methods/aggregative.py +130 -1
- {mlquantify-0.1.2 → mlquantify-0.1.4}/mlquantify/methods/threshold_optimization.py +0 -151
- {mlquantify-0.1.2 → mlquantify-0.1.4}/mlquantify/utils/general.py +43 -6
- {mlquantify-0.1.2 → mlquantify-0.1.4}/mlquantify.egg-info/PKG-INFO +16 -13
- {mlquantify-0.1.2 → mlquantify-0.1.4}/mlquantify.egg-info/SOURCES.txt +2 -0
- {mlquantify-0.1.2 → mlquantify-0.1.4}/setup.py +6 -6
- mlquantify-0.1.2/mlquantify/evaluation/protocol.py +0 -647
- {mlquantify-0.1.2 → mlquantify-0.1.4}/mlquantify/__init__.py +0 -0
- {mlquantify-0.1.2 → mlquantify-0.1.4}/mlquantify/base.py +0 -0
- {mlquantify-0.1.2 → mlquantify-0.1.4}/mlquantify/classification/__init__.py +0 -0
- {mlquantify-0.1.2 → mlquantify-0.1.4}/mlquantify/classification/methods.py +0 -0
- {mlquantify-0.1.2 → mlquantify-0.1.4}/mlquantify/evaluation/__init__.py +0 -0
- {mlquantify-0.1.2 → mlquantify-0.1.4}/mlquantify/evaluation/measures.py +0 -0
- {mlquantify-0.1.2 → mlquantify-0.1.4}/mlquantify/methods/__init__.py +0 -0
- {mlquantify-0.1.2 → mlquantify-0.1.4}/mlquantify/methods/meta.py +0 -0
- {mlquantify-0.1.2 → mlquantify-0.1.4}/mlquantify/methods/mixture_models.py +0 -0
- {mlquantify-0.1.2 → mlquantify-0.1.4}/mlquantify/methods/non_aggregative.py +0 -0
- {mlquantify-0.1.2 → mlquantify-0.1.4}/mlquantify/model_selection.py +0 -0
- {mlquantify-0.1.2 → mlquantify-0.1.4}/mlquantify/plots.py +0 -0
- {mlquantify-0.1.2 → mlquantify-0.1.4}/mlquantify/utils/__init__.py +0 -0
- {mlquantify-0.1.2 → mlquantify-0.1.4}/mlquantify/utils/method.py +0 -0
- {mlquantify-0.1.2 → mlquantify-0.1.4}/mlquantify.egg-info/dependency_links.txt +0 -0
- {mlquantify-0.1.2 → mlquantify-0.1.4}/mlquantify.egg-info/requires.txt +0 -0
- {mlquantify-0.1.2 → mlquantify-0.1.4}/mlquantify.egg-info/top_level.txt +0 -0
- {mlquantify-0.1.2 → mlquantify-0.1.4}/setup.cfg +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
include VERSION.txt
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mlquantify
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Quantification Library
|
|
5
5
|
Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
|
|
6
6
|
Maintainer: Luiz Fernando Luth Junior
|
|
@@ -40,9 +40,9 @@ ___
|
|
|
40
40
|
|
|
41
41
|
## Latest Release
|
|
42
42
|
|
|
43
|
-
- **Version 0.
|
|
44
|
-
- In case you need any help, refer to the [
|
|
45
|
-
- Explore the [API documentation](
|
|
43
|
+
- **Version 0.1.3**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
|
|
44
|
+
- In case you need any help, refer to the [User Guide](https://luizfernandolj.github.io/mlquantify/user_guide.html).
|
|
45
|
+
- Explore the [API documentation](https://luizfernandolj.github.io/mlquantify/api/index.html) for detailed developer information.
|
|
46
46
|
- See also the library in the pypi site in [pypi mlquantify](https://pypi.org/project/mlquantify/)
|
|
47
47
|
|
|
48
48
|
___
|
|
@@ -70,7 +70,7 @@ ___
|
|
|
70
70
|
| **21 Quantification Methods** | Methods for quantification, such as classify & Count Correct methods, Threshold Optimization, Mixture Models and more.|
|
|
71
71
|
| **Dynamic class management** | All methods are dynamic, and handles multiclass and binary problems, in case of binary it makes One-Vs-All (OVA) automatically. |
|
|
72
72
|
| **Model Selection** | Criteria and processes used to select the best model, such as grid-search for the case of quantification|
|
|
73
|
-
| **Evaluation Metrics** | Specific metrics used to evaluate quantification performance, (e.g., AE,
|
|
73
|
+
| **Evaluation Metrics** | Specific metrics used to evaluate quantification performance, (e.g., AE, MAE, NAE, SE, KLD, etc.). |
|
|
74
74
|
| **Evaluation Protocols** | Evaluation protocols used, based on sampling generation (e.g., APP, NPP, etc.).. |
|
|
75
75
|
| **Plotting Results** | Tools and techniques used to visualize results, such as the protocol results.|
|
|
76
76
|
| **Comprehensive Documentation** | Complete documentation of the project, including code, data, and results. |
|
|
@@ -82,7 +82,10 @@ ___
|
|
|
82
82
|
This code first loads the breast cancer dataset from _sklearn_, which is then split into training and testing sets. It uses the _Expectation Maximisation Quantifier (EMQ)_ with a RandomForest classifier to predict class prevalence. After training the model, it evaluates performance by calculating and printing the absolute error and bias between the real and predicted prevalences.
|
|
83
83
|
|
|
84
84
|
```python
|
|
85
|
-
|
|
85
|
+
from mlquantify.methods import EMQ
|
|
86
|
+
from mlquantify.evaluation.measures import absolute_error, mean_absolute_error
|
|
87
|
+
from mlquantify.utils import get_real_prev
|
|
88
|
+
|
|
86
89
|
from sklearn.ensemble import RandomForestClassifier
|
|
87
90
|
from sklearn.datasets import load_breast_cancer
|
|
88
91
|
from sklearn.model_selection import train_test_split
|
|
@@ -94,19 +97,19 @@ features, target = load_breast_cancer(return_X_y=True)
|
|
|
94
97
|
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
|
|
95
98
|
|
|
96
99
|
#Create the model, here it is the Expectation Maximisation Quantifier (EMQ) with a classifier
|
|
97
|
-
model =
|
|
100
|
+
model = EMQ(RandomForestClassifier())
|
|
98
101
|
model.fit(X_train, y_train)
|
|
99
102
|
|
|
100
103
|
#Predict the class prevalence for X_test
|
|
101
104
|
pred_prevalence = model.predict(X_test)
|
|
102
|
-
real_prevalence =
|
|
105
|
+
real_prevalence = get_real_prev(y_test)
|
|
103
106
|
|
|
104
107
|
#Get the error for the prediction
|
|
105
|
-
ae =
|
|
106
|
-
|
|
108
|
+
ae = absolute_error(real_prevalence, pred_prevalence)
|
|
109
|
+
mae = mean_absolute_error(real_prevalence, pred_prevalence)
|
|
107
110
|
|
|
108
|
-
print(f"
|
|
109
|
-
print(f"
|
|
111
|
+
print(f"Absolute Error -> {ae}")
|
|
112
|
+
print(f"Mean Absolute Error -> {mae}")
|
|
110
113
|
```
|
|
111
114
|
|
|
112
115
|
___
|
|
@@ -125,7 +128,7 @@ ___
|
|
|
125
128
|
|
|
126
129
|
## Documentation
|
|
127
130
|
|
|
128
|
-
##### API is avaliable [here](
|
|
131
|
+
##### API is avaliable [here](https://luizfernandolj.github.io/mlquantify/api/index.html)
|
|
129
132
|
|
|
130
133
|
- [Methods](https://github.com/luizfernandolj/mlquantify/wiki/Methods)
|
|
131
134
|
- [Model Selection](https://github.com/luizfernandolj/mlquantify/wiki/Model-Selection)
|
|
@@ -9,9 +9,9 @@ ___
|
|
|
9
9
|
|
|
10
10
|
## Latest Release
|
|
11
11
|
|
|
12
|
-
- **Version 0.
|
|
13
|
-
- In case you need any help, refer to the [
|
|
14
|
-
- Explore the [API documentation](
|
|
12
|
+
- **Version 0.1.3**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
|
|
13
|
+
- In case you need any help, refer to the [User Guide](https://luizfernandolj.github.io/mlquantify/user_guide.html).
|
|
14
|
+
- Explore the [API documentation](https://luizfernandolj.github.io/mlquantify/api/index.html) for detailed developer information.
|
|
15
15
|
- See also the library in the pypi site in [pypi mlquantify](https://pypi.org/project/mlquantify/)
|
|
16
16
|
|
|
17
17
|
___
|
|
@@ -39,7 +39,7 @@ ___
|
|
|
39
39
|
| **21 Quantification Methods** | Methods for quantification, such as classify & Count Correct methods, Threshold Optimization, Mixture Models and more.|
|
|
40
40
|
| **Dynamic class management** | All methods are dynamic, and handles multiclass and binary problems, in case of binary it makes One-Vs-All (OVA) automatically. |
|
|
41
41
|
| **Model Selection** | Criteria and processes used to select the best model, such as grid-search for the case of quantification|
|
|
42
|
-
| **Evaluation Metrics** | Specific metrics used to evaluate quantification performance, (e.g., AE,
|
|
42
|
+
| **Evaluation Metrics** | Specific metrics used to evaluate quantification performance, (e.g., AE, MAE, NAE, SE, KLD, etc.). |
|
|
43
43
|
| **Evaluation Protocols** | Evaluation protocols used, based on sampling generation (e.g., APP, NPP, etc.).. |
|
|
44
44
|
| **Plotting Results** | Tools and techniques used to visualize results, such as the protocol results.|
|
|
45
45
|
| **Comprehensive Documentation** | Complete documentation of the project, including code, data, and results. |
|
|
@@ -51,7 +51,10 @@ ___
|
|
|
51
51
|
This code first loads the breast cancer dataset from _sklearn_, which is then split into training and testing sets. It uses the _Expectation Maximisation Quantifier (EMQ)_ with a RandomForest classifier to predict class prevalence. After training the model, it evaluates performance by calculating and printing the absolute error and bias between the real and predicted prevalences.
|
|
52
52
|
|
|
53
53
|
```python
|
|
54
|
-
|
|
54
|
+
from mlquantify.methods import EMQ
|
|
55
|
+
from mlquantify.evaluation.measures import absolute_error, mean_absolute_error
|
|
56
|
+
from mlquantify.utils import get_real_prev
|
|
57
|
+
|
|
55
58
|
from sklearn.ensemble import RandomForestClassifier
|
|
56
59
|
from sklearn.datasets import load_breast_cancer
|
|
57
60
|
from sklearn.model_selection import train_test_split
|
|
@@ -63,19 +66,19 @@ features, target = load_breast_cancer(return_X_y=True)
|
|
|
63
66
|
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
|
|
64
67
|
|
|
65
68
|
#Create the model, here it is the Expectation Maximisation Quantifier (EMQ) with a classifier
|
|
66
|
-
model =
|
|
69
|
+
model = EMQ(RandomForestClassifier())
|
|
67
70
|
model.fit(X_train, y_train)
|
|
68
71
|
|
|
69
72
|
#Predict the class prevalence for X_test
|
|
70
73
|
pred_prevalence = model.predict(X_test)
|
|
71
|
-
real_prevalence =
|
|
74
|
+
real_prevalence = get_real_prev(y_test)
|
|
72
75
|
|
|
73
76
|
#Get the error for the prediction
|
|
74
|
-
ae =
|
|
75
|
-
|
|
77
|
+
ae = absolute_error(real_prevalence, pred_prevalence)
|
|
78
|
+
mae = mean_absolute_error(real_prevalence, pred_prevalence)
|
|
76
79
|
|
|
77
|
-
print(f"
|
|
78
|
-
print(f"
|
|
80
|
+
print(f"Absolute Error -> {ae}")
|
|
81
|
+
print(f"Mean Absolute Error -> {mae}")
|
|
79
82
|
```
|
|
80
83
|
|
|
81
84
|
___
|
|
@@ -94,7 +97,7 @@ ___
|
|
|
94
97
|
|
|
95
98
|
## Documentation
|
|
96
99
|
|
|
97
|
-
##### API is avaliable [here](
|
|
100
|
+
##### API is avaliable [here](https://luizfernandolj.github.io/mlquantify/api/index.html)
|
|
98
101
|
|
|
99
102
|
- [Methods](https://github.com/luizfernandolj/mlquantify/wiki/Methods)
|
|
100
103
|
- [Model Selection](https://github.com/luizfernandolj/mlquantify/wiki/Model-Selection)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.1.4
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
import numpy as np
|
|
3
|
+
from typing import Generator, Tuple
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
|
|
6
|
+
from ..utils.general import *
|
|
7
|
+
|
|
8
|
+
class Protocol(ABC):
|
|
9
|
+
"""Base class for evaluation protocols.
|
|
10
|
+
|
|
11
|
+
Parameters
|
|
12
|
+
----------
|
|
13
|
+
batch_size : int or list of int
|
|
14
|
+
The size of the batches to be used in the evaluation.
|
|
15
|
+
random_state : int, optional
|
|
16
|
+
The random seed for reproducibility.
|
|
17
|
+
|
|
18
|
+
Attributes
|
|
19
|
+
----------
|
|
20
|
+
n_combinations : int
|
|
21
|
+
|
|
22
|
+
Raises
|
|
23
|
+
------
|
|
24
|
+
ValueError
|
|
25
|
+
If the batch size is not a positive integer or list of positive integers.
|
|
26
|
+
|
|
27
|
+
Notes
|
|
28
|
+
-----
|
|
29
|
+
This class serves as a base class for different evaluation protocols, each with its own strategy for splitting the data into batches.
|
|
30
|
+
|
|
31
|
+
Examples
|
|
32
|
+
--------
|
|
33
|
+
>>> class MyCustomProtocol(Protocol):
|
|
34
|
+
... def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
|
|
35
|
+
... for batch_size in self.batch_size:
|
|
36
|
+
... yield np.random.choice(X.shape[0], batch_size, replace=True)
|
|
37
|
+
...
|
|
38
|
+
>>> protocol = MyCustomProtocol(batch_size=100, random_state=42)
|
|
39
|
+
>>> for train_idx, test_idx in protocol.split(X, y):
|
|
40
|
+
... # Train and evaluate model
|
|
41
|
+
... pass
|
|
42
|
+
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(self, batch_size, random_state=None, **kwargs):
|
|
46
|
+
if isinstance(batch_size, int):
|
|
47
|
+
self.n_combinations = 1
|
|
48
|
+
else:
|
|
49
|
+
self.n_combinations = len(batch_size)
|
|
50
|
+
|
|
51
|
+
self.batch_size = [batch_size] if isinstance(batch_size, int) else batch_size
|
|
52
|
+
self.random_state = random_state
|
|
53
|
+
|
|
54
|
+
for name, value in kwargs.items():
|
|
55
|
+
setattr(self, name, value)
|
|
56
|
+
if isinstance(value, list):
|
|
57
|
+
self.n_combinations *= len(value)
|
|
58
|
+
elif isinstance(value, (int, float)):
|
|
59
|
+
self.n_combinations *= value
|
|
60
|
+
else:
|
|
61
|
+
raise ValueError(f"Invalid argument {name}={value}: must be int/float or list of int/float.")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def split(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray, np.ndarray]:
|
|
65
|
+
"""
|
|
66
|
+
Split the data into samples for evaluation.
|
|
67
|
+
|
|
68
|
+
Parameters
|
|
69
|
+
----------
|
|
70
|
+
X : np.ndarray
|
|
71
|
+
The input features.
|
|
72
|
+
y : np.ndarray
|
|
73
|
+
The target labels.
|
|
74
|
+
|
|
75
|
+
Yields
|
|
76
|
+
------
|
|
77
|
+
Generator[np.ndarray, np.ndarray]
|
|
78
|
+
A generator that yields the indices for each split.
|
|
79
|
+
"""
|
|
80
|
+
indices = np.arange(X.shape[0])
|
|
81
|
+
for idx in self._split_indices_masks(X, y):
|
|
82
|
+
indexes = indices[idx]
|
|
83
|
+
yield indexes
|
|
84
|
+
|
|
85
|
+
def _split_indices_masks(self, X: np.ndarray, y: np.ndarray) -> Generator[Tuple[np.ndarray, np.ndarray]]:
|
|
86
|
+
for idx in self._iter_indices(X, y):
|
|
87
|
+
|
|
88
|
+
mask = np.zeros(X.shape[0], dtype=bool)
|
|
89
|
+
mask[idx] = True
|
|
90
|
+
|
|
91
|
+
yield mask
|
|
92
|
+
|
|
93
|
+
@abstractmethod
|
|
94
|
+
def _iter_indices(self, X, y):
|
|
95
|
+
"""Abstract method to be implemented by subclasses to yield indices for each batch."""
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
def get_n_combinations(self) -> int:
|
|
99
|
+
"""
|
|
100
|
+
Get the number of combinations for the current protocol.
|
|
101
|
+
"""
|
|
102
|
+
return self.n_combinations
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class APP(Protocol):
|
|
106
|
+
"""Artificial Prevalence Protocol (APP) for evaluation.
|
|
107
|
+
This protocol generates artificial prevalence distributions for the evaluation in an exhaustive manner, testing all possible combinations of prevalences.
|
|
108
|
+
|
|
109
|
+
Parameters
|
|
110
|
+
----------
|
|
111
|
+
batch_size : int or list of int
|
|
112
|
+
The size of the batches to be used in the evaluation.
|
|
113
|
+
n_prevalences : int
|
|
114
|
+
The number of artificial prevalences to generate.
|
|
115
|
+
repeats : int, optional
|
|
116
|
+
The number of times to repeat the evaluation with different random seeds.
|
|
117
|
+
random_state : int, optional
|
|
118
|
+
The random seed for reproducibility.
|
|
119
|
+
|
|
120
|
+
Attributes
|
|
121
|
+
----------
|
|
122
|
+
n_prevalences : int
|
|
123
|
+
The number of artificial prevalences to generate.
|
|
124
|
+
repeats : int
|
|
125
|
+
The number of times to repeat the evaluation with different random seeds.
|
|
126
|
+
random_state : int
|
|
127
|
+
The random seed for reproducibility.
|
|
128
|
+
|
|
129
|
+
Notes
|
|
130
|
+
-----
|
|
131
|
+
It is important to note that in case of multiclass problems, the time complexity of this protocol can be significantly higher due to the increased number of combinations to evaluate.
|
|
132
|
+
|
|
133
|
+
Examples
|
|
134
|
+
--------
|
|
135
|
+
>>> protocol = APP(batch_size=[100, 200], n_prevalences=5, repeats=3, random_state=42)
|
|
136
|
+
>>> for train_idx, test_idx in protocol.split(X, y):
|
|
137
|
+
... # Train and evaluate model
|
|
138
|
+
... pass
|
|
139
|
+
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
def __init__(self, batch_size, n_prevalences, repeats=1, random_state=None):
|
|
143
|
+
super().__init__(batch_size=batch_size,
|
|
144
|
+
random_state=random_state,
|
|
145
|
+
n_prevalences=n_prevalences,
|
|
146
|
+
repeats=repeats)
|
|
147
|
+
|
|
148
|
+
def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
|
|
149
|
+
|
|
150
|
+
n_dim = len(np.unique(y))
|
|
151
|
+
|
|
152
|
+
for batch_size in self.batch_size:
|
|
153
|
+
prevalences = generate_artificial_prevalences(n_dim=n_dim,
|
|
154
|
+
n_prev=self.n_prevalences,
|
|
155
|
+
n_iter=self.repeats)
|
|
156
|
+
for prev in prevalences:
|
|
157
|
+
indexes = get_indexes_with_prevalence(y, prev, batch_size)
|
|
158
|
+
yield indexes
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class NPP(Protocol):
|
|
164
|
+
"""No Prevalence Protocol (NPP) for evaluation.
|
|
165
|
+
This protocol just samples the data without any consideration for prevalence, with all instances having equal probability of being selected.
|
|
166
|
+
|
|
167
|
+
Parameters
|
|
168
|
+
----------
|
|
169
|
+
batch_size : int or list of int
|
|
170
|
+
The size of the batches to be used in the evaluation.
|
|
171
|
+
random_state : int, optional
|
|
172
|
+
The random seed for reproducibility.
|
|
173
|
+
|
|
174
|
+
Attributes
|
|
175
|
+
----------
|
|
176
|
+
n_prevalences : int
|
|
177
|
+
The number of artificial prevalences to generate.
|
|
178
|
+
repeats : int
|
|
179
|
+
The number of times to repeat the evaluation with different random seeds.
|
|
180
|
+
random_state : int
|
|
181
|
+
The random seed for reproducibility.
|
|
182
|
+
|
|
183
|
+
Examples
|
|
184
|
+
--------
|
|
185
|
+
>>> protocol = NPP(batch_size=100, random_state=42)
|
|
186
|
+
>>> for train_idx, test_idx in protocol.split(X, y):
|
|
187
|
+
... # Train and evaluate model
|
|
188
|
+
... pass
|
|
189
|
+
"""
|
|
190
|
+
|
|
191
|
+
def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
|
|
192
|
+
|
|
193
|
+
for batch_size in self.batch_size:
|
|
194
|
+
yield np.random.choice(X.shape[0], batch_size, replace=True)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class UPP(Protocol):
|
|
198
|
+
"""Uniform Prevalence Protocol (UPP) for evaluation.
|
|
199
|
+
An extension of the APP that generates artificial prevalence distributions uniformly across all classes utilizing the kraemer sampling method.
|
|
200
|
+
|
|
201
|
+
Parameters
|
|
202
|
+
----------
|
|
203
|
+
batch_size : int or list of int
|
|
204
|
+
The size of the batches to be used in the evaluation.
|
|
205
|
+
n_prevalences : int
|
|
206
|
+
The number of artificial prevalences to generate.
|
|
207
|
+
repeats : int
|
|
208
|
+
The number of times to repeat the evaluation with different random seeds.
|
|
209
|
+
random_state : int, optional
|
|
210
|
+
The random seed for reproducibility.
|
|
211
|
+
|
|
212
|
+
Attributes
|
|
213
|
+
----------
|
|
214
|
+
n_prevalences : int
|
|
215
|
+
The number of artificial prevalences to generate.
|
|
216
|
+
repeats : int
|
|
217
|
+
The number of times to repeat the evaluation with different random seeds.
|
|
218
|
+
random_state : int
|
|
219
|
+
The random seed for reproducibility.
|
|
220
|
+
|
|
221
|
+
Examples
|
|
222
|
+
--------
|
|
223
|
+
>>> protocol = UPP(batch_size=100, n_prevalences=5, repeats=3, random_state=42)
|
|
224
|
+
>>> for train_idx, test_idx in protocol.split(X, y):
|
|
225
|
+
... # Train and evaluate model
|
|
226
|
+
... pass
|
|
227
|
+
"""
|
|
228
|
+
|
|
229
|
+
def __init__(self, batch_size, n_prevalences, repeats=1, random_state=None):
|
|
230
|
+
super().__init__(batch_size=batch_size,
|
|
231
|
+
random_state=random_state,
|
|
232
|
+
n_prevalences=n_prevalences,
|
|
233
|
+
repeats=repeats)
|
|
234
|
+
|
|
235
|
+
def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
|
|
236
|
+
|
|
237
|
+
n_dim = len(np.unique(y))
|
|
238
|
+
|
|
239
|
+
for batch_size in self.batch_size:
|
|
240
|
+
|
|
241
|
+
prevalences = kraemer_sampling(n_dim=n_dim,
|
|
242
|
+
n_prev=self.n_prevalences,
|
|
243
|
+
n_iter=self.repeats)
|
|
244
|
+
|
|
245
|
+
for prev in prevalences:
|
|
246
|
+
indexes = get_indexes_with_prevalence(y, prev, batch_size)
|
|
247
|
+
yield indexes
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
class PPP(Protocol):
|
|
251
|
+
""" Personalized Prevalence Protocol (PPP) for evaluation.
|
|
252
|
+
This protocol generates artificial prevalence distributions personalized for each class.
|
|
253
|
+
|
|
254
|
+
Parameters
|
|
255
|
+
----------
|
|
256
|
+
batch_size : int or list of int
|
|
257
|
+
The size of the batches to be used in the evaluation.
|
|
258
|
+
prevalences : list of float
|
|
259
|
+
The list of artificial prevalences to generate for each class.
|
|
260
|
+
repeats : int
|
|
261
|
+
The number of times to repeat the evaluation with different random seeds.
|
|
262
|
+
random_state : int, optional
|
|
263
|
+
The random seed for reproducibility.
|
|
264
|
+
|
|
265
|
+
Attributes
|
|
266
|
+
----------
|
|
267
|
+
prevalences : list of float
|
|
268
|
+
The list of artificial prevalences to generate for each class.
|
|
269
|
+
repeats : int
|
|
270
|
+
The number of times to repeat the evaluation with different random seeds.
|
|
271
|
+
random_state : int
|
|
272
|
+
The random seed for reproducibility.
|
|
273
|
+
|
|
274
|
+
Examples
|
|
275
|
+
--------
|
|
276
|
+
>>> protocol = PPP(batch_size=100, prevalences=[0.1, 0.9], repeats=3, random_state=42)
|
|
277
|
+
>>> for train_idx, test_idx in protocol.split(X, y):
|
|
278
|
+
... # Train and evaluate model
|
|
279
|
+
... pass
|
|
280
|
+
"""
|
|
281
|
+
|
|
282
|
+
def __init__(self, batch_size, prevalences, repeats=1, random_state=None):
|
|
283
|
+
super().__init__(batch_size=batch_size,
|
|
284
|
+
random_state=random_state,
|
|
285
|
+
prevalences=prevalences,
|
|
286
|
+
repeats=repeats)
|
|
287
|
+
|
|
288
|
+
def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
|
|
289
|
+
|
|
290
|
+
for batch_size in self.batch_size:
|
|
291
|
+
for prev in self.prevalences:
|
|
292
|
+
if isinstance(prev, float):
|
|
293
|
+
prev = [1-prev, prev]
|
|
294
|
+
|
|
295
|
+
indexes = get_indexes_with_prevalence(y, prev, batch_size)
|
|
296
|
+
yield indexes
|
|
297
|
+
|
|
@@ -907,10 +907,140 @@ class PCC(AggregativeQuantifier):
|
|
|
907
907
|
|
|
908
908
|
|
|
909
909
|
|
|
910
|
+
class PACC(AggregativeQuantifier):
|
|
911
|
+
"""
|
|
912
|
+
Probabilistic Adjusted Classify and Count (PACC).
|
|
913
|
+
This method extends the Adjusted Classify and Count (AC) approach
|
|
914
|
+
by leveraging the average class-conditional confidences obtained
|
|
915
|
+
from a probabilistic classifier instead of relying solely on true
|
|
916
|
+
positive and false positive rates.
|
|
917
|
+
|
|
918
|
+
Parameters
|
|
919
|
+
----------
|
|
920
|
+
learner : BaseEstimator
|
|
921
|
+
A scikit-learn compatible classifier to be used for quantification.
|
|
922
|
+
threshold : float, optional
|
|
923
|
+
The decision threshold for classification. Default is 0.5.
|
|
924
|
+
|
|
925
|
+
Attributes
|
|
926
|
+
----------
|
|
927
|
+
learner : BaseEstimator
|
|
928
|
+
A scikit-learn compatible classifier.
|
|
929
|
+
threshold : float
|
|
930
|
+
Decision threshold for classification. Default is 0.5.
|
|
931
|
+
tpr : float
|
|
932
|
+
True positive rate computed during the fitting process.
|
|
933
|
+
fpr : float
|
|
934
|
+
False positive rate computed during the fitting process.
|
|
935
|
+
|
|
936
|
+
See Also
|
|
937
|
+
--------
|
|
938
|
+
ThresholdOptimization : Base class for threshold-based quantification methods.
|
|
939
|
+
ACC : Adjusted Classify and Count quantification method.
|
|
940
|
+
CC : Classify and Count quantification method.
|
|
941
|
+
|
|
942
|
+
References
|
|
943
|
+
----------
|
|
944
|
+
A. Bella, C. Ferri, J. Hernández-Orallo and M. J. Ramírez-Quintana, "Quantification via Probability Estimators," 2010 IEEE International Conference on Data Mining, Sydney, NSW, Australia, 2010, pp. 737-742, doi: 10.1109/ICDM.2010.75. Available at: https://ieeexplore.ieee.org/abstract/document/5694031
|
|
910
945
|
|
|
946
|
+
Examples
|
|
947
|
+
--------
|
|
948
|
+
>>> from mlquantify.methods.aggregative import PACC
|
|
949
|
+
>>> from mlquantify.utils.general import get_real_prev
|
|
950
|
+
>>> from sklearn.datasets import load_breast_cancer
|
|
951
|
+
>>> from sklearn.svm import SVC
|
|
952
|
+
>>> from sklearn.model_selection import train_test_split
|
|
953
|
+
>>>
|
|
954
|
+
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
955
|
+
>>>
|
|
956
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
|
|
957
|
+
>>>
|
|
958
|
+
>>> pacc = PACC(learner=SVC(probability=True))
|
|
959
|
+
>>> pacc.fit(X_train, y_train)
|
|
960
|
+
>>> y_pred = pacc.predict(X_test)
|
|
961
|
+
>>> y_pred
|
|
962
|
+
{0: 0.4664886119311328, 1: 0.5335113880688672}
|
|
963
|
+
>>> get_real_prev(y_test)
|
|
964
|
+
{0: 0.3991228070175439, 1: 0.6008771929824561}
|
|
965
|
+
"""
|
|
911
966
|
|
|
967
|
+
def __init__(self, learner: BaseEstimator=None, threshold: float = 0.5):
|
|
968
|
+
self.learner = learner
|
|
969
|
+
self.threshold = threshold
|
|
970
|
+
self.mean_pos = None
|
|
971
|
+
self.mean_neg = None
|
|
972
|
+
|
|
973
|
+
@property
|
|
974
|
+
def is_probabilistic(self) -> bool:
|
|
975
|
+
return True
|
|
976
|
+
|
|
977
|
+
@property
|
|
978
|
+
def is_multiclass(self) -> bool:
|
|
979
|
+
return False
|
|
912
980
|
|
|
981
|
+
def _fit_method(self, X, y):
|
|
982
|
+
# Get predicted labels and probabilities
|
|
983
|
+
if mq.arguments["y_labels"] is not None and mq.arguments["posteriors_train"] is not None:
|
|
984
|
+
y_labels = mq.arguments["y_labels"]
|
|
985
|
+
probabilities = mq.arguments["posteriors_train"]
|
|
986
|
+
else:
|
|
987
|
+
y_labels, probabilities = get_scores(X, y, self.learner, self.cv_folds, self.learner_fitted)
|
|
988
|
+
|
|
989
|
+
# Adjust thresholds and compute true and false positive rates
|
|
990
|
+
|
|
991
|
+
self.mean_pos = np.mean(probabilities[y_labels == self.classes[1], 1])
|
|
992
|
+
self.mean_neg = np.mean(probabilities[y_labels != self.classes[1], 1])
|
|
993
|
+
|
|
994
|
+
return self
|
|
995
|
+
|
|
996
|
+
|
|
997
|
+
def _predict_method(self, X):
|
|
998
|
+
"""
|
|
999
|
+
Predicts the class prevalence using the mean class-conditional
|
|
1000
|
+
probabilities from a probabilistic classifier.
|
|
913
1001
|
|
|
1002
|
+
Parameters
|
|
1003
|
+
----------
|
|
1004
|
+
X : array-like or sparse matrix of shape (n_samples, n_features)
|
|
1005
|
+
The input data for prediction.
|
|
1006
|
+
|
|
1007
|
+
Returns
|
|
1008
|
+
-------
|
|
1009
|
+
dict
|
|
1010
|
+
A dictionary with class labels as keys and their respective
|
|
1011
|
+
prevalence estimates as values.
|
|
1012
|
+
|
|
1013
|
+
Notes
|
|
1014
|
+
-----
|
|
1015
|
+
The prevalence is adjusted using the formula:
|
|
1016
|
+
prevalence = |mean_score - FPR| / (TPR - FPR),
|
|
1017
|
+
where mean_score is the average probability for the positive class.
|
|
1018
|
+
|
|
1019
|
+
Raises
|
|
1020
|
+
------
|
|
1021
|
+
ZeroDivisionError
|
|
1022
|
+
If `TPR - FPR` equals zero, indicating that the classifier's
|
|
1023
|
+
performance does not vary across the threshold range.
|
|
1024
|
+
"""
|
|
1025
|
+
prevalences = {}
|
|
1026
|
+
|
|
1027
|
+
# Calculate probabilities for the positive class
|
|
1028
|
+
probabilities = self.predict_learner(X)[:, 1]
|
|
1029
|
+
|
|
1030
|
+
# Compute the mean score for the positive class
|
|
1031
|
+
mean_scores = np.mean(probabilities)
|
|
1032
|
+
|
|
1033
|
+
# Adjust prevalence based on TPR and FPR
|
|
1034
|
+
if self.mean_pos - self.mean_neg == 0:
|
|
1035
|
+
prevalence = mean_scores
|
|
1036
|
+
else:
|
|
1037
|
+
prevalence = np.clip(abs(mean_scores - self.mean_neg) / (self.mean_pos - self.mean_neg), 0, 1)
|
|
1038
|
+
|
|
1039
|
+
# Map the computed prevalence to the class labels
|
|
1040
|
+
prevalences[self.classes[0]] = 1 - prevalence
|
|
1041
|
+
prevalences[self.classes[1]] = prevalence
|
|
1042
|
+
|
|
1043
|
+
return prevalences
|
|
914
1044
|
|
|
915
1045
|
|
|
916
1046
|
class PWK(AggregativeQuantifier):
|
|
@@ -1012,7 +1142,6 @@ class PWK(AggregativeQuantifier):
|
|
|
1012
1142
|
from . import threshold_optimization
|
|
1013
1143
|
|
|
1014
1144
|
ACC = threshold_optimization.ACC
|
|
1015
|
-
PACC = threshold_optimization.PACC
|
|
1016
1145
|
T50 = threshold_optimization.T50
|
|
1017
1146
|
MAX = threshold_optimization.MAX
|
|
1018
1147
|
X_method = threshold_optimization.X_method
|