mlquantify 0.1.3__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mlquantify-0.1.3/mlquantify.egg-info → mlquantify-0.1.4}/PKG-INFO +16 -13
- {mlquantify-0.1.3 → mlquantify-0.1.4}/README.md +15 -12
- mlquantify-0.1.4/VERSION.txt +1 -0
- mlquantify-0.1.4/mlquantify/evaluation/protocol.py +297 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/mlquantify/utils/general.py +43 -6
- {mlquantify-0.1.3 → mlquantify-0.1.4/mlquantify.egg-info}/PKG-INFO +16 -13
- mlquantify-0.1.3/VERSION.txt +0 -1
- mlquantify-0.1.3/mlquantify/evaluation/protocol.py +0 -647
- {mlquantify-0.1.3 → mlquantify-0.1.4}/MANIFEST.in +0 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/mlquantify/__init__.py +0 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/mlquantify/base.py +0 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/mlquantify/classification/__init__.py +0 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/mlquantify/classification/methods.py +0 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/mlquantify/evaluation/__init__.py +0 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/mlquantify/evaluation/measures.py +0 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/mlquantify/methods/__init__.py +0 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/mlquantify/methods/aggregative.py +0 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/mlquantify/methods/meta.py +0 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/mlquantify/methods/mixture_models.py +0 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/mlquantify/methods/non_aggregative.py +0 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/mlquantify/methods/threshold_optimization.py +0 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/mlquantify/model_selection.py +0 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/mlquantify/plots.py +0 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/mlquantify/utils/__init__.py +0 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/mlquantify/utils/method.py +0 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/mlquantify.egg-info/SOURCES.txt +0 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/mlquantify.egg-info/dependency_links.txt +0 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/mlquantify.egg-info/requires.txt +0 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/mlquantify.egg-info/top_level.txt +0 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/setup.cfg +0 -0
- {mlquantify-0.1.3 → mlquantify-0.1.4}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mlquantify
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Quantification Library
|
|
5
5
|
Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
|
|
6
6
|
Maintainer: Luiz Fernando Luth Junior
|
|
@@ -40,9 +40,9 @@ ___
|
|
|
40
40
|
|
|
41
41
|
## Latest Release
|
|
42
42
|
|
|
43
|
-
- **Version 0.
|
|
44
|
-
- In case you need any help, refer to the [
|
|
45
|
-
- Explore the [API documentation](
|
|
43
|
+
- **Version 0.1.3**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
|
|
44
|
+
- In case you need any help, refer to the [User Guide](https://luizfernandolj.github.io/mlquantify/user_guide.html).
|
|
45
|
+
- Explore the [API documentation](https://luizfernandolj.github.io/mlquantify/api/index.html) for detailed developer information.
|
|
46
46
|
- See also the library in the pypi site in [pypi mlquantify](https://pypi.org/project/mlquantify/)
|
|
47
47
|
|
|
48
48
|
___
|
|
@@ -70,7 +70,7 @@ ___
|
|
|
70
70
|
| **21 Quantification Methods** | Methods for quantification, such as classify & Count Correct methods, Threshold Optimization, Mixture Models and more.|
|
|
71
71
|
| **Dynamic class management** | All methods are dynamic, and handles multiclass and binary problems, in case of binary it makes One-Vs-All (OVA) automatically. |
|
|
72
72
|
| **Model Selection** | Criteria and processes used to select the best model, such as grid-search for the case of quantification|
|
|
73
|
-
| **Evaluation Metrics** | Specific metrics used to evaluate quantification performance, (e.g., AE,
|
|
73
|
+
| **Evaluation Metrics** | Specific metrics used to evaluate quantification performance, (e.g., AE, MAE, NAE, SE, KLD, etc.). |
|
|
74
74
|
| **Evaluation Protocols** | Evaluation protocols used, based on sampling generation (e.g., APP, NPP, etc.).. |
|
|
75
75
|
| **Plotting Results** | Tools and techniques used to visualize results, such as the protocol results.|
|
|
76
76
|
| **Comprehensive Documentation** | Complete documentation of the project, including code, data, and results. |
|
|
@@ -82,7 +82,10 @@ ___
|
|
|
82
82
|
This code first loads the breast cancer dataset from _sklearn_, which is then split into training and testing sets. It uses the _Expectation Maximisation Quantifier (EMQ)_ with a RandomForest classifier to predict class prevalence. After training the model, it evaluates performance by calculating and printing the absolute error and bias between the real and predicted prevalences.
|
|
83
83
|
|
|
84
84
|
```python
|
|
85
|
-
|
|
85
|
+
from mlquantify.methods import EMQ
|
|
86
|
+
from mlquantify.evaluation.measures import absolute_error, mean_absolute_error
|
|
87
|
+
from mlquantify.utils import get_real_prev
|
|
88
|
+
|
|
86
89
|
from sklearn.ensemble import RandomForestClassifier
|
|
87
90
|
from sklearn.datasets import load_breast_cancer
|
|
88
91
|
from sklearn.model_selection import train_test_split
|
|
@@ -94,19 +97,19 @@ features, target = load_breast_cancer(return_X_y=True)
|
|
|
94
97
|
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
|
|
95
98
|
|
|
96
99
|
#Create the model, here it is the Expectation Maximisation Quantifier (EMQ) with a classifier
|
|
97
|
-
model =
|
|
100
|
+
model = EMQ(RandomForestClassifier())
|
|
98
101
|
model.fit(X_train, y_train)
|
|
99
102
|
|
|
100
103
|
#Predict the class prevalence for X_test
|
|
101
104
|
pred_prevalence = model.predict(X_test)
|
|
102
|
-
real_prevalence =
|
|
105
|
+
real_prevalence = get_real_prev(y_test)
|
|
103
106
|
|
|
104
107
|
#Get the error for the prediction
|
|
105
|
-
ae =
|
|
106
|
-
|
|
108
|
+
ae = absolute_error(real_prevalence, pred_prevalence)
|
|
109
|
+
mae = mean_absolute_error(real_prevalence, pred_prevalence)
|
|
107
110
|
|
|
108
|
-
print(f"
|
|
109
|
-
print(f"
|
|
111
|
+
print(f"Absolute Error -> {ae}")
|
|
112
|
+
print(f"Mean Absolute Error -> {mae}")
|
|
110
113
|
```
|
|
111
114
|
|
|
112
115
|
___
|
|
@@ -125,7 +128,7 @@ ___
|
|
|
125
128
|
|
|
126
129
|
## Documentation
|
|
127
130
|
|
|
128
|
-
##### API is avaliable [here](
|
|
131
|
+
##### API is avaliable [here](https://luizfernandolj.github.io/mlquantify/api/index.html)
|
|
129
132
|
|
|
130
133
|
- [Methods](https://github.com/luizfernandolj/mlquantify/wiki/Methods)
|
|
131
134
|
- [Model Selection](https://github.com/luizfernandolj/mlquantify/wiki/Model-Selection)
|
|
@@ -9,9 +9,9 @@ ___
|
|
|
9
9
|
|
|
10
10
|
## Latest Release
|
|
11
11
|
|
|
12
|
-
- **Version 0.
|
|
13
|
-
- In case you need any help, refer to the [
|
|
14
|
-
- Explore the [API documentation](
|
|
12
|
+
- **Version 0.1.3**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
|
|
13
|
+
- In case you need any help, refer to the [User Guide](https://luizfernandolj.github.io/mlquantify/user_guide.html).
|
|
14
|
+
- Explore the [API documentation](https://luizfernandolj.github.io/mlquantify/api/index.html) for detailed developer information.
|
|
15
15
|
- See also the library in the pypi site in [pypi mlquantify](https://pypi.org/project/mlquantify/)
|
|
16
16
|
|
|
17
17
|
___
|
|
@@ -39,7 +39,7 @@ ___
|
|
|
39
39
|
| **21 Quantification Methods** | Methods for quantification, such as classify & Count Correct methods, Threshold Optimization, Mixture Models and more.|
|
|
40
40
|
| **Dynamic class management** | All methods are dynamic, and handles multiclass and binary problems, in case of binary it makes One-Vs-All (OVA) automatically. |
|
|
41
41
|
| **Model Selection** | Criteria and processes used to select the best model, such as grid-search for the case of quantification|
|
|
42
|
-
| **Evaluation Metrics** | Specific metrics used to evaluate quantification performance, (e.g., AE,
|
|
42
|
+
| **Evaluation Metrics** | Specific metrics used to evaluate quantification performance, (e.g., AE, MAE, NAE, SE, KLD, etc.). |
|
|
43
43
|
| **Evaluation Protocols** | Evaluation protocols used, based on sampling generation (e.g., APP, NPP, etc.).. |
|
|
44
44
|
| **Plotting Results** | Tools and techniques used to visualize results, such as the protocol results.|
|
|
45
45
|
| **Comprehensive Documentation** | Complete documentation of the project, including code, data, and results. |
|
|
@@ -51,7 +51,10 @@ ___
|
|
|
51
51
|
This code first loads the breast cancer dataset from _sklearn_, which is then split into training and testing sets. It uses the _Expectation Maximisation Quantifier (EMQ)_ with a RandomForest classifier to predict class prevalence. After training the model, it evaluates performance by calculating and printing the absolute error and bias between the real and predicted prevalences.
|
|
52
52
|
|
|
53
53
|
```python
|
|
54
|
-
|
|
54
|
+
from mlquantify.methods import EMQ
|
|
55
|
+
from mlquantify.evaluation.measures import absolute_error, mean_absolute_error
|
|
56
|
+
from mlquantify.utils import get_real_prev
|
|
57
|
+
|
|
55
58
|
from sklearn.ensemble import RandomForestClassifier
|
|
56
59
|
from sklearn.datasets import load_breast_cancer
|
|
57
60
|
from sklearn.model_selection import train_test_split
|
|
@@ -63,19 +66,19 @@ features, target = load_breast_cancer(return_X_y=True)
|
|
|
63
66
|
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
|
|
64
67
|
|
|
65
68
|
#Create the model, here it is the Expectation Maximisation Quantifier (EMQ) with a classifier
|
|
66
|
-
model =
|
|
69
|
+
model = EMQ(RandomForestClassifier())
|
|
67
70
|
model.fit(X_train, y_train)
|
|
68
71
|
|
|
69
72
|
#Predict the class prevalence for X_test
|
|
70
73
|
pred_prevalence = model.predict(X_test)
|
|
71
|
-
real_prevalence =
|
|
74
|
+
real_prevalence = get_real_prev(y_test)
|
|
72
75
|
|
|
73
76
|
#Get the error for the prediction
|
|
74
|
-
ae =
|
|
75
|
-
|
|
77
|
+
ae = absolute_error(real_prevalence, pred_prevalence)
|
|
78
|
+
mae = mean_absolute_error(real_prevalence, pred_prevalence)
|
|
76
79
|
|
|
77
|
-
print(f"
|
|
78
|
-
print(f"
|
|
80
|
+
print(f"Absolute Error -> {ae}")
|
|
81
|
+
print(f"Mean Absolute Error -> {mae}")
|
|
79
82
|
```
|
|
80
83
|
|
|
81
84
|
___
|
|
@@ -94,7 +97,7 @@ ___
|
|
|
94
97
|
|
|
95
98
|
## Documentation
|
|
96
99
|
|
|
97
|
-
##### API is avaliable [here](
|
|
100
|
+
##### API is avaliable [here](https://luizfernandolj.github.io/mlquantify/api/index.html)
|
|
98
101
|
|
|
99
102
|
- [Methods](https://github.com/luizfernandolj/mlquantify/wiki/Methods)
|
|
100
103
|
- [Model Selection](https://github.com/luizfernandolj/mlquantify/wiki/Model-Selection)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.1.4
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
import numpy as np
|
|
3
|
+
from typing import Generator, Tuple
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
|
|
6
|
+
from ..utils.general import *
|
|
7
|
+
|
|
8
|
+
class Protocol(ABC):
|
|
9
|
+
"""Base class for evaluation protocols.
|
|
10
|
+
|
|
11
|
+
Parameters
|
|
12
|
+
----------
|
|
13
|
+
batch_size : int or list of int
|
|
14
|
+
The size of the batches to be used in the evaluation.
|
|
15
|
+
random_state : int, optional
|
|
16
|
+
The random seed for reproducibility.
|
|
17
|
+
|
|
18
|
+
Attributes
|
|
19
|
+
----------
|
|
20
|
+
n_combinations : int
|
|
21
|
+
|
|
22
|
+
Raises
|
|
23
|
+
------
|
|
24
|
+
ValueError
|
|
25
|
+
If the batch size is not a positive integer or list of positive integers.
|
|
26
|
+
|
|
27
|
+
Notes
|
|
28
|
+
-----
|
|
29
|
+
This class serves as a base class for different evaluation protocols, each with its own strategy for splitting the data into batches.
|
|
30
|
+
|
|
31
|
+
Examples
|
|
32
|
+
--------
|
|
33
|
+
>>> class MyCustomProtocol(Protocol):
|
|
34
|
+
... def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
|
|
35
|
+
... for batch_size in self.batch_size:
|
|
36
|
+
... yield np.random.choice(X.shape[0], batch_size, replace=True)
|
|
37
|
+
...
|
|
38
|
+
>>> protocol = MyCustomProtocol(batch_size=100, random_state=42)
|
|
39
|
+
>>> for train_idx, test_idx in protocol.split(X, y):
|
|
40
|
+
... # Train and evaluate model
|
|
41
|
+
... pass
|
|
42
|
+
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(self, batch_size, random_state=None, **kwargs):
|
|
46
|
+
if isinstance(batch_size, int):
|
|
47
|
+
self.n_combinations = 1
|
|
48
|
+
else:
|
|
49
|
+
self.n_combinations = len(batch_size)
|
|
50
|
+
|
|
51
|
+
self.batch_size = [batch_size] if isinstance(batch_size, int) else batch_size
|
|
52
|
+
self.random_state = random_state
|
|
53
|
+
|
|
54
|
+
for name, value in kwargs.items():
|
|
55
|
+
setattr(self, name, value)
|
|
56
|
+
if isinstance(value, list):
|
|
57
|
+
self.n_combinations *= len(value)
|
|
58
|
+
elif isinstance(value, (int, float)):
|
|
59
|
+
self.n_combinations *= value
|
|
60
|
+
else:
|
|
61
|
+
raise ValueError(f"Invalid argument {name}={value}: must be int/float or list of int/float.")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def split(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray, np.ndarray]:
|
|
65
|
+
"""
|
|
66
|
+
Split the data into samples for evaluation.
|
|
67
|
+
|
|
68
|
+
Parameters
|
|
69
|
+
----------
|
|
70
|
+
X : np.ndarray
|
|
71
|
+
The input features.
|
|
72
|
+
y : np.ndarray
|
|
73
|
+
The target labels.
|
|
74
|
+
|
|
75
|
+
Yields
|
|
76
|
+
------
|
|
77
|
+
Generator[np.ndarray, np.ndarray]
|
|
78
|
+
A generator that yields the indices for each split.
|
|
79
|
+
"""
|
|
80
|
+
indices = np.arange(X.shape[0])
|
|
81
|
+
for idx in self._split_indices_masks(X, y):
|
|
82
|
+
indexes = indices[idx]
|
|
83
|
+
yield indexes
|
|
84
|
+
|
|
85
|
+
def _split_indices_masks(self, X: np.ndarray, y: np.ndarray) -> Generator[Tuple[np.ndarray, np.ndarray]]:
|
|
86
|
+
for idx in self._iter_indices(X, y):
|
|
87
|
+
|
|
88
|
+
mask = np.zeros(X.shape[0], dtype=bool)
|
|
89
|
+
mask[idx] = True
|
|
90
|
+
|
|
91
|
+
yield mask
|
|
92
|
+
|
|
93
|
+
@abstractmethod
|
|
94
|
+
def _iter_indices(self, X, y):
|
|
95
|
+
"""Abstract method to be implemented by subclasses to yield indices for each batch."""
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
def get_n_combinations(self) -> int:
|
|
99
|
+
"""
|
|
100
|
+
Get the number of combinations for the current protocol.
|
|
101
|
+
"""
|
|
102
|
+
return self.n_combinations
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class APP(Protocol):
|
|
106
|
+
"""Artificial Prevalence Protocol (APP) for evaluation.
|
|
107
|
+
This protocol generates artificial prevalence distributions for the evaluation in an exhaustive manner, testing all possible combinations of prevalences.
|
|
108
|
+
|
|
109
|
+
Parameters
|
|
110
|
+
----------
|
|
111
|
+
batch_size : int or list of int
|
|
112
|
+
The size of the batches to be used in the evaluation.
|
|
113
|
+
n_prevalences : int
|
|
114
|
+
The number of artificial prevalences to generate.
|
|
115
|
+
repeats : int, optional
|
|
116
|
+
The number of times to repeat the evaluation with different random seeds.
|
|
117
|
+
random_state : int, optional
|
|
118
|
+
The random seed for reproducibility.
|
|
119
|
+
|
|
120
|
+
Attributes
|
|
121
|
+
----------
|
|
122
|
+
n_prevalences : int
|
|
123
|
+
The number of artificial prevalences to generate.
|
|
124
|
+
repeats : int
|
|
125
|
+
The number of times to repeat the evaluation with different random seeds.
|
|
126
|
+
random_state : int
|
|
127
|
+
The random seed for reproducibility.
|
|
128
|
+
|
|
129
|
+
Notes
|
|
130
|
+
-----
|
|
131
|
+
It is important to note that in case of multiclass problems, the time complexity of this protocol can be significantly higher due to the increased number of combinations to evaluate.
|
|
132
|
+
|
|
133
|
+
Examples
|
|
134
|
+
--------
|
|
135
|
+
>>> protocol = APP(batch_size=[100, 200], n_prevalences=5, repeats=3, random_state=42)
|
|
136
|
+
>>> for train_idx, test_idx in protocol.split(X, y):
|
|
137
|
+
... # Train and evaluate model
|
|
138
|
+
... pass
|
|
139
|
+
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
def __init__(self, batch_size, n_prevalences, repeats=1, random_state=None):
|
|
143
|
+
super().__init__(batch_size=batch_size,
|
|
144
|
+
random_state=random_state,
|
|
145
|
+
n_prevalences=n_prevalences,
|
|
146
|
+
repeats=repeats)
|
|
147
|
+
|
|
148
|
+
def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
|
|
149
|
+
|
|
150
|
+
n_dim = len(np.unique(y))
|
|
151
|
+
|
|
152
|
+
for batch_size in self.batch_size:
|
|
153
|
+
prevalences = generate_artificial_prevalences(n_dim=n_dim,
|
|
154
|
+
n_prev=self.n_prevalences,
|
|
155
|
+
n_iter=self.repeats)
|
|
156
|
+
for prev in prevalences:
|
|
157
|
+
indexes = get_indexes_with_prevalence(y, prev, batch_size)
|
|
158
|
+
yield indexes
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class NPP(Protocol):
|
|
164
|
+
"""No Prevalence Protocol (NPP) for evaluation.
|
|
165
|
+
This protocol just samples the data without any consideration for prevalence, with all instances having equal probability of being selected.
|
|
166
|
+
|
|
167
|
+
Parameters
|
|
168
|
+
----------
|
|
169
|
+
batch_size : int or list of int
|
|
170
|
+
The size of the batches to be used in the evaluation.
|
|
171
|
+
random_state : int, optional
|
|
172
|
+
The random seed for reproducibility.
|
|
173
|
+
|
|
174
|
+
Attributes
|
|
175
|
+
----------
|
|
176
|
+
n_prevalences : int
|
|
177
|
+
The number of artificial prevalences to generate.
|
|
178
|
+
repeats : int
|
|
179
|
+
The number of times to repeat the evaluation with different random seeds.
|
|
180
|
+
random_state : int
|
|
181
|
+
The random seed for reproducibility.
|
|
182
|
+
|
|
183
|
+
Examples
|
|
184
|
+
--------
|
|
185
|
+
>>> protocol = NPP(batch_size=100, random_state=42)
|
|
186
|
+
>>> for train_idx, test_idx in protocol.split(X, y):
|
|
187
|
+
... # Train and evaluate model
|
|
188
|
+
... pass
|
|
189
|
+
"""
|
|
190
|
+
|
|
191
|
+
def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
|
|
192
|
+
|
|
193
|
+
for batch_size in self.batch_size:
|
|
194
|
+
yield np.random.choice(X.shape[0], batch_size, replace=True)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class UPP(Protocol):
|
|
198
|
+
"""Uniform Prevalence Protocol (UPP) for evaluation.
|
|
199
|
+
An extension of the APP that generates artificial prevalence distributions uniformly across all classes utilizing the kraemer sampling method.
|
|
200
|
+
|
|
201
|
+
Parameters
|
|
202
|
+
----------
|
|
203
|
+
batch_size : int or list of int
|
|
204
|
+
The size of the batches to be used in the evaluation.
|
|
205
|
+
n_prevalences : int
|
|
206
|
+
The number of artificial prevalences to generate.
|
|
207
|
+
repeats : int
|
|
208
|
+
The number of times to repeat the evaluation with different random seeds.
|
|
209
|
+
random_state : int, optional
|
|
210
|
+
The random seed for reproducibility.
|
|
211
|
+
|
|
212
|
+
Attributes
|
|
213
|
+
----------
|
|
214
|
+
n_prevalences : int
|
|
215
|
+
The number of artificial prevalences to generate.
|
|
216
|
+
repeats : int
|
|
217
|
+
The number of times to repeat the evaluation with different random seeds.
|
|
218
|
+
random_state : int
|
|
219
|
+
The random seed for reproducibility.
|
|
220
|
+
|
|
221
|
+
Examples
|
|
222
|
+
--------
|
|
223
|
+
>>> protocol = UPP(batch_size=100, n_prevalences=5, repeats=3, random_state=42)
|
|
224
|
+
>>> for train_idx, test_idx in protocol.split(X, y):
|
|
225
|
+
... # Train and evaluate model
|
|
226
|
+
... pass
|
|
227
|
+
"""
|
|
228
|
+
|
|
229
|
+
def __init__(self, batch_size, n_prevalences, repeats=1, random_state=None):
|
|
230
|
+
super().__init__(batch_size=batch_size,
|
|
231
|
+
random_state=random_state,
|
|
232
|
+
n_prevalences=n_prevalences,
|
|
233
|
+
repeats=repeats)
|
|
234
|
+
|
|
235
|
+
def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
|
|
236
|
+
|
|
237
|
+
n_dim = len(np.unique(y))
|
|
238
|
+
|
|
239
|
+
for batch_size in self.batch_size:
|
|
240
|
+
|
|
241
|
+
prevalences = kraemer_sampling(n_dim=n_dim,
|
|
242
|
+
n_prev=self.n_prevalences,
|
|
243
|
+
n_iter=self.repeats)
|
|
244
|
+
|
|
245
|
+
for prev in prevalences:
|
|
246
|
+
indexes = get_indexes_with_prevalence(y, prev, batch_size)
|
|
247
|
+
yield indexes
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
class PPP(Protocol):
|
|
251
|
+
""" Personalized Prevalence Protocol (PPP) for evaluation.
|
|
252
|
+
This protocol generates artificial prevalence distributions personalized for each class.
|
|
253
|
+
|
|
254
|
+
Parameters
|
|
255
|
+
----------
|
|
256
|
+
batch_size : int or list of int
|
|
257
|
+
The size of the batches to be used in the evaluation.
|
|
258
|
+
prevalences : list of float
|
|
259
|
+
The list of artificial prevalences to generate for each class.
|
|
260
|
+
repeats : int
|
|
261
|
+
The number of times to repeat the evaluation with different random seeds.
|
|
262
|
+
random_state : int, optional
|
|
263
|
+
The random seed for reproducibility.
|
|
264
|
+
|
|
265
|
+
Attributes
|
|
266
|
+
----------
|
|
267
|
+
prevalences : list of float
|
|
268
|
+
The list of artificial prevalences to generate for each class.
|
|
269
|
+
repeats : int
|
|
270
|
+
The number of times to repeat the evaluation with different random seeds.
|
|
271
|
+
random_state : int
|
|
272
|
+
The random seed for reproducibility.
|
|
273
|
+
|
|
274
|
+
Examples
|
|
275
|
+
--------
|
|
276
|
+
>>> protocol = PPP(batch_size=100, prevalences=[0.1, 0.9], repeats=3, random_state=42)
|
|
277
|
+
>>> for train_idx, test_idx in protocol.split(X, y):
|
|
278
|
+
... # Train and evaluate model
|
|
279
|
+
... pass
|
|
280
|
+
"""
|
|
281
|
+
|
|
282
|
+
def __init__(self, batch_size, prevalences, repeats=1, random_state=None):
|
|
283
|
+
super().__init__(batch_size=batch_size,
|
|
284
|
+
random_state=random_state,
|
|
285
|
+
prevalences=prevalences,
|
|
286
|
+
repeats=repeats)
|
|
287
|
+
|
|
288
|
+
def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
|
|
289
|
+
|
|
290
|
+
for batch_size in self.batch_size:
|
|
291
|
+
for prev in self.prevalences:
|
|
292
|
+
if isinstance(prev, float):
|
|
293
|
+
prev = [1-prev, prev]
|
|
294
|
+
|
|
295
|
+
indexes = get_indexes_with_prevalence(y, prev, batch_size)
|
|
296
|
+
yield indexes
|
|
297
|
+
|
|
@@ -26,12 +26,9 @@ def convert_columns_to_arrays(df, columns:list = ['PRED_PREVS', 'REAL_PREVS']):
|
|
|
26
26
|
return df
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def generate_artificial_indexes(y, prevalence: list, sample_size:int, classes:list):
|
|
29
|
+
def get_indexes_with_prevalence(y, prevalence: list, sample_size:int):
|
|
33
30
|
"""
|
|
34
|
-
|
|
31
|
+
Get indexes for a stratified sample based on the prevalence of each class.
|
|
35
32
|
|
|
36
33
|
Parameters
|
|
37
34
|
----------
|
|
@@ -48,10 +45,13 @@ def generate_artificial_indexes(y, prevalence: list, sample_size:int, classes:li
|
|
|
48
45
|
-------
|
|
49
46
|
list
|
|
50
47
|
List of indexes for the stratified sample.
|
|
51
|
-
"""
|
|
48
|
+
"""
|
|
49
|
+
classes = np.unique(y)
|
|
50
|
+
|
|
52
51
|
# Ensure the sum of prevalences is 1
|
|
53
52
|
assert np.isclose(sum(prevalence), 1), "The sum of prevalences must be 1"
|
|
54
53
|
# Ensure the number of prevalences matches the number of classes
|
|
54
|
+
assert len(prevalence) == len(classes), "The number of prevalences must match the number of classes"
|
|
55
55
|
|
|
56
56
|
sampled_indexes = []
|
|
57
57
|
total_sampled = 0
|
|
@@ -78,6 +78,43 @@ def generate_artificial_indexes(y, prevalence: list, sample_size:int, classes:li
|
|
|
78
78
|
|
|
79
79
|
|
|
80
80
|
|
|
81
|
+
def kraemer_sampling(n_dim: int, n_prev: int, n_iter: int = 1) -> np.ndarray:
|
|
82
|
+
"""
|
|
83
|
+
Uniform sampling from the unit simplex using Kraemer's algorithm.
|
|
84
|
+
|
|
85
|
+
Parameters
|
|
86
|
+
----------
|
|
87
|
+
n_dim : int
|
|
88
|
+
Number of dimensions.
|
|
89
|
+
n_prev : int
|
|
90
|
+
Size of the sample.
|
|
91
|
+
n_iter : int
|
|
92
|
+
Number of iterations.
|
|
93
|
+
|
|
94
|
+
Returns
|
|
95
|
+
-------
|
|
96
|
+
np.ndarray
|
|
97
|
+
Array of sampled prevalences.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
def _sampling(n_dim: int, n_prev: int) -> np.ndarray:
|
|
101
|
+
if n_dim == 2:
|
|
102
|
+
u = np.random.rand(n_prev)
|
|
103
|
+
return np.vstack([1 - u, u]).T
|
|
104
|
+
else:
|
|
105
|
+
u = np.random.rand(n_prev, n_dim - 1)
|
|
106
|
+
u.sort(axis=-1) # sort each row
|
|
107
|
+
_0s = np.zeros((n_prev, 1))
|
|
108
|
+
_1s = np.ones((n_prev, 1))
|
|
109
|
+
a = np.hstack([_0s, u])
|
|
110
|
+
b = np.hstack([u, _1s])
|
|
111
|
+
return b - a
|
|
112
|
+
|
|
113
|
+
# repeat n_iter times
|
|
114
|
+
prevs = _sampling(n_dim, n_prev)
|
|
115
|
+
|
|
116
|
+
return np.repeat(prevs, n_iter, axis=0) if n_iter > 1 else prevs
|
|
117
|
+
|
|
81
118
|
|
|
82
119
|
def generate_artificial_prevalences(n_dim: int, n_prev: int, n_iter: int) -> np.ndarray:
|
|
83
120
|
"""Generates n artificial prevalences with n dimensions.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mlquantify
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Quantification Library
|
|
5
5
|
Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
|
|
6
6
|
Maintainer: Luiz Fernando Luth Junior
|
|
@@ -40,9 +40,9 @@ ___
|
|
|
40
40
|
|
|
41
41
|
## Latest Release
|
|
42
42
|
|
|
43
|
-
- **Version 0.
|
|
44
|
-
- In case you need any help, refer to the [
|
|
45
|
-
- Explore the [API documentation](
|
|
43
|
+
- **Version 0.1.3**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
|
|
44
|
+
- In case you need any help, refer to the [User Guide](https://luizfernandolj.github.io/mlquantify/user_guide.html).
|
|
45
|
+
- Explore the [API documentation](https://luizfernandolj.github.io/mlquantify/api/index.html) for detailed developer information.
|
|
46
46
|
- See also the library in the pypi site in [pypi mlquantify](https://pypi.org/project/mlquantify/)
|
|
47
47
|
|
|
48
48
|
___
|
|
@@ -70,7 +70,7 @@ ___
|
|
|
70
70
|
| **21 Quantification Methods** | Methods for quantification, such as classify & Count Correct methods, Threshold Optimization, Mixture Models and more.|
|
|
71
71
|
| **Dynamic class management** | All methods are dynamic, and handles multiclass and binary problems, in case of binary it makes One-Vs-All (OVA) automatically. |
|
|
72
72
|
| **Model Selection** | Criteria and processes used to select the best model, such as grid-search for the case of quantification|
|
|
73
|
-
| **Evaluation Metrics** | Specific metrics used to evaluate quantification performance, (e.g., AE,
|
|
73
|
+
| **Evaluation Metrics** | Specific metrics used to evaluate quantification performance, (e.g., AE, MAE, NAE, SE, KLD, etc.). |
|
|
74
74
|
| **Evaluation Protocols** | Evaluation protocols used, based on sampling generation (e.g., APP, NPP, etc.).. |
|
|
75
75
|
| **Plotting Results** | Tools and techniques used to visualize results, such as the protocol results.|
|
|
76
76
|
| **Comprehensive Documentation** | Complete documentation of the project, including code, data, and results. |
|
|
@@ -82,7 +82,10 @@ ___
|
|
|
82
82
|
This code first loads the breast cancer dataset from _sklearn_, which is then split into training and testing sets. It uses the _Expectation Maximisation Quantifier (EMQ)_ with a RandomForest classifier to predict class prevalence. After training the model, it evaluates performance by calculating and printing the absolute error and bias between the real and predicted prevalences.
|
|
83
83
|
|
|
84
84
|
```python
|
|
85
|
-
|
|
85
|
+
from mlquantify.methods import EMQ
|
|
86
|
+
from mlquantify.evaluation.measures import absolute_error, mean_absolute_error
|
|
87
|
+
from mlquantify.utils import get_real_prev
|
|
88
|
+
|
|
86
89
|
from sklearn.ensemble import RandomForestClassifier
|
|
87
90
|
from sklearn.datasets import load_breast_cancer
|
|
88
91
|
from sklearn.model_selection import train_test_split
|
|
@@ -94,19 +97,19 @@ features, target = load_breast_cancer(return_X_y=True)
|
|
|
94
97
|
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
|
|
95
98
|
|
|
96
99
|
#Create the model, here it is the Expectation Maximisation Quantifier (EMQ) with a classifier
|
|
97
|
-
model =
|
|
100
|
+
model = EMQ(RandomForestClassifier())
|
|
98
101
|
model.fit(X_train, y_train)
|
|
99
102
|
|
|
100
103
|
#Predict the class prevalence for X_test
|
|
101
104
|
pred_prevalence = model.predict(X_test)
|
|
102
|
-
real_prevalence =
|
|
105
|
+
real_prevalence = get_real_prev(y_test)
|
|
103
106
|
|
|
104
107
|
#Get the error for the prediction
|
|
105
|
-
ae =
|
|
106
|
-
|
|
108
|
+
ae = absolute_error(real_prevalence, pred_prevalence)
|
|
109
|
+
mae = mean_absolute_error(real_prevalence, pred_prevalence)
|
|
107
110
|
|
|
108
|
-
print(f"
|
|
109
|
-
print(f"
|
|
111
|
+
print(f"Absolute Error -> {ae}")
|
|
112
|
+
print(f"Mean Absolute Error -> {mae}")
|
|
110
113
|
```
|
|
111
114
|
|
|
112
115
|
___
|
|
@@ -125,7 +128,7 @@ ___
|
|
|
125
128
|
|
|
126
129
|
## Documentation
|
|
127
130
|
|
|
128
|
-
##### API is avaliable [here](
|
|
131
|
+
##### API is avaliable [here](https://luizfernandolj.github.io/mlquantify/api/index.html)
|
|
129
132
|
|
|
130
133
|
- [Methods](https://github.com/luizfernandolj/mlquantify/wiki/Methods)
|
|
131
134
|
- [Model Selection](https://github.com/luizfernandolj/mlquantify/wiki/Model-Selection)
|
mlquantify-0.1.3/VERSION.txt
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.1.3
|
|
@@ -1,647 +0,0 @@
|
|
|
1
|
-
from abc import ABC, abstractmethod
|
|
2
|
-
import numpy as np
|
|
3
|
-
import pandas as pd
|
|
4
|
-
from typing import Union, List, Tuple, Any
|
|
5
|
-
from sklearn.base import BaseEstimator
|
|
6
|
-
from time import time
|
|
7
|
-
from tqdm import tqdm
|
|
8
|
-
|
|
9
|
-
from ..methods import METHODS, AGGREGATIVE, NON_AGGREGATIVE
|
|
10
|
-
from ..utils.general import *
|
|
11
|
-
from ..utils.method import *
|
|
12
|
-
from . import MEASURES
|
|
13
|
-
from ..base import Quantifier
|
|
14
|
-
|
|
15
|
-
import mlquantify as mq
|
|
16
|
-
|
|
17
|
-
class Protocol(ABC):
|
|
18
|
-
"""Base class for evaluation protocols.
|
|
19
|
-
|
|
20
|
-
Parameters
|
|
21
|
-
----------
|
|
22
|
-
models : Union[List[Union[str, Quantifier]], str, Quantifier]
|
|
23
|
-
List of quantification models, a single model name, or 'all' for all models.
|
|
24
|
-
learner : BaseEstimator, optional
|
|
25
|
-
Machine learning model to be used with the quantifiers. Required for model methods.
|
|
26
|
-
n_jobs : int, optional
|
|
27
|
-
Number of jobs to run in parallel. Default is 1.
|
|
28
|
-
random_state : int, optional
|
|
29
|
-
Seed for random number generation. Default is 32.
|
|
30
|
-
verbose : bool, optional
|
|
31
|
-
Whether to print progress messages. Default is False.
|
|
32
|
-
return_type : str, optional
|
|
33
|
-
Type of return value ('predictions' or 'table'). Default is 'predictions'.
|
|
34
|
-
measures : List[str], optional
|
|
35
|
-
List of error measures to calculate. Must be in MEASURES or None. Default is None.
|
|
36
|
-
columns : List[str], optional
|
|
37
|
-
Columns to be included in the table. Default is ['ITERATION', 'QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS', 'BATCH_SIZE'].
|
|
38
|
-
|
|
39
|
-
Attributes
|
|
40
|
-
----------
|
|
41
|
-
models : List[Quantifier]
|
|
42
|
-
List of quantification models.
|
|
43
|
-
learner : BaseEstimator
|
|
44
|
-
Machine learning model to be used with the quantifiers.
|
|
45
|
-
n_jobs : int
|
|
46
|
-
Number of jobs to run in parallel.
|
|
47
|
-
random_state : int
|
|
48
|
-
Seed for random number generation.
|
|
49
|
-
verbose : bool
|
|
50
|
-
Whether to print progress messages.
|
|
51
|
-
return_type : str
|
|
52
|
-
Type of return value ('predictions' or 'table').
|
|
53
|
-
measures : List[str]
|
|
54
|
-
List of error measures to calculate.
|
|
55
|
-
columns : List[str]
|
|
56
|
-
Columns to be included in the table.
|
|
57
|
-
|
|
58
|
-
Raises
|
|
59
|
-
------
|
|
60
|
-
AssertionError
|
|
61
|
-
If measures contain invalid error measures.
|
|
62
|
-
If return_type is invalid.
|
|
63
|
-
If columns does not contain ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS'].
|
|
64
|
-
|
|
65
|
-
Notes
|
|
66
|
-
-----
|
|
67
|
-
- The 'models' parameter can be a list of Quantifiers, a single Quantifier, a list of model names, a single model name, or 'all'.
|
|
68
|
-
- If 'models' is a list of model names or 'all', 'learner' must be provided.
|
|
69
|
-
- The 'all' option for 'models' will use all quantification models available in the library.
|
|
70
|
-
- If 'models' is a Quantifier or list of Quantifier, 'learner' is not required. But the models must be initializated
|
|
71
|
-
- You can pass your own model by passing a Quantifier object.
|
|
72
|
-
- Columns must contain ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS'].
|
|
73
|
-
- If 'return_type' is 'table', the table will contain the columns specified in 'columns' and the error measures in 'measures'.
|
|
74
|
-
- For creating your own protocol, you must have the attributes 'models', 'learner', 'n_jobs', 'random_state', 'verbose', 'return_type', 'measures', and 'columns'., but columns can be changed, as long as it contains ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS'].
|
|
75
|
-
|
|
76
|
-
See Also
|
|
77
|
-
--------
|
|
78
|
-
APP : Artificial Prevalence Protocol.
|
|
79
|
-
NPP : Natural Prevalence Protocol.
|
|
80
|
-
Quantifier : Base class for quantification methods.
|
|
81
|
-
|
|
82
|
-
Examples
|
|
83
|
-
--------
|
|
84
|
-
import numpy as np
|
|
85
|
-
>>> from mlquantify.evaluation.protocol import Protocol
|
|
86
|
-
>>> from mlquantify.utils import get_real_prev
|
|
87
|
-
>>> from sklearn.ensemble import RandomForestClassifier
|
|
88
|
-
>>> from sklearn.datasets import load_breast_cancer
|
|
89
|
-
>>> from sklearn.model_selection import train_test_split
|
|
90
|
-
>>> import time as t
|
|
91
|
-
>>>
|
|
92
|
-
>>> class MyProtocol(Protocol):
|
|
93
|
-
... def __init__(self,
|
|
94
|
-
... models,
|
|
95
|
-
... learner,
|
|
96
|
-
... n_jobs,
|
|
97
|
-
... random_state,
|
|
98
|
-
... verbose,
|
|
99
|
-
... return_type,
|
|
100
|
-
... measures,
|
|
101
|
-
... sample_size,
|
|
102
|
-
... iterations=10):
|
|
103
|
-
... super().__init__(models,
|
|
104
|
-
... learner,
|
|
105
|
-
... n_jobs,
|
|
106
|
-
... random_state,
|
|
107
|
-
... verbose,
|
|
108
|
-
... return_type,
|
|
109
|
-
... measures,
|
|
110
|
-
... columns=['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS', 'TIME'])
|
|
111
|
-
... self.sample_size = sample_size
|
|
112
|
-
... self.iterations = iterations
|
|
113
|
-
...
|
|
114
|
-
... def predict_protocol(self, X_test, y_test):
|
|
115
|
-
... predictions = []
|
|
116
|
-
...
|
|
117
|
-
... X_sample, y_sample = self._new_sample(X_test, y_test)
|
|
118
|
-
...
|
|
119
|
-
... for _ in range(self.iterations):
|
|
120
|
-
... for model in self.models:
|
|
121
|
-
... quantifier = model.__class__.__name__
|
|
122
|
-
...
|
|
123
|
-
... real_prev = get_real_prev(y_sample)
|
|
124
|
-
...
|
|
125
|
-
... start_time = t.time()
|
|
126
|
-
... pred_prev = model.predict(X_sample)
|
|
127
|
-
... end_time = t.time()
|
|
128
|
-
... time = end_time - start_time
|
|
129
|
-
...
|
|
130
|
-
... predictions.append([quantifier, real_prev, pred_prev, time])
|
|
131
|
-
...
|
|
132
|
-
... return predictions
|
|
133
|
-
...
|
|
134
|
-
... def _new_sample(self, X_test, y_test):
|
|
135
|
-
... indexes = np.random.choice(len(X_test), size=self.sample_size, replace=False)
|
|
136
|
-
... return X_test[indexes], y_test[indexes]
|
|
137
|
-
>>>
|
|
138
|
-
>>>
|
|
139
|
-
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
140
|
-
>>>
|
|
141
|
-
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.5, random_state=42)
|
|
142
|
-
>>>
|
|
143
|
-
>>> protocol = MyProtocol(models=["CC", "EMQ", "DyS"], # or [CC(learner), EMQ(learner), DyS(learner)]
|
|
144
|
-
... learner=RandomForestClassifier(),
|
|
145
|
-
... n_jobs=1,
|
|
146
|
-
... random_state=42,
|
|
147
|
-
... verbose=True,
|
|
148
|
-
... return_type="table",
|
|
149
|
-
... measures=None,
|
|
150
|
-
... sample_size=100)
|
|
151
|
-
>>>
|
|
152
|
-
>>> protocol.fit(X_train, y_train)
|
|
153
|
-
>>> table = protocol.predict(X_test, y_test)
|
|
154
|
-
>>> print(table)
|
|
155
|
-
|
|
156
|
-
"""
|
|
157
|
-
|
|
158
|
-
def __init__(self,
|
|
159
|
-
models: Union[List[Union[str, Quantifier]], str, Quantifier],
|
|
160
|
-
learner: BaseEstimator = None,
|
|
161
|
-
n_jobs: int = 1,
|
|
162
|
-
random_state: int = 32,
|
|
163
|
-
verbose: bool = False,
|
|
164
|
-
return_type: str = "predictions",
|
|
165
|
-
measures: List[str] = None,
|
|
166
|
-
columns: List[str] = ["ITERATION", "QUANTIFIER", "REAL_PREVS", "PRED_PREVS", "BATCH_SIZE"]):
|
|
167
|
-
|
|
168
|
-
assert not measures or all(m in MEASURES for m in measures), \
|
|
169
|
-
f"Invalid measure(s) provided. Valid options: {list(MEASURES.keys())} or None"
|
|
170
|
-
assert return_type in ["predictions", "table"], \
|
|
171
|
-
"Invalid return_type. Valid options: ['predictions', 'table']"
|
|
172
|
-
assert all(col in columns for col in ["QUANTIFIER", "REAL_PREVS", "PRED_PREVS"]), \
|
|
173
|
-
"Columns must contain ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS']"
|
|
174
|
-
|
|
175
|
-
# Fixed parameters
|
|
176
|
-
self.models = self._initialize_models(models, learner)
|
|
177
|
-
self.learner = learner
|
|
178
|
-
self.n_jobs = n_jobs
|
|
179
|
-
self.random_state = random_state
|
|
180
|
-
self.verbose = verbose
|
|
181
|
-
self.return_type = return_type
|
|
182
|
-
self.measures = measures
|
|
183
|
-
self.columns = columns
|
|
184
|
-
|
|
185
|
-
def _initialize_models(self, models, learner):
|
|
186
|
-
"""Initializes the quantification models.
|
|
187
|
-
|
|
188
|
-
Parameters
|
|
189
|
-
----------
|
|
190
|
-
models : Union[List[Union[str, Quantifier]], str, Quantifier]
|
|
191
|
-
List of quantification models, a single model name, or 'all' for all models.
|
|
192
|
-
learner : BaseEstimator
|
|
193
|
-
Machine learning model to be used with the quantifiers.
|
|
194
|
-
|
|
195
|
-
Returns
|
|
196
|
-
-------
|
|
197
|
-
List[Quantifier]
|
|
198
|
-
List of quantification models.
|
|
199
|
-
"""
|
|
200
|
-
if isinstance(models, list):
|
|
201
|
-
if all(isinstance(model, Quantifier) for model in models):
|
|
202
|
-
return models
|
|
203
|
-
return [get_method(model)(learner) for model in models]
|
|
204
|
-
|
|
205
|
-
if isinstance(models, Quantifier):
|
|
206
|
-
return [models]
|
|
207
|
-
|
|
208
|
-
assert learner is not None, "Learner is required for model methods."
|
|
209
|
-
|
|
210
|
-
model_dict = {
|
|
211
|
-
"all": METHODS.values,
|
|
212
|
-
"aggregative": AGGREGATIVE.values,
|
|
213
|
-
"non_aggregative": NON_AGGREGATIVE.values
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
if models in model_dict:
|
|
217
|
-
return [model(learner) if hasattr(model, "learner") else model() for model in model_dict[models]()]
|
|
218
|
-
return [get_method(models)(learner)]
|
|
219
|
-
|
|
220
|
-
def sout(self, msg):
|
|
221
|
-
"""Prints a message if verbose is True."""
|
|
222
|
-
if self.verbose:
|
|
223
|
-
print('[APP]' + msg)
|
|
224
|
-
|
|
225
|
-
def fit(self, X_train, y_train):
|
|
226
|
-
"""Fits the models with the training data.
|
|
227
|
-
|
|
228
|
-
Parameters
|
|
229
|
-
----------
|
|
230
|
-
X_train : np.ndarray
|
|
231
|
-
Features of the training set.
|
|
232
|
-
y_train : np.ndarray
|
|
233
|
-
Labels of the training set.
|
|
234
|
-
|
|
235
|
-
Returns
|
|
236
|
-
-------
|
|
237
|
-
Protocol
|
|
238
|
-
Fitted protocol.
|
|
239
|
-
"""
|
|
240
|
-
self.sout("Fitting models")
|
|
241
|
-
|
|
242
|
-
args = ((model, X_train, y_train) for model in self.models)
|
|
243
|
-
|
|
244
|
-
wrapper = tqdm if self.verbose else lambda x, **kwargs: x
|
|
245
|
-
|
|
246
|
-
self.models = Parallel(n_jobs=self.n_jobs)( # Parallel processing of models
|
|
247
|
-
delayed(self._delayed_fit)(*arg) for arg in wrapper(args, desc="Fitting models", total=len(self.models))
|
|
248
|
-
)
|
|
249
|
-
self.sout("Fit [Done]")
|
|
250
|
-
return self
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
def predict(self, X_test: np.ndarray, y_test: np.ndarray) -> Any:
|
|
254
|
-
"""Predicts the prevalence for the test set.
|
|
255
|
-
|
|
256
|
-
Parameters
|
|
257
|
-
----------
|
|
258
|
-
X_test : np.ndarray
|
|
259
|
-
Features of the test set.
|
|
260
|
-
y_test : np.ndarray
|
|
261
|
-
Labels of the test set.
|
|
262
|
-
|
|
263
|
-
Returns
|
|
264
|
-
-------
|
|
265
|
-
Any
|
|
266
|
-
Predictions for the test set. Can be a table or a tuple with the quantifier names, real prevalence, and predicted prevalence.
|
|
267
|
-
"""
|
|
268
|
-
predictions = self.predict_protocol(X_test, y_test)
|
|
269
|
-
predictions_df = pd.DataFrame(predictions, columns=self.columns)
|
|
270
|
-
|
|
271
|
-
if self.return_type == "table":
|
|
272
|
-
if self.measures:
|
|
273
|
-
smoothed_factor = 1 / (2 * len(X_test))
|
|
274
|
-
|
|
275
|
-
def smooth(values: np.ndarray) -> np.ndarray:
|
|
276
|
-
return (values + smoothed_factor) / (smoothed_factor * len(values) + 1)
|
|
277
|
-
|
|
278
|
-
for metric in self.measures:
|
|
279
|
-
predictions_df[metric] = predictions_df.apply(
|
|
280
|
-
lambda row: get_measure(metric)(
|
|
281
|
-
smooth(np.array(row["REAL_PREVS"])),
|
|
282
|
-
smooth(np.array(row["PRED_PREVS"]))
|
|
283
|
-
),
|
|
284
|
-
axis=1
|
|
285
|
-
)
|
|
286
|
-
return predictions_df
|
|
287
|
-
|
|
288
|
-
return (
|
|
289
|
-
predictions_df["QUANTIFIER"].to_numpy(), # Quantifier names
|
|
290
|
-
np.stack(predictions_df["REAL_PREVS"].to_numpy()), # REAL_PREVS
|
|
291
|
-
np.stack(predictions_df["PRED_PREVS"].to_numpy()) # PRED_PREVS
|
|
292
|
-
)
|
|
293
|
-
|
|
294
|
-
@abstractmethod
|
|
295
|
-
def predict_protocol(self, X_test: np.ndarray, y_test: np.ndarray) -> np.ndarray:
|
|
296
|
-
"""Abstract method that every protocol must implement
|
|
297
|
-
|
|
298
|
-
Parameters
|
|
299
|
-
----------
|
|
300
|
-
X_test : np.ndarray
|
|
301
|
-
Features of the test set.
|
|
302
|
-
y_test : np.ndarray
|
|
303
|
-
Labels of the test set.
|
|
304
|
-
|
|
305
|
-
Returns
|
|
306
|
-
-------
|
|
307
|
-
np.ndarray
|
|
308
|
-
Predictions for the test set. With the same format as the column names attribute.
|
|
309
|
-
"""
|
|
310
|
-
...
|
|
311
|
-
|
|
312
|
-
@abstractmethod
|
|
313
|
-
def _new_sample(self) -> Tuple[np.ndarray, np.ndarray]:
|
|
314
|
-
"""Abstract method of sample extraction for each protocol.
|
|
315
|
-
|
|
316
|
-
Returns:
|
|
317
|
-
Tuple[np.ndarray, np.ndarray]: Tuple containing X_sample and y_sample.
|
|
318
|
-
"""
|
|
319
|
-
...
|
|
320
|
-
|
|
321
|
-
@staticmethod
|
|
322
|
-
def _delayed_fit(model, X_train, y_train):
|
|
323
|
-
"""Method to fit the model in parallel.
|
|
324
|
-
|
|
325
|
-
Parameters
|
|
326
|
-
----------
|
|
327
|
-
model : Quantifier
|
|
328
|
-
Quantification model.
|
|
329
|
-
X_train : np.ndarray
|
|
330
|
-
Features of the training set.
|
|
331
|
-
y_train : np.ndarray
|
|
332
|
-
Labels of the training set.
|
|
333
|
-
|
|
334
|
-
Returns
|
|
335
|
-
-------
|
|
336
|
-
Quantifier
|
|
337
|
-
Fitted quantification model
|
|
338
|
-
"""
|
|
339
|
-
model_name = model.__class__.__name__
|
|
340
|
-
if model_name == "Ensemble" and isinstance(model.base_quantifier, Quantifier):
|
|
341
|
-
model_name = f"{model.__class__.__name__}_{model.base_quantifier.__class__.__name__}_{model.size}"
|
|
342
|
-
|
|
343
|
-
start = time()
|
|
344
|
-
model = model.fit(X=X_train, y=y_train)
|
|
345
|
-
duration = time() - start
|
|
346
|
-
print(f"\tFitted {model_name} in {duration:.3f} seconds")
|
|
347
|
-
return model
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
class APP(Protocol):
|
|
356
|
-
"""Artificial Prevalence Protocol.
|
|
357
|
-
|
|
358
|
-
This approach splits a test into several samples varying prevalence and sample size,
|
|
359
|
-
with n iterations. For a list of Quantifiers, it computes training and testing for
|
|
360
|
-
each one and returns either a table of results with error measures or just the predictions.
|
|
361
|
-
|
|
362
|
-
Parameters
|
|
363
|
-
----------
|
|
364
|
-
models : Union[List[Union[str, Quantifier]], str, Quantifier]
|
|
365
|
-
List of quantification models, a single model name, or 'all' for all models.
|
|
366
|
-
batch_size : Union[List[int], int]
|
|
367
|
-
Size of the batches to be processed, or a list of sizes.
|
|
368
|
-
learner : BaseEstimator, optional
|
|
369
|
-
Machine learning model to be used with the quantifiers. Required for model methods.
|
|
370
|
-
n_prevs : int, optional
|
|
371
|
-
Number of prevalence points to generate. Default is 100.
|
|
372
|
-
n_iterations : int, optional
|
|
373
|
-
Number of iterations for the protocol. Default is 1.
|
|
374
|
-
n_jobs : int, optional
|
|
375
|
-
Number of jobs to run in parallel. Default is 1.
|
|
376
|
-
random_state : int, optional
|
|
377
|
-
Seed for random number generation. Default is 32.
|
|
378
|
-
verbose : bool, optional
|
|
379
|
-
Whether to print progress messages. Default is False.
|
|
380
|
-
return_type : str, optional
|
|
381
|
-
Type of return value ('predictions' or 'table'). Default is 'predictions'.
|
|
382
|
-
measures : List[str], optional
|
|
383
|
-
List of error measures to calculate. Must be in MEASURES or None. Default is None.
|
|
384
|
-
|
|
385
|
-
Attributes
|
|
386
|
-
----------
|
|
387
|
-
models : List[Quantifier]
|
|
388
|
-
List of quantification models.
|
|
389
|
-
batch_size : Union[List[int], int]
|
|
390
|
-
Size of the batches to be processed.
|
|
391
|
-
learner : BaseEstimator
|
|
392
|
-
Machine learning model to be used with the quantifiers.
|
|
393
|
-
n_prevs : int
|
|
394
|
-
Number of prevalence points to generate.
|
|
395
|
-
n_iterations : int
|
|
396
|
-
Number of iterations for the protocol.
|
|
397
|
-
n_jobs : int
|
|
398
|
-
Number of jobs to run in parallel.
|
|
399
|
-
random_state : int
|
|
400
|
-
Seed for random number generation.
|
|
401
|
-
verbose : bool
|
|
402
|
-
Whether to print progress messages.
|
|
403
|
-
return_type : str
|
|
404
|
-
Type of return value ('predictions' or 'table').
|
|
405
|
-
measures : List[str]
|
|
406
|
-
List of error measures to calculate.
|
|
407
|
-
|
|
408
|
-
Raises
|
|
409
|
-
------
|
|
410
|
-
AssertionError
|
|
411
|
-
If return_type is invalid.
|
|
412
|
-
|
|
413
|
-
See Also
|
|
414
|
-
--------
|
|
415
|
-
Protocol : Base class for evaluation protocols.
|
|
416
|
-
NPP : Natural Prevalence Protocol.
|
|
417
|
-
Quantifier : Base class for quantification methods.
|
|
418
|
-
|
|
419
|
-
Examples
|
|
420
|
-
--------
|
|
421
|
-
>>> from mlquantify.evaluation.protocol import APP
|
|
422
|
-
>>> from sklearn.ensemble import RandomForestClassifier
|
|
423
|
-
>>> from sklearn.datasets import load_breast_cancer
|
|
424
|
-
>>> from sklearn.model_selection import train_test_split
|
|
425
|
-
>>>
|
|
426
|
-
>>> # Loading dataset from sklearn
|
|
427
|
-
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
428
|
-
>>>
|
|
429
|
-
>>> #Splitting into train and test
|
|
430
|
-
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
|
|
431
|
-
>>>
|
|
432
|
-
>>> app = APP(models=["CC", "EMQ", "DyS"],
|
|
433
|
-
... batch_size=[10, 50, 100],
|
|
434
|
-
... learner=RandomForestClassifier(),
|
|
435
|
-
... n_prevs=100, # Default
|
|
436
|
-
... n_jobs=-1,
|
|
437
|
-
... return_type="table",
|
|
438
|
-
... measures=["ae", "se"],
|
|
439
|
-
... verbose=True)
|
|
440
|
-
>>>
|
|
441
|
-
>>> app.fit(X_train, y_train)
|
|
442
|
-
>>>
|
|
443
|
-
>>> table = app.predict(X_test, y_test)
|
|
444
|
-
>>>
|
|
445
|
-
>>> print(table)
|
|
446
|
-
"""
|
|
447
|
-
|
|
448
|
-
def __init__(self,
|
|
449
|
-
models: Union[List[Union[str, Quantifier]], str, Quantifier],
|
|
450
|
-
batch_size: Union[List[int], int],
|
|
451
|
-
learner: BaseEstimator = None,
|
|
452
|
-
n_prevs: int = 100,
|
|
453
|
-
n_iterations: int = 1,
|
|
454
|
-
n_jobs: int = 1,
|
|
455
|
-
random_state: int = 32,
|
|
456
|
-
verbose: bool = False,
|
|
457
|
-
return_type: str = "predictions",
|
|
458
|
-
measures: List[str] = None):
|
|
459
|
-
|
|
460
|
-
super().__init__(models, learner, n_jobs, random_state, verbose, return_type, measures)
|
|
461
|
-
self.n_prevs = n_prevs
|
|
462
|
-
self.batch_size = batch_size if isinstance(batch_size, list) else [batch_size]
|
|
463
|
-
self.n_prevs = n_prevs
|
|
464
|
-
self.n_iterations = n_iterations
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
def predict_protocol(self, X_test: np.ndarray, y_test: np.ndarray) -> Tuple:
|
|
468
|
-
"""Generates several samples with artificial prevalences and sizes.
|
|
469
|
-
For each model, predicts with this sample, aggregating all together
|
|
470
|
-
with a pandas dataframe if requested, or else just the predictions.
|
|
471
|
-
|
|
472
|
-
Parameters
|
|
473
|
-
----------
|
|
474
|
-
X_test : np.ndarray
|
|
475
|
-
Features of the test set.
|
|
476
|
-
y_test : np.ndarray
|
|
477
|
-
Labels of the test set.
|
|
478
|
-
|
|
479
|
-
Returns
|
|
480
|
-
-------
|
|
481
|
-
Tuple
|
|
482
|
-
Tuple containing the (iteration, model name, prev, prev_pred, and batch size).
|
|
483
|
-
"""
|
|
484
|
-
|
|
485
|
-
n_dim = len(np.unique(y_test))
|
|
486
|
-
prevs = generate_artificial_prevalences(n_dim, self.n_prevs, self.n_iterations)
|
|
487
|
-
|
|
488
|
-
args = [
|
|
489
|
-
(iteration, X_test, y_test, model, prev, bs, self.verbose)
|
|
490
|
-
for prev in prevs for bs in self.batch_size for model in self.models for iteration in range(self.n_iterations)
|
|
491
|
-
]
|
|
492
|
-
|
|
493
|
-
size = len(prevs) * len(self.models) * len(self.batch_size) * self.n_iterations
|
|
494
|
-
|
|
495
|
-
predictions = []
|
|
496
|
-
for arg in tqdm(args, desc="Running APP", total=size):
|
|
497
|
-
predictions.append(self._predict(*arg))
|
|
498
|
-
|
|
499
|
-
return predictions
|
|
500
|
-
|
|
501
|
-
def _predict(self, iteration:int, X: np.ndarray, y: np.ndarray, model: Any, prev: List[float], batch_size: int, verbose: bool) -> Tuple:
|
|
502
|
-
"""Method predicts into the new sample for each model and prevalence.
|
|
503
|
-
|
|
504
|
-
Parameters
|
|
505
|
-
----------
|
|
506
|
-
iteration : int
|
|
507
|
-
Current iteration.
|
|
508
|
-
X : np.ndarray
|
|
509
|
-
Features of the test set.
|
|
510
|
-
y : np.ndarray
|
|
511
|
-
Labels of the test set.
|
|
512
|
-
model : Any
|
|
513
|
-
Quantification model.
|
|
514
|
-
prev : List[float]
|
|
515
|
-
Prevalence values for the sample.
|
|
516
|
-
batch_size : int
|
|
517
|
-
Batch size for the sample.
|
|
518
|
-
verbose : bool
|
|
519
|
-
Whether to print progress messages.
|
|
520
|
-
|
|
521
|
-
Returns
|
|
522
|
-
-------
|
|
523
|
-
Tuple
|
|
524
|
-
Tuple containing the iteration, model name, prev, prev_pred, and batch size.
|
|
525
|
-
"""
|
|
526
|
-
model_name = model.__class__.__name__
|
|
527
|
-
if model_name == "Ensemble" and isinstance(model.base_quantifier, Quantifier):
|
|
528
|
-
model_name = f"{model.__class__.__name__}_{model.base_quantifier.__class__.__name__}_{model.size}"
|
|
529
|
-
|
|
530
|
-
if verbose:
|
|
531
|
-
print(f'\t {model_name} with {batch_size} instances and prev {prev}')
|
|
532
|
-
|
|
533
|
-
X_sample, _ = self._new_sample(X, y, prev, batch_size)
|
|
534
|
-
prev_pred = np.asarray(list(model.predict(X_sample).values()))
|
|
535
|
-
|
|
536
|
-
if verbose:
|
|
537
|
-
print(f'\t \\--Ending {model_name} with {batch_size} instances and prev {prev}\n')
|
|
538
|
-
|
|
539
|
-
return (iteration+1, model_name, prev, prev_pred, batch_size)
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
def _new_sample(self, X: np.ndarray, y: np.ndarray, prev: List[float], batch_size: int) -> Tuple[np.ndarray, np.ndarray]:
|
|
543
|
-
"""Generates a new sample with a specified prevalence and size.
|
|
544
|
-
|
|
545
|
-
Parameters
|
|
546
|
-
----------
|
|
547
|
-
X : np.ndarray
|
|
548
|
-
Features of the test set.
|
|
549
|
-
y : np.ndarray
|
|
550
|
-
Labels of the test set.
|
|
551
|
-
prev : List[float]
|
|
552
|
-
Prevalence values for the sample.
|
|
553
|
-
batch_size : int
|
|
554
|
-
Batch size for the sample.
|
|
555
|
-
|
|
556
|
-
Returns
|
|
557
|
-
-------
|
|
558
|
-
Tuple[np.ndarray, np.ndarray]
|
|
559
|
-
Tuple containing the new sample features and labels.
|
|
560
|
-
"""
|
|
561
|
-
sample_index = generate_artificial_indexes(y, prev, batch_size, np.unique(y))
|
|
562
|
-
return (np.take(X, sample_index, axis=0), np.take(y, sample_index, axis=0))
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
class NPP(Protocol):
|
|
576
|
-
"""Natural Prevalence Protocol.
|
|
577
|
-
|
|
578
|
-
This approach splits a test into several samples varying sample size,
|
|
579
|
-
with n iterations. For a list of Quantifiers, it computes training and testing for
|
|
580
|
-
each one and returns either a table of results with error measures or just the predictions.
|
|
581
|
-
|
|
582
|
-
Parameters
|
|
583
|
-
----------
|
|
584
|
-
models : Union[List[Union[str, Quantifier]], str, Quantifier]
|
|
585
|
-
List of quantification models, a single model name, or 'all' for all models.
|
|
586
|
-
batch_size : Union[List[int], int]
|
|
587
|
-
Size of the batches to be processed, or a list of sizes.
|
|
588
|
-
learner : BaseEstimator, optional
|
|
589
|
-
Machine learning model to be used with the quantifiers. Required for model methods.
|
|
590
|
-
n_iterations : int, optional
|
|
591
|
-
Number of iterations for the protocol. Default is 1.
|
|
592
|
-
n_jobs : int, optional
|
|
593
|
-
Number of jobs to run in parallel. Default is 1.
|
|
594
|
-
random_state : int, optional
|
|
595
|
-
Seed for random number generation. Default is 32.
|
|
596
|
-
verbose : bool, optional
|
|
597
|
-
Whether to print progress messages. Default is False.
|
|
598
|
-
return_type : str, optional
|
|
599
|
-
Type of return value ('predictions' or 'table'). Default is 'predictions'.
|
|
600
|
-
measures : List[str], optional
|
|
601
|
-
List of error measures to calculate. Must be in MEASURES or None. Default is None.
|
|
602
|
-
|
|
603
|
-
Attributes
|
|
604
|
-
----------
|
|
605
|
-
models : List[Quantifier]
|
|
606
|
-
List of quantification models.
|
|
607
|
-
batch_size : Union[List[int], int]
|
|
608
|
-
Size of the batches to be processed.
|
|
609
|
-
learner : BaseEstimator
|
|
610
|
-
Machine learning model to be used with the quantifiers.
|
|
611
|
-
n_iterations : int
|
|
612
|
-
Number of iterations for the protocol.
|
|
613
|
-
n_jobs : int
|
|
614
|
-
Number of jobs to run in parallel.
|
|
615
|
-
random_state : int
|
|
616
|
-
Seed for random number generation.
|
|
617
|
-
verbose : bool
|
|
618
|
-
Whether to print progress messages.
|
|
619
|
-
return_type : str
|
|
620
|
-
Type of return value ('predictions' or 'table').
|
|
621
|
-
measures : List[str]
|
|
622
|
-
List of error measures to calculate.
|
|
623
|
-
"""
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
def __init__(self,
|
|
627
|
-
models: Union[List[Union[str, Quantifier]], str, Quantifier],
|
|
628
|
-
learner: BaseEstimator = None,
|
|
629
|
-
n_jobs: int = 1,
|
|
630
|
-
random_state: int = 32,
|
|
631
|
-
verbose: bool = False,
|
|
632
|
-
return_type: str = "predictions",
|
|
633
|
-
measures: List[str] = None):
|
|
634
|
-
|
|
635
|
-
super().__init__(models, learner, n_jobs, random_state, verbose, return_type, measures)
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
def predict_protocol(self, X_test, y_test) -> tuple:
|
|
639
|
-
raise NotImplementedError
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
def _new_sample(self, X, y, prev: List[float], batch_size: int) -> tuple:
|
|
643
|
-
raise NotImplementedError
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
def _delayed_predict(self, args) -> tuple:
|
|
647
|
-
raise NotImplementedError
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|