mlquantify 0.0.11.2__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify/__init__.py +32 -6
- mlquantify/base.py +559 -257
- mlquantify/classification/__init__.py +1 -1
- mlquantify/classification/methods.py +160 -0
- mlquantify/evaluation/__init__.py +14 -2
- mlquantify/evaluation/measures.py +215 -0
- mlquantify/evaluation/protocol.py +647 -0
- mlquantify/methods/__init__.py +37 -40
- mlquantify/methods/aggregative.py +1030 -0
- mlquantify/methods/meta.py +472 -0
- mlquantify/methods/mixture_models.py +1003 -0
- mlquantify/methods/non_aggregative.py +136 -0
- mlquantify/methods/threshold_optimization.py +957 -0
- mlquantify/model_selection.py +377 -232
- mlquantify/plots.py +367 -0
- mlquantify/utils/__init__.py +2 -2
- mlquantify/utils/general.py +334 -0
- mlquantify/utils/method.py +449 -0
- {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/METADATA +137 -122
- mlquantify-0.1.1.dist-info/RECORD +22 -0
- {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/WHEEL +1 -1
- mlquantify/classification/pwkclf.py +0 -73
- mlquantify/evaluation/measures/__init__.py +0 -26
- mlquantify/evaluation/measures/ae.py +0 -11
- mlquantify/evaluation/measures/bias.py +0 -16
- mlquantify/evaluation/measures/kld.py +0 -8
- mlquantify/evaluation/measures/mse.py +0 -12
- mlquantify/evaluation/measures/nae.py +0 -16
- mlquantify/evaluation/measures/nkld.py +0 -13
- mlquantify/evaluation/measures/nrae.py +0 -16
- mlquantify/evaluation/measures/rae.py +0 -12
- mlquantify/evaluation/measures/se.py +0 -12
- mlquantify/evaluation/protocol/_Protocol.py +0 -202
- mlquantify/evaluation/protocol/__init__.py +0 -2
- mlquantify/evaluation/protocol/app.py +0 -146
- mlquantify/evaluation/protocol/npp.py +0 -34
- mlquantify/methods/aggregative/ThreholdOptm/_ThreholdOptimization.py +0 -62
- mlquantify/methods/aggregative/ThreholdOptm/__init__.py +0 -7
- mlquantify/methods/aggregative/ThreholdOptm/acc.py +0 -27
- mlquantify/methods/aggregative/ThreholdOptm/max.py +0 -23
- mlquantify/methods/aggregative/ThreholdOptm/ms.py +0 -21
- mlquantify/methods/aggregative/ThreholdOptm/ms2.py +0 -25
- mlquantify/methods/aggregative/ThreholdOptm/pacc.py +0 -41
- mlquantify/methods/aggregative/ThreholdOptm/t50.py +0 -21
- mlquantify/methods/aggregative/ThreholdOptm/x.py +0 -23
- mlquantify/methods/aggregative/__init__.py +0 -9
- mlquantify/methods/aggregative/cc.py +0 -32
- mlquantify/methods/aggregative/emq.py +0 -86
- mlquantify/methods/aggregative/fm.py +0 -72
- mlquantify/methods/aggregative/gac.py +0 -96
- mlquantify/methods/aggregative/gpac.py +0 -87
- mlquantify/methods/aggregative/mixtureModels/_MixtureModel.py +0 -81
- mlquantify/methods/aggregative/mixtureModels/__init__.py +0 -5
- mlquantify/methods/aggregative/mixtureModels/dys.py +0 -55
- mlquantify/methods/aggregative/mixtureModels/dys_syn.py +0 -89
- mlquantify/methods/aggregative/mixtureModels/hdy.py +0 -46
- mlquantify/methods/aggregative/mixtureModels/smm.py +0 -27
- mlquantify/methods/aggregative/mixtureModels/sord.py +0 -77
- mlquantify/methods/aggregative/pcc.py +0 -33
- mlquantify/methods/aggregative/pwk.py +0 -38
- mlquantify/methods/meta/__init__.py +0 -1
- mlquantify/methods/meta/ensemble.py +0 -236
- mlquantify/methods/non_aggregative/__init__.py +0 -1
- mlquantify/methods/non_aggregative/hdx.py +0 -71
- mlquantify/plots/__init__.py +0 -2
- mlquantify/plots/distribution_plot.py +0 -109
- mlquantify/plots/protocol_plot.py +0 -193
- mlquantify/utils/general_purposes/__init__.py +0 -8
- mlquantify/utils/general_purposes/convert_col_to_array.py +0 -13
- mlquantify/utils/general_purposes/generate_artificial_indexes.py +0 -29
- mlquantify/utils/general_purposes/get_real_prev.py +0 -9
- mlquantify/utils/general_purposes/load_quantifier.py +0 -4
- mlquantify/utils/general_purposes/make_prevs.py +0 -23
- mlquantify/utils/general_purposes/normalize.py +0 -20
- mlquantify/utils/general_purposes/parallel.py +0 -10
- mlquantify/utils/general_purposes/round_protocol_df.py +0 -14
- mlquantify/utils/method_purposes/__init__.py +0 -6
- mlquantify/utils/method_purposes/distances.py +0 -21
- mlquantify/utils/method_purposes/getHist.py +0 -13
- mlquantify/utils/method_purposes/get_scores.py +0 -33
- mlquantify/utils/method_purposes/moss.py +0 -16
- mlquantify/utils/method_purposes/ternary_search.py +0 -14
- mlquantify/utils/method_purposes/tprfpr.py +0 -42
- mlquantify-0.0.11.2.dist-info/RECORD +0 -73
- {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,647 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from typing import Union, List, Tuple, Any
|
|
5
|
+
from sklearn.base import BaseEstimator
|
|
6
|
+
from time import time
|
|
7
|
+
from tqdm import tqdm
|
|
8
|
+
|
|
9
|
+
from ..methods import METHODS, AGGREGATIVE, NON_AGGREGATIVE
|
|
10
|
+
from ..utils.general import *
|
|
11
|
+
from ..utils.method import *
|
|
12
|
+
from . import MEASURES
|
|
13
|
+
from ..base import Quantifier
|
|
14
|
+
|
|
15
|
+
import mlquantify as mq
|
|
16
|
+
|
|
17
|
+
class Protocol(ABC):
|
|
18
|
+
"""Base class for evaluation protocols.
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
models : Union[List[Union[str, Quantifier]], str, Quantifier]
|
|
23
|
+
List of quantification models, a single model name, or 'all' for all models.
|
|
24
|
+
learner : BaseEstimator, optional
|
|
25
|
+
Machine learning model to be used with the quantifiers. Required for model methods.
|
|
26
|
+
n_jobs : int, optional
|
|
27
|
+
Number of jobs to run in parallel. Default is 1.
|
|
28
|
+
random_state : int, optional
|
|
29
|
+
Seed for random number generation. Default is 32.
|
|
30
|
+
verbose : bool, optional
|
|
31
|
+
Whether to print progress messages. Default is False.
|
|
32
|
+
return_type : str, optional
|
|
33
|
+
Type of return value ('predictions' or 'table'). Default is 'predictions'.
|
|
34
|
+
measures : List[str], optional
|
|
35
|
+
List of error measures to calculate. Must be in MEASURES or None. Default is None.
|
|
36
|
+
columns : List[str], optional
|
|
37
|
+
Columns to be included in the table. Default is ['ITERATION', 'QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS', 'BATCH_SIZE'].
|
|
38
|
+
|
|
39
|
+
Attributes
|
|
40
|
+
----------
|
|
41
|
+
models : List[Quantifier]
|
|
42
|
+
List of quantification models.
|
|
43
|
+
learner : BaseEstimator
|
|
44
|
+
Machine learning model to be used with the quantifiers.
|
|
45
|
+
n_jobs : int
|
|
46
|
+
Number of jobs to run in parallel.
|
|
47
|
+
random_state : int
|
|
48
|
+
Seed for random number generation.
|
|
49
|
+
verbose : bool
|
|
50
|
+
Whether to print progress messages.
|
|
51
|
+
return_type : str
|
|
52
|
+
Type of return value ('predictions' or 'table').
|
|
53
|
+
measures : List[str]
|
|
54
|
+
List of error measures to calculate.
|
|
55
|
+
columns : List[str]
|
|
56
|
+
Columns to be included in the table.
|
|
57
|
+
|
|
58
|
+
Raises
|
|
59
|
+
------
|
|
60
|
+
AssertionError
|
|
61
|
+
If measures contain invalid error measures.
|
|
62
|
+
If return_type is invalid.
|
|
63
|
+
If columns does not contain ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS'].
|
|
64
|
+
|
|
65
|
+
Notes
|
|
66
|
+
-----
|
|
67
|
+
- The 'models' parameter can be a list of Quantifiers, a single Quantifier, a list of model names, a single model name, or 'all'.
|
|
68
|
+
- If 'models' is a list of model names or 'all', 'learner' must be provided.
|
|
69
|
+
- The 'all' option for 'models' will use all quantification models available in the library.
|
|
70
|
+
- If 'models' is a Quantifier or list of Quantifier, 'learner' is not required. But the models must be initializated
|
|
71
|
+
- You can pass your own model by passing a Quantifier object.
|
|
72
|
+
- Columns must contain ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS'].
|
|
73
|
+
- If 'return_type' is 'table', the table will contain the columns specified in 'columns' and the error measures in 'measures'.
|
|
74
|
+
- For creating your own protocol, you must have the attributes 'models', 'learner', 'n_jobs', 'random_state', 'verbose', 'return_type', 'measures', and 'columns'., but columns can be changed, as long as it contains ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS'].
|
|
75
|
+
|
|
76
|
+
See Also
|
|
77
|
+
--------
|
|
78
|
+
APP : Artificial Prevalence Protocol.
|
|
79
|
+
NPP : Natural Prevalence Protocol.
|
|
80
|
+
Quantifier : Base class for quantification methods.
|
|
81
|
+
|
|
82
|
+
Examples
|
|
83
|
+
--------
|
|
84
|
+
import numpy as np
|
|
85
|
+
>>> from mlquantify.evaluation.protocol import Protocol
|
|
86
|
+
>>> from mlquantify.utils import get_real_prev
|
|
87
|
+
>>> from sklearn.ensemble import RandomForestClassifier
|
|
88
|
+
>>> from sklearn.datasets import load_breast_cancer
|
|
89
|
+
>>> from sklearn.model_selection import train_test_split
|
|
90
|
+
>>> import time as t
|
|
91
|
+
>>>
|
|
92
|
+
>>> class MyProtocol(Protocol):
|
|
93
|
+
... def __init__(self,
|
|
94
|
+
... models,
|
|
95
|
+
... learner,
|
|
96
|
+
... n_jobs,
|
|
97
|
+
... random_state,
|
|
98
|
+
... verbose,
|
|
99
|
+
... return_type,
|
|
100
|
+
... measures,
|
|
101
|
+
... sample_size,
|
|
102
|
+
... iterations=10):
|
|
103
|
+
... super().__init__(models,
|
|
104
|
+
... learner,
|
|
105
|
+
... n_jobs,
|
|
106
|
+
... random_state,
|
|
107
|
+
... verbose,
|
|
108
|
+
... return_type,
|
|
109
|
+
... measures,
|
|
110
|
+
... columns=['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS', 'TIME'])
|
|
111
|
+
... self.sample_size = sample_size
|
|
112
|
+
... self.iterations = iterations
|
|
113
|
+
...
|
|
114
|
+
... def predict_protocol(self, X_test, y_test):
|
|
115
|
+
... predictions = []
|
|
116
|
+
...
|
|
117
|
+
... X_sample, y_sample = self._new_sample(X_test, y_test)
|
|
118
|
+
...
|
|
119
|
+
... for _ in range(self.iterations):
|
|
120
|
+
... for model in self.models:
|
|
121
|
+
... quantifier = model.__class__.__name__
|
|
122
|
+
...
|
|
123
|
+
... real_prev = get_real_prev(y_sample)
|
|
124
|
+
...
|
|
125
|
+
... start_time = t.time()
|
|
126
|
+
... pred_prev = model.predict(X_sample)
|
|
127
|
+
... end_time = t.time()
|
|
128
|
+
... time = end_time - start_time
|
|
129
|
+
...
|
|
130
|
+
... predictions.append([quantifier, real_prev, pred_prev, time])
|
|
131
|
+
...
|
|
132
|
+
... return predictions
|
|
133
|
+
...
|
|
134
|
+
... def _new_sample(self, X_test, y_test):
|
|
135
|
+
... indexes = np.random.choice(len(X_test), size=self.sample_size, replace=False)
|
|
136
|
+
... return X_test[indexes], y_test[indexes]
|
|
137
|
+
>>>
|
|
138
|
+
>>>
|
|
139
|
+
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
140
|
+
>>>
|
|
141
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.5, random_state=42)
|
|
142
|
+
>>>
|
|
143
|
+
>>> protocol = MyProtocol(models=["CC", "EMQ", "DyS"], # or [CC(learner), EMQ(learner), DyS(learner)]
|
|
144
|
+
... learner=RandomForestClassifier(),
|
|
145
|
+
... n_jobs=1,
|
|
146
|
+
... random_state=42,
|
|
147
|
+
... verbose=True,
|
|
148
|
+
... return_type="table",
|
|
149
|
+
... measures=None,
|
|
150
|
+
... sample_size=100)
|
|
151
|
+
>>>
|
|
152
|
+
>>> protocol.fit(X_train, y_train)
|
|
153
|
+
>>> table = protocol.predict(X_test, y_test)
|
|
154
|
+
>>> print(table)
|
|
155
|
+
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
def __init__(self,
|
|
159
|
+
models: Union[List[Union[str, Quantifier]], str, Quantifier],
|
|
160
|
+
learner: BaseEstimator = None,
|
|
161
|
+
n_jobs: int = 1,
|
|
162
|
+
random_state: int = 32,
|
|
163
|
+
verbose: bool = False,
|
|
164
|
+
return_type: str = "predictions",
|
|
165
|
+
measures: List[str] = None,
|
|
166
|
+
columns: List[str] = ["ITERATION", "QUANTIFIER", "REAL_PREVS", "PRED_PREVS", "BATCH_SIZE"]):
|
|
167
|
+
|
|
168
|
+
assert not measures or all(m in MEASURES for m in measures), \
|
|
169
|
+
f"Invalid measure(s) provided. Valid options: {list(MEASURES.keys())} or None"
|
|
170
|
+
assert return_type in ["predictions", "table"], \
|
|
171
|
+
"Invalid return_type. Valid options: ['predictions', 'table']"
|
|
172
|
+
assert all(col in columns for col in ["QUANTIFIER", "REAL_PREVS", "PRED_PREVS"]), \
|
|
173
|
+
"Columns must contain ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS']"
|
|
174
|
+
|
|
175
|
+
# Fixed parameters
|
|
176
|
+
self.models = self._initialize_models(models, learner)
|
|
177
|
+
self.learner = learner
|
|
178
|
+
self.n_jobs = n_jobs
|
|
179
|
+
self.random_state = random_state
|
|
180
|
+
self.verbose = verbose
|
|
181
|
+
self.return_type = return_type
|
|
182
|
+
self.measures = measures
|
|
183
|
+
self.columns = columns
|
|
184
|
+
|
|
185
|
+
def _initialize_models(self, models, learner):
|
|
186
|
+
"""Initializes the quantification models.
|
|
187
|
+
|
|
188
|
+
Parameters
|
|
189
|
+
----------
|
|
190
|
+
models : Union[List[Union[str, Quantifier]], str, Quantifier]
|
|
191
|
+
List of quantification models, a single model name, or 'all' for all models.
|
|
192
|
+
learner : BaseEstimator
|
|
193
|
+
Machine learning model to be used with the quantifiers.
|
|
194
|
+
|
|
195
|
+
Returns
|
|
196
|
+
-------
|
|
197
|
+
List[Quantifier]
|
|
198
|
+
List of quantification models.
|
|
199
|
+
"""
|
|
200
|
+
if isinstance(models, list):
|
|
201
|
+
if all(isinstance(model, Quantifier) for model in models):
|
|
202
|
+
return models
|
|
203
|
+
return [get_method(model)(learner) for model in models]
|
|
204
|
+
|
|
205
|
+
if isinstance(models, Quantifier):
|
|
206
|
+
return [models]
|
|
207
|
+
|
|
208
|
+
assert learner is not None, "Learner is required for model methods."
|
|
209
|
+
|
|
210
|
+
model_dict = {
|
|
211
|
+
"all": METHODS.values,
|
|
212
|
+
"aggregative": AGGREGATIVE.values,
|
|
213
|
+
"non_aggregative": NON_AGGREGATIVE.values
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
if models in model_dict:
|
|
217
|
+
return [model(learner) if hasattr(model, "learner") else model() for model in model_dict[models]()]
|
|
218
|
+
return [get_method(models)(learner)]
|
|
219
|
+
|
|
220
|
+
def sout(self, msg):
|
|
221
|
+
"""Prints a message if verbose is True."""
|
|
222
|
+
if self.verbose:
|
|
223
|
+
print('[APP]' + msg)
|
|
224
|
+
|
|
225
|
+
def fit(self, X_train, y_train):
|
|
226
|
+
"""Fits the models with the training data.
|
|
227
|
+
|
|
228
|
+
Parameters
|
|
229
|
+
----------
|
|
230
|
+
X_train : np.ndarray
|
|
231
|
+
Features of the training set.
|
|
232
|
+
y_train : np.ndarray
|
|
233
|
+
Labels of the training set.
|
|
234
|
+
|
|
235
|
+
Returns
|
|
236
|
+
-------
|
|
237
|
+
Protocol
|
|
238
|
+
Fitted protocol.
|
|
239
|
+
"""
|
|
240
|
+
self.sout("Fitting models")
|
|
241
|
+
|
|
242
|
+
args = ((model, X_train, y_train) for model in self.models)
|
|
243
|
+
|
|
244
|
+
wrapper = tqdm if self.verbose else lambda x, **kwargs: x
|
|
245
|
+
|
|
246
|
+
self.models = Parallel(n_jobs=self.n_jobs)( # Parallel processing of models
|
|
247
|
+
delayed(self._delayed_fit)(*arg) for arg in wrapper(args, desc="Fitting models", total=len(self.models))
|
|
248
|
+
)
|
|
249
|
+
self.sout("Fit [Done]")
|
|
250
|
+
return self
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def predict(self, X_test: np.ndarray, y_test: np.ndarray) -> Any:
|
|
254
|
+
"""Predicts the prevalence for the test set.
|
|
255
|
+
|
|
256
|
+
Parameters
|
|
257
|
+
----------
|
|
258
|
+
X_test : np.ndarray
|
|
259
|
+
Features of the test set.
|
|
260
|
+
y_test : np.ndarray
|
|
261
|
+
Labels of the test set.
|
|
262
|
+
|
|
263
|
+
Returns
|
|
264
|
+
-------
|
|
265
|
+
Any
|
|
266
|
+
Predictions for the test set. Can be a table or a tuple with the quantifier names, real prevalence, and predicted prevalence.
|
|
267
|
+
"""
|
|
268
|
+
predictions = self.predict_protocol(X_test, y_test)
|
|
269
|
+
predictions_df = pd.DataFrame(predictions, columns=self.columns)
|
|
270
|
+
|
|
271
|
+
if self.return_type == "table":
|
|
272
|
+
if self.measures:
|
|
273
|
+
smoothed_factor = 1 / (2 * len(X_test))
|
|
274
|
+
|
|
275
|
+
def smooth(values: np.ndarray) -> np.ndarray:
|
|
276
|
+
return (values + smoothed_factor) / (smoothed_factor * len(values) + 1)
|
|
277
|
+
|
|
278
|
+
for metric in self.measures:
|
|
279
|
+
predictions_df[metric] = predictions_df.apply(
|
|
280
|
+
lambda row: get_measure(metric)(
|
|
281
|
+
smooth(np.array(row["REAL_PREVS"])),
|
|
282
|
+
smooth(np.array(row["PRED_PREVS"]))
|
|
283
|
+
),
|
|
284
|
+
axis=1
|
|
285
|
+
)
|
|
286
|
+
return predictions_df
|
|
287
|
+
|
|
288
|
+
return (
|
|
289
|
+
predictions_df["QUANTIFIER"].to_numpy(), # Quantifier names
|
|
290
|
+
np.stack(predictions_df["REAL_PREVS"].to_numpy()), # REAL_PREVS
|
|
291
|
+
np.stack(predictions_df["PRED_PREVS"].to_numpy()) # PRED_PREVS
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
@abstractmethod
|
|
295
|
+
def predict_protocol(self, X_test: np.ndarray, y_test: np.ndarray) -> np.ndarray:
|
|
296
|
+
"""Abstract method that every protocol must implement
|
|
297
|
+
|
|
298
|
+
Parameters
|
|
299
|
+
----------
|
|
300
|
+
X_test : np.ndarray
|
|
301
|
+
Features of the test set.
|
|
302
|
+
y_test : np.ndarray
|
|
303
|
+
Labels of the test set.
|
|
304
|
+
|
|
305
|
+
Returns
|
|
306
|
+
-------
|
|
307
|
+
np.ndarray
|
|
308
|
+
Predictions for the test set. With the same format as the column names attribute.
|
|
309
|
+
"""
|
|
310
|
+
...
|
|
311
|
+
|
|
312
|
+
@abstractmethod
|
|
313
|
+
def _new_sample(self) -> Tuple[np.ndarray, np.ndarray]:
|
|
314
|
+
"""Abstract method of sample extraction for each protocol.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
Tuple[np.ndarray, np.ndarray]: Tuple containing X_sample and y_sample.
|
|
318
|
+
"""
|
|
319
|
+
...
|
|
320
|
+
|
|
321
|
+
@staticmethod
|
|
322
|
+
def _delayed_fit(model, X_train, y_train):
|
|
323
|
+
"""Method to fit the model in parallel.
|
|
324
|
+
|
|
325
|
+
Parameters
|
|
326
|
+
----------
|
|
327
|
+
model : Quantifier
|
|
328
|
+
Quantification model.
|
|
329
|
+
X_train : np.ndarray
|
|
330
|
+
Features of the training set.
|
|
331
|
+
y_train : np.ndarray
|
|
332
|
+
Labels of the training set.
|
|
333
|
+
|
|
334
|
+
Returns
|
|
335
|
+
-------
|
|
336
|
+
Quantifier
|
|
337
|
+
Fitted quantification model
|
|
338
|
+
"""
|
|
339
|
+
model_name = model.__class__.__name__
|
|
340
|
+
if model_name == "Ensemble" and isinstance(model.base_quantifier, Quantifier):
|
|
341
|
+
model_name = f"{model.__class__.__name__}_{model.base_quantifier.__class__.__name__}_{model.size}"
|
|
342
|
+
|
|
343
|
+
start = time()
|
|
344
|
+
model = model.fit(X=X_train, y=y_train)
|
|
345
|
+
duration = time() - start
|
|
346
|
+
print(f"\tFitted {model_name} in {duration:.3f} seconds")
|
|
347
|
+
return model
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
class APP(Protocol):
|
|
356
|
+
"""Artificial Prevalence Protocol.
|
|
357
|
+
|
|
358
|
+
This approach splits a test into several samples varying prevalence and sample size,
|
|
359
|
+
with n iterations. For a list of Quantifiers, it computes training and testing for
|
|
360
|
+
each one and returns either a table of results with error measures or just the predictions.
|
|
361
|
+
|
|
362
|
+
Parameters
|
|
363
|
+
----------
|
|
364
|
+
models : Union[List[Union[str, Quantifier]], str, Quantifier]
|
|
365
|
+
List of quantification models, a single model name, or 'all' for all models.
|
|
366
|
+
batch_size : Union[List[int], int]
|
|
367
|
+
Size of the batches to be processed, or a list of sizes.
|
|
368
|
+
learner : BaseEstimator, optional
|
|
369
|
+
Machine learning model to be used with the quantifiers. Required for model methods.
|
|
370
|
+
n_prevs : int, optional
|
|
371
|
+
Number of prevalence points to generate. Default is 100.
|
|
372
|
+
n_iterations : int, optional
|
|
373
|
+
Number of iterations for the protocol. Default is 1.
|
|
374
|
+
n_jobs : int, optional
|
|
375
|
+
Number of jobs to run in parallel. Default is 1.
|
|
376
|
+
random_state : int, optional
|
|
377
|
+
Seed for random number generation. Default is 32.
|
|
378
|
+
verbose : bool, optional
|
|
379
|
+
Whether to print progress messages. Default is False.
|
|
380
|
+
return_type : str, optional
|
|
381
|
+
Type of return value ('predictions' or 'table'). Default is 'predictions'.
|
|
382
|
+
measures : List[str], optional
|
|
383
|
+
List of error measures to calculate. Must be in MEASURES or None. Default is None.
|
|
384
|
+
|
|
385
|
+
Attributes
|
|
386
|
+
----------
|
|
387
|
+
models : List[Quantifier]
|
|
388
|
+
List of quantification models.
|
|
389
|
+
batch_size : Union[List[int], int]
|
|
390
|
+
Size of the batches to be processed.
|
|
391
|
+
learner : BaseEstimator
|
|
392
|
+
Machine learning model to be used with the quantifiers.
|
|
393
|
+
n_prevs : int
|
|
394
|
+
Number of prevalence points to generate.
|
|
395
|
+
n_iterations : int
|
|
396
|
+
Number of iterations for the protocol.
|
|
397
|
+
n_jobs : int
|
|
398
|
+
Number of jobs to run in parallel.
|
|
399
|
+
random_state : int
|
|
400
|
+
Seed for random number generation.
|
|
401
|
+
verbose : bool
|
|
402
|
+
Whether to print progress messages.
|
|
403
|
+
return_type : str
|
|
404
|
+
Type of return value ('predictions' or 'table').
|
|
405
|
+
measures : List[str]
|
|
406
|
+
List of error measures to calculate.
|
|
407
|
+
|
|
408
|
+
Raises
|
|
409
|
+
------
|
|
410
|
+
AssertionError
|
|
411
|
+
If return_type is invalid.
|
|
412
|
+
|
|
413
|
+
See Also
|
|
414
|
+
--------
|
|
415
|
+
Protocol : Base class for evaluation protocols.
|
|
416
|
+
NPP : Natural Prevalence Protocol.
|
|
417
|
+
Quantifier : Base class for quantification methods.
|
|
418
|
+
|
|
419
|
+
Examples
|
|
420
|
+
--------
|
|
421
|
+
>>> from mlquantify.evaluation.protocol import APP
|
|
422
|
+
>>> from sklearn.ensemble import RandomForestClassifier
|
|
423
|
+
>>> from sklearn.datasets import load_breast_cancer
|
|
424
|
+
>>> from sklearn.model_selection import train_test_split
|
|
425
|
+
>>>
|
|
426
|
+
>>> # Loading dataset from sklearn
|
|
427
|
+
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
428
|
+
>>>
|
|
429
|
+
>>> #Splitting into train and test
|
|
430
|
+
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
|
|
431
|
+
>>>
|
|
432
|
+
>>> app = APP(models=["CC", "EMQ", "DyS"],
|
|
433
|
+
... batch_size=[10, 50, 100],
|
|
434
|
+
... learner=RandomForestClassifier(),
|
|
435
|
+
... n_prevs=100, # Default
|
|
436
|
+
... n_jobs=-1,
|
|
437
|
+
... return_type="table",
|
|
438
|
+
... measures=["ae", "se"],
|
|
439
|
+
... verbose=True)
|
|
440
|
+
>>>
|
|
441
|
+
>>> app.fit(X_train, y_train)
|
|
442
|
+
>>>
|
|
443
|
+
>>> table = app.predict(X_test, y_test)
|
|
444
|
+
>>>
|
|
445
|
+
>>> print(table)
|
|
446
|
+
"""
|
|
447
|
+
|
|
448
|
+
def __init__(self,
|
|
449
|
+
models: Union[List[Union[str, Quantifier]], str, Quantifier],
|
|
450
|
+
batch_size: Union[List[int], int],
|
|
451
|
+
learner: BaseEstimator = None,
|
|
452
|
+
n_prevs: int = 100,
|
|
453
|
+
n_iterations: int = 1,
|
|
454
|
+
n_jobs: int = 1,
|
|
455
|
+
random_state: int = 32,
|
|
456
|
+
verbose: bool = False,
|
|
457
|
+
return_type: str = "predictions",
|
|
458
|
+
measures: List[str] = None):
|
|
459
|
+
|
|
460
|
+
super().__init__(models, learner, n_jobs, random_state, verbose, return_type, measures)
|
|
461
|
+
self.n_prevs = n_prevs
|
|
462
|
+
self.batch_size = batch_size if isinstance(batch_size, list) else [batch_size]
|
|
463
|
+
self.n_prevs = n_prevs
|
|
464
|
+
self.n_iterations = n_iterations
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def predict_protocol(self, X_test: np.ndarray, y_test: np.ndarray) -> Tuple:
|
|
468
|
+
"""Generates several samples with artificial prevalences and sizes.
|
|
469
|
+
For each model, predicts with this sample, aggregating all together
|
|
470
|
+
with a pandas dataframe if requested, or else just the predictions.
|
|
471
|
+
|
|
472
|
+
Parameters
|
|
473
|
+
----------
|
|
474
|
+
X_test : np.ndarray
|
|
475
|
+
Features of the test set.
|
|
476
|
+
y_test : np.ndarray
|
|
477
|
+
Labels of the test set.
|
|
478
|
+
|
|
479
|
+
Returns
|
|
480
|
+
-------
|
|
481
|
+
Tuple
|
|
482
|
+
Tuple containing the (iteration, model name, prev, prev_pred, and batch size).
|
|
483
|
+
"""
|
|
484
|
+
|
|
485
|
+
n_dim = len(np.unique(y_test))
|
|
486
|
+
prevs = generate_artificial_prevalences(n_dim, self.n_prevs, self.n_iterations)
|
|
487
|
+
|
|
488
|
+
args = [
|
|
489
|
+
(iteration, X_test, y_test, model, prev, bs, self.verbose)
|
|
490
|
+
for prev in prevs for bs in self.batch_size for model in self.models for iteration in range(self.n_iterations)
|
|
491
|
+
]
|
|
492
|
+
|
|
493
|
+
size = len(prevs) * len(self.models) * len(self.batch_size) * self.n_iterations
|
|
494
|
+
|
|
495
|
+
predictions = []
|
|
496
|
+
for arg in tqdm(args, desc="Running APP", total=size):
|
|
497
|
+
predictions.append(self._predict(*arg))
|
|
498
|
+
|
|
499
|
+
return predictions
|
|
500
|
+
|
|
501
|
+
def _predict(self, iteration:int, X: np.ndarray, y: np.ndarray, model: Any, prev: List[float], batch_size: int, verbose: bool) -> Tuple:
|
|
502
|
+
"""Method predicts into the new sample for each model and prevalence.
|
|
503
|
+
|
|
504
|
+
Parameters
|
|
505
|
+
----------
|
|
506
|
+
iteration : int
|
|
507
|
+
Current iteration.
|
|
508
|
+
X : np.ndarray
|
|
509
|
+
Features of the test set.
|
|
510
|
+
y : np.ndarray
|
|
511
|
+
Labels of the test set.
|
|
512
|
+
model : Any
|
|
513
|
+
Quantification model.
|
|
514
|
+
prev : List[float]
|
|
515
|
+
Prevalence values for the sample.
|
|
516
|
+
batch_size : int
|
|
517
|
+
Batch size for the sample.
|
|
518
|
+
verbose : bool
|
|
519
|
+
Whether to print progress messages.
|
|
520
|
+
|
|
521
|
+
Returns
|
|
522
|
+
-------
|
|
523
|
+
Tuple
|
|
524
|
+
Tuple containing the iteration, model name, prev, prev_pred, and batch size.
|
|
525
|
+
"""
|
|
526
|
+
model_name = model.__class__.__name__
|
|
527
|
+
if model_name == "Ensemble" and isinstance(model.base_quantifier, Quantifier):
|
|
528
|
+
model_name = f"{model.__class__.__name__}_{model.base_quantifier.__class__.__name__}_{model.size}"
|
|
529
|
+
|
|
530
|
+
if verbose:
|
|
531
|
+
print(f'\t {model_name} with {batch_size} instances and prev {prev}')
|
|
532
|
+
|
|
533
|
+
X_sample, _ = self._new_sample(X, y, prev, batch_size)
|
|
534
|
+
prev_pred = np.asarray(list(model.predict(X_sample).values()))
|
|
535
|
+
|
|
536
|
+
if verbose:
|
|
537
|
+
print(f'\t \\--Ending {model_name} with {batch_size} instances and prev {prev}\n')
|
|
538
|
+
|
|
539
|
+
return (iteration+1, model_name, prev, prev_pred, batch_size)
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
def _new_sample(self, X: np.ndarray, y: np.ndarray, prev: List[float], batch_size: int) -> Tuple[np.ndarray, np.ndarray]:
|
|
543
|
+
"""Generates a new sample with a specified prevalence and size.
|
|
544
|
+
|
|
545
|
+
Parameters
|
|
546
|
+
----------
|
|
547
|
+
X : np.ndarray
|
|
548
|
+
Features of the test set.
|
|
549
|
+
y : np.ndarray
|
|
550
|
+
Labels of the test set.
|
|
551
|
+
prev : List[float]
|
|
552
|
+
Prevalence values for the sample.
|
|
553
|
+
batch_size : int
|
|
554
|
+
Batch size for the sample.
|
|
555
|
+
|
|
556
|
+
Returns
|
|
557
|
+
-------
|
|
558
|
+
Tuple[np.ndarray, np.ndarray]
|
|
559
|
+
Tuple containing the new sample features and labels.
|
|
560
|
+
"""
|
|
561
|
+
sample_index = generate_artificial_indexes(y, prev, batch_size, np.unique(y))
|
|
562
|
+
return (np.take(X, sample_index, axis=0), np.take(y, sample_index, axis=0))
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
class NPP(Protocol):
|
|
576
|
+
"""Natural Prevalence Protocol.
|
|
577
|
+
|
|
578
|
+
This approach splits a test into several samples varying sample size,
|
|
579
|
+
with n iterations. For a list of Quantifiers, it computes training and testing for
|
|
580
|
+
each one and returns either a table of results with error measures or just the predictions.
|
|
581
|
+
|
|
582
|
+
Parameters
|
|
583
|
+
----------
|
|
584
|
+
models : Union[List[Union[str, Quantifier]], str, Quantifier]
|
|
585
|
+
List of quantification models, a single model name, or 'all' for all models.
|
|
586
|
+
batch_size : Union[List[int], int]
|
|
587
|
+
Size of the batches to be processed, or a list of sizes.
|
|
588
|
+
learner : BaseEstimator, optional
|
|
589
|
+
Machine learning model to be used with the quantifiers. Required for model methods.
|
|
590
|
+
n_iterations : int, optional
|
|
591
|
+
Number of iterations for the protocol. Default is 1.
|
|
592
|
+
n_jobs : int, optional
|
|
593
|
+
Number of jobs to run in parallel. Default is 1.
|
|
594
|
+
random_state : int, optional
|
|
595
|
+
Seed for random number generation. Default is 32.
|
|
596
|
+
verbose : bool, optional
|
|
597
|
+
Whether to print progress messages. Default is False.
|
|
598
|
+
return_type : str, optional
|
|
599
|
+
Type of return value ('predictions' or 'table'). Default is 'predictions'.
|
|
600
|
+
measures : List[str], optional
|
|
601
|
+
List of error measures to calculate. Must be in MEASURES or None. Default is None.
|
|
602
|
+
|
|
603
|
+
Attributes
|
|
604
|
+
----------
|
|
605
|
+
models : List[Quantifier]
|
|
606
|
+
List of quantification models.
|
|
607
|
+
batch_size : Union[List[int], int]
|
|
608
|
+
Size of the batches to be processed.
|
|
609
|
+
learner : BaseEstimator
|
|
610
|
+
Machine learning model to be used with the quantifiers.
|
|
611
|
+
n_iterations : int
|
|
612
|
+
Number of iterations for the protocol.
|
|
613
|
+
n_jobs : int
|
|
614
|
+
Number of jobs to run in parallel.
|
|
615
|
+
random_state : int
|
|
616
|
+
Seed for random number generation.
|
|
617
|
+
verbose : bool
|
|
618
|
+
Whether to print progress messages.
|
|
619
|
+
return_type : str
|
|
620
|
+
Type of return value ('predictions' or 'table').
|
|
621
|
+
measures : List[str]
|
|
622
|
+
List of error measures to calculate.
|
|
623
|
+
"""
|
|
624
|
+
|
|
625
|
+
|
|
626
|
+
def __init__(self,
|
|
627
|
+
models: Union[List[Union[str, Quantifier]], str, Quantifier],
|
|
628
|
+
learner: BaseEstimator = None,
|
|
629
|
+
n_jobs: int = 1,
|
|
630
|
+
random_state: int = 32,
|
|
631
|
+
verbose: bool = False,
|
|
632
|
+
return_type: str = "predictions",
|
|
633
|
+
measures: List[str] = None):
|
|
634
|
+
|
|
635
|
+
super().__init__(models, learner, n_jobs, random_state, verbose, return_type, measures)
|
|
636
|
+
|
|
637
|
+
|
|
638
|
+
def predict_protocol(self, X_test, y_test) -> tuple:
|
|
639
|
+
raise NotImplementedError
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
def _new_sample(self, X, y, prev: List[float], batch_size: int) -> tuple:
|
|
643
|
+
raise NotImplementedError
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
def _delayed_predict(self, args) -> tuple:
|
|
647
|
+
raise NotImplementedError
|