mlquantify 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify/evaluation/protocol.py +216 -566
- mlquantify/methods/meta.py +2 -2
- mlquantify/utils/general.py +43 -6
- {mlquantify-0.1.3.dist-info → mlquantify-0.1.5.dist-info}/METADATA +16 -13
- {mlquantify-0.1.3.dist-info → mlquantify-0.1.5.dist-info}/RECORD +7 -7
- {mlquantify-0.1.3.dist-info → mlquantify-0.1.5.dist-info}/WHEEL +0 -0
- {mlquantify-0.1.3.dist-info → mlquantify-0.1.5.dist-info}/top_level.txt +0 -0
|
@@ -1,647 +1,297 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
import numpy as np
|
|
3
|
-
import
|
|
4
|
-
from typing import Union, List, Tuple, Any
|
|
5
|
-
from sklearn.base import BaseEstimator
|
|
6
|
-
from time import time
|
|
3
|
+
from typing import Generator, Tuple
|
|
7
4
|
from tqdm import tqdm
|
|
8
5
|
|
|
9
|
-
from ..methods import METHODS, AGGREGATIVE, NON_AGGREGATIVE
|
|
10
6
|
from ..utils.general import *
|
|
11
|
-
from ..utils.method import *
|
|
12
|
-
from . import MEASURES
|
|
13
|
-
from ..base import Quantifier
|
|
14
|
-
|
|
15
|
-
import mlquantify as mq
|
|
16
7
|
|
|
17
8
|
class Protocol(ABC):
|
|
18
9
|
"""Base class for evaluation protocols.
|
|
19
10
|
|
|
20
11
|
Parameters
|
|
21
12
|
----------
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
learner : BaseEstimator, optional
|
|
25
|
-
Machine learning model to be used with the quantifiers. Required for model methods.
|
|
26
|
-
n_jobs : int, optional
|
|
27
|
-
Number of jobs to run in parallel. Default is 1.
|
|
13
|
+
batch_size : int or list of int
|
|
14
|
+
The size of the batches to be used in the evaluation.
|
|
28
15
|
random_state : int, optional
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
Whether to print progress messages. Default is False.
|
|
32
|
-
return_type : str, optional
|
|
33
|
-
Type of return value ('predictions' or 'table'). Default is 'predictions'.
|
|
34
|
-
measures : List[str], optional
|
|
35
|
-
List of error measures to calculate. Must be in MEASURES or None. Default is None.
|
|
36
|
-
columns : List[str], optional
|
|
37
|
-
Columns to be included in the table. Default is ['ITERATION', 'QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS', 'BATCH_SIZE'].
|
|
38
|
-
|
|
16
|
+
The random seed for reproducibility.
|
|
17
|
+
|
|
39
18
|
Attributes
|
|
40
19
|
----------
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
learner : BaseEstimator
|
|
44
|
-
Machine learning model to be used with the quantifiers.
|
|
45
|
-
n_jobs : int
|
|
46
|
-
Number of jobs to run in parallel.
|
|
47
|
-
random_state : int
|
|
48
|
-
Seed for random number generation.
|
|
49
|
-
verbose : bool
|
|
50
|
-
Whether to print progress messages.
|
|
51
|
-
return_type : str
|
|
52
|
-
Type of return value ('predictions' or 'table').
|
|
53
|
-
measures : List[str]
|
|
54
|
-
List of error measures to calculate.
|
|
55
|
-
columns : List[str]
|
|
56
|
-
Columns to be included in the table.
|
|
57
|
-
|
|
20
|
+
n_combinations : int
|
|
21
|
+
|
|
58
22
|
Raises
|
|
59
23
|
------
|
|
60
|
-
|
|
61
|
-
If
|
|
62
|
-
|
|
63
|
-
If columns does not contain ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS'].
|
|
64
|
-
|
|
24
|
+
ValueError
|
|
25
|
+
If the batch size is not a positive integer or list of positive integers.
|
|
26
|
+
|
|
65
27
|
Notes
|
|
66
28
|
-----
|
|
67
|
-
|
|
68
|
-
- If 'models' is a list of model names or 'all', 'learner' must be provided.
|
|
69
|
-
- The 'all' option for 'models' will use all quantification models available in the library.
|
|
70
|
-
- If 'models' is a Quantifier or list of Quantifier, 'learner' is not required. But the models must be initializated
|
|
71
|
-
- You can pass your own model by passing a Quantifier object.
|
|
72
|
-
- Columns must contain ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS'].
|
|
73
|
-
- If 'return_type' is 'table', the table will contain the columns specified in 'columns' and the error measures in 'measures'.
|
|
74
|
-
- For creating your own protocol, you must have the attributes 'models', 'learner', 'n_jobs', 'random_state', 'verbose', 'return_type', 'measures', and 'columns'., but columns can be changed, as long as it contains ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS'].
|
|
75
|
-
|
|
76
|
-
See Also
|
|
77
|
-
--------
|
|
78
|
-
APP : Artificial Prevalence Protocol.
|
|
79
|
-
NPP : Natural Prevalence Protocol.
|
|
80
|
-
Quantifier : Base class for quantification methods.
|
|
29
|
+
This class serves as a base class for different evaluation protocols, each with its own strategy for splitting the data into batches.
|
|
81
30
|
|
|
82
31
|
Examples
|
|
83
32
|
--------
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
>>> from sklearn.datasets import load_breast_cancer
|
|
89
|
-
>>> from sklearn.model_selection import train_test_split
|
|
90
|
-
>>> import time as t
|
|
91
|
-
>>>
|
|
92
|
-
>>> class MyProtocol(Protocol):
|
|
93
|
-
... def __init__(self,
|
|
94
|
-
... models,
|
|
95
|
-
... learner,
|
|
96
|
-
... n_jobs,
|
|
97
|
-
... random_state,
|
|
98
|
-
... verbose,
|
|
99
|
-
... return_type,
|
|
100
|
-
... measures,
|
|
101
|
-
... sample_size,
|
|
102
|
-
... iterations=10):
|
|
103
|
-
... super().__init__(models,
|
|
104
|
-
... learner,
|
|
105
|
-
... n_jobs,
|
|
106
|
-
... random_state,
|
|
107
|
-
... verbose,
|
|
108
|
-
... return_type,
|
|
109
|
-
... measures,
|
|
110
|
-
... columns=['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS', 'TIME'])
|
|
111
|
-
... self.sample_size = sample_size
|
|
112
|
-
... self.iterations = iterations
|
|
113
|
-
...
|
|
114
|
-
... def predict_protocol(self, X_test, y_test):
|
|
115
|
-
... predictions = []
|
|
116
|
-
...
|
|
117
|
-
... X_sample, y_sample = self._new_sample(X_test, y_test)
|
|
118
|
-
...
|
|
119
|
-
... for _ in range(self.iterations):
|
|
120
|
-
... for model in self.models:
|
|
121
|
-
... quantifier = model.__class__.__name__
|
|
122
|
-
...
|
|
123
|
-
... real_prev = get_real_prev(y_sample)
|
|
33
|
+
>>> class MyCustomProtocol(Protocol):
|
|
34
|
+
... def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
|
|
35
|
+
... for batch_size in self.batch_size:
|
|
36
|
+
... yield np.random.choice(X.shape[0], batch_size, replace=True)
|
|
124
37
|
...
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
...
|
|
128
|
-
...
|
|
129
|
-
|
|
130
|
-
... predictions.append([quantifier, real_prev, pred_prev, time])
|
|
131
|
-
...
|
|
132
|
-
... return predictions
|
|
133
|
-
...
|
|
134
|
-
... def _new_sample(self, X_test, y_test):
|
|
135
|
-
... indexes = np.random.choice(len(X_test), size=self.sample_size, replace=False)
|
|
136
|
-
... return X_test[indexes], y_test[indexes]
|
|
137
|
-
>>>
|
|
138
|
-
>>>
|
|
139
|
-
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
140
|
-
>>>
|
|
141
|
-
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.5, random_state=42)
|
|
142
|
-
>>>
|
|
143
|
-
>>> protocol = MyProtocol(models=["CC", "EMQ", "DyS"], # or [CC(learner), EMQ(learner), DyS(learner)]
|
|
144
|
-
... learner=RandomForestClassifier(),
|
|
145
|
-
... n_jobs=1,
|
|
146
|
-
... random_state=42,
|
|
147
|
-
... verbose=True,
|
|
148
|
-
... return_type="table",
|
|
149
|
-
... measures=None,
|
|
150
|
-
... sample_size=100)
|
|
151
|
-
>>>
|
|
152
|
-
>>> protocol.fit(X_train, y_train)
|
|
153
|
-
>>> table = protocol.predict(X_test, y_test)
|
|
154
|
-
>>> print(table)
|
|
155
|
-
|
|
38
|
+
>>> protocol = MyCustomProtocol(batch_size=100, random_state=42)
|
|
39
|
+
>>> for train_idx, test_idx in protocol.split(X, y):
|
|
40
|
+
... # Train and evaluate model
|
|
41
|
+
... pass
|
|
42
|
+
|
|
156
43
|
"""
|
|
157
|
-
|
|
158
|
-
def __init__(self,
|
|
159
|
-
models: Union[List[Union[str, Quantifier]], str, Quantifier],
|
|
160
|
-
learner: BaseEstimator = None,
|
|
161
|
-
n_jobs: int = 1,
|
|
162
|
-
random_state: int = 32,
|
|
163
|
-
verbose: bool = False,
|
|
164
|
-
return_type: str = "predictions",
|
|
165
|
-
measures: List[str] = None,
|
|
166
|
-
columns: List[str] = ["ITERATION", "QUANTIFIER", "REAL_PREVS", "PRED_PREVS", "BATCH_SIZE"]):
|
|
167
|
-
|
|
168
|
-
assert not measures or all(m in MEASURES for m in measures), \
|
|
169
|
-
f"Invalid measure(s) provided. Valid options: {list(MEASURES.keys())} or None"
|
|
170
|
-
assert return_type in ["predictions", "table"], \
|
|
171
|
-
"Invalid return_type. Valid options: ['predictions', 'table']"
|
|
172
|
-
assert all(col in columns for col in ["QUANTIFIER", "REAL_PREVS", "PRED_PREVS"]), \
|
|
173
|
-
"Columns must contain ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS']"
|
|
174
|
-
|
|
175
|
-
# Fixed parameters
|
|
176
|
-
self.models = self._initialize_models(models, learner)
|
|
177
|
-
self.learner = learner
|
|
178
|
-
self.n_jobs = n_jobs
|
|
179
|
-
self.random_state = random_state
|
|
180
|
-
self.verbose = verbose
|
|
181
|
-
self.return_type = return_type
|
|
182
|
-
self.measures = measures
|
|
183
|
-
self.columns = columns
|
|
184
|
-
|
|
185
|
-
def _initialize_models(self, models, learner):
|
|
186
|
-
"""Initializes the quantification models.
|
|
187
|
-
|
|
188
|
-
Parameters
|
|
189
|
-
----------
|
|
190
|
-
models : Union[List[Union[str, Quantifier]], str, Quantifier]
|
|
191
|
-
List of quantification models, a single model name, or 'all' for all models.
|
|
192
|
-
learner : BaseEstimator
|
|
193
|
-
Machine learning model to be used with the quantifiers.
|
|
194
|
-
|
|
195
|
-
Returns
|
|
196
|
-
-------
|
|
197
|
-
List[Quantifier]
|
|
198
|
-
List of quantification models.
|
|
199
|
-
"""
|
|
200
|
-
if isinstance(models, list):
|
|
201
|
-
if all(isinstance(model, Quantifier) for model in models):
|
|
202
|
-
return models
|
|
203
|
-
return [get_method(model)(learner) for model in models]
|
|
204
|
-
|
|
205
|
-
if isinstance(models, Quantifier):
|
|
206
|
-
return [models]
|
|
207
44
|
|
|
208
|
-
|
|
45
|
+
def __init__(self, batch_size, random_state=None, **kwargs):
|
|
46
|
+
if isinstance(batch_size, int):
|
|
47
|
+
self.n_combinations = 1
|
|
48
|
+
else:
|
|
49
|
+
self.n_combinations = len(batch_size)
|
|
209
50
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
"aggregative": AGGREGATIVE.values,
|
|
213
|
-
"non_aggregative": NON_AGGREGATIVE.values
|
|
214
|
-
}
|
|
51
|
+
self.batch_size = [batch_size] if isinstance(batch_size, int) else batch_size
|
|
52
|
+
self.random_state = random_state
|
|
215
53
|
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
Parameters
|
|
229
|
-
----------
|
|
230
|
-
X_train : np.ndarray
|
|
231
|
-
Features of the training set.
|
|
232
|
-
y_train : np.ndarray
|
|
233
|
-
Labels of the training set.
|
|
234
|
-
|
|
235
|
-
Returns
|
|
236
|
-
-------
|
|
237
|
-
Protocol
|
|
238
|
-
Fitted protocol.
|
|
54
|
+
for name, value in kwargs.items():
|
|
55
|
+
setattr(self, name, value)
|
|
56
|
+
if isinstance(value, list):
|
|
57
|
+
self.n_combinations *= len(value)
|
|
58
|
+
elif isinstance(value, (int, float)):
|
|
59
|
+
self.n_combinations *= value
|
|
60
|
+
else:
|
|
61
|
+
raise ValueError(f"Invalid argument {name}={value}: must be int/float or list of int/float.")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def split(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray, np.ndarray]:
|
|
239
65
|
"""
|
|
240
|
-
|
|
66
|
+
Split the data into samples for evaluation.
|
|
241
67
|
|
|
242
|
-
args = ((model, X_train, y_train) for model in self.models)
|
|
243
|
-
|
|
244
|
-
wrapper = tqdm if self.verbose else lambda x, **kwargs: x
|
|
245
|
-
|
|
246
|
-
self.models = Parallel(n_jobs=self.n_jobs)( # Parallel processing of models
|
|
247
|
-
delayed(self._delayed_fit)(*arg) for arg in wrapper(args, desc="Fitting models", total=len(self.models))
|
|
248
|
-
)
|
|
249
|
-
self.sout("Fit [Done]")
|
|
250
|
-
return self
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
def predict(self, X_test: np.ndarray, y_test: np.ndarray) -> Any:
|
|
254
|
-
"""Predicts the prevalence for the test set.
|
|
255
|
-
|
|
256
68
|
Parameters
|
|
257
69
|
----------
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
Returns
|
|
264
|
-
-------
|
|
265
|
-
Any
|
|
266
|
-
Predictions for the test set. Can be a table or a tuple with the quantifier names, real prevalence, and predicted prevalence.
|
|
267
|
-
"""
|
|
268
|
-
predictions = self.predict_protocol(X_test, y_test)
|
|
269
|
-
predictions_df = pd.DataFrame(predictions, columns=self.columns)
|
|
270
|
-
|
|
271
|
-
if self.return_type == "table":
|
|
272
|
-
if self.measures:
|
|
273
|
-
smoothed_factor = 1 / (2 * len(X_test))
|
|
274
|
-
|
|
275
|
-
def smooth(values: np.ndarray) -> np.ndarray:
|
|
276
|
-
return (values + smoothed_factor) / (smoothed_factor * len(values) + 1)
|
|
277
|
-
|
|
278
|
-
for metric in self.measures:
|
|
279
|
-
predictions_df[metric] = predictions_df.apply(
|
|
280
|
-
lambda row: get_measure(metric)(
|
|
281
|
-
smooth(np.array(row["REAL_PREVS"])),
|
|
282
|
-
smooth(np.array(row["PRED_PREVS"]))
|
|
283
|
-
),
|
|
284
|
-
axis=1
|
|
285
|
-
)
|
|
286
|
-
return predictions_df
|
|
287
|
-
|
|
288
|
-
return (
|
|
289
|
-
predictions_df["QUANTIFIER"].to_numpy(), # Quantifier names
|
|
290
|
-
np.stack(predictions_df["REAL_PREVS"].to_numpy()), # REAL_PREVS
|
|
291
|
-
np.stack(predictions_df["PRED_PREVS"].to_numpy()) # PRED_PREVS
|
|
292
|
-
)
|
|
70
|
+
X : np.ndarray
|
|
71
|
+
The input features.
|
|
72
|
+
y : np.ndarray
|
|
73
|
+
The target labels.
|
|
293
74
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
Parameters
|
|
299
|
-
----------
|
|
300
|
-
X_test : np.ndarray
|
|
301
|
-
Features of the test set.
|
|
302
|
-
y_test : np.ndarray
|
|
303
|
-
Labels of the test set.
|
|
304
|
-
|
|
305
|
-
Returns
|
|
306
|
-
-------
|
|
307
|
-
np.ndarray
|
|
308
|
-
Predictions for the test set. With the same format as the column names attribute.
|
|
75
|
+
Yields
|
|
76
|
+
------
|
|
77
|
+
Generator[np.ndarray, np.ndarray]
|
|
78
|
+
A generator that yields the indices for each split.
|
|
309
79
|
"""
|
|
310
|
-
|
|
80
|
+
indices = np.arange(X.shape[0])
|
|
81
|
+
for idx in self._split_indices_masks(X, y):
|
|
82
|
+
indexes = indices[idx]
|
|
83
|
+
yield indexes
|
|
311
84
|
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
"""Abstract method of sample extraction for each protocol.
|
|
85
|
+
def _split_indices_masks(self, X: np.ndarray, y: np.ndarray) -> Generator[Tuple[np.ndarray, np.ndarray]]:
|
|
86
|
+
for idx in self._iter_indices(X, y):
|
|
315
87
|
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
"""
|
|
319
|
-
...
|
|
88
|
+
mask = np.zeros(X.shape[0], dtype=bool)
|
|
89
|
+
mask[idx] = True
|
|
320
90
|
|
|
321
|
-
|
|
322
|
-
def _delayed_fit(model, X_train, y_train):
|
|
323
|
-
"""Method to fit the model in parallel.
|
|
324
|
-
|
|
325
|
-
Parameters
|
|
326
|
-
----------
|
|
327
|
-
model : Quantifier
|
|
328
|
-
Quantification model.
|
|
329
|
-
X_train : np.ndarray
|
|
330
|
-
Features of the training set.
|
|
331
|
-
y_train : np.ndarray
|
|
332
|
-
Labels of the training set.
|
|
333
|
-
|
|
334
|
-
Returns
|
|
335
|
-
-------
|
|
336
|
-
Quantifier
|
|
337
|
-
Fitted quantification model
|
|
338
|
-
"""
|
|
339
|
-
model_name = model.__class__.__name__
|
|
340
|
-
if model_name == "Ensemble" and isinstance(model.base_quantifier, Quantifier):
|
|
341
|
-
model_name = f"{model.__class__.__name__}_{model.base_quantifier.__class__.__name__}_{model.size}"
|
|
342
|
-
|
|
343
|
-
start = time()
|
|
344
|
-
model = model.fit(X=X_train, y=y_train)
|
|
345
|
-
duration = time() - start
|
|
346
|
-
print(f"\tFitted {model_name} in {duration:.3f} seconds")
|
|
347
|
-
return model
|
|
91
|
+
yield mask
|
|
348
92
|
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
93
|
+
@abstractmethod
|
|
94
|
+
def _iter_indices(self, X, y):
|
|
95
|
+
"""Abstract method to be implemented by subclasses to yield indices for each batch."""
|
|
96
|
+
pass
|
|
352
97
|
|
|
98
|
+
def get_n_combinations(self) -> int:
|
|
99
|
+
"""
|
|
100
|
+
Get the number of combinations for the current protocol.
|
|
101
|
+
"""
|
|
102
|
+
return self.n_combinations
|
|
353
103
|
|
|
354
104
|
|
|
355
105
|
class APP(Protocol):
|
|
356
|
-
"""Artificial Prevalence Protocol.
|
|
357
|
-
|
|
358
|
-
This approach splits a test into several samples varying prevalence and sample size,
|
|
359
|
-
with n iterations. For a list of Quantifiers, it computes training and testing for
|
|
360
|
-
each one and returns either a table of results with error measures or just the predictions.
|
|
106
|
+
"""Artificial Prevalence Protocol (APP) for evaluation.
|
|
107
|
+
This protocol generates artificial prevalence distributions for the evaluation in an exhaustive manner, testing all possible combinations of prevalences.
|
|
361
108
|
|
|
362
109
|
Parameters
|
|
363
110
|
----------
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
n_prevs : int, optional
|
|
371
|
-
Number of prevalence points to generate. Default is 100.
|
|
372
|
-
n_iterations : int, optional
|
|
373
|
-
Number of iterations for the protocol. Default is 1.
|
|
374
|
-
n_jobs : int, optional
|
|
375
|
-
Number of jobs to run in parallel. Default is 1.
|
|
111
|
+
batch_size : int or list of int
|
|
112
|
+
The size of the batches to be used in the evaluation.
|
|
113
|
+
n_prevalences : int
|
|
114
|
+
The number of artificial prevalences to generate.
|
|
115
|
+
repeats : int, optional
|
|
116
|
+
The number of times to repeat the evaluation with different random seeds.
|
|
376
117
|
random_state : int, optional
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
Whether to print progress messages. Default is False.
|
|
380
|
-
return_type : str, optional
|
|
381
|
-
Type of return value ('predictions' or 'table'). Default is 'predictions'.
|
|
382
|
-
measures : List[str], optional
|
|
383
|
-
List of error measures to calculate. Must be in MEASURES or None. Default is None.
|
|
384
|
-
|
|
118
|
+
The random seed for reproducibility.
|
|
119
|
+
|
|
385
120
|
Attributes
|
|
386
121
|
----------
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
learner : BaseEstimator
|
|
392
|
-
Machine learning model to be used with the quantifiers.
|
|
393
|
-
n_prevs : int
|
|
394
|
-
Number of prevalence points to generate.
|
|
395
|
-
n_iterations : int
|
|
396
|
-
Number of iterations for the protocol.
|
|
397
|
-
n_jobs : int
|
|
398
|
-
Number of jobs to run in parallel.
|
|
122
|
+
n_prevalences : int
|
|
123
|
+
The number of artificial prevalences to generate.
|
|
124
|
+
repeats : int
|
|
125
|
+
The number of times to repeat the evaluation with different random seeds.
|
|
399
126
|
random_state : int
|
|
400
|
-
|
|
401
|
-
verbose : bool
|
|
402
|
-
Whether to print progress messages.
|
|
403
|
-
return_type : str
|
|
404
|
-
Type of return value ('predictions' or 'table').
|
|
405
|
-
measures : List[str]
|
|
406
|
-
List of error measures to calculate.
|
|
127
|
+
The random seed for reproducibility.
|
|
407
128
|
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
See Also
|
|
414
|
-
--------
|
|
415
|
-
Protocol : Base class for evaluation protocols.
|
|
416
|
-
NPP : Natural Prevalence Protocol.
|
|
417
|
-
Quantifier : Base class for quantification methods.
|
|
418
|
-
|
|
129
|
+
Notes
|
|
130
|
+
-----
|
|
131
|
+
It is important to note that in case of multiclass problems, the time complexity of this protocol can be significantly higher due to the increased number of combinations to evaluate.
|
|
132
|
+
|
|
419
133
|
Examples
|
|
420
134
|
--------
|
|
421
|
-
>>>
|
|
422
|
-
>>>
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
>>>
|
|
426
|
-
>>> # Loading dataset from sklearn
|
|
427
|
-
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
428
|
-
>>>
|
|
429
|
-
>>> #Splitting into train and test
|
|
430
|
-
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
|
|
431
|
-
>>>
|
|
432
|
-
>>> app = APP(models=["CC", "EMQ", "DyS"],
|
|
433
|
-
... batch_size=[10, 50, 100],
|
|
434
|
-
... learner=RandomForestClassifier(),
|
|
435
|
-
... n_prevs=100, # Default
|
|
436
|
-
... n_jobs=-1,
|
|
437
|
-
... return_type="table",
|
|
438
|
-
... measures=["ae", "se"],
|
|
439
|
-
... verbose=True)
|
|
440
|
-
>>>
|
|
441
|
-
>>> app.fit(X_train, y_train)
|
|
442
|
-
>>>
|
|
443
|
-
>>> table = app.predict(X_test, y_test)
|
|
444
|
-
>>>
|
|
445
|
-
>>> print(table)
|
|
446
|
-
"""
|
|
447
|
-
|
|
448
|
-
def __init__(self,
|
|
449
|
-
models: Union[List[Union[str, Quantifier]], str, Quantifier],
|
|
450
|
-
batch_size: Union[List[int], int],
|
|
451
|
-
learner: BaseEstimator = None,
|
|
452
|
-
n_prevs: int = 100,
|
|
453
|
-
n_iterations: int = 1,
|
|
454
|
-
n_jobs: int = 1,
|
|
455
|
-
random_state: int = 32,
|
|
456
|
-
verbose: bool = False,
|
|
457
|
-
return_type: str = "predictions",
|
|
458
|
-
measures: List[str] = None):
|
|
459
|
-
|
|
460
|
-
super().__init__(models, learner, n_jobs, random_state, verbose, return_type, measures)
|
|
461
|
-
self.n_prevs = n_prevs
|
|
462
|
-
self.batch_size = batch_size if isinstance(batch_size, list) else [batch_size]
|
|
463
|
-
self.n_prevs = n_prevs
|
|
464
|
-
self.n_iterations = n_iterations
|
|
135
|
+
>>> protocol = APP(batch_size=[100, 200], n_prevalences=5, repeats=3, random_state=42)
|
|
136
|
+
>>> for train_idx, test_idx in protocol.split(X, y):
|
|
137
|
+
... # Train and evaluate model
|
|
138
|
+
... pass
|
|
465
139
|
|
|
140
|
+
"""
|
|
466
141
|
|
|
467
|
-
def
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
142
|
+
def __init__(self, batch_size, n_prevalences, repeats=1, random_state=None):
|
|
143
|
+
super().__init__(batch_size=batch_size,
|
|
144
|
+
random_state=random_state,
|
|
145
|
+
n_prevalences=n_prevalences,
|
|
146
|
+
repeats=repeats)
|
|
471
147
|
|
|
472
|
-
|
|
473
|
-
----------
|
|
474
|
-
X_test : np.ndarray
|
|
475
|
-
Features of the test set.
|
|
476
|
-
y_test : np.ndarray
|
|
477
|
-
Labels of the test set.
|
|
148
|
+
def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
|
|
478
149
|
|
|
479
|
-
|
|
480
|
-
-------
|
|
481
|
-
Tuple
|
|
482
|
-
Tuple containing the (iteration, model name, prev, prev_pred, and batch size).
|
|
483
|
-
"""
|
|
484
|
-
|
|
485
|
-
n_dim = len(np.unique(y_test))
|
|
486
|
-
prevs = generate_artificial_prevalences(n_dim, self.n_prevs, self.n_iterations)
|
|
487
|
-
|
|
488
|
-
args = [
|
|
489
|
-
(iteration, X_test, y_test, model, prev, bs, self.verbose)
|
|
490
|
-
for prev in prevs for bs in self.batch_size for model in self.models for iteration in range(self.n_iterations)
|
|
491
|
-
]
|
|
150
|
+
n_dim = len(np.unique(y))
|
|
492
151
|
|
|
493
|
-
|
|
152
|
+
for batch_size in self.batch_size:
|
|
153
|
+
prevalences = generate_artificial_prevalences(n_dim=n_dim,
|
|
154
|
+
n_prev=self.n_prevalences,
|
|
155
|
+
n_iter=self.repeats)
|
|
156
|
+
for prev in prevalences:
|
|
157
|
+
indexes = get_indexes_with_prevalence(y, prev, batch_size)
|
|
158
|
+
yield indexes
|
|
159
|
+
|
|
494
160
|
|
|
495
|
-
|
|
496
|
-
for arg in tqdm(args, desc="Running APP", total=size):
|
|
497
|
-
predictions.append(self._predict(*arg))
|
|
498
|
-
|
|
499
|
-
return predictions
|
|
161
|
+
|
|
500
162
|
|
|
501
|
-
|
|
502
|
-
|
|
163
|
+
class NPP(Protocol):
|
|
164
|
+
"""No Prevalence Protocol (NPP) for evaluation.
|
|
165
|
+
This protocol just samples the data without any consideration for prevalence, with all instances having equal probability of being selected.
|
|
503
166
|
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
y : np.ndarray
|
|
511
|
-
Labels of the test set.
|
|
512
|
-
model : Any
|
|
513
|
-
Quantification model.
|
|
514
|
-
prev : List[float]
|
|
515
|
-
Prevalence values for the sample.
|
|
516
|
-
batch_size : int
|
|
517
|
-
Batch size for the sample.
|
|
518
|
-
verbose : bool
|
|
519
|
-
Whether to print progress messages.
|
|
520
|
-
|
|
521
|
-
Returns
|
|
522
|
-
-------
|
|
523
|
-
Tuple
|
|
524
|
-
Tuple containing the iteration, model name, prev, prev_pred, and batch size.
|
|
525
|
-
"""
|
|
526
|
-
model_name = model.__class__.__name__
|
|
527
|
-
if model_name == "Ensemble" and isinstance(model.base_quantifier, Quantifier):
|
|
528
|
-
model_name = f"{model.__class__.__name__}_{model.base_quantifier.__class__.__name__}_{model.size}"
|
|
529
|
-
|
|
530
|
-
if verbose:
|
|
531
|
-
print(f'\t {model_name} with {batch_size} instances and prev {prev}')
|
|
532
|
-
|
|
533
|
-
X_sample, _ = self._new_sample(X, y, prev, batch_size)
|
|
534
|
-
prev_pred = np.asarray(list(model.predict(X_sample).values()))
|
|
535
|
-
|
|
536
|
-
if verbose:
|
|
537
|
-
print(f'\t \\--Ending {model_name} with {batch_size} instances and prev {prev}\n')
|
|
538
|
-
|
|
539
|
-
return (iteration+1, model_name, prev, prev_pred, batch_size)
|
|
167
|
+
Parameters
|
|
168
|
+
----------
|
|
169
|
+
batch_size : int or list of int
|
|
170
|
+
The size of the batches to be used in the evaluation.
|
|
171
|
+
random_state : int, optional
|
|
172
|
+
The random seed for reproducibility.
|
|
540
173
|
|
|
174
|
+
Attributes
|
|
175
|
+
----------
|
|
176
|
+
n_prevalences : int
|
|
177
|
+
The number of artificial prevalences to generate.
|
|
178
|
+
repeats : int
|
|
179
|
+
The number of times to repeat the evaluation with different random seeds.
|
|
180
|
+
random_state : int
|
|
181
|
+
The random seed for reproducibility.
|
|
541
182
|
|
|
542
|
-
|
|
543
|
-
|
|
183
|
+
Examples
|
|
184
|
+
--------
|
|
185
|
+
>>> protocol = NPP(batch_size=100, random_state=42)
|
|
186
|
+
>>> for train_idx, test_idx in protocol.split(X, y):
|
|
187
|
+
... # Train and evaluate model
|
|
188
|
+
... pass
|
|
189
|
+
"""
|
|
544
190
|
|
|
545
|
-
|
|
546
|
-
----------
|
|
547
|
-
X : np.ndarray
|
|
548
|
-
Features of the test set.
|
|
549
|
-
y : np.ndarray
|
|
550
|
-
Labels of the test set.
|
|
551
|
-
prev : List[float]
|
|
552
|
-
Prevalence values for the sample.
|
|
553
|
-
batch_size : int
|
|
554
|
-
Batch size for the sample.
|
|
191
|
+
def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
|
|
555
192
|
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
Tuple containing the new sample features and labels.
|
|
560
|
-
"""
|
|
561
|
-
sample_index = generate_artificial_indexes(y, prev, batch_size, np.unique(y))
|
|
562
|
-
return (np.take(X, sample_index, axis=0), np.take(y, sample_index, axis=0))
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
193
|
+
for batch_size in self.batch_size:
|
|
194
|
+
yield np.random.choice(X.shape[0], batch_size, replace=True)
|
|
195
|
+
|
|
566
196
|
|
|
197
|
+
class UPP(Protocol):
|
|
198
|
+
"""Uniform Prevalence Protocol (UPP) for evaluation.
|
|
199
|
+
An extension of the APP that generates artificial prevalence distributions uniformly across all classes utilizing the kraemer sampling method.
|
|
567
200
|
|
|
201
|
+
Parameters
|
|
202
|
+
----------
|
|
203
|
+
batch_size : int or list of int
|
|
204
|
+
The size of the batches to be used in the evaluation.
|
|
205
|
+
n_prevalences : int
|
|
206
|
+
The number of artificial prevalences to generate.
|
|
207
|
+
repeats : int
|
|
208
|
+
The number of times to repeat the evaluation with different random seeds.
|
|
209
|
+
random_state : int, optional
|
|
210
|
+
The random seed for reproducibility.
|
|
568
211
|
|
|
212
|
+
Attributes
|
|
213
|
+
----------
|
|
214
|
+
n_prevalences : int
|
|
215
|
+
The number of artificial prevalences to generate.
|
|
216
|
+
repeats : int
|
|
217
|
+
The number of times to repeat the evaluation with different random seeds.
|
|
218
|
+
random_state : int
|
|
219
|
+
The random seed for reproducibility.
|
|
569
220
|
|
|
221
|
+
Examples
|
|
222
|
+
--------
|
|
223
|
+
>>> protocol = UPP(batch_size=100, n_prevalences=5, repeats=3, random_state=42)
|
|
224
|
+
>>> for train_idx, test_idx in protocol.split(X, y):
|
|
225
|
+
... # Train and evaluate model
|
|
226
|
+
... pass
|
|
227
|
+
"""
|
|
570
228
|
|
|
229
|
+
def __init__(self, batch_size, n_prevalences, repeats=1, random_state=None):
|
|
230
|
+
super().__init__(batch_size=batch_size,
|
|
231
|
+
random_state=random_state,
|
|
232
|
+
n_prevalences=n_prevalences,
|
|
233
|
+
repeats=repeats)
|
|
571
234
|
|
|
235
|
+
def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
|
|
236
|
+
|
|
237
|
+
n_dim = len(np.unique(y))
|
|
238
|
+
|
|
239
|
+
for batch_size in self.batch_size:
|
|
240
|
+
|
|
241
|
+
prevalences = kraemer_sampling(n_dim=n_dim,
|
|
242
|
+
n_prev=self.n_prevalences,
|
|
243
|
+
n_iter=self.repeats)
|
|
244
|
+
|
|
245
|
+
for prev in prevalences:
|
|
246
|
+
indexes = get_indexes_with_prevalence(y, prev, batch_size)
|
|
247
|
+
yield indexes
|
|
572
248
|
|
|
573
249
|
|
|
250
|
+
class PPP(Protocol):
|
|
251
|
+
""" Personalized Prevalence Protocol (PPP) for evaluation.
|
|
252
|
+
This protocol generates artificial prevalence distributions personalized for each class.
|
|
574
253
|
|
|
575
|
-
class NPP(Protocol):
|
|
576
|
-
"""Natural Prevalence Protocol.
|
|
577
|
-
|
|
578
|
-
This approach splits a test into several samples varying sample size,
|
|
579
|
-
with n iterations. For a list of Quantifiers, it computes training and testing for
|
|
580
|
-
each one and returns either a table of results with error measures or just the predictions.
|
|
581
|
-
|
|
582
254
|
Parameters
|
|
583
255
|
----------
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
n_iterations : int, optional
|
|
591
|
-
Number of iterations for the protocol. Default is 1.
|
|
592
|
-
n_jobs : int, optional
|
|
593
|
-
Number of jobs to run in parallel. Default is 1.
|
|
256
|
+
batch_size : int or list of int
|
|
257
|
+
The size of the batches to be used in the evaluation.
|
|
258
|
+
prevalences : list of float
|
|
259
|
+
The list of artificial prevalences to generate for each class.
|
|
260
|
+
repeats : int
|
|
261
|
+
The number of times to repeat the evaluation with different random seeds.
|
|
594
262
|
random_state : int, optional
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
Whether to print progress messages. Default is False.
|
|
598
|
-
return_type : str, optional
|
|
599
|
-
Type of return value ('predictions' or 'table'). Default is 'predictions'.
|
|
600
|
-
measures : List[str], optional
|
|
601
|
-
List of error measures to calculate. Must be in MEASURES or None. Default is None.
|
|
602
|
-
|
|
263
|
+
The random seed for reproducibility.
|
|
264
|
+
|
|
603
265
|
Attributes
|
|
604
266
|
----------
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
learner : BaseEstimator
|
|
610
|
-
Machine learning model to be used with the quantifiers.
|
|
611
|
-
n_iterations : int
|
|
612
|
-
Number of iterations for the protocol.
|
|
613
|
-
n_jobs : int
|
|
614
|
-
Number of jobs to run in parallel.
|
|
267
|
+
prevalences : list of float
|
|
268
|
+
The list of artificial prevalences to generate for each class.
|
|
269
|
+
repeats : int
|
|
270
|
+
The number of times to repeat the evaluation with different random seeds.
|
|
615
271
|
random_state : int
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
272
|
+
The random seed for reproducibility.
|
|
273
|
+
|
|
274
|
+
Examples
|
|
275
|
+
--------
|
|
276
|
+
>>> protocol = PPP(batch_size=100, prevalences=[0.1, 0.9], repeats=3, random_state=42)
|
|
277
|
+
>>> for train_idx, test_idx in protocol.split(X, y):
|
|
278
|
+
... # Train and evaluate model
|
|
279
|
+
... pass
|
|
623
280
|
"""
|
|
624
281
|
|
|
282
|
+
def __init__(self, batch_size, prevalences, repeats=1, random_state=None):
|
|
283
|
+
super().__init__(batch_size=batch_size,
|
|
284
|
+
random_state=random_state,
|
|
285
|
+
prevalences=prevalences,
|
|
286
|
+
repeats=repeats)
|
|
625
287
|
|
|
626
|
-
def
|
|
627
|
-
models: Union[List[Union[str, Quantifier]], str, Quantifier],
|
|
628
|
-
learner: BaseEstimator = None,
|
|
629
|
-
n_jobs: int = 1,
|
|
630
|
-
random_state: int = 32,
|
|
631
|
-
verbose: bool = False,
|
|
632
|
-
return_type: str = "predictions",
|
|
633
|
-
measures: List[str] = None):
|
|
634
|
-
|
|
635
|
-
super().__init__(models, learner, n_jobs, random_state, verbose, return_type, measures)
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
def predict_protocol(self, X_test, y_test) -> tuple:
|
|
639
|
-
raise NotImplementedError
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
def _new_sample(self, X, y, prev: List[float], batch_size: int) -> tuple:
|
|
643
|
-
raise NotImplementedError
|
|
288
|
+
def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
|
|
644
289
|
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
290
|
+
for batch_size in self.batch_size:
|
|
291
|
+
for prev in self.prevalences:
|
|
292
|
+
if isinstance(prev, float):
|
|
293
|
+
prev = [1-prev, prev]
|
|
294
|
+
|
|
295
|
+
indexes = get_indexes_with_prevalence(y, prev, batch_size)
|
|
296
|
+
yield indexes
|
|
297
|
+
|
mlquantify/methods/meta.py
CHANGED
|
@@ -7,7 +7,7 @@ from sklearn.model_selection import GridSearchCV, cross_val_predict
|
|
|
7
7
|
from ..evaluation import measures
|
|
8
8
|
from ..base import Quantifier
|
|
9
9
|
from ..utils.method import getHist, hellinger
|
|
10
|
-
from ..utils.general import make_prevs, normalize_prevalence, parallel,
|
|
10
|
+
from ..utils.general import make_prevs, normalize_prevalence, parallel, get_indexes_with_prevalence
|
|
11
11
|
|
|
12
12
|
class Ensemble(Quantifier):
|
|
13
13
|
"""Ensemble of Quantification Models.
|
|
@@ -401,7 +401,7 @@ def _delayed_new_sample(args):
|
|
|
401
401
|
print(f'\tfit-start for prev {str(np.round(prev, 3))}, sample_size={sample_size}')
|
|
402
402
|
model = deepcopy(base_quantifier)
|
|
403
403
|
|
|
404
|
-
sample_index =
|
|
404
|
+
sample_index = get_indexes_with_prevalence(y, prev, sample_size)
|
|
405
405
|
X_sample = np.take(X, sample_index, axis=0)
|
|
406
406
|
y_sample = np.take(y, sample_index, axis=0)
|
|
407
407
|
#print(X_sample)
|
mlquantify/utils/general.py
CHANGED
|
@@ -26,12 +26,9 @@ def convert_columns_to_arrays(df, columns:list = ['PRED_PREVS', 'REAL_PREVS']):
|
|
|
26
26
|
return df
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def generate_artificial_indexes(y, prevalence: list, sample_size:int, classes:list):
|
|
29
|
+
def get_indexes_with_prevalence(y, prevalence: list, sample_size:int):
|
|
33
30
|
"""
|
|
34
|
-
|
|
31
|
+
Get indexes for a stratified sample based on the prevalence of each class.
|
|
35
32
|
|
|
36
33
|
Parameters
|
|
37
34
|
----------
|
|
@@ -48,10 +45,13 @@ def generate_artificial_indexes(y, prevalence: list, sample_size:int, classes:li
|
|
|
48
45
|
-------
|
|
49
46
|
list
|
|
50
47
|
List of indexes for the stratified sample.
|
|
51
|
-
"""
|
|
48
|
+
"""
|
|
49
|
+
classes = np.unique(y)
|
|
50
|
+
|
|
52
51
|
# Ensure the sum of prevalences is 1
|
|
53
52
|
assert np.isclose(sum(prevalence), 1), "The sum of prevalences must be 1"
|
|
54
53
|
# Ensure the number of prevalences matches the number of classes
|
|
54
|
+
assert len(prevalence) == len(classes), "The number of prevalences must match the number of classes"
|
|
55
55
|
|
|
56
56
|
sampled_indexes = []
|
|
57
57
|
total_sampled = 0
|
|
@@ -78,6 +78,43 @@ def generate_artificial_indexes(y, prevalence: list, sample_size:int, classes:li
|
|
|
78
78
|
|
|
79
79
|
|
|
80
80
|
|
|
81
|
+
def kraemer_sampling(n_dim: int, n_prev: int, n_iter: int = 1) -> np.ndarray:
|
|
82
|
+
"""
|
|
83
|
+
Uniform sampling from the unit simplex using Kraemer's algorithm.
|
|
84
|
+
|
|
85
|
+
Parameters
|
|
86
|
+
----------
|
|
87
|
+
n_dim : int
|
|
88
|
+
Number of dimensions.
|
|
89
|
+
n_prev : int
|
|
90
|
+
Size of the sample.
|
|
91
|
+
n_iter : int
|
|
92
|
+
Number of iterations.
|
|
93
|
+
|
|
94
|
+
Returns
|
|
95
|
+
-------
|
|
96
|
+
np.ndarray
|
|
97
|
+
Array of sampled prevalences.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
def _sampling(n_dim: int, n_prev: int) -> np.ndarray:
|
|
101
|
+
if n_dim == 2:
|
|
102
|
+
u = np.random.rand(n_prev)
|
|
103
|
+
return np.vstack([1 - u, u]).T
|
|
104
|
+
else:
|
|
105
|
+
u = np.random.rand(n_prev, n_dim - 1)
|
|
106
|
+
u.sort(axis=-1) # sort each row
|
|
107
|
+
_0s = np.zeros((n_prev, 1))
|
|
108
|
+
_1s = np.ones((n_prev, 1))
|
|
109
|
+
a = np.hstack([_0s, u])
|
|
110
|
+
b = np.hstack([u, _1s])
|
|
111
|
+
return b - a
|
|
112
|
+
|
|
113
|
+
# repeat n_iter times
|
|
114
|
+
prevs = _sampling(n_dim, n_prev)
|
|
115
|
+
|
|
116
|
+
return np.repeat(prevs, n_iter, axis=0) if n_iter > 1 else prevs
|
|
117
|
+
|
|
81
118
|
|
|
82
119
|
def generate_artificial_prevalences(n_dim: int, n_prev: int, n_iter: int) -> np.ndarray:
|
|
83
120
|
"""Generates n artificial prevalences with n dimensions.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mlquantify
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.5
|
|
4
4
|
Summary: Quantification Library
|
|
5
5
|
Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
|
|
6
6
|
Maintainer: Luiz Fernando Luth Junior
|
|
@@ -40,9 +40,9 @@ ___
|
|
|
40
40
|
|
|
41
41
|
## Latest Release
|
|
42
42
|
|
|
43
|
-
- **Version 0.
|
|
44
|
-
- In case you need any help, refer to the [
|
|
45
|
-
- Explore the [API documentation](
|
|
43
|
+
- **Version 0.1.3**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
|
|
44
|
+
- In case you need any help, refer to the [User Guide](https://luizfernandolj.github.io/mlquantify/user_guide.html).
|
|
45
|
+
- Explore the [API documentation](https://luizfernandolj.github.io/mlquantify/api/index.html) for detailed developer information.
|
|
46
46
|
- See also the library in the pypi site in [pypi mlquantify](https://pypi.org/project/mlquantify/)
|
|
47
47
|
|
|
48
48
|
___
|
|
@@ -70,7 +70,7 @@ ___
|
|
|
70
70
|
| **21 Quantification Methods** | Methods for quantification, such as classify & Count Correct methods, Threshold Optimization, Mixture Models and more.|
|
|
71
71
|
| **Dynamic class management** | All methods are dynamic, and handles multiclass and binary problems, in case of binary it makes One-Vs-All (OVA) automatically. |
|
|
72
72
|
| **Model Selection** | Criteria and processes used to select the best model, such as grid-search for the case of quantification|
|
|
73
|
-
| **Evaluation Metrics** | Specific metrics used to evaluate quantification performance, (e.g., AE,
|
|
73
|
+
| **Evaluation Metrics** | Specific metrics used to evaluate quantification performance, (e.g., AE, MAE, NAE, SE, KLD, etc.). |
|
|
74
74
|
| **Evaluation Protocols** | Evaluation protocols used, based on sampling generation (e.g., APP, NPP, etc.).. |
|
|
75
75
|
| **Plotting Results** | Tools and techniques used to visualize results, such as the protocol results.|
|
|
76
76
|
| **Comprehensive Documentation** | Complete documentation of the project, including code, data, and results. |
|
|
@@ -82,7 +82,10 @@ ___
|
|
|
82
82
|
This code first loads the breast cancer dataset from _sklearn_, which is then split into training and testing sets. It uses the _Expectation Maximisation Quantifier (EMQ)_ with a RandomForest classifier to predict class prevalence. After training the model, it evaluates performance by calculating and printing the absolute error and bias between the real and predicted prevalences.
|
|
83
83
|
|
|
84
84
|
```python
|
|
85
|
-
|
|
85
|
+
from mlquantify.methods import EMQ
|
|
86
|
+
from mlquantify.evaluation.measures import absolute_error, mean_absolute_error
|
|
87
|
+
from mlquantify.utils import get_real_prev
|
|
88
|
+
|
|
86
89
|
from sklearn.ensemble import RandomForestClassifier
|
|
87
90
|
from sklearn.datasets import load_breast_cancer
|
|
88
91
|
from sklearn.model_selection import train_test_split
|
|
@@ -94,19 +97,19 @@ features, target = load_breast_cancer(return_X_y=True)
|
|
|
94
97
|
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
|
|
95
98
|
|
|
96
99
|
#Create the model, here it is the Expectation Maximisation Quantifier (EMQ) with a classifier
|
|
97
|
-
model =
|
|
100
|
+
model = EMQ(RandomForestClassifier())
|
|
98
101
|
model.fit(X_train, y_train)
|
|
99
102
|
|
|
100
103
|
#Predict the class prevalence for X_test
|
|
101
104
|
pred_prevalence = model.predict(X_test)
|
|
102
|
-
real_prevalence =
|
|
105
|
+
real_prevalence = get_real_prev(y_test)
|
|
103
106
|
|
|
104
107
|
#Get the error for the prediction
|
|
105
|
-
ae =
|
|
106
|
-
|
|
108
|
+
ae = absolute_error(real_prevalence, pred_prevalence)
|
|
109
|
+
mae = mean_absolute_error(real_prevalence, pred_prevalence)
|
|
107
110
|
|
|
108
|
-
print(f"
|
|
109
|
-
print(f"
|
|
111
|
+
print(f"Absolute Error -> {ae}")
|
|
112
|
+
print(f"Mean Absolute Error -> {mae}")
|
|
110
113
|
```
|
|
111
114
|
|
|
112
115
|
___
|
|
@@ -125,7 +128,7 @@ ___
|
|
|
125
128
|
|
|
126
129
|
## Documentation
|
|
127
130
|
|
|
128
|
-
##### API is avaliable [here](
|
|
131
|
+
##### API is avaliable [here](https://luizfernandolj.github.io/mlquantify/api/index.html)
|
|
129
132
|
|
|
130
133
|
- [Methods](https://github.com/luizfernandolj/mlquantify/wiki/Methods)
|
|
131
134
|
- [Model Selection](https://github.com/luizfernandolj/mlquantify/wiki/Model-Selection)
|
|
@@ -6,17 +6,17 @@ mlquantify/classification/__init__.py,sha256=3FGf-F4SOM3gByUPsWdnBzjyC_31B3Mtzuo
|
|
|
6
6
|
mlquantify/classification/methods.py,sha256=yDSbpoqM3hfF0a9ATzKqfG9S-44x-0Rq0lkAVJKTIEs,5006
|
|
7
7
|
mlquantify/evaluation/__init__.py,sha256=x1grng0n_QeZpVBU8-pwagYdBMkbMRILtrp1qk_bLvk,447
|
|
8
8
|
mlquantify/evaluation/measures.py,sha256=fIKyxxlD8em3oaj4u_BeXmNyUQG_A0vXWY8APPgNoJ0,6579
|
|
9
|
-
mlquantify/evaluation/protocol.py,sha256=
|
|
9
|
+
mlquantify/evaluation/protocol.py,sha256=__tzRyqW4cJz4Fl87TInf7dXxIJ6bSaYaSaw-SdkNmM,10365
|
|
10
10
|
mlquantify/methods/__init__.py,sha256=ya3Mn7bcz2r3oaIT7yVR4iJkAfgEAwF4xDK54C0rZ7U,536
|
|
11
11
|
mlquantify/methods/aggregative.py,sha256=F5Z-tGA9OcZgMBLKOeaos6wIgvvnDeriZ4y0TyMpDrc,39051
|
|
12
|
-
mlquantify/methods/meta.py,sha256=
|
|
12
|
+
mlquantify/methods/meta.py,sha256=mBunCc_PFLdmrs5sf5MDc8TbO3VFpLAmxV2y2VDNjY8,19052
|
|
13
13
|
mlquantify/methods/mixture_models.py,sha256=si2Pzaka5Kbva4QKBzLolvb_8V0ZEjp68UBAiOwl49s,35166
|
|
14
14
|
mlquantify/methods/non_aggregative.py,sha256=xaBu21TUtiYkOEUKO16NaNMwdNa6-SNjfBsc5PpIMyI,4815
|
|
15
15
|
mlquantify/methods/threshold_optimization.py,sha256=NYGKbYvtfmiBeU8wpTiFCdURkijcPRZtybPOt6vtXbY,30489
|
|
16
16
|
mlquantify/utils/__init__.py,sha256=logWrL6B6mukP8tvYm_UPEdO9eNA-J-ySILr7-syDoc,44
|
|
17
|
-
mlquantify/utils/general.py,sha256=
|
|
17
|
+
mlquantify/utils/general.py,sha256=wKJSmwF1KfSlSrDm0KTf92FMvB62BBOxf2Se9HyeWYE,8668
|
|
18
18
|
mlquantify/utils/method.py,sha256=RL4vBJGl5_6DZ59Bs62hdNXI_hnoDIWilMMyMPiOjBg,12631
|
|
19
|
-
mlquantify-0.1.
|
|
20
|
-
mlquantify-0.1.
|
|
21
|
-
mlquantify-0.1.
|
|
22
|
-
mlquantify-0.1.
|
|
19
|
+
mlquantify-0.1.5.dist-info/METADATA,sha256=bBEPfQhD4FYz9K4XsFCikSefsqo6JsNjqEPxTOW-Fv0,5166
|
|
20
|
+
mlquantify-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
21
|
+
mlquantify-0.1.5.dist-info/top_level.txt,sha256=tGEkYkbbFElwULvqENjam3u1uXtyC1J9dRmibsq8_n0,11
|
|
22
|
+
mlquantify-0.1.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|