mlquantify 0.0.11.2__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. mlquantify/__init__.py +32 -6
  2. mlquantify/base.py +559 -257
  3. mlquantify/classification/__init__.py +1 -1
  4. mlquantify/classification/methods.py +160 -0
  5. mlquantify/evaluation/__init__.py +14 -2
  6. mlquantify/evaluation/measures.py +215 -0
  7. mlquantify/evaluation/protocol.py +647 -0
  8. mlquantify/methods/__init__.py +37 -40
  9. mlquantify/methods/aggregative.py +1030 -0
  10. mlquantify/methods/meta.py +472 -0
  11. mlquantify/methods/mixture_models.py +1003 -0
  12. mlquantify/methods/non_aggregative.py +136 -0
  13. mlquantify/methods/threshold_optimization.py +957 -0
  14. mlquantify/model_selection.py +377 -232
  15. mlquantify/plots.py +367 -0
  16. mlquantify/utils/__init__.py +2 -2
  17. mlquantify/utils/general.py +334 -0
  18. mlquantify/utils/method.py +449 -0
  19. {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/METADATA +137 -122
  20. mlquantify-0.1.1.dist-info/RECORD +22 -0
  21. {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/WHEEL +1 -1
  22. mlquantify/classification/pwkclf.py +0 -73
  23. mlquantify/evaluation/measures/__init__.py +0 -26
  24. mlquantify/evaluation/measures/ae.py +0 -11
  25. mlquantify/evaluation/measures/bias.py +0 -16
  26. mlquantify/evaluation/measures/kld.py +0 -8
  27. mlquantify/evaluation/measures/mse.py +0 -12
  28. mlquantify/evaluation/measures/nae.py +0 -16
  29. mlquantify/evaluation/measures/nkld.py +0 -13
  30. mlquantify/evaluation/measures/nrae.py +0 -16
  31. mlquantify/evaluation/measures/rae.py +0 -12
  32. mlquantify/evaluation/measures/se.py +0 -12
  33. mlquantify/evaluation/protocol/_Protocol.py +0 -202
  34. mlquantify/evaluation/protocol/__init__.py +0 -2
  35. mlquantify/evaluation/protocol/app.py +0 -146
  36. mlquantify/evaluation/protocol/npp.py +0 -34
  37. mlquantify/methods/aggregative/ThreholdOptm/_ThreholdOptimization.py +0 -62
  38. mlquantify/methods/aggregative/ThreholdOptm/__init__.py +0 -7
  39. mlquantify/methods/aggregative/ThreholdOptm/acc.py +0 -27
  40. mlquantify/methods/aggregative/ThreholdOptm/max.py +0 -23
  41. mlquantify/methods/aggregative/ThreholdOptm/ms.py +0 -21
  42. mlquantify/methods/aggregative/ThreholdOptm/ms2.py +0 -25
  43. mlquantify/methods/aggregative/ThreholdOptm/pacc.py +0 -41
  44. mlquantify/methods/aggregative/ThreholdOptm/t50.py +0 -21
  45. mlquantify/methods/aggregative/ThreholdOptm/x.py +0 -23
  46. mlquantify/methods/aggregative/__init__.py +0 -9
  47. mlquantify/methods/aggregative/cc.py +0 -32
  48. mlquantify/methods/aggregative/emq.py +0 -86
  49. mlquantify/methods/aggregative/fm.py +0 -72
  50. mlquantify/methods/aggregative/gac.py +0 -96
  51. mlquantify/methods/aggregative/gpac.py +0 -87
  52. mlquantify/methods/aggregative/mixtureModels/_MixtureModel.py +0 -81
  53. mlquantify/methods/aggregative/mixtureModels/__init__.py +0 -5
  54. mlquantify/methods/aggregative/mixtureModels/dys.py +0 -55
  55. mlquantify/methods/aggregative/mixtureModels/dys_syn.py +0 -89
  56. mlquantify/methods/aggregative/mixtureModels/hdy.py +0 -46
  57. mlquantify/methods/aggregative/mixtureModels/smm.py +0 -27
  58. mlquantify/methods/aggregative/mixtureModels/sord.py +0 -77
  59. mlquantify/methods/aggregative/pcc.py +0 -33
  60. mlquantify/methods/aggregative/pwk.py +0 -38
  61. mlquantify/methods/meta/__init__.py +0 -1
  62. mlquantify/methods/meta/ensemble.py +0 -236
  63. mlquantify/methods/non_aggregative/__init__.py +0 -1
  64. mlquantify/methods/non_aggregative/hdx.py +0 -71
  65. mlquantify/plots/__init__.py +0 -2
  66. mlquantify/plots/distribution_plot.py +0 -109
  67. mlquantify/plots/protocol_plot.py +0 -193
  68. mlquantify/utils/general_purposes/__init__.py +0 -8
  69. mlquantify/utils/general_purposes/convert_col_to_array.py +0 -13
  70. mlquantify/utils/general_purposes/generate_artificial_indexes.py +0 -29
  71. mlquantify/utils/general_purposes/get_real_prev.py +0 -9
  72. mlquantify/utils/general_purposes/load_quantifier.py +0 -4
  73. mlquantify/utils/general_purposes/make_prevs.py +0 -23
  74. mlquantify/utils/general_purposes/normalize.py +0 -20
  75. mlquantify/utils/general_purposes/parallel.py +0 -10
  76. mlquantify/utils/general_purposes/round_protocol_df.py +0 -14
  77. mlquantify/utils/method_purposes/__init__.py +0 -6
  78. mlquantify/utils/method_purposes/distances.py +0 -21
  79. mlquantify/utils/method_purposes/getHist.py +0 -13
  80. mlquantify/utils/method_purposes/get_scores.py +0 -33
  81. mlquantify/utils/method_purposes/moss.py +0 -16
  82. mlquantify/utils/method_purposes/ternary_search.py +0 -14
  83. mlquantify/utils/method_purposes/tprfpr.py +0 -42
  84. mlquantify-0.0.11.2.dist-info/RECORD +0 -73
  85. {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,647 @@
1
+ from abc import ABC, abstractmethod
2
+ import numpy as np
3
+ import pandas as pd
4
+ from typing import Union, List, Tuple, Any
5
+ from sklearn.base import BaseEstimator
6
+ from time import time
7
+ from tqdm import tqdm
8
+
9
+ from ..methods import METHODS, AGGREGATIVE, NON_AGGREGATIVE
10
+ from ..utils.general import *
11
+ from ..utils.method import *
12
+ from . import MEASURES
13
+ from ..base import Quantifier
14
+
15
+ import mlquantify as mq
16
+
17
+ class Protocol(ABC):
18
+ """Base class for evaluation protocols.
19
+
20
+ Parameters
21
+ ----------
22
+ models : Union[List[Union[str, Quantifier]], str, Quantifier]
23
+ List of quantification models, a single model name, or 'all' for all models.
24
+ learner : BaseEstimator, optional
25
+ Machine learning model to be used with the quantifiers. Required for model methods.
26
+ n_jobs : int, optional
27
+ Number of jobs to run in parallel. Default is 1.
28
+ random_state : int, optional
29
+ Seed for random number generation. Default is 32.
30
+ verbose : bool, optional
31
+ Whether to print progress messages. Default is False.
32
+ return_type : str, optional
33
+ Type of return value ('predictions' or 'table'). Default is 'predictions'.
34
+ measures : List[str], optional
35
+ List of error measures to calculate. Must be in MEASURES or None. Default is None.
36
+ columns : List[str], optional
37
+ Columns to be included in the table. Default is ['ITERATION', 'QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS', 'BATCH_SIZE'].
38
+
39
+ Attributes
40
+ ----------
41
+ models : List[Quantifier]
42
+ List of quantification models.
43
+ learner : BaseEstimator
44
+ Machine learning model to be used with the quantifiers.
45
+ n_jobs : int
46
+ Number of jobs to run in parallel.
47
+ random_state : int
48
+ Seed for random number generation.
49
+ verbose : bool
50
+ Whether to print progress messages.
51
+ return_type : str
52
+ Type of return value ('predictions' or 'table').
53
+ measures : List[str]
54
+ List of error measures to calculate.
55
+ columns : List[str]
56
+ Columns to be included in the table.
57
+
58
+ Raises
59
+ ------
60
+ AssertionError
61
+ If measures contain invalid error measures.
62
+ If return_type is invalid.
63
+ If columns does not contain ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS'].
64
+
65
+ Notes
66
+ -----
67
+ - The 'models' parameter can be a list of Quantifiers, a single Quantifier, a list of model names, a single model name, or 'all'.
68
+ - If 'models' is a list of model names or 'all', 'learner' must be provided.
69
+ - The 'all' option for 'models' will use all quantification models available in the library.
70
+ - If 'models' is a Quantifier or list of Quantifier, 'learner' is not required. But the models must be initializated
71
+ - You can pass your own model by passing a Quantifier object.
72
+ - Columns must contain ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS'].
73
+ - If 'return_type' is 'table', the table will contain the columns specified in 'columns' and the error measures in 'measures'.
74
+ - For creating your own protocol, you must have the attributes 'models', 'learner', 'n_jobs', 'random_state', 'verbose', 'return_type', 'measures', and 'columns'., but columns can be changed, as long as it contains ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS'].
75
+
76
+ See Also
77
+ --------
78
+ APP : Artificial Prevalence Protocol.
79
+ NPP : Natural Prevalence Protocol.
80
+ Quantifier : Base class for quantification methods.
81
+
82
+ Examples
83
+ --------
84
+ import numpy as np
85
+ >>> from mlquantify.evaluation.protocol import Protocol
86
+ >>> from mlquantify.utils import get_real_prev
87
+ >>> from sklearn.ensemble import RandomForestClassifier
88
+ >>> from sklearn.datasets import load_breast_cancer
89
+ >>> from sklearn.model_selection import train_test_split
90
+ >>> import time as t
91
+ >>>
92
+ >>> class MyProtocol(Protocol):
93
+ ... def __init__(self,
94
+ ... models,
95
+ ... learner,
96
+ ... n_jobs,
97
+ ... random_state,
98
+ ... verbose,
99
+ ... return_type,
100
+ ... measures,
101
+ ... sample_size,
102
+ ... iterations=10):
103
+ ... super().__init__(models,
104
+ ... learner,
105
+ ... n_jobs,
106
+ ... random_state,
107
+ ... verbose,
108
+ ... return_type,
109
+ ... measures,
110
+ ... columns=['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS', 'TIME'])
111
+ ... self.sample_size = sample_size
112
+ ... self.iterations = iterations
113
+ ...
114
+ ... def predict_protocol(self, X_test, y_test):
115
+ ... predictions = []
116
+ ...
117
+ ... X_sample, y_sample = self._new_sample(X_test, y_test)
118
+ ...
119
+ ... for _ in range(self.iterations):
120
+ ... for model in self.models:
121
+ ... quantifier = model.__class__.__name__
122
+ ...
123
+ ... real_prev = get_real_prev(y_sample)
124
+ ...
125
+ ... start_time = t.time()
126
+ ... pred_prev = model.predict(X_sample)
127
+ ... end_time = t.time()
128
+ ... time = end_time - start_time
129
+ ...
130
+ ... predictions.append([quantifier, real_prev, pred_prev, time])
131
+ ...
132
+ ... return predictions
133
+ ...
134
+ ... def _new_sample(self, X_test, y_test):
135
+ ... indexes = np.random.choice(len(X_test), size=self.sample_size, replace=False)
136
+ ... return X_test[indexes], y_test[indexes]
137
+ >>>
138
+ >>>
139
+ >>> features, target = load_breast_cancer(return_X_y=True)
140
+ >>>
141
+ >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.5, random_state=42)
142
+ >>>
143
+ >>> protocol = MyProtocol(models=["CC", "EMQ", "DyS"], # or [CC(learner), EMQ(learner), DyS(learner)]
144
+ ... learner=RandomForestClassifier(),
145
+ ... n_jobs=1,
146
+ ... random_state=42,
147
+ ... verbose=True,
148
+ ... return_type="table",
149
+ ... measures=None,
150
+ ... sample_size=100)
151
+ >>>
152
+ >>> protocol.fit(X_train, y_train)
153
+ >>> table = protocol.predict(X_test, y_test)
154
+ >>> print(table)
155
+
156
+ """
157
+
158
+ def __init__(self,
159
+ models: Union[List[Union[str, Quantifier]], str, Quantifier],
160
+ learner: BaseEstimator = None,
161
+ n_jobs: int = 1,
162
+ random_state: int = 32,
163
+ verbose: bool = False,
164
+ return_type: str = "predictions",
165
+ measures: List[str] = None,
166
+ columns: List[str] = ["ITERATION", "QUANTIFIER", "REAL_PREVS", "PRED_PREVS", "BATCH_SIZE"]):
167
+
168
+ assert not measures or all(m in MEASURES for m in measures), \
169
+ f"Invalid measure(s) provided. Valid options: {list(MEASURES.keys())} or None"
170
+ assert return_type in ["predictions", "table"], \
171
+ "Invalid return_type. Valid options: ['predictions', 'table']"
172
+ assert all(col in columns for col in ["QUANTIFIER", "REAL_PREVS", "PRED_PREVS"]), \
173
+ "Columns must contain ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS']"
174
+
175
+ # Fixed parameters
176
+ self.models = self._initialize_models(models, learner)
177
+ self.learner = learner
178
+ self.n_jobs = n_jobs
179
+ self.random_state = random_state
180
+ self.verbose = verbose
181
+ self.return_type = return_type
182
+ self.measures = measures
183
+ self.columns = columns
184
+
185
+ def _initialize_models(self, models, learner):
186
+ """Initializes the quantification models.
187
+
188
+ Parameters
189
+ ----------
190
+ models : Union[List[Union[str, Quantifier]], str, Quantifier]
191
+ List of quantification models, a single model name, or 'all' for all models.
192
+ learner : BaseEstimator
193
+ Machine learning model to be used with the quantifiers.
194
+
195
+ Returns
196
+ -------
197
+ List[Quantifier]
198
+ List of quantification models.
199
+ """
200
+ if isinstance(models, list):
201
+ if all(isinstance(model, Quantifier) for model in models):
202
+ return models
203
+ return [get_method(model)(learner) for model in models]
204
+
205
+ if isinstance(models, Quantifier):
206
+ return [models]
207
+
208
+ assert learner is not None, "Learner is required for model methods."
209
+
210
+ model_dict = {
211
+ "all": METHODS.values,
212
+ "aggregative": AGGREGATIVE.values,
213
+ "non_aggregative": NON_AGGREGATIVE.values
214
+ }
215
+
216
+ if models in model_dict:
217
+ return [model(learner) if hasattr(model, "learner") else model() for model in model_dict[models]()]
218
+ return [get_method(models)(learner)]
219
+
220
+ def sout(self, msg):
221
+ """Prints a message if verbose is True."""
222
+ if self.verbose:
223
+ print('[APP]' + msg)
224
+
225
+ def fit(self, X_train, y_train):
226
+ """Fits the models with the training data.
227
+
228
+ Parameters
229
+ ----------
230
+ X_train : np.ndarray
231
+ Features of the training set.
232
+ y_train : np.ndarray
233
+ Labels of the training set.
234
+
235
+ Returns
236
+ -------
237
+ Protocol
238
+ Fitted protocol.
239
+ """
240
+ self.sout("Fitting models")
241
+
242
+ args = ((model, X_train, y_train) for model in self.models)
243
+
244
+ wrapper = tqdm if self.verbose else lambda x, **kwargs: x
245
+
246
+ self.models = Parallel(n_jobs=self.n_jobs)( # Parallel processing of models
247
+ delayed(self._delayed_fit)(*arg) for arg in wrapper(args, desc="Fitting models", total=len(self.models))
248
+ )
249
+ self.sout("Fit [Done]")
250
+ return self
251
+
252
+
253
+ def predict(self, X_test: np.ndarray, y_test: np.ndarray) -> Any:
254
+ """Predicts the prevalence for the test set.
255
+
256
+ Parameters
257
+ ----------
258
+ X_test : np.ndarray
259
+ Features of the test set.
260
+ y_test : np.ndarray
261
+ Labels of the test set.
262
+
263
+ Returns
264
+ -------
265
+ Any
266
+ Predictions for the test set. Can be a table or a tuple with the quantifier names, real prevalence, and predicted prevalence.
267
+ """
268
+ predictions = self.predict_protocol(X_test, y_test)
269
+ predictions_df = pd.DataFrame(predictions, columns=self.columns)
270
+
271
+ if self.return_type == "table":
272
+ if self.measures:
273
+ smoothed_factor = 1 / (2 * len(X_test))
274
+
275
+ def smooth(values: np.ndarray) -> np.ndarray:
276
+ return (values + smoothed_factor) / (smoothed_factor * len(values) + 1)
277
+
278
+ for metric in self.measures:
279
+ predictions_df[metric] = predictions_df.apply(
280
+ lambda row: get_measure(metric)(
281
+ smooth(np.array(row["REAL_PREVS"])),
282
+ smooth(np.array(row["PRED_PREVS"]))
283
+ ),
284
+ axis=1
285
+ )
286
+ return predictions_df
287
+
288
+ return (
289
+ predictions_df["QUANTIFIER"].to_numpy(), # Quantifier names
290
+ np.stack(predictions_df["REAL_PREVS"].to_numpy()), # REAL_PREVS
291
+ np.stack(predictions_df["PRED_PREVS"].to_numpy()) # PRED_PREVS
292
+ )
293
+
294
+ @abstractmethod
295
+ def predict_protocol(self, X_test: np.ndarray, y_test: np.ndarray) -> np.ndarray:
296
+ """Abstract method that every protocol must implement
297
+
298
+ Parameters
299
+ ----------
300
+ X_test : np.ndarray
301
+ Features of the test set.
302
+ y_test : np.ndarray
303
+ Labels of the test set.
304
+
305
+ Returns
306
+ -------
307
+ np.ndarray
308
+ Predictions for the test set. With the same format as the column names attribute.
309
+ """
310
+ ...
311
+
312
+ @abstractmethod
313
+ def _new_sample(self) -> Tuple[np.ndarray, np.ndarray]:
314
+ """Abstract method of sample extraction for each protocol.
315
+
316
+ Returns:
317
+ Tuple[np.ndarray, np.ndarray]: Tuple containing X_sample and y_sample.
318
+ """
319
+ ...
320
+
321
+ @staticmethod
322
+ def _delayed_fit(model, X_train, y_train):
323
+ """Method to fit the model in parallel.
324
+
325
+ Parameters
326
+ ----------
327
+ model : Quantifier
328
+ Quantification model.
329
+ X_train : np.ndarray
330
+ Features of the training set.
331
+ y_train : np.ndarray
332
+ Labels of the training set.
333
+
334
+ Returns
335
+ -------
336
+ Quantifier
337
+ Fitted quantification model
338
+ """
339
+ model_name = model.__class__.__name__
340
+ if model_name == "Ensemble" and isinstance(model.base_quantifier, Quantifier):
341
+ model_name = f"{model.__class__.__name__}_{model.base_quantifier.__class__.__name__}_{model.size}"
342
+
343
+ start = time()
344
+ model = model.fit(X=X_train, y=y_train)
345
+ duration = time() - start
346
+ print(f"\tFitted {model_name} in {duration:.3f} seconds")
347
+ return model
348
+
349
+
350
+
351
+
352
+
353
+
354
+
355
+ class APP(Protocol):
356
+ """Artificial Prevalence Protocol.
357
+
358
+ This approach splits a test into several samples varying prevalence and sample size,
359
+ with n iterations. For a list of Quantifiers, it computes training and testing for
360
+ each one and returns either a table of results with error measures or just the predictions.
361
+
362
+ Parameters
363
+ ----------
364
+ models : Union[List[Union[str, Quantifier]], str, Quantifier]
365
+ List of quantification models, a single model name, or 'all' for all models.
366
+ batch_size : Union[List[int], int]
367
+ Size of the batches to be processed, or a list of sizes.
368
+ learner : BaseEstimator, optional
369
+ Machine learning model to be used with the quantifiers. Required for model methods.
370
+ n_prevs : int, optional
371
+ Number of prevalence points to generate. Default is 100.
372
+ n_iterations : int, optional
373
+ Number of iterations for the protocol. Default is 1.
374
+ n_jobs : int, optional
375
+ Number of jobs to run in parallel. Default is 1.
376
+ random_state : int, optional
377
+ Seed for random number generation. Default is 32.
378
+ verbose : bool, optional
379
+ Whether to print progress messages. Default is False.
380
+ return_type : str, optional
381
+ Type of return value ('predictions' or 'table'). Default is 'predictions'.
382
+ measures : List[str], optional
383
+ List of error measures to calculate. Must be in MEASURES or None. Default is None.
384
+
385
+ Attributes
386
+ ----------
387
+ models : List[Quantifier]
388
+ List of quantification models.
389
+ batch_size : Union[List[int], int]
390
+ Size of the batches to be processed.
391
+ learner : BaseEstimator
392
+ Machine learning model to be used with the quantifiers.
393
+ n_prevs : int
394
+ Number of prevalence points to generate.
395
+ n_iterations : int
396
+ Number of iterations for the protocol.
397
+ n_jobs : int
398
+ Number of jobs to run in parallel.
399
+ random_state : int
400
+ Seed for random number generation.
401
+ verbose : bool
402
+ Whether to print progress messages.
403
+ return_type : str
404
+ Type of return value ('predictions' or 'table').
405
+ measures : List[str]
406
+ List of error measures to calculate.
407
+
408
+ Raises
409
+ ------
410
+ AssertionError
411
+ If return_type is invalid.
412
+
413
+ See Also
414
+ --------
415
+ Protocol : Base class for evaluation protocols.
416
+ NPP : Natural Prevalence Protocol.
417
+ Quantifier : Base class for quantification methods.
418
+
419
+ Examples
420
+ --------
421
+ >>> from mlquantify.evaluation.protocol import APP
422
+ >>> from sklearn.ensemble import RandomForestClassifier
423
+ >>> from sklearn.datasets import load_breast_cancer
424
+ >>> from sklearn.model_selection import train_test_split
425
+ >>>
426
+ >>> # Loading dataset from sklearn
427
+ >>> features, target = load_breast_cancer(return_X_y=True)
428
+ >>>
429
+ >>> #Splitting into train and test
430
+ >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
431
+ >>>
432
+ >>> app = APP(models=["CC", "EMQ", "DyS"],
433
+ ... batch_size=[10, 50, 100],
434
+ ... learner=RandomForestClassifier(),
435
+ ... n_prevs=100, # Default
436
+ ... n_jobs=-1,
437
+ ... return_type="table",
438
+ ... measures=["ae", "se"],
439
+ ... verbose=True)
440
+ >>>
441
+ >>> app.fit(X_train, y_train)
442
+ >>>
443
+ >>> table = app.predict(X_test, y_test)
444
+ >>>
445
+ >>> print(table)
446
+ """
447
+
448
+ def __init__(self,
449
+ models: Union[List[Union[str, Quantifier]], str, Quantifier],
450
+ batch_size: Union[List[int], int],
451
+ learner: BaseEstimator = None,
452
+ n_prevs: int = 100,
453
+ n_iterations: int = 1,
454
+ n_jobs: int = 1,
455
+ random_state: int = 32,
456
+ verbose: bool = False,
457
+ return_type: str = "predictions",
458
+ measures: List[str] = None):
459
+
460
+ super().__init__(models, learner, n_jobs, random_state, verbose, return_type, measures)
461
+ self.n_prevs = n_prevs
462
+ self.batch_size = batch_size if isinstance(batch_size, list) else [batch_size]
463
+ self.n_prevs = n_prevs
464
+ self.n_iterations = n_iterations
465
+
466
+
467
+ def predict_protocol(self, X_test: np.ndarray, y_test: np.ndarray) -> Tuple:
468
+ """Generates several samples with artificial prevalences and sizes.
469
+ For each model, predicts with this sample, aggregating all together
470
+ with a pandas dataframe if requested, or else just the predictions.
471
+
472
+ Parameters
473
+ ----------
474
+ X_test : np.ndarray
475
+ Features of the test set.
476
+ y_test : np.ndarray
477
+ Labels of the test set.
478
+
479
+ Returns
480
+ -------
481
+ Tuple
482
+ Tuple containing the (iteration, model name, prev, prev_pred, and batch size).
483
+ """
484
+
485
+ n_dim = len(np.unique(y_test))
486
+ prevs = generate_artificial_prevalences(n_dim, self.n_prevs, self.n_iterations)
487
+
488
+ args = [
489
+ (iteration, X_test, y_test, model, prev, bs, self.verbose)
490
+ for prev in prevs for bs in self.batch_size for model in self.models for iteration in range(self.n_iterations)
491
+ ]
492
+
493
+ size = len(prevs) * len(self.models) * len(self.batch_size) * self.n_iterations
494
+
495
+ predictions = []
496
+ for arg in tqdm(args, desc="Running APP", total=size):
497
+ predictions.append(self._predict(*arg))
498
+
499
+ return predictions
500
+
501
+ def _predict(self, iteration:int, X: np.ndarray, y: np.ndarray, model: Any, prev: List[float], batch_size: int, verbose: bool) -> Tuple:
502
+ """Method predicts into the new sample for each model and prevalence.
503
+
504
+ Parameters
505
+ ----------
506
+ iteration : int
507
+ Current iteration.
508
+ X : np.ndarray
509
+ Features of the test set.
510
+ y : np.ndarray
511
+ Labels of the test set.
512
+ model : Any
513
+ Quantification model.
514
+ prev : List[float]
515
+ Prevalence values for the sample.
516
+ batch_size : int
517
+ Batch size for the sample.
518
+ verbose : bool
519
+ Whether to print progress messages.
520
+
521
+ Returns
522
+ -------
523
+ Tuple
524
+ Tuple containing the iteration, model name, prev, prev_pred, and batch size.
525
+ """
526
+ model_name = model.__class__.__name__
527
+ if model_name == "Ensemble" and isinstance(model.base_quantifier, Quantifier):
528
+ model_name = f"{model.__class__.__name__}_{model.base_quantifier.__class__.__name__}_{model.size}"
529
+
530
+ if verbose:
531
+ print(f'\t {model_name} with {batch_size} instances and prev {prev}')
532
+
533
+ X_sample, _ = self._new_sample(X, y, prev, batch_size)
534
+ prev_pred = np.asarray(list(model.predict(X_sample).values()))
535
+
536
+ if verbose:
537
+ print(f'\t \\--Ending {model_name} with {batch_size} instances and prev {prev}\n')
538
+
539
+ return (iteration+1, model_name, prev, prev_pred, batch_size)
540
+
541
+
542
+ def _new_sample(self, X: np.ndarray, y: np.ndarray, prev: List[float], batch_size: int) -> Tuple[np.ndarray, np.ndarray]:
543
+ """Generates a new sample with a specified prevalence and size.
544
+
545
+ Parameters
546
+ ----------
547
+ X : np.ndarray
548
+ Features of the test set.
549
+ y : np.ndarray
550
+ Labels of the test set.
551
+ prev : List[float]
552
+ Prevalence values for the sample.
553
+ batch_size : int
554
+ Batch size for the sample.
555
+
556
+ Returns
557
+ -------
558
+ Tuple[np.ndarray, np.ndarray]
559
+ Tuple containing the new sample features and labels.
560
+ """
561
+ sample_index = generate_artificial_indexes(y, prev, batch_size, np.unique(y))
562
+ return (np.take(X, sample_index, axis=0), np.take(y, sample_index, axis=0))
563
+
564
+
565
+
566
+
567
+
568
+
569
+
570
+
571
+
572
+
573
+
574
+
575
+ class NPP(Protocol):
576
+ """Natural Prevalence Protocol.
577
+
578
+ This approach splits a test into several samples varying sample size,
579
+ with n iterations. For a list of Quantifiers, it computes training and testing for
580
+ each one and returns either a table of results with error measures or just the predictions.
581
+
582
+ Parameters
583
+ ----------
584
+ models : Union[List[Union[str, Quantifier]], str, Quantifier]
585
+ List of quantification models, a single model name, or 'all' for all models.
586
+ batch_size : Union[List[int], int]
587
+ Size of the batches to be processed, or a list of sizes.
588
+ learner : BaseEstimator, optional
589
+ Machine learning model to be used with the quantifiers. Required for model methods.
590
+ n_iterations : int, optional
591
+ Number of iterations for the protocol. Default is 1.
592
+ n_jobs : int, optional
593
+ Number of jobs to run in parallel. Default is 1.
594
+ random_state : int, optional
595
+ Seed for random number generation. Default is 32.
596
+ verbose : bool, optional
597
+ Whether to print progress messages. Default is False.
598
+ return_type : str, optional
599
+ Type of return value ('predictions' or 'table'). Default is 'predictions'.
600
+ measures : List[str], optional
601
+ List of error measures to calculate. Must be in MEASURES or None. Default is None.
602
+
603
+ Attributes
604
+ ----------
605
+ models : List[Quantifier]
606
+ List of quantification models.
607
+ batch_size : Union[List[int], int]
608
+ Size of the batches to be processed.
609
+ learner : BaseEstimator
610
+ Machine learning model to be used with the quantifiers.
611
+ n_iterations : int
612
+ Number of iterations for the protocol.
613
+ n_jobs : int
614
+ Number of jobs to run in parallel.
615
+ random_state : int
616
+ Seed for random number generation.
617
+ verbose : bool
618
+ Whether to print progress messages.
619
+ return_type : str
620
+ Type of return value ('predictions' or 'table').
621
+ measures : List[str]
622
+ List of error measures to calculate.
623
+ """
624
+
625
+
626
+ def __init__(self,
627
+ models: Union[List[Union[str, Quantifier]], str, Quantifier],
628
+ learner: BaseEstimator = None,
629
+ n_jobs: int = 1,
630
+ random_state: int = 32,
631
+ verbose: bool = False,
632
+ return_type: str = "predictions",
633
+ measures: List[str] = None):
634
+
635
+ super().__init__(models, learner, n_jobs, random_state, verbose, return_type, measures)
636
+
637
+
638
+ def predict_protocol(self, X_test, y_test) -> tuple:
639
+ raise NotImplementedError
640
+
641
+
642
+ def _new_sample(self, X, y, prev: List[float], batch_size: int) -> tuple:
643
+ raise NotImplementedError
644
+
645
+
646
+ def _delayed_predict(self, args) -> tuple:
647
+ raise NotImplementedError