mlquantify 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,647 +1,297 @@
1
1
  from abc import ABC, abstractmethod
2
2
  import numpy as np
3
- import pandas as pd
4
- from typing import Union, List, Tuple, Any
5
- from sklearn.base import BaseEstimator
6
- from time import time
3
+ from typing import Generator, Tuple
7
4
  from tqdm import tqdm
8
5
 
9
- from ..methods import METHODS, AGGREGATIVE, NON_AGGREGATIVE
10
6
  from ..utils.general import *
11
- from ..utils.method import *
12
- from . import MEASURES
13
- from ..base import Quantifier
14
-
15
- import mlquantify as mq
16
7
 
17
8
  class Protocol(ABC):
18
9
  """Base class for evaluation protocols.
19
10
 
20
11
  Parameters
21
12
  ----------
22
- models : Union[List[Union[str, Quantifier]], str, Quantifier]
23
- List of quantification models, a single model name, or 'all' for all models.
24
- learner : BaseEstimator, optional
25
- Machine learning model to be used with the quantifiers. Required for model methods.
26
- n_jobs : int, optional
27
- Number of jobs to run in parallel. Default is 1.
13
+ batch_size : int or list of int
14
+ The size of the batches to be used in the evaluation.
28
15
  random_state : int, optional
29
- Seed for random number generation. Default is 32.
30
- verbose : bool, optional
31
- Whether to print progress messages. Default is False.
32
- return_type : str, optional
33
- Type of return value ('predictions' or 'table'). Default is 'predictions'.
34
- measures : List[str], optional
35
- List of error measures to calculate. Must be in MEASURES or None. Default is None.
36
- columns : List[str], optional
37
- Columns to be included in the table. Default is ['ITERATION', 'QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS', 'BATCH_SIZE'].
38
-
16
+ The random seed for reproducibility.
17
+
39
18
  Attributes
40
19
  ----------
41
- models : List[Quantifier]
42
- List of quantification models.
43
- learner : BaseEstimator
44
- Machine learning model to be used with the quantifiers.
45
- n_jobs : int
46
- Number of jobs to run in parallel.
47
- random_state : int
48
- Seed for random number generation.
49
- verbose : bool
50
- Whether to print progress messages.
51
- return_type : str
52
- Type of return value ('predictions' or 'table').
53
- measures : List[str]
54
- List of error measures to calculate.
55
- columns : List[str]
56
- Columns to be included in the table.
57
-
20
+ n_combinations : int
21
+
58
22
  Raises
59
23
  ------
60
- AssertionError
61
- If measures contain invalid error measures.
62
- If return_type is invalid.
63
- If columns does not contain ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS'].
64
-
24
+ ValueError
25
+ If the batch size is not a positive integer or list of positive integers.
26
+
65
27
  Notes
66
28
  -----
67
- - The 'models' parameter can be a list of Quantifiers, a single Quantifier, a list of model names, a single model name, or 'all'.
68
- - If 'models' is a list of model names or 'all', 'learner' must be provided.
69
- - The 'all' option for 'models' will use all quantification models available in the library.
70
- - If 'models' is a Quantifier or list of Quantifier, 'learner' is not required. But the models must be initializated
71
- - You can pass your own model by passing a Quantifier object.
72
- - Columns must contain ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS'].
73
- - If 'return_type' is 'table', the table will contain the columns specified in 'columns' and the error measures in 'measures'.
74
- - For creating your own protocol, you must have the attributes 'models', 'learner', 'n_jobs', 'random_state', 'verbose', 'return_type', 'measures', and 'columns'., but columns can be changed, as long as it contains ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS'].
75
-
76
- See Also
77
- --------
78
- APP : Artificial Prevalence Protocol.
79
- NPP : Natural Prevalence Protocol.
80
- Quantifier : Base class for quantification methods.
29
+ This class serves as a base class for different evaluation protocols, each with its own strategy for splitting the data into batches.
81
30
 
82
31
  Examples
83
32
  --------
84
- import numpy as np
85
- >>> from mlquantify.evaluation.protocol import Protocol
86
- >>> from mlquantify.utils import get_real_prev
87
- >>> from sklearn.ensemble import RandomForestClassifier
88
- >>> from sklearn.datasets import load_breast_cancer
89
- >>> from sklearn.model_selection import train_test_split
90
- >>> import time as t
91
- >>>
92
- >>> class MyProtocol(Protocol):
93
- ... def __init__(self,
94
- ... models,
95
- ... learner,
96
- ... n_jobs,
97
- ... random_state,
98
- ... verbose,
99
- ... return_type,
100
- ... measures,
101
- ... sample_size,
102
- ... iterations=10):
103
- ... super().__init__(models,
104
- ... learner,
105
- ... n_jobs,
106
- ... random_state,
107
- ... verbose,
108
- ... return_type,
109
- ... measures,
110
- ... columns=['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS', 'TIME'])
111
- ... self.sample_size = sample_size
112
- ... self.iterations = iterations
113
- ...
114
- ... def predict_protocol(self, X_test, y_test):
115
- ... predictions = []
116
- ...
117
- ... X_sample, y_sample = self._new_sample(X_test, y_test)
118
- ...
119
- ... for _ in range(self.iterations):
120
- ... for model in self.models:
121
- ... quantifier = model.__class__.__name__
122
- ...
123
- ... real_prev = get_real_prev(y_sample)
33
+ >>> class MyCustomProtocol(Protocol):
34
+ ... def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
35
+ ... for batch_size in self.batch_size:
36
+ ... yield np.random.choice(X.shape[0], batch_size, replace=True)
124
37
  ...
125
- ... start_time = t.time()
126
- ... pred_prev = model.predict(X_sample)
127
- ... end_time = t.time()
128
- ... time = end_time - start_time
129
- ...
130
- ... predictions.append([quantifier, real_prev, pred_prev, time])
131
- ...
132
- ... return predictions
133
- ...
134
- ... def _new_sample(self, X_test, y_test):
135
- ... indexes = np.random.choice(len(X_test), size=self.sample_size, replace=False)
136
- ... return X_test[indexes], y_test[indexes]
137
- >>>
138
- >>>
139
- >>> features, target = load_breast_cancer(return_X_y=True)
140
- >>>
141
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.5, random_state=42)
142
- >>>
143
- >>> protocol = MyProtocol(models=["CC", "EMQ", "DyS"], # or [CC(learner), EMQ(learner), DyS(learner)]
144
- ... learner=RandomForestClassifier(),
145
- ... n_jobs=1,
146
- ... random_state=42,
147
- ... verbose=True,
148
- ... return_type="table",
149
- ... measures=None,
150
- ... sample_size=100)
151
- >>>
152
- >>> protocol.fit(X_train, y_train)
153
- >>> table = protocol.predict(X_test, y_test)
154
- >>> print(table)
155
-
38
+ >>> protocol = MyCustomProtocol(batch_size=100, random_state=42)
39
+ >>> for train_idx, test_idx in protocol.split(X, y):
40
+ ... # Train and evaluate model
41
+ ... pass
42
+
156
43
  """
157
-
158
- def __init__(self,
159
- models: Union[List[Union[str, Quantifier]], str, Quantifier],
160
- learner: BaseEstimator = None,
161
- n_jobs: int = 1,
162
- random_state: int = 32,
163
- verbose: bool = False,
164
- return_type: str = "predictions",
165
- measures: List[str] = None,
166
- columns: List[str] = ["ITERATION", "QUANTIFIER", "REAL_PREVS", "PRED_PREVS", "BATCH_SIZE"]):
167
-
168
- assert not measures or all(m in MEASURES for m in measures), \
169
- f"Invalid measure(s) provided. Valid options: {list(MEASURES.keys())} or None"
170
- assert return_type in ["predictions", "table"], \
171
- "Invalid return_type. Valid options: ['predictions', 'table']"
172
- assert all(col in columns for col in ["QUANTIFIER", "REAL_PREVS", "PRED_PREVS"]), \
173
- "Columns must contain ['QUANTIFIER', 'REAL_PREVS', 'PRED_PREVS']"
174
-
175
- # Fixed parameters
176
- self.models = self._initialize_models(models, learner)
177
- self.learner = learner
178
- self.n_jobs = n_jobs
179
- self.random_state = random_state
180
- self.verbose = verbose
181
- self.return_type = return_type
182
- self.measures = measures
183
- self.columns = columns
184
-
185
- def _initialize_models(self, models, learner):
186
- """Initializes the quantification models.
187
-
188
- Parameters
189
- ----------
190
- models : Union[List[Union[str, Quantifier]], str, Quantifier]
191
- List of quantification models, a single model name, or 'all' for all models.
192
- learner : BaseEstimator
193
- Machine learning model to be used with the quantifiers.
194
-
195
- Returns
196
- -------
197
- List[Quantifier]
198
- List of quantification models.
199
- """
200
- if isinstance(models, list):
201
- if all(isinstance(model, Quantifier) for model in models):
202
- return models
203
- return [get_method(model)(learner) for model in models]
204
-
205
- if isinstance(models, Quantifier):
206
- return [models]
207
44
 
208
- assert learner is not None, "Learner is required for model methods."
45
+ def __init__(self, batch_size, random_state=None, **kwargs):
46
+ if isinstance(batch_size, int):
47
+ self.n_combinations = 1
48
+ else:
49
+ self.n_combinations = len(batch_size)
209
50
 
210
- model_dict = {
211
- "all": METHODS.values,
212
- "aggregative": AGGREGATIVE.values,
213
- "non_aggregative": NON_AGGREGATIVE.values
214
- }
51
+ self.batch_size = [batch_size] if isinstance(batch_size, int) else batch_size
52
+ self.random_state = random_state
215
53
 
216
- if models in model_dict:
217
- return [model(learner) if hasattr(model, "learner") else model() for model in model_dict[models]()]
218
- return [get_method(models)(learner)]
219
-
220
- def sout(self, msg):
221
- """Prints a message if verbose is True."""
222
- if self.verbose:
223
- print('[APP]' + msg)
224
-
225
- def fit(self, X_train, y_train):
226
- """Fits the models with the training data.
227
-
228
- Parameters
229
- ----------
230
- X_train : np.ndarray
231
- Features of the training set.
232
- y_train : np.ndarray
233
- Labels of the training set.
234
-
235
- Returns
236
- -------
237
- Protocol
238
- Fitted protocol.
54
+ for name, value in kwargs.items():
55
+ setattr(self, name, value)
56
+ if isinstance(value, list):
57
+ self.n_combinations *= len(value)
58
+ elif isinstance(value, (int, float)):
59
+ self.n_combinations *= value
60
+ else:
61
+ raise ValueError(f"Invalid argument {name}={value}: must be int/float or list of int/float.")
62
+
63
+
64
+ def split(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray, np.ndarray]:
239
65
  """
240
- self.sout("Fitting models")
66
+ Split the data into samples for evaluation.
241
67
 
242
- args = ((model, X_train, y_train) for model in self.models)
243
-
244
- wrapper = tqdm if self.verbose else lambda x, **kwargs: x
245
-
246
- self.models = Parallel(n_jobs=self.n_jobs)( # Parallel processing of models
247
- delayed(self._delayed_fit)(*arg) for arg in wrapper(args, desc="Fitting models", total=len(self.models))
248
- )
249
- self.sout("Fit [Done]")
250
- return self
251
-
252
-
253
- def predict(self, X_test: np.ndarray, y_test: np.ndarray) -> Any:
254
- """Predicts the prevalence for the test set.
255
-
256
68
  Parameters
257
69
  ----------
258
- X_test : np.ndarray
259
- Features of the test set.
260
- y_test : np.ndarray
261
- Labels of the test set.
262
-
263
- Returns
264
- -------
265
- Any
266
- Predictions for the test set. Can be a table or a tuple with the quantifier names, real prevalence, and predicted prevalence.
267
- """
268
- predictions = self.predict_protocol(X_test, y_test)
269
- predictions_df = pd.DataFrame(predictions, columns=self.columns)
270
-
271
- if self.return_type == "table":
272
- if self.measures:
273
- smoothed_factor = 1 / (2 * len(X_test))
274
-
275
- def smooth(values: np.ndarray) -> np.ndarray:
276
- return (values + smoothed_factor) / (smoothed_factor * len(values) + 1)
277
-
278
- for metric in self.measures:
279
- predictions_df[metric] = predictions_df.apply(
280
- lambda row: get_measure(metric)(
281
- smooth(np.array(row["REAL_PREVS"])),
282
- smooth(np.array(row["PRED_PREVS"]))
283
- ),
284
- axis=1
285
- )
286
- return predictions_df
287
-
288
- return (
289
- predictions_df["QUANTIFIER"].to_numpy(), # Quantifier names
290
- np.stack(predictions_df["REAL_PREVS"].to_numpy()), # REAL_PREVS
291
- np.stack(predictions_df["PRED_PREVS"].to_numpy()) # PRED_PREVS
292
- )
70
+ X : np.ndarray
71
+ The input features.
72
+ y : np.ndarray
73
+ The target labels.
293
74
 
294
- @abstractmethod
295
- def predict_protocol(self, X_test: np.ndarray, y_test: np.ndarray) -> np.ndarray:
296
- """Abstract method that every protocol must implement
297
-
298
- Parameters
299
- ----------
300
- X_test : np.ndarray
301
- Features of the test set.
302
- y_test : np.ndarray
303
- Labels of the test set.
304
-
305
- Returns
306
- -------
307
- np.ndarray
308
- Predictions for the test set. With the same format as the column names attribute.
75
+ Yields
76
+ ------
77
+ Generator[np.ndarray, np.ndarray]
78
+ A generator that yields the indices for each split.
309
79
  """
310
- ...
80
+ indices = np.arange(X.shape[0])
81
+ for idx in self._split_indices_masks(X, y):
82
+ indexes = indices[idx]
83
+ yield indexes
311
84
 
312
- @abstractmethod
313
- def _new_sample(self) -> Tuple[np.ndarray, np.ndarray]:
314
- """Abstract method of sample extraction for each protocol.
85
+ def _split_indices_masks(self, X: np.ndarray, y: np.ndarray) -> Generator[Tuple[np.ndarray, np.ndarray]]:
86
+ for idx in self._iter_indices(X, y):
315
87
 
316
- Returns:
317
- Tuple[np.ndarray, np.ndarray]: Tuple containing X_sample and y_sample.
318
- """
319
- ...
88
+ mask = np.zeros(X.shape[0], dtype=bool)
89
+ mask[idx] = True
320
90
 
321
- @staticmethod
322
- def _delayed_fit(model, X_train, y_train):
323
- """Method to fit the model in parallel.
324
-
325
- Parameters
326
- ----------
327
- model : Quantifier
328
- Quantification model.
329
- X_train : np.ndarray
330
- Features of the training set.
331
- y_train : np.ndarray
332
- Labels of the training set.
333
-
334
- Returns
335
- -------
336
- Quantifier
337
- Fitted quantification model
338
- """
339
- model_name = model.__class__.__name__
340
- if model_name == "Ensemble" and isinstance(model.base_quantifier, Quantifier):
341
- model_name = f"{model.__class__.__name__}_{model.base_quantifier.__class__.__name__}_{model.size}"
342
-
343
- start = time()
344
- model = model.fit(X=X_train, y=y_train)
345
- duration = time() - start
346
- print(f"\tFitted {model_name} in {duration:.3f} seconds")
347
- return model
91
+ yield mask
348
92
 
349
-
350
-
351
-
93
+ @abstractmethod
94
+ def _iter_indices(self, X, y):
95
+ """Abstract method to be implemented by subclasses to yield indices for each batch."""
96
+ pass
352
97
 
98
+ def get_n_combinations(self) -> int:
99
+ """
100
+ Get the number of combinations for the current protocol.
101
+ """
102
+ return self.n_combinations
353
103
 
354
104
 
355
105
  class APP(Protocol):
356
- """Artificial Prevalence Protocol.
357
-
358
- This approach splits a test into several samples varying prevalence and sample size,
359
- with n iterations. For a list of Quantifiers, it computes training and testing for
360
- each one and returns either a table of results with error measures or just the predictions.
106
+ """Artificial Prevalence Protocol (APP) for evaluation.
107
+ This protocol generates artificial prevalence distributions for the evaluation in an exhaustive manner, testing all possible combinations of prevalences.
361
108
 
362
109
  Parameters
363
110
  ----------
364
- models : Union[List[Union[str, Quantifier]], str, Quantifier]
365
- List of quantification models, a single model name, or 'all' for all models.
366
- batch_size : Union[List[int], int]
367
- Size of the batches to be processed, or a list of sizes.
368
- learner : BaseEstimator, optional
369
- Machine learning model to be used with the quantifiers. Required for model methods.
370
- n_prevs : int, optional
371
- Number of prevalence points to generate. Default is 100.
372
- n_iterations : int, optional
373
- Number of iterations for the protocol. Default is 1.
374
- n_jobs : int, optional
375
- Number of jobs to run in parallel. Default is 1.
111
+ batch_size : int or list of int
112
+ The size of the batches to be used in the evaluation.
113
+ n_prevalences : int
114
+ The number of artificial prevalences to generate.
115
+ repeats : int, optional
116
+ The number of times to repeat the evaluation with different random seeds.
376
117
  random_state : int, optional
377
- Seed for random number generation. Default is 32.
378
- verbose : bool, optional
379
- Whether to print progress messages. Default is False.
380
- return_type : str, optional
381
- Type of return value ('predictions' or 'table'). Default is 'predictions'.
382
- measures : List[str], optional
383
- List of error measures to calculate. Must be in MEASURES or None. Default is None.
384
-
118
+ The random seed for reproducibility.
119
+
385
120
  Attributes
386
121
  ----------
387
- models : List[Quantifier]
388
- List of quantification models.
389
- batch_size : Union[List[int], int]
390
- Size of the batches to be processed.
391
- learner : BaseEstimator
392
- Machine learning model to be used with the quantifiers.
393
- n_prevs : int
394
- Number of prevalence points to generate.
395
- n_iterations : int
396
- Number of iterations for the protocol.
397
- n_jobs : int
398
- Number of jobs to run in parallel.
122
+ n_prevalences : int
123
+ The number of artificial prevalences to generate.
124
+ repeats : int
125
+ The number of times to repeat the evaluation with different random seeds.
399
126
  random_state : int
400
- Seed for random number generation.
401
- verbose : bool
402
- Whether to print progress messages.
403
- return_type : str
404
- Type of return value ('predictions' or 'table').
405
- measures : List[str]
406
- List of error measures to calculate.
127
+ The random seed for reproducibility.
407
128
 
408
- Raises
409
- ------
410
- AssertionError
411
- If return_type is invalid.
412
-
413
- See Also
414
- --------
415
- Protocol : Base class for evaluation protocols.
416
- NPP : Natural Prevalence Protocol.
417
- Quantifier : Base class for quantification methods.
418
-
129
+ Notes
130
+ -----
131
+ It is important to note that in case of multiclass problems, the time complexity of this protocol can be significantly higher due to the increased number of combinations to evaluate.
132
+
419
133
  Examples
420
134
  --------
421
- >>> from mlquantify.evaluation.protocol import APP
422
- >>> from sklearn.ensemble import RandomForestClassifier
423
- >>> from sklearn.datasets import load_breast_cancer
424
- >>> from sklearn.model_selection import train_test_split
425
- >>>
426
- >>> # Loading dataset from sklearn
427
- >>> features, target = load_breast_cancer(return_X_y=True)
428
- >>>
429
- >>> #Splitting into train and test
430
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
431
- >>>
432
- >>> app = APP(models=["CC", "EMQ", "DyS"],
433
- ... batch_size=[10, 50, 100],
434
- ... learner=RandomForestClassifier(),
435
- ... n_prevs=100, # Default
436
- ... n_jobs=-1,
437
- ... return_type="table",
438
- ... measures=["ae", "se"],
439
- ... verbose=True)
440
- >>>
441
- >>> app.fit(X_train, y_train)
442
- >>>
443
- >>> table = app.predict(X_test, y_test)
444
- >>>
445
- >>> print(table)
446
- """
447
-
448
- def __init__(self,
449
- models: Union[List[Union[str, Quantifier]], str, Quantifier],
450
- batch_size: Union[List[int], int],
451
- learner: BaseEstimator = None,
452
- n_prevs: int = 100,
453
- n_iterations: int = 1,
454
- n_jobs: int = 1,
455
- random_state: int = 32,
456
- verbose: bool = False,
457
- return_type: str = "predictions",
458
- measures: List[str] = None):
459
-
460
- super().__init__(models, learner, n_jobs, random_state, verbose, return_type, measures)
461
- self.n_prevs = n_prevs
462
- self.batch_size = batch_size if isinstance(batch_size, list) else [batch_size]
463
- self.n_prevs = n_prevs
464
- self.n_iterations = n_iterations
135
+ >>> protocol = APP(batch_size=[100, 200], n_prevalences=5, repeats=3, random_state=42)
136
+ >>> for train_idx, test_idx in protocol.split(X, y):
137
+ ... # Train and evaluate model
138
+ ... pass
465
139
 
140
+ """
466
141
 
467
- def predict_protocol(self, X_test: np.ndarray, y_test: np.ndarray) -> Tuple:
468
- """Generates several samples with artificial prevalences and sizes.
469
- For each model, predicts with this sample, aggregating all together
470
- with a pandas dataframe if requested, or else just the predictions.
142
+ def __init__(self, batch_size, n_prevalences, repeats=1, random_state=None):
143
+ super().__init__(batch_size=batch_size,
144
+ random_state=random_state,
145
+ n_prevalences=n_prevalences,
146
+ repeats=repeats)
471
147
 
472
- Parameters
473
- ----------
474
- X_test : np.ndarray
475
- Features of the test set.
476
- y_test : np.ndarray
477
- Labels of the test set.
148
+ def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
478
149
 
479
- Returns
480
- -------
481
- Tuple
482
- Tuple containing the (iteration, model name, prev, prev_pred, and batch size).
483
- """
484
-
485
- n_dim = len(np.unique(y_test))
486
- prevs = generate_artificial_prevalences(n_dim, self.n_prevs, self.n_iterations)
487
-
488
- args = [
489
- (iteration, X_test, y_test, model, prev, bs, self.verbose)
490
- for prev in prevs for bs in self.batch_size for model in self.models for iteration in range(self.n_iterations)
491
- ]
150
+ n_dim = len(np.unique(y))
492
151
 
493
- size = len(prevs) * len(self.models) * len(self.batch_size) * self.n_iterations
152
+ for batch_size in self.batch_size:
153
+ prevalences = generate_artificial_prevalences(n_dim=n_dim,
154
+ n_prev=self.n_prevalences,
155
+ n_iter=self.repeats)
156
+ for prev in prevalences:
157
+ indexes = get_indexes_with_prevalence(y, prev, batch_size)
158
+ yield indexes
159
+
494
160
 
495
- predictions = []
496
- for arg in tqdm(args, desc="Running APP", total=size):
497
- predictions.append(self._predict(*arg))
498
-
499
- return predictions
161
+
500
162
 
501
- def _predict(self, iteration:int, X: np.ndarray, y: np.ndarray, model: Any, prev: List[float], batch_size: int, verbose: bool) -> Tuple:
502
- """Method predicts into the new sample for each model and prevalence.
163
+ class NPP(Protocol):
164
+ """No Prevalence Protocol (NPP) for evaluation.
165
+ This protocol just samples the data without any consideration for prevalence, with all instances having equal probability of being selected.
503
166
 
504
- Parameters
505
- ----------
506
- iteration : int
507
- Current iteration.
508
- X : np.ndarray
509
- Features of the test set.
510
- y : np.ndarray
511
- Labels of the test set.
512
- model : Any
513
- Quantification model.
514
- prev : List[float]
515
- Prevalence values for the sample.
516
- batch_size : int
517
- Batch size for the sample.
518
- verbose : bool
519
- Whether to print progress messages.
520
-
521
- Returns
522
- -------
523
- Tuple
524
- Tuple containing the iteration, model name, prev, prev_pred, and batch size.
525
- """
526
- model_name = model.__class__.__name__
527
- if model_name == "Ensemble" and isinstance(model.base_quantifier, Quantifier):
528
- model_name = f"{model.__class__.__name__}_{model.base_quantifier.__class__.__name__}_{model.size}"
529
-
530
- if verbose:
531
- print(f'\t {model_name} with {batch_size} instances and prev {prev}')
532
-
533
- X_sample, _ = self._new_sample(X, y, prev, batch_size)
534
- prev_pred = np.asarray(list(model.predict(X_sample).values()))
535
-
536
- if verbose:
537
- print(f'\t \\--Ending {model_name} with {batch_size} instances and prev {prev}\n')
538
-
539
- return (iteration+1, model_name, prev, prev_pred, batch_size)
167
+ Parameters
168
+ ----------
169
+ batch_size : int or list of int
170
+ The size of the batches to be used in the evaluation.
171
+ random_state : int, optional
172
+ The random seed for reproducibility.
540
173
 
174
+ Attributes
175
+ ----------
176
+ n_prevalences : int
177
+ The number of artificial prevalences to generate.
178
+ repeats : int
179
+ The number of times to repeat the evaluation with different random seeds.
180
+ random_state : int
181
+ The random seed for reproducibility.
541
182
 
542
- def _new_sample(self, X: np.ndarray, y: np.ndarray, prev: List[float], batch_size: int) -> Tuple[np.ndarray, np.ndarray]:
543
- """Generates a new sample with a specified prevalence and size.
183
+ Examples
184
+ --------
185
+ >>> protocol = NPP(batch_size=100, random_state=42)
186
+ >>> for train_idx, test_idx in protocol.split(X, y):
187
+ ... # Train and evaluate model
188
+ ... pass
189
+ """
544
190
 
545
- Parameters
546
- ----------
547
- X : np.ndarray
548
- Features of the test set.
549
- y : np.ndarray
550
- Labels of the test set.
551
- prev : List[float]
552
- Prevalence values for the sample.
553
- batch_size : int
554
- Batch size for the sample.
191
+ def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
555
192
 
556
- Returns
557
- -------
558
- Tuple[np.ndarray, np.ndarray]
559
- Tuple containing the new sample features and labels.
560
- """
561
- sample_index = generate_artificial_indexes(y, prev, batch_size, np.unique(y))
562
- return (np.take(X, sample_index, axis=0), np.take(y, sample_index, axis=0))
563
-
564
-
565
-
193
+ for batch_size in self.batch_size:
194
+ yield np.random.choice(X.shape[0], batch_size, replace=True)
195
+
566
196
 
197
+ class UPP(Protocol):
198
+ """Uniform Prevalence Protocol (UPP) for evaluation.
199
+ An extension of the APP that generates artificial prevalence distributions uniformly across all classes utilizing the kraemer sampling method.
567
200
 
201
+ Parameters
202
+ ----------
203
+ batch_size : int or list of int
204
+ The size of the batches to be used in the evaluation.
205
+ n_prevalences : int
206
+ The number of artificial prevalences to generate.
207
+ repeats : int
208
+ The number of times to repeat the evaluation with different random seeds.
209
+ random_state : int, optional
210
+ The random seed for reproducibility.
568
211
 
212
+ Attributes
213
+ ----------
214
+ n_prevalences : int
215
+ The number of artificial prevalences to generate.
216
+ repeats : int
217
+ The number of times to repeat the evaluation with different random seeds.
218
+ random_state : int
219
+ The random seed for reproducibility.
569
220
 
221
+ Examples
222
+ --------
223
+ >>> protocol = UPP(batch_size=100, n_prevalences=5, repeats=3, random_state=42)
224
+ >>> for train_idx, test_idx in protocol.split(X, y):
225
+ ... # Train and evaluate model
226
+ ... pass
227
+ """
570
228
 
229
+ def __init__(self, batch_size, n_prevalences, repeats=1, random_state=None):
230
+ super().__init__(batch_size=batch_size,
231
+ random_state=random_state,
232
+ n_prevalences=n_prevalences,
233
+ repeats=repeats)
571
234
 
235
+ def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
236
+
237
+ n_dim = len(np.unique(y))
238
+
239
+ for batch_size in self.batch_size:
240
+
241
+ prevalences = kraemer_sampling(n_dim=n_dim,
242
+ n_prev=self.n_prevalences,
243
+ n_iter=self.repeats)
244
+
245
+ for prev in prevalences:
246
+ indexes = get_indexes_with_prevalence(y, prev, batch_size)
247
+ yield indexes
572
248
 
573
249
 
250
+ class PPP(Protocol):
251
+ """ Personalized Prevalence Protocol (PPP) for evaluation.
252
+ This protocol generates artificial prevalence distributions personalized for each class.
574
253
 
575
- class NPP(Protocol):
576
- """Natural Prevalence Protocol.
577
-
578
- This approach splits a test into several samples varying sample size,
579
- with n iterations. For a list of Quantifiers, it computes training and testing for
580
- each one and returns either a table of results with error measures or just the predictions.
581
-
582
254
  Parameters
583
255
  ----------
584
- models : Union[List[Union[str, Quantifier]], str, Quantifier]
585
- List of quantification models, a single model name, or 'all' for all models.
586
- batch_size : Union[List[int], int]
587
- Size of the batches to be processed, or a list of sizes.
588
- learner : BaseEstimator, optional
589
- Machine learning model to be used with the quantifiers. Required for model methods.
590
- n_iterations : int, optional
591
- Number of iterations for the protocol. Default is 1.
592
- n_jobs : int, optional
593
- Number of jobs to run in parallel. Default is 1.
256
+ batch_size : int or list of int
257
+ The size of the batches to be used in the evaluation.
258
+ prevalences : list of float
259
+ The list of artificial prevalences to generate for each class.
260
+ repeats : int
261
+ The number of times to repeat the evaluation with different random seeds.
594
262
  random_state : int, optional
595
- Seed for random number generation. Default is 32.
596
- verbose : bool, optional
597
- Whether to print progress messages. Default is False.
598
- return_type : str, optional
599
- Type of return value ('predictions' or 'table'). Default is 'predictions'.
600
- measures : List[str], optional
601
- List of error measures to calculate. Must be in MEASURES or None. Default is None.
602
-
263
+ The random seed for reproducibility.
264
+
603
265
  Attributes
604
266
  ----------
605
- models : List[Quantifier]
606
- List of quantification models.
607
- batch_size : Union[List[int], int]
608
- Size of the batches to be processed.
609
- learner : BaseEstimator
610
- Machine learning model to be used with the quantifiers.
611
- n_iterations : int
612
- Number of iterations for the protocol.
613
- n_jobs : int
614
- Number of jobs to run in parallel.
267
+ prevalences : list of float
268
+ The list of artificial prevalences to generate for each class.
269
+ repeats : int
270
+ The number of times to repeat the evaluation with different random seeds.
615
271
  random_state : int
616
- Seed for random number generation.
617
- verbose : bool
618
- Whether to print progress messages.
619
- return_type : str
620
- Type of return value ('predictions' or 'table').
621
- measures : List[str]
622
- List of error measures to calculate.
272
+ The random seed for reproducibility.
273
+
274
+ Examples
275
+ --------
276
+ >>> protocol = PPP(batch_size=100, prevalences=[0.1, 0.9], repeats=3, random_state=42)
277
+ >>> for train_idx, test_idx in protocol.split(X, y):
278
+ ... # Train and evaluate model
279
+ ... pass
623
280
  """
624
281
 
282
+ def __init__(self, batch_size, prevalences, repeats=1, random_state=None):
283
+ super().__init__(batch_size=batch_size,
284
+ random_state=random_state,
285
+ prevalences=prevalences,
286
+ repeats=repeats)
625
287
 
626
- def __init__(self,
627
- models: Union[List[Union[str, Quantifier]], str, Quantifier],
628
- learner: BaseEstimator = None,
629
- n_jobs: int = 1,
630
- random_state: int = 32,
631
- verbose: bool = False,
632
- return_type: str = "predictions",
633
- measures: List[str] = None):
634
-
635
- super().__init__(models, learner, n_jobs, random_state, verbose, return_type, measures)
636
-
637
-
638
- def predict_protocol(self, X_test, y_test) -> tuple:
639
- raise NotImplementedError
640
-
641
-
642
- def _new_sample(self, X, y, prev: List[float], batch_size: int) -> tuple:
643
- raise NotImplementedError
288
+ def _iter_indices(self, X: np.ndarray, y: np.ndarray) -> Generator[np.ndarray]:
644
289
 
645
-
646
- def _delayed_predict(self, args) -> tuple:
647
- raise NotImplementedError
290
+ for batch_size in self.batch_size:
291
+ for prev in self.prevalences:
292
+ if isinstance(prev, float):
293
+ prev = [1-prev, prev]
294
+
295
+ indexes = get_indexes_with_prevalence(y, prev, batch_size)
296
+ yield indexes
297
+
@@ -26,12 +26,9 @@ def convert_columns_to_arrays(df, columns:list = ['PRED_PREVS', 'REAL_PREVS']):
26
26
  return df
27
27
 
28
28
 
29
-
30
-
31
-
32
- def generate_artificial_indexes(y, prevalence: list, sample_size:int, classes:list):
29
+ def get_indexes_with_prevalence(y, prevalence: list, sample_size:int):
33
30
  """
34
- Generate indexes for a stratified sample based on the prevalence of each class.
31
+ Get indexes for a stratified sample based on the prevalence of each class.
35
32
 
36
33
  Parameters
37
34
  ----------
@@ -48,10 +45,13 @@ def generate_artificial_indexes(y, prevalence: list, sample_size:int, classes:li
48
45
  -------
49
46
  list
50
47
  List of indexes for the stratified sample.
51
- """
48
+ """
49
+ classes = np.unique(y)
50
+
52
51
  # Ensure the sum of prevalences is 1
53
52
  assert np.isclose(sum(prevalence), 1), "The sum of prevalences must be 1"
54
53
  # Ensure the number of prevalences matches the number of classes
54
+ assert len(prevalence) == len(classes), "The number of prevalences must match the number of classes"
55
55
 
56
56
  sampled_indexes = []
57
57
  total_sampled = 0
@@ -78,6 +78,43 @@ def generate_artificial_indexes(y, prevalence: list, sample_size:int, classes:li
78
78
 
79
79
 
80
80
 
81
+ def kraemer_sampling(n_dim: int, n_prev: int, n_iter: int = 1) -> np.ndarray:
82
+ """
83
+ Uniform sampling from the unit simplex using Kraemer's algorithm.
84
+
85
+ Parameters
86
+ ----------
87
+ n_dim : int
88
+ Number of dimensions.
89
+ n_prev : int
90
+ Size of the sample.
91
+ n_iter : int
92
+ Number of iterations.
93
+
94
+ Returns
95
+ -------
96
+ np.ndarray
97
+ Array of sampled prevalences.
98
+ """
99
+
100
+ def _sampling(n_dim: int, n_prev: int) -> np.ndarray:
101
+ if n_dim == 2:
102
+ u = np.random.rand(n_prev)
103
+ return np.vstack([1 - u, u]).T
104
+ else:
105
+ u = np.random.rand(n_prev, n_dim - 1)
106
+ u.sort(axis=-1) # sort each row
107
+ _0s = np.zeros((n_prev, 1))
108
+ _1s = np.ones((n_prev, 1))
109
+ a = np.hstack([_0s, u])
110
+ b = np.hstack([u, _1s])
111
+ return b - a
112
+
113
+ # repeat n_iter times
114
+ prevs = _sampling(n_dim, n_prev)
115
+
116
+ return np.repeat(prevs, n_iter, axis=0) if n_iter > 1 else prevs
117
+
81
118
 
82
119
  def generate_artificial_prevalences(n_dim: int, n_prev: int, n_iter: int) -> np.ndarray:
83
120
  """Generates n artificial prevalences with n dimensions.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mlquantify
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: Quantification Library
5
5
  Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
6
6
  Maintainer: Luiz Fernando Luth Junior
@@ -40,9 +40,9 @@ ___
40
40
 
41
41
  ## Latest Release
42
42
 
43
- - **Version 0.0.11.6**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
44
- - In case you need any help, refer to the [wiki](https://github.com/luizfernandolj/mlquantify/wiki).
45
- - Explore the [API documentation](#) for detailed developer information.
43
+ - **Version 0.1.3**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
44
+ - In case you need any help, refer to the [User Guide](https://luizfernandolj.github.io/mlquantify/user_guide.html).
45
+ - Explore the [API documentation](https://luizfernandolj.github.io/mlquantify/api/index.html) for detailed developer information.
46
46
  - See also the library in the pypi site in [pypi mlquantify](https://pypi.org/project/mlquantify/)
47
47
 
48
48
  ___
@@ -70,7 +70,7 @@ ___
70
70
  | **21 Quantification Methods** | Methods for quantification, such as classify & Count Correct methods, Threshold Optimization, Mixture Models and more.|
71
71
  | **Dynamic class management** | All methods are dynamic, and handles multiclass and binary problems, in case of binary it makes One-Vs-All (OVA) automatically. |
72
72
  | **Model Selection** | Criteria and processes used to select the best model, such as grid-search for the case of quantification|
73
- | **Evaluation Metrics** | Specific metrics used to evaluate quantification performance, (e.g., AE, BIAS, NAE, SE, KLD, etc.). |
73
+ | **Evaluation Metrics** | Specific metrics used to evaluate quantification performance, (e.g., AE, MAE, NAE, SE, KLD, etc.). |
74
74
  | **Evaluation Protocols** | Evaluation protocols used, based on sampling generation (e.g., APP, NPP, etc.).. |
75
75
  | **Plotting Results** | Tools and techniques used to visualize results, such as the protocol results.|
76
76
  | **Comprehensive Documentation** | Complete documentation of the project, including code, data, and results. |
@@ -82,7 +82,10 @@ ___
82
82
  This code first loads the breast cancer dataset from _sklearn_, which is then split into training and testing sets. It uses the _Expectation Maximisation Quantifier (EMQ)_ with a RandomForest classifier to predict class prevalence. After training the model, it evaluates performance by calculating and printing the absolute error and bias between the real and predicted prevalences.
83
83
 
84
84
  ```python
85
- import mlquantify as mq
85
+ from mlquantify.methods import EMQ
86
+ from mlquantify.evaluation.measures import absolute_error, mean_absolute_error
87
+ from mlquantify.utils import get_real_prev
88
+
86
89
  from sklearn.ensemble import RandomForestClassifier
87
90
  from sklearn.datasets import load_breast_cancer
88
91
  from sklearn.model_selection import train_test_split
@@ -94,19 +97,19 @@ features, target = load_breast_cancer(return_X_y=True)
94
97
  X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
95
98
 
96
99
  #Create the model, here it is the Expectation Maximisation Quantifier (EMQ) with a classifier
97
- model = mq.methods.EMQ(RandomForestClassifier())
100
+ model = EMQ(RandomForestClassifier())
98
101
  model.fit(X_train, y_train)
99
102
 
100
103
  #Predict the class prevalence for X_test
101
104
  pred_prevalence = model.predict(X_test)
102
- real_prevalence = mq.utils.get_real_prev(y_test)
105
+ real_prevalence = get_real_prev(y_test)
103
106
 
104
107
  #Get the error for the prediction
105
- ae = mq.evaluation.absolute_error(real_prevalence, pred_prevalence)
106
- bias = mq.evaluation.bias(real_prevalence, pred_prevalence)
108
+ ae = absolute_error(real_prevalence, pred_prevalence)
109
+ mae = mean_absolute_error(real_prevalence, pred_prevalence)
107
110
 
108
- print(f"Mean Squared Error (MSE) -> {ae:.4f}")
109
- print(f"Bias -> {bias}")
111
+ print(f"Absolute Error -> {ae}")
112
+ print(f"Mean Absolute Error -> {mae}")
110
113
  ```
111
114
 
112
115
  ___
@@ -125,7 +128,7 @@ ___
125
128
 
126
129
  ## Documentation
127
130
 
128
- ##### API is avaliable [here](#)
131
+ ##### API is avaliable [here](https://luizfernandolj.github.io/mlquantify/api/index.html)
129
132
 
130
133
  - [Methods](https://github.com/luizfernandolj/mlquantify/wiki/Methods)
131
134
  - [Model Selection](https://github.com/luizfernandolj/mlquantify/wiki/Model-Selection)
@@ -6,7 +6,7 @@ mlquantify/classification/__init__.py,sha256=3FGf-F4SOM3gByUPsWdnBzjyC_31B3Mtzuo
6
6
  mlquantify/classification/methods.py,sha256=yDSbpoqM3hfF0a9ATzKqfG9S-44x-0Rq0lkAVJKTIEs,5006
7
7
  mlquantify/evaluation/__init__.py,sha256=x1grng0n_QeZpVBU8-pwagYdBMkbMRILtrp1qk_bLvk,447
8
8
  mlquantify/evaluation/measures.py,sha256=fIKyxxlD8em3oaj4u_BeXmNyUQG_A0vXWY8APPgNoJ0,6579
9
- mlquantify/evaluation/protocol.py,sha256=OsOXm_vf7sYlw9pQv08WxAvvgzo10bAqiDM-1cpz7nQ,24020
9
+ mlquantify/evaluation/protocol.py,sha256=__tzRyqW4cJz4Fl87TInf7dXxIJ6bSaYaSaw-SdkNmM,10365
10
10
  mlquantify/methods/__init__.py,sha256=ya3Mn7bcz2r3oaIT7yVR4iJkAfgEAwF4xDK54C0rZ7U,536
11
11
  mlquantify/methods/aggregative.py,sha256=F5Z-tGA9OcZgMBLKOeaos6wIgvvnDeriZ4y0TyMpDrc,39051
12
12
  mlquantify/methods/meta.py,sha256=sZWQHUGkm6iiqujmIpHDL_8tDdKQ161bzD5mcpXLWEY,19066
@@ -14,9 +14,9 @@ mlquantify/methods/mixture_models.py,sha256=si2Pzaka5Kbva4QKBzLolvb_8V0ZEjp68UBA
14
14
  mlquantify/methods/non_aggregative.py,sha256=xaBu21TUtiYkOEUKO16NaNMwdNa6-SNjfBsc5PpIMyI,4815
15
15
  mlquantify/methods/threshold_optimization.py,sha256=NYGKbYvtfmiBeU8wpTiFCdURkijcPRZtybPOt6vtXbY,30489
16
16
  mlquantify/utils/__init__.py,sha256=logWrL6B6mukP8tvYm_UPEdO9eNA-J-ySILr7-syDoc,44
17
- mlquantify/utils/general.py,sha256=Li5ix_dy19dUhYNgiUsNHdqqnSVYvznUBUuyr-zYSPI,7554
17
+ mlquantify/utils/general.py,sha256=wKJSmwF1KfSlSrDm0KTf92FMvB62BBOxf2Se9HyeWYE,8668
18
18
  mlquantify/utils/method.py,sha256=RL4vBJGl5_6DZ59Bs62hdNXI_hnoDIWilMMyMPiOjBg,12631
19
- mlquantify-0.1.3.dist-info/METADATA,sha256=FkF8Qt_lHsa0Lf0sXAQ36Ri5bs5aMkAoNVzubTPty1A,4940
20
- mlquantify-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
21
- mlquantify-0.1.3.dist-info/top_level.txt,sha256=tGEkYkbbFElwULvqENjam3u1uXtyC1J9dRmibsq8_n0,11
22
- mlquantify-0.1.3.dist-info/RECORD,,
19
+ mlquantify-0.1.4.dist-info/METADATA,sha256=UtNxYnZnSt6HS0B8JsW5A5tvxlxFUH_GODjF1AXXsSY,5166
20
+ mlquantify-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
21
+ mlquantify-0.1.4.dist-info/top_level.txt,sha256=tGEkYkbbFElwULvqENjam3u1uXtyC1J9dRmibsq8_n0,11
22
+ mlquantify-0.1.4.dist-info/RECORD,,