mlquantify 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. mlquantify/__init__.py +10 -29
  2. mlquantify/adjust_counting/__init__.py +24 -0
  3. mlquantify/adjust_counting/_adjustment.py +648 -0
  4. mlquantify/adjust_counting/_base.py +245 -0
  5. mlquantify/adjust_counting/_counting.py +153 -0
  6. mlquantify/adjust_counting/_utils.py +109 -0
  7. mlquantify/base.py +117 -519
  8. mlquantify/base_aggregative.py +209 -0
  9. mlquantify/calibration.py +1 -0
  10. mlquantify/confidence.py +329 -0
  11. mlquantify/likelihood/__init__.py +5 -0
  12. mlquantify/likelihood/_base.py +147 -0
  13. mlquantify/likelihood/_classes.py +430 -0
  14. mlquantify/meta/__init__.py +1 -0
  15. mlquantify/meta/_classes.py +785 -0
  16. mlquantify/metrics/__init__.py +21 -0
  17. mlquantify/metrics/_oq.py +109 -0
  18. mlquantify/metrics/_rq.py +98 -0
  19. mlquantify/{evaluation/measures.py → metrics/_slq.py} +51 -36
  20. mlquantify/mixture/__init__.py +7 -0
  21. mlquantify/mixture/_base.py +147 -0
  22. mlquantify/mixture/_classes.py +458 -0
  23. mlquantify/mixture/_utils.py +163 -0
  24. mlquantify/model_selection/__init__.py +9 -0
  25. mlquantify/model_selection/_protocol.py +358 -0
  26. mlquantify/model_selection/_search.py +315 -0
  27. mlquantify/model_selection/_split.py +1 -0
  28. mlquantify/multiclass.py +350 -0
  29. mlquantify/neighbors/__init__.py +9 -0
  30. mlquantify/neighbors/_base.py +168 -0
  31. mlquantify/neighbors/_classes.py +150 -0
  32. mlquantify/{classification/methods.py → neighbors/_classification.py} +37 -62
  33. mlquantify/neighbors/_kde.py +268 -0
  34. mlquantify/neighbors/_utils.py +131 -0
  35. mlquantify/neural/__init__.py +1 -0
  36. mlquantify/utils/__init__.py +47 -2
  37. mlquantify/utils/_artificial.py +27 -0
  38. mlquantify/utils/_constraints.py +219 -0
  39. mlquantify/utils/_context.py +21 -0
  40. mlquantify/utils/_decorators.py +36 -0
  41. mlquantify/utils/_exceptions.py +12 -0
  42. mlquantify/utils/_get_scores.py +159 -0
  43. mlquantify/utils/_load.py +18 -0
  44. mlquantify/utils/_parallel.py +6 -0
  45. mlquantify/utils/_random.py +36 -0
  46. mlquantify/utils/_sampling.py +273 -0
  47. mlquantify/utils/_tags.py +44 -0
  48. mlquantify/utils/_validation.py +447 -0
  49. mlquantify/utils/prevalence.py +64 -0
  50. {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/METADATA +2 -1
  51. mlquantify-0.1.10.dist-info/RECORD +53 -0
  52. mlquantify/classification/__init__.py +0 -1
  53. mlquantify/evaluation/__init__.py +0 -14
  54. mlquantify/evaluation/protocol.py +0 -289
  55. mlquantify/methods/__init__.py +0 -37
  56. mlquantify/methods/aggregative.py +0 -1159
  57. mlquantify/methods/meta.py +0 -472
  58. mlquantify/methods/mixture_models.py +0 -1003
  59. mlquantify/methods/non_aggregative.py +0 -136
  60. mlquantify/methods/threshold_optimization.py +0 -869
  61. mlquantify/model_selection.py +0 -377
  62. mlquantify/plots.py +0 -367
  63. mlquantify/utils/general.py +0 -371
  64. mlquantify/utils/method.py +0 -449
  65. mlquantify-0.1.8.dist-info/RECORD +0 -22
  66. {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/WHEEL +0 -0
  67. {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,785 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from copy import deepcopy
4
+ from tqdm import tqdm
5
+ from sklearn.linear_model import LogisticRegression
6
+ from sklearn.model_selection import GridSearchCV, cross_val_predict, train_test_split
7
+
8
+ from mlquantify.base import BaseQuantifier, MetaquantifierMixin
9
+ from mlquantify.metrics._slq import MSE
10
+ from mlquantify.mixture._classes import SORD, DyS
11
+ from mlquantify.mixture._utils import getHist, hellinger
12
+ from mlquantify.utils import Options, Interval
13
+ from mlquantify.utils import _fit_context
14
+ from mlquantify.confidence import (
15
+ construct_confidence_region
16
+ )
17
+ from mlquantify.base_aggregative import (
18
+ _get_learner_function,
19
+ is_aggregative_quantifier,
20
+ get_aggregation_requirements,
21
+ uses_soft_predictions
22
+ )
23
+ from mlquantify.utils._sampling import (
24
+ bootstrap_sample_indices
25
+ )
26
+ from mlquantify.model_selection import APP, NPP, UPP
27
+ from mlquantify.utils._validation import validate_data, validate_prevalences
28
+ from mlquantify.utils.prevalence import get_prev_from_labels
29
+
30
+
31
+
32
+ def get_protocol_sampler(protocol_name, batch_size, n_prevalences, min_prev, max_prev, n_classes):
33
+ r""" Returns a prevalence sampler function based on the specified protocol name.
34
+
35
+ Parameters
36
+ ----------
37
+ protocol_name : str
38
+ The name of the protocol ('app', 'npp', 'upp', 'upp-k').
39
+ batch_size : int
40
+ The size of each batch.
41
+ n_prevalences : int
42
+ The number of prevalences to sample.
43
+ min_prev : float
44
+ The minimum prevalence value.
45
+ max_prev : float
46
+ The maximum prevalence value.
47
+ n_classes : int
48
+ The number of classes.
49
+
50
+ Returns
51
+ -------
52
+ callable
53
+ A function that generates prevalence samples according to the specified protocol.
54
+ """
55
+
56
+ if protocol_name == 'artificial':
57
+ protocol = APP(batch_size=batch_size,
58
+ n_prevalences=n_prevalences,
59
+ min_prev=min_prev,
60
+ max_prev=max_prev)
61
+
62
+ elif protocol_name == 'natural':
63
+ protocol = NPP(batch_size=batch_size,
64
+ n_samples=n_prevalences)
65
+
66
+ elif protocol_name == 'uniform':
67
+ protocol = UPP(batch_size=batch_size,
68
+ n_prevalences=n_prevalences,
69
+ algorithm='uniform',
70
+ min_prev=min_prev,
71
+ max_prev=max_prev)
72
+ elif protocol_name == 'kraemer':
73
+ protocol = UPP(batch_size=batch_size,
74
+ n_prevalences=n_prevalences,
75
+ algorithm='kraemer',
76
+ min_prev=min_prev,
77
+ max_prev=max_prev)
78
+ else:
79
+ raise ValueError(f"Unknown protocol: {protocol_name}")
80
+ return protocol
81
+
82
+ class EnsembleQ(MetaquantifierMixin, BaseQuantifier):
83
+ r"""Ensemble-based Quantifier combining multiple models trained on varied data samples
84
+ with controlled prevalence distributions to improve robustness and accuracy.
85
+
86
+ This quantifier constructs an ensemble of quantification models using batches of training
87
+ data sampled according to an evaluation protocol (e.g. 'artificial', 'natural', 'uniform', 'kraemer')
88
+ with specified prevalence constraints. Diverse models are trained on these subsamples,
89
+ and their prevalence estimates aggregated using various selection metrics and aggregation methods.
90
+
91
+ Parameters
92
+ ----------
93
+ quantifier : BaseQuantifier
94
+ The quantifier model class to be used for ensemble members.
95
+ size : int, default=50
96
+ Number of ensemble members (sub-models) to train.
97
+ min_prop, max_prop : float, default=(0.1, 1.0)
98
+ Minimum and maximum class prevalence proportions for generating training batches.
99
+ selection_metric : {'all', 'ptr', 'ds'}, default='all'
100
+ Metric used to select or weight ensemble members during aggregation:
101
+ - 'all': uses all models equally,
102
+ - 'ptr': selects models with prevalences closest to initial test prevalence estimates,
103
+ - 'ds': selects models with score distributions similar to test data.
104
+ p_metric : float, default=0.25
105
+ Proportion of ensemble members to select according to the selection metric.
106
+ protocol : {'artificial', 'natural', 'uniform', 'kraemer'}, default='uniform'
107
+ Sampling protocol used to generate training data for ensemble models.
108
+ return_type : {'mean', 'median'}, default='mean'
109
+ Aggregation method for ensemble predictions.
110
+ max_sample_size : int or None, optional
111
+ Maximum number of samples per training batch; defaults to dataset size if None.
112
+ max_trials : int, default=100
113
+ Maximum number of trials for sampling.
114
+ n_jobs : int, default=1
115
+ Number of parallel jobs for training ensemble members.
116
+ verbose : bool, default=False
117
+ Enable verbose output.
118
+
119
+ Attributes
120
+ ----------
121
+ models : list
122
+ List of fitted quantifier ensemble members.
123
+ train_prevalences : list
124
+ List of training prevalences corresponding to ensemble members.
125
+ train_distributions : list
126
+ List of historical training posterior histograms (used when selection_metric='ds').
127
+ posteriors_generator : callable or None
128
+ Function to generate posterior probabilities for new samples.
129
+
130
+
131
+ Notes
132
+ -----
133
+ - Ensemble diversity is controlled by sampling prevalences from the specified protocol.
134
+ - The 'ds' selection metric requires probabilistic quantifiers and computes distribution similarity.
135
+ - Uses sklearn's LogisticRegression and GridSearchCV internally for posterior computation within 'ds'.
136
+
137
+ Examples
138
+ --------
139
+ >>> from mlquantify.ensemble import EnsembleQ
140
+ >>> from mlquantify.mixture import DyS
141
+ >>> from sklearn.ensemble import RandomForestClassifier
142
+ >>>
143
+ >>> ensemble = EnsembleQ(
144
+ ... quantifier=DyS(RandomForestClassifier()),
145
+ ... size=30,
146
+ ... protocol='artificial', # APP protocol
147
+ ... selection_metric='ptr'
148
+ ... )
149
+ >>> ensemble.fit(X_train, y_train)
150
+ >>> prevalence_estimates = ensemble.predict(X_test)
151
+
152
+ References
153
+ ----------
154
+ .. [1] Pérez-Gállego, P., Castaño, A., Ramón Quevedo, J., & José del Coz, J. (2019). Dynamic ensemble selection for quantification tasks. Information Fusion, 45, 1-15. https://doi.org/10.1016/j.inffus.2018.01.001
155
+
156
+ .. [2] Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017). Using ensembles for problems with characterizable changes in data distribution: A case study on quantification. Information Fusion, 34, 87-100. https://doi.org/10.1016/j.inffus.2016.07.001
157
+
158
+ """
159
+
160
+ _parameter_constraints = {
161
+ "quantifier": [BaseQuantifier],
162
+ "size": [Interval(left=1, right=None, discrete=True)],
163
+ "min_prop": [Interval(left=0.0, right=1.0, inclusive_left=True, inclusive_right=True)],
164
+ "max_prop": [Interval(left=0.0, right=1.0, inclusive_left=True, inclusive_right=True)],
165
+ "selection_metric": [Options(['all', 'ptr', 'ds'])],
166
+ "p_metric": [Interval(left=0.0, right=1.0, inclusive_left=True, inclusive_right=True)],
167
+ "protocol": [Options(['artificial', 'natural', 'uniform', 'kraemer'])],
168
+ "return_type": [Options(['mean', 'median'])],
169
+ "max_sample_size": [Options([Interval(left=1, right=None, discrete=True), None])],
170
+ "max_trials": [Interval(left=1, right=None, discrete=True)],
171
+ "n_jobs": [Interval(left=1, right=None, discrete=True)],
172
+ "verbose": [bool],
173
+ }
174
+
175
+ def __init__(self,
176
+ quantifier,
177
+ size=50,
178
+ min_prop=0.1,
179
+ max_prop=1,
180
+ selection_metric='all',
181
+ protocol="uniform",
182
+ p_metric=0.25,
183
+ return_type="mean",
184
+ max_sample_size=None,
185
+ max_trials=100,
186
+ n_jobs=1,
187
+ verbose=False):
188
+
189
+ self.quantifier = quantifier
190
+ self.size = size
191
+ self.min_prop = min_prop
192
+ self.max_prop = max_prop
193
+ self.p_metric = p_metric
194
+ self.protocol = protocol
195
+ self.selection_metric = selection_metric
196
+ self.return_type = return_type
197
+ self.n_jobs = n_jobs
198
+ self.verbose = verbose
199
+ self.max_sample_size = max_sample_size
200
+ self.max_trials = max_trials
201
+
202
+ def sout(self, msg):
203
+ """Prints a message if verbose is True."""
204
+ if self.verbose:
205
+ print('[Ensemble]' + msg)
206
+
207
+ @_fit_context(prefer_skip_nested_validation=True)
208
+ def fit(self, X, y):
209
+ """ Fits the ensemble model to the given training data.
210
+
211
+ Parameters
212
+ ----------
213
+ X : array-like of shape (n_samples, n_features)
214
+ The input data.
215
+ y : array-like of shape (n_samples,)
216
+ The target values.
217
+
218
+ Returns
219
+ -------
220
+ self : Ensemble
221
+ The fitted ensemble model.
222
+ """
223
+ self.sout('Fit')
224
+
225
+ self.models = []
226
+ self.train_prevalences = []
227
+ self.train_distributions = []
228
+ self.posteriors_generator = []
229
+
230
+ self.classes = np.unique(y)
231
+ X, y = validate_data(self, X, y)
232
+
233
+ if self.selection_metric == 'ds' and not len(self.classes) == 2:
234
+ raise ValueError(f'ds selection_metric is only defined for binary quantification, but this dataset is not binary')
235
+ # randomly chooses the prevalences for each member of the ensemble (preventing classes with less than
236
+ # min_pos positive examples)
237
+ sample_size = len(y) if self.max_sample_size is None else min(self.max_sample_size, len(y))
238
+
239
+ protocol = get_protocol_sampler(
240
+ protocol_name=self.protocol,
241
+ batch_size=sample_size,
242
+ n_prevalences=self.size,
243
+ min_prev=self.min_prop,
244
+ max_prev=self.max_prop,
245
+ n_classes=len(self.classes)
246
+ )
247
+
248
+ posteriors = None
249
+ if self.selection_metric == 'ds':
250
+ # precompute the training posterior probabilities
251
+ posteriors, self.posteriors_generator = self.ds_get_posteriors(X, y)
252
+
253
+ for idx in protocol.split(X, y):
254
+ X_batch, y_batch = X[idx], y[idx]
255
+ model = deepcopy(self.quantifier)
256
+
257
+ model.fit(X_batch, y_batch)
258
+ tr_prev = get_prev_from_labels(y_batch)
259
+
260
+ if self.selection_metric == 'ds':
261
+ self.train_distributions.append(getHist(posteriors[idx], 8))
262
+
263
+ self.train_prevalences.append(tr_prev)
264
+ self.models.append(model)
265
+
266
+ self.sout('Fit [Done]')
267
+ return self
268
+
269
+ def predict(self, X):
270
+ """ Predicts the class prevalences for the given test data.
271
+
272
+ Parameters
273
+ ----------
274
+ X : array-like of shape (n_samples, n_features)
275
+ The input data.
276
+
277
+ Returns
278
+ -------
279
+ prevalences : array-like of shape (n_samples, n_classes)
280
+ The predicted class prevalences.
281
+ """
282
+ self.sout('Predict')
283
+
284
+ test_prevalences = []
285
+
286
+ for model in tqdm(self.models, disable=not self.verbose):
287
+ pred = np.asarray(list(model.predict(X).values()))
288
+ test_prevalences.append(pred)
289
+
290
+ test_prevalences = np.asarray(test_prevalences)
291
+
292
+ if self.selection_metric == 'ptr':
293
+ test_prevalences = self.ptr_selection_metric(test_prevalences, self.train_prevalences)
294
+ elif self.selection_metric == 'ds':
295
+ test_prevalences = self.ds_selection_metric(X,
296
+ test_prevalences,
297
+ self.train_distributions,
298
+ self.posteriors_generator)
299
+
300
+ if self.return_type == "median":
301
+ prevalences = np.median(test_prevalences, axis=0)
302
+ else:
303
+ prevalences = np.mean(test_prevalences, axis=0)
304
+
305
+
306
+ self.sout('Predict [Done]')
307
+ prevalences = validate_prevalences(self, prevalences, self.classes)
308
+ return prevalences
309
+
310
+
311
+ def ptr_selection_metric(self, prevalences, train_prevalences):
312
+ r"""
313
+ Selects the prevalence estimates from models trained on samples whose prevalence is most similar
314
+ to an initial approximation of the test prevalence as estimated by all models in the ensemble.
315
+
316
+ Parameters
317
+ ----------
318
+ prevalences : numpy.ndarray
319
+ An array of prevalence estimates provided by each model in the ensemble.
320
+
321
+ Returns
322
+ -------
323
+ numpy.ndarray
324
+ The selected prevalence estimates after applying the PTR selection metric.
325
+ """
326
+ test_prev_estim = prevalences.mean(axis=0)
327
+ ptr_differences = [MSE(test_prev_estim, ptr_i) for ptr_i in train_prevalences]
328
+ order = np.argsort(ptr_differences)
329
+ return _select_k(prevalences, order, k=self.p_metric)
330
+
331
+ def ds_get_posteriors(self, X, y):
332
+ r"""
333
+ Generate posterior probabilities using cross-validated logistic regression.
334
+ This method computes posterior probabilities for the training data via cross-validation,
335
+ using a logistic regression classifier with hyperparameters optimized through grid search.
336
+ It also returns a function to generate posterior probabilities for new data.
337
+
338
+ Parameters
339
+ ----------
340
+ X : array-like of shape (n_samples, n_features)
341
+ The feature matrix representing the training data.
342
+ y : array-like of shape (n_samples,)
343
+ The target vector representing class labels for the training data.
344
+
345
+ Returns
346
+ -------
347
+ posteriors : ndarray of shape (n_samples, n_classes)
348
+ Posterior probabilities for the training data obtained through cross-validation.
349
+ posteriors_generator : callable
350
+ A function that computes posterior probabilities for new input data.
351
+
352
+ Notes
353
+ -----
354
+ - In scenarios where the quantifier is not based on a probabilistic classifier, it's necessary
355
+ to train a separate probabilistic model to obtain posterior probabilities.
356
+ - Using cross-validation ensures that the posterior probabilities for the training data are unbiased,
357
+ as each data point is evaluated by a model not trained on that point.
358
+ - Hyperparameters for the logistic regression classifier are optimized using a grid search with
359
+ cross-validation to improve the model's performance.
360
+ """
361
+ lr_base = LogisticRegression(class_weight='balanced', max_iter=1000)
362
+
363
+ optim = GridSearchCV(
364
+ lr_base, param_grid={'C': np.logspace(-4, 4, 9)}, cv=5, n_jobs=self.n_jobs, refit=True
365
+ ).fit(X, y)
366
+
367
+ posteriors = cross_val_predict(
368
+ optim.best_estimator_, X, y, cv=5, n_jobs=self.n_jobs, method='predict_proba'
369
+ )
370
+ posteriors_generator = optim.best_estimator_.predict_proba
371
+
372
+ return posteriors, posteriors_generator
373
+
374
+
375
+ def ds_selection_metric(self, X, prevalences, train_distributions, posteriors_generator):
376
+ r"""
377
+ Selects the prevalence estimates from models trained on samples whose distribution of posterior
378
+ probabilities is most similar to the distribution of posterior probabilities for the test data.
379
+
380
+ Parameters
381
+ ----------
382
+ prevalences : numpy.ndarray
383
+ An array of prevalence estimates provided by each model in the ensemble.
384
+ test : array-like of shape (n_samples, n_features)
385
+ The feature matrix representing the test data.
386
+
387
+ Returns
388
+ -------
389
+ numpy.ndarray
390
+ The selected prevalence estimates after applying the DS selection metric.
391
+ """
392
+ test_posteriors = posteriors_generator(X)
393
+ test_distribution = getHist(test_posteriors, 8)
394
+ dist = [hellinger(tr_dist_i, test_distribution) for tr_dist_i in train_distributions]
395
+ order = np.argsort(dist)
396
+ return _select_k(prevalences, order, k=self.p_metric)
397
+
398
+ def _select_k(elements, order, k):
399
+ r"""
400
+ Selects the k elements from the list of elements based on the order.
401
+ If the list is empty, it returns the original list.
402
+
403
+ Parameters
404
+ ----------
405
+ elements : array-like
406
+ The array of elements to be selected from.
407
+ order : array-like
408
+ The order of the elements.
409
+ k : int
410
+ The number of elements to be selected.
411
+
412
+ Returns
413
+ -------
414
+ array-like
415
+ The selected elements.
416
+ """
417
+ elements_k = [elements[idx] for idx in order[:k]]
418
+ if elements_k:
419
+ return elements_k
420
+ print(f"Unable to take {k} for elements with size {len(elements)}")
421
+ return elements
422
+
423
+
424
+
425
+
426
+
427
+ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
428
+ r"""
429
+ Aggregative Bootstrap Quantifier to compute prevalence confidence regions.
430
+
431
+ This metaquantifier applies bootstrapping to both training and test data predictions
432
+ to generate multiple bootstrap prevalence estimates. These bootstrapped estimates
433
+ are used to construct confidence intervals or elliptical confidence regions for
434
+ prevalence predictions, improving uncertainty quantification.
435
+
436
+ Parameters
437
+ ----------
438
+ quantifier : BaseQuantifier
439
+ The base quantifier model, which must be aggregative.
440
+ n_train_bootstraps : int, default=1
441
+ Number of bootstrap samples to generate from training predictions.
442
+ n_test_bootstraps : int, default=1
443
+ Number of bootstrap samples to generate from test predictions.
444
+ random_state : int or None, optional
445
+ Random seed for reproducibility.
446
+ region_type : {'intervals', 'ellipse', 'ellipse-clr'}, default='intervals'
447
+ Type of confidence region to construct.
448
+ confidence_level : float between 0 and 1, default=0.95
449
+ Confidence level for intervals or regions.
450
+
451
+
452
+ Examples
453
+ --------
454
+ >>> from mlquantify.ensemble import AggregativeBootstrap
455
+ >>> from mlquantify.neighbors import EMQ
456
+ >>> from sklearn.ensemble import RandomForestClassifier
457
+ >>> agg_boot = AggregativeBootstrap(
458
+ ... quantifier=EMQ(RandomForestClassifier()),
459
+ ... n_train_bootstraps=100,
460
+ ... n_test_bootstraps=100
461
+ ... )
462
+ >>> agg_boot.fit(X_train, y_train)
463
+ >>> prevalence, conf_region = agg_boot.predict(X_test)
464
+ """
465
+
466
+ _parameter_constraints = {
467
+ "quantifier": [BaseQuantifier],
468
+ "n_train_bootstraps": [Interval(left=1, right=None, discrete=True)],
469
+ "n_test_bootstraps": [Interval(left=1, right=None, discrete=True)],
470
+ "random_state": [Options([None, int])],
471
+ "region_type": [Options(['intervals', 'ellipse', 'ellipse-clr'])],
472
+ "confidence_level": [Interval(left=0.0, right=1.0)],
473
+ }
474
+
475
+ def __init__(self,
476
+ quantifier,
477
+ n_train_bootstraps=1,
478
+ n_test_bootstraps=1,
479
+ random_state=None,
480
+ region_type='intervals',
481
+ confidence_level=0.95):
482
+ self.quantifier = quantifier
483
+ self.n_train_bootstraps = n_train_bootstraps
484
+ self.n_test_bootstraps = n_test_bootstraps
485
+ self.random_state = random_state
486
+ self.region_type = region_type
487
+ self.confidence_level = confidence_level
488
+
489
+ def fit(self, X, y, val_split=None):
490
+ r""" Fits the aggregative bootstrap model to the given training data.
491
+
492
+ Parameters
493
+ ----------
494
+ X : array-like of shape (n_samples, n_features)
495
+ The input data.
496
+ y : array-like of shape (n_samples,)
497
+ The target values.
498
+
499
+ Returns
500
+ -------
501
+ self : AggregativeBootstrap
502
+ The fitted aggregative bootstrap model.
503
+
504
+ Raises
505
+ ------
506
+ ValueError
507
+ If the provided quantifier is not an aggregative quantifier.
508
+ """
509
+ X, y = validate_data(self, X, y)
510
+ self.classes = np.unique(y)
511
+
512
+ if not is_aggregative_quantifier(self.quantifier):
513
+ raise ValueError(f"The quantifier {self.quantifier.__class__.__name__} is not an aggregative quantifier.")
514
+ self.quantifier_learner = deepcopy(self.quantifier)
515
+
516
+ learner_function = _get_learner_function(self.quantifier_learner)
517
+ model = self.quantifier_learner.learner
518
+
519
+ if val_split is None:
520
+ model.fit(X, y)
521
+ train_y_values = y
522
+ train_predictions = getattr(model, learner_function)(X)
523
+ else:
524
+ X_fit, y_fit, X_val, y_val = train_test_split(X, y, test_size=val_split, random_state=self.random_state)
525
+ model.fit(X_fit, y_fit)
526
+ train_y_values = y_val
527
+ train_predictions = getattr(model, learner_function)(X_val)
528
+ self.train_predictions = train_predictions
529
+ self.train_y_values = train_y_values
530
+
531
+ return self
532
+
533
+ def predict(self, X):
534
+ r""" Predicts the class prevalences for the given test data.
535
+
536
+ Parameters
537
+ ----------
538
+ X : array-like of shape (n_samples, n_features)
539
+ The input data.
540
+
541
+ Returns
542
+ -------
543
+ prevalences : array-like of shape (n_samples, n_classes)
544
+ The predicted class prevalences.
545
+ """
546
+ X = validate_data(self, X, None)
547
+ learner_function = _get_learner_function(self.quantifier_learner)
548
+ model = self.quantifier_learner.learner
549
+
550
+ predictions = getattr(model, learner_function)(X)
551
+
552
+ return self.aggregate(predictions, self.train_predictions, self.train_y_values)
553
+
554
+
555
+ def aggregate(self, predictions, train_predictions, train_y_values):
556
+ r""" Aggregates the predictions using bootstrap resampling.
557
+
558
+ Parameters
559
+ ----------
560
+ predictions : array-like of shape (n_samples, n_classes)
561
+ The input data.
562
+ train_predictions : array-like of shape (n_samples, n_classes)
563
+ The training predictions.
564
+ train_y_values : array-like of shape (n_samples,)
565
+ The training target values.
566
+
567
+ Returns
568
+ -------
569
+ prevalences : array-like of shape (n_samples, n_classes)
570
+ The predicted class prevalences.
571
+ """
572
+ prevalences = []
573
+
574
+ self.classes = np.unique(train_y_values)
575
+
576
+ for train_idx in bootstrap_sample_indices(
577
+ n_samples=len(train_predictions),
578
+ n_bootstraps=self.n_train_bootstraps,
579
+ batch_size=len(train_predictions),
580
+ random_state=self.random_state
581
+ ):
582
+ train_pred_boot = train_predictions[train_idx]
583
+ train_y_boot = train_y_values[train_idx]
584
+
585
+ for test_idx in bootstrap_sample_indices(
586
+ n_samples=len(predictions),
587
+ n_bootstraps=self.n_test_bootstraps,
588
+ batch_size=len(predictions),
589
+ random_state=self.random_state
590
+ ):
591
+ test_pred_boot = predictions[test_idx]
592
+
593
+ requirements = get_aggregation_requirements(self.quantifier)
594
+
595
+ if requirements.requires_train_proba and requirements.requires_train_labels:
596
+ prevalences_boot = self.quantifier.aggregate(test_pred_boot, train_pred_boot, train_y_boot)
597
+ elif requirements.requires_train_labels:
598
+ prevalences_boot = self.quantifier.aggregate(test_pred_boot, train_y_boot)
599
+ else:
600
+ prevalences_boot = self.quantifier.aggregate(test_pred_boot)
601
+
602
+ prevalences_boot = np.asarray(list(prevalences_boot.values()))
603
+ prevalences.append(prevalences_boot)
604
+
605
+ prevalences = np.asarray(prevalences)
606
+ confidence_region = construct_confidence_region(
607
+ prev_estims=prevalences,
608
+ method=self.region_type,
609
+ confidence_level=self.confidence_level,
610
+ )
611
+
612
+ prevalence = confidence_region.get_point_estimate()
613
+
614
+ prevalence = validate_prevalences(self, prevalence, self.classes)
615
+
616
+ return prevalence
617
+
618
+
619
+
620
+
621
+ class QuaDapt(MetaquantifierMixin, BaseQuantifier):
622
+ r"""QuaDapt Metaquantifier: Adaptive quantification using synthetic scores.
623
+
624
+ This metaquantifier improves prevalence estimation by merging training samples
625
+ with different score distributions using a merging factor :math: \( m \). It evaluates
626
+ candidate merging factors, chooses the best by minimizing a distribution distance
627
+ metric (Hellinger, Topsoe, ProbSymm, or SORD), and aggregates quantification accordingly.
628
+
629
+ Parameters
630
+ ----------
631
+ quantifier : BaseQuantifier
632
+ The base quantifier model to adapt.
633
+ measure : {'hellinger', 'topsoe', 'probsymm', 'sord'}, default='topsoe'
634
+ The distribution distance metric used to select the best merging factor.
635
+ merging_factors : array-like
636
+ Candidate merging factor values to evaluate.
637
+
638
+ Examples
639
+ --------
640
+ >>> from mlquantify.meta import QuaDapt
641
+ >>> from mlquantify.adjust_counting import ACC
642
+ >>> from sklearn.ensemble import RandomForestClassifier
643
+ >>> quadapt_acc = QuaDapt(
644
+ ... quantifier=ACC(RandomForestClassifier()),
645
+ ... merging_factor=[0.1, 0.5, 1.0],
646
+ ... measure='sord'
647
+ ... )
648
+ >>> quadapt_acc.fit(X_train, y_train)
649
+ >>> prevalence = quadapt_acc.predict(X_test)
650
+
651
+
652
+ """
653
+
654
+ _parameter_constraints = {
655
+ "quantifier": [BaseQuantifier],
656
+ "merging_factors": "array-like",
657
+ "measure": [Options(["hellinger", "topsoe", "probsymm", "sord"])],
658
+ "random_state": [Options([None, int])],
659
+ }
660
+
661
+ def __init__(self,
662
+ quantifier,
663
+ measure="topsoe",
664
+ merging_factors=(0.1, 1.0, 0.2)):
665
+ self.quantifier = quantifier
666
+ self.measure = measure
667
+ self.merging_factors = merging_factors
668
+
669
+
670
+ def fit(self, X, y):
671
+ X, y = validate_data(self, X, y)
672
+ self.classes = np.unique(y)
673
+
674
+ if not uses_soft_predictions(self.quantifier):
675
+ raise ValueError(f"The quantifier {self.quantifier.__class__.__name__} is not a soft (probabilistic) quantifier.")
676
+
677
+ requirements = get_aggregation_requirements(self.quantifier)
678
+ if not requirements.requires_train_proba:
679
+ raise ValueError(f"The quantifier {self.quantifier.__class__.__name__} does not use training probabilities, which are required for QuaDapt.")
680
+
681
+ self.quantifier.learner.fit(X, y)
682
+ self.train_y_values = y
683
+
684
+ return self
685
+
686
+ def predict(self, X):
687
+
688
+ X = validate_data(self, X, None)
689
+
690
+ model = self.quantifier.learner
691
+
692
+ predictions = getattr(model, "predict_proba")(X)
693
+
694
+ return self.aggregate(predictions, self.train_y_values)
695
+
696
+
697
+ def aggregate(self, predictions, train_y_values):
698
+
699
+ pos_predictions = predictions[:, 1]
700
+ m = self._get_best_merging_factor(pos_predictions)
701
+
702
+ self.classes = self.classes if hasattr(self, 'classes') else np.unique(train_y_values)
703
+
704
+ moss = QuaDapt.MoSS(1000, 0.5, m)
705
+
706
+ moss_scores = moss[:, :2]
707
+ moss_labels = moss[:, 2]
708
+
709
+ prevalences = self.quantifier.aggregate(predictions,
710
+ moss_scores,
711
+ moss_labels)
712
+
713
+ prevalences = {self.classes[i]: v for i, v in enumerate(prevalences.values())}
714
+ return prevalences
715
+
716
+
717
+ def _get_best_merging_factor(self, predictions):
718
+
719
+ MF = np.atleast_1d(np.round(self.merging_factors, 2)).astype(float)
720
+
721
+ distances = []
722
+
723
+ for mf in MF:
724
+ scores = QuaDapt.MoSS(1000, 0.5, mf)
725
+ pos_scores = scores[scores[:, 2] == 1][:, :2]
726
+ neg_scores = scores[scores[:, 2] == 0][:, :2]
727
+
728
+ best_distance = self._get_best_distance(predictions, pos_scores, neg_scores)
729
+
730
+ distances.append(best_distance)
731
+
732
+ best_m = MF[np.argmin(distances)]
733
+ return best_m
734
+
735
+ def _get_best_distance(self, predictions, pos_scores, neg_scores):
736
+
737
+ if self.measure in ["hellinger", "topsoe", "probsymm"]:
738
+ method = DyS(measure=self.measure)
739
+ elif self.measure == "sord":
740
+ method = SORD()
741
+
742
+ best_distance = method.get_best_distance(predictions, pos_scores, neg_scores)
743
+ return best_distance
744
+
745
+
746
+ @classmethod
747
+ def MoSS(cls, n, alpha, m):
748
+ r"""Model for Score Simulation
749
+
750
+ MoSS has three key parameters:
751
+ (I) the number of observations `n`;
752
+ (II) the class proportion `\alpha`, which defines the prevalence of the positive class;
753
+ (III) the merging factor :math:`m`, which controls the overlap between positive and negative score distributions
754
+ (where :math:`m=0` represents easily separable classes and :math:`m=1` represents highly overlapping ones).
755
+
756
+ .. math::
757
+
758
+ \mathrm{moss}(n, \alpha, \mathfrak{m}) = \mathrm{syn}(\oplus, \lfloor \alpha n \rfloor, \mathfrak{m}) \cup \mathrm{syn}(\ominus , \lfloor (1 - \alpha) n \rfloor, \mathfrak{m})
759
+
760
+ Notes
761
+ -----
762
+ The MoSS generates only binary scores, simulating positive and negative class scores.
763
+
764
+ Examples
765
+ --------
766
+ >>> scores = QuaDapt.MoSS(n=1000, alpha=0.3, m=0.5)
767
+ >>> print(scores.shape)
768
+ (1000, 3)
769
+
770
+ References
771
+ ----------
772
+ .. [1] Maletzke, A., Reis, D. dos, Hassan, W., & Batista, G. (2021).
773
+ Accurately Quantifying under Score Variability. 2021 IEEE International Conference on Data Mining (ICDM), 1228-1233. https://doi.org/10.1109/ICDM51629.2021.00149
774
+ """
775
+ p_score = np.random.uniform(size=int(n * alpha)) ** m
776
+ n_score = 1 - (np.random.uniform(size=int(round(n * (1 - alpha), 0))) ** m)
777
+ scores = np.column_stack(
778
+ (np.concatenate((p_score, n_score)),
779
+ np.concatenate((p_score, n_score)),
780
+ np.concatenate((
781
+ np.ones(len(p_score)),
782
+ np.full(len(n_score), 0))))
783
+ )
784
+ return scores
785
+