mlquantify 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. mlquantify/__init__.py +0 -29
  2. mlquantify/adjust_counting/__init__.py +14 -0
  3. mlquantify/adjust_counting/_adjustment.py +365 -0
  4. mlquantify/adjust_counting/_base.py +247 -0
  5. mlquantify/adjust_counting/_counting.py +145 -0
  6. mlquantify/adjust_counting/_utils.py +114 -0
  7. mlquantify/base.py +117 -519
  8. mlquantify/base_aggregative.py +209 -0
  9. mlquantify/calibration.py +1 -0
  10. mlquantify/confidence.py +335 -0
  11. mlquantify/likelihood/__init__.py +5 -0
  12. mlquantify/likelihood/_base.py +161 -0
  13. mlquantify/likelihood/_classes.py +414 -0
  14. mlquantify/meta/__init__.py +1 -0
  15. mlquantify/meta/_classes.py +761 -0
  16. mlquantify/metrics/__init__.py +21 -0
  17. mlquantify/metrics/_oq.py +109 -0
  18. mlquantify/metrics/_rq.py +98 -0
  19. mlquantify/{evaluation/measures.py → metrics/_slq.py} +43 -28
  20. mlquantify/mixture/__init__.py +7 -0
  21. mlquantify/mixture/_base.py +153 -0
  22. mlquantify/mixture/_classes.py +400 -0
  23. mlquantify/mixture/_utils.py +112 -0
  24. mlquantify/model_selection/__init__.py +9 -0
  25. mlquantify/model_selection/_protocol.py +358 -0
  26. mlquantify/model_selection/_search.py +315 -0
  27. mlquantify/model_selection/_split.py +1 -0
  28. mlquantify/multiclass.py +350 -0
  29. mlquantify/neighbors/__init__.py +9 -0
  30. mlquantify/neighbors/_base.py +198 -0
  31. mlquantify/neighbors/_classes.py +159 -0
  32. mlquantify/{classification/methods.py → neighbors/_classification.py} +48 -66
  33. mlquantify/neighbors/_kde.py +270 -0
  34. mlquantify/neighbors/_utils.py +135 -0
  35. mlquantify/neural/__init__.py +1 -0
  36. mlquantify/utils/__init__.py +47 -2
  37. mlquantify/utils/_artificial.py +27 -0
  38. mlquantify/utils/_constraints.py +219 -0
  39. mlquantify/utils/_context.py +21 -0
  40. mlquantify/utils/_decorators.py +36 -0
  41. mlquantify/utils/_exceptions.py +12 -0
  42. mlquantify/utils/_get_scores.py +159 -0
  43. mlquantify/utils/_load.py +18 -0
  44. mlquantify/utils/_parallel.py +6 -0
  45. mlquantify/utils/_random.py +36 -0
  46. mlquantify/utils/_sampling.py +273 -0
  47. mlquantify/utils/_tags.py +44 -0
  48. mlquantify/utils/_validation.py +447 -0
  49. mlquantify/utils/prevalence.py +61 -0
  50. {mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/METADATA +2 -1
  51. mlquantify-0.1.9.dist-info/RECORD +53 -0
  52. mlquantify/classification/__init__.py +0 -1
  53. mlquantify/evaluation/__init__.py +0 -14
  54. mlquantify/evaluation/protocol.py +0 -291
  55. mlquantify/methods/__init__.py +0 -37
  56. mlquantify/methods/aggregative.py +0 -1159
  57. mlquantify/methods/meta.py +0 -472
  58. mlquantify/methods/mixture_models.py +0 -1003
  59. mlquantify/methods/non_aggregative.py +0 -136
  60. mlquantify/methods/threshold_optimization.py +0 -869
  61. mlquantify/model_selection.py +0 -377
  62. mlquantify/plots.py +0 -367
  63. mlquantify/utils/general.py +0 -371
  64. mlquantify/utils/method.py +0 -449
  65. mlquantify-0.1.7.dist-info/RECORD +0 -22
  66. {mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/WHEEL +0 -0
  67. {mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,761 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from copy import deepcopy
4
+ from tqdm import tqdm
5
+ from sklearn.linear_model import LogisticRegression
6
+ from sklearn.model_selection import GridSearchCV, cross_val_predict, train_test_split
7
+
8
+ from mlquantify.base import BaseQuantifier, MetaquantifierMixin
9
+ from mlquantify.metrics._slq import MSE
10
+ from mlquantify.mixture._classes import SORD, DyS
11
+ from mlquantify.mixture._utils import getHist, hellinger
12
+ from mlquantify.utils import Options, Interval
13
+ from mlquantify.utils import _fit_context
14
+ from mlquantify.confidence import (
15
+ construct_confidence_region
16
+ )
17
+ from mlquantify.base_aggregative import (
18
+ _get_learner_function,
19
+ is_aggregative_quantifier,
20
+ get_aggregation_requirements,
21
+ uses_soft_predictions
22
+ )
23
+ from mlquantify.utils._sampling import (
24
+ bootstrap_sample_indices
25
+ )
26
+ from mlquantify.model_selection import APP, NPP, UPP
27
+ from mlquantify.utils._validation import validate_data, validate_prevalences
28
+ from mlquantify.utils.prevalence import get_prev_from_labels
29
+
30
+
31
+
32
+ def get_protocol_sampler(protocol_name, batch_size, n_prevalences, min_prev, max_prev, n_classes):
33
+ """ Returns a prevalence sampler function based on the specified protocol name.
34
+
35
+ Parameters
36
+ ----------
37
+ protocol_name : str
38
+ The name of the protocol ('app', 'npp', 'upp', 'upp-k').
39
+ batch_size : int
40
+ The size of each batch.
41
+ n_prevalences : int
42
+ The number of prevalences to sample.
43
+ min_prev : float
44
+ The minimum prevalence value.
45
+ max_prev : float
46
+ The maximum prevalence value.
47
+ n_classes : int
48
+ The number of classes.
49
+
50
+ Returns
51
+ -------
52
+ callable
53
+ A function that generates prevalence samples according to the specified protocol.
54
+ """
55
+
56
+ if protocol_name == 'artificial':
57
+ protocol = APP(batch_size=batch_size,
58
+ n_prevalences=n_prevalences,
59
+ min_prev=min_prev,
60
+ max_prev=max_prev)
61
+
62
+ elif protocol_name == 'natural':
63
+ protocol = NPP(batch_size=batch_size,
64
+ n_samples=n_prevalences)
65
+
66
+ elif protocol_name == 'uniform':
67
+ protocol = UPP(batch_size=batch_size,
68
+ n_prevalences=n_prevalences,
69
+ algorithm='uniform',
70
+ min_prev=min_prev,
71
+ max_prev=max_prev)
72
+ elif protocol_name == 'kraemer':
73
+ protocol = UPP(batch_size=batch_size,
74
+ n_prevalences=n_prevalences,
75
+ algorithm='kraemer',
76
+ min_prev=min_prev,
77
+ max_prev=max_prev)
78
+ else:
79
+ raise ValueError(f"Unknown protocol: {protocol_name}")
80
+ return protocol
81
+
82
+ class EnsembleQ(MetaquantifierMixin, BaseQuantifier):
83
+ """
84
+ Ensemble-based Quantifier combining multiple models trained on varied data samples
85
+ with controlled prevalence distributions to improve robustness and accuracy.
86
+
87
+ This quantifier constructs an ensemble of quantification models using batches of training
88
+ data sampled according to an evaluation protocol (e.g. 'artificial', 'natural', 'uniform', 'kraemer')
89
+ with specified prevalence constraints. Diverse models are trained on these subsamples,
90
+ and their prevalence estimates aggregated using various selection metrics and aggregation methods.
91
+
92
+ Parameters
93
+ ----------
94
+ quantifier : BaseQuantifier
95
+ The quantifier model class to be used for ensemble members.
96
+ size : int, default=50
97
+ Number of ensemble members (sub-models) to train.
98
+ min_prop, max_prop : float, default=(0.1, 1.0)
99
+ Minimum and maximum class prevalence proportions for generating training batches.
100
+ selection_metric : {'all', 'ptr', 'ds'}, default='all'
101
+ Metric used to select or weight ensemble members during aggregation:
102
+ - 'all': uses all models equally,
103
+ - 'ptr': selects models with prevalences closest to initial test prevalence estimates,
104
+ - 'ds': selects models with score distributions similar to test data.
105
+ p_metric : float, default=0.25
106
+ Proportion of ensemble members to select according to the selection metric.
107
+ protocol : {'artificial', 'natural', 'uniform', 'kraemer'}, default='uniform'
108
+ Sampling protocol used to generate training data for ensemble models.
109
+ return_type : {'mean', 'median'}, default='mean'
110
+ Aggregation method for ensemble predictions.
111
+ max_sample_size : int or None, optional
112
+ Maximum number of samples per training batch; defaults to dataset size if None.
113
+ max_trials : int, default=100
114
+ Maximum number of trials for sampling.
115
+ n_jobs : int, default=1
116
+ Number of parallel jobs for training ensemble members.
117
+ verbose : bool, default=False
118
+ Enable verbose output.
119
+
120
+ Attributes
121
+ ----------
122
+ models : list
123
+ List of fitted quantifier ensemble members.
124
+ train_prevalences : list
125
+ List of training prevalences corresponding to ensemble members.
126
+ train_distributions : list
127
+ List of historical training posterior histograms (used when selection_metric='ds').
128
+ posteriors_generator : callable or None
129
+ Function to generate posterior probabilities for new samples.
130
+
131
+ Methods
132
+ -------
133
+ fit(X, y)
134
+ Fits all ensemble member quantifiers on sampled training batches.
135
+ predict(X)
136
+ Aggregates ensemble member predictions into final prevalence estimates.
137
+ ptr_selection_metric(prevalences, train_prevalences)
138
+ Implements PTR-based selection metric on prevalence estimates.
139
+ ds_get_posteriors(X, y)
140
+ Computes posterior probabilities for training data with cross-validated logistic regression.
141
+ ds_selection_metric(X, prevalences, train_distributions, posteriors_generator)
142
+ Implements DS-based selection metric comparing posterior distributions.
143
+
144
+ Notes
145
+ -----
146
+ - Ensemble diversity is controlled by sampling prevalences from the specified protocol.
147
+ - The 'ds' selection metric requires probabilistic quantifiers and computes distribution similarity.
148
+ - Uses sklearn's LogisticRegression and GridSearchCV internally for posterior computation within 'ds'.
149
+
150
+ Examples
151
+ --------
152
+ >>> ensemble = EnsembleQ(quantifier=SomeQuantifier(), size=30, protocol='kraemer', selection_metric='ptr')
153
+ >>> ensemble.fit(X_train, y_train)
154
+ >>> prevalence_estimates = ensemble.predict(X_test)
155
+ """
156
+
157
+ _parameter_constraints = {
158
+ "quantifier": [BaseQuantifier],
159
+ "size": [Interval(left=1, right=None, discrete=True)],
160
+ "min_prop": [Interval(left=0.0, right=1.0, inclusive_left=True, inclusive_right=True)],
161
+ "max_prop": [Interval(left=0.0, right=1.0, inclusive_left=True, inclusive_right=True)],
162
+ "selection_metric": [Options(['all', 'ptr', 'ds'])],
163
+ "p_metric": [Interval(left=0.0, right=1.0, inclusive_left=True, inclusive_right=True)],
164
+ "protocol": [Options(['artificial', 'natural', 'uniform', 'kraemer'])],
165
+ "return_type": [Options(['mean', 'median'])],
166
+ "max_sample_size": [Options([Interval(left=1, right=None, discrete=True), None])],
167
+ "max_trials": [Interval(left=1, right=None, discrete=True)],
168
+ "n_jobs": [Interval(left=1, right=None, discrete=True)],
169
+ "verbose": [bool],
170
+ }
171
+
172
+ def __init__(self,
173
+ quantifier,
174
+ size=50,
175
+ min_prop=0.1,
176
+ max_prop=1,
177
+ selection_metric='all',
178
+ protocol="uniform",
179
+ p_metric=0.25,
180
+ return_type="mean",
181
+ max_sample_size=None,
182
+ max_trials=100,
183
+ n_jobs=1,
184
+ verbose=False):
185
+
186
+ self.quantifier = quantifier
187
+ self.size = size
188
+ self.min_prop = min_prop
189
+ self.max_prop = max_prop
190
+ self.p_metric = p_metric
191
+ self.protocol = protocol
192
+ self.selection_metric = selection_metric
193
+ self.return_type = return_type
194
+ self.n_jobs = n_jobs
195
+ self.verbose = verbose
196
+ self.max_sample_size = max_sample_size
197
+ self.max_trials = max_trials
198
+
199
+ def sout(self, msg):
200
+ """Prints a message if verbose is True."""
201
+ if self.verbose:
202
+ print('[Ensemble]' + msg)
203
+
204
+ @_fit_context(prefer_skip_nested_validation=True)
205
+ def fit(self, X, y):
206
+ """ Fits the ensemble model to the given training data.
207
+
208
+ Parameters
209
+ ----------
210
+ X : array-like of shape (n_samples, n_features)
211
+ The input data.
212
+ y : array-like of shape (n_samples,)
213
+ The target values.
214
+
215
+ Returns
216
+ -------
217
+ self : Ensemble
218
+ The fitted ensemble model.
219
+ """
220
+ self.sout('Fit')
221
+
222
+ self.models = []
223
+ self.train_prevalences = []
224
+ self.train_distributions = []
225
+ self.posteriors_generator = []
226
+
227
+ self.classes = np.unique(y)
228
+ X, y = validate_data(self, X, y)
229
+
230
+ if self.selection_metric == 'ds' and not len(self.classes) == 2:
231
+ raise ValueError(f'ds selection_metric is only defined for binary quantification, but this dataset is not binary')
232
+ # randomly chooses the prevalences for each member of the ensemble (preventing classes with less than
233
+ # min_pos positive examples)
234
+ sample_size = len(y) if self.max_sample_size is None else min(self.max_sample_size, len(y))
235
+
236
+ protocol = get_protocol_sampler(
237
+ protocol_name=self.protocol,
238
+ batch_size=sample_size,
239
+ n_prevalences=self.size,
240
+ min_prev=self.min_prop,
241
+ max_prev=self.max_prop,
242
+ n_classes=len(self.classes)
243
+ )
244
+
245
+ posteriors = None
246
+ if self.selection_metric == 'ds':
247
+ # precompute the training posterior probabilities
248
+ posteriors, self.posteriors_generator = self.ds_get_posteriors(X, y)
249
+
250
+ for idx in protocol.split(X, y):
251
+ X_batch, y_batch = X[idx], y[idx]
252
+ model = deepcopy(self.quantifier)
253
+
254
+ model.fit(X_batch, y_batch)
255
+ tr_prev = get_prev_from_labels(y_batch)
256
+
257
+ if self.selection_metric == 'ds':
258
+ self.train_distributions.append(getHist(posteriors[idx], 8))
259
+
260
+ self.train_prevalences.append(tr_prev)
261
+ self.models.append(model)
262
+
263
+ self.sout('Fit [Done]')
264
+ return self
265
+
266
+ def predict(self, X):
267
+ """ Predicts the class prevalences for the given test data.
268
+
269
+ Parameters
270
+ ----------
271
+ X : array-like of shape (n_samples, n_features)
272
+ The input data.
273
+
274
+ Returns
275
+ -------
276
+ prevalences : array-like of shape (n_samples, n_classes)
277
+ The predicted class prevalences.
278
+ """
279
+ self.sout('Predict')
280
+
281
+ test_prevalences = []
282
+
283
+ for model in tqdm(self.models, disable=not self.verbose):
284
+ pred = np.asarray(list(model.predict(X).values()))
285
+ test_prevalences.append(pred)
286
+
287
+ test_prevalences = np.asarray(test_prevalences)
288
+
289
+ if self.selection_metric == 'ptr':
290
+ test_prevalences = self.ptr_selection_metric(test_prevalences, self.train_prevalences)
291
+ elif self.selection_metric == 'ds':
292
+ test_prevalences = self.ds_selection_metric(X,
293
+ test_prevalences,
294
+ self.train_distributions,
295
+ self.posteriors_generator)
296
+
297
+ if self.return_type == "median":
298
+ prevalences = np.median(test_prevalences, axis=0)
299
+ else:
300
+ prevalences = np.mean(test_prevalences, axis=0)
301
+
302
+
303
+ self.sout('Predict [Done]')
304
+ prevalences = validate_prevalences(self, prevalences, self.classes)
305
+ return prevalences
306
+
307
+
308
+ def ptr_selection_metric(self, prevalences, train_prevalences):
309
+ """
310
+ Selects the prevalence estimates from models trained on samples whose prevalence is most similar
311
+ to an initial approximation of the test prevalence as estimated by all models in the ensemble.
312
+
313
+ Parameters
314
+ ----------
315
+ prevalences : numpy.ndarray
316
+ An array of prevalence estimates provided by each model in the ensemble.
317
+
318
+ Returns
319
+ -------
320
+ numpy.ndarray
321
+ The selected prevalence estimates after applying the PTR selection metric.
322
+ """
323
+ test_prev_estim = prevalences.mean(axis=0)
324
+ ptr_differences = [MSE(test_prev_estim, ptr_i) for ptr_i in train_prevalences]
325
+ order = np.argsort(ptr_differences)
326
+ return _select_k(prevalences, order, k=self.p_metric)
327
+
328
+ def ds_get_posteriors(self, X, y):
329
+ """
330
+ Generate posterior probabilities using cross-validated logistic regression.
331
+ This method computes posterior probabilities for the training data via cross-validation,
332
+ using a logistic regression classifier with hyperparameters optimized through grid search.
333
+ It also returns a function to generate posterior probabilities for new data.
334
+
335
+ Parameters
336
+ ----------
337
+ X : array-like of shape (n_samples, n_features)
338
+ The feature matrix representing the training data.
339
+ y : array-like of shape (n_samples,)
340
+ The target vector representing class labels for the training data.
341
+
342
+ Returns
343
+ -------
344
+ posteriors : ndarray of shape (n_samples, n_classes)
345
+ Posterior probabilities for the training data obtained through cross-validation.
346
+ posteriors_generator : callable
347
+ A function that computes posterior probabilities for new input data.
348
+
349
+ Notes
350
+ -----
351
+ - In scenarios where the quantifier is not based on a probabilistic classifier, it's necessary
352
+ to train a separate probabilistic model to obtain posterior probabilities.
353
+ - Using cross-validation ensures that the posterior probabilities for the training data are unbiased,
354
+ as each data point is evaluated by a model not trained on that point.
355
+ - Hyperparameters for the logistic regression classifier are optimized using a grid search with
356
+ cross-validation to improve the model's performance.
357
+ """
358
+ lr_base = LogisticRegression(class_weight='balanced', max_iter=1000)
359
+
360
+ optim = GridSearchCV(
361
+ lr_base, param_grid={'C': np.logspace(-4, 4, 9)}, cv=5, n_jobs=self.n_jobs, refit=True
362
+ ).fit(X, y)
363
+
364
+ posteriors = cross_val_predict(
365
+ optim.best_estimator_, X, y, cv=5, n_jobs=self.n_jobs, method='predict_proba'
366
+ )
367
+ posteriors_generator = optim.best_estimator_.predict_proba
368
+
369
+ return posteriors, posteriors_generator
370
+
371
+
372
+ def ds_selection_metric(self, X, prevalences, train_distributions, posteriors_generator):
373
+ """
374
+ Selects the prevalence estimates from models trained on samples whose distribution of posterior
375
+ probabilities is most similar to the distribution of posterior probabilities for the test data.
376
+
377
+ Parameters
378
+ ----------
379
+ prevalences : numpy.ndarray
380
+ An array of prevalence estimates provided by each model in the ensemble.
381
+ test : array-like of shape (n_samples, n_features)
382
+ The feature matrix representing the test data.
383
+
384
+ Returns
385
+ -------
386
+ numpy.ndarray
387
+ The selected prevalence estimates after applying the DS selection metric.
388
+ """
389
+ test_posteriors = posteriors_generator(X)
390
+ test_distribution = getHist(test_posteriors, 8)
391
+ dist = [hellinger(tr_dist_i, test_distribution) for tr_dist_i in train_distributions]
392
+ order = np.argsort(dist)
393
+ return _select_k(prevalences, order, k=self.p_metric)
394
+
395
+ def _select_k(elements, order, k):
396
+ """
397
+ Selects the k elements from the list of elements based on the order.
398
+ If the list is empty, it returns the original list.
399
+
400
+ Parameters
401
+ ----------
402
+ elements : array-like
403
+ The array of elements to be selected from.
404
+ order : array-like
405
+ The order of the elements.
406
+ k : int
407
+ The number of elements to be selected.
408
+
409
+ Returns
410
+ -------
411
+ array-like
412
+ The selected elements.
413
+ """
414
+ elements_k = [elements[idx] for idx in order[:k]]
415
+ if elements_k:
416
+ return elements_k
417
+ print(f"Unable to take {k} for elements with size {len(elements)}")
418
+ return elements
419
+
420
+
421
+
422
+
423
+
424
+ class AggregativeBootstrap(MetaquantifierMixin, BaseQuantifier):
425
+ """
426
+ Aggregative Bootstrap Quantifier to compute prevalence confidence regions.
427
+
428
+ This metaquantifier applies bootstrapping to both training and test data predictions
429
+ to generate multiple bootstrap prevalence estimates. These bootstrapped estimates
430
+ are used to construct confidence intervals or elliptical confidence regions for
431
+ prevalence predictions, improving uncertainty quantification.
432
+
433
+ Parameters
434
+ ----------
435
+ quantifier : BaseQuantifier
436
+ The base quantifier model, which must be aggregative.
437
+ n_train_bootstraps : int, default=1
438
+ Number of bootstrap samples to generate from training predictions.
439
+ n_test_bootstraps : int, default=1
440
+ Number of bootstrap samples to generate from test predictions.
441
+ random_state : int or None, optional
442
+ Random seed for reproducibility.
443
+ region_type : {'intervals', 'ellipse', 'ellipse-clr'}, default='intervals'
444
+ Type of confidence region to construct.
445
+ confidence_level : float between 0 and 1, default=0.95
446
+ Confidence level for intervals or regions.
447
+
448
+ Methods
449
+ -------
450
+ fit(X, y, val_split=None)
451
+ Fits base quantifier and generates training predictions (optionally splitting data).
452
+ predict(X)
453
+ Returns prevalence estimates and confidence regions aggregated from bootstrap samples.
454
+ aggregate(predictions, train_predictions, train_y_values)
455
+ Performs bootstrap resampling aggregation to obtain prevalence confidence regions.
456
+
457
+ Examples
458
+ --------
459
+ >>> agg_boot = AggregativeBootstrap(quantifier=SomeQuantifier, n_train_bootstraps=100, n_test_bootstraps=100)
460
+ >>> agg_boot.fit(X_train, y_train)
461
+ >>> prevalence, conf_region = agg_boot.predict(X_test)
462
+ """
463
+
464
+ _parameter_constraints = {
465
+ "quantifier": [BaseQuantifier],
466
+ "n_train_bootstraps": [Interval(left=1, right=None, discrete=True)],
467
+ "n_test_bootstraps": [Interval(left=1, right=None, discrete=True)],
468
+ "random_state": [Options([None, int])],
469
+ "region_type": [Options(['intervals', 'ellipse', 'ellipse-clr'])],
470
+ "confidence_level": [Interval(left=0.0, right=1.0)],
471
+ }
472
+
473
+ def __init__(self,
474
+ quantifier,
475
+ n_train_bootstraps=1,
476
+ n_test_bootstraps=1,
477
+ random_state=None,
478
+ region_type='intervals',
479
+ confidence_level=0.95):
480
+ self.quantifier = quantifier
481
+ self.n_train_bootstraps = n_train_bootstraps
482
+ self.n_test_bootstraps = n_test_bootstraps
483
+ self.random_state = random_state
484
+ self.region_type = region_type
485
+ self.confidence_level = confidence_level
486
+
487
+ def fit(self, X, y, val_split=None):
488
+ """ Fits the aggregative bootstrap model to the given training data.
489
+
490
+ Parameters
491
+ ----------
492
+ X : array-like of shape (n_samples, n_features)
493
+ The input data.
494
+ y : array-like of shape (n_samples,)
495
+ The target values.
496
+
497
+ Returns
498
+ -------
499
+ self : AggregativeBootstrap
500
+ The fitted aggregative bootstrap model.
501
+ """
502
+ X, y = validate_data(self, X, y)
503
+ self.classes = np.unique(y)
504
+
505
+ if not is_aggregative_quantifier(self.quantifier):
506
+ raise ValueError(f"The quantifier {self.quantifier.__class__.__name__} is not an aggregative quantifier.")
507
+ self.quantifier_learner = deepcopy(self.quantifier)
508
+
509
+ learner_function = _get_learner_function(self.quantifier_learner)
510
+ model = self.quantifier_learner.learner
511
+
512
+ if val_split is None:
513
+ model.fit(X, y)
514
+ train_y_values = y
515
+ train_predictions = getattr(model, learner_function)(X)
516
+ else:
517
+ X_fit, y_fit, X_val, y_val = train_test_split(X, y, test_size=val_split, random_state=self.random_state)
518
+ model.fit(X_fit, y_fit)
519
+ train_y_values = y_val
520
+ train_predictions = getattr(model, learner_function)(X_val)
521
+ self.train_predictions = train_predictions
522
+ self.train_y_values = train_y_values
523
+
524
+ return self
525
+
526
+ def predict(self, X):
527
+ """ Predicts the class prevalences for the given test data.
528
+
529
+ Parameters
530
+ ----------
531
+ X : array-like of shape (n_samples, n_features)
532
+ The input data.
533
+
534
+ Returns
535
+ -------
536
+ prevalences : array-like of shape (n_samples, n_classes)
537
+ The predicted class prevalences.
538
+ """
539
+ X = validate_data(self, X, None)
540
+ learner_function = _get_learner_function(self.quantifier_learner)
541
+ model = self.quantifier_learner.learner
542
+
543
+ predictions = getattr(model, learner_function)(X)
544
+
545
+ return self.aggregate(predictions, self.train_predictions, self.train_y_values)
546
+
547
+
548
+ def aggregate(self, predictions, train_predictions, train_y_values):
549
+ """ Aggregates the predictions using bootstrap resampling.
550
+
551
+ Parameters
552
+ ----------
553
+ predictions : array-like of shape (n_samples, n_classes)
554
+ The input data.
555
+ train_predictions : array-like of shape (n_samples, n_classes)
556
+ The training predictions.
557
+ train_y_values : array-like of shape (n_samples,)
558
+ The training target values.
559
+
560
+ Returns
561
+ -------
562
+ prevalences : array-like of shape (n_samples, n_classes)
563
+ The predicted class prevalences.
564
+ """
565
+ prevalences = []
566
+
567
+ self.classes = np.unique(train_y_values)
568
+
569
+ for train_idx in bootstrap_sample_indices(
570
+ n_samples=len(train_predictions),
571
+ n_bootstraps=self.n_train_bootstraps,
572
+ batch_size=len(train_predictions),
573
+ random_state=self.random_state
574
+ ):
575
+ train_pred_boot = train_predictions[train_idx]
576
+ train_y_boot = train_y_values[train_idx]
577
+
578
+ for test_idx in bootstrap_sample_indices(
579
+ n_samples=len(predictions),
580
+ n_bootstraps=self.n_test_bootstraps,
581
+ batch_size=len(predictions),
582
+ random_state=self.random_state
583
+ ):
584
+ test_pred_boot = predictions[test_idx]
585
+
586
+ requirements = get_aggregation_requirements(self.quantifier)
587
+
588
+ if requirements.requires_train_proba and requirements.requires_train_labels:
589
+ prevalences_boot = self.quantifier.aggregate(test_pred_boot, train_pred_boot, train_y_boot)
590
+ elif requirements.requires_train_labels:
591
+ prevalences_boot = self.quantifier.aggregate(test_pred_boot, train_y_boot)
592
+ else:
593
+ prevalences_boot = self.quantifier.aggregate(test_pred_boot)
594
+
595
+ prevalences_boot = np.asarray(list(prevalences_boot.values()))
596
+ prevalences.append(prevalences_boot)
597
+
598
+ prevalences = np.asarray(prevalences)
599
+ confidence_region = construct_confidence_region(
600
+ prev_estims=prevalences,
601
+ method=self.region_type,
602
+ confidence_level=self.confidence_level,
603
+ )
604
+
605
+ prevalence = confidence_region.get_point_estimate()
606
+
607
+ prevalence = validate_prevalences(self, prevalence, self.classes)
608
+
609
+ return prevalence
610
+
611
+
612
+
613
+
614
+ class QuaDapt(MetaquantifierMixin, BaseQuantifier):
615
+ r"""QuaDapt Metaquantifier: Adaptive quantification using score merging and distance measures.
616
+
617
+ This metaquantifier improves prevalence estimation by merging training samples
618
+ with different score distributions using a merging factor \( m \). It evaluates
619
+ candidate merging factors, chooses the best by minimizing a distribution distance
620
+ metric (Hellinger, Topsoe, ProbSymm, or SORD), and aggregates quantification accordingly.
621
+
622
+ Parameters
623
+ ----------
624
+ quantifier : BaseQuantifier
625
+ The base quantifier model to adapt.
626
+ measure : {'hellinger', 'topsoe', 'probsymm', 'sord'}, default='topsoe'
627
+ The distribution distance metric used to select the best merging factor.
628
+ merging_factor : array-like
629
+ Candidate merging factor values to evaluate.
630
+
631
+ Methods
632
+ -------
633
+ fit(X, y)
634
+ Fits the base learner on training data.
635
+ predict(X)
636
+ Predicts prevalence aggregating via the best merging factor.
637
+ aggregate(predictions, train_y_values)
638
+ Performs adaptation and aggregation based on merged score distributions.
639
+ _get_best_merging_factor(predictions)
640
+ Evaluates merging factors and selects the best based on minimum distance.
641
+ _get_best_distance(predictions, pos_scores, neg_scores)
642
+ Computes the distance metric between predicted and class score distributions.
643
+
644
+ Class Methods
645
+ -------------
646
+ MoSS(n, alpha, m)
647
+ Generates merged score samples modeling class conditional distributions
648
+ parameterized by mixing proportion alpha and merging factor m.
649
+
650
+ Examples
651
+ --------
652
+ >>> quadapt = QuaDapt(quantifier=SomeQuantifier, merging_factor=[0.1, 0.5, 1.0], measure='sord')
653
+ >>> quadapt.fit(X_train, y_train)
654
+ >>> prevalence = quadapt.predict(X_test)
655
+ """
656
+
657
+ _parameter_constraints = {
658
+ "quantifier": [BaseQuantifier],
659
+ "merging_factor": "array-like",
660
+ "measure": [Options(["hellinger", "topsoe", "probsymm", "sord"])],
661
+ "random_state": [Options([None, int])],
662
+ }
663
+
664
+ def __init__(self,
665
+ quantifier,
666
+ measure="topsoe",
667
+ merging_factor=(0.1, 1.0, 0.2)):
668
+ self.quantifier = quantifier
669
+ self.measure = measure
670
+ self.merging_factor = merging_factor
671
+
672
+
673
+ def fit(self, X, y):
674
+ X, y = validate_data(self, X, y)
675
+ self.classes = np.unique(y)
676
+
677
+ if not uses_soft_predictions(self.quantifier):
678
+ raise ValueError(f"The quantifier {self.quantifier.__class__.__name__} is not a soft (probabilistic) quantifier.")
679
+
680
+ requirements = get_aggregation_requirements(self.quantifier)
681
+ if not requirements.requires_train_proba:
682
+ raise ValueError(f"The quantifier {self.quantifier.__class__.__name__} does not use training probabilities, which are required for QuaDapt.")
683
+
684
+ self.quantifier.learner.fit(X, y)
685
+ self.train_y_values = y
686
+
687
+ return self
688
+
689
+ def predict(self, X):
690
+
691
+ X = validate_data(self, X, None)
692
+
693
+ model = self.quantifier.learner
694
+
695
+ predictions = getattr(model, "predict_proba")(X)
696
+
697
+ return self.aggregate(predictions, self.train_y_values)
698
+
699
+
700
+ def aggregate(self, predictions, train_y_values):
701
+
702
+ pos_predictions = predictions[:, 1]
703
+ m = self._get_best_merging_factor(pos_predictions)
704
+
705
+ self.classes = self.classes if hasattr(self, 'classes') else np.unique(train_y_values)
706
+
707
+ moss = QuaDapt.MoSS(1000, 0.5, m)
708
+
709
+ moss_scores = moss[:, :2]
710
+ moss_labels = moss[:, 2]
711
+
712
+ prevalences = self.quantifier.aggregate(predictions,
713
+ moss_scores,
714
+ moss_labels)
715
+
716
+ prevalences = {self.classes[i]: v for i, v in enumerate(prevalences.values())}
717
+ return prevalences
718
+
719
+
720
+ def _get_best_merging_factor(self, predictions):
721
+
722
+ MF = np.atleast_1d(np.round(self.merging_factor, 2)).astype(float)
723
+
724
+ distances = []
725
+
726
+ for mf in MF:
727
+ scores = QuaDapt.MoSS(1000, 0.5, mf)
728
+ pos_scores = scores[scores[:, 2] == 1][:, :2]
729
+ neg_scores = scores[scores[:, 2] == 0][:, :2]
730
+
731
+ best_distance = self._get_best_distance(predictions, pos_scores, neg_scores)
732
+
733
+ distances.append(best_distance)
734
+
735
+ best_m = MF[np.argmin(distances)]
736
+ return best_m
737
+
738
+ def _get_best_distance(self, predictions, pos_scores, neg_scores):
739
+
740
+ if self.measure in ["hellinger", "topsoe", "probsymm"]:
741
+ method = DyS(measure=self.measure)
742
+ elif self.measure == "sord":
743
+ method = SORD()
744
+
745
+ best_distance = method.get_best_distance(predictions, pos_scores, neg_scores)
746
+ return best_distance
747
+
748
+
749
+ @classmethod
750
+ def MoSS(cls, n, alpha, m):
751
+ p_score = np.random.uniform(size=int(n * alpha)) ** m
752
+ n_score = 1 - (np.random.uniform(size=int(round(n * (1 - alpha), 0))) ** m)
753
+ scores = np.column_stack(
754
+ (np.concatenate((p_score, n_score)),
755
+ np.concatenate((p_score, n_score)),
756
+ np.concatenate((
757
+ np.ones(len(p_score)),
758
+ np.full(len(n_score), 0))))
759
+ )
760
+ return scores
761
+