mlquantify 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. mlquantify/__init__.py +0 -29
  2. mlquantify/adjust_counting/__init__.py +14 -0
  3. mlquantify/adjust_counting/_adjustment.py +365 -0
  4. mlquantify/adjust_counting/_base.py +247 -0
  5. mlquantify/adjust_counting/_counting.py +145 -0
  6. mlquantify/adjust_counting/_utils.py +114 -0
  7. mlquantify/base.py +117 -519
  8. mlquantify/base_aggregative.py +209 -0
  9. mlquantify/calibration.py +1 -0
  10. mlquantify/confidence.py +335 -0
  11. mlquantify/likelihood/__init__.py +5 -0
  12. mlquantify/likelihood/_base.py +161 -0
  13. mlquantify/likelihood/_classes.py +414 -0
  14. mlquantify/meta/__init__.py +1 -0
  15. mlquantify/meta/_classes.py +761 -0
  16. mlquantify/metrics/__init__.py +21 -0
  17. mlquantify/metrics/_oq.py +109 -0
  18. mlquantify/metrics/_rq.py +98 -0
  19. mlquantify/{evaluation/measures.py → metrics/_slq.py} +43 -28
  20. mlquantify/mixture/__init__.py +7 -0
  21. mlquantify/mixture/_base.py +153 -0
  22. mlquantify/mixture/_classes.py +400 -0
  23. mlquantify/mixture/_utils.py +112 -0
  24. mlquantify/model_selection/__init__.py +9 -0
  25. mlquantify/model_selection/_protocol.py +358 -0
  26. mlquantify/model_selection/_search.py +315 -0
  27. mlquantify/model_selection/_split.py +1 -0
  28. mlquantify/multiclass.py +350 -0
  29. mlquantify/neighbors/__init__.py +9 -0
  30. mlquantify/neighbors/_base.py +198 -0
  31. mlquantify/neighbors/_classes.py +159 -0
  32. mlquantify/{classification/methods.py → neighbors/_classification.py} +48 -66
  33. mlquantify/neighbors/_kde.py +270 -0
  34. mlquantify/neighbors/_utils.py +135 -0
  35. mlquantify/neural/__init__.py +1 -0
  36. mlquantify/utils/__init__.py +47 -2
  37. mlquantify/utils/_artificial.py +27 -0
  38. mlquantify/utils/_constraints.py +219 -0
  39. mlquantify/utils/_context.py +21 -0
  40. mlquantify/utils/_decorators.py +36 -0
  41. mlquantify/utils/_exceptions.py +12 -0
  42. mlquantify/utils/_get_scores.py +159 -0
  43. mlquantify/utils/_load.py +18 -0
  44. mlquantify/utils/_parallel.py +6 -0
  45. mlquantify/utils/_random.py +36 -0
  46. mlquantify/utils/_sampling.py +273 -0
  47. mlquantify/utils/_tags.py +44 -0
  48. mlquantify/utils/_validation.py +447 -0
  49. mlquantify/utils/prevalence.py +61 -0
  50. {mlquantify-0.1.8.dist-info → mlquantify-0.1.9.dist-info}/METADATA +2 -1
  51. mlquantify-0.1.9.dist-info/RECORD +53 -0
  52. mlquantify/classification/__init__.py +0 -1
  53. mlquantify/evaluation/__init__.py +0 -14
  54. mlquantify/evaluation/protocol.py +0 -289
  55. mlquantify/methods/__init__.py +0 -37
  56. mlquantify/methods/aggregative.py +0 -1159
  57. mlquantify/methods/meta.py +0 -472
  58. mlquantify/methods/mixture_models.py +0 -1003
  59. mlquantify/methods/non_aggregative.py +0 -136
  60. mlquantify/methods/threshold_optimization.py +0 -869
  61. mlquantify/model_selection.py +0 -377
  62. mlquantify/plots.py +0 -367
  63. mlquantify/utils/general.py +0 -371
  64. mlquantify/utils/method.py +0 -449
  65. mlquantify-0.1.8.dist-info/RECORD +0 -22
  66. {mlquantify-0.1.8.dist-info → mlquantify-0.1.9.dist-info}/WHEEL +0 -0
  67. {mlquantify-0.1.8.dist-info → mlquantify-0.1.9.dist-info}/top_level.txt +0 -0
@@ -1,1003 +0,0 @@
1
- from abc import abstractmethod
2
- import numpy as np
3
- from sklearn.base import BaseEstimator
4
-
5
- from ..base import AggregativeQuantifier
6
-
7
- from ..utils.general import get_real_prev
8
- from ..utils.method import *
9
- import mlquantify as mq
10
-
11
-
12
-
13
-
14
- class MixtureModel(AggregativeQuantifier):
15
- """Mixtures of Score Distributions
16
-
17
- MixtureModel is a generic class for methods based on mixture models.
18
- The main idea is that the cumulative distribution of scores assigned
19
- to data points in the test set is a mixture of the score distributions
20
- from the training set (positive and negative classes).
21
-
22
- Parameters
23
- ----------
24
- learner : BaseEstimator
25
- A scikit-learn compatible classifier that supports `predict_proba`.
26
-
27
- Attributes
28
- ----------
29
- learner : BaseEstimator
30
- A scikit-learn compatible classifier that provides predictive probabilities.
31
- pos_scores : np.ndarray
32
- Score distribution for the positive class in the training data.
33
- neg_scores : np.ndarray
34
- Score distribution for the negative class in the training data.
35
-
36
- Notes
37
- -----
38
- All methods that inherits from MixtureModel will be binary quantifiers. In case of multiclass problems will be made One vs All.
39
-
40
- Examples
41
- --------
42
- >>> from mlquantify.methods.mixture_models import MixtureModel
43
- >>> from mlquantify.utils.general import get_real_prev
44
- >>> from mlquantify.utils.method import getHist
45
- >>> from sklearn.model_selection import train_test_split
46
- >>> from sklearn.datasets import load_breast_cancer
47
- >>> from sklearn.ensemble import RandomForestClassifier
48
- >>> import numpy as np
49
- >>>
50
- >>> class MyMixtureModel(MixtureModel):
51
- ... def __init__(self, learner, param):
52
- ... super().__init__(learner)
53
- ... self.param = param
54
- ... def _compute_prevalence(self, test_scores: np.ndarray) -> float:
55
- ... hist_pos = getHist(self.pos_scores, self.param)
56
- ... hist_neg = getHist(self.neg_scores, self.param)
57
- ... hist_test = getHist(test_scores, self.param)
58
- ... mixture = hist_test * (hist_pos + hist_neg)
59
- ... return np.sum(mixture)
60
- >>>
61
- >>> features, target = load_breast_cancer(return_X_y=True)
62
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
63
- >>>
64
- >>> mm = MyMixtureModel(RandomForestClassifier(), 10)
65
- >>> mm.fit(X_train, y_train)
66
- >>> prevalence = mm.predict(X_test)
67
- >>> prevalence
68
- {0: 0.3622419419517543, 1: 0.6377580580482457}
69
- >>> get_real_prev(y_test)
70
- {0: 0.37719298245614036, 1: 0.6228070175438597}
71
- """
72
-
73
- def __init__(self, learner: BaseEstimator=None):
74
- self.learner = learner
75
- self.pos_scores = None
76
- self.neg_scores = None
77
-
78
- @property
79
- def is_multiclass(self) -> bool:
80
- """
81
- Indicates whether the model supports multiclass classification.
82
-
83
- Returns
84
- -------
85
- bool
86
- Always returns False, as MixtureModel supports only binary classification.
87
- """
88
- return False
89
-
90
- @property
91
- def is_probabilistic(self) -> bool:
92
- return True
93
-
94
- def _fit_method(self, X, y):
95
- """
96
- Fits the positive and negative score distributions using cross-validation.
97
-
98
- Parameters
99
- ----------
100
- X : np.ndarray
101
- Training feature matrix.
102
- y : np.ndarray
103
- Training labels.
104
-
105
- Returns
106
- -------
107
- self : MixtureModel
108
- The fitted MixtureModel instance.
109
- """
110
- if mq.arguments["y_labels"] is not None and mq.arguments["posteriors_train"] is not None:
111
- y_labels = mq.arguments["y_labels"]
112
- probabilities = mq.arguments["posteriors_train"]
113
- else:
114
- y_labels, probabilities = get_scores(X, y, self.learner, self.cv_folds, self.learner_fitted)
115
-
116
- # Separate positive and negative scores based on labels
117
- self.pos_scores = probabilities[y_labels == self.classes[1]][:, 1]
118
- self.neg_scores = probabilities[y_labels == self.classes[0]][:, 1]
119
-
120
- return self
121
-
122
- def _predict_method(self, X) -> dict:
123
- """
124
- Predicts class prevalences for the test data.
125
-
126
- Parameters
127
- ----------
128
- X : np.ndarray
129
- Test feature matrix.
130
-
131
- Returns
132
- -------
133
- np.ndarray
134
- An array containing the prevalence for each class.
135
- """
136
- # Get the predicted probabilities for the positive class
137
- test_scores = self.predict_learner(X)[:, 1]
138
-
139
- # Compute the prevalence using the mixture model
140
- prevalence = np.clip(self._compute_prevalence(test_scores), 0, 1)
141
-
142
- # Return the prevalence as a distribution over the classes
143
- return np.asarray([1 - prevalence, prevalence])
144
-
145
- @abstractmethod
146
- def _compute_prevalence(self, test_scores: np.ndarray) -> float:
147
- """
148
- Abstract method to compute prevalence using the test scores.
149
- Subclasses must implement this method.
150
-
151
- Parameters
152
- ----------
153
- test_scores : np.ndarray
154
- Probabilities for the positive class in the test set.
155
-
156
- Returns
157
- -------
158
- float
159
- The computed prevalence for the positive class.
160
- """
161
- pass
162
-
163
- def get_distance(self, dist_train, dist_test, measure: str) -> float:
164
- """
165
- Computes the distance between training and test distributions using a specified metric.
166
-
167
- Parameters
168
- ----------
169
- dist_train : np.ndarray
170
- Distribution of scores for the training data.
171
- dist_test : np.ndarray
172
- Distribution of scores for the test data.
173
- measure : str
174
- The metric to use for distance calculation. Supported values are
175
- 'topsoe', 'probsymm', 'hellinger', and 'euclidean'.
176
-
177
- Returns
178
- -------
179
- float
180
- The computed distance between the two distributions.
181
-
182
- Raises
183
- ------
184
- ValueError
185
- If the input distributions have mismatched sizes or are zero vectors.
186
- """
187
- # Validate input distributions
188
- if np.sum(dist_train) < 1e-20 or np.sum(dist_test) < 1e-20:
189
- raise ValueError("One or both vectors are zero (empty)...")
190
- if len(dist_train) != len(dist_test):
191
- raise ValueError("Arrays need to be of equal size...")
192
-
193
- # Avoid division by zero by replacing small values
194
- dist_train = np.maximum(dist_train, 1e-20)
195
- dist_test = np.maximum(dist_test, 1e-20)
196
-
197
- # Compute the distance based on the selected metric
198
- if measure == 'topsoe':
199
- return topsoe(dist_train, dist_test)
200
- elif measure == 'probsymm':
201
- return probsymm(dist_train, dist_test)
202
- elif measure == 'hellinger':
203
- return hellinger(dist_train, dist_test)
204
- elif measure == 'euclidean':
205
- return sqEuclidean(dist_train, dist_test)
206
- else:
207
- return 100 # Default value for unknown metrics
208
-
209
-
210
-
211
-
212
-
213
- class DyS(MixtureModel):
214
- """
215
- Distribution y-Similarity (DyS) framework.
216
-
217
- DyS is a method that generalizes the HDy approach by
218
- considering the dissimilarity function DS as a parameter
219
- of the model.
220
-
221
- Parameters
222
- ----------
223
- learner : BaseEstimator
224
- A probabilistic classifier implementing the `predict_proba` method.
225
- measure : str, optional
226
- The metric used to compare distributions. Options are:
227
- - "hellinger"
228
- - "topsoe"
229
- - "probsymm"
230
- Default is "topsoe".
231
- bins_size : np.ndarray, optional
232
- Array of bin sizes for histogram computation.
233
- Default is np.append(np.linspace(2, 20, 10), 30).
234
-
235
- Attributes
236
- ----------
237
- bins_size : np.ndarray
238
- Bin sizes used for histogram calculations.
239
- measure : str
240
- Selected distance metric.
241
- prevs : np.ndarray
242
- Array of prevalences that minimize the distances.
243
-
244
- References
245
- ----------
246
- VAN HASSELT, H.; GUEZ, A.; SILVER, D. Proceedings of the AAAI conference on artificial intelligence. 2016. Avaliable at https://ojs.aaai.org/index.php/AAAI/article/view/4376
247
-
248
- Examples
249
- --------
250
- >>> from mlquantify.methods.mixture_models import DyS
251
- >>> from mlquantify.utils.general import get_real_prev
252
- >>> from sklearn.ensemble import RandomForestClassifier
253
- >>> from sklearn.datasets import load_breast_cancer
254
- >>> from sklearn.model_selection import train_test_split
255
- >>>
256
- >>> features, target = load_breast_cancer(return_X_y=True)
257
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
258
- >>>
259
- >>> dys = DyS(RandomForestClassifier())
260
- >>> dys.fit(X_train, y_train)
261
- >>> prevalence = dys.predict(X_test)
262
- >>> prevalence
263
- {0: 0.3736714619191387, 1: 0.6263285380808613}
264
- >>> get_real_prev(y_test)
265
- {0: 0.37719298245614036, 1: 0.6228070175438597}
266
- """
267
-
268
- def __init__(self, learner: BaseEstimator=None, measure: str = "topsoe", bins_size: np.ndarray = None):
269
- assert measure in ["hellinger", "topsoe", "probsymm"], "Invalid measure."
270
- super().__init__(learner)
271
-
272
- # Set up bins_size
273
- if bins_size is None:
274
- bins_size = np.append(np.linspace(2, 20, 10), 30)
275
- if isinstance(bins_size, list):
276
- bins_size = np.asarray(bins_size)
277
-
278
- self.bins_size = bins_size
279
- self.measure = measure
280
- self.prevs = None # Array of prevalences that minimizes the distances
281
-
282
- def _compute_prevalence(self, test_scores: np.ndarray) -> float:
283
- """
284
- Compute the prevalence estimate based on the test scores.
285
-
286
- Parameters
287
- ----------
288
- test_scores : np.ndarray
289
- Array of predicted probabilities for the test data.
290
-
291
- Returns
292
- -------
293
- prevalence : float
294
- Estimated prevalence.
295
- """
296
- prevs = self.GetMinDistancesDyS(test_scores)
297
- # Use the median of the prevalences as the final estimate
298
- prevalence = np.median(prevs)
299
-
300
- return prevalence
301
-
302
- def best_distance(self, X_test: np.ndarray) -> float:
303
- """
304
- Calculate the minimum distance between test scores and train distributions.
305
-
306
- Parameters
307
- ----------
308
- X_test : np.ndarray
309
- Test data to evaluate.
310
-
311
- Returns
312
- -------
313
- distance : float
314
- The minimum distance value.
315
- """
316
- test_scores = self.predict_learner(X_test)
317
- prevs = self.GetMinDistancesDyS(test_scores)
318
-
319
- size = len(prevs)
320
- best_prev = np.median(prevs)
321
-
322
- if size % 2 != 0: # Odd
323
- index = np.argmax(prevs == best_prev)
324
- bin_size = self.bins_size[index]
325
- else: # Even
326
- # Sort the prevalences
327
- ordered_prevs = np.sort(prevs)
328
- # Get the two middle indices
329
- middle1 = np.floor(size / 2).astype(int)
330
- middle2 = np.ceil(size / 2).astype(int)
331
- # Find the values corresponding to the median positions
332
- median1 = ordered_prevs[middle1]
333
- median2 = ordered_prevs[middle2]
334
- # Find the indices of these medians
335
- index1 = np.argmax(prevs == median1)
336
- index2 = np.argmax(prevs == median2)
337
- # Compute the average bin size
338
- bin_size = np.mean([self.bins_size[index1], self.bins_size[index2]])
339
-
340
- # Compute histogram densities
341
- pos_bin_density = getHist(self.pos_scores, bin_size)
342
- neg_bin_density = getHist(self.neg_scores, bin_size)
343
- test_bin_density = getHist(test_scores, bin_size)
344
-
345
- # Combine densities
346
- train_combined_density = (pos_bin_density * best_prev) + (neg_bin_density * (1 - best_prev))
347
-
348
- # Compute the distance
349
- distance = self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
350
-
351
- return distance
352
-
353
- def GetMinDistancesDyS(self, test_scores: np.ndarray) -> list:
354
- """
355
- Compute prevalence by evaluating the distance metric across bin sizes.
356
-
357
- Parameters
358
- ----------
359
- test_scores : np.ndarray
360
- Array of predicted probabilities for the test data.
361
-
362
- Returns
363
- -------
364
- prevs : list
365
- List of prevalence estimates minimizing the distance for each bin size.
366
- """
367
- prevs = []
368
-
369
- # Iterate over each bin size
370
- for bins in self.bins_size:
371
- # Compute histogram densities
372
- pos_bin_density = getHist(self.pos_scores, bins)
373
- neg_bin_density = getHist(self.neg_scores, bins)
374
- test_bin_density = getHist(test_scores, bins)
375
-
376
- # Define the function to minimize
377
- def f(x):
378
- # Combine densities
379
- train_combined_density = (pos_bin_density * x) + (neg_bin_density * (1 - x))
380
- # Compute the distance
381
- return self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
382
-
383
- # Use ternary search to minimize the distance
384
- prevs.append(ternary_search(0, 1, f))
385
-
386
- return prevs
387
-
388
-
389
-
390
-
391
-
392
-
393
-
394
-
395
-
396
-
397
- class DySsyn(MixtureModel):
398
- """Synthetic Distribution y-Similarity (DySsyn).
399
-
400
- This method works similarly to the DyS method, but instead of using the
401
- train scores, it generates them via MoSS (Model for Synthetic Scores).
402
- MoSS creates a spectrum of score distributions ranging from highly separated
403
- to fully mixed scores.
404
-
405
- Parameters
406
- ----------
407
- learner : BaseEstimator
408
- A probabilistic classifier implementing the `predict_proba` method.
409
- measure : str, optional
410
- The metric used to compare distributions. Options are:
411
- - "hellinger"
412
- - "topsoe"
413
- - "probsymm"
414
- Default is "topsoe".
415
- merge_factor : np.ndarray, optional
416
- Array controlling the mixing level of synthetic distributions.
417
- Default is np.linspace(0.1, 0.4, 10).
418
- bins_size : np.ndarray, optional
419
- Array of bin sizes for histogram computation.
420
- Default is np.append(np.linspace(2, 20, 10), 30).
421
- alpha_train : float, optional
422
- Initial estimate of the training prevalence. Default is 0.5.
423
- n : int, optional
424
- Number of synthetic samples generated. Default is None.
425
-
426
- Attributes
427
- ----------
428
- bins_size : np.ndarray
429
- Bin sizes used for histogram calculations.
430
- merge_factor : np.ndarray
431
- Mixing factors for generating synthetic score distributions.
432
- alpha_train : float
433
- True training prevalence.
434
- n : int
435
- Number of samples generated during synthetic distribution creation.
436
- measure : str
437
- Selected distance metric.
438
- m : None or float
439
- Best mixing factor determined during computation.
440
-
441
- References
442
- ----------
443
- MALETZKE, André et al. Accurately quantifying under score variability. In: 2021 IEEE International Conference on Data Mining (ICDM). IEEE, 2021. p. 1228-1233. Avaliable at https://ieeexplore.ieee.org/abstract/document/9679104
444
-
445
- Examples
446
- --------
447
- >>> from mlquantify.methods.mixture_models import DySsyn
448
- >>> from mlquantify.utils.general import get_real_prev
449
- >>> from sklearn.ensemble import RandomForestClassifier
450
- >>> from sklearn.datasets import load_breast_cancer
451
- >>> from sklearn.model_selection import train_test_split
452
- >>>
453
- >>> features, target = load_breast_cancer(return_X_y=True)
454
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
455
- >>>
456
- >>> dyssyn = DySsyn(RandomForestClassifier())
457
- >>> dyssyn.fit(X_train, y_train)
458
- >>> prevalence = dyssyn.predict(X_test)
459
- >>> prevalence
460
- {0: 0.3606413872681201, 1: 0.6393586127318799}
461
- >>> get_real_prev(y_test)
462
- {0: 0.37719298245614036, 1: 0.6228070175438597}
463
- """
464
-
465
-
466
- def __init__(self, learner:BaseEstimator=None, measure:str="topsoe", merge_factor:np.ndarray=None, bins_size:np.ndarray=None, alpha_train:float=0.5, n:int=None):
467
- assert measure in ["hellinger", "topsoe", "probsymm"], "measure not valid"
468
- super().__init__(learner)
469
-
470
- # Set up bins_size
471
- if not bins_size:
472
- bins_size = np.append(np.linspace(2,20,10), 30)
473
- if isinstance(bins_size, list):
474
- bins_size = np.asarray(bins_size)
475
-
476
- if not merge_factor:
477
- merge_factor = np.linspace(0.1, 0.4, 10)
478
-
479
- self.bins_size = bins_size
480
- self.merge_factor = merge_factor
481
- self.alpha_train = alpha_train
482
- self.n = n
483
- self.measure = measure
484
- self.m = None
485
-
486
-
487
-
488
- def _fit_method(self, X, y):
489
- """
490
- Fits the learner and calculates the training prevalence.
491
-
492
- Parameters
493
- ----------
494
- X : array-like of shape (n_samples, n_features)
495
- Training data.
496
- y : array-like of shape (n_samples,)
497
- Training labels.
498
-
499
- Returns
500
- -------
501
- self : DySsyn
502
- The fitted DySsyn instance.
503
- """
504
- self.fit_learner(X, y)
505
-
506
- self.alpha_train = list(get_real_prev(y).values())[1]
507
-
508
- return self
509
-
510
- def _compute_prevalence(self, test_scores: np.ndarray) -> float:
511
- """
512
- Computes the prevalence estimate using test scores.
513
-
514
- Parameters
515
- ----------
516
- test_scores : np.ndarray
517
- Array of predicted probabilities for the test data.
518
-
519
- Returns
520
- -------
521
- prevalence : float
522
- Estimated prevalence based on the minimum distance
523
- across synthetic distributions.
524
- """
525
- distances = self.GetMinDistancesDySsyn(test_scores)
526
-
527
- # Use the median of the prevalence estimates as the final prevalence
528
- index = min(distances, key=lambda d: distances[d][0])
529
- prevalence = distances[index][1]
530
-
531
- return prevalence
532
-
533
- def best_distance(self, X_test):
534
- """
535
- Computes the minimum distance between test scores and synthetic distributions of MoSS.
536
-
537
- Parameters
538
- ----------
539
- X_test : array-like of shape (n_samples, n_features)
540
- Test data.
541
-
542
- Returns
543
- -------
544
- distance : float
545
- Minimum distance value for the test data.
546
- """
547
- test_scores = self.predict_learner(X_test)
548
-
549
- distances = self.GetMinDistancesDySsyn(test_scores)
550
-
551
- index = min(distances, key=lambda d: distances[d][0])
552
-
553
- distance = distances[index][0]
554
-
555
- return distance
556
-
557
- def GetMinDistancesDySsyn(self, test_scores: np.ndarray) -> list:
558
- """
559
- Calculates the minimum distances between test scores and synthetic distributions of MoSS
560
- across various bin sizes and merge factors.
561
-
562
- Parameters
563
- ----------
564
- test_scores : np.ndarray
565
- Array of predicted probabilities for the test data.
566
-
567
- Returns
568
- -------
569
- values : dict
570
- Dictionary mapping each merge factor (m) to a tuple containing:
571
- - The minimum distance value.
572
- - The corresponding prevalence estimate.
573
- """
574
- if self.n is None:
575
- self.n = len(test_scores)
576
-
577
- values = {}
578
-
579
- # Iterate over each merge factor
580
- for m in self.merge_factor:
581
- pos_scores, neg_scores = MoSS(self.n, self.alpha_train, m)
582
- prevs = []
583
- for bins in self.bins_size:
584
- # Compute histogram densities for positive, negative, and test scores
585
- pos_bin_density = getHist(pos_scores, bins)
586
- neg_bin_density = getHist(neg_scores, bins)
587
- test_bin_density = getHist(test_scores, bins)
588
-
589
- # Define the function to minimize
590
- def f(x):
591
- # Combine densities using a mixture of positive and negative densities
592
- train_combined_density = (pos_bin_density * x) + (neg_bin_density * (1 - x))
593
- # Calculate the distance between combined density and test density
594
- return self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
595
-
596
- # Use ternary search to find the best x that minimizes the distance
597
- prevs.append(ternary_search(0, 1, f))
598
-
599
- size = len(prevs)
600
- best_prev = np.median(prevs)
601
-
602
- if size % 2 != 0: # ODD
603
- index = np.argmax(prevs == best_prev)
604
- bin_size = self.bins_size[index]
605
- else: # EVEN
606
- # Sort the values in self.prevs
607
- ordered_prevs = np.sort(prevs)
608
-
609
- # Find the two middle indices
610
- middle1 = np.floor(size / 2).astype(int)
611
- middle2 = np.ceil(size / 2).astype(int)
612
-
613
- # Get the values corresponding to the median positions
614
- median1 = ordered_prevs[middle1]
615
- median2 = ordered_prevs[middle2]
616
-
617
- # Find the indices of median1 and median2 in prevs
618
- index1 = np.argmax(prevs == median1)
619
- index2 = np.argmax(prevs == median2)
620
-
621
- # Calculate the average of the corresponding bin sizes
622
- bin_size = np.mean([self.bins_size[index1], self.bins_size[index2]])
623
-
624
- pos_bin_density = getHist(pos_scores, bin_size)
625
- neg_bin_density = getHist(neg_scores, bin_size)
626
- test_bin_density = getHist(test_scores, bin_size)
627
-
628
- train_combined_density = (pos_bin_density * best_prev) + (neg_bin_density * (1 - best_prev))
629
-
630
- distance = self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
631
-
632
- values[m] = (distance, best_prev)
633
-
634
- return values
635
-
636
-
637
-
638
-
639
-
640
-
641
-
642
-
643
-
644
- class HDy(MixtureModel):
645
- """
646
- Hellinger Distance Minimization (HDy) framework.
647
-
648
- HDy is based on computing the Hellinger distance between two distributions:
649
- the test distribution and the mixture of the positive and negative
650
- distributions from the training data.
651
-
652
- Parameters
653
- ----------
654
- learner : BaseEstimator
655
- A supervised learning model implementing a `predict_proba` method.
656
-
657
- Attributes
658
- ----------
659
- pos_scores : np.ndarray
660
- Score distribution for the positive class in the training data.
661
- neg_scores : np.ndarray
662
- Score distribution for the negative class in the training data.
663
-
664
- References
665
- ----------
666
- GONZÁLEZ-CASTRO, Víctor; ALAIZ-RODRÍGUEZ, Rocío; ALEGRE, Enrique. Class distribution estimation based on the Hellinger distance. Information Sciences, v. 218, p. 146-164, 2013. Avaliable at https://www.sciencedirect.com/science/article/abs/pii/S0020025512004069?casa_token=W6UksOigmp4AAAAA:ap8FK5mtpAzG-s8k2ygfRVgdIBYDGWjEi70ueJ546coP9F-VNaCKE5W_gsAv0bWQiwzt2QoAuLjP
667
-
668
- Examples
669
- --------
670
- >>> from mlquantify.methods.mixture_models import HDy
671
- >>> from mlquantify.utils.general import get_real_prev
672
- >>> from sklearn.ensemble import RandomForestClassifier
673
- >>> from sklearn.datasets import load_breast_cancer
674
- >>> from sklearn.model_selection import train_test_split
675
- >>>
676
- >>> features, target = load_breast_cancer(return_X_y=True)
677
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
678
- >>>
679
- >>> hdy = HDy(RandomForestClassifier())
680
- >>> hdy.fit(X_train, y_train)
681
- >>> prevalence = hdy.predict(X_test)
682
- >>> prevalence
683
- {0: 0.33999999999999997, 1: 0.66}
684
- >>> get_real_prev(y_test)
685
- {0: 0.37719298245614036, 1: 0.6228070175438597}
686
- """
687
-
688
- def __init__(self, learner: BaseEstimator=None):
689
- super().__init__(learner)
690
-
691
- def _compute_prevalence(self, test_scores: np.ndarray) -> float:
692
- """
693
- Compute the prevalence estimate based on test scores.
694
-
695
- Parameters
696
- ----------
697
- test_scores : np.ndarray
698
- Array of predicted probabilities for the test data.
699
-
700
- Returns
701
- -------
702
- prevalence : float
703
- Estimated prevalence.
704
- """
705
- best_alphas, _ = self.GetMinDistancesHDy(test_scores)
706
- # Use the median of the best alpha values as the final prevalence estimate
707
- prevalence = np.median(best_alphas)
708
-
709
- return prevalence
710
-
711
- def best_distance(self, X_test: np.ndarray) -> float:
712
- """
713
- Calculate the minimum Hellinger distance for the test data.
714
-
715
- Parameters
716
- ----------
717
- X_test : np.ndarray
718
- Test data to evaluate.
719
-
720
- Returns
721
- -------
722
- distance : float
723
- The minimum distance value.
724
- """
725
- test_scores = self.predict_learner(X_test)
726
- _, distances = self.GetMinDistancesHDy(test_scores)
727
-
728
- size = len(distances)
729
-
730
- if size % 2 != 0: # Odd
731
- index = size // 2
732
- distance = distances[index]
733
- else: # Even
734
- # Find the two middle indices
735
- middle1 = np.floor(size / 2).astype(int)
736
- middle2 = np.ceil(size / 2).astype(int)
737
- # Compute the average of the corresponding distances
738
- distance = np.mean([distances[middle1], distances[middle2]])
739
-
740
- return distance
741
-
742
- def GetMinDistancesHDy(self, test_scores: np.ndarray) -> tuple:
743
- """
744
- Compute prevalence by minimizing the Hellinger distance across bins and alphas.
745
-
746
- Parameters
747
- ----------
748
- test_scores : np.ndarray
749
- Array of predicted probabilities for the test data.
750
-
751
- Returns
752
- -------
753
- best_alphas : list
754
- List of alpha values that minimize the Hellinger distance for each bin size.
755
- distances : list
756
- List of minimum distances corresponding to the best alphas for each bin size.
757
- """
758
- # Define bin sizes and alpha values
759
- bins_size = np.arange(10, 110, 11) # Bins from 10 to 110 with a step size of 10
760
- alpha_values = np.round(np.linspace(0, 1, 101), 2) # Alpha values from 0 to 1, rounded to 2 decimal places
761
-
762
- best_alphas = []
763
- distances = []
764
-
765
- for bins in bins_size:
766
- # Compute histogram densities for positive, negative, and test scores
767
- pos_bin_density = getHist(self.pos_scores, bins)
768
- neg_bin_density = getHist(self.neg_scores, bins)
769
- test_bin_density = getHist(test_scores, bins)
770
-
771
- bin_distances = []
772
-
773
- # Evaluate distance for each alpha value
774
- for x in alpha_values:
775
- # Combine densities using a mixture of positive and negative densities
776
- train_combined_density = (pos_bin_density * x) + (neg_bin_density * (1 - x))
777
- # Compute the distance using the Hellinger measure
778
- bin_distances.append(self.get_distance(train_combined_density, test_bin_density, measure="hellinger"))
779
-
780
- # Find the alpha value that minimizes the distance
781
- best_alpha = alpha_values[np.argmin(bin_distances)]
782
- min_distance = min(bin_distances)
783
-
784
- best_alphas.append(best_alpha)
785
- distances.append(min_distance)
786
-
787
- return best_alphas, distances
788
-
789
-
790
-
791
-
792
-
793
-
794
-
795
-
796
- class SMM(MixtureModel):
797
- """
798
- Sample Mean Matching (SMM).
799
-
800
- A member of the DyS framework that estimates the prevalence
801
- of the positive class in a test dataset by leveraging simple
802
- mean values to represent the score distributions for positive,
803
- negative, and unlabeled data.
804
-
805
- Parameters
806
- ----------
807
- learner : BaseEstimator
808
- A supervised learning model implementing a `predict_proba` method.
809
-
810
- Attributes
811
- ----------
812
- pos_scores : np.ndarray
813
- Score distribution for the positive class in the training data.
814
- neg_scores : np.ndarray
815
- Score distribution for the negative class in the training data.
816
-
817
- References
818
- ----------
819
- HASSAN, Waqar; MALETZKE, André; BATISTA, Gustavo. Accurately quantifying a billion instances per second. In: 2020 IEEE 7th International Conference on Data Science and Advanced Analytics (DSAA). IEEE, 2020. p. 1-10. Avaliable at https://ieeexplore.ieee.org/document/9260028
820
-
821
- Examples
822
- --------
823
- >>> from mlquantify.methods.mixture_models import SMM
824
- >>> from mlquantify.utils.general import get_real_prev
825
- >>> from sklearn.ensemble import RandomForestClassifier
826
- >>> from sklearn.datasets import load_breast_cancer
827
- >>> from sklearn.model_selection import train_test_split
828
- >>>
829
- >>> features, target = load_breast_cancer(return_X_y=True)
830
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
831
- >>>
832
- >>> smm = SMM(RandomForestClassifier())
833
- >>> smm.fit(X_train, y_train)
834
- >>> prevalence = smm.predict(X_test)
835
- >>> prevalence
836
- {0: 0.38358048188348526, 1: 0.6164195181165147}
837
- >>> get_real_prev(y_test)
838
- {0: 0.37719298245614036, 1: 0.6228070175438597}
839
- """
840
-
841
- def __init__(self, learner: BaseEstimator=None):
842
- super().__init__(learner)
843
-
844
- def _compute_prevalence(self, test_scores: np.ndarray) -> float:
845
- """
846
- Compute the prevalence estimate based on mean scores.
847
-
848
- Parameters
849
- ----------
850
- test_scores : np.ndarray
851
- Array of predicted probabilities for the test data.
852
-
853
- Returns
854
- -------
855
- prevalence : float
856
- Estimated prevalence.
857
- """
858
- mean_pos_score = np.mean(self.pos_scores)
859
- mean_neg_score = np.mean(self.neg_scores)
860
- mean_test_score = np.mean(test_scores)
861
-
862
- # Calculate prevalence as the proportion of the positive class
863
- prevalence = (mean_test_score - mean_neg_score) / (mean_pos_score - mean_neg_score)
864
-
865
- return prevalence
866
-
867
-
868
- class SORD(MixtureModel):
869
- """
870
- Sample Ordinal Distance (SORD).
871
-
872
- A method that estimates the prevalence of the positive class
873
- in a test dataset by calculating and minimizing a sample ordinal
874
- distance measure between test scores and known positive and
875
- negative scores. This approach does not rely on distributional
876
- assumptions.
877
-
878
- Parameters
879
- ----------
880
- learner : BaseEstimator
881
- A supervised learning model implementing a `predict_proba` method.
882
-
883
- Attributes
884
- ----------
885
- pos_scores : np.ndarray
886
- Score distribution for the positive class in the training data.
887
- neg_scores : np.ndarray
888
- Score distribution for the negative class in the training data.
889
- best_distance_index : int
890
- Index of the best alpha value.
891
-
892
- References
893
- ----------
894
- VAN HASSELT, H.; GUEZ, A.; SILVER, D. Proceedings of the AAAI conference on artificial intelligence. 2016. Avaliable at https://ojs.aaai.org/index.php/AAAI/article/view/4376
895
-
896
- Examples
897
- --------
898
- >>> from mlquantify.methods.mixture_models import SORD
899
- >>> from mlquantify.utils.general import get_real_prev
900
- >>> from sklearn.ensemble import RandomForestClassifier
901
- >>> from sklearn.datasets import load_breast_cancer
902
- >>> from sklearn.model_selection import train_test_split
903
- >>>
904
- >>> features, target = load_breast_cancer(return_X_y=True)
905
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
906
- >>>
907
- >>> sord = SORD(RandomForestClassifier())
908
- >>> sord.fit(X_train, y_train)
909
- >>> prevalence = sord.predict(X_test)
910
- >>> prevalence
911
- {0: 0.38, 1: 0.62}
912
- >>> get_real_prev(y_test)
913
- {0: 0.37719298245614036, 1: 0.6228070175438597}
914
- """
915
-
916
- def __init__(self, learner: BaseEstimator=None):
917
- super().__init__(learner)
918
-
919
- self.best_distance_index = None # Stores the index of the best alpha value
920
-
921
- def _compute_prevalence(self, test_scores: np.ndarray) -> float:
922
- """
923
- Compute the prevalence estimate by minimizing the ordinal distance.
924
-
925
- Parameters
926
- ----------
927
- test_scores : np.ndarray
928
- Array of predicted probabilities for the test data.
929
-
930
- Returns
931
- -------
932
- prevalence : float
933
- Estimated prevalence.
934
- """
935
- # Compute alpha values and corresponding distance measures
936
- alpha_values, distance_measures = self._calculate_distances(test_scores)
937
-
938
- # Find the index of the alpha value with the minimum distance measure
939
- self.best_distance_index = np.argmin(distance_measures)
940
- prevalence = alpha_values[self.best_distance_index]
941
-
942
- return prevalence
943
-
944
- def _calculate_distances(self, test_scores: np.ndarray) -> tuple:
945
- """
946
- Calculate distance measures for a range of alpha values.
947
-
948
- Parameters
949
- ----------
950
- test_scores : np.ndarray
951
- Array of predicted probabilities for the test data.
952
-
953
- Returns
954
- -------
955
- alpha_values : np.ndarray
956
- Array of alpha values (from 0 to 1) used for evaluation.
957
- distance_measures : list
958
- List of distance measures for each alpha value.
959
- """
960
- # Define a range of alpha values from 0 to 1
961
- alpha_values = np.linspace(0, 1, 101)
962
-
963
- # Get the number of positive, negative, and test scores
964
- num_pos_scores = len(self.pos_scores)
965
- num_neg_scores = len(self.neg_scores)
966
- num_test_scores = len(test_scores)
967
-
968
- distance_measures = []
969
-
970
- # Iterate over each alpha value
971
- for alpha in alpha_values:
972
- # Compute weights for positive, negative, and test scores
973
- pos_weight = alpha / num_pos_scores
974
- neg_weight = (1 - alpha) / num_neg_scores
975
- test_weight = -1 / num_test_scores
976
-
977
- # Create arrays with weights
978
- pos_weights = np.full(num_pos_scores, pos_weight)
979
- neg_weights = np.full(num_neg_scores, neg_weight)
980
- test_weights = np.full(num_test_scores, test_weight)
981
-
982
- # Concatenate all scores and their corresponding weights
983
- all_scores = np.concatenate([self.pos_scores, self.neg_scores, test_scores])
984
- all_weights = np.concatenate([pos_weights, neg_weights, test_weights])
985
-
986
- # Sort scores and weights based on scores
987
- sorted_indices = np.argsort(all_scores)
988
- sorted_scores = all_scores[sorted_indices]
989
- sorted_weights = all_weights[sorted_indices]
990
-
991
- # Compute the total cost for the current alpha
992
- cumulative_weight = sorted_weights[0]
993
- total_cost = 0
994
-
995
- for i in range(1, len(sorted_scores)):
996
- # Calculate the cost for the segment between sorted scores
997
- segment_width = sorted_scores[i] - sorted_scores[i - 1]
998
- total_cost += abs(segment_width * cumulative_weight)
999
- cumulative_weight += sorted_weights[i]
1000
-
1001
- distance_measures.append(total_cost)
1002
-
1003
- return alpha_values, distance_measures