mlquantify 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. mlquantify/__init__.py +10 -29
  2. mlquantify/adjust_counting/__init__.py +24 -0
  3. mlquantify/adjust_counting/_adjustment.py +648 -0
  4. mlquantify/adjust_counting/_base.py +245 -0
  5. mlquantify/adjust_counting/_counting.py +153 -0
  6. mlquantify/adjust_counting/_utils.py +109 -0
  7. mlquantify/base.py +117 -519
  8. mlquantify/base_aggregative.py +209 -0
  9. mlquantify/calibration.py +1 -0
  10. mlquantify/confidence.py +329 -0
  11. mlquantify/likelihood/__init__.py +5 -0
  12. mlquantify/likelihood/_base.py +147 -0
  13. mlquantify/likelihood/_classes.py +430 -0
  14. mlquantify/meta/__init__.py +1 -0
  15. mlquantify/meta/_classes.py +785 -0
  16. mlquantify/metrics/__init__.py +21 -0
  17. mlquantify/metrics/_oq.py +109 -0
  18. mlquantify/metrics/_rq.py +98 -0
  19. mlquantify/{evaluation/measures.py → metrics/_slq.py} +51 -36
  20. mlquantify/mixture/__init__.py +7 -0
  21. mlquantify/mixture/_base.py +147 -0
  22. mlquantify/mixture/_classes.py +458 -0
  23. mlquantify/mixture/_utils.py +163 -0
  24. mlquantify/model_selection/__init__.py +9 -0
  25. mlquantify/model_selection/_protocol.py +358 -0
  26. mlquantify/model_selection/_search.py +315 -0
  27. mlquantify/model_selection/_split.py +1 -0
  28. mlquantify/multiclass.py +350 -0
  29. mlquantify/neighbors/__init__.py +9 -0
  30. mlquantify/neighbors/_base.py +168 -0
  31. mlquantify/neighbors/_classes.py +150 -0
  32. mlquantify/{classification/methods.py → neighbors/_classification.py} +37 -62
  33. mlquantify/neighbors/_kde.py +268 -0
  34. mlquantify/neighbors/_utils.py +131 -0
  35. mlquantify/neural/__init__.py +1 -0
  36. mlquantify/utils/__init__.py +47 -2
  37. mlquantify/utils/_artificial.py +27 -0
  38. mlquantify/utils/_constraints.py +219 -0
  39. mlquantify/utils/_context.py +21 -0
  40. mlquantify/utils/_decorators.py +36 -0
  41. mlquantify/utils/_exceptions.py +12 -0
  42. mlquantify/utils/_get_scores.py +159 -0
  43. mlquantify/utils/_load.py +18 -0
  44. mlquantify/utils/_parallel.py +6 -0
  45. mlquantify/utils/_random.py +36 -0
  46. mlquantify/utils/_sampling.py +273 -0
  47. mlquantify/utils/_tags.py +44 -0
  48. mlquantify/utils/_validation.py +447 -0
  49. mlquantify/utils/prevalence.py +64 -0
  50. {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/METADATA +2 -1
  51. mlquantify-0.1.10.dist-info/RECORD +53 -0
  52. mlquantify/classification/__init__.py +0 -1
  53. mlquantify/evaluation/__init__.py +0 -14
  54. mlquantify/evaluation/protocol.py +0 -289
  55. mlquantify/methods/__init__.py +0 -37
  56. mlquantify/methods/aggregative.py +0 -1159
  57. mlquantify/methods/meta.py +0 -472
  58. mlquantify/methods/mixture_models.py +0 -1003
  59. mlquantify/methods/non_aggregative.py +0 -136
  60. mlquantify/methods/threshold_optimization.py +0 -869
  61. mlquantify/model_selection.py +0 -377
  62. mlquantify/plots.py +0 -367
  63. mlquantify/utils/general.py +0 -371
  64. mlquantify/utils/method.py +0 -449
  65. mlquantify-0.1.8.dist-info/RECORD +0 -22
  66. {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/WHEEL +0 -0
  67. {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/top_level.txt +0 -0
@@ -1,1159 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- from scipy.optimize import minimize
4
- from ..base import AggregativeQuantifier
5
- from ..utils.method import *
6
-
7
- from sklearn.base import BaseEstimator
8
- from sklearn.metrics import confusion_matrix
9
- from sklearn.model_selection import train_test_split
10
- import mlquantify as mq
11
-
12
-
13
-
14
-
15
-
16
- class CC(AggregativeQuantifier):
17
- """Classify and Count (CC).
18
-
19
- The simplest quantification method involves classifying each instance
20
- and then counting the number of instances assigned to each class to
21
- estimate the class prevalence.
22
-
23
- This method is based on the concept of classification and counting the
24
- number of instances for each class, which is used to estimate the
25
- class prevalence.
26
-
27
- Attributes
28
- ----------
29
- learner : BaseEstimator
30
- The machine learning model used to classify the instances.
31
- It must be an estimator from scikit-learn (e.g., LogisticRegression,
32
- RandomForestClassifier).
33
-
34
- See Also
35
- --------
36
- AggregativeQuantifier : Base class for aggregative quantification methods.
37
-
38
- References
39
- ----------
40
- FORMAN, George. Quantifying counts and costs via classification.
41
- Data Mining and Knowledge Discovery, v. 17, p. 164-206, 2008.
42
- Available at: https://link.springer.com/article/10.1007/s10618-008-0097-y
43
-
44
- Parameters
45
- ----------
46
- learner : BaseEstimator
47
- A scikit-learn-compatible model that serves as the classifier.
48
-
49
- Methods
50
- -------
51
- fit(X, y)
52
- Fits the learner to the data.
53
-
54
- predict(X) -> dict
55
- Predicts the class labels for the given data and calculates
56
- the prevalence of each class based on the predictions.
57
-
58
- Examples
59
- --------
60
- >>> from mlquantify.utils.general import get_real_prev
61
- >>> from mlquantify.methods.aggregative import CC
62
- >>> from sklearn.ensemble import RandomForestClassifier
63
- >>> from sklearn.datasets import load_wine
64
- >>> from sklearn.model_selection import train_test_split
65
- >>>
66
- >>> features, target = load_wine(return_X_y=True)
67
- >>>
68
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.4, random_state=32)
69
- >>>
70
- >>> cc = CC(RandomForestClassifier())
71
- >>> cc.fit(X_train, y_train)
72
- >>> y_pred = cc.predict(X_test)
73
- >>> y_pred
74
- {0: 0.4305555555555556, 1: 0.2916666666666667, 2: 0.2777777777777778}
75
- >>> get_real_prev(y_test)
76
- {0: 0.4166666666666667, 1: 0.3194444444444444, 2: 0.2638888888888889}
77
- """
78
-
79
- def __init__(self, learner: BaseEstimator=None):
80
- self.learner = learner
81
-
82
- def _fit_method(self, X, y):
83
- """
84
- Fits the learner to the data. This method is used internally.
85
-
86
- Parameters
87
- ----------
88
- X : array-like
89
- Feature matrix.
90
- y : array-like
91
- Target labels.
92
-
93
- Returns
94
- -------
95
- self : CC
96
- The instance of the CC class.
97
- """
98
- self.fit_learner(X, y)
99
- return self
100
-
101
- def _predict_method(self, X) -> np.ndarray:
102
- """
103
- Predicts the class labels for the given data and calculates
104
- the prevalence of each class based on the predictions.
105
-
106
- Parameters
107
- ----------
108
- X : array-like
109
- Feature matrix for prediction.
110
-
111
- Returns
112
- -------
113
- array-like
114
- An array containing the prevalence of each class.
115
- """
116
- predicted_labels = self.predict_learner(X)
117
-
118
- # Count occurrences of each class in the predictions
119
- class_counts = np.array([np.count_nonzero(predicted_labels == _class) for _class in self.classes])
120
-
121
- # Calculate the prevalence of each class
122
- prevalences = class_counts / len(predicted_labels)
123
-
124
- return prevalences
125
-
126
-
127
-
128
-
129
-
130
-
131
-
132
-
133
-
134
- class EMQ(AggregativeQuantifier):
135
- """Expectation Maximisation Quantifier (EMQ).
136
-
137
- EMQ is a quantification method that iteratively adjusts the prior
138
- and posterior probabilities of a learner using the Expectation-Maximisation (EM) algorithm.
139
- It is particularly useful for scenarios where the class distribution in the test set
140
- differs from that in the training set.
141
-
142
- Attributes
143
- ----------
144
- learner : BaseEstimator
145
- A scikit-learn-compatible model used to classify the instances.
146
- priors : array-like
147
- Prior probabilities of the classes, estimated from the training data.
148
-
149
- References
150
- ----------
151
- SAERENS, Marco; LATINNE, Patrice; DECAESTECKER, Christine. Adjusting the outputs of a classifier
152
- to new a priori probabilities: a simple procedure. Neural Computation, v. 14, n. 1, p. 21-41, 2002.
153
- Available at: https://ieeexplore.ieee.org/abstract/document/6789744
154
-
155
- Examples
156
- --------
157
- >>> from mlquantify.methods.aggregative import EMQ
158
- >>> from mlquantify.utils.general import get_real_prev
159
- >>> from sklearn.ensemble import RandomForestClassifier
160
- >>> from sklearn.datasets import load_wine
161
- >>> from sklearn.model_selection import train_test_split
162
- >>>
163
- >>> features, target = load_wine(return_X_y=True)
164
- >>>
165
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.4, random_state=32)
166
- >>>
167
- >>> emq = EMQ(RandomForestClassifier())
168
- >>> emq.fit(X_train, y_train)
169
- >>> prevalences = emq.predict(X_test)
170
- >>> print(prevalences)
171
- {0: 0.4466744706195974, 1: 0.29747794914814046, 2: 0.25584758023226206}
172
- >>> get_real_prev(y_test)
173
- {0: 0.4166666666666667, 1: 0.3194444444444444, 2: 0.2638888888888889}
174
- """
175
-
176
- MAX_ITER = 1000
177
- EPSILON = 1e-6
178
-
179
- @property
180
- def is_probabilistic(self) -> bool:
181
- return True
182
-
183
- def __init__(self, learner: BaseEstimator=None):
184
- self.learner = learner
185
- self.priors = None
186
-
187
- def _fit_method(self, X, y):
188
- """
189
- Fits the learner to the training data and calculates prior probabilities.
190
-
191
- Parameters
192
- ----------
193
- X : array-like
194
- Feature matrix for training.
195
- y : array-like
196
- Target labels for training.
197
-
198
- Returns
199
- -------
200
- self : EMQ
201
- The fitted instance of EMQ.
202
- """
203
- self.fit_learner(X, y)
204
-
205
- counts = np.array([np.count_nonzero(y == _class) for _class in self.classes])
206
- self.priors = counts / len(y)
207
-
208
- return self
209
-
210
- def _predict_method(self, X) -> dict:
211
- """
212
- Predicts the prevalence of each class in the test data.
213
-
214
- Parameters
215
- ----------
216
- X : array-like
217
- Feature matrix for prediction.
218
-
219
- Returns
220
- -------
221
- dict
222
- A dictionary with class labels as keys and their prevalence as values.
223
- """
224
- posteriors = self.predict_learner(X)
225
- prevalences, _ = self.EM(self.priors, posteriors)
226
-
227
- return prevalences
228
-
229
- def predict_proba(self, X, epsilon: float = EPSILON, max_iter: int = MAX_ITER) -> np.ndarray:
230
- """
231
- Predicts the posterior probabilities for the test data after adjustment using EM.
232
-
233
- Parameters
234
- ----------
235
- X : array-like
236
- Feature matrix for prediction.
237
- epsilon : float, optional
238
- Convergence threshold for the EM algorithm (default: EPSILON).
239
- max_iter : int, optional
240
- Maximum number of iterations for the EM algorithm (default: MAX_ITER).
241
-
242
- Returns
243
- -------
244
- np.ndarray
245
- Adjusted posterior probabilities.
246
- """
247
- posteriors = self.predict_learner(X)
248
- _, posteriors = self.EM(self.priors, posteriors, epsilon, max_iter)
249
- return posteriors
250
-
251
- @classmethod
252
- def EM(cls, priors, posteriors, epsilon=EPSILON, max_iter=MAX_ITER):
253
- """
254
- Expectation-Maximisation (EM) algorithm for adjusting prior and posterior probabilities.
255
-
256
- The algorithm iterates over the data, adjusting the probabilities until convergence
257
- or reaching the maximum number of iterations. It estimates the class prevalence
258
- and adjusts the posterior probabilities for each class.
259
-
260
- Parameters
261
- ----------
262
- priors : array-like
263
- Initial prior probabilities for each class.
264
- posteriors : array-like
265
- Initial posterior probabilities for each test instance and class.
266
- epsilon : float, optional
267
- Convergence threshold (default: EPSILON).
268
- max_iter : int, optional
269
- Maximum number of iterations (default: MAX_ITER).
270
-
271
- Returns
272
- -------
273
- tuple
274
- Adjusted prevalence (array-like) and updated posterior probabilities (array-like).
275
- """
276
- Px = posteriors
277
- prev_prevalence = np.copy(priors)
278
- running_estimate = np.copy(prev_prevalence) # Initialized with the training prevalence
279
-
280
- iteration, converged = 0, False
281
- previous_estimate = None
282
-
283
- while not converged and iteration < max_iter:
284
- # E-step: Compute unnormalized posteriors
285
- posteriors_unnormalized = (running_estimate / prev_prevalence) * Px
286
- posteriors = posteriors_unnormalized / posteriors_unnormalized.sum(axis=1, keepdims=True)
287
-
288
- # M-step: Update the running prevalence estimate
289
- running_estimate = posteriors.mean(axis=0)
290
-
291
- if previous_estimate is not None and np.mean(np.abs(running_estimate - previous_estimate)) < epsilon and iteration > 10:
292
- converged = True
293
-
294
- previous_estimate = running_estimate
295
- iteration += 1
296
-
297
- if not converged:
298
- print('[Warning] The method has reached the maximum number of iterations; it might not have converged')
299
-
300
- return running_estimate, posteriors
301
-
302
-
303
-
304
-
305
-
306
-
307
-
308
-
309
-
310
-
311
- class FM(AggregativeQuantifier):
312
- """The Friedman Method (FM).
313
-
314
- FM is a quantification method similar to GPAC (General Probabilistic Aggregative Classifier),
315
- but instead of averaging confidence scores from probabilistic classifiers,
316
- it uses the proportion of confidence scores that exceed the expected class frequencies
317
- estimated from the training data.
318
-
319
- This method leverages a confusion matrix computed during training to adjust
320
- class prevalences in the test set, solving an optimization problem to align
321
- predicted and actual distributions.
322
-
323
- Attributes
324
- ----------
325
- learner : BaseEstimator
326
- A scikit-learn-compatible model used for classification.
327
- CM : np.ndarray
328
- The confusion matrix, normalized by class counts.
329
- priors : array-like
330
- Prior probabilities of the classes, estimated from training data.
331
-
332
- References
333
- ----------
334
- Friedman, J. (2001). Quantification via Classification. Presentation.
335
- Available at: https://jerryfriedman.su.domains/talks/qc.pdf
336
-
337
- Examples
338
- --------
339
- >>> from mlquantify.utils.general import get_real_prev
340
- >>> from mlquantify.methods.aggregative import FM
341
- >>> from sklearn.ensemble import RandomForestClassifier
342
- >>> from sklearn.datasets import load_wine
343
- >>> from sklearn.model_selection import train_test_split
344
- >>>
345
- >>> features, target = load_wine(return_X_y=True)
346
- >>>
347
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.4, random_state=32)
348
- >>>
349
- >>> fm = FM(RandomForestClassifier())
350
- >>> fm.fit(X_train, y_train)
351
- >>> y_pred = fm.predict(X_test)
352
- >>> y_pred
353
- {0: 0.4207283701943278, 1: 0.3049753216939303, 2: 0.27429630811174194}
354
- >>> get_real_prev(y_test)
355
- {0: 0.4166666666666667, 1: 0.3194444444444444, 2: 0.2638888888888889}
356
- """
357
-
358
- @property
359
- def is_probabilistic(self) -> bool:
360
- return True
361
-
362
-
363
- def __init__(self, learner: BaseEstimator=None):
364
- self.learner = learner
365
- self.CM = None
366
-
367
- def _fit_method(self, X, y):
368
- """
369
- Fits the learner and computes the confusion matrix.
370
-
371
- The confusion matrix is computed based on cross-validated predicted labels
372
- and probabilities. It represents the proportions of confidence scores
373
- exceeding the priors for each class.
374
-
375
- Parameters
376
- ----------
377
- X : array-like
378
- Feature matrix for training.
379
- y : array-like
380
- Target labels for training.
381
-
382
- Returns
383
- -------
384
- self : FM
385
- The fitted instance of FM.
386
- """
387
- # Get predicted labels and probabilities using cross-validation
388
- if mq.arguments["y_labels"] is not None and mq.arguments["posteriors_train"] is not None:
389
- y_labels = mq.arguments["y_labels"]
390
- probabilities = mq.arguments["posteriors_train"]
391
- else:
392
- y_labels, probabilities = get_scores(X, y, self.learner, self.cv_folds, self.learner_fitted)
393
-
394
- # Fit the learner if it hasn't been fitted already
395
- self.fit_learner(X, y)
396
-
397
- # Initialize the confusion matrix
398
- CM = np.zeros((self.n_class, self.n_class))
399
-
400
- # Calculate the class priors
401
- class_counts = np.array([np.count_nonzero(y_labels == _class) for _class in self.classes])
402
- self.priors = class_counts / len(y_labels)
403
-
404
- # Populate the confusion matrix
405
- for i, _class in enumerate(self.classes):
406
- indices = np.where(y_labels == _class)[0]
407
- CM[:, i] = np.sum(probabilities[indices] > self.priors, axis=0)
408
-
409
- # Normalize the confusion matrix by class counts
410
- self.CM = CM / class_counts
411
-
412
- return self
413
-
414
- def _predict_method(self, X) -> dict:
415
- """
416
- Predicts class prevalences in the test set using the confusion matrix.
417
-
418
- Solves an optimization problem to find class prevalences that best
419
- align with the observed proportions in the test set.
420
-
421
- Parameters
422
- ----------
423
- X : array-like
424
- Feature matrix for prediction.
425
-
426
- Returns
427
- -------
428
- dict
429
- A dictionary with class labels as keys and their prevalence as values.
430
- """
431
- posteriors = self.predict_learner(X)
432
-
433
- # Calculate the estimated prevalences in the test set
434
- prevs_estim = np.sum(posteriors > self.priors, axis=0) / posteriors.shape[0]
435
-
436
- # Define the objective function for optimization
437
- def objective(prevs_pred):
438
- return np.linalg.norm(self.CM @ prevs_pred - prevs_estim)
439
-
440
- # Constraints for the optimization problem
441
- constraints = [
442
- {'type': 'eq', 'fun': lambda prevs_pred: np.sum(prevs_pred) - 1.0},
443
- {'type': 'ineq', 'fun': lambda prevs_pred: prevs_pred}
444
- ]
445
-
446
- # Initial guess for the optimization
447
- initial_guess = np.ones(self.CM.shape[1]) / self.CM.shape[1]
448
-
449
- # Solve the optimization problem
450
- result = minimize(objective, initial_guess, constraints=constraints, bounds=[(0, 1)] * self.CM.shape[1])
451
-
452
- if result.success:
453
- prevalences = result.x
454
- else:
455
- print("Optimization did not converge")
456
- prevalences = self.priors
457
-
458
- return prevalences
459
-
460
-
461
-
462
-
463
-
464
-
465
-
466
- class GAC(AggregativeQuantifier):
467
- """
468
- Generalized Adjusted Count (GAC).
469
-
470
- GAC is a quantification method that applies a classifier to estimate the distribution
471
- of class labels in the test set by solving a system of linear equations. This system
472
- is constructed using a conditional probability matrix derived from training data and
473
- is solved via constrained least-squares regression.
474
-
475
- Parameters
476
- ----------
477
- learner : BaseEstimator
478
- A scikit-learn-compatible model used for classification.
479
- train_size : float, optional
480
- Proportion of the dataset to include in the training split, by default 0.6.
481
- random_state : int, optional
482
- Random seed for reproducibility of data splits, by default None.
483
-
484
- Attributes
485
- ----------
486
- learner : BaseEstimator
487
- A scikit-learn-compatible model used for classification.
488
- cond_prob_matrix : np.ndarray
489
- Conditional probability matrix, representing P(yi|yj).
490
- train_size : float, optional
491
- Proportion of the dataset to include in the training split, by default 0.6.
492
- random_state : int, optional
493
- Random seed for reproducibility of data splits, by default None.
494
-
495
-
496
- References
497
- ----------
498
- Firat, Aykut. Unified framework for quantification. arXiv preprint arXiv:1606.00868, 2016.
499
- Available at: https://arxiv.org/abs/1606.00868
500
-
501
- Examples
502
- --------
503
- >>> from mlquantify.utils.general import get_real_prev
504
- >>> from mlquantify.methods.aggregative import GAC
505
- >>> from sklearn.ensemble import RandomForestClassifier
506
- >>> from sklearn.datasets import load_wine
507
- >>> from sklearn.model_selection import train_test_split
508
- >>>
509
- >>> features, target = load_wine(return_X_y=True)
510
- >>>
511
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.4, random_state=32)
512
- >>>
513
- >>> gac = GAC(RandomForestClassifier())
514
- >>> gac.fit(X_train, y_train)
515
- >>> y_pred = gac.predict(X_test)
516
- >>> y_pred
517
- {0: 0.4305555555555556, 1: 0.2916666666666667, 2: 0.2777777777777778}
518
- >>> get_real_prev(y_test)
519
- {0: 0.4166666666666667, 1: 0.3194444444444444, 2: 0.2638888888888889}
520
- """
521
-
522
-
523
- def __init__(self, learner: BaseEstimator=None, train_size:float=0.6, random_state:int=None):
524
- self.learner = learner
525
- self.cond_prob_matrix = None
526
- self.train_size = train_size
527
- self.random_state = random_state
528
-
529
- def _fit_method(self, X, y):
530
- """
531
- Trains the model and computes the conditional probability matrix.
532
-
533
- Parameters
534
- ----------
535
- X : pd.DataFrame or np.ndarray
536
- Features of the dataset.
537
- y : pd.Series or np.ndarray
538
- Labels of the dataset.
539
-
540
- Returns
541
- -------
542
- self : GAC
543
- Fitted quantifier object.
544
- """
545
- if isinstance(X, np.ndarray):
546
- X = pd.DataFrame(X)
547
- if isinstance(y, np.ndarray):
548
- y = pd.Series(y)
549
-
550
- if self.learner_fitted or self.learner is None:
551
- y_pred = mq.arguments["y_pred_train"] if mq.arguments["y_pred_train"] is not None else self.predict_learner(X)
552
- y_label = y
553
- else:
554
- X_train, X_val, y_train, y_val = train_test_split(
555
- X, y, train_size=self.train_size, stratify=y, random_state=self.random_state
556
- )
557
- self.fit_learner(X_train, y_train)
558
- y_label = y_val
559
- y_pred = self.learner.predict(X_val)
560
-
561
- self.cond_prob_matrix = GAC.get_cond_prob_matrix(self.classes, y_label, y_pred)
562
- return self
563
-
564
- def _predict_method(self, X) -> dict:
565
- """
566
- Predicts the class prevalences in the test set and adjusts them.
567
-
568
- Parameters
569
- ----------
570
- X : pd.DataFrame or np.ndarray
571
- Features of the test dataset.
572
-
573
- Returns
574
- -------
575
- dict
576
- Adjusted class prevalences.
577
- """
578
- y_pred = self.predict_learner(X)
579
- _, counts = np.unique(y_pred, return_counts=True)
580
- predicted_prevalences = counts / counts.sum()
581
- adjusted_prevalences = self.solve_adjustment(self.cond_prob_matrix, predicted_prevalences)
582
- return adjusted_prevalences
583
-
584
- @classmethod
585
- def get_cond_prob_matrix(cls, classes: list, y_labels: np.ndarray, predictions: np.ndarray) -> np.ndarray:
586
- """
587
- Computes the conditional probability matrix P(yi|yj).
588
-
589
- Parameters
590
- ----------
591
- classes : list
592
- List of class labels.
593
- y_labels : np.ndarray
594
- True labels from the validation set.
595
- predictions : np.ndarray
596
- Predicted labels from the classifier.
597
-
598
- Returns
599
- -------
600
- np.ndarray
601
- Conditional probability matrix.
602
- """
603
- CM = confusion_matrix(y_labels, predictions, labels=classes).T
604
- CM = CM.astype(float)
605
- class_counts = CM.sum(axis=0)
606
- for i, _ in enumerate(classes):
607
- if class_counts[i] == 0:
608
- CM[i, i] = 1
609
- else:
610
- CM[:, i] /= class_counts[i]
611
- return CM
612
-
613
- @classmethod
614
- def solve_adjustment(cls, cond_prob_matrix: np.ndarray, predicted_prevalences: np.ndarray) -> np.ndarray:
615
- """
616
- Solves the linear system Ax = B to adjust predicted prevalences.
617
-
618
- Parameters
619
- ----------
620
- cond_prob_matrix : np.ndarray
621
- Conditional probability matrix (A).
622
- predicted_prevalences : np.ndarray
623
- Predicted class prevalences (B).
624
-
625
- Returns
626
- -------
627
- np.ndarray
628
- Adjusted class prevalences.
629
- """
630
- A = cond_prob_matrix
631
- B = predicted_prevalences
632
- try:
633
- adjusted_prevalences = np.linalg.solve(A, B)
634
- adjusted_prevalences = np.clip(adjusted_prevalences, 0, 1)
635
- adjusted_prevalences /= adjusted_prevalences.sum()
636
- except np.linalg.LinAlgError:
637
- adjusted_prevalences = predicted_prevalences # Return unadjusted if adjustment fails
638
- return adjusted_prevalences
639
-
640
-
641
-
642
-
643
-
644
-
645
-
646
-
647
-
648
-
649
-
650
-
651
-
652
- class GPAC(AggregativeQuantifier):
653
- """
654
- Generalized Probabilistic Adjusted Count (GPAC).
655
-
656
- GPAC is an extension of the Generalized Adjusted Count (GAC) method. It constructs a system of
657
- linear equations using the confidence scores from probabilistic classifiers, similar to the PAC method.
658
- The system is solved to estimate the prevalence of classes in a test dataset.
659
-
660
- Parameters
661
- ----------
662
- learner : BaseEstimator
663
- A scikit-learn-compatible model used for classification.
664
- train_size : float, optional
665
- Proportion of the dataset to include in the training split, by default 0.6.
666
- random_state : int, optional
667
- Random seed for reproducibility of data splits, by default None.
668
-
669
- Attributes
670
- ----------
671
- learner : BaseEstimator
672
- A scikit-learn-compatible model used for classification.
673
- cond_prob_matrix : np.ndarray
674
- Conditional probability matrix representing P(yi|yj).
675
- train_size : float, optional
676
- Proportion of the dataset to include in the training split, by default 0.6.
677
- random_state : int, optional
678
- Random seed for reproducibility of data splits, by default None.
679
-
680
- References
681
- ----------
682
- Firat, Aykut. Unified framework for quantification. arXiv preprint arXiv:1606.00868, 2016.
683
- Available at: https://arxiv.org/abs/1606.00868
684
-
685
- Examples
686
- --------
687
- >>> from mlquantify.utils.general import get_real_prev
688
- >>> from mlquantify.methods.aggregative import GPAC
689
- >>> from sklearn.ensemble import RandomForestClassifier
690
- >>> from sklearn.datasets import load_wine
691
- >>> from sklearn.model_selection import train_test_split
692
- >>>
693
- >>> features, target = load_wine(return_X_y=True, random_state=32)
694
- >>>
695
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.4, random_state=32)
696
- >>>
697
- >>> gpac = GPAC(RandomForestClassifier())
698
- >>> gpac.fit(X_train, y_train)
699
- >>> y_pred = gpac.predict(X_test)
700
- >>> y_pred
701
- {0: 0.41435185185185186, 1: 0.3078703703703704, 2: 0.2777777777777778}
702
- >>> get_real_prev(y_test)
703
- {0: 0.4166666666666667, 1: 0.3194444444444444, 2: 0.2638888888888889}
704
- """
705
-
706
- def __init__(self, learner: BaseEstimator=None, train_size: float = 0.6, random_state: int = None):
707
- self.learner = learner
708
- self.cond_prob_matrix = None
709
- self.train_size = train_size
710
- self.random_state = random_state
711
-
712
- def _fit_method(self, X, y):
713
- """
714
- Trains the model and computes the conditional probability matrix using validation data.
715
-
716
- Parameters
717
- ----------
718
- X : pd.DataFrame or np.ndarray
719
- Features of the dataset.
720
- y : pd.Series or np.ndarray
721
- Labels of the dataset.
722
-
723
- Returns
724
- -------
725
- self : GPAC
726
- Fitted quantifier object.
727
- """
728
- if isinstance(X, np.ndarray):
729
- X = pd.DataFrame(X)
730
- if isinstance(y, np.ndarray):
731
- y = pd.Series(y)
732
-
733
- if self.learner_fitted or self.learner is None:
734
- y_pred = mq.arguments["y_pred_train"] if mq.arguments["y_pred_train"] is not None else self.predict_learner(X)
735
- y_labels = y
736
- else:
737
- X_train, X_val, y_train, y_val = train_test_split(
738
- X, y, train_size=self.train_size, stratify=y, random_state=self.random_state
739
- )
740
- self.fit_learner(X_train, y_train)
741
- y_labels = y_val
742
- y_pred = self.predict_learner(X_val)
743
-
744
- # Compute the conditional probability matrix
745
- self.cond_prob_matrix = GAC.get_cond_prob_matrix(self.classes, y_labels, y_pred)
746
- return self
747
-
748
- def _predict_method(self, X) -> dict:
749
- """
750
- Predicts class prevalences in the test set and adjusts them using the conditional probability matrix.
751
-
752
- Parameters
753
- ----------
754
- X : pd.DataFrame or np.ndarray
755
- Features of the test dataset.
756
-
757
- Returns
758
- -------
759
- dict
760
- Adjusted class prevalences.
761
- """
762
- predictions = self.predict_learner(X)
763
-
764
- # Compute the distribution of predictions
765
- predicted_prevalences = np.zeros(self.n_class)
766
- _, counts = np.unique(predictions, return_counts=True)
767
- predicted_prevalences[:len(counts)] = counts
768
- predicted_prevalences /= predicted_prevalences.sum()
769
-
770
- # Adjust prevalences using the conditional probability matrix
771
- adjusted_prevalences = GAC.solve_adjustment(self.cond_prob_matrix, predicted_prevalences)
772
- return adjusted_prevalences
773
-
774
- @classmethod
775
- def get_cond_prob_matrix(cls, classes: list, y_labels: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
776
- """
777
- Computes the conditional probability matrix P(yi|yj).
778
-
779
- Parameters
780
- ----------
781
- classes : list
782
- List of class labels.
783
- y_labels : np.ndarray
784
- True labels from the validation set.
785
- y_pred : np.ndarray
786
- Predicted probabilities or labels from the classifier.
787
-
788
- Returns
789
- -------
790
- np.ndarray
791
- Conditional probability matrix with entry (i, j) representing P(yi|yj).
792
- """
793
- n_classes = len(classes)
794
- cond_prob_matrix = np.eye(n_classes)
795
-
796
- for i, class_ in enumerate(classes):
797
- class_indices = y_labels == class_
798
- if class_indices.any():
799
- cond_prob_matrix[i] = y_pred[class_indices].mean(axis=0)
800
-
801
- return cond_prob_matrix.T
802
-
803
-
804
-
805
-
806
-
807
-
808
-
809
-
810
-
811
-
812
- class PCC(AggregativeQuantifier):
813
- """
814
- Probabilistic Classify and Count (PCC).
815
-
816
- PCC is a quantification method that uses a probabilistic classifier to estimate
817
- class prevalences in a test dataset. It computes the mean of the predicted
818
- probabilities for each class to determine their prevalences.
819
-
820
- Parameters
821
- ----------
822
- learner : BaseEstimator
823
- A scikit-learn-compatible probabilistic classifier.
824
-
825
- Attributes
826
- ----------
827
- learner : BaseEstimator
828
- A scikit-learn-compatible probabilistic classifier.
829
-
830
- References
831
- ----------
832
- BELLA, Antonio et al. Quantification via probability estimators. In: 2010 IEEE International Conference on Data Mining. IEEE, 2010. p. 737-742. Avaliable at: https://ieeexplore.ieee.org/abstract/document/5694031
833
-
834
- Examples
835
- --------
836
- >>> from mlquantify.utils.general import get_real_prev
837
- >>> from mlquantify.methods.aggregative import PCC
838
- >>> from sklearn.ensemble import RandomForestClassifier
839
- >>> from sklearn.datasets import load_wine
840
- >>> from sklearn.model_selection import train_test_split
841
- >>>
842
- >>> features, target = load_wine(return_X_y=True, random_state=32)
843
- >>>
844
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.4, random_state=32)
845
- >>>
846
- >>> pcc = PCC(RandomForestClassifier())
847
- >>> pcc.fit(X_train, y_train)
848
- >>> y_pred = pcc.predict(X_test)
849
- >>> y_pred
850
- {0: 0.4036111111111111, 1: 0.3427777777777778, 2: 0.2536111111111111}
851
- >>> get_real_prev(y_test)
852
- {0: 0.4166666666666667, 1: 0.3194444444444444, 2: 0.2638888888888889}
853
- """
854
- @property
855
- def is_probabilistic(self) -> bool:
856
- return True
857
-
858
- def __init__(self, learner: BaseEstimator=None):
859
- self.learner = learner
860
-
861
- def _fit_method(self, X, y):
862
- """
863
- Fits the learner to the training data.
864
-
865
- Parameters
866
- ----------
867
- X : pd.DataFrame or np.ndarray
868
- Features of the training dataset.
869
- y : pd.Series or np.ndarray
870
- Labels of the training dataset.
871
-
872
- Returns
873
- -------
874
- self : PCC
875
- Fitted quantifier object.
876
- """
877
- self.fit_learner(X, y)
878
- return self
879
-
880
- def _predict_method(self, X) -> np.ndarray:
881
- """
882
- Predicts class prevalences in the test dataset by averaging the predicted probabilities.
883
-
884
- Parameters
885
- ----------
886
- X : pd.DataFrame or np.ndarray
887
- Features of the test dataset.
888
-
889
- Returns
890
- -------
891
- np.ndarray
892
- Estimated prevalences for each class.
893
- """
894
- # Initialize a list to store the prevalence for each class
895
- prevalences = []
896
-
897
- # Calculate the prevalence for each class
898
- for class_index in range(self.n_class):
899
- # Get the predicted probabilities for the current class
900
- class_probabilities = self.predict_learner(X)[:, class_index]
901
-
902
- # Compute the average probability (prevalence) for the current class
903
- mean_prev = np.mean(class_probabilities)
904
- prevalences.append(mean_prev)
905
-
906
- return np.asarray(prevalences)
907
-
908
-
909
-
910
- class PACC(AggregativeQuantifier):
911
- """
912
- Probabilistic Adjusted Classify and Count (PACC).
913
- This method extends the Adjusted Classify and Count (AC) approach
914
- by leveraging the average class-conditional confidences obtained
915
- from a probabilistic classifier instead of relying solely on true
916
- positive and false positive rates.
917
-
918
- Parameters
919
- ----------
920
- learner : BaseEstimator
921
- A scikit-learn compatible classifier to be used for quantification.
922
- threshold : float, optional
923
- The decision threshold for classification. Default is 0.5.
924
-
925
- Attributes
926
- ----------
927
- learner : BaseEstimator
928
- A scikit-learn compatible classifier.
929
- threshold : float
930
- Decision threshold for classification. Default is 0.5.
931
- tpr : float
932
- True positive rate computed during the fitting process.
933
- fpr : float
934
- False positive rate computed during the fitting process.
935
-
936
- See Also
937
- --------
938
- ThresholdOptimization : Base class for threshold-based quantification methods.
939
- ACC : Adjusted Classify and Count quantification method.
940
- CC : Classify and Count quantification method.
941
-
942
- References
943
- ----------
944
- A. Bella, C. Ferri, J. Hernández-Orallo and M. J. Ramírez-Quintana, "Quantification via Probability Estimators," 2010 IEEE International Conference on Data Mining, Sydney, NSW, Australia, 2010, pp. 737-742, doi: 10.1109/ICDM.2010.75. Available at: https://ieeexplore.ieee.org/abstract/document/5694031
945
-
946
- Examples
947
- --------
948
- >>> from mlquantify.methods.aggregative import PACC
949
- >>> from mlquantify.utils.general import get_real_prev
950
- >>> from sklearn.datasets import load_breast_cancer
951
- >>> from sklearn.svm import SVC
952
- >>> from sklearn.model_selection import train_test_split
953
- >>>
954
- >>> features, target = load_breast_cancer(return_X_y=True)
955
- >>>
956
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
957
- >>>
958
- >>> pacc = PACC(learner=SVC(probability=True))
959
- >>> pacc.fit(X_train, y_train)
960
- >>> y_pred = pacc.predict(X_test)
961
- >>> y_pred
962
- {0: 0.4664886119311328, 1: 0.5335113880688672}
963
- >>> get_real_prev(y_test)
964
- {0: 0.3991228070175439, 1: 0.6008771929824561}
965
- """
966
-
967
- def __init__(self, learner: BaseEstimator=None, threshold: float = 0.5):
968
- self.learner = learner
969
- self.threshold = threshold
970
- self.mean_pos = None
971
- self.mean_neg = None
972
-
973
- @property
974
- def is_probabilistic(self) -> bool:
975
- return True
976
-
977
- @property
978
- def is_multiclass(self) -> bool:
979
- return False
980
-
981
- def _fit_method(self, X, y):
982
- # Get predicted labels and probabilities
983
- if mq.arguments["y_labels"] is not None and mq.arguments["posteriors_train"] is not None:
984
- y_labels = mq.arguments["y_labels"]
985
- probabilities = mq.arguments["posteriors_train"]
986
- else:
987
- y_labels, probabilities = get_scores(X, y, self.learner, self.cv_folds, self.learner_fitted)
988
-
989
- # Adjust thresholds and compute true and false positive rates
990
-
991
- self.mean_pos = np.mean(probabilities[y_labels == self.classes[1], 1])
992
- self.mean_neg = np.mean(probabilities[y_labels != self.classes[1], 1])
993
-
994
- return self
995
-
996
-
997
- def _predict_method(self, X):
998
- """
999
- Predicts the class prevalence using the mean class-conditional
1000
- probabilities from a probabilistic classifier.
1001
-
1002
- Parameters
1003
- ----------
1004
- X : array-like or sparse matrix of shape (n_samples, n_features)
1005
- The input data for prediction.
1006
-
1007
- Returns
1008
- -------
1009
- dict
1010
- A dictionary with class labels as keys and their respective
1011
- prevalence estimates as values.
1012
-
1013
- Notes
1014
- -----
1015
- The prevalence is adjusted using the formula:
1016
- prevalence = |mean_score - FPR| / (TPR - FPR),
1017
- where mean_score is the average probability for the positive class.
1018
-
1019
- Raises
1020
- ------
1021
- ZeroDivisionError
1022
- If `TPR - FPR` equals zero, indicating that the classifier's
1023
- performance does not vary across the threshold range.
1024
- """
1025
- prevalences = {}
1026
-
1027
- # Calculate probabilities for the positive class
1028
- probabilities = self.predict_learner(X)[:, 1]
1029
-
1030
- # Compute the mean score for the positive class
1031
- mean_scores = np.mean(probabilities)
1032
-
1033
- # Adjust prevalence based on TPR and FPR
1034
- if self.mean_pos - self.mean_neg == 0:
1035
- prevalence = mean_scores
1036
- else:
1037
- prevalence = np.clip(abs(mean_scores - self.mean_neg) / (self.mean_pos - self.mean_neg), 0, 1)
1038
-
1039
- # Map the computed prevalence to the class labels
1040
- prevalences[self.classes[0]] = 1 - prevalence
1041
- prevalences[self.classes[1]] = prevalence
1042
-
1043
- return prevalences
1044
-
1045
-
1046
- class PWK(AggregativeQuantifier):
1047
- """
1048
- Nearest-Neighbor Based Quantification (PWK).
1049
-
1050
- PWK extends nearest-neighbor classification to the quantification setting.
1051
- This k-NN approach uses a weighting scheme that reduces the influence of
1052
- neighbors from the majority class to better estimate class prevalences.
1053
-
1054
- Attributes
1055
- ----------
1056
- learner : BaseEstimator
1057
- A scikit-learn-compatible classifier that implements a k-NN approach.
1058
-
1059
- Notes
1060
- -----
1061
- To get the optimal functionality, you must use the `PWKCLF` classifier, which is a classifier that uses K-NN to classify
1062
-
1063
- References
1064
- ----------
1065
- BARRANQUERO, Jose et al. On the study of nearest neighbor algorithms for prevalence estimation in binary problems. Pattern Recognition, v. 46, n. 2, p. 472-482, 2013. Available at: https://www.sciencedirect.com/science/article/pii/S0031320312003391?casa_token=qgInkRZdEhgAAAAA:Yu_ttk6Tso0xAZR23I0EGnge_UmA_kWI1eB8kxaRZ5Vg1PFLpMwcbEwNvZ5-4Mep7Jgfj9WsCFMMdQ
1066
-
1067
- Examples
1068
- --------
1069
- >>> from mlquantify.utils.general import get_real_prev
1070
- >>> from mlquantify.methods.aggregative import PWK
1071
- >>> from sklearn.ensemble import RandomForestClassifier
1072
- >>> from sklearn.datasets import load_wine
1073
- >>> from sklearn.model_selection import train_test_split
1074
- >>>
1075
- >>> features, target = load_wine(return_X_y=True, random_state=32)
1076
- >>>
1077
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.4, random_state=32)
1078
- >>>
1079
- >>> pwk = PWK(RandomForestClassifier())
1080
- >>> pwk.fit(X_train, y_train)
1081
- >>> y_pred = pwk.predict(X_test)
1082
- >>> y_pred
1083
- {0: 0.4305555555555556, 1: 0.2916666666666667, 2: 0.2777777777777778}
1084
- >>> get_real_prev(y_test)
1085
- {0: 0.4166666666666667, 1: 0.3194444444444444, 2: 0.2638888888888889}
1086
- """
1087
-
1088
- def __init__(self, learner: BaseEstimator=None):
1089
- self.learner = learner
1090
-
1091
- def _fit_method(self, X, y):
1092
- """
1093
- Fits the k-NN learner to the training data.
1094
-
1095
- Parameters
1096
- ----------
1097
- X : pd.DataFrame or np.ndarray
1098
- Features of the training dataset.
1099
- y : pd.Series or np.ndarray
1100
- Labels of the training dataset.
1101
-
1102
- Returns
1103
- -------
1104
- self : PWK
1105
- Fitted quantifier object.
1106
- """
1107
- self.fit_learner(X, y)
1108
- return self
1109
-
1110
- def _predict_method(self, X) -> dict:
1111
- """
1112
- Predicts class prevalences in the test dataset by analyzing the distribution of predicted labels.
1113
-
1114
- Parameters
1115
- ----------
1116
- X : pd.DataFrame or np.ndarray
1117
- Features of the test dataset.
1118
-
1119
- Returns
1120
- -------
1121
- dict
1122
- A dictionary mapping each class label to its estimated prevalence.
1123
- """
1124
- # Predict class labels for the given data
1125
- predicted_labels = self.predict_learner(X)
1126
-
1127
- # Compute the distribution of predicted labels
1128
- unique_labels, label_counts = np.unique(predicted_labels, return_counts=True)
1129
-
1130
- # Calculate the prevalence for each class
1131
- class_prevalences = label_counts / label_counts.sum()
1132
-
1133
- # Map each class label to its prevalence
1134
- prevalences = {label: prevalence for label, prevalence in zip(unique_labels, class_prevalences)}
1135
-
1136
- return prevalences
1137
-
1138
-
1139
-
1140
-
1141
-
1142
- from . import threshold_optimization
1143
-
1144
- ACC = threshold_optimization.ACC
1145
- T50 = threshold_optimization.T50
1146
- MAX = threshold_optimization.MAX
1147
- X_method = threshold_optimization.X_method
1148
- MS = threshold_optimization.MS
1149
- MS2 = threshold_optimization.MS2
1150
-
1151
-
1152
-
1153
- from . import mixture_models
1154
-
1155
- DySsyn = mixture_models.DySsyn
1156
- DyS = mixture_models.DyS
1157
- HDy = mixture_models.HDy
1158
- SMM = mixture_models.SMM
1159
- SORD = mixture_models.SORD