mlquantify 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. mlquantify/__init__.py +10 -29
  2. mlquantify/adjust_counting/__init__.py +24 -0
  3. mlquantify/adjust_counting/_adjustment.py +648 -0
  4. mlquantify/adjust_counting/_base.py +245 -0
  5. mlquantify/adjust_counting/_counting.py +153 -0
  6. mlquantify/adjust_counting/_utils.py +109 -0
  7. mlquantify/base.py +117 -519
  8. mlquantify/base_aggregative.py +209 -0
  9. mlquantify/calibration.py +1 -0
  10. mlquantify/confidence.py +329 -0
  11. mlquantify/likelihood/__init__.py +5 -0
  12. mlquantify/likelihood/_base.py +147 -0
  13. mlquantify/likelihood/_classes.py +430 -0
  14. mlquantify/meta/__init__.py +1 -0
  15. mlquantify/meta/_classes.py +785 -0
  16. mlquantify/metrics/__init__.py +21 -0
  17. mlquantify/metrics/_oq.py +109 -0
  18. mlquantify/metrics/_rq.py +98 -0
  19. mlquantify/{evaluation/measures.py → metrics/_slq.py} +51 -36
  20. mlquantify/mixture/__init__.py +7 -0
  21. mlquantify/mixture/_base.py +147 -0
  22. mlquantify/mixture/_classes.py +458 -0
  23. mlquantify/mixture/_utils.py +163 -0
  24. mlquantify/model_selection/__init__.py +9 -0
  25. mlquantify/model_selection/_protocol.py +358 -0
  26. mlquantify/model_selection/_search.py +315 -0
  27. mlquantify/model_selection/_split.py +1 -0
  28. mlquantify/multiclass.py +350 -0
  29. mlquantify/neighbors/__init__.py +9 -0
  30. mlquantify/neighbors/_base.py +168 -0
  31. mlquantify/neighbors/_classes.py +150 -0
  32. mlquantify/{classification/methods.py → neighbors/_classification.py} +37 -62
  33. mlquantify/neighbors/_kde.py +268 -0
  34. mlquantify/neighbors/_utils.py +131 -0
  35. mlquantify/neural/__init__.py +1 -0
  36. mlquantify/utils/__init__.py +47 -2
  37. mlquantify/utils/_artificial.py +27 -0
  38. mlquantify/utils/_constraints.py +219 -0
  39. mlquantify/utils/_context.py +21 -0
  40. mlquantify/utils/_decorators.py +36 -0
  41. mlquantify/utils/_exceptions.py +12 -0
  42. mlquantify/utils/_get_scores.py +159 -0
  43. mlquantify/utils/_load.py +18 -0
  44. mlquantify/utils/_parallel.py +6 -0
  45. mlquantify/utils/_random.py +36 -0
  46. mlquantify/utils/_sampling.py +273 -0
  47. mlquantify/utils/_tags.py +44 -0
  48. mlquantify/utils/_validation.py +447 -0
  49. mlquantify/utils/prevalence.py +64 -0
  50. {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/METADATA +2 -1
  51. mlquantify-0.1.10.dist-info/RECORD +53 -0
  52. mlquantify/classification/__init__.py +0 -1
  53. mlquantify/evaluation/__init__.py +0 -14
  54. mlquantify/evaluation/protocol.py +0 -289
  55. mlquantify/methods/__init__.py +0 -37
  56. mlquantify/methods/aggregative.py +0 -1159
  57. mlquantify/methods/meta.py +0 -472
  58. mlquantify/methods/mixture_models.py +0 -1003
  59. mlquantify/methods/non_aggregative.py +0 -136
  60. mlquantify/methods/threshold_optimization.py +0 -869
  61. mlquantify/model_selection.py +0 -377
  62. mlquantify/plots.py +0 -367
  63. mlquantify/utils/general.py +0 -371
  64. mlquantify/utils/method.py +0 -449
  65. mlquantify-0.1.8.dist-info/RECORD +0 -22
  66. {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/WHEEL +0 -0
  67. {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/top_level.txt +0 -0
@@ -1,869 +0,0 @@
1
- from abc import abstractmethod
2
- import numpy as np
3
- import warnings
4
- from sklearn.base import BaseEstimator
5
-
6
- from ..base import AggregativeQuantifier
7
- from ..utils.method import adjust_threshold, get_scores
8
- import mlquantify as mq
9
-
10
-
11
-
12
-
13
- class ThresholdOptimization(AggregativeQuantifier):
14
- """
15
- Generic Class for methods that adjust the decision boundary of the underlying classifier
16
- to make the ACC (base method for threshold methods) estimation more numerically stable.
17
- Most strategies involve altering the denominator of the ACC equation.
18
-
19
- This class serves as a base for implementing threshold optimization techniques in classification
20
- tasks. It is designed to adjust thresholds based on true positive and false positive rates,
21
- ensuring better quantification performance.
22
-
23
- Parameters
24
- ----------
25
- learner : BaseEstimator
26
- A scikit-learn compatible classifier to be used for threshold optimization.
27
- threshold : float, optional
28
- The threshold value to be used for classification decisions. Default is 0.5.
29
-
30
- Attributes
31
- ----------
32
- learner : BaseEstimator
33
- A scikit-learn compatible classifier.
34
- threshold : float, optional
35
- The optimized threshold used for classification decisions.
36
- cc_output : float, optional
37
- The classification count output, representing the proportion of instances classified
38
- as positive based on the threshold.
39
- tpr : float, optional
40
- The true positive rate corresponding to the best threshold.
41
- fpr : float, optional
42
- The false positive rate corresponding to the best threshold.
43
-
44
- Notes
45
- -----
46
- All methods that inherit from this class will be binary quantifiers. In case of multiclass problems, it will be made One vs All.
47
-
48
- Examples
49
- --------
50
- >>> from mlquantify.methods.threshold_optimization import ThresholdOptimization
51
- >>> from mlquantify.utils.general import get_real_prev
52
- >>> from sklearn.datasets import load_breast_cancer
53
- >>> from sklearn.svm import SVC
54
- >>> from sklearn.model_selection import train_test_split
55
- >>>
56
- >>> class MyThrMethod(ThresholdOptimization):
57
- ... def __init__(self, learner, threshold=0.5):
58
- ... super().__init__(learner)
59
- ... self.threshold = threshold
60
- ... def best_tprfpr(self, thresholds, tpr, fpr):
61
- ... return thresholds[20], tpr[20], fpr[20]
62
- >>>
63
- >>> features, target = load_breast_cancer(return_X_y=True)
64
- >>>
65
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
66
- >>>
67
- >>> mtm = MyThrMethod(learner=SVC(probability=True), threshold=0.5)
68
- >>> mtm.fit(X_train, y_train)
69
- >>> y_pred = mtm.predict(X_test)
70
- """
71
-
72
- def __init__(self, learner: BaseEstimator=None):
73
- self.learner = learner
74
- self.threshold = None
75
- self.cc_output = None
76
- self.tpr = None
77
- self.fpr = None
78
-
79
- @property
80
- def is_probabilistic(self) -> bool:
81
- """
82
- Returns whether the method is probabilistic.
83
-
84
- This method is used to determine whether the quantification method is probabilistic,
85
- meaning it uses class-conditional probabilities to estimate class prevalences.
86
-
87
- Returns
88
- -------
89
- bool
90
- True, indicating that this method is probabilistic.
91
- """
92
- return True
93
-
94
- @property
95
- def is_multiclass(self) -> bool:
96
- """
97
- Returns whether the method is applicable to multiclass quantification.
98
-
99
- Threshold-based methods are typically binary classifiers, so this method
100
- returns False.
101
-
102
- Returns
103
- -------
104
- bool
105
- False, indicating that this method does not support multiclass quantification.
106
- """
107
- return False
108
-
109
- def _fit_method(self, X, y):
110
- """
111
- Fits the classifier and adjusts thresholds based on true positive rate (TPR) and false positive rate (FPR).
112
-
113
- Parameters
114
- ----------
115
- X : pd.DataFrame or np.ndarray
116
- The input features for training.
117
- y : pd.Series or np.ndarray
118
- The target labels for training.
119
-
120
- Returns
121
- -------
122
- self : ThresholdOptimization
123
- The fitted quantifier object with the best threshold, TPR, and FPR.
124
- """
125
- # Get predicted labels and probabilities
126
- if mq.arguments["y_labels"] is not None and mq.arguments["posteriors_train"] is not None:
127
- y_labels = mq.arguments["y_labels"]
128
- probabilities = mq.arguments["posteriors_train"]
129
- else:
130
- y_labels, probabilities = get_scores(X, y, self.learner, self.cv_folds, self.learner_fitted)
131
-
132
- # Adjust thresholds and compute true and false positive rates
133
- thresholds, tprs, fprs = adjust_threshold(y_labels, probabilities[:, 1], self.classes)
134
-
135
- # Find the best threshold based on TPR and FPR
136
- self.threshold, self.tpr, self.fpr = self.best_tprfpr(thresholds, tprs, fprs)
137
-
138
- return self
139
-
140
- def _predict_method(self, X) -> dict:
141
- """
142
- Predicts class prevalences using the adjusted threshold.
143
-
144
- Parameters
145
- ----------
146
- X : pd.DataFrame or np.ndarray
147
- The input features for prediction.
148
-
149
- Returns
150
- -------
151
- np.ndarray
152
- An array of predicted prevalences for the classes.
153
- """
154
- # Get predicted probabilities for the positive class
155
- probabilities = self.predict_learner(X)[:, 1]
156
-
157
- # Compute the classification count output based on the threshold
158
- self.cc_output = len(probabilities[probabilities >= self.threshold]) / len(probabilities)
159
-
160
- # Calculate prevalence, ensuring it is within [0, 1]
161
- if self.tpr - self.fpr == 0:
162
- prevalence = self.cc_output
163
- else:
164
- # Equation of threshold methods to compute prevalence
165
- prevalence = np.clip((self.cc_output - self.fpr) / (self.tpr - self.fpr), 0, 1)
166
-
167
- prevalences = [1 - prevalence, prevalence]
168
-
169
- return np.asarray(prevalences)
170
-
171
- @abstractmethod
172
- def best_tprfpr(self, thresholds: np.ndarray, tpr: np.ndarray, fpr: np.ndarray) -> float:
173
- """
174
- Abstract method for determining the best TPR (True Positive Rate) and FPR (False Positive Rate)
175
- to use in the equation for threshold optimization.
176
-
177
- This method needs to be implemented by subclasses to define how the best threshold
178
- is chosen based on TPR and FPR.
179
-
180
- Parameters
181
- ----------
182
- thresholds : np.ndarray
183
- An array of threshold values.
184
- tpr : np.ndarray
185
- An array of true positive rates corresponding to the thresholds.
186
- fpr : np.ndarray
187
- An array of false positive rates corresponding to the thresholds.
188
-
189
- Returns
190
- -------
191
- float
192
- The best threshold value determined based on the true positive and false positive rates.
193
- """
194
- ...
195
-
196
-
197
-
198
-
199
-
200
-
201
- class ACC(ThresholdOptimization):
202
- """
203
- Adjusted Classify and Count (ACC). This method is a base approach for threshold-based
204
- quantification methods.
205
-
206
- As described in the ThresholdOptimization base class, this method estimates the true
207
- positive rate (TPR) and false positive rate (FPR) from the training data. It then uses
208
- these values to adjust the output of the Classify and Count (CC) method, making the
209
- quantification process more accurate and stable.
210
-
211
- Parameters
212
- ----------
213
- learner : BaseEstimator
214
- A scikit-learn compatible classifier to be used for quantification.
215
- threshold : float, optional
216
- The decision threshold for classifying instances. Default is 0.5.
217
-
218
- Attributes
219
- ----------
220
- learner : BaseEstimator
221
- A scikit-learn compatible classifier.
222
- threshold : float
223
- The decision threshold used to classify instances as positive or negative. Default is 0.5.
224
-
225
- See Also
226
- --------
227
- ThresholdOptimization : Base class for threshold-based quantification methods.
228
- CC : Classify and Count quantification method.
229
-
230
- References
231
- ----------
232
- FORMAN, George. Quantifying counts and costs via classification. Data Mining and Knowledge Discovery, v. 17, p. 164-206, 2008. Available at: https://link.springer.com/article/10.1007/s10618-008-0097-y
233
-
234
- Examples
235
- --------
236
- >>> from mlquantify.methods.aggregative import ACC
237
- >>> from mlquantify.utils.general import get_real_prev
238
- >>> from sklearn.datasets import load_breast_cancer
239
- >>> from sklearn.svm import SVC
240
- >>> from sklearn.model_selection import train_test_split
241
- >>>
242
- >>> features, target = load_breast_cancer(return_X_y=True)
243
- >>>
244
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
245
- >>>
246
- >>> acc = ACC(learner=SVC(probability=True), threshold=0.5)
247
- >>> acc.fit(X_train, y_train)
248
- >>> y_pred = acc.predict(X_test)
249
- >>> y_pred
250
- {0: 0.3968506555196656, 1: 0.6031493444803344}
251
- >>> get_real_prev(y_test)
252
- {0: 0.3991228070175439, 1: 0.6008771929824561}
253
- """
254
-
255
- def __init__(self, learner: BaseEstimator=None, threshold: float = 0.5):
256
- super().__init__(learner)
257
- self.threshold = threshold
258
-
259
- def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
260
- """
261
- Determines the true positive rate (TPR) and false positive rate (FPR) for the specified threshold.
262
-
263
- This method identifies the TPR and FPR corresponding to the threshold provided
264
- during initialization. It assumes that the `thresholds`, `tprs`, and `fprs` arrays
265
- are aligned, meaning the `i-th` element of each array corresponds to the same threshold.
266
-
267
- Parameters
268
- ----------
269
- thresholds : np.ndarray
270
- An array of threshold values.
271
- tprs : np.ndarray
272
- An array of true positive rates corresponding to the thresholds.
273
- fprs : np.ndarray
274
- An array of false positive rates corresponding to the thresholds.
275
-
276
- Returns
277
- -------
278
- tuple
279
- A tuple containing the threshold, the true positive rate (TPR), and the false
280
- positive rate (FPR) for the specified threshold.
281
-
282
- Raises
283
- ------
284
- IndexError
285
- If the specified threshold is not found in the `thresholds` array.
286
- """
287
- # Get the TPR and FPR where the threshold matches the specified value
288
- tpr = tprs[thresholds == self.threshold][0]
289
- fpr = fprs[thresholds == self.threshold][0]
290
- return (self.threshold, tpr, fpr)
291
-
292
-
293
-
294
-
295
-
296
-
297
-
298
-
299
-
300
- class MAX(ThresholdOptimization):
301
- """
302
- Threshold MAX. This quantification method selects the threshold that maximizes
303
- the absolute difference between the true positive rate (TPR) and false positive
304
- rate (FPR). This threshold is then used in the denominator of the equation for
305
- adjusted prevalence estimation.
306
-
307
- Parameters
308
- ----------
309
- learner : BaseEstimator
310
- A scikit-learn compatible classifier to be used for quantification.
311
-
312
- Attributes
313
- ----------
314
- learner : BaseEstimator
315
- A scikit-learn compatible classifier.
316
-
317
- See Also
318
- --------
319
- ThresholdOptimization : Base class for threshold-based quantification methods.
320
- ACC : Adjusted Classify and Count quantification method.
321
- CC : Classify and Count quantification method.
322
-
323
- References
324
- ----------
325
- FORMAN, George. Counting positives accurately despite inaccurate classification. In: European conference on machine learning. Berlin, Heidelberg: Springer Berlin Heidelberg, 2005. p. 564-575. Available at: https://link.springer.com/chapter/10.1007/11564096_56
326
-
327
- Examples
328
- --------
329
- >>> from mlquantify.methods.aggregative import MAX
330
- >>> from mlquantify.utils.general import get_real_prev
331
- >>> from sklearn.datasets import load_breast_cancer
332
- >>> from sklearn.svm import SVC
333
- >>> from sklearn.model_selection import train_test_split
334
- >>>
335
- >>> features, target = load_breast_cancer(return_X_y=True)
336
- >>>
337
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
338
- >>>
339
- >>> maxq = MAX(learner=SVC(probability=True))
340
- >>> maxq.fit(X_train, y_train)
341
- >>> y_pred = maxq.predict(X_test)
342
- >>> y_pred
343
- {0: 0.3920664352842359, 1: 0.6079335647157641}
344
- >>> get_real_prev(y_test)
345
- {0: 0.3991228070175439, 1: 0.6008771929824561}
346
- """
347
-
348
- def __init__(self, learner: BaseEstimator=None):
349
- super().__init__(learner)
350
-
351
- def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
352
- """
353
- Determines the optimal threshold by maximizing the absolute difference between
354
- the true positive rate (TPR) and the false positive rate (FPR).
355
-
356
- This method identifies the index where `|TPR - FPR|` is maximized and retrieves
357
- the corresponding threshold, TPR, and FPR.
358
-
359
- Parameters
360
- ----------
361
- thresholds : np.ndarray
362
- An array of threshold values.
363
- tprs : np.ndarray
364
- An array of true positive rates corresponding to the thresholds.
365
- fprs : np.ndarray
366
- An array of false positive rates corresponding to the thresholds.
367
-
368
- Returns
369
- -------
370
- tuple
371
- A tuple containing:
372
- - The threshold that maximizes `|TPR - FPR|`.
373
- - The true positive rate (TPR) at the selected threshold.
374
- - The false positive rate (FPR) at the selected threshold.
375
-
376
- Raises
377
- ------
378
- ValueError
379
- If `thresholds`, `tprs`, or `fprs` are empty or have mismatched lengths.
380
- """
381
- max_index = np.argmax(np.abs(tprs - fprs))
382
-
383
- # Retrieve the corresponding threshold, TPR, and FPR
384
- threshold = thresholds[max_index]
385
- tpr = tprs[max_index]
386
- fpr = fprs[max_index]
387
- return (threshold, tpr, fpr)
388
-
389
-
390
-
391
-
392
-
393
-
394
-
395
-
396
-
397
- class MS(ThresholdOptimization):
398
- """
399
- Median Sweep (MS). This quantification method uses an ensemble
400
- of threshold-based methods, taking the median values of the
401
- true positive rate (TPR) and false positive rate (FPR) across
402
- all thresholds to compute adjusted prevalences.
403
-
404
- Parameters
405
- ----------
406
- learner : BaseEstimator
407
- A scikit-learn compatible classifier to be used for quantification.
408
- threshold : float, optional
409
- The default threshold value to use for the quantification method. Default is 0.5.
410
-
411
- Attributes
412
- ----------
413
- learner : BaseEstimator
414
- A scikit-learn compatible classifier.
415
- threshold : float
416
- The default threshold to use for the quantification method, typically 0.5.
417
-
418
- See Also
419
- --------
420
- ThresholdOptimization : Base class for threshold-based quantification methods.
421
- ACC : Adjusted Classify and Count quantification method.
422
- MAX : Threshold MAX quantification method.
423
- CC : Classify and Count quantification method.
424
-
425
- References
426
- ----------
427
- FORMAN, George. Quantifying counts and costs via classification. Data Mining and Knowledge Discovery, v. 17, p. 164-206, 2008. Available at: https://link.springer.com/article/10.1007/s10618-008-0097-y
428
-
429
- Examples
430
- --------
431
- >>> from mlquantify.methods.aggregative import MS
432
- >>> from mlquantify.utils.general import get_real_prev
433
- >>> from sklearn.datasets import load_breast_cancer
434
- >>> from sklearn.svm import SVC
435
- >>> from sklearn.model_selection import train_test_split
436
- >>>
437
- >>> features, target = load_breast_cancer(return_X_y=True)
438
- >>>
439
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
440
- >>>
441
- >>> ms = MS(learner=SVC(probability=True))
442
- >>> ms.fit(X_train, y_train)
443
- >>> y_pred = ms.predict(X_test)
444
- >>> y_pred
445
- {0: 0.41287676595138967, 1: 0.5871232340486103}
446
- >>> get_real_prev(y_test)
447
- {0: 0.3991228070175439, 1: 0.6008771929824561}
448
- """
449
-
450
- def __init__(self, learner: BaseEstimator=None):
451
- super().__init__(learner)
452
-
453
- def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
454
- """
455
- Determines the optimal TPR and FPR by taking the median of
456
- all TPR and FPR values across the given thresholds.
457
-
458
- This method computes the median values of TPR and FPR to
459
- mitigate the influence of outliers and variability in the
460
- performance metrics.
461
-
462
- Parameters
463
- ----------
464
- thresholds : np.ndarray
465
- An array of threshold values.
466
- tprs : np.ndarray
467
- An array of true positive rates corresponding to the thresholds.
468
- fprs : np.ndarray
469
- An array of false positive rates corresponding to the thresholds.
470
-
471
- Returns
472
- -------
473
- tuple
474
- A tuple containing:
475
- - The default threshold value (float).
476
- - The median true positive rate (float).
477
- - The median false positive rate (float).
478
-
479
- Raises
480
- ------
481
- ValueError
482
- If `thresholds`, `tprs`, or `fprs` are empty or have mismatched lengths.
483
- """
484
-
485
- return (thresholds, tprs, fprs)
486
-
487
- def _predict_method(self, X) -> dict:
488
- """
489
- Predicts class prevalences using the adjusted threshold.
490
-
491
- Parameters
492
- ----------
493
- X : pd.DataFrame or np.ndarray
494
- The input features for prediction.
495
-
496
- Returns
497
- -------
498
- np.ndarray
499
- An array of predicted prevalences for the classes.
500
- """
501
- # Get predicted probabilities for the positive class
502
- probabilities = self.predict_learner(X)[:, 1]
503
-
504
- prevs = []
505
-
506
- for thr, tpr, fpr in zip(self.threshold, self.tpr, self.fpr):
507
- cc_output = len(probabilities[probabilities >= thr]) / len(probabilities)
508
-
509
- if tpr - fpr == 0:
510
- prevalence = cc_output
511
- else:
512
- prev = np.clip((cc_output - fpr) / (tpr - fpr), 0, 1)
513
- prevs.append(prev)
514
-
515
- prevalence = np.median(prevs)
516
-
517
- prevalences = [1 - prevalence, prevalence]
518
-
519
- return np.asarray(prevalences)
520
-
521
-
522
-
523
-
524
-
525
-
526
-
527
-
528
- class MS2(ThresholdOptimization):
529
- """
530
- Median Sweep 2 (MS2). This method is an extension of the
531
- Median Sweep strategy, but it focuses only on cases where
532
- the difference between the true positive rate (TPR) and the
533
- false positive rate (FPR) exceeds a threshold (0.25). The
534
- method computes the median values of TPR, FPR, and thresholds
535
- for these selected cases.
536
-
537
- Parameters
538
- ----------
539
- learner : BaseEstimator
540
- A scikit-learn compatible classifier to be used for quantification.
541
-
542
- Attributes
543
- ----------
544
- learner : BaseEstimator
545
- A scikit-learn compatible classifier.
546
-
547
- References
548
- ----------
549
- FORMAN, George. Quantifying counts and costs via classification. Data Mining and Knowledge Discovery, v. 17, p. 164-206, 2008. Available at: https://link.springer.com/article/10.1007/s10618-008-0097-y
550
-
551
- See Also
552
- --------
553
- ThresholdOptimization : Base class for threshold-based quantification methods.
554
- ACC : Adjusted Classify and Count quantification method.
555
- MS : Median Sweep quantification method.
556
- CC : Classify and Count quantification method.
557
-
558
- Examples
559
- --------
560
- >>> from mlquantify.methods.aggregative import MS2
561
- >>> from mlquantify.utils.general import get_real_prev
562
- >>> from sklearn.datasets import load_breast_cancer
563
- >>> from sklearn.svm import SVC
564
- >>> from sklearn.model_selection import train_test_split
565
- >>>
566
- >>> features, target = load_breast_cancer(return_X_y=True)
567
- >>>
568
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
569
- >>>
570
- >>> ms2 = MS2(learner=SVC(probability=True))
571
- >>> ms2.fit(X_train, y_train)
572
- >>> y_pred = ms2.predict(X_test)
573
- >>> y_pred
574
- {0: 0.41287676595138967, 1: 0.5871232340486103}
575
- >>> get_real_prev(y_test)
576
- {0: 0.3991228070175439, 1: 0.6008771929824561}
577
- """
578
-
579
- def __init__(self, learner: BaseEstimator=None):
580
- super().__init__(learner)
581
-
582
- def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
583
- """
584
- Determines the optimal threshold, TPR, and FPR by focusing only on
585
- cases where the absolute difference between TPR and FPR is greater
586
- than 0.25. For these cases, the method computes the median values.
587
-
588
- Parameters
589
- ----------
590
- thresholds : np.ndarray
591
- An array of threshold values.
592
- tprs : np.ndarray
593
- An array of true positive rates corresponding to the thresholds.
594
- fprs : np.ndarray
595
- An array of false positive rates corresponding to the thresholds.
596
-
597
- Returns
598
- -------
599
- tuple
600
- A tuple containing:
601
- - The median threshold value for cases meeting the condition (float).
602
- - The median true positive rate for cases meeting the condition (float).
603
- - The median false positive rate for cases meeting the condition (float).
604
-
605
- Raises
606
- ------
607
- ValueError
608
- If no cases satisfy the condition `|TPR - FPR| > 0.25`.
609
- Warning
610
- If all TPR or FPR values are zero.
611
- """
612
- # Check if all TPR or FPR values are zero
613
- if np.all(tprs == 0) or np.all(fprs == 0):
614
- warnings.warn("All TPR or FPR values are zero.")
615
-
616
- # Identify indices where the condition is satisfied
617
- indices = np.where(np.abs(tprs - fprs) > 0.25)[0]
618
- if len(indices) == 0:
619
- warnings.warn("No cases satisfy the condition |TPR - FPR| > 0.25.")
620
- indices = np.where(np.abs(tprs - fprs) >= 0)[0]
621
-
622
- thresholds_ = thresholds[indices]
623
- tprs_ = tprs[indices]
624
- fprs_ = fprs[indices]
625
-
626
- return (thresholds_, tprs_, fprs_)
627
-
628
- def _predict_method(self, X) -> dict:
629
- """
630
- Predicts class prevalences using the adjusted threshold.
631
-
632
- Parameters
633
- ----------
634
- X : pd.DataFrame or np.ndarray
635
- The input features for prediction.
636
-
637
- Returns
638
- -------
639
- np.ndarray
640
- An array of predicted prevalences for the classes.
641
- """
642
- # Get predicted probabilities for the positive class
643
- probabilities = self.predict_learner(X)[:, 1]
644
-
645
- prevs = []
646
-
647
- for thr, tpr, fpr in zip(self.threshold, self.tpr, self.fpr):
648
- cc_output = len(probabilities[probabilities >= thr]) / len(probabilities)
649
-
650
- if tpr - fpr == 0:
651
- prevalence = cc_output
652
- else:
653
- prev = np.clip((cc_output - fpr) / (tpr - fpr), 0, 1)
654
- prevs.append(prev)
655
-
656
- prevalence = np.median(prevs)
657
-
658
- prevalences = [1 - prevalence, prevalence]
659
-
660
- return np.asarray(prevalences)
661
-
662
-
663
-
664
- class T50(ThresholdOptimization):
665
- """
666
- Threshold 50 (T50). This method adjusts the decision threshold
667
- to the point where the true positive rate (TPR) is approximately
668
- equal to 0.5. This approach is particularly useful for balancing
669
- sensitivity and specificity in binary classification tasks.
670
-
671
- Parameters
672
- ----------
673
- learner : BaseEstimator
674
- A scikit-learn compatible classifier to be used for quantification.
675
-
676
- Attributes
677
- ----------
678
- learner : BaseEstimator
679
- A scikit-learn compatible classifier.
680
- threshold : float
681
- Decision threshold determined during training.
682
- tpr : float
683
- True positive rate corresponding to the selected threshold.
684
- fpr : float
685
- False positive rate corresponding to the selected threshold.
686
-
687
- See Also
688
- --------
689
- ThresholdOptimization : Base class for threshold-based quantification methods.
690
- ACC : Adjusted Classify and Count quantification method.
691
- CC : Classify and Count quantification method.
692
-
693
- References
694
- ----------
695
- FORMAN, George. Quantifying counts and costs via classification. Data Mining and Knowledge Discovery, v. 17, p. 164-206, 2008. Available at: https://link.springer.com/article/10.1007/s10618-008-0097-y
696
-
697
- Examples
698
- --------
699
- >>> from mlquantify.methods.aggregative import T50
700
- >>> from mlquantify.utils.general import get_real_prev
701
- >>> from sklearn.datasets import load_breast_cancer
702
- >>> from sklearn.svm import SVC
703
- >>> from sklearn.model_selection import train_test_split
704
- >>>
705
- >>> features, target = load_breast_cancer(return_X_y=True)
706
- >>>
707
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
708
- >>>
709
- >>> t50 = T50(learner=SVC(probability=True))
710
- >>> t50.fit(X_train, y_train)
711
- >>> y_pred = t50.predict(X_test)
712
- >>> y_pred
713
- {0: 0.49563196626070505, 1: 0.504368033739295}
714
- >>> get_real_prev(y_test)
715
- {0: 0.3991228070175439, 1: 0.6008771929824561}
716
- """
717
-
718
- def __init__(self, learner: BaseEstimator=None):
719
- super().__init__(learner)
720
-
721
- def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
722
- """
723
- Determines the threshold, true positive rate (TPR), and false positive
724
- rate (FPR) where TPR is closest to 0.5.
725
-
726
- Parameters
727
- ----------
728
- thresholds : np.ndarray
729
- An array of threshold values.
730
- tprs : np.ndarray
731
- An array of true positive rates corresponding to the thresholds.
732
- fprs : np.ndarray
733
- An array of false positive rates corresponding to the thresholds.
734
-
735
- Returns
736
- -------
737
- tuple
738
- A tuple containing the selected threshold, TPR, and FPR.
739
-
740
- Notes
741
- -----
742
- - The method identifies the index where the absolute difference
743
- between TPR and 0.5 is minimized.
744
- - This ensures that the selected threshold represents a balance
745
- point in the ROC space.
746
-
747
- Raises
748
- ------
749
- ValueError
750
- If the arrays `thresholds`, `tprs`, or `fprs` are empty or
751
- misaligned in length.
752
- """
753
- # Find the index where TPR is closest to 0.5
754
- min_index = np.argmin(np.abs(tprs - 0.5))
755
-
756
- # Retrieve the corresponding threshold, TPR, and FPR
757
- threshold = thresholds[min_index]
758
- tpr = tprs[min_index]
759
- fpr = fprs[min_index]
760
-
761
- return (threshold, tpr, fpr)
762
-
763
-
764
-
765
-
766
-
767
-
768
-
769
-
770
-
771
-
772
- class X_method(ThresholdOptimization):
773
- """
774
- Threshold X. This method identifies the decision threshold where the
775
- false positive rate (FPR) is approximately equal to 1 - true positive rate (TPR).
776
- This criterion is useful for identifying thresholds that align with a balance
777
- point on the ROC curve.
778
-
779
- Parameters
780
- ----------
781
- learner : BaseEstimator
782
- A scikit-learn compatible classifier to be used for quantification.
783
-
784
- Attributes
785
- ----------
786
- learner : BaseEstimator
787
- A scikit-learn compatible classifier.
788
- threshold : float
789
- Decision threshold determined during training.
790
- tpr : float
791
- True positive rate corresponding to the selected threshold.
792
- fpr : float
793
- False positive rate corresponding to the selected threshold.
794
-
795
- See Also
796
- --------
797
- ThresholdOptimization : Base class for threshold-based quantification methods.
798
- ACC : Adjusted Classify and Count quantification method.
799
- CC : Classify and Count quantification method.
800
-
801
- References
802
- ----------
803
- FORMAN, George. Quantifying counts and costs via classification. Data Mining and Knowledge Discovery, v. 17, p. 164-206, 2008. Available at: https://link.springer.com/article/10.1007/s10618-008-0097-y
804
-
805
- Examples
806
- --------
807
- >>> from mlquantify.methods.aggregative import X_method
808
- >>> from mlquantify.utils.general import get_real_prev
809
- >>> from sklearn.datasets import load_breast_cancer
810
- >>> from sklearn.svm import SVC
811
- >>> from sklearn.model_selection import train_test_split
812
- >>>
813
- >>> features, target = load_breast_cancer(return_X_y=True)
814
- >>>
815
- >>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
816
- >>>
817
- >>> x_method = X_method(learner=SVC(probability=True))
818
- >>> x_method.fit(X_train, y_train)
819
- >>> y_pred = x_method.predict(X_test)
820
- >>> y_pred
821
- {0: 0.40523495782808205, 1: 0.594765042171918}
822
- >>> get_real_prev(y_test)
823
- {0: 0.3991228070175439, 1: 0.6008771929824561}
824
- """
825
-
826
- def __init__(self, learner: BaseEstimator=None):
827
- super().__init__(learner)
828
-
829
- def best_tprfpr(self, thresholds: np.ndarray, tprs: np.ndarray, fprs: np.ndarray) -> tuple:
830
- """
831
- Determines the threshold, true positive rate (TPR), and false positive
832
- rate (FPR) where FPR is closest to 1 - TPR.
833
-
834
- Parameters
835
- ----------
836
- thresholds : np.ndarray
837
- An array of threshold values.
838
- tprs : np.ndarray
839
- An array of true positive rates corresponding to the thresholds.
840
- fprs : np.ndarray
841
- An array of false positive rates corresponding to the thresholds.
842
-
843
- Returns
844
- -------
845
- tuple
846
- A tuple containing the selected threshold, TPR, and FPR.
847
-
848
- Notes
849
- -----
850
- - The method identifies the index where the absolute difference
851
- between FPR and 1 - TPR is minimized.
852
- - This ensures that the selected threshold corresponds to a balance
853
- point based on the given criterion.
854
-
855
- Raises
856
- ------
857
- ValueError
858
- If the arrays `thresholds`, `tprs`, or `fprs` are empty or
859
- misaligned in length.
860
- """
861
- # Find the index where FPR is closest to 1 - TPR
862
- min_index = np.argmin(np.abs(1 - (tprs + fprs)))
863
-
864
- # Retrieve the corresponding threshold, TPR, and FPR
865
- threshold = thresholds[min_index]
866
- tpr = tprs[min_index]
867
- fpr = fprs[min_index]
868
-
869
- return (threshold, tpr, fpr)