mlquantify 0.1.7__tar.gz → 0.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. {mlquantify-0.1.7/mlquantify.egg-info → mlquantify-0.1.9}/PKG-INFO +2 -1
  2. mlquantify-0.1.9/VERSION.txt +1 -0
  3. mlquantify-0.1.9/mlquantify/__init__.py +3 -0
  4. mlquantify-0.1.9/mlquantify/adjust_counting/__init__.py +14 -0
  5. mlquantify-0.1.9/mlquantify/adjust_counting/_adjustment.py +365 -0
  6. mlquantify-0.1.9/mlquantify/adjust_counting/_base.py +247 -0
  7. mlquantify-0.1.9/mlquantify/adjust_counting/_counting.py +145 -0
  8. mlquantify-0.1.9/mlquantify/adjust_counting/_utils.py +114 -0
  9. mlquantify-0.1.9/mlquantify/base.py +157 -0
  10. mlquantify-0.1.9/mlquantify/base_aggregative.py +209 -0
  11. mlquantify-0.1.9/mlquantify/calibration.py +1 -0
  12. mlquantify-0.1.9/mlquantify/confidence.py +335 -0
  13. mlquantify-0.1.9/mlquantify/likelihood/__init__.py +5 -0
  14. mlquantify-0.1.9/mlquantify/likelihood/_base.py +161 -0
  15. mlquantify-0.1.9/mlquantify/likelihood/_classes.py +414 -0
  16. mlquantify-0.1.9/mlquantify/meta/__init__.py +1 -0
  17. mlquantify-0.1.9/mlquantify/meta/_classes.py +761 -0
  18. mlquantify-0.1.9/mlquantify/metrics/__init__.py +21 -0
  19. mlquantify-0.1.9/mlquantify/metrics/_oq.py +109 -0
  20. mlquantify-0.1.9/mlquantify/metrics/_rq.py +98 -0
  21. mlquantify-0.1.7/mlquantify/evaluation/measures.py → mlquantify-0.1.9/mlquantify/metrics/_slq.py +43 -28
  22. mlquantify-0.1.9/mlquantify/mixture/__init__.py +7 -0
  23. mlquantify-0.1.9/mlquantify/mixture/_base.py +153 -0
  24. mlquantify-0.1.9/mlquantify/mixture/_classes.py +400 -0
  25. mlquantify-0.1.9/mlquantify/mixture/_utils.py +112 -0
  26. mlquantify-0.1.9/mlquantify/model_selection/__init__.py +9 -0
  27. mlquantify-0.1.9/mlquantify/model_selection/_protocol.py +358 -0
  28. mlquantify-0.1.9/mlquantify/model_selection/_search.py +315 -0
  29. mlquantify-0.1.9/mlquantify/model_selection/_split.py +1 -0
  30. mlquantify-0.1.9/mlquantify/multiclass.py +350 -0
  31. mlquantify-0.1.9/mlquantify/neighbors/__init__.py +9 -0
  32. mlquantify-0.1.9/mlquantify/neighbors/_base.py +198 -0
  33. mlquantify-0.1.9/mlquantify/neighbors/_classes.py +159 -0
  34. mlquantify-0.1.7/mlquantify/classification/methods.py → mlquantify-0.1.9/mlquantify/neighbors/_classification.py +48 -66
  35. mlquantify-0.1.9/mlquantify/neighbors/_kde.py +270 -0
  36. mlquantify-0.1.9/mlquantify/neighbors/_utils.py +135 -0
  37. mlquantify-0.1.9/mlquantify/neural/__init__.py +1 -0
  38. mlquantify-0.1.9/mlquantify/utils/__init__.py +47 -0
  39. mlquantify-0.1.9/mlquantify/utils/_artificial.py +27 -0
  40. mlquantify-0.1.9/mlquantify/utils/_constraints.py +219 -0
  41. mlquantify-0.1.9/mlquantify/utils/_context.py +21 -0
  42. mlquantify-0.1.9/mlquantify/utils/_decorators.py +36 -0
  43. mlquantify-0.1.9/mlquantify/utils/_exceptions.py +12 -0
  44. mlquantify-0.1.9/mlquantify/utils/_get_scores.py +159 -0
  45. mlquantify-0.1.9/mlquantify/utils/_load.py +18 -0
  46. mlquantify-0.1.9/mlquantify/utils/_parallel.py +6 -0
  47. mlquantify-0.1.9/mlquantify/utils/_random.py +36 -0
  48. mlquantify-0.1.9/mlquantify/utils/_sampling.py +273 -0
  49. mlquantify-0.1.9/mlquantify/utils/_tags.py +44 -0
  50. mlquantify-0.1.9/mlquantify/utils/_validation.py +447 -0
  51. mlquantify-0.1.9/mlquantify/utils/prevalence.py +61 -0
  52. {mlquantify-0.1.7 → mlquantify-0.1.9/mlquantify.egg-info}/PKG-INFO +2 -1
  53. mlquantify-0.1.9/mlquantify.egg-info/SOURCES.txt +58 -0
  54. {mlquantify-0.1.7 → mlquantify-0.1.9}/mlquantify.egg-info/requires.txt +1 -0
  55. {mlquantify-0.1.7 → mlquantify-0.1.9}/setup.py +1 -1
  56. mlquantify-0.1.7/VERSION.txt +0 -1
  57. mlquantify-0.1.7/mlquantify/__init__.py +0 -32
  58. mlquantify-0.1.7/mlquantify/base.py +0 -559
  59. mlquantify-0.1.7/mlquantify/classification/__init__.py +0 -1
  60. mlquantify-0.1.7/mlquantify/evaluation/__init__.py +0 -14
  61. mlquantify-0.1.7/mlquantify/evaluation/protocol.py +0 -291
  62. mlquantify-0.1.7/mlquantify/methods/__init__.py +0 -37
  63. mlquantify-0.1.7/mlquantify/methods/aggregative.py +0 -1159
  64. mlquantify-0.1.7/mlquantify/methods/meta.py +0 -472
  65. mlquantify-0.1.7/mlquantify/methods/mixture_models.py +0 -1003
  66. mlquantify-0.1.7/mlquantify/methods/non_aggregative.py +0 -136
  67. mlquantify-0.1.7/mlquantify/methods/threshold_optimization.py +0 -869
  68. mlquantify-0.1.7/mlquantify/model_selection.py +0 -377
  69. mlquantify-0.1.7/mlquantify/plots.py +0 -367
  70. mlquantify-0.1.7/mlquantify/utils/__init__.py +0 -2
  71. mlquantify-0.1.7/mlquantify/utils/general.py +0 -371
  72. mlquantify-0.1.7/mlquantify/utils/method.py +0 -449
  73. mlquantify-0.1.7/mlquantify.egg-info/SOURCES.txt +0 -27
  74. {mlquantify-0.1.7 → mlquantify-0.1.9}/MANIFEST.in +0 -0
  75. {mlquantify-0.1.7 → mlquantify-0.1.9}/README.md +0 -0
  76. {mlquantify-0.1.7 → mlquantify-0.1.9}/mlquantify.egg-info/dependency_links.txt +0 -0
  77. {mlquantify-0.1.7 → mlquantify-0.1.9}/mlquantify.egg-info/top_level.txt +0 -0
  78. {mlquantify-0.1.7 → mlquantify-0.1.9}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mlquantify
3
- Version: 0.1.7
3
+ Version: 0.1.9
4
4
  Summary: Quantification Library
5
5
  Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
6
6
  Maintainer: Luiz Fernando Luth Junior
@@ -20,6 +20,7 @@ Requires-Dist: tqdm
20
20
  Requires-Dist: pandas
21
21
  Requires-Dist: xlrd
22
22
  Requires-Dist: matplotlib
23
+ Requires-Dist: abstention
23
24
  Dynamic: classifier
24
25
  Dynamic: description
25
26
  Dynamic: description-content-type
@@ -0,0 +1 @@
1
+ 0.1.9
@@ -0,0 +1,3 @@
1
+ "mlquantify, a Python package for quantification"
2
+
3
+
@@ -0,0 +1,14 @@
1
+ from ._counting import CC, PCC
2
+ from ._adjustment import (
3
+ ThresholdAdjustment,
4
+ MatrixAdjustment,
5
+ FM,
6
+ GAC,
7
+ GPAC,
8
+ ACC,
9
+ X_method,
10
+ MAX,
11
+ T50,
12
+ MS,
13
+ MS2,
14
+ )
@@ -0,0 +1,365 @@
1
+ import numpy as np
2
+ from abc import abstractmethod
3
+ from scipy.optimize import minimize
4
+ import warnings
5
+
6
+ from mlquantify.adjust_counting._base import BaseAdjustCount
7
+ from mlquantify.adjust_counting._counting import CC, PCC
8
+ from mlquantify.base_aggregative import (
9
+ CrispLearnerQMixin,
10
+ SoftLearnerQMixin,
11
+ uses_soft_predictions,
12
+ )
13
+ from mlquantify.multiclass import define_binary
14
+ from mlquantify.adjust_counting._utils import evaluate_thresholds
15
+ from mlquantify.utils._constraints import Interval, Options
16
+
17
+
18
+ @define_binary
19
+ class ThresholdAdjustment(SoftLearnerQMixin, BaseAdjustCount):
20
+ r"""
21
+ Applies threshold-based adjustment methods for quantification.
22
+
23
+ This is the base class for methods such as ACC, X, MAX, T50, MS, and MS2,
24
+ which adjust prevalence estimates based on the classifier’s ROC curve, as proposed by
25
+ Forman (2005, 2008).
26
+
27
+ These methods correct the bias in *Classify & Count (CC)* estimates caused by differences
28
+ in class distributions between the training and test datasets.
29
+
30
+ Mathematical formulation
31
+
32
+ Given:
33
+ - \( p' \): observed positive proportion from CC,
34
+ - \( \text{TPR} = P(\hat{y}=1|y=1) \),
35
+ - \( \text{FPR} = P(\hat{y}=1|y=0) \),
36
+
37
+ the adjusted prevalence is given by:
38
+
39
+ \[
40
+ \hat{p} = \frac{p' - \text{FPR}}{\text{TPR} - \text{FPR}}
41
+ \]
42
+
43
+ (Forman, *Counting Positives Accurately Despite Inaccurate Classification*, ECML 2005;
44
+ *Quantifying Counts and Costs via Classification*, DMKD 2008).
45
+
46
+
47
+ Notes
48
+ -----
49
+ - Defined only for binary quantification tasks.
50
+ - When applied to multiclass problems, the one-vs-rest strategy (`ovr`) is used automatically.
51
+
52
+
53
+ Parameters
54
+ ----------
55
+ learner : estimator, optional
56
+ A supervised learning model with `fit` and `predict_proba` methods.
57
+ threshold : float, default=0.5
58
+ Classification threshold in [0, 1].
59
+ strategy : {'ovr'}, default='ovr'
60
+ Strategy used for multiclass adaptation.
61
+
62
+
63
+ Attributes
64
+ ----------
65
+ learner : estimator
66
+ The underlying classification model.
67
+ classes : ndarray of shape (n_classes,)
68
+ Unique class labels observed during training.
69
+
70
+
71
+ Examples
72
+ --------
73
+ >>> from sklearn.linear_model import LogisticRegression
74
+ >>> from mlquantify.adjust_counting import ThresholdAdjustment
75
+ >>> import numpy as np
76
+ >>> class CustomThreshold(ThresholdAdjustment):
77
+ ... def _get_best_threshold(self, thresholds, tprs, fprs):
78
+ ... idx = np.argmax(tprs - fprs)
79
+ ... return thresholds[idx], tprs[idx], fprs[idx]
80
+ >>> X = np.random.randn(100, 4)
81
+ >>> y = np.random.randint(0, 2, 100)
82
+ >>> q = CustomThreshold(learner=LogisticRegression())
83
+ >>> q.fit(X, y)
84
+ >>> q.predict(X)
85
+ {0: 0.49, 1: 0.51}
86
+ """
87
+
88
+ _parameter_constraints = {
89
+ "threshold": [
90
+ Interval(0.0, 1.0),
91
+ Interval(0, 1, discrete=True),
92
+ ],
93
+ }
94
+
95
+ def __init__(self, learner=None, threshold=0.5, strategy="ovr"):
96
+ super().__init__(learner=learner)
97
+ self.threshold = threshold
98
+ self.strategy = strategy
99
+
100
+ def _adjust(self, predictions, train_y_scores, train_y_values):
101
+ """Internal adjustment computation based on selected ROC threshold."""
102
+ positive_scores = train_y_scores[:, 1]
103
+
104
+ thresholds, tprs, fprs = evaluate_thresholds(train_y_values, positive_scores, self.classes_)
105
+ threshold, tpr, fpr = self._get_best_threshold(thresholds, tprs, fprs)
106
+
107
+ cc_predictions = CC(threshold).aggregate(predictions)[1]
108
+
109
+ if tpr - fpr == 0:
110
+ prevalence = cc_predictions
111
+ else:
112
+ prevalence = np.clip((cc_predictions - fpr) / (tpr - fpr), 0, 1)
113
+
114
+ return np.asarray([1 - prevalence, prevalence])
115
+
116
+ @abstractmethod
117
+ def _get_best_threshold(self, thresholds, tprs, fprs):
118
+ """Select the best threshold according to the specific method."""
119
+ ...
120
+
121
+
122
+ class MatrixAdjustment(BaseAdjustCount):
123
+ r"""
124
+ Base class for matrix-based quantification adjustments (FM, GAC, GPAC).
125
+
126
+ This class implements the matrix correction model for quantification
127
+ as formulated in Firat (2016), which expresses the observed prevalences as
128
+ a linear combination of true prevalences through the confusion matrix.
129
+
130
+ Mathematical model
131
+
132
+ The system is given by:
133
+
134
+ \[
135
+ \mathbf{y} = \mathbf{C}\hat{\pi}_F + \varepsilon
136
+ \]
137
+
138
+ subject to:
139
+
140
+ \[
141
+ \hat{\pi}_F \ge 0, \quad \sum_k \hat{\pi}_{F,k} = 1
142
+ \]
143
+
144
+ where:
145
+ - \( \mathbf{y} \): vector of predicted prevalences in test set,
146
+ - \( \mathbf{C} \): confusion matrix,
147
+ - \( \hat{\pi}_F \): true class prevalence vector (unknown),
148
+ - \( \varepsilon \): residual error.
149
+
150
+ The model can be solved either via:
151
+ - Linear algebraic solution, or
152
+ - Constrained optimization (quadratic or least-squares).
153
+
154
+
155
+ Parameters
156
+ ----------
157
+ learner : estimator, optional
158
+ Classifier with `fit` and `predict` methods.
159
+ solver : {'optim', 'linear'}, optional
160
+ Solver for the adjustment system:
161
+ - `'linear'`: uses matrix inversion (e.g., GAC, GPAC)
162
+ - `'optim'`: uses optimization (e.g., FM)
163
+
164
+
165
+ Attributes
166
+ ----------
167
+ CM : ndarray of shape (n_classes, n_classes)
168
+ Confusion matrix used for correction.
169
+ classes : ndarray
170
+ Class labels observed in training.
171
+
172
+
173
+ References
174
+ ----------
175
+ - Firat, A. (2016). *Unified Framework for Quantification.* AAAI, pp. 1-8.
176
+
177
+
178
+ Examples
179
+ --------
180
+ >>> from sklearn.linear_model import LogisticRegression
181
+ >>> from mlquantify.adjust_counting import MatrixAdjustment
182
+ >>> class MyMatrix(MatrixAdjustment):
183
+ ... def _compute_confusion_matrix(self, preds, y):
184
+ ... cm = np.ones((2, 2))
185
+ ... return cm / cm.sum(axis=1, keepdims=True)
186
+ >>> q = MyMatrix(learner=LogisticRegression(), solver='linear')
187
+ >>> X = np.random.randn(50, 4)
188
+ >>> y = np.random.randint(0, 2, 50)
189
+ >>> q.fit(X, y)
190
+ >>> q.predict(X)
191
+ {0: 0.5, 1: 0.5}
192
+ """
193
+
194
+ _parameter_constraints = {"solver": Options(["optim", "linear"])}
195
+
196
+ def __init__(self, learner=None, solver=None):
197
+ super().__init__(learner=learner)
198
+ self.solver = solver
199
+
200
+ def _adjust(self, predictions, train_y_pred, train_y_values):
201
+ n_class = len(np.unique(train_y_values))
202
+ self.CM = np.zeros((n_class, n_class))
203
+
204
+ if self.solver == 'optim':
205
+ priors = np.array(list(CC().aggregate(train_y_pred).values()))
206
+ self.CM = self._compute_confusion_matrix(train_y_pred, train_y_values, priors)
207
+ prevs_estim = self._get_estimations(predictions > priors)
208
+ prevalence = self._solve_optimization(prevs_estim, priors)
209
+ else:
210
+ self.CM = self._compute_confusion_matrix(train_y_pred)
211
+ prevs_estim = self._get_estimations(predictions)
212
+ prevalence = self._solve_linear(prevs_estim)
213
+
214
+ return prevalence
215
+
216
+ def _solve_linear(self, prevs_estim):
217
+ r"""
218
+ Solve the system linearly:
219
+
220
+ \[
221
+ \hat{\pi}_F = \mathbf{C}^{-1} \mathbf{p}
222
+ \]
223
+ """
224
+ try:
225
+ adjusted = np.linalg.solve(self.CM, prevs_estim)
226
+ adjusted = np.clip(adjusted, 0, 1)
227
+ adjusted /= adjusted.sum()
228
+ except np.linalg.LinAlgError:
229
+ adjusted = prevs_estim
230
+ return adjusted
231
+
232
+ def _solve_optimization(self, prevs_estim, priors):
233
+ r"""
234
+ Solve via constrained least squares:
235
+
236
+ \[
237
+ \min_{\hat{\pi}_F} \| \mathbf{C}\hat{\pi}_F - \mathbf{p} \|_2^2
238
+ \quad \text{s.t. } \hat{\pi}_F \ge 0, \ \sum_k \hat{\pi}_{F,k} = 1
239
+ \]
240
+ """
241
+ def objective(prevs_pred):
242
+ return np.linalg.norm(self.CM @ prevs_pred - prevs_estim)
243
+
244
+ constraints = [
245
+ {'type': 'eq', 'fun': lambda x: np.sum(x) - 1},
246
+ {'type': 'ineq', 'fun': lambda x: x}
247
+ ]
248
+ bounds = [(0, 1)] * self.CM.shape[1]
249
+ init = np.full(self.CM.shape[1], 1 / self.CM.shape[1])
250
+ result = minimize(objective, init, constraints=constraints, bounds=bounds)
251
+ return result.x if result.success else priors
252
+
253
+ def _get_estimations(self, predictions):
254
+ """Return prevalence estimates using CC (crisp) or PCC (probabilistic)."""
255
+ if uses_soft_predictions(self):
256
+ return np.array(list(PCC().aggregate(predictions).values()))
257
+ return np.array(list(CC().aggregate(predictions).values()))
258
+
259
+ @abstractmethod
260
+ def _compute_confusion_matrix(self, predictions, *args):
261
+ ...
262
+
263
+
264
+ class FM(SoftLearnerQMixin, MatrixAdjustment):
265
+ """Forman's Matrix Adjustment (FM) — solved via optimization."""
266
+ def __init__(self, learner=None):
267
+ super().__init__(learner=learner, solver='optim')
268
+
269
+ def _compute_confusion_matrix(self, posteriors, y_true, priors):
270
+ for i, _class in enumerate(self.classes_):
271
+ indices = (y_true == _class)
272
+ self.CM[:, i] = self._get_estimations(posteriors[indices] > priors)
273
+ return self.CM
274
+
275
+
276
+ class GAC(CrispLearnerQMixin, MatrixAdjustment):
277
+ """Gonzalez-Castro’s Generalized Adjusted Count (GAC) method."""
278
+ def __init__(self, learner=None):
279
+ super().__init__(learner=learner, solver='linear')
280
+
281
+ def _compute_confusion_matrix(self, predictions):
282
+ prev_estim = self._get_estimations(predictions)
283
+ for i, _ in enumerate(self.classes_):
284
+ if prev_estim[i] == 0:
285
+ self.CM[i, i] = 1
286
+ else:
287
+ self.CM[:, i] /= prev_estim[i]
288
+ return self.CM
289
+
290
+
291
+ class GPAC(SoftLearnerQMixin, MatrixAdjustment):
292
+ """Probabilistic GAC (GPAC) — soft version using posterior probabilities."""
293
+ def __init__(self, learner=None):
294
+ super().__init__(learner=learner, solver='linear')
295
+
296
+ def _compute_confusion_matrix(self, posteriors):
297
+ prev_estim = self._get_estimations(posteriors)
298
+ for i, _ in enumerate(self.classes_):
299
+ if prev_estim[i] == 0:
300
+ self.CM[i, i] = 1
301
+ else:
302
+ self.CM[:, i] /= prev_estim[i]
303
+ return self.CM
304
+
305
+
306
+ class ACC(ThresholdAdjustment):
307
+ """Adjusted Count (ACC) — baseline threshold correction."""
308
+ def _get_best_threshold(self, thresholds, tprs, fprs):
309
+ tpr = tprs[thresholds == self.threshold][0]
310
+ fpr = fprs[thresholds == self.threshold][0]
311
+ return (self.threshold, tpr, fpr)
312
+
313
+
314
+ class X_method(ThresholdAdjustment):
315
+ """X method — threshold where \( \text{TPR} + \text{FPR} = 1 \)."""
316
+ def _get_best_threshold(self, thresholds, tprs, fprs):
317
+ idx = np.argmin(np.abs(1 - (tprs + fprs)))
318
+ return thresholds[idx], tprs[idx], fprs[idx]
319
+
320
+
321
+ class MAX(ThresholdAdjustment):
322
+ r"""MAX method — threshold maximizing \( \text{TPR} - \text{FPR} \)."""
323
+ def _get_best_threshold(self, thresholds, tprs, fprs):
324
+ idx = np.argmax(np.abs(tprs - fprs))
325
+ return thresholds[idx], tprs[idx], fprs[idx]
326
+
327
+
328
+ class T50(ThresholdAdjustment):
329
+ r"""T50 — selects threshold where \( \text{TPR} = 0.5 \)."""
330
+ def _get_best_threshold(self, thresholds, tprs, fprs):
331
+ idx = np.argmin(np.abs(tprs - 0.5))
332
+ return thresholds[idx], tprs[idx], fprs[idx]
333
+
334
+
335
+ class MS(ThresholdAdjustment):
336
+ r"""Median Sweep (MS) — median prevalence across all thresholds."""
337
+ def _adjust(self, predictions, train_y_scores, train_y_values):
338
+ positive_scores = train_y_scores[:, 1]
339
+
340
+ thresholds, tprs, fprs = evaluate_thresholds(train_y_values, positive_scores, self.classes_)
341
+ thresholds, tprs, fprs = self._get_best_threshold(thresholds, tprs, fprs)
342
+
343
+ prevs = []
344
+ for thr, tpr, fpr in zip(thresholds, tprs, fprs):
345
+ cc_predictions = CC(thr).aggregate(predictions)
346
+ cc_predictions = cc_predictions[1]
347
+ prevalence = cc_predictions if tpr - fpr == 0 else (cc_predictions - fpr) / (tpr - fpr)
348
+ prevs.append(prevalence)
349
+ prevalence = np.median(prevs)
350
+ return np.asarray([1 - prevalence, prevalence])
351
+
352
+ def _get_best_threshold(self, thresholds, tprs, fprs):
353
+ return thresholds, tprs, fprs
354
+
355
+
356
+ class MS2(MS):
357
+ r"""MS2 — Median Sweep variant with constraint \( |\text{TPR} - \text{FPR}| > 0.25 \)."""
358
+ def _get_best_threshold(self, thresholds, tprs, fprs):
359
+ if np.all(tprs == 0) or np.all(fprs == 0):
360
+ warnings.warn("All TPR or FPR values are zero.")
361
+ indices = np.where(np.abs(tprs - fprs) > 0.25)[0]
362
+ if len(indices) == 0:
363
+ warnings.warn("No cases satisfy |TPR - FPR| > 0.25.")
364
+ indices = np.where(np.abs(tprs - fprs) >= 0)[0]
365
+ return thresholds[indices], tprs[indices], fprs[indices]
@@ -0,0 +1,247 @@
1
+ import numpy as np
2
+ from abc import abstractmethod
3
+
4
+ from mlquantify.base import BaseQuantifier
5
+
6
+ from mlquantify.base_aggregative import (
7
+ AggregationMixin,
8
+ _get_learner_function
9
+ )
10
+ from mlquantify.utils._decorators import _fit_context
11
+ from mlquantify.utils._validation import check_classes_attribute, validate_predictions, validate_y, validate_data, validate_prevalences
12
+ from mlquantify.utils._get_scores import apply_cross_validation
13
+
14
+
15
+
16
+
17
+ class BaseCount(AggregationMixin, BaseQuantifier):
18
+ r"""Base class for count-based quantifiers.
19
+
20
+ Implements the foundation for *count-based quantification* methods,
21
+ where class prevalences are estimated directly from classifier outputs
22
+ without any correction.
23
+
24
+ The method assumes a classifier :math:`f(x)` producing either hard or
25
+ probabilistic predictions. The prevalence of each class :math:`c` in
26
+ the unlabeled test set is estimated as:
27
+
28
+ .. math::
29
+ \hat{\pi}_c = \frac{1}{N} \sum_{i=1}^{N} I(f(x_i) = c)
30
+
31
+ for *hard* classifiers, or equivalently as:
32
+
33
+ .. math::
34
+ \hat{\pi}_c = \frac{1}{N} \sum_{i=1}^{N} f_c(x_i)
35
+
36
+ for *soft* classifiers where :math:`f_c(x)` denotes the posterior
37
+ probability of class :math:`c`.
38
+
39
+ This is the classical *Classify and Count (CC)* and *Probabilistic
40
+ Classify and Count (PCC)* approach, introduced by Forman (2005, 2008)
41
+ and unified in the constrained regression framework of Firat et al. (2016).
42
+
43
+ Parameters
44
+ ----------
45
+ learner : object, optional
46
+ A supervised learning model implementing `fit` and `predict`
47
+ or `predict_proba`.
48
+
49
+ Attributes
50
+ ----------
51
+ learner : object
52
+ Underlying classification model.
53
+ classes : ndarray of shape (n_classes,)
54
+ Unique class labels observed during training.
55
+
56
+ Examples
57
+ --------
58
+ >>> from mlquantify.base_count import BaseCount
59
+ >>> from mlquantify.utils.validation import validate_prevalences
60
+ >>> import numpy as np
61
+
62
+ >>> class CC(CrispLearnerQMixin, BaseCount):
63
+ ... def __init__(self, learner=None, threshold=0.5):
64
+ ... super().__init__(learner)
65
+ ... self.threshold = threshold
66
+ ... def aggregate(self, predictions):
67
+ ... predictions = validate_predictions(self, predictions)
68
+ ... self.classes = self.classes if hasattr(self, 'classes') else np.unique(predictions)
69
+ ... counts = np.array([np.count_nonzero(predictions == c) for c in self.classes])
70
+ ... prevalences = counts / len(predictions)
71
+ ... return validate_prevalences(self, prevalences, self.classes)
72
+
73
+ >>> from sklearn.linear_model import LogisticRegression
74
+ >>> X = np.random.randn(100, 5)
75
+ >>> y = np.random.randint(0, 2, 100)
76
+ >>> q = CC(learner=LogisticRegression())
77
+ >>> q.fit(X, y)
78
+ >>> q.predict(X).round(3)
79
+ array([0.47, 0.53])
80
+
81
+ References
82
+ ----------
83
+ [1] Forman, G. (2005). *Counting Positives Accurately Despite Inaccurate Classification.*
84
+ ECML, pp. 564-575.
85
+ [2] Forman, G. (2008). *Quantifying Counts and Costs via Classification.*
86
+ Data Mining and Knowledge Discovery, 17(2), 164-206.
87
+ """
88
+
89
+ @abstractmethod
90
+ def __init__(self, learner=None):
91
+ self.learner = learner
92
+
93
+ def __mlquantify_tags__(self):
94
+ tags = super().__mlquantify_tags__()
95
+ tags.prediction_requirements.requires_train_proba = False
96
+ tags.prediction_requirements.requires_train_labels = False
97
+ return tags
98
+
99
+ @_fit_context(prefer_skip_nested_validation=True)
100
+ def fit(self, X, y, learner_fitted=False, *args, **kwargs):
101
+ """Fit the quantifier using the provided data and learner."""
102
+ X, y = validate_data(self, X, y)
103
+ validate_y(self, y)
104
+ self.classes_ = np.unique(y)
105
+ if not learner_fitted:
106
+ self.learner.fit(X, y, *args, **kwargs)
107
+ return self
108
+
109
+ def predict(self, X):
110
+ """Predict class prevalences for the given data."""
111
+ estimator_function = _get_learner_function(self)
112
+ predictions = getattr(self.learner, estimator_function)(X)
113
+ prevalences = self.aggregate(predictions)
114
+ return prevalences
115
+
116
+ @abstractmethod
117
+ def aggregate(self, predictions):
118
+ """Aggregate predictions into class prevalence estimates."""
119
+ ...
120
+
121
+
122
+ class BaseAdjustCount(AggregationMixin, BaseQuantifier):
123
+ r"""Base class for adjustment-based quantifiers.
124
+
125
+ This class generalizes *adjusted count* quantification methods,
126
+ providing a framework for correcting bias in raw classifier outputs
127
+ based on estimated confusion matrices or rate statistics.
128
+
129
+ Following Forman (2005, 2008), in the binary case the correction
130
+ uses true positive (TPR) and false positive (FPR) rates to adjust
131
+ the observed positive proportion :math:`\hat{p}'_{+}`:
132
+
133
+ .. math::
134
+ \hat{p}_{+} = \frac{\hat{p}'_{+} - \text{FPR}}{\text{TPR} - \text{FPR}}
135
+
136
+ In the multiclass extension (Firat et al., 2016), the same principle
137
+ can be expressed using matrix algebra. Let :math:`C` denote the
138
+ normalized confusion matrix where :math:`C_{ij} = P(\hat{y}=i|y=j)`
139
+ estimated via cross-validation. Then, given the observed distribution
140
+ of predictions :math:`\hat{\pi}'`, the corrected prevalence vector
141
+ :math:`\hat{\pi}` is obtained as:
142
+
143
+ .. math::
144
+ \hat{\pi}' = C \hat{\pi}
145
+ \quad \Rightarrow \quad
146
+ \hat{\pi} = C^{-1} \hat{\pi}'
147
+
148
+ subject to non-negativity and unit-sum constraints:
149
+
150
+ .. math::
151
+ \hat{\pi}_c \ge 0, \quad \sum_c \hat{\pi}_c = 1
152
+
153
+ This formulation can be solved via constrained least squares
154
+ (L2), least absolute deviation (L1), or Hellinger divergence
155
+ minimization, as discussed by Firat et al. (2016).
156
+
157
+ Parameters
158
+ ----------
159
+ learner : object, optional
160
+ Supervised learner implementing `fit`, `predict`, or `predict_proba`.
161
+
162
+ Attributes
163
+ ----------
164
+ learner : object
165
+ Underlying classification model.
166
+ train_predictions : ndarray of shape (n_samples_train, n_classes)
167
+ Predictions on training data from cross-validation.
168
+ train_y_values : ndarray of shape (n_samples_train,)
169
+ True labels corresponding to training predictions.
170
+ classes : ndarray of shape (n_classes,)
171
+ Unique class labels.
172
+
173
+ Examples
174
+ --------
175
+ >>> from mlquantify.base_count import BaseAdjustCount
176
+ >>> import numpy as np
177
+
178
+ >>> class ACC(CrispLearnerQMixin, BaseAdjustCount):
179
+ ... def _adjust(self, preds, train_preds, y_train):
180
+ ... tpr = np.mean(train_preds[y_train == 1])
181
+ ... fpr = np.mean(train_preds[y_train == 0])
182
+ ... p_obs = np.mean(preds)
183
+ ... p_adj = (p_obs - fpr) / (tpr - fpr)
184
+ ... return np.clip([1 - p_adj, p_adj], 0, 1)
185
+
186
+ >>> from sklearn.linear_model import LogisticRegression
187
+ >>> X = np.random.randn(100, 5)
188
+ >>> y = np.random.randint(0, 2, 100)
189
+ >>> q = ACC(learner=LogisticRegression())
190
+ >>> q.fit(X, y)
191
+ >>> q.predict(X).round(3)
192
+ array([0.52, 0.48])
193
+
194
+ References
195
+ ----------
196
+ [1] Forman, G. (2005). *Counting Positives Accurately Despite Inaccurate Classification.*
197
+ ECML 2005, LNAI 3720, pp. 564-575.
198
+ [2] Forman, G. (2008). *Quantifying Counts and Costs via Classification.*
199
+ Data Mining and Knowledge Discovery, 17(2), 164-206.
200
+ [3] Firat, A. (2016). *Unified Framework for Quantification.*
201
+ Proceedings of the AAAI Conference on Artificial Intelligence, Sections 3.2-3.3.
202
+ """
203
+
204
+ @abstractmethod
205
+ def __init__(self, learner=None):
206
+ self.learner = learner
207
+
208
+ @_fit_context(prefer_skip_nested_validation=True)
209
+ def fit(self, X, y, learner_fitted=False):
210
+ """Fit the quantifier using the provided data and learner."""
211
+ X, y = validate_data(self, X, y)
212
+ validate_y(self, y)
213
+ self.classes_ = np.unique(y)
214
+ learner_function = _get_learner_function(self)
215
+
216
+ if learner_fitted:
217
+ train_predictions = getattr(self.learner, learner_function)(X)
218
+ y_train_labels = y
219
+ else:
220
+ train_predictions, y_train_labels = apply_cross_validation(
221
+ self.learner,
222
+ X,
223
+ y,
224
+ function=learner_function,
225
+ cv=5,
226
+ stratified=True,
227
+ random_state=None,
228
+ shuffle=True
229
+ )
230
+
231
+ self.train_predictions = train_predictions
232
+ self.train_y_values = y_train_labels
233
+ return self
234
+
235
+ def predict(self, X):
236
+ """Predict class prevalences for the given data."""
237
+ predictions = getattr(self.learner, _get_learner_function(self))(X)
238
+ prevalences = self.aggregate(predictions, self.train_predictions, self.train_y_values)
239
+ return prevalences
240
+
241
+ def aggregate(self, predictions, train_predictions, y_train_values):
242
+ """Aggregate predictions and apply matrix- or rate-based bias correction."""
243
+ self.classes_ = check_classes_attribute(self, np.unique(y_train_values))
244
+ predictions = validate_predictions(self, train_predictions)
245
+ prevalences = self._adjust(predictions, train_predictions, y_train_values)
246
+ prevalences = validate_prevalences(self, prevalences, self.classes_)
247
+ return prevalences