mlquantify 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify/__init__.py +10 -29
- mlquantify/adjust_counting/__init__.py +24 -0
- mlquantify/adjust_counting/_adjustment.py +648 -0
- mlquantify/adjust_counting/_base.py +245 -0
- mlquantify/adjust_counting/_counting.py +153 -0
- mlquantify/adjust_counting/_utils.py +109 -0
- mlquantify/base.py +117 -519
- mlquantify/base_aggregative.py +209 -0
- mlquantify/calibration.py +1 -0
- mlquantify/confidence.py +329 -0
- mlquantify/likelihood/__init__.py +5 -0
- mlquantify/likelihood/_base.py +147 -0
- mlquantify/likelihood/_classes.py +430 -0
- mlquantify/meta/__init__.py +1 -0
- mlquantify/meta/_classes.py +785 -0
- mlquantify/metrics/__init__.py +21 -0
- mlquantify/metrics/_oq.py +109 -0
- mlquantify/metrics/_rq.py +98 -0
- mlquantify/{evaluation/measures.py → metrics/_slq.py} +51 -36
- mlquantify/mixture/__init__.py +7 -0
- mlquantify/mixture/_base.py +147 -0
- mlquantify/mixture/_classes.py +458 -0
- mlquantify/mixture/_utils.py +163 -0
- mlquantify/model_selection/__init__.py +9 -0
- mlquantify/model_selection/_protocol.py +358 -0
- mlquantify/model_selection/_search.py +315 -0
- mlquantify/model_selection/_split.py +1 -0
- mlquantify/multiclass.py +350 -0
- mlquantify/neighbors/__init__.py +9 -0
- mlquantify/neighbors/_base.py +168 -0
- mlquantify/neighbors/_classes.py +150 -0
- mlquantify/{classification/methods.py → neighbors/_classification.py} +37 -62
- mlquantify/neighbors/_kde.py +268 -0
- mlquantify/neighbors/_utils.py +131 -0
- mlquantify/neural/__init__.py +1 -0
- mlquantify/utils/__init__.py +47 -2
- mlquantify/utils/_artificial.py +27 -0
- mlquantify/utils/_constraints.py +219 -0
- mlquantify/utils/_context.py +21 -0
- mlquantify/utils/_decorators.py +36 -0
- mlquantify/utils/_exceptions.py +12 -0
- mlquantify/utils/_get_scores.py +159 -0
- mlquantify/utils/_load.py +18 -0
- mlquantify/utils/_parallel.py +6 -0
- mlquantify/utils/_random.py +36 -0
- mlquantify/utils/_sampling.py +273 -0
- mlquantify/utils/_tags.py +44 -0
- mlquantify/utils/_validation.py +447 -0
- mlquantify/utils/prevalence.py +64 -0
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/METADATA +2 -1
- mlquantify-0.1.10.dist-info/RECORD +53 -0
- mlquantify/classification/__init__.py +0 -1
- mlquantify/evaluation/__init__.py +0 -14
- mlquantify/evaluation/protocol.py +0 -289
- mlquantify/methods/__init__.py +0 -37
- mlquantify/methods/aggregative.py +0 -1159
- mlquantify/methods/meta.py +0 -472
- mlquantify/methods/mixture_models.py +0 -1003
- mlquantify/methods/non_aggregative.py +0 -136
- mlquantify/methods/threshold_optimization.py +0 -869
- mlquantify/model_selection.py +0 -377
- mlquantify/plots.py +0 -367
- mlquantify/utils/general.py +0 -371
- mlquantify/utils/method.py +0 -449
- mlquantify-0.1.8.dist-info/RECORD +0 -22
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/WHEEL +0 -0
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/top_level.txt +0 -0
mlquantify/multiclass.py
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
from copy import deepcopy
|
|
2
|
+
import numpy as np
|
|
3
|
+
from abc import abstractmethod
|
|
4
|
+
from mlquantify.base import BaseQuantifier
|
|
5
|
+
from mlquantify.base_aggregative import get_aggregation_requirements
|
|
6
|
+
from mlquantify.utils._decorators import _fit_context
|
|
7
|
+
from mlquantify.base import BaseQuantifier, MetaquantifierMixin
|
|
8
|
+
from mlquantify.utils._validation import validate_prevalences, check_has_method
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
from copy import deepcopy
|
|
12
|
+
from itertools import combinations
|
|
13
|
+
import numpy as np
|
|
14
|
+
from abc import abstractmethod
|
|
15
|
+
from mlquantify.base import BaseQuantifier, MetaquantifierMixin
|
|
16
|
+
from mlquantify.base_aggregative import get_aggregation_requirements
|
|
17
|
+
from mlquantify.utils._decorators import _fit_context
|
|
18
|
+
from mlquantify.utils._validation import validate_prevalences, check_has_method
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# ============================================================
|
|
22
|
+
# Decorator for enabling binary quantification behavior
|
|
23
|
+
# ============================================================
|
|
24
|
+
def define_binary(cls):
|
|
25
|
+
"""Decorator to enable binary quantification extensions (One-vs-Rest or One-vs-One).
|
|
26
|
+
|
|
27
|
+
This decorator dynamically extends a quantifier class to handle multiclass
|
|
28
|
+
quantification tasks by decomposing them into multiple binary subproblems,
|
|
29
|
+
following either the One-vs-Rest (OvR) or One-vs-One (OvO) strategy.
|
|
30
|
+
|
|
31
|
+
It automatically replaces the class methods `fit`, `predict`, and `aggregate`
|
|
32
|
+
with binary-aware versions from `BinaryQuantifier`, while preserving access
|
|
33
|
+
to the original implementations via `_original_fit`, `_original_predict`,
|
|
34
|
+
and `_original_aggregate`.
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
cls : class
|
|
39
|
+
A subclass of `BaseQuantifier` implementing standard binary quantification
|
|
40
|
+
methods (`fit`, `predict`, and `aggregate`).
|
|
41
|
+
|
|
42
|
+
Returns
|
|
43
|
+
-------
|
|
44
|
+
class
|
|
45
|
+
The same class with binary quantification capabilities added.
|
|
46
|
+
|
|
47
|
+
Examples
|
|
48
|
+
--------
|
|
49
|
+
>>> from mlquantify.base import BaseQuantifier
|
|
50
|
+
>>> from mlquantify.binary import define_binary
|
|
51
|
+
|
|
52
|
+
>>> @define_binary
|
|
53
|
+
... class MyQuantifier(BaseQuantifier):
|
|
54
|
+
... def fit(self, X, y):
|
|
55
|
+
... # Custom binary training logic
|
|
56
|
+
... self.classes_ = np.unique(y)
|
|
57
|
+
... return self
|
|
58
|
+
...
|
|
59
|
+
... def predict(self, X):
|
|
60
|
+
... # Return dummy prevalences
|
|
61
|
+
... return np.array([0.4, 0.6])
|
|
62
|
+
...
|
|
63
|
+
... def aggregate(self, preds, y_train):
|
|
64
|
+
... # Example aggregation method
|
|
65
|
+
... return np.mean(preds, axis=0)
|
|
66
|
+
|
|
67
|
+
>>> qtf = MyQuantifier()
|
|
68
|
+
>>> qtf.strategy = 'ovr' # or 'ovo'
|
|
69
|
+
>>> X = np.random.randn(10, 5)
|
|
70
|
+
>>> y = np.random.randint(0, 3, 10)
|
|
71
|
+
>>> qtf.fit(X, y)
|
|
72
|
+
MyQuantifier(...)
|
|
73
|
+
>>> qtf.predict(X)
|
|
74
|
+
array([...])
|
|
75
|
+
"""
|
|
76
|
+
if check_has_method(cls, "fit"):
|
|
77
|
+
cls._original_fit = cls.fit
|
|
78
|
+
if check_has_method(cls, "predict"):
|
|
79
|
+
cls._original_predict = cls.predict
|
|
80
|
+
if check_has_method(cls, "aggregate"):
|
|
81
|
+
cls._original_aggregate = cls.aggregate
|
|
82
|
+
|
|
83
|
+
cls.fit = BinaryQuantifier.fit
|
|
84
|
+
cls.predict = BinaryQuantifier.predict
|
|
85
|
+
cls.aggregate = BinaryQuantifier.aggregate
|
|
86
|
+
|
|
87
|
+
return cls
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# ============================================================
|
|
91
|
+
# Fitting strategies
|
|
92
|
+
# ============================================================
|
|
93
|
+
def _fit_ovr(quantifier, X, y):
|
|
94
|
+
"""Fit using One-vs-Rest (OvR) strategy.
|
|
95
|
+
|
|
96
|
+
Creates a binary quantifier for each class, trained to distinguish that class
|
|
97
|
+
versus all others.
|
|
98
|
+
|
|
99
|
+
Parameters
|
|
100
|
+
----------
|
|
101
|
+
quantifier : BaseQuantifier
|
|
102
|
+
The quantifier instance being trained.
|
|
103
|
+
X : array-like of shape (n_samples, n_features)
|
|
104
|
+
Training feature matrix.
|
|
105
|
+
y : array-like of shape (n_samples,)
|
|
106
|
+
Class labels.
|
|
107
|
+
|
|
108
|
+
Returns
|
|
109
|
+
-------
|
|
110
|
+
dict
|
|
111
|
+
A mapping from class label to fitted binary quantifier.
|
|
112
|
+
"""
|
|
113
|
+
quantifiers = {}
|
|
114
|
+
for cls in np.unique(y):
|
|
115
|
+
qtf = deepcopy(quantifier)
|
|
116
|
+
y_bin = (y == cls).astype(int)
|
|
117
|
+
qtf._original_fit(X, y_bin)
|
|
118
|
+
quantifiers[cls] = qtf
|
|
119
|
+
return quantifiers
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _fit_ovo(quantifier, X, y):
|
|
123
|
+
"""Fit using One-vs-One (OvO) strategy.
|
|
124
|
+
|
|
125
|
+
Creates a binary quantifier for every pair of classes, trained to distinguish
|
|
126
|
+
one class from another.
|
|
127
|
+
|
|
128
|
+
Parameters
|
|
129
|
+
----------
|
|
130
|
+
quantifier : BaseQuantifier
|
|
131
|
+
The quantifier instance being trained.
|
|
132
|
+
X : array-like of shape (n_samples, n_features)
|
|
133
|
+
Training feature matrix.
|
|
134
|
+
y : array-like of shape (n_samples,)
|
|
135
|
+
Class labels.
|
|
136
|
+
|
|
137
|
+
Returns
|
|
138
|
+
-------
|
|
139
|
+
dict
|
|
140
|
+
A mapping from (class1, class2) tuples to fitted binary quantifiers.
|
|
141
|
+
"""
|
|
142
|
+
quantifiers = {}
|
|
143
|
+
for cls1, cls2 in combinations(np.unique(y), 2):
|
|
144
|
+
qtf = deepcopy(quantifier)
|
|
145
|
+
mask = (y == cls1) | (y == cls2)
|
|
146
|
+
y_bin = (y[mask] == cls1).astype(int)
|
|
147
|
+
qtf._original_fit(X[mask], y_bin)
|
|
148
|
+
quantifiers[(cls1, cls2)] = qtf
|
|
149
|
+
return quantifiers
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# ============================================================
|
|
153
|
+
# Prediction strategies
|
|
154
|
+
# ============================================================
|
|
155
|
+
def _predict_ovr(quantifier, X):
|
|
156
|
+
"""Predict using One-vs-Rest (OvR) strategy.
|
|
157
|
+
|
|
158
|
+
Each binary quantifier produces a prevalence estimate for its corresponding class.
|
|
159
|
+
|
|
160
|
+
Parameters
|
|
161
|
+
----------
|
|
162
|
+
quantifier : BinaryQuantifier
|
|
163
|
+
Fitted quantifier containing binary models.
|
|
164
|
+
X : array-like of shape (n_samples, n_features)
|
|
165
|
+
Test feature matrix.
|
|
166
|
+
|
|
167
|
+
Returns
|
|
168
|
+
-------
|
|
169
|
+
np.ndarray
|
|
170
|
+
Predicted prevalences for each class.
|
|
171
|
+
"""
|
|
172
|
+
preds = np.zeros(len(quantifier.qtfs_))
|
|
173
|
+
for i, qtf in enumerate(quantifier.qtfs_.values()):
|
|
174
|
+
preds[i] = qtf._original_predict(X)[1]
|
|
175
|
+
return preds
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _predict_ovo(quantifier, X):
|
|
179
|
+
"""Predict using One-vs-One (OvO) strategy.
|
|
180
|
+
|
|
181
|
+
Each binary quantifier outputs a prevalence estimate for the pair of classes it was trained on.
|
|
182
|
+
|
|
183
|
+
Parameters
|
|
184
|
+
----------
|
|
185
|
+
quantifier : BinaryQuantifier
|
|
186
|
+
Fitted quantifier containing binary models.
|
|
187
|
+
X : array-like of shape (n_samples, n_features)
|
|
188
|
+
Test feature matrix.
|
|
189
|
+
|
|
190
|
+
Returns
|
|
191
|
+
-------
|
|
192
|
+
np.ndarray
|
|
193
|
+
Pairwise prevalence predictions.
|
|
194
|
+
"""
|
|
195
|
+
preds = np.zeros(len(quantifier.qtfs_))
|
|
196
|
+
for i, (cls1, cls2) in enumerate(combinations(quantifier.qtfs_.keys(), 2)):
|
|
197
|
+
qtf = quantifier.qtfs_[(cls1, cls2)]
|
|
198
|
+
preds[i] = qtf._original_predict(X)[1]
|
|
199
|
+
return preds
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
# ============================================================
|
|
203
|
+
# Aggregation strategies
|
|
204
|
+
# ============================================================
|
|
205
|
+
def _aggregate_ovr(quantifier, preds, y_train, train_preds=None):
|
|
206
|
+
"""Aggregate binary predictions using One-vs-Rest (OvR).
|
|
207
|
+
|
|
208
|
+
Parameters
|
|
209
|
+
----------
|
|
210
|
+
quantifier : BinaryQuantifier
|
|
211
|
+
Quantifier performing the aggregation.
|
|
212
|
+
preds : ndarray of shape (n_samples, n_classes)
|
|
213
|
+
Model predictions.
|
|
214
|
+
y_train : ndarray of shape (n_samples,)
|
|
215
|
+
Training labels.
|
|
216
|
+
train_preds : ndarray of shape (n_samples, n_classes), optional
|
|
217
|
+
Predictions on the training set.
|
|
218
|
+
|
|
219
|
+
Returns
|
|
220
|
+
-------
|
|
221
|
+
dict
|
|
222
|
+
Class-wise prevalence estimates.
|
|
223
|
+
"""
|
|
224
|
+
prevalences = {}
|
|
225
|
+
for i, cls in enumerate(np.unique(y_train)):
|
|
226
|
+
bin_preds = np.column_stack([1 - preds[:, i], preds[:, i]])
|
|
227
|
+
y_bin = (y_train == cls).astype(int)
|
|
228
|
+
args = [bin_preds]
|
|
229
|
+
|
|
230
|
+
if train_preds is not None:
|
|
231
|
+
bin_train_preds = np.column_stack([1 - train_preds[:, i], train_preds[:, i]])
|
|
232
|
+
args.append(bin_train_preds)
|
|
233
|
+
|
|
234
|
+
args.append(y_bin)
|
|
235
|
+
prevalences[cls] = quantifier._original_aggregate(*args)[1]
|
|
236
|
+
return prevalences
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _aggregate_ovo(quantifier, preds, y_train, train_preds=None):
|
|
240
|
+
"""Aggregate binary predictions using One-vs-One (OvO).
|
|
241
|
+
|
|
242
|
+
Parameters
|
|
243
|
+
----------
|
|
244
|
+
quantifier : BinaryQuantifier
|
|
245
|
+
Quantifier performing the aggregation.
|
|
246
|
+
preds : ndarray
|
|
247
|
+
Model predictions.
|
|
248
|
+
y_train : ndarray
|
|
249
|
+
Training labels.
|
|
250
|
+
train_preds : ndarray, optional
|
|
251
|
+
Predictions on the training set.
|
|
252
|
+
|
|
253
|
+
Returns
|
|
254
|
+
-------
|
|
255
|
+
dict
|
|
256
|
+
Pairwise prevalence estimates.
|
|
257
|
+
"""
|
|
258
|
+
prevalences = {}
|
|
259
|
+
for cls1, cls2 in combinations(np.unique(y_train), 2):
|
|
260
|
+
bin_preds = np.column_stack([1 - preds[:, (cls1, cls2)], preds[:, (cls1, cls2)]])
|
|
261
|
+
mask = (y_train == cls1) | (y_train == cls2)
|
|
262
|
+
y_bin = (y_train[mask] == cls1).astype(int)
|
|
263
|
+
|
|
264
|
+
args = [bin_preds]
|
|
265
|
+
if train_preds is not None:
|
|
266
|
+
bin_train_preds = np.column_stack([1 - train_preds[:, (cls1, cls2)], train_preds[:, (cls1, cls2)]])
|
|
267
|
+
args.append(bin_train_preds)
|
|
268
|
+
|
|
269
|
+
args.append(y_bin)
|
|
270
|
+
prevalences[(cls1, cls2)] = quantifier._original_aggregate(*args)[1]
|
|
271
|
+
return prevalences
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
# ============================================================
|
|
275
|
+
# Main Class
|
|
276
|
+
# ============================================================
|
|
277
|
+
class BinaryQuantifier(MetaquantifierMixin, BaseQuantifier):
|
|
278
|
+
"""Meta-quantifier enabling One-vs-Rest and One-vs-One strategies.
|
|
279
|
+
|
|
280
|
+
This class extends a base quantifier to handle multiclass problems by
|
|
281
|
+
decomposing them into binary subproblems. It automatically delegates fitting,
|
|
282
|
+
prediction, and aggregation operations to the appropriate binary quantifiers.
|
|
283
|
+
|
|
284
|
+
Attributes
|
|
285
|
+
----------
|
|
286
|
+
qtfs_ : dict
|
|
287
|
+
Dictionary mapping class labels or label pairs to fitted binary quantifiers.
|
|
288
|
+
strategy : {'ovr', 'ovo'}
|
|
289
|
+
Defines how multiclass quantification is decomposed.
|
|
290
|
+
"""
|
|
291
|
+
|
|
292
|
+
@_fit_context(prefer_skip_nested_validation=False)
|
|
293
|
+
def fit(qtf, X, y):
|
|
294
|
+
"""Fit the quantifier under a binary decomposition strategy."""
|
|
295
|
+
if len(np.unique(y)) <= 2:
|
|
296
|
+
qtf.binary = True
|
|
297
|
+
return qtf._original_fit(X, y)
|
|
298
|
+
|
|
299
|
+
qtf.strategy = getattr(qtf, "strategy", "ovr")
|
|
300
|
+
|
|
301
|
+
if qtf.strategy == "ovr":
|
|
302
|
+
qtf.qtfs_ = _fit_ovr(qtf, X, y)
|
|
303
|
+
elif qtf.strategy == "ovo":
|
|
304
|
+
qtf.qtfs_ = _fit_ovo(qtf, X, y)
|
|
305
|
+
else:
|
|
306
|
+
raise ValueError("Strategy must be 'ovr' or 'ovo'")
|
|
307
|
+
|
|
308
|
+
return qtf
|
|
309
|
+
|
|
310
|
+
def predict(qtf, X):
|
|
311
|
+
"""Predict class prevalences using the trained binary quantifiers."""
|
|
312
|
+
if hasattr(qtf, "binary") and qtf.binary:
|
|
313
|
+
return qtf._original_predict(X)
|
|
314
|
+
else:
|
|
315
|
+
if qtf.strategy == "ovr":
|
|
316
|
+
preds = _predict_ovr(qtf, X)
|
|
317
|
+
elif qtf.strategy == "ovo":
|
|
318
|
+
preds = _predict_ovo(qtf, X)
|
|
319
|
+
else:
|
|
320
|
+
raise ValueError("Strategy must be 'ovr' or 'ovo'")
|
|
321
|
+
|
|
322
|
+
return validate_prevalences(qtf, preds, qtf.qtfs_.keys())
|
|
323
|
+
|
|
324
|
+
def aggregate(qtf, *args):
|
|
325
|
+
"""Aggregate binary predictions to obtain multiclass prevalence estimates."""
|
|
326
|
+
requirements = get_aggregation_requirements(qtf)
|
|
327
|
+
|
|
328
|
+
if requirements.requires_train_proba and requirements.requires_train_labels:
|
|
329
|
+
preds, train_preds, y_train = args
|
|
330
|
+
args_dict = dict(preds=preds, train_preds=train_preds, y_train=y_train)
|
|
331
|
+
elif requirements.requires_train_labels:
|
|
332
|
+
preds, y_train = args
|
|
333
|
+
args_dict = dict(preds=preds, y_train=y_train)
|
|
334
|
+
else:
|
|
335
|
+
raise ValueError("Binary aggregation requires at least train labels")
|
|
336
|
+
|
|
337
|
+
classes = np.unique(args_dict["y_train"])
|
|
338
|
+
qtf.strategy = getattr(qtf, "strategy", "ovr")
|
|
339
|
+
|
|
340
|
+
if hasattr(qtf, "binary") and qtf.binary:
|
|
341
|
+
return qtf._original_aggregate(*args_dict.values())
|
|
342
|
+
|
|
343
|
+
if qtf.strategy == "ovr":
|
|
344
|
+
prevalences = _aggregate_ovr(qtf, **args_dict)
|
|
345
|
+
elif qtf.strategy == "ovo":
|
|
346
|
+
prevalences = _aggregate_ovo(qtf, **args_dict)
|
|
347
|
+
else:
|
|
348
|
+
raise ValueError("Strategy must be 'ovr' or 'ovo'")
|
|
349
|
+
|
|
350
|
+
return validate_prevalences(qtf, prevalences, classes)
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from abc import abstractmethod
|
|
3
|
+
from sklearn.neighbors import KernelDensity
|
|
4
|
+
|
|
5
|
+
from mlquantify.utils._decorators import _fit_context
|
|
6
|
+
from mlquantify.base import BaseQuantifier
|
|
7
|
+
from mlquantify.utils import validate_y, validate_predictions, validate_data, check_classes_attribute
|
|
8
|
+
from mlquantify.base_aggregative import AggregationMixin, SoftLearnerQMixin, _get_learner_function
|
|
9
|
+
from mlquantify.utils._constraints import Interval, Options
|
|
10
|
+
from mlquantify.utils._get_scores import apply_cross_validation
|
|
11
|
+
from mlquantify.utils._validation import validate_prevalences
|
|
12
|
+
|
|
13
|
+
EPS = 1e-12
|
|
14
|
+
|
|
15
|
+
class BaseKDE(SoftLearnerQMixin, AggregationMixin, BaseQuantifier):
|
|
16
|
+
r"""Base class for KDEy quantification methods.
|
|
17
|
+
|
|
18
|
+
KDEy models the class-conditional densities of posterior probabilities using Kernel Density Estimation (KDE)
|
|
19
|
+
on the probability simplex. Given posterior outputs from a probabilistic classifier, each class distribution
|
|
20
|
+
is approximated as a smooth KDE. Test set class prevalences correspond to mixture weights that best explain
|
|
21
|
+
the overall test posterior distribution.
|
|
22
|
+
|
|
23
|
+
Mathematically, the test posterior distribution is approximated as:
|
|
24
|
+
|
|
25
|
+
.. math::
|
|
26
|
+
|
|
27
|
+
p_{\mathrm{test}}(x) \approx \sum_{k=1}^K \alpha_k p_k(x),
|
|
28
|
+
|
|
29
|
+
where \(p_k(x)\) is the KDE of class \(k\) posteriors from training data, and \(\alpha_k\) are the unknown class
|
|
30
|
+
prevalences subject to:
|
|
31
|
+
|
|
32
|
+
.. math::
|
|
33
|
+
|
|
34
|
+
\alpha_k \geq 0, \quad \sum_{k=1}^K \alpha_k = 1.
|
|
35
|
+
|
|
36
|
+
The quantification minimizes an objective \(\mathcal{L}\) over \(\boldsymbol{\alpha} = (\alpha_1, \dots, \alpha_K)\) in the simplex:
|
|
37
|
+
|
|
38
|
+
.. math::
|
|
39
|
+
|
|
40
|
+
\min_{\boldsymbol{\alpha} \in \Delta^{K-1}} \mathcal{L} \left( \sum_{k=1}^K \alpha_k p_k(x), \hat{p}(x) \right),
|
|
41
|
+
|
|
42
|
+
where \(\hat{p}(x)\) is the test posterior distribution (empirical KDE or direct predictions).
|
|
43
|
+
|
|
44
|
+
This problem is typically solved using numerical constrained optimization methods.
|
|
45
|
+
|
|
46
|
+
Attributes
|
|
47
|
+
----------
|
|
48
|
+
learner : estimator
|
|
49
|
+
Probabilistic classifier generating posterior predictions.
|
|
50
|
+
bandwidth : float
|
|
51
|
+
KDE bandwidth (smoothing parameter).
|
|
52
|
+
kernel : str
|
|
53
|
+
KDE kernel type (e.g., 'gaussian').
|
|
54
|
+
_precomputed : bool
|
|
55
|
+
Indicates if KDE models have been fitted.
|
|
56
|
+
best_distance : float or None
|
|
57
|
+
Best objective value found during estimation.
|
|
58
|
+
|
|
59
|
+
Examples
|
|
60
|
+
--------
|
|
61
|
+
Subclasses should implement `_solve_prevalences` method returning estimated prevalences and objective value:
|
|
62
|
+
|
|
63
|
+
>>> class KDEyExample(BaseKDE):
|
|
64
|
+
... def _solve_prevalences(self, predictions):
|
|
65
|
+
... n_classes = len(self._class_kdes)
|
|
66
|
+
... alpha = np.ones(n_classes) / n_classes
|
|
67
|
+
... obj_val = 0.0 # Placeholder, replace with actual objective
|
|
68
|
+
... return alpha, obj_val
|
|
69
|
+
|
|
70
|
+
References
|
|
71
|
+
----------
|
|
72
|
+
.. [1] Moreo, A., et al. (2023). Kernel Density Quantification methods and applications.
|
|
73
|
+
In *Learning to Quantify*, Springer.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
_parameter_constraints = {
|
|
77
|
+
"bandwidth": [Interval(0, None, inclusive_right=False)],
|
|
78
|
+
"kernel": [Options(["gaussian", "tophat", "epanechnikov", "exponential", "linear", "cosine"])],
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
def __init__(self, learner=None, bandwidth: float = 0.1, kernel: str = "gaussian"):
|
|
82
|
+
self.learner = learner
|
|
83
|
+
self.bandwidth = bandwidth
|
|
84
|
+
self.kernel = kernel
|
|
85
|
+
self._precomputed = False
|
|
86
|
+
self.best_distance = None
|
|
87
|
+
|
|
88
|
+
@_fit_context(prefer_skip_nested_validation=True)
|
|
89
|
+
def fit(self, X, y, learner_fitted=False):
|
|
90
|
+
X, y = validate_data(self, X, y, ensure_2d=True, ensure_min_samples=2)
|
|
91
|
+
validate_y(self, y)
|
|
92
|
+
|
|
93
|
+
self.classes_ = np.unique(y)
|
|
94
|
+
|
|
95
|
+
learner_function = _get_learner_function(self)
|
|
96
|
+
|
|
97
|
+
if learner_fitted:
|
|
98
|
+
train_predictions = getattr(self.learner, learner_function)(X)
|
|
99
|
+
train_y_values = y
|
|
100
|
+
else:
|
|
101
|
+
train_predictions, train_y_values = apply_cross_validation(
|
|
102
|
+
self.learner, X, y,
|
|
103
|
+
function=learner_function, cv=5,
|
|
104
|
+
stratified=True, shuffle=True
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
self.train_predictions = train_predictions
|
|
108
|
+
self.train_y_values = train_y_values
|
|
109
|
+
self._precompute_training(train_predictions, train_y_values)
|
|
110
|
+
return self
|
|
111
|
+
|
|
112
|
+
def _fit_kde_models(self, train_predictions, train_y_values):
|
|
113
|
+
P = np.atleast_2d(train_predictions)
|
|
114
|
+
y = np.asarray(train_y_values)
|
|
115
|
+
self._class_kdes = []
|
|
116
|
+
|
|
117
|
+
for c in self.classes_:
|
|
118
|
+
Xi = P[y == c]
|
|
119
|
+
if Xi.shape[0] == 0:
|
|
120
|
+
Xi = np.ones((1, P.shape[1])) / P.shape[1] # fallback
|
|
121
|
+
kde = KernelDensity(bandwidth=self.bandwidth, kernel=self.kernel)
|
|
122
|
+
kde.fit(Xi)
|
|
123
|
+
self._class_kdes.append(kde)
|
|
124
|
+
|
|
125
|
+
self._precomputed = True
|
|
126
|
+
|
|
127
|
+
def predict(self, X):
|
|
128
|
+
predictions = getattr(self.learner, _get_learner_function(self))(X)
|
|
129
|
+
return self.aggregate(predictions, self.train_predictions, self.train_y_values)
|
|
130
|
+
|
|
131
|
+
def aggregate(self, predictions, train_predictions, train_y_values):
|
|
132
|
+
predictions = validate_predictions(self, predictions)
|
|
133
|
+
|
|
134
|
+
if hasattr(self, "classes_") and len(np.unique(train_y_values)) != len(self.classes_):
|
|
135
|
+
self._precomputed = False
|
|
136
|
+
|
|
137
|
+
self.classes_ = check_classes_attribute(self, np.unique(train_y_values))
|
|
138
|
+
|
|
139
|
+
if not self._precomputed:
|
|
140
|
+
self._precompute_training(train_predictions, train_y_values)
|
|
141
|
+
self._precomputed = True
|
|
142
|
+
|
|
143
|
+
prevalence, _ = self._solve_prevalences(predictions)
|
|
144
|
+
prevalence = np.clip(prevalence, EPS, None)
|
|
145
|
+
prevalence = validate_prevalences(self, prevalence, self.classes_)
|
|
146
|
+
return prevalence
|
|
147
|
+
|
|
148
|
+
def best_distance(self, predictions, train_predictions, train_y_values):
|
|
149
|
+
"""Retorna a melhor distância encontrada durante o ajuste."""
|
|
150
|
+
if self.best_distance is not None:
|
|
151
|
+
return self.best_distance
|
|
152
|
+
|
|
153
|
+
self.classes_ = check_classes_attribute(self, np.unique(train_y_values))
|
|
154
|
+
|
|
155
|
+
if not self._precomputed:
|
|
156
|
+
self._precompute_training(train_predictions, train_y_values)
|
|
157
|
+
self._precomputed = True
|
|
158
|
+
|
|
159
|
+
_, best_distance = self._solve_prevalences(predictions)
|
|
160
|
+
return best_distance
|
|
161
|
+
|
|
162
|
+
@abstractmethod
|
|
163
|
+
def _precompute_training(self, train_predictions, train_y_values):
|
|
164
|
+
raise NotImplementedError
|
|
165
|
+
|
|
166
|
+
@abstractmethod
|
|
167
|
+
def _solve_prevalences(self, predictions):
|
|
168
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from mlquantify.utils._constraints import Interval, Options
|
|
3
|
+
from mlquantify.neighbors._classification import PWKCLF
|
|
4
|
+
from mlquantify.base_aggregative import AggregationMixin, CrispLearnerQMixin
|
|
5
|
+
from mlquantify.base import BaseQuantifier
|
|
6
|
+
from mlquantify.utils._decorators import _fit_context
|
|
7
|
+
from mlquantify.adjust_counting import CC
|
|
8
|
+
from mlquantify.utils import validate_y, validate_data
|
|
9
|
+
from mlquantify.utils._validation import validate_prevalences
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PWK(BaseQuantifier):
|
|
13
|
+
r"""
|
|
14
|
+
Probabilistic Weighted k-Nearest Neighbor (PWK) Quantifier.
|
|
15
|
+
|
|
16
|
+
This quantifier leverages the PWKCLF classifier to perform quantification by estimating
|
|
17
|
+
class prevalences through a probabilistically weighted k-nearest neighbor approach.
|
|
18
|
+
|
|
19
|
+
The method internally uses a weighted k-NN classifier where neighbors' contributions
|
|
20
|
+
are adjusted by class-specific weights designed to correct for class imbalance,
|
|
21
|
+
controlled by the hyperparameter alpha.
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
alpha : float, default=1
|
|
26
|
+
Imbalance correction exponent for class weights. Higher values increase
|
|
27
|
+
the influence of minority classes.
|
|
28
|
+
n_neighbors : int, default=10
|
|
29
|
+
Number of nearest neighbors considered.
|
|
30
|
+
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
|
31
|
+
Algorithm used to compute nearest neighbors.
|
|
32
|
+
metric : str, default='euclidean'
|
|
33
|
+
Distance metric for nearest neighbor search.
|
|
34
|
+
leaf_size : int, default=30
|
|
35
|
+
Leaf size for tree-based neighbors algorithms.
|
|
36
|
+
p : int, default=2
|
|
37
|
+
Power parameter for the Minkowski metric (when metric='minkowski').
|
|
38
|
+
metric_params : dict or None, default=None
|
|
39
|
+
Additional parameters for the metric function.
|
|
40
|
+
n_jobs : int or None, default=None
|
|
41
|
+
Number of parallel jobs for neighbors search.
|
|
42
|
+
|
|
43
|
+
Attributes
|
|
44
|
+
----------
|
|
45
|
+
cc : object
|
|
46
|
+
Internally used Classify & Count quantifier wrapping PWKCLF.
|
|
47
|
+
learner : PWKCLF
|
|
48
|
+
Underlying probabilistic weighted k-NN classifier.
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
Examples
|
|
52
|
+
--------
|
|
53
|
+
>>> q = PWK(alpha=1.5, n_neighbors=5)
|
|
54
|
+
>>> q.fit(X_train, y_train)
|
|
55
|
+
>>> prevalences = q.predict(X_test)
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
_parameter_constraints = {
|
|
59
|
+
"alpha": [Interval(1, None, inclusive_right=False)],
|
|
60
|
+
"n_neighbors": [Interval(1, None, inclusive_right=False)],
|
|
61
|
+
"algorithm": [Options(["auto", "ball_tree", "kd_tree", "brute"])],
|
|
62
|
+
"metric": [str],
|
|
63
|
+
"leaf_size": [Interval(1, None, inclusive_right=False)],
|
|
64
|
+
"p": [Interval(1, None, inclusive_right=False)],
|
|
65
|
+
"metric_params": [dict, type(None)],
|
|
66
|
+
"n_jobs": [Interval(1, None, inclusive_right=False), type(None)],
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
def __init__(self,
|
|
70
|
+
alpha=1,
|
|
71
|
+
n_neighbors=10,
|
|
72
|
+
algorithm="auto",
|
|
73
|
+
metric="euclidean",
|
|
74
|
+
leaf_size=30,
|
|
75
|
+
p=2,
|
|
76
|
+
metric_params=None,
|
|
77
|
+
n_jobs=None):
|
|
78
|
+
learner = PWKCLF(alpha=alpha,
|
|
79
|
+
n_neighbors=n_neighbors,
|
|
80
|
+
algorithm=algorithm,
|
|
81
|
+
metric=metric,
|
|
82
|
+
leaf_size=leaf_size,
|
|
83
|
+
p=p,
|
|
84
|
+
metric_params=metric_params,
|
|
85
|
+
n_jobs=n_jobs)
|
|
86
|
+
self.algorithm = algorithm
|
|
87
|
+
self.alpha = alpha
|
|
88
|
+
self.n_neighbors = n_neighbors
|
|
89
|
+
self.metric = metric
|
|
90
|
+
self.leaf_size = leaf_size
|
|
91
|
+
self.p = p
|
|
92
|
+
self.metric_params = metric_params
|
|
93
|
+
self.n_jobs = n_jobs
|
|
94
|
+
self.learner = learner
|
|
95
|
+
|
|
96
|
+
@_fit_context(prefer_skip_nested_validation=True)
|
|
97
|
+
def fit(self, X, y):
|
|
98
|
+
"""Fit the PWK quantifier to the training data.
|
|
99
|
+
|
|
100
|
+
Parameters
|
|
101
|
+
----------
|
|
102
|
+
X_train : array-like of shape (n_samples, n_features)
|
|
103
|
+
Training features.
|
|
104
|
+
|
|
105
|
+
y_train : array-like of shape (n_samples,)
|
|
106
|
+
Training labels.
|
|
107
|
+
|
|
108
|
+
Returns
|
|
109
|
+
-------
|
|
110
|
+
self : object
|
|
111
|
+
The fitted instance.
|
|
112
|
+
"""
|
|
113
|
+
X, y = validate_data(self, X, y, ensure_2d=True, ensure_min_samples=2)
|
|
114
|
+
validate_y(self, y)
|
|
115
|
+
self.classes_ = np.unique(y)
|
|
116
|
+
self.cc = CC(self.learner)
|
|
117
|
+
return self.cc.fit(X, y)
|
|
118
|
+
|
|
119
|
+
def predict(self, X):
|
|
120
|
+
"""Predict prevalences for the given data.
|
|
121
|
+
|
|
122
|
+
Parameters
|
|
123
|
+
----------
|
|
124
|
+
X : array-like of shape (n_samples, n_features)
|
|
125
|
+
Features for which to predict prevalences.
|
|
126
|
+
|
|
127
|
+
Returns
|
|
128
|
+
-------
|
|
129
|
+
prevalences : array of shape (n_classes,)
|
|
130
|
+
Predicted class prevalences.
|
|
131
|
+
"""
|
|
132
|
+
prevalences = self.cc.predict(X)
|
|
133
|
+
prevalences = validate_prevalences(self, prevalences, self.classes_)
|
|
134
|
+
return prevalences
|
|
135
|
+
|
|
136
|
+
def classify(self, X):
|
|
137
|
+
"""Classify samples using the underlying learner.
|
|
138
|
+
|
|
139
|
+
Parameters
|
|
140
|
+
----------
|
|
141
|
+
X : array-like of shape (n_samples, n_features)
|
|
142
|
+
Features to classify.
|
|
143
|
+
|
|
144
|
+
Returns
|
|
145
|
+
-------
|
|
146
|
+
labels : array of shape (n_samples,)
|
|
147
|
+
Predicted class labels.
|
|
148
|
+
"""
|
|
149
|
+
return self.learner.predict(X)
|
|
150
|
+
|