mlquantify 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify/__init__.py +10 -29
- mlquantify/adjust_counting/__init__.py +24 -0
- mlquantify/adjust_counting/_adjustment.py +648 -0
- mlquantify/adjust_counting/_base.py +245 -0
- mlquantify/adjust_counting/_counting.py +153 -0
- mlquantify/adjust_counting/_utils.py +109 -0
- mlquantify/base.py +117 -519
- mlquantify/base_aggregative.py +209 -0
- mlquantify/calibration.py +1 -0
- mlquantify/confidence.py +329 -0
- mlquantify/likelihood/__init__.py +5 -0
- mlquantify/likelihood/_base.py +147 -0
- mlquantify/likelihood/_classes.py +430 -0
- mlquantify/meta/__init__.py +1 -0
- mlquantify/meta/_classes.py +785 -0
- mlquantify/metrics/__init__.py +21 -0
- mlquantify/metrics/_oq.py +109 -0
- mlquantify/metrics/_rq.py +98 -0
- mlquantify/{evaluation/measures.py → metrics/_slq.py} +51 -36
- mlquantify/mixture/__init__.py +7 -0
- mlquantify/mixture/_base.py +147 -0
- mlquantify/mixture/_classes.py +458 -0
- mlquantify/mixture/_utils.py +163 -0
- mlquantify/model_selection/__init__.py +9 -0
- mlquantify/model_selection/_protocol.py +358 -0
- mlquantify/model_selection/_search.py +315 -0
- mlquantify/model_selection/_split.py +1 -0
- mlquantify/multiclass.py +350 -0
- mlquantify/neighbors/__init__.py +9 -0
- mlquantify/neighbors/_base.py +168 -0
- mlquantify/neighbors/_classes.py +150 -0
- mlquantify/{classification/methods.py → neighbors/_classification.py} +37 -62
- mlquantify/neighbors/_kde.py +268 -0
- mlquantify/neighbors/_utils.py +131 -0
- mlquantify/neural/__init__.py +1 -0
- mlquantify/utils/__init__.py +47 -2
- mlquantify/utils/_artificial.py +27 -0
- mlquantify/utils/_constraints.py +219 -0
- mlquantify/utils/_context.py +21 -0
- mlquantify/utils/_decorators.py +36 -0
- mlquantify/utils/_exceptions.py +12 -0
- mlquantify/utils/_get_scores.py +159 -0
- mlquantify/utils/_load.py +18 -0
- mlquantify/utils/_parallel.py +6 -0
- mlquantify/utils/_random.py +36 -0
- mlquantify/utils/_sampling.py +273 -0
- mlquantify/utils/_tags.py +44 -0
- mlquantify/utils/_validation.py +447 -0
- mlquantify/utils/prevalence.py +64 -0
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/METADATA +2 -1
- mlquantify-0.1.10.dist-info/RECORD +53 -0
- mlquantify/classification/__init__.py +0 -1
- mlquantify/evaluation/__init__.py +0 -14
- mlquantify/evaluation/protocol.py +0 -289
- mlquantify/methods/__init__.py +0 -37
- mlquantify/methods/aggregative.py +0 -1159
- mlquantify/methods/meta.py +0 -472
- mlquantify/methods/mixture_models.py +0 -1003
- mlquantify/methods/non_aggregative.py +0 -136
- mlquantify/methods/threshold_optimization.py +0 -869
- mlquantify/model_selection.py +0 -377
- mlquantify/plots.py +0 -367
- mlquantify/utils/general.py +0 -371
- mlquantify/utils/method.py +0 -449
- mlquantify-0.1.8.dist-info/RECORD +0 -22
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/WHEEL +0 -0
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/top_level.txt +0 -0
|
@@ -1,1159 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
import pandas as pd
|
|
3
|
-
from scipy.optimize import minimize
|
|
4
|
-
from ..base import AggregativeQuantifier
|
|
5
|
-
from ..utils.method import *
|
|
6
|
-
|
|
7
|
-
from sklearn.base import BaseEstimator
|
|
8
|
-
from sklearn.metrics import confusion_matrix
|
|
9
|
-
from sklearn.model_selection import train_test_split
|
|
10
|
-
import mlquantify as mq
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class CC(AggregativeQuantifier):
|
|
17
|
-
"""Classify and Count (CC).
|
|
18
|
-
|
|
19
|
-
The simplest quantification method involves classifying each instance
|
|
20
|
-
and then counting the number of instances assigned to each class to
|
|
21
|
-
estimate the class prevalence.
|
|
22
|
-
|
|
23
|
-
This method is based on the concept of classification and counting the
|
|
24
|
-
number of instances for each class, which is used to estimate the
|
|
25
|
-
class prevalence.
|
|
26
|
-
|
|
27
|
-
Attributes
|
|
28
|
-
----------
|
|
29
|
-
learner : BaseEstimator
|
|
30
|
-
The machine learning model used to classify the instances.
|
|
31
|
-
It must be an estimator from scikit-learn (e.g., LogisticRegression,
|
|
32
|
-
RandomForestClassifier).
|
|
33
|
-
|
|
34
|
-
See Also
|
|
35
|
-
--------
|
|
36
|
-
AggregativeQuantifier : Base class for aggregative quantification methods.
|
|
37
|
-
|
|
38
|
-
References
|
|
39
|
-
----------
|
|
40
|
-
FORMAN, George. Quantifying counts and costs via classification.
|
|
41
|
-
Data Mining and Knowledge Discovery, v. 17, p. 164-206, 2008.
|
|
42
|
-
Available at: https://link.springer.com/article/10.1007/s10618-008-0097-y
|
|
43
|
-
|
|
44
|
-
Parameters
|
|
45
|
-
----------
|
|
46
|
-
learner : BaseEstimator
|
|
47
|
-
A scikit-learn-compatible model that serves as the classifier.
|
|
48
|
-
|
|
49
|
-
Methods
|
|
50
|
-
-------
|
|
51
|
-
fit(X, y)
|
|
52
|
-
Fits the learner to the data.
|
|
53
|
-
|
|
54
|
-
predict(X) -> dict
|
|
55
|
-
Predicts the class labels for the given data and calculates
|
|
56
|
-
the prevalence of each class based on the predictions.
|
|
57
|
-
|
|
58
|
-
Examples
|
|
59
|
-
--------
|
|
60
|
-
>>> from mlquantify.utils.general import get_real_prev
|
|
61
|
-
>>> from mlquantify.methods.aggregative import CC
|
|
62
|
-
>>> from sklearn.ensemble import RandomForestClassifier
|
|
63
|
-
>>> from sklearn.datasets import load_wine
|
|
64
|
-
>>> from sklearn.model_selection import train_test_split
|
|
65
|
-
>>>
|
|
66
|
-
>>> features, target = load_wine(return_X_y=True)
|
|
67
|
-
>>>
|
|
68
|
-
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.4, random_state=32)
|
|
69
|
-
>>>
|
|
70
|
-
>>> cc = CC(RandomForestClassifier())
|
|
71
|
-
>>> cc.fit(X_train, y_train)
|
|
72
|
-
>>> y_pred = cc.predict(X_test)
|
|
73
|
-
>>> y_pred
|
|
74
|
-
{0: 0.4305555555555556, 1: 0.2916666666666667, 2: 0.2777777777777778}
|
|
75
|
-
>>> get_real_prev(y_test)
|
|
76
|
-
{0: 0.4166666666666667, 1: 0.3194444444444444, 2: 0.2638888888888889}
|
|
77
|
-
"""
|
|
78
|
-
|
|
79
|
-
def __init__(self, learner: BaseEstimator=None):
|
|
80
|
-
self.learner = learner
|
|
81
|
-
|
|
82
|
-
def _fit_method(self, X, y):
|
|
83
|
-
"""
|
|
84
|
-
Fits the learner to the data. This method is used internally.
|
|
85
|
-
|
|
86
|
-
Parameters
|
|
87
|
-
----------
|
|
88
|
-
X : array-like
|
|
89
|
-
Feature matrix.
|
|
90
|
-
y : array-like
|
|
91
|
-
Target labels.
|
|
92
|
-
|
|
93
|
-
Returns
|
|
94
|
-
-------
|
|
95
|
-
self : CC
|
|
96
|
-
The instance of the CC class.
|
|
97
|
-
"""
|
|
98
|
-
self.fit_learner(X, y)
|
|
99
|
-
return self
|
|
100
|
-
|
|
101
|
-
def _predict_method(self, X) -> np.ndarray:
|
|
102
|
-
"""
|
|
103
|
-
Predicts the class labels for the given data and calculates
|
|
104
|
-
the prevalence of each class based on the predictions.
|
|
105
|
-
|
|
106
|
-
Parameters
|
|
107
|
-
----------
|
|
108
|
-
X : array-like
|
|
109
|
-
Feature matrix for prediction.
|
|
110
|
-
|
|
111
|
-
Returns
|
|
112
|
-
-------
|
|
113
|
-
array-like
|
|
114
|
-
An array containing the prevalence of each class.
|
|
115
|
-
"""
|
|
116
|
-
predicted_labels = self.predict_learner(X)
|
|
117
|
-
|
|
118
|
-
# Count occurrences of each class in the predictions
|
|
119
|
-
class_counts = np.array([np.count_nonzero(predicted_labels == _class) for _class in self.classes])
|
|
120
|
-
|
|
121
|
-
# Calculate the prevalence of each class
|
|
122
|
-
prevalences = class_counts / len(predicted_labels)
|
|
123
|
-
|
|
124
|
-
return prevalences
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
class EMQ(AggregativeQuantifier):
|
|
135
|
-
"""Expectation Maximisation Quantifier (EMQ).
|
|
136
|
-
|
|
137
|
-
EMQ is a quantification method that iteratively adjusts the prior
|
|
138
|
-
and posterior probabilities of a learner using the Expectation-Maximisation (EM) algorithm.
|
|
139
|
-
It is particularly useful for scenarios where the class distribution in the test set
|
|
140
|
-
differs from that in the training set.
|
|
141
|
-
|
|
142
|
-
Attributes
|
|
143
|
-
----------
|
|
144
|
-
learner : BaseEstimator
|
|
145
|
-
A scikit-learn-compatible model used to classify the instances.
|
|
146
|
-
priors : array-like
|
|
147
|
-
Prior probabilities of the classes, estimated from the training data.
|
|
148
|
-
|
|
149
|
-
References
|
|
150
|
-
----------
|
|
151
|
-
SAERENS, Marco; LATINNE, Patrice; DECAESTECKER, Christine. Adjusting the outputs of a classifier
|
|
152
|
-
to new a priori probabilities: a simple procedure. Neural Computation, v. 14, n. 1, p. 21-41, 2002.
|
|
153
|
-
Available at: https://ieeexplore.ieee.org/abstract/document/6789744
|
|
154
|
-
|
|
155
|
-
Examples
|
|
156
|
-
--------
|
|
157
|
-
>>> from mlquantify.methods.aggregative import EMQ
|
|
158
|
-
>>> from mlquantify.utils.general import get_real_prev
|
|
159
|
-
>>> from sklearn.ensemble import RandomForestClassifier
|
|
160
|
-
>>> from sklearn.datasets import load_wine
|
|
161
|
-
>>> from sklearn.model_selection import train_test_split
|
|
162
|
-
>>>
|
|
163
|
-
>>> features, target = load_wine(return_X_y=True)
|
|
164
|
-
>>>
|
|
165
|
-
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.4, random_state=32)
|
|
166
|
-
>>>
|
|
167
|
-
>>> emq = EMQ(RandomForestClassifier())
|
|
168
|
-
>>> emq.fit(X_train, y_train)
|
|
169
|
-
>>> prevalences = emq.predict(X_test)
|
|
170
|
-
>>> print(prevalences)
|
|
171
|
-
{0: 0.4466744706195974, 1: 0.29747794914814046, 2: 0.25584758023226206}
|
|
172
|
-
>>> get_real_prev(y_test)
|
|
173
|
-
{0: 0.4166666666666667, 1: 0.3194444444444444, 2: 0.2638888888888889}
|
|
174
|
-
"""
|
|
175
|
-
|
|
176
|
-
MAX_ITER = 1000
|
|
177
|
-
EPSILON = 1e-6
|
|
178
|
-
|
|
179
|
-
@property
|
|
180
|
-
def is_probabilistic(self) -> bool:
|
|
181
|
-
return True
|
|
182
|
-
|
|
183
|
-
def __init__(self, learner: BaseEstimator=None):
|
|
184
|
-
self.learner = learner
|
|
185
|
-
self.priors = None
|
|
186
|
-
|
|
187
|
-
def _fit_method(self, X, y):
|
|
188
|
-
"""
|
|
189
|
-
Fits the learner to the training data and calculates prior probabilities.
|
|
190
|
-
|
|
191
|
-
Parameters
|
|
192
|
-
----------
|
|
193
|
-
X : array-like
|
|
194
|
-
Feature matrix for training.
|
|
195
|
-
y : array-like
|
|
196
|
-
Target labels for training.
|
|
197
|
-
|
|
198
|
-
Returns
|
|
199
|
-
-------
|
|
200
|
-
self : EMQ
|
|
201
|
-
The fitted instance of EMQ.
|
|
202
|
-
"""
|
|
203
|
-
self.fit_learner(X, y)
|
|
204
|
-
|
|
205
|
-
counts = np.array([np.count_nonzero(y == _class) for _class in self.classes])
|
|
206
|
-
self.priors = counts / len(y)
|
|
207
|
-
|
|
208
|
-
return self
|
|
209
|
-
|
|
210
|
-
def _predict_method(self, X) -> dict:
|
|
211
|
-
"""
|
|
212
|
-
Predicts the prevalence of each class in the test data.
|
|
213
|
-
|
|
214
|
-
Parameters
|
|
215
|
-
----------
|
|
216
|
-
X : array-like
|
|
217
|
-
Feature matrix for prediction.
|
|
218
|
-
|
|
219
|
-
Returns
|
|
220
|
-
-------
|
|
221
|
-
dict
|
|
222
|
-
A dictionary with class labels as keys and their prevalence as values.
|
|
223
|
-
"""
|
|
224
|
-
posteriors = self.predict_learner(X)
|
|
225
|
-
prevalences, _ = self.EM(self.priors, posteriors)
|
|
226
|
-
|
|
227
|
-
return prevalences
|
|
228
|
-
|
|
229
|
-
def predict_proba(self, X, epsilon: float = EPSILON, max_iter: int = MAX_ITER) -> np.ndarray:
|
|
230
|
-
"""
|
|
231
|
-
Predicts the posterior probabilities for the test data after adjustment using EM.
|
|
232
|
-
|
|
233
|
-
Parameters
|
|
234
|
-
----------
|
|
235
|
-
X : array-like
|
|
236
|
-
Feature matrix for prediction.
|
|
237
|
-
epsilon : float, optional
|
|
238
|
-
Convergence threshold for the EM algorithm (default: EPSILON).
|
|
239
|
-
max_iter : int, optional
|
|
240
|
-
Maximum number of iterations for the EM algorithm (default: MAX_ITER).
|
|
241
|
-
|
|
242
|
-
Returns
|
|
243
|
-
-------
|
|
244
|
-
np.ndarray
|
|
245
|
-
Adjusted posterior probabilities.
|
|
246
|
-
"""
|
|
247
|
-
posteriors = self.predict_learner(X)
|
|
248
|
-
_, posteriors = self.EM(self.priors, posteriors, epsilon, max_iter)
|
|
249
|
-
return posteriors
|
|
250
|
-
|
|
251
|
-
@classmethod
|
|
252
|
-
def EM(cls, priors, posteriors, epsilon=EPSILON, max_iter=MAX_ITER):
|
|
253
|
-
"""
|
|
254
|
-
Expectation-Maximisation (EM) algorithm for adjusting prior and posterior probabilities.
|
|
255
|
-
|
|
256
|
-
The algorithm iterates over the data, adjusting the probabilities until convergence
|
|
257
|
-
or reaching the maximum number of iterations. It estimates the class prevalence
|
|
258
|
-
and adjusts the posterior probabilities for each class.
|
|
259
|
-
|
|
260
|
-
Parameters
|
|
261
|
-
----------
|
|
262
|
-
priors : array-like
|
|
263
|
-
Initial prior probabilities for each class.
|
|
264
|
-
posteriors : array-like
|
|
265
|
-
Initial posterior probabilities for each test instance and class.
|
|
266
|
-
epsilon : float, optional
|
|
267
|
-
Convergence threshold (default: EPSILON).
|
|
268
|
-
max_iter : int, optional
|
|
269
|
-
Maximum number of iterations (default: MAX_ITER).
|
|
270
|
-
|
|
271
|
-
Returns
|
|
272
|
-
-------
|
|
273
|
-
tuple
|
|
274
|
-
Adjusted prevalence (array-like) and updated posterior probabilities (array-like).
|
|
275
|
-
"""
|
|
276
|
-
Px = posteriors
|
|
277
|
-
prev_prevalence = np.copy(priors)
|
|
278
|
-
running_estimate = np.copy(prev_prevalence) # Initialized with the training prevalence
|
|
279
|
-
|
|
280
|
-
iteration, converged = 0, False
|
|
281
|
-
previous_estimate = None
|
|
282
|
-
|
|
283
|
-
while not converged and iteration < max_iter:
|
|
284
|
-
# E-step: Compute unnormalized posteriors
|
|
285
|
-
posteriors_unnormalized = (running_estimate / prev_prevalence) * Px
|
|
286
|
-
posteriors = posteriors_unnormalized / posteriors_unnormalized.sum(axis=1, keepdims=True)
|
|
287
|
-
|
|
288
|
-
# M-step: Update the running prevalence estimate
|
|
289
|
-
running_estimate = posteriors.mean(axis=0)
|
|
290
|
-
|
|
291
|
-
if previous_estimate is not None and np.mean(np.abs(running_estimate - previous_estimate)) < epsilon and iteration > 10:
|
|
292
|
-
converged = True
|
|
293
|
-
|
|
294
|
-
previous_estimate = running_estimate
|
|
295
|
-
iteration += 1
|
|
296
|
-
|
|
297
|
-
if not converged:
|
|
298
|
-
print('[Warning] The method has reached the maximum number of iterations; it might not have converged')
|
|
299
|
-
|
|
300
|
-
return running_estimate, posteriors
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
class FM(AggregativeQuantifier):
|
|
312
|
-
"""The Friedman Method (FM).
|
|
313
|
-
|
|
314
|
-
FM is a quantification method similar to GPAC (General Probabilistic Aggregative Classifier),
|
|
315
|
-
but instead of averaging confidence scores from probabilistic classifiers,
|
|
316
|
-
it uses the proportion of confidence scores that exceed the expected class frequencies
|
|
317
|
-
estimated from the training data.
|
|
318
|
-
|
|
319
|
-
This method leverages a confusion matrix computed during training to adjust
|
|
320
|
-
class prevalences in the test set, solving an optimization problem to align
|
|
321
|
-
predicted and actual distributions.
|
|
322
|
-
|
|
323
|
-
Attributes
|
|
324
|
-
----------
|
|
325
|
-
learner : BaseEstimator
|
|
326
|
-
A scikit-learn-compatible model used for classification.
|
|
327
|
-
CM : np.ndarray
|
|
328
|
-
The confusion matrix, normalized by class counts.
|
|
329
|
-
priors : array-like
|
|
330
|
-
Prior probabilities of the classes, estimated from training data.
|
|
331
|
-
|
|
332
|
-
References
|
|
333
|
-
----------
|
|
334
|
-
Friedman, J. (2001). Quantification via Classification. Presentation.
|
|
335
|
-
Available at: https://jerryfriedman.su.domains/talks/qc.pdf
|
|
336
|
-
|
|
337
|
-
Examples
|
|
338
|
-
--------
|
|
339
|
-
>>> from mlquantify.utils.general import get_real_prev
|
|
340
|
-
>>> from mlquantify.methods.aggregative import FM
|
|
341
|
-
>>> from sklearn.ensemble import RandomForestClassifier
|
|
342
|
-
>>> from sklearn.datasets import load_wine
|
|
343
|
-
>>> from sklearn.model_selection import train_test_split
|
|
344
|
-
>>>
|
|
345
|
-
>>> features, target = load_wine(return_X_y=True)
|
|
346
|
-
>>>
|
|
347
|
-
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.4, random_state=32)
|
|
348
|
-
>>>
|
|
349
|
-
>>> fm = FM(RandomForestClassifier())
|
|
350
|
-
>>> fm.fit(X_train, y_train)
|
|
351
|
-
>>> y_pred = fm.predict(X_test)
|
|
352
|
-
>>> y_pred
|
|
353
|
-
{0: 0.4207283701943278, 1: 0.3049753216939303, 2: 0.27429630811174194}
|
|
354
|
-
>>> get_real_prev(y_test)
|
|
355
|
-
{0: 0.4166666666666667, 1: 0.3194444444444444, 2: 0.2638888888888889}
|
|
356
|
-
"""
|
|
357
|
-
|
|
358
|
-
@property
|
|
359
|
-
def is_probabilistic(self) -> bool:
|
|
360
|
-
return True
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
def __init__(self, learner: BaseEstimator=None):
|
|
364
|
-
self.learner = learner
|
|
365
|
-
self.CM = None
|
|
366
|
-
|
|
367
|
-
def _fit_method(self, X, y):
|
|
368
|
-
"""
|
|
369
|
-
Fits the learner and computes the confusion matrix.
|
|
370
|
-
|
|
371
|
-
The confusion matrix is computed based on cross-validated predicted labels
|
|
372
|
-
and probabilities. It represents the proportions of confidence scores
|
|
373
|
-
exceeding the priors for each class.
|
|
374
|
-
|
|
375
|
-
Parameters
|
|
376
|
-
----------
|
|
377
|
-
X : array-like
|
|
378
|
-
Feature matrix for training.
|
|
379
|
-
y : array-like
|
|
380
|
-
Target labels for training.
|
|
381
|
-
|
|
382
|
-
Returns
|
|
383
|
-
-------
|
|
384
|
-
self : FM
|
|
385
|
-
The fitted instance of FM.
|
|
386
|
-
"""
|
|
387
|
-
# Get predicted labels and probabilities using cross-validation
|
|
388
|
-
if mq.arguments["y_labels"] is not None and mq.arguments["posteriors_train"] is not None:
|
|
389
|
-
y_labels = mq.arguments["y_labels"]
|
|
390
|
-
probabilities = mq.arguments["posteriors_train"]
|
|
391
|
-
else:
|
|
392
|
-
y_labels, probabilities = get_scores(X, y, self.learner, self.cv_folds, self.learner_fitted)
|
|
393
|
-
|
|
394
|
-
# Fit the learner if it hasn't been fitted already
|
|
395
|
-
self.fit_learner(X, y)
|
|
396
|
-
|
|
397
|
-
# Initialize the confusion matrix
|
|
398
|
-
CM = np.zeros((self.n_class, self.n_class))
|
|
399
|
-
|
|
400
|
-
# Calculate the class priors
|
|
401
|
-
class_counts = np.array([np.count_nonzero(y_labels == _class) for _class in self.classes])
|
|
402
|
-
self.priors = class_counts / len(y_labels)
|
|
403
|
-
|
|
404
|
-
# Populate the confusion matrix
|
|
405
|
-
for i, _class in enumerate(self.classes):
|
|
406
|
-
indices = np.where(y_labels == _class)[0]
|
|
407
|
-
CM[:, i] = np.sum(probabilities[indices] > self.priors, axis=0)
|
|
408
|
-
|
|
409
|
-
# Normalize the confusion matrix by class counts
|
|
410
|
-
self.CM = CM / class_counts
|
|
411
|
-
|
|
412
|
-
return self
|
|
413
|
-
|
|
414
|
-
def _predict_method(self, X) -> dict:
|
|
415
|
-
"""
|
|
416
|
-
Predicts class prevalences in the test set using the confusion matrix.
|
|
417
|
-
|
|
418
|
-
Solves an optimization problem to find class prevalences that best
|
|
419
|
-
align with the observed proportions in the test set.
|
|
420
|
-
|
|
421
|
-
Parameters
|
|
422
|
-
----------
|
|
423
|
-
X : array-like
|
|
424
|
-
Feature matrix for prediction.
|
|
425
|
-
|
|
426
|
-
Returns
|
|
427
|
-
-------
|
|
428
|
-
dict
|
|
429
|
-
A dictionary with class labels as keys and their prevalence as values.
|
|
430
|
-
"""
|
|
431
|
-
posteriors = self.predict_learner(X)
|
|
432
|
-
|
|
433
|
-
# Calculate the estimated prevalences in the test set
|
|
434
|
-
prevs_estim = np.sum(posteriors > self.priors, axis=0) / posteriors.shape[0]
|
|
435
|
-
|
|
436
|
-
# Define the objective function for optimization
|
|
437
|
-
def objective(prevs_pred):
|
|
438
|
-
return np.linalg.norm(self.CM @ prevs_pred - prevs_estim)
|
|
439
|
-
|
|
440
|
-
# Constraints for the optimization problem
|
|
441
|
-
constraints = [
|
|
442
|
-
{'type': 'eq', 'fun': lambda prevs_pred: np.sum(prevs_pred) - 1.0},
|
|
443
|
-
{'type': 'ineq', 'fun': lambda prevs_pred: prevs_pred}
|
|
444
|
-
]
|
|
445
|
-
|
|
446
|
-
# Initial guess for the optimization
|
|
447
|
-
initial_guess = np.ones(self.CM.shape[1]) / self.CM.shape[1]
|
|
448
|
-
|
|
449
|
-
# Solve the optimization problem
|
|
450
|
-
result = minimize(objective, initial_guess, constraints=constraints, bounds=[(0, 1)] * self.CM.shape[1])
|
|
451
|
-
|
|
452
|
-
if result.success:
|
|
453
|
-
prevalences = result.x
|
|
454
|
-
else:
|
|
455
|
-
print("Optimization did not converge")
|
|
456
|
-
prevalences = self.priors
|
|
457
|
-
|
|
458
|
-
return prevalences
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
class GAC(AggregativeQuantifier):
|
|
467
|
-
"""
|
|
468
|
-
Generalized Adjusted Count (GAC).
|
|
469
|
-
|
|
470
|
-
GAC is a quantification method that applies a classifier to estimate the distribution
|
|
471
|
-
of class labels in the test set by solving a system of linear equations. This system
|
|
472
|
-
is constructed using a conditional probability matrix derived from training data and
|
|
473
|
-
is solved via constrained least-squares regression.
|
|
474
|
-
|
|
475
|
-
Parameters
|
|
476
|
-
----------
|
|
477
|
-
learner : BaseEstimator
|
|
478
|
-
A scikit-learn-compatible model used for classification.
|
|
479
|
-
train_size : float, optional
|
|
480
|
-
Proportion of the dataset to include in the training split, by default 0.6.
|
|
481
|
-
random_state : int, optional
|
|
482
|
-
Random seed for reproducibility of data splits, by default None.
|
|
483
|
-
|
|
484
|
-
Attributes
|
|
485
|
-
----------
|
|
486
|
-
learner : BaseEstimator
|
|
487
|
-
A scikit-learn-compatible model used for classification.
|
|
488
|
-
cond_prob_matrix : np.ndarray
|
|
489
|
-
Conditional probability matrix, representing P(yi|yj).
|
|
490
|
-
train_size : float, optional
|
|
491
|
-
Proportion of the dataset to include in the training split, by default 0.6.
|
|
492
|
-
random_state : int, optional
|
|
493
|
-
Random seed for reproducibility of data splits, by default None.
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
References
|
|
497
|
-
----------
|
|
498
|
-
Firat, Aykut. Unified framework for quantification. arXiv preprint arXiv:1606.00868, 2016.
|
|
499
|
-
Available at: https://arxiv.org/abs/1606.00868
|
|
500
|
-
|
|
501
|
-
Examples
|
|
502
|
-
--------
|
|
503
|
-
>>> from mlquantify.utils.general import get_real_prev
|
|
504
|
-
>>> from mlquantify.methods.aggregative import GAC
|
|
505
|
-
>>> from sklearn.ensemble import RandomForestClassifier
|
|
506
|
-
>>> from sklearn.datasets import load_wine
|
|
507
|
-
>>> from sklearn.model_selection import train_test_split
|
|
508
|
-
>>>
|
|
509
|
-
>>> features, target = load_wine(return_X_y=True)
|
|
510
|
-
>>>
|
|
511
|
-
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.4, random_state=32)
|
|
512
|
-
>>>
|
|
513
|
-
>>> gac = GAC(RandomForestClassifier())
|
|
514
|
-
>>> gac.fit(X_train, y_train)
|
|
515
|
-
>>> y_pred = gac.predict(X_test)
|
|
516
|
-
>>> y_pred
|
|
517
|
-
{0: 0.4305555555555556, 1: 0.2916666666666667, 2: 0.2777777777777778}
|
|
518
|
-
>>> get_real_prev(y_test)
|
|
519
|
-
{0: 0.4166666666666667, 1: 0.3194444444444444, 2: 0.2638888888888889}
|
|
520
|
-
"""
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
def __init__(self, learner: BaseEstimator=None, train_size:float=0.6, random_state:int=None):
|
|
524
|
-
self.learner = learner
|
|
525
|
-
self.cond_prob_matrix = None
|
|
526
|
-
self.train_size = train_size
|
|
527
|
-
self.random_state = random_state
|
|
528
|
-
|
|
529
|
-
def _fit_method(self, X, y):
|
|
530
|
-
"""
|
|
531
|
-
Trains the model and computes the conditional probability matrix.
|
|
532
|
-
|
|
533
|
-
Parameters
|
|
534
|
-
----------
|
|
535
|
-
X : pd.DataFrame or np.ndarray
|
|
536
|
-
Features of the dataset.
|
|
537
|
-
y : pd.Series or np.ndarray
|
|
538
|
-
Labels of the dataset.
|
|
539
|
-
|
|
540
|
-
Returns
|
|
541
|
-
-------
|
|
542
|
-
self : GAC
|
|
543
|
-
Fitted quantifier object.
|
|
544
|
-
"""
|
|
545
|
-
if isinstance(X, np.ndarray):
|
|
546
|
-
X = pd.DataFrame(X)
|
|
547
|
-
if isinstance(y, np.ndarray):
|
|
548
|
-
y = pd.Series(y)
|
|
549
|
-
|
|
550
|
-
if self.learner_fitted or self.learner is None:
|
|
551
|
-
y_pred = mq.arguments["y_pred_train"] if mq.arguments["y_pred_train"] is not None else self.predict_learner(X)
|
|
552
|
-
y_label = y
|
|
553
|
-
else:
|
|
554
|
-
X_train, X_val, y_train, y_val = train_test_split(
|
|
555
|
-
X, y, train_size=self.train_size, stratify=y, random_state=self.random_state
|
|
556
|
-
)
|
|
557
|
-
self.fit_learner(X_train, y_train)
|
|
558
|
-
y_label = y_val
|
|
559
|
-
y_pred = self.learner.predict(X_val)
|
|
560
|
-
|
|
561
|
-
self.cond_prob_matrix = GAC.get_cond_prob_matrix(self.classes, y_label, y_pred)
|
|
562
|
-
return self
|
|
563
|
-
|
|
564
|
-
def _predict_method(self, X) -> dict:
|
|
565
|
-
"""
|
|
566
|
-
Predicts the class prevalences in the test set and adjusts them.
|
|
567
|
-
|
|
568
|
-
Parameters
|
|
569
|
-
----------
|
|
570
|
-
X : pd.DataFrame or np.ndarray
|
|
571
|
-
Features of the test dataset.
|
|
572
|
-
|
|
573
|
-
Returns
|
|
574
|
-
-------
|
|
575
|
-
dict
|
|
576
|
-
Adjusted class prevalences.
|
|
577
|
-
"""
|
|
578
|
-
y_pred = self.predict_learner(X)
|
|
579
|
-
_, counts = np.unique(y_pred, return_counts=True)
|
|
580
|
-
predicted_prevalences = counts / counts.sum()
|
|
581
|
-
adjusted_prevalences = self.solve_adjustment(self.cond_prob_matrix, predicted_prevalences)
|
|
582
|
-
return adjusted_prevalences
|
|
583
|
-
|
|
584
|
-
@classmethod
|
|
585
|
-
def get_cond_prob_matrix(cls, classes: list, y_labels: np.ndarray, predictions: np.ndarray) -> np.ndarray:
|
|
586
|
-
"""
|
|
587
|
-
Computes the conditional probability matrix P(yi|yj).
|
|
588
|
-
|
|
589
|
-
Parameters
|
|
590
|
-
----------
|
|
591
|
-
classes : list
|
|
592
|
-
List of class labels.
|
|
593
|
-
y_labels : np.ndarray
|
|
594
|
-
True labels from the validation set.
|
|
595
|
-
predictions : np.ndarray
|
|
596
|
-
Predicted labels from the classifier.
|
|
597
|
-
|
|
598
|
-
Returns
|
|
599
|
-
-------
|
|
600
|
-
np.ndarray
|
|
601
|
-
Conditional probability matrix.
|
|
602
|
-
"""
|
|
603
|
-
CM = confusion_matrix(y_labels, predictions, labels=classes).T
|
|
604
|
-
CM = CM.astype(float)
|
|
605
|
-
class_counts = CM.sum(axis=0)
|
|
606
|
-
for i, _ in enumerate(classes):
|
|
607
|
-
if class_counts[i] == 0:
|
|
608
|
-
CM[i, i] = 1
|
|
609
|
-
else:
|
|
610
|
-
CM[:, i] /= class_counts[i]
|
|
611
|
-
return CM
|
|
612
|
-
|
|
613
|
-
@classmethod
|
|
614
|
-
def solve_adjustment(cls, cond_prob_matrix: np.ndarray, predicted_prevalences: np.ndarray) -> np.ndarray:
|
|
615
|
-
"""
|
|
616
|
-
Solves the linear system Ax = B to adjust predicted prevalences.
|
|
617
|
-
|
|
618
|
-
Parameters
|
|
619
|
-
----------
|
|
620
|
-
cond_prob_matrix : np.ndarray
|
|
621
|
-
Conditional probability matrix (A).
|
|
622
|
-
predicted_prevalences : np.ndarray
|
|
623
|
-
Predicted class prevalences (B).
|
|
624
|
-
|
|
625
|
-
Returns
|
|
626
|
-
-------
|
|
627
|
-
np.ndarray
|
|
628
|
-
Adjusted class prevalences.
|
|
629
|
-
"""
|
|
630
|
-
A = cond_prob_matrix
|
|
631
|
-
B = predicted_prevalences
|
|
632
|
-
try:
|
|
633
|
-
adjusted_prevalences = np.linalg.solve(A, B)
|
|
634
|
-
adjusted_prevalences = np.clip(adjusted_prevalences, 0, 1)
|
|
635
|
-
adjusted_prevalences /= adjusted_prevalences.sum()
|
|
636
|
-
except np.linalg.LinAlgError:
|
|
637
|
-
adjusted_prevalences = predicted_prevalences # Return unadjusted if adjustment fails
|
|
638
|
-
return adjusted_prevalences
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
class GPAC(AggregativeQuantifier):
|
|
653
|
-
"""
|
|
654
|
-
Generalized Probabilistic Adjusted Count (GPAC).
|
|
655
|
-
|
|
656
|
-
GPAC is an extension of the Generalized Adjusted Count (GAC) method. It constructs a system of
|
|
657
|
-
linear equations using the confidence scores from probabilistic classifiers, similar to the PAC method.
|
|
658
|
-
The system is solved to estimate the prevalence of classes in a test dataset.
|
|
659
|
-
|
|
660
|
-
Parameters
|
|
661
|
-
----------
|
|
662
|
-
learner : BaseEstimator
|
|
663
|
-
A scikit-learn-compatible model used for classification.
|
|
664
|
-
train_size : float, optional
|
|
665
|
-
Proportion of the dataset to include in the training split, by default 0.6.
|
|
666
|
-
random_state : int, optional
|
|
667
|
-
Random seed for reproducibility of data splits, by default None.
|
|
668
|
-
|
|
669
|
-
Attributes
|
|
670
|
-
----------
|
|
671
|
-
learner : BaseEstimator
|
|
672
|
-
A scikit-learn-compatible model used for classification.
|
|
673
|
-
cond_prob_matrix : np.ndarray
|
|
674
|
-
Conditional probability matrix representing P(yi|yj).
|
|
675
|
-
train_size : float, optional
|
|
676
|
-
Proportion of the dataset to include in the training split, by default 0.6.
|
|
677
|
-
random_state : int, optional
|
|
678
|
-
Random seed for reproducibility of data splits, by default None.
|
|
679
|
-
|
|
680
|
-
References
|
|
681
|
-
----------
|
|
682
|
-
Firat, Aykut. Unified framework for quantification. arXiv preprint arXiv:1606.00868, 2016.
|
|
683
|
-
Available at: https://arxiv.org/abs/1606.00868
|
|
684
|
-
|
|
685
|
-
Examples
|
|
686
|
-
--------
|
|
687
|
-
>>> from mlquantify.utils.general import get_real_prev
|
|
688
|
-
>>> from mlquantify.methods.aggregative import GPAC
|
|
689
|
-
>>> from sklearn.ensemble import RandomForestClassifier
|
|
690
|
-
>>> from sklearn.datasets import load_wine
|
|
691
|
-
>>> from sklearn.model_selection import train_test_split
|
|
692
|
-
>>>
|
|
693
|
-
>>> features, target = load_wine(return_X_y=True, random_state=32)
|
|
694
|
-
>>>
|
|
695
|
-
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.4, random_state=32)
|
|
696
|
-
>>>
|
|
697
|
-
>>> gpac = GPAC(RandomForestClassifier())
|
|
698
|
-
>>> gpac.fit(X_train, y_train)
|
|
699
|
-
>>> y_pred = gpac.predict(X_test)
|
|
700
|
-
>>> y_pred
|
|
701
|
-
{0: 0.41435185185185186, 1: 0.3078703703703704, 2: 0.2777777777777778}
|
|
702
|
-
>>> get_real_prev(y_test)
|
|
703
|
-
{0: 0.4166666666666667, 1: 0.3194444444444444, 2: 0.2638888888888889}
|
|
704
|
-
"""
|
|
705
|
-
|
|
706
|
-
def __init__(self, learner: BaseEstimator=None, train_size: float = 0.6, random_state: int = None):
|
|
707
|
-
self.learner = learner
|
|
708
|
-
self.cond_prob_matrix = None
|
|
709
|
-
self.train_size = train_size
|
|
710
|
-
self.random_state = random_state
|
|
711
|
-
|
|
712
|
-
def _fit_method(self, X, y):
|
|
713
|
-
"""
|
|
714
|
-
Trains the model and computes the conditional probability matrix using validation data.
|
|
715
|
-
|
|
716
|
-
Parameters
|
|
717
|
-
----------
|
|
718
|
-
X : pd.DataFrame or np.ndarray
|
|
719
|
-
Features of the dataset.
|
|
720
|
-
y : pd.Series or np.ndarray
|
|
721
|
-
Labels of the dataset.
|
|
722
|
-
|
|
723
|
-
Returns
|
|
724
|
-
-------
|
|
725
|
-
self : GPAC
|
|
726
|
-
Fitted quantifier object.
|
|
727
|
-
"""
|
|
728
|
-
if isinstance(X, np.ndarray):
|
|
729
|
-
X = pd.DataFrame(X)
|
|
730
|
-
if isinstance(y, np.ndarray):
|
|
731
|
-
y = pd.Series(y)
|
|
732
|
-
|
|
733
|
-
if self.learner_fitted or self.learner is None:
|
|
734
|
-
y_pred = mq.arguments["y_pred_train"] if mq.arguments["y_pred_train"] is not None else self.predict_learner(X)
|
|
735
|
-
y_labels = y
|
|
736
|
-
else:
|
|
737
|
-
X_train, X_val, y_train, y_val = train_test_split(
|
|
738
|
-
X, y, train_size=self.train_size, stratify=y, random_state=self.random_state
|
|
739
|
-
)
|
|
740
|
-
self.fit_learner(X_train, y_train)
|
|
741
|
-
y_labels = y_val
|
|
742
|
-
y_pred = self.predict_learner(X_val)
|
|
743
|
-
|
|
744
|
-
# Compute the conditional probability matrix
|
|
745
|
-
self.cond_prob_matrix = GAC.get_cond_prob_matrix(self.classes, y_labels, y_pred)
|
|
746
|
-
return self
|
|
747
|
-
|
|
748
|
-
def _predict_method(self, X) -> dict:
|
|
749
|
-
"""
|
|
750
|
-
Predicts class prevalences in the test set and adjusts them using the conditional probability matrix.
|
|
751
|
-
|
|
752
|
-
Parameters
|
|
753
|
-
----------
|
|
754
|
-
X : pd.DataFrame or np.ndarray
|
|
755
|
-
Features of the test dataset.
|
|
756
|
-
|
|
757
|
-
Returns
|
|
758
|
-
-------
|
|
759
|
-
dict
|
|
760
|
-
Adjusted class prevalences.
|
|
761
|
-
"""
|
|
762
|
-
predictions = self.predict_learner(X)
|
|
763
|
-
|
|
764
|
-
# Compute the distribution of predictions
|
|
765
|
-
predicted_prevalences = np.zeros(self.n_class)
|
|
766
|
-
_, counts = np.unique(predictions, return_counts=True)
|
|
767
|
-
predicted_prevalences[:len(counts)] = counts
|
|
768
|
-
predicted_prevalences /= predicted_prevalences.sum()
|
|
769
|
-
|
|
770
|
-
# Adjust prevalences using the conditional probability matrix
|
|
771
|
-
adjusted_prevalences = GAC.solve_adjustment(self.cond_prob_matrix, predicted_prevalences)
|
|
772
|
-
return adjusted_prevalences
|
|
773
|
-
|
|
774
|
-
@classmethod
|
|
775
|
-
def get_cond_prob_matrix(cls, classes: list, y_labels: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
|
|
776
|
-
"""
|
|
777
|
-
Computes the conditional probability matrix P(yi|yj).
|
|
778
|
-
|
|
779
|
-
Parameters
|
|
780
|
-
----------
|
|
781
|
-
classes : list
|
|
782
|
-
List of class labels.
|
|
783
|
-
y_labels : np.ndarray
|
|
784
|
-
True labels from the validation set.
|
|
785
|
-
y_pred : np.ndarray
|
|
786
|
-
Predicted probabilities or labels from the classifier.
|
|
787
|
-
|
|
788
|
-
Returns
|
|
789
|
-
-------
|
|
790
|
-
np.ndarray
|
|
791
|
-
Conditional probability matrix with entry (i, j) representing P(yi|yj).
|
|
792
|
-
"""
|
|
793
|
-
n_classes = len(classes)
|
|
794
|
-
cond_prob_matrix = np.eye(n_classes)
|
|
795
|
-
|
|
796
|
-
for i, class_ in enumerate(classes):
|
|
797
|
-
class_indices = y_labels == class_
|
|
798
|
-
if class_indices.any():
|
|
799
|
-
cond_prob_matrix[i] = y_pred[class_indices].mean(axis=0)
|
|
800
|
-
|
|
801
|
-
return cond_prob_matrix.T
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
class PCC(AggregativeQuantifier):
|
|
813
|
-
"""
|
|
814
|
-
Probabilistic Classify and Count (PCC).
|
|
815
|
-
|
|
816
|
-
PCC is a quantification method that uses a probabilistic classifier to estimate
|
|
817
|
-
class prevalences in a test dataset. It computes the mean of the predicted
|
|
818
|
-
probabilities for each class to determine their prevalences.
|
|
819
|
-
|
|
820
|
-
Parameters
|
|
821
|
-
----------
|
|
822
|
-
learner : BaseEstimator
|
|
823
|
-
A scikit-learn-compatible probabilistic classifier.
|
|
824
|
-
|
|
825
|
-
Attributes
|
|
826
|
-
----------
|
|
827
|
-
learner : BaseEstimator
|
|
828
|
-
A scikit-learn-compatible probabilistic classifier.
|
|
829
|
-
|
|
830
|
-
References
|
|
831
|
-
----------
|
|
832
|
-
BELLA, Antonio et al. Quantification via probability estimators. In: 2010 IEEE International Conference on Data Mining. IEEE, 2010. p. 737-742. Avaliable at: https://ieeexplore.ieee.org/abstract/document/5694031
|
|
833
|
-
|
|
834
|
-
Examples
|
|
835
|
-
--------
|
|
836
|
-
>>> from mlquantify.utils.general import get_real_prev
|
|
837
|
-
>>> from mlquantify.methods.aggregative import PCC
|
|
838
|
-
>>> from sklearn.ensemble import RandomForestClassifier
|
|
839
|
-
>>> from sklearn.datasets import load_wine
|
|
840
|
-
>>> from sklearn.model_selection import train_test_split
|
|
841
|
-
>>>
|
|
842
|
-
>>> features, target = load_wine(return_X_y=True, random_state=32)
|
|
843
|
-
>>>
|
|
844
|
-
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.4, random_state=32)
|
|
845
|
-
>>>
|
|
846
|
-
>>> pcc = PCC(RandomForestClassifier())
|
|
847
|
-
>>> pcc.fit(X_train, y_train)
|
|
848
|
-
>>> y_pred = pcc.predict(X_test)
|
|
849
|
-
>>> y_pred
|
|
850
|
-
{0: 0.4036111111111111, 1: 0.3427777777777778, 2: 0.2536111111111111}
|
|
851
|
-
>>> get_real_prev(y_test)
|
|
852
|
-
{0: 0.4166666666666667, 1: 0.3194444444444444, 2: 0.2638888888888889}
|
|
853
|
-
"""
|
|
854
|
-
@property
|
|
855
|
-
def is_probabilistic(self) -> bool:
|
|
856
|
-
return True
|
|
857
|
-
|
|
858
|
-
def __init__(self, learner: BaseEstimator=None):
|
|
859
|
-
self.learner = learner
|
|
860
|
-
|
|
861
|
-
def _fit_method(self, X, y):
|
|
862
|
-
"""
|
|
863
|
-
Fits the learner to the training data.
|
|
864
|
-
|
|
865
|
-
Parameters
|
|
866
|
-
----------
|
|
867
|
-
X : pd.DataFrame or np.ndarray
|
|
868
|
-
Features of the training dataset.
|
|
869
|
-
y : pd.Series or np.ndarray
|
|
870
|
-
Labels of the training dataset.
|
|
871
|
-
|
|
872
|
-
Returns
|
|
873
|
-
-------
|
|
874
|
-
self : PCC
|
|
875
|
-
Fitted quantifier object.
|
|
876
|
-
"""
|
|
877
|
-
self.fit_learner(X, y)
|
|
878
|
-
return self
|
|
879
|
-
|
|
880
|
-
def _predict_method(self, X) -> np.ndarray:
|
|
881
|
-
"""
|
|
882
|
-
Predicts class prevalences in the test dataset by averaging the predicted probabilities.
|
|
883
|
-
|
|
884
|
-
Parameters
|
|
885
|
-
----------
|
|
886
|
-
X : pd.DataFrame or np.ndarray
|
|
887
|
-
Features of the test dataset.
|
|
888
|
-
|
|
889
|
-
Returns
|
|
890
|
-
-------
|
|
891
|
-
np.ndarray
|
|
892
|
-
Estimated prevalences for each class.
|
|
893
|
-
"""
|
|
894
|
-
# Initialize a list to store the prevalence for each class
|
|
895
|
-
prevalences = []
|
|
896
|
-
|
|
897
|
-
# Calculate the prevalence for each class
|
|
898
|
-
for class_index in range(self.n_class):
|
|
899
|
-
# Get the predicted probabilities for the current class
|
|
900
|
-
class_probabilities = self.predict_learner(X)[:, class_index]
|
|
901
|
-
|
|
902
|
-
# Compute the average probability (prevalence) for the current class
|
|
903
|
-
mean_prev = np.mean(class_probabilities)
|
|
904
|
-
prevalences.append(mean_prev)
|
|
905
|
-
|
|
906
|
-
return np.asarray(prevalences)
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
class PACC(AggregativeQuantifier):
|
|
911
|
-
"""
|
|
912
|
-
Probabilistic Adjusted Classify and Count (PACC).
|
|
913
|
-
This method extends the Adjusted Classify and Count (AC) approach
|
|
914
|
-
by leveraging the average class-conditional confidences obtained
|
|
915
|
-
from a probabilistic classifier instead of relying solely on true
|
|
916
|
-
positive and false positive rates.
|
|
917
|
-
|
|
918
|
-
Parameters
|
|
919
|
-
----------
|
|
920
|
-
learner : BaseEstimator
|
|
921
|
-
A scikit-learn compatible classifier to be used for quantification.
|
|
922
|
-
threshold : float, optional
|
|
923
|
-
The decision threshold for classification. Default is 0.5.
|
|
924
|
-
|
|
925
|
-
Attributes
|
|
926
|
-
----------
|
|
927
|
-
learner : BaseEstimator
|
|
928
|
-
A scikit-learn compatible classifier.
|
|
929
|
-
threshold : float
|
|
930
|
-
Decision threshold for classification. Default is 0.5.
|
|
931
|
-
tpr : float
|
|
932
|
-
True positive rate computed during the fitting process.
|
|
933
|
-
fpr : float
|
|
934
|
-
False positive rate computed during the fitting process.
|
|
935
|
-
|
|
936
|
-
See Also
|
|
937
|
-
--------
|
|
938
|
-
ThresholdOptimization : Base class for threshold-based quantification methods.
|
|
939
|
-
ACC : Adjusted Classify and Count quantification method.
|
|
940
|
-
CC : Classify and Count quantification method.
|
|
941
|
-
|
|
942
|
-
References
|
|
943
|
-
----------
|
|
944
|
-
A. Bella, C. Ferri, J. Hernández-Orallo and M. J. Ramírez-Quintana, "Quantification via Probability Estimators," 2010 IEEE International Conference on Data Mining, Sydney, NSW, Australia, 2010, pp. 737-742, doi: 10.1109/ICDM.2010.75. Available at: https://ieeexplore.ieee.org/abstract/document/5694031
|
|
945
|
-
|
|
946
|
-
Examples
|
|
947
|
-
--------
|
|
948
|
-
>>> from mlquantify.methods.aggregative import PACC
|
|
949
|
-
>>> from mlquantify.utils.general import get_real_prev
|
|
950
|
-
>>> from sklearn.datasets import load_breast_cancer
|
|
951
|
-
>>> from sklearn.svm import SVC
|
|
952
|
-
>>> from sklearn.model_selection import train_test_split
|
|
953
|
-
>>>
|
|
954
|
-
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
955
|
-
>>>
|
|
956
|
-
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
|
|
957
|
-
>>>
|
|
958
|
-
>>> pacc = PACC(learner=SVC(probability=True))
|
|
959
|
-
>>> pacc.fit(X_train, y_train)
|
|
960
|
-
>>> y_pred = pacc.predict(X_test)
|
|
961
|
-
>>> y_pred
|
|
962
|
-
{0: 0.4664886119311328, 1: 0.5335113880688672}
|
|
963
|
-
>>> get_real_prev(y_test)
|
|
964
|
-
{0: 0.3991228070175439, 1: 0.6008771929824561}
|
|
965
|
-
"""
|
|
966
|
-
|
|
967
|
-
def __init__(self, learner: BaseEstimator=None, threshold: float = 0.5):
|
|
968
|
-
self.learner = learner
|
|
969
|
-
self.threshold = threshold
|
|
970
|
-
self.mean_pos = None
|
|
971
|
-
self.mean_neg = None
|
|
972
|
-
|
|
973
|
-
@property
|
|
974
|
-
def is_probabilistic(self) -> bool:
|
|
975
|
-
return True
|
|
976
|
-
|
|
977
|
-
@property
|
|
978
|
-
def is_multiclass(self) -> bool:
|
|
979
|
-
return False
|
|
980
|
-
|
|
981
|
-
def _fit_method(self, X, y):
|
|
982
|
-
# Get predicted labels and probabilities
|
|
983
|
-
if mq.arguments["y_labels"] is not None and mq.arguments["posteriors_train"] is not None:
|
|
984
|
-
y_labels = mq.arguments["y_labels"]
|
|
985
|
-
probabilities = mq.arguments["posteriors_train"]
|
|
986
|
-
else:
|
|
987
|
-
y_labels, probabilities = get_scores(X, y, self.learner, self.cv_folds, self.learner_fitted)
|
|
988
|
-
|
|
989
|
-
# Adjust thresholds and compute true and false positive rates
|
|
990
|
-
|
|
991
|
-
self.mean_pos = np.mean(probabilities[y_labels == self.classes[1], 1])
|
|
992
|
-
self.mean_neg = np.mean(probabilities[y_labels != self.classes[1], 1])
|
|
993
|
-
|
|
994
|
-
return self
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
def _predict_method(self, X):
|
|
998
|
-
"""
|
|
999
|
-
Predicts the class prevalence using the mean class-conditional
|
|
1000
|
-
probabilities from a probabilistic classifier.
|
|
1001
|
-
|
|
1002
|
-
Parameters
|
|
1003
|
-
----------
|
|
1004
|
-
X : array-like or sparse matrix of shape (n_samples, n_features)
|
|
1005
|
-
The input data for prediction.
|
|
1006
|
-
|
|
1007
|
-
Returns
|
|
1008
|
-
-------
|
|
1009
|
-
dict
|
|
1010
|
-
A dictionary with class labels as keys and their respective
|
|
1011
|
-
prevalence estimates as values.
|
|
1012
|
-
|
|
1013
|
-
Notes
|
|
1014
|
-
-----
|
|
1015
|
-
The prevalence is adjusted using the formula:
|
|
1016
|
-
prevalence = |mean_score - FPR| / (TPR - FPR),
|
|
1017
|
-
where mean_score is the average probability for the positive class.
|
|
1018
|
-
|
|
1019
|
-
Raises
|
|
1020
|
-
------
|
|
1021
|
-
ZeroDivisionError
|
|
1022
|
-
If `TPR - FPR` equals zero, indicating that the classifier's
|
|
1023
|
-
performance does not vary across the threshold range.
|
|
1024
|
-
"""
|
|
1025
|
-
prevalences = {}
|
|
1026
|
-
|
|
1027
|
-
# Calculate probabilities for the positive class
|
|
1028
|
-
probabilities = self.predict_learner(X)[:, 1]
|
|
1029
|
-
|
|
1030
|
-
# Compute the mean score for the positive class
|
|
1031
|
-
mean_scores = np.mean(probabilities)
|
|
1032
|
-
|
|
1033
|
-
# Adjust prevalence based on TPR and FPR
|
|
1034
|
-
if self.mean_pos - self.mean_neg == 0:
|
|
1035
|
-
prevalence = mean_scores
|
|
1036
|
-
else:
|
|
1037
|
-
prevalence = np.clip(abs(mean_scores - self.mean_neg) / (self.mean_pos - self.mean_neg), 0, 1)
|
|
1038
|
-
|
|
1039
|
-
# Map the computed prevalence to the class labels
|
|
1040
|
-
prevalences[self.classes[0]] = 1 - prevalence
|
|
1041
|
-
prevalences[self.classes[1]] = prevalence
|
|
1042
|
-
|
|
1043
|
-
return prevalences
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
class PWK(AggregativeQuantifier):
|
|
1047
|
-
"""
|
|
1048
|
-
Nearest-Neighbor Based Quantification (PWK).
|
|
1049
|
-
|
|
1050
|
-
PWK extends nearest-neighbor classification to the quantification setting.
|
|
1051
|
-
This k-NN approach uses a weighting scheme that reduces the influence of
|
|
1052
|
-
neighbors from the majority class to better estimate class prevalences.
|
|
1053
|
-
|
|
1054
|
-
Attributes
|
|
1055
|
-
----------
|
|
1056
|
-
learner : BaseEstimator
|
|
1057
|
-
A scikit-learn-compatible classifier that implements a k-NN approach.
|
|
1058
|
-
|
|
1059
|
-
Notes
|
|
1060
|
-
-----
|
|
1061
|
-
To get the optimal functionality, you must use the `PWKCLF` classifier, which is a classifier that uses K-NN to classify
|
|
1062
|
-
|
|
1063
|
-
References
|
|
1064
|
-
----------
|
|
1065
|
-
BARRANQUERO, Jose et al. On the study of nearest neighbor algorithms for prevalence estimation in binary problems. Pattern Recognition, v. 46, n. 2, p. 472-482, 2013. Available at: https://www.sciencedirect.com/science/article/pii/S0031320312003391?casa_token=qgInkRZdEhgAAAAA:Yu_ttk6Tso0xAZR23I0EGnge_UmA_kWI1eB8kxaRZ5Vg1PFLpMwcbEwNvZ5-4Mep7Jgfj9WsCFMMdQ
|
|
1066
|
-
|
|
1067
|
-
Examples
|
|
1068
|
-
--------
|
|
1069
|
-
>>> from mlquantify.utils.general import get_real_prev
|
|
1070
|
-
>>> from mlquantify.methods.aggregative import PWK
|
|
1071
|
-
>>> from sklearn.ensemble import RandomForestClassifier
|
|
1072
|
-
>>> from sklearn.datasets import load_wine
|
|
1073
|
-
>>> from sklearn.model_selection import train_test_split
|
|
1074
|
-
>>>
|
|
1075
|
-
>>> features, target = load_wine(return_X_y=True, random_state=32)
|
|
1076
|
-
>>>
|
|
1077
|
-
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.4, random_state=32)
|
|
1078
|
-
>>>
|
|
1079
|
-
>>> pwk = PWK(RandomForestClassifier())
|
|
1080
|
-
>>> pwk.fit(X_train, y_train)
|
|
1081
|
-
>>> y_pred = pwk.predict(X_test)
|
|
1082
|
-
>>> y_pred
|
|
1083
|
-
{0: 0.4305555555555556, 1: 0.2916666666666667, 2: 0.2777777777777778}
|
|
1084
|
-
>>> get_real_prev(y_test)
|
|
1085
|
-
{0: 0.4166666666666667, 1: 0.3194444444444444, 2: 0.2638888888888889}
|
|
1086
|
-
"""
|
|
1087
|
-
|
|
1088
|
-
def __init__(self, learner: BaseEstimator=None):
|
|
1089
|
-
self.learner = learner
|
|
1090
|
-
|
|
1091
|
-
def _fit_method(self, X, y):
|
|
1092
|
-
"""
|
|
1093
|
-
Fits the k-NN learner to the training data.
|
|
1094
|
-
|
|
1095
|
-
Parameters
|
|
1096
|
-
----------
|
|
1097
|
-
X : pd.DataFrame or np.ndarray
|
|
1098
|
-
Features of the training dataset.
|
|
1099
|
-
y : pd.Series or np.ndarray
|
|
1100
|
-
Labels of the training dataset.
|
|
1101
|
-
|
|
1102
|
-
Returns
|
|
1103
|
-
-------
|
|
1104
|
-
self : PWK
|
|
1105
|
-
Fitted quantifier object.
|
|
1106
|
-
"""
|
|
1107
|
-
self.fit_learner(X, y)
|
|
1108
|
-
return self
|
|
1109
|
-
|
|
1110
|
-
def _predict_method(self, X) -> dict:
|
|
1111
|
-
"""
|
|
1112
|
-
Predicts class prevalences in the test dataset by analyzing the distribution of predicted labels.
|
|
1113
|
-
|
|
1114
|
-
Parameters
|
|
1115
|
-
----------
|
|
1116
|
-
X : pd.DataFrame or np.ndarray
|
|
1117
|
-
Features of the test dataset.
|
|
1118
|
-
|
|
1119
|
-
Returns
|
|
1120
|
-
-------
|
|
1121
|
-
dict
|
|
1122
|
-
A dictionary mapping each class label to its estimated prevalence.
|
|
1123
|
-
"""
|
|
1124
|
-
# Predict class labels for the given data
|
|
1125
|
-
predicted_labels = self.predict_learner(X)
|
|
1126
|
-
|
|
1127
|
-
# Compute the distribution of predicted labels
|
|
1128
|
-
unique_labels, label_counts = np.unique(predicted_labels, return_counts=True)
|
|
1129
|
-
|
|
1130
|
-
# Calculate the prevalence for each class
|
|
1131
|
-
class_prevalences = label_counts / label_counts.sum()
|
|
1132
|
-
|
|
1133
|
-
# Map each class label to its prevalence
|
|
1134
|
-
prevalences = {label: prevalence for label, prevalence in zip(unique_labels, class_prevalences)}
|
|
1135
|
-
|
|
1136
|
-
return prevalences
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
from . import threshold_optimization
|
|
1143
|
-
|
|
1144
|
-
ACC = threshold_optimization.ACC
|
|
1145
|
-
T50 = threshold_optimization.T50
|
|
1146
|
-
MAX = threshold_optimization.MAX
|
|
1147
|
-
X_method = threshold_optimization.X_method
|
|
1148
|
-
MS = threshold_optimization.MS
|
|
1149
|
-
MS2 = threshold_optimization.MS2
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
from . import mixture_models
|
|
1154
|
-
|
|
1155
|
-
DySsyn = mixture_models.DySsyn
|
|
1156
|
-
DyS = mixture_models.DyS
|
|
1157
|
-
HDy = mixture_models.HDy
|
|
1158
|
-
SMM = mixture_models.SMM
|
|
1159
|
-
SORD = mixture_models.SORD
|