mlquantify 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify/__init__.py +0 -29
- mlquantify/adjust_counting/__init__.py +14 -0
- mlquantify/adjust_counting/_adjustment.py +365 -0
- mlquantify/adjust_counting/_base.py +247 -0
- mlquantify/adjust_counting/_counting.py +145 -0
- mlquantify/adjust_counting/_utils.py +114 -0
- mlquantify/base.py +117 -519
- mlquantify/base_aggregative.py +209 -0
- mlquantify/calibration.py +1 -0
- mlquantify/confidence.py +335 -0
- mlquantify/likelihood/__init__.py +5 -0
- mlquantify/likelihood/_base.py +161 -0
- mlquantify/likelihood/_classes.py +414 -0
- mlquantify/meta/__init__.py +1 -0
- mlquantify/meta/_classes.py +761 -0
- mlquantify/metrics/__init__.py +21 -0
- mlquantify/metrics/_oq.py +109 -0
- mlquantify/metrics/_rq.py +98 -0
- mlquantify/{evaluation/measures.py → metrics/_slq.py} +43 -28
- mlquantify/mixture/__init__.py +7 -0
- mlquantify/mixture/_base.py +153 -0
- mlquantify/mixture/_classes.py +400 -0
- mlquantify/mixture/_utils.py +112 -0
- mlquantify/model_selection/__init__.py +9 -0
- mlquantify/model_selection/_protocol.py +358 -0
- mlquantify/model_selection/_search.py +315 -0
- mlquantify/model_selection/_split.py +1 -0
- mlquantify/multiclass.py +350 -0
- mlquantify/neighbors/__init__.py +9 -0
- mlquantify/neighbors/_base.py +198 -0
- mlquantify/neighbors/_classes.py +159 -0
- mlquantify/{classification/methods.py → neighbors/_classification.py} +48 -66
- mlquantify/neighbors/_kde.py +270 -0
- mlquantify/neighbors/_utils.py +135 -0
- mlquantify/neural/__init__.py +1 -0
- mlquantify/utils/__init__.py +47 -2
- mlquantify/utils/_artificial.py +27 -0
- mlquantify/utils/_constraints.py +219 -0
- mlquantify/utils/_context.py +21 -0
- mlquantify/utils/_decorators.py +36 -0
- mlquantify/utils/_exceptions.py +12 -0
- mlquantify/utils/_get_scores.py +159 -0
- mlquantify/utils/_load.py +18 -0
- mlquantify/utils/_parallel.py +6 -0
- mlquantify/utils/_random.py +36 -0
- mlquantify/utils/_sampling.py +273 -0
- mlquantify/utils/_tags.py +44 -0
- mlquantify/utils/_validation.py +447 -0
- mlquantify/utils/prevalence.py +61 -0
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.9.dist-info}/METADATA +2 -1
- mlquantify-0.1.9.dist-info/RECORD +53 -0
- mlquantify/classification/__init__.py +0 -1
- mlquantify/evaluation/__init__.py +0 -14
- mlquantify/evaluation/protocol.py +0 -289
- mlquantify/methods/__init__.py +0 -37
- mlquantify/methods/aggregative.py +0 -1159
- mlquantify/methods/meta.py +0 -472
- mlquantify/methods/mixture_models.py +0 -1003
- mlquantify/methods/non_aggregative.py +0 -136
- mlquantify/methods/threshold_optimization.py +0 -869
- mlquantify/model_selection.py +0 -377
- mlquantify/plots.py +0 -367
- mlquantify/utils/general.py +0 -371
- mlquantify/utils/method.py +0 -449
- mlquantify-0.1.8.dist-info/RECORD +0 -22
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.9.dist-info}/WHEEL +0 -0
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.9.dist-info}/top_level.txt +0 -0
|
@@ -1,1003 +0,0 @@
|
|
|
1
|
-
from abc import abstractmethod
|
|
2
|
-
import numpy as np
|
|
3
|
-
from sklearn.base import BaseEstimator
|
|
4
|
-
|
|
5
|
-
from ..base import AggregativeQuantifier
|
|
6
|
-
|
|
7
|
-
from ..utils.general import get_real_prev
|
|
8
|
-
from ..utils.method import *
|
|
9
|
-
import mlquantify as mq
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class MixtureModel(AggregativeQuantifier):
|
|
15
|
-
"""Mixtures of Score Distributions
|
|
16
|
-
|
|
17
|
-
MixtureModel is a generic class for methods based on mixture models.
|
|
18
|
-
The main idea is that the cumulative distribution of scores assigned
|
|
19
|
-
to data points in the test set is a mixture of the score distributions
|
|
20
|
-
from the training set (positive and negative classes).
|
|
21
|
-
|
|
22
|
-
Parameters
|
|
23
|
-
----------
|
|
24
|
-
learner : BaseEstimator
|
|
25
|
-
A scikit-learn compatible classifier that supports `predict_proba`.
|
|
26
|
-
|
|
27
|
-
Attributes
|
|
28
|
-
----------
|
|
29
|
-
learner : BaseEstimator
|
|
30
|
-
A scikit-learn compatible classifier that provides predictive probabilities.
|
|
31
|
-
pos_scores : np.ndarray
|
|
32
|
-
Score distribution for the positive class in the training data.
|
|
33
|
-
neg_scores : np.ndarray
|
|
34
|
-
Score distribution for the negative class in the training data.
|
|
35
|
-
|
|
36
|
-
Notes
|
|
37
|
-
-----
|
|
38
|
-
All methods that inherits from MixtureModel will be binary quantifiers. In case of multiclass problems will be made One vs All.
|
|
39
|
-
|
|
40
|
-
Examples
|
|
41
|
-
--------
|
|
42
|
-
>>> from mlquantify.methods.mixture_models import MixtureModel
|
|
43
|
-
>>> from mlquantify.utils.general import get_real_prev
|
|
44
|
-
>>> from mlquantify.utils.method import getHist
|
|
45
|
-
>>> from sklearn.model_selection import train_test_split
|
|
46
|
-
>>> from sklearn.datasets import load_breast_cancer
|
|
47
|
-
>>> from sklearn.ensemble import RandomForestClassifier
|
|
48
|
-
>>> import numpy as np
|
|
49
|
-
>>>
|
|
50
|
-
>>> class MyMixtureModel(MixtureModel):
|
|
51
|
-
... def __init__(self, learner, param):
|
|
52
|
-
... super().__init__(learner)
|
|
53
|
-
... self.param = param
|
|
54
|
-
... def _compute_prevalence(self, test_scores: np.ndarray) -> float:
|
|
55
|
-
... hist_pos = getHist(self.pos_scores, self.param)
|
|
56
|
-
... hist_neg = getHist(self.neg_scores, self.param)
|
|
57
|
-
... hist_test = getHist(test_scores, self.param)
|
|
58
|
-
... mixture = hist_test * (hist_pos + hist_neg)
|
|
59
|
-
... return np.sum(mixture)
|
|
60
|
-
>>>
|
|
61
|
-
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
62
|
-
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
|
|
63
|
-
>>>
|
|
64
|
-
>>> mm = MyMixtureModel(RandomForestClassifier(), 10)
|
|
65
|
-
>>> mm.fit(X_train, y_train)
|
|
66
|
-
>>> prevalence = mm.predict(X_test)
|
|
67
|
-
>>> prevalence
|
|
68
|
-
{0: 0.3622419419517543, 1: 0.6377580580482457}
|
|
69
|
-
>>> get_real_prev(y_test)
|
|
70
|
-
{0: 0.37719298245614036, 1: 0.6228070175438597}
|
|
71
|
-
"""
|
|
72
|
-
|
|
73
|
-
def __init__(self, learner: BaseEstimator=None):
|
|
74
|
-
self.learner = learner
|
|
75
|
-
self.pos_scores = None
|
|
76
|
-
self.neg_scores = None
|
|
77
|
-
|
|
78
|
-
@property
|
|
79
|
-
def is_multiclass(self) -> bool:
|
|
80
|
-
"""
|
|
81
|
-
Indicates whether the model supports multiclass classification.
|
|
82
|
-
|
|
83
|
-
Returns
|
|
84
|
-
-------
|
|
85
|
-
bool
|
|
86
|
-
Always returns False, as MixtureModel supports only binary classification.
|
|
87
|
-
"""
|
|
88
|
-
return False
|
|
89
|
-
|
|
90
|
-
@property
|
|
91
|
-
def is_probabilistic(self) -> bool:
|
|
92
|
-
return True
|
|
93
|
-
|
|
94
|
-
def _fit_method(self, X, y):
|
|
95
|
-
"""
|
|
96
|
-
Fits the positive and negative score distributions using cross-validation.
|
|
97
|
-
|
|
98
|
-
Parameters
|
|
99
|
-
----------
|
|
100
|
-
X : np.ndarray
|
|
101
|
-
Training feature matrix.
|
|
102
|
-
y : np.ndarray
|
|
103
|
-
Training labels.
|
|
104
|
-
|
|
105
|
-
Returns
|
|
106
|
-
-------
|
|
107
|
-
self : MixtureModel
|
|
108
|
-
The fitted MixtureModel instance.
|
|
109
|
-
"""
|
|
110
|
-
if mq.arguments["y_labels"] is not None and mq.arguments["posteriors_train"] is not None:
|
|
111
|
-
y_labels = mq.arguments["y_labels"]
|
|
112
|
-
probabilities = mq.arguments["posteriors_train"]
|
|
113
|
-
else:
|
|
114
|
-
y_labels, probabilities = get_scores(X, y, self.learner, self.cv_folds, self.learner_fitted)
|
|
115
|
-
|
|
116
|
-
# Separate positive and negative scores based on labels
|
|
117
|
-
self.pos_scores = probabilities[y_labels == self.classes[1]][:, 1]
|
|
118
|
-
self.neg_scores = probabilities[y_labels == self.classes[0]][:, 1]
|
|
119
|
-
|
|
120
|
-
return self
|
|
121
|
-
|
|
122
|
-
def _predict_method(self, X) -> dict:
|
|
123
|
-
"""
|
|
124
|
-
Predicts class prevalences for the test data.
|
|
125
|
-
|
|
126
|
-
Parameters
|
|
127
|
-
----------
|
|
128
|
-
X : np.ndarray
|
|
129
|
-
Test feature matrix.
|
|
130
|
-
|
|
131
|
-
Returns
|
|
132
|
-
-------
|
|
133
|
-
np.ndarray
|
|
134
|
-
An array containing the prevalence for each class.
|
|
135
|
-
"""
|
|
136
|
-
# Get the predicted probabilities for the positive class
|
|
137
|
-
test_scores = self.predict_learner(X)[:, 1]
|
|
138
|
-
|
|
139
|
-
# Compute the prevalence using the mixture model
|
|
140
|
-
prevalence = np.clip(self._compute_prevalence(test_scores), 0, 1)
|
|
141
|
-
|
|
142
|
-
# Return the prevalence as a distribution over the classes
|
|
143
|
-
return np.asarray([1 - prevalence, prevalence])
|
|
144
|
-
|
|
145
|
-
@abstractmethod
|
|
146
|
-
def _compute_prevalence(self, test_scores: np.ndarray) -> float:
|
|
147
|
-
"""
|
|
148
|
-
Abstract method to compute prevalence using the test scores.
|
|
149
|
-
Subclasses must implement this method.
|
|
150
|
-
|
|
151
|
-
Parameters
|
|
152
|
-
----------
|
|
153
|
-
test_scores : np.ndarray
|
|
154
|
-
Probabilities for the positive class in the test set.
|
|
155
|
-
|
|
156
|
-
Returns
|
|
157
|
-
-------
|
|
158
|
-
float
|
|
159
|
-
The computed prevalence for the positive class.
|
|
160
|
-
"""
|
|
161
|
-
pass
|
|
162
|
-
|
|
163
|
-
def get_distance(self, dist_train, dist_test, measure: str) -> float:
|
|
164
|
-
"""
|
|
165
|
-
Computes the distance between training and test distributions using a specified metric.
|
|
166
|
-
|
|
167
|
-
Parameters
|
|
168
|
-
----------
|
|
169
|
-
dist_train : np.ndarray
|
|
170
|
-
Distribution of scores for the training data.
|
|
171
|
-
dist_test : np.ndarray
|
|
172
|
-
Distribution of scores for the test data.
|
|
173
|
-
measure : str
|
|
174
|
-
The metric to use for distance calculation. Supported values are
|
|
175
|
-
'topsoe', 'probsymm', 'hellinger', and 'euclidean'.
|
|
176
|
-
|
|
177
|
-
Returns
|
|
178
|
-
-------
|
|
179
|
-
float
|
|
180
|
-
The computed distance between the two distributions.
|
|
181
|
-
|
|
182
|
-
Raises
|
|
183
|
-
------
|
|
184
|
-
ValueError
|
|
185
|
-
If the input distributions have mismatched sizes or are zero vectors.
|
|
186
|
-
"""
|
|
187
|
-
# Validate input distributions
|
|
188
|
-
if np.sum(dist_train) < 1e-20 or np.sum(dist_test) < 1e-20:
|
|
189
|
-
raise ValueError("One or both vectors are zero (empty)...")
|
|
190
|
-
if len(dist_train) != len(dist_test):
|
|
191
|
-
raise ValueError("Arrays need to be of equal size...")
|
|
192
|
-
|
|
193
|
-
# Avoid division by zero by replacing small values
|
|
194
|
-
dist_train = np.maximum(dist_train, 1e-20)
|
|
195
|
-
dist_test = np.maximum(dist_test, 1e-20)
|
|
196
|
-
|
|
197
|
-
# Compute the distance based on the selected metric
|
|
198
|
-
if measure == 'topsoe':
|
|
199
|
-
return topsoe(dist_train, dist_test)
|
|
200
|
-
elif measure == 'probsymm':
|
|
201
|
-
return probsymm(dist_train, dist_test)
|
|
202
|
-
elif measure == 'hellinger':
|
|
203
|
-
return hellinger(dist_train, dist_test)
|
|
204
|
-
elif measure == 'euclidean':
|
|
205
|
-
return sqEuclidean(dist_train, dist_test)
|
|
206
|
-
else:
|
|
207
|
-
return 100 # Default value for unknown metrics
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
class DyS(MixtureModel):
|
|
214
|
-
"""
|
|
215
|
-
Distribution y-Similarity (DyS) framework.
|
|
216
|
-
|
|
217
|
-
DyS is a method that generalizes the HDy approach by
|
|
218
|
-
considering the dissimilarity function DS as a parameter
|
|
219
|
-
of the model.
|
|
220
|
-
|
|
221
|
-
Parameters
|
|
222
|
-
----------
|
|
223
|
-
learner : BaseEstimator
|
|
224
|
-
A probabilistic classifier implementing the `predict_proba` method.
|
|
225
|
-
measure : str, optional
|
|
226
|
-
The metric used to compare distributions. Options are:
|
|
227
|
-
- "hellinger"
|
|
228
|
-
- "topsoe"
|
|
229
|
-
- "probsymm"
|
|
230
|
-
Default is "topsoe".
|
|
231
|
-
bins_size : np.ndarray, optional
|
|
232
|
-
Array of bin sizes for histogram computation.
|
|
233
|
-
Default is np.append(np.linspace(2, 20, 10), 30).
|
|
234
|
-
|
|
235
|
-
Attributes
|
|
236
|
-
----------
|
|
237
|
-
bins_size : np.ndarray
|
|
238
|
-
Bin sizes used for histogram calculations.
|
|
239
|
-
measure : str
|
|
240
|
-
Selected distance metric.
|
|
241
|
-
prevs : np.ndarray
|
|
242
|
-
Array of prevalences that minimize the distances.
|
|
243
|
-
|
|
244
|
-
References
|
|
245
|
-
----------
|
|
246
|
-
VAN HASSELT, H.; GUEZ, A.; SILVER, D. Proceedings of the AAAI conference on artificial intelligence. 2016. Avaliable at https://ojs.aaai.org/index.php/AAAI/article/view/4376
|
|
247
|
-
|
|
248
|
-
Examples
|
|
249
|
-
--------
|
|
250
|
-
>>> from mlquantify.methods.mixture_models import DyS
|
|
251
|
-
>>> from mlquantify.utils.general import get_real_prev
|
|
252
|
-
>>> from sklearn.ensemble import RandomForestClassifier
|
|
253
|
-
>>> from sklearn.datasets import load_breast_cancer
|
|
254
|
-
>>> from sklearn.model_selection import train_test_split
|
|
255
|
-
>>>
|
|
256
|
-
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
257
|
-
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
|
|
258
|
-
>>>
|
|
259
|
-
>>> dys = DyS(RandomForestClassifier())
|
|
260
|
-
>>> dys.fit(X_train, y_train)
|
|
261
|
-
>>> prevalence = dys.predict(X_test)
|
|
262
|
-
>>> prevalence
|
|
263
|
-
{0: 0.3736714619191387, 1: 0.6263285380808613}
|
|
264
|
-
>>> get_real_prev(y_test)
|
|
265
|
-
{0: 0.37719298245614036, 1: 0.6228070175438597}
|
|
266
|
-
"""
|
|
267
|
-
|
|
268
|
-
def __init__(self, learner: BaseEstimator=None, measure: str = "topsoe", bins_size: np.ndarray = None):
|
|
269
|
-
assert measure in ["hellinger", "topsoe", "probsymm"], "Invalid measure."
|
|
270
|
-
super().__init__(learner)
|
|
271
|
-
|
|
272
|
-
# Set up bins_size
|
|
273
|
-
if bins_size is None:
|
|
274
|
-
bins_size = np.append(np.linspace(2, 20, 10), 30)
|
|
275
|
-
if isinstance(bins_size, list):
|
|
276
|
-
bins_size = np.asarray(bins_size)
|
|
277
|
-
|
|
278
|
-
self.bins_size = bins_size
|
|
279
|
-
self.measure = measure
|
|
280
|
-
self.prevs = None # Array of prevalences that minimizes the distances
|
|
281
|
-
|
|
282
|
-
def _compute_prevalence(self, test_scores: np.ndarray) -> float:
|
|
283
|
-
"""
|
|
284
|
-
Compute the prevalence estimate based on the test scores.
|
|
285
|
-
|
|
286
|
-
Parameters
|
|
287
|
-
----------
|
|
288
|
-
test_scores : np.ndarray
|
|
289
|
-
Array of predicted probabilities for the test data.
|
|
290
|
-
|
|
291
|
-
Returns
|
|
292
|
-
-------
|
|
293
|
-
prevalence : float
|
|
294
|
-
Estimated prevalence.
|
|
295
|
-
"""
|
|
296
|
-
prevs = self.GetMinDistancesDyS(test_scores)
|
|
297
|
-
# Use the median of the prevalences as the final estimate
|
|
298
|
-
prevalence = np.median(prevs)
|
|
299
|
-
|
|
300
|
-
return prevalence
|
|
301
|
-
|
|
302
|
-
def best_distance(self, X_test: np.ndarray) -> float:
|
|
303
|
-
"""
|
|
304
|
-
Calculate the minimum distance between test scores and train distributions.
|
|
305
|
-
|
|
306
|
-
Parameters
|
|
307
|
-
----------
|
|
308
|
-
X_test : np.ndarray
|
|
309
|
-
Test data to evaluate.
|
|
310
|
-
|
|
311
|
-
Returns
|
|
312
|
-
-------
|
|
313
|
-
distance : float
|
|
314
|
-
The minimum distance value.
|
|
315
|
-
"""
|
|
316
|
-
test_scores = self.predict_learner(X_test)
|
|
317
|
-
prevs = self.GetMinDistancesDyS(test_scores)
|
|
318
|
-
|
|
319
|
-
size = len(prevs)
|
|
320
|
-
best_prev = np.median(prevs)
|
|
321
|
-
|
|
322
|
-
if size % 2 != 0: # Odd
|
|
323
|
-
index = np.argmax(prevs == best_prev)
|
|
324
|
-
bin_size = self.bins_size[index]
|
|
325
|
-
else: # Even
|
|
326
|
-
# Sort the prevalences
|
|
327
|
-
ordered_prevs = np.sort(prevs)
|
|
328
|
-
# Get the two middle indices
|
|
329
|
-
middle1 = np.floor(size / 2).astype(int)
|
|
330
|
-
middle2 = np.ceil(size / 2).astype(int)
|
|
331
|
-
# Find the values corresponding to the median positions
|
|
332
|
-
median1 = ordered_prevs[middle1]
|
|
333
|
-
median2 = ordered_prevs[middle2]
|
|
334
|
-
# Find the indices of these medians
|
|
335
|
-
index1 = np.argmax(prevs == median1)
|
|
336
|
-
index2 = np.argmax(prevs == median2)
|
|
337
|
-
# Compute the average bin size
|
|
338
|
-
bin_size = np.mean([self.bins_size[index1], self.bins_size[index2]])
|
|
339
|
-
|
|
340
|
-
# Compute histogram densities
|
|
341
|
-
pos_bin_density = getHist(self.pos_scores, bin_size)
|
|
342
|
-
neg_bin_density = getHist(self.neg_scores, bin_size)
|
|
343
|
-
test_bin_density = getHist(test_scores, bin_size)
|
|
344
|
-
|
|
345
|
-
# Combine densities
|
|
346
|
-
train_combined_density = (pos_bin_density * best_prev) + (neg_bin_density * (1 - best_prev))
|
|
347
|
-
|
|
348
|
-
# Compute the distance
|
|
349
|
-
distance = self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
|
|
350
|
-
|
|
351
|
-
return distance
|
|
352
|
-
|
|
353
|
-
def GetMinDistancesDyS(self, test_scores: np.ndarray) -> list:
|
|
354
|
-
"""
|
|
355
|
-
Compute prevalence by evaluating the distance metric across bin sizes.
|
|
356
|
-
|
|
357
|
-
Parameters
|
|
358
|
-
----------
|
|
359
|
-
test_scores : np.ndarray
|
|
360
|
-
Array of predicted probabilities for the test data.
|
|
361
|
-
|
|
362
|
-
Returns
|
|
363
|
-
-------
|
|
364
|
-
prevs : list
|
|
365
|
-
List of prevalence estimates minimizing the distance for each bin size.
|
|
366
|
-
"""
|
|
367
|
-
prevs = []
|
|
368
|
-
|
|
369
|
-
# Iterate over each bin size
|
|
370
|
-
for bins in self.bins_size:
|
|
371
|
-
# Compute histogram densities
|
|
372
|
-
pos_bin_density = getHist(self.pos_scores, bins)
|
|
373
|
-
neg_bin_density = getHist(self.neg_scores, bins)
|
|
374
|
-
test_bin_density = getHist(test_scores, bins)
|
|
375
|
-
|
|
376
|
-
# Define the function to minimize
|
|
377
|
-
def f(x):
|
|
378
|
-
# Combine densities
|
|
379
|
-
train_combined_density = (pos_bin_density * x) + (neg_bin_density * (1 - x))
|
|
380
|
-
# Compute the distance
|
|
381
|
-
return self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
|
|
382
|
-
|
|
383
|
-
# Use ternary search to minimize the distance
|
|
384
|
-
prevs.append(ternary_search(0, 1, f))
|
|
385
|
-
|
|
386
|
-
return prevs
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
class DySsyn(MixtureModel):
|
|
398
|
-
"""Synthetic Distribution y-Similarity (DySsyn).
|
|
399
|
-
|
|
400
|
-
This method works similarly to the DyS method, but instead of using the
|
|
401
|
-
train scores, it generates them via MoSS (Model for Synthetic Scores).
|
|
402
|
-
MoSS creates a spectrum of score distributions ranging from highly separated
|
|
403
|
-
to fully mixed scores.
|
|
404
|
-
|
|
405
|
-
Parameters
|
|
406
|
-
----------
|
|
407
|
-
learner : BaseEstimator
|
|
408
|
-
A probabilistic classifier implementing the `predict_proba` method.
|
|
409
|
-
measure : str, optional
|
|
410
|
-
The metric used to compare distributions. Options are:
|
|
411
|
-
- "hellinger"
|
|
412
|
-
- "topsoe"
|
|
413
|
-
- "probsymm"
|
|
414
|
-
Default is "topsoe".
|
|
415
|
-
merge_factor : np.ndarray, optional
|
|
416
|
-
Array controlling the mixing level of synthetic distributions.
|
|
417
|
-
Default is np.linspace(0.1, 0.4, 10).
|
|
418
|
-
bins_size : np.ndarray, optional
|
|
419
|
-
Array of bin sizes for histogram computation.
|
|
420
|
-
Default is np.append(np.linspace(2, 20, 10), 30).
|
|
421
|
-
alpha_train : float, optional
|
|
422
|
-
Initial estimate of the training prevalence. Default is 0.5.
|
|
423
|
-
n : int, optional
|
|
424
|
-
Number of synthetic samples generated. Default is None.
|
|
425
|
-
|
|
426
|
-
Attributes
|
|
427
|
-
----------
|
|
428
|
-
bins_size : np.ndarray
|
|
429
|
-
Bin sizes used for histogram calculations.
|
|
430
|
-
merge_factor : np.ndarray
|
|
431
|
-
Mixing factors for generating synthetic score distributions.
|
|
432
|
-
alpha_train : float
|
|
433
|
-
True training prevalence.
|
|
434
|
-
n : int
|
|
435
|
-
Number of samples generated during synthetic distribution creation.
|
|
436
|
-
measure : str
|
|
437
|
-
Selected distance metric.
|
|
438
|
-
m : None or float
|
|
439
|
-
Best mixing factor determined during computation.
|
|
440
|
-
|
|
441
|
-
References
|
|
442
|
-
----------
|
|
443
|
-
MALETZKE, André et al. Accurately quantifying under score variability. In: 2021 IEEE International Conference on Data Mining (ICDM). IEEE, 2021. p. 1228-1233. Avaliable at https://ieeexplore.ieee.org/abstract/document/9679104
|
|
444
|
-
|
|
445
|
-
Examples
|
|
446
|
-
--------
|
|
447
|
-
>>> from mlquantify.methods.mixture_models import DySsyn
|
|
448
|
-
>>> from mlquantify.utils.general import get_real_prev
|
|
449
|
-
>>> from sklearn.ensemble import RandomForestClassifier
|
|
450
|
-
>>> from sklearn.datasets import load_breast_cancer
|
|
451
|
-
>>> from sklearn.model_selection import train_test_split
|
|
452
|
-
>>>
|
|
453
|
-
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
454
|
-
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
|
|
455
|
-
>>>
|
|
456
|
-
>>> dyssyn = DySsyn(RandomForestClassifier())
|
|
457
|
-
>>> dyssyn.fit(X_train, y_train)
|
|
458
|
-
>>> prevalence = dyssyn.predict(X_test)
|
|
459
|
-
>>> prevalence
|
|
460
|
-
{0: 0.3606413872681201, 1: 0.6393586127318799}
|
|
461
|
-
>>> get_real_prev(y_test)
|
|
462
|
-
{0: 0.37719298245614036, 1: 0.6228070175438597}
|
|
463
|
-
"""
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
def __init__(self, learner:BaseEstimator=None, measure:str="topsoe", merge_factor:np.ndarray=None, bins_size:np.ndarray=None, alpha_train:float=0.5, n:int=None):
|
|
467
|
-
assert measure in ["hellinger", "topsoe", "probsymm"], "measure not valid"
|
|
468
|
-
super().__init__(learner)
|
|
469
|
-
|
|
470
|
-
# Set up bins_size
|
|
471
|
-
if not bins_size:
|
|
472
|
-
bins_size = np.append(np.linspace(2,20,10), 30)
|
|
473
|
-
if isinstance(bins_size, list):
|
|
474
|
-
bins_size = np.asarray(bins_size)
|
|
475
|
-
|
|
476
|
-
if not merge_factor:
|
|
477
|
-
merge_factor = np.linspace(0.1, 0.4, 10)
|
|
478
|
-
|
|
479
|
-
self.bins_size = bins_size
|
|
480
|
-
self.merge_factor = merge_factor
|
|
481
|
-
self.alpha_train = alpha_train
|
|
482
|
-
self.n = n
|
|
483
|
-
self.measure = measure
|
|
484
|
-
self.m = None
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
def _fit_method(self, X, y):
|
|
489
|
-
"""
|
|
490
|
-
Fits the learner and calculates the training prevalence.
|
|
491
|
-
|
|
492
|
-
Parameters
|
|
493
|
-
----------
|
|
494
|
-
X : array-like of shape (n_samples, n_features)
|
|
495
|
-
Training data.
|
|
496
|
-
y : array-like of shape (n_samples,)
|
|
497
|
-
Training labels.
|
|
498
|
-
|
|
499
|
-
Returns
|
|
500
|
-
-------
|
|
501
|
-
self : DySsyn
|
|
502
|
-
The fitted DySsyn instance.
|
|
503
|
-
"""
|
|
504
|
-
self.fit_learner(X, y)
|
|
505
|
-
|
|
506
|
-
self.alpha_train = list(get_real_prev(y).values())[1]
|
|
507
|
-
|
|
508
|
-
return self
|
|
509
|
-
|
|
510
|
-
def _compute_prevalence(self, test_scores: np.ndarray) -> float:
|
|
511
|
-
"""
|
|
512
|
-
Computes the prevalence estimate using test scores.
|
|
513
|
-
|
|
514
|
-
Parameters
|
|
515
|
-
----------
|
|
516
|
-
test_scores : np.ndarray
|
|
517
|
-
Array of predicted probabilities for the test data.
|
|
518
|
-
|
|
519
|
-
Returns
|
|
520
|
-
-------
|
|
521
|
-
prevalence : float
|
|
522
|
-
Estimated prevalence based on the minimum distance
|
|
523
|
-
across synthetic distributions.
|
|
524
|
-
"""
|
|
525
|
-
distances = self.GetMinDistancesDySsyn(test_scores)
|
|
526
|
-
|
|
527
|
-
# Use the median of the prevalence estimates as the final prevalence
|
|
528
|
-
index = min(distances, key=lambda d: distances[d][0])
|
|
529
|
-
prevalence = distances[index][1]
|
|
530
|
-
|
|
531
|
-
return prevalence
|
|
532
|
-
|
|
533
|
-
def best_distance(self, X_test):
|
|
534
|
-
"""
|
|
535
|
-
Computes the minimum distance between test scores and synthetic distributions of MoSS.
|
|
536
|
-
|
|
537
|
-
Parameters
|
|
538
|
-
----------
|
|
539
|
-
X_test : array-like of shape (n_samples, n_features)
|
|
540
|
-
Test data.
|
|
541
|
-
|
|
542
|
-
Returns
|
|
543
|
-
-------
|
|
544
|
-
distance : float
|
|
545
|
-
Minimum distance value for the test data.
|
|
546
|
-
"""
|
|
547
|
-
test_scores = self.predict_learner(X_test)
|
|
548
|
-
|
|
549
|
-
distances = self.GetMinDistancesDySsyn(test_scores)
|
|
550
|
-
|
|
551
|
-
index = min(distances, key=lambda d: distances[d][0])
|
|
552
|
-
|
|
553
|
-
distance = distances[index][0]
|
|
554
|
-
|
|
555
|
-
return distance
|
|
556
|
-
|
|
557
|
-
def GetMinDistancesDySsyn(self, test_scores: np.ndarray) -> list:
|
|
558
|
-
"""
|
|
559
|
-
Calculates the minimum distances between test scores and synthetic distributions of MoSS
|
|
560
|
-
across various bin sizes and merge factors.
|
|
561
|
-
|
|
562
|
-
Parameters
|
|
563
|
-
----------
|
|
564
|
-
test_scores : np.ndarray
|
|
565
|
-
Array of predicted probabilities for the test data.
|
|
566
|
-
|
|
567
|
-
Returns
|
|
568
|
-
-------
|
|
569
|
-
values : dict
|
|
570
|
-
Dictionary mapping each merge factor (m) to a tuple containing:
|
|
571
|
-
- The minimum distance value.
|
|
572
|
-
- The corresponding prevalence estimate.
|
|
573
|
-
"""
|
|
574
|
-
if self.n is None:
|
|
575
|
-
self.n = len(test_scores)
|
|
576
|
-
|
|
577
|
-
values = {}
|
|
578
|
-
|
|
579
|
-
# Iterate over each merge factor
|
|
580
|
-
for m in self.merge_factor:
|
|
581
|
-
pos_scores, neg_scores = MoSS(self.n, self.alpha_train, m)
|
|
582
|
-
prevs = []
|
|
583
|
-
for bins in self.bins_size:
|
|
584
|
-
# Compute histogram densities for positive, negative, and test scores
|
|
585
|
-
pos_bin_density = getHist(pos_scores, bins)
|
|
586
|
-
neg_bin_density = getHist(neg_scores, bins)
|
|
587
|
-
test_bin_density = getHist(test_scores, bins)
|
|
588
|
-
|
|
589
|
-
# Define the function to minimize
|
|
590
|
-
def f(x):
|
|
591
|
-
# Combine densities using a mixture of positive and negative densities
|
|
592
|
-
train_combined_density = (pos_bin_density * x) + (neg_bin_density * (1 - x))
|
|
593
|
-
# Calculate the distance between combined density and test density
|
|
594
|
-
return self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
|
|
595
|
-
|
|
596
|
-
# Use ternary search to find the best x that minimizes the distance
|
|
597
|
-
prevs.append(ternary_search(0, 1, f))
|
|
598
|
-
|
|
599
|
-
size = len(prevs)
|
|
600
|
-
best_prev = np.median(prevs)
|
|
601
|
-
|
|
602
|
-
if size % 2 != 0: # ODD
|
|
603
|
-
index = np.argmax(prevs == best_prev)
|
|
604
|
-
bin_size = self.bins_size[index]
|
|
605
|
-
else: # EVEN
|
|
606
|
-
# Sort the values in self.prevs
|
|
607
|
-
ordered_prevs = np.sort(prevs)
|
|
608
|
-
|
|
609
|
-
# Find the two middle indices
|
|
610
|
-
middle1 = np.floor(size / 2).astype(int)
|
|
611
|
-
middle2 = np.ceil(size / 2).astype(int)
|
|
612
|
-
|
|
613
|
-
# Get the values corresponding to the median positions
|
|
614
|
-
median1 = ordered_prevs[middle1]
|
|
615
|
-
median2 = ordered_prevs[middle2]
|
|
616
|
-
|
|
617
|
-
# Find the indices of median1 and median2 in prevs
|
|
618
|
-
index1 = np.argmax(prevs == median1)
|
|
619
|
-
index2 = np.argmax(prevs == median2)
|
|
620
|
-
|
|
621
|
-
# Calculate the average of the corresponding bin sizes
|
|
622
|
-
bin_size = np.mean([self.bins_size[index1], self.bins_size[index2]])
|
|
623
|
-
|
|
624
|
-
pos_bin_density = getHist(pos_scores, bin_size)
|
|
625
|
-
neg_bin_density = getHist(neg_scores, bin_size)
|
|
626
|
-
test_bin_density = getHist(test_scores, bin_size)
|
|
627
|
-
|
|
628
|
-
train_combined_density = (pos_bin_density * best_prev) + (neg_bin_density * (1 - best_prev))
|
|
629
|
-
|
|
630
|
-
distance = self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
|
|
631
|
-
|
|
632
|
-
values[m] = (distance, best_prev)
|
|
633
|
-
|
|
634
|
-
return values
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
class HDy(MixtureModel):
|
|
645
|
-
"""
|
|
646
|
-
Hellinger Distance Minimization (HDy) framework.
|
|
647
|
-
|
|
648
|
-
HDy is based on computing the Hellinger distance between two distributions:
|
|
649
|
-
the test distribution and the mixture of the positive and negative
|
|
650
|
-
distributions from the training data.
|
|
651
|
-
|
|
652
|
-
Parameters
|
|
653
|
-
----------
|
|
654
|
-
learner : BaseEstimator
|
|
655
|
-
A supervised learning model implementing a `predict_proba` method.
|
|
656
|
-
|
|
657
|
-
Attributes
|
|
658
|
-
----------
|
|
659
|
-
pos_scores : np.ndarray
|
|
660
|
-
Score distribution for the positive class in the training data.
|
|
661
|
-
neg_scores : np.ndarray
|
|
662
|
-
Score distribution for the negative class in the training data.
|
|
663
|
-
|
|
664
|
-
References
|
|
665
|
-
----------
|
|
666
|
-
GONZÁLEZ-CASTRO, Víctor; ALAIZ-RODRÍGUEZ, Rocío; ALEGRE, Enrique. Class distribution estimation based on the Hellinger distance. Information Sciences, v. 218, p. 146-164, 2013. Avaliable at https://www.sciencedirect.com/science/article/abs/pii/S0020025512004069?casa_token=W6UksOigmp4AAAAA:ap8FK5mtpAzG-s8k2ygfRVgdIBYDGWjEi70ueJ546coP9F-VNaCKE5W_gsAv0bWQiwzt2QoAuLjP
|
|
667
|
-
|
|
668
|
-
Examples
|
|
669
|
-
--------
|
|
670
|
-
>>> from mlquantify.methods.mixture_models import HDy
|
|
671
|
-
>>> from mlquantify.utils.general import get_real_prev
|
|
672
|
-
>>> from sklearn.ensemble import RandomForestClassifier
|
|
673
|
-
>>> from sklearn.datasets import load_breast_cancer
|
|
674
|
-
>>> from sklearn.model_selection import train_test_split
|
|
675
|
-
>>>
|
|
676
|
-
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
677
|
-
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
|
|
678
|
-
>>>
|
|
679
|
-
>>> hdy = HDy(RandomForestClassifier())
|
|
680
|
-
>>> hdy.fit(X_train, y_train)
|
|
681
|
-
>>> prevalence = hdy.predict(X_test)
|
|
682
|
-
>>> prevalence
|
|
683
|
-
{0: 0.33999999999999997, 1: 0.66}
|
|
684
|
-
>>> get_real_prev(y_test)
|
|
685
|
-
{0: 0.37719298245614036, 1: 0.6228070175438597}
|
|
686
|
-
"""
|
|
687
|
-
|
|
688
|
-
def __init__(self, learner: BaseEstimator=None):
|
|
689
|
-
super().__init__(learner)
|
|
690
|
-
|
|
691
|
-
def _compute_prevalence(self, test_scores: np.ndarray) -> float:
|
|
692
|
-
"""
|
|
693
|
-
Compute the prevalence estimate based on test scores.
|
|
694
|
-
|
|
695
|
-
Parameters
|
|
696
|
-
----------
|
|
697
|
-
test_scores : np.ndarray
|
|
698
|
-
Array of predicted probabilities for the test data.
|
|
699
|
-
|
|
700
|
-
Returns
|
|
701
|
-
-------
|
|
702
|
-
prevalence : float
|
|
703
|
-
Estimated prevalence.
|
|
704
|
-
"""
|
|
705
|
-
best_alphas, _ = self.GetMinDistancesHDy(test_scores)
|
|
706
|
-
# Use the median of the best alpha values as the final prevalence estimate
|
|
707
|
-
prevalence = np.median(best_alphas)
|
|
708
|
-
|
|
709
|
-
return prevalence
|
|
710
|
-
|
|
711
|
-
def best_distance(self, X_test: np.ndarray) -> float:
|
|
712
|
-
"""
|
|
713
|
-
Calculate the minimum Hellinger distance for the test data.
|
|
714
|
-
|
|
715
|
-
Parameters
|
|
716
|
-
----------
|
|
717
|
-
X_test : np.ndarray
|
|
718
|
-
Test data to evaluate.
|
|
719
|
-
|
|
720
|
-
Returns
|
|
721
|
-
-------
|
|
722
|
-
distance : float
|
|
723
|
-
The minimum distance value.
|
|
724
|
-
"""
|
|
725
|
-
test_scores = self.predict_learner(X_test)
|
|
726
|
-
_, distances = self.GetMinDistancesHDy(test_scores)
|
|
727
|
-
|
|
728
|
-
size = len(distances)
|
|
729
|
-
|
|
730
|
-
if size % 2 != 0: # Odd
|
|
731
|
-
index = size // 2
|
|
732
|
-
distance = distances[index]
|
|
733
|
-
else: # Even
|
|
734
|
-
# Find the two middle indices
|
|
735
|
-
middle1 = np.floor(size / 2).astype(int)
|
|
736
|
-
middle2 = np.ceil(size / 2).astype(int)
|
|
737
|
-
# Compute the average of the corresponding distances
|
|
738
|
-
distance = np.mean([distances[middle1], distances[middle2]])
|
|
739
|
-
|
|
740
|
-
return distance
|
|
741
|
-
|
|
742
|
-
def GetMinDistancesHDy(self, test_scores: np.ndarray) -> tuple:
|
|
743
|
-
"""
|
|
744
|
-
Compute prevalence by minimizing the Hellinger distance across bins and alphas.
|
|
745
|
-
|
|
746
|
-
Parameters
|
|
747
|
-
----------
|
|
748
|
-
test_scores : np.ndarray
|
|
749
|
-
Array of predicted probabilities for the test data.
|
|
750
|
-
|
|
751
|
-
Returns
|
|
752
|
-
-------
|
|
753
|
-
best_alphas : list
|
|
754
|
-
List of alpha values that minimize the Hellinger distance for each bin size.
|
|
755
|
-
distances : list
|
|
756
|
-
List of minimum distances corresponding to the best alphas for each bin size.
|
|
757
|
-
"""
|
|
758
|
-
# Define bin sizes and alpha values
|
|
759
|
-
bins_size = np.arange(10, 110, 11) # Bins from 10 to 110 with a step size of 10
|
|
760
|
-
alpha_values = np.round(np.linspace(0, 1, 101), 2) # Alpha values from 0 to 1, rounded to 2 decimal places
|
|
761
|
-
|
|
762
|
-
best_alphas = []
|
|
763
|
-
distances = []
|
|
764
|
-
|
|
765
|
-
for bins in bins_size:
|
|
766
|
-
# Compute histogram densities for positive, negative, and test scores
|
|
767
|
-
pos_bin_density = getHist(self.pos_scores, bins)
|
|
768
|
-
neg_bin_density = getHist(self.neg_scores, bins)
|
|
769
|
-
test_bin_density = getHist(test_scores, bins)
|
|
770
|
-
|
|
771
|
-
bin_distances = []
|
|
772
|
-
|
|
773
|
-
# Evaluate distance for each alpha value
|
|
774
|
-
for x in alpha_values:
|
|
775
|
-
# Combine densities using a mixture of positive and negative densities
|
|
776
|
-
train_combined_density = (pos_bin_density * x) + (neg_bin_density * (1 - x))
|
|
777
|
-
# Compute the distance using the Hellinger measure
|
|
778
|
-
bin_distances.append(self.get_distance(train_combined_density, test_bin_density, measure="hellinger"))
|
|
779
|
-
|
|
780
|
-
# Find the alpha value that minimizes the distance
|
|
781
|
-
best_alpha = alpha_values[np.argmin(bin_distances)]
|
|
782
|
-
min_distance = min(bin_distances)
|
|
783
|
-
|
|
784
|
-
best_alphas.append(best_alpha)
|
|
785
|
-
distances.append(min_distance)
|
|
786
|
-
|
|
787
|
-
return best_alphas, distances
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
class SMM(MixtureModel):
|
|
797
|
-
"""
|
|
798
|
-
Sample Mean Matching (SMM).
|
|
799
|
-
|
|
800
|
-
A member of the DyS framework that estimates the prevalence
|
|
801
|
-
of the positive class in a test dataset by leveraging simple
|
|
802
|
-
mean values to represent the score distributions for positive,
|
|
803
|
-
negative, and unlabeled data.
|
|
804
|
-
|
|
805
|
-
Parameters
|
|
806
|
-
----------
|
|
807
|
-
learner : BaseEstimator
|
|
808
|
-
A supervised learning model implementing a `predict_proba` method.
|
|
809
|
-
|
|
810
|
-
Attributes
|
|
811
|
-
----------
|
|
812
|
-
pos_scores : np.ndarray
|
|
813
|
-
Score distribution for the positive class in the training data.
|
|
814
|
-
neg_scores : np.ndarray
|
|
815
|
-
Score distribution for the negative class in the training data.
|
|
816
|
-
|
|
817
|
-
References
|
|
818
|
-
----------
|
|
819
|
-
HASSAN, Waqar; MALETZKE, André; BATISTA, Gustavo. Accurately quantifying a billion instances per second. In: 2020 IEEE 7th International Conference on Data Science and Advanced Analytics (DSAA). IEEE, 2020. p. 1-10. Avaliable at https://ieeexplore.ieee.org/document/9260028
|
|
820
|
-
|
|
821
|
-
Examples
|
|
822
|
-
--------
|
|
823
|
-
>>> from mlquantify.methods.mixture_models import SMM
|
|
824
|
-
>>> from mlquantify.utils.general import get_real_prev
|
|
825
|
-
>>> from sklearn.ensemble import RandomForestClassifier
|
|
826
|
-
>>> from sklearn.datasets import load_breast_cancer
|
|
827
|
-
>>> from sklearn.model_selection import train_test_split
|
|
828
|
-
>>>
|
|
829
|
-
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
830
|
-
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
|
|
831
|
-
>>>
|
|
832
|
-
>>> smm = SMM(RandomForestClassifier())
|
|
833
|
-
>>> smm.fit(X_train, y_train)
|
|
834
|
-
>>> prevalence = smm.predict(X_test)
|
|
835
|
-
>>> prevalence
|
|
836
|
-
{0: 0.38358048188348526, 1: 0.6164195181165147}
|
|
837
|
-
>>> get_real_prev(y_test)
|
|
838
|
-
{0: 0.37719298245614036, 1: 0.6228070175438597}
|
|
839
|
-
"""
|
|
840
|
-
|
|
841
|
-
def __init__(self, learner: BaseEstimator=None):
|
|
842
|
-
super().__init__(learner)
|
|
843
|
-
|
|
844
|
-
def _compute_prevalence(self, test_scores: np.ndarray) -> float:
|
|
845
|
-
"""
|
|
846
|
-
Compute the prevalence estimate based on mean scores.
|
|
847
|
-
|
|
848
|
-
Parameters
|
|
849
|
-
----------
|
|
850
|
-
test_scores : np.ndarray
|
|
851
|
-
Array of predicted probabilities for the test data.
|
|
852
|
-
|
|
853
|
-
Returns
|
|
854
|
-
-------
|
|
855
|
-
prevalence : float
|
|
856
|
-
Estimated prevalence.
|
|
857
|
-
"""
|
|
858
|
-
mean_pos_score = np.mean(self.pos_scores)
|
|
859
|
-
mean_neg_score = np.mean(self.neg_scores)
|
|
860
|
-
mean_test_score = np.mean(test_scores)
|
|
861
|
-
|
|
862
|
-
# Calculate prevalence as the proportion of the positive class
|
|
863
|
-
prevalence = (mean_test_score - mean_neg_score) / (mean_pos_score - mean_neg_score)
|
|
864
|
-
|
|
865
|
-
return prevalence
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
class SORD(MixtureModel):
|
|
869
|
-
"""
|
|
870
|
-
Sample Ordinal Distance (SORD).
|
|
871
|
-
|
|
872
|
-
A method that estimates the prevalence of the positive class
|
|
873
|
-
in a test dataset by calculating and minimizing a sample ordinal
|
|
874
|
-
distance measure between test scores and known positive and
|
|
875
|
-
negative scores. This approach does not rely on distributional
|
|
876
|
-
assumptions.
|
|
877
|
-
|
|
878
|
-
Parameters
|
|
879
|
-
----------
|
|
880
|
-
learner : BaseEstimator
|
|
881
|
-
A supervised learning model implementing a `predict_proba` method.
|
|
882
|
-
|
|
883
|
-
Attributes
|
|
884
|
-
----------
|
|
885
|
-
pos_scores : np.ndarray
|
|
886
|
-
Score distribution for the positive class in the training data.
|
|
887
|
-
neg_scores : np.ndarray
|
|
888
|
-
Score distribution for the negative class in the training data.
|
|
889
|
-
best_distance_index : int
|
|
890
|
-
Index of the best alpha value.
|
|
891
|
-
|
|
892
|
-
References
|
|
893
|
-
----------
|
|
894
|
-
VAN HASSELT, H.; GUEZ, A.; SILVER, D. Proceedings of the AAAI conference on artificial intelligence. 2016. Avaliable at https://ojs.aaai.org/index.php/AAAI/article/view/4376
|
|
895
|
-
|
|
896
|
-
Examples
|
|
897
|
-
--------
|
|
898
|
-
>>> from mlquantify.methods.mixture_models import SORD
|
|
899
|
-
>>> from mlquantify.utils.general import get_real_prev
|
|
900
|
-
>>> from sklearn.ensemble import RandomForestClassifier
|
|
901
|
-
>>> from sklearn.datasets import load_breast_cancer
|
|
902
|
-
>>> from sklearn.model_selection import train_test_split
|
|
903
|
-
>>>
|
|
904
|
-
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
905
|
-
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
|
|
906
|
-
>>>
|
|
907
|
-
>>> sord = SORD(RandomForestClassifier())
|
|
908
|
-
>>> sord.fit(X_train, y_train)
|
|
909
|
-
>>> prevalence = sord.predict(X_test)
|
|
910
|
-
>>> prevalence
|
|
911
|
-
{0: 0.38, 1: 0.62}
|
|
912
|
-
>>> get_real_prev(y_test)
|
|
913
|
-
{0: 0.37719298245614036, 1: 0.6228070175438597}
|
|
914
|
-
"""
|
|
915
|
-
|
|
916
|
-
def __init__(self, learner: BaseEstimator=None):
|
|
917
|
-
super().__init__(learner)
|
|
918
|
-
|
|
919
|
-
self.best_distance_index = None # Stores the index of the best alpha value
|
|
920
|
-
|
|
921
|
-
def _compute_prevalence(self, test_scores: np.ndarray) -> float:
|
|
922
|
-
"""
|
|
923
|
-
Compute the prevalence estimate by minimizing the ordinal distance.
|
|
924
|
-
|
|
925
|
-
Parameters
|
|
926
|
-
----------
|
|
927
|
-
test_scores : np.ndarray
|
|
928
|
-
Array of predicted probabilities for the test data.
|
|
929
|
-
|
|
930
|
-
Returns
|
|
931
|
-
-------
|
|
932
|
-
prevalence : float
|
|
933
|
-
Estimated prevalence.
|
|
934
|
-
"""
|
|
935
|
-
# Compute alpha values and corresponding distance measures
|
|
936
|
-
alpha_values, distance_measures = self._calculate_distances(test_scores)
|
|
937
|
-
|
|
938
|
-
# Find the index of the alpha value with the minimum distance measure
|
|
939
|
-
self.best_distance_index = np.argmin(distance_measures)
|
|
940
|
-
prevalence = alpha_values[self.best_distance_index]
|
|
941
|
-
|
|
942
|
-
return prevalence
|
|
943
|
-
|
|
944
|
-
def _calculate_distances(self, test_scores: np.ndarray) -> tuple:
|
|
945
|
-
"""
|
|
946
|
-
Calculate distance measures for a range of alpha values.
|
|
947
|
-
|
|
948
|
-
Parameters
|
|
949
|
-
----------
|
|
950
|
-
test_scores : np.ndarray
|
|
951
|
-
Array of predicted probabilities for the test data.
|
|
952
|
-
|
|
953
|
-
Returns
|
|
954
|
-
-------
|
|
955
|
-
alpha_values : np.ndarray
|
|
956
|
-
Array of alpha values (from 0 to 1) used for evaluation.
|
|
957
|
-
distance_measures : list
|
|
958
|
-
List of distance measures for each alpha value.
|
|
959
|
-
"""
|
|
960
|
-
# Define a range of alpha values from 0 to 1
|
|
961
|
-
alpha_values = np.linspace(0, 1, 101)
|
|
962
|
-
|
|
963
|
-
# Get the number of positive, negative, and test scores
|
|
964
|
-
num_pos_scores = len(self.pos_scores)
|
|
965
|
-
num_neg_scores = len(self.neg_scores)
|
|
966
|
-
num_test_scores = len(test_scores)
|
|
967
|
-
|
|
968
|
-
distance_measures = []
|
|
969
|
-
|
|
970
|
-
# Iterate over each alpha value
|
|
971
|
-
for alpha in alpha_values:
|
|
972
|
-
# Compute weights for positive, negative, and test scores
|
|
973
|
-
pos_weight = alpha / num_pos_scores
|
|
974
|
-
neg_weight = (1 - alpha) / num_neg_scores
|
|
975
|
-
test_weight = -1 / num_test_scores
|
|
976
|
-
|
|
977
|
-
# Create arrays with weights
|
|
978
|
-
pos_weights = np.full(num_pos_scores, pos_weight)
|
|
979
|
-
neg_weights = np.full(num_neg_scores, neg_weight)
|
|
980
|
-
test_weights = np.full(num_test_scores, test_weight)
|
|
981
|
-
|
|
982
|
-
# Concatenate all scores and their corresponding weights
|
|
983
|
-
all_scores = np.concatenate([self.pos_scores, self.neg_scores, test_scores])
|
|
984
|
-
all_weights = np.concatenate([pos_weights, neg_weights, test_weights])
|
|
985
|
-
|
|
986
|
-
# Sort scores and weights based on scores
|
|
987
|
-
sorted_indices = np.argsort(all_scores)
|
|
988
|
-
sorted_scores = all_scores[sorted_indices]
|
|
989
|
-
sorted_weights = all_weights[sorted_indices]
|
|
990
|
-
|
|
991
|
-
# Compute the total cost for the current alpha
|
|
992
|
-
cumulative_weight = sorted_weights[0]
|
|
993
|
-
total_cost = 0
|
|
994
|
-
|
|
995
|
-
for i in range(1, len(sorted_scores)):
|
|
996
|
-
# Calculate the cost for the segment between sorted scores
|
|
997
|
-
segment_width = sorted_scores[i] - sorted_scores[i - 1]
|
|
998
|
-
total_cost += abs(segment_width * cumulative_weight)
|
|
999
|
-
cumulative_weight += sorted_weights[i]
|
|
1000
|
-
|
|
1001
|
-
distance_measures.append(total_cost)
|
|
1002
|
-
|
|
1003
|
-
return alpha_values, distance_measures
|