mlquantify 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify/__init__.py +0 -29
- mlquantify/adjust_counting/__init__.py +14 -0
- mlquantify/adjust_counting/_adjustment.py +365 -0
- mlquantify/adjust_counting/_base.py +247 -0
- mlquantify/adjust_counting/_counting.py +145 -0
- mlquantify/adjust_counting/_utils.py +114 -0
- mlquantify/base.py +117 -519
- mlquantify/base_aggregative.py +209 -0
- mlquantify/calibration.py +1 -0
- mlquantify/confidence.py +335 -0
- mlquantify/likelihood/__init__.py +5 -0
- mlquantify/likelihood/_base.py +161 -0
- mlquantify/likelihood/_classes.py +414 -0
- mlquantify/meta/__init__.py +1 -0
- mlquantify/meta/_classes.py +761 -0
- mlquantify/metrics/__init__.py +21 -0
- mlquantify/metrics/_oq.py +109 -0
- mlquantify/metrics/_rq.py +98 -0
- mlquantify/{evaluation/measures.py → metrics/_slq.py} +43 -28
- mlquantify/mixture/__init__.py +7 -0
- mlquantify/mixture/_base.py +153 -0
- mlquantify/mixture/_classes.py +400 -0
- mlquantify/mixture/_utils.py +112 -0
- mlquantify/model_selection/__init__.py +9 -0
- mlquantify/model_selection/_protocol.py +358 -0
- mlquantify/model_selection/_search.py +315 -0
- mlquantify/model_selection/_split.py +1 -0
- mlquantify/multiclass.py +350 -0
- mlquantify/neighbors/__init__.py +9 -0
- mlquantify/neighbors/_base.py +198 -0
- mlquantify/neighbors/_classes.py +159 -0
- mlquantify/{classification/methods.py → neighbors/_classification.py} +48 -66
- mlquantify/neighbors/_kde.py +270 -0
- mlquantify/neighbors/_utils.py +135 -0
- mlquantify/neural/__init__.py +1 -0
- mlquantify/utils/__init__.py +47 -2
- mlquantify/utils/_artificial.py +27 -0
- mlquantify/utils/_constraints.py +219 -0
- mlquantify/utils/_context.py +21 -0
- mlquantify/utils/_decorators.py +36 -0
- mlquantify/utils/_exceptions.py +12 -0
- mlquantify/utils/_get_scores.py +159 -0
- mlquantify/utils/_load.py +18 -0
- mlquantify/utils/_parallel.py +6 -0
- mlquantify/utils/_random.py +36 -0
- mlquantify/utils/_sampling.py +273 -0
- mlquantify/utils/_tags.py +44 -0
- mlquantify/utils/_validation.py +447 -0
- mlquantify/utils/prevalence.py +61 -0
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.9.dist-info}/METADATA +2 -1
- mlquantify-0.1.9.dist-info/RECORD +53 -0
- mlquantify/classification/__init__.py +0 -1
- mlquantify/evaluation/__init__.py +0 -14
- mlquantify/evaluation/protocol.py +0 -289
- mlquantify/methods/__init__.py +0 -37
- mlquantify/methods/aggregative.py +0 -1159
- mlquantify/methods/meta.py +0 -472
- mlquantify/methods/mixture_models.py +0 -1003
- mlquantify/methods/non_aggregative.py +0 -136
- mlquantify/methods/threshold_optimization.py +0 -869
- mlquantify/model_selection.py +0 -377
- mlquantify/plots.py +0 -367
- mlquantify/utils/general.py +0 -371
- mlquantify/utils/method.py +0 -449
- mlquantify-0.1.8.dist-info/RECORD +0 -22
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.9.dist-info}/WHEEL +0 -0
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,400 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from abc import abstractmethod
|
|
3
|
+
|
|
4
|
+
from mlquantify.base import BaseQuantifier
|
|
5
|
+
from mlquantify.base_aggregative import AggregationMixin, SoftLearnerQMixin, _get_learner_function
|
|
6
|
+
from mlquantify.mixture._base import BaseMixture
|
|
7
|
+
from mlquantify.multiclass import define_binary
|
|
8
|
+
from mlquantify.utils._constraints import Interval, Options
|
|
9
|
+
from mlquantify.utils._decorators import _fit_context
|
|
10
|
+
from mlquantify.utils._get_scores import apply_cross_validation
|
|
11
|
+
from mlquantify.utils._validation import check_classes_attribute, validate_predictions, validate_prevalences, validate_y
|
|
12
|
+
from mlquantify.mixture._utils import (
|
|
13
|
+
getHist,
|
|
14
|
+
ternary_search,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# =====================================================
|
|
20
|
+
# Base class
|
|
21
|
+
# =====================================================
|
|
22
|
+
@define_binary
|
|
23
|
+
class AggregativeMixture(SoftLearnerQMixin, AggregationMixin, BaseMixture):
|
|
24
|
+
"""
|
|
25
|
+
Base class for Mixture-based Quantification Methods.
|
|
26
|
+
|
|
27
|
+
These methods assume that the test score distribution is a mixture
|
|
28
|
+
of the positive and negative score distributions from the training data.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
_parameter_constraints = {
|
|
32
|
+
"strategy": [Options(["ovr", "ovo"])]
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
def __init__(self, learner = None, strategy="ovr"):
|
|
36
|
+
super().__init__()
|
|
37
|
+
self.learner = learner
|
|
38
|
+
self.pos_scores = None
|
|
39
|
+
self.neg_scores = None
|
|
40
|
+
self.distances = None
|
|
41
|
+
self.strategy = strategy
|
|
42
|
+
|
|
43
|
+
def _fit(self, X, y, learner_fitted=False, *args, **kwargs):
|
|
44
|
+
learner_function = _get_learner_function(self)
|
|
45
|
+
|
|
46
|
+
if learner_fitted:
|
|
47
|
+
train_predictions = getattr(self.learner, learner_function)(X)
|
|
48
|
+
train_y_values = y
|
|
49
|
+
else:
|
|
50
|
+
train_predictions, train_y_values = apply_cross_validation(
|
|
51
|
+
self.learner,
|
|
52
|
+
X,
|
|
53
|
+
y,
|
|
54
|
+
function= learner_function,
|
|
55
|
+
cv= 5,
|
|
56
|
+
stratified= True,
|
|
57
|
+
random_state= None,
|
|
58
|
+
shuffle= True
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
self.train_predictions = train_predictions
|
|
62
|
+
self.train_y_values = train_y_values
|
|
63
|
+
|
|
64
|
+
self._precompute_training(train_predictions, train_y_values)
|
|
65
|
+
return self
|
|
66
|
+
|
|
67
|
+
def _precompute_training(self, train_predictions, train_y_values):
|
|
68
|
+
"""
|
|
69
|
+
Fit learner and store score distributions for positive and negative classes.
|
|
70
|
+
"""
|
|
71
|
+
# Store scores for positive and negative classes
|
|
72
|
+
self.pos_scores = train_predictions[train_y_values == self.classes_[1], 1]
|
|
73
|
+
self.neg_scores = train_predictions[train_y_values == self.classes_[0], 1]
|
|
74
|
+
self._precomputed = True
|
|
75
|
+
return self
|
|
76
|
+
|
|
77
|
+
def _predict(self, X):
|
|
78
|
+
"""Predict class prevalences for the given data."""
|
|
79
|
+
predictions = getattr(self.learner, _get_learner_function(self))(X)
|
|
80
|
+
prevalences = self.aggregate(predictions, self.train_predictions, self.train_y_values)
|
|
81
|
+
|
|
82
|
+
return prevalences
|
|
83
|
+
|
|
84
|
+
def aggregate(self, predictions, train_predictions, train_y_values):
|
|
85
|
+
predictions = validate_predictions(self, predictions)
|
|
86
|
+
self.classes_ = check_classes_attribute(self, np.unique(train_y_values))
|
|
87
|
+
|
|
88
|
+
if not self._precomputed:
|
|
89
|
+
self._precompute_training(train_predictions, train_y_values)
|
|
90
|
+
self._precomputed = True
|
|
91
|
+
|
|
92
|
+
pos_test_scores = predictions[:, 1]
|
|
93
|
+
|
|
94
|
+
best_alpha, _ = self.best_mixture(pos_test_scores, self.pos_scores, self.neg_scores)
|
|
95
|
+
prevalence = np.array([1 - best_alpha, best_alpha])
|
|
96
|
+
prevalence = validate_prevalences(self, prevalence, self.classes_)
|
|
97
|
+
return prevalence
|
|
98
|
+
|
|
99
|
+
@abstractmethod
|
|
100
|
+
def best_mixture(self, predictions, pos_scores, neg_scores):
|
|
101
|
+
...
|
|
102
|
+
|
|
103
|
+
# =====================================================
|
|
104
|
+
# DyS
|
|
105
|
+
# =====================================================
|
|
106
|
+
|
|
107
|
+
class DyS(AggregativeMixture):
|
|
108
|
+
"""Distribution y-Similarity (DyS) quantification method.
|
|
109
|
+
|
|
110
|
+
Uses mixture modeling with a dissimilarity measure between distributions
|
|
111
|
+
computed on histograms of classifier scores. This method optimizes mixture
|
|
112
|
+
weights by minimizing a chosen distance measure: Hellinger, Topsoe, or ProbSymm.
|
|
113
|
+
|
|
114
|
+
Parameters
|
|
115
|
+
----------
|
|
116
|
+
learner : estimator, optional
|
|
117
|
+
Base probabilistic classifier.
|
|
118
|
+
measure : {'hellinger', 'topsoe', 'probsymm'}, default='topsoe'
|
|
119
|
+
Distance function to minimize.
|
|
120
|
+
bins_size : array-like or None
|
|
121
|
+
Histogram bin sizes to try for score representation. Defaults to a set of
|
|
122
|
+
bin sizes between 2 and 30.
|
|
123
|
+
|
|
124
|
+
References
|
|
125
|
+
----------
|
|
126
|
+
[1] Maletzke et al. (2019). DyS: A Framework for Mixture Models in Quantification. AAAI 2019.
|
|
127
|
+
[2] Esuli et al. (2023). Learning to Quantify. Springer.
|
|
128
|
+
|
|
129
|
+
Examples
|
|
130
|
+
--------
|
|
131
|
+
>>> q = DyS(learner=my_learner, measure="hellinger")
|
|
132
|
+
>>> q.fit(X_train, y_train)
|
|
133
|
+
>>> prevalences = q.predict(X_test)
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
_parameter_constraints = {
|
|
137
|
+
"measure": [Options(["hellinger", "topsoe", "probsymm"])],
|
|
138
|
+
"bins_size": ["array-like", None]
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
def __init__(self, learner=None, measure="topsoe", bins_size=None):
|
|
142
|
+
super().__init__(learner)
|
|
143
|
+
if bins_size is None:
|
|
144
|
+
bins_size = np.append(np.linspace(2, 20, 10), 30)
|
|
145
|
+
|
|
146
|
+
self.measure = measure
|
|
147
|
+
self.bins_size = np.asarray(bins_size, dtype=int)
|
|
148
|
+
|
|
149
|
+
def best_mixture(self, predictions, pos_scores, neg_scores):
|
|
150
|
+
|
|
151
|
+
prevs = []
|
|
152
|
+
self.distances = []
|
|
153
|
+
for bins in self.bins_size:
|
|
154
|
+
pos = getHist(pos_scores, bins)
|
|
155
|
+
neg = getHist(neg_scores, bins)
|
|
156
|
+
test = getHist(predictions, bins)
|
|
157
|
+
|
|
158
|
+
def f(alpha):
|
|
159
|
+
mix = self._mix(pos, neg, alpha)
|
|
160
|
+
return BaseMixture.get_distance(mix, test, measure=self.measure)
|
|
161
|
+
|
|
162
|
+
alpha = ternary_search(0, 1, f)
|
|
163
|
+
prevs.append(alpha)
|
|
164
|
+
self.distances.append(f(alpha))
|
|
165
|
+
alpha = np.median(prevs)
|
|
166
|
+
best_distance = np.median(self.distances)
|
|
167
|
+
return alpha, best_distance
|
|
168
|
+
|
|
169
|
+
def _mix(self, pos_hist, neg_hist, alpha):
|
|
170
|
+
return alpha * pos_hist + (1 - alpha) * neg_hist
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
# =====================================================
|
|
174
|
+
# HDy
|
|
175
|
+
# =====================================================
|
|
176
|
+
|
|
177
|
+
class HDy(AggregativeMixture):
|
|
178
|
+
"""Hellinger Distance Minimization (HDy) quantification method.
|
|
179
|
+
|
|
180
|
+
Estimates class prevalences by finding mixture weights that minimize
|
|
181
|
+
the Hellinger distance between the histogram of test scores and the mixture
|
|
182
|
+
of positive and negative class score histograms, evaluated over multiple bin sizes.
|
|
183
|
+
|
|
184
|
+
Parameters
|
|
185
|
+
----------
|
|
186
|
+
learner : estimator, optional
|
|
187
|
+
Base probabilistic classifier.
|
|
188
|
+
|
|
189
|
+
References
|
|
190
|
+
----------
|
|
191
|
+
[2] Esuli et al. (2023). Learning to Quantify. Springer.
|
|
192
|
+
|
|
193
|
+
"""
|
|
194
|
+
|
|
195
|
+
def best_mixture(self, predictions, pos_scores, neg_scores):
|
|
196
|
+
bins_size = np.arange(10, 110, 11)
|
|
197
|
+
alpha_values = np.round(np.linspace(0, 1, 101), 2)
|
|
198
|
+
|
|
199
|
+
alphas, self.distances = [], []
|
|
200
|
+
for bins in bins_size:
|
|
201
|
+
pos = getHist(pos_scores, bins)
|
|
202
|
+
neg = getHist(neg_scores, bins)
|
|
203
|
+
test = getHist(predictions, bins)
|
|
204
|
+
dists = []
|
|
205
|
+
for a in alpha_values:
|
|
206
|
+
mix = self._mix(pos, neg, a)
|
|
207
|
+
dists.append(BaseMixture.get_distance(mix, test, measure="hellinger"))
|
|
208
|
+
a = alpha_values[np.argmin(dists)]
|
|
209
|
+
alphas.append(a)
|
|
210
|
+
self.distances.append(np.min(dists))
|
|
211
|
+
|
|
212
|
+
best_alpha = np.median(alphas)
|
|
213
|
+
best_distance = np.median(self.distances)
|
|
214
|
+
|
|
215
|
+
return best_alpha, best_distance
|
|
216
|
+
|
|
217
|
+
def _mix(self, pos_hist, neg_hist, alpha):
|
|
218
|
+
return alpha * pos_hist + (1 - alpha) * neg_hist
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
# =====================================================
|
|
223
|
+
# SMM
|
|
224
|
+
# =====================================================
|
|
225
|
+
|
|
226
|
+
class SMM(AggregativeMixture):
|
|
227
|
+
r"""Sample Mean Matching (SMM) quantification method.
|
|
228
|
+
|
|
229
|
+
Estimates class prevalence by matching the mean score of the test samples
|
|
230
|
+
to a convex combination of positive and negative training scores. The mixture
|
|
231
|
+
weight \( \alpha \) is computed as:
|
|
232
|
+
|
|
233
|
+
\[
|
|
234
|
+
\alpha = \frac{\bar{s}_{test} - \bar{s}_{neg}}{\bar{s}_{pos} - \bar{s}_{neg}}
|
|
235
|
+
\]
|
|
236
|
+
|
|
237
|
+
where \( \bar{s} \) denotes the sample mean.
|
|
238
|
+
|
|
239
|
+
Parameters
|
|
240
|
+
----------
|
|
241
|
+
learner : estimator, optional
|
|
242
|
+
Base probabilistic classifier.
|
|
243
|
+
|
|
244
|
+
References
|
|
245
|
+
----------
|
|
246
|
+
[2] Esuli et al. (2023). Learning to Quantify. Springer.
|
|
247
|
+
"""
|
|
248
|
+
|
|
249
|
+
def best_mixture(self, predictions, pos_scores, neg_scores):
|
|
250
|
+
mean_pos = np.mean(pos_scores)
|
|
251
|
+
mean_neg = np.mean(neg_scores)
|
|
252
|
+
mean_test = np.mean(predictions)
|
|
253
|
+
|
|
254
|
+
alpha = (mean_test - mean_neg) / (mean_pos - mean_neg)
|
|
255
|
+
return alpha, None
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
# =====================================================
|
|
259
|
+
# SORD
|
|
260
|
+
# =====================================================
|
|
261
|
+
|
|
262
|
+
class SORD(AggregativeMixture):
|
|
263
|
+
"""Sample Ordinal Distance (SORD) quantification method.
|
|
264
|
+
|
|
265
|
+
Estimates prevalence by minimizing the weighted sum of absolute score differences
|
|
266
|
+
between test data and training classes. The method creates weighted score
|
|
267
|
+
vectors for positive, negative, and test samples, sorts them, and computes
|
|
268
|
+
a cumulative absolute difference as the distance measure.
|
|
269
|
+
|
|
270
|
+
Parameters
|
|
271
|
+
----------
|
|
272
|
+
learner : estimator, optional
|
|
273
|
+
Base probabilistic classifier.
|
|
274
|
+
|
|
275
|
+
References
|
|
276
|
+
----------
|
|
277
|
+
[2] Esuli et al. (2023). Learning to Quantify. Springer.
|
|
278
|
+
"""
|
|
279
|
+
|
|
280
|
+
def best_mixture(self, predictions, pos_scores, neg_scores):
|
|
281
|
+
alphas = np.linspace(0, 1, 101)
|
|
282
|
+
self.distances = []
|
|
283
|
+
|
|
284
|
+
pos, neg, test = pos_scores, neg_scores, predictions
|
|
285
|
+
n_pos, n_neg, n_test = len(pos), len(neg), len(test)
|
|
286
|
+
for a in alphas:
|
|
287
|
+
pos_w = np.full(n_pos, a / n_pos)
|
|
288
|
+
neg_w = np.full(n_neg, (1 - a) / n_neg)
|
|
289
|
+
test_w = np.full(n_test, -1 / n_test)
|
|
290
|
+
scores = np.concatenate([pos, neg, test])
|
|
291
|
+
weights = np.concatenate([pos_w, neg_w, test_w])
|
|
292
|
+
idx = np.argsort(scores)
|
|
293
|
+
sorted_scores = scores[idx]
|
|
294
|
+
sorted_weights = weights[idx]
|
|
295
|
+
cum_w = sorted_weights[0]
|
|
296
|
+
total = 0
|
|
297
|
+
for i in range(1, len(sorted_scores)):
|
|
298
|
+
seg = sorted_scores[i] - sorted_scores[i - 1]
|
|
299
|
+
total += abs(seg * cum_w)
|
|
300
|
+
cum_w += sorted_weights[i]
|
|
301
|
+
self.distances.append(total)
|
|
302
|
+
|
|
303
|
+
best_distance_index = np.argmin(self.distances)
|
|
304
|
+
best_alpha = alphas[best_distance_index]
|
|
305
|
+
best_distance = self.distances[best_distance_index]
|
|
306
|
+
return best_alpha, best_distance
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
# =====================================================
|
|
313
|
+
# Non aggregative Mixture-based Quantifiers
|
|
314
|
+
# =====================================================
|
|
315
|
+
|
|
316
|
+
class HDx(BaseMixture):
|
|
317
|
+
"""
|
|
318
|
+
Hellinger Distance-based Quantifier (HDx).
|
|
319
|
+
|
|
320
|
+
A non-aggregative mixture quantifier that estimates class prevalences by
|
|
321
|
+
minimizing the average Hellinger distance between class-wise feature histograms
|
|
322
|
+
of training data and test data. It iterates over mixture weights and histogram bin sizes,
|
|
323
|
+
evaluating distance per feature and aggregates the results.
|
|
324
|
+
|
|
325
|
+
Parameters
|
|
326
|
+
----------
|
|
327
|
+
bins_size : array-like, optional
|
|
328
|
+
Histogram bin sizes to consider for discretizing features.
|
|
329
|
+
strategy : {'ovr', 'ovo'}, default='ovr'
|
|
330
|
+
Multiclass quantification strategy.
|
|
331
|
+
|
|
332
|
+
Attributes
|
|
333
|
+
----------
|
|
334
|
+
pos_features : ndarray
|
|
335
|
+
Training samples of the positive class.
|
|
336
|
+
neg_features : ndarray
|
|
337
|
+
Training samples of the negative class.
|
|
338
|
+
|
|
339
|
+
References
|
|
340
|
+
----------
|
|
341
|
+
[2] Esuli et al. (2023). Learning to Quantify. Springer.
|
|
342
|
+
"""
|
|
343
|
+
|
|
344
|
+
_parameter_constraints = {
|
|
345
|
+
"bins_size": ["array-like", None],
|
|
346
|
+
"strategy": [Options(["ovr", "ovo"])]
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
def __init__(self, bins_size=None, strategy="ovr"):
|
|
350
|
+
super().__init__()
|
|
351
|
+
if bins_size is None:
|
|
352
|
+
bins_size = np.append(np.linspace(2, 20, 10), 30)
|
|
353
|
+
|
|
354
|
+
self.bins_size = bins_size
|
|
355
|
+
self.neg_features = None
|
|
356
|
+
self.pos_features = None
|
|
357
|
+
self.strategy = strategy
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def _fit(self, X, y, *args, **kwargs):
|
|
361
|
+
self.pos_features = X[y == self.classes_[1]]
|
|
362
|
+
self.neg_features = X[y == self.classes_[0]]
|
|
363
|
+
return self
|
|
364
|
+
|
|
365
|
+
def _predict(self, X) -> np.ndarray:
|
|
366
|
+
alpha, _ = self.best_mixture(X, self.pos_features, self.neg_features)
|
|
367
|
+
prevalence = np.array([1 - alpha, alpha])
|
|
368
|
+
prevalence = validate_prevalences(self, prevalence, self.classes_)
|
|
369
|
+
return prevalence
|
|
370
|
+
|
|
371
|
+
def best_mixture(self, X, pos, neg):
|
|
372
|
+
alpha_values = np.round(np.linspace(0, 1, 101), 2)
|
|
373
|
+
self.distances = []
|
|
374
|
+
|
|
375
|
+
# Iterate over alpha values to compute the prevalence
|
|
376
|
+
for alpha in alpha_values:
|
|
377
|
+
distances = []
|
|
378
|
+
|
|
379
|
+
# For each feature, compute the Hellinger distance
|
|
380
|
+
for feature_idx in range(X.shape[1]):
|
|
381
|
+
|
|
382
|
+
for bins in self.bins_size:
|
|
383
|
+
|
|
384
|
+
pos_feature = pos[:, feature_idx]
|
|
385
|
+
neg_feature = neg[:, feature_idx]
|
|
386
|
+
test_feature = X[:, feature_idx]
|
|
387
|
+
|
|
388
|
+
pos_hist = getHist(pos_feature, bins)
|
|
389
|
+
neg_hist = getHist(neg_feature, bins)
|
|
390
|
+
test_hist = getHist(test_feature, bins)
|
|
391
|
+
|
|
392
|
+
mix_hist = alpha * pos_hist + (1 - alpha) * neg_hist
|
|
393
|
+
distance = BaseMixture.get_distance(mix_hist, test_hist, measure="hellinger")
|
|
394
|
+
distances.append(distance)
|
|
395
|
+
|
|
396
|
+
avg_distance = np.mean(distances)
|
|
397
|
+
self.distances.append(avg_distance)
|
|
398
|
+
best_alpha = alpha_values[np.argmin(self.distances)]
|
|
399
|
+
best_distance = np.min(self.distances)
|
|
400
|
+
return best_alpha, best_distance
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
# =====================================================
|
|
5
|
+
# Utility functions
|
|
6
|
+
# =====================================================
|
|
7
|
+
|
|
8
|
+
def getHist(scores, nbins):
|
|
9
|
+
"""
|
|
10
|
+
Calculate histogram-like bin probabilities for a given set of scores.
|
|
11
|
+
|
|
12
|
+
This function divides the score range into equal bins and computes the proportion
|
|
13
|
+
of scores in each bin, normalized by the total count.
|
|
14
|
+
|
|
15
|
+
Parameters
|
|
16
|
+
----------
|
|
17
|
+
scores : np.ndarray
|
|
18
|
+
A 1-dimensional array of scores.
|
|
19
|
+
nbins : int
|
|
20
|
+
Number of bins for dividing the score range.
|
|
21
|
+
|
|
22
|
+
Returns
|
|
23
|
+
-------
|
|
24
|
+
np.ndarray
|
|
25
|
+
An array containing the normalized bin probabilities.
|
|
26
|
+
|
|
27
|
+
Notes
|
|
28
|
+
-----
|
|
29
|
+
- The bins are equally spaced between 0 and 1, with an additional upper boundary
|
|
30
|
+
to include the maximum score.
|
|
31
|
+
- The returned probabilities are normalized to account for the total number of scores.
|
|
32
|
+
"""
|
|
33
|
+
breaks = np.linspace(0, 1, int(nbins) + 1)
|
|
34
|
+
breaks = np.delete(breaks, -1)
|
|
35
|
+
breaks = np.append(breaks, 1.1)
|
|
36
|
+
|
|
37
|
+
re = np.repeat(1 / (len(breaks) - 1), (len(breaks) - 1))
|
|
38
|
+
for i in range(1, len(breaks)):
|
|
39
|
+
re[i - 1] = (re[i - 1] + len(np.where((scores >= breaks[i - 1]) & (scores < breaks[i]))[0])) / (len(scores) + 1)
|
|
40
|
+
|
|
41
|
+
return re
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def ternary_search(left: float, right: float, func, tol: float = 1e-4) -> float:
|
|
45
|
+
"""
|
|
46
|
+
Ternary search to find the minimum of a unimodal function in [left, right].
|
|
47
|
+
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
left : float
|
|
51
|
+
Left bound.
|
|
52
|
+
right : float
|
|
53
|
+
Right bound.
|
|
54
|
+
func : callable
|
|
55
|
+
Function to minimize.
|
|
56
|
+
tol : float
|
|
57
|
+
Tolerance for termination.
|
|
58
|
+
|
|
59
|
+
Returns
|
|
60
|
+
-------
|
|
61
|
+
float
|
|
62
|
+
Approximate position of the minimum.
|
|
63
|
+
"""
|
|
64
|
+
while right - left > tol:
|
|
65
|
+
m1 = left + (right - left) / 3
|
|
66
|
+
m2 = right - (right - left) / 3
|
|
67
|
+
f1, f2 = func(m1), func(m2)
|
|
68
|
+
if f1 < f2:
|
|
69
|
+
right = m2
|
|
70
|
+
else:
|
|
71
|
+
left = m1
|
|
72
|
+
return (left + right) / 2
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def topsoe(p: np.ndarray, q: np.ndarray) -> float:
|
|
76
|
+
"""
|
|
77
|
+
Topsoe distance between two probability distributions.
|
|
78
|
+
|
|
79
|
+
D_T(p, q) = sum( p*log(2p/(p+q)) + q*log(2q/(p+q)) )
|
|
80
|
+
"""
|
|
81
|
+
p = np.maximum(p, 1e-20)
|
|
82
|
+
q = np.maximum(q, 1e-20)
|
|
83
|
+
return np.sum(p * np.log(2 * p / (p + q)) + q * np.log(2 * q / (p + q)))
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def probsymm(p: np.ndarray, q: np.ndarray) -> float:
|
|
87
|
+
"""
|
|
88
|
+
Probabilistic Symmetric distance.
|
|
89
|
+
|
|
90
|
+
D_PS(p, q) = sum( (p - q) * log(p / q) )
|
|
91
|
+
"""
|
|
92
|
+
p = np.maximum(p, 1e-20)
|
|
93
|
+
q = np.maximum(q, 1e-20)
|
|
94
|
+
return np.sum((p - q) * np.log(p / q))
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def hellinger(p: np.ndarray, q: np.ndarray) -> float:
|
|
98
|
+
"""
|
|
99
|
+
Hellinger distance between two probability distributions.
|
|
100
|
+
|
|
101
|
+
H(p, q) = (1/sqrt(2)) * sqrt( sum( (sqrt(p) - sqrt(q))^2 ) )
|
|
102
|
+
"""
|
|
103
|
+
p = np.maximum(p, 1e-20)
|
|
104
|
+
q = np.maximum(q, 1e-20)
|
|
105
|
+
return np.sqrt(0.5 * np.sum((np.sqrt(p) - np.sqrt(q)) ** 2))
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def sqEuclidean(p: np.ndarray, q: np.ndarray) -> float:
|
|
109
|
+
"""
|
|
110
|
+
Squared Euclidean distance between two vectors.
|
|
111
|
+
"""
|
|
112
|
+
return np.sum((p - q) ** 2)
|