mlquantify 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify/__init__.py +10 -29
- mlquantify/adjust_counting/__init__.py +24 -0
- mlquantify/adjust_counting/_adjustment.py +648 -0
- mlquantify/adjust_counting/_base.py +245 -0
- mlquantify/adjust_counting/_counting.py +153 -0
- mlquantify/adjust_counting/_utils.py +109 -0
- mlquantify/base.py +117 -519
- mlquantify/base_aggregative.py +209 -0
- mlquantify/calibration.py +1 -0
- mlquantify/confidence.py +329 -0
- mlquantify/likelihood/__init__.py +5 -0
- mlquantify/likelihood/_base.py +147 -0
- mlquantify/likelihood/_classes.py +430 -0
- mlquantify/meta/__init__.py +1 -0
- mlquantify/meta/_classes.py +785 -0
- mlquantify/metrics/__init__.py +21 -0
- mlquantify/metrics/_oq.py +109 -0
- mlquantify/metrics/_rq.py +98 -0
- mlquantify/{evaluation/measures.py → metrics/_slq.py} +51 -36
- mlquantify/mixture/__init__.py +7 -0
- mlquantify/mixture/_base.py +147 -0
- mlquantify/mixture/_classes.py +458 -0
- mlquantify/mixture/_utils.py +163 -0
- mlquantify/model_selection/__init__.py +9 -0
- mlquantify/model_selection/_protocol.py +358 -0
- mlquantify/model_selection/_search.py +315 -0
- mlquantify/model_selection/_split.py +1 -0
- mlquantify/multiclass.py +350 -0
- mlquantify/neighbors/__init__.py +9 -0
- mlquantify/neighbors/_base.py +168 -0
- mlquantify/neighbors/_classes.py +150 -0
- mlquantify/{classification/methods.py → neighbors/_classification.py} +37 -62
- mlquantify/neighbors/_kde.py +268 -0
- mlquantify/neighbors/_utils.py +131 -0
- mlquantify/neural/__init__.py +1 -0
- mlquantify/utils/__init__.py +47 -2
- mlquantify/utils/_artificial.py +27 -0
- mlquantify/utils/_constraints.py +219 -0
- mlquantify/utils/_context.py +21 -0
- mlquantify/utils/_decorators.py +36 -0
- mlquantify/utils/_exceptions.py +12 -0
- mlquantify/utils/_get_scores.py +159 -0
- mlquantify/utils/_load.py +18 -0
- mlquantify/utils/_parallel.py +6 -0
- mlquantify/utils/_random.py +36 -0
- mlquantify/utils/_sampling.py +273 -0
- mlquantify/utils/_tags.py +44 -0
- mlquantify/utils/_validation.py +447 -0
- mlquantify/utils/prevalence.py +64 -0
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/METADATA +2 -1
- mlquantify-0.1.10.dist-info/RECORD +53 -0
- mlquantify/classification/__init__.py +0 -1
- mlquantify/evaluation/__init__.py +0 -14
- mlquantify/evaluation/protocol.py +0 -289
- mlquantify/methods/__init__.py +0 -37
- mlquantify/methods/aggregative.py +0 -1159
- mlquantify/methods/meta.py +0 -472
- mlquantify/methods/mixture_models.py +0 -1003
- mlquantify/methods/non_aggregative.py +0 -136
- mlquantify/methods/threshold_optimization.py +0 -869
- mlquantify/model_selection.py +0 -377
- mlquantify/plots.py +0 -367
- mlquantify/utils/general.py +0 -371
- mlquantify/utils/method.py +0 -449
- mlquantify-0.1.8.dist-info/RECORD +0 -22
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/WHEEL +0 -0
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/top_level.txt +0 -0
|
@@ -1,73 +1,54 @@
|
|
|
1
|
-
|
|
2
|
-
from sklearn.base import BaseEstimator
|
|
1
|
+
|
|
3
2
|
import numpy as np
|
|
4
|
-
|
|
3
|
+
from sklearn.neighbors import NearestNeighbors
|
|
5
4
|
|
|
6
|
-
class PWKCLF(BaseEstimator):
|
|
7
|
-
"""
|
|
8
|
-
Learner based on k-Nearest Neighbors (KNN) to use in the PWK method.
|
|
9
|
-
|
|
10
|
-
This classifier adjusts the influence of neighbors using class weights
|
|
11
|
-
derived from the `alpha` parameter. The `alpha` parameter controls the
|
|
12
|
-
influence of class imbalance.
|
|
13
5
|
|
|
14
|
-
Parameters
|
|
15
|
-
----------
|
|
16
|
-
alpha : float, default=1
|
|
17
|
-
Controls the influence of class imbalance. Must be >= 1.
|
|
18
6
|
|
|
19
|
-
|
|
20
|
-
|
|
7
|
+
class PWKCLF:
|
|
8
|
+
r"""Probabilistic Weighted k-Nearest Neighbor Classifier (PWKCLF).
|
|
21
9
|
|
|
22
|
-
|
|
23
|
-
|
|
10
|
+
A weighted k-nearest neighbor classifier that assigns class probabilities to
|
|
11
|
+
instances based on neighbor counts weighted by class-specific inverse frequency
|
|
12
|
+
factors adjusted by a hyperparameter alpha controlling imbalance compensation.
|
|
24
13
|
|
|
25
|
-
|
|
26
|
-
|
|
14
|
+
Attributes
|
|
15
|
+
----------
|
|
16
|
+
alpha : float
|
|
17
|
+
Exponent controlling the degree of imbalance compensation.
|
|
18
|
+
n_neighbors : int
|
|
19
|
+
Number of nearest neighbors considered.
|
|
20
|
+
nbrs : sklearn.neighbors.NearestNeighbors
|
|
21
|
+
The underlying k-NN structure used for neighbor queries.
|
|
22
|
+
classes_ : ndarray
|
|
23
|
+
Unique classes observed during training.
|
|
24
|
+
class_to_index : dict
|
|
25
|
+
Mapping from class label to index used in internal arrays.
|
|
26
|
+
class_weights : ndarray
|
|
27
|
+
Per-class weights computed based on class frequency and alpha.
|
|
28
|
+
y_train : ndarray
|
|
29
|
+
Labels of training samples.
|
|
27
30
|
|
|
28
|
-
leaf_size : int, default=30
|
|
29
|
-
Leaf size passed to the tree-based algorithms.
|
|
30
31
|
|
|
31
|
-
|
|
32
|
-
|
|
32
|
+
Notes
|
|
33
|
+
-----
|
|
34
|
+
The class weights are defined as:
|
|
33
35
|
|
|
34
|
-
|
|
35
|
-
Additional keyword arguments for the metric function.
|
|
36
|
+
.. math::
|
|
36
37
|
|
|
37
|
-
|
|
38
|
-
|
|
38
|
+
w_c = \left( \frac{N_c}{\min_{c'} N_{c'}} \right)^{-\frac{1}{\alpha}},
|
|
39
|
+
|
|
40
|
+
where :math:`N_c` is the count of class :math:`c` in the training set.
|
|
41
|
+
|
|
42
|
+
This weighting scheme reduces bias towards majority classes by downweighting them
|
|
43
|
+
in the voting process.
|
|
39
44
|
|
|
40
45
|
Examples
|
|
41
46
|
--------
|
|
42
|
-
>>>
|
|
43
|
-
>>>
|
|
44
|
-
>>>
|
|
45
|
-
>>> from mlquantify.utils.general import get_real_prev
|
|
46
|
-
>>> from mlquantify.classification import PWKCLF
|
|
47
|
-
>>>
|
|
48
|
-
>>> # Load dataset
|
|
49
|
-
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
50
|
-
>>>
|
|
51
|
-
>>> # Split into training and testing sets
|
|
52
|
-
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=32)
|
|
53
|
-
>>>
|
|
54
|
-
>>> # Create and configure the PWKCLF learner
|
|
55
|
-
>>> learner = PWKCLF(alpha=1, n_neighbors=10)
|
|
56
|
-
>>>
|
|
57
|
-
>>> # Create the PWK quantifier
|
|
58
|
-
>>> model = PWK(learner=learner)
|
|
59
|
-
>>>
|
|
60
|
-
>>> # Train the model
|
|
61
|
-
>>> model.fit(X_train, y_train)
|
|
62
|
-
>>>
|
|
63
|
-
>>> # Predict prevalences
|
|
64
|
-
>>> y_pred = model.predict(X_test)
|
|
65
|
-
>>>
|
|
66
|
-
>>> # Display results
|
|
67
|
-
>>> print("Real:", get_real_prev(y_test))
|
|
68
|
-
>>> print("PWK:", y_pred)
|
|
47
|
+
>>> clf = PWKCLF(alpha=2.0, n_neighbors=7)
|
|
48
|
+
>>> clf.fit(X_train, y_train)
|
|
49
|
+
>>> labels = clf.predict(X_test)
|
|
69
50
|
"""
|
|
70
|
-
|
|
51
|
+
|
|
71
52
|
def __init__(self,
|
|
72
53
|
alpha=1,
|
|
73
54
|
n_neighbors=10,
|
|
@@ -77,9 +58,6 @@ class PWKCLF(BaseEstimator):
|
|
|
77
58
|
p=2,
|
|
78
59
|
metric_params=None,
|
|
79
60
|
n_jobs=None):
|
|
80
|
-
if alpha < 1:
|
|
81
|
-
raise ValueError("alpha must not be smaller than 1")
|
|
82
|
-
|
|
83
61
|
self.alpha = alpha
|
|
84
62
|
self.n_neighbors = n_neighbors
|
|
85
63
|
|
|
@@ -119,9 +97,6 @@ class PWKCLF(BaseEstimator):
|
|
|
119
97
|
|
|
120
98
|
self.y_train = y
|
|
121
99
|
|
|
122
|
-
if isinstance(y, pd.DataFrame):
|
|
123
|
-
self.y_train = y.reset_index(drop=True)
|
|
124
|
-
|
|
125
100
|
unique_classes, class_counts = np.unique(y, return_counts=True)
|
|
126
101
|
self.classes_ = unique_classes
|
|
127
102
|
self.class_to_index = dict(zip(self.classes_, range(len(self.classes_))))
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from sklearn.neighbors import KernelDensity
|
|
3
|
+
from mlquantify.utils._constraints import Interval
|
|
4
|
+
from mlquantify.neighbors._base import BaseKDE
|
|
5
|
+
from mlquantify.neighbors._utils import (
|
|
6
|
+
gaussian_kernel,
|
|
7
|
+
negative_log_likelihood,
|
|
8
|
+
EPS,
|
|
9
|
+
)
|
|
10
|
+
from mlquantify.utils import check_random_state
|
|
11
|
+
from scipy.optimize import minimize
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# ============================================================
|
|
15
|
+
# Auxiliary functions
|
|
16
|
+
# ============================================================
|
|
17
|
+
|
|
18
|
+
def _optimize_on_simplex(objective, n_classes, x0=None):
|
|
19
|
+
r"""Optimize an objective function over the probability simplex.
|
|
20
|
+
|
|
21
|
+
This function performs constrained optimization to find the mixture weights
|
|
22
|
+
:math:`\alpha` on the simplex :math:`\Delta^{n-1} = \{ \alpha \in \mathbb{R}^n : \alpha_i \geq 0, \sum_i \alpha_i = 1 \}`
|
|
23
|
+
that minimize the given objective function.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
objective : callable
|
|
28
|
+
Function from :math:`\mathbb{R}^n \to \mathbb{R}` to minimize.
|
|
29
|
+
n_classes : int
|
|
30
|
+
Dimensionality of the simplex (number of classes).
|
|
31
|
+
x0 : array-like, optional
|
|
32
|
+
Initial guess for the optimization, defaults to uniform vector.
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
alpha_opt : ndarray of shape (n_classes,)
|
|
37
|
+
Optimized weights summing to 1.
|
|
38
|
+
min_loss : float
|
|
39
|
+
Objective function value at optimum.
|
|
40
|
+
|
|
41
|
+
Notes
|
|
42
|
+
-----
|
|
43
|
+
The optimization uses scipy's `minimize` with bounds and equality constraint.
|
|
44
|
+
"""
|
|
45
|
+
if x0 is None:
|
|
46
|
+
x0 = np.ones(n_classes) / n_classes
|
|
47
|
+
|
|
48
|
+
constraints = {'type': 'eq', 'fun': lambda x: np.sum(x) - 1}
|
|
49
|
+
bounds = [(0, 1)] * n_classes
|
|
50
|
+
|
|
51
|
+
res = minimize(objective, x0, bounds=bounds, constraints=constraints)
|
|
52
|
+
alpha_opt = res.x / np.sum(res.x)
|
|
53
|
+
return alpha_opt, res.fun
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# ============================================================
|
|
57
|
+
# KDEy-ML — Maximum Likelihood
|
|
58
|
+
# ============================================================
|
|
59
|
+
|
|
60
|
+
class KDEyML(BaseKDE):
|
|
61
|
+
r"""KDEy Maximum Likelihood quantifier.
|
|
62
|
+
|
|
63
|
+
Models class-conditional densities of posterior probabilities via Kernel Density
|
|
64
|
+
Estimation (KDE) and estimates class prevalences by maximizing the likelihood of
|
|
65
|
+
test data under a mixture model of these KDEs.
|
|
66
|
+
|
|
67
|
+
The mixture weights correspond to class prevalences, optimized under the simplex
|
|
68
|
+
constraint. The optimization minimizes the negative log-likelihood of the mixture
|
|
69
|
+
density evaluated at test posteriors.
|
|
70
|
+
|
|
71
|
+
This approach generalizes EM-based quantification methods by using KDE instead
|
|
72
|
+
of discrete histograms, allowing smooth multivariate density estimation over
|
|
73
|
+
the probability simplex.
|
|
74
|
+
|
|
75
|
+
References
|
|
76
|
+
----------
|
|
77
|
+
The method is based on ideas presented by Moreo et al. (2023), extending KDE-based
|
|
78
|
+
approaches for distribution matching and maximum likelihood estimation.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def _precompute_training(self, train_predictions, train_y_values):
|
|
82
|
+
r"""
|
|
83
|
+
Fit KDE models on class-specific training posterior predictions.
|
|
84
|
+
"""
|
|
85
|
+
super()._fit_kde_models(train_predictions, train_y_values)
|
|
86
|
+
|
|
87
|
+
def _solve_prevalences(self, predictions):
|
|
88
|
+
r"""
|
|
89
|
+
Estimate class prevalences by maximizing log-likelihood under KDE mixture.
|
|
90
|
+
|
|
91
|
+
Parameters
|
|
92
|
+
----------
|
|
93
|
+
predictions : ndarray, shape (n_samples, n_features)
|
|
94
|
+
Posterior probabilities of test set instances.
|
|
95
|
+
|
|
96
|
+
Returns
|
|
97
|
+
-------
|
|
98
|
+
alpha_opt : ndarray, shape (n_classes,)
|
|
99
|
+
Estimated class prevalences.
|
|
100
|
+
min_loss : float
|
|
101
|
+
Minimum negative log-likelihood achieved.
|
|
102
|
+
|
|
103
|
+
Notes
|
|
104
|
+
-----
|
|
105
|
+
The optimization is solved over the probability simplex.
|
|
106
|
+
"""
|
|
107
|
+
n_classes = len(self._class_kdes)
|
|
108
|
+
class_likelihoods = np.array([
|
|
109
|
+
np.exp(kde.score_samples(predictions)) + EPS for kde in self._class_kdes
|
|
110
|
+
]) # (n_classes, n_samples)
|
|
111
|
+
|
|
112
|
+
def objective(alpha):
|
|
113
|
+
mixture = np.dot(alpha, class_likelihoods)
|
|
114
|
+
return negative_log_likelihood(mixture)
|
|
115
|
+
|
|
116
|
+
alpha_opt, min_loss = _optimize_on_simplex(objective, n_classes)
|
|
117
|
+
|
|
118
|
+
self.best_distance = min_loss
|
|
119
|
+
|
|
120
|
+
return alpha_opt, min_loss
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
# ============================================================
|
|
124
|
+
# KDEy-HD — Hellinger Distance Minimization
|
|
125
|
+
# ============================================================
|
|
126
|
+
|
|
127
|
+
class KDEyHD(BaseKDE):
|
|
128
|
+
r"""KDEy Hellinger Distance Minimization quantifier.
|
|
129
|
+
|
|
130
|
+
Estimates class prevalences by minimizing the Hellinger distance \( HD \) between
|
|
131
|
+
the KDE mixture of class-conditional densities and the KDE of test data, estimated
|
|
132
|
+
via Monte Carlo sampling and importance weighting.
|
|
133
|
+
|
|
134
|
+
This stochastic approximation enables practical optimization of complex divergence
|
|
135
|
+
measures otherwise lacking closed-form expressions for Gaussian Mixture Models.
|
|
136
|
+
|
|
137
|
+
Parameters
|
|
138
|
+
----------
|
|
139
|
+
montecarlo_trials : int
|
|
140
|
+
Number of Monte Carlo samples used in approximation.
|
|
141
|
+
random_state : int or None
|
|
142
|
+
Seed or random state for reproducibility.
|
|
143
|
+
|
|
144
|
+
References
|
|
145
|
+
----------
|
|
146
|
+
Builds on f-divergence Monte Carlo approximations for KDE mixtures as detailed
|
|
147
|
+
by Moreo et al. (2023) and importance sampling techniques.
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
_parameter_constraints = {
|
|
151
|
+
"montecarlo_trials": [Interval(1, None)],
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
def __init__(self, learner=None, bandwidth=0.1, kernel="gaussian", montecarlo_trials=1000, random_state=None):
|
|
155
|
+
super().__init__(learner, bandwidth, kernel)
|
|
156
|
+
self.montecarlo_trials = montecarlo_trials
|
|
157
|
+
self.random_state = random_state
|
|
158
|
+
|
|
159
|
+
def _precompute_training(self, train_predictions, train_y_values):
|
|
160
|
+
"""
|
|
161
|
+
Precompute reference samples from class KDEs and their densities.
|
|
162
|
+
"""
|
|
163
|
+
super()._fit_kde_models(train_predictions, train_y_values)
|
|
164
|
+
n_class = len(self._class_kdes)
|
|
165
|
+
trials = int(self.montecarlo_trials)
|
|
166
|
+
rng = check_random_state(self.random_state)
|
|
167
|
+
# Convert to integer seed for sklearn compatibility
|
|
168
|
+
seed = rng.integers(0, 2**31 - 1) if hasattr(rng, 'integers') else self.random_state
|
|
169
|
+
|
|
170
|
+
samples = np.vstack([
|
|
171
|
+
kde.sample(max(1, trials // n_class), random_state=seed)
|
|
172
|
+
for kde in self._class_kdes
|
|
173
|
+
])
|
|
174
|
+
|
|
175
|
+
ref_classwise = np.array([np.exp(k.score_samples(samples)) + EPS for k in self._class_kdes])
|
|
176
|
+
ref_density = np.mean(ref_classwise, axis=0) + EPS
|
|
177
|
+
|
|
178
|
+
self._ref_samples = samples
|
|
179
|
+
self._ref_classwise = ref_classwise
|
|
180
|
+
self._ref_density = ref_density
|
|
181
|
+
|
|
182
|
+
def _solve_prevalences(self, predictions):
|
|
183
|
+
"""
|
|
184
|
+
Minimize Hellinger distance between test KDE and mixture KDE via importance sampling.
|
|
185
|
+
"""
|
|
186
|
+
test_kde = KernelDensity(bandwidth=self.bandwidth).fit(predictions)
|
|
187
|
+
qs = np.exp(test_kde.score_samples(self._ref_samples)) + EPS
|
|
188
|
+
iw = qs / self._ref_density
|
|
189
|
+
fracs = self._ref_classwise / qs
|
|
190
|
+
|
|
191
|
+
def objective(alpha):
|
|
192
|
+
alpha = np.clip(alpha, EPS, None)
|
|
193
|
+
alpha /= np.sum(alpha)
|
|
194
|
+
ps_div_qs = np.dot(alpha, fracs)
|
|
195
|
+
vals = (np.sqrt(ps_div_qs) - 1.0) ** 2 * iw
|
|
196
|
+
return np.mean(vals)
|
|
197
|
+
|
|
198
|
+
alpha_opt, min_loss = _optimize_on_simplex(objective, len(self._class_kdes))
|
|
199
|
+
|
|
200
|
+
self.best_distance = min_loss
|
|
201
|
+
|
|
202
|
+
return alpha_opt, min_loss
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
# ============================================================
|
|
206
|
+
# KDEy-CS — Cauchy–Schwarz Divergence
|
|
207
|
+
# ============================================================
|
|
208
|
+
|
|
209
|
+
class KDEyCS(BaseKDE):
|
|
210
|
+
r"""KDEy Cauchy-Schwarz Divergence quantifier.
|
|
211
|
+
|
|
212
|
+
Uses a closed-form solution for minimizing the Cauchy-Schwarz (CS) divergence between
|
|
213
|
+
Gaussian Mixture Models representing class-conditional densities fitted via KDE.
|
|
214
|
+
|
|
215
|
+
This mathematically efficient approach leverages precomputed kernel Gram matrices
|
|
216
|
+
of train-train, train-test, and test-test instances for fast divergence evaluation,
|
|
217
|
+
enabling scalable multiclass quantification.
|
|
218
|
+
|
|
219
|
+
References
|
|
220
|
+
----------
|
|
221
|
+
Based on closed-form CS divergence derivations by Kampa et al. (2011) and KDEy
|
|
222
|
+
density representations, as discussed by Moreo et al. (2023).
|
|
223
|
+
"""
|
|
224
|
+
|
|
225
|
+
def _precompute_training(self, train_predictions, train_y_values):
|
|
226
|
+
"""
|
|
227
|
+
Precompute kernel sums and Gram matrices needed for CS divergence evaluation.
|
|
228
|
+
"""
|
|
229
|
+
P = np.atleast_2d(train_predictions)
|
|
230
|
+
y = np.asarray(train_y_values)
|
|
231
|
+
centers = [P[y == c] for c in self.classes_]
|
|
232
|
+
counts = np.array([len(x) if len(x) > 0 else 1 for x in centers])
|
|
233
|
+
h_eff = np.sqrt(2) * self.bandwidth
|
|
234
|
+
|
|
235
|
+
B_bar = np.zeros((len(self.classes_), len(self.classes_)))
|
|
236
|
+
for i, Xi in enumerate(centers):
|
|
237
|
+
for j, Xj in enumerate(centers[i:], start=i):
|
|
238
|
+
val = np.sum(gaussian_kernel(Xi, Xj, h_eff))
|
|
239
|
+
B_bar[i, j] = B_bar[j, i] = val
|
|
240
|
+
self._centers = centers
|
|
241
|
+
self._counts = counts
|
|
242
|
+
self._B_bar = B_bar
|
|
243
|
+
self._h_eff = h_eff
|
|
244
|
+
|
|
245
|
+
def _solve_prevalences(self, predictions):
|
|
246
|
+
"""
|
|
247
|
+
Minimize Cauchy-Schwarz divergence over class mixture weights on the probability simplex.
|
|
248
|
+
"""
|
|
249
|
+
Pte = np.atleast_2d(predictions)
|
|
250
|
+
n = len(self.classes_)
|
|
251
|
+
a_bar = np.array([np.sum(gaussian_kernel(Xi, Pte, self._h_eff)) for Xi in self._centers])
|
|
252
|
+
counts = self._counts + EPS
|
|
253
|
+
B_bar = self._B_bar + EPS
|
|
254
|
+
t = 1.0 / max(1, Pte.shape[0])
|
|
255
|
+
|
|
256
|
+
def objective(alpha):
|
|
257
|
+
alpha = np.clip(alpha, EPS, None)
|
|
258
|
+
alpha /= np.sum(alpha)
|
|
259
|
+
rbar = alpha / counts
|
|
260
|
+
partA = -np.log(np.dot(rbar, a_bar) * t + EPS)
|
|
261
|
+
partB = 0.5 * np.log(rbar @ (B_bar @ rbar) + EPS)
|
|
262
|
+
return partA + partB
|
|
263
|
+
|
|
264
|
+
alpha_opt, min_loss = _optimize_on_simplex(objective, n)
|
|
265
|
+
|
|
266
|
+
self.best_distance = min_loss
|
|
267
|
+
|
|
268
|
+
return alpha_opt, min_loss
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from sklearn.metrics import pairwise_distances
|
|
3
|
+
from math import pi
|
|
4
|
+
from scipy.optimize import minimize
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
EPS = 1e-12
|
|
8
|
+
|
|
9
|
+
# ============================================================
|
|
10
|
+
# Utilitários
|
|
11
|
+
# ============================================================
|
|
12
|
+
|
|
13
|
+
def gaussian_kernel(X, Y, bandwidth):
|
|
14
|
+
r"""Compute the Gaussian kernel matrix K(x, y) with specified bandwidth.
|
|
15
|
+
|
|
16
|
+
This kernel matrix represents the similarity between each pair of points in X and Y,
|
|
17
|
+
computed using the Gaussian (RBF) kernel function:
|
|
18
|
+
|
|
19
|
+
.. math::
|
|
20
|
+
|
|
21
|
+
K(x, y) = \frac{1}{(2 \pi)^{D/2} h^D} \exp\left(- \frac{\|x - y\|^2}{2 h^2}\right)
|
|
22
|
+
|
|
23
|
+
where :math:`h` is the bandwidth (smoothing parameter), and :math:`D` is the dimensionality
|
|
24
|
+
of the input feature space.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
X : array-like of shape (n_samples_X, n_features)
|
|
29
|
+
Input data points.
|
|
30
|
+
Y : array-like of shape (n_samples_Y, n_features) or None
|
|
31
|
+
Input data points for kernel computation. If None, defaults to X.
|
|
32
|
+
bandwidth : float
|
|
33
|
+
Kernel bandwidth parameter :math:`h`.
|
|
34
|
+
|
|
35
|
+
Returns
|
|
36
|
+
-------
|
|
37
|
+
K : ndarray of shape (n_samples_X, n_samples_Y)
|
|
38
|
+
Gaussian kernel matrix.
|
|
39
|
+
"""
|
|
40
|
+
X = np.atleast_2d(X)
|
|
41
|
+
if Y is None:
|
|
42
|
+
Y = X
|
|
43
|
+
else:
|
|
44
|
+
Y = np.atleast_2d(Y)
|
|
45
|
+
sqd = pairwise_distances(X, Y, metric="euclidean") ** 2
|
|
46
|
+
D = X.shape[1]
|
|
47
|
+
norm = (bandwidth ** D) * ((2 * pi) ** (D / 2))
|
|
48
|
+
return np.exp(-sqd / (2 * (bandwidth ** 2))) / (norm + EPS)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def negative_log_likelihood(mixture_likelihoods):
|
|
52
|
+
r"""Compute the negative log-likelihood of given mixture likelihoods in a numerically stable way.
|
|
53
|
+
|
|
54
|
+
Given mixture likelihood values :math:`p_i` for samples, the negative log-likelihood is:
|
|
55
|
+
|
|
56
|
+
.. math::
|
|
57
|
+
|
|
58
|
+
- \sum_i \log(p_i)
|
|
59
|
+
|
|
60
|
+
Numerical stability is achieved by clipping likelihoods below a small epsilon.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
mixture_likelihoods : array-like
|
|
65
|
+
Likelihood values for the mixture distribution evaluated at samples.
|
|
66
|
+
|
|
67
|
+
Returns
|
|
68
|
+
-------
|
|
69
|
+
nll : float
|
|
70
|
+
Negative log-likelihood value.
|
|
71
|
+
"""
|
|
72
|
+
mixture_likelihoods = np.clip(mixture_likelihoods, EPS, None)
|
|
73
|
+
return -np.sum(np.log(mixture_likelihoods))
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _simplex_constraints(n):
|
|
77
|
+
r"""Define constraints and bounds for optimization over the probability simplex.
|
|
78
|
+
|
|
79
|
+
The simplex is defined as all vectors :math:`\alpha \in \mathbb{R}^n` such that:
|
|
80
|
+
|
|
81
|
+
.. math::
|
|
82
|
+
|
|
83
|
+
\alpha_i \geq 0, \quad \sum_{i=1}^n \alpha_i = 1
|
|
84
|
+
|
|
85
|
+
Parameters
|
|
86
|
+
----------
|
|
87
|
+
n : int
|
|
88
|
+
Dimensionality of the simplex (number of mixture components).
|
|
89
|
+
|
|
90
|
+
Returns
|
|
91
|
+
-------
|
|
92
|
+
constraints : list of dict
|
|
93
|
+
List containing equality constraint for sum of elements equaling 1.
|
|
94
|
+
bounds : list of tuple
|
|
95
|
+
Bounds for each element to lie between 0 and 1.
|
|
96
|
+
"""
|
|
97
|
+
cons = [{"type": "eq", "fun": lambda a: np.sum(a) - 1.0}]
|
|
98
|
+
bounds = [(0.0, 1.0) for _ in range(n)]
|
|
99
|
+
return cons, bounds
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _optimize_on_simplex(objective, n, x0=None):
|
|
103
|
+
r"""Minimize an objective function over the probability simplex.
|
|
104
|
+
|
|
105
|
+
This function solves for mixture weights \( \boldsymbol{\alpha} \) that minimize the
|
|
106
|
+
objective function under the constraints \(\alpha_i \geq 0\) and \(\sum_i \alpha_i = 1\).
|
|
107
|
+
|
|
108
|
+
The optimization uses Sequential Least SQuares Programming (SLSQP).
|
|
109
|
+
|
|
110
|
+
Parameters
|
|
111
|
+
----------
|
|
112
|
+
objective : callable
|
|
113
|
+
The objective function to minimize. It should accept a vector of length n and
|
|
114
|
+
return a scalar loss.
|
|
115
|
+
n : int
|
|
116
|
+
Number of mixture components (dimension of \( \boldsymbol{\alpha} \)).
|
|
117
|
+
x0 : array-like, optional
|
|
118
|
+
Initial guess for \( \boldsymbol{\alpha} \). If None, defaults to uniform.
|
|
119
|
+
|
|
120
|
+
Returns
|
|
121
|
+
-------
|
|
122
|
+
alpha_opt : ndarray of shape (n,)
|
|
123
|
+
Optimized mixture weights summing to one.
|
|
124
|
+
"""
|
|
125
|
+
if x0 is None:
|
|
126
|
+
x0 = np.ones(n) / n
|
|
127
|
+
cons, bounds = _simplex_constraints(n)
|
|
128
|
+
res = minimize(objective, x0, method="SLSQP", constraints=cons, bounds=bounds)
|
|
129
|
+
x = np.clip(getattr(res, "x", x0), 0.0, None)
|
|
130
|
+
s = np.sum(x)
|
|
131
|
+
return x / s if s > 0 else np.ones(n) / n
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# TODO
|
mlquantify/utils/__init__.py
CHANGED
|
@@ -1,2 +1,47 @@
|
|
|
1
|
-
from .
|
|
2
|
-
|
|
1
|
+
from mlquantify.utils._tags import (
|
|
2
|
+
Tags,
|
|
3
|
+
TargetInputTags,
|
|
4
|
+
get_tags
|
|
5
|
+
)
|
|
6
|
+
from mlquantify.utils._constraints import (
|
|
7
|
+
Interval,
|
|
8
|
+
Options,
|
|
9
|
+
CallableConstraint
|
|
10
|
+
)
|
|
11
|
+
from mlquantify.utils.prevalence import (
|
|
12
|
+
get_prev_from_labels,
|
|
13
|
+
normalize_prevalence
|
|
14
|
+
)
|
|
15
|
+
from mlquantify.utils._load import load_quantifier
|
|
16
|
+
from mlquantify.utils._artificial import make_prevs
|
|
17
|
+
from mlquantify.utils._context import validation_context, is_validation_skipped
|
|
18
|
+
from mlquantify.utils._decorators import _fit_context
|
|
19
|
+
from mlquantify.utils._exceptions import (
|
|
20
|
+
InputValidationError,
|
|
21
|
+
InvalidParameterError,
|
|
22
|
+
NotFittedError
|
|
23
|
+
)
|
|
24
|
+
from mlquantify.utils._get_scores import apply_cross_validation
|
|
25
|
+
from mlquantify.utils._parallel import resolve_n_jobs
|
|
26
|
+
from mlquantify.utils._random import check_random_state
|
|
27
|
+
from mlquantify.utils._sampling import (
|
|
28
|
+
simplex_uniform_kraemer,
|
|
29
|
+
simplex_grid_sampling,
|
|
30
|
+
simplex_uniform_sampling,
|
|
31
|
+
get_indexes_with_prevalence
|
|
32
|
+
)
|
|
33
|
+
from mlquantify.utils._validation import (
|
|
34
|
+
_validate_is_numpy_array,
|
|
35
|
+
_validate_2d_predictions,
|
|
36
|
+
_validate_1d_predictions,
|
|
37
|
+
validate_y,
|
|
38
|
+
validate_predictions,
|
|
39
|
+
validate_parameter_constraints,
|
|
40
|
+
validate_learner_contraints,
|
|
41
|
+
_is_fitted,
|
|
42
|
+
check_is_fitted,
|
|
43
|
+
_is_arraylike_not_scalar,
|
|
44
|
+
_is_arraylike,
|
|
45
|
+
validate_data,
|
|
46
|
+
check_classes_attribute
|
|
47
|
+
)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def make_prevs(ndim:int) -> list:
|
|
5
|
+
"""
|
|
6
|
+
Generate a list of n_dim values uniformly distributed between 0 and 1 that sum exactly to 1.
|
|
7
|
+
|
|
8
|
+
Parameters
|
|
9
|
+
----------
|
|
10
|
+
ndim : int
|
|
11
|
+
Number of dimensions.
|
|
12
|
+
|
|
13
|
+
Returns
|
|
14
|
+
-------
|
|
15
|
+
list
|
|
16
|
+
List of n_dim values uniformly distributed between 0 and 1 that sum exactly to 1.
|
|
17
|
+
"""
|
|
18
|
+
# Generate n_dim-1 random u_dist uniformly distributed between 0 and 1
|
|
19
|
+
u_dist = np.random.uniform(0, 1, ndim - 1)
|
|
20
|
+
# Add 0 and 1 to the u_dist
|
|
21
|
+
u_dist = np.append(u_dist, [0, 1])
|
|
22
|
+
# Sort the u_dist
|
|
23
|
+
u_dist.sort()
|
|
24
|
+
# Calculate the differences between consecutive u_dist
|
|
25
|
+
prevs = np.diff(u_dist)
|
|
26
|
+
|
|
27
|
+
return prevs
|