mlquantify 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify/__init__.py +11 -1
- mlquantify/adjust_counting/__init__.py +11 -1
- mlquantify/adjust_counting/_adjustment.py +370 -87
- mlquantify/adjust_counting/_base.py +1 -3
- mlquantify/adjust_counting/_counting.py +27 -19
- mlquantify/adjust_counting/_utils.py +23 -28
- mlquantify/confidence.py +16 -22
- mlquantify/likelihood/_base.py +38 -52
- mlquantify/likelihood/_classes.py +88 -72
- mlquantify/meta/_classes.py +86 -62
- mlquantify/metrics/_oq.py +2 -2
- mlquantify/metrics/_rq.py +2 -2
- mlquantify/metrics/_slq.py +9 -9
- mlquantify/mixture/_base.py +13 -19
- mlquantify/mixture/_classes.py +68 -10
- mlquantify/mixture/_utils.py +62 -11
- mlquantify/model_selection/_protocol.py +6 -6
- mlquantify/model_selection/_search.py +1 -1
- mlquantify/neighbors/_base.py +35 -65
- mlquantify/neighbors/_classes.py +1 -10
- mlquantify/neighbors/_classification.py +5 -12
- mlquantify/neighbors/_kde.py +7 -9
- mlquantify/neighbors/_utils.py +17 -21
- mlquantify/utils/_validation.py +3 -3
- mlquantify/utils/prevalence.py +4 -1
- {mlquantify-0.1.9.dist-info → mlquantify-0.1.11.dist-info}/METADATA +10 -18
- mlquantify-0.1.11.dist-info/RECORD +53 -0
- mlquantify-0.1.9.dist-info/RECORD +0 -53
- {mlquantify-0.1.9.dist-info → mlquantify-0.1.11.dist-info}/WHEEL +0 -0
- {mlquantify-0.1.9.dist-info → mlquantify-0.1.11.dist-info}/top_level.txt +0 -0
mlquantify/mixture/_utils.py
CHANGED
|
@@ -6,7 +6,7 @@ import numpy as np
|
|
|
6
6
|
# =====================================================
|
|
7
7
|
|
|
8
8
|
def getHist(scores, nbins):
|
|
9
|
-
"""
|
|
9
|
+
r"""
|
|
10
10
|
Calculate histogram-like bin probabilities for a given set of scores.
|
|
11
11
|
|
|
12
12
|
This function divides the score range into equal bins and computes the proportion
|
|
@@ -42,7 +42,7 @@ def getHist(scores, nbins):
|
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
def ternary_search(left: float, right: float, func, tol: float = 1e-4) -> float:
|
|
45
|
-
"""
|
|
45
|
+
r"""
|
|
46
46
|
Ternary search to find the minimum of a unimodal function in [left, right].
|
|
47
47
|
|
|
48
48
|
Parameters
|
|
@@ -53,8 +53,8 @@ def ternary_search(left: float, right: float, func, tol: float = 1e-4) -> float:
|
|
|
53
53
|
Right bound.
|
|
54
54
|
func : callable
|
|
55
55
|
Function to minimize.
|
|
56
|
-
tol : float
|
|
57
|
-
Tolerance for termination.
|
|
56
|
+
tol : float, optional
|
|
57
|
+
Tolerance for termination. Default is 1e-4.
|
|
58
58
|
|
|
59
59
|
Returns
|
|
60
60
|
-------
|
|
@@ -73,10 +73,23 @@ def ternary_search(left: float, right: float, func, tol: float = 1e-4) -> float:
|
|
|
73
73
|
|
|
74
74
|
|
|
75
75
|
def topsoe(p: np.ndarray, q: np.ndarray) -> float:
|
|
76
|
-
"""
|
|
76
|
+
r"""
|
|
77
77
|
Topsoe distance between two probability distributions.
|
|
78
78
|
|
|
79
|
-
|
|
79
|
+
.. math::
|
|
80
|
+
D_T(p, q) = \sum \left( p \log \frac{2p}{p + q} + q \log \frac{2q}{p + q} \right)
|
|
81
|
+
|
|
82
|
+
Parameters
|
|
83
|
+
----------
|
|
84
|
+
p : np.ndarray
|
|
85
|
+
First probability distribution.
|
|
86
|
+
q : np.ndarray
|
|
87
|
+
Second probability distribution.
|
|
88
|
+
|
|
89
|
+
Returns
|
|
90
|
+
-------
|
|
91
|
+
float
|
|
92
|
+
The Topsoe distance.
|
|
80
93
|
"""
|
|
81
94
|
p = np.maximum(p, 1e-20)
|
|
82
95
|
q = np.maximum(q, 1e-20)
|
|
@@ -84,10 +97,23 @@ def topsoe(p: np.ndarray, q: np.ndarray) -> float:
|
|
|
84
97
|
|
|
85
98
|
|
|
86
99
|
def probsymm(p: np.ndarray, q: np.ndarray) -> float:
|
|
87
|
-
"""
|
|
100
|
+
r"""
|
|
88
101
|
Probabilistic Symmetric distance.
|
|
89
102
|
|
|
90
|
-
|
|
103
|
+
.. math::
|
|
104
|
+
D_{PS}(p, q) = \sum (p - q) \log \frac{p}{q}
|
|
105
|
+
|
|
106
|
+
Parameters
|
|
107
|
+
----------
|
|
108
|
+
p : np.ndarray
|
|
109
|
+
First probability distribution.
|
|
110
|
+
q : np.ndarray
|
|
111
|
+
Second probability distribution.
|
|
112
|
+
|
|
113
|
+
Returns
|
|
114
|
+
-------
|
|
115
|
+
float
|
|
116
|
+
The Probabilistic Symmetric distance.
|
|
91
117
|
"""
|
|
92
118
|
p = np.maximum(p, 1e-20)
|
|
93
119
|
q = np.maximum(q, 1e-20)
|
|
@@ -95,10 +121,23 @@ def probsymm(p: np.ndarray, q: np.ndarray) -> float:
|
|
|
95
121
|
|
|
96
122
|
|
|
97
123
|
def hellinger(p: np.ndarray, q: np.ndarray) -> float:
|
|
98
|
-
"""
|
|
124
|
+
r"""
|
|
99
125
|
Hellinger distance between two probability distributions.
|
|
100
126
|
|
|
101
|
-
|
|
127
|
+
.. math::
|
|
128
|
+
H(p, q) = \frac{1}{\sqrt{2}} \sqrt{\sum \left( \sqrt{p} - \sqrt{q} \right)^2}
|
|
129
|
+
|
|
130
|
+
Parameters
|
|
131
|
+
----------
|
|
132
|
+
p : np.ndarray
|
|
133
|
+
First probability distribution.
|
|
134
|
+
q : np.ndarray
|
|
135
|
+
Second probability distribution.
|
|
136
|
+
|
|
137
|
+
Returns
|
|
138
|
+
-------
|
|
139
|
+
float
|
|
140
|
+
The Hellinger distance.
|
|
102
141
|
"""
|
|
103
142
|
p = np.maximum(p, 1e-20)
|
|
104
143
|
q = np.maximum(q, 1e-20)
|
|
@@ -106,7 +145,19 @@ def hellinger(p: np.ndarray, q: np.ndarray) -> float:
|
|
|
106
145
|
|
|
107
146
|
|
|
108
147
|
def sqEuclidean(p: np.ndarray, q: np.ndarray) -> float:
|
|
109
|
-
"""
|
|
148
|
+
r"""
|
|
110
149
|
Squared Euclidean distance between two vectors.
|
|
150
|
+
|
|
151
|
+
Parameters
|
|
152
|
+
----------
|
|
153
|
+
p : np.ndarray
|
|
154
|
+
First vector.
|
|
155
|
+
q : np.ndarray
|
|
156
|
+
Second vector.
|
|
157
|
+
|
|
158
|
+
Returns
|
|
159
|
+
-------
|
|
160
|
+
float
|
|
161
|
+
The squared Euclidean distance.
|
|
111
162
|
"""
|
|
112
163
|
return np.sum((p - q) ** 2)
|
|
@@ -15,7 +15,7 @@ import numpy as np
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class BaseProtocol(ProtocolMixin, BaseQuantifier):
|
|
18
|
-
"""Base class for evaluation protocols.
|
|
18
|
+
r"""Base class for evaluation protocols.
|
|
19
19
|
|
|
20
20
|
Parameters
|
|
21
21
|
----------
|
|
@@ -76,7 +76,7 @@ class BaseProtocol(ProtocolMixin, BaseQuantifier):
|
|
|
76
76
|
|
|
77
77
|
|
|
78
78
|
def split(self, X: np.ndarray, y: np.ndarray):
|
|
79
|
-
"""
|
|
79
|
+
r"""
|
|
80
80
|
Split the data into samples for evaluation.
|
|
81
81
|
|
|
82
82
|
Parameters
|
|
@@ -117,7 +117,7 @@ class BaseProtocol(ProtocolMixin, BaseQuantifier):
|
|
|
117
117
|
|
|
118
118
|
|
|
119
119
|
class APP(BaseProtocol):
|
|
120
|
-
"""
|
|
120
|
+
r"""
|
|
121
121
|
Artificial Prevalence Protocol (APP) for exhaustive prevalent batch evaluation.
|
|
122
122
|
|
|
123
123
|
Generates batches with artificially imposed prevalences across all possible
|
|
@@ -185,7 +185,7 @@ class APP(BaseProtocol):
|
|
|
185
185
|
|
|
186
186
|
|
|
187
187
|
class NPP(BaseProtocol):
|
|
188
|
-
"""
|
|
188
|
+
r"""
|
|
189
189
|
Natural Prevalence Protocol (NPP) that samples data without imposing prevalence constraints.
|
|
190
190
|
|
|
191
191
|
This protocol simply samples batches randomly with replacement,
|
|
@@ -230,7 +230,7 @@ class NPP(BaseProtocol):
|
|
|
230
230
|
|
|
231
231
|
|
|
232
232
|
class UPP(BaseProtocol):
|
|
233
|
-
"""
|
|
233
|
+
r"""
|
|
234
234
|
Uniform Prevalence Protocol (UPP) for uniform sampling of artificial prevalences.
|
|
235
235
|
|
|
236
236
|
Similar to APP, but uses uniform prevalence distribution generation
|
|
@@ -310,7 +310,7 @@ class UPP(BaseProtocol):
|
|
|
310
310
|
|
|
311
311
|
|
|
312
312
|
class PPP(BaseProtocol):
|
|
313
|
-
"""
|
|
313
|
+
r"""
|
|
314
314
|
Personalized Prevalence Protocol (PPP) for targeted prevalence batch generation.
|
|
315
315
|
|
|
316
316
|
Generates batches with user-specified prevalence distributions, allowing for
|
mlquantify/neighbors/_base.py
CHANGED
|
@@ -13,94 +13,64 @@ from mlquantify.utils._validation import validate_prevalences
|
|
|
13
13
|
EPS = 1e-12
|
|
14
14
|
|
|
15
15
|
class BaseKDE(SoftLearnerQMixin, AggregationMixin, BaseQuantifier):
|
|
16
|
-
r"""
|
|
17
|
-
Base class for KDEy quantification methods.
|
|
18
|
-
|
|
19
|
-
KDEy methods model the class-conditional densities of posterior probabilities
|
|
20
|
-
using Kernel Density Estimation (KDE) in the probability simplex space.
|
|
21
|
-
Given a probabilistic classifier's posterior outputs, each class distribution
|
|
22
|
-
is approximated as a smooth density function via KDE. Class prevalences in
|
|
23
|
-
the test set are estimated as the mixture weights of these densities that best
|
|
24
|
-
explain the test posterior distribution.
|
|
25
|
-
|
|
26
|
-
Formally, KDEy approximates the test posterior distribution as:
|
|
16
|
+
r"""Base class for KDEy quantification methods.
|
|
27
17
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
18
|
+
KDEy models the class-conditional densities of posterior probabilities using Kernel Density Estimation (KDE)
|
|
19
|
+
on the probability simplex. Given posterior outputs from a probabilistic classifier, each class distribution
|
|
20
|
+
is approximated as a smooth KDE. Test set class prevalences correspond to mixture weights that best explain
|
|
21
|
+
the overall test posterior distribution.
|
|
31
22
|
|
|
32
|
-
|
|
33
|
-
and \( \alpha_k \) are the unknown class prevalences to be estimated under:
|
|
23
|
+
Mathematically, the test posterior distribution is approximated as:
|
|
34
24
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
25
|
+
.. math::
|
|
26
|
+
|
|
27
|
+
p_{\mathrm{test}}(x) \approx \sum_{k=1}^K \alpha_k p_k(x),
|
|
28
|
+
|
|
29
|
+
where \(p_k(x)\) is the KDE of class \(k\) posteriors from training data, and \(\alpha_k\) are the unknown class
|
|
30
|
+
prevalences subject to:
|
|
31
|
+
|
|
32
|
+
.. math::
|
|
33
|
+
|
|
34
|
+
\alpha_k \geq 0, \quad \sum_{k=1}^K \alpha_k = 1.
|
|
35
|
+
|
|
36
|
+
The quantification minimizes an objective \(\mathcal{L}\) over \(\boldsymbol{\alpha} = (\alpha_1, \dots, \alpha_K)\) in the simplex:
|
|
37
|
+
|
|
38
|
+
.. math::
|
|
39
|
+
|
|
40
|
+
\min_{\boldsymbol{\alpha} \in \Delta^{K-1}} \mathcal{L} \left( \sum_{k=1}^K \alpha_k p_k(x), \hat{p}(x) \right),
|
|
41
|
+
|
|
42
|
+
where \(\hat{p}(x)\) is the test posterior distribution (empirical KDE or direct predictions).
|
|
43
|
+
|
|
44
|
+
This problem is typically solved using numerical constrained optimization methods.
|
|
42
45
|
|
|
43
46
|
Attributes
|
|
44
47
|
----------
|
|
45
48
|
learner : estimator
|
|
46
|
-
|
|
49
|
+
Probabilistic classifier generating posterior predictions.
|
|
47
50
|
bandwidth : float
|
|
48
|
-
|
|
51
|
+
KDE bandwidth (smoothing parameter).
|
|
49
52
|
kernel : str
|
|
50
|
-
|
|
53
|
+
KDE kernel type (e.g., 'gaussian').
|
|
51
54
|
_precomputed : bool
|
|
52
|
-
Indicates
|
|
55
|
+
Indicates if KDE models have been fitted.
|
|
53
56
|
best_distance : float or None
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
Methods
|
|
57
|
-
-------
|
|
58
|
-
fit(X, y, learner_fitted=False)
|
|
59
|
-
Fits KDE models for each class using posterior predictions of the learner.
|
|
60
|
-
predict(X)
|
|
61
|
-
Aggregates learner’s posterior predictions on X to estimate class prevalences.
|
|
62
|
-
aggregate(predictions, train_predictions, train_y_values)
|
|
63
|
-
Core estimation method that validates inputs, ensures KDE precomputation,
|
|
64
|
-
and calls `_solve_prevalences` implemented by subclasses.
|
|
65
|
-
_fit_kde_models(train_predictions, train_y_values)
|
|
66
|
-
Fits KDE model per class on training data posteriors.
|
|
67
|
-
_solve_prevalences(predictions)
|
|
68
|
-
Abstract method to estimate prevalence vector \( \boldsymbol{\alpha} \) for given posteriors.
|
|
69
|
-
Must be implemented by subclasses.
|
|
57
|
+
Best objective value found during estimation.
|
|
70
58
|
|
|
71
59
|
Examples
|
|
72
60
|
--------
|
|
73
|
-
|
|
74
|
-
`_solve_prevalences`, which receives posterior predictions and returns a tuple
|
|
75
|
-
|
|
76
|
-
(estimated prevalences \(\boldsymbol{\alpha}\), objective value).
|
|
61
|
+
Subclasses should implement `_solve_prevalences` method returning estimated prevalences and objective value:
|
|
77
62
|
|
|
78
63
|
>>> class KDEyExample(BaseKDE):
|
|
79
64
|
... def _solve_prevalences(self, predictions):
|
|
80
|
-
... # Example: simple uniform prevalences, replace with actual optimization
|
|
81
65
|
... n_classes = len(self._class_kdes)
|
|
82
66
|
... alpha = np.ones(n_classes) / n_classes
|
|
83
|
-
... obj_val = 0.0 #
|
|
67
|
+
... obj_val = 0.0 # Placeholder, replace with actual objective
|
|
84
68
|
... return alpha, obj_val
|
|
85
69
|
|
|
86
|
-
Mathematical formulation for prevalence estimation typically involves optimizing:
|
|
87
|
-
|
|
88
|
-
\[
|
|
89
|
-
\min_{\boldsymbol{\alpha} \in \Delta^{K-1}} \mathcal{L} \bigg( \sum_{k=1}^K \alpha_k p_k(x), \hat{p}(x) \bigg),
|
|
90
|
-
\]
|
|
91
|
-
|
|
92
|
-
where \(\hat{p}(x)\) is the test posterior distribution (empirical KDE or direct predictions),
|
|
93
|
-
\(\Delta^{K-1}\) is the probability simplex defined by the constraints on \(\boldsymbol{\alpha}\),
|
|
94
|
-
and \(\mathcal{L}\) is an appropriate divergence or loss function, e.g., negative log-likelihood,
|
|
95
|
-
Hellinger distance, or Cauchy–Schwarz divergence.
|
|
96
|
-
|
|
97
|
-
This optimization is typically solved numerically with constrained methods such as
|
|
98
|
-
sequential quadratic programming or projected gradient descent.
|
|
99
|
-
|
|
100
70
|
References
|
|
101
71
|
----------
|
|
102
|
-
[1] Moreo, A., et al. (2023). Kernel Density Quantification methods and applications.
|
|
103
|
-
|
|
72
|
+
.. [1] Moreo, A., et al. (2023). Kernel Density Quantification methods and applications.
|
|
73
|
+
In *Learning to Quantify*, Springer.
|
|
104
74
|
"""
|
|
105
75
|
|
|
106
76
|
_parameter_constraints = {
|
mlquantify/neighbors/_classes.py
CHANGED
|
@@ -10,7 +10,7 @@ from mlquantify.utils._validation import validate_prevalences
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class PWK(BaseQuantifier):
|
|
13
|
-
"""
|
|
13
|
+
r"""
|
|
14
14
|
Probabilistic Weighted k-Nearest Neighbor (PWK) Quantifier.
|
|
15
15
|
|
|
16
16
|
This quantifier leverages the PWKCLF classifier to perform quantification by estimating
|
|
@@ -47,15 +47,6 @@ class PWK(BaseQuantifier):
|
|
|
47
47
|
learner : PWKCLF
|
|
48
48
|
Underlying probabilistic weighted k-NN classifier.
|
|
49
49
|
|
|
50
|
-
Methods
|
|
51
|
-
-------
|
|
52
|
-
fit(X, y)
|
|
53
|
-
Fits the quantifier by training the internal PWKCLF and wrapping it with
|
|
54
|
-
Classify & Count quantification.
|
|
55
|
-
predict(X)
|
|
56
|
-
Predicts class prevalences for input data using the trained model.
|
|
57
|
-
classify(X)
|
|
58
|
-
Returns label predictions by applying the trained PWKCLF classifier.
|
|
59
50
|
|
|
60
51
|
Examples
|
|
61
52
|
--------
|
|
@@ -5,8 +5,7 @@ from sklearn.neighbors import NearestNeighbors
|
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class PWKCLF:
|
|
8
|
-
"""
|
|
9
|
-
Probabilistic Weighted k-Nearest Neighbor Classifier (PWKCLF).
|
|
8
|
+
r"""Probabilistic Weighted k-Nearest Neighbor Classifier (PWKCLF).
|
|
10
9
|
|
|
11
10
|
A weighted k-nearest neighbor classifier that assigns class probabilities to
|
|
12
11
|
instances based on neighbor counts weighted by class-specific inverse frequency
|
|
@@ -29,22 +28,16 @@ class PWKCLF:
|
|
|
29
28
|
y_train : ndarray
|
|
30
29
|
Labels of training samples.
|
|
31
30
|
|
|
32
|
-
Methods
|
|
33
|
-
-------
|
|
34
|
-
fit(X, y)
|
|
35
|
-
Fits the k-NN structure and computes class weights.
|
|
36
|
-
predict(X)
|
|
37
|
-
Predicts class labels by weighted voting among neighbors.
|
|
38
31
|
|
|
39
32
|
Notes
|
|
40
33
|
-----
|
|
41
34
|
The class weights are defined as:
|
|
42
35
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
36
|
+
.. math::
|
|
37
|
+
|
|
38
|
+
w_c = \left( \frac{N_c}{\min_{c'} N_{c'}} \right)^{-\frac{1}{\alpha}},
|
|
46
39
|
|
|
47
|
-
where
|
|
40
|
+
where :math:`N_c` is the count of class :math:`c` in the training set.
|
|
48
41
|
|
|
49
42
|
This weighting scheme reduces bias towards majority classes by downweighting them
|
|
50
43
|
in the voting process.
|
mlquantify/neighbors/_kde.py
CHANGED
|
@@ -16,17 +16,16 @@ from scipy.optimize import minimize
|
|
|
16
16
|
# ============================================================
|
|
17
17
|
|
|
18
18
|
def _optimize_on_simplex(objective, n_classes, x0=None):
|
|
19
|
-
"""
|
|
20
|
-
Optimize an objective function over the probability simplex.
|
|
19
|
+
r"""Optimize an objective function over the probability simplex.
|
|
21
20
|
|
|
22
21
|
This function performs constrained optimization to find the mixture weights
|
|
23
|
-
|
|
22
|
+
:math:`\alpha` on the simplex :math:`\Delta^{n-1} = \{ \alpha \in \mathbb{R}^n : \alpha_i \geq 0, \sum_i \alpha_i = 1 \}`
|
|
24
23
|
that minimize the given objective function.
|
|
25
24
|
|
|
26
25
|
Parameters
|
|
27
26
|
----------
|
|
28
27
|
objective : callable
|
|
29
|
-
Function from
|
|
28
|
+
Function from :math:`\mathbb{R}^n \to \mathbb{R}` to minimize.
|
|
30
29
|
n_classes : int
|
|
31
30
|
Dimensionality of the simplex (number of classes).
|
|
32
31
|
x0 : array-like, optional
|
|
@@ -59,7 +58,7 @@ def _optimize_on_simplex(objective, n_classes, x0=None):
|
|
|
59
58
|
# ============================================================
|
|
60
59
|
|
|
61
60
|
class KDEyML(BaseKDE):
|
|
62
|
-
"""KDEy Maximum Likelihood quantifier.
|
|
61
|
+
r"""KDEy Maximum Likelihood quantifier.
|
|
63
62
|
|
|
64
63
|
Models class-conditional densities of posterior probabilities via Kernel Density
|
|
65
64
|
Estimation (KDE) and estimates class prevalences by maximizing the likelihood of
|
|
@@ -80,13 +79,13 @@ class KDEyML(BaseKDE):
|
|
|
80
79
|
"""
|
|
81
80
|
|
|
82
81
|
def _precompute_training(self, train_predictions, train_y_values):
|
|
83
|
-
"""
|
|
82
|
+
r"""
|
|
84
83
|
Fit KDE models on class-specific training posterior predictions.
|
|
85
84
|
"""
|
|
86
85
|
super()._fit_kde_models(train_predictions, train_y_values)
|
|
87
86
|
|
|
88
87
|
def _solve_prevalences(self, predictions):
|
|
89
|
-
"""
|
|
88
|
+
r"""
|
|
90
89
|
Estimate class prevalences by maximizing log-likelihood under KDE mixture.
|
|
91
90
|
|
|
92
91
|
Parameters
|
|
@@ -208,8 +207,7 @@ class KDEyHD(BaseKDE):
|
|
|
208
207
|
# ============================================================
|
|
209
208
|
|
|
210
209
|
class KDEyCS(BaseKDE):
|
|
211
|
-
"""
|
|
212
|
-
KDEy Cauchy-Schwarz Divergence quantifier.
|
|
210
|
+
r"""KDEy Cauchy-Schwarz Divergence quantifier.
|
|
213
211
|
|
|
214
212
|
Uses a closed-form solution for minimizing the Cauchy-Schwarz (CS) divergence between
|
|
215
213
|
Gaussian Mixture Models representing class-conditional densities fitted via KDE.
|
mlquantify/neighbors/_utils.py
CHANGED
|
@@ -11,17 +11,16 @@ EPS = 1e-12
|
|
|
11
11
|
# ============================================================
|
|
12
12
|
|
|
13
13
|
def gaussian_kernel(X, Y, bandwidth):
|
|
14
|
-
"""
|
|
15
|
-
Compute the Gaussian kernel matrix K(x, y) with specified bandwidth.
|
|
14
|
+
r"""Compute the Gaussian kernel matrix K(x, y) with specified bandwidth.
|
|
16
15
|
|
|
17
16
|
This kernel matrix represents the similarity between each pair of points in X and Y,
|
|
18
17
|
computed using the Gaussian (RBF) kernel function:
|
|
19
18
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
19
|
+
.. math::
|
|
20
|
+
|
|
21
|
+
K(x, y) = \frac{1}{(2 \pi)^{D/2} h^D} \exp\left(- \frac{\|x - y\|^2}{2 h^2}\right)
|
|
23
22
|
|
|
24
|
-
where
|
|
23
|
+
where :math:`h` is the bandwidth (smoothing parameter), and :math:`D` is the dimensionality
|
|
25
24
|
of the input feature space.
|
|
26
25
|
|
|
27
26
|
Parameters
|
|
@@ -31,7 +30,7 @@ def gaussian_kernel(X, Y, bandwidth):
|
|
|
31
30
|
Y : array-like of shape (n_samples_Y, n_features) or None
|
|
32
31
|
Input data points for kernel computation. If None, defaults to X.
|
|
33
32
|
bandwidth : float
|
|
34
|
-
Kernel bandwidth parameter
|
|
33
|
+
Kernel bandwidth parameter :math:`h`.
|
|
35
34
|
|
|
36
35
|
Returns
|
|
37
36
|
-------
|
|
@@ -50,14 +49,13 @@ def gaussian_kernel(X, Y, bandwidth):
|
|
|
50
49
|
|
|
51
50
|
|
|
52
51
|
def negative_log_likelihood(mixture_likelihoods):
|
|
53
|
-
"""
|
|
54
|
-
|
|
52
|
+
r"""Compute the negative log-likelihood of given mixture likelihoods in a numerically stable way.
|
|
53
|
+
|
|
54
|
+
Given mixture likelihood values :math:`p_i` for samples, the negative log-likelihood is:
|
|
55
55
|
|
|
56
|
-
|
|
56
|
+
.. math::
|
|
57
57
|
|
|
58
|
-
|
|
59
|
-
- \sum_i \log(p_i)
|
|
60
|
-
\]
|
|
58
|
+
- \sum_i \log(p_i)
|
|
61
59
|
|
|
62
60
|
Numerical stability is achieved by clipping likelihoods below a small epsilon.
|
|
63
61
|
|
|
@@ -76,14 +74,13 @@ def negative_log_likelihood(mixture_likelihoods):
|
|
|
76
74
|
|
|
77
75
|
|
|
78
76
|
def _simplex_constraints(n):
|
|
79
|
-
"""
|
|
80
|
-
|
|
77
|
+
r"""Define constraints and bounds for optimization over the probability simplex.
|
|
78
|
+
|
|
79
|
+
The simplex is defined as all vectors :math:`\alpha \in \mathbb{R}^n` such that:
|
|
81
80
|
|
|
82
|
-
|
|
81
|
+
.. math::
|
|
83
82
|
|
|
84
|
-
|
|
85
|
-
\alpha_i \geq 0, \quad \sum_{i=1}^n \alpha_i = 1
|
|
86
|
-
\]
|
|
83
|
+
\alpha_i \geq 0, \quad \sum_{i=1}^n \alpha_i = 1
|
|
87
84
|
|
|
88
85
|
Parameters
|
|
89
86
|
----------
|
|
@@ -103,8 +100,7 @@ def _simplex_constraints(n):
|
|
|
103
100
|
|
|
104
101
|
|
|
105
102
|
def _optimize_on_simplex(objective, n, x0=None):
|
|
106
|
-
"""
|
|
107
|
-
Minimize an objective function over the probability simplex.
|
|
103
|
+
r"""Minimize an objective function over the probability simplex.
|
|
108
104
|
|
|
109
105
|
This function solves for mixture weights \( \boldsymbol{\alpha} \) that minimize the
|
|
110
106
|
objective function under the constraints \(\alpha_i \geq 0\) and \(\sum_i \alpha_i = 1\).
|
mlquantify/utils/_validation.py
CHANGED
|
@@ -274,7 +274,7 @@ def validate_data(quantifier,
|
|
|
274
274
|
else:
|
|
275
275
|
out = X, y
|
|
276
276
|
elif not no_val_X and no_val_y:
|
|
277
|
-
out = check_array(X, input_name="X", **check_params)
|
|
277
|
+
out = check_array(X, input_name="X", dtype=None, **check_params)
|
|
278
278
|
elif no_val_X and not no_val_y:
|
|
279
279
|
out = _check_y(y, **check_params)
|
|
280
280
|
else:
|
|
@@ -286,12 +286,12 @@ def validate_data(quantifier,
|
|
|
286
286
|
check_X_params, check_y_params = validate_separately
|
|
287
287
|
if "estimator" not in check_X_params:
|
|
288
288
|
check_X_params = {**default_check_params, **check_X_params}
|
|
289
|
-
X = check_array(X, input_name="X", **check_X_params)
|
|
289
|
+
X = check_array(X, input_name="X", dtype=None, **check_X_params)
|
|
290
290
|
if "estimator" not in check_y_params:
|
|
291
291
|
check_y_params = {**default_check_params, **check_y_params}
|
|
292
292
|
y = check_array(y, input_name="y", **check_y_params)
|
|
293
293
|
else:
|
|
294
|
-
X, y = check_X_y(X, y, **check_params)
|
|
294
|
+
X, y = check_X_y(X, y, dtype=None, **check_params)
|
|
295
295
|
out = X, y
|
|
296
296
|
|
|
297
297
|
return out
|
mlquantify/utils/prevalence.py
CHANGED
|
@@ -3,7 +3,7 @@ import pandas as pd
|
|
|
3
3
|
from collections import defaultdict
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
def get_prev_from_labels(y) -> dict:
|
|
6
|
+
def get_prev_from_labels(y, format="dict") -> dict:
|
|
7
7
|
"""
|
|
8
8
|
Get the real prevalence of each class in the target array.
|
|
9
9
|
|
|
@@ -19,6 +19,9 @@ def get_prev_from_labels(y) -> dict:
|
|
|
19
19
|
"""
|
|
20
20
|
if isinstance(y, np.ndarray):
|
|
21
21
|
y = pd.Series(y)
|
|
22
|
+
if format == "array":
|
|
23
|
+
prevalences = y.value_counts(normalize=True).sort_index().values
|
|
24
|
+
return prevalences
|
|
22
25
|
real_prevs = y.value_counts(normalize=True).to_dict()
|
|
23
26
|
real_prevs = dict(sorted(real_prevs.items()))
|
|
24
27
|
return real_prevs
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mlquantify
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.11
|
|
4
4
|
Summary: Quantification Library
|
|
5
5
|
Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
|
|
6
6
|
Maintainer: Luiz Fernando Luth Junior
|
|
@@ -30,7 +30,7 @@ Dynamic: maintainer
|
|
|
30
30
|
Dynamic: requires-dist
|
|
31
31
|
Dynamic: summary
|
|
32
32
|
|
|
33
|
-
<
|
|
33
|
+
<img src="assets/logo_mlquantify-white.svg" alt="mlquantify logo">
|
|
34
34
|
<h4 align="center">A Python Package for Quantification</h4>
|
|
35
35
|
|
|
36
36
|
___
|
|
@@ -41,7 +41,7 @@ ___
|
|
|
41
41
|
|
|
42
42
|
## Latest Release
|
|
43
43
|
|
|
44
|
-
- **Version 0.1.
|
|
44
|
+
- **Version 0.1.11**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
|
|
45
45
|
- In case you need any help, refer to the [User Guide](https://luizfernandolj.github.io/mlquantify/user_guide.html).
|
|
46
46
|
- Explore the [API documentation](https://luizfernandolj.github.io/mlquantify/api/index.html) for detailed developer information.
|
|
47
47
|
- See also the library in the pypi site in [pypi mlquantify](https://pypi.org/project/mlquantify/)
|
|
@@ -73,7 +73,6 @@ ___
|
|
|
73
73
|
| **Model Selection** | Criteria and processes used to select the best model, such as grid-search for the case of quantification|
|
|
74
74
|
| **Evaluation Metrics** | Specific metrics used to evaluate quantification performance, (e.g., AE, MAE, NAE, SE, KLD, etc.). |
|
|
75
75
|
| **Evaluation Protocols** | Evaluation protocols used, based on sampling generation (e.g., APP, NPP, etc.).. |
|
|
76
|
-
| **Plotting Results** | Tools and techniques used to visualize results, such as the protocol results.|
|
|
77
76
|
| **Comprehensive Documentation** | Complete documentation of the project, including code, data, and results. |
|
|
78
77
|
|
|
79
78
|
___
|
|
@@ -84,8 +83,8 @@ This code first loads the breast cancer dataset from _sklearn_, which is then sp
|
|
|
84
83
|
|
|
85
84
|
```python
|
|
86
85
|
from mlquantify.methods import EMQ
|
|
87
|
-
from mlquantify.
|
|
88
|
-
from mlquantify.utils import
|
|
86
|
+
from mlquantify.metrics import MAE, NRAE
|
|
87
|
+
from mlquantify.utils import get_prev_from_labels
|
|
89
88
|
|
|
90
89
|
from sklearn.ensemble import RandomForestClassifier
|
|
91
90
|
from sklearn.datasets import load_breast_cancer
|
|
@@ -103,14 +102,14 @@ model.fit(X_train, y_train)
|
|
|
103
102
|
|
|
104
103
|
#Predict the class prevalence for X_test
|
|
105
104
|
pred_prevalence = model.predict(X_test)
|
|
106
|
-
real_prevalence =
|
|
105
|
+
real_prevalence = get_prev_from_labels(y_test)
|
|
107
106
|
|
|
108
107
|
#Get the error for the prediction
|
|
109
|
-
|
|
110
|
-
|
|
108
|
+
mae = MAE(real_prevalence, pred_prevalence)
|
|
109
|
+
nrae = NRAE(real_prevalence, pred_prevalence)
|
|
111
110
|
|
|
112
|
-
print(f"Absolute Error -> {ae}")
|
|
113
111
|
print(f"Mean Absolute Error -> {mae}")
|
|
112
|
+
print(f"Normalized Relative Absolute Error -> {nrae}")
|
|
114
113
|
```
|
|
115
114
|
|
|
116
115
|
___
|
|
@@ -129,13 +128,6 @@ ___
|
|
|
129
128
|
|
|
130
129
|
## Documentation
|
|
131
130
|
|
|
132
|
-
##### API is avaliable [here](https://luizfernandolj.github.io/mlquantify/api/
|
|
133
|
-
|
|
134
|
-
- [Methods](https://github.com/luizfernandolj/mlquantify/wiki/Methods)
|
|
135
|
-
- [Model Selection](https://github.com/luizfernandolj/mlquantify/wiki/Model-Selection)
|
|
136
|
-
- [Evaluation](https://github.com/luizfernandolj/mlquantify/wiki/Evaluation)
|
|
137
|
-
- [Plotting](https://github.com/luizfernandolj/mlquantify/wiki/Plotting)
|
|
138
|
-
- [Utilities](https://github.com/luizfernandolj/mlquantify/wiki/Utilities)
|
|
139
|
-
|
|
131
|
+
##### API is avaliable [here](https://luizfernandolj.github.io/mlquantify/api/)
|
|
140
132
|
|
|
141
133
|
___
|