mlquantify 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify/__init__.py +10 -29
- mlquantify/adjust_counting/__init__.py +24 -0
- mlquantify/adjust_counting/_adjustment.py +648 -0
- mlquantify/adjust_counting/_base.py +245 -0
- mlquantify/adjust_counting/_counting.py +153 -0
- mlquantify/adjust_counting/_utils.py +109 -0
- mlquantify/base.py +117 -519
- mlquantify/base_aggregative.py +209 -0
- mlquantify/calibration.py +1 -0
- mlquantify/confidence.py +329 -0
- mlquantify/likelihood/__init__.py +5 -0
- mlquantify/likelihood/_base.py +147 -0
- mlquantify/likelihood/_classes.py +430 -0
- mlquantify/meta/__init__.py +1 -0
- mlquantify/meta/_classes.py +785 -0
- mlquantify/metrics/__init__.py +21 -0
- mlquantify/metrics/_oq.py +109 -0
- mlquantify/metrics/_rq.py +98 -0
- mlquantify/{evaluation/measures.py → metrics/_slq.py} +51 -36
- mlquantify/mixture/__init__.py +7 -0
- mlquantify/mixture/_base.py +147 -0
- mlquantify/mixture/_classes.py +458 -0
- mlquantify/mixture/_utils.py +163 -0
- mlquantify/model_selection/__init__.py +9 -0
- mlquantify/model_selection/_protocol.py +358 -0
- mlquantify/model_selection/_search.py +315 -0
- mlquantify/model_selection/_split.py +1 -0
- mlquantify/multiclass.py +350 -0
- mlquantify/neighbors/__init__.py +9 -0
- mlquantify/neighbors/_base.py +168 -0
- mlquantify/neighbors/_classes.py +150 -0
- mlquantify/{classification/methods.py → neighbors/_classification.py} +37 -62
- mlquantify/neighbors/_kde.py +268 -0
- mlquantify/neighbors/_utils.py +131 -0
- mlquantify/neural/__init__.py +1 -0
- mlquantify/utils/__init__.py +47 -2
- mlquantify/utils/_artificial.py +27 -0
- mlquantify/utils/_constraints.py +219 -0
- mlquantify/utils/_context.py +21 -0
- mlquantify/utils/_decorators.py +36 -0
- mlquantify/utils/_exceptions.py +12 -0
- mlquantify/utils/_get_scores.py +159 -0
- mlquantify/utils/_load.py +18 -0
- mlquantify/utils/_parallel.py +6 -0
- mlquantify/utils/_random.py +36 -0
- mlquantify/utils/_sampling.py +273 -0
- mlquantify/utils/_tags.py +44 -0
- mlquantify/utils/_validation.py +447 -0
- mlquantify/utils/prevalence.py +64 -0
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/METADATA +2 -1
- mlquantify-0.1.10.dist-info/RECORD +53 -0
- mlquantify/classification/__init__.py +0 -1
- mlquantify/evaluation/__init__.py +0 -14
- mlquantify/evaluation/protocol.py +0 -289
- mlquantify/methods/__init__.py +0 -37
- mlquantify/methods/aggregative.py +0 -1159
- mlquantify/methods/meta.py +0 -472
- mlquantify/methods/mixture_models.py +0 -1003
- mlquantify/methods/non_aggregative.py +0 -136
- mlquantify/methods/threshold_optimization.py +0 -869
- mlquantify/model_selection.py +0 -377
- mlquantify/plots.py +0 -367
- mlquantify/utils/general.py +0 -371
- mlquantify/utils/method.py +0 -449
- mlquantify-0.1.8.dist-info/RECORD +0 -22
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/WHEEL +0 -0
- {mlquantify-0.1.8.dist-info → mlquantify-0.1.10.dist-info}/top_level.txt +0 -0
|
@@ -1,136 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
|
|
3
|
-
from ..base import NonAggregativeQuantifier
|
|
4
|
-
from ..utils.method import getHist, hellinger
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class HDx(NonAggregativeQuantifier):
|
|
10
|
-
"""
|
|
11
|
-
Hellinger Distance Minimization (HDx).
|
|
12
|
-
|
|
13
|
-
This method estimates class prevalence by calculating the Hellinger
|
|
14
|
-
distance for each feature in the dataset, as opposed to HDy, which
|
|
15
|
-
computes the distance for classifier-generated scores.
|
|
16
|
-
|
|
17
|
-
Parameters
|
|
18
|
-
----------
|
|
19
|
-
bins_size : np.ndarray, optional
|
|
20
|
-
An array of bin sizes for histogram calculations. Defaults to an array
|
|
21
|
-
combining linearly spaced values between 2 and 20 with an additional
|
|
22
|
-
bin size of 30.
|
|
23
|
-
|
|
24
|
-
Attributes
|
|
25
|
-
----------
|
|
26
|
-
bins_size : np.ndarray
|
|
27
|
-
An array of bin sizes for histogram calculations.
|
|
28
|
-
neg_features : np.ndarray
|
|
29
|
-
Features from the negative class.
|
|
30
|
-
pos_features : np.ndarray
|
|
31
|
-
Features from the positive class.
|
|
32
|
-
|
|
33
|
-
References
|
|
34
|
-
----------
|
|
35
|
-
.. [1] GONZÁLEZ-CASTRO, Víctor; ALAIZ-RODRÍGUEZ, Rocío; ALEGRE, Enrique. Class distribution estimation based on the Hellinger distance. Information Sciences, v. 218, p. 146-164, 2013. Avaliable at https://www.sciencedirect.com/science/article/abs/pii/S0020025512004069?casa_token=W6UksOigmp4AAAAA:ap8FK5mtpAzG-s8k2ygfRVgdIBYDGWjEi70ueJ546coP9F-VNaCKE5W_gsAv0bWQiwzt2QoAuLjP
|
|
36
|
-
|
|
37
|
-
Examples
|
|
38
|
-
--------
|
|
39
|
-
>>> from mlquantify.methods.non_aggregative import HDx
|
|
40
|
-
>>> from mlquantify.utils.general import get_real_prev
|
|
41
|
-
>>> from sklearn.datasets import load_breast_cancer
|
|
42
|
-
>>> from sklearn.model_selection import train_test_split
|
|
43
|
-
>>>
|
|
44
|
-
>>> features, target = load_breast_cancer(return_X_y=True)
|
|
45
|
-
>>>
|
|
46
|
-
>>> X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)
|
|
47
|
-
>>>
|
|
48
|
-
>>> model = HDx()
|
|
49
|
-
>>> model.fit(X_train, y_train)
|
|
50
|
-
>>>
|
|
51
|
-
>>> predictions = model.predict(X_test)
|
|
52
|
-
>>> predictions
|
|
53
|
-
{0: 0.39, 1: 0.61}
|
|
54
|
-
>>> get_real_prev(y_test)
|
|
55
|
-
{0: 0.3684210526315789, 1: 0.631578947368421}
|
|
56
|
-
"""
|
|
57
|
-
|
|
58
|
-
def __init__(self, bins_size: np.ndarray = None):
|
|
59
|
-
if bins_size is None:
|
|
60
|
-
bins_size = np.append(np.linspace(2, 20, 10), 30)
|
|
61
|
-
|
|
62
|
-
self.bins_size = bins_size
|
|
63
|
-
self.neg_features = None
|
|
64
|
-
self.pos_features = None
|
|
65
|
-
|
|
66
|
-
def _fit_method(self, X, y):
|
|
67
|
-
"""
|
|
68
|
-
Fit the HDx model by separating the features into positive and negative classes.
|
|
69
|
-
|
|
70
|
-
Parameters
|
|
71
|
-
----------
|
|
72
|
-
X : array-like
|
|
73
|
-
Feature matrix.
|
|
74
|
-
y : array-like
|
|
75
|
-
Target labels.
|
|
76
|
-
|
|
77
|
-
Returns
|
|
78
|
-
-------
|
|
79
|
-
self : HDx
|
|
80
|
-
The fitted instance of the class.
|
|
81
|
-
"""
|
|
82
|
-
self.pos_features = X[y == self.classes[1]]
|
|
83
|
-
self.neg_features = X[y == self.classes[0]]
|
|
84
|
-
|
|
85
|
-
if not isinstance(X, np.ndarray):
|
|
86
|
-
self.pos_features = self.pos_features.to_numpy()
|
|
87
|
-
if not isinstance(y, np.ndarray):
|
|
88
|
-
self.neg_features = self.neg_features.to_numpy()
|
|
89
|
-
|
|
90
|
-
return self
|
|
91
|
-
|
|
92
|
-
def _predict_method(self, X) -> np.ndarray:
|
|
93
|
-
"""
|
|
94
|
-
Predict the prevalence of the positive and negative classes.
|
|
95
|
-
|
|
96
|
-
Parameters
|
|
97
|
-
----------
|
|
98
|
-
X : array-like
|
|
99
|
-
Feature matrix for the test data.
|
|
100
|
-
|
|
101
|
-
Returns
|
|
102
|
-
-------
|
|
103
|
-
prevalence : np.ndarray
|
|
104
|
-
A 2-element array representing the prevalence of the negative
|
|
105
|
-
and positive classes, respectively.
|
|
106
|
-
"""
|
|
107
|
-
if not isinstance(X, np.ndarray):
|
|
108
|
-
X = X.to_numpy()
|
|
109
|
-
|
|
110
|
-
alpha_values = np.round(np.linspace(0, 1, 101), 2)
|
|
111
|
-
best_distances = {}
|
|
112
|
-
|
|
113
|
-
# Iterate over alpha values to compute the prevalence
|
|
114
|
-
for alpha in alpha_values:
|
|
115
|
-
distances = []
|
|
116
|
-
|
|
117
|
-
# For each feature, compute the Hellinger distance
|
|
118
|
-
for i in range(X.shape[1]):
|
|
119
|
-
for bins in self.bins_size:
|
|
120
|
-
dist_feature_pos = getHist(self.pos_features[:, i], bins)
|
|
121
|
-
dist_feature_neg = getHist(self.neg_features[:, i], bins)
|
|
122
|
-
dist_feature_test = getHist(X[:, i], bins)
|
|
123
|
-
|
|
124
|
-
# Combine positive and negative densities using the mixture weight (alpha)
|
|
125
|
-
train_combined_density = (dist_feature_pos * alpha) + (dist_feature_neg * (1 - alpha))
|
|
126
|
-
|
|
127
|
-
# Compute the Hellinger distance between the combined density and test density
|
|
128
|
-
distances.append(hellinger(train_combined_density, dist_feature_test))
|
|
129
|
-
|
|
130
|
-
# Store the mean distance for the current alpha
|
|
131
|
-
best_distances[alpha] = np.mean(distances)
|
|
132
|
-
|
|
133
|
-
# Find the alpha value that minimizes the mean Hellinger distance
|
|
134
|
-
prevalence = min(best_distances, key=best_distances.get)
|
|
135
|
-
|
|
136
|
-
return np.asarray([1 - prevalence, prevalence])
|