mlquantify 0.0.11.2__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify/__init__.py +32 -6
- mlquantify/base.py +559 -257
- mlquantify/classification/__init__.py +1 -1
- mlquantify/classification/methods.py +160 -0
- mlquantify/evaluation/__init__.py +14 -2
- mlquantify/evaluation/measures.py +215 -0
- mlquantify/evaluation/protocol.py +647 -0
- mlquantify/methods/__init__.py +37 -40
- mlquantify/methods/aggregative.py +1030 -0
- mlquantify/methods/meta.py +472 -0
- mlquantify/methods/mixture_models.py +1003 -0
- mlquantify/methods/non_aggregative.py +136 -0
- mlquantify/methods/threshold_optimization.py +957 -0
- mlquantify/model_selection.py +377 -232
- mlquantify/plots.py +367 -0
- mlquantify/utils/__init__.py +2 -2
- mlquantify/utils/general.py +334 -0
- mlquantify/utils/method.py +449 -0
- {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/METADATA +137 -122
- mlquantify-0.1.1.dist-info/RECORD +22 -0
- {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/WHEEL +1 -1
- mlquantify/classification/pwkclf.py +0 -73
- mlquantify/evaluation/measures/__init__.py +0 -26
- mlquantify/evaluation/measures/ae.py +0 -11
- mlquantify/evaluation/measures/bias.py +0 -16
- mlquantify/evaluation/measures/kld.py +0 -8
- mlquantify/evaluation/measures/mse.py +0 -12
- mlquantify/evaluation/measures/nae.py +0 -16
- mlquantify/evaluation/measures/nkld.py +0 -13
- mlquantify/evaluation/measures/nrae.py +0 -16
- mlquantify/evaluation/measures/rae.py +0 -12
- mlquantify/evaluation/measures/se.py +0 -12
- mlquantify/evaluation/protocol/_Protocol.py +0 -202
- mlquantify/evaluation/protocol/__init__.py +0 -2
- mlquantify/evaluation/protocol/app.py +0 -146
- mlquantify/evaluation/protocol/npp.py +0 -34
- mlquantify/methods/aggregative/ThreholdOptm/_ThreholdOptimization.py +0 -62
- mlquantify/methods/aggregative/ThreholdOptm/__init__.py +0 -7
- mlquantify/methods/aggregative/ThreholdOptm/acc.py +0 -27
- mlquantify/methods/aggregative/ThreholdOptm/max.py +0 -23
- mlquantify/methods/aggregative/ThreholdOptm/ms.py +0 -21
- mlquantify/methods/aggregative/ThreholdOptm/ms2.py +0 -25
- mlquantify/methods/aggregative/ThreholdOptm/pacc.py +0 -41
- mlquantify/methods/aggregative/ThreholdOptm/t50.py +0 -21
- mlquantify/methods/aggregative/ThreholdOptm/x.py +0 -23
- mlquantify/methods/aggregative/__init__.py +0 -9
- mlquantify/methods/aggregative/cc.py +0 -32
- mlquantify/methods/aggregative/emq.py +0 -86
- mlquantify/methods/aggregative/fm.py +0 -72
- mlquantify/methods/aggregative/gac.py +0 -96
- mlquantify/methods/aggregative/gpac.py +0 -87
- mlquantify/methods/aggregative/mixtureModels/_MixtureModel.py +0 -81
- mlquantify/methods/aggregative/mixtureModels/__init__.py +0 -5
- mlquantify/methods/aggregative/mixtureModels/dys.py +0 -55
- mlquantify/methods/aggregative/mixtureModels/dys_syn.py +0 -89
- mlquantify/methods/aggregative/mixtureModels/hdy.py +0 -46
- mlquantify/methods/aggregative/mixtureModels/smm.py +0 -27
- mlquantify/methods/aggregative/mixtureModels/sord.py +0 -77
- mlquantify/methods/aggregative/pcc.py +0 -33
- mlquantify/methods/aggregative/pwk.py +0 -38
- mlquantify/methods/meta/__init__.py +0 -1
- mlquantify/methods/meta/ensemble.py +0 -236
- mlquantify/methods/non_aggregative/__init__.py +0 -1
- mlquantify/methods/non_aggregative/hdx.py +0 -71
- mlquantify/plots/__init__.py +0 -2
- mlquantify/plots/distribution_plot.py +0 -109
- mlquantify/plots/protocol_plot.py +0 -193
- mlquantify/utils/general_purposes/__init__.py +0 -8
- mlquantify/utils/general_purposes/convert_col_to_array.py +0 -13
- mlquantify/utils/general_purposes/generate_artificial_indexes.py +0 -29
- mlquantify/utils/general_purposes/get_real_prev.py +0 -9
- mlquantify/utils/general_purposes/load_quantifier.py +0 -4
- mlquantify/utils/general_purposes/make_prevs.py +0 -23
- mlquantify/utils/general_purposes/normalize.py +0 -20
- mlquantify/utils/general_purposes/parallel.py +0 -10
- mlquantify/utils/general_purposes/round_protocol_df.py +0 -14
- mlquantify/utils/method_purposes/__init__.py +0 -6
- mlquantify/utils/method_purposes/distances.py +0 -21
- mlquantify/utils/method_purposes/getHist.py +0 -13
- mlquantify/utils/method_purposes/get_scores.py +0 -33
- mlquantify/utils/method_purposes/moss.py +0 -16
- mlquantify/utils/method_purposes/ternary_search.py +0 -14
- mlquantify/utils/method_purposes/tprfpr.py +0 -42
- mlquantify-0.0.11.2.dist-info/RECORD +0 -73
- {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/top_level.txt +0 -0
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
from sklearn.base import BaseEstimator
|
|
3
|
-
|
|
4
|
-
from ._MixtureModel import MixtureModel
|
|
5
|
-
|
|
6
|
-
class SORD(MixtureModel):
|
|
7
|
-
"""Sample Ordinal Distance. Is a method
|
|
8
|
-
that does not rely on distributions, but
|
|
9
|
-
estimates the prevalence of the positive
|
|
10
|
-
class in a test dataset by calculating and
|
|
11
|
-
minimizing a sample ordinal distance measure
|
|
12
|
-
between the test scores and known positive
|
|
13
|
-
and negative scores.
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
def __init__(self, learner: BaseEstimator):
|
|
17
|
-
assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
|
|
18
|
-
super().__init__(learner)
|
|
19
|
-
|
|
20
|
-
self.best_distance_index = None
|
|
21
|
-
|
|
22
|
-
def _compute_prevalence(self, test_scores: np.ndarray) -> float:
|
|
23
|
-
# Compute alpha values and corresponding distance measures
|
|
24
|
-
alpha_values, distance_measures = self._calculate_distances(test_scores)
|
|
25
|
-
|
|
26
|
-
# Find the index of the alpha value with the minimum distance measure
|
|
27
|
-
self.best_distance_index = np.argmin(distance_measures)
|
|
28
|
-
prevalence = alpha_values[self.best_distance_index]
|
|
29
|
-
|
|
30
|
-
return prevalence
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def _calculate_distances(self, test_scores: np.ndarray):
|
|
34
|
-
# Define a range of alpha values from 0 to 1
|
|
35
|
-
alpha_values = np.linspace(0, 1, 101)
|
|
36
|
-
|
|
37
|
-
# Get the number of positive, negative, and test scores
|
|
38
|
-
num_pos_scores = len(self.pos_scores)
|
|
39
|
-
num_neg_scores = len(self.neg_scores)
|
|
40
|
-
num_test_scores = len(test_scores)
|
|
41
|
-
|
|
42
|
-
distance_measures = []
|
|
43
|
-
|
|
44
|
-
# Iterate over each alpha value
|
|
45
|
-
for alpha in alpha_values:
|
|
46
|
-
# Compute weights for positive, negative, and test scores
|
|
47
|
-
pos_weight = alpha / num_pos_scores
|
|
48
|
-
neg_weight = (1 - alpha) / num_neg_scores
|
|
49
|
-
test_weight = -1 / num_test_scores
|
|
50
|
-
|
|
51
|
-
# Create arrays with weights
|
|
52
|
-
pos_weights = np.full(num_pos_scores, pos_weight)
|
|
53
|
-
neg_weights = np.full(num_neg_scores, neg_weight)
|
|
54
|
-
test_weights = np.full(num_test_scores, test_weight)
|
|
55
|
-
|
|
56
|
-
# Concatenate all scores and their corresponding weights
|
|
57
|
-
all_scores = np.concatenate([self.pos_scores, self.neg_scores, test_scores])
|
|
58
|
-
all_weights = np.concatenate([pos_weights, neg_weights, test_weights])
|
|
59
|
-
|
|
60
|
-
# Sort scores and weights based on scores
|
|
61
|
-
sorted_indices = np.argsort(all_scores)
|
|
62
|
-
sorted_scores = all_scores[sorted_indices]
|
|
63
|
-
sorted_weights = all_weights[sorted_indices]
|
|
64
|
-
|
|
65
|
-
# Compute the total cost for the current alpha
|
|
66
|
-
cumulative_weight = sorted_weights[0]
|
|
67
|
-
total_cost = 0
|
|
68
|
-
|
|
69
|
-
for i in range(1, len(sorted_scores)):
|
|
70
|
-
# Calculate the cost for the segment between sorted scores
|
|
71
|
-
segment_width = sorted_scores[i] - sorted_scores[i - 1]
|
|
72
|
-
total_cost += abs(segment_width * cumulative_weight)
|
|
73
|
-
cumulative_weight += sorted_weights[i]
|
|
74
|
-
|
|
75
|
-
distance_measures.append(total_cost)
|
|
76
|
-
|
|
77
|
-
return alpha_values, distance_measures
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
from sklearn.base import BaseEstimator
|
|
3
|
-
from ...base import AggregativeQuantifier
|
|
4
|
-
|
|
5
|
-
class PCC(AggregativeQuantifier):
|
|
6
|
-
"""Probabilistic Classify and Count. This method
|
|
7
|
-
takes the probabilistic predictions and takes the
|
|
8
|
-
mean of them for each class.
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
def __init__(self, learner: BaseEstimator):
|
|
12
|
-
assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
|
|
13
|
-
self.learner = learner
|
|
14
|
-
|
|
15
|
-
def _fit_method(self, X, y):
|
|
16
|
-
if not self.learner_fitted:
|
|
17
|
-
self.learner.fit(X, y)
|
|
18
|
-
return self
|
|
19
|
-
|
|
20
|
-
def _predict_method(self, X) -> dict:
|
|
21
|
-
# Initialize a dictionary to store the prevalence for each class
|
|
22
|
-
prevalences = []
|
|
23
|
-
|
|
24
|
-
# Calculate the prevalence for each class
|
|
25
|
-
for class_index in range(self.n_class):
|
|
26
|
-
# Get the predicted probabilities for the current class
|
|
27
|
-
class_probabilities = self.learner.predict_proba(X)[:, class_index]
|
|
28
|
-
|
|
29
|
-
# Compute the average probability (prevalence) for the current class
|
|
30
|
-
mean_prev = np.mean(class_probabilities)
|
|
31
|
-
prevalences.append(mean_prev)
|
|
32
|
-
|
|
33
|
-
return np.asarray(prevalences)
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
from sklearn.base import BaseEstimator
|
|
3
|
-
from collections import defaultdict
|
|
4
|
-
|
|
5
|
-
from ...base import AggregativeQuantifier
|
|
6
|
-
|
|
7
|
-
class PWK(AggregativeQuantifier):
|
|
8
|
-
""" Nearest-Neighbor based Quantification. This method
|
|
9
|
-
is based on nearest-neighbor based classification to the
|
|
10
|
-
setting of quantification. In this k-NN approach, it applies
|
|
11
|
-
a weighting scheme which applies less weight on neighbors
|
|
12
|
-
from the majority class.
|
|
13
|
-
Must be used with PWKCLF to work as expected.
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
def __init__(self, learner: BaseEstimator):
|
|
17
|
-
assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
|
|
18
|
-
self.learner = learner
|
|
19
|
-
|
|
20
|
-
def _fit_method(self, X, y):
|
|
21
|
-
if not self.learner_fitted:
|
|
22
|
-
self.learner.fit(X, y)
|
|
23
|
-
return self
|
|
24
|
-
|
|
25
|
-
def _predict_method(self, X) -> dict:
|
|
26
|
-
# Predict class labels for the given data
|
|
27
|
-
predicted_labels = self.learner.predict(X)
|
|
28
|
-
|
|
29
|
-
# Compute the distribution of predicted labels
|
|
30
|
-
unique_labels, label_counts = np.unique(predicted_labels, return_counts=True)
|
|
31
|
-
|
|
32
|
-
# Calculate the prevalence for each class
|
|
33
|
-
class_prevalences = label_counts / label_counts.sum()
|
|
34
|
-
|
|
35
|
-
# Map each class label to its prevalence
|
|
36
|
-
prevalences = {label: prevalence for label, prevalence in zip(unique_labels, class_prevalences)}
|
|
37
|
-
|
|
38
|
-
return prevalences
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
from .ensemble import Ensemble
|
|
@@ -1,236 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
import pandas as pd
|
|
3
|
-
from copy import deepcopy
|
|
4
|
-
from tqdm import tqdm
|
|
5
|
-
from sklearn.linear_model import LogisticRegression
|
|
6
|
-
from sklearn.model_selection import GridSearchCV, cross_val_predict
|
|
7
|
-
from ...evaluation import measures
|
|
8
|
-
from ...base import Quantifier
|
|
9
|
-
from ...utils import make_prevs, getHist, normalize_prevalence, parallel, hellinger, generate_artificial_indexes
|
|
10
|
-
|
|
11
|
-
class Ensemble(Quantifier):
|
|
12
|
-
SELECTION_METRICS = {'all', 'ptr', 'ds'}
|
|
13
|
-
|
|
14
|
-
"""Ensemble method, based on the articles:
|
|
15
|
-
Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
|
|
16
|
-
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
|
|
17
|
-
Information Fusion, 34, 87-100.
|
|
18
|
-
and
|
|
19
|
-
Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
|
|
20
|
-
Dynamic ensemble selection for quantification tasks.
|
|
21
|
-
Information Fusion, 45, 1-15.
|
|
22
|
-
|
|
23
|
-
This approach of Ensemble is made of taking multiple
|
|
24
|
-
samples varying class proportions on each, and for the
|
|
25
|
-
predictions, it takes the k models which as the minimum
|
|
26
|
-
seletion metric, which are:
|
|
27
|
-
- all -> return all the predictions
|
|
28
|
-
- ptr -> computes the selected error measure
|
|
29
|
-
- ds -> computes the hellinger distance of the train and test
|
|
30
|
-
distributions for each model
|
|
31
|
-
|
|
32
|
-
"""
|
|
33
|
-
|
|
34
|
-
def __init__(self,
|
|
35
|
-
quantifier:Quantifier,
|
|
36
|
-
size:int=50,
|
|
37
|
-
min_prop:float=0.1,
|
|
38
|
-
selection_metric:str='all',
|
|
39
|
-
p_metric:float=0.25,
|
|
40
|
-
return_type:str="mean",
|
|
41
|
-
max_sample_size:int=None,
|
|
42
|
-
max_trials:int=100,
|
|
43
|
-
n_jobs:int=1,
|
|
44
|
-
verbose:bool=False):
|
|
45
|
-
|
|
46
|
-
assert selection_metric in Ensemble.SELECTION_METRICS, \
|
|
47
|
-
f'unknown selection_metric={selection_metric}; valid are {Ensemble.SELECTION_METRICS}'
|
|
48
|
-
assert max_sample_size is None or max_sample_size > 0, \
|
|
49
|
-
'wrong value for max_sample_size; set it to a positive number or None'
|
|
50
|
-
|
|
51
|
-
self.base_quantifier = quantifier
|
|
52
|
-
self.size = size
|
|
53
|
-
self.min_prop = min_prop
|
|
54
|
-
self.p_metric = p_metric
|
|
55
|
-
self.selection_metric = selection_metric
|
|
56
|
-
self.return_type = return_type
|
|
57
|
-
self.n_jobs = n_jobs
|
|
58
|
-
self.proba_generator = None
|
|
59
|
-
self.verbose = verbose
|
|
60
|
-
self.max_sample_size = max_sample_size
|
|
61
|
-
self.max_trials = max_trials
|
|
62
|
-
|
|
63
|
-
def sout(self, msg):
|
|
64
|
-
if self.verbose:
|
|
65
|
-
print('[Ensemble]' + msg)
|
|
66
|
-
|
|
67
|
-
def fit(self, X, y):
|
|
68
|
-
self.sout('Fit')
|
|
69
|
-
|
|
70
|
-
self.classes = np.unique(y)
|
|
71
|
-
|
|
72
|
-
if self.selection_metric == 'ds' and not self.binary_data:
|
|
73
|
-
raise ValueError(f'ds selection_metric is only defined for binary quantification, but this dataset is not binary')
|
|
74
|
-
# randomly chooses the prevalences for each member of the ensemble (preventing classes with less than
|
|
75
|
-
# min_pos positive examples)
|
|
76
|
-
sample_size = len(y) if self.max_sample_size is None else min(self.max_sample_size, len(y))
|
|
77
|
-
prevs = [_draw_simplex(ndim=self.n_class, min_val=self.min_prop, max_trials=self.max_trials) for _ in range(self.size)]
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
posteriors = None
|
|
81
|
-
if self.selection_metric == 'ds':
|
|
82
|
-
# precompute the training posterior probabilities
|
|
83
|
-
posteriors, self.proba_generator = self.ds_get_posteriors(X, y)
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
args = (
|
|
87
|
-
(X, y, self.base_quantifier, prev, posteriors, self.verbose, sample_size)
|
|
88
|
-
for prev in prevs
|
|
89
|
-
)
|
|
90
|
-
|
|
91
|
-
self.ensemble = parallel(
|
|
92
|
-
_delayed_new_sample,
|
|
93
|
-
tqdm(args, desc='fitting ensemble', total=self.size) if self.verbose else args,
|
|
94
|
-
n_jobs=self.n_jobs)
|
|
95
|
-
|
|
96
|
-
self.sout('Fit [Done]')
|
|
97
|
-
return self
|
|
98
|
-
|
|
99
|
-
def predict(self, X):
|
|
100
|
-
self.sout('Predict')
|
|
101
|
-
|
|
102
|
-
args = ((Qi, X) for Qi in self.ensemble)
|
|
103
|
-
|
|
104
|
-
prevalences = np.asarray(
|
|
105
|
-
parallel(_delayed_predict,
|
|
106
|
-
tqdm(args, desc="Predicting Ensemble", total=len(self.ensemble)) if self.verbose else args,
|
|
107
|
-
n_jobs=self.n_jobs)
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
prevalences = pd.DataFrame(prevalences).to_numpy()
|
|
111
|
-
|
|
112
|
-
self.p_metric = int(len(prevalences) * self.p_metric)
|
|
113
|
-
|
|
114
|
-
if self.selection_metric == 'ptr':
|
|
115
|
-
prevalences = self.ptr_selection_metric(prevalences)
|
|
116
|
-
elif self.selection_metric == 'ds':
|
|
117
|
-
prevalences = self.ds_selection_metric(prevalences, X)
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
if self.return_type == "median":
|
|
121
|
-
prevalences = np.median(prevalences, axis=0)
|
|
122
|
-
else:
|
|
123
|
-
prevalences = np.mean(prevalences, axis=0)
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
self.sout('Predict [Done]')
|
|
127
|
-
return normalize_prevalence(prevalences, self.classes)
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
def ptr_selection_metric(self, prevalences):
|
|
131
|
-
"""
|
|
132
|
-
Selects the prevalences made by models that have been trained on samples with a prevalence that is most similar
|
|
133
|
-
to a first approximation of the test prevalence as made by all models in the ensemble.
|
|
134
|
-
"""
|
|
135
|
-
test_prev_estim = prevalences.mean(axis=0)
|
|
136
|
-
tr_prevs = [m[1] for m in self.ensemble]
|
|
137
|
-
ptr_differences = [measures.mean_squared_error(test_prev_estim, ptr_i) for ptr_i in tr_prevs]
|
|
138
|
-
order = np.argsort(ptr_differences)
|
|
139
|
-
return _select_k(prevalences, order, k=self.p_metric)
|
|
140
|
-
|
|
141
|
-
def ds_get_posteriors(self, X, y):
|
|
142
|
-
"""
|
|
143
|
-
In the original article, this procedure is not described in a sufficient level of detail. The paper only says
|
|
144
|
-
that the distribution of posterior probabilities from training and test examples is compared by means of the
|
|
145
|
-
Hellinger Distance. However, how these posterior probabilities are generated is not specified. In the article,
|
|
146
|
-
a Logistic Regressor (LR) is used as the classifier device and that could be used for this purpose. However, in
|
|
147
|
-
general, a Quantifier is not necessarily an instance of Aggreggative Probabilistic Quantifiers, and so, that the
|
|
148
|
-
quantifier builds on top of a probabilistic classifier cannot be given for granted. Additionally, it would not
|
|
149
|
-
be correct to generate the posterior probabilities for training documents that have concurred in training the
|
|
150
|
-
classifier that generates them.
|
|
151
|
-
This function thus generates the posterior probabilities for all training documents in a cross-validation way,
|
|
152
|
-
using a LR with hyperparameters that have previously been optimized via grid search in 5FCV.
|
|
153
|
-
:return P,f, where P is a ndarray containing the posterior probabilities of the training data, generated via
|
|
154
|
-
cross-validation and using an optimized LR, and the function to be used in order to generate posterior
|
|
155
|
-
probabilities for test X.
|
|
156
|
-
"""
|
|
157
|
-
lr_base = LogisticRegression(class_weight='balanced', max_iter=1000)
|
|
158
|
-
|
|
159
|
-
optim = GridSearchCV(
|
|
160
|
-
lr_base, param_grid={'C': np.logspace(-4, 4, 9)}, cv=5, n_jobs=self.n_jobs, refit=True
|
|
161
|
-
).fit(X, y)
|
|
162
|
-
|
|
163
|
-
posteriors = cross_val_predict(
|
|
164
|
-
optim.best_estimator_, X, y, cv=5, n_jobs=self.n_jobs, method='predict_proba'
|
|
165
|
-
)
|
|
166
|
-
posteriors_generator = optim.best_estimator_.predict_proba
|
|
167
|
-
|
|
168
|
-
return posteriors, posteriors_generator
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
def ds_selection_metric(self, prevalences, test):
|
|
172
|
-
test_posteriors = self.proba_generator(test)
|
|
173
|
-
test_distribution = getHist(test_posteriors, 8)
|
|
174
|
-
tr_distributions = [m[2] for m in self.ensemble]
|
|
175
|
-
dist = [hellinger(tr_dist_i, test_distribution) for tr_dist_i in tr_distributions]
|
|
176
|
-
order = np.argsort(dist)
|
|
177
|
-
return _select_k(prevalences, order, k=self.p_metric)
|
|
178
|
-
|
|
179
|
-
def _select_k(elements, order, k):
|
|
180
|
-
elements_k = [elements[idx] for idx in order[:k]]
|
|
181
|
-
if elements_k:
|
|
182
|
-
return elements_k
|
|
183
|
-
print(f"Unable to take {k} for elements with size {len(elements)}")
|
|
184
|
-
return elements
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
def _delayed_new_sample(args):
|
|
189
|
-
X, y, base_quantifier, prev, posteriors, verbose, sample_size = args
|
|
190
|
-
if verbose:
|
|
191
|
-
print(f'\tfit-start for prev {str(np.round(prev, 3))}, sample_size={sample_size}')
|
|
192
|
-
model = deepcopy(base_quantifier)
|
|
193
|
-
|
|
194
|
-
sample_index = generate_artificial_indexes(y, prev, sample_size, np.unique(y))
|
|
195
|
-
X_sample = np.take(X, sample_index, axis=0)
|
|
196
|
-
y_sample = np.take(y, sample_index, axis=0)
|
|
197
|
-
#print(X_sample)
|
|
198
|
-
|
|
199
|
-
model.fit(X_sample, y_sample)
|
|
200
|
-
|
|
201
|
-
tr_prevalence = prev
|
|
202
|
-
tr_distribution = getHist(posteriors[sample_index], 8) if (posteriors is not None) else None
|
|
203
|
-
if verbose:
|
|
204
|
-
print(f'\t \\--fit-ended for prev {str(np.round(prev, 3))}')
|
|
205
|
-
return (model, tr_prevalence, tr_distribution, X, y)
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
def _delayed_predict(args):
|
|
209
|
-
quantifier, X = args
|
|
210
|
-
#print(np.asarray(list(quantifier[0].predict(X).values())))
|
|
211
|
-
return list(quantifier[0].predict(X).values())
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
def _draw_simplex(ndim, min_val, max_trials=100):
|
|
215
|
-
"""
|
|
216
|
-
returns a uniform sampling from the ndim-dimensional simplex but guarantees that all dimensions
|
|
217
|
-
are >= min_class_prev (for min_val>0, this makes the sampling not truly uniform)
|
|
218
|
-
:param ndim: number of dimensions of the simplex
|
|
219
|
-
:param min_val: minimum class prevalence allowed. If less than 1/ndim a ValueError will be throw since
|
|
220
|
-
there is no possible solution.
|
|
221
|
-
:return: a sample from the ndim-dimensional simplex that is uniform in S(ndim)-R where S(ndim) is the simplex
|
|
222
|
-
and R is the simplex subset containing dimensions lower than min_val
|
|
223
|
-
"""
|
|
224
|
-
if min_val >= 1 / ndim:
|
|
225
|
-
raise ValueError(f'no sample can be draw from the {ndim}-dimensional simplex so that '
|
|
226
|
-
f'all its values are >={min_val} (try with a larger value for min_pos)')
|
|
227
|
-
trials = 0
|
|
228
|
-
while True:
|
|
229
|
-
u = make_prevs(ndim)
|
|
230
|
-
if all(u >= min_val):
|
|
231
|
-
return u
|
|
232
|
-
trials += 1
|
|
233
|
-
if trials >= max_trials:
|
|
234
|
-
raise ValueError(f'it looks like finding a random simplex with all its dimensions being'
|
|
235
|
-
f'>= {min_val} is unlikely (it failed after {max_trials} trials)')
|
|
236
|
-
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
from .hdx import HDx
|
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
import numpy as np
|
|
3
|
-
|
|
4
|
-
from ...base import NonAggregativeQuantifier
|
|
5
|
-
from ...utils import getHist, hellinger
|
|
6
|
-
|
|
7
|
-
class HDx(NonAggregativeQuantifier):
|
|
8
|
-
"""Hellinger Distance Minimization. The method is similar
|
|
9
|
-
to the HDy method, but istead of computing the hellinger
|
|
10
|
-
distance of the scores (generated via classifier), HDx
|
|
11
|
-
computes the distance of each one of the features of the
|
|
12
|
-
dataset
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
def __init__(self, bins_size:np.ndarray=None):
|
|
16
|
-
if not bins_size:
|
|
17
|
-
bins_size = np.append(np.linspace(2,20,10), 30)
|
|
18
|
-
|
|
19
|
-
self.bins_size = bins_size
|
|
20
|
-
self.neg_features = None
|
|
21
|
-
self.pos_features = None
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def _fit_method(self, X, y):
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
self.pos_features = X[y == self.classes[1]]
|
|
28
|
-
self.neg_features = X[y == self.classes[0]]
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
if not isinstance(X, np.ndarray):
|
|
32
|
-
self.pos_features = self.pos_features.to_numpy()
|
|
33
|
-
if not isinstance(y, np.ndarray):
|
|
34
|
-
self.neg_features = self.neg_features.to_numpy()
|
|
35
|
-
|
|
36
|
-
return self
|
|
37
|
-
|
|
38
|
-
def _predict_method(self, X) -> dict:
|
|
39
|
-
|
|
40
|
-
if not isinstance(X, np.ndarray):
|
|
41
|
-
X = X.to_numpy()
|
|
42
|
-
|
|
43
|
-
alpha_values = np.round(np.linspace(0, 1, 101), 2)
|
|
44
|
-
|
|
45
|
-
best_distances = {}
|
|
46
|
-
|
|
47
|
-
for x in alpha_values:
|
|
48
|
-
|
|
49
|
-
distances = []
|
|
50
|
-
|
|
51
|
-
for i in range(X.shape[1]):
|
|
52
|
-
for bins in self.bins_size:
|
|
53
|
-
|
|
54
|
-
dist_feature_pos = getHist(self.pos_features[:, i], bins)
|
|
55
|
-
dist_feature_neg = getHist(self.neg_features[:, i], bins)
|
|
56
|
-
dist_feature_test = getHist(X[:, i], bins)
|
|
57
|
-
|
|
58
|
-
# Combine densities using a mixture of positive and negative densities
|
|
59
|
-
train_combined_density = (dist_feature_pos * x) + (dist_feature_neg * (1 - x))
|
|
60
|
-
# Compute the distance using the Hellinger measure
|
|
61
|
-
distances.append(hellinger(train_combined_density, dist_feature_test))
|
|
62
|
-
|
|
63
|
-
best_distances[x] = np.mean(distances)
|
|
64
|
-
|
|
65
|
-
prevalence = min(best_distances, key=best_distances.get)
|
|
66
|
-
|
|
67
|
-
return np.asarray([1- prevalence, prevalence])
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
mlquantify/plots/__init__.py
DELETED
|
@@ -1,109 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
import matplotlib.pyplot as plt
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import List, Optional, Dict, Any, Union
|
|
5
|
-
|
|
6
|
-
plt.rcParams.update({
|
|
7
|
-
'axes.facecolor': "#F8F8F8",
|
|
8
|
-
'figure.facecolor': "#F8F8F8",
|
|
9
|
-
'font.family': 'sans-serif',
|
|
10
|
-
'font.sans-serif': 'Arial',
|
|
11
|
-
'font.size': 12,
|
|
12
|
-
'font.weight': 'light',
|
|
13
|
-
'axes.labelsize': 14,
|
|
14
|
-
'axes.labelweight': 'light',
|
|
15
|
-
'axes.titlesize': 16,
|
|
16
|
-
'axes.titleweight': 'normal',
|
|
17
|
-
'boxplot.boxprops.linewidth': 0.3,
|
|
18
|
-
'boxplot.whiskerprops.linewidth': 0.3,
|
|
19
|
-
'boxplot.capprops.linewidth': 0.3,
|
|
20
|
-
'boxplot.medianprops.linewidth': 0.6,
|
|
21
|
-
'boxplot.flierprops.linewidth': 0.3,
|
|
22
|
-
'boxplot.flierprops.markersize': 0.9,
|
|
23
|
-
'boxplot.medianprops.color': 'black',
|
|
24
|
-
'figure.subplot.bottom': 0.2,
|
|
25
|
-
'axes.grid': True,
|
|
26
|
-
'grid.color': 'black',
|
|
27
|
-
'grid.alpha': 0.1,
|
|
28
|
-
'grid.linewidth': 0.5,
|
|
29
|
-
'grid.linestyle': '--'
|
|
30
|
-
})
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
COLORS = [
|
|
36
|
-
'#FFAB91', '#FFE082', '#A5D6A7', '#4DD0E1', '#FF6F61', '#FF8C94', '#D4A5A5',
|
|
37
|
-
'#FF677D', '#B9FBC0', '#C2C2F0', '#E3F9A6', '#E2A8F7', '#F7B7A3', '#F7C6C7',
|
|
38
|
-
'#8D9BFC', '#B4E6FF', '#FF8A65', '#FFC3A0', '#FFCCBC', '#F8BBD0', '#FF9AA2',
|
|
39
|
-
'#FFB3B3', '#FFDDC1', '#FFE0B2', '#E2A8F7', '#F7C6C7', '#E57373', '#BA68C8',
|
|
40
|
-
'#4FC3F7', '#FFB3B3', '#FF6F61'
|
|
41
|
-
]
|
|
42
|
-
|
|
43
|
-
def class_distribution_plot(values: Union[List, np.ndarray],
|
|
44
|
-
labels: Union[List, np.ndarray],
|
|
45
|
-
bins: int = 30,
|
|
46
|
-
title: Optional[str] = None,
|
|
47
|
-
legend: bool = True,
|
|
48
|
-
save_path: Optional[str] = None,
|
|
49
|
-
plot_params: Optional[Dict[str, Any]] = None):
|
|
50
|
-
|
|
51
|
-
"""Plot overlaid histograms of class distributions.
|
|
52
|
-
|
|
53
|
-
This function creates a plot with overlaid histograms, each representing the distribution
|
|
54
|
-
of a different class or category. Custom colors, titles, legends, and other plot parameters
|
|
55
|
-
can be applied to enhance visualization.
|
|
56
|
-
|
|
57
|
-
Args:
|
|
58
|
-
values (Union[List, np.ndarray]):
|
|
59
|
-
A list of arrays or a single array containing values for specific classes or categories.
|
|
60
|
-
labels (Union[List, np.ndarray]):
|
|
61
|
-
A list or an array of labels corresponding to each value set in `values`.
|
|
62
|
-
Must be the same length as `values`.
|
|
63
|
-
bins (int, optional):
|
|
64
|
-
Number of bins to use in the histograms. Default is 30.
|
|
65
|
-
title (Optional[str], optional):
|
|
66
|
-
Title of the plot. If not provided, no title will be displayed.
|
|
67
|
-
legend (bool, optional):
|
|
68
|
-
Whether to display a legend. Default is True.
|
|
69
|
-
save_path (Optional[str], optional):
|
|
70
|
-
File path to save the plot image. If not provided, the plot will not be saved.
|
|
71
|
-
plot_params (Optional[Dict[str, Any]], optional):
|
|
72
|
-
Dictionary of custom plotting parameters to apply. Default is None.
|
|
73
|
-
|
|
74
|
-
Raises:
|
|
75
|
-
AssertionError:
|
|
76
|
-
If the number of labels does not match the number of value sets.
|
|
77
|
-
|
|
78
|
-
"""
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
# Apply custom plotting parameters if provided
|
|
82
|
-
if plot_params:
|
|
83
|
-
plt.rcParams.update(plot_params)
|
|
84
|
-
|
|
85
|
-
# Ensure the number of labels matches the number of value sets
|
|
86
|
-
assert len(values) == len(labels), "The number of value sets must match the number of labels."
|
|
87
|
-
|
|
88
|
-
# Create the overlaid histogram
|
|
89
|
-
for i, (value_set, label) in enumerate(zip(values, labels)):
|
|
90
|
-
plt.hist(value_set, bins=bins, color=COLORS[i % len(COLORS)], edgecolor='black', alpha=0.5, label=label)
|
|
91
|
-
|
|
92
|
-
# Add title to the plot if provided
|
|
93
|
-
if title:
|
|
94
|
-
plt.title(title)
|
|
95
|
-
|
|
96
|
-
# Add legend to the plot if enabled
|
|
97
|
-
if legend:
|
|
98
|
-
plt.legend(loc='upper right')
|
|
99
|
-
|
|
100
|
-
# Set axis labels
|
|
101
|
-
plt.xlabel('Values')
|
|
102
|
-
plt.ylabel('Frequency')
|
|
103
|
-
|
|
104
|
-
# Save the figure if a path is specified
|
|
105
|
-
if save_path:
|
|
106
|
-
plt.savefig(save_path, bbox_inches='tight')
|
|
107
|
-
|
|
108
|
-
# Show the plot
|
|
109
|
-
plt.show()
|