mlquantify 0.0.11.2__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. mlquantify/__init__.py +32 -6
  2. mlquantify/base.py +559 -257
  3. mlquantify/classification/__init__.py +1 -1
  4. mlquantify/classification/methods.py +160 -0
  5. mlquantify/evaluation/__init__.py +14 -2
  6. mlquantify/evaluation/measures.py +215 -0
  7. mlquantify/evaluation/protocol.py +647 -0
  8. mlquantify/methods/__init__.py +37 -40
  9. mlquantify/methods/aggregative.py +1030 -0
  10. mlquantify/methods/meta.py +472 -0
  11. mlquantify/methods/mixture_models.py +1003 -0
  12. mlquantify/methods/non_aggregative.py +136 -0
  13. mlquantify/methods/threshold_optimization.py +957 -0
  14. mlquantify/model_selection.py +377 -232
  15. mlquantify/plots.py +367 -0
  16. mlquantify/utils/__init__.py +2 -2
  17. mlquantify/utils/general.py +334 -0
  18. mlquantify/utils/method.py +449 -0
  19. {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/METADATA +137 -122
  20. mlquantify-0.1.1.dist-info/RECORD +22 -0
  21. {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/WHEEL +1 -1
  22. mlquantify/classification/pwkclf.py +0 -73
  23. mlquantify/evaluation/measures/__init__.py +0 -26
  24. mlquantify/evaluation/measures/ae.py +0 -11
  25. mlquantify/evaluation/measures/bias.py +0 -16
  26. mlquantify/evaluation/measures/kld.py +0 -8
  27. mlquantify/evaluation/measures/mse.py +0 -12
  28. mlquantify/evaluation/measures/nae.py +0 -16
  29. mlquantify/evaluation/measures/nkld.py +0 -13
  30. mlquantify/evaluation/measures/nrae.py +0 -16
  31. mlquantify/evaluation/measures/rae.py +0 -12
  32. mlquantify/evaluation/measures/se.py +0 -12
  33. mlquantify/evaluation/protocol/_Protocol.py +0 -202
  34. mlquantify/evaluation/protocol/__init__.py +0 -2
  35. mlquantify/evaluation/protocol/app.py +0 -146
  36. mlquantify/evaluation/protocol/npp.py +0 -34
  37. mlquantify/methods/aggregative/ThreholdOptm/_ThreholdOptimization.py +0 -62
  38. mlquantify/methods/aggregative/ThreholdOptm/__init__.py +0 -7
  39. mlquantify/methods/aggregative/ThreholdOptm/acc.py +0 -27
  40. mlquantify/methods/aggregative/ThreholdOptm/max.py +0 -23
  41. mlquantify/methods/aggregative/ThreholdOptm/ms.py +0 -21
  42. mlquantify/methods/aggregative/ThreholdOptm/ms2.py +0 -25
  43. mlquantify/methods/aggregative/ThreholdOptm/pacc.py +0 -41
  44. mlquantify/methods/aggregative/ThreholdOptm/t50.py +0 -21
  45. mlquantify/methods/aggregative/ThreholdOptm/x.py +0 -23
  46. mlquantify/methods/aggregative/__init__.py +0 -9
  47. mlquantify/methods/aggregative/cc.py +0 -32
  48. mlquantify/methods/aggregative/emq.py +0 -86
  49. mlquantify/methods/aggregative/fm.py +0 -72
  50. mlquantify/methods/aggregative/gac.py +0 -96
  51. mlquantify/methods/aggregative/gpac.py +0 -87
  52. mlquantify/methods/aggregative/mixtureModels/_MixtureModel.py +0 -81
  53. mlquantify/methods/aggregative/mixtureModels/__init__.py +0 -5
  54. mlquantify/methods/aggregative/mixtureModels/dys.py +0 -55
  55. mlquantify/methods/aggregative/mixtureModels/dys_syn.py +0 -89
  56. mlquantify/methods/aggregative/mixtureModels/hdy.py +0 -46
  57. mlquantify/methods/aggregative/mixtureModels/smm.py +0 -27
  58. mlquantify/methods/aggregative/mixtureModels/sord.py +0 -77
  59. mlquantify/methods/aggregative/pcc.py +0 -33
  60. mlquantify/methods/aggregative/pwk.py +0 -38
  61. mlquantify/methods/meta/__init__.py +0 -1
  62. mlquantify/methods/meta/ensemble.py +0 -236
  63. mlquantify/methods/non_aggregative/__init__.py +0 -1
  64. mlquantify/methods/non_aggregative/hdx.py +0 -71
  65. mlquantify/plots/__init__.py +0 -2
  66. mlquantify/plots/distribution_plot.py +0 -109
  67. mlquantify/plots/protocol_plot.py +0 -193
  68. mlquantify/utils/general_purposes/__init__.py +0 -8
  69. mlquantify/utils/general_purposes/convert_col_to_array.py +0 -13
  70. mlquantify/utils/general_purposes/generate_artificial_indexes.py +0 -29
  71. mlquantify/utils/general_purposes/get_real_prev.py +0 -9
  72. mlquantify/utils/general_purposes/load_quantifier.py +0 -4
  73. mlquantify/utils/general_purposes/make_prevs.py +0 -23
  74. mlquantify/utils/general_purposes/normalize.py +0 -20
  75. mlquantify/utils/general_purposes/parallel.py +0 -10
  76. mlquantify/utils/general_purposes/round_protocol_df.py +0 -14
  77. mlquantify/utils/method_purposes/__init__.py +0 -6
  78. mlquantify/utils/method_purposes/distances.py +0 -21
  79. mlquantify/utils/method_purposes/getHist.py +0 -13
  80. mlquantify/utils/method_purposes/get_scores.py +0 -33
  81. mlquantify/utils/method_purposes/moss.py +0 -16
  82. mlquantify/utils/method_purposes/ternary_search.py +0 -14
  83. mlquantify/utils/method_purposes/tprfpr.py +0 -42
  84. mlquantify-0.0.11.2.dist-info/RECORD +0 -73
  85. {mlquantify-0.0.11.2.dist-info → mlquantify-0.1.1.dist-info}/top_level.txt +0 -0
@@ -1,77 +0,0 @@
1
- import numpy as np
2
- from sklearn.base import BaseEstimator
3
-
4
- from ._MixtureModel import MixtureModel
5
-
6
- class SORD(MixtureModel):
7
- """Sample Ordinal Distance. Is a method
8
- that does not rely on distributions, but
9
- estimates the prevalence of the positive
10
- class in a test dataset by calculating and
11
- minimizing a sample ordinal distance measure
12
- between the test scores and known positive
13
- and negative scores.
14
- """
15
-
16
- def __init__(self, learner: BaseEstimator):
17
- assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
18
- super().__init__(learner)
19
-
20
- self.best_distance_index = None
21
-
22
- def _compute_prevalence(self, test_scores: np.ndarray) -> float:
23
- # Compute alpha values and corresponding distance measures
24
- alpha_values, distance_measures = self._calculate_distances(test_scores)
25
-
26
- # Find the index of the alpha value with the minimum distance measure
27
- self.best_distance_index = np.argmin(distance_measures)
28
- prevalence = alpha_values[self.best_distance_index]
29
-
30
- return prevalence
31
-
32
-
33
- def _calculate_distances(self, test_scores: np.ndarray):
34
- # Define a range of alpha values from 0 to 1
35
- alpha_values = np.linspace(0, 1, 101)
36
-
37
- # Get the number of positive, negative, and test scores
38
- num_pos_scores = len(self.pos_scores)
39
- num_neg_scores = len(self.neg_scores)
40
- num_test_scores = len(test_scores)
41
-
42
- distance_measures = []
43
-
44
- # Iterate over each alpha value
45
- for alpha in alpha_values:
46
- # Compute weights for positive, negative, and test scores
47
- pos_weight = alpha / num_pos_scores
48
- neg_weight = (1 - alpha) / num_neg_scores
49
- test_weight = -1 / num_test_scores
50
-
51
- # Create arrays with weights
52
- pos_weights = np.full(num_pos_scores, pos_weight)
53
- neg_weights = np.full(num_neg_scores, neg_weight)
54
- test_weights = np.full(num_test_scores, test_weight)
55
-
56
- # Concatenate all scores and their corresponding weights
57
- all_scores = np.concatenate([self.pos_scores, self.neg_scores, test_scores])
58
- all_weights = np.concatenate([pos_weights, neg_weights, test_weights])
59
-
60
- # Sort scores and weights based on scores
61
- sorted_indices = np.argsort(all_scores)
62
- sorted_scores = all_scores[sorted_indices]
63
- sorted_weights = all_weights[sorted_indices]
64
-
65
- # Compute the total cost for the current alpha
66
- cumulative_weight = sorted_weights[0]
67
- total_cost = 0
68
-
69
- for i in range(1, len(sorted_scores)):
70
- # Calculate the cost for the segment between sorted scores
71
- segment_width = sorted_scores[i] - sorted_scores[i - 1]
72
- total_cost += abs(segment_width * cumulative_weight)
73
- cumulative_weight += sorted_weights[i]
74
-
75
- distance_measures.append(total_cost)
76
-
77
- return alpha_values, distance_measures
@@ -1,33 +0,0 @@
1
- import numpy as np
2
- from sklearn.base import BaseEstimator
3
- from ...base import AggregativeQuantifier
4
-
5
- class PCC(AggregativeQuantifier):
6
- """Probabilistic Classify and Count. This method
7
- takes the probabilistic predictions and takes the
8
- mean of them for each class.
9
- """
10
-
11
- def __init__(self, learner: BaseEstimator):
12
- assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
13
- self.learner = learner
14
-
15
- def _fit_method(self, X, y):
16
- if not self.learner_fitted:
17
- self.learner.fit(X, y)
18
- return self
19
-
20
- def _predict_method(self, X) -> dict:
21
- # Initialize a dictionary to store the prevalence for each class
22
- prevalences = []
23
-
24
- # Calculate the prevalence for each class
25
- for class_index in range(self.n_class):
26
- # Get the predicted probabilities for the current class
27
- class_probabilities = self.learner.predict_proba(X)[:, class_index]
28
-
29
- # Compute the average probability (prevalence) for the current class
30
- mean_prev = np.mean(class_probabilities)
31
- prevalences.append(mean_prev)
32
-
33
- return np.asarray(prevalences)
@@ -1,38 +0,0 @@
1
- import numpy as np
2
- from sklearn.base import BaseEstimator
3
- from collections import defaultdict
4
-
5
- from ...base import AggregativeQuantifier
6
-
7
- class PWK(AggregativeQuantifier):
8
- """ Nearest-Neighbor based Quantification. This method
9
- is based on nearest-neighbor based classification to the
10
- setting of quantification. In this k-NN approach, it applies
11
- a weighting scheme which applies less weight on neighbors
12
- from the majority class.
13
- Must be used with PWKCLF to work as expected.
14
- """
15
-
16
- def __init__(self, learner: BaseEstimator):
17
- assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
18
- self.learner = learner
19
-
20
- def _fit_method(self, X, y):
21
- if not self.learner_fitted:
22
- self.learner.fit(X, y)
23
- return self
24
-
25
- def _predict_method(self, X) -> dict:
26
- # Predict class labels for the given data
27
- predicted_labels = self.learner.predict(X)
28
-
29
- # Compute the distribution of predicted labels
30
- unique_labels, label_counts = np.unique(predicted_labels, return_counts=True)
31
-
32
- # Calculate the prevalence for each class
33
- class_prevalences = label_counts / label_counts.sum()
34
-
35
- # Map each class label to its prevalence
36
- prevalences = {label: prevalence for label, prevalence in zip(unique_labels, class_prevalences)}
37
-
38
- return prevalences
@@ -1 +0,0 @@
1
- from .ensemble import Ensemble
@@ -1,236 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- from copy import deepcopy
4
- from tqdm import tqdm
5
- from sklearn.linear_model import LogisticRegression
6
- from sklearn.model_selection import GridSearchCV, cross_val_predict
7
- from ...evaluation import measures
8
- from ...base import Quantifier
9
- from ...utils import make_prevs, getHist, normalize_prevalence, parallel, hellinger, generate_artificial_indexes
10
-
11
- class Ensemble(Quantifier):
12
- SELECTION_METRICS = {'all', 'ptr', 'ds'}
13
-
14
- """Ensemble method, based on the articles:
15
- Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
16
- Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
17
- Information Fusion, 34, 87-100.
18
- and
19
- Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
20
- Dynamic ensemble selection for quantification tasks.
21
- Information Fusion, 45, 1-15.
22
-
23
- This approach of Ensemble is made of taking multiple
24
- samples varying class proportions on each, and for the
25
- predictions, it takes the k models which as the minimum
26
- seletion metric, which are:
27
- - all -> return all the predictions
28
- - ptr -> computes the selected error measure
29
- - ds -> computes the hellinger distance of the train and test
30
- distributions for each model
31
-
32
- """
33
-
34
- def __init__(self,
35
- quantifier:Quantifier,
36
- size:int=50,
37
- min_prop:float=0.1,
38
- selection_metric:str='all',
39
- p_metric:float=0.25,
40
- return_type:str="mean",
41
- max_sample_size:int=None,
42
- max_trials:int=100,
43
- n_jobs:int=1,
44
- verbose:bool=False):
45
-
46
- assert selection_metric in Ensemble.SELECTION_METRICS, \
47
- f'unknown selection_metric={selection_metric}; valid are {Ensemble.SELECTION_METRICS}'
48
- assert max_sample_size is None or max_sample_size > 0, \
49
- 'wrong value for max_sample_size; set it to a positive number or None'
50
-
51
- self.base_quantifier = quantifier
52
- self.size = size
53
- self.min_prop = min_prop
54
- self.p_metric = p_metric
55
- self.selection_metric = selection_metric
56
- self.return_type = return_type
57
- self.n_jobs = n_jobs
58
- self.proba_generator = None
59
- self.verbose = verbose
60
- self.max_sample_size = max_sample_size
61
- self.max_trials = max_trials
62
-
63
- def sout(self, msg):
64
- if self.verbose:
65
- print('[Ensemble]' + msg)
66
-
67
- def fit(self, X, y):
68
- self.sout('Fit')
69
-
70
- self.classes = np.unique(y)
71
-
72
- if self.selection_metric == 'ds' and not self.binary_data:
73
- raise ValueError(f'ds selection_metric is only defined for binary quantification, but this dataset is not binary')
74
- # randomly chooses the prevalences for each member of the ensemble (preventing classes with less than
75
- # min_pos positive examples)
76
- sample_size = len(y) if self.max_sample_size is None else min(self.max_sample_size, len(y))
77
- prevs = [_draw_simplex(ndim=self.n_class, min_val=self.min_prop, max_trials=self.max_trials) for _ in range(self.size)]
78
-
79
-
80
- posteriors = None
81
- if self.selection_metric == 'ds':
82
- # precompute the training posterior probabilities
83
- posteriors, self.proba_generator = self.ds_get_posteriors(X, y)
84
-
85
-
86
- args = (
87
- (X, y, self.base_quantifier, prev, posteriors, self.verbose, sample_size)
88
- for prev in prevs
89
- )
90
-
91
- self.ensemble = parallel(
92
- _delayed_new_sample,
93
- tqdm(args, desc='fitting ensemble', total=self.size) if self.verbose else args,
94
- n_jobs=self.n_jobs)
95
-
96
- self.sout('Fit [Done]')
97
- return self
98
-
99
- def predict(self, X):
100
- self.sout('Predict')
101
-
102
- args = ((Qi, X) for Qi in self.ensemble)
103
-
104
- prevalences = np.asarray(
105
- parallel(_delayed_predict,
106
- tqdm(args, desc="Predicting Ensemble", total=len(self.ensemble)) if self.verbose else args,
107
- n_jobs=self.n_jobs)
108
- )
109
-
110
- prevalences = pd.DataFrame(prevalences).to_numpy()
111
-
112
- self.p_metric = int(len(prevalences) * self.p_metric)
113
-
114
- if self.selection_metric == 'ptr':
115
- prevalences = self.ptr_selection_metric(prevalences)
116
- elif self.selection_metric == 'ds':
117
- prevalences = self.ds_selection_metric(prevalences, X)
118
-
119
-
120
- if self.return_type == "median":
121
- prevalences = np.median(prevalences, axis=0)
122
- else:
123
- prevalences = np.mean(prevalences, axis=0)
124
-
125
-
126
- self.sout('Predict [Done]')
127
- return normalize_prevalence(prevalences, self.classes)
128
-
129
-
130
- def ptr_selection_metric(self, prevalences):
131
- """
132
- Selects the prevalences made by models that have been trained on samples with a prevalence that is most similar
133
- to a first approximation of the test prevalence as made by all models in the ensemble.
134
- """
135
- test_prev_estim = prevalences.mean(axis=0)
136
- tr_prevs = [m[1] for m in self.ensemble]
137
- ptr_differences = [measures.mean_squared_error(test_prev_estim, ptr_i) for ptr_i in tr_prevs]
138
- order = np.argsort(ptr_differences)
139
- return _select_k(prevalences, order, k=self.p_metric)
140
-
141
- def ds_get_posteriors(self, X, y):
142
- """
143
- In the original article, this procedure is not described in a sufficient level of detail. The paper only says
144
- that the distribution of posterior probabilities from training and test examples is compared by means of the
145
- Hellinger Distance. However, how these posterior probabilities are generated is not specified. In the article,
146
- a Logistic Regressor (LR) is used as the classifier device and that could be used for this purpose. However, in
147
- general, a Quantifier is not necessarily an instance of Aggreggative Probabilistic Quantifiers, and so, that the
148
- quantifier builds on top of a probabilistic classifier cannot be given for granted. Additionally, it would not
149
- be correct to generate the posterior probabilities for training documents that have concurred in training the
150
- classifier that generates them.
151
- This function thus generates the posterior probabilities for all training documents in a cross-validation way,
152
- using a LR with hyperparameters that have previously been optimized via grid search in 5FCV.
153
- :return P,f, where P is a ndarray containing the posterior probabilities of the training data, generated via
154
- cross-validation and using an optimized LR, and the function to be used in order to generate posterior
155
- probabilities for test X.
156
- """
157
- lr_base = LogisticRegression(class_weight='balanced', max_iter=1000)
158
-
159
- optim = GridSearchCV(
160
- lr_base, param_grid={'C': np.logspace(-4, 4, 9)}, cv=5, n_jobs=self.n_jobs, refit=True
161
- ).fit(X, y)
162
-
163
- posteriors = cross_val_predict(
164
- optim.best_estimator_, X, y, cv=5, n_jobs=self.n_jobs, method='predict_proba'
165
- )
166
- posteriors_generator = optim.best_estimator_.predict_proba
167
-
168
- return posteriors, posteriors_generator
169
-
170
-
171
- def ds_selection_metric(self, prevalences, test):
172
- test_posteriors = self.proba_generator(test)
173
- test_distribution = getHist(test_posteriors, 8)
174
- tr_distributions = [m[2] for m in self.ensemble]
175
- dist = [hellinger(tr_dist_i, test_distribution) for tr_dist_i in tr_distributions]
176
- order = np.argsort(dist)
177
- return _select_k(prevalences, order, k=self.p_metric)
178
-
179
- def _select_k(elements, order, k):
180
- elements_k = [elements[idx] for idx in order[:k]]
181
- if elements_k:
182
- return elements_k
183
- print(f"Unable to take {k} for elements with size {len(elements)}")
184
- return elements
185
-
186
-
187
-
188
- def _delayed_new_sample(args):
189
- X, y, base_quantifier, prev, posteriors, verbose, sample_size = args
190
- if verbose:
191
- print(f'\tfit-start for prev {str(np.round(prev, 3))}, sample_size={sample_size}')
192
- model = deepcopy(base_quantifier)
193
-
194
- sample_index = generate_artificial_indexes(y, prev, sample_size, np.unique(y))
195
- X_sample = np.take(X, sample_index, axis=0)
196
- y_sample = np.take(y, sample_index, axis=0)
197
- #print(X_sample)
198
-
199
- model.fit(X_sample, y_sample)
200
-
201
- tr_prevalence = prev
202
- tr_distribution = getHist(posteriors[sample_index], 8) if (posteriors is not None) else None
203
- if verbose:
204
- print(f'\t \\--fit-ended for prev {str(np.round(prev, 3))}')
205
- return (model, tr_prevalence, tr_distribution, X, y)
206
-
207
-
208
- def _delayed_predict(args):
209
- quantifier, X = args
210
- #print(np.asarray(list(quantifier[0].predict(X).values())))
211
- return list(quantifier[0].predict(X).values())
212
-
213
-
214
- def _draw_simplex(ndim, min_val, max_trials=100):
215
- """
216
- returns a uniform sampling from the ndim-dimensional simplex but guarantees that all dimensions
217
- are >= min_class_prev (for min_val>0, this makes the sampling not truly uniform)
218
- :param ndim: number of dimensions of the simplex
219
- :param min_val: minimum class prevalence allowed. If less than 1/ndim a ValueError will be throw since
220
- there is no possible solution.
221
- :return: a sample from the ndim-dimensional simplex that is uniform in S(ndim)-R where S(ndim) is the simplex
222
- and R is the simplex subset containing dimensions lower than min_val
223
- """
224
- if min_val >= 1 / ndim:
225
- raise ValueError(f'no sample can be draw from the {ndim}-dimensional simplex so that '
226
- f'all its values are >={min_val} (try with a larger value for min_pos)')
227
- trials = 0
228
- while True:
229
- u = make_prevs(ndim)
230
- if all(u >= min_val):
231
- return u
232
- trials += 1
233
- if trials >= max_trials:
234
- raise ValueError(f'it looks like finding a random simplex with all its dimensions being'
235
- f'>= {min_val} is unlikely (it failed after {max_trials} trials)')
236
-
@@ -1 +0,0 @@
1
- from .hdx import HDx
@@ -1,71 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
-
4
- from ...base import NonAggregativeQuantifier
5
- from ...utils import getHist, hellinger
6
-
7
- class HDx(NonAggregativeQuantifier):
8
- """Hellinger Distance Minimization. The method is similar
9
- to the HDy method, but istead of computing the hellinger
10
- distance of the scores (generated via classifier), HDx
11
- computes the distance of each one of the features of the
12
- dataset
13
- """
14
-
15
- def __init__(self, bins_size:np.ndarray=None):
16
- if not bins_size:
17
- bins_size = np.append(np.linspace(2,20,10), 30)
18
-
19
- self.bins_size = bins_size
20
- self.neg_features = None
21
- self.pos_features = None
22
-
23
-
24
- def _fit_method(self, X, y):
25
-
26
-
27
- self.pos_features = X[y == self.classes[1]]
28
- self.neg_features = X[y == self.classes[0]]
29
-
30
-
31
- if not isinstance(X, np.ndarray):
32
- self.pos_features = self.pos_features.to_numpy()
33
- if not isinstance(y, np.ndarray):
34
- self.neg_features = self.neg_features.to_numpy()
35
-
36
- return self
37
-
38
- def _predict_method(self, X) -> dict:
39
-
40
- if not isinstance(X, np.ndarray):
41
- X = X.to_numpy()
42
-
43
- alpha_values = np.round(np.linspace(0, 1, 101), 2)
44
-
45
- best_distances = {}
46
-
47
- for x in alpha_values:
48
-
49
- distances = []
50
-
51
- for i in range(X.shape[1]):
52
- for bins in self.bins_size:
53
-
54
- dist_feature_pos = getHist(self.pos_features[:, i], bins)
55
- dist_feature_neg = getHist(self.neg_features[:, i], bins)
56
- dist_feature_test = getHist(X[:, i], bins)
57
-
58
- # Combine densities using a mixture of positive and negative densities
59
- train_combined_density = (dist_feature_pos * x) + (dist_feature_neg * (1 - x))
60
- # Compute the distance using the Hellinger measure
61
- distances.append(hellinger(train_combined_density, dist_feature_test))
62
-
63
- best_distances[x] = np.mean(distances)
64
-
65
- prevalence = min(best_distances, key=best_distances.get)
66
-
67
- return np.asarray([1- prevalence, prevalence])
68
-
69
-
70
-
71
-
@@ -1,2 +0,0 @@
1
- from .protocol_plot import protocol_boxplot, protocol_lineplot
2
- from .distribution_plot import class_distribution_plot
@@ -1,109 +0,0 @@
1
- import numpy as np
2
- import matplotlib.pyplot as plt
3
- from pathlib import Path
4
- from typing import List, Optional, Dict, Any, Union
5
-
6
- plt.rcParams.update({
7
- 'axes.facecolor': "#F8F8F8",
8
- 'figure.facecolor': "#F8F8F8",
9
- 'font.family': 'sans-serif',
10
- 'font.sans-serif': 'Arial',
11
- 'font.size': 12,
12
- 'font.weight': 'light',
13
- 'axes.labelsize': 14,
14
- 'axes.labelweight': 'light',
15
- 'axes.titlesize': 16,
16
- 'axes.titleweight': 'normal',
17
- 'boxplot.boxprops.linewidth': 0.3,
18
- 'boxplot.whiskerprops.linewidth': 0.3,
19
- 'boxplot.capprops.linewidth': 0.3,
20
- 'boxplot.medianprops.linewidth': 0.6,
21
- 'boxplot.flierprops.linewidth': 0.3,
22
- 'boxplot.flierprops.markersize': 0.9,
23
- 'boxplot.medianprops.color': 'black',
24
- 'figure.subplot.bottom': 0.2,
25
- 'axes.grid': True,
26
- 'grid.color': 'black',
27
- 'grid.alpha': 0.1,
28
- 'grid.linewidth': 0.5,
29
- 'grid.linestyle': '--'
30
- })
31
-
32
-
33
-
34
-
35
- COLORS = [
36
- '#FFAB91', '#FFE082', '#A5D6A7', '#4DD0E1', '#FF6F61', '#FF8C94', '#D4A5A5',
37
- '#FF677D', '#B9FBC0', '#C2C2F0', '#E3F9A6', '#E2A8F7', '#F7B7A3', '#F7C6C7',
38
- '#8D9BFC', '#B4E6FF', '#FF8A65', '#FFC3A0', '#FFCCBC', '#F8BBD0', '#FF9AA2',
39
- '#FFB3B3', '#FFDDC1', '#FFE0B2', '#E2A8F7', '#F7C6C7', '#E57373', '#BA68C8',
40
- '#4FC3F7', '#FFB3B3', '#FF6F61'
41
- ]
42
-
43
- def class_distribution_plot(values: Union[List, np.ndarray],
44
- labels: Union[List, np.ndarray],
45
- bins: int = 30,
46
- title: Optional[str] = None,
47
- legend: bool = True,
48
- save_path: Optional[str] = None,
49
- plot_params: Optional[Dict[str, Any]] = None):
50
-
51
- """Plot overlaid histograms of class distributions.
52
-
53
- This function creates a plot with overlaid histograms, each representing the distribution
54
- of a different class or category. Custom colors, titles, legends, and other plot parameters
55
- can be applied to enhance visualization.
56
-
57
- Args:
58
- values (Union[List, np.ndarray]):
59
- A list of arrays or a single array containing values for specific classes or categories.
60
- labels (Union[List, np.ndarray]):
61
- A list or an array of labels corresponding to each value set in `values`.
62
- Must be the same length as `values`.
63
- bins (int, optional):
64
- Number of bins to use in the histograms. Default is 30.
65
- title (Optional[str], optional):
66
- Title of the plot. If not provided, no title will be displayed.
67
- legend (bool, optional):
68
- Whether to display a legend. Default is True.
69
- save_path (Optional[str], optional):
70
- File path to save the plot image. If not provided, the plot will not be saved.
71
- plot_params (Optional[Dict[str, Any]], optional):
72
- Dictionary of custom plotting parameters to apply. Default is None.
73
-
74
- Raises:
75
- AssertionError:
76
- If the number of labels does not match the number of value sets.
77
-
78
- """
79
-
80
-
81
- # Apply custom plotting parameters if provided
82
- if plot_params:
83
- plt.rcParams.update(plot_params)
84
-
85
- # Ensure the number of labels matches the number of value sets
86
- assert len(values) == len(labels), "The number of value sets must match the number of labels."
87
-
88
- # Create the overlaid histogram
89
- for i, (value_set, label) in enumerate(zip(values, labels)):
90
- plt.hist(value_set, bins=bins, color=COLORS[i % len(COLORS)], edgecolor='black', alpha=0.5, label=label)
91
-
92
- # Add title to the plot if provided
93
- if title:
94
- plt.title(title)
95
-
96
- # Add legend to the plot if enabled
97
- if legend:
98
- plt.legend(loc='upper right')
99
-
100
- # Set axis labels
101
- plt.xlabel('Values')
102
- plt.ylabel('Frequency')
103
-
104
- # Save the figure if a path is specified
105
- if save_path:
106
- plt.savefig(save_path, bbox_inches='tight')
107
-
108
- # Show the plot
109
- plt.show()