mlquantify 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlquantify/__init__.py +0 -29
- mlquantify/adjust_counting/__init__.py +14 -0
- mlquantify/adjust_counting/_adjustment.py +365 -0
- mlquantify/adjust_counting/_base.py +247 -0
- mlquantify/adjust_counting/_counting.py +145 -0
- mlquantify/adjust_counting/_utils.py +114 -0
- mlquantify/base.py +117 -519
- mlquantify/base_aggregative.py +209 -0
- mlquantify/calibration.py +1 -0
- mlquantify/confidence.py +335 -0
- mlquantify/likelihood/__init__.py +5 -0
- mlquantify/likelihood/_base.py +161 -0
- mlquantify/likelihood/_classes.py +414 -0
- mlquantify/meta/__init__.py +1 -0
- mlquantify/meta/_classes.py +761 -0
- mlquantify/metrics/__init__.py +21 -0
- mlquantify/metrics/_oq.py +109 -0
- mlquantify/metrics/_rq.py +98 -0
- mlquantify/{evaluation/measures.py → metrics/_slq.py} +43 -28
- mlquantify/mixture/__init__.py +7 -0
- mlquantify/mixture/_base.py +153 -0
- mlquantify/mixture/_classes.py +400 -0
- mlquantify/mixture/_utils.py +112 -0
- mlquantify/model_selection/__init__.py +9 -0
- mlquantify/model_selection/_protocol.py +358 -0
- mlquantify/model_selection/_search.py +315 -0
- mlquantify/model_selection/_split.py +1 -0
- mlquantify/multiclass.py +350 -0
- mlquantify/neighbors/__init__.py +9 -0
- mlquantify/neighbors/_base.py +198 -0
- mlquantify/neighbors/_classes.py +159 -0
- mlquantify/{classification/methods.py → neighbors/_classification.py} +48 -66
- mlquantify/neighbors/_kde.py +270 -0
- mlquantify/neighbors/_utils.py +135 -0
- mlquantify/neural/__init__.py +1 -0
- mlquantify/utils/__init__.py +47 -2
- mlquantify/utils/_artificial.py +27 -0
- mlquantify/utils/_constraints.py +219 -0
- mlquantify/utils/_context.py +21 -0
- mlquantify/utils/_decorators.py +36 -0
- mlquantify/utils/_exceptions.py +12 -0
- mlquantify/utils/_get_scores.py +159 -0
- mlquantify/utils/_load.py +18 -0
- mlquantify/utils/_parallel.py +6 -0
- mlquantify/utils/_random.py +36 -0
- mlquantify/utils/_sampling.py +273 -0
- mlquantify/utils/_tags.py +44 -0
- mlquantify/utils/_validation.py +447 -0
- mlquantify/utils/prevalence.py +61 -0
- {mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/METADATA +2 -1
- mlquantify-0.1.9.dist-info/RECORD +53 -0
- mlquantify/classification/__init__.py +0 -1
- mlquantify/evaluation/__init__.py +0 -14
- mlquantify/evaluation/protocol.py +0 -291
- mlquantify/methods/__init__.py +0 -37
- mlquantify/methods/aggregative.py +0 -1159
- mlquantify/methods/meta.py +0 -472
- mlquantify/methods/mixture_models.py +0 -1003
- mlquantify/methods/non_aggregative.py +0 -136
- mlquantify/methods/threshold_optimization.py +0 -869
- mlquantify/model_selection.py +0 -377
- mlquantify/plots.py +0 -367
- mlquantify/utils/general.py +0 -371
- mlquantify/utils/method.py +0 -449
- mlquantify-0.1.7.dist-info/RECORD +0 -22
- {mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/WHEEL +0 -0
- {mlquantify-0.1.7.dist-info → mlquantify-0.1.9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import itertools
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def get_indexes_with_prevalence(y, prevalence: list, sample_size:int):
|
|
6
|
+
"""
|
|
7
|
+
Get indexes for a stratified sample based on the prevalence of each class.
|
|
8
|
+
|
|
9
|
+
Parameters
|
|
10
|
+
----------
|
|
11
|
+
y : np.ndarray
|
|
12
|
+
Array of class labels.
|
|
13
|
+
prevalence : list
|
|
14
|
+
List of prevalences for each class.
|
|
15
|
+
sample_size : int
|
|
16
|
+
Number of samples to generate.
|
|
17
|
+
classes : list
|
|
18
|
+
List of unique classes.
|
|
19
|
+
|
|
20
|
+
Returns
|
|
21
|
+
-------
|
|
22
|
+
list
|
|
23
|
+
List of indexes for the stratified sample.
|
|
24
|
+
"""
|
|
25
|
+
classes = np.unique(y)
|
|
26
|
+
|
|
27
|
+
# Ensure the sum of prevalences is 1
|
|
28
|
+
assert np.isclose(sum(prevalence), 1), "The sum of prevalences must be 1"
|
|
29
|
+
# Ensure the number of prevalences matches the number of classes
|
|
30
|
+
assert len(prevalence) == len(classes), "The number of prevalences must match the number of classes"
|
|
31
|
+
|
|
32
|
+
sampled_indexes = []
|
|
33
|
+
total_sampled = 0
|
|
34
|
+
|
|
35
|
+
for i, class_ in enumerate(classes):
|
|
36
|
+
|
|
37
|
+
if i == len(classes) - 1:
|
|
38
|
+
num_samples = sample_size - total_sampled
|
|
39
|
+
else:
|
|
40
|
+
num_samples = int(sample_size * prevalence[i])
|
|
41
|
+
|
|
42
|
+
# Get the indexes of the current class
|
|
43
|
+
class_indexes = np.where(y == class_)[0]
|
|
44
|
+
|
|
45
|
+
# Sample the indexes for the current class
|
|
46
|
+
sampled_class_indexes = np.random.choice(class_indexes, size=num_samples, replace=True)
|
|
47
|
+
|
|
48
|
+
sampled_indexes.extend(sampled_class_indexes)
|
|
49
|
+
total_sampled += num_samples
|
|
50
|
+
|
|
51
|
+
np.random.shuffle(sampled_indexes) # Shuffle after collecting all indexes
|
|
52
|
+
|
|
53
|
+
return sampled_indexes
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def simplex_uniform_kraemer(n_dim: int,
|
|
58
|
+
n_prev: int,
|
|
59
|
+
n_iter: int,
|
|
60
|
+
min_val: float = 0.0,
|
|
61
|
+
max_val: float = 1.0,
|
|
62
|
+
max_tries: int = 1000) -> np.ndarray:
|
|
63
|
+
"""
|
|
64
|
+
Generates n_prev prevalence vectors of n_dim classes uniformly
|
|
65
|
+
distributed on the simplex, with optional lower and upper bounds.
|
|
66
|
+
|
|
67
|
+
Based on the algorithm of Kramer et al. for uniform sampling on a simplex.
|
|
68
|
+
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
n_dim : int
|
|
72
|
+
Number of dimensions (classes).
|
|
73
|
+
n_prev : int
|
|
74
|
+
Number of prevalence vectors to generate.
|
|
75
|
+
min_val : float, optional
|
|
76
|
+
Minimum allowed prevalence for each class (default=0.0).
|
|
77
|
+
max_val : float, optional
|
|
78
|
+
Maximum allowed prevalence for each class (default=1.0).
|
|
79
|
+
max_tries : int, optional
|
|
80
|
+
Maximum number of sampling iterations to reach the target n_prev.
|
|
81
|
+
|
|
82
|
+
Returns
|
|
83
|
+
-------
|
|
84
|
+
np.ndarray
|
|
85
|
+
Array of shape (n_prev, n_dim) with valid prevalence vectors.
|
|
86
|
+
"""
|
|
87
|
+
if n_dim < 2:
|
|
88
|
+
raise ValueError("n_dim must be >= 2.")
|
|
89
|
+
if not (0 <= min_val < 1) or not (0 < max_val <= 1):
|
|
90
|
+
raise ValueError("min_val and max_val must be between 0 and 1.")
|
|
91
|
+
if min_val * n_dim > 1 or max_val * n_dim < 1:
|
|
92
|
+
raise ValueError("Invalid bounds: they make it impossible to sum to 1.")
|
|
93
|
+
|
|
94
|
+
effective_simplex_size = 1 - n_dim * min_val
|
|
95
|
+
prevs = []
|
|
96
|
+
|
|
97
|
+
# Amostragem em blocos até atingir n_prev válidos
|
|
98
|
+
tries = 0
|
|
99
|
+
batch_size = max(n_prev, 1000) # Gera em blocos grandes para eficiência
|
|
100
|
+
|
|
101
|
+
while len(prevs) < n_prev and tries < max_tries:
|
|
102
|
+
tries += 1
|
|
103
|
+
|
|
104
|
+
# Geração de pontos uniformes no simplex reduzido
|
|
105
|
+
u = np.random.uniform(0, 1, (batch_size, n_dim - 1))
|
|
106
|
+
u.sort(axis=1)
|
|
107
|
+
simplex = np.diff(np.concatenate([np.zeros((batch_size, 1)), u, np.ones((batch_size, 1))], axis=1), axis=1)
|
|
108
|
+
|
|
109
|
+
# Escala para [min_val, max_val]
|
|
110
|
+
scaled = min_val + simplex * effective_simplex_size
|
|
111
|
+
|
|
112
|
+
# Normaliza para garantir soma = 1
|
|
113
|
+
scaled /= scaled.sum(axis=1, keepdims=True)
|
|
114
|
+
|
|
115
|
+
# Filtra apenas vetores válidos
|
|
116
|
+
mask = np.all((scaled >= min_val) & (scaled <= max_val), axis=1)
|
|
117
|
+
valid = scaled[mask]
|
|
118
|
+
|
|
119
|
+
if valid.size > 0:
|
|
120
|
+
prevs.append(valid)
|
|
121
|
+
|
|
122
|
+
if not prevs:
|
|
123
|
+
raise RuntimeError("No valid prevalences found with given constraints. Try adjusting min_val/max_val.")
|
|
124
|
+
|
|
125
|
+
if n_iter > 1:
|
|
126
|
+
prevs = np.tile(prevs, (n_iter, 1))
|
|
127
|
+
|
|
128
|
+
result = np.vstack(prevs)
|
|
129
|
+
return result[:n_prev]
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def simplex_grid_sampling(
|
|
134
|
+
n_dim: int,
|
|
135
|
+
n_prev: int,
|
|
136
|
+
n_iter: int,
|
|
137
|
+
min_val: float,
|
|
138
|
+
max_val: float
|
|
139
|
+
) -> np.ndarray:
|
|
140
|
+
"""
|
|
141
|
+
Efficiently generates artificial prevalence vectors that sum to 1
|
|
142
|
+
and respect min_val ≤ p_i ≤ max_val for all i.
|
|
143
|
+
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
n_dim : int
|
|
147
|
+
Number of dimensions (classes).
|
|
148
|
+
n_prev : int
|
|
149
|
+
Number of prevalence points per dimension (grid density).
|
|
150
|
+
n_iter : int
|
|
151
|
+
Number of repetitions.
|
|
152
|
+
min_val : float
|
|
153
|
+
Minimum allowed value for each prevalence component.
|
|
154
|
+
max_val : float
|
|
155
|
+
Maximum allowed value for each prevalence component.
|
|
156
|
+
|
|
157
|
+
Returns
|
|
158
|
+
-------
|
|
159
|
+
np.ndarray
|
|
160
|
+
Array of shape (n_samples, n_dim) with all valid prevalence vectors.
|
|
161
|
+
"""
|
|
162
|
+
if n_dim < 2:
|
|
163
|
+
raise ValueError("n_dim must be at least 2.")
|
|
164
|
+
if not (0 <= min_val < max_val <= 1):
|
|
165
|
+
raise ValueError("min_val and max_val must satisfy 0 <= min_val < max_val <= 1.")
|
|
166
|
+
if min_val * n_dim > 1 or max_val * n_dim < 1:
|
|
167
|
+
raise ValueError("Impossible combination of min_val, max_val, and n_dim — cannot sum to 1.")
|
|
168
|
+
|
|
169
|
+
# Intervalo de possíveis valores para cada dimensão (exceto a última)
|
|
170
|
+
s = np.linspace(min_val, max_val, n_prev)
|
|
171
|
+
grids = np.stack(np.meshgrid(*([s] * (n_dim - 1)), indexing="ij"), axis=-1)
|
|
172
|
+
grid_flat = grids.reshape(-1, n_dim - 1)
|
|
173
|
+
|
|
174
|
+
# Calcula o último valor para garantir soma = 1
|
|
175
|
+
last_col = 1.0 - np.sum(grid_flat, axis=1)
|
|
176
|
+
prevs = np.hstack([grid_flat, last_col[:, None]])
|
|
177
|
+
|
|
178
|
+
# Filtro de validade: dentro dos limites
|
|
179
|
+
mask = np.all((prevs >= min_val) & (prevs <= max_val), axis=1)
|
|
180
|
+
prevs = prevs[mask]
|
|
181
|
+
|
|
182
|
+
# Repetição se necessário
|
|
183
|
+
if n_iter > 1:
|
|
184
|
+
prevs = np.tile(prevs, (n_iter, 1))
|
|
185
|
+
|
|
186
|
+
return prevs
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def simplex_uniform_sampling(
|
|
192
|
+
n_dim: int,
|
|
193
|
+
n_prev: int,
|
|
194
|
+
n_iter: int,
|
|
195
|
+
min_val: float,
|
|
196
|
+
max_val: float
|
|
197
|
+
) -> np.ndarray:
|
|
198
|
+
"""
|
|
199
|
+
Generates uniformly distributed prevalence vectors within the simplex,
|
|
200
|
+
constrained by min_val ≤ p_i ≤ max_val.
|
|
201
|
+
|
|
202
|
+
Parameters
|
|
203
|
+
----------
|
|
204
|
+
n_dim : int
|
|
205
|
+
Number of dimensions.
|
|
206
|
+
n_prev : int
|
|
207
|
+
Number of prevalence samples to generate.
|
|
208
|
+
n_iter : int
|
|
209
|
+
Number of repetitions.
|
|
210
|
+
min_val : float
|
|
211
|
+
Minimum allowed value for each prevalence component.
|
|
212
|
+
max_val : float
|
|
213
|
+
Maximum allowed value for each prevalence component.
|
|
214
|
+
|
|
215
|
+
Returns
|
|
216
|
+
-------
|
|
217
|
+
np.ndarray
|
|
218
|
+
Array of shape (n_samples, n_dim) with uniformly distributed prevalences.
|
|
219
|
+
"""
|
|
220
|
+
if min_val * n_dim > 1 or max_val * n_dim < 1:
|
|
221
|
+
raise ValueError("Invalid min_val/max_val for simplex constraints.")
|
|
222
|
+
|
|
223
|
+
total_samples = n_prev * n_iter
|
|
224
|
+
samples = []
|
|
225
|
+
|
|
226
|
+
while len(samples) < total_samples:
|
|
227
|
+
# Gera candidatos via Dirichlet (uniforme no simplex)
|
|
228
|
+
x = np.random.dirichlet(np.ones(n_dim), size=total_samples * 2)
|
|
229
|
+
# Filtra os que respeitam os limites
|
|
230
|
+
mask = np.all((x >= min_val) & (x <= max_val), axis=1)
|
|
231
|
+
valid = x[mask]
|
|
232
|
+
if len(valid) > 0:
|
|
233
|
+
samples.append(valid)
|
|
234
|
+
|
|
235
|
+
if len(samples) > 0:
|
|
236
|
+
all_samples = np.concatenate(samples, axis=0)
|
|
237
|
+
if len(all_samples) >= total_samples:
|
|
238
|
+
return all_samples[:total_samples]
|
|
239
|
+
|
|
240
|
+
return np.concatenate(samples, axis=0)[:total_samples]
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def bootstrap_sample_indices(
|
|
244
|
+
n_samples: int,
|
|
245
|
+
batch_size: int,
|
|
246
|
+
n_bootstraps: int,
|
|
247
|
+
random_state: int = None
|
|
248
|
+
):
|
|
249
|
+
"""
|
|
250
|
+
Generate bootstrap sample indices for a dataset.
|
|
251
|
+
|
|
252
|
+
Parameters
|
|
253
|
+
----------
|
|
254
|
+
n_samples : int
|
|
255
|
+
Total number of samples in the dataset.
|
|
256
|
+
batch_size : int
|
|
257
|
+
Number of samples in each bootstrap sample.
|
|
258
|
+
n_bootstraps : int
|
|
259
|
+
Number of bootstrap samples to generate.
|
|
260
|
+
random_state : int, optional
|
|
261
|
+
Random seed for reproducibility.
|
|
262
|
+
|
|
263
|
+
Yields
|
|
264
|
+
------
|
|
265
|
+
np.ndarray
|
|
266
|
+
Array containing indices for a bootstrap sample.
|
|
267
|
+
"""
|
|
268
|
+
if random_state is not None:
|
|
269
|
+
np.random.seed(random_state)
|
|
270
|
+
|
|
271
|
+
for _ in range(n_bootstraps):
|
|
272
|
+
indices = np.random.choice(n_samples, size=batch_size, replace=True)
|
|
273
|
+
yield indices
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
@dataclass
|
|
5
|
+
class TargetInputTags:
|
|
6
|
+
|
|
7
|
+
one_d: bool = True
|
|
8
|
+
two_d: bool = False
|
|
9
|
+
continuous: bool = False
|
|
10
|
+
categorical: bool = True
|
|
11
|
+
multi_class: bool = True
|
|
12
|
+
required: bool = False
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class PredictionRequirements:
|
|
16
|
+
|
|
17
|
+
requires_train_proba: bool = True
|
|
18
|
+
requires_train_labels: bool = True
|
|
19
|
+
requires_test_predictions: bool = True
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class Tags:
|
|
24
|
+
|
|
25
|
+
estimation_type: str | None
|
|
26
|
+
estimator_function: str | None
|
|
27
|
+
estimator_type: str | None
|
|
28
|
+
aggregation_type: str | None
|
|
29
|
+
target_input_tags: TargetInputTags
|
|
30
|
+
prediction_requirements: PredictionRequirements
|
|
31
|
+
has_estimator: bool = False
|
|
32
|
+
requires_fit: bool = True
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_tags(quantifier):
|
|
37
|
+
try:
|
|
38
|
+
tags = quantifier.__mlquantify_tags__()
|
|
39
|
+
except AttributeError as ext:
|
|
40
|
+
if "has no attribute '__mlquantify_tags__'" in str(ext):
|
|
41
|
+
raise AttributeError("Quantifier is missing __mlquantify_tags__ method, ensure your class is inheriting from BaseQuantifier.")
|
|
42
|
+
else:
|
|
43
|
+
raise
|
|
44
|
+
return tags
|