mlsampler 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlsampler-0.3.0/LICENSE +6 -0
- mlsampler-0.3.0/PKG-INFO +21 -0
- mlsampler-0.3.0/README.md +8 -0
- mlsampler-0.3.0/pyproject.toml +22 -0
- mlsampler-0.3.0/setup.cfg +4 -0
- mlsampler-0.3.0/src/constraints/__init__.py +25 -0
- mlsampler-0.3.0/src/constraints/base.py +70 -0
- mlsampler-0.3.0/src/constraints/constraints.py +270 -0
- mlsampler-0.3.0/src/engine/__init__.py +10 -0
- mlsampler-0.3.0/src/engine/random.py +236 -0
- mlsampler-0.3.0/src/mlsampler.egg-info/PKG-INFO +21 -0
- mlsampler-0.3.0/src/mlsampler.egg-info/SOURCES.txt +13 -0
- mlsampler-0.3.0/src/mlsampler.egg-info/dependency_links.txt +1 -0
- mlsampler-0.3.0/src/mlsampler.egg-info/requires.txt +2 -0
- mlsampler-0.3.0/src/mlsampler.egg-info/top_level.txt +2 -0
mlsampler-0.3.0/LICENSE
ADDED
mlsampler-0.3.0/PKG-INFO
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mlsampler
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Flexible constrained random sampler for reverse analysis in machine learning
|
|
5
|
+
Author: Jugai O
|
|
6
|
+
License: LICENSE
|
|
7
|
+
Requires-Python: >=3.11
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: numpy
|
|
11
|
+
Requires-Dist: joblib
|
|
12
|
+
Dynamic: license-file
|
|
13
|
+
|
|
14
|
+
# RandomSampler
|
|
15
|
+
|
|
16
|
+
Flexible constrained random sampler for reverse analysis in machine learning.
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install mlsampler
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "mlsampler"
|
|
7
|
+
version = "0.3.0"
|
|
8
|
+
description = "Flexible constrained random sampler for reverse analysis in machine learning"
|
|
9
|
+
authors = [
|
|
10
|
+
{ name="Jugai O" }
|
|
11
|
+
]
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.11"
|
|
14
|
+
dependencies = [
|
|
15
|
+
"numpy",
|
|
16
|
+
"joblib"
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
license = { text= "LICENSE" }
|
|
20
|
+
|
|
21
|
+
[tool.setuptools.packages.find]
|
|
22
|
+
where = ["src"]
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from .constraints import (
|
|
2
|
+
SumConstraint,
|
|
3
|
+
SumIntConstraint,
|
|
4
|
+
MultihotConstraint,
|
|
5
|
+
RandomSelectConstraint,
|
|
6
|
+
RangeConstraint,
|
|
7
|
+
CategoriesConstraint,
|
|
8
|
+
StepConstraint,
|
|
9
|
+
StepSumConstraint,
|
|
10
|
+
FunctionConstraint
|
|
11
|
+
)
|
|
12
|
+
from .base import Constraints
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"Constraints",
|
|
16
|
+
"SumConstraint",
|
|
17
|
+
"SumIntConstraint",
|
|
18
|
+
"MultihotConstraint",
|
|
19
|
+
"RandomSelectConstraint",
|
|
20
|
+
"RangeConstraint",
|
|
21
|
+
"CategoriesConstraint",
|
|
22
|
+
"StepConstraint",
|
|
23
|
+
"StepSumConstraint",
|
|
24
|
+
"FunctionConstraint"
|
|
25
|
+
]
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from .. import validate as v
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
class Constraints(ABC):
|
|
7
|
+
def __init__(self, cols: list[int]):
|
|
8
|
+
v.validate_cols(cols)
|
|
9
|
+
self.cols = cols
|
|
10
|
+
|
|
11
|
+
def _rng(
|
|
12
|
+
self,
|
|
13
|
+
rng: Optional[np.random.Generator] = None
|
|
14
|
+
) -> np.random.Generator:
|
|
15
|
+
return rng if rng is not None else np.random.default_rng()
|
|
16
|
+
|
|
17
|
+
def __call__(self, row: np.ndarray, rng: Optional[np.random.Generator] = None):
|
|
18
|
+
return self._constrain(row, rng)
|
|
19
|
+
|
|
20
|
+
def __repr__(self):
|
|
21
|
+
if not self.cols:
|
|
22
|
+
return f"{self.__class__.__name__}"
|
|
23
|
+
return f"{self.__class__.__name__}(cols={self.cols})"
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def _constrain(self, row: np.ndarray, rng: Optional[np.random.Generator] = None):
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class SelectConstraint(Constraints):
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
cols: list[int],
|
|
34
|
+
min_used: int = 1,
|
|
35
|
+
max_used: Optional[int] = None,
|
|
36
|
+
reset_cols: bool = True,
|
|
37
|
+
**kwargs
|
|
38
|
+
):
|
|
39
|
+
|
|
40
|
+
if max_used is None:
|
|
41
|
+
max_used = len(cols)
|
|
42
|
+
v.validate_usage(min_used, max_used)
|
|
43
|
+
|
|
44
|
+
super().__init__(cols)
|
|
45
|
+
self.min_used = min_used
|
|
46
|
+
self.max_used = max_used
|
|
47
|
+
self.reset_cols = reset_cols
|
|
48
|
+
|
|
49
|
+
def _reset_cols(self, row:np.ndarray, cols:list, value: float = 0):
|
|
50
|
+
row[cols] = value
|
|
51
|
+
return row
|
|
52
|
+
|
|
53
|
+
def _constrain(self,
|
|
54
|
+
row: np.ndarray,
|
|
55
|
+
rng: Optional[np.random.Generator] = None
|
|
56
|
+
) -> np.ndarray:
|
|
57
|
+
rng = self._rng(rng)
|
|
58
|
+
self.rng = rng
|
|
59
|
+
if self.reset_cols:
|
|
60
|
+
row = self._reset_cols(row, self.cols)
|
|
61
|
+
used = rng.integers(self.min_used, self.max_used + 1)
|
|
62
|
+
selected = rng.choice(self.cols, size=used, replace=False)
|
|
63
|
+
|
|
64
|
+
return self._constrain_selected(row, selected)
|
|
65
|
+
|
|
66
|
+
@abstractmethod
|
|
67
|
+
def _constrain_selected(self, row: np.ndarray, selected: np.ndarray) -> np.ndarray:
|
|
68
|
+
pass
|
|
69
|
+
|
|
70
|
+
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from numpy.random import Generator
|
|
3
|
+
from .base import Constraints, SelectConstraint
|
|
4
|
+
from .. import validate as v
|
|
5
|
+
from ..types import Numeric, ArrayLike, Bool, ConstraintFn
|
|
6
|
+
from typing import Optional, Callable
|
|
7
|
+
from ..errors import ConstraintViolationError, ConstraintError
|
|
8
|
+
|
|
9
|
+
class MultihotConstraint(SelectConstraint):
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
cols: list[int],
|
|
13
|
+
n_hot: int = 1,
|
|
14
|
+
**kwargs
|
|
15
|
+
):
|
|
16
|
+
super().__init__(
|
|
17
|
+
cols,
|
|
18
|
+
min_used=n_hot,
|
|
19
|
+
max_used=n_hot,
|
|
20
|
+
**kwargs
|
|
21
|
+
)
|
|
22
|
+
self.n_hot = n_hot
|
|
23
|
+
|
|
24
|
+
def _constrain_selected(
|
|
25
|
+
self,
|
|
26
|
+
row: np.ndarray,
|
|
27
|
+
selected: np.ndarray,
|
|
28
|
+
rng: Optional[np.random.Generator] = None,
|
|
29
|
+
) -> np.ndarray:
|
|
30
|
+
row[selected] = 1
|
|
31
|
+
return row
|
|
32
|
+
|
|
33
|
+
class RandomSelectConstraint(SelectConstraint):
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
cols: list[int],
|
|
37
|
+
rng: Optional[np.random.Generator] = None,
|
|
38
|
+
**kwargs
|
|
39
|
+
):
|
|
40
|
+
super().__init__(cols, reset_cols=False, **kwargs)
|
|
41
|
+
self.rng = rng
|
|
42
|
+
|
|
43
|
+
def _constrain_selected(
|
|
44
|
+
self,
|
|
45
|
+
row: np.ndarray,
|
|
46
|
+
selected: np.ndarray
|
|
47
|
+
) -> np.ndarray:
|
|
48
|
+
|
|
49
|
+
not_selected = np.setdiff1d(self.cols, selected)
|
|
50
|
+
row[not_selected] = 0
|
|
51
|
+
return row
|
|
52
|
+
|
|
53
|
+
class SumConstraint(SelectConstraint):
|
|
54
|
+
def __init__(self,
|
|
55
|
+
cols: list[int],
|
|
56
|
+
sum_value: Numeric = 1,
|
|
57
|
+
alpha: Optional[np.ndarray] = None,
|
|
58
|
+
rng: Optional[np.random.Generator] = None,
|
|
59
|
+
**kwargs
|
|
60
|
+
):
|
|
61
|
+
v.validate_values(sum_value)
|
|
62
|
+
|
|
63
|
+
super().__init__(cols, **kwargs)
|
|
64
|
+
self.sum_value = sum_value
|
|
65
|
+
self.alpha = alpha
|
|
66
|
+
self.rng = self._rng(rng)
|
|
67
|
+
|
|
68
|
+
def _constrain_selected(self,
|
|
69
|
+
row: np.ndarray,
|
|
70
|
+
selected: np.ndarray
|
|
71
|
+
) -> np.ndarray:
|
|
72
|
+
"""
|
|
73
|
+
Args:
|
|
74
|
+
row (np.ndarray): numpy array representing the row to be modified
|
|
75
|
+
selected (np.ndarray): numpy array of selected column indices
|
|
76
|
+
sum_value (float, optional): The desired sum of the selected columns. Defaults to 1.
|
|
77
|
+
alpha (Optional[np.ndarray], optional): The concentration parameters for the Dirichlet distribution. Defaults to None.
|
|
78
|
+
rng (Optional[np.random.Generator], optional): The random number generator. Defaults to None.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
np.ndarray: The modified row
|
|
82
|
+
"""
|
|
83
|
+
if self.alpha is None:
|
|
84
|
+
alpha = np.ones(len(selected))
|
|
85
|
+
else:
|
|
86
|
+
alpha = self.alpha
|
|
87
|
+
|
|
88
|
+
weights = self.rng.dirichlet(alpha)
|
|
89
|
+
|
|
90
|
+
row[selected] = weights * self.sum_value
|
|
91
|
+
return row
|
|
92
|
+
|
|
93
|
+
class SumIntConstraint(SumConstraint):
|
|
94
|
+
def __init__(self, cols: list[int], sum_value: int = 100, rng: Generator | None = None, **kwargs):
|
|
95
|
+
if sum_value < 0:
|
|
96
|
+
raise TypeError("sum_value must be a non-negative integer")
|
|
97
|
+
super().__init__(cols, sum_value=sum_value, rng=rng, **kwargs)
|
|
98
|
+
|
|
99
|
+
def _constrain_selected(self, row: np.ndarray, selected: np.ndarray) -> np.ndarray:
|
|
100
|
+
k = len(selected)
|
|
101
|
+
if k == 1:
|
|
102
|
+
row[selected[0]] = self.sum_value
|
|
103
|
+
return row
|
|
104
|
+
|
|
105
|
+
cuts = np.sort(self.rng.choice((self.sum_value + k - 1), k - 1, replace=False))
|
|
106
|
+
parts = np.diff(np.concatenate(([-1], cuts, [self.sum_value + k - 1]))) - 1
|
|
107
|
+
|
|
108
|
+
row[selected] = parts
|
|
109
|
+
return row
|
|
110
|
+
|
|
111
|
+
class CategoriesConstraint(Constraints):
|
|
112
|
+
def __init__(self, cols: list[int], values: list[list], strength:str = "hard", **kwargs):
|
|
113
|
+
super().__init__(cols, **kwargs)
|
|
114
|
+
self.strength = strength
|
|
115
|
+
val_tuples = [tuple(v) for v in values]
|
|
116
|
+
if len(set(val_tuples)) != len(values):
|
|
117
|
+
raise ConstraintViolationError("values must be unique")
|
|
118
|
+
|
|
119
|
+
self.values = np.array(values, dtype=object)
|
|
120
|
+
|
|
121
|
+
def _constrain(
|
|
122
|
+
self,
|
|
123
|
+
row: np.ndarray,
|
|
124
|
+
rng: Optional[np.random.Generator] = None,
|
|
125
|
+
) -> np.ndarray:
|
|
126
|
+
rng = self._rng(rng)
|
|
127
|
+
|
|
128
|
+
current_row = row[self.cols]
|
|
129
|
+
mask = np.ones(len(self.values), dtype=bool)
|
|
130
|
+
if self.strength == "hard":
|
|
131
|
+
for i, col in enumerate(self.cols):
|
|
132
|
+
if current_row[i] is not None:
|
|
133
|
+
mask &= (self.values[:, i] == current_row[i])
|
|
134
|
+
elif self.strength == "soft":
|
|
135
|
+
mask = mask
|
|
136
|
+
else:
|
|
137
|
+
raise ConstraintError(f"{self.strength} was not supported")
|
|
138
|
+
|
|
139
|
+
valid_patterns = self.values[mask]
|
|
140
|
+
if len(valid_patterns) == 0:
|
|
141
|
+
raise ConstraintViolationError("No patterns found in categories")
|
|
142
|
+
|
|
143
|
+
idx = rng.integers(len(valid_patterns))
|
|
144
|
+
selected_pattern = valid_patterns[idx]
|
|
145
|
+
|
|
146
|
+
row[self.cols] = selected_pattern
|
|
147
|
+
return row
|
|
148
|
+
|
|
149
|
+
class RangeConstraint(Constraints):
|
|
150
|
+
def __init__(self, cols: list[int], low: float = 0, high: float = 1, **kwargs):
|
|
151
|
+
super().__init__(cols, **kwargs)
|
|
152
|
+
self.low = low
|
|
153
|
+
self.high = high
|
|
154
|
+
|
|
155
|
+
def _constrain(self, row: np.ndarray, rng: Optional[np.random.Generator] = None) -> np.ndarray:
|
|
156
|
+
# No need to reset cols here
|
|
157
|
+
rng = self._rng(rng)
|
|
158
|
+
row[self.cols] = rng.uniform(self.low, self.high, size=len(self.cols))
|
|
159
|
+
return row
|
|
160
|
+
|
|
161
|
+
class StepConstraint(Constraints):
|
|
162
|
+
def __init__(self, col:int, low: float, high: float, step: float, **kwargs):
|
|
163
|
+
super().__init__(cols=[col], **kwargs)
|
|
164
|
+
self.values = np.arange(low, high + step, step)
|
|
165
|
+
self.col = col
|
|
166
|
+
self.step = step
|
|
167
|
+
|
|
168
|
+
def _constrain(
|
|
169
|
+
self,
|
|
170
|
+
row: np.ndarray,
|
|
171
|
+
rng: Optional[np.random.Generator] = None
|
|
172
|
+
) -> np.ndarray:
|
|
173
|
+
rng = self._rng(rng)
|
|
174
|
+
|
|
175
|
+
if row[self.col] is not None:
|
|
176
|
+
if np.any(np.isclose(row[self.col], self.values)):
|
|
177
|
+
return row
|
|
178
|
+
|
|
179
|
+
row[self.col] = rng.choice(self.values)
|
|
180
|
+
return row
|
|
181
|
+
|
|
182
|
+
class StepSumConstraint(StepConstraint):
|
|
183
|
+
def __init__(
|
|
184
|
+
self,
|
|
185
|
+
cols: list[int],
|
|
186
|
+
sum_value: float,
|
|
187
|
+
lows: Optional[ArrayLike] = None,
|
|
188
|
+
highs: Optional[ArrayLike] = None,
|
|
189
|
+
step: float = 1,
|
|
190
|
+
**kwargs
|
|
191
|
+
):
|
|
192
|
+
# Initialize parent with the first column's range
|
|
193
|
+
lows = lows if lows is not None else np.zeros(len(cols))
|
|
194
|
+
highs = highs if highs is not None else np.ones(len(cols))*100
|
|
195
|
+
|
|
196
|
+
super().__init__(
|
|
197
|
+
col=cols[0],
|
|
198
|
+
low=lows[0],
|
|
199
|
+
high=highs[0],
|
|
200
|
+
step=step,
|
|
201
|
+
**kwargs
|
|
202
|
+
)
|
|
203
|
+
self.cols = cols
|
|
204
|
+
|
|
205
|
+
self.lows = np.array(lows)
|
|
206
|
+
self.highs = np.array(highs)
|
|
207
|
+
self.sum_value = sum_value
|
|
208
|
+
self.step = step
|
|
209
|
+
|
|
210
|
+
def _constrain(self, row: np.ndarray, rng: Optional[np.random.Generator] = None) -> np.ndarray:
|
|
211
|
+
rng = self._rng(rng)
|
|
212
|
+
|
|
213
|
+
# 1. Initialize all target columns with their respective minimum (low) values
|
|
214
|
+
current_values = self.lows.copy().astype(float)
|
|
215
|
+
|
|
216
|
+
# 2. Calculate the difference to reach the target sum_value
|
|
217
|
+
current_sum = np.sum(current_values)
|
|
218
|
+
residual = self.sum_value - current_sum
|
|
219
|
+
|
|
220
|
+
# Basic validation for feasibility
|
|
221
|
+
if residual < -1e-9:
|
|
222
|
+
raise ConstraintViolationError(f"Sum of lows ({current_sum}) exceeds sum_value ({self.sum_value}).")
|
|
223
|
+
|
|
224
|
+
if not np.isclose(residual % self.step, 0) and not np.isclose(residual % self.step, self.step):
|
|
225
|
+
raise ConstraintViolationError(f"Residual ({residual}) is not a multiple of step ({self.step}).")
|
|
226
|
+
|
|
227
|
+
# 3. Randomly distribute the residual in 'step' increments
|
|
228
|
+
num_steps = int(round(residual / self.step))
|
|
229
|
+
|
|
230
|
+
for _ in range(num_steps):
|
|
231
|
+
# Find indices where adding a step won't exceed the specific column's high limit
|
|
232
|
+
eligible_indices = [
|
|
233
|
+
i for i, val in enumerate(current_values)
|
|
234
|
+
if val + self.step <= self.highs[i] + 1e-9
|
|
235
|
+
]
|
|
236
|
+
|
|
237
|
+
if not eligible_indices:
|
|
238
|
+
raise ConstraintViolationError("Target sum_value is unreachable within defined highs.")
|
|
239
|
+
|
|
240
|
+
# Pick a random column and increment it
|
|
241
|
+
target_idx = rng.choice(eligible_indices)
|
|
242
|
+
current_values[target_idx] += self.step
|
|
243
|
+
|
|
244
|
+
# 4. Final assignment to the row
|
|
245
|
+
row[self.cols] = current_values
|
|
246
|
+
return row
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
class FunctionConstraint(Constraints):
|
|
250
|
+
def __init__(self, fn: ConstraintFn):
|
|
251
|
+
super().__init__(cols=[])
|
|
252
|
+
self.fn = fn
|
|
253
|
+
|
|
254
|
+
def _constrain(
|
|
255
|
+
self,
|
|
256
|
+
row: np.ndarray,
|
|
257
|
+
rng: Optional[np.random.Generator] = None
|
|
258
|
+
) -> Optional[np.ndarray]:
|
|
259
|
+
result = self.fn(row)
|
|
260
|
+
|
|
261
|
+
if isinstance(result, Bool):
|
|
262
|
+
if not result:
|
|
263
|
+
return None
|
|
264
|
+
else:
|
|
265
|
+
return row
|
|
266
|
+
elif isinstance(result, np.ndarray):
|
|
267
|
+
return result
|
|
268
|
+
else:
|
|
269
|
+
from ..errors import ConstraintTypeError
|
|
270
|
+
raise ConstraintTypeError("Constraint function must return either a boolean or a numpy array")
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
from ..base import BaseSampler, SamplerConfig, DtypeMeta as dm
|
|
4
|
+
from ..constraints import *
|
|
5
|
+
from ..errors import (
|
|
6
|
+
ConstraintViolationError,
|
|
7
|
+
ConstraintTypeError,
|
|
8
|
+
DuplicateColumnWarning
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
from typing import Optional, Callable
|
|
12
|
+
from collections import defaultdict
|
|
13
|
+
from joblib import Parallel, delayed
|
|
14
|
+
|
|
15
|
+
import warnings
|
|
16
|
+
|
|
17
|
+
class RandomSampler(BaseSampler):
|
|
18
|
+
"""
|
|
19
|
+
Random constraint-based sampler.
|
|
20
|
+
|
|
21
|
+
This sampler generates samples based on feature metadata
|
|
22
|
+
inferred from training data. Users can register constraints
|
|
23
|
+
that are applied during sample generation.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
config : SamplerConfig
|
|
28
|
+
Configuration object containing feature metadata and sampling settings.
|
|
29
|
+
|
|
30
|
+
Notes
|
|
31
|
+
-----
|
|
32
|
+
Two types of constraints are supported:
|
|
33
|
+
|
|
34
|
+
- Validation constraints: return a boolean.
|
|
35
|
+
- Constructive constraints: return a modified numpy array.
|
|
36
|
+
|
|
37
|
+
Sampling is performed with retry logic up to `max_retries`.
|
|
38
|
+
Parallel generation is supported via joblib.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
__qualname__ = "RandomSampler"
|
|
42
|
+
|
|
43
|
+
def __init__(self, config: SamplerConfig):
|
|
44
|
+
super().__init__(config)
|
|
45
|
+
self.rng = np.random.default_rng(config.random_state)
|
|
46
|
+
self.features = config.features
|
|
47
|
+
self.batch_size = config.batch_size
|
|
48
|
+
self.n_jobs = config.n_jobs
|
|
49
|
+
self.seed = config.random_state
|
|
50
|
+
self.max_retries = config.max_retries
|
|
51
|
+
|
|
52
|
+
self.dim = len(self.features)
|
|
53
|
+
self.constraints:list[Constraints] = []
|
|
54
|
+
self._registry = {
|
|
55
|
+
"sum": SumConstraint,
|
|
56
|
+
"sumint": SumIntConstraint,
|
|
57
|
+
"multihot": MultihotConstraint,
|
|
58
|
+
"random": RandomSelectConstraint,
|
|
59
|
+
"range": RangeConstraint,
|
|
60
|
+
"categories": CategoriesConstraint,
|
|
61
|
+
"step": StepConstraint,
|
|
62
|
+
"stepsum": StepSumConstraint
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
def reset_constraints(self):
|
|
66
|
+
"""Clear all registered constraints."""
|
|
67
|
+
self.constraints = []
|
|
68
|
+
|
|
69
|
+
def set_constraints(
|
|
70
|
+
self,
|
|
71
|
+
constraint_fn: str | Callable = "sum",
|
|
72
|
+
replace=False,
|
|
73
|
+
**kwargs
|
|
74
|
+
):
|
|
75
|
+
"""
|
|
76
|
+
Set a constraint to the sampler.
|
|
77
|
+
Parameters
|
|
78
|
+
----------
|
|
79
|
+
constraint_fn : str or callable, default="sum"
|
|
80
|
+
Type of constraint. Supported types:
|
|
81
|
+
- callable: user-defined function that takes a row and returns a boolean. Required `fn` parameter.
|
|
82
|
+
- "sum": constraint based on the sum of selected columns. `sum_value` and `cols` must be provided in kwargs.
|
|
83
|
+
- "sumint": similar to "sum" but ensures the sum is an integer. `sum_value` and `cols` must be provided in kwargs.
|
|
84
|
+
- "multihot": constraint ensuring a specified number of columns in a set are active.`n_hot` and `cols` must be provided in kwargs.
|
|
85
|
+
- "random": constraint selecting a random subset of columns. `cols`, `min_used`, and `max_used` can be provided in kwargs.
|
|
86
|
+
- "range": constraint setting values within a specified range. `cols`, `min_val`, and `max_val` must be provided in kwargs.
|
|
87
|
+
- "categories": constraint selecting from a list of categorical values. `col` and `values` must be provided in kwargs.
|
|
88
|
+
replace : bool, default=True
|
|
89
|
+
If True, clears existing constraints before adding the new one.
|
|
90
|
+
**kwargs
|
|
91
|
+
Additional parameters specific to the constraint type.
|
|
92
|
+
|
|
93
|
+
Examples
|
|
94
|
+
--------
|
|
95
|
+
>>> from mlsampler import RandomSampler
|
|
96
|
+
>>> sampler = RandomSampler.setup(X_train)
|
|
97
|
+
>>> # Custom function constraint
|
|
98
|
+
>>> sampler.set_constraints(lambda x: (0 < x[0] < 1) and (0 < x[1] < 1))
|
|
99
|
+
>>> # Sum constraint
|
|
100
|
+
>>> sampler.set_constraints("sum", sum_value=1, cols=[2, 3, 4], max_used=3)
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
if replace:
|
|
104
|
+
self.constraints = []
|
|
105
|
+
|
|
106
|
+
if callable(constraint_fn):
|
|
107
|
+
self.constraints.append(FunctionConstraint(fn=constraint_fn))
|
|
108
|
+
elif isinstance(constraint_fn, str) and constraint_fn in self._registry:
|
|
109
|
+
self.constraints.append(self._registry[constraint_fn](**kwargs))
|
|
110
|
+
else:
|
|
111
|
+
raise ConstraintTypeError(
|
|
112
|
+
f"Unsupported constraint type: {constraint_fn}"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
def _base_sample(self, rng: np.random.Generator):
|
|
116
|
+
# x = np.empty(self.dim, dtype=float)
|
|
117
|
+
x = np.empty(len(self.config.features), dtype=object)
|
|
118
|
+
|
|
119
|
+
for i, f in enumerate(self.features):
|
|
120
|
+
if f.dtype == dm.bin:
|
|
121
|
+
x[i] = rng.integers(0, 2)
|
|
122
|
+
elif f.dtype == dm.integer:
|
|
123
|
+
x[i] = rng.integers(int(f.low), int(f.high) + 1)
|
|
124
|
+
elif f.dtype == dm.flt:
|
|
125
|
+
x[i] = rng.uniform(f.low, f.high)
|
|
126
|
+
else:
|
|
127
|
+
x[i] = 0
|
|
128
|
+
|
|
129
|
+
return x
|
|
130
|
+
|
|
131
|
+
def _fill_categorical_features(self, x, rng: np.random.Generator):
|
|
132
|
+
for i, f in enumerate(self.features):
|
|
133
|
+
if f.dtype == dm.cat:
|
|
134
|
+
if f.categories:
|
|
135
|
+
x[i] = rng.choice(f.categories)
|
|
136
|
+
else:
|
|
137
|
+
x[i] = "unknown"
|
|
138
|
+
return x
|
|
139
|
+
|
|
140
|
+
def _apply_constraints(self, row: np.ndarray) -> Optional[np.ndarray]:
|
|
141
|
+
"""
|
|
142
|
+
Apply registered constraints to a generated row.
|
|
143
|
+
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
row : np.ndarray
|
|
147
|
+
The generated sample to validate/modify.
|
|
148
|
+
|
|
149
|
+
Returns
|
|
150
|
+
-------
|
|
151
|
+
np.ndarray or None
|
|
152
|
+
The modified row if constructive constraints are applied,
|
|
153
|
+
or the original row if only validation constraints are present.
|
|
154
|
+
Returns None if any validation constraint fails.
|
|
155
|
+
"""
|
|
156
|
+
for constraint in self:
|
|
157
|
+
row = constraint(row)
|
|
158
|
+
if row is None:
|
|
159
|
+
return None # Constraint violation
|
|
160
|
+
return row
|
|
161
|
+
|
|
162
|
+
def _detect_conflicts(self):
|
|
163
|
+
col_usage = defaultdict(list)
|
|
164
|
+
|
|
165
|
+
for i, c in enumerate(self.constraints):
|
|
166
|
+
for col in getattr(c, "cols", []):
|
|
167
|
+
col_usage[col].append(i)
|
|
168
|
+
|
|
169
|
+
for col, ids in col_usage.items():
|
|
170
|
+
if len(ids) > 1:
|
|
171
|
+
warnings.warn(
|
|
172
|
+
f"Column {col} used in multiple constraints {ids}",
|
|
173
|
+
DuplicateColumnWarning
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
def _generate_one(self, seed_offset: int = 0):
|
|
177
|
+
base_seed = self.seed if self.seed is not None else None
|
|
178
|
+
rng = np.random.default_rng(
|
|
179
|
+
None if base_seed is None else base_seed + seed_offset
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
for _ in range(self.max_retries):
|
|
183
|
+
x = self._base_sample(rng)
|
|
184
|
+
x = self._fill_categorical_features(x, rng)
|
|
185
|
+
x = self._apply_constraints(x)
|
|
186
|
+
if x is not None:
|
|
187
|
+
return x
|
|
188
|
+
|
|
189
|
+
raise ConstraintViolationError("Max retries exceeded.")
|
|
190
|
+
|
|
191
|
+
def _generate_batch(self, batch_size):
|
|
192
|
+
return np.array([self._generate_one() for _ in range(batch_size)])
|
|
193
|
+
|
|
194
|
+
def sample(self, n_samples: int):
|
|
195
|
+
"""
|
|
196
|
+
Generate samples satisfying all registered constraints.
|
|
197
|
+
|
|
198
|
+
Parameters
|
|
199
|
+
----------
|
|
200
|
+
n_samples : int
|
|
201
|
+
Total number of samples to generate.
|
|
202
|
+
|
|
203
|
+
Returns
|
|
204
|
+
-------
|
|
205
|
+
np.ndarray
|
|
206
|
+
Array of shape (n_samples, n_features).
|
|
207
|
+
|
|
208
|
+
"""
|
|
209
|
+
|
|
210
|
+
self._detect_conflicts()
|
|
211
|
+
|
|
212
|
+
batches = []
|
|
213
|
+
remaining = n_samples
|
|
214
|
+
|
|
215
|
+
print("sampling...", flush=True)
|
|
216
|
+
|
|
217
|
+
while remaining > 0:
|
|
218
|
+
current_batch = min(self.batch_size, remaining)
|
|
219
|
+
|
|
220
|
+
if self.n_jobs == 1:
|
|
221
|
+
batch = self._generate_batch(current_batch)
|
|
222
|
+
else:
|
|
223
|
+
results = Parallel(n_jobs=self.n_jobs)(
|
|
224
|
+
delayed(self._generate_one)()
|
|
225
|
+
for _ in range(current_batch)
|
|
226
|
+
)
|
|
227
|
+
batch = np.array(results)
|
|
228
|
+
|
|
229
|
+
batches.append(batch)
|
|
230
|
+
|
|
231
|
+
remaining -= current_batch
|
|
232
|
+
|
|
233
|
+
print(f"{n_samples - remaining}/{n_samples}", flush=True)
|
|
234
|
+
|
|
235
|
+
return np.vstack(batches)
|
|
236
|
+
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mlsampler
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Flexible constrained random sampler for reverse analysis in machine learning
|
|
5
|
+
Author: Jugai O
|
|
6
|
+
License: LICENSE
|
|
7
|
+
Requires-Python: >=3.11
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: numpy
|
|
11
|
+
Requires-Dist: joblib
|
|
12
|
+
Dynamic: license-file
|
|
13
|
+
|
|
14
|
+
# RandomSampler
|
|
15
|
+
|
|
16
|
+
Flexible constrained random sampler for reverse analysis in machine learning.
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install mlsampler
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/constraints/__init__.py
|
|
5
|
+
src/constraints/base.py
|
|
6
|
+
src/constraints/constraints.py
|
|
7
|
+
src/engine/__init__.py
|
|
8
|
+
src/engine/random.py
|
|
9
|
+
src/mlsampler.egg-info/PKG-INFO
|
|
10
|
+
src/mlsampler.egg-info/SOURCES.txt
|
|
11
|
+
src/mlsampler.egg-info/dependency_links.txt
|
|
12
|
+
src/mlsampler.egg-info/requires.txt
|
|
13
|
+
src/mlsampler.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|