synthbench 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
synthbench/__init__.py ADDED
@@ -0,0 +1,77 @@
1
+ from __future__ import annotations
2
+
3
+ from synthbench._version import __version__
4
+ from synthbench.corruptors import (
5
+ BaseCorruptor,
6
+ BaseLabelCorruptor,
7
+ CategoricalCorruptor,
8
+ CollinearityCorruptor,
9
+ LabelNoiseCorruptor,
10
+ MeasurementNoiseCorruptor,
11
+ MissingDataCorruptor,
12
+ OutlierCorruptor,
13
+ )
14
+ from synthbench.dgps import (
15
+ AdditiveDGP,
16
+ BaseDGP,
17
+ FriedmanDGP,
18
+ GeometricDGP,
19
+ LinearDGP,
20
+ PolynomialDGP,
21
+ SparseDGP,
22
+ TreeDGP,
23
+ )
24
+ from synthbench.pipeline import BenchPipeline, BenchResult
25
+ from synthbench.suite import BenchSuite
26
+ from synthbench.sweeps import difficulty_sweep, experiment_grid, severity_sweep
27
+
28
+ __all__ = [
29
+ "__version__",
30
+ "AdditiveDGP",
31
+ "BaseDGP",
32
+ "BaseCorruptor",
33
+ "BaseLabelCorruptor",
34
+ "BenchPipeline",
35
+ "BenchResult",
36
+ "BenchSuite",
37
+ "CategoricalCorruptor",
38
+ "CollinearityCorruptor",
39
+ "difficulty_sweep",
40
+ "experiment_grid",
41
+ "FriedmanDGP",
42
+ "GeometricDGP",
43
+ "LabelNoiseCorruptor",
44
+ "LinearDGP",
45
+ "MeasurementNoiseCorruptor",
46
+ "MissingDataCorruptor",
47
+ "OutlierCorruptor",
48
+ "PolynomialDGP",
49
+ "severity_sweep",
50
+ "SparseDGP",
51
+ "TreeDGP",
52
+ # RandomNeuralDGP is available only with synthbench[neural] (torch required).
53
+ # It is exposed lazily via __getattr__ below so that importing synthbench
54
+ # does NOT unconditionally load torch into sys.modules.
55
+ "RandomNeuralDGP",
56
+ ]
57
+
58
+ # Names that are only available with the neural extra (torch).
59
+ _NEURAL_NAMES = {"RandomNeuralDGP"}
60
+
61
+
62
+ def __getattr__(name: str):
63
+ """Lazy loader for optional neural symbols (requires synthbench[neural])."""
64
+ if name in _NEURAL_NAMES:
65
+ try:
66
+ from synthbench.dgps.neural import RandomNeuralDGP
67
+ except ImportError as exc:
68
+ raise ImportError(
69
+ f"{name} requires PyTorch. "
70
+ "Install it with: pip install synthbench[neural]"
71
+ ) from exc
72
+ # Cache in module namespace so subsequent lookups are O(1).
73
+ import synthbench as _self
74
+
75
+ setattr(_self, name, RandomNeuralDGP)
76
+ return RandomNeuralDGP
77
+ raise AttributeError(f"module 'synthbench' has no attribute {name!r}")
synthbench/_seed.py ADDED
@@ -0,0 +1,43 @@
1
+ from __future__ import annotations
2
+
3
+ import numpy as np
4
+ from numpy.random import SeedSequence
5
+
6
+
7
+ def derive_seeds(master_state: int | SeedSequence, n: int) -> list[int]:
8
+ """Derive *n* independent integer seeds from a master state.
9
+
10
+ Uses ``numpy.random.SeedSequence.spawn`` so that child seeds are
11
+ statistically independent and fully reproducible from the master state.
12
+ The global numpy RNG state is never touched.
13
+
14
+ Parameters
15
+ ----------
16
+ master_state:
17
+ Either a plain integer that seeds the root ``SeedSequence``, or an
18
+ already-constructed ``SeedSequence`` (e.g. from a parent context).
19
+ n:
20
+ Number of independent child seeds to derive.
21
+
22
+ Returns
23
+ -------
24
+ list[int]
25
+ Plain Python ints (JSON-serializable, not numpy dtypes).
26
+ """
27
+ if isinstance(master_state, SeedSequence):
28
+ seq = master_state
29
+ else:
30
+ seq = SeedSequence(int(master_state))
31
+
32
+ children = seq.spawn(n)
33
+ # generate_state(1)[0] gives a single uint64 from the child sequence
34
+ return [int(child.generate_state(1)[0]) for child in children]
35
+
36
+
37
+ def make_rng(seed: int) -> np.random.RandomState:
38
+ """Return a local ``numpy.random.RandomState`` seeded with *seed*.
39
+
40
+ Use ``sklearn.utils.check_random_state`` for public APIs that accept
41
+ ``int | None | RandomState``.
42
+ """
43
+ return np.random.RandomState(seed)
synthbench/_version.py ADDED
@@ -0,0 +1,8 @@
1
+ from __future__ import annotations
2
+
3
+ try:
4
+ from importlib.metadata import PackageNotFoundError, version
5
+
6
+ __version__ = version("synthbench")
7
+ except PackageNotFoundError:
8
+ __version__ = "0.0.0+dev"
@@ -0,0 +1,23 @@
1
+ from __future__ import annotations
2
+
3
+ from synthbench.corruptors.base import BaseCorruptor
4
+ from synthbench.corruptors.categorical import CategoricalCorruptor
5
+ from synthbench.corruptors.collinearity import CollinearityCorruptor
6
+ from synthbench.corruptors.label_base import (
7
+ BaseLabelCorruptor,
8
+ )
9
+ from synthbench.corruptors.label_noise import LabelNoiseCorruptor
10
+ from synthbench.corruptors.measurement_noise import MeasurementNoiseCorruptor
11
+ from synthbench.corruptors.missing_data import MissingDataCorruptor
12
+ from synthbench.corruptors.outlier import OutlierCorruptor
13
+
14
+ __all__ = [
15
+ "BaseCorruptor",
16
+ "BaseLabelCorruptor",
17
+ "CategoricalCorruptor",
18
+ "CollinearityCorruptor",
19
+ "LabelNoiseCorruptor",
20
+ "MeasurementNoiseCorruptor",
21
+ "MissingDataCorruptor",
22
+ "OutlierCorruptor",
23
+ ]
@@ -0,0 +1,78 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+
5
+ import numpy as np
6
+
7
+ _CORRUPTOR_REGISTRY: dict[str, type] = {}
8
+
9
+
10
+ class BaseCorruptor(ABC):
11
+ """Abstract base class for all corruptors.
12
+
13
+ Concrete subclasses must:
14
+ - Declare ``key="some_key"`` in their class signature to be auto-registered.
15
+ - Implement [corrupt][synthbench.corruptors.base.BaseCorruptor.corrupt].
16
+
17
+ Corruptors transform X only; y is never passed in or mutated.
18
+
19
+ Example::
20
+
21
+ class CollinearityCorruptor(BaseCorruptor, key="collinearity"):
22
+ def corrupt(self, X, metadata, random_state):
23
+ ...
24
+ """
25
+
26
+ def __init_subclass__(cls, key: str | None = None, **kwargs: object) -> None:
27
+ super().__init_subclass__(**kwargs)
28
+ if key is not None:
29
+ if key in _CORRUPTOR_REGISTRY:
30
+ raise ValueError(
31
+ f"Duplicate corruptor key '{key}': already registered by "
32
+ f"{_CORRUPTOR_REGISTRY[key].__qualname__}"
33
+ )
34
+ _CORRUPTOR_REGISTRY[key] = cls
35
+
36
+ @abstractmethod
37
+ def corrupt(
38
+ self,
39
+ X: np.ndarray,
40
+ metadata: dict,
41
+ random_state: int,
42
+ ) -> tuple[np.ndarray, dict]:
43
+ """Apply a structural transformation to X.
44
+
45
+ Parameters
46
+ ----------
47
+ X:
48
+ Feature matrix of shape (n_samples, n_features). Must not be
49
+ modified in-place; return a new array.
50
+ metadata:
51
+ The BenchResult metadata dict produced by the DGP. Corruptors
52
+ may add keys (e.g. effective_feature_importances) but must not
53
+ remove or overwrite existing keys set by the DGP.
54
+ random_state:
55
+ Integer seed. Each corruptor derives its own RNG from this value
56
+ so that results are fully reproducible.
57
+
58
+ Returns
59
+ -------
60
+ X_corrupted : np.ndarray
61
+ Transformed feature matrix, same shape as X.
62
+ updated_metadata : dict
63
+ Metadata dict with any corruptor-specific fields added.
64
+ """
65
+
66
+ def get_params(self) -> dict[str, object]:
67
+ """Return the corruptor's current configuration as a plain dict.
68
+
69
+ Concrete corruptors should override this to return their __init__
70
+ parameters. Used by BenchPipeline to record component provenance.
71
+ """
72
+ raise NotImplementedError(
73
+ f"{type(self).__qualname__}.get_params() is not implemented. "
74
+ "Concrete corruptor subclasses must override this method."
75
+ )
76
+
77
+
78
+ __all__ = ["BaseCorruptor", "_CORRUPTOR_REGISTRY"]
@@ -0,0 +1,154 @@
1
+ """CategoricalCorruptor -- bins continuous features into integer categories."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+
7
+ from synthbench.corruptors.base import BaseCorruptor
8
+
9
+ _SEVERITY_N_BINS: dict[str, int] = {
10
+ "low": 10,
11
+ "medium": 5,
12
+ "high": 2,
13
+ }
14
+
15
+
16
+ class CategoricalCorruptor(BaseCorruptor, key="categorical"):
17
+ """Converts continuous features to integer-encoded bins.
18
+
19
+ For each targeted column, quantile-based bin edges are computed and
20
+ ``np.digitize`` is used to assign each sample to a bin index
21
+ ``0, 1, ..., n_bins-1``. The column values are replaced with these
22
+ integer indices cast to ``float``.
23
+
24
+ Feature importances are discounted by the factor ``(1 - 1/n_bins)``
25
+ for each targeted column, then re-normalized to sum to 1.
26
+
27
+ Parameters
28
+ ----------
29
+ severity:
30
+ Controls default ``n_bins`` when ``n_bins`` is not provided:
31
+ ``"low"`` -> 10, ``"medium"`` -> 5, ``"high"`` -> 2.
32
+ n_bins:
33
+ If provided, overrides the severity-derived number of bins.
34
+ columns:
35
+ Indices of columns to target. ``None`` targets all columns.
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ severity: str = "medium",
41
+ n_bins: int | None = None,
42
+ columns: list[int] | None = None,
43
+ ) -> None:
44
+ if severity not in _SEVERITY_N_BINS:
45
+ raise ValueError(
46
+ f"Invalid severity '{severity}'. "
47
+ f"Must be one of {list(_SEVERITY_N_BINS)}."
48
+ )
49
+ self.severity = severity
50
+ self.n_bins = n_bins
51
+ self.columns = columns
52
+
53
+ # ------------------------------------------------------------------
54
+ # Private helpers
55
+ # ------------------------------------------------------------------
56
+
57
+ def _effective_n_bins(self) -> int:
58
+ """Return n_bins, falling back to severity default."""
59
+ if self.n_bins is not None:
60
+ return int(self.n_bins)
61
+ return _SEVERITY_N_BINS[self.severity]
62
+
63
+ @staticmethod
64
+ def _bin_column(col_values: np.ndarray, n_bins: int) -> np.ndarray:
65
+ """Quantile-bin a 1-D array into integer indices 0 .. n_bins-1."""
66
+ edges = np.quantile(col_values, np.linspace(0.0, 1.0, n_bins + 1))
67
+ # Handle constant columns: all edges are identical -> all go to bin 0
68
+ if np.all(edges == edges[0]):
69
+ return np.zeros(len(col_values), dtype=float)
70
+ # np.digitize with edges[1:-1] gives indices 0 to n_bins-1
71
+ bins = np.digitize(col_values, edges[1:-1])
72
+ return bins.astype(float)
73
+
74
+ # ------------------------------------------------------------------
75
+ # Public API
76
+ # ------------------------------------------------------------------
77
+
78
+ def corrupt(
79
+ self,
80
+ X: np.ndarray,
81
+ metadata: dict,
82
+ random_state: int,
83
+ ) -> tuple[np.ndarray, dict]:
84
+ """Bin targeted columns and update importances.
85
+
86
+ Parameters
87
+ ----------
88
+ X:
89
+ Feature matrix of shape ``(n_samples, n_features)``.
90
+ metadata:
91
+ Metadata dict from the DGP. Must contain either
92
+ ``effective_feature_importances`` or
93
+ ``signal_feature_importances``.
94
+ random_state:
95
+ Integer seed. Binning is fully deterministic given X, so
96
+ ``random_state`` does not affect output; it is accepted for
97
+ interface consistency.
98
+
99
+ Returns
100
+ -------
101
+ X_corrupted:
102
+ Same shape as X; targeted columns contain integer-valued floats.
103
+ updated_metadata:
104
+ Metadata with updated ``effective_feature_importances``.
105
+ """
106
+ # random_state accepted for contract consistency; binning is deterministic
107
+ _ = random_state
108
+
109
+ n_features = X.shape[1]
110
+ n_bins = self._effective_n_bins()
111
+
112
+ # Determine targeted columns
113
+ targeted = (
114
+ list(range(n_features)) if self.columns is None else list(self.columns)
115
+ )
116
+
117
+ # Copy X -- do not modify in-place
118
+ X_out = X.copy()
119
+ for col in targeted:
120
+ X_out[:, col] = self._bin_column(X[:, col], n_bins)
121
+
122
+ # Retrieve effective importances (fall back to signal importances)
123
+ eff = metadata.get("effective_feature_importances")
124
+ if eff is None:
125
+ eff = metadata.get("signal_feature_importances", {})
126
+ importances: dict[str, float] = dict(eff)
127
+
128
+ # Discount targeted column importances
129
+ discount = 1.0 - 1.0 / n_bins
130
+ for col in targeted:
131
+ key = f"feature_{col}"
132
+ importances[key] = importances.get(key, 0.0) * discount
133
+
134
+ # Re-normalize to sum to 1.0
135
+ total = sum(importances.values())
136
+ if total > 0:
137
+ importances = {k: v / total for k, v in importances.items()}
138
+
139
+ # Build output metadata (no in-place mutation)
140
+ meta_out = dict(metadata)
141
+ meta_out["effective_feature_importances"] = importances
142
+
143
+ return X_out, meta_out
144
+
145
+ def get_params(self) -> dict[str, object]:
146
+ """Return constructor parameters as a plain dict."""
147
+ return {
148
+ "severity": self.severity,
149
+ "n_bins": self.n_bins,
150
+ "columns": self.columns,
151
+ }
152
+
153
+
154
+ __all__ = ["CategoricalCorruptor"]
@@ -0,0 +1,174 @@
1
+ """CollinearityCorruptor -- appends correlated proxy columns to X."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+
7
+ from synthbench._seed import make_rng
8
+ from synthbench.corruptors.base import BaseCorruptor
9
+
10
+ _SEVERITY_NOISE_STD: dict[str, float] = {
11
+ "low": 0.05,
12
+ "medium": 0.3,
13
+ "high": 0.8,
14
+ }
15
+
16
+
17
+ class CollinearityCorruptor(BaseCorruptor, key="collinearity"):
18
+ """Appends proxy columns that are noisy copies of targeted features.
19
+
20
+ For each targeted column ``c``, a proxy column is generated as::
21
+
22
+ proxy = X[:, c] * scale + N(0, noise_std, n_samples)
23
+
24
+ The proxy is appended at the end of X, expanding the feature matrix
25
+ from ``(n_samples, n_features)`` to
26
+ ``(n_samples, n_features + n_targeted)``.
27
+
28
+ Feature importances are split between the original and proxy columns
29
+ using the coefficient of determination (r^2) of the proxy relative to
30
+ the original signal.
31
+
32
+ Parameters
33
+ ----------
34
+ severity:
35
+ Controls default ``noise_std`` when ``noise_std`` is not provided:
36
+ ``"low"`` -> 0.05, ``"medium"`` -> 0.3, ``"high"`` -> 0.8.
37
+ noise_std:
38
+ If provided, overrides the severity-derived noise standard deviation.
39
+ scale:
40
+ Multiplicative factor applied to the source column when generating
41
+ the proxy. Defaults to 1.0.
42
+ columns:
43
+ Indices of columns to target. ``None`` targets all columns.
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ severity: str = "medium",
49
+ noise_std: float | None = None,
50
+ scale: float = 1.0,
51
+ columns: list[int] | None = None,
52
+ ) -> None:
53
+ if severity not in _SEVERITY_NOISE_STD:
54
+ raise ValueError(
55
+ f"Invalid severity '{severity}'. "
56
+ f"Must be one of {list(_SEVERITY_NOISE_STD)}."
57
+ )
58
+ self.severity = severity
59
+ self.noise_std = noise_std
60
+ self.scale = scale
61
+ self.columns = columns
62
+
63
+ # ------------------------------------------------------------------
64
+ # Private helpers
65
+ # ------------------------------------------------------------------
66
+
67
+ def _effective_noise_std(self) -> float:
68
+ """Return noise_std, falling back to severity default."""
69
+ if self.noise_std is not None:
70
+ return float(self.noise_std)
71
+ return _SEVERITY_NOISE_STD[self.severity]
72
+
73
+ # ------------------------------------------------------------------
74
+ # Public API
75
+ # ------------------------------------------------------------------
76
+
77
+ def corrupt(
78
+ self,
79
+ X: np.ndarray,
80
+ metadata: dict,
81
+ random_state: int,
82
+ ) -> tuple[np.ndarray, dict]:
83
+ """Append proxy columns and update importances.
84
+
85
+ Parameters
86
+ ----------
87
+ X:
88
+ Feature matrix of shape ``(n_samples, n_features)``.
89
+ metadata:
90
+ Metadata dict from the DGP. Must contain either
91
+ ``effective_feature_importances`` or
92
+ ``signal_feature_importances``.
93
+ random_state:
94
+ Integer seed for reproducibility.
95
+
96
+ Returns
97
+ -------
98
+ X_corrupted:
99
+ Shape ``(n_samples, n_features + n_targeted)``.
100
+ updated_metadata:
101
+ Metadata with updated ``effective_feature_importances`` and
102
+ ``proxy_source_map``.
103
+ """
104
+ rng = make_rng(random_state)
105
+ n_samples, n_features = X.shape
106
+ noise_std = self._effective_noise_std()
107
+
108
+ # Determine targeted columns
109
+ targeted = (
110
+ list(range(n_features)) if self.columns is None else list(self.columns)
111
+ )
112
+
113
+ # Build proxy columns
114
+ proxy_cols: list[np.ndarray] = []
115
+ for col in targeted:
116
+ signal = X[:, col] * self.scale
117
+ noise = rng.normal(0.0, noise_std, size=n_samples)
118
+ proxy_cols.append(signal + noise)
119
+
120
+ # Append proxies
121
+ if proxy_cols:
122
+ X_out = np.hstack([X, np.column_stack(proxy_cols)])
123
+ else:
124
+ X_out = X.copy()
125
+
126
+ # Build proxy_source_map
127
+ proxy_source_map: dict[str, str] = {}
128
+ for proxy_idx, source_col in enumerate(targeted):
129
+ proxy_key = f"feature_{n_features + proxy_idx}"
130
+ source_key = f"feature_{source_col}"
131
+ proxy_source_map[proxy_key] = source_key
132
+
133
+ # Retrieve effective importances (fall back to signal importances)
134
+ eff = metadata.get("effective_feature_importances")
135
+ if eff is None:
136
+ eff = metadata.get("signal_feature_importances", {})
137
+ importances: dict[str, float] = dict(eff)
138
+
139
+ # Split importances: original keeps (1 - r2), proxy gets r2
140
+ for proxy_idx, source_col in enumerate(targeted):
141
+ source_key = f"feature_{source_col}"
142
+ proxy_key = f"feature_{n_features + proxy_idx}"
143
+
144
+ col_var = float(np.var(X[:, source_col]))
145
+ signal_var = col_var * (self.scale**2)
146
+ r2 = signal_var / (signal_var + noise_std**2 + 1e-12)
147
+
148
+ original_imp = importances.get(source_key, 0.0)
149
+ importances[source_key] = original_imp * (1.0 - r2)
150
+ importances[proxy_key] = original_imp * r2
151
+
152
+ # Re-normalize to sum to 1.0
153
+ total = sum(importances.values())
154
+ if total > 0:
155
+ importances = {k: v / total for k, v in importances.items()}
156
+
157
+ # Build output metadata (no in-place mutation)
158
+ meta_out = dict(metadata)
159
+ meta_out["effective_feature_importances"] = importances
160
+ meta_out["proxy_source_map"] = proxy_source_map
161
+
162
+ return X_out, meta_out
163
+
164
+ def get_params(self) -> dict[str, object]:
165
+ """Return constructor parameters as a plain dict."""
166
+ return {
167
+ "severity": self.severity,
168
+ "noise_std": self.noise_std,
169
+ "scale": self.scale,
170
+ "columns": self.columns,
171
+ }
172
+
173
+
174
+ __all__ = ["CollinearityCorruptor"]
@@ -0,0 +1,102 @@
1
+ """Base class for label-space corruptors.
2
+
3
+ Corruptors that transform y (targets). X is read-only reference, never mutated.
4
+
5
+ Concrete subclasses must:
6
+ - Declare ``key="some_key"`` in their class signature to be auto-registered.
7
+ - Implement
8
+ [corrupt_labels][synthbench.corruptors.label_base.BaseLabelCorruptor.corrupt_labels].
9
+ - Override [get_params][synthbench.corruptors.label_base.BaseLabelCorruptor.get_params].
10
+
11
+ Example::
12
+
13
+ class LabelNoiseCorruptor(BaseLabelCorruptor, key="label_noise"):
14
+ def corrupt_labels(self, X, y, metadata, random_state):
15
+ ...
16
+ def get_params(self):
17
+ return {"noise_rate": self.noise_rate}
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ from abc import ABC, abstractmethod
23
+
24
+ import numpy as np
25
+
26
+ _LABEL_CORRUPTOR_REGISTRY: dict[str, type] = {}
27
+
28
+
29
+ class BaseLabelCorruptor(ABC):
30
+ """Abstract base class for all label-space corruptors.
31
+
32
+ Concrete subclasses must:
33
+ - Declare ``key="some_key"`` in their class signature to be auto-registered.
34
+ - Implement
35
+ [corrupt_labels][synthbench.corruptors.label_base.BaseLabelCorruptor.corrupt_labels].
36
+
37
+ Label corruptors transform y only; X is passed as a read-only reference
38
+ and must never be mutated.
39
+
40
+ Example::
41
+
42
+ class LabelNoiseCorruptor(BaseLabelCorruptor, key="label_noise"):
43
+ def corrupt_labels(self, X, y, metadata, random_state):
44
+ ...
45
+ """
46
+
47
+ def __init_subclass__(cls, key: str | None = None, **kwargs: object) -> None:
48
+ super().__init_subclass__(**kwargs)
49
+ if key is not None:
50
+ if key in _LABEL_CORRUPTOR_REGISTRY:
51
+ raise ValueError(
52
+ f"Duplicate label corruptor key '{key}': already registered by "
53
+ f"{_LABEL_CORRUPTOR_REGISTRY[key].__qualname__}"
54
+ )
55
+ _LABEL_CORRUPTOR_REGISTRY[key] = cls
56
+
57
+ @abstractmethod
58
+ def corrupt_labels(
59
+ self,
60
+ X: np.ndarray,
61
+ y: np.ndarray,
62
+ metadata: dict,
63
+ random_state: int,
64
+ ) -> tuple[np.ndarray, dict]:
65
+ """Apply a label-space transformation to y.
66
+
67
+ Parameters
68
+ ----------
69
+ X:
70
+ Feature matrix of shape (n_samples, n_features). Read-only
71
+ reference; must not be modified in-place.
72
+ y:
73
+ Target array of shape (n_samples,). Must not be modified in-place;
74
+ return a new array.
75
+ metadata:
76
+ The BenchResult metadata dict. Label corruptors may add keys but
77
+ must not remove or overwrite existing keys.
78
+ random_state:
79
+ Integer seed. Each corruptor derives its own RNG from this value
80
+ so that results are fully reproducible.
81
+
82
+ Returns
83
+ -------
84
+ y_corrupted : np.ndarray
85
+ Transformed target array, same shape as y.
86
+ updated_metadata : dict
87
+ Metadata dict with any label-corruptor-specific fields added.
88
+ """
89
+
90
+ def get_params(self) -> dict[str, object]:
91
+ """Return the corruptor's current configuration as a plain dict.
92
+
93
+ Concrete corruptors should override this to return their __init__
94
+ parameters. Used by BenchPipeline to record component provenance.
95
+ """
96
+ raise NotImplementedError(
97
+ f"{type(self).__qualname__}.get_params() is not implemented. "
98
+ "Concrete label corruptor subclasses must override this method."
99
+ )
100
+
101
+
102
+ __all__ = ["BaseLabelCorruptor", "_LABEL_CORRUPTOR_REGISTRY"]