synthbench 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- synthbench/__init__.py +77 -0
- synthbench/_seed.py +43 -0
- synthbench/_version.py +8 -0
- synthbench/corruptors/__init__.py +23 -0
- synthbench/corruptors/base.py +78 -0
- synthbench/corruptors/categorical.py +154 -0
- synthbench/corruptors/collinearity.py +174 -0
- synthbench/corruptors/label_base.py +102 -0
- synthbench/corruptors/label_noise.py +105 -0
- synthbench/corruptors/measurement_noise.py +98 -0
- synthbench/corruptors/missing_data.py +211 -0
- synthbench/corruptors/outlier.py +114 -0
- synthbench/data/suites/easy-classification.json +36 -0
- synthbench/data/suites/hard-regression.json +36 -0
- synthbench/dgps/__init__.py +45 -0
- synthbench/dgps/_utils.py +87 -0
- synthbench/dgps/additive.py +217 -0
- synthbench/dgps/base.py +72 -0
- synthbench/dgps/friedman.py +274 -0
- synthbench/dgps/geometric.py +291 -0
- synthbench/dgps/linear.py +165 -0
- synthbench/dgps/neural.py +209 -0
- synthbench/dgps/polynomial.py +200 -0
- synthbench/dgps/sparse.py +165 -0
- synthbench/dgps/tree.py +234 -0
- synthbench/pipeline.py +644 -0
- synthbench/suite.py +113 -0
- synthbench/sweeps.py +223 -0
- synthbench-0.1.0.dist-info/METADATA +127 -0
- synthbench-0.1.0.dist-info/RECORD +32 -0
- synthbench-0.1.0.dist-info/WHEEL +4 -0
- synthbench-0.1.0.dist-info/licenses/LICENSE +21 -0
synthbench/__init__.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from synthbench._version import __version__
|
|
4
|
+
from synthbench.corruptors import (
|
|
5
|
+
BaseCorruptor,
|
|
6
|
+
BaseLabelCorruptor,
|
|
7
|
+
CategoricalCorruptor,
|
|
8
|
+
CollinearityCorruptor,
|
|
9
|
+
LabelNoiseCorruptor,
|
|
10
|
+
MeasurementNoiseCorruptor,
|
|
11
|
+
MissingDataCorruptor,
|
|
12
|
+
OutlierCorruptor,
|
|
13
|
+
)
|
|
14
|
+
from synthbench.dgps import (
|
|
15
|
+
AdditiveDGP,
|
|
16
|
+
BaseDGP,
|
|
17
|
+
FriedmanDGP,
|
|
18
|
+
GeometricDGP,
|
|
19
|
+
LinearDGP,
|
|
20
|
+
PolynomialDGP,
|
|
21
|
+
SparseDGP,
|
|
22
|
+
TreeDGP,
|
|
23
|
+
)
|
|
24
|
+
from synthbench.pipeline import BenchPipeline, BenchResult
|
|
25
|
+
from synthbench.suite import BenchSuite
|
|
26
|
+
from synthbench.sweeps import difficulty_sweep, experiment_grid, severity_sweep
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"__version__",
|
|
30
|
+
"AdditiveDGP",
|
|
31
|
+
"BaseDGP",
|
|
32
|
+
"BaseCorruptor",
|
|
33
|
+
"BaseLabelCorruptor",
|
|
34
|
+
"BenchPipeline",
|
|
35
|
+
"BenchResult",
|
|
36
|
+
"BenchSuite",
|
|
37
|
+
"CategoricalCorruptor",
|
|
38
|
+
"CollinearityCorruptor",
|
|
39
|
+
"difficulty_sweep",
|
|
40
|
+
"experiment_grid",
|
|
41
|
+
"FriedmanDGP",
|
|
42
|
+
"GeometricDGP",
|
|
43
|
+
"LabelNoiseCorruptor",
|
|
44
|
+
"LinearDGP",
|
|
45
|
+
"MeasurementNoiseCorruptor",
|
|
46
|
+
"MissingDataCorruptor",
|
|
47
|
+
"OutlierCorruptor",
|
|
48
|
+
"PolynomialDGP",
|
|
49
|
+
"severity_sweep",
|
|
50
|
+
"SparseDGP",
|
|
51
|
+
"TreeDGP",
|
|
52
|
+
# RandomNeuralDGP is available only with synthbench[neural] (torch required).
|
|
53
|
+
# It is exposed lazily via __getattr__ below so that importing synthbench
|
|
54
|
+
# does NOT unconditionally load torch into sys.modules.
|
|
55
|
+
"RandomNeuralDGP",
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
# Names that are only available with the neural extra (torch).
|
|
59
|
+
_NEURAL_NAMES = {"RandomNeuralDGP"}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def __getattr__(name: str):
|
|
63
|
+
"""Lazy loader for optional neural symbols (requires synthbench[neural])."""
|
|
64
|
+
if name in _NEURAL_NAMES:
|
|
65
|
+
try:
|
|
66
|
+
from synthbench.dgps.neural import RandomNeuralDGP
|
|
67
|
+
except ImportError as exc:
|
|
68
|
+
raise ImportError(
|
|
69
|
+
f"{name} requires PyTorch. "
|
|
70
|
+
"Install it with: pip install synthbench[neural]"
|
|
71
|
+
) from exc
|
|
72
|
+
# Cache in module namespace so subsequent lookups are O(1).
|
|
73
|
+
import synthbench as _self
|
|
74
|
+
|
|
75
|
+
setattr(_self, name, RandomNeuralDGP)
|
|
76
|
+
return RandomNeuralDGP
|
|
77
|
+
raise AttributeError(f"module 'synthbench' has no attribute {name!r}")
|
synthbench/_seed.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from numpy.random import SeedSequence
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def derive_seeds(master_state: int | SeedSequence, n: int) -> list[int]:
|
|
8
|
+
"""Derive *n* independent integer seeds from a master state.
|
|
9
|
+
|
|
10
|
+
Uses ``numpy.random.SeedSequence.spawn`` so that child seeds are
|
|
11
|
+
statistically independent and fully reproducible from the master state.
|
|
12
|
+
The global numpy RNG state is never touched.
|
|
13
|
+
|
|
14
|
+
Parameters
|
|
15
|
+
----------
|
|
16
|
+
master_state:
|
|
17
|
+
Either a plain integer that seeds the root ``SeedSequence``, or an
|
|
18
|
+
already-constructed ``SeedSequence`` (e.g. from a parent context).
|
|
19
|
+
n:
|
|
20
|
+
Number of independent child seeds to derive.
|
|
21
|
+
|
|
22
|
+
Returns
|
|
23
|
+
-------
|
|
24
|
+
list[int]
|
|
25
|
+
Plain Python ints (JSON-serializable, not numpy dtypes).
|
|
26
|
+
"""
|
|
27
|
+
if isinstance(master_state, SeedSequence):
|
|
28
|
+
seq = master_state
|
|
29
|
+
else:
|
|
30
|
+
seq = SeedSequence(int(master_state))
|
|
31
|
+
|
|
32
|
+
children = seq.spawn(n)
|
|
33
|
+
# generate_state(1)[0] gives a single uint64 from the child sequence
|
|
34
|
+
return [int(child.generate_state(1)[0]) for child in children]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def make_rng(seed: int) -> np.random.RandomState:
|
|
38
|
+
"""Return a local ``numpy.random.RandomState`` seeded with *seed*.
|
|
39
|
+
|
|
40
|
+
Use ``sklearn.utils.check_random_state`` for public APIs that accept
|
|
41
|
+
``int | None | RandomState``.
|
|
42
|
+
"""
|
|
43
|
+
return np.random.RandomState(seed)
|
synthbench/_version.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from synthbench.corruptors.base import BaseCorruptor
|
|
4
|
+
from synthbench.corruptors.categorical import CategoricalCorruptor
|
|
5
|
+
from synthbench.corruptors.collinearity import CollinearityCorruptor
|
|
6
|
+
from synthbench.corruptors.label_base import (
|
|
7
|
+
BaseLabelCorruptor,
|
|
8
|
+
)
|
|
9
|
+
from synthbench.corruptors.label_noise import LabelNoiseCorruptor
|
|
10
|
+
from synthbench.corruptors.measurement_noise import MeasurementNoiseCorruptor
|
|
11
|
+
from synthbench.corruptors.missing_data import MissingDataCorruptor
|
|
12
|
+
from synthbench.corruptors.outlier import OutlierCorruptor
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"BaseCorruptor",
|
|
16
|
+
"BaseLabelCorruptor",
|
|
17
|
+
"CategoricalCorruptor",
|
|
18
|
+
"CollinearityCorruptor",
|
|
19
|
+
"LabelNoiseCorruptor",
|
|
20
|
+
"MeasurementNoiseCorruptor",
|
|
21
|
+
"MissingDataCorruptor",
|
|
22
|
+
"OutlierCorruptor",
|
|
23
|
+
]
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
_CORRUPTOR_REGISTRY: dict[str, type] = {}
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BaseCorruptor(ABC):
|
|
11
|
+
"""Abstract base class for all corruptors.
|
|
12
|
+
|
|
13
|
+
Concrete subclasses must:
|
|
14
|
+
- Declare ``key="some_key"`` in their class signature to be auto-registered.
|
|
15
|
+
- Implement [corrupt][synthbench.corruptors.base.BaseCorruptor.corrupt].
|
|
16
|
+
|
|
17
|
+
Corruptors transform X only; y is never passed in or mutated.
|
|
18
|
+
|
|
19
|
+
Example::
|
|
20
|
+
|
|
21
|
+
class CollinearityCorruptor(BaseCorruptor, key="collinearity"):
|
|
22
|
+
def corrupt(self, X, metadata, random_state):
|
|
23
|
+
...
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init_subclass__(cls, key: str | None = None, **kwargs: object) -> None:
|
|
27
|
+
super().__init_subclass__(**kwargs)
|
|
28
|
+
if key is not None:
|
|
29
|
+
if key in _CORRUPTOR_REGISTRY:
|
|
30
|
+
raise ValueError(
|
|
31
|
+
f"Duplicate corruptor key '{key}': already registered by "
|
|
32
|
+
f"{_CORRUPTOR_REGISTRY[key].__qualname__}"
|
|
33
|
+
)
|
|
34
|
+
_CORRUPTOR_REGISTRY[key] = cls
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
def corrupt(
|
|
38
|
+
self,
|
|
39
|
+
X: np.ndarray,
|
|
40
|
+
metadata: dict,
|
|
41
|
+
random_state: int,
|
|
42
|
+
) -> tuple[np.ndarray, dict]:
|
|
43
|
+
"""Apply a structural transformation to X.
|
|
44
|
+
|
|
45
|
+
Parameters
|
|
46
|
+
----------
|
|
47
|
+
X:
|
|
48
|
+
Feature matrix of shape (n_samples, n_features). Must not be
|
|
49
|
+
modified in-place; return a new array.
|
|
50
|
+
metadata:
|
|
51
|
+
The BenchResult metadata dict produced by the DGP. Corruptors
|
|
52
|
+
may add keys (e.g. effective_feature_importances) but must not
|
|
53
|
+
remove or overwrite existing keys set by the DGP.
|
|
54
|
+
random_state:
|
|
55
|
+
Integer seed. Each corruptor derives its own RNG from this value
|
|
56
|
+
so that results are fully reproducible.
|
|
57
|
+
|
|
58
|
+
Returns
|
|
59
|
+
-------
|
|
60
|
+
X_corrupted : np.ndarray
|
|
61
|
+
Transformed feature matrix, same shape as X.
|
|
62
|
+
updated_metadata : dict
|
|
63
|
+
Metadata dict with any corruptor-specific fields added.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def get_params(self) -> dict[str, object]:
|
|
67
|
+
"""Return the corruptor's current configuration as a plain dict.
|
|
68
|
+
|
|
69
|
+
Concrete corruptors should override this to return their __init__
|
|
70
|
+
parameters. Used by BenchPipeline to record component provenance.
|
|
71
|
+
"""
|
|
72
|
+
raise NotImplementedError(
|
|
73
|
+
f"{type(self).__qualname__}.get_params() is not implemented. "
|
|
74
|
+
"Concrete corruptor subclasses must override this method."
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
__all__ = ["BaseCorruptor", "_CORRUPTOR_REGISTRY"]
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""CategoricalCorruptor -- bins continuous features into integer categories."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from synthbench.corruptors.base import BaseCorruptor
|
|
8
|
+
|
|
9
|
+
_SEVERITY_N_BINS: dict[str, int] = {
|
|
10
|
+
"low": 10,
|
|
11
|
+
"medium": 5,
|
|
12
|
+
"high": 2,
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class CategoricalCorruptor(BaseCorruptor, key="categorical"):
|
|
17
|
+
"""Converts continuous features to integer-encoded bins.
|
|
18
|
+
|
|
19
|
+
For each targeted column, quantile-based bin edges are computed and
|
|
20
|
+
``np.digitize`` is used to assign each sample to a bin index
|
|
21
|
+
``0, 1, ..., n_bins-1``. The column values are replaced with these
|
|
22
|
+
integer indices cast to ``float``.
|
|
23
|
+
|
|
24
|
+
Feature importances are discounted by the factor ``(1 - 1/n_bins)``
|
|
25
|
+
for each targeted column, then re-normalized to sum to 1.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
severity:
|
|
30
|
+
Controls default ``n_bins`` when ``n_bins`` is not provided:
|
|
31
|
+
``"low"`` -> 10, ``"medium"`` -> 5, ``"high"`` -> 2.
|
|
32
|
+
n_bins:
|
|
33
|
+
If provided, overrides the severity-derived number of bins.
|
|
34
|
+
columns:
|
|
35
|
+
Indices of columns to target. ``None`` targets all columns.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
severity: str = "medium",
|
|
41
|
+
n_bins: int | None = None,
|
|
42
|
+
columns: list[int] | None = None,
|
|
43
|
+
) -> None:
|
|
44
|
+
if severity not in _SEVERITY_N_BINS:
|
|
45
|
+
raise ValueError(
|
|
46
|
+
f"Invalid severity '{severity}'. "
|
|
47
|
+
f"Must be one of {list(_SEVERITY_N_BINS)}."
|
|
48
|
+
)
|
|
49
|
+
self.severity = severity
|
|
50
|
+
self.n_bins = n_bins
|
|
51
|
+
self.columns = columns
|
|
52
|
+
|
|
53
|
+
# ------------------------------------------------------------------
|
|
54
|
+
# Private helpers
|
|
55
|
+
# ------------------------------------------------------------------
|
|
56
|
+
|
|
57
|
+
def _effective_n_bins(self) -> int:
|
|
58
|
+
"""Return n_bins, falling back to severity default."""
|
|
59
|
+
if self.n_bins is not None:
|
|
60
|
+
return int(self.n_bins)
|
|
61
|
+
return _SEVERITY_N_BINS[self.severity]
|
|
62
|
+
|
|
63
|
+
@staticmethod
|
|
64
|
+
def _bin_column(col_values: np.ndarray, n_bins: int) -> np.ndarray:
|
|
65
|
+
"""Quantile-bin a 1-D array into integer indices 0 .. n_bins-1."""
|
|
66
|
+
edges = np.quantile(col_values, np.linspace(0.0, 1.0, n_bins + 1))
|
|
67
|
+
# Handle constant columns: all edges are identical -> all go to bin 0
|
|
68
|
+
if np.all(edges == edges[0]):
|
|
69
|
+
return np.zeros(len(col_values), dtype=float)
|
|
70
|
+
# np.digitize with edges[1:-1] gives indices 0 to n_bins-1
|
|
71
|
+
bins = np.digitize(col_values, edges[1:-1])
|
|
72
|
+
return bins.astype(float)
|
|
73
|
+
|
|
74
|
+
# ------------------------------------------------------------------
|
|
75
|
+
# Public API
|
|
76
|
+
# ------------------------------------------------------------------
|
|
77
|
+
|
|
78
|
+
def corrupt(
|
|
79
|
+
self,
|
|
80
|
+
X: np.ndarray,
|
|
81
|
+
metadata: dict,
|
|
82
|
+
random_state: int,
|
|
83
|
+
) -> tuple[np.ndarray, dict]:
|
|
84
|
+
"""Bin targeted columns and update importances.
|
|
85
|
+
|
|
86
|
+
Parameters
|
|
87
|
+
----------
|
|
88
|
+
X:
|
|
89
|
+
Feature matrix of shape ``(n_samples, n_features)``.
|
|
90
|
+
metadata:
|
|
91
|
+
Metadata dict from the DGP. Must contain either
|
|
92
|
+
``effective_feature_importances`` or
|
|
93
|
+
``signal_feature_importances``.
|
|
94
|
+
random_state:
|
|
95
|
+
Integer seed. Binning is fully deterministic given X, so
|
|
96
|
+
``random_state`` does not affect output; it is accepted for
|
|
97
|
+
interface consistency.
|
|
98
|
+
|
|
99
|
+
Returns
|
|
100
|
+
-------
|
|
101
|
+
X_corrupted:
|
|
102
|
+
Same shape as X; targeted columns contain integer-valued floats.
|
|
103
|
+
updated_metadata:
|
|
104
|
+
Metadata with updated ``effective_feature_importances``.
|
|
105
|
+
"""
|
|
106
|
+
# random_state accepted for contract consistency; binning is deterministic
|
|
107
|
+
_ = random_state
|
|
108
|
+
|
|
109
|
+
n_features = X.shape[1]
|
|
110
|
+
n_bins = self._effective_n_bins()
|
|
111
|
+
|
|
112
|
+
# Determine targeted columns
|
|
113
|
+
targeted = (
|
|
114
|
+
list(range(n_features)) if self.columns is None else list(self.columns)
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Copy X -- do not modify in-place
|
|
118
|
+
X_out = X.copy()
|
|
119
|
+
for col in targeted:
|
|
120
|
+
X_out[:, col] = self._bin_column(X[:, col], n_bins)
|
|
121
|
+
|
|
122
|
+
# Retrieve effective importances (fall back to signal importances)
|
|
123
|
+
eff = metadata.get("effective_feature_importances")
|
|
124
|
+
if eff is None:
|
|
125
|
+
eff = metadata.get("signal_feature_importances", {})
|
|
126
|
+
importances: dict[str, float] = dict(eff)
|
|
127
|
+
|
|
128
|
+
# Discount targeted column importances
|
|
129
|
+
discount = 1.0 - 1.0 / n_bins
|
|
130
|
+
for col in targeted:
|
|
131
|
+
key = f"feature_{col}"
|
|
132
|
+
importances[key] = importances.get(key, 0.0) * discount
|
|
133
|
+
|
|
134
|
+
# Re-normalize to sum to 1.0
|
|
135
|
+
total = sum(importances.values())
|
|
136
|
+
if total > 0:
|
|
137
|
+
importances = {k: v / total for k, v in importances.items()}
|
|
138
|
+
|
|
139
|
+
# Build output metadata (no in-place mutation)
|
|
140
|
+
meta_out = dict(metadata)
|
|
141
|
+
meta_out["effective_feature_importances"] = importances
|
|
142
|
+
|
|
143
|
+
return X_out, meta_out
|
|
144
|
+
|
|
145
|
+
def get_params(self) -> dict[str, object]:
|
|
146
|
+
"""Return constructor parameters as a plain dict."""
|
|
147
|
+
return {
|
|
148
|
+
"severity": self.severity,
|
|
149
|
+
"n_bins": self.n_bins,
|
|
150
|
+
"columns": self.columns,
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
__all__ = ["CategoricalCorruptor"]
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""CollinearityCorruptor -- appends correlated proxy columns to X."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from synthbench._seed import make_rng
|
|
8
|
+
from synthbench.corruptors.base import BaseCorruptor
|
|
9
|
+
|
|
10
|
+
_SEVERITY_NOISE_STD: dict[str, float] = {
|
|
11
|
+
"low": 0.05,
|
|
12
|
+
"medium": 0.3,
|
|
13
|
+
"high": 0.8,
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CollinearityCorruptor(BaseCorruptor, key="collinearity"):
|
|
18
|
+
"""Appends proxy columns that are noisy copies of targeted features.
|
|
19
|
+
|
|
20
|
+
For each targeted column ``c``, a proxy column is generated as::
|
|
21
|
+
|
|
22
|
+
proxy = X[:, c] * scale + N(0, noise_std, n_samples)
|
|
23
|
+
|
|
24
|
+
The proxy is appended at the end of X, expanding the feature matrix
|
|
25
|
+
from ``(n_samples, n_features)`` to
|
|
26
|
+
``(n_samples, n_features + n_targeted)``.
|
|
27
|
+
|
|
28
|
+
Feature importances are split between the original and proxy columns
|
|
29
|
+
using the coefficient of determination (r^2) of the proxy relative to
|
|
30
|
+
the original signal.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
severity:
|
|
35
|
+
Controls default ``noise_std`` when ``noise_std`` is not provided:
|
|
36
|
+
``"low"`` -> 0.05, ``"medium"`` -> 0.3, ``"high"`` -> 0.8.
|
|
37
|
+
noise_std:
|
|
38
|
+
If provided, overrides the severity-derived noise standard deviation.
|
|
39
|
+
scale:
|
|
40
|
+
Multiplicative factor applied to the source column when generating
|
|
41
|
+
the proxy. Defaults to 1.0.
|
|
42
|
+
columns:
|
|
43
|
+
Indices of columns to target. ``None`` targets all columns.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
severity: str = "medium",
|
|
49
|
+
noise_std: float | None = None,
|
|
50
|
+
scale: float = 1.0,
|
|
51
|
+
columns: list[int] | None = None,
|
|
52
|
+
) -> None:
|
|
53
|
+
if severity not in _SEVERITY_NOISE_STD:
|
|
54
|
+
raise ValueError(
|
|
55
|
+
f"Invalid severity '{severity}'. "
|
|
56
|
+
f"Must be one of {list(_SEVERITY_NOISE_STD)}."
|
|
57
|
+
)
|
|
58
|
+
self.severity = severity
|
|
59
|
+
self.noise_std = noise_std
|
|
60
|
+
self.scale = scale
|
|
61
|
+
self.columns = columns
|
|
62
|
+
|
|
63
|
+
# ------------------------------------------------------------------
|
|
64
|
+
# Private helpers
|
|
65
|
+
# ------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
def _effective_noise_std(self) -> float:
|
|
68
|
+
"""Return noise_std, falling back to severity default."""
|
|
69
|
+
if self.noise_std is not None:
|
|
70
|
+
return float(self.noise_std)
|
|
71
|
+
return _SEVERITY_NOISE_STD[self.severity]
|
|
72
|
+
|
|
73
|
+
# ------------------------------------------------------------------
|
|
74
|
+
# Public API
|
|
75
|
+
# ------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
def corrupt(
|
|
78
|
+
self,
|
|
79
|
+
X: np.ndarray,
|
|
80
|
+
metadata: dict,
|
|
81
|
+
random_state: int,
|
|
82
|
+
) -> tuple[np.ndarray, dict]:
|
|
83
|
+
"""Append proxy columns and update importances.
|
|
84
|
+
|
|
85
|
+
Parameters
|
|
86
|
+
----------
|
|
87
|
+
X:
|
|
88
|
+
Feature matrix of shape ``(n_samples, n_features)``.
|
|
89
|
+
metadata:
|
|
90
|
+
Metadata dict from the DGP. Must contain either
|
|
91
|
+
``effective_feature_importances`` or
|
|
92
|
+
``signal_feature_importances``.
|
|
93
|
+
random_state:
|
|
94
|
+
Integer seed for reproducibility.
|
|
95
|
+
|
|
96
|
+
Returns
|
|
97
|
+
-------
|
|
98
|
+
X_corrupted:
|
|
99
|
+
Shape ``(n_samples, n_features + n_targeted)``.
|
|
100
|
+
updated_metadata:
|
|
101
|
+
Metadata with updated ``effective_feature_importances`` and
|
|
102
|
+
``proxy_source_map``.
|
|
103
|
+
"""
|
|
104
|
+
rng = make_rng(random_state)
|
|
105
|
+
n_samples, n_features = X.shape
|
|
106
|
+
noise_std = self._effective_noise_std()
|
|
107
|
+
|
|
108
|
+
# Determine targeted columns
|
|
109
|
+
targeted = (
|
|
110
|
+
list(range(n_features)) if self.columns is None else list(self.columns)
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Build proxy columns
|
|
114
|
+
proxy_cols: list[np.ndarray] = []
|
|
115
|
+
for col in targeted:
|
|
116
|
+
signal = X[:, col] * self.scale
|
|
117
|
+
noise = rng.normal(0.0, noise_std, size=n_samples)
|
|
118
|
+
proxy_cols.append(signal + noise)
|
|
119
|
+
|
|
120
|
+
# Append proxies
|
|
121
|
+
if proxy_cols:
|
|
122
|
+
X_out = np.hstack([X, np.column_stack(proxy_cols)])
|
|
123
|
+
else:
|
|
124
|
+
X_out = X.copy()
|
|
125
|
+
|
|
126
|
+
# Build proxy_source_map
|
|
127
|
+
proxy_source_map: dict[str, str] = {}
|
|
128
|
+
for proxy_idx, source_col in enumerate(targeted):
|
|
129
|
+
proxy_key = f"feature_{n_features + proxy_idx}"
|
|
130
|
+
source_key = f"feature_{source_col}"
|
|
131
|
+
proxy_source_map[proxy_key] = source_key
|
|
132
|
+
|
|
133
|
+
# Retrieve effective importances (fall back to signal importances)
|
|
134
|
+
eff = metadata.get("effective_feature_importances")
|
|
135
|
+
if eff is None:
|
|
136
|
+
eff = metadata.get("signal_feature_importances", {})
|
|
137
|
+
importances: dict[str, float] = dict(eff)
|
|
138
|
+
|
|
139
|
+
# Split importances: original keeps (1 - r2), proxy gets r2
|
|
140
|
+
for proxy_idx, source_col in enumerate(targeted):
|
|
141
|
+
source_key = f"feature_{source_col}"
|
|
142
|
+
proxy_key = f"feature_{n_features + proxy_idx}"
|
|
143
|
+
|
|
144
|
+
col_var = float(np.var(X[:, source_col]))
|
|
145
|
+
signal_var = col_var * (self.scale**2)
|
|
146
|
+
r2 = signal_var / (signal_var + noise_std**2 + 1e-12)
|
|
147
|
+
|
|
148
|
+
original_imp = importances.get(source_key, 0.0)
|
|
149
|
+
importances[source_key] = original_imp * (1.0 - r2)
|
|
150
|
+
importances[proxy_key] = original_imp * r2
|
|
151
|
+
|
|
152
|
+
# Re-normalize to sum to 1.0
|
|
153
|
+
total = sum(importances.values())
|
|
154
|
+
if total > 0:
|
|
155
|
+
importances = {k: v / total for k, v in importances.items()}
|
|
156
|
+
|
|
157
|
+
# Build output metadata (no in-place mutation)
|
|
158
|
+
meta_out = dict(metadata)
|
|
159
|
+
meta_out["effective_feature_importances"] = importances
|
|
160
|
+
meta_out["proxy_source_map"] = proxy_source_map
|
|
161
|
+
|
|
162
|
+
return X_out, meta_out
|
|
163
|
+
|
|
164
|
+
def get_params(self) -> dict[str, object]:
|
|
165
|
+
"""Return constructor parameters as a plain dict."""
|
|
166
|
+
return {
|
|
167
|
+
"severity": self.severity,
|
|
168
|
+
"noise_std": self.noise_std,
|
|
169
|
+
"scale": self.scale,
|
|
170
|
+
"columns": self.columns,
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
__all__ = ["CollinearityCorruptor"]
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Base class for label-space corruptors.
|
|
2
|
+
|
|
3
|
+
Corruptors that transform y (targets). X is read-only reference, never mutated.
|
|
4
|
+
|
|
5
|
+
Concrete subclasses must:
|
|
6
|
+
- Declare ``key="some_key"`` in their class signature to be auto-registered.
|
|
7
|
+
- Implement
|
|
8
|
+
[corrupt_labels][synthbench.corruptors.label_base.BaseLabelCorruptor.corrupt_labels].
|
|
9
|
+
- Override [get_params][synthbench.corruptors.label_base.BaseLabelCorruptor.get_params].
|
|
10
|
+
|
|
11
|
+
Example::
|
|
12
|
+
|
|
13
|
+
class LabelNoiseCorruptor(BaseLabelCorruptor, key="label_noise"):
|
|
14
|
+
def corrupt_labels(self, X, y, metadata, random_state):
|
|
15
|
+
...
|
|
16
|
+
def get_params(self):
|
|
17
|
+
return {"noise_rate": self.noise_rate}
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
from abc import ABC, abstractmethod
|
|
23
|
+
|
|
24
|
+
import numpy as np
|
|
25
|
+
|
|
26
|
+
_LABEL_CORRUPTOR_REGISTRY: dict[str, type] = {}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class BaseLabelCorruptor(ABC):
|
|
30
|
+
"""Abstract base class for all label-space corruptors.
|
|
31
|
+
|
|
32
|
+
Concrete subclasses must:
|
|
33
|
+
- Declare ``key="some_key"`` in their class signature to be auto-registered.
|
|
34
|
+
- Implement
|
|
35
|
+
[corrupt_labels][synthbench.corruptors.label_base.BaseLabelCorruptor.corrupt_labels].
|
|
36
|
+
|
|
37
|
+
Label corruptors transform y only; X is passed as a read-only reference
|
|
38
|
+
and must never be mutated.
|
|
39
|
+
|
|
40
|
+
Example::
|
|
41
|
+
|
|
42
|
+
class LabelNoiseCorruptor(BaseLabelCorruptor, key="label_noise"):
|
|
43
|
+
def corrupt_labels(self, X, y, metadata, random_state):
|
|
44
|
+
...
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init_subclass__(cls, key: str | None = None, **kwargs: object) -> None:
|
|
48
|
+
super().__init_subclass__(**kwargs)
|
|
49
|
+
if key is not None:
|
|
50
|
+
if key in _LABEL_CORRUPTOR_REGISTRY:
|
|
51
|
+
raise ValueError(
|
|
52
|
+
f"Duplicate label corruptor key '{key}': already registered by "
|
|
53
|
+
f"{_LABEL_CORRUPTOR_REGISTRY[key].__qualname__}"
|
|
54
|
+
)
|
|
55
|
+
_LABEL_CORRUPTOR_REGISTRY[key] = cls
|
|
56
|
+
|
|
57
|
+
@abstractmethod
|
|
58
|
+
def corrupt_labels(
|
|
59
|
+
self,
|
|
60
|
+
X: np.ndarray,
|
|
61
|
+
y: np.ndarray,
|
|
62
|
+
metadata: dict,
|
|
63
|
+
random_state: int,
|
|
64
|
+
) -> tuple[np.ndarray, dict]:
|
|
65
|
+
"""Apply a label-space transformation to y.
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
----------
|
|
69
|
+
X:
|
|
70
|
+
Feature matrix of shape (n_samples, n_features). Read-only
|
|
71
|
+
reference; must not be modified in-place.
|
|
72
|
+
y:
|
|
73
|
+
Target array of shape (n_samples,). Must not be modified in-place;
|
|
74
|
+
return a new array.
|
|
75
|
+
metadata:
|
|
76
|
+
The BenchResult metadata dict. Label corruptors may add keys but
|
|
77
|
+
must not remove or overwrite existing keys.
|
|
78
|
+
random_state:
|
|
79
|
+
Integer seed. Each corruptor derives its own RNG from this value
|
|
80
|
+
so that results are fully reproducible.
|
|
81
|
+
|
|
82
|
+
Returns
|
|
83
|
+
-------
|
|
84
|
+
y_corrupted : np.ndarray
|
|
85
|
+
Transformed target array, same shape as y.
|
|
86
|
+
updated_metadata : dict
|
|
87
|
+
Metadata dict with any label-corruptor-specific fields added.
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
def get_params(self) -> dict[str, object]:
|
|
91
|
+
"""Return the corruptor's current configuration as a plain dict.
|
|
92
|
+
|
|
93
|
+
Concrete corruptors should override this to return their __init__
|
|
94
|
+
parameters. Used by BenchPipeline to record component provenance.
|
|
95
|
+
"""
|
|
96
|
+
raise NotImplementedError(
|
|
97
|
+
f"{type(self).__qualname__}.get_params() is not implemented. "
|
|
98
|
+
"Concrete label corruptor subclasses must override this method."
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
__all__ = ["BaseLabelCorruptor", "_LABEL_CORRUPTOR_REGISTRY"]
|