synthbench 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. synthbench-0.1.0/.gitignore +13 -0
  2. synthbench-0.1.0/CHANGELOG.md +5 -0
  3. synthbench-0.1.0/LICENSE +21 -0
  4. synthbench-0.1.0/PKG-INFO +127 -0
  5. synthbench-0.1.0/README.md +81 -0
  6. synthbench-0.1.0/icon.png +0 -0
  7. synthbench-0.1.0/pyproject.toml +99 -0
  8. synthbench-0.1.0/src/synthbench/__init__.py +77 -0
  9. synthbench-0.1.0/src/synthbench/_seed.py +43 -0
  10. synthbench-0.1.0/src/synthbench/_version.py +8 -0
  11. synthbench-0.1.0/src/synthbench/corruptors/__init__.py +23 -0
  12. synthbench-0.1.0/src/synthbench/corruptors/base.py +78 -0
  13. synthbench-0.1.0/src/synthbench/corruptors/categorical.py +154 -0
  14. synthbench-0.1.0/src/synthbench/corruptors/collinearity.py +174 -0
  15. synthbench-0.1.0/src/synthbench/corruptors/label_base.py +102 -0
  16. synthbench-0.1.0/src/synthbench/corruptors/label_noise.py +105 -0
  17. synthbench-0.1.0/src/synthbench/corruptors/measurement_noise.py +98 -0
  18. synthbench-0.1.0/src/synthbench/corruptors/missing_data.py +211 -0
  19. synthbench-0.1.0/src/synthbench/corruptors/outlier.py +114 -0
  20. synthbench-0.1.0/src/synthbench/data/suites/easy-classification.json +36 -0
  21. synthbench-0.1.0/src/synthbench/data/suites/hard-regression.json +36 -0
  22. synthbench-0.1.0/src/synthbench/dgps/__init__.py +45 -0
  23. synthbench-0.1.0/src/synthbench/dgps/_utils.py +87 -0
  24. synthbench-0.1.0/src/synthbench/dgps/additive.py +217 -0
  25. synthbench-0.1.0/src/synthbench/dgps/base.py +72 -0
  26. synthbench-0.1.0/src/synthbench/dgps/friedman.py +274 -0
  27. synthbench-0.1.0/src/synthbench/dgps/geometric.py +291 -0
  28. synthbench-0.1.0/src/synthbench/dgps/linear.py +165 -0
  29. synthbench-0.1.0/src/synthbench/dgps/neural.py +209 -0
  30. synthbench-0.1.0/src/synthbench/dgps/polynomial.py +200 -0
  31. synthbench-0.1.0/src/synthbench/dgps/sparse.py +165 -0
  32. synthbench-0.1.0/src/synthbench/dgps/tree.py +234 -0
  33. synthbench-0.1.0/src/synthbench/pipeline.py +644 -0
  34. synthbench-0.1.0/src/synthbench/suite.py +113 -0
  35. synthbench-0.1.0/src/synthbench/sweeps.py +223 -0
  36. synthbench-0.1.0/tests/conftest.py +59 -0
  37. synthbench-0.1.0/tests/test_additive_dgp.py +214 -0
  38. synthbench-0.1.0/tests/test_base_corruptor.py +86 -0
  39. synthbench-0.1.0/tests/test_base_dgp.py +68 -0
  40. synthbench-0.1.0/tests/test_base_label_corruptor.py +143 -0
  41. synthbench-0.1.0/tests/test_bench_result.py +280 -0
  42. synthbench-0.1.0/tests/test_categorical_corruptor.py +277 -0
  43. synthbench-0.1.0/tests/test_ci_smoke.py +53 -0
  44. synthbench-0.1.0/tests/test_class_weight.py +240 -0
  45. synthbench-0.1.0/tests/test_collinearity_corruptor.py +303 -0
  46. synthbench-0.1.0/tests/test_dgp_integration.py +326 -0
  47. synthbench-0.1.0/tests/test_friedman_dgp.py +361 -0
  48. synthbench-0.1.0/tests/test_geometric_dgp.py +259 -0
  49. synthbench-0.1.0/tests/test_label_noise_corruptor.py +270 -0
  50. synthbench-0.1.0/tests/test_linear_dgp.py +276 -0
  51. synthbench-0.1.0/tests/test_measurement_noise_corruptor.py +202 -0
  52. synthbench-0.1.0/tests/test_missing_data_corruptor.py +450 -0
  53. synthbench-0.1.0/tests/test_neural_dgp.py +179 -0
  54. synthbench-0.1.0/tests/test_outlier_corruptor.py +218 -0
  55. synthbench-0.1.0/tests/test_pipeline.py +662 -0
  56. synthbench-0.1.0/tests/test_polynomial_dgp.py +180 -0
  57. synthbench-0.1.0/tests/test_rng_isolation.py +153 -0
  58. synthbench-0.1.0/tests/test_seed.py +60 -0
  59. synthbench-0.1.0/tests/test_sparse_dgp.py +215 -0
  60. synthbench-0.1.0/tests/test_suite.py +110 -0
  61. synthbench-0.1.0/tests/test_sweeps.py +195 -0
  62. synthbench-0.1.0/tests/test_tree_dgp.py +208 -0
@@ -0,0 +1,13 @@
1
+ .planning/
2
+ .pytest_cache/
3
+ .ruff_cache/
4
+ .venv/
5
+ .cache/
6
+ *.pyc
7
+ quality_reports/
8
+
9
+ # Build artifacts
10
+ dist/
11
+ build/
12
+ *.egg-info/
13
+ .coverage
@@ -0,0 +1,5 @@
1
+ # Changelog
2
+
3
+ ## 0.1.0 — 2026-05-06
4
+
5
+ Initial public release on PyPI.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Jan Teichert-Kluge
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,127 @@
1
+ Metadata-Version: 2.4
2
+ Name: synthbench
3
+ Version: 0.1.0
4
+ Summary: Synthetic datasets for ML benchmarking with controllable complexity, configurable corruptions, and full provenance.
5
+ Project-URL: Homepage, https://github.com/JanTeichertKluge/synth-bench
6
+ Project-URL: Documentation, https://JanTeichertKluge.github.io/synth-bench
7
+ Project-URL: Repository, https://github.com/JanTeichertKluge/synth-bench.git
8
+ Project-URL: Issues, https://github.com/JanTeichertKluge/synth-bench/issues
9
+ Project-URL: Changelog, https://github.com/JanTeichertKluge/synth-bench/releases
10
+ Author-email: Jan Teichert-Kluge <janteiklu@gmail.com>
11
+ License-Expression: MIT
12
+ License-File: LICENSE
13
+ Keywords: benchmarking,data-generating-process,dataset-generation,machine-learning,reproducibility,synthetic-data
14
+ Classifier: Development Status :: 3 - Alpha
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: Intended Audience :: Science/Research
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Scientific/Engineering
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Classifier: Typing :: Typed
24
+ Requires-Python: >=3.12
25
+ Requires-Dist: numpy>=2.0
26
+ Requires-Dist: scikit-learn>=1.5
27
+ Requires-Dist: scipy>=1.12
28
+ Provides-Extra: dev
29
+ Requires-Dist: pre-commit>=3.5; extra == 'dev'
30
+ Requires-Dist: pytest-cov>=7.0; extra == 'dev'
31
+ Requires-Dist: pytest>=8.0; extra == 'dev'
32
+ Requires-Dist: ruff>=0.5; extra == 'dev'
33
+ Provides-Extra: docs
34
+ Requires-Dist: matplotlib>=3.7; extra == 'docs'
35
+ Requires-Dist: mkdocs-jupyter>=0.26; extra == 'docs'
36
+ Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
37
+ Requires-Dist: mkdocs>=1.6; extra == 'docs'
38
+ Requires-Dist: mkdocstrings[python]>=1.0; extra == 'docs'
39
+ Requires-Dist: openml>=0.14; extra == 'docs'
40
+ Requires-Dist: pandas>=2.0; extra == 'docs'
41
+ Provides-Extra: io
42
+ Requires-Dist: pyarrow>=14.0; extra == 'io'
43
+ Provides-Extra: neural
44
+ Requires-Dist: torch>=2.1; extra == 'neural'
45
+ Description-Content-Type: text/markdown
46
+
47
+ <div align="center">
48
+ <img src="icon.png" alt="synthbench" width="420">
49
+ </div>
50
+
51
+ ---
52
+
53
+ synthbench is a small Python library for generating synthetic datasets that are actually useful for benchmarking. You control the signal complexity, add noise or missing data on top, and get back a dataset with full provenance so you know exactly what you generated and why. Every result is reproducible from a single integer seed.
54
+
55
+ It covers eight DGP families, five corruptors, metadata enrichment (Bayes error, effective rank), Parquet/CSV serialization, and sweep helpers for running ablation grids.
56
+
57
+ ## Installation
58
+
59
+ ```bash
60
+ pip install synthbench
61
+ ```
62
+
63
+ For Parquet support:
64
+
65
+ ```bash
66
+ pip install "synthbench[io]"
67
+ ```
68
+
69
+ For `RandomNeuralDGP` (needs PyTorch):
70
+
71
+ ```bash
72
+ pip install "synthbench[neural]"
73
+ ```
74
+
75
+ ## Basic usage
76
+
77
+ ```python
78
+ from synthbench import BenchPipeline, LinearDGP, MissingDataCorruptor
79
+
80
+ pipeline = BenchPipeline(
81
+ LinearDGP(complexity="medium", task_type="classification"),
82
+ corruptors=[MissingDataCorruptor(proportion=0.1, mechanism="mar")],
83
+ )
84
+ result = pipeline.run(n_samples=500, n_features=10, random_state=42)
85
+
86
+ print(result.X.shape) # (500, 10)
87
+ print(result.metadata["bayes_error"]) # empirical difficulty estimate
88
+ print(result.metadata["effective_rank"]) # feature space dimensionality
89
+ ```
90
+
91
+ ## What it does
92
+
93
+ **Data-generating processes** — Linear, Polynomial, Tree, Friedman (variants 1/2/3), Additive, Sparse, Geometric, and RandomNeural. Each takes a `complexity` parameter and records ground-truth feature importances alongside the data.
94
+
95
+ **Corruptors** — MeasurementNoise, Outlier, MissingData, Collinearity, and Categorical corruptors for the feature matrix, plus `LabelNoiseCorruptor` for flipping labels or injecting regression noise. They chain together in a canonical order and track how much signal they degrade.
96
+
97
+ **Metadata** — every result carries `bayes_error`, `effective_rank`, corruptor parameters, and version provenance. Enough to reconstruct the generating pipeline from scratch.
98
+
99
+ **Sweeps** — `severity_sweep` and `difficulty_sweep` for single-axis ablations, and `experiment_grid` for full factorial runs across sample size, complexity, and severity. Seeds are derived hierarchically so cells are independent but deterministic.
100
+
101
+ **Named suites** — `BenchSuite("easy-classification").run()` returns a labelled dict of results for a curated collection. Good for quick sanity checks or as a shared benchmark baseline.
102
+
103
+ **Serialization** — `to_parquet` / `from_parquet` and `to_csv` / `from_csv` round-trip everything including metadata. `BenchPipeline.from_metadata` reconstructs and re-runs the pipeline for bit-identical replay.
104
+
105
+ ## Ablation example
106
+
107
+ ```python
108
+ from synthbench import LinearDGP, OutlierCorruptor, experiment_grid
109
+
110
+ grid = experiment_grid(
111
+ LinearDGP,
112
+ OutlierCorruptor,
113
+ n_samples_list=[200, 500, 1000],
114
+ complexities=["low", "medium", "high"],
115
+ severities=["low", "medium", "high"],
116
+ n_features=10,
117
+ random_state=0,
118
+ task_type="classification",
119
+ )
120
+
121
+ result = grid[(500, "high", "medium")]
122
+ print(result.metadata["bayes_error"])
123
+ ```
124
+
125
+ ## Docs
126
+
127
+ Full reference at [JanTeichertKluge.github.io/synth-bench](https://JanTeichertKluge.github.io/synth-bench).
@@ -0,0 +1,81 @@
1
+ <div align="center">
2
+ <img src="icon.png" alt="synthbench" width="420">
3
+ </div>
4
+
5
+ ---
6
+
7
+ synthbench is a small Python library for generating synthetic datasets that are actually useful for benchmarking. You control the signal complexity, add noise or missing data on top, and get back a dataset with full provenance so you know exactly what you generated and why. Every result is reproducible from a single integer seed.
8
+
9
+ It covers eight DGP families, five corruptors, metadata enrichment (Bayes error, effective rank), Parquet/CSV serialization, and sweep helpers for running ablation grids.
10
+
11
+ ## Installation
12
+
13
+ ```bash
14
+ pip install synthbench
15
+ ```
16
+
17
+ For Parquet support:
18
+
19
+ ```bash
20
+ pip install "synthbench[io]"
21
+ ```
22
+
23
+ For `RandomNeuralDGP` (needs PyTorch):
24
+
25
+ ```bash
26
+ pip install "synthbench[neural]"
27
+ ```
28
+
29
+ ## Basic usage
30
+
31
+ ```python
32
+ from synthbench import BenchPipeline, LinearDGP, MissingDataCorruptor
33
+
34
+ pipeline = BenchPipeline(
35
+ LinearDGP(complexity="medium", task_type="classification"),
36
+ corruptors=[MissingDataCorruptor(proportion=0.1, mechanism="mar")],
37
+ )
38
+ result = pipeline.run(n_samples=500, n_features=10, random_state=42)
39
+
40
+ print(result.X.shape) # (500, 10)
41
+ print(result.metadata["bayes_error"]) # empirical difficulty estimate
42
+ print(result.metadata["effective_rank"]) # feature space dimensionality
43
+ ```
44
+
45
+ ## What it does
46
+
47
+ **Data-generating processes** — Linear, Polynomial, Tree, Friedman (variants 1/2/3), Additive, Sparse, Geometric, and RandomNeural. Each takes a `complexity` parameter and records ground-truth feature importances alongside the data.
48
+
49
+ **Corruptors** — MeasurementNoise, Outlier, MissingData, Collinearity, and Categorical corruptors for the feature matrix, plus `LabelNoiseCorruptor` for flipping labels or injecting regression noise. They chain together in a canonical order and track how much signal they degrade.
50
+
51
+ **Metadata** — every result carries `bayes_error`, `effective_rank`, corruptor parameters, and version provenance. Enough to reconstruct the generating pipeline from scratch.
52
+
53
+ **Sweeps** — `severity_sweep` and `difficulty_sweep` for single-axis ablations, and `experiment_grid` for full factorial runs across sample size, complexity, and severity. Seeds are derived hierarchically so cells are independent but deterministic.
54
+
55
+ **Named suites** — `BenchSuite("easy-classification").run()` returns a labelled dict of results for a curated collection. Good for quick sanity checks or as a shared benchmark baseline.
56
+
57
+ **Serialization** — `to_parquet` / `from_parquet` and `to_csv` / `from_csv` round-trip everything including metadata. `BenchPipeline.from_metadata` reconstructs and re-runs the pipeline for bit-identical replay.
58
+
59
+ ## Ablation example
60
+
61
+ ```python
62
+ from synthbench import LinearDGP, OutlierCorruptor, experiment_grid
63
+
64
+ grid = experiment_grid(
65
+ LinearDGP,
66
+ OutlierCorruptor,
67
+ n_samples_list=[200, 500, 1000],
68
+ complexities=["low", "medium", "high"],
69
+ severities=["low", "medium", "high"],
70
+ n_features=10,
71
+ random_state=0,
72
+ task_type="classification",
73
+ )
74
+
75
+ result = grid[(500, "high", "medium")]
76
+ print(result.metadata["bayes_error"])
77
+ ```
78
+
79
+ ## Docs
80
+
81
+ Full reference at [JanTeichertKluge.github.io/synth-bench](https://JanTeichertKluge.github.io/synth-bench).
Binary file
@@ -0,0 +1,99 @@
1
+ [build-system]
2
+ requires = ["hatchling>=1.27"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "synthbench"
7
+ version = "0.1.0"
8
+ description = "Synthetic datasets for ML benchmarking with controllable complexity, configurable corruptions, and full provenance."
9
+ readme = "README.md"
10
+ requires-python = ">=3.12"
11
+ license = "MIT"
12
+ license-files = ["LICENSE"]
13
+ authors = [
14
+ { name = "Jan Teichert-Kluge", email = "janteiklu@gmail.com" },
15
+ ]
16
+ keywords = [
17
+ "benchmarking",
18
+ "synthetic-data",
19
+ "machine-learning",
20
+ "dataset-generation",
21
+ "reproducibility",
22
+ "data-generating-process",
23
+ ]
24
+ classifiers = [
25
+ "Development Status :: 3 - Alpha",
26
+ "Intended Audience :: Science/Research",
27
+ "Intended Audience :: Developers",
28
+ "Operating System :: OS Independent",
29
+ "Programming Language :: Python :: 3",
30
+ "Programming Language :: Python :: 3.12",
31
+ "Programming Language :: Python :: 3.13",
32
+ "Topic :: Scientific/Engineering",
33
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
34
+ "Typing :: Typed",
35
+ ]
36
+ dependencies = [
37
+ "numpy>=2.0",
38
+ "scikit-learn>=1.5",
39
+ "scipy>=1.12",
40
+ ]
41
+
42
+ [project.urls]
43
+ Homepage = "https://github.com/JanTeichertKluge/synth-bench"
44
+ Documentation = "https://JanTeichertKluge.github.io/synth-bench"
45
+ Repository = "https://github.com/JanTeichertKluge/synth-bench.git"
46
+ Issues = "https://github.com/JanTeichertKluge/synth-bench/issues"
47
+ Changelog = "https://github.com/JanTeichertKluge/synth-bench/releases"
48
+
49
+ [project.optional-dependencies]
50
+ io = ["pyarrow>=14.0"]
51
+ neural = ["torch>=2.1"]
52
+ docs = [
53
+ "mkdocs>=1.6",
54
+ "mkdocs-material>=9.5",
55
+ "mkdocstrings[python]>=1.0",
56
+ "mkdocs-jupyter>=0.26",
57
+ "openml>=0.14",
58
+ "matplotlib>=3.7",
59
+ "pandas>=2.0",
60
+ ]
61
+ dev = [
62
+ "pytest>=8.0",
63
+ "pytest-cov>=7.0",
64
+ "ruff>=0.5",
65
+ "pre-commit>=3.5",
66
+ ]
67
+
68
+ [tool.hatch.build.targets.wheel]
69
+ packages = ["src/synthbench"]
70
+
71
+ [tool.hatch.build.targets.sdist]
72
+ exclude = [
73
+ ".coverage",
74
+ ".gitattributes",
75
+ ".pre-commit-config.yaml",
76
+ "uv.lock",
77
+ ".github/",
78
+ "docs/",
79
+ "mkdocs.yml",
80
+ ]
81
+
82
+ [tool.ruff]
83
+ line-length = 88
84
+ target-version = "py312"
85
+
86
+ [tool.ruff.lint]
87
+ select = ["E", "W", "F", "I", "UP", "B"]
88
+
89
+ [tool.pytest.ini_options]
90
+ testpaths = ["tests"]
91
+ markers = [
92
+ "neural: marks tests requiring torch (deselect with '-m not neural')",
93
+ ]
94
+
95
+ [tool.coverage.report]
96
+ fail_under = 90
97
+ omit = [
98
+ "src/synthbench/dgps/neural.py",
99
+ ]
@@ -0,0 +1,77 @@
1
+ from __future__ import annotations
2
+
3
+ from synthbench._version import __version__
4
+ from synthbench.corruptors import (
5
+ BaseCorruptor,
6
+ BaseLabelCorruptor,
7
+ CategoricalCorruptor,
8
+ CollinearityCorruptor,
9
+ LabelNoiseCorruptor,
10
+ MeasurementNoiseCorruptor,
11
+ MissingDataCorruptor,
12
+ OutlierCorruptor,
13
+ )
14
+ from synthbench.dgps import (
15
+ AdditiveDGP,
16
+ BaseDGP,
17
+ FriedmanDGP,
18
+ GeometricDGP,
19
+ LinearDGP,
20
+ PolynomialDGP,
21
+ SparseDGP,
22
+ TreeDGP,
23
+ )
24
+ from synthbench.pipeline import BenchPipeline, BenchResult
25
+ from synthbench.suite import BenchSuite
26
+ from synthbench.sweeps import difficulty_sweep, experiment_grid, severity_sweep
27
+
28
+ __all__ = [
29
+ "__version__",
30
+ "AdditiveDGP",
31
+ "BaseDGP",
32
+ "BaseCorruptor",
33
+ "BaseLabelCorruptor",
34
+ "BenchPipeline",
35
+ "BenchResult",
36
+ "BenchSuite",
37
+ "CategoricalCorruptor",
38
+ "CollinearityCorruptor",
39
+ "difficulty_sweep",
40
+ "experiment_grid",
41
+ "FriedmanDGP",
42
+ "GeometricDGP",
43
+ "LabelNoiseCorruptor",
44
+ "LinearDGP",
45
+ "MeasurementNoiseCorruptor",
46
+ "MissingDataCorruptor",
47
+ "OutlierCorruptor",
48
+ "PolynomialDGP",
49
+ "severity_sweep",
50
+ "SparseDGP",
51
+ "TreeDGP",
52
+ # RandomNeuralDGP is available only with synthbench[neural] (torch required).
53
+ # It is exposed lazily via __getattr__ below so that importing synthbench
54
+ # does NOT unconditionally load torch into sys.modules.
55
+ "RandomNeuralDGP",
56
+ ]
57
+
58
+ # Names that are only available with the neural extra (torch).
59
+ _NEURAL_NAMES = {"RandomNeuralDGP"}
60
+
61
+
62
+ def __getattr__(name: str):
63
+ """Lazy loader for optional neural symbols (requires synthbench[neural])."""
64
+ if name in _NEURAL_NAMES:
65
+ try:
66
+ from synthbench.dgps.neural import RandomNeuralDGP
67
+ except ImportError as exc:
68
+ raise ImportError(
69
+ f"{name} requires PyTorch. "
70
+ "Install it with: pip install synthbench[neural]"
71
+ ) from exc
72
+ # Cache in module namespace so subsequent lookups are O(1).
73
+ import synthbench as _self
74
+
75
+ setattr(_self, name, RandomNeuralDGP)
76
+ return RandomNeuralDGP
77
+ raise AttributeError(f"module 'synthbench' has no attribute {name!r}")
@@ -0,0 +1,43 @@
1
+ from __future__ import annotations
2
+
3
+ import numpy as np
4
+ from numpy.random import SeedSequence
5
+
6
+
7
+ def derive_seeds(master_state: int | SeedSequence, n: int) -> list[int]:
8
+ """Derive *n* independent integer seeds from a master state.
9
+
10
+ Uses ``numpy.random.SeedSequence.spawn`` so that child seeds are
11
+ statistically independent and fully reproducible from the master state.
12
+ The global numpy RNG state is never touched.
13
+
14
+ Parameters
15
+ ----------
16
+ master_state:
17
+ Either a plain integer that seeds the root ``SeedSequence``, or an
18
+ already-constructed ``SeedSequence`` (e.g. from a parent context).
19
+ n:
20
+ Number of independent child seeds to derive.
21
+
22
+ Returns
23
+ -------
24
+ list[int]
25
+ Plain Python ints (JSON-serializable, not numpy dtypes).
26
+ """
27
+ if isinstance(master_state, SeedSequence):
28
+ seq = master_state
29
+ else:
30
+ seq = SeedSequence(int(master_state))
31
+
32
+ children = seq.spawn(n)
33
+ # generate_state(1)[0] gives a single uint64 from the child sequence
34
+ return [int(child.generate_state(1)[0]) for child in children]
35
+
36
+
37
+ def make_rng(seed: int) -> np.random.RandomState:
38
+ """Return a local ``numpy.random.RandomState`` seeded with *seed*.
39
+
40
+ Use ``sklearn.utils.check_random_state`` for public APIs that accept
41
+ ``int | None | RandomState``.
42
+ """
43
+ return np.random.RandomState(seed)
@@ -0,0 +1,8 @@
1
+ from __future__ import annotations
2
+
3
+ try:
4
+ from importlib.metadata import PackageNotFoundError, version
5
+
6
+ __version__ = version("synthbench")
7
+ except PackageNotFoundError:
8
+ __version__ = "0.0.0+dev"
@@ -0,0 +1,23 @@
1
+ from __future__ import annotations
2
+
3
+ from synthbench.corruptors.base import BaseCorruptor
4
+ from synthbench.corruptors.categorical import CategoricalCorruptor
5
+ from synthbench.corruptors.collinearity import CollinearityCorruptor
6
+ from synthbench.corruptors.label_base import (
7
+ BaseLabelCorruptor,
8
+ )
9
+ from synthbench.corruptors.label_noise import LabelNoiseCorruptor
10
+ from synthbench.corruptors.measurement_noise import MeasurementNoiseCorruptor
11
+ from synthbench.corruptors.missing_data import MissingDataCorruptor
12
+ from synthbench.corruptors.outlier import OutlierCorruptor
13
+
14
+ __all__ = [
15
+ "BaseCorruptor",
16
+ "BaseLabelCorruptor",
17
+ "CategoricalCorruptor",
18
+ "CollinearityCorruptor",
19
+ "LabelNoiseCorruptor",
20
+ "MeasurementNoiseCorruptor",
21
+ "MissingDataCorruptor",
22
+ "OutlierCorruptor",
23
+ ]
@@ -0,0 +1,78 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+
5
+ import numpy as np
6
+
7
+ _CORRUPTOR_REGISTRY: dict[str, type] = {}
8
+
9
+
10
+ class BaseCorruptor(ABC):
11
+ """Abstract base class for all corruptors.
12
+
13
+ Concrete subclasses must:
14
+ - Declare ``key="some_key"`` in their class signature to be auto-registered.
15
+ - Implement [corrupt][synthbench.corruptors.base.BaseCorruptor.corrupt].
16
+
17
+ Corruptors transform X only; y is never passed in or mutated.
18
+
19
+ Example::
20
+
21
+ class CollinearityCorruptor(BaseCorruptor, key="collinearity"):
22
+ def corrupt(self, X, metadata, random_state):
23
+ ...
24
+ """
25
+
26
+ def __init_subclass__(cls, key: str | None = None, **kwargs: object) -> None:
27
+ super().__init_subclass__(**kwargs)
28
+ if key is not None:
29
+ if key in _CORRUPTOR_REGISTRY:
30
+ raise ValueError(
31
+ f"Duplicate corruptor key '{key}': already registered by "
32
+ f"{_CORRUPTOR_REGISTRY[key].__qualname__}"
33
+ )
34
+ _CORRUPTOR_REGISTRY[key] = cls
35
+
36
+ @abstractmethod
37
+ def corrupt(
38
+ self,
39
+ X: np.ndarray,
40
+ metadata: dict,
41
+ random_state: int,
42
+ ) -> tuple[np.ndarray, dict]:
43
+ """Apply a structural transformation to X.
44
+
45
+ Parameters
46
+ ----------
47
+ X:
48
+ Feature matrix of shape (n_samples, n_features). Must not be
49
+ modified in-place; return a new array.
50
+ metadata:
51
+ The BenchResult metadata dict produced by the DGP. Corruptors
52
+ may add keys (e.g. effective_feature_importances) but must not
53
+ remove or overwrite existing keys set by the DGP.
54
+ random_state:
55
+ Integer seed. Each corruptor derives its own RNG from this value
56
+ so that results are fully reproducible.
57
+
58
+ Returns
59
+ -------
60
+ X_corrupted : np.ndarray
61
+ Transformed feature matrix, same shape as X.
62
+ updated_metadata : dict
63
+ Metadata dict with any corruptor-specific fields added.
64
+ """
65
+
66
+ def get_params(self) -> dict[str, object]:
67
+ """Return the corruptor's current configuration as a plain dict.
68
+
69
+ Concrete corruptors should override this to return their __init__
70
+ parameters. Used by BenchPipeline to record component provenance.
71
+ """
72
+ raise NotImplementedError(
73
+ f"{type(self).__qualname__}.get_params() is not implemented. "
74
+ "Concrete corruptor subclasses must override this method."
75
+ )
76
+
77
+
78
+ __all__ = ["BaseCorruptor", "_CORRUPTOR_REGISTRY"]