synthbench 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- synthbench-0.1.0/.gitignore +13 -0
- synthbench-0.1.0/CHANGELOG.md +5 -0
- synthbench-0.1.0/LICENSE +21 -0
- synthbench-0.1.0/PKG-INFO +127 -0
- synthbench-0.1.0/README.md +81 -0
- synthbench-0.1.0/icon.png +0 -0
- synthbench-0.1.0/pyproject.toml +99 -0
- synthbench-0.1.0/src/synthbench/__init__.py +77 -0
- synthbench-0.1.0/src/synthbench/_seed.py +43 -0
- synthbench-0.1.0/src/synthbench/_version.py +8 -0
- synthbench-0.1.0/src/synthbench/corruptors/__init__.py +23 -0
- synthbench-0.1.0/src/synthbench/corruptors/base.py +78 -0
- synthbench-0.1.0/src/synthbench/corruptors/categorical.py +154 -0
- synthbench-0.1.0/src/synthbench/corruptors/collinearity.py +174 -0
- synthbench-0.1.0/src/synthbench/corruptors/label_base.py +102 -0
- synthbench-0.1.0/src/synthbench/corruptors/label_noise.py +105 -0
- synthbench-0.1.0/src/synthbench/corruptors/measurement_noise.py +98 -0
- synthbench-0.1.0/src/synthbench/corruptors/missing_data.py +211 -0
- synthbench-0.1.0/src/synthbench/corruptors/outlier.py +114 -0
- synthbench-0.1.0/src/synthbench/data/suites/easy-classification.json +36 -0
- synthbench-0.1.0/src/synthbench/data/suites/hard-regression.json +36 -0
- synthbench-0.1.0/src/synthbench/dgps/__init__.py +45 -0
- synthbench-0.1.0/src/synthbench/dgps/_utils.py +87 -0
- synthbench-0.1.0/src/synthbench/dgps/additive.py +217 -0
- synthbench-0.1.0/src/synthbench/dgps/base.py +72 -0
- synthbench-0.1.0/src/synthbench/dgps/friedman.py +274 -0
- synthbench-0.1.0/src/synthbench/dgps/geometric.py +291 -0
- synthbench-0.1.0/src/synthbench/dgps/linear.py +165 -0
- synthbench-0.1.0/src/synthbench/dgps/neural.py +209 -0
- synthbench-0.1.0/src/synthbench/dgps/polynomial.py +200 -0
- synthbench-0.1.0/src/synthbench/dgps/sparse.py +165 -0
- synthbench-0.1.0/src/synthbench/dgps/tree.py +234 -0
- synthbench-0.1.0/src/synthbench/pipeline.py +644 -0
- synthbench-0.1.0/src/synthbench/suite.py +113 -0
- synthbench-0.1.0/src/synthbench/sweeps.py +223 -0
- synthbench-0.1.0/tests/conftest.py +59 -0
- synthbench-0.1.0/tests/test_additive_dgp.py +214 -0
- synthbench-0.1.0/tests/test_base_corruptor.py +86 -0
- synthbench-0.1.0/tests/test_base_dgp.py +68 -0
- synthbench-0.1.0/tests/test_base_label_corruptor.py +143 -0
- synthbench-0.1.0/tests/test_bench_result.py +280 -0
- synthbench-0.1.0/tests/test_categorical_corruptor.py +277 -0
- synthbench-0.1.0/tests/test_ci_smoke.py +53 -0
- synthbench-0.1.0/tests/test_class_weight.py +240 -0
- synthbench-0.1.0/tests/test_collinearity_corruptor.py +303 -0
- synthbench-0.1.0/tests/test_dgp_integration.py +326 -0
- synthbench-0.1.0/tests/test_friedman_dgp.py +361 -0
- synthbench-0.1.0/tests/test_geometric_dgp.py +259 -0
- synthbench-0.1.0/tests/test_label_noise_corruptor.py +270 -0
- synthbench-0.1.0/tests/test_linear_dgp.py +276 -0
- synthbench-0.1.0/tests/test_measurement_noise_corruptor.py +202 -0
- synthbench-0.1.0/tests/test_missing_data_corruptor.py +450 -0
- synthbench-0.1.0/tests/test_neural_dgp.py +179 -0
- synthbench-0.1.0/tests/test_outlier_corruptor.py +218 -0
- synthbench-0.1.0/tests/test_pipeline.py +662 -0
- synthbench-0.1.0/tests/test_polynomial_dgp.py +180 -0
- synthbench-0.1.0/tests/test_rng_isolation.py +153 -0
- synthbench-0.1.0/tests/test_seed.py +60 -0
- synthbench-0.1.0/tests/test_sparse_dgp.py +215 -0
- synthbench-0.1.0/tests/test_suite.py +110 -0
- synthbench-0.1.0/tests/test_sweeps.py +195 -0
- synthbench-0.1.0/tests/test_tree_dgp.py +208 -0
synthbench-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Jan Teichert-Kluge
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: synthbench
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Synthetic datasets for ML benchmarking with controllable complexity, configurable corruptions, and full provenance.
|
|
5
|
+
Project-URL: Homepage, https://github.com/JanTeichertKluge/synth-bench
|
|
6
|
+
Project-URL: Documentation, https://JanTeichertKluge.github.io/synth-bench
|
|
7
|
+
Project-URL: Repository, https://github.com/JanTeichertKluge/synth-bench.git
|
|
8
|
+
Project-URL: Issues, https://github.com/JanTeichertKluge/synth-bench/issues
|
|
9
|
+
Project-URL: Changelog, https://github.com/JanTeichertKluge/synth-bench/releases
|
|
10
|
+
Author-email: Jan Teichert-Kluge <janteiklu@gmail.com>
|
|
11
|
+
License-Expression: MIT
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Keywords: benchmarking,data-generating-process,dataset-generation,machine-learning,reproducibility,synthetic-data
|
|
14
|
+
Classifier: Development Status :: 3 - Alpha
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
|
+
Classifier: Typing :: Typed
|
|
24
|
+
Requires-Python: >=3.12
|
|
25
|
+
Requires-Dist: numpy>=2.0
|
|
26
|
+
Requires-Dist: scikit-learn>=1.5
|
|
27
|
+
Requires-Dist: scipy>=1.12
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pre-commit>=3.5; extra == 'dev'
|
|
30
|
+
Requires-Dist: pytest-cov>=7.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
32
|
+
Requires-Dist: ruff>=0.5; extra == 'dev'
|
|
33
|
+
Provides-Extra: docs
|
|
34
|
+
Requires-Dist: matplotlib>=3.7; extra == 'docs'
|
|
35
|
+
Requires-Dist: mkdocs-jupyter>=0.26; extra == 'docs'
|
|
36
|
+
Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
|
|
37
|
+
Requires-Dist: mkdocs>=1.6; extra == 'docs'
|
|
38
|
+
Requires-Dist: mkdocstrings[python]>=1.0; extra == 'docs'
|
|
39
|
+
Requires-Dist: openml>=0.14; extra == 'docs'
|
|
40
|
+
Requires-Dist: pandas>=2.0; extra == 'docs'
|
|
41
|
+
Provides-Extra: io
|
|
42
|
+
Requires-Dist: pyarrow>=14.0; extra == 'io'
|
|
43
|
+
Provides-Extra: neural
|
|
44
|
+
Requires-Dist: torch>=2.1; extra == 'neural'
|
|
45
|
+
Description-Content-Type: text/markdown
|
|
46
|
+
|
|
47
|
+
<div align="center">
|
|
48
|
+
<img src="icon.png" alt="synthbench" width="420">
|
|
49
|
+
</div>
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
synthbench is a small Python library for generating synthetic datasets that are actually useful for benchmarking. You control the signal complexity, add noise or missing data on top, and get back a dataset with full provenance so you know exactly what you generated and why. Every result is reproducible from a single integer seed.
|
|
54
|
+
|
|
55
|
+
It covers eight DGP families, five corruptors, metadata enrichment (Bayes error, effective rank), Parquet/CSV serialization, and sweep helpers for running ablation grids.
|
|
56
|
+
|
|
57
|
+
## Installation
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install synthbench
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
For Parquet support:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install "synthbench[io]"
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
For `RandomNeuralDGP` (needs PyTorch):
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install "synthbench[neural]"
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Basic usage
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from synthbench import BenchPipeline, LinearDGP, MissingDataCorruptor
|
|
79
|
+
|
|
80
|
+
pipeline = BenchPipeline(
|
|
81
|
+
LinearDGP(complexity="medium", task_type="classification"),
|
|
82
|
+
corruptors=[MissingDataCorruptor(proportion=0.1, mechanism="mar")],
|
|
83
|
+
)
|
|
84
|
+
result = pipeline.run(n_samples=500, n_features=10, random_state=42)
|
|
85
|
+
|
|
86
|
+
print(result.X.shape) # (500, 10)
|
|
87
|
+
print(result.metadata["bayes_error"]) # empirical difficulty estimate
|
|
88
|
+
print(result.metadata["effective_rank"]) # feature space dimensionality
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## What it does
|
|
92
|
+
|
|
93
|
+
**Data-generating processes** — Linear, Polynomial, Tree, Friedman (variants 1/2/3), Additive, Sparse, Geometric, and RandomNeural. Each takes a `complexity` parameter and records ground-truth feature importances alongside the data.
|
|
94
|
+
|
|
95
|
+
**Corruptors** — MeasurementNoise, Outlier, MissingData, Collinearity, and Categorical corruptors for the feature matrix, plus `LabelNoiseCorruptor` for flipping labels or injecting regression noise. They chain together in a canonical order and track how much signal they degrade.
|
|
96
|
+
|
|
97
|
+
**Metadata** — every result carries `bayes_error`, `effective_rank`, corruptor parameters, and version provenance. Enough to reconstruct the generating pipeline from scratch.
|
|
98
|
+
|
|
99
|
+
**Sweeps** — `severity_sweep` and `difficulty_sweep` for single-axis ablations, and `experiment_grid` for full factorial runs across sample size, complexity, and severity. Seeds are derived hierarchically so cells are independent but deterministic.
|
|
100
|
+
|
|
101
|
+
**Named suites** — `BenchSuite("easy-classification").run()` returns a labelled dict of results for a curated collection. Good for quick sanity checks or as a shared benchmark baseline.
|
|
102
|
+
|
|
103
|
+
**Serialization** — `to_parquet` / `from_parquet` and `to_csv` / `from_csv` round-trip everything including metadata. `BenchPipeline.from_metadata` reconstructs and re-runs the pipeline for bit-identical replay.
|
|
104
|
+
|
|
105
|
+
## Ablation example
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from synthbench import LinearDGP, OutlierCorruptor, experiment_grid
|
|
109
|
+
|
|
110
|
+
grid = experiment_grid(
|
|
111
|
+
LinearDGP,
|
|
112
|
+
OutlierCorruptor,
|
|
113
|
+
n_samples_list=[200, 500, 1000],
|
|
114
|
+
complexities=["low", "medium", "high"],
|
|
115
|
+
severities=["low", "medium", "high"],
|
|
116
|
+
n_features=10,
|
|
117
|
+
random_state=0,
|
|
118
|
+
task_type="classification",
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
result = grid[(500, "high", "medium")]
|
|
122
|
+
print(result.metadata["bayes_error"])
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## Docs
|
|
126
|
+
|
|
127
|
+
Full reference at [JanTeichertKluge.github.io/synth-bench](https://JanTeichertKluge.github.io/synth-bench).
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
<img src="icon.png" alt="synthbench" width="420">
|
|
3
|
+
</div>
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
synthbench is a small Python library for generating synthetic datasets that are actually useful for benchmarking. You control the signal complexity, add noise or missing data on top, and get back a dataset with full provenance so you know exactly what you generated and why. Every result is reproducible from a single integer seed.
|
|
8
|
+
|
|
9
|
+
It covers eight DGP families, five corruptors, metadata enrichment (Bayes error, effective rank), Parquet/CSV serialization, and sweep helpers for running ablation grids.
|
|
10
|
+
|
|
11
|
+
## Installation
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip install synthbench
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
For Parquet support:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install "synthbench[io]"
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
For `RandomNeuralDGP` (needs PyTorch):
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install "synthbench[neural]"
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Basic usage
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
from synthbench import BenchPipeline, LinearDGP, MissingDataCorruptor
|
|
33
|
+
|
|
34
|
+
pipeline = BenchPipeline(
|
|
35
|
+
LinearDGP(complexity="medium", task_type="classification"),
|
|
36
|
+
corruptors=[MissingDataCorruptor(proportion=0.1, mechanism="mar")],
|
|
37
|
+
)
|
|
38
|
+
result = pipeline.run(n_samples=500, n_features=10, random_state=42)
|
|
39
|
+
|
|
40
|
+
print(result.X.shape) # (500, 10)
|
|
41
|
+
print(result.metadata["bayes_error"]) # empirical difficulty estimate
|
|
42
|
+
print(result.metadata["effective_rank"]) # feature space dimensionality
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## What it does
|
|
46
|
+
|
|
47
|
+
**Data-generating processes** — Linear, Polynomial, Tree, Friedman (variants 1/2/3), Additive, Sparse, Geometric, and RandomNeural. Each takes a `complexity` parameter and records ground-truth feature importances alongside the data.
|
|
48
|
+
|
|
49
|
+
**Corruptors** — MeasurementNoise, Outlier, MissingData, Collinearity, and Categorical corruptors for the feature matrix, plus `LabelNoiseCorruptor` for flipping labels or injecting regression noise. They chain together in a canonical order and track how much signal they degrade.
|
|
50
|
+
|
|
51
|
+
**Metadata** — every result carries `bayes_error`, `effective_rank`, corruptor parameters, and version provenance. Enough to reconstruct the generating pipeline from scratch.
|
|
52
|
+
|
|
53
|
+
**Sweeps** — `severity_sweep` and `difficulty_sweep` for single-axis ablations, and `experiment_grid` for full factorial runs across sample size, complexity, and severity. Seeds are derived hierarchically so cells are independent but deterministic.
|
|
54
|
+
|
|
55
|
+
**Named suites** — `BenchSuite("easy-classification").run()` returns a labelled dict of results for a curated collection. Good for quick sanity checks or as a shared benchmark baseline.
|
|
56
|
+
|
|
57
|
+
**Serialization** — `to_parquet` / `from_parquet` and `to_csv` / `from_csv` round-trip everything including metadata. `BenchPipeline.from_metadata` reconstructs and re-runs the pipeline for bit-identical replay.
|
|
58
|
+
|
|
59
|
+
## Ablation example
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from synthbench import LinearDGP, OutlierCorruptor, experiment_grid
|
|
63
|
+
|
|
64
|
+
grid = experiment_grid(
|
|
65
|
+
LinearDGP,
|
|
66
|
+
OutlierCorruptor,
|
|
67
|
+
n_samples_list=[200, 500, 1000],
|
|
68
|
+
complexities=["low", "medium", "high"],
|
|
69
|
+
severities=["low", "medium", "high"],
|
|
70
|
+
n_features=10,
|
|
71
|
+
random_state=0,
|
|
72
|
+
task_type="classification",
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
result = grid[(500, "high", "medium")]
|
|
76
|
+
print(result.metadata["bayes_error"])
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Docs
|
|
80
|
+
|
|
81
|
+
Full reference at [JanTeichertKluge.github.io/synth-bench](https://JanTeichertKluge.github.io/synth-bench).
|
|
Binary file
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling>=1.27"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "synthbench"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Synthetic datasets for ML benchmarking with controllable complexity, configurable corruptions, and full provenance."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
license-files = ["LICENSE"]
|
|
13
|
+
authors = [
|
|
14
|
+
{ name = "Jan Teichert-Kluge", email = "janteiklu@gmail.com" },
|
|
15
|
+
]
|
|
16
|
+
keywords = [
|
|
17
|
+
"benchmarking",
|
|
18
|
+
"synthetic-data",
|
|
19
|
+
"machine-learning",
|
|
20
|
+
"dataset-generation",
|
|
21
|
+
"reproducibility",
|
|
22
|
+
"data-generating-process",
|
|
23
|
+
]
|
|
24
|
+
classifiers = [
|
|
25
|
+
"Development Status :: 3 - Alpha",
|
|
26
|
+
"Intended Audience :: Science/Research",
|
|
27
|
+
"Intended Audience :: Developers",
|
|
28
|
+
"Operating System :: OS Independent",
|
|
29
|
+
"Programming Language :: Python :: 3",
|
|
30
|
+
"Programming Language :: Python :: 3.12",
|
|
31
|
+
"Programming Language :: Python :: 3.13",
|
|
32
|
+
"Topic :: Scientific/Engineering",
|
|
33
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
34
|
+
"Typing :: Typed",
|
|
35
|
+
]
|
|
36
|
+
dependencies = [
|
|
37
|
+
"numpy>=2.0",
|
|
38
|
+
"scikit-learn>=1.5",
|
|
39
|
+
"scipy>=1.12",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[project.urls]
|
|
43
|
+
Homepage = "https://github.com/JanTeichertKluge/synth-bench"
|
|
44
|
+
Documentation = "https://JanTeichertKluge.github.io/synth-bench"
|
|
45
|
+
Repository = "https://github.com/JanTeichertKluge/synth-bench.git"
|
|
46
|
+
Issues = "https://github.com/JanTeichertKluge/synth-bench/issues"
|
|
47
|
+
Changelog = "https://github.com/JanTeichertKluge/synth-bench/releases"
|
|
48
|
+
|
|
49
|
+
[project.optional-dependencies]
|
|
50
|
+
io = ["pyarrow>=14.0"]
|
|
51
|
+
neural = ["torch>=2.1"]
|
|
52
|
+
docs = [
|
|
53
|
+
"mkdocs>=1.6",
|
|
54
|
+
"mkdocs-material>=9.5",
|
|
55
|
+
"mkdocstrings[python]>=1.0",
|
|
56
|
+
"mkdocs-jupyter>=0.26",
|
|
57
|
+
"openml>=0.14",
|
|
58
|
+
"matplotlib>=3.7",
|
|
59
|
+
"pandas>=2.0",
|
|
60
|
+
]
|
|
61
|
+
dev = [
|
|
62
|
+
"pytest>=8.0",
|
|
63
|
+
"pytest-cov>=7.0",
|
|
64
|
+
"ruff>=0.5",
|
|
65
|
+
"pre-commit>=3.5",
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
[tool.hatch.build.targets.wheel]
|
|
69
|
+
packages = ["src/synthbench"]
|
|
70
|
+
|
|
71
|
+
[tool.hatch.build.targets.sdist]
|
|
72
|
+
exclude = [
|
|
73
|
+
".coverage",
|
|
74
|
+
".gitattributes",
|
|
75
|
+
".pre-commit-config.yaml",
|
|
76
|
+
"uv.lock",
|
|
77
|
+
".github/",
|
|
78
|
+
"docs/",
|
|
79
|
+
"mkdocs.yml",
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
[tool.ruff]
|
|
83
|
+
line-length = 88
|
|
84
|
+
target-version = "py312"
|
|
85
|
+
|
|
86
|
+
[tool.ruff.lint]
|
|
87
|
+
select = ["E", "W", "F", "I", "UP", "B"]
|
|
88
|
+
|
|
89
|
+
[tool.pytest.ini_options]
|
|
90
|
+
testpaths = ["tests"]
|
|
91
|
+
markers = [
|
|
92
|
+
"neural: marks tests requiring torch (deselect with '-m not neural')",
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
[tool.coverage.report]
|
|
96
|
+
fail_under = 90
|
|
97
|
+
omit = [
|
|
98
|
+
"src/synthbench/dgps/neural.py",
|
|
99
|
+
]
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from synthbench._version import __version__
|
|
4
|
+
from synthbench.corruptors import (
|
|
5
|
+
BaseCorruptor,
|
|
6
|
+
BaseLabelCorruptor,
|
|
7
|
+
CategoricalCorruptor,
|
|
8
|
+
CollinearityCorruptor,
|
|
9
|
+
LabelNoiseCorruptor,
|
|
10
|
+
MeasurementNoiseCorruptor,
|
|
11
|
+
MissingDataCorruptor,
|
|
12
|
+
OutlierCorruptor,
|
|
13
|
+
)
|
|
14
|
+
from synthbench.dgps import (
|
|
15
|
+
AdditiveDGP,
|
|
16
|
+
BaseDGP,
|
|
17
|
+
FriedmanDGP,
|
|
18
|
+
GeometricDGP,
|
|
19
|
+
LinearDGP,
|
|
20
|
+
PolynomialDGP,
|
|
21
|
+
SparseDGP,
|
|
22
|
+
TreeDGP,
|
|
23
|
+
)
|
|
24
|
+
from synthbench.pipeline import BenchPipeline, BenchResult
|
|
25
|
+
from synthbench.suite import BenchSuite
|
|
26
|
+
from synthbench.sweeps import difficulty_sweep, experiment_grid, severity_sweep
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"__version__",
|
|
30
|
+
"AdditiveDGP",
|
|
31
|
+
"BaseDGP",
|
|
32
|
+
"BaseCorruptor",
|
|
33
|
+
"BaseLabelCorruptor",
|
|
34
|
+
"BenchPipeline",
|
|
35
|
+
"BenchResult",
|
|
36
|
+
"BenchSuite",
|
|
37
|
+
"CategoricalCorruptor",
|
|
38
|
+
"CollinearityCorruptor",
|
|
39
|
+
"difficulty_sweep",
|
|
40
|
+
"experiment_grid",
|
|
41
|
+
"FriedmanDGP",
|
|
42
|
+
"GeometricDGP",
|
|
43
|
+
"LabelNoiseCorruptor",
|
|
44
|
+
"LinearDGP",
|
|
45
|
+
"MeasurementNoiseCorruptor",
|
|
46
|
+
"MissingDataCorruptor",
|
|
47
|
+
"OutlierCorruptor",
|
|
48
|
+
"PolynomialDGP",
|
|
49
|
+
"severity_sweep",
|
|
50
|
+
"SparseDGP",
|
|
51
|
+
"TreeDGP",
|
|
52
|
+
# RandomNeuralDGP is available only with synthbench[neural] (torch required).
|
|
53
|
+
# It is exposed lazily via __getattr__ below so that importing synthbench
|
|
54
|
+
# does NOT unconditionally load torch into sys.modules.
|
|
55
|
+
"RandomNeuralDGP",
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
# Names that are only available with the neural extra (torch).
|
|
59
|
+
_NEURAL_NAMES = {"RandomNeuralDGP"}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def __getattr__(name: str):
|
|
63
|
+
"""Lazy loader for optional neural symbols (requires synthbench[neural])."""
|
|
64
|
+
if name in _NEURAL_NAMES:
|
|
65
|
+
try:
|
|
66
|
+
from synthbench.dgps.neural import RandomNeuralDGP
|
|
67
|
+
except ImportError as exc:
|
|
68
|
+
raise ImportError(
|
|
69
|
+
f"{name} requires PyTorch. "
|
|
70
|
+
"Install it with: pip install synthbench[neural]"
|
|
71
|
+
) from exc
|
|
72
|
+
# Cache in module namespace so subsequent lookups are O(1).
|
|
73
|
+
import synthbench as _self
|
|
74
|
+
|
|
75
|
+
setattr(_self, name, RandomNeuralDGP)
|
|
76
|
+
return RandomNeuralDGP
|
|
77
|
+
raise AttributeError(f"module 'synthbench' has no attribute {name!r}")
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from numpy.random import SeedSequence
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def derive_seeds(master_state: int | SeedSequence, n: int) -> list[int]:
|
|
8
|
+
"""Derive *n* independent integer seeds from a master state.
|
|
9
|
+
|
|
10
|
+
Uses ``numpy.random.SeedSequence.spawn`` so that child seeds are
|
|
11
|
+
statistically independent and fully reproducible from the master state.
|
|
12
|
+
The global numpy RNG state is never touched.
|
|
13
|
+
|
|
14
|
+
Parameters
|
|
15
|
+
----------
|
|
16
|
+
master_state:
|
|
17
|
+
Either a plain integer that seeds the root ``SeedSequence``, or an
|
|
18
|
+
already-constructed ``SeedSequence`` (e.g. from a parent context).
|
|
19
|
+
n:
|
|
20
|
+
Number of independent child seeds to derive.
|
|
21
|
+
|
|
22
|
+
Returns
|
|
23
|
+
-------
|
|
24
|
+
list[int]
|
|
25
|
+
Plain Python ints (JSON-serializable, not numpy dtypes).
|
|
26
|
+
"""
|
|
27
|
+
if isinstance(master_state, SeedSequence):
|
|
28
|
+
seq = master_state
|
|
29
|
+
else:
|
|
30
|
+
seq = SeedSequence(int(master_state))
|
|
31
|
+
|
|
32
|
+
children = seq.spawn(n)
|
|
33
|
+
# generate_state(1)[0] gives a single uint64 from the child sequence
|
|
34
|
+
return [int(child.generate_state(1)[0]) for child in children]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def make_rng(seed: int) -> np.random.RandomState:
|
|
38
|
+
"""Return a local ``numpy.random.RandomState`` seeded with *seed*.
|
|
39
|
+
|
|
40
|
+
Use ``sklearn.utils.check_random_state`` for public APIs that accept
|
|
41
|
+
``int | None | RandomState``.
|
|
42
|
+
"""
|
|
43
|
+
return np.random.RandomState(seed)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from synthbench.corruptors.base import BaseCorruptor
|
|
4
|
+
from synthbench.corruptors.categorical import CategoricalCorruptor
|
|
5
|
+
from synthbench.corruptors.collinearity import CollinearityCorruptor
|
|
6
|
+
from synthbench.corruptors.label_base import (
|
|
7
|
+
BaseLabelCorruptor,
|
|
8
|
+
)
|
|
9
|
+
from synthbench.corruptors.label_noise import LabelNoiseCorruptor
|
|
10
|
+
from synthbench.corruptors.measurement_noise import MeasurementNoiseCorruptor
|
|
11
|
+
from synthbench.corruptors.missing_data import MissingDataCorruptor
|
|
12
|
+
from synthbench.corruptors.outlier import OutlierCorruptor
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"BaseCorruptor",
|
|
16
|
+
"BaseLabelCorruptor",
|
|
17
|
+
"CategoricalCorruptor",
|
|
18
|
+
"CollinearityCorruptor",
|
|
19
|
+
"LabelNoiseCorruptor",
|
|
20
|
+
"MeasurementNoiseCorruptor",
|
|
21
|
+
"MissingDataCorruptor",
|
|
22
|
+
"OutlierCorruptor",
|
|
23
|
+
]
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
_CORRUPTOR_REGISTRY: dict[str, type] = {}
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BaseCorruptor(ABC):
|
|
11
|
+
"""Abstract base class for all corruptors.
|
|
12
|
+
|
|
13
|
+
Concrete subclasses must:
|
|
14
|
+
- Declare ``key="some_key"`` in their class signature to be auto-registered.
|
|
15
|
+
- Implement [corrupt][synthbench.corruptors.base.BaseCorruptor.corrupt].
|
|
16
|
+
|
|
17
|
+
Corruptors transform X only; y is never passed in or mutated.
|
|
18
|
+
|
|
19
|
+
Example::
|
|
20
|
+
|
|
21
|
+
class CollinearityCorruptor(BaseCorruptor, key="collinearity"):
|
|
22
|
+
def corrupt(self, X, metadata, random_state):
|
|
23
|
+
...
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init_subclass__(cls, key: str | None = None, **kwargs: object) -> None:
|
|
27
|
+
super().__init_subclass__(**kwargs)
|
|
28
|
+
if key is not None:
|
|
29
|
+
if key in _CORRUPTOR_REGISTRY:
|
|
30
|
+
raise ValueError(
|
|
31
|
+
f"Duplicate corruptor key '{key}': already registered by "
|
|
32
|
+
f"{_CORRUPTOR_REGISTRY[key].__qualname__}"
|
|
33
|
+
)
|
|
34
|
+
_CORRUPTOR_REGISTRY[key] = cls
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
def corrupt(
|
|
38
|
+
self,
|
|
39
|
+
X: np.ndarray,
|
|
40
|
+
metadata: dict,
|
|
41
|
+
random_state: int,
|
|
42
|
+
) -> tuple[np.ndarray, dict]:
|
|
43
|
+
"""Apply a structural transformation to X.
|
|
44
|
+
|
|
45
|
+
Parameters
|
|
46
|
+
----------
|
|
47
|
+
X:
|
|
48
|
+
Feature matrix of shape (n_samples, n_features). Must not be
|
|
49
|
+
modified in-place; return a new array.
|
|
50
|
+
metadata:
|
|
51
|
+
The BenchResult metadata dict produced by the DGP. Corruptors
|
|
52
|
+
may add keys (e.g. effective_feature_importances) but must not
|
|
53
|
+
remove or overwrite existing keys set by the DGP.
|
|
54
|
+
random_state:
|
|
55
|
+
Integer seed. Each corruptor derives its own RNG from this value
|
|
56
|
+
so that results are fully reproducible.
|
|
57
|
+
|
|
58
|
+
Returns
|
|
59
|
+
-------
|
|
60
|
+
X_corrupted : np.ndarray
|
|
61
|
+
Transformed feature matrix, same shape as X.
|
|
62
|
+
updated_metadata : dict
|
|
63
|
+
Metadata dict with any corruptor-specific fields added.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def get_params(self) -> dict[str, object]:
|
|
67
|
+
"""Return the corruptor's current configuration as a plain dict.
|
|
68
|
+
|
|
69
|
+
Concrete corruptors should override this to return their __init__
|
|
70
|
+
parameters. Used by BenchPipeline to record component provenance.
|
|
71
|
+
"""
|
|
72
|
+
raise NotImplementedError(
|
|
73
|
+
f"{type(self).__qualname__}.get_params() is not implemented. "
|
|
74
|
+
"Concrete corruptor subclasses must override this method."
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
__all__ = ["BaseCorruptor", "_CORRUPTOR_REGISTRY"]
|