sdgym 0.10.0.dev0__tar.gz → 0.10.1.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sdgym-0.10.0.dev0/sdgym.egg-info → sdgym-0.10.1.dev0}/PKG-INFO +31 -24
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/pyproject.toml +31 -25
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/__init__.py +9 -5
- sdgym-0.10.1.dev0/sdgym/_dataset_utils.py +107 -0
- sdgym-0.10.1.dev0/sdgym/benchmark.py +1606 -0
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/cli/__main__.py +19 -8
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/cli/collect.py +11 -9
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/cli/summary.py +7 -3
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/cli/utils.py +17 -13
- sdgym-0.10.1.dev0/sdgym/dataset_explorer.py +277 -0
- sdgym-0.10.1.dev0/sdgym/datasets.py +330 -0
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/progress.py +1 -3
- sdgym-0.10.1.dev0/sdgym/result_explorer/__init__.py +5 -0
- sdgym-0.10.1.dev0/sdgym/result_explorer/result_explorer.py +121 -0
- sdgym-0.10.1.dev0/sdgym/result_explorer/result_handler.py +398 -0
- sdgym-0.10.1.dev0/sdgym/result_writer.py +147 -0
- sdgym-0.10.1.dev0/sdgym/s3.py +240 -0
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/synthesizers/generate.py +4 -0
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/synthesizers/realtabformer.py +2 -2
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/synthesizers/uniform.py +9 -1
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/utils.py +32 -0
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0/sdgym.egg-info}/PKG-INFO +31 -24
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym.egg-info/SOURCES.txt +7 -0
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym.egg-info/requires.txt +30 -21
- sdgym-0.10.1.dev0/tests/test_scripts.py +37 -0
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/tests/test_tasks.py +1 -1
- sdgym-0.10.0.dev0/sdgym/benchmark.py +0 -839
- sdgym-0.10.0.dev0/sdgym/datasets.py +0 -227
- sdgym-0.10.0.dev0/sdgym/s3.py +0 -146
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/LICENSE +0 -0
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/README.md +0 -0
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/cli/__init__.py +0 -0
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/errors.py +0 -0
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/metrics.py +0 -0
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/synthesizers/__init__.py +0 -0
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/synthesizers/base.py +0 -0
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/synthesizers/column.py +0 -0
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/synthesizers/identity.py +0 -0
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/synthesizers/sdv.py +0 -0
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym.egg-info/dependency_links.txt +0 -0
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym.egg-info/entry_points.txt +0 -0
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym.egg-info/top_level.txt +0 -0
- {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: sdgym
|
|
3
|
-
Version: 0.10.
|
|
3
|
+
Version: 0.10.1.dev0
|
|
4
4
|
Summary: Benchmark tabular synthetic data generators using a variety of datasets
|
|
5
5
|
Author-email: "DataCebo, Inc." <info@sdv.dev>
|
|
6
6
|
License: BSL-1.1
|
|
@@ -15,13 +15,13 @@ Classifier: Intended Audience :: Developers
|
|
|
15
15
|
Classifier: License :: Free for non-commercial use
|
|
16
16
|
Classifier: Natural Language :: English
|
|
17
17
|
Classifier: Programming Language :: Python :: 3
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.9
|
|
20
19
|
Classifier: Programming Language :: Python :: 3.10
|
|
21
20
|
Classifier: Programming Language :: Python :: 3.11
|
|
22
21
|
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
23
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
24
|
-
Requires-Python: <3.
|
|
24
|
+
Requires-Python: <3.14,>=3.9
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
26
|
License-File: LICENSE
|
|
27
27
|
Requires-Dist: appdirs>=1.3
|
|
@@ -29,59 +29,66 @@ Requires-Dist: boto3<2,>=1.28
|
|
|
29
29
|
Requires-Dist: botocore<2,>=1.31
|
|
30
30
|
Requires-Dist: cloudpickle>=2.1.0
|
|
31
31
|
Requires-Dist: compress-pickle>=1.2.0
|
|
32
|
-
Requires-Dist: humanfriendly>=
|
|
33
|
-
Requires-Dist: numpy>=1.
|
|
34
|
-
Requires-Dist: numpy>=1.
|
|
35
|
-
Requires-Dist: numpy>=1.26.0; python_version >= "3.12"
|
|
32
|
+
Requires-Dist: humanfriendly>=10.0
|
|
33
|
+
Requires-Dist: numpy>=1.22.2; python_version < "3.10"
|
|
34
|
+
Requires-Dist: numpy>=1.24.0; python_version >= "3.10" and python_version < "3.12"
|
|
35
|
+
Requires-Dist: numpy>=1.26.0; python_version >= "3.12" and python_version < "3.13"
|
|
36
|
+
Requires-Dist: numpy>=2.1.0; python_version >= "3.13"
|
|
36
37
|
Requires-Dist: pandas>=1.4.0; python_version < "3.11"
|
|
37
38
|
Requires-Dist: pandas>=1.5.0; python_version >= "3.11" and python_version < "3.12"
|
|
38
|
-
Requires-Dist: pandas>=2.1.1; python_version >= "3.12"
|
|
39
|
+
Requires-Dist: pandas>=2.1.1; python_version >= "3.12" and python_version < "3.13"
|
|
40
|
+
Requires-Dist: pandas>=2.2.3; python_version >= "3.13"
|
|
39
41
|
Requires-Dist: psutil>=5.7
|
|
40
42
|
Requires-Dist: scikit-learn>=1.0.2; python_version < "3.10"
|
|
41
43
|
Requires-Dist: scikit-learn>=1.1.0; python_version >= "3.10" and python_version < "3.11"
|
|
42
44
|
Requires-Dist: scikit-learn>=1.1.3; python_version >= "3.11" and python_version < "3.12"
|
|
43
|
-
Requires-Dist: scikit-learn>=1.3.1; python_version >= "3.12"
|
|
45
|
+
Requires-Dist: scikit-learn>=1.3.1; python_version >= "3.12" and python_version < "3.13"
|
|
46
|
+
Requires-Dist: scikit-learn>=1.5.2; python_version >= "3.13"
|
|
44
47
|
Requires-Dist: scipy>=1.7.3; python_version < "3.10"
|
|
45
48
|
Requires-Dist: scipy>=1.9.2; python_version >= "3.10" and python_version < "3.12"
|
|
46
|
-
Requires-Dist: scipy>=1.12.0; python_version >= "3.12"
|
|
49
|
+
Requires-Dist: scipy>=1.12.0; python_version >= "3.12" and python_version < "3.13"
|
|
50
|
+
Requires-Dist: scipy>=1.14.1; python_version >= "3.13"
|
|
47
51
|
Requires-Dist: tabulate<0.9,>=0.8.3
|
|
48
|
-
Requires-Dist: torch>=
|
|
49
|
-
Requires-Dist: torch>=2.0.0; python_version >= "3.10" and python_version < "3.12"
|
|
50
|
-
Requires-Dist: torch>=2.2.0; python_version >= "3.12"
|
|
52
|
+
Requires-Dist: torch>=2.6.0
|
|
51
53
|
Requires-Dist: tqdm>=4.66.3
|
|
52
54
|
Requires-Dist: XlsxWriter>=1.2.8
|
|
53
|
-
Requires-Dist: rdt>=1.
|
|
54
|
-
Requires-Dist: sdmetrics>=0.
|
|
55
|
-
Requires-Dist: sdv>=1.
|
|
55
|
+
Requires-Dist: rdt>=1.17.0
|
|
56
|
+
Requires-Dist: sdmetrics>=0.20.1
|
|
57
|
+
Requires-Dist: sdv>=1.21.0
|
|
56
58
|
Provides-Extra: dask
|
|
57
59
|
Requires-Dist: dask; extra == "dask"
|
|
58
60
|
Requires-Dist: distributed; extra == "dask"
|
|
59
61
|
Provides-Extra: realtabformer
|
|
60
|
-
Requires-Dist: realtabformer>=0.2.
|
|
61
|
-
Requires-Dist: torch>=2.
|
|
62
|
-
Requires-Dist:
|
|
62
|
+
Requires-Dist: realtabformer>=0.2.3; extra == "realtabformer"
|
|
63
|
+
Requires-Dist: torch>=2.6.0; extra == "realtabformer"
|
|
64
|
+
Requires-Dist: transformers<4.51; extra == "realtabformer"
|
|
63
65
|
Provides-Extra: test
|
|
64
66
|
Requires-Dist: sdgym[realtabformer]; extra == "test"
|
|
65
67
|
Requires-Dist: pytest>=6.2.5; extra == "test"
|
|
66
68
|
Requires-Dist: pytest-cov>=2.6.0; extra == "test"
|
|
67
69
|
Requires-Dist: jupyter<2,>=1.0.0; extra == "test"
|
|
68
|
-
Requires-Dist: rundoc<0.5,>=0.4.3; extra == "test"
|
|
69
70
|
Requires-Dist: tomli<3,>=2.0.0; extra == "test"
|
|
71
|
+
Requires-Dist: slack-sdk<4.0,>=3.23; extra == "test"
|
|
72
|
+
Requires-Dist: openpyxl>=3.0.0; python_version < "3.9" and extra == "test"
|
|
73
|
+
Requires-Dist: openpyxl>=3.1.2; python_version >= "3.9" and extra == "test"
|
|
74
|
+
Requires-Dist: pydrive2<2.0.0,>=1.4.0; extra == "test"
|
|
70
75
|
Provides-Extra: dev
|
|
71
76
|
Requires-Dist: sdgym[dask,test]; extra == "dev"
|
|
72
77
|
Requires-Dist: build<2,>=1.0.0; extra == "dev"
|
|
73
|
-
Requires-Dist: bump-my-version
|
|
78
|
+
Requires-Dist: bump-my-version>=0.18.3; extra == "dev"
|
|
74
79
|
Requires-Dist: pip>=9.0.1; extra == "dev"
|
|
75
80
|
Requires-Dist: watchdog<5,>=1.0.1; extra == "dev"
|
|
76
81
|
Requires-Dist: ruff<1,>=0.4.5; extra == "dev"
|
|
77
|
-
Requires-Dist: twine
|
|
82
|
+
Requires-Dist: twine>=1.10.0; extra == "dev"
|
|
78
83
|
Requires-Dist: wheel>=0.30.0; extra == "dev"
|
|
79
84
|
Requires-Dist: coverage<8,>=4.5.12; extra == "dev"
|
|
80
|
-
Requires-Dist: tox<5,>=2.9.1; extra == "dev"
|
|
81
85
|
Requires-Dist: importlib-metadata>=3.6; extra == "dev"
|
|
82
86
|
Requires-Dist: invoke; extra == "dev"
|
|
87
|
+
Provides-Extra: readme
|
|
88
|
+
Requires-Dist: rundoc<0.5,>=0.4.3; extra == "readme"
|
|
83
89
|
Provides-Extra: all
|
|
84
90
|
Requires-Dist: sdgym[dask,dev,test]; extra == "all"
|
|
91
|
+
Dynamic: license-file
|
|
85
92
|
|
|
86
93
|
<div align="center">
|
|
87
94
|
<br/>
|
|
@@ -8,17 +8,17 @@ classifiers = [
|
|
|
8
8
|
'License :: Free for non-commercial use',
|
|
9
9
|
'Natural Language :: English',
|
|
10
10
|
'Programming Language :: Python :: 3',
|
|
11
|
-
'Programming Language :: Python :: 3.8',
|
|
12
11
|
'Programming Language :: Python :: 3.9',
|
|
13
12
|
'Programming Language :: Python :: 3.10',
|
|
14
13
|
'Programming Language :: Python :: 3.11',
|
|
15
14
|
'Programming Language :: Python :: 3.12',
|
|
15
|
+
'Programming Language :: Python :: 3.13',
|
|
16
16
|
'Topic :: Scientific/Engineering :: Artificial Intelligence',
|
|
17
17
|
]
|
|
18
18
|
keywords = ['machine learning', 'synthetic data generation', 'benchmark', 'generative models']
|
|
19
19
|
dynamic = ['version']
|
|
20
20
|
license = { text = 'BSL-1.1' }
|
|
21
|
-
requires-python = '>=3.
|
|
21
|
+
requires-python = '>=3.9,<3.14'
|
|
22
22
|
readme = 'README.md'
|
|
23
23
|
dependencies = [
|
|
24
24
|
'appdirs>=1.3',
|
|
@@ -26,30 +26,32 @@ dependencies = [
|
|
|
26
26
|
'botocore>=1.31,<2',
|
|
27
27
|
'cloudpickle>=2.1.0',
|
|
28
28
|
'compress-pickle>=1.2.0',
|
|
29
|
-
'humanfriendly>=
|
|
30
|
-
"numpy>=1.
|
|
31
|
-
"numpy>=1.
|
|
32
|
-
"numpy>=1.26.0;python_version>='3.12'",
|
|
29
|
+
'humanfriendly>=10.0',
|
|
30
|
+
"numpy>=1.22.2;python_version<'3.10'",
|
|
31
|
+
"numpy>=1.24.0;python_version>='3.10' and python_version<'3.12'",
|
|
32
|
+
"numpy>=1.26.0;python_version>='3.12' and python_version<'3.13'",
|
|
33
|
+
"numpy>=2.1.0;python_version>='3.13'",
|
|
33
34
|
"pandas>=1.4.0;python_version<'3.11'",
|
|
34
35
|
"pandas>=1.5.0;python_version>='3.11' and python_version<'3.12'",
|
|
35
|
-
"pandas>=2.1.1;python_version>='3.12'",
|
|
36
|
+
"pandas>=2.1.1;python_version>='3.12' and python_version<'3.13'",
|
|
37
|
+
"pandas>=2.2.3;python_version>='3.13'",
|
|
36
38
|
'psutil>=5.7',
|
|
37
39
|
"scikit-learn>=1.0.2;python_version<'3.10'",
|
|
38
40
|
"scikit-learn>=1.1.0;python_version>='3.10' and python_version<'3.11'",
|
|
39
41
|
"scikit-learn>=1.1.3;python_version>='3.11' and python_version<'3.12'",
|
|
40
|
-
"scikit-learn>=1.3.1;python_version>='3.12'",
|
|
42
|
+
"scikit-learn>=1.3.1;python_version>='3.12' and python_version<'3.13'",
|
|
43
|
+
"scikit-learn>=1.5.2;python_version>='3.13'",
|
|
41
44
|
"scipy>=1.7.3;python_version<'3.10'",
|
|
42
45
|
"scipy>=1.9.2;python_version>='3.10' and python_version<'3.12'",
|
|
43
|
-
"scipy>=1.12.0;python_version>='3.12'",
|
|
46
|
+
"scipy>=1.12.0;python_version>='3.12' and python_version<'3.13'",
|
|
47
|
+
"scipy>=1.14.1;python_version>='3.13'",
|
|
44
48
|
'tabulate>=0.8.3,<0.9',
|
|
45
|
-
"torch>=
|
|
46
|
-
"torch>=2.0.0;python_version>='3.10' and python_version<'3.12'",
|
|
47
|
-
"torch>=2.2.0;python_version>='3.12'",
|
|
49
|
+
"torch>=2.6.0",
|
|
48
50
|
'tqdm>=4.66.3',
|
|
49
51
|
'XlsxWriter>=1.2.8',
|
|
50
|
-
'rdt>=1.
|
|
51
|
-
'sdmetrics>=0.
|
|
52
|
-
'sdv>=1.
|
|
52
|
+
'rdt>=1.17.0',
|
|
53
|
+
'sdmetrics>=0.20.1',
|
|
54
|
+
'sdv>=1.21.0',
|
|
53
55
|
]
|
|
54
56
|
|
|
55
57
|
[project.urls]
|
|
@@ -65,24 +67,27 @@ sdgym = { main = 'sdgym.cli.__main__:main' }
|
|
|
65
67
|
[project.optional-dependencies]
|
|
66
68
|
dask = ['dask', 'distributed']
|
|
67
69
|
realtabformer = [
|
|
68
|
-
'realtabformer>=0.2.
|
|
69
|
-
"torch>=2.
|
|
70
|
-
|
|
70
|
+
'realtabformer>=0.2.3',
|
|
71
|
+
"torch>=2.6.0",
|
|
72
|
+
'transformers<4.51',
|
|
71
73
|
]
|
|
72
74
|
test = [
|
|
73
75
|
'sdgym[realtabformer]',
|
|
74
76
|
'pytest>=6.2.5',
|
|
75
77
|
'pytest-cov>=2.6.0',
|
|
76
78
|
'jupyter>=1.0.0,<2',
|
|
77
|
-
'rundoc>=0.4.3,<0.5',
|
|
78
79
|
'tomli>=2.0.0,<3',
|
|
80
|
+
'slack-sdk>=3.23,<4.0',
|
|
81
|
+
"openpyxl>=3.0.0; python_version<'3.9'",
|
|
82
|
+
"openpyxl>=3.1.2; python_version>='3.9'",
|
|
83
|
+
'pydrive2>=1.4.0,<2.0.0'
|
|
79
84
|
]
|
|
80
85
|
dev = [
|
|
81
86
|
'sdgym[dask, test]',
|
|
82
87
|
|
|
83
88
|
# general
|
|
84
89
|
'build>=1.0.0,<2',
|
|
85
|
-
'bump-my-version>=0.18.3
|
|
90
|
+
'bump-my-version>=0.18.3',
|
|
86
91
|
'pip>=9.0.1',
|
|
87
92
|
'watchdog>=1.0.1,<5',
|
|
88
93
|
|
|
@@ -90,17 +95,17 @@ dev = [
|
|
|
90
95
|
'ruff>=0.4.5,<1',
|
|
91
96
|
|
|
92
97
|
# distribute on PyPI
|
|
93
|
-
'twine>=1.10.0
|
|
98
|
+
'twine>=1.10.0',
|
|
94
99
|
'wheel>=0.30.0',
|
|
95
100
|
|
|
96
101
|
# Advanced testing
|
|
97
102
|
'coverage>=4.5.12,<8',
|
|
98
|
-
'tox>=2.9.1,<5',
|
|
99
103
|
'importlib-metadata>=3.6',
|
|
100
104
|
|
|
101
105
|
# Invoke
|
|
102
106
|
'invoke',
|
|
103
107
|
]
|
|
108
|
+
readme = ['rundoc>=0.4.3,<0.5',]
|
|
104
109
|
all = [
|
|
105
110
|
'sdgym[dask, test, dev]',
|
|
106
111
|
]
|
|
@@ -140,7 +145,7 @@ namespaces = false
|
|
|
140
145
|
version = {attr = 'sdgym.__version__'}
|
|
141
146
|
|
|
142
147
|
[tool.bumpversion]
|
|
143
|
-
current_version = "0.10.
|
|
148
|
+
current_version = "0.10.1.dev0"
|
|
144
149
|
parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
|
|
145
150
|
serialize = [
|
|
146
151
|
'{major}.{minor}.{patch}.{release}{candidate}',
|
|
@@ -188,6 +193,8 @@ exclude = [
|
|
|
188
193
|
"__pycache__",
|
|
189
194
|
".ipynb_checkpoints",
|
|
190
195
|
"tasks.py",
|
|
196
|
+
"static_code_analysis.txt",
|
|
197
|
+
"*.ipynb"
|
|
191
198
|
]
|
|
192
199
|
|
|
193
200
|
[tool.ruff.lint]
|
|
@@ -212,7 +219,6 @@ ignore = [
|
|
|
212
219
|
# pydocstyle
|
|
213
220
|
"D107", # Missing docstring in __init__
|
|
214
221
|
"D417", # Missing argument descriptions in the docstring, this is a bug from pydocstyle: https://github.com/PyCQA/pydocstyle/issues/449
|
|
215
|
-
"PD901",
|
|
216
222
|
"PD101",
|
|
217
223
|
]
|
|
218
224
|
|
|
@@ -237,4 +243,4 @@ convention = "google"
|
|
|
237
243
|
|
|
238
244
|
[tool.ruff.lint.pycodestyle]
|
|
239
245
|
max-doc-length = 100
|
|
240
|
-
max-line-length = 100
|
|
246
|
+
max-line-length = 100
|
|
@@ -8,26 +8,30 @@ __author__ = 'DataCebo, Inc.'
|
|
|
8
8
|
__copyright__ = 'Copyright (c) 2022 DataCebo, Inc.'
|
|
9
9
|
__email__ = 'info@sdv.dev'
|
|
10
10
|
__license__ = 'BSL-1.1'
|
|
11
|
-
__version__ = '0.10.
|
|
11
|
+
__version__ = '0.10.1.dev0'
|
|
12
12
|
|
|
13
13
|
import logging
|
|
14
14
|
|
|
15
15
|
from sdgym.benchmark import benchmark_single_table
|
|
16
16
|
from sdgym.cli.collect import collect_results
|
|
17
17
|
from sdgym.cli.summary import make_summary_spreadsheet
|
|
18
|
+
from sdgym.dataset_explorer import DatasetExplorer
|
|
18
19
|
from sdgym.datasets import get_available_datasets, load_dataset
|
|
19
20
|
from sdgym.synthesizers import create_sdv_synthesizer_variant, create_single_table_synthesizer
|
|
21
|
+
from sdgym.result_explorer import ResultsExplorer
|
|
20
22
|
|
|
21
23
|
# Clear the logging wrongfully configured by tensorflow/absl
|
|
22
24
|
list(map(logging.root.removeHandler, logging.root.handlers))
|
|
23
25
|
list(map(logging.root.removeFilter, logging.root.filters))
|
|
24
26
|
|
|
25
27
|
__all__ = [
|
|
26
|
-
'
|
|
27
|
-
'
|
|
28
|
-
'make_summary_spreadsheet',
|
|
28
|
+
'DatasetExplorer',
|
|
29
|
+
'ResultsExplorer',
|
|
29
30
|
'benchmark_single_table',
|
|
30
|
-
'
|
|
31
|
+
'collect_results',
|
|
31
32
|
'create_sdv_synthesizer_variant',
|
|
32
33
|
'create_single_table_synthesizer',
|
|
34
|
+
'get_available_datasets',
|
|
35
|
+
'load_dataset',
|
|
36
|
+
'make_summary_spreadsheet',
|
|
33
37
|
]
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Utility functions for handling datasets."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from zipfile import ZipFile
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
LOGGER = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _parse_numeric_value(value, dataset_name, field_name, target_type=float):
|
|
15
|
+
"""Generic parser for numeric values with logging and NaN fallback."""
|
|
16
|
+
try:
|
|
17
|
+
return target_type(value)
|
|
18
|
+
except (ValueError, TypeError):
|
|
19
|
+
LOGGER.info(
|
|
20
|
+
f"Could not cast {field_name} '{value}' to {target_type.__name__} for dataset "
|
|
21
|
+
f"'{dataset_name}' defaulting to NaN."
|
|
22
|
+
)
|
|
23
|
+
return np.nan
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _get_dataset_subset(data, metadata_dict, modality):
|
|
27
|
+
"""Limit the size of a dataset for faster evaluation or testing.
|
|
28
|
+
|
|
29
|
+
This function reduces a dataset to a smaller subset by restricting the number
|
|
30
|
+
of rows and columns to 1000 rows and 10 columns. It ensures that essential
|
|
31
|
+
columns—such as sequence indices and keys in sequential datasets—are always retained.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
data (pd.DataFrame):
|
|
35
|
+
The dataset to be reduced.
|
|
36
|
+
metadata_dict (dict):
|
|
37
|
+
A dictionary containing the dataset's metadata.
|
|
38
|
+
modality (str):
|
|
39
|
+
The dataset modality. Must be one of: ``'single_table'``, ``'sequential'``.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
tuple[pd.DataFrame, dict]:
|
|
43
|
+
A tuple containing:
|
|
44
|
+
- The reduced dataset as a DataFrame.
|
|
45
|
+
- The updated metadata dictionary reflecting any removed columns.
|
|
46
|
+
|
|
47
|
+
Raises:
|
|
48
|
+
ValueError:
|
|
49
|
+
If the provided modality is ``'multi_table'``.
|
|
50
|
+
"""
|
|
51
|
+
if modality == 'multi_table':
|
|
52
|
+
raise ValueError('limit_dataset_size is not supported for multi-table datasets.')
|
|
53
|
+
|
|
54
|
+
max_rows, max_columns = (1000, 10)
|
|
55
|
+
tables = metadata_dict.get('tables', {})
|
|
56
|
+
mandatory_columns = []
|
|
57
|
+
table_name, table_info = next(iter(tables.items()))
|
|
58
|
+
|
|
59
|
+
columns = table_info.get('columns', {})
|
|
60
|
+
keep_columns = list(columns)
|
|
61
|
+
if modality == 'sequential':
|
|
62
|
+
seq_index = table_info.get('sequence_index')
|
|
63
|
+
seq_key = table_info.get('sequence_key')
|
|
64
|
+
mandatory_columns = [col for col in (seq_index, seq_key) if col]
|
|
65
|
+
|
|
66
|
+
optional_columns = [col for col in columns if col not in mandatory_columns]
|
|
67
|
+
|
|
68
|
+
# If we have too many columns, drop extras but never mandatory ones
|
|
69
|
+
if len(columns) > max_columns:
|
|
70
|
+
keep_count = max_columns - len(mandatory_columns)
|
|
71
|
+
keep_columns = mandatory_columns + optional_columns[:keep_count]
|
|
72
|
+
table_info['columns'] = {
|
|
73
|
+
column_name: column_definition
|
|
74
|
+
for column_name, column_definition in columns.items()
|
|
75
|
+
if column_name in keep_columns
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
data = data[list(keep_columns)]
|
|
79
|
+
data = data.sample(max_rows)
|
|
80
|
+
return data, metadata_dict
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _read_zipped_data(zip_file_path, modality):
|
|
84
|
+
data = {}
|
|
85
|
+
with ZipFile(zip_file_path, 'r') as zf:
|
|
86
|
+
for file_name in zf.namelist():
|
|
87
|
+
if file_name.endswith('.csv'):
|
|
88
|
+
key = Path(file_name).stem
|
|
89
|
+
data[key] = _read_csv_from_zip(zf, csv_file_name=file_name)
|
|
90
|
+
|
|
91
|
+
if modality != 'multi_table':
|
|
92
|
+
data = next(iter(data.values()))
|
|
93
|
+
|
|
94
|
+
return data
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _read_csv_from_zip(zip_file, csv_file_name):
|
|
98
|
+
"""Read a single CSV file from an open ZipFile and return a DataFrame."""
|
|
99
|
+
with zip_file.open(csv_file_name) as csv_file:
|
|
100
|
+
return pd.read_csv(csv_file, low_memory=False)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _read_metadata_json(metadata_path):
|
|
104
|
+
with open(metadata_path) as metadata_file:
|
|
105
|
+
metadata_dict = json.load(metadata_file)
|
|
106
|
+
|
|
107
|
+
return metadata_dict
|