sdgym 0.14.3.dev0__tar.gz → 0.14.4.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sdgym-0.14.3.dev0/sdgym.egg-info → sdgym-0.14.4.dev0}/PKG-INFO +4 -6
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/pyproject.toml +4 -6
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/__init__.py +1 -1
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark_launcher/script.py +15 -102
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark_launcher/utils.py +119 -8
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/benchmark.py +190 -96
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/dataset_explorer.py +16 -4
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/datasets.py +99 -4
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/result_explorer/result_explorer.py +1 -1
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/result_explorer/result_handler.py +11 -1
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/run_benchmark/upload_benchmark_results.py +7 -7
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0/sdgym.egg-info}/PKG-INFO +4 -6
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym.egg-info/SOURCES.txt +0 -2
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym.egg-info/requires.txt +3 -5
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/tests/test_tasks.py +0 -2
- sdgym-0.14.3.dev0/sdgym/_benchmark_launcher/benchmark_multi_table.yaml +0 -180
- sdgym-0.14.3.dev0/sdgym/_benchmark_launcher/benchmark_single_table.yaml +0 -131
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/LICENSE +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/README.md +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark/__init__.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark/benchmark.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark/credentials_utils.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark_launcher/__init__.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark_launcher/_instance_manager.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark_launcher/_storage_manager.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark_launcher/_validation.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark_launcher/benchmark_base.yaml +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark_launcher/benchmark_config.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark_launcher/benchmark_launcher.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_dataset_utils.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/cli/__init__.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/cli/__main__.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/cli/collect.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/cli/summary.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/cli/utils.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/errors.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/metrics.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/progress.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/result_explorer/__init__.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/result_writer.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/run_benchmark/__init__.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/run_benchmark/run_benchmark.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/run_benchmark/utils.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/s3.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/synthesizer_descriptions.yaml +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/synthesizers/__init__.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/synthesizers/base.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/synthesizers/column.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/synthesizers/generate.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/synthesizers/identity.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/synthesizers/realtabformer.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/synthesizers/sdv.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/synthesizers/uniform.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/synthesizers/utils.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/utils.py +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym.egg-info/dependency_links.txt +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym.egg-info/entry_points.txt +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym.egg-info/top_level.txt +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/setup.cfg +0 -0
- {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/tests/test_scripts.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sdgym
|
|
3
|
-
Version: 0.14.
|
|
3
|
+
Version: 0.14.4.dev0
|
|
4
4
|
Summary: Benchmark tabular synthetic data generators using a variety of datasets
|
|
5
5
|
Author-email: "DataCebo, Inc." <info@sdv.dev>
|
|
6
6
|
License-Expression: BUSL-1.1
|
|
@@ -44,7 +44,7 @@ Requires-Dist: pandas<3,>=1.5.0; python_version >= "3.11" and python_version < "
|
|
|
44
44
|
Requires-Dist: pandas<3,>=2.1.1; python_version >= "3.12" and python_version < "3.13"
|
|
45
45
|
Requires-Dist: pandas<3,>=2.2.3; python_version >= "3.13" and python_version < "3.14"
|
|
46
46
|
Requires-Dist: pandas<3,>=2.3.3; python_version >= "3.14"
|
|
47
|
-
Requires-Dist: psutil>=
|
|
47
|
+
Requires-Dist: psutil>=7.0.0
|
|
48
48
|
Requires-Dist: scikit-learn>=1.0.2; python_version < "3.10"
|
|
49
49
|
Requires-Dist: scikit-learn>=1.1.0; python_version >= "3.10" and python_version < "3.11"
|
|
50
50
|
Requires-Dist: scikit-learn>=1.1.3; python_version >= "3.11" and python_version < "3.12"
|
|
@@ -66,10 +66,8 @@ Requires-Dist: tqdm>=4.66.3
|
|
|
66
66
|
Requires-Dist: XlsxWriter>=1.2.8
|
|
67
67
|
Requires-Dist: rdt>=1.18.2; python_version < "3.14"
|
|
68
68
|
Requires-Dist: rdt>=1.20.0; python_version >= "3.14"
|
|
69
|
-
Requires-Dist: sdmetrics>=0.
|
|
70
|
-
Requires-Dist:
|
|
71
|
-
Requires-Dist: sdv>=1.21.0; python_version < "3.14"
|
|
72
|
-
Requires-Dist: sdv>=1.33.0; python_version >= "3.14"
|
|
69
|
+
Requires-Dist: sdmetrics>=0.28.0
|
|
70
|
+
Requires-Dist: sdv>=1.37.0
|
|
73
71
|
Provides-Extra: dask
|
|
74
72
|
Requires-Dist: dask; extra == "dask"
|
|
75
73
|
Requires-Dist: distributed; extra == "dask"
|
|
@@ -42,7 +42,7 @@ dependencies = [
|
|
|
42
42
|
"pandas>=2.1.1,<3;python_version>='3.12' and python_version<'3.13'",
|
|
43
43
|
"pandas>=2.2.3,<3;python_version>='3.13' and python_version<'3.14'",
|
|
44
44
|
"pandas>=2.3.3,<3;python_version>='3.14'",
|
|
45
|
-
'psutil>=
|
|
45
|
+
'psutil>=7.0.0',
|
|
46
46
|
"scikit-learn>=1.0.2;python_version<'3.10'",
|
|
47
47
|
"scikit-learn>=1.1.0;python_version>='3.10' and python_version<'3.11'",
|
|
48
48
|
"scikit-learn>=1.1.3;python_version>='3.11' and python_version<'3.12'",
|
|
@@ -64,10 +64,8 @@ dependencies = [
|
|
|
64
64
|
'XlsxWriter>=1.2.8',
|
|
65
65
|
"rdt>=1.18.2;python_version<'3.14'",
|
|
66
66
|
"rdt>=1.20.0;python_version>='3.14'",
|
|
67
|
-
"sdmetrics>=0.
|
|
68
|
-
"
|
|
69
|
-
"sdv>=1.21.0;python_version<'3.14'",
|
|
70
|
-
"sdv>=1.33.0;python_version>='3.14'",
|
|
67
|
+
"sdmetrics>=0.28.0",
|
|
68
|
+
"sdv>=1.37.0",
|
|
71
69
|
]
|
|
72
70
|
|
|
73
71
|
[project.urls]
|
|
@@ -163,7 +161,7 @@ namespaces = false
|
|
|
163
161
|
version = {attr = 'sdgym.__version__'}
|
|
164
162
|
|
|
165
163
|
[tool.bumpversion]
|
|
166
|
-
current_version = "0.14.
|
|
164
|
+
current_version = "0.14.4.dev0"
|
|
167
165
|
parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
|
|
168
166
|
serialize = [
|
|
169
167
|
'{major}.{minor}.{patch}.{release}{candidate}',
|
|
@@ -1,17 +1,16 @@
|
|
|
1
1
|
import argparse
|
|
2
|
-
import
|
|
2
|
+
from itertools import product
|
|
3
3
|
|
|
4
4
|
from sdgym._benchmark_launcher.benchmark_config import BenchmarkConfig
|
|
5
5
|
from sdgym._benchmark_launcher.benchmark_launcher import BenchmarkLauncher
|
|
6
6
|
from sdgym._benchmark_launcher.utils import (
|
|
7
7
|
_deep_merge,
|
|
8
8
|
_load_merged_modality_config,
|
|
9
|
+
_resolve_datasets,
|
|
9
10
|
_resolve_modality_config,
|
|
10
11
|
)
|
|
11
12
|
from sdgym.run_benchmark.utils import OUTPUT_DESTINATION_AWS
|
|
12
13
|
|
|
13
|
-
DEFAULT_NUM_INSTANCES = 1
|
|
14
|
-
|
|
15
14
|
|
|
16
15
|
def _parse_args():
|
|
17
16
|
"""Parse CLI arguments for launching a benchmark."""
|
|
@@ -45,12 +44,6 @@ def _parse_args():
|
|
|
45
44
|
'benchmark for the given modality.'
|
|
46
45
|
),
|
|
47
46
|
)
|
|
48
|
-
parser.add_argument(
|
|
49
|
-
'--num-instances',
|
|
50
|
-
type=int,
|
|
51
|
-
default=None,
|
|
52
|
-
help='Number of benchmark instances to create. Defaults to 1.',
|
|
53
|
-
)
|
|
54
47
|
parser.add_argument(
|
|
55
48
|
'--timeout',
|
|
56
49
|
type=int,
|
|
@@ -77,7 +70,6 @@ def _validate_args(args):
|
|
|
77
70
|
args.synthesizers,
|
|
78
71
|
args.output_destination,
|
|
79
72
|
args.timeout,
|
|
80
|
-
args.num_instances,
|
|
81
73
|
)
|
|
82
74
|
):
|
|
83
75
|
raise ValueError(
|
|
@@ -94,9 +86,6 @@ def _validate_args(args):
|
|
|
94
86
|
"'--output-destination' is required when '--config-filepath' is not provided."
|
|
95
87
|
)
|
|
96
88
|
|
|
97
|
-
if args.num_instances is not None and args.num_instances < 1:
|
|
98
|
-
raise ValueError("'--num-instances' must be greater than or equal to 1.")
|
|
99
|
-
|
|
100
89
|
if args.output_destination == OUTPUT_DESTINATION_AWS:
|
|
101
90
|
raise ValueError(
|
|
102
91
|
f"'--output-destination' cannot be {OUTPUT_DESTINATION_AWS!r} that is reserved "
|
|
@@ -104,97 +93,25 @@ def _validate_args(args):
|
|
|
104
93
|
)
|
|
105
94
|
|
|
106
95
|
|
|
107
|
-
def
|
|
108
|
-
"""
|
|
109
|
-
|
|
110
|
-
return values[:midpoint], values[midpoint:]
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
def _instance_job_size(instance_job):
|
|
114
|
-
"""Return the number of synthesizer and dataset combinations."""
|
|
115
|
-
return len(instance_job['synthesizers']) * len(instance_job['datasets'])
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
def _split_instance_jobs(instance_job):
|
|
119
|
-
"""Split an instance job into two smaller instance jobs.
|
|
120
|
-
|
|
121
|
-
Prefer splitting synthesizers. If there is only one synthesizer,
|
|
122
|
-
split datasets instead.
|
|
123
|
-
"""
|
|
124
|
-
synthesizers = instance_job['synthesizers']
|
|
125
|
-
datasets = instance_job['datasets']
|
|
126
|
-
if len(synthesizers) > 1:
|
|
127
|
-
left_synthesizers, right_synthesizers = _split_list(synthesizers)
|
|
128
|
-
return [
|
|
129
|
-
{
|
|
130
|
-
'synthesizers': left_synthesizers,
|
|
131
|
-
'datasets': datasets,
|
|
132
|
-
'output_destination': instance_job['output_destination'],
|
|
133
|
-
},
|
|
134
|
-
{
|
|
135
|
-
'synthesizers': right_synthesizers,
|
|
136
|
-
'datasets': datasets,
|
|
137
|
-
'output_destination': instance_job['output_destination'],
|
|
138
|
-
},
|
|
139
|
-
]
|
|
140
|
-
|
|
141
|
-
if len(datasets) > 1:
|
|
142
|
-
left_datasets, right_datasets = _split_list(datasets)
|
|
143
|
-
return [
|
|
144
|
-
{
|
|
145
|
-
'synthesizers': synthesizers,
|
|
146
|
-
'datasets': left_datasets,
|
|
147
|
-
'output_destination': instance_job['output_destination'],
|
|
148
|
-
},
|
|
149
|
-
{
|
|
150
|
-
'synthesizers': synthesizers,
|
|
151
|
-
'datasets': right_datasets,
|
|
152
|
-
'output_destination': instance_job['output_destination'],
|
|
153
|
-
},
|
|
154
|
-
]
|
|
155
|
-
|
|
156
|
-
raise ValueError('Cannot split the instance job any further.')
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
def _build_instance_artifacts(datasets, synthesizers, num_instances, output_destination):
|
|
160
|
-
"""Build exactly ``num_instances`` instance jobs."""
|
|
161
|
-
max_jobs = len(synthesizers) * len(datasets)
|
|
162
|
-
if num_instances > max_jobs:
|
|
163
|
-
num_instances = max_jobs
|
|
164
|
-
warnings.warn(
|
|
165
|
-
f'num_instances is too high for the number of synthesizers and datasets. '
|
|
166
|
-
f'Maximum number of instances is {max_jobs}. Setting num_instances to {max_jobs}.'
|
|
167
|
-
)
|
|
168
|
-
|
|
169
|
-
instance_jobs = [
|
|
96
|
+
def _build_instance_jobs(datasets, synthesizers, output_destination):
|
|
97
|
+
"""Build one instance job per dataset and synthesizer pair."""
|
|
98
|
+
return [
|
|
170
99
|
{
|
|
171
|
-
'synthesizers':
|
|
172
|
-
'datasets':
|
|
100
|
+
'synthesizers': [synthesizer],
|
|
101
|
+
'datasets': [dataset],
|
|
173
102
|
'output_destination': output_destination,
|
|
174
103
|
}
|
|
104
|
+
for dataset, synthesizer in product(datasets, synthesizers)
|
|
175
105
|
]
|
|
176
|
-
while len(instance_jobs) < num_instances:
|
|
177
|
-
split_index = None
|
|
178
|
-
split_size = -1
|
|
179
|
-
for index, instance_job in enumerate(instance_jobs):
|
|
180
|
-
if (_instance_job_size(instance_job) > 1) and (
|
|
181
|
-
_instance_job_size(instance_job) > split_size
|
|
182
|
-
):
|
|
183
|
-
split_index = index
|
|
184
|
-
split_size = _instance_job_size(instance_job)
|
|
185
|
-
|
|
186
|
-
instance_job = instance_jobs.pop(split_index)
|
|
187
|
-
instance_jobs.extend(_split_instance_jobs(instance_job))
|
|
188
|
-
|
|
189
|
-
return instance_jobs
|
|
190
106
|
|
|
191
107
|
|
|
192
108
|
def _get_default_datasets_and_synthesizers(modality):
|
|
193
109
|
"""Get the default datasets and synthesizers for a modality config."""
|
|
194
110
|
base_dict = _load_merged_modality_config(modality)
|
|
195
|
-
datasets =
|
|
111
|
+
datasets = []
|
|
196
112
|
synthesizers = []
|
|
197
113
|
for instance_job in base_dict.get('instance_jobs', []):
|
|
114
|
+
datasets.extend(_resolve_datasets(instance_job.get('datasets', [])))
|
|
198
115
|
synthesizers.extend(instance_job.get('synthesizers', []))
|
|
199
116
|
|
|
200
117
|
return sorted(set(datasets)), sorted(set(synthesizers))
|
|
@@ -208,8 +125,7 @@ def build_dict_from_args(args):
|
|
|
208
125
|
|
|
209
126
|
datasets = args.datasets
|
|
210
127
|
synthesizers = args.synthesizers
|
|
211
|
-
|
|
212
|
-
if all(value is None for value in (datasets, synthesizers, num_instances)):
|
|
128
|
+
if all(value is None for value in (datasets, synthesizers)):
|
|
213
129
|
config = _resolve_modality_config(args.modality)
|
|
214
130
|
config['method_params'] = method_params
|
|
215
131
|
for config_instance_job in config.get('instance_jobs', []):
|
|
@@ -220,13 +136,11 @@ def build_dict_from_args(args):
|
|
|
220
136
|
default_datasets, default_synthesizers = _get_default_datasets_and_synthesizers(args.modality)
|
|
221
137
|
datasets = datasets if datasets is not None else default_datasets
|
|
222
138
|
synthesizers = synthesizers if synthesizers is not None else default_synthesizers
|
|
223
|
-
num_instances = num_instances if num_instances is not None else DEFAULT_NUM_INSTANCES
|
|
224
139
|
return {
|
|
225
140
|
'method_params': method_params,
|
|
226
|
-
'instance_jobs':
|
|
141
|
+
'instance_jobs': _build_instance_jobs(
|
|
227
142
|
datasets=datasets,
|
|
228
143
|
synthesizers=synthesizers,
|
|
229
|
-
num_instances=num_instances,
|
|
230
144
|
output_destination=args.output_destination,
|
|
231
145
|
),
|
|
232
146
|
}
|
|
@@ -257,12 +171,11 @@ def launch_from_args():
|
|
|
257
171
|
|
|
258
172
|
When building the configuration from command-line arguments:
|
|
259
173
|
|
|
260
|
-
- If ``--datasets
|
|
261
|
-
|
|
262
|
-
modality is used.
|
|
263
|
-
- If ``--num-instances`` is omitted, it defaults to ``1``.
|
|
174
|
+
- If ``--datasets`` and ``--synthesizers`` are both omitted, the default
|
|
175
|
+
monthly benchmark configuration for the selected modality is used.
|
|
264
176
|
- If ``--datasets`` or ``--synthesizers`` is omitted, the corresponding
|
|
265
177
|
default values from the monthly benchmark configuration are used.
|
|
178
|
+
- One instance job is created for each dataset and synthesizer pair.
|
|
266
179
|
|
|
267
180
|
Once the configuration is resolved, the benchmark is launched.
|
|
268
181
|
"""
|
|
@@ -5,6 +5,7 @@ import os
|
|
|
5
5
|
import uuid
|
|
6
6
|
from datetime import datetime
|
|
7
7
|
from importlib.resources import files
|
|
8
|
+
from itertools import product
|
|
8
9
|
from urllib.parse import quote_plus
|
|
9
10
|
|
|
10
11
|
import yaml
|
|
@@ -13,13 +14,111 @@ from sdgym._benchmark.benchmark import (
|
|
|
13
14
|
_benchmark_multi_table_compute_gcp,
|
|
14
15
|
_benchmark_single_table_compute_gcp,
|
|
15
16
|
)
|
|
16
|
-
from sdgym.run_benchmark.utils import get_s3_console_link
|
|
17
|
+
from sdgym.run_benchmark.utils import OUTPUT_DESTINATION_AWS, get_s3_console_link
|
|
17
18
|
from sdgym.s3 import parse_s3_path
|
|
18
19
|
|
|
19
20
|
_YAML_PKG = 'sdgym._benchmark_launcher'
|
|
20
|
-
|
|
21
|
-
'single_table':
|
|
22
|
-
|
|
21
|
+
MODALITY_TO_JOB_SETUP = {
|
|
22
|
+
'single_table': {
|
|
23
|
+
'output_destination': OUTPUT_DESTINATION_AWS,
|
|
24
|
+
'datasets': [
|
|
25
|
+
'adult',
|
|
26
|
+
'alarm',
|
|
27
|
+
'census',
|
|
28
|
+
'child',
|
|
29
|
+
'covtype',
|
|
30
|
+
'expedia_hotel_logs',
|
|
31
|
+
'insurance',
|
|
32
|
+
'intrusion',
|
|
33
|
+
'news',
|
|
34
|
+
],
|
|
35
|
+
'synthesizers': [
|
|
36
|
+
'ColumnSynthesizer',
|
|
37
|
+
'GaussianCopulaSynthesizer',
|
|
38
|
+
'CTGANSynthesizer',
|
|
39
|
+
'CopulaGANSynthesizer',
|
|
40
|
+
'TVAESynthesizer',
|
|
41
|
+
'SegmentSynthesizer',
|
|
42
|
+
'XGCSynthesizer',
|
|
43
|
+
'BootstrapSynthesizer',
|
|
44
|
+
'RealTabFormerSynthesizer',
|
|
45
|
+
],
|
|
46
|
+
},
|
|
47
|
+
'multi_table': {
|
|
48
|
+
'output_destination': OUTPUT_DESTINATION_AWS,
|
|
49
|
+
'datasets': [
|
|
50
|
+
'rel-amazon',
|
|
51
|
+
'rel-arxiv',
|
|
52
|
+
'rel-avito',
|
|
53
|
+
'rel-event',
|
|
54
|
+
'rel-f1',
|
|
55
|
+
'rel-hm',
|
|
56
|
+
'rel-ratebeer',
|
|
57
|
+
'rel-salt',
|
|
58
|
+
'rel-stack',
|
|
59
|
+
'rel-trial',
|
|
60
|
+
'instacart_marketbasket_ml',
|
|
61
|
+
'MovieLens',
|
|
62
|
+
'rossmann',
|
|
63
|
+
'Telstra',
|
|
64
|
+
'walmart',
|
|
65
|
+
'WebKP',
|
|
66
|
+
'DCG',
|
|
67
|
+
'UW_std',
|
|
68
|
+
'Same_gen',
|
|
69
|
+
'CORA',
|
|
70
|
+
'got_families',
|
|
71
|
+
'SalesDB',
|
|
72
|
+
'UTube',
|
|
73
|
+
'Student_loan',
|
|
74
|
+
'Hepatitis_std',
|
|
75
|
+
'Elti',
|
|
76
|
+
'Bupa',
|
|
77
|
+
'Toxicology',
|
|
78
|
+
'imdb_ijs',
|
|
79
|
+
'ftp',
|
|
80
|
+
'imdb_small',
|
|
81
|
+
'imdb_MovieLens',
|
|
82
|
+
'Pima',
|
|
83
|
+
'university',
|
|
84
|
+
'legalActs',
|
|
85
|
+
'Dunur',
|
|
86
|
+
'Mesh',
|
|
87
|
+
'world',
|
|
88
|
+
'airbnb-simplified',
|
|
89
|
+
'trains',
|
|
90
|
+
'FNHK',
|
|
91
|
+
'fake_hotels',
|
|
92
|
+
'SAT',
|
|
93
|
+
'genes',
|
|
94
|
+
'Biodegradability',
|
|
95
|
+
'Pyrimidine',
|
|
96
|
+
'mutagenesis',
|
|
97
|
+
'restbase',
|
|
98
|
+
'Triazine',
|
|
99
|
+
'Carcinogenesis',
|
|
100
|
+
'fake_hotels_extended',
|
|
101
|
+
'Mooney_Family',
|
|
102
|
+
'PTE',
|
|
103
|
+
'Facebook',
|
|
104
|
+
'multi_table_ID_demo_dataset',
|
|
105
|
+
'SAP',
|
|
106
|
+
'Chess',
|
|
107
|
+
'Countries',
|
|
108
|
+
'NCAA',
|
|
109
|
+
'Atherosclerosis',
|
|
110
|
+
'nations',
|
|
111
|
+
'TubePricing',
|
|
112
|
+
'financial',
|
|
113
|
+
'Accidents',
|
|
114
|
+
'MuskSmall',
|
|
115
|
+
'NBA',
|
|
116
|
+
'AustralianFootball',
|
|
117
|
+
'PremierLeague',
|
|
118
|
+
'OMOP_CDM_dayz',
|
|
119
|
+
],
|
|
120
|
+
'synthesizers': ['HMASynthesizer', 'HSASynthesizer', 'IndependentSynthesizer'],
|
|
121
|
+
},
|
|
23
122
|
}
|
|
24
123
|
CONFIG_KEYS = {
|
|
25
124
|
'modality',
|
|
@@ -84,10 +183,24 @@ def resolve_compute(compute):
|
|
|
84
183
|
raise ValueError(f"compute.service must be one of: 'gcp'. Found: {service}")
|
|
85
184
|
|
|
86
185
|
|
|
186
|
+
def _get_modality_config(modality):
|
|
187
|
+
"""Get the launchable benchmark config for a modality."""
|
|
188
|
+
result = []
|
|
189
|
+
job_setup = MODALITY_TO_JOB_SETUP.get(modality)
|
|
190
|
+
for dataset, synthesizer in product(job_setup['datasets'], job_setup['synthesizers']):
|
|
191
|
+
result.append({
|
|
192
|
+
'datasets': [dataset],
|
|
193
|
+
'synthesizers': [synthesizer],
|
|
194
|
+
'output_destination': job_setup['output_destination'],
|
|
195
|
+
})
|
|
196
|
+
|
|
197
|
+
return {'modality': modality, 'instance_jobs': result}
|
|
198
|
+
|
|
199
|
+
|
|
87
200
|
def _load_merged_modality_config(modality):
|
|
88
201
|
"""Load and merge the base and modality-specific benchmark configs."""
|
|
89
202
|
base_config = _load_yaml_resource('benchmark_base.yaml')
|
|
90
|
-
modality_config =
|
|
203
|
+
modality_config = _get_modality_config(modality)
|
|
91
204
|
return _deep_merge(base_config, modality_config)
|
|
92
205
|
|
|
93
206
|
|
|
@@ -285,9 +398,7 @@ def _build_instance_artifact_filepaths(
|
|
|
285
398
|
return (
|
|
286
399
|
_build_s3_uri(output_destination, f'{artifact_key_prefix}/{metainfo_name}.yaml'),
|
|
287
400
|
_build_s3_uri(output_destination, f'{artifact_key_prefix}/{results_name}.csv'),
|
|
288
|
-
_build_s3_uri(
|
|
289
|
-
output_destination, f'{modality_prefix}/job_args_list_{metainfo_name}.pkl.gz'
|
|
290
|
-
),
|
|
401
|
+
_build_s3_uri(output_destination, f'{modality_prefix}/job_args_list_{metainfo_name}.pkl'),
|
|
291
402
|
)
|
|
292
403
|
|
|
293
404
|
|