sdgym 0.14.3.dev0__tar.gz → 0.14.4.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {sdgym-0.14.3.dev0/sdgym.egg-info → sdgym-0.14.4.dev0}/PKG-INFO +4 -6
  2. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/pyproject.toml +4 -6
  3. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/__init__.py +1 -1
  4. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark_launcher/script.py +15 -102
  5. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark_launcher/utils.py +119 -8
  6. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/benchmark.py +190 -96
  7. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/dataset_explorer.py +16 -4
  8. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/datasets.py +99 -4
  9. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/result_explorer/result_explorer.py +1 -1
  10. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/result_explorer/result_handler.py +11 -1
  11. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/run_benchmark/upload_benchmark_results.py +7 -7
  12. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0/sdgym.egg-info}/PKG-INFO +4 -6
  13. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym.egg-info/SOURCES.txt +0 -2
  14. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym.egg-info/requires.txt +3 -5
  15. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/tests/test_tasks.py +0 -2
  16. sdgym-0.14.3.dev0/sdgym/_benchmark_launcher/benchmark_multi_table.yaml +0 -180
  17. sdgym-0.14.3.dev0/sdgym/_benchmark_launcher/benchmark_single_table.yaml +0 -131
  18. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/LICENSE +0 -0
  19. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/README.md +0 -0
  20. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark/__init__.py +0 -0
  21. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark/benchmark.py +0 -0
  22. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark/credentials_utils.py +0 -0
  23. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark_launcher/__init__.py +0 -0
  24. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark_launcher/_instance_manager.py +0 -0
  25. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark_launcher/_storage_manager.py +0 -0
  26. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark_launcher/_validation.py +0 -0
  27. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark_launcher/benchmark_base.yaml +0 -0
  28. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark_launcher/benchmark_config.py +0 -0
  29. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark_launcher/benchmark_launcher.py +0 -0
  30. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_dataset_utils.py +0 -0
  31. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/cli/__init__.py +0 -0
  32. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/cli/__main__.py +0 -0
  33. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/cli/collect.py +0 -0
  34. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/cli/summary.py +0 -0
  35. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/cli/utils.py +0 -0
  36. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/errors.py +0 -0
  37. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/metrics.py +0 -0
  38. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/progress.py +0 -0
  39. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/result_explorer/__init__.py +0 -0
  40. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/result_writer.py +0 -0
  41. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/run_benchmark/__init__.py +0 -0
  42. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/run_benchmark/run_benchmark.py +0 -0
  43. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/run_benchmark/utils.py +0 -0
  44. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/s3.py +0 -0
  45. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/synthesizer_descriptions.yaml +0 -0
  46. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/synthesizers/__init__.py +0 -0
  47. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/synthesizers/base.py +0 -0
  48. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/synthesizers/column.py +0 -0
  49. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/synthesizers/generate.py +0 -0
  50. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/synthesizers/identity.py +0 -0
  51. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/synthesizers/realtabformer.py +0 -0
  52. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/synthesizers/sdv.py +0 -0
  53. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/synthesizers/uniform.py +0 -0
  54. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/synthesizers/utils.py +0 -0
  55. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/utils.py +0 -0
  56. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym.egg-info/dependency_links.txt +0 -0
  57. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym.egg-info/entry_points.txt +0 -0
  58. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym.egg-info/top_level.txt +0 -0
  59. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/setup.cfg +0 -0
  60. {sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/tests/test_scripts.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdgym
3
- Version: 0.14.3.dev0
3
+ Version: 0.14.4.dev0
4
4
  Summary: Benchmark tabular synthetic data generators using a variety of datasets
5
5
  Author-email: "DataCebo, Inc." <info@sdv.dev>
6
6
  License-Expression: BUSL-1.1
@@ -44,7 +44,7 @@ Requires-Dist: pandas<3,>=1.5.0; python_version >= "3.11" and python_version < "
44
44
  Requires-Dist: pandas<3,>=2.1.1; python_version >= "3.12" and python_version < "3.13"
45
45
  Requires-Dist: pandas<3,>=2.2.3; python_version >= "3.13" and python_version < "3.14"
46
46
  Requires-Dist: pandas<3,>=2.3.3; python_version >= "3.14"
47
- Requires-Dist: psutil>=5.8
47
+ Requires-Dist: psutil>=7.0.0
48
48
  Requires-Dist: scikit-learn>=1.0.2; python_version < "3.10"
49
49
  Requires-Dist: scikit-learn>=1.1.0; python_version >= "3.10" and python_version < "3.11"
50
50
  Requires-Dist: scikit-learn>=1.1.3; python_version >= "3.11" and python_version < "3.12"
@@ -66,10 +66,8 @@ Requires-Dist: tqdm>=4.66.3
66
66
  Requires-Dist: XlsxWriter>=1.2.8
67
67
  Requires-Dist: rdt>=1.18.2; python_version < "3.14"
68
68
  Requires-Dist: rdt>=1.20.0; python_version >= "3.14"
69
- Requires-Dist: sdmetrics>=0.21.0; python_version < "3.14"
70
- Requires-Dist: sdmetrics>=0.26.0; python_version >= "3.14"
71
- Requires-Dist: sdv>=1.21.0; python_version < "3.14"
72
- Requires-Dist: sdv>=1.33.0; python_version >= "3.14"
69
+ Requires-Dist: sdmetrics>=0.28.0
70
+ Requires-Dist: sdv>=1.37.0
73
71
  Provides-Extra: dask
74
72
  Requires-Dist: dask; extra == "dask"
75
73
  Requires-Dist: distributed; extra == "dask"
@@ -42,7 +42,7 @@ dependencies = [
42
42
  "pandas>=2.1.1,<3;python_version>='3.12' and python_version<'3.13'",
43
43
  "pandas>=2.2.3,<3;python_version>='3.13' and python_version<'3.14'",
44
44
  "pandas>=2.3.3,<3;python_version>='3.14'",
45
- 'psutil>=5.8',
45
+ 'psutil>=7.0.0',
46
46
  "scikit-learn>=1.0.2;python_version<'3.10'",
47
47
  "scikit-learn>=1.1.0;python_version>='3.10' and python_version<'3.11'",
48
48
  "scikit-learn>=1.1.3;python_version>='3.11' and python_version<'3.12'",
@@ -64,10 +64,8 @@ dependencies = [
64
64
  'XlsxWriter>=1.2.8',
65
65
  "rdt>=1.18.2;python_version<'3.14'",
66
66
  "rdt>=1.20.0;python_version>='3.14'",
67
- "sdmetrics>=0.21.0;python_version<'3.14'",
68
- "sdmetrics>=0.26.0;python_version>='3.14'",
69
- "sdv>=1.21.0;python_version<'3.14'",
70
- "sdv>=1.33.0;python_version>='3.14'",
67
+ "sdmetrics>=0.28.0",
68
+ "sdv>=1.37.0",
71
69
  ]
72
70
 
73
71
  [project.urls]
@@ -163,7 +161,7 @@ namespaces = false
163
161
  version = {attr = 'sdgym.__version__'}
164
162
 
165
163
  [tool.bumpversion]
166
- current_version = "0.14.3.dev0"
164
+ current_version = "0.14.4.dev0"
167
165
  parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
168
166
  serialize = [
169
167
  '{major}.{minor}.{patch}.{release}{candidate}',
@@ -8,7 +8,7 @@ __author__ = 'DataCebo, Inc.'
8
8
  __copyright__ = 'Copyright (c) 2022 DataCebo, Inc.'
9
9
  __email__ = 'info@sdv.dev'
10
10
  __license__ = 'BSL-1.1'
11
- __version__ = '0.14.3.dev0'
11
+ __version__ = '0.14.4.dev0'
12
12
 
13
13
  import logging
14
14
 
@@ -1,17 +1,16 @@
1
1
  import argparse
2
- import warnings
2
+ from itertools import product
3
3
 
4
4
  from sdgym._benchmark_launcher.benchmark_config import BenchmarkConfig
5
5
  from sdgym._benchmark_launcher.benchmark_launcher import BenchmarkLauncher
6
6
  from sdgym._benchmark_launcher.utils import (
7
7
  _deep_merge,
8
8
  _load_merged_modality_config,
9
+ _resolve_datasets,
9
10
  _resolve_modality_config,
10
11
  )
11
12
  from sdgym.run_benchmark.utils import OUTPUT_DESTINATION_AWS
12
13
 
13
- DEFAULT_NUM_INSTANCES = 1
14
-
15
14
 
16
15
  def _parse_args():
17
16
  """Parse CLI arguments for launching a benchmark."""
@@ -45,12 +44,6 @@ def _parse_args():
45
44
  'benchmark for the given modality.'
46
45
  ),
47
46
  )
48
- parser.add_argument(
49
- '--num-instances',
50
- type=int,
51
- default=None,
52
- help='Number of benchmark instances to create. Defaults to 1.',
53
- )
54
47
  parser.add_argument(
55
48
  '--timeout',
56
49
  type=int,
@@ -77,7 +70,6 @@ def _validate_args(args):
77
70
  args.synthesizers,
78
71
  args.output_destination,
79
72
  args.timeout,
80
- args.num_instances,
81
73
  )
82
74
  ):
83
75
  raise ValueError(
@@ -94,9 +86,6 @@ def _validate_args(args):
94
86
  "'--output-destination' is required when '--config-filepath' is not provided."
95
87
  )
96
88
 
97
- if args.num_instances is not None and args.num_instances < 1:
98
- raise ValueError("'--num-instances' must be greater than or equal to 1.")
99
-
100
89
  if args.output_destination == OUTPUT_DESTINATION_AWS:
101
90
  raise ValueError(
102
91
  f"'--output-destination' cannot be {OUTPUT_DESTINATION_AWS!r} that is reserved "
@@ -104,97 +93,25 @@ def _validate_args(args):
104
93
  )
105
94
 
106
95
 
107
- def _split_list(values):
108
- """Split a list into two non-empty parts, as evenly as possible."""
109
- midpoint = len(values) // 2
110
- return values[:midpoint], values[midpoint:]
111
-
112
-
113
- def _instance_job_size(instance_job):
114
- """Return the number of synthesizer and dataset combinations."""
115
- return len(instance_job['synthesizers']) * len(instance_job['datasets'])
116
-
117
-
118
- def _split_instance_jobs(instance_job):
119
- """Split an instance job into two smaller instance jobs.
120
-
121
- Prefer splitting synthesizers. If there is only one synthesizer,
122
- split datasets instead.
123
- """
124
- synthesizers = instance_job['synthesizers']
125
- datasets = instance_job['datasets']
126
- if len(synthesizers) > 1:
127
- left_synthesizers, right_synthesizers = _split_list(synthesizers)
128
- return [
129
- {
130
- 'synthesizers': left_synthesizers,
131
- 'datasets': datasets,
132
- 'output_destination': instance_job['output_destination'],
133
- },
134
- {
135
- 'synthesizers': right_synthesizers,
136
- 'datasets': datasets,
137
- 'output_destination': instance_job['output_destination'],
138
- },
139
- ]
140
-
141
- if len(datasets) > 1:
142
- left_datasets, right_datasets = _split_list(datasets)
143
- return [
144
- {
145
- 'synthesizers': synthesizers,
146
- 'datasets': left_datasets,
147
- 'output_destination': instance_job['output_destination'],
148
- },
149
- {
150
- 'synthesizers': synthesizers,
151
- 'datasets': right_datasets,
152
- 'output_destination': instance_job['output_destination'],
153
- },
154
- ]
155
-
156
- raise ValueError('Cannot split the instance job any further.')
157
-
158
-
159
- def _build_instance_artifacts(datasets, synthesizers, num_instances, output_destination):
160
- """Build exactly ``num_instances`` instance jobs."""
161
- max_jobs = len(synthesizers) * len(datasets)
162
- if num_instances > max_jobs:
163
- num_instances = max_jobs
164
- warnings.warn(
165
- f'num_instances is too high for the number of synthesizers and datasets. '
166
- f'Maximum number of instances is {max_jobs}. Setting num_instances to {max_jobs}.'
167
- )
168
-
169
- instance_jobs = [
96
+ def _build_instance_jobs(datasets, synthesizers, output_destination):
97
+ """Build one instance job per dataset and synthesizer pair."""
98
+ return [
170
99
  {
171
- 'synthesizers': list(synthesizers),
172
- 'datasets': list(datasets),
100
+ 'synthesizers': [synthesizer],
101
+ 'datasets': [dataset],
173
102
  'output_destination': output_destination,
174
103
  }
104
+ for dataset, synthesizer in product(datasets, synthesizers)
175
105
  ]
176
- while len(instance_jobs) < num_instances:
177
- split_index = None
178
- split_size = -1
179
- for index, instance_job in enumerate(instance_jobs):
180
- if (_instance_job_size(instance_job) > 1) and (
181
- _instance_job_size(instance_job) > split_size
182
- ):
183
- split_index = index
184
- split_size = _instance_job_size(instance_job)
185
-
186
- instance_job = instance_jobs.pop(split_index)
187
- instance_jobs.extend(_split_instance_jobs(instance_job))
188
-
189
- return instance_jobs
190
106
 
191
107
 
192
108
  def _get_default_datasets_and_synthesizers(modality):
193
109
  """Get the default datasets and synthesizers for a modality config."""
194
110
  base_dict = _load_merged_modality_config(modality)
195
- datasets = base_dict.get(f'datasets_{modality}', [])
111
+ datasets = []
196
112
  synthesizers = []
197
113
  for instance_job in base_dict.get('instance_jobs', []):
114
+ datasets.extend(_resolve_datasets(instance_job.get('datasets', [])))
198
115
  synthesizers.extend(instance_job.get('synthesizers', []))
199
116
 
200
117
  return sorted(set(datasets)), sorted(set(synthesizers))
@@ -208,8 +125,7 @@ def build_dict_from_args(args):
208
125
 
209
126
  datasets = args.datasets
210
127
  synthesizers = args.synthesizers
211
- num_instances = args.num_instances
212
- if all(value is None for value in (datasets, synthesizers, num_instances)):
128
+ if all(value is None for value in (datasets, synthesizers)):
213
129
  config = _resolve_modality_config(args.modality)
214
130
  config['method_params'] = method_params
215
131
  for config_instance_job in config.get('instance_jobs', []):
@@ -220,13 +136,11 @@ def build_dict_from_args(args):
220
136
  default_datasets, default_synthesizers = _get_default_datasets_and_synthesizers(args.modality)
221
137
  datasets = datasets if datasets is not None else default_datasets
222
138
  synthesizers = synthesizers if synthesizers is not None else default_synthesizers
223
- num_instances = num_instances if num_instances is not None else DEFAULT_NUM_INSTANCES
224
139
  return {
225
140
  'method_params': method_params,
226
- 'instance_jobs': _build_instance_artifacts(
141
+ 'instance_jobs': _build_instance_jobs(
227
142
  datasets=datasets,
228
143
  synthesizers=synthesizers,
229
- num_instances=num_instances,
230
144
  output_destination=args.output_destination,
231
145
  ),
232
146
  }
@@ -257,12 +171,11 @@ def launch_from_args():
257
171
 
258
172
  When building the configuration from command-line arguments:
259
173
 
260
- - If ``--datasets``, ``--synthesizers``, and ``--num-instances`` are all
261
- omitted, the default monthly benchmark configuration for the selected
262
- modality is used.
263
- - If ``--num-instances`` is omitted, it defaults to ``1``.
174
+ - If ``--datasets`` and ``--synthesizers`` are both omitted, the default
175
+ monthly benchmark configuration for the selected modality is used.
264
176
  - If ``--datasets`` or ``--synthesizers`` is omitted, the corresponding
265
177
  default values from the monthly benchmark configuration are used.
178
+ - One instance job is created for each dataset and synthesizer pair.
266
179
 
267
180
  Once the configuration is resolved, the benchmark is launched.
268
181
  """
@@ -5,6 +5,7 @@ import os
5
5
  import uuid
6
6
  from datetime import datetime
7
7
  from importlib.resources import files
8
+ from itertools import product
8
9
  from urllib.parse import quote_plus
9
10
 
10
11
  import yaml
@@ -13,13 +14,111 @@ from sdgym._benchmark.benchmark import (
13
14
  _benchmark_multi_table_compute_gcp,
14
15
  _benchmark_single_table_compute_gcp,
15
16
  )
16
- from sdgym.run_benchmark.utils import get_s3_console_link
17
+ from sdgym.run_benchmark.utils import OUTPUT_DESTINATION_AWS, get_s3_console_link
17
18
  from sdgym.s3 import parse_s3_path
18
19
 
19
20
  _YAML_PKG = 'sdgym._benchmark_launcher'
20
- MODALITY_TO_CONFIG_FILE = {
21
- 'single_table': 'benchmark_single_table.yaml',
22
- 'multi_table': 'benchmark_multi_table.yaml',
21
+ MODALITY_TO_JOB_SETUP = {
22
+ 'single_table': {
23
+ 'output_destination': OUTPUT_DESTINATION_AWS,
24
+ 'datasets': [
25
+ 'adult',
26
+ 'alarm',
27
+ 'census',
28
+ 'child',
29
+ 'covtype',
30
+ 'expedia_hotel_logs',
31
+ 'insurance',
32
+ 'intrusion',
33
+ 'news',
34
+ ],
35
+ 'synthesizers': [
36
+ 'ColumnSynthesizer',
37
+ 'GaussianCopulaSynthesizer',
38
+ 'CTGANSynthesizer',
39
+ 'CopulaGANSynthesizer',
40
+ 'TVAESynthesizer',
41
+ 'SegmentSynthesizer',
42
+ 'XGCSynthesizer',
43
+ 'BootstrapSynthesizer',
44
+ 'RealTabFormerSynthesizer',
45
+ ],
46
+ },
47
+ 'multi_table': {
48
+ 'output_destination': OUTPUT_DESTINATION_AWS,
49
+ 'datasets': [
50
+ 'rel-amazon',
51
+ 'rel-arxiv',
52
+ 'rel-avito',
53
+ 'rel-event',
54
+ 'rel-f1',
55
+ 'rel-hm',
56
+ 'rel-ratebeer',
57
+ 'rel-salt',
58
+ 'rel-stack',
59
+ 'rel-trial',
60
+ 'instacart_marketbasket_ml',
61
+ 'MovieLens',
62
+ 'rossmann',
63
+ 'Telstra',
64
+ 'walmart',
65
+ 'WebKP',
66
+ 'DCG',
67
+ 'UW_std',
68
+ 'Same_gen',
69
+ 'CORA',
70
+ 'got_families',
71
+ 'SalesDB',
72
+ 'UTube',
73
+ 'Student_loan',
74
+ 'Hepatitis_std',
75
+ 'Elti',
76
+ 'Bupa',
77
+ 'Toxicology',
78
+ 'imdb_ijs',
79
+ 'ftp',
80
+ 'imdb_small',
81
+ 'imdb_MovieLens',
82
+ 'Pima',
83
+ 'university',
84
+ 'legalActs',
85
+ 'Dunur',
86
+ 'Mesh',
87
+ 'world',
88
+ 'airbnb-simplified',
89
+ 'trains',
90
+ 'FNHK',
91
+ 'fake_hotels',
92
+ 'SAT',
93
+ 'genes',
94
+ 'Biodegradability',
95
+ 'Pyrimidine',
96
+ 'mutagenesis',
97
+ 'restbase',
98
+ 'Triazine',
99
+ 'Carcinogenesis',
100
+ 'fake_hotels_extended',
101
+ 'Mooney_Family',
102
+ 'PTE',
103
+ 'Facebook',
104
+ 'multi_table_ID_demo_dataset',
105
+ 'SAP',
106
+ 'Chess',
107
+ 'Countries',
108
+ 'NCAA',
109
+ 'Atherosclerosis',
110
+ 'nations',
111
+ 'TubePricing',
112
+ 'financial',
113
+ 'Accidents',
114
+ 'MuskSmall',
115
+ 'NBA',
116
+ 'AustralianFootball',
117
+ 'PremierLeague',
118
+ 'OMOP_CDM_dayz',
119
+ ],
120
+ 'synthesizers': ['HMASynthesizer', 'HSASynthesizer', 'IndependentSynthesizer'],
121
+ },
23
122
  }
24
123
  CONFIG_KEYS = {
25
124
  'modality',
@@ -84,10 +183,24 @@ def resolve_compute(compute):
84
183
  raise ValueError(f"compute.service must be one of: 'gcp'. Found: {service}")
85
184
 
86
185
 
186
+ def _get_modality_config(modality):
187
+ """Get the launchable benchmark config for a modality."""
188
+ result = []
189
+ job_setup = MODALITY_TO_JOB_SETUP.get(modality)
190
+ for dataset, synthesizer in product(job_setup['datasets'], job_setup['synthesizers']):
191
+ result.append({
192
+ 'datasets': [dataset],
193
+ 'synthesizers': [synthesizer],
194
+ 'output_destination': job_setup['output_destination'],
195
+ })
196
+
197
+ return {'modality': modality, 'instance_jobs': result}
198
+
199
+
87
200
  def _load_merged_modality_config(modality):
88
201
  """Load and merge the base and modality-specific benchmark configs."""
89
202
  base_config = _load_yaml_resource('benchmark_base.yaml')
90
- modality_config = _load_yaml_resource(MODALITY_TO_CONFIG_FILE[modality])
203
+ modality_config = _get_modality_config(modality)
91
204
  return _deep_merge(base_config, modality_config)
92
205
 
93
206
 
@@ -285,9 +398,7 @@ def _build_instance_artifact_filepaths(
285
398
  return (
286
399
  _build_s3_uri(output_destination, f'{artifact_key_prefix}/{metainfo_name}.yaml'),
287
400
  _build_s3_uri(output_destination, f'{artifact_key_prefix}/{results_name}.csv'),
288
- _build_s3_uri(
289
- output_destination, f'{modality_prefix}/job_args_list_{metainfo_name}.pkl.gz'
290
- ),
401
+ _build_s3_uri(output_destination, f'{modality_prefix}/job_args_list_{metainfo_name}.pkl'),
291
402
  )
292
403
 
293
404