sdgym 0.14.2.dev0__tar.gz → 0.14.3.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. {sdgym-0.14.2.dev0/sdgym.egg-info → sdgym-0.14.3.dev0}/PKG-INFO +1 -1
  2. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/pyproject.toml +1 -1
  3. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/__init__.py +1 -1
  4. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/_benchmark/benchmark.py +10 -11
  5. sdgym-0.14.3.dev0/sdgym/_benchmark_launcher/_storage_manager.py +175 -0
  6. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/_benchmark_launcher/_validation.py +25 -3
  7. sdgym-0.14.3.dev0/sdgym/_benchmark_launcher/benchmark_base.yaml +16 -0
  8. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/_benchmark_launcher/benchmark_config.py +2 -0
  9. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/_benchmark_launcher/benchmark_launcher.py +134 -24
  10. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/_benchmark_launcher/script.py +2 -2
  11. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/_benchmark_launcher/utils.py +75 -5
  12. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/result_explorer/result_handler.py +9 -8
  13. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/run_benchmark/run_benchmark.py +3 -11
  14. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/run_benchmark/upload_benchmark_results.py +25 -8
  15. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/s3.py +8 -0
  16. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/synthesizer_descriptions.yaml +31 -0
  17. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0/sdgym.egg-info}/PKG-INFO +1 -1
  18. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym.egg-info/SOURCES.txt +0 -1
  19. sdgym-0.14.2.dev0/sdgym/_benchmark/config_utils.py +0 -123
  20. sdgym-0.14.2.dev0/sdgym/_benchmark_launcher/_storage_manager.py +0 -64
  21. sdgym-0.14.2.dev0/sdgym/_benchmark_launcher/benchmark_base.yaml +0 -9
  22. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/LICENSE +0 -0
  23. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/README.md +0 -0
  24. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/_benchmark/__init__.py +0 -0
  25. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/_benchmark/credentials_utils.py +0 -0
  26. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/_benchmark_launcher/__init__.py +0 -0
  27. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/_benchmark_launcher/_instance_manager.py +0 -0
  28. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/_benchmark_launcher/benchmark_multi_table.yaml +0 -0
  29. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/_benchmark_launcher/benchmark_single_table.yaml +0 -0
  30. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/_dataset_utils.py +0 -0
  31. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/benchmark.py +0 -0
  32. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/cli/__init__.py +0 -0
  33. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/cli/__main__.py +0 -0
  34. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/cli/collect.py +0 -0
  35. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/cli/summary.py +0 -0
  36. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/cli/utils.py +0 -0
  37. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/dataset_explorer.py +0 -0
  38. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/datasets.py +0 -0
  39. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/errors.py +0 -0
  40. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/metrics.py +0 -0
  41. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/progress.py +0 -0
  42. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/result_explorer/__init__.py +0 -0
  43. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/result_explorer/result_explorer.py +0 -0
  44. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/result_writer.py +0 -0
  45. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/run_benchmark/__init__.py +0 -0
  46. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/run_benchmark/utils.py +0 -0
  47. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/synthesizers/__init__.py +0 -0
  48. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/synthesizers/base.py +0 -0
  49. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/synthesizers/column.py +0 -0
  50. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/synthesizers/generate.py +0 -0
  51. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/synthesizers/identity.py +0 -0
  52. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/synthesizers/realtabformer.py +0 -0
  53. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/synthesizers/sdv.py +0 -0
  54. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/synthesizers/uniform.py +0 -0
  55. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/synthesizers/utils.py +0 -0
  56. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/utils.py +0 -0
  57. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym.egg-info/dependency_links.txt +0 -0
  58. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym.egg-info/entry_points.txt +0 -0
  59. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym.egg-info/requires.txt +0 -0
  60. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym.egg-info/top_level.txt +0 -0
  61. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/setup.cfg +0 -0
  62. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/tests/test_scripts.py +0 -0
  63. {sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/tests/test_tasks.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdgym
3
- Version: 0.14.2.dev0
3
+ Version: 0.14.3.dev0
4
4
  Summary: Benchmark tabular synthetic data generators using a variety of datasets
5
5
  Author-email: "DataCebo, Inc." <info@sdv.dev>
6
6
  License-Expression: BUSL-1.1
@@ -163,7 +163,7 @@ namespaces = false
163
163
  version = {attr = 'sdgym.__version__'}
164
164
 
165
165
  [tool.bumpversion]
166
- current_version = "0.14.2.dev0"
166
+ current_version = "0.14.3.dev0"
167
167
  parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
168
168
  serialize = [
169
169
  '{major}.{minor}.{patch}.{release}{candidate}',
@@ -8,7 +8,7 @@ __author__ = 'DataCebo, Inc.'
8
8
  __copyright__ = 'Copyright (c) 2022 DataCebo, Inc.'
9
9
  __email__ = 'info@sdv.dev'
10
10
  __license__ = 'BSL-1.1'
11
- __version__ = '0.14.2.dev0'
11
+ __version__ = '0.14.3.dev0'
12
12
 
13
13
  import logging
14
14
 
@@ -1,14 +1,11 @@
1
1
  import textwrap
2
+ import uuid
3
+ from datetime import datetime, timezone
2
4
  from urllib.parse import urlparse
3
5
 
4
6
  from google.cloud import compute_v1
5
7
  from google.oauth2 import service_account
6
8
 
7
- from sdgym._benchmark.config_utils import (
8
- _make_instance_name,
9
- resolve_compute_config,
10
- validate_compute_config,
11
- )
12
9
  from sdgym._benchmark.credentials_utils import sdv_install_cmd
13
10
  from sdgym.benchmark import (
14
11
  DEFAULT_MULTI_TABLE_DATASETS,
@@ -26,6 +23,12 @@ from sdgym.benchmark import (
26
23
  )
27
24
 
28
25
 
26
+ def _make_instance_name(prefix):
27
+ day = datetime.now(timezone.utc).strftime('%Y%m%d-%H%M')
28
+ suffix = uuid.uuid4().hex[:6]
29
+ return f'{prefix}-{day}-{suffix}'
30
+
31
+
29
32
  def _get_logs_s3_uri(output_destination, instance_name):
30
33
  """Store logs next to output destination prefix.
31
34
 
@@ -144,12 +147,11 @@ def _get_user_data_script(
144
147
  or int(config.get('gpu_count', 0)) > 0
145
148
  or bool(config.get('gpu_type'))
146
149
  )
147
- upload_logs = bool(config.get('upload_logs', True))
148
150
 
149
151
  aws_key = credentials['aws']['aws_access_key_id']
150
152
  aws_secret = credentials['aws']['aws_secret_access_key']
151
153
 
152
- log_uri = _get_logs_s3_uri(output_destination, instance_name) if upload_logs else ''
154
+ log_uri = _get_logs_s3_uri(output_destination, instance_name)
153
155
 
154
156
  sdv_install = sdv_install_cmd(credentials).rstrip()
155
157
  sdv_install = textwrap.indent(sdv_install, ' ') if sdv_install else ''
@@ -363,9 +365,6 @@ def _benchmark_compute_gcp(
363
365
  modality,
364
366
  ):
365
367
  """Run the SDGym benchmark on datasets for the given modality."""
366
- compute_config = resolve_compute_config('gcp', compute_config)
367
- validate_compute_config(compute_config)
368
-
369
368
  s3_client = _validate_output_destination(
370
369
  output_destination,
371
370
  aws_keys={
@@ -454,7 +453,7 @@ def _benchmark_single_table_compute_gcp(
454
453
  compute_diagnostic_score (bool, optional):
455
454
  Whether to compute the diagnostic score. Defaults to True.
456
455
  compute_privacy_score (bool, optional):
457
- Whether to compute the privacy score. Defaults to True.
456
+ Whether to compute the privacy score. Defaults to False.
458
457
  sdmetrics (list of str, optional):
459
458
  The sdmetrics to use for evaluation. If None, default metrics will be used.
460
459
  timeout (int, optional):
@@ -0,0 +1,175 @@
1
+ import io
2
+ import logging
3
+
4
+ import pandas as pd
5
+ from botocore.exceptions import BotoCoreError, ClientError
6
+
7
+ from sdgym._benchmark_launcher.utils import resolve_credentials
8
+ from sdgym.result_writer import S3ResultsWriter
9
+ from sdgym.s3 import _list_s3_bucket_contents, get_s3_client, is_s3_path, parse_s3_path
10
+
11
+ LOGGER = logging.getLogger(__name__)
12
+
13
+
14
+ def _validate_s3_output_destinations(instance_jobs):
15
+ """Validate that all output destinations are S3 paths."""
16
+ for instance_job in instance_jobs:
17
+ output_destination = instance_job['output_destination']
18
+ if not is_s3_path(output_destination):
19
+ raise ValueError(
20
+ f'Only S3 storage is currently supported. Found: {output_destination!r}.'
21
+ )
22
+
23
+
24
+ class BaseStorageManager:
25
+ """Base class for storage-specific managers."""
26
+
27
+ def handles_destination(self, output_destination):
28
+ """Return whether this manager supports the given destination."""
29
+ raise NotImplementedError
30
+
31
+ def list_files(self, output_destination):
32
+ """Return the files currently stored under the given destination."""
33
+ raise NotImplementedError
34
+
35
+ def get_existing_filenames(self, output_destination):
36
+ """Return the existing filenames for the given destination."""
37
+ raise NotImplementedError
38
+
39
+ def file_exists(self, filepath):
40
+ """Return whether the provided key exists in the destination."""
41
+ raise NotImplementedError
42
+
43
+ def read_csv(self, filepath):
44
+ """Read a CSV artifact from storage."""
45
+ raise NotImplementedError
46
+
47
+ def write_csv(self, result, filepath):
48
+ """Write a CSV artifact to storage."""
49
+ raise NotImplementedError
50
+
51
+ def _load_job_result(self, filepath):
52
+ """Load a per-job result CSV if it exists, otherwise return None."""
53
+ raise NotImplementedError
54
+
55
+ def update_metainfo(self, filepath, content):
56
+ """Update metainfo for an artifact."""
57
+ raise NotImplementedError
58
+
59
+ def delete(self, filepath):
60
+ """Delete an artifact from storage."""
61
+ raise NotImplementedError
62
+
63
+ def save_pickle(self, object, filepath):
64
+ """Save a picklable object to storage."""
65
+ raise NotImplementedError
66
+
67
+
68
+ class S3StorageManager(BaseStorageManager):
69
+ """Manage benchmark artifacts stored in S3."""
70
+
71
+ def __init__(self, credentials_filepath, instance_jobs):
72
+ _validate_s3_output_destinations(instance_jobs)
73
+ self.credentials_filepath = credentials_filepath
74
+ self._existing_files = {}
75
+ self._writer = None
76
+
77
+ def __getstate__(self):
78
+ """Return the picklable state."""
79
+ state = self.__dict__.copy()
80
+ state['_writer'] = None
81
+ return state
82
+
83
+ def __setstate__(self, state):
84
+ """Restore the state after unpickling."""
85
+ self.__dict__.update(state)
86
+
87
+ def _get_writer(self):
88
+ """Build the results writer."""
89
+ if self._writer is None:
90
+ self._writer = S3ResultsWriter(self._get_client())
91
+
92
+ return self._writer
93
+
94
+ def handles_destination(self, output_destination):
95
+ """Return whether the destination is an S3 path."""
96
+ return is_s3_path(output_destination)
97
+
98
+ def _get_client(self):
99
+ """Build and return the S3 client."""
100
+ credentials = resolve_credentials(self.credentials_filepath)
101
+ aws_credentials = credentials.get('aws', {})
102
+ return get_s3_client(
103
+ aws_access_key_id=aws_credentials.get('aws_access_key_id'),
104
+ aws_secret_access_key=aws_credentials.get('aws_secret_access_key'),
105
+ )
106
+
107
+ def _get_s3_resources(self, filepath):
108
+ """Return the S3 client and bucket name for a destination."""
109
+ if not is_s3_path(filepath):
110
+ raise ValueError(f'S3StorageManager only supports S3 paths. Found: {filepath!r}.')
111
+
112
+ s3_client = self._get_client()
113
+ bucket_name, key = parse_s3_path(filepath)
114
+ return s3_client, bucket_name, key
115
+
116
+ def list_files(self, output_destination):
117
+ """List files under the provided S3 output destination."""
118
+ if not self.handles_destination(output_destination):
119
+ raise ValueError(
120
+ f'S3StorageManager only supports S3 paths. Found: {output_destination!r}.'
121
+ )
122
+
123
+ s3_client = self._get_client()
124
+ bucket_name, key_prefix = parse_s3_path(output_destination)
125
+ return _list_s3_bucket_contents(s3_client, bucket_name, key_prefix)
126
+
127
+ def get_existing_filenames(self, output_destination):
128
+ """Return the existing filenames for the given destination."""
129
+ return {obj['Key'] for obj in self.list_files(output_destination)}
130
+
131
+ def file_exists(self, filepath):
132
+ """Check if a file exists in S3."""
133
+ s3_client, bucket_name, key = self._get_s3_resources(filepath)
134
+ try:
135
+ s3_client.head_object(Bucket=bucket_name, Key=key)
136
+ return True
137
+ except ClientError as error:
138
+ if error.response['Error']['Code'] == '404':
139
+ return False
140
+
141
+ raise
142
+
143
+ def read_csv(self, filepath):
144
+ """Read a CSV artifact from S3."""
145
+ s3_client, bucket_name, key = self._get_s3_resources(filepath)
146
+ response = s3_client.get_object(Bucket=bucket_name, Key=key)
147
+ return pd.read_csv(io.BytesIO(response['Body'].read()))
148
+
149
+ def write_csv(self, result, filepath):
150
+ self._get_writer().write_dataframe(result, filepath, index=False)
151
+
152
+ def _load_job_result(self, filepath):
153
+ if not self.file_exists(filepath):
154
+ return None
155
+
156
+ return self.read_csv(filepath)
157
+
158
+ def update_metainfo(self, filepath, content):
159
+ """Update metainfo for an artifact."""
160
+ self._get_writer().write_yaml(data=content, file_path=filepath, append=True)
161
+
162
+ def delete(self, filepath):
163
+ """Delete an artifact from storage."""
164
+ s3_client, bucket_name, key = self._get_s3_resources(filepath)
165
+ try:
166
+ s3_client.delete_object(Bucket=bucket_name, Key=key)
167
+ LOGGER.info(f'Deleted S3 object {filepath} successfully.')
168
+
169
+ except (ClientError, BotoCoreError):
170
+ LOGGER.exception(f'Failed to delete S3 object {filepath}.')
171
+ raise
172
+
173
+ def save_pickle(self, object, filepath):
174
+ """Save a picklable object to S3."""
175
+ self._get_writer().write_pickle(object, filepath)
@@ -1,6 +1,7 @@
1
1
  from sdgym._benchmark_launcher.utils import (
2
2
  _AWS_CREDENTIAL_KEYS,
3
3
  _GCP_SERVICE_ACCOUNT_REQUIRED_KEYS,
4
+ _REQUIRED_CANONICAL_KEYS,
4
5
  _is_unique_string_list,
5
6
  resolve_credentials,
6
7
  )
@@ -9,7 +10,7 @@ _INJECTED_PARAMS = {
9
10
  'credentials',
10
11
  'synthesizers',
11
12
  'sdv_datasets',
12
- 'compute_config',
13
+ 'compute',
13
14
  'output_destination',
14
15
  }
15
16
 
@@ -61,12 +62,33 @@ def _validate_structure(config):
61
62
  compute = getattr(config, 'compute', None)
62
63
  if isinstance(compute, dict):
63
64
  service = compute.get('service')
64
- if service not in ('gcp',):
65
- errors.append(f"compute.service: must be 'gcp'. Found: {service!r}")
65
+ if service is None:
66
+ errors.append('compute.service: is required but missing.')
66
67
 
67
68
  return sorted(errors)
68
69
 
69
70
 
71
+ def _validate_compute_canonical(compute):
72
+ errors = []
73
+ for key in _REQUIRED_CANONICAL_KEYS:
74
+ if not compute.get(key):
75
+ errors.append(f'compute.{key} is required but missing.')
76
+
77
+ gpu_count = int(compute.get('gpu_count') or 0)
78
+ if gpu_count > 0 and not compute.get('gpu_type'):
79
+ errors.append('compute.gpu_type is required when compute.gpu_count > 0.')
80
+
81
+ return sorted(errors)
82
+
83
+
84
+ def _validate_compute(compute):
85
+ """Validate the 'compute' section of the config.
86
+
87
+ This includes validating the canonical compute keys and any service-specific requirements.
88
+ """
89
+ return _validate_compute_canonical(compute)
90
+
91
+
70
92
  def _validate_method_params(method_params, method_to_run):
71
93
  errors = []
72
94
  timeout = method_params.get('timeout')
@@ -0,0 +1,16 @@
1
+ method_params:
2
+ timeout: 345600
3
+ compute_quality_score: true
4
+ compute_diagnostic_score: true
5
+
6
+ compute:
7
+ service: gcp
8
+ instance_type: n1-highmem-16
9
+ boot_image: projects/deeplearning-platform-release/global/images/family/common-cu129-ubuntu-2204-nvidia-580
10
+ root_disk_gb: 300
11
+ gpu_type: nvidia-tesla-t4
12
+ gpu_count: 1
13
+ swap_gb: 64
14
+ name_prefix: sdgym-run
15
+
16
+ credentials_filepath: null
@@ -7,6 +7,7 @@ import yaml
7
7
 
8
8
  from sdgym._benchmark_launcher._validation import (
9
9
  _format_sectioned_errors,
10
+ _validate_compute,
10
11
  _validate_credentials,
11
12
  _validate_instance_jobs,
12
13
  _validate_method_params,
@@ -75,6 +76,7 @@ class BenchmarkConfig:
75
76
  section_errors = {
76
77
  'method_params': _validate_method_params(self.method_params, method_to_run),
77
78
  'credentials_filepath': _validate_credentials(self.credentials_filepath),
79
+ 'compute': _validate_compute(self.compute),
78
80
  'instance_jobs': _validate_instance_jobs(self.instance_jobs),
79
81
  }
80
82
  if any(section_errors.values()):
@@ -11,11 +11,14 @@ from sdgym._benchmark_launcher._storage_manager import S3StorageManager
11
11
  from sdgym._benchmark_launcher.utils import (
12
12
  _METHODS,
13
13
  _add_dataset_suffix,
14
+ _build_instance_artifact_filepaths,
15
+ _build_job_artifact_filepaths,
14
16
  _build_job_artifact_keys,
15
17
  _build_job_output_destination,
16
18
  _get_top_folder_prefix,
17
19
  _resolve_datasets,
18
20
  generate_ids,
21
+ resolve_compute,
19
22
  resolve_credentials,
20
23
  )
21
24
 
@@ -59,9 +62,10 @@ class BenchmarkLauncher:
59
62
  ])
60
63
  self._launch_to_instance_names = {}
61
64
  self._instance_name_to_status = {}
62
- self._instance_name_to_jobs = {}
65
+ self._instance_name_to_artifacts = {}
63
66
  self._instance_manager = self._build_instance_manager()
64
67
  self._storage_manager = self._build_storage_manager()
68
+ self._timestamp = None
65
69
 
66
70
  def _build_storage_manager(self):
67
71
  """Build the storage manager."""
@@ -84,39 +88,71 @@ class BenchmarkLauncher:
84
88
 
85
89
  raise NotImplementedError(f'Compute service {self.compute_service!r} is not supported.')
86
90
 
87
- def _add_synthesizer_suffix(self, synthesizer, suffix):
88
- """Return the synthesizer name with the instance suffix."""
89
- return synthesizer if suffix == 0 else f'{synthesizer}({suffix})'
91
+ def _add_filename_suffix(self, filename, suffix):
92
+ """Return the filename with the instance suffix."""
93
+ return filename if suffix == 0 else f'{filename}({suffix})'
90
94
 
91
- def _build_instance_jobs(self, datasets, synthesizers, output_destination, instance_idx):
92
- """Build the job metadata for a launched instance."""
93
- artifact_key_prefix = _get_top_folder_prefix(output_destination, self.modality)
95
+ def _build_instance_artifacts(self, datasets, synthesizers, output_destination, instance_idx):
96
+ """Build the artifact information for one instance."""
97
+ artifact_key_prefix, modality_prefix = _get_top_folder_prefix(
98
+ output_destination, self.modality
99
+ )
94
100
  jobs = []
101
+
95
102
  for dataset in datasets:
96
103
  artifact_dataset = _add_dataset_suffix(dataset)
97
104
  for synthesizer in synthesizers:
98
- artifact_synthesizer = self._add_synthesizer_suffix(synthesizer, instance_idx)
105
+ artifact_synthesizer = self._add_filename_suffix(synthesizer, instance_idx)
106
+ job_output_destination = _build_job_output_destination(
107
+ output_destination=output_destination,
108
+ artifact_key_prefix=artifact_key_prefix,
109
+ artifact_dataset=artifact_dataset,
110
+ artifact_synthesizer=artifact_synthesizer,
111
+ )
112
+ benchmark_fp, synthetic_data_fp, synthesizer_fp = _build_job_artifact_filepaths(
113
+ artifact_key_prefix=artifact_key_prefix,
114
+ artifact_dataset=artifact_dataset,
115
+ artifact_synthesizer=artifact_synthesizer,
116
+ modality=self.modality,
117
+ output_destination=output_destination,
118
+ )
119
+
99
120
  jobs.append({
100
121
  'dataset': dataset,
101
122
  'synthesizer': synthesizer,
102
123
  'artifact_dataset': artifact_dataset,
103
124
  'artifact_synthesizer': artifact_synthesizer,
104
125
  'artifact_key_prefix': artifact_key_prefix,
105
- 'output_destination': output_destination,
106
- 'job_output_destination': _build_job_output_destination(
107
- output_destination=output_destination,
108
- artifact_key_prefix=artifact_key_prefix,
109
- artifact_dataset=artifact_dataset,
110
- artifact_synthesizer=artifact_synthesizer,
111
- ),
126
+ 'job_output_destination': job_output_destination,
127
+ 'benchmark_result_filepath': benchmark_fp,
128
+ 'synthetic_data_filepath': synthetic_data_fp,
129
+ 'synthesizer_filepath': synthesizer_fp,
112
130
  })
113
131
 
114
- return jobs
132
+ metainfo_name = self._add_filename_suffix('metainfo', instance_idx)
133
+ results_name = self._add_filename_suffix('results', instance_idx)
134
+ metainfo_fp, result_fp, job_arg_fp = _build_instance_artifact_filepaths(
135
+ output_destination=output_destination,
136
+ artifact_key_prefix=artifact_key_prefix,
137
+ modality_prefix=modality_prefix,
138
+ metainfo_name=metainfo_name,
139
+ results_name=results_name,
140
+ )
141
+ results = {
142
+ 'jobs': jobs,
143
+ 'output_destination': output_destination,
144
+ 'result_filepath': result_fp,
145
+ 'metainfo_filepath': metainfo_fp,
146
+ 'job_arg_filepath': job_arg_fp,
147
+ }
148
+
149
+ return results
115
150
 
116
151
  def _launch(self):
117
152
  launch_id = generate_ids(['LAUNCH_ID'])
118
153
  self._launch_to_instance_names[launch_id] = []
119
154
  credentials = resolve_credentials(self.benchmark_config.credentials_filepath)
155
+ compute = resolve_compute(self.benchmark_config.compute)
120
156
 
121
157
  for instance_idx, instance_job in enumerate(self.benchmark_config.instance_jobs):
122
158
  datasets = _resolve_datasets(instance_job['datasets'])
@@ -128,19 +164,20 @@ class BenchmarkLauncher:
128
164
  synthesizers=synthesizers,
129
165
  sdv_datasets=datasets,
130
166
  credentials=credentials,
131
- compute_config=self.benchmark_config.compute,
167
+ compute_config=compute,
132
168
  **self.benchmark_config.method_params,
133
169
  )
134
-
135
170
  self._launch_to_instance_names[launch_id].append(instance_name)
136
171
  self._instance_name_to_status[instance_name] = 'running'
137
- self._instance_name_to_jobs[instance_name] = self._build_instance_jobs(
172
+ self._instance_name_to_artifacts[instance_name] = self._build_instance_artifacts(
138
173
  datasets=datasets,
139
174
  synthesizers=synthesizers,
140
175
  output_destination=output_destination,
141
176
  instance_idx=instance_idx,
142
177
  )
143
178
 
179
+ self._timestamp = pd.Timestamp.now().strftime('%d_%m_%Y %H:%M:%S')
180
+
144
181
  def launch(self):
145
182
  """Run the BenchmarkConfig: validate it and then execute the specified benchmark method."""
146
183
  if not self.benchmark_config._is_validated:
@@ -274,10 +311,10 @@ class BenchmarkLauncher:
274
311
  instances = self._validate_instance_names(instance_names)
275
312
  output_destinations = []
276
313
  for instance_name in instances:
277
- for job in self._instance_name_to_jobs.get(instance_name, []):
278
- output_destination = job['output_destination']
279
- if output_destination not in output_destinations:
280
- output_destinations.append(output_destination)
314
+ instance_artifacts = self._instance_name_to_artifacts.get(instance_name, {})
315
+ output_destination = instance_artifacts.get('output_destination')
316
+ if output_destination not in output_destinations:
317
+ output_destinations.append(output_destination)
281
318
 
282
319
  return output_destinations
283
320
 
@@ -387,7 +424,7 @@ class BenchmarkLauncher:
387
424
  }
388
425
  rows = []
389
426
  for instance_name in instances:
390
- jobs = self._instance_name_to_jobs.get(instance_name, [])
427
+ jobs = self._instance_name_to_artifacts.get(instance_name, {}).get('jobs', [])
391
428
  instance_rows = self._get_instance_job_rows(
392
429
  instance_name=instance_name,
393
430
  jobs=jobs,
@@ -400,6 +437,79 @@ class BenchmarkLauncher:
400
437
 
401
438
  return pd.DataFrame(rows)
402
439
 
440
+ def _build_missing_result_row(self, job):
441
+ """Build a result row for a job missing its benchmark_result.csv."""
442
+ return pd.DataFrame([
443
+ {
444
+ 'Dataset': job['dataset'],
445
+ 'Synthesizer': job['synthesizer'],
446
+ 'Dataset_Size_MB': None,
447
+ 'Train_Time': None,
448
+ 'Peak_Memory_MB': None,
449
+ 'Synthesizer_Size_MB': None,
450
+ 'Sample_Time': None,
451
+ 'Evaluate_Time': None,
452
+ 'Error': 'Instance Deadline Error',
453
+ }
454
+ ])
455
+
456
+ def _build_or_load_instance_results(self, instance_name):
457
+ """Get instance result table.
458
+
459
+ If the instance's result file exists in storage, load and return it.
460
+ Otherwise, build the result table by loading each job's result file if it exists,
461
+ or adding a row with an error if it doesn't.
462
+ """
463
+ jobs = self._instance_name_to_artifacts[instance_name]['jobs']
464
+ results_filepath = self._instance_name_to_artifacts[instance_name]['result_filepath']
465
+ if self._storage_manager.file_exists(results_filepath):
466
+ return self._storage_manager.read_csv(results_filepath)
467
+
468
+ frames = []
469
+ for job in jobs:
470
+ job_result = self._storage_manager._load_job_result(job['benchmark_result_filepath'])
471
+ if job_result is None:
472
+ frames.append(self._build_missing_result_row(job))
473
+ else:
474
+ frames.append(job_result)
475
+
476
+ return pd.concat(frames, ignore_index=True)
477
+
478
+ def _update_instance_metainfo(self, instance_name):
479
+ """Update the instance metainfo file with the completion date."""
480
+ metainfo_filepath = self._instance_name_to_artifacts[instance_name]['metainfo_filepath']
481
+ content = {'completed_date': pd.Timestamp.now().strftime('%d_%m_%Y %H:%M:%S')}
482
+ self._storage_manager.update_metainfo(metainfo_filepath, content)
483
+
484
+ def finalize(self):
485
+ """Finalize the benchmark using the results available so far.
486
+
487
+ This method is used for an early stop scenario. For each launched instance,
488
+ it builds or loads the instance-level results file from the available job
489
+ artifacts, updates the metainfo file, and removes temporary job argument
490
+ artifacts. Missing job results are preserved as incomplete or failed entries
491
+ in the final output.
492
+
493
+ Once the available artifacts have been saved, all remaining running
494
+ instances are terminated.
495
+ """
496
+ self._validate_compute_service()
497
+ self._update_instance_statuses()
498
+ for instance_name in self._get_all_instance_names():
499
+ instance_artifacts = self._instance_name_to_artifacts[instance_name]
500
+ result_filepath = instance_artifacts['result_filepath']
501
+ job_arg_filepath = instance_artifacts['job_arg_filepath']
502
+ result_df = self._build_or_load_instance_results(instance_name)
503
+ self._storage_manager.write_csv(result=result_df, filepath=result_filepath)
504
+ self._storage_manager.delete(job_arg_filepath)
505
+ self._update_instance_metainfo(instance_name)
506
+
507
+ self.terminate(verbose=True)
508
+
509
+ def save_to_cloud(self, filepath):
510
+ """Save the benchmark launcher in the cloud using the storage manager."""
511
+ self._storage_manager.save_pickle(self, filepath)
512
+
403
513
  def save(self, filepath):
404
514
  """Save the benchmark configuration to a file."""
405
515
  with open(filepath, 'wb') as output:
@@ -156,7 +156,7 @@ def _split_instance_jobs(instance_job):
156
156
  raise ValueError('Cannot split the instance job any further.')
157
157
 
158
158
 
159
- def _build_instance_jobs(datasets, synthesizers, num_instances, output_destination):
159
+ def _build_instance_artifacts(datasets, synthesizers, num_instances, output_destination):
160
160
  """Build exactly ``num_instances`` instance jobs."""
161
161
  max_jobs = len(synthesizers) * len(datasets)
162
162
  if num_instances > max_jobs:
@@ -223,7 +223,7 @@ def build_dict_from_args(args):
223
223
  num_instances = num_instances if num_instances is not None else DEFAULT_NUM_INSTANCES
224
224
  return {
225
225
  'method_params': method_params,
226
- 'instance_jobs': _build_instance_jobs(
226
+ 'instance_jobs': _build_instance_artifacts(
227
227
  datasets=datasets,
228
228
  synthesizers=synthesizers,
229
229
  num_instances=num_instances,