PyPI - sdgym - Versions diffs - 0.14.2.dev0__tar.gz → 0.14.3.dev0__tar.gz - Mend

sdgym 0.14.2.dev0tar.gz → 0.14.3.dev0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

{sdgym-0.14.2.dev0/sdgym.egg-info → sdgym-0.14.3.dev0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sdgym
-Version: 0.14.2.dev0
+Version: 0.14.3.dev0
 Summary: Benchmark tabular synthetic data generators using a variety of datasets
 Author-email: "DataCebo, Inc." <info@sdv.dev>
 License-Expression: BUSL-1.1

{sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/pyproject.toml RENAMED Viewed

@@ -163,7 +163,7 @@ namespaces = false
 version = {attr = 'sdgym.__version__'}
 [tool.bumpversion]
-current_version = "0.14.2.dev0"
+current_version = "0.14.3.dev0"
 parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
 serialize = [
     '{major}.{minor}.{patch}.{release}{candidate}',

{sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/__init__.py RENAMED Viewed

@@ -8,7 +8,7 @@ __author__ = 'DataCebo, Inc.'
 __copyright__ = 'Copyright (c) 2022 DataCebo, Inc.'
 __email__ = 'info@sdv.dev'
 __license__ = 'BSL-1.1'
-__version__ = '0.14.2.dev0'
+__version__ = '0.14.3.dev0'
 import logging

{sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/_benchmark/benchmark.py RENAMED Viewed

@@ -1,14 +1,11 @@
 import textwrap
+import uuid
+from datetime import datetime, timezone
 from urllib.parse import urlparse
 from google.cloud import compute_v1
 from google.oauth2 import service_account
-from sdgym._benchmark.config_utils import (
-    _make_instance_name,
-    resolve_compute_config,
-    validate_compute_config,
-)
 from sdgym._benchmark.credentials_utils import sdv_install_cmd
 from sdgym.benchmark import (
     DEFAULT_MULTI_TABLE_DATASETS,
@@ -26,6 +23,12 @@ from sdgym.benchmark import (
 )
+def _make_instance_name(prefix):
+    day = datetime.now(timezone.utc).strftime('%Y%m%d-%H%M')
+    suffix = uuid.uuid4().hex[:6]
+    return f'{prefix}-{day}-{suffix}'
 def _get_logs_s3_uri(output_destination, instance_name):
     """Store logs next to output destination prefix.
@@ -144,12 +147,11 @@ def _get_user_data_script(
         or int(config.get('gpu_count', 0)) > 0
         or bool(config.get('gpu_type'))
     )
-    upload_logs = bool(config.get('upload_logs', True))
     aws_key = credentials['aws']['aws_access_key_id']
     aws_secret = credentials['aws']['aws_secret_access_key']
-    log_uri = _get_logs_s3_uri(output_destination, instance_name) if upload_logs else ''
+    log_uri = _get_logs_s3_uri(output_destination, instance_name)
     sdv_install = sdv_install_cmd(credentials).rstrip()
     sdv_install = textwrap.indent(sdv_install, '        ') if sdv_install else ''
@@ -363,9 +365,6 @@ def _benchmark_compute_gcp(
     modality,
 ):
     """Run the SDGym benchmark on datasets for the given modality."""
-    compute_config = resolve_compute_config('gcp', compute_config)
-    validate_compute_config(compute_config)
     s3_client = _validate_output_destination(
         output_destination,
         aws_keys={
@@ -454,7 +453,7 @@ def _benchmark_single_table_compute_gcp(
         compute_diagnostic_score (bool, optional):
             Whether to compute the diagnostic score. Defaults to True.
         compute_privacy_score (bool, optional):
-            Whether to compute the privacy score. Defaults to True.
+            Whether to compute the privacy score. Defaults to False.
         sdmetrics (list of str, optional):
             The sdmetrics to use for evaluation. If None, default metrics will be used.
         timeout (int, optional):

sdgym-0.14.3.dev0/sdgym/_benchmark_launcher/_storage_manager.py ADDED Viewed

@@ -0,0 +1,175 @@
+import io
+import logging
+import pandas as pd
+from botocore.exceptions import BotoCoreError, ClientError
+from sdgym._benchmark_launcher.utils import resolve_credentials
+from sdgym.result_writer import S3ResultsWriter
+from sdgym.s3 import _list_s3_bucket_contents, get_s3_client, is_s3_path, parse_s3_path
+LOGGER = logging.getLogger(__name__)
+def _validate_s3_output_destinations(instance_jobs):
+    """Validate that all output destinations are S3 paths."""
+    for instance_job in instance_jobs:
+        output_destination = instance_job['output_destination']
+        if not is_s3_path(output_destination):
+            raise ValueError(
+                f'Only S3 storage is currently supported. Found: {output_destination!r}.'
+            )
+class BaseStorageManager:
+    """Base class for storage-specific managers."""
+    def handles_destination(self, output_destination):
+        """Return whether this manager supports the given destination."""
+        raise NotImplementedError
+    def list_files(self, output_destination):
+        """Return the files currently stored under the given destination."""
+        raise NotImplementedError
+    def get_existing_filenames(self, output_destination):
+        """Return the existing filenames for the given destination."""
+        raise NotImplementedError
+    def file_exists(self, filepath):
+        """Return whether the provided key exists in the destination."""
+        raise NotImplementedError
+    def read_csv(self, filepath):
+        """Read a CSV artifact from storage."""
+        raise NotImplementedError
+    def write_csv(self, result, filepath):
+        """Write a CSV artifact to storage."""
+        raise NotImplementedError
+    def _load_job_result(self, filepath):
+        """Load a per-job result CSV if it exists, otherwise return None."""
+        raise NotImplementedError
+    def update_metainfo(self, filepath, content):
+        """Update metainfo for an artifact."""
+        raise NotImplementedError
+    def delete(self, filepath):
+        """Delete an artifact from storage."""
+        raise NotImplementedError
+    def save_pickle(self, object, filepath):
+        """Save a picklable object to storage."""
+        raise NotImplementedError
+class S3StorageManager(BaseStorageManager):
+    """Manage benchmark artifacts stored in S3."""
+    def __init__(self, credentials_filepath, instance_jobs):
+        _validate_s3_output_destinations(instance_jobs)
+        self.credentials_filepath = credentials_filepath
+        self._existing_files = {}
+        self._writer = None
+    def __getstate__(self):
+        """Return the picklable state."""
+        state = self.__dict__.copy()
+        state['_writer'] = None
+        return state
+    def __setstate__(self, state):
+        """Restore the state after unpickling."""
+        self.__dict__.update(state)
+    def _get_writer(self):
+        """Build the results writer."""
+        if self._writer is None:
+            self._writer = S3ResultsWriter(self._get_client())
+        return self._writer
+    def handles_destination(self, output_destination):
+        """Return whether the destination is an S3 path."""
+        return is_s3_path(output_destination)
+    def _get_client(self):
+        """Build and return the S3 client."""
+        credentials = resolve_credentials(self.credentials_filepath)
+        aws_credentials = credentials.get('aws', {})
+        return get_s3_client(
+            aws_access_key_id=aws_credentials.get('aws_access_key_id'),
+            aws_secret_access_key=aws_credentials.get('aws_secret_access_key'),
+        )
+    def _get_s3_resources(self, filepath):
+        """Return the S3 client and bucket name for a destination."""
+        if not is_s3_path(filepath):
+            raise ValueError(f'S3StorageManager only supports S3 paths. Found: {filepath!r}.')
+        s3_client = self._get_client()
+        bucket_name, key = parse_s3_path(filepath)
+        return s3_client, bucket_name, key
+    def list_files(self, output_destination):
+        """List files under the provided S3 output destination."""
+        if not self.handles_destination(output_destination):
+            raise ValueError(
+                f'S3StorageManager only supports S3 paths. Found: {output_destination!r}.'
+            )
+        s3_client = self._get_client()
+        bucket_name, key_prefix = parse_s3_path(output_destination)
+        return _list_s3_bucket_contents(s3_client, bucket_name, key_prefix)
+    def get_existing_filenames(self, output_destination):
+        """Return the existing filenames for the given destination."""
+        return {obj['Key'] for obj in self.list_files(output_destination)}
+    def file_exists(self, filepath):
+        """Check if a file exists in S3."""
+        s3_client, bucket_name, key = self._get_s3_resources(filepath)
+        try:
+            s3_client.head_object(Bucket=bucket_name, Key=key)
+            return True
+        except ClientError as error:
+            if error.response['Error']['Code'] == '404':
+                return False
+            raise
+    def read_csv(self, filepath):
+        """Read a CSV artifact from S3."""
+        s3_client, bucket_name, key = self._get_s3_resources(filepath)
+        response = s3_client.get_object(Bucket=bucket_name, Key=key)
+        return pd.read_csv(io.BytesIO(response['Body'].read()))
+    def write_csv(self, result, filepath):
+        self._get_writer().write_dataframe(result, filepath, index=False)
+    def _load_job_result(self, filepath):
+        if not self.file_exists(filepath):
+            return None
+        return self.read_csv(filepath)
+    def update_metainfo(self, filepath, content):
+        """Update metainfo for an artifact."""
+        self._get_writer().write_yaml(data=content, file_path=filepath, append=True)
+    def delete(self, filepath):
+        """Delete an artifact from storage."""
+        s3_client, bucket_name, key = self._get_s3_resources(filepath)
+        try:
+            s3_client.delete_object(Bucket=bucket_name, Key=key)
+            LOGGER.info(f'Deleted S3 object {filepath} successfully.')
+        except (ClientError, BotoCoreError):
+            LOGGER.exception(f'Failed to delete S3 object {filepath}.')
+            raise
+    def save_pickle(self, object, filepath):
+        """Save a picklable object to S3."""
+        self._get_writer().write_pickle(object, filepath)

{sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/_benchmark_launcher/_validation.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from sdgym._benchmark_launcher.utils import (
     _AWS_CREDENTIAL_KEYS,
     _GCP_SERVICE_ACCOUNT_REQUIRED_KEYS,
+    _REQUIRED_CANONICAL_KEYS,
     _is_unique_string_list,
     resolve_credentials,
 )
@@ -9,7 +10,7 @@ _INJECTED_PARAMS = {
     'credentials',
     'synthesizers',
     'sdv_datasets',
-    'compute_config',
+    'compute',
     'output_destination',
 }
@@ -61,12 +62,33 @@ def _validate_structure(config):
     compute = getattr(config, 'compute', None)
     if isinstance(compute, dict):
         service = compute.get('service')
-        if service not in ('gcp',):
-            errors.append(f"compute.service: must be 'gcp'. Found: {service!r}")
+        if service is None:
+            errors.append('compute.service: is required but missing.')
     return sorted(errors)
+def _validate_compute_canonical(compute):
+    errors = []
+    for key in _REQUIRED_CANONICAL_KEYS:
+        if not compute.get(key):
+            errors.append(f'compute.{key} is required but missing.')
+    gpu_count = int(compute.get('gpu_count') or 0)
+    if gpu_count > 0 and not compute.get('gpu_type'):
+        errors.append('compute.gpu_type is required when compute.gpu_count > 0.')
+    return sorted(errors)
+def _validate_compute(compute):
+    """Validate the 'compute' section of the config.
+    This includes validating the canonical compute keys and any service-specific requirements.
+    """
+    return _validate_compute_canonical(compute)
 def _validate_method_params(method_params, method_to_run):
     errors = []
     timeout = method_params.get('timeout')

sdgym-0.14.3.dev0/sdgym/_benchmark_launcher/benchmark_base.yaml ADDED Viewed

@@ -0,0 +1,16 @@
+method_params:
+  timeout: 345600
+  compute_quality_score: true
+  compute_diagnostic_score: true
+compute:
+  service: gcp
+  instance_type: n1-highmem-16
+  boot_image: projects/deeplearning-platform-release/global/images/family/common-cu129-ubuntu-2204-nvidia-580
+  root_disk_gb: 300
+  gpu_type: nvidia-tesla-t4
+  gpu_count: 1
+  swap_gb: 64
+  name_prefix: sdgym-run
+credentials_filepath: null

{sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/_benchmark_launcher/benchmark_config.py RENAMED Viewed

@@ -7,6 +7,7 @@ import yaml
 from sdgym._benchmark_launcher._validation import (
     _format_sectioned_errors,
+    _validate_compute,
     _validate_credentials,
     _validate_instance_jobs,
     _validate_method_params,
@@ -75,6 +76,7 @@ class BenchmarkConfig:
         section_errors = {
             'method_params': _validate_method_params(self.method_params, method_to_run),
             'credentials_filepath': _validate_credentials(self.credentials_filepath),
+            'compute': _validate_compute(self.compute),
             'instance_jobs': _validate_instance_jobs(self.instance_jobs),
         }
         if any(section_errors.values()):

{sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/_benchmark_launcher/benchmark_launcher.py RENAMED Viewed

@@ -11,11 +11,14 @@ from sdgym._benchmark_launcher._storage_manager import S3StorageManager
 from sdgym._benchmark_launcher.utils import (
     _METHODS,
     _add_dataset_suffix,
+    _build_instance_artifact_filepaths,
+    _build_job_artifact_filepaths,
     _build_job_artifact_keys,
     _build_job_output_destination,
     _get_top_folder_prefix,
     _resolve_datasets,
     generate_ids,
+    resolve_compute,
     resolve_credentials,
 )
@@ -59,9 +62,10 @@ class BenchmarkLauncher:
         ])
         self._launch_to_instance_names = {}
         self._instance_name_to_status = {}
-        self._instance_name_to_jobs = {}
+        self._instance_name_to_artifacts = {}
         self._instance_manager = self._build_instance_manager()
         self._storage_manager = self._build_storage_manager()
+        self._timestamp = None
     def _build_storage_manager(self):
         """Build the storage manager."""
@@ -84,39 +88,71 @@ class BenchmarkLauncher:
         raise NotImplementedError(f'Compute service {self.compute_service!r} is not supported.')
-    def _add_synthesizer_suffix(self, synthesizer, suffix):
-        """Return the synthesizer name with the instance suffix."""
-        return synthesizer if suffix == 0 else f'{synthesizer}({suffix})'
+    def _add_filename_suffix(self, filename, suffix):
+        """Return the filename with the instance suffix."""
+        return filename if suffix == 0 else f'{filename}({suffix})'
-    def _build_instance_jobs(self, datasets, synthesizers, output_destination, instance_idx):
-        """Build the job metadata for a launched instance."""
-        artifact_key_prefix = _get_top_folder_prefix(output_destination, self.modality)
+    def _build_instance_artifacts(self, datasets, synthesizers, output_destination, instance_idx):
+        """Build the artifact information for one instance."""
+        artifact_key_prefix, modality_prefix = _get_top_folder_prefix(
+            output_destination, self.modality
+        )
         jobs = []
         for dataset in datasets:
             artifact_dataset = _add_dataset_suffix(dataset)
             for synthesizer in synthesizers:
-                artifact_synthesizer = self._add_synthesizer_suffix(synthesizer, instance_idx)
+                artifact_synthesizer = self._add_filename_suffix(synthesizer, instance_idx)
+                job_output_destination = _build_job_output_destination(
+                    output_destination=output_destination,
+                    artifact_key_prefix=artifact_key_prefix,
+                    artifact_dataset=artifact_dataset,
+                    artifact_synthesizer=artifact_synthesizer,
+                )
+                benchmark_fp, synthetic_data_fp, synthesizer_fp = _build_job_artifact_filepaths(
+                    artifact_key_prefix=artifact_key_prefix,
+                    artifact_dataset=artifact_dataset,
+                    artifact_synthesizer=artifact_synthesizer,
+                    modality=self.modality,
+                    output_destination=output_destination,
+                )
                 jobs.append({
                     'dataset': dataset,
                     'synthesizer': synthesizer,
                     'artifact_dataset': artifact_dataset,
                     'artifact_synthesizer': artifact_synthesizer,
                     'artifact_key_prefix': artifact_key_prefix,
-                    'output_destination': output_destination,
-                    'job_output_destination': _build_job_output_destination(
-                        output_destination=output_destination,
-                        artifact_key_prefix=artifact_key_prefix,
-                        artifact_dataset=artifact_dataset,
-                        artifact_synthesizer=artifact_synthesizer,
-                    ),
+                    'job_output_destination': job_output_destination,
+                    'benchmark_result_filepath': benchmark_fp,
+                    'synthetic_data_filepath': synthetic_data_fp,
+                    'synthesizer_filepath': synthesizer_fp,
                 })
-        return jobs
+        metainfo_name = self._add_filename_suffix('metainfo', instance_idx)
+        results_name = self._add_filename_suffix('results', instance_idx)
+        metainfo_fp, result_fp, job_arg_fp = _build_instance_artifact_filepaths(
+            output_destination=output_destination,
+            artifact_key_prefix=artifact_key_prefix,
+            modality_prefix=modality_prefix,
+            metainfo_name=metainfo_name,
+            results_name=results_name,
+        )
+        results = {
+            'jobs': jobs,
+            'output_destination': output_destination,
+            'result_filepath': result_fp,
+            'metainfo_filepath': metainfo_fp,
+            'job_arg_filepath': job_arg_fp,
+        }
+        return results
     def _launch(self):
         launch_id = generate_ids(['LAUNCH_ID'])
         self._launch_to_instance_names[launch_id] = []
         credentials = resolve_credentials(self.benchmark_config.credentials_filepath)
+        compute = resolve_compute(self.benchmark_config.compute)
         for instance_idx, instance_job in enumerate(self.benchmark_config.instance_jobs):
             datasets = _resolve_datasets(instance_job['datasets'])
@@ -128,19 +164,20 @@ class BenchmarkLauncher:
                 synthesizers=synthesizers,
                 sdv_datasets=datasets,
                 credentials=credentials,
-                compute_config=self.benchmark_config.compute,
+                compute_config=compute,
                 **self.benchmark_config.method_params,
             )
             self._launch_to_instance_names[launch_id].append(instance_name)
             self._instance_name_to_status[instance_name] = 'running'
-            self._instance_name_to_jobs[instance_name] = self._build_instance_jobs(
+            self._instance_name_to_artifacts[instance_name] = self._build_instance_artifacts(
                 datasets=datasets,
                 synthesizers=synthesizers,
                 output_destination=output_destination,
                 instance_idx=instance_idx,
             )
+        self._timestamp = pd.Timestamp.now().strftime('%d_%m_%Y %H:%M:%S')
     def launch(self):
         """Run the BenchmarkConfig: validate it and then execute the specified benchmark method."""
         if not self.benchmark_config._is_validated:
@@ -274,10 +311,10 @@ class BenchmarkLauncher:
         instances = self._validate_instance_names(instance_names)
         output_destinations = []
         for instance_name in instances:
-            for job in self._instance_name_to_jobs.get(instance_name, []):
-                output_destination = job['output_destination']
-                if output_destination not in output_destinations:
-                    output_destinations.append(output_destination)
+            instance_artifacts = self._instance_name_to_artifacts.get(instance_name, {})
+            output_destination = instance_artifacts.get('output_destination')
+            if output_destination not in output_destinations:
+                output_destinations.append(output_destination)
         return output_destinations
@@ -387,7 +424,7 @@ class BenchmarkLauncher:
         }
         rows = []
         for instance_name in instances:
-            jobs = self._instance_name_to_jobs.get(instance_name, [])
+            jobs = self._instance_name_to_artifacts.get(instance_name, {}).get('jobs', [])
             instance_rows = self._get_instance_job_rows(
                 instance_name=instance_name,
                 jobs=jobs,
@@ -400,6 +437,79 @@ class BenchmarkLauncher:
         return pd.DataFrame(rows)
+    def _build_missing_result_row(self, job):
+        """Build a result row for a job missing its benchmark_result.csv."""
+        return pd.DataFrame([
+            {
+                'Dataset': job['dataset'],
+                'Synthesizer': job['synthesizer'],
+                'Dataset_Size_MB': None,
+                'Train_Time': None,
+                'Peak_Memory_MB': None,
+                'Synthesizer_Size_MB': None,
+                'Sample_Time': None,
+                'Evaluate_Time': None,
+                'Error': 'Instance Deadline Error',
+            }
+        ])
+    def _build_or_load_instance_results(self, instance_name):
+        """Get instance result table.
+        If the instance's result file exists in storage, load and return it.
+        Otherwise, build the result table by loading each job's result file if it exists,
+        or adding a row with an error if it doesn't.
+        """
+        jobs = self._instance_name_to_artifacts[instance_name]['jobs']
+        results_filepath = self._instance_name_to_artifacts[instance_name]['result_filepath']
+        if self._storage_manager.file_exists(results_filepath):
+            return self._storage_manager.read_csv(results_filepath)
+        frames = []
+        for job in jobs:
+            job_result = self._storage_manager._load_job_result(job['benchmark_result_filepath'])
+            if job_result is None:
+                frames.append(self._build_missing_result_row(job))
+            else:
+                frames.append(job_result)
+        return pd.concat(frames, ignore_index=True)
+    def _update_instance_metainfo(self, instance_name):
+        """Update the instance metainfo file with the completion date."""
+        metainfo_filepath = self._instance_name_to_artifacts[instance_name]['metainfo_filepath']
+        content = {'completed_date': pd.Timestamp.now().strftime('%d_%m_%Y %H:%M:%S')}
+        self._storage_manager.update_metainfo(metainfo_filepath, content)
+    def finalize(self):
+        """Finalize the benchmark using the results available so far.
+        This method is used for an early stop scenario. For each launched instance,
+        it builds or loads the instance-level results file from the available job
+        artifacts, updates the metainfo file, and removes temporary job argument
+        artifacts. Missing job results are preserved as incomplete or failed entries
+        in the final output.
+        Once the available artifacts have been saved, all remaining running
+        instances are terminated.
+        """
+        self._validate_compute_service()
+        self._update_instance_statuses()
+        for instance_name in self._get_all_instance_names():
+            instance_artifacts = self._instance_name_to_artifacts[instance_name]
+            result_filepath = instance_artifacts['result_filepath']
+            job_arg_filepath = instance_artifacts['job_arg_filepath']
+            result_df = self._build_or_load_instance_results(instance_name)
+            self._storage_manager.write_csv(result=result_df, filepath=result_filepath)
+            self._storage_manager.delete(job_arg_filepath)
+            self._update_instance_metainfo(instance_name)
+        self.terminate(verbose=True)
+    def save_to_cloud(self, filepath):
+        """Save the benchmark launcher in the cloud using the storage manager."""
+        self._storage_manager.save_pickle(self, filepath)
     def save(self, filepath):
         """Save the benchmark configuration to a file."""
         with open(filepath, 'wb') as output:

{sdgym-0.14.2.dev0 → sdgym-0.14.3.dev0}/sdgym/_benchmark_launcher/script.py RENAMED Viewed

@@ -156,7 +156,7 @@ def _split_instance_jobs(instance_job):
     raise ValueError('Cannot split the instance job any further.')
-def _build_instance_jobs(datasets, synthesizers, num_instances, output_destination):
+def _build_instance_artifacts(datasets, synthesizers, num_instances, output_destination):
     """Build exactly ``num_instances`` instance jobs."""
     max_jobs = len(synthesizers) * len(datasets)
     if num_instances > max_jobs:
@@ -223,7 +223,7 @@ def build_dict_from_args(args):
     num_instances = num_instances if num_instances is not None else DEFAULT_NUM_INSTANCES
     return {
         'method_params': method_params,
-        'instance_jobs': _build_instance_jobs(
+        'instance_jobs': _build_instance_artifacts(
             datasets=datasets,
             synthesizers=synthesizers,
             num_instances=num_instances,

sdgym 0.14.2.dev0__tar.gz → 0.14.3.dev0__tar.gz

sdgym 0.14.2.dev0tar.gz → 0.14.3.dev0tar.gz