PyPI - sdgym - Versions diffs - 0.14.3.dev0__tar.gz → 0.14.4.dev0__tar.gz - Mend

sdgym 0.14.3.dev0tar.gz → 0.14.4.dev0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

{sdgym-0.14.3.dev0/sdgym.egg-info → sdgym-0.14.4.dev0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sdgym
-Version: 0.14.3.dev0
+Version: 0.14.4.dev0
 Summary: Benchmark tabular synthetic data generators using a variety of datasets
 Author-email: "DataCebo, Inc." <info@sdv.dev>
 License-Expression: BUSL-1.1
@@ -44,7 +44,7 @@ Requires-Dist: pandas<3,>=1.5.0; python_version >= "3.11" and python_version < "
 Requires-Dist: pandas<3,>=2.1.1; python_version >= "3.12" and python_version < "3.13"
 Requires-Dist: pandas<3,>=2.2.3; python_version >= "3.13" and python_version < "3.14"
 Requires-Dist: pandas<3,>=2.3.3; python_version >= "3.14"
-Requires-Dist: psutil>=5.8
+Requires-Dist: psutil>=7.0.0
 Requires-Dist: scikit-learn>=1.0.2; python_version < "3.10"
 Requires-Dist: scikit-learn>=1.1.0; python_version >= "3.10" and python_version < "3.11"
 Requires-Dist: scikit-learn>=1.1.3; python_version >= "3.11" and python_version < "3.12"
@@ -66,10 +66,8 @@ Requires-Dist: tqdm>=4.66.3
 Requires-Dist: XlsxWriter>=1.2.8
 Requires-Dist: rdt>=1.18.2; python_version < "3.14"
 Requires-Dist: rdt>=1.20.0; python_version >= "3.14"
-Requires-Dist: sdmetrics>=0.21.0; python_version < "3.14"
-Requires-Dist: sdmetrics>=0.26.0; python_version >= "3.14"
-Requires-Dist: sdv>=1.21.0; python_version < "3.14"
-Requires-Dist: sdv>=1.33.0; python_version >= "3.14"
+Requires-Dist: sdmetrics>=0.28.0
+Requires-Dist: sdv>=1.37.0
 Provides-Extra: dask
 Requires-Dist: dask; extra == "dask"
 Requires-Dist: distributed; extra == "dask"

{sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/pyproject.toml RENAMED Viewed

@@ -42,7 +42,7 @@ dependencies = [
     "pandas>=2.1.1,<3;python_version>='3.12' and python_version<'3.13'",
     "pandas>=2.2.3,<3;python_version>='3.13' and python_version<'3.14'",
     "pandas>=2.3.3,<3;python_version>='3.14'",
-    'psutil>=5.8',
+    'psutil>=7.0.0',
     "scikit-learn>=1.0.2;python_version<'3.10'",
     "scikit-learn>=1.1.0;python_version>='3.10' and python_version<'3.11'",
     "scikit-learn>=1.1.3;python_version>='3.11' and python_version<'3.12'",
@@ -64,10 +64,8 @@ dependencies = [
     'XlsxWriter>=1.2.8',
     "rdt>=1.18.2;python_version<'3.14'",
     "rdt>=1.20.0;python_version>='3.14'",
-    "sdmetrics>=0.21.0;python_version<'3.14'",
-    "sdmetrics>=0.26.0;python_version>='3.14'",
-    "sdv>=1.21.0;python_version<'3.14'",
-    "sdv>=1.33.0;python_version>='3.14'",
+    "sdmetrics>=0.28.0",
+    "sdv>=1.37.0",
 ]
 [project.urls]
@@ -163,7 +161,7 @@ namespaces = false
 version = {attr = 'sdgym.__version__'}
 [tool.bumpversion]
-current_version = "0.14.3.dev0"
+current_version = "0.14.4.dev0"
 parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
 serialize = [
     '{major}.{minor}.{patch}.{release}{candidate}',

{sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/__init__.py RENAMED Viewed

@@ -8,7 +8,7 @@ __author__ = 'DataCebo, Inc.'
 __copyright__ = 'Copyright (c) 2022 DataCebo, Inc.'
 __email__ = 'info@sdv.dev'
 __license__ = 'BSL-1.1'
-__version__ = '0.14.3.dev0'
+__version__ = '0.14.4.dev0'
 import logging

{sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark_launcher/script.py RENAMED Viewed

@@ -1,17 +1,16 @@
 import argparse
-import warnings
+from itertools import product
 from sdgym._benchmark_launcher.benchmark_config import BenchmarkConfig
 from sdgym._benchmark_launcher.benchmark_launcher import BenchmarkLauncher
 from sdgym._benchmark_launcher.utils import (
     _deep_merge,
     _load_merged_modality_config,
+    _resolve_datasets,
     _resolve_modality_config,
 )
 from sdgym.run_benchmark.utils import OUTPUT_DESTINATION_AWS
-DEFAULT_NUM_INSTANCES = 1
 def _parse_args():
     """Parse CLI arguments for launching a benchmark."""
@@ -45,12 +44,6 @@ def _parse_args():
             'benchmark for the given modality.'
         ),
     )
-    parser.add_argument(
-        '--num-instances',
-        type=int,
-        default=None,
-        help='Number of benchmark instances to create. Defaults to 1.',
-    )
     parser.add_argument(
         '--timeout',
         type=int,
@@ -77,7 +70,6 @@ def _validate_args(args):
                 args.synthesizers,
                 args.output_destination,
                 args.timeout,
-                args.num_instances,
             )
         ):
             raise ValueError(
@@ -94,9 +86,6 @@ def _validate_args(args):
             "'--output-destination' is required when '--config-filepath' is not provided."
         )
-    if args.num_instances is not None and args.num_instances < 1:
-        raise ValueError("'--num-instances' must be greater than or equal to 1.")
     if args.output_destination == OUTPUT_DESTINATION_AWS:
         raise ValueError(
             f"'--output-destination' cannot be {OUTPUT_DESTINATION_AWS!r} that is reserved "
@@ -104,97 +93,25 @@ def _validate_args(args):
         )
-def _split_list(values):
-    """Split a list into two non-empty parts, as evenly as possible."""
-    midpoint = len(values) // 2
-    return values[:midpoint], values[midpoint:]
-def _instance_job_size(instance_job):
-    """Return the number of synthesizer and dataset combinations."""
-    return len(instance_job['synthesizers']) * len(instance_job['datasets'])
-def _split_instance_jobs(instance_job):
-    """Split an instance job into two smaller instance jobs.
-    Prefer splitting synthesizers. If there is only one synthesizer,
-    split datasets instead.
-    """
-    synthesizers = instance_job['synthesizers']
-    datasets = instance_job['datasets']
-    if len(synthesizers) > 1:
-        left_synthesizers, right_synthesizers = _split_list(synthesizers)
-        return [
-            {
-                'synthesizers': left_synthesizers,
-                'datasets': datasets,
-                'output_destination': instance_job['output_destination'],
-            },
-            {
-                'synthesizers': right_synthesizers,
-                'datasets': datasets,
-                'output_destination': instance_job['output_destination'],
-            },
-        ]
-    if len(datasets) > 1:
-        left_datasets, right_datasets = _split_list(datasets)
-        return [
-            {
-                'synthesizers': synthesizers,
-                'datasets': left_datasets,
-                'output_destination': instance_job['output_destination'],
-            },
-            {
-                'synthesizers': synthesizers,
-                'datasets': right_datasets,
-                'output_destination': instance_job['output_destination'],
-            },
-        ]
-    raise ValueError('Cannot split the instance job any further.')
-def _build_instance_artifacts(datasets, synthesizers, num_instances, output_destination):
-    """Build exactly ``num_instances`` instance jobs."""
-    max_jobs = len(synthesizers) * len(datasets)
-    if num_instances > max_jobs:
-        num_instances = max_jobs
-        warnings.warn(
-            f'num_instances is too high for the number of synthesizers and datasets. '
-            f'Maximum number of instances is {max_jobs}. Setting num_instances to {max_jobs}.'
-        )
-    instance_jobs = [
+def _build_instance_jobs(datasets, synthesizers, output_destination):
+    """Build one instance job per dataset and synthesizer pair."""
+    return [
         {
-            'synthesizers': list(synthesizers),
-            'datasets': list(datasets),
+            'synthesizers': [synthesizer],
+            'datasets': [dataset],
             'output_destination': output_destination,
         }
+        for dataset, synthesizer in product(datasets, synthesizers)
     ]
-    while len(instance_jobs) < num_instances:
-        split_index = None
-        split_size = -1
-        for index, instance_job in enumerate(instance_jobs):
-            if (_instance_job_size(instance_job) > 1) and (
-                _instance_job_size(instance_job) > split_size
-            ):
-                split_index = index
-                split_size = _instance_job_size(instance_job)
-        instance_job = instance_jobs.pop(split_index)
-        instance_jobs.extend(_split_instance_jobs(instance_job))
-    return instance_jobs
 def _get_default_datasets_and_synthesizers(modality):
     """Get the default datasets and synthesizers for a modality config."""
     base_dict = _load_merged_modality_config(modality)
-    datasets = base_dict.get(f'datasets_{modality}', [])
+    datasets = []
     synthesizers = []
     for instance_job in base_dict.get('instance_jobs', []):
+        datasets.extend(_resolve_datasets(instance_job.get('datasets', [])))
         synthesizers.extend(instance_job.get('synthesizers', []))
     return sorted(set(datasets)), sorted(set(synthesizers))
@@ -208,8 +125,7 @@ def build_dict_from_args(args):
     datasets = args.datasets
     synthesizers = args.synthesizers
-    num_instances = args.num_instances
-    if all(value is None for value in (datasets, synthesizers, num_instances)):
+    if all(value is None for value in (datasets, synthesizers)):
         config = _resolve_modality_config(args.modality)
         config['method_params'] = method_params
         for config_instance_job in config.get('instance_jobs', []):
@@ -220,13 +136,11 @@ def build_dict_from_args(args):
     default_datasets, default_synthesizers = _get_default_datasets_and_synthesizers(args.modality)
     datasets = datasets if datasets is not None else default_datasets
     synthesizers = synthesizers if synthesizers is not None else default_synthesizers
-    num_instances = num_instances if num_instances is not None else DEFAULT_NUM_INSTANCES
     return {
         'method_params': method_params,
-        'instance_jobs': _build_instance_artifacts(
+        'instance_jobs': _build_instance_jobs(
             datasets=datasets,
             synthesizers=synthesizers,
-            num_instances=num_instances,
             output_destination=args.output_destination,
         ),
     }
@@ -257,12 +171,11 @@ def launch_from_args():
     When building the configuration from command-line arguments:
-    - If ``--datasets``, ``--synthesizers``, and ``--num-instances`` are all
-      omitted, the default monthly benchmark configuration for the selected
-      modality is used.
-    - If ``--num-instances`` is omitted, it defaults to ``1``.
+    - If ``--datasets`` and ``--synthesizers`` are both omitted, the default
+      monthly benchmark configuration for the selected modality is used.
     - If ``--datasets`` or ``--synthesizers`` is omitted, the corresponding
       default values from the monthly benchmark configuration are used.
+    - One instance job is created for each dataset and synthesizer pair.
     Once the configuration is resolved, the benchmark is launched.
     """

{sdgym-0.14.3.dev0 → sdgym-0.14.4.dev0}/sdgym/_benchmark_launcher/utils.py RENAMED Viewed

@@ -5,6 +5,7 @@ import os
 import uuid
 from datetime import datetime
 from importlib.resources import files
+from itertools import product
 from urllib.parse import quote_plus
 import yaml
@@ -13,13 +14,111 @@ from sdgym._benchmark.benchmark import (
     _benchmark_multi_table_compute_gcp,
     _benchmark_single_table_compute_gcp,
 )
-from sdgym.run_benchmark.utils import get_s3_console_link
+from sdgym.run_benchmark.utils import OUTPUT_DESTINATION_AWS, get_s3_console_link
 from sdgym.s3 import parse_s3_path
 _YAML_PKG = 'sdgym._benchmark_launcher'
-MODALITY_TO_CONFIG_FILE = {
-    'single_table': 'benchmark_single_table.yaml',
-    'multi_table': 'benchmark_multi_table.yaml',
+MODALITY_TO_JOB_SETUP = {
+    'single_table': {
+        'output_destination': OUTPUT_DESTINATION_AWS,
+        'datasets': [
+            'adult',
+            'alarm',
+            'census',
+            'child',
+            'covtype',
+            'expedia_hotel_logs',
+            'insurance',
+            'intrusion',
+            'news',
+        ],
+        'synthesizers': [
+            'ColumnSynthesizer',
+            'GaussianCopulaSynthesizer',
+            'CTGANSynthesizer',
+            'CopulaGANSynthesizer',
+            'TVAESynthesizer',
+            'SegmentSynthesizer',
+            'XGCSynthesizer',
+            'BootstrapSynthesizer',
+            'RealTabFormerSynthesizer',
+        ],
+    },
+    'multi_table': {
+        'output_destination': OUTPUT_DESTINATION_AWS,
+        'datasets': [
+            'rel-amazon',
+            'rel-arxiv',
+            'rel-avito',
+            'rel-event',
+            'rel-f1',
+            'rel-hm',
+            'rel-ratebeer',
+            'rel-salt',
+            'rel-stack',
+            'rel-trial',
+            'instacart_marketbasket_ml',
+            'MovieLens',
+            'rossmann',
+            'Telstra',
+            'walmart',
+            'WebKP',
+            'DCG',
+            'UW_std',
+            'Same_gen',
+            'CORA',
+            'got_families',
+            'SalesDB',
+            'UTube',
+            'Student_loan',
+            'Hepatitis_std',
+            'Elti',
+            'Bupa',
+            'Toxicology',
+            'imdb_ijs',
+            'ftp',
+            'imdb_small',
+            'imdb_MovieLens',
+            'Pima',
+            'university',
+            'legalActs',
+            'Dunur',
+            'Mesh',
+            'world',
+            'airbnb-simplified',
+            'trains',
+            'FNHK',
+            'fake_hotels',
+            'SAT',
+            'genes',
+            'Biodegradability',
+            'Pyrimidine',
+            'mutagenesis',
+            'restbase',
+            'Triazine',
+            'Carcinogenesis',
+            'fake_hotels_extended',
+            'Mooney_Family',
+            'PTE',
+            'Facebook',
+            'multi_table_ID_demo_dataset',
+            'SAP',
+            'Chess',
+            'Countries',
+            'NCAA',
+            'Atherosclerosis',
+            'nations',
+            'TubePricing',
+            'financial',
+            'Accidents',
+            'MuskSmall',
+            'NBA',
+            'AustralianFootball',
+            'PremierLeague',
+            'OMOP_CDM_dayz',
+        ],
+        'synthesizers': ['HMASynthesizer', 'HSASynthesizer', 'IndependentSynthesizer'],
+    },
 }
 CONFIG_KEYS = {
     'modality',
@@ -84,10 +183,24 @@ def resolve_compute(compute):
     raise ValueError(f"compute.service must be one of: 'gcp'. Found: {service}")
+def _get_modality_config(modality):
+    """Get the launchable benchmark config for a modality."""
+    result = []
+    job_setup = MODALITY_TO_JOB_SETUP.get(modality)
+    for dataset, synthesizer in product(job_setup['datasets'], job_setup['synthesizers']):
+        result.append({
+            'datasets': [dataset],
+            'synthesizers': [synthesizer],
+            'output_destination': job_setup['output_destination'],
+        })
+    return {'modality': modality, 'instance_jobs': result}
 def _load_merged_modality_config(modality):
     """Load and merge the base and modality-specific benchmark configs."""
     base_config = _load_yaml_resource('benchmark_base.yaml')
-    modality_config = _load_yaml_resource(MODALITY_TO_CONFIG_FILE[modality])
+    modality_config = _get_modality_config(modality)
     return _deep_merge(base_config, modality_config)
@@ -285,9 +398,7 @@ def _build_instance_artifact_filepaths(
     return (
         _build_s3_uri(output_destination, f'{artifact_key_prefix}/{metainfo_name}.yaml'),
         _build_s3_uri(output_destination, f'{artifact_key_prefix}/{results_name}.csv'),
-        _build_s3_uri(
-            output_destination, f'{modality_prefix}/job_args_list_{metainfo_name}.pkl.gz'
-        ),
+        _build_s3_uri(output_destination, f'{modality_prefix}/job_args_list_{metainfo_name}.pkl'),
     )

sdgym 0.14.3.dev0__tar.gz → 0.14.4.dev0__tar.gz

sdgym 0.14.3.dev0tar.gz → 0.14.4.dev0tar.gz