PyPI - sdgym - Versions diffs - 0.9.0.dev0__tar.gz → 0.9.1.dev0__tar.gz - Mend

sdgym 0.9.0.dev0tar.gz → 0.9.1.dev0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sdgym
-Version: 0.9.0.dev0
+Version: 0.9.1.dev0
 Summary: Benchmark tabular synthetic data generators using a variety of datasets
 Author-email: "DataCebo, Inc." <info@sdv.dev>
 License: BSL-1.1
@@ -27,6 +27,7 @@ License-File: LICENSE
 Requires-Dist: appdirs>=1.3
 Requires-Dist: boto3<2,>=1.28
 Requires-Dist: botocore<2,>=1.31
+Requires-Dist: cloudpickle>=2.1.0
 Requires-Dist: compress-pickle>=1.2.0
 Requires-Dist: humanfriendly>=8.2
 Requires-Dist: numpy<2.0.0,>=1.21.0; python_version < "3.10"

{sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/pyproject.toml RENAMED Viewed

@@ -24,6 +24,7 @@ dependencies = [
     'appdirs>=1.3',
     'boto3>=1.28,<2',
     'botocore>=1.31,<2',
+    'cloudpickle>=2.1.0',
     'compress-pickle>=1.2.0',
     'humanfriendly>=8.2',
     "numpy>=1.21.0,<2.0.0;python_version<'3.10'",
@@ -133,7 +134,7 @@ namespaces = false
 version = {attr = 'sdgym.__version__'}
 [tool.bumpversion]
-current_version = "0.9.0.dev0"
+current_version = "0.9.1.dev0"
 parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
 serialize = [
     '{major}.{minor}.{patch}.{release}{candidate}',

{sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/__init__.py RENAMED Viewed

@@ -8,7 +8,7 @@ __author__ = 'DataCebo, Inc.'
 __copyright__ = 'Copyright (c) 2022 DataCebo, Inc.'
 __email__ = 'info@sdv.dev'
 __license__ = 'BSL-1.1'
-__version__ = '0.9.0.dev0'
+__version__ = '0.9.1.dev0'
 import logging

{sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/benchmark.py RENAMED Viewed

@@ -7,10 +7,12 @@ import os
 import pickle
 import tracemalloc
 import warnings
+from contextlib import contextmanager
 from datetime import datetime
 from pathlib import Path
 import boto3
+import cloudpickle
 import compress_pickle
 import numpy as np
 import pandas as pd
@@ -146,11 +148,18 @@ def _generate_job_args_list(
 def _synthesize(synthesizer_dict, real_data, metadata):
     synthesizer = synthesizer_dict['synthesizer']
-    assert issubclass(synthesizer, BaselineSynthesizer), '`synthesizer` must be a synthesizer class'
+    if isinstance(synthesizer, type):
+        assert issubclass(
+            synthesizer, BaselineSynthesizer
+        ), '`synthesizer` must be a synthesizer class'
+        synthesizer = synthesizer()
+    else:
+        assert issubclass(
+            type(synthesizer), BaselineSynthesizer
+        ), '`synthesizer` must be an instance of a synthesizer class.'
-    synthesizer_object = synthesizer()
-    get_synthesizer = synthesizer_object.get_trained_synthesizer
-    sample_from_synthesizer = synthesizer_object.sample_from_synthesizer
+    get_synthesizer = synthesizer.get_trained_synthesizer
+    sample_from_synthesizer = synthesizer.sample_from_synthesizer
     data = real_data.copy()
     num_samples = len(data)
@@ -190,7 +199,8 @@ def _compute_scores(
                 'metric': metric_name,
                 'error': 'Metric Timeout',
             })
-            output['scores'] = scores  # re-inject list to multiprocessing output
+            # re-inject list to multiprocessing output
+            output['scores'] = scores
             error = None
             score = None
@@ -213,7 +223,8 @@ def _compute_scores(
                 'error': error,
                 'metric_time': (datetime.utcnow() - start).total_seconds(),
             })
-            output['scores'] = scores  # re-inject list to multiprocessing output
+            # re-inject list to multiprocessing output
+            output['scores'] = scores
     if compute_diagnostic_score:
         start = datetime.utcnow()
@@ -264,7 +275,8 @@ def _score(
         )
         output['dataset_size'] = get_size_of(data) / N_BYTES_IN_MB
-        output['error'] = 'Synthesizer Timeout'  # To be deleted if there is no error
+        # To be deleted if there is no error
+        output['error'] = 'Synthesizer Timeout'
         synthetic_data, train_time, sample_time, synthesizer_size, peak_memory = _synthesize(
             synthesizer, data.copy(), metadata
         )
@@ -283,7 +295,8 @@ def _score(
             used_memory(),
         )
-        del output['error']  # No error so far. _compute_scores tracks its own errors by metric
+        # No error so far. _compute_scores tracks its own errors by metric
+        del output['error']
         _compute_scores(
             metrics,
             data,
@@ -314,6 +327,26 @@ def _score(
     return output
+@contextmanager
+def multiprocessing_context():
+    """Override multiprocessing ForkingPickler to use cloudpickle."""
+    original_dump = multiprocessing.reduction.ForkingPickler.dumps
+    original_load = multiprocessing.reduction.ForkingPickler.loads
+    original_method = multiprocessing.get_start_method()
+    multiprocessing.set_start_method('spawn', force=True)
+    multiprocessing.reduction.ForkingPickler.dumps = cloudpickle.dumps
+    multiprocessing.reduction.ForkingPickler.loads = cloudpickle.loads
+    try:
+        yield
+    finally:
+        # Restore original methods
+        multiprocessing.set_start_method(original_method, force=True)
+        multiprocessing.reduction.ForkingPickler.dumps = original_dump
+        multiprocessing.reduction.ForkingPickler.loads = original_load
 def _score_with_timeout(
     timeout,
     synthesizer,
@@ -325,32 +358,33 @@ def _score_with_timeout(
     modality=None,
     dataset_name=None,
 ):
-    with multiprocessing.Manager() as manager:
-        output = manager.dict()
-        process = multiprocessing.Process(
-            target=_score,
-            args=(
-                synthesizer,
-                data,
-                metadata,
-                metrics,
-                output,
-                compute_quality_score,
-                compute_diagnostic_score,
-                modality,
-                dataset_name,
-            ),
-        )
+    with multiprocessing_context():
+        with multiprocessing.Manager() as manager:
+            output = manager.dict()
+            process = multiprocessing.Process(
+                target=_score,
+                args=(
+                    synthesizer,
+                    data,
+                    metadata,
+                    metrics,
+                    output,
+                    compute_quality_score,
+                    compute_diagnostic_score,
+                    modality,
+                    dataset_name,
+                ),
+            )
-        process.start()
-        process.join(timeout)
-        process.terminate()
+            process.start()
+            process.join(timeout)
+            process.terminate()
-        output = dict(output)
-        if output.get('timeout'):
-            LOGGER.error('Timeout running %s on dataset %s;', synthesizer['name'], dataset_name)
+            output = dict(output)
+            if output.get('timeout'):
+                LOGGER.error('Timeout running %s on dataset %s;', synthesizer['name'], dataset_name)
-        return output
+            return output
 def _format_output(
@@ -756,6 +790,11 @@ def benchmark_single_table(
              'package_name': 'dask' or 'multiprocessing',
              'num_workers': 4
             }
+        run_on_ec2 (bool):
+            The flag is used to run the benchmark on an EC2 instance that will be created
+            by a scriptusing the authentication of the current user. The EC2 instance
+            uses the LATEST released version of sdgym. Local changes or changes NOT
+            in the released version will NOT be used in the ec2 instance.
     Returns:
         pandas.DataFrame:

{sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/synthesizers/generate.py RENAMED Viewed

@@ -124,7 +124,7 @@ def create_single_table_synthesizer(
                 obj:
                     The trained synthesizer.
             """
-            return get_trained_synthesizer_fn(data, metadata)
+            return self.synthesizer_fn['get_trained_synthesizer_fn'](data, metadata)
         def sample_from_synthesizer(self, synthesizer, num_samples):
             """Sample the desired number of samples from the given synthesizer.
@@ -139,11 +139,22 @@ def create_single_table_synthesizer(
                 pandas.DataFrame:
                     The synthetic data.
             """
-            return sample_from_synthesizer_fn(synthesizer, num_samples)
-    NewSynthesizer.__name__ = f'Custom:{display_name}'
-    return NewSynthesizer
+            return self.synthesizer_fn['sample_from_synthesizer_fn'](synthesizer, num_samples)
+    CustomSynthesizer = type(
+        f'Custom:{display_name}',
+        (NewSynthesizer,),
+        {
+            'synthesizer_fn': {
+                'get_trained_synthesizer_fn': get_trained_synthesizer_fn,
+                'sample_from_synthesizer_fn': sample_from_synthesizer_fn,
+            },
+        },
+    )
+    CustomSynthesizer.__name__ = f'Custom:{display_name}'
+    CustomSynthesizer.__module__ = 'sdgym.synthesizers.generate'
+    globals()[f'Custom:{display_name}'] = CustomSynthesizer
+    return CustomSynthesizer
 def create_multi_table_synthesizer(

{sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym/utils.py RENAMED Viewed

@@ -60,8 +60,12 @@ def get_synthesizers(synthesizers):
             else:
                 raise SDGymError(f'Unknown synthesizer {synthesizer}') from None
+        if isinstance(synthesizer, type) or hasattr(synthesizer, '__name__'):
+            synthesizer_name = getattr(synthesizer, '__name__', 'undefined')
+        else:
+            synthesizer_name = getattr(type(synthesizer), '__name__', 'undefined')
         synthesizers_dicts.append({
-            'name': getattr(synthesizer, '__name__', 'undefined'),
+            'name': synthesizer_name,
             'synthesizer': synthesizer,
         })

{sdgym-0.9.0.dev0 → sdgym-0.9.1.dev0}/sdgym.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sdgym
-Version: 0.9.0.dev0
+Version: 0.9.1.dev0
 Summary: Benchmark tabular synthetic data generators using a variety of datasets
 Author-email: "DataCebo, Inc." <info@sdv.dev>
 License: BSL-1.1
@@ -27,6 +27,7 @@ License-File: LICENSE
 Requires-Dist: appdirs>=1.3
 Requires-Dist: boto3<2,>=1.28
 Requires-Dist: botocore<2,>=1.31
+Requires-Dist: cloudpickle>=2.1.0
 Requires-Dist: compress-pickle>=1.2.0
 Requires-Dist: humanfriendly>=8.2
 Requires-Dist: numpy<2.0.0,>=1.21.0; python_version < "3.10"