PyPI - sdgym - Versions diffs - 0.8.0.dev1__tar.gz → 0.9.0.dev0__tar.gz - Mend

sdgym 0.8.0.dev1tar.gz → 0.9.0.dev0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sdgym
-Version: 0.8.0.dev1
+Version: 0.9.0.dev0
 Summary: Benchmark tabular synthetic data generators using a variety of datasets
 Author-email: "DataCebo, Inc." <info@sdv.dev>
 License: BSL-1.1
@@ -29,9 +29,9 @@ Requires-Dist: boto3<2,>=1.28
 Requires-Dist: botocore<2,>=1.31
 Requires-Dist: compress-pickle>=1.2.0
 Requires-Dist: humanfriendly>=8.2
-Requires-Dist: numpy>=1.21.0; python_version < "3.10"
-Requires-Dist: numpy<2,>=1.23.3; python_version >= "3.10" and python_version < "3.12"
-Requires-Dist: numpy<2,>=1.26.0; python_version >= "3.12"
+Requires-Dist: numpy<2.0.0,>=1.21.0; python_version < "3.10"
+Requires-Dist: numpy<2.0.0,>=1.23.3; python_version >= "3.10" and python_version < "3.12"
+Requires-Dist: numpy<2.0.0,>=1.26.0; python_version >= "3.12"
 Requires-Dist: pandas>=1.4.0; python_version < "3.11"
 Requires-Dist: pandas>=1.5.0; python_version >= "3.11" and python_version < "3.12"
 Requires-Dist: pandas>=2.1.1; python_version >= "3.12"

{sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/pyproject.toml RENAMED Viewed

@@ -26,9 +26,9 @@ dependencies = [
     'botocore>=1.31,<2',
     'compress-pickle>=1.2.0',
     'humanfriendly>=8.2',
-    "numpy>=1.21.0;python_version<'3.10'",
-    "numpy>=1.23.3,<2;python_version>='3.10' and python_version<'3.12'",
-    "numpy>=1.26.0,<2;python_version>='3.12'",
+    "numpy>=1.21.0,<2.0.0;python_version<'3.10'",
+    "numpy>=1.23.3,<2.0.0;python_version>='3.10' and python_version<'3.12'",
+    "numpy>=1.26.0,<2.0.0;python_version>='3.12'",
     "pandas>=1.4.0;python_version<'3.11'",
     "pandas>=1.5.0;python_version>='3.11' and python_version<'3.12'",
     "pandas>=2.1.1;python_version>='3.12'",
@@ -133,7 +133,7 @@ namespaces = false
 version = {attr = 'sdgym.__version__'}
 [tool.bumpversion]
-current_version = "0.8.0.dev1"
+current_version = "0.9.0.dev0"
 parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
 serialize = [
     '{major}.{minor}.{patch}.{release}{candidate}',
@@ -179,7 +179,8 @@ exclude = [
     ".tox",
     ".git",
     "__pycache__",
-    ".ipynb_checkpoints"
+    ".ipynb_checkpoints",
+    "tasks.py",
 ]
 [tool.ruff.lint]
@@ -189,14 +190,22 @@ select = [
     # Pycodestyle
     "E",
     "W",
-    "D200",
+    # pydocstyle
+    "D",
     # isort
     "I001",
+    # print statements
+    "T201",
+    # pandas-vet
+    "PD"
 ]
 ignore = [
     "E501",
+    # pydocstyle
     "D107",  # Missing docstring in __init__
     "D417",   # Missing argument descriptions in the docstring, this is a bug from pydocstyle: https://github.com/PyCQA/pydocstyle/issues/449
+    "PD901",
+    "PD101",
 ]
 [tool.ruff.format]
@@ -206,14 +215,18 @@ preview = true
 docstring-code-format = true
 docstring-code-line-length = "dynamic"
-[tool.ruff.lint.pep8-naming]
-extend-ignore-names = ["X", "C", "X_padded", "Y", "Y_padded"]
 [tool.ruff.lint.isort]
 known-first-party = ["sdgym"]
+lines-between-types = 0
 [tool.ruff.lint.per-file-ignores]
 "__init__.py" = ["F401", "E402", "F403", "F405", "E501", "I001"]
+"errors.py" = ["D105"]
+"tests/**.py" = ["D"]
 [tool.ruff.lint.pydocstyle]
-convention = "google"
+convention = "google"
+[tool.ruff.lint.pycodestyle]
+max-doc-length = 100
+max-line-length = 100

{sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/__init__.py RENAMED Viewed

@@ -8,7 +8,7 @@ __author__ = 'DataCebo, Inc.'
 __copyright__ = 'Copyright (c) 2022 DataCebo, Inc.'
 __email__ = 'info@sdv.dev'
 __license__ = 'BSL-1.1'
-__version__ = '0.8.0.dev1'
+__version__ = '0.9.0.dev0'
 import logging

{sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/benchmark.py RENAMED Viewed

@@ -15,8 +15,18 @@ import compress_pickle
 import numpy as np
 import pandas as pd
 import tqdm
-from sdmetrics.reports.multi_table import QualityReport as MultiTableQualityReport
-from sdmetrics.reports.single_table import QualityReport as SingleTableQualityReport
+from sdmetrics.reports.multi_table import (
+    DiagnosticReport as MultiTableDiagnosticReport,
+)
+from sdmetrics.reports.multi_table import (
+    QualityReport as MultiTableQualityReport,
+)
+from sdmetrics.reports.single_table import (
+    DiagnosticReport as SingleTableDiagnosticReport,
+)
+from sdmetrics.reports.single_table import (
+    QualityReport as SingleTableQualityReport,
+)
 from sdgym.datasets import get_dataset_paths, load_dataset
 from sdgym.errors import SDGymError
@@ -88,6 +98,7 @@ def _generate_job_args_list(
     detailed_results_folder,
     timeout,
     compute_quality_score,
+    compute_diagnostic_score,
     synthesizers,
     custom_synthesizers,
 ):
@@ -124,6 +135,7 @@ def _generate_job_args_list(
             detailed_results_folder,
             timeout,
             compute_quality_score,
+            compute_diagnostic_score,
             dataset.name,
             'single_table',
         )
@@ -164,6 +176,7 @@ def _compute_scores(
     metadata,
     output,
     compute_quality_score,
+    compute_diagnostic_score,
     modality,
     dataset_name,
 ):
@@ -202,6 +215,17 @@ def _compute_scores(
             })
             output['scores'] = scores  # re-inject list to multiprocessing output
+    if compute_diagnostic_score:
+        start = datetime.utcnow()
+        if modality == 'single_table':
+            diagnostic_report = SingleTableDiagnosticReport()
+        else:
+            diagnostic_report = MultiTableDiagnosticReport()
+        diagnostic_report.generate(real_data, synthetic_data, metadata, verbose=False)
+        output['diagnostic_score_time'] = (datetime.utcnow() - start).total_seconds()
+        output['diagnostic_score'] = diagnostic_report.get_score()
     if compute_quality_score:
         start = datetime.utcnow()
         if modality == 'single_table':
@@ -221,6 +245,7 @@ def _score(
     metrics,
     output=None,
     compute_quality_score=False,
+    compute_diagnostic_score=False,
     modality=None,
     dataset_name=None,
 ):
@@ -266,6 +291,7 @@ def _score(
             metadata,
             output,
             compute_quality_score,
+            compute_diagnostic_score,
             modality,
             dataset_name,
         )
@@ -295,6 +321,7 @@ def _score_with_timeout(
     metadata,
     metrics,
     compute_quality_score=False,
+    compute_diagnostic_score=False,
     modality=None,
     dataset_name=None,
 ):
@@ -309,6 +336,7 @@ def _score_with_timeout(
                 metrics,
                 output,
                 compute_quality_score,
+                compute_diagnostic_score,
                 modality,
                 dataset_name,
             ),
@@ -325,15 +353,26 @@ def _score_with_timeout(
         return output
-def _format_output(output, name, dataset_name, compute_quality_score, cache_dir):
-    evaluate_time = None
-    if 'scores' in output or 'quality_score_time' in output:
-        evaluate_time = output.get('quality_score_time', 0)
+def _format_output(
+    output, name, dataset_name, compute_quality_score, compute_diagnostic_score, cache_dir
+):
+    evaluate_time = 0
+    if 'quality_score_time' in output:
+        evaluate_time += output.get('quality_score_time', 0)
+    if 'diagnostic_score_time' in output:
+        evaluate_time += output.get('diagnostic_score_time', 0)
     for score in output.get('scores', []):
-        if score['metric'] == 'NewRowSynthesis':
+        if 'metric_time' in score and not np.isnan(score['metric_time']):
             evaluate_time += score['metric_time']
+    if (
+        'quality_score_time' not in output
+        and 'scores' not in output
+        and 'diagnostic_score_time' not in output
+    ):
+        evaluate_time = None
     scores = pd.DataFrame({
         'Synthesizer': [name],
         'Dataset': [dataset_name],
@@ -345,6 +384,9 @@ def _format_output(output, name, dataset_name, compute_quality_score, cache_dir)
         'Evaluate_Time': [evaluate_time],
     })
+    if compute_diagnostic_score:
+        scores.insert(len(scores.columns), 'Diagnostic_Score', output.get('diagnostic_score'))
     if compute_quality_score:
         scores.insert(len(scores.columns), 'Quality_Score', output.get('quality_score'))
@@ -381,6 +423,7 @@ def _run_job(args):
         cache_dir,
         timeout,
         compute_quality_score,
+        compute_diagnostic_score,
         dataset_name,
         modality,
     ) = args
@@ -404,6 +447,7 @@ def _run_job(args):
                 metadata=metadata,
                 metrics=metrics,
                 compute_quality_score=compute_quality_score,
+                compute_diagnostic_score=compute_diagnostic_score,
                 modality=modality,
                 dataset_name=dataset_name,
             )
@@ -414,13 +458,16 @@ def _run_job(args):
                 metadata=metadata,
                 metrics=metrics,
                 compute_quality_score=compute_quality_score,
+                compute_diagnostic_score=compute_diagnostic_score,
                 modality=modality,
                 dataset_name=dataset_name,
             )
     except Exception as error:
         output['exception'] = error
-    scores = _format_output(output, name, dataset_name, compute_quality_score, cache_dir)
+    scores = _format_output(
+        output, name, dataset_name, compute_quality_score, compute_diagnostic_score, cache_dir
+    )
     return scores
@@ -482,7 +529,7 @@ def _run_jobs(multi_processing_config, job_args_list, show_progress):
     return scores
-def _get_empty_dataframe(compute_quality_score, sdmetrics):
+def _get_empty_dataframe(compute_diagnostic_score, compute_quality_score, sdmetrics):
     warnings.warn('No datasets/synthesizers found.')
     scores = pd.DataFrame({
@@ -496,6 +543,8 @@ def _get_empty_dataframe(compute_quality_score, sdmetrics):
         'Evaluate_Time': [],
     })
+    if compute_diagnostic_score:
+        scores['Diagnostic_Score'] = []
     if compute_quality_score:
         scores['Quality_Score'] = []
     if sdmetrics:
@@ -564,7 +613,7 @@ from io import StringIO
 import sdgym
 from sdgym.synthesizers.sdv import (CopulaGANSynthesizer, CTGANSynthesizer,
     GaussianCopulaSynthesizer, HMASynthesizer, PARSynthesizer, SDVRelationalSynthesizer,
-    SDVTabularSynthesizer,TVAESynthesizer)
+    SDVTabularSynthesizer, TVAESynthesizer)
 results = sdgym.benchmark_single_table(
     {synthesizer_string}, custom_synthesizers={params['custom_synthesizers']},
@@ -572,6 +621,7 @@ results = sdgym.benchmark_single_table(
     additional_datasets_folder={params['additional_datasets_folder']},
     limit_dataset_size={params['limit_dataset_size']},
     compute_quality_score={params['compute_quality_score']},
+    compute_diagnostic_score={params['compute_diagnostic_score']},
     sdmetrics={params['sdmetrics']}, timeout={params['timeout']},
     detailed_results_folder={params['detailed_results_folder']},
     multi_processing_config={params['multi_processing_config']}
@@ -643,6 +693,7 @@ def benchmark_single_table(
     additional_datasets_folder=None,
     limit_dataset_size=False,
     compute_quality_score=True,
+    compute_diagnostic_score=True,
     sdmetrics=DEFAULT_METRICS,
     timeout=None,
     output_filepath=None,
@@ -680,6 +731,8 @@ def benchmark_single_table(
             columns.
         compute_quality_score (bool):
             Whether or not to evaluate an overall quality score.
+        compute_diagnostic_score (bool):
+            Whether or not to evaluate an overall diagnostic score.
         sdmetrics (list[str]):
             A list of the different SDMetrics to use. If you'd like to input specific parameters
             into the metric, provide a tuple with the metric name followed by a dictionary of
@@ -729,6 +782,7 @@ def benchmark_single_table(
         detailed_results_folder,
         timeout,
         compute_quality_score,
+        compute_diagnostic_score,
         synthesizers,
         custom_synthesizers,
     )
@@ -738,7 +792,7 @@ def benchmark_single_table(
     # If no synthesizers/datasets are passed, return an empty dataframe
     else:
-        scores = _get_empty_dataframe(compute_quality_score, sdmetrics)
+        scores = _get_empty_dataframe(compute_diagnostic_score, compute_quality_score, sdmetrics)
     if output_filepath:
         write_csv(scores, output_filepath, None, None)

{sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/cli/__main__.py RENAMED Viewed

@@ -41,13 +41,13 @@ def _print_table(data, sort=None, reverse=False, format=None):
     if 'error' in data:
         error = data['error']
-        if pd.isnull(error).all():
+        if pd.isna(error).all():
             del data['error']
         else:
             long_error = error.str.len() > 30
             data.loc[long_error, 'error'] = error[long_error].str[:30] + '...'
-    print(tabulate.tabulate(data, tablefmt='github', headers=data.columns, showindex=False))
+    print(tabulate.tabulate(data, tablefmt='github', headers=data.columns, showindex=False))  # noqa: T201
 def _run(args):
@@ -110,7 +110,7 @@ def _download_datasets(args):
 def _list_downloaded(args):
     datasets = sdgym.cli.utils.get_downloaded_datasets(args.datasets_path)
     _print_table(datasets, args.sort, args.reverse, {'size': humanfriendly.format_size})
-    print(f'Found {len(datasets)} downloaded datasets')
+    print(f'Found {len(datasets)} downloaded datasets')  # noqa: T201
 def _list_available(args):
@@ -395,7 +395,7 @@ def main():
     try:
         args.action(args)
     except sdgym.errors.SDGymError as error:
-        print(f'ERROR: {error}')
+        print(f'ERROR: {error}')  # noqa: T201
 if __name__ == '__main__':

{sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/cli/collect.py RENAMED Viewed

@@ -22,7 +22,7 @@ def collect_results(input_path, output_file=None, aws_key=None, aws_secret=None)
             If an ``aws_secret`` is provided, the given secret access key will be used to read
             from and/or write to any s3 paths.
     """
-    print(f'Reading results from {input_path}')
+    print(f'Reading results from {input_path}')  # noqa: T201
     scores = read_csv_from_path(input_path, aws_key, aws_secret)
     scores = scores.drop_duplicates()
@@ -31,5 +31,5 @@ def collect_results(input_path, output_file=None, aws_key=None, aws_secret=None)
     else:
         output = f'{input_path}/results.csv'
-    print(f'Storing results at {output}')
+    print(f'Storing results at {output}')  # noqa: T201
     write_csv(scores, output, aws_key, aws_secret)

{sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/cli/summary.py RENAMED Viewed

@@ -15,7 +15,7 @@ KNOWN_ERRORS = (
 )
 MODALITY_BASELINES = {
-    'single-table': ['Uniform', 'Independent', 'CLBN', 'PrivBN'],
+    'single-table': ['Uniform', 'Column', 'CLBN', 'PrivBN'],
     'multi-table': ['Uniform', 'Independent'],
     'timeseries': [],
 }
@@ -46,7 +46,7 @@ def preprocess(data):
 def _coverage(data):
     total = len(data.Dataset.unique())
-    scores = data.groupby('Synthesizer').apply(lambda x: x.Quality_Score.notnull().sum())
+    scores = data.groupby('Synthesizer').apply(lambda x: x.Quality_Score.notna().sum())
     coverage_perc = scores / total
     coverage_str = scores.astype(str) + f' / {total}'
     return coverage_perc, coverage_str
@@ -102,7 +102,7 @@ def summarize(data, baselines=(), datasets=None):
     no_identity = data[data.Synthesizer != 'DataIdentity']
     coverage_perc, coverage_str = _coverage(data)
-    solved = data.groupby('Synthesizer').apply(lambda x: x.Quality_Score.notnull().sum())
+    solved = data.groupby('Synthesizer').apply(lambda x: x.Quality_Score.notna().sum())
     results = {
         'total': len(data.Dataset.unique()),
@@ -127,7 +127,7 @@ def summarize(data, baselines=(), datasets=None):
         for _, error_column in KNOWN_ERRORS:
             results[error_column] = grouped[error_column].sum()
-        results['errors'] = grouped.error.apply(lambda x: x.notnull().sum())
+        results['errors'] = grouped.error.apply(lambda x: x.notna().sum())
         total_errors = results['errors']
         results['metric_errors'] = results['total'] - results['solved'] - total_errors
@@ -160,7 +160,7 @@ def errors_summary(data):
     """
     if 'error' in data.columns:
         all_errors = pd.DataFrame(_error_counts(data)).rename(columns={'error': 'all'})
-        synthesizer_errors = data.groupby('Synthesizer').apply(_error_counts).unstack(level=0)
+        synthesizer_errors = data.groupby('Synthesizer').apply(_error_counts).pivot_table(level=0)
         for synthesizer, errors in synthesizer_errors.items():
             all_errors[synthesizer] = errors.fillna(0).astype(int)
@@ -217,7 +217,7 @@ def _find_library(synthesizer):
 def _add_summary_libraries(summary_data):
     summary_data['library'] = summary_data.index.map(_find_library)
-    summary_data['library'].fillna('Other', inplace=True)
+    summary_data['library'] = summary_data['library'].fillna('Other')
     return summary_data
@@ -240,7 +240,7 @@ def _add_summary(data, modality, baselines, writer):
         },
         axis=1,
     )
-    summary.drop(index='Identity', inplace=True, errors='ignore')
+    summary = summary.drop(index='Identity', errors='ignore')
     summary = _add_summary_libraries(summary)
     beat_baseline_headers = ['beat_' + b.lower() for b in baselines]

{sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/cli/utils.py RENAMED Viewed

@@ -67,7 +67,7 @@ def read_csv_from_path(path, aws_key, aws_secret):
     All csv content within a path will be read and returned in a
     DataFrame. The path can be either local or an s3 directory.
-    args:
+    Args:
         path (str):
             The path to read from, which can be either local or an s3 path.
         aws_key (str):

{sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym/synthesizers/__init__.py RENAMED Viewed

@@ -8,7 +8,7 @@ from sdgym.synthesizers.generate import (
     create_single_table_synthesizer,
 )
 from sdgym.synthesizers.identity import DataIdentity
-from sdgym.synthesizers.independent import IndependentSynthesizer
+from sdgym.synthesizers.column import ColumnSynthesizer
 from sdgym.synthesizers.sdv import (
     CopulaGANSynthesizer,
     CTGANSynthesizer,
@@ -23,7 +23,7 @@ from sdgym.synthesizers.uniform import UniformSynthesizer
 __all__ = (
     'DataIdentity',
-    'IndependentSynthesizer',
+    'ColumnSynthesizer',
     'CTGANSynthesizer',
     'TVAESynthesizer',
     'UniformSynthesizer',

sdgym-0.8.0.dev1/sdgym/synthesizers/independent.py → sdgym-0.9.0.dev0/sdgym/synthesizers/column.py RENAMED Viewed

@@ -1,4 +1,4 @@
-"""IndependentSynthesizer module."""
+"""ColumnSynthesizer module."""
 import pandas as pd
 from rdt.hyper_transformer import HyperTransformer
@@ -7,7 +7,7 @@ from sklearn.mixture import GaussianMixture
 from sdgym.synthesizers.base import BaselineSynthesizer
-class IndependentSynthesizer(BaselineSynthesizer):
+class ColumnSynthesizer(BaselineSynthesizer):
     """Synthesizer that learns each column independently.
     Categorical columns are sampled using empirical frequencies.

{sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sdgym
-Version: 0.8.0.dev1
+Version: 0.9.0.dev0
 Summary: Benchmark tabular synthetic data generators using a variety of datasets
 Author-email: "DataCebo, Inc." <info@sdv.dev>
 License: BSL-1.1
@@ -29,9 +29,9 @@ Requires-Dist: boto3<2,>=1.28
 Requires-Dist: botocore<2,>=1.31
 Requires-Dist: compress-pickle>=1.2.0
 Requires-Dist: humanfriendly>=8.2
-Requires-Dist: numpy>=1.21.0; python_version < "3.10"
-Requires-Dist: numpy<2,>=1.23.3; python_version >= "3.10" and python_version < "3.12"
-Requires-Dist: numpy<2,>=1.26.0; python_version >= "3.12"
+Requires-Dist: numpy<2.0.0,>=1.21.0; python_version < "3.10"
+Requires-Dist: numpy<2.0.0,>=1.23.3; python_version >= "3.10" and python_version < "3.12"
+Requires-Dist: numpy<2.0.0,>=1.26.0; python_version >= "3.12"
 Requires-Dist: pandas>=1.4.0; python_version < "3.11"
 Requires-Dist: pandas>=1.5.0; python_version >= "3.11" and python_version < "3.12"
 Requires-Dist: pandas>=2.1.1; python_version >= "3.12"

{sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym.egg-info/SOURCES.txt RENAMED Viewed

@@ -22,9 +22,9 @@ sdgym/cli/summary.py
 sdgym/cli/utils.py
 sdgym/synthesizers/__init__.py
 sdgym/synthesizers/base.py
+sdgym/synthesizers/column.py
 sdgym/synthesizers/generate.py
 sdgym/synthesizers/identity.py
-sdgym/synthesizers/independent.py
 sdgym/synthesizers/sdv.py
 sdgym/synthesizers/uniform.py
 tests/test_tasks.py

{sdgym-0.8.0.dev1 → sdgym-0.9.0.dev0}/sdgym.egg-info/requires.txt RENAMED Viewed

@@ -12,7 +12,7 @@ sdmetrics>=0.14.1
 sdv>=1.13.1
 [:python_version < "3.10"]
-numpy>=1.21.0
+numpy<2.0.0,>=1.21.0
 scikit-learn>=1.0.2
 scipy>=1.7.3
 torch>=1.9.0
@@ -24,7 +24,7 @@ pandas>=1.4.0
 scikit-learn>=1.1.0
 [:python_version >= "3.10" and python_version < "3.12"]
-numpy<2,>=1.23.3
+numpy<2.0.0,>=1.23.3
 scipy>=1.9.2
 torch>=2.0.0
@@ -33,7 +33,7 @@ pandas>=1.5.0
 scikit-learn>=1.1.3
 [:python_version >= "3.12"]
-numpy<2,>=1.26.0
+numpy<2.0.0,>=1.26.0
 pandas>=2.1.1
 scikit-learn>=1.3.1
 scipy>=1.12.0