PyPI - sdgym - Versions diffs - 0.10.0.dev0__tar.gz → 0.10.1.dev0__tar.gz - Mend

sdgym 0.10.0.dev0tar.gz → 0.10.1.dev0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

{sdgym-0.10.0.dev0/sdgym.egg-info → sdgym-0.10.1.dev0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: sdgym
-Version: 0.10.0.dev0
+Version: 0.10.1.dev0
 Summary: Benchmark tabular synthetic data generators using a variety of datasets
 Author-email: "DataCebo, Inc." <info@sdv.dev>
 License: BSL-1.1
@@ -15,13 +15,13 @@ Classifier: Intended Audience :: Developers
 Classifier: License :: Free for non-commercial use
 Classifier: Natural Language :: English
 Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.8
 Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
-Requires-Python: <3.13,>=3.8
+Requires-Python: <3.14,>=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: appdirs>=1.3
@@ -29,59 +29,66 @@ Requires-Dist: boto3<2,>=1.28
 Requires-Dist: botocore<2,>=1.31
 Requires-Dist: cloudpickle>=2.1.0
 Requires-Dist: compress-pickle>=1.2.0
-Requires-Dist: humanfriendly>=8.2
-Requires-Dist: numpy>=1.21.6; python_version < "3.10"
-Requires-Dist: numpy>=1.23.3; python_version >= "3.10" and python_version < "3.12"
-Requires-Dist: numpy>=1.26.0; python_version >= "3.12"
+Requires-Dist: humanfriendly>=10.0
+Requires-Dist: numpy>=1.22.2; python_version < "3.10"
+Requires-Dist: numpy>=1.24.0; python_version >= "3.10" and python_version < "3.12"
+Requires-Dist: numpy>=1.26.0; python_version >= "3.12" and python_version < "3.13"
+Requires-Dist: numpy>=2.1.0; python_version >= "3.13"
 Requires-Dist: pandas>=1.4.0; python_version < "3.11"
 Requires-Dist: pandas>=1.5.0; python_version >= "3.11" and python_version < "3.12"
-Requires-Dist: pandas>=2.1.1; python_version >= "3.12"
+Requires-Dist: pandas>=2.1.1; python_version >= "3.12" and python_version < "3.13"
+Requires-Dist: pandas>=2.2.3; python_version >= "3.13"
 Requires-Dist: psutil>=5.7
 Requires-Dist: scikit-learn>=1.0.2; python_version < "3.10"
 Requires-Dist: scikit-learn>=1.1.0; python_version >= "3.10" and python_version < "3.11"
 Requires-Dist: scikit-learn>=1.1.3; python_version >= "3.11" and python_version < "3.12"
-Requires-Dist: scikit-learn>=1.3.1; python_version >= "3.12"
+Requires-Dist: scikit-learn>=1.3.1; python_version >= "3.12" and python_version < "3.13"
+Requires-Dist: scikit-learn>=1.5.2; python_version >= "3.13"
 Requires-Dist: scipy>=1.7.3; python_version < "3.10"
 Requires-Dist: scipy>=1.9.2; python_version >= "3.10" and python_version < "3.12"
-Requires-Dist: scipy>=1.12.0; python_version >= "3.12"
+Requires-Dist: scipy>=1.12.0; python_version >= "3.12" and python_version < "3.13"
+Requires-Dist: scipy>=1.14.1; python_version >= "3.13"
 Requires-Dist: tabulate<0.9,>=0.8.3
-Requires-Dist: torch>=1.12.1; python_version < "3.10"
-Requires-Dist: torch>=2.0.0; python_version >= "3.10" and python_version < "3.12"
-Requires-Dist: torch>=2.2.0; python_version >= "3.12"
+Requires-Dist: torch>=2.6.0
 Requires-Dist: tqdm>=4.66.3
 Requires-Dist: XlsxWriter>=1.2.8
-Requires-Dist: rdt>=1.13.1
-Requires-Dist: sdmetrics>=0.17.0
-Requires-Dist: sdv>=1.17.2
+Requires-Dist: rdt>=1.17.0
+Requires-Dist: sdmetrics>=0.20.1
+Requires-Dist: sdv>=1.21.0
 Provides-Extra: dask
 Requires-Dist: dask; extra == "dask"
 Requires-Dist: distributed; extra == "dask"
 Provides-Extra: realtabformer
-Requires-Dist: realtabformer>=0.2.2; extra == "realtabformer"
-Requires-Dist: torch>=2.0.0; (python_version >= "3.8" and python_version < "3.12") and extra == "realtabformer"
-Requires-Dist: torch>=2.2.0; python_version >= "3.12" and extra == "realtabformer"
+Requires-Dist: realtabformer>=0.2.3; extra == "realtabformer"
+Requires-Dist: torch>=2.6.0; extra == "realtabformer"
+Requires-Dist: transformers<4.51; extra == "realtabformer"
 Provides-Extra: test
 Requires-Dist: sdgym[realtabformer]; extra == "test"
 Requires-Dist: pytest>=6.2.5; extra == "test"
 Requires-Dist: pytest-cov>=2.6.0; extra == "test"
 Requires-Dist: jupyter<2,>=1.0.0; extra == "test"
-Requires-Dist: rundoc<0.5,>=0.4.3; extra == "test"
 Requires-Dist: tomli<3,>=2.0.0; extra == "test"
+Requires-Dist: slack-sdk<4.0,>=3.23; extra == "test"
+Requires-Dist: openpyxl>=3.0.0; python_version < "3.9" and extra == "test"
+Requires-Dist: openpyxl>=3.1.2; python_version >= "3.9" and extra == "test"
+Requires-Dist: pydrive2<2.0.0,>=1.4.0; extra == "test"
 Provides-Extra: dev
 Requires-Dist: sdgym[dask,test]; extra == "dev"
 Requires-Dist: build<2,>=1.0.0; extra == "dev"
-Requires-Dist: bump-my-version<1,>=0.18.3; extra == "dev"
+Requires-Dist: bump-my-version>=0.18.3; extra == "dev"
 Requires-Dist: pip>=9.0.1; extra == "dev"
 Requires-Dist: watchdog<5,>=1.0.1; extra == "dev"
 Requires-Dist: ruff<1,>=0.4.5; extra == "dev"
-Requires-Dist: twine<6,>=1.10.0; extra == "dev"
+Requires-Dist: twine>=1.10.0; extra == "dev"
 Requires-Dist: wheel>=0.30.0; extra == "dev"
 Requires-Dist: coverage<8,>=4.5.12; extra == "dev"
-Requires-Dist: tox<5,>=2.9.1; extra == "dev"
 Requires-Dist: importlib-metadata>=3.6; extra == "dev"
 Requires-Dist: invoke; extra == "dev"
+Provides-Extra: readme
+Requires-Dist: rundoc<0.5,>=0.4.3; extra == "readme"
 Provides-Extra: all
 Requires-Dist: sdgym[dask,dev,test]; extra == "all"
+Dynamic: license-file
 <div align="center">
 <br/>

{sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/pyproject.toml RENAMED Viewed

@@ -8,17 +8,17 @@ classifiers = [
     'License :: Free for non-commercial use',
     'Natural Language :: English',
     'Programming Language :: Python :: 3',
-    'Programming Language :: Python :: 3.8',
     'Programming Language :: Python :: 3.9',
     'Programming Language :: Python :: 3.10',
     'Programming Language :: Python :: 3.11',
     'Programming Language :: Python :: 3.12',
+    'Programming Language :: Python :: 3.13',
     'Topic :: Scientific/Engineering :: Artificial Intelligence',
 ]
 keywords = ['machine learning', 'synthetic data generation', 'benchmark', 'generative models']
 dynamic = ['version']
 license = { text = 'BSL-1.1' }
-requires-python = '>=3.8,<3.13'
+requires-python = '>=3.9,<3.14'
 readme = 'README.md'
 dependencies = [
     'appdirs>=1.3',
@@ -26,30 +26,32 @@ dependencies = [
     'botocore>=1.31,<2',
     'cloudpickle>=2.1.0',
     'compress-pickle>=1.2.0',
-    'humanfriendly>=8.2',
-    "numpy>=1.21.6;python_version<'3.10'",
-    "numpy>=1.23.3;python_version>='3.10' and python_version<'3.12'",
-    "numpy>=1.26.0;python_version>='3.12'",
+    'humanfriendly>=10.0',
+    "numpy>=1.22.2;python_version<'3.10'",
+    "numpy>=1.24.0;python_version>='3.10' and python_version<'3.12'",
+    "numpy>=1.26.0;python_version>='3.12' and python_version<'3.13'",
+    "numpy>=2.1.0;python_version>='3.13'",
     "pandas>=1.4.0;python_version<'3.11'",
     "pandas>=1.5.0;python_version>='3.11' and python_version<'3.12'",
-    "pandas>=2.1.1;python_version>='3.12'",
+    "pandas>=2.1.1;python_version>='3.12' and python_version<'3.13'",
+    "pandas>=2.2.3;python_version>='3.13'",
     'psutil>=5.7',
     "scikit-learn>=1.0.2;python_version<'3.10'",
     "scikit-learn>=1.1.0;python_version>='3.10' and python_version<'3.11'",
     "scikit-learn>=1.1.3;python_version>='3.11' and python_version<'3.12'",
-    "scikit-learn>=1.3.1;python_version>='3.12'",
+    "scikit-learn>=1.3.1;python_version>='3.12' and python_version<'3.13'",
+    "scikit-learn>=1.5.2;python_version>='3.13'",
     "scipy>=1.7.3;python_version<'3.10'",
     "scipy>=1.9.2;python_version>='3.10' and python_version<'3.12'",
-    "scipy>=1.12.0;python_version>='3.12'",
+    "scipy>=1.12.0;python_version>='3.12' and python_version<'3.13'",
+    "scipy>=1.14.1;python_version>='3.13'",
     'tabulate>=0.8.3,<0.9',
-    "torch>=1.12.1;python_version<'3.10'",
-    "torch>=2.0.0;python_version>='3.10' and python_version<'3.12'",
-    "torch>=2.2.0;python_version>='3.12'",
+    "torch>=2.6.0",
     'tqdm>=4.66.3',
     'XlsxWriter>=1.2.8',
-    'rdt>=1.13.1',
-    'sdmetrics>=0.17.0',
-    'sdv>=1.17.2',
+    'rdt>=1.17.0',
+    'sdmetrics>=0.20.1',
+    'sdv>=1.21.0',
 ]
 [project.urls]
@@ -65,24 +67,27 @@ sdgym = { main = 'sdgym.cli.__main__:main' }
 [project.optional-dependencies]
 dask = ['dask', 'distributed']
 realtabformer = [
-    'realtabformer>=0.2.2',
-    "torch>=2.0.0;python_version>='3.8' and python_version<'3.12'",
-    "torch>=2.2.0;python_version>='3.12'",
+    'realtabformer>=0.2.3',
+    "torch>=2.6.0",
+    'transformers<4.51',
 ]
 test = [
     'sdgym[realtabformer]',
     'pytest>=6.2.5',
     'pytest-cov>=2.6.0',
     'jupyter>=1.0.0,<2',
-    'rundoc>=0.4.3,<0.5',
     'tomli>=2.0.0,<3',
+    'slack-sdk>=3.23,<4.0',
+    "openpyxl>=3.0.0; python_version<'3.9'",
+    "openpyxl>=3.1.2; python_version>='3.9'",
+    'pydrive2>=1.4.0,<2.0.0'
 ]
 dev = [
     'sdgym[dask, test]',
     # general
     'build>=1.0.0,<2',
-    'bump-my-version>=0.18.3,<1',
+    'bump-my-version>=0.18.3',
     'pip>=9.0.1',
     'watchdog>=1.0.1,<5',
@@ -90,17 +95,17 @@ dev = [
     'ruff>=0.4.5,<1',
     # distribute on PyPI
-    'twine>=1.10.0,<6',
+    'twine>=1.10.0',
     'wheel>=0.30.0',
     # Advanced testing
     'coverage>=4.5.12,<8',
-    'tox>=2.9.1,<5',
     'importlib-metadata>=3.6',
     # Invoke
     'invoke',
 ]
+readme = ['rundoc>=0.4.3,<0.5',]
 all = [
     'sdgym[dask, test, dev]',
 ]
@@ -140,7 +145,7 @@ namespaces = false
 version = {attr = 'sdgym.__version__'}
 [tool.bumpversion]
-current_version = "0.10.0.dev0"
+current_version = "0.10.1.dev0"
 parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
 serialize = [
     '{major}.{minor}.{patch}.{release}{candidate}',
@@ -188,6 +193,8 @@ exclude = [
     "__pycache__",
     ".ipynb_checkpoints",
     "tasks.py",
+    "static_code_analysis.txt",
+    "*.ipynb"
 ]
 [tool.ruff.lint]
@@ -212,7 +219,6 @@ ignore = [
     # pydocstyle
     "D107",  # Missing docstring in __init__
     "D417",   # Missing argument descriptions in the docstring, this is a bug from pydocstyle: https://github.com/PyCQA/pydocstyle/issues/449
-    "PD901",
     "PD101",
 ]
@@ -237,4 +243,4 @@ convention = "google"
 [tool.ruff.lint.pycodestyle]
 max-doc-length = 100
-max-line-length = 100
+max-line-length = 100

{sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/__init__.py RENAMED Viewed

@@ -8,26 +8,30 @@ __author__ = 'DataCebo, Inc.'
 __copyright__ = 'Copyright (c) 2022 DataCebo, Inc.'
 __email__ = 'info@sdv.dev'
 __license__ = 'BSL-1.1'
-__version__ = '0.10.0.dev0'
+__version__ = '0.10.1.dev0'
 import logging
 from sdgym.benchmark import benchmark_single_table
 from sdgym.cli.collect import collect_results
 from sdgym.cli.summary import make_summary_spreadsheet
+from sdgym.dataset_explorer import DatasetExplorer
 from sdgym.datasets import get_available_datasets, load_dataset
 from sdgym.synthesizers import create_sdv_synthesizer_variant, create_single_table_synthesizer
+from sdgym.result_explorer import ResultsExplorer
 # Clear the logging wrongfully configured by tensorflow/absl
 list(map(logging.root.removeHandler, logging.root.handlers))
 list(map(logging.root.removeFilter, logging.root.filters))
 __all__ = [
-    'load_dataset',
-    'collect_results',
-    'make_summary_spreadsheet',
+    'DatasetExplorer',
+    'ResultsExplorer',
     'benchmark_single_table',
-    'get_available_datasets',
+    'collect_results',
     'create_sdv_synthesizer_variant',
     'create_single_table_synthesizer',
+    'get_available_datasets',
+    'load_dataset',
+    'make_summary_spreadsheet',
 ]

sdgym-0.10.1.dev0/sdgym/_dataset_utils.py ADDED Viewed

@@ -0,0 +1,107 @@
+"""Utility functions for handling datasets."""
+import json
+import logging
+from pathlib import Path
+from zipfile import ZipFile
+import numpy as np
+import pandas as pd
+LOGGER = logging.getLogger(__name__)
+def _parse_numeric_value(value, dataset_name, field_name, target_type=float):
+    """Generic parser for numeric values with logging and NaN fallback."""
+    try:
+        return target_type(value)
+    except (ValueError, TypeError):
+        LOGGER.info(
+            f"Could not cast {field_name} '{value}' to {target_type.__name__} for dataset "
+            f"'{dataset_name}' defaulting to NaN."
+        )
+        return np.nan
+def _get_dataset_subset(data, metadata_dict, modality):
+    """Limit the size of a dataset for faster evaluation or testing.
+    This function reduces a dataset to a smaller subset by restricting the number
+    of rows and columns to 1000 rows and 10 columns. It ensures that essential
+    columns—such as sequence indices and keys in sequential datasets—are always retained.
+    Args:
+        data (pd.DataFrame):
+            The dataset to be reduced.
+        metadata_dict (dict):
+            A dictionary containing the dataset's metadata.
+        modality (str):
+            The dataset modality. Must be one of: ``'single_table'``, ``'sequential'``.
+    Returns:
+        tuple[pd.DataFrame, dict]:
+            A tuple containing:
+            - The reduced dataset as a DataFrame.
+            - The updated metadata dictionary reflecting any removed columns.
+    Raises:
+        ValueError:
+            If the provided modality is ``'multi_table'``.
+    """
+    if modality == 'multi_table':
+        raise ValueError('limit_dataset_size is not supported for multi-table datasets.')
+    max_rows, max_columns = (1000, 10)
+    tables = metadata_dict.get('tables', {})
+    mandatory_columns = []
+    table_name, table_info = next(iter(tables.items()))
+    columns = table_info.get('columns', {})
+    keep_columns = list(columns)
+    if modality == 'sequential':
+        seq_index = table_info.get('sequence_index')
+        seq_key = table_info.get('sequence_key')
+        mandatory_columns = [col for col in (seq_index, seq_key) if col]
+    optional_columns = [col for col in columns if col not in mandatory_columns]
+    # If we have too many columns, drop extras but never mandatory ones
+    if len(columns) > max_columns:
+        keep_count = max_columns - len(mandatory_columns)
+        keep_columns = mandatory_columns + optional_columns[:keep_count]
+        table_info['columns'] = {
+            column_name: column_definition
+            for column_name, column_definition in columns.items()
+            if column_name in keep_columns
+        }
+    data = data[list(keep_columns)]
+    data = data.sample(max_rows)
+    return data, metadata_dict
+def _read_zipped_data(zip_file_path, modality):
+    data = {}
+    with ZipFile(zip_file_path, 'r') as zf:
+        for file_name in zf.namelist():
+            if file_name.endswith('.csv'):
+                key = Path(file_name).stem
+                data[key] = _read_csv_from_zip(zf, csv_file_name=file_name)
+    if modality != 'multi_table':
+        data = next(iter(data.values()))
+    return data
+def _read_csv_from_zip(zip_file, csv_file_name):
+    """Read a single CSV file from an open ZipFile and return a DataFrame."""
+    with zip_file.open(csv_file_name) as csv_file:
+        return pd.read_csv(csv_file, low_memory=False)
+def _read_metadata_json(metadata_path):
+    with open(metadata_path) as metadata_file:
+        metadata_dict = json.load(metadata_file)
+    return metadata_dict

sdgym 0.10.0.dev0__tar.gz → 0.10.1.dev0__tar.gz

sdgym 0.10.0.dev0tar.gz → 0.10.1.dev0tar.gz