sdgym 0.10.0.dev0__tar.gz → 0.10.1.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {sdgym-0.10.0.dev0/sdgym.egg-info → sdgym-0.10.1.dev0}/PKG-INFO +31 -24
  2. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/pyproject.toml +31 -25
  3. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/__init__.py +9 -5
  4. sdgym-0.10.1.dev0/sdgym/_dataset_utils.py +107 -0
  5. sdgym-0.10.1.dev0/sdgym/benchmark.py +1606 -0
  6. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/cli/__main__.py +19 -8
  7. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/cli/collect.py +11 -9
  8. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/cli/summary.py +7 -3
  9. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/cli/utils.py +17 -13
  10. sdgym-0.10.1.dev0/sdgym/dataset_explorer.py +277 -0
  11. sdgym-0.10.1.dev0/sdgym/datasets.py +330 -0
  12. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/progress.py +1 -3
  13. sdgym-0.10.1.dev0/sdgym/result_explorer/__init__.py +5 -0
  14. sdgym-0.10.1.dev0/sdgym/result_explorer/result_explorer.py +121 -0
  15. sdgym-0.10.1.dev0/sdgym/result_explorer/result_handler.py +398 -0
  16. sdgym-0.10.1.dev0/sdgym/result_writer.py +147 -0
  17. sdgym-0.10.1.dev0/sdgym/s3.py +240 -0
  18. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/synthesizers/generate.py +4 -0
  19. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/synthesizers/realtabformer.py +2 -2
  20. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/synthesizers/uniform.py +9 -1
  21. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/utils.py +32 -0
  22. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0/sdgym.egg-info}/PKG-INFO +31 -24
  23. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym.egg-info/SOURCES.txt +7 -0
  24. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym.egg-info/requires.txt +30 -21
  25. sdgym-0.10.1.dev0/tests/test_scripts.py +37 -0
  26. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/tests/test_tasks.py +1 -1
  27. sdgym-0.10.0.dev0/sdgym/benchmark.py +0 -839
  28. sdgym-0.10.0.dev0/sdgym/datasets.py +0 -227
  29. sdgym-0.10.0.dev0/sdgym/s3.py +0 -146
  30. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/LICENSE +0 -0
  31. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/README.md +0 -0
  32. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/cli/__init__.py +0 -0
  33. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/errors.py +0 -0
  34. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/metrics.py +0 -0
  35. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/synthesizers/__init__.py +0 -0
  36. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/synthesizers/base.py +0 -0
  37. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/synthesizers/column.py +0 -0
  38. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/synthesizers/identity.py +0 -0
  39. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym/synthesizers/sdv.py +0 -0
  40. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym.egg-info/dependency_links.txt +0 -0
  41. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym.egg-info/entry_points.txt +0 -0
  42. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/sdgym.egg-info/top_level.txt +0 -0
  43. {sdgym-0.10.0.dev0 → sdgym-0.10.1.dev0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: sdgym
3
- Version: 0.10.0.dev0
3
+ Version: 0.10.1.dev0
4
4
  Summary: Benchmark tabular synthetic data generators using a variety of datasets
5
5
  Author-email: "DataCebo, Inc." <info@sdv.dev>
6
6
  License: BSL-1.1
@@ -15,13 +15,13 @@ Classifier: Intended Audience :: Developers
15
15
  Classifier: License :: Free for non-commercial use
16
16
  Classifier: Natural Language :: English
17
17
  Classifier: Programming Language :: Python :: 3
18
- Classifier: Programming Language :: Python :: 3.8
19
18
  Classifier: Programming Language :: Python :: 3.9
20
19
  Classifier: Programming Language :: Python :: 3.10
21
20
  Classifier: Programming Language :: Python :: 3.11
22
21
  Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
23
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
- Requires-Python: <3.13,>=3.8
24
+ Requires-Python: <3.14,>=3.9
25
25
  Description-Content-Type: text/markdown
26
26
  License-File: LICENSE
27
27
  Requires-Dist: appdirs>=1.3
@@ -29,59 +29,66 @@ Requires-Dist: boto3<2,>=1.28
29
29
  Requires-Dist: botocore<2,>=1.31
30
30
  Requires-Dist: cloudpickle>=2.1.0
31
31
  Requires-Dist: compress-pickle>=1.2.0
32
- Requires-Dist: humanfriendly>=8.2
33
- Requires-Dist: numpy>=1.21.6; python_version < "3.10"
34
- Requires-Dist: numpy>=1.23.3; python_version >= "3.10" and python_version < "3.12"
35
- Requires-Dist: numpy>=1.26.0; python_version >= "3.12"
32
+ Requires-Dist: humanfriendly>=10.0
33
+ Requires-Dist: numpy>=1.22.2; python_version < "3.10"
34
+ Requires-Dist: numpy>=1.24.0; python_version >= "3.10" and python_version < "3.12"
35
+ Requires-Dist: numpy>=1.26.0; python_version >= "3.12" and python_version < "3.13"
36
+ Requires-Dist: numpy>=2.1.0; python_version >= "3.13"
36
37
  Requires-Dist: pandas>=1.4.0; python_version < "3.11"
37
38
  Requires-Dist: pandas>=1.5.0; python_version >= "3.11" and python_version < "3.12"
38
- Requires-Dist: pandas>=2.1.1; python_version >= "3.12"
39
+ Requires-Dist: pandas>=2.1.1; python_version >= "3.12" and python_version < "3.13"
40
+ Requires-Dist: pandas>=2.2.3; python_version >= "3.13"
39
41
  Requires-Dist: psutil>=5.7
40
42
  Requires-Dist: scikit-learn>=1.0.2; python_version < "3.10"
41
43
  Requires-Dist: scikit-learn>=1.1.0; python_version >= "3.10" and python_version < "3.11"
42
44
  Requires-Dist: scikit-learn>=1.1.3; python_version >= "3.11" and python_version < "3.12"
43
- Requires-Dist: scikit-learn>=1.3.1; python_version >= "3.12"
45
+ Requires-Dist: scikit-learn>=1.3.1; python_version >= "3.12" and python_version < "3.13"
46
+ Requires-Dist: scikit-learn>=1.5.2; python_version >= "3.13"
44
47
  Requires-Dist: scipy>=1.7.3; python_version < "3.10"
45
48
  Requires-Dist: scipy>=1.9.2; python_version >= "3.10" and python_version < "3.12"
46
- Requires-Dist: scipy>=1.12.0; python_version >= "3.12"
49
+ Requires-Dist: scipy>=1.12.0; python_version >= "3.12" and python_version < "3.13"
50
+ Requires-Dist: scipy>=1.14.1; python_version >= "3.13"
47
51
  Requires-Dist: tabulate<0.9,>=0.8.3
48
- Requires-Dist: torch>=1.12.1; python_version < "3.10"
49
- Requires-Dist: torch>=2.0.0; python_version >= "3.10" and python_version < "3.12"
50
- Requires-Dist: torch>=2.2.0; python_version >= "3.12"
52
+ Requires-Dist: torch>=2.6.0
51
53
  Requires-Dist: tqdm>=4.66.3
52
54
  Requires-Dist: XlsxWriter>=1.2.8
53
- Requires-Dist: rdt>=1.13.1
54
- Requires-Dist: sdmetrics>=0.17.0
55
- Requires-Dist: sdv>=1.17.2
55
+ Requires-Dist: rdt>=1.17.0
56
+ Requires-Dist: sdmetrics>=0.20.1
57
+ Requires-Dist: sdv>=1.21.0
56
58
  Provides-Extra: dask
57
59
  Requires-Dist: dask; extra == "dask"
58
60
  Requires-Dist: distributed; extra == "dask"
59
61
  Provides-Extra: realtabformer
60
- Requires-Dist: realtabformer>=0.2.2; extra == "realtabformer"
61
- Requires-Dist: torch>=2.0.0; (python_version >= "3.8" and python_version < "3.12") and extra == "realtabformer"
62
- Requires-Dist: torch>=2.2.0; python_version >= "3.12" and extra == "realtabformer"
62
+ Requires-Dist: realtabformer>=0.2.3; extra == "realtabformer"
63
+ Requires-Dist: torch>=2.6.0; extra == "realtabformer"
64
+ Requires-Dist: transformers<4.51; extra == "realtabformer"
63
65
  Provides-Extra: test
64
66
  Requires-Dist: sdgym[realtabformer]; extra == "test"
65
67
  Requires-Dist: pytest>=6.2.5; extra == "test"
66
68
  Requires-Dist: pytest-cov>=2.6.0; extra == "test"
67
69
  Requires-Dist: jupyter<2,>=1.0.0; extra == "test"
68
- Requires-Dist: rundoc<0.5,>=0.4.3; extra == "test"
69
70
  Requires-Dist: tomli<3,>=2.0.0; extra == "test"
71
+ Requires-Dist: slack-sdk<4.0,>=3.23; extra == "test"
72
+ Requires-Dist: openpyxl>=3.0.0; python_version < "3.9" and extra == "test"
73
+ Requires-Dist: openpyxl>=3.1.2; python_version >= "3.9" and extra == "test"
74
+ Requires-Dist: pydrive2<2.0.0,>=1.4.0; extra == "test"
70
75
  Provides-Extra: dev
71
76
  Requires-Dist: sdgym[dask,test]; extra == "dev"
72
77
  Requires-Dist: build<2,>=1.0.0; extra == "dev"
73
- Requires-Dist: bump-my-version<1,>=0.18.3; extra == "dev"
78
+ Requires-Dist: bump-my-version>=0.18.3; extra == "dev"
74
79
  Requires-Dist: pip>=9.0.1; extra == "dev"
75
80
  Requires-Dist: watchdog<5,>=1.0.1; extra == "dev"
76
81
  Requires-Dist: ruff<1,>=0.4.5; extra == "dev"
77
- Requires-Dist: twine<6,>=1.10.0; extra == "dev"
82
+ Requires-Dist: twine>=1.10.0; extra == "dev"
78
83
  Requires-Dist: wheel>=0.30.0; extra == "dev"
79
84
  Requires-Dist: coverage<8,>=4.5.12; extra == "dev"
80
- Requires-Dist: tox<5,>=2.9.1; extra == "dev"
81
85
  Requires-Dist: importlib-metadata>=3.6; extra == "dev"
82
86
  Requires-Dist: invoke; extra == "dev"
87
+ Provides-Extra: readme
88
+ Requires-Dist: rundoc<0.5,>=0.4.3; extra == "readme"
83
89
  Provides-Extra: all
84
90
  Requires-Dist: sdgym[dask,dev,test]; extra == "all"
91
+ Dynamic: license-file
85
92
 
86
93
  <div align="center">
87
94
  <br/>
@@ -8,17 +8,17 @@ classifiers = [
8
8
  'License :: Free for non-commercial use',
9
9
  'Natural Language :: English',
10
10
  'Programming Language :: Python :: 3',
11
- 'Programming Language :: Python :: 3.8',
12
11
  'Programming Language :: Python :: 3.9',
13
12
  'Programming Language :: Python :: 3.10',
14
13
  'Programming Language :: Python :: 3.11',
15
14
  'Programming Language :: Python :: 3.12',
15
+ 'Programming Language :: Python :: 3.13',
16
16
  'Topic :: Scientific/Engineering :: Artificial Intelligence',
17
17
  ]
18
18
  keywords = ['machine learning', 'synthetic data generation', 'benchmark', 'generative models']
19
19
  dynamic = ['version']
20
20
  license = { text = 'BSL-1.1' }
21
- requires-python = '>=3.8,<3.13'
21
+ requires-python = '>=3.9,<3.14'
22
22
  readme = 'README.md'
23
23
  dependencies = [
24
24
  'appdirs>=1.3',
@@ -26,30 +26,32 @@ dependencies = [
26
26
  'botocore>=1.31,<2',
27
27
  'cloudpickle>=2.1.0',
28
28
  'compress-pickle>=1.2.0',
29
- 'humanfriendly>=8.2',
30
- "numpy>=1.21.6;python_version<'3.10'",
31
- "numpy>=1.23.3;python_version>='3.10' and python_version<'3.12'",
32
- "numpy>=1.26.0;python_version>='3.12'",
29
+ 'humanfriendly>=10.0',
30
+ "numpy>=1.22.2;python_version<'3.10'",
31
+ "numpy>=1.24.0;python_version>='3.10' and python_version<'3.12'",
32
+ "numpy>=1.26.0;python_version>='3.12' and python_version<'3.13'",
33
+ "numpy>=2.1.0;python_version>='3.13'",
33
34
  "pandas>=1.4.0;python_version<'3.11'",
34
35
  "pandas>=1.5.0;python_version>='3.11' and python_version<'3.12'",
35
- "pandas>=2.1.1;python_version>='3.12'",
36
+ "pandas>=2.1.1;python_version>='3.12' and python_version<'3.13'",
37
+ "pandas>=2.2.3;python_version>='3.13'",
36
38
  'psutil>=5.7',
37
39
  "scikit-learn>=1.0.2;python_version<'3.10'",
38
40
  "scikit-learn>=1.1.0;python_version>='3.10' and python_version<'3.11'",
39
41
  "scikit-learn>=1.1.3;python_version>='3.11' and python_version<'3.12'",
40
- "scikit-learn>=1.3.1;python_version>='3.12'",
42
+ "scikit-learn>=1.3.1;python_version>='3.12' and python_version<'3.13'",
43
+ "scikit-learn>=1.5.2;python_version>='3.13'",
41
44
  "scipy>=1.7.3;python_version<'3.10'",
42
45
  "scipy>=1.9.2;python_version>='3.10' and python_version<'3.12'",
43
- "scipy>=1.12.0;python_version>='3.12'",
46
+ "scipy>=1.12.0;python_version>='3.12' and python_version<'3.13'",
47
+ "scipy>=1.14.1;python_version>='3.13'",
44
48
  'tabulate>=0.8.3,<0.9',
45
- "torch>=1.12.1;python_version<'3.10'",
46
- "torch>=2.0.0;python_version>='3.10' and python_version<'3.12'",
47
- "torch>=2.2.0;python_version>='3.12'",
49
+ "torch>=2.6.0",
48
50
  'tqdm>=4.66.3',
49
51
  'XlsxWriter>=1.2.8',
50
- 'rdt>=1.13.1',
51
- 'sdmetrics>=0.17.0',
52
- 'sdv>=1.17.2',
52
+ 'rdt>=1.17.0',
53
+ 'sdmetrics>=0.20.1',
54
+ 'sdv>=1.21.0',
53
55
  ]
54
56
 
55
57
  [project.urls]
@@ -65,24 +67,27 @@ sdgym = { main = 'sdgym.cli.__main__:main' }
65
67
  [project.optional-dependencies]
66
68
  dask = ['dask', 'distributed']
67
69
  realtabformer = [
68
- 'realtabformer>=0.2.2',
69
- "torch>=2.0.0;python_version>='3.8' and python_version<'3.12'",
70
- "torch>=2.2.0;python_version>='3.12'",
70
+ 'realtabformer>=0.2.3',
71
+ "torch>=2.6.0",
72
+ 'transformers<4.51',
71
73
  ]
72
74
  test = [
73
75
  'sdgym[realtabformer]',
74
76
  'pytest>=6.2.5',
75
77
  'pytest-cov>=2.6.0',
76
78
  'jupyter>=1.0.0,<2',
77
- 'rundoc>=0.4.3,<0.5',
78
79
  'tomli>=2.0.0,<3',
80
+ 'slack-sdk>=3.23,<4.0',
81
+ "openpyxl>=3.0.0; python_version<'3.9'",
82
+ "openpyxl>=3.1.2; python_version>='3.9'",
83
+ 'pydrive2>=1.4.0,<2.0.0'
79
84
  ]
80
85
  dev = [
81
86
  'sdgym[dask, test]',
82
87
 
83
88
  # general
84
89
  'build>=1.0.0,<2',
85
- 'bump-my-version>=0.18.3,<1',
90
+ 'bump-my-version>=0.18.3',
86
91
  'pip>=9.0.1',
87
92
  'watchdog>=1.0.1,<5',
88
93
 
@@ -90,17 +95,17 @@ dev = [
90
95
  'ruff>=0.4.5,<1',
91
96
 
92
97
  # distribute on PyPI
93
- 'twine>=1.10.0,<6',
98
+ 'twine>=1.10.0',
94
99
  'wheel>=0.30.0',
95
100
 
96
101
  # Advanced testing
97
102
  'coverage>=4.5.12,<8',
98
- 'tox>=2.9.1,<5',
99
103
  'importlib-metadata>=3.6',
100
104
 
101
105
  # Invoke
102
106
  'invoke',
103
107
  ]
108
+ readme = ['rundoc>=0.4.3,<0.5',]
104
109
  all = [
105
110
  'sdgym[dask, test, dev]',
106
111
  ]
@@ -140,7 +145,7 @@ namespaces = false
140
145
  version = {attr = 'sdgym.__version__'}
141
146
 
142
147
  [tool.bumpversion]
143
- current_version = "0.10.0.dev0"
148
+ current_version = "0.10.1.dev0"
144
149
  parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
145
150
  serialize = [
146
151
  '{major}.{minor}.{patch}.{release}{candidate}',
@@ -188,6 +193,8 @@ exclude = [
188
193
  "__pycache__",
189
194
  ".ipynb_checkpoints",
190
195
  "tasks.py",
196
+ "static_code_analysis.txt",
197
+ "*.ipynb"
191
198
  ]
192
199
 
193
200
  [tool.ruff.lint]
@@ -212,7 +219,6 @@ ignore = [
212
219
  # pydocstyle
213
220
  "D107", # Missing docstring in __init__
214
221
  "D417", # Missing argument descriptions in the docstring, this is a bug from pydocstyle: https://github.com/PyCQA/pydocstyle/issues/449
215
- "PD901",
216
222
  "PD101",
217
223
  ]
218
224
 
@@ -237,4 +243,4 @@ convention = "google"
237
243
 
238
244
  [tool.ruff.lint.pycodestyle]
239
245
  max-doc-length = 100
240
- max-line-length = 100
246
+ max-line-length = 100
@@ -8,26 +8,30 @@ __author__ = 'DataCebo, Inc.'
8
8
  __copyright__ = 'Copyright (c) 2022 DataCebo, Inc.'
9
9
  __email__ = 'info@sdv.dev'
10
10
  __license__ = 'BSL-1.1'
11
- __version__ = '0.10.0.dev0'
11
+ __version__ = '0.10.1.dev0'
12
12
 
13
13
  import logging
14
14
 
15
15
  from sdgym.benchmark import benchmark_single_table
16
16
  from sdgym.cli.collect import collect_results
17
17
  from sdgym.cli.summary import make_summary_spreadsheet
18
+ from sdgym.dataset_explorer import DatasetExplorer
18
19
  from sdgym.datasets import get_available_datasets, load_dataset
19
20
  from sdgym.synthesizers import create_sdv_synthesizer_variant, create_single_table_synthesizer
21
+ from sdgym.result_explorer import ResultsExplorer
20
22
 
21
23
  # Clear the logging wrongfully configured by tensorflow/absl
22
24
  list(map(logging.root.removeHandler, logging.root.handlers))
23
25
  list(map(logging.root.removeFilter, logging.root.filters))
24
26
 
25
27
  __all__ = [
26
- 'load_dataset',
27
- 'collect_results',
28
- 'make_summary_spreadsheet',
28
+ 'DatasetExplorer',
29
+ 'ResultsExplorer',
29
30
  'benchmark_single_table',
30
- 'get_available_datasets',
31
+ 'collect_results',
31
32
  'create_sdv_synthesizer_variant',
32
33
  'create_single_table_synthesizer',
34
+ 'get_available_datasets',
35
+ 'load_dataset',
36
+ 'make_summary_spreadsheet',
33
37
  ]
@@ -0,0 +1,107 @@
1
+ """Utility functions for handling datasets."""
2
+
3
+ import json
4
+ import logging
5
+ from pathlib import Path
6
+ from zipfile import ZipFile
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+ LOGGER = logging.getLogger(__name__)
12
+
13
+
14
+ def _parse_numeric_value(value, dataset_name, field_name, target_type=float):
15
+ """Generic parser for numeric values with logging and NaN fallback."""
16
+ try:
17
+ return target_type(value)
18
+ except (ValueError, TypeError):
19
+ LOGGER.info(
20
+ f"Could not cast {field_name} '{value}' to {target_type.__name__} for dataset "
21
+ f"'{dataset_name}' defaulting to NaN."
22
+ )
23
+ return np.nan
24
+
25
+
26
+ def _get_dataset_subset(data, metadata_dict, modality):
27
+ """Limit the size of a dataset for faster evaluation or testing.
28
+
29
+ This function reduces a dataset to a smaller subset by restricting the number
30
+ of rows and columns to 1000 rows and 10 columns. It ensures that essential
31
+ columns—such as sequence indices and keys in sequential datasets—are always retained.
32
+
33
+ Args:
34
+ data (pd.DataFrame):
35
+ The dataset to be reduced.
36
+ metadata_dict (dict):
37
+ A dictionary containing the dataset's metadata.
38
+ modality (str):
39
+ The dataset modality. Must be one of: ``'single_table'``, ``'sequential'``.
40
+
41
+ Returns:
42
+ tuple[pd.DataFrame, dict]:
43
+ A tuple containing:
44
+ - The reduced dataset as a DataFrame.
45
+ - The updated metadata dictionary reflecting any removed columns.
46
+
47
+ Raises:
48
+ ValueError:
49
+ If the provided modality is ``'multi_table'``.
50
+ """
51
+ if modality == 'multi_table':
52
+ raise ValueError('limit_dataset_size is not supported for multi-table datasets.')
53
+
54
+ max_rows, max_columns = (1000, 10)
55
+ tables = metadata_dict.get('tables', {})
56
+ mandatory_columns = []
57
+ table_name, table_info = next(iter(tables.items()))
58
+
59
+ columns = table_info.get('columns', {})
60
+ keep_columns = list(columns)
61
+ if modality == 'sequential':
62
+ seq_index = table_info.get('sequence_index')
63
+ seq_key = table_info.get('sequence_key')
64
+ mandatory_columns = [col for col in (seq_index, seq_key) if col]
65
+
66
+ optional_columns = [col for col in columns if col not in mandatory_columns]
67
+
68
+ # If we have too many columns, drop extras but never mandatory ones
69
+ if len(columns) > max_columns:
70
+ keep_count = max_columns - len(mandatory_columns)
71
+ keep_columns = mandatory_columns + optional_columns[:keep_count]
72
+ table_info['columns'] = {
73
+ column_name: column_definition
74
+ for column_name, column_definition in columns.items()
75
+ if column_name in keep_columns
76
+ }
77
+
78
+ data = data[list(keep_columns)]
79
+ data = data.sample(max_rows)
80
+ return data, metadata_dict
81
+
82
+
83
+ def _read_zipped_data(zip_file_path, modality):
84
+ data = {}
85
+ with ZipFile(zip_file_path, 'r') as zf:
86
+ for file_name in zf.namelist():
87
+ if file_name.endswith('.csv'):
88
+ key = Path(file_name).stem
89
+ data[key] = _read_csv_from_zip(zf, csv_file_name=file_name)
90
+
91
+ if modality != 'multi_table':
92
+ data = next(iter(data.values()))
93
+
94
+ return data
95
+
96
+
97
+ def _read_csv_from_zip(zip_file, csv_file_name):
98
+ """Read a single CSV file from an open ZipFile and return a DataFrame."""
99
+ with zip_file.open(csv_file_name) as csv_file:
100
+ return pd.read_csv(csv_file, low_memory=False)
101
+
102
+
103
+ def _read_metadata_json(metadata_path):
104
+ with open(metadata_path) as metadata_file:
105
+ metadata_dict = json.load(metadata_file)
106
+
107
+ return metadata_dict