sdv 1.35.2.dev0__tar.gz → 1.36.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sdv-1.35.2.dev0/sdv.egg-info → sdv-1.36.1}/PKG-INFO +1 -1
- {sdv-1.35.2.dev0 → sdv-1.36.1}/pyproject.toml +1 -1
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/__init__.py +1 -1
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/_utils.py +5 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/metadata/metadata.py +49 -18
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/metadata/multi_table.py +51 -8
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/metadata/single_table.py +54 -4
- sdv-1.36.1/sdv/metadata/utils.py +82 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1/sdv.egg-info}/PKG-INFO +1 -1
- sdv-1.35.2.dev0/sdv/metadata/utils.py +0 -32
- {sdv-1.35.2.dev0 → sdv-1.36.1}/LICENSE +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/README.md +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/cag/__init__.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/cag/_errors.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/cag/_utils.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/cag/base.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/cag/fixed_combinations.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/cag/fixed_increments.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/cag/inequality.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/cag/one_hot_encoding.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/cag/programmable_constraint.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/cag/range.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/constraints/__init__.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/constraints/base.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/constraints/errors.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/constraints/tabular.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/constraints/utils.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/data_processing/__init__.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/data_processing/data_processor.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/data_processing/datetime_formatter.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/data_processing/errors.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/data_processing/numerical_formatter.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/data_processing/utils.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/datasets/__init__.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/datasets/demo.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/datasets/local.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/errors.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/evaluation/__init__.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/evaluation/_utils.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/evaluation/multi_table.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/evaluation/single_table.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/io/__init__.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/io/local/__init__.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/io/local/local.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/lite/__init__.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/lite/single_table.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/logging/__init__.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/logging/logger.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/logging/sdv_logger_config.yml +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/logging/utils.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/metadata/__init__.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/metadata/errors.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/metadata/metadata_upgrader.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/metadata/visualization.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/metrics/__init__.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/metrics/demos.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/metrics/relational.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/metrics/tabular.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/metrics/timeseries.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/multi_table/__init__.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/multi_table/base.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/multi_table/dayz.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/multi_table/hma.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/multi_table/utils.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/sampling/__init__.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/sampling/hierarchical_sampler.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/sampling/independent_sampler.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/sampling/tabular.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/sequential/__init__.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/sequential/par.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/single_table/__init__.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/single_table/base.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/single_table/copulagan.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/single_table/copulas.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/single_table/ctgan.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/single_table/dayz.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/single_table/utils.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/utils/__init__.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/utils/mixins.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/utils/poc.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/utils/utils.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv/version/__init__.py +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv.egg-info/SOURCES.txt +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv.egg-info/dependency_links.txt +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv.egg-info/entry_points.txt +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv.egg-info/requires.txt +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/sdv.egg-info/top_level.txt +0 -0
- {sdv-1.35.2.dev0 → sdv-1.36.1}/setup.cfg +0 -0
|
@@ -149,7 +149,7 @@ namespaces = false
|
|
|
149
149
|
version = {attr = 'sdv.__version__'}
|
|
150
150
|
|
|
151
151
|
[tool.bumpversion]
|
|
152
|
-
current_version = "1.
|
|
152
|
+
current_version = "1.36.1"
|
|
153
153
|
parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
|
|
154
154
|
serialize = [
|
|
155
155
|
'{major}.{minor}.{patch}.{release}{candidate}',
|
|
@@ -536,3 +536,8 @@ def _get_unreferenced_keys(parent_columns, child_columns):
|
|
|
536
536
|
merged = merged[merged[indicator] == 'left_only'][list(child_columns.columns)]
|
|
537
537
|
merged = merged.dropna(how='all')
|
|
538
538
|
return merged.dropna(how='all')
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def _validate_boolean_parameter(parameter, parameter_name):
|
|
542
|
+
if not isinstance(parameter, bool):
|
|
543
|
+
raise ValueError(f"'{parameter_name}' must be a boolean value.")
|
|
@@ -4,6 +4,7 @@ import warnings
|
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
|
+
from sdv._utils import _validate_boolean_parameter
|
|
7
8
|
from sdv.metadata.errors import InvalidMetadataError
|
|
8
9
|
from sdv.metadata.multi_table import MultiTableMetadata
|
|
9
10
|
from sdv.metadata.single_table import SingleTableMetadata
|
|
@@ -61,11 +62,6 @@ class Metadata(MultiTableMetadata):
|
|
|
61
62
|
instance._set_metadata_dict(metadata_dict, single_table_name)
|
|
62
63
|
return instance
|
|
63
64
|
|
|
64
|
-
@staticmethod
|
|
65
|
-
def _validate_infer_sdtypes(infer_sdtypes):
|
|
66
|
-
if not isinstance(infer_sdtypes, bool):
|
|
67
|
-
raise ValueError("'infer_sdtypes' must be a boolean value.")
|
|
68
|
-
|
|
69
65
|
@staticmethod
|
|
70
66
|
def _validate_foreign_key_inference_algorithm(foreign_key_inference_algorithm):
|
|
71
67
|
if foreign_key_inference_algorithm != 'column_name_match':
|
|
@@ -78,6 +74,7 @@ class Metadata(MultiTableMetadata):
|
|
|
78
74
|
infer_sdtypes=True,
|
|
79
75
|
infer_keys='primary_and_foreign',
|
|
80
76
|
foreign_key_inference_algorithm='column_name_match',
|
|
77
|
+
verbose=False,
|
|
81
78
|
):
|
|
82
79
|
if not data or not all(isinstance(df, pd.DataFrame) for df in data.values()):
|
|
83
80
|
raise ValueError('The provided dictionary must contain only pandas DataFrame objects.')
|
|
@@ -86,16 +83,20 @@ class Metadata(MultiTableMetadata):
|
|
|
86
83
|
"'infer_keys' must be one of: 'primary_and_foreign', 'primary_only', None."
|
|
87
84
|
)
|
|
88
85
|
cls._validate_foreign_key_inference_algorithm(foreign_key_inference_algorithm)
|
|
89
|
-
|
|
86
|
+
_validate_boolean_parameter(infer_sdtypes, 'infer_sdtypes')
|
|
90
87
|
|
|
91
88
|
metadata = Metadata()
|
|
92
89
|
for table_name, dataframe in data.items():
|
|
93
90
|
metadata.detect_table_from_dataframe(
|
|
94
|
-
table_name,
|
|
91
|
+
table_name,
|
|
92
|
+
dataframe,
|
|
93
|
+
infer_sdtypes,
|
|
94
|
+
None if infer_keys is None else 'primary_only',
|
|
95
|
+
verbose,
|
|
95
96
|
)
|
|
96
97
|
|
|
97
98
|
if infer_keys == 'primary_and_foreign':
|
|
98
|
-
metadata._detect_relationships(data, foreign_key_inference_algorithm)
|
|
99
|
+
metadata._detect_relationships(data, foreign_key_inference_algorithm, verbose)
|
|
99
100
|
|
|
100
101
|
return metadata
|
|
101
102
|
|
|
@@ -106,6 +107,7 @@ class Metadata(MultiTableMetadata):
|
|
|
106
107
|
infer_sdtypes=True,
|
|
107
108
|
infer_keys='primary_and_foreign',
|
|
108
109
|
foreign_key_inference_algorithm='column_name_match',
|
|
110
|
+
verbose=False,
|
|
109
111
|
):
|
|
110
112
|
"""Detect the metadata for all tables in a dictionary of dataframes.
|
|
111
113
|
|
|
@@ -130,6 +132,11 @@ class Metadata(MultiTableMetadata):
|
|
|
130
132
|
foreign_key_inference_algorithm (str):
|
|
131
133
|
Which algorithm to use for detecting foreign keys. Currently only one option,
|
|
132
134
|
'column_name_match'. Defaults to 'column_name_match'.
|
|
135
|
+
verbose (bool):
|
|
136
|
+
A boolean that determines if information should be printed regarding detection.
|
|
137
|
+
If True, it prints out information about what is detected.
|
|
138
|
+
If False, it does not print out any information about what is detected.
|
|
139
|
+
Defaults to False.
|
|
133
140
|
|
|
134
141
|
Returns:
|
|
135
142
|
Metadata:
|
|
@@ -140,8 +147,25 @@ class Metadata(MultiTableMetadata):
|
|
|
140
147
|
infer_sdtypes=infer_sdtypes,
|
|
141
148
|
infer_keys=infer_keys,
|
|
142
149
|
foreign_key_inference_algorithm=foreign_key_inference_algorithm,
|
|
150
|
+
verbose=verbose,
|
|
143
151
|
)
|
|
144
152
|
|
|
153
|
+
@classmethod
|
|
154
|
+
def _detect_from_dataframe(
|
|
155
|
+
cls, data, table_name=None, infer_sdtypes=True, infer_keys='primary_only', verbose=False
|
|
156
|
+
):
|
|
157
|
+
"""Detect the metadata for a DataFrame."""
|
|
158
|
+
table_name = table_name or cls.DEFAULT_SINGLE_TABLE_NAME
|
|
159
|
+
if not isinstance(data, pd.DataFrame):
|
|
160
|
+
raise ValueError('The provided data must be a pandas DataFrame object.')
|
|
161
|
+
if infer_keys not in ['primary_only', None]:
|
|
162
|
+
raise ValueError("'infer_keys' must be one of: 'primary_only', None.")
|
|
163
|
+
|
|
164
|
+
_validate_boolean_parameter(infer_sdtypes, 'infer_sdtypes')
|
|
165
|
+
metadata = Metadata()
|
|
166
|
+
metadata.detect_table_from_dataframe(table_name, data, infer_sdtypes, infer_keys, verbose)
|
|
167
|
+
return metadata
|
|
168
|
+
|
|
145
169
|
@classmethod
|
|
146
170
|
def detect_from_dataframe(
|
|
147
171
|
cls,
|
|
@@ -149,6 +173,7 @@ class Metadata(MultiTableMetadata):
|
|
|
149
173
|
table_name=DEFAULT_SINGLE_TABLE_NAME,
|
|
150
174
|
infer_sdtypes=True,
|
|
151
175
|
infer_keys='primary_only',
|
|
176
|
+
verbose=False,
|
|
152
177
|
):
|
|
153
178
|
"""Detect the metadata for a DataFrame.
|
|
154
179
|
|
|
@@ -157,7 +182,10 @@ class Metadata(MultiTableMetadata):
|
|
|
157
182
|
|
|
158
183
|
Args:
|
|
159
184
|
data (pandas.DataFrame):
|
|
160
|
-
|
|
185
|
+
The data to detect metadata from.
|
|
186
|
+
table_name (str):
|
|
187
|
+
The name of the table to detect. If None, a default name will be used.
|
|
188
|
+
Defaults to None.
|
|
161
189
|
infer_sdtypes (bool):
|
|
162
190
|
A boolean describing whether to infer the sdtypes of each column.
|
|
163
191
|
If True it infers the sdtypes based on the data.
|
|
@@ -168,20 +196,23 @@ class Metadata(MultiTableMetadata):
|
|
|
168
196
|
- 'primary_only': Infer only the primary keys of each table
|
|
169
197
|
- None: Do not infer any keys
|
|
170
198
|
Defaults to 'primary_only'.
|
|
199
|
+
verbose (bool):
|
|
200
|
+
A boolean that determines if information should be printed regarding detection.
|
|
201
|
+
If True, it prints out information about what is detected.
|
|
202
|
+
If False, it does not print out any information about what is detected.
|
|
203
|
+
Defaults to False.
|
|
171
204
|
|
|
172
205
|
Returns:
|
|
173
206
|
Metadata:
|
|
174
207
|
A new metadata object with the sdtypes detected from the data.
|
|
175
208
|
"""
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
metadata.detect_table_from_dataframe(table_name, data, infer_sdtypes, infer_keys)
|
|
184
|
-
return metadata
|
|
209
|
+
return cls._detect_from_dataframe(
|
|
210
|
+
data=data,
|
|
211
|
+
table_name=table_name,
|
|
212
|
+
infer_sdtypes=infer_sdtypes,
|
|
213
|
+
infer_keys=infer_keys,
|
|
214
|
+
verbose=verbose,
|
|
215
|
+
)
|
|
185
216
|
|
|
186
217
|
def _set_metadata_dict(self, metadata, single_table_name=None):
|
|
187
218
|
"""Set a ``metadata`` dictionary to the current instance.
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import datetime
|
|
4
4
|
import json
|
|
5
5
|
import logging
|
|
6
|
+
import sys
|
|
6
7
|
import warnings
|
|
7
8
|
from collections import defaultdict
|
|
8
9
|
from copy import deepcopy
|
|
@@ -547,7 +548,7 @@ class MultiTableMetadata:
|
|
|
547
548
|
f'The relationships in the dataset are disjointed. {table_msg}'
|
|
548
549
|
)
|
|
549
550
|
|
|
550
|
-
def _detect_foreign_keys_by_column_name(self, data):
|
|
551
|
+
def _detect_foreign_keys_by_column_name(self, data, verbose=False):
|
|
551
552
|
"""Detect the foreign keys based on if a column name matches a primary key.
|
|
552
553
|
|
|
553
554
|
If a column name (a child table) is a primary key, it will also be considered
|
|
@@ -557,7 +558,15 @@ class MultiTableMetadata:
|
|
|
557
558
|
data (dict):
|
|
558
559
|
Dictionary of table names to dataframes.
|
|
559
560
|
NOTE: this is only used in SDV-Enterprise.
|
|
561
|
+
verbose (bool):
|
|
562
|
+
A boolean that determines if information should be printed regarding detection.
|
|
563
|
+
If True, it prints out information about what is detected.
|
|
564
|
+
If False, it does not print out any information about what is detected.
|
|
565
|
+
Defaults to False.
|
|
560
566
|
"""
|
|
567
|
+
is_foreign_keys_found = False
|
|
568
|
+
if verbose:
|
|
569
|
+
sys.stdout.write('\nDetecting foreign keys:\n')
|
|
561
570
|
for parent_candidate in self.tables.keys():
|
|
562
571
|
primary_key = self.tables[parent_candidate].primary_key
|
|
563
572
|
if primary_key is None:
|
|
@@ -573,15 +582,25 @@ class MultiTableMetadata:
|
|
|
573
582
|
continue
|
|
574
583
|
|
|
575
584
|
try:
|
|
585
|
+
sdtype_updated = False
|
|
576
586
|
if pk_sdtype == 'id' and original_fk_sdtype != 'id':
|
|
577
587
|
self.update_column(
|
|
578
588
|
table_name=child_candidate,
|
|
579
589
|
column_name=primary_key,
|
|
580
590
|
sdtype='id',
|
|
581
591
|
)
|
|
592
|
+
sdtype_updated = True
|
|
582
593
|
self.add_relationship(
|
|
583
594
|
parent_candidate, child_candidate, primary_key, primary_key
|
|
584
595
|
)
|
|
596
|
+
is_foreign_keys_found = True
|
|
597
|
+
if verbose:
|
|
598
|
+
child_col = f"'{child_candidate}.{primary_key}'"
|
|
599
|
+
parent_col = f"'{parent_candidate}.{primary_key}'"
|
|
600
|
+
suffix = " (updating sdtype to 'id')" if sdtype_updated else ''
|
|
601
|
+
sys.stdout.write(
|
|
602
|
+
f'- Column {child_col} refers to column {parent_col}{suffix}\n'
|
|
603
|
+
)
|
|
585
604
|
|
|
586
605
|
except InvalidMetadataError:
|
|
587
606
|
# circular relationship
|
|
@@ -592,8 +611,12 @@ class MultiTableMetadata:
|
|
|
592
611
|
**original_fk_meta,
|
|
593
612
|
)
|
|
594
613
|
continue
|
|
614
|
+
if verbose and not is_foreign_keys_found:
|
|
615
|
+
sys.stdout.write('- No foreign keys found\n')
|
|
595
616
|
|
|
596
|
-
def _detect_relationships(
|
|
617
|
+
def _detect_relationships(
|
|
618
|
+
self, data=None, foreign_key_inference_algorithm='column_name_match', verbose=False
|
|
619
|
+
):
|
|
597
620
|
"""Automatically detect relationships between tables.
|
|
598
621
|
|
|
599
622
|
Args:
|
|
@@ -603,12 +626,22 @@ class MultiTableMetadata:
|
|
|
603
626
|
foreign_key_inference_algorithm (str):
|
|
604
627
|
Which algorithm to use for detecting foreign keys. Currently only one option,
|
|
605
628
|
'column_name_match'.
|
|
629
|
+
verbose (bool):
|
|
630
|
+
A boolean that determines if information should be printed regarding detection.
|
|
631
|
+
If True, it prints out information about what is detected.
|
|
632
|
+
If False, it does not print out any information about what is detected.
|
|
633
|
+
Defaults to False.
|
|
606
634
|
"""
|
|
607
635
|
if foreign_key_inference_algorithm == 'column_name_match':
|
|
608
|
-
self._detect_foreign_keys_by_column_name(data)
|
|
636
|
+
self._detect_foreign_keys_by_column_name(data, verbose)
|
|
609
637
|
|
|
610
638
|
def detect_table_from_dataframe(
|
|
611
|
-
self,
|
|
639
|
+
self,
|
|
640
|
+
table_name,
|
|
641
|
+
data,
|
|
642
|
+
infer_sdtypes=True,
|
|
643
|
+
infer_keys='primary_only',
|
|
644
|
+
verbose=False,
|
|
612
645
|
):
|
|
613
646
|
"""Detect the metadata for a table from a dataframe.
|
|
614
647
|
|
|
@@ -630,14 +663,19 @@ class MultiTableMetadata:
|
|
|
630
663
|
- 'primary_only': Infer only the primary keys of each table
|
|
631
664
|
- None: Do not infer any keys
|
|
632
665
|
Defaults to 'primary_only'.
|
|
666
|
+
verbose (bool):
|
|
667
|
+
A boolean that determines if information should be printed regarding detection.
|
|
668
|
+
If True, it prints out information about what is detected.
|
|
669
|
+
If False, it does not print out any information about what is detected.
|
|
670
|
+
Defaults to False.
|
|
633
671
|
"""
|
|
634
672
|
self._validate_table_not_detected(table_name)
|
|
635
673
|
table = SingleTableMetadata()
|
|
636
|
-
table._detect_columns(data, table_name, infer_sdtypes, infer_keys)
|
|
674
|
+
table._detect_columns(data, table_name, infer_sdtypes, infer_keys, verbose)
|
|
637
675
|
self.tables[table_name] = table
|
|
638
676
|
self._log_detected_table(table)
|
|
639
677
|
|
|
640
|
-
def detect_from_dataframes(self, data):
|
|
678
|
+
def detect_from_dataframes(self, data, verbose=False):
|
|
641
679
|
"""Detect the metadata for all tables in a dictionary of dataframes.
|
|
642
680
|
|
|
643
681
|
This method automatically detects the ``sdtypes`` for the given ``pandas.DataFrame``.
|
|
@@ -646,14 +684,19 @@ class MultiTableMetadata:
|
|
|
646
684
|
Args:
|
|
647
685
|
data (dict):
|
|
648
686
|
Dictionary of table names to dataframes.
|
|
687
|
+
verbose (bool):
|
|
688
|
+
A boolean that determines if information should be printed regarding detection.
|
|
689
|
+
If True, it prints out information about what is detected.
|
|
690
|
+
If False, it does not print out any information about what is detected.
|
|
691
|
+
Defaults to False.
|
|
649
692
|
"""
|
|
650
693
|
if not data or not all(isinstance(df, pd.DataFrame) for df in data.values()):
|
|
651
694
|
raise ValueError('The provided dictionary must contain only pandas DataFrame objects.')
|
|
652
695
|
|
|
653
696
|
for table_name, dataframe in data.items():
|
|
654
|
-
self.detect_table_from_dataframe(table_name, dataframe)
|
|
697
|
+
self.detect_table_from_dataframe(table_name, dataframe, verbose=verbose)
|
|
655
698
|
|
|
656
|
-
self._detect_relationships(data)
|
|
699
|
+
self._detect_relationships(data, verbose=verbose)
|
|
657
700
|
|
|
658
701
|
def detect_from_csvs(self, folder_name, read_csv_parameters=None):
|
|
659
702
|
"""Detect the metadata for all tables in a folder of csv files.
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
5
|
import re
|
|
6
|
+
import sys
|
|
6
7
|
import warnings
|
|
7
8
|
from collections import Counter, defaultdict
|
|
8
9
|
from copy import deepcopy
|
|
@@ -28,7 +29,13 @@ from sdv.errors import InvalidDataError
|
|
|
28
29
|
from sdv.logging import get_sdv_logger
|
|
29
30
|
from sdv.metadata.errors import InvalidMetadataError
|
|
30
31
|
from sdv.metadata.metadata_upgrader import convert_metadata
|
|
31
|
-
from sdv.metadata.utils import
|
|
32
|
+
from sdv.metadata.utils import (
|
|
33
|
+
_format_column_metadata,
|
|
34
|
+
_print_primary_key_detection,
|
|
35
|
+
_validate_file_mode,
|
|
36
|
+
read_json,
|
|
37
|
+
validate_file_does_not_exist,
|
|
38
|
+
)
|
|
32
39
|
from sdv.metadata.visualization import (
|
|
33
40
|
create_columns_node,
|
|
34
41
|
create_summarized_columns_node,
|
|
@@ -651,7 +658,9 @@ class SingleTableMetadata:
|
|
|
651
658
|
except Exception as e:
|
|
652
659
|
self._handle_detection_error(e, column_name, table_name)
|
|
653
660
|
|
|
654
|
-
def _select_primary_key(
|
|
661
|
+
def _select_primary_key(
|
|
662
|
+
self, infer_sdtypes, pk_candidates, pii_pk_candidates, table_name=None, verbose=False
|
|
663
|
+
):
|
|
655
664
|
"""Select the primary key from a list of candidates.
|
|
656
665
|
|
|
657
666
|
If there are any non-pii candidates, we select the first one. Otherwise, we select the
|
|
@@ -666,22 +675,48 @@ class SingleTableMetadata:
|
|
|
666
675
|
A list of primary key candidates that aren't pii.
|
|
667
676
|
pii_pk_candidates (list):
|
|
668
677
|
A list of primary key candidates that are pii.
|
|
678
|
+
table_name (str):
|
|
679
|
+
The name of the table to be analyzed. Defaults to ``None``.
|
|
680
|
+
verbose (bool):
|
|
681
|
+
A boolean that determines if information should be printed regarding detection.
|
|
682
|
+
If True, it prints out information about what is detected.
|
|
683
|
+
If False, it does not print out any information about what is detected.
|
|
684
|
+
Defaults to False.
|
|
669
685
|
"""
|
|
686
|
+
if verbose:
|
|
687
|
+
table_str = f" for table '{table_name}'" if table_name else ''
|
|
688
|
+
sys.stdout.write(f'\nDetecting primary key{table_str}:\n')
|
|
689
|
+
chosen_pk = None
|
|
690
|
+
sdtype_updated = False
|
|
691
|
+
pii_removed = False
|
|
692
|
+
|
|
670
693
|
if pk_candidates:
|
|
671
694
|
selected_pk = pk_candidates[0]
|
|
672
695
|
self.primary_key = selected_pk
|
|
696
|
+
original_sdtype = self.columns.get(self.primary_key, {}).get('sdtype')
|
|
673
697
|
self.columns[self.primary_key]['sdtype'] = 'id'
|
|
698
|
+
chosen_pk = self.primary_key
|
|
699
|
+
sdtype_updated = original_sdtype != 'id'
|
|
674
700
|
|
|
675
701
|
elif pii_pk_candidates:
|
|
676
702
|
self.primary_key = pii_pk_candidates[0]
|
|
703
|
+
chosen_pk = self.primary_key
|
|
677
704
|
if not infer_sdtypes:
|
|
705
|
+
original_sdtype = self.columns.get(self.primary_key, {}).get('sdtype')
|
|
678
706
|
self.columns[self.primary_key]['sdtype'] = 'id'
|
|
707
|
+
sdtype_updated = original_sdtype != 'id'
|
|
679
708
|
|
|
680
709
|
if self.primary_key and self.columns[self.primary_key].get('sdtype') == 'id':
|
|
681
710
|
if self.columns[self.primary_key].get('pii') is not None:
|
|
682
711
|
del self.columns[self.primary_key]['pii']
|
|
712
|
+
pii_removed = True
|
|
713
|
+
|
|
714
|
+
if verbose:
|
|
715
|
+
_print_primary_key_detection(chosen_pk, sdtype_updated, pii_removed)
|
|
683
716
|
|
|
684
|
-
def _detect_columns(
|
|
717
|
+
def _detect_columns(
|
|
718
|
+
self, data, table_name=None, infer_sdtypes=True, infer_keys='primary_only', verbose=False
|
|
719
|
+
):
|
|
685
720
|
"""Detect metadata information for each column in the data.
|
|
686
721
|
|
|
687
722
|
Args:
|
|
@@ -699,7 +734,16 @@ class SingleTableMetadata:
|
|
|
699
734
|
- 'primary_only': Infer the primary keys.
|
|
700
735
|
- None: Do not infer any keys.
|
|
701
736
|
Defaults to 'primary_only'.
|
|
737
|
+
verbose (bool):
|
|
738
|
+
A boolean that determines if information should be printed regarding detection.
|
|
739
|
+
If True, it prints out information about what is detected.
|
|
740
|
+
If False, it does not print out any information about what is detected.
|
|
741
|
+
Defaults to False.
|
|
702
742
|
"""
|
|
743
|
+
if verbose and infer_sdtypes:
|
|
744
|
+
table_str = f"table '{table_name}'" if table_name else 'table'
|
|
745
|
+
sys.stdout.write(f'\nDetecting {table_str}:\n')
|
|
746
|
+
|
|
703
747
|
old_columns = data.columns
|
|
704
748
|
data.columns = data.columns.astype(str)
|
|
705
749
|
pk_candidates = []
|
|
@@ -728,13 +772,19 @@ class SingleTableMetadata:
|
|
|
728
772
|
column_dict['pii'] = True
|
|
729
773
|
|
|
730
774
|
column_dict['sdtype'] = sdtype
|
|
731
|
-
self.columns[field] = deepcopy(column_dict)
|
|
732
775
|
|
|
776
|
+
if verbose and infer_sdtypes:
|
|
777
|
+
column_metadata = _format_column_metadata(column_dict)
|
|
778
|
+
sys.stdout.write(f"- Column '{field}': {column_metadata}\n")
|
|
779
|
+
|
|
780
|
+
self.columns[field] = deepcopy(column_dict)
|
|
733
781
|
if infer_keys == 'primary_only':
|
|
734
782
|
self._select_primary_key(
|
|
735
783
|
infer_sdtypes=infer_sdtypes,
|
|
736
784
|
pk_candidates=pk_candidates,
|
|
737
785
|
pii_pk_candidates=pii_pk_candidates,
|
|
786
|
+
table_name=table_name,
|
|
787
|
+
verbose=verbose,
|
|
738
788
|
)
|
|
739
789
|
|
|
740
790
|
self._updated = True
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Tools to generate strings from regular expressions."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def read_json(filepath):
|
|
9
|
+
"""Validate and open a file path."""
|
|
10
|
+
filepath = Path(filepath)
|
|
11
|
+
if not filepath.exists():
|
|
12
|
+
raise ValueError(
|
|
13
|
+
f"A file named '{filepath.name}' does not exist. Please specify a different filename."
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
with open(filepath, 'r', encoding='utf-8') as metadata_file:
|
|
17
|
+
return json.load(metadata_file)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def validate_file_does_not_exist(filepath):
|
|
21
|
+
"""Validate a file path doesn't exist."""
|
|
22
|
+
filepath = Path(filepath)
|
|
23
|
+
if filepath.exists():
|
|
24
|
+
raise ValueError(
|
|
25
|
+
f"A file named '{filepath.name}' already exists in this folder. Please specify "
|
|
26
|
+
'a different filename.'
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _validate_file_mode(mode):
|
|
31
|
+
possible_modes = ['write', 'overwrite']
|
|
32
|
+
if mode not in possible_modes:
|
|
33
|
+
raise ValueError(f"Mode '{mode}' must be in {possible_modes}.")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _format_metadata_value(value):
|
|
37
|
+
"""Format a value for display, quoting only strings.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
value:
|
|
41
|
+
The value to format. Boolean and None are returned as their
|
|
42
|
+
string representation; all other values are wrapped in single quotes.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
str:
|
|
46
|
+
The formatted value as a string.
|
|
47
|
+
"""
|
|
48
|
+
if isinstance(value, bool) or value is None:
|
|
49
|
+
return str(value)
|
|
50
|
+
return f"'{value}'"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _format_column_metadata(sdtype_info):
|
|
54
|
+
"""Format a column's metadata dictionary as a display string, with sdtype first.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
sdtype_info (dict):
|
|
58
|
+
A dictionary of column metadata (`{'sdtype': 'ssn', 'pii': False}`).
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
str:
|
|
62
|
+
A comma-separated `key=value` string with 'sdtype' first.
|
|
63
|
+
(`sdtype='numerical', computer_representation='Float'`)
|
|
64
|
+
"""
|
|
65
|
+
parts = [f'{k}={_format_metadata_value(v)}' for k, v in sdtype_info.items()]
|
|
66
|
+
parts.sort(key=lambda p: not p.startswith('sdtype='))
|
|
67
|
+
return ', '.join(parts)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _print_primary_key_detection(chosen_pk, sdtype_updated, pii_removed):
|
|
71
|
+
if not chosen_pk:
|
|
72
|
+
sys.stdout.write('- No primary key found\n')
|
|
73
|
+
return
|
|
74
|
+
|
|
75
|
+
notes = []
|
|
76
|
+
if sdtype_updated:
|
|
77
|
+
notes.append("updating sdtype to 'id'")
|
|
78
|
+
if pii_removed:
|
|
79
|
+
notes.append("removing 'pii' field")
|
|
80
|
+
|
|
81
|
+
suffix = f' ({", ".join(notes)})' if notes else ''
|
|
82
|
+
sys.stdout.write(f"- primary_key='{chosen_pk}'{suffix}\n")
|
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
"""Tools to generate strings from regular expressions."""
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def read_json(filepath):
|
|
8
|
-
"""Validate and open a file path."""
|
|
9
|
-
filepath = Path(filepath)
|
|
10
|
-
if not filepath.exists():
|
|
11
|
-
raise ValueError(
|
|
12
|
-
f"A file named '{filepath.name}' does not exist. Please specify a different filename."
|
|
13
|
-
)
|
|
14
|
-
|
|
15
|
-
with open(filepath, 'r', encoding='utf-8') as metadata_file:
|
|
16
|
-
return json.load(metadata_file)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def validate_file_does_not_exist(filepath):
|
|
20
|
-
"""Validate a file path doesn't exist."""
|
|
21
|
-
filepath = Path(filepath)
|
|
22
|
-
if filepath.exists():
|
|
23
|
-
raise ValueError(
|
|
24
|
-
f"A file named '{filepath.name}' already exists in this folder. Please specify "
|
|
25
|
-
'a different filename.'
|
|
26
|
-
)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def _validate_file_mode(mode):
|
|
30
|
-
possible_modes = ['write', 'overwrite']
|
|
31
|
-
if mode not in possible_modes:
|
|
32
|
-
raise ValueError(f"Mode '{mode}' must be in {possible_modes}.")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|