sdv 1.35.2.dev0__tar.gz → 1.36.1.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. {sdv-1.35.2.dev0/sdv.egg-info → sdv-1.36.1.dev0}/PKG-INFO +1 -1
  2. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/pyproject.toml +1 -1
  3. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/__init__.py +1 -1
  4. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/_utils.py +5 -0
  5. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/metadata/metadata.py +49 -18
  6. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/metadata/multi_table.py +51 -8
  7. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/metadata/single_table.py +54 -4
  8. sdv-1.36.1.dev0/sdv/metadata/utils.py +82 -0
  9. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0/sdv.egg-info}/PKG-INFO +1 -1
  10. sdv-1.35.2.dev0/sdv/metadata/utils.py +0 -32
  11. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/LICENSE +0 -0
  12. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/README.md +0 -0
  13. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/cag/__init__.py +0 -0
  14. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/cag/_errors.py +0 -0
  15. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/cag/_utils.py +0 -0
  16. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/cag/base.py +0 -0
  17. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/cag/fixed_combinations.py +0 -0
  18. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/cag/fixed_increments.py +0 -0
  19. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/cag/inequality.py +0 -0
  20. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/cag/one_hot_encoding.py +0 -0
  21. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/cag/programmable_constraint.py +0 -0
  22. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/cag/range.py +0 -0
  23. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/constraints/__init__.py +0 -0
  24. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/constraints/base.py +0 -0
  25. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/constraints/errors.py +0 -0
  26. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/constraints/tabular.py +0 -0
  27. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/constraints/utils.py +0 -0
  28. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/data_processing/__init__.py +0 -0
  29. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/data_processing/data_processor.py +0 -0
  30. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/data_processing/datetime_formatter.py +0 -0
  31. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/data_processing/errors.py +0 -0
  32. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/data_processing/numerical_formatter.py +0 -0
  33. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/data_processing/utils.py +0 -0
  34. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/datasets/__init__.py +0 -0
  35. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/datasets/demo.py +0 -0
  36. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/datasets/local.py +0 -0
  37. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/errors.py +0 -0
  38. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/evaluation/__init__.py +0 -0
  39. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/evaluation/_utils.py +0 -0
  40. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/evaluation/multi_table.py +0 -0
  41. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/evaluation/single_table.py +0 -0
  42. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/io/__init__.py +0 -0
  43. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/io/local/__init__.py +0 -0
  44. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/io/local/local.py +0 -0
  45. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/lite/__init__.py +0 -0
  46. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/lite/single_table.py +0 -0
  47. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/logging/__init__.py +0 -0
  48. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/logging/logger.py +0 -0
  49. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/logging/sdv_logger_config.yml +0 -0
  50. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/logging/utils.py +0 -0
  51. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/metadata/__init__.py +0 -0
  52. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/metadata/errors.py +0 -0
  53. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/metadata/metadata_upgrader.py +0 -0
  54. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/metadata/visualization.py +0 -0
  55. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/metrics/__init__.py +0 -0
  56. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/metrics/demos.py +0 -0
  57. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/metrics/relational.py +0 -0
  58. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/metrics/tabular.py +0 -0
  59. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/metrics/timeseries.py +0 -0
  60. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/multi_table/__init__.py +0 -0
  61. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/multi_table/base.py +0 -0
  62. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/multi_table/dayz.py +0 -0
  63. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/multi_table/hma.py +0 -0
  64. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/multi_table/utils.py +0 -0
  65. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/sampling/__init__.py +0 -0
  66. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/sampling/hierarchical_sampler.py +0 -0
  67. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/sampling/independent_sampler.py +0 -0
  68. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/sampling/tabular.py +0 -0
  69. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/sequential/__init__.py +0 -0
  70. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/sequential/par.py +0 -0
  71. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/single_table/__init__.py +0 -0
  72. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/single_table/base.py +0 -0
  73. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/single_table/copulagan.py +0 -0
  74. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/single_table/copulas.py +0 -0
  75. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/single_table/ctgan.py +0 -0
  76. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/single_table/dayz.py +0 -0
  77. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/single_table/utils.py +0 -0
  78. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/utils/__init__.py +0 -0
  79. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/utils/mixins.py +0 -0
  80. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/utils/poc.py +0 -0
  81. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/utils/utils.py +0 -0
  82. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv/version/__init__.py +0 -0
  83. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv.egg-info/SOURCES.txt +0 -0
  84. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv.egg-info/dependency_links.txt +0 -0
  85. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv.egg-info/entry_points.txt +0 -0
  86. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv.egg-info/requires.txt +0 -0
  87. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/sdv.egg-info/top_level.txt +0 -0
  88. {sdv-1.35.2.dev0 → sdv-1.36.1.dev0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdv
3
- Version: 1.35.2.dev0
3
+ Version: 1.36.1.dev0
4
4
  Summary: Generate synthetic data for single table, multi table and sequential data
5
5
  Author-email: "DataCebo, Inc." <info@sdv.dev>
6
6
  License-Expression: BUSL-1.1
@@ -149,7 +149,7 @@ namespaces = false
149
149
  version = {attr = 'sdv.__version__'}
150
150
 
151
151
  [tool.bumpversion]
152
- current_version = "1.35.2.dev0"
152
+ current_version = "1.36.1.dev0"
153
153
  parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
154
154
  serialize = [
155
155
  '{major}.{minor}.{patch}.{release}{candidate}',
@@ -6,7 +6,7 @@
6
6
 
7
7
  __author__ = 'DataCebo, Inc.'
8
8
  __email__ = 'info@sdv.dev'
9
- __version__ = '1.35.2.dev0'
9
+ __version__ = '1.36.1.dev0'
10
10
 
11
11
 
12
12
  import sys
@@ -536,3 +536,8 @@ def _get_unreferenced_keys(parent_columns, child_columns):
536
536
  merged = merged[merged[indicator] == 'left_only'][list(child_columns.columns)]
537
537
  merged = merged.dropna(how='all')
538
538
  return merged.dropna(how='all')
539
+
540
+
541
+ def _validate_boolean_parameter(parameter, parameter_name):
542
+ if not isinstance(parameter, bool):
543
+ raise ValueError(f"'{parameter_name}' must be a boolean value.")
@@ -4,6 +4,7 @@ import warnings
4
4
 
5
5
  import pandas as pd
6
6
 
7
+ from sdv._utils import _validate_boolean_parameter
7
8
  from sdv.metadata.errors import InvalidMetadataError
8
9
  from sdv.metadata.multi_table import MultiTableMetadata
9
10
  from sdv.metadata.single_table import SingleTableMetadata
@@ -61,11 +62,6 @@ class Metadata(MultiTableMetadata):
61
62
  instance._set_metadata_dict(metadata_dict, single_table_name)
62
63
  return instance
63
64
 
64
- @staticmethod
65
- def _validate_infer_sdtypes(infer_sdtypes):
66
- if not isinstance(infer_sdtypes, bool):
67
- raise ValueError("'infer_sdtypes' must be a boolean value.")
68
-
69
65
  @staticmethod
70
66
  def _validate_foreign_key_inference_algorithm(foreign_key_inference_algorithm):
71
67
  if foreign_key_inference_algorithm != 'column_name_match':
@@ -78,6 +74,7 @@ class Metadata(MultiTableMetadata):
78
74
  infer_sdtypes=True,
79
75
  infer_keys='primary_and_foreign',
80
76
  foreign_key_inference_algorithm='column_name_match',
77
+ verbose=False,
81
78
  ):
82
79
  if not data or not all(isinstance(df, pd.DataFrame) for df in data.values()):
83
80
  raise ValueError('The provided dictionary must contain only pandas DataFrame objects.')
@@ -86,16 +83,20 @@ class Metadata(MultiTableMetadata):
86
83
  "'infer_keys' must be one of: 'primary_and_foreign', 'primary_only', None."
87
84
  )
88
85
  cls._validate_foreign_key_inference_algorithm(foreign_key_inference_algorithm)
89
- cls._validate_infer_sdtypes(infer_sdtypes)
86
+ _validate_boolean_parameter(infer_sdtypes, 'infer_sdtypes')
90
87
 
91
88
  metadata = Metadata()
92
89
  for table_name, dataframe in data.items():
93
90
  metadata.detect_table_from_dataframe(
94
- table_name, dataframe, infer_sdtypes, None if infer_keys is None else 'primary_only'
91
+ table_name,
92
+ dataframe,
93
+ infer_sdtypes,
94
+ None if infer_keys is None else 'primary_only',
95
+ verbose,
95
96
  )
96
97
 
97
98
  if infer_keys == 'primary_and_foreign':
98
- metadata._detect_relationships(data, foreign_key_inference_algorithm)
99
+ metadata._detect_relationships(data, foreign_key_inference_algorithm, verbose)
99
100
 
100
101
  return metadata
101
102
 
@@ -106,6 +107,7 @@ class Metadata(MultiTableMetadata):
106
107
  infer_sdtypes=True,
107
108
  infer_keys='primary_and_foreign',
108
109
  foreign_key_inference_algorithm='column_name_match',
110
+ verbose=False,
109
111
  ):
110
112
  """Detect the metadata for all tables in a dictionary of dataframes.
111
113
 
@@ -130,6 +132,11 @@ class Metadata(MultiTableMetadata):
130
132
  foreign_key_inference_algorithm (str):
131
133
  Which algorithm to use for detecting foreign keys. Currently only one option,
132
134
  'column_name_match'. Defaults to 'column_name_match'.
135
+ verbose (bool):
136
+ A boolean that determines if information should be printed regarding detection.
137
+ If True, it prints out information about what is detected.
138
+ If False, it does not print out any information about what is detected.
139
+ Defaults to False.
133
140
 
134
141
  Returns:
135
142
  Metadata:
@@ -140,8 +147,25 @@ class Metadata(MultiTableMetadata):
140
147
  infer_sdtypes=infer_sdtypes,
141
148
  infer_keys=infer_keys,
142
149
  foreign_key_inference_algorithm=foreign_key_inference_algorithm,
150
+ verbose=verbose,
143
151
  )
144
152
 
153
+ @classmethod
154
+ def _detect_from_dataframe(
155
+ cls, data, table_name=None, infer_sdtypes=True, infer_keys='primary_only', verbose=False
156
+ ):
157
+ """Detect the metadata for a DataFrame."""
158
+ table_name = table_name or cls.DEFAULT_SINGLE_TABLE_NAME
159
+ if not isinstance(data, pd.DataFrame):
160
+ raise ValueError('The provided data must be a pandas DataFrame object.')
161
+ if infer_keys not in ['primary_only', None]:
162
+ raise ValueError("'infer_keys' must be one of: 'primary_only', None.")
163
+
164
+ _validate_boolean_parameter(infer_sdtypes, 'infer_sdtypes')
165
+ metadata = Metadata()
166
+ metadata.detect_table_from_dataframe(table_name, data, infer_sdtypes, infer_keys, verbose)
167
+ return metadata
168
+
145
169
  @classmethod
146
170
  def detect_from_dataframe(
147
171
  cls,
@@ -149,6 +173,7 @@ class Metadata(MultiTableMetadata):
149
173
  table_name=DEFAULT_SINGLE_TABLE_NAME,
150
174
  infer_sdtypes=True,
151
175
  infer_keys='primary_only',
176
+ verbose=False,
152
177
  ):
153
178
  """Detect the metadata for a DataFrame.
154
179
 
@@ -157,7 +182,10 @@ class Metadata(MultiTableMetadata):
157
182
 
158
183
  Args:
159
184
  data (pandas.DataFrame):
160
- Dictionary of table names to dataframes.
185
+ The data to detect metadata from.
186
+ table_name (str):
187
+ The name of the table to detect. If None, a default name will be used.
188
+ Defaults to None.
161
189
  infer_sdtypes (bool):
162
190
  A boolean describing whether to infer the sdtypes of each column.
163
191
  If True it infers the sdtypes based on the data.
@@ -168,20 +196,23 @@ class Metadata(MultiTableMetadata):
168
196
  - 'primary_only': Infer only the primary keys of each table
169
197
  - None: Do not infer any keys
170
198
  Defaults to 'primary_only'.
199
+ verbose (bool):
200
+ A boolean that determines if information should be printed regarding detection.
201
+ If True, it prints out information about what is detected.
202
+ If False, it does not print out any information about what is detected.
203
+ Defaults to False.
171
204
 
172
205
  Returns:
173
206
  Metadata:
174
207
  A new metadata object with the sdtypes detected from the data.
175
208
  """
176
- if not isinstance(data, pd.DataFrame):
177
- raise ValueError('The provided data must be a pandas DataFrame object.')
178
- if infer_keys not in ['primary_only', None]:
179
- raise ValueError("'infer_keys' must be one of: 'primary_only', None.")
180
- cls._validate_infer_sdtypes(infer_sdtypes)
181
-
182
- metadata = Metadata()
183
- metadata.detect_table_from_dataframe(table_name, data, infer_sdtypes, infer_keys)
184
- return metadata
209
+ return cls._detect_from_dataframe(
210
+ data=data,
211
+ table_name=table_name,
212
+ infer_sdtypes=infer_sdtypes,
213
+ infer_keys=infer_keys,
214
+ verbose=verbose,
215
+ )
185
216
 
186
217
  def _set_metadata_dict(self, metadata, single_table_name=None):
187
218
  """Set a ``metadata`` dictionary to the current instance.
@@ -3,6 +3,7 @@
3
3
  import datetime
4
4
  import json
5
5
  import logging
6
+ import sys
6
7
  import warnings
7
8
  from collections import defaultdict
8
9
  from copy import deepcopy
@@ -547,7 +548,7 @@ class MultiTableMetadata:
547
548
  f'The relationships in the dataset are disjointed. {table_msg}'
548
549
  )
549
550
 
550
- def _detect_foreign_keys_by_column_name(self, data):
551
+ def _detect_foreign_keys_by_column_name(self, data, verbose=False):
551
552
  """Detect the foreign keys based on if a column name matches a primary key.
552
553
 
553
554
  If a column name (a child table) is a primary key, it will also be considered
@@ -557,7 +558,15 @@ class MultiTableMetadata:
557
558
  data (dict):
558
559
  Dictionary of table names to dataframes.
559
560
  NOTE: this is only used in SDV-Enterprise.
561
+ verbose (bool):
562
+ A boolean that determines if information should be printed regarding detection.
563
+ If True, it prints out information about what is detected.
564
+ If False, it does not print out any information about what is detected.
565
+ Defaults to False.
560
566
  """
567
+ is_foreign_keys_found = False
568
+ if verbose:
569
+ sys.stdout.write('\nDetecting foreign keys:\n')
561
570
  for parent_candidate in self.tables.keys():
562
571
  primary_key = self.tables[parent_candidate].primary_key
563
572
  if primary_key is None:
@@ -573,15 +582,25 @@ class MultiTableMetadata:
573
582
  continue
574
583
 
575
584
  try:
585
+ sdtype_updated = False
576
586
  if pk_sdtype == 'id' and original_fk_sdtype != 'id':
577
587
  self.update_column(
578
588
  table_name=child_candidate,
579
589
  column_name=primary_key,
580
590
  sdtype='id',
581
591
  )
592
+ sdtype_updated = True
582
593
  self.add_relationship(
583
594
  parent_candidate, child_candidate, primary_key, primary_key
584
595
  )
596
+ is_foreign_keys_found = True
597
+ if verbose:
598
+ child_col = f"'{child_candidate}.{primary_key}'"
599
+ parent_col = f"'{parent_candidate}.{primary_key}'"
600
+ suffix = " (updating sdtype to 'id')" if sdtype_updated else ''
601
+ sys.stdout.write(
602
+ f'- Column {child_col} refers to column {parent_col}{suffix}\n'
603
+ )
585
604
 
586
605
  except InvalidMetadataError:
587
606
  # circular relationship
@@ -592,8 +611,12 @@ class MultiTableMetadata:
592
611
  **original_fk_meta,
593
612
  )
594
613
  continue
614
+ if verbose and not is_foreign_keys_found:
615
+ sys.stdout.write('- No foreign keys found\n')
595
616
 
596
- def _detect_relationships(self, data=None, foreign_key_inference_algorithm='column_name_match'):
617
+ def _detect_relationships(
618
+ self, data=None, foreign_key_inference_algorithm='column_name_match', verbose=False
619
+ ):
597
620
  """Automatically detect relationships between tables.
598
621
 
599
622
  Args:
@@ -603,12 +626,22 @@ class MultiTableMetadata:
603
626
  foreign_key_inference_algorithm (str):
604
627
  Which algorithm to use for detecting foreign keys. Currently only one option,
605
628
  'column_name_match'.
629
+ verbose (bool):
630
+ A boolean that determines if information should be printed regarding detection.
631
+ If True, it prints out information about what is detected.
632
+ If False, it does not print out any information about what is detected.
633
+ Defaults to False.
606
634
  """
607
635
  if foreign_key_inference_algorithm == 'column_name_match':
608
- self._detect_foreign_keys_by_column_name(data)
636
+ self._detect_foreign_keys_by_column_name(data, verbose)
609
637
 
610
638
  def detect_table_from_dataframe(
611
- self, table_name, data, infer_sdtypes=True, infer_keys='primary_only'
639
+ self,
640
+ table_name,
641
+ data,
642
+ infer_sdtypes=True,
643
+ infer_keys='primary_only',
644
+ verbose=False,
612
645
  ):
613
646
  """Detect the metadata for a table from a dataframe.
614
647
 
@@ -630,14 +663,19 @@ class MultiTableMetadata:
630
663
  - 'primary_only': Infer only the primary keys of each table
631
664
  - None: Do not infer any keys
632
665
  Defaults to 'primary_only'.
666
+ verbose (bool):
667
+ A boolean that determines if information should be printed regarding detection.
668
+ If True, it prints out information about what is detected.
669
+ If False, it does not print out any information about what is detected.
670
+ Defaults to False.
633
671
  """
634
672
  self._validate_table_not_detected(table_name)
635
673
  table = SingleTableMetadata()
636
- table._detect_columns(data, table_name, infer_sdtypes, infer_keys)
674
+ table._detect_columns(data, table_name, infer_sdtypes, infer_keys, verbose)
637
675
  self.tables[table_name] = table
638
676
  self._log_detected_table(table)
639
677
 
640
- def detect_from_dataframes(self, data):
678
+ def detect_from_dataframes(self, data, verbose=False):
641
679
  """Detect the metadata for all tables in a dictionary of dataframes.
642
680
 
643
681
  This method automatically detects the ``sdtypes`` for the given ``pandas.DataFrame``.
@@ -646,14 +684,19 @@ class MultiTableMetadata:
646
684
  Args:
647
685
  data (dict):
648
686
  Dictionary of table names to dataframes.
687
+ verbose (bool):
688
+ A boolean that determines if information should be printed regarding detection.
689
+ If True, it prints out information about what is detected.
690
+ If False, it does not print out any information about what is detected.
691
+ Defaults to False.
649
692
  """
650
693
  if not data or not all(isinstance(df, pd.DataFrame) for df in data.values()):
651
694
  raise ValueError('The provided dictionary must contain only pandas DataFrame objects.')
652
695
 
653
696
  for table_name, dataframe in data.items():
654
- self.detect_table_from_dataframe(table_name, dataframe)
697
+ self.detect_table_from_dataframe(table_name, dataframe, verbose=verbose)
655
698
 
656
- self._detect_relationships(data)
699
+ self._detect_relationships(data, verbose=verbose)
657
700
 
658
701
  def detect_from_csvs(self, folder_name, read_csv_parameters=None):
659
702
  """Detect the metadata for all tables in a folder of csv files.
@@ -3,6 +3,7 @@
3
3
  import json
4
4
  import logging
5
5
  import re
6
+ import sys
6
7
  import warnings
7
8
  from collections import Counter, defaultdict
8
9
  from copy import deepcopy
@@ -28,7 +29,13 @@ from sdv.errors import InvalidDataError
28
29
  from sdv.logging import get_sdv_logger
29
30
  from sdv.metadata.errors import InvalidMetadataError
30
31
  from sdv.metadata.metadata_upgrader import convert_metadata
31
- from sdv.metadata.utils import _validate_file_mode, read_json, validate_file_does_not_exist
32
+ from sdv.metadata.utils import (
33
+ _format_column_metadata,
34
+ _print_primary_key_detection,
35
+ _validate_file_mode,
36
+ read_json,
37
+ validate_file_does_not_exist,
38
+ )
32
39
  from sdv.metadata.visualization import (
33
40
  create_columns_node,
34
41
  create_summarized_columns_node,
@@ -651,7 +658,9 @@ class SingleTableMetadata:
651
658
  except Exception as e:
652
659
  self._handle_detection_error(e, column_name, table_name)
653
660
 
654
- def _select_primary_key(self, infer_sdtypes, pk_candidates, pii_pk_candidates):
661
+ def _select_primary_key(
662
+ self, infer_sdtypes, pk_candidates, pii_pk_candidates, table_name=None, verbose=False
663
+ ):
655
664
  """Select the primary key from a list of candidates.
656
665
 
657
666
  If there are any non-pii candidates, we select the first one. Otherwise, we select the
@@ -666,22 +675,48 @@ class SingleTableMetadata:
666
675
  A list of primary key candidates that aren't pii.
667
676
  pii_pk_candidates (list):
668
677
  A list of primary key candidates that are pii.
678
+ table_name (str):
679
+ The name of the table to be analyzed. Defaults to ``None``.
680
+ verbose (bool):
681
+ A boolean that determines if information should be printed regarding detection.
682
+ If True, it prints out information about what is detected.
683
+ If False, it does not print out any information about what is detected.
684
+ Defaults to False.
669
685
  """
686
+ if verbose:
687
+ table_str = f" for table '{table_name}'" if table_name else ''
688
+ sys.stdout.write(f'\nDetecting primary key{table_str}:\n')
689
+ chosen_pk = None
690
+ sdtype_updated = False
691
+ pii_removed = False
692
+
670
693
  if pk_candidates:
671
694
  selected_pk = pk_candidates[0]
672
695
  self.primary_key = selected_pk
696
+ original_sdtype = self.columns.get(self.primary_key, {}).get('sdtype')
673
697
  self.columns[self.primary_key]['sdtype'] = 'id'
698
+ chosen_pk = self.primary_key
699
+ sdtype_updated = original_sdtype != 'id'
674
700
 
675
701
  elif pii_pk_candidates:
676
702
  self.primary_key = pii_pk_candidates[0]
703
+ chosen_pk = self.primary_key
677
704
  if not infer_sdtypes:
705
+ original_sdtype = self.columns.get(self.primary_key, {}).get('sdtype')
678
706
  self.columns[self.primary_key]['sdtype'] = 'id'
707
+ sdtype_updated = original_sdtype != 'id'
679
708
 
680
709
  if self.primary_key and self.columns[self.primary_key].get('sdtype') == 'id':
681
710
  if self.columns[self.primary_key].get('pii') is not None:
682
711
  del self.columns[self.primary_key]['pii']
712
+ pii_removed = True
713
+
714
+ if verbose:
715
+ _print_primary_key_detection(chosen_pk, sdtype_updated, pii_removed)
683
716
 
684
- def _detect_columns(self, data, table_name=None, infer_sdtypes=True, infer_keys='primary_only'):
717
+ def _detect_columns(
718
+ self, data, table_name=None, infer_sdtypes=True, infer_keys='primary_only', verbose=False
719
+ ):
685
720
  """Detect metadata information for each column in the data.
686
721
 
687
722
  Args:
@@ -699,7 +734,16 @@ class SingleTableMetadata:
699
734
  - 'primary_only': Infer the primary keys.
700
735
  - None: Do not infer any keys.
701
736
  Defaults to 'primary_only'.
737
+ verbose (bool):
738
+ A boolean that determines if information should be printed regarding detection.
739
+ If True, it prints out information about what is detected.
740
+ If False, it does not print out any information about what is detected.
741
+ Defaults to False.
702
742
  """
743
+ if verbose and infer_sdtypes:
744
+ table_str = f"table '{table_name}'" if table_name else 'table'
745
+ sys.stdout.write(f'\nDetecting {table_str}:\n')
746
+
703
747
  old_columns = data.columns
704
748
  data.columns = data.columns.astype(str)
705
749
  pk_candidates = []
@@ -728,13 +772,19 @@ class SingleTableMetadata:
728
772
  column_dict['pii'] = True
729
773
 
730
774
  column_dict['sdtype'] = sdtype
731
- self.columns[field] = deepcopy(column_dict)
732
775
 
776
+ if verbose and infer_sdtypes:
777
+ column_metadata = _format_column_metadata(column_dict)
778
+ sys.stdout.write(f"- Column '{field}': {column_metadata}\n")
779
+
780
+ self.columns[field] = deepcopy(column_dict)
733
781
  if infer_keys == 'primary_only':
734
782
  self._select_primary_key(
735
783
  infer_sdtypes=infer_sdtypes,
736
784
  pk_candidates=pk_candidates,
737
785
  pii_pk_candidates=pii_pk_candidates,
786
+ table_name=table_name,
787
+ verbose=verbose,
738
788
  )
739
789
 
740
790
  self._updated = True
@@ -0,0 +1,82 @@
1
+ """Tools to generate strings from regular expressions."""
2
+
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+
7
+
8
+ def read_json(filepath):
9
+ """Validate and open a file path."""
10
+ filepath = Path(filepath)
11
+ if not filepath.exists():
12
+ raise ValueError(
13
+ f"A file named '{filepath.name}' does not exist. Please specify a different filename."
14
+ )
15
+
16
+ with open(filepath, 'r', encoding='utf-8') as metadata_file:
17
+ return json.load(metadata_file)
18
+
19
+
20
+ def validate_file_does_not_exist(filepath):
21
+ """Validate a file path doesn't exist."""
22
+ filepath = Path(filepath)
23
+ if filepath.exists():
24
+ raise ValueError(
25
+ f"A file named '{filepath.name}' already exists in this folder. Please specify "
26
+ 'a different filename.'
27
+ )
28
+
29
+
30
+ def _validate_file_mode(mode):
31
+ possible_modes = ['write', 'overwrite']
32
+ if mode not in possible_modes:
33
+ raise ValueError(f"Mode '{mode}' must be in {possible_modes}.")
34
+
35
+
36
+ def _format_metadata_value(value):
37
+ """Format a value for display, quoting only strings.
38
+
39
+ Args:
40
+ value:
41
+ The value to format. Boolean and None are returned as their
42
+ string representation; all other values are wrapped in single quotes.
43
+
44
+ Returns:
45
+ str:
46
+ The formatted value as a string.
47
+ """
48
+ if isinstance(value, bool) or value is None:
49
+ return str(value)
50
+ return f"'{value}'"
51
+
52
+
53
+ def _format_column_metadata(sdtype_info):
54
+ """Format a column's metadata dictionary as a display string, with sdtype first.
55
+
56
+ Args:
57
+ sdtype_info (dict):
58
+ A dictionary of column metadata (`{'sdtype': 'ssn', 'pii': False}`).
59
+
60
+ Returns:
61
+ str:
62
+ A comma-separated `key=value` string with 'sdtype' first.
63
+ (`sdtype='numerical', computer_representation='Float'`)
64
+ """
65
+ parts = [f'{k}={_format_metadata_value(v)}' for k, v in sdtype_info.items()]
66
+ parts.sort(key=lambda p: not p.startswith('sdtype='))
67
+ return ', '.join(parts)
68
+
69
+
70
+ def _print_primary_key_detection(chosen_pk, sdtype_updated, pii_removed):
71
+ if not chosen_pk:
72
+ sys.stdout.write('- No primary key found\n')
73
+ return
74
+
75
+ notes = []
76
+ if sdtype_updated:
77
+ notes.append("updating sdtype to 'id'")
78
+ if pii_removed:
79
+ notes.append("removing 'pii' field")
80
+
81
+ suffix = f' ({", ".join(notes)})' if notes else ''
82
+ sys.stdout.write(f"- primary_key='{chosen_pk}'{suffix}\n")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sdv
3
- Version: 1.35.2.dev0
3
+ Version: 1.36.1.dev0
4
4
  Summary: Generate synthetic data for single table, multi table and sequential data
5
5
  Author-email: "DataCebo, Inc." <info@sdv.dev>
6
6
  License-Expression: BUSL-1.1
@@ -1,32 +0,0 @@
1
- """Tools to generate strings from regular expressions."""
2
-
3
- import json
4
- from pathlib import Path
5
-
6
-
7
- def read_json(filepath):
8
- """Validate and open a file path."""
9
- filepath = Path(filepath)
10
- if not filepath.exists():
11
- raise ValueError(
12
- f"A file named '{filepath.name}' does not exist. Please specify a different filename."
13
- )
14
-
15
- with open(filepath, 'r', encoding='utf-8') as metadata_file:
16
- return json.load(metadata_file)
17
-
18
-
19
- def validate_file_does_not_exist(filepath):
20
- """Validate a file path doesn't exist."""
21
- filepath = Path(filepath)
22
- if filepath.exists():
23
- raise ValueError(
24
- f"A file named '{filepath.name}' already exists in this folder. Please specify "
25
- 'a different filename.'
26
- )
27
-
28
-
29
- def _validate_file_mode(mode):
30
- possible_modes = ['write', 'overwrite']
31
- if mode not in possible_modes:
32
- raise ValueError(f"Mode '{mode}' must be in {possible_modes}.")
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes