tdfs4ds 0.2.4.2__py3-none-any.whl → 0.2.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tdfs4ds/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = '0.2.4.2'
1
+ __version__ = '0.2.4.3'
2
2
  import logging
3
3
  # Setup the logger
4
4
  logging.basicConfig(
@@ -11,6 +11,7 @@ logger = logging.getLogger(__name__)
11
11
 
12
12
  from tdfs4ds.feature_store.feature_query_retrieval import get_available_entity_id_records, write_where_clause_filter
13
13
  from tdfs4ds.process_store.process_followup import follow_up_report
14
+ from tdfs4ds.dataset.dataset_catalog import DatasetCatalog, Dataset
14
15
 
15
16
  DATA_DOMAIN = None
16
17
  SCHEMA = None
@@ -19,6 +20,7 @@ FEATURE_CATALOG_NAME_VIEW = 'FS_V_FEATURE_CATALOG'
19
20
  PROCESS_CATALOG_NAME = 'FS_PROCESS_CATALOG'
20
21
  PROCESS_CATALOG_NAME_VIEW = 'FS_V_PROCESS_CATALOG'
21
22
  PROCESS_CATALOG_NAME_VIEW_FEATURE_SPLIT = 'FS_V_PROCESS_CATALOG_FEATURE_SPLIT'
23
+ DATASET_CATALOG_NAME = 'FS_DATASET'
22
24
 
23
25
  DATA_DISTRIBUTION_NAME = 'FS_DATA_DISTRIBUTION'
24
26
  FOLLOW_UP_NAME = 'FS_FOLLOW_UP'
@@ -125,6 +127,8 @@ def setup(database, if_exists='fail'):
125
127
  tdml.db_drop_table(table_name = tdfs4ds.DATA_DISTRIBUTION_NAME, schema_name=database)
126
128
  except Exception as e:
127
129
  print(str(e).split('\n')[0])
130
+
131
+ DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME).drop_catalog()
128
132
  try:
129
133
  tdfs4ds.FEATURE_CATALOG_NAME = feature_store_catalog_creation()
130
134
  print('feature catalog table: ', tdfs4ds.FEATURE_CATALOG_NAME, ' in database ', database)
@@ -146,18 +150,22 @@ def setup(database, if_exists='fail'):
146
150
 
147
151
  tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
148
152
  tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
153
+ dataset_catalog = DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME)
154
+ if not dataset_catalog._exists():
155
+ dataset_catalog.create_catalog()
149
156
 
150
157
  return
151
158
 
152
159
  def connect(
153
- database = tdfs4ds.SCHEMA,
154
- feature_catalog_name = tdfs4ds.FEATURE_CATALOG_NAME,
155
- process_catalog_name = tdfs4ds.PROCESS_CATALOG_NAME,
156
- data_distribution_name = tdfs4ds.DATA_DISTRIBUTION_NAME,
157
- filter_manager_name = tdfs4ds.FILTER_MANAGER_NAME,
158
- followup_name = tdfs4ds.FOLLOW_UP_NAME,
160
+ database = tdfs4ds.SCHEMA,
161
+ feature_catalog_name = tdfs4ds.FEATURE_CATALOG_NAME,
162
+ process_catalog_name = tdfs4ds.PROCESS_CATALOG_NAME,
163
+ data_distribution_name = tdfs4ds.DATA_DISTRIBUTION_NAME,
164
+ filter_manager_name = tdfs4ds.FILTER_MANAGER_NAME,
165
+ followup_name = tdfs4ds.FOLLOW_UP_NAME,
159
166
  feature_catalog_name_view = tdfs4ds.FEATURE_CATALOG_NAME_VIEW,
160
- process_catalog_name_view = tdfs4ds.PROCESS_CATALOG_NAME_VIEW
167
+ process_catalog_name_view = tdfs4ds.PROCESS_CATALOG_NAME_VIEW,
168
+ dataset_catalog_name = tdfs4ds.DATASET_CATALOG_NAME
161
169
  ):
162
170
  """
163
171
  Configures the database environment by setting schema names and checking the existence of specified catalog tables.
@@ -197,7 +205,8 @@ def connect(
197
205
  distrib_exists = data_distribution_name.lower() in tables
198
206
  filter_manager_exists = filter_manager_name.lower() in tables
199
207
  followup_name_exists = followup_name.lower() in tables
200
-
208
+
209
+
201
210
  if followup_name_exists:
202
211
  tdfs4ds.FOLLOW_UP_NAME = followup_name
203
212
  else:
@@ -211,6 +220,7 @@ def connect(
211
220
  tdfs4ds.FILTER_MANAGER_NAME = filter_manager_name
212
221
  tdfs4ds.PROCESS_CATALOG_NAME_VIEW = process_catalog_name_view
213
222
  tdfs4ds.FEATURE_CATALOG_NAME_VIEW = feature_catalog_name_view
223
+
214
224
 
215
225
  process_list = tdml.DataFrame(tdml.in_schema(database, process_catalog_name))
216
226
  if 'ENTITY_NULL_SUBSTITUTE' not in process_list.columns:
@@ -237,6 +247,11 @@ def connect(
237
247
  def is_data_distribution_temporal():
238
248
  return 'PERIOD' in tdfs4ds.utils.lineage.get_ddl(view_name=tdfs4ds.DATA_DISTRIBUTION_NAME,
239
249
  schema_name=tdfs4ds.SCHEMA, object_type='table')
250
+
251
+ tdfs4ds.DATASET_CATALOG_NAME = dataset_catalog_name
252
+ dataset_catalog = DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME)
253
+ if not dataset_catalog._exists():
254
+ dataset_catalog.create_catalog()
240
255
 
241
256
  if is_data_distribution_temporal():
242
257
  tdfs4ds.DATA_DISTRIBUTION_TEMPORAL = True
@@ -279,7 +294,21 @@ def process_catalog():
279
294
  """
280
295
  return tdfs4ds.process_store.process_query_administration.list_processes()
281
296
 
297
+ def dataset_catalog():
298
+ """
299
+ Retrieve a list of all datasets registered in the dataset store.
300
+
301
+ This function performs a query against the dataset store to gather a list of all
302
+ datasets that have been registered and are administrable.
303
+
304
+ """
305
+ return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).catalog
306
+
307
+ def get_dataset_entity(dataset_id = None):
308
+ return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_entity(dataset_id)
282
309
 
310
+ def get_dataset_features(dataset_id = None):
311
+ return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_features(dataset_id)
283
312
 
284
313
  def run(process_id, return_dataset = False, force_compute = False):
285
314
  """
@@ -1064,6 +1093,12 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
1064
1093
  tdfs4ds.logger.info(f"Adding a comment to the view {view_name} in the {schema_name} database.")
1065
1094
  tdml.execute_sql(f"COMMENT ON VIEW {schema_name}.{view_name} IS '{comment}'")
1066
1095
 
1096
+ # build the dataset object
1097
+ tdfs4ds.logger.info(f"Creation of the dataset object.")
1098
+ dataset = Dataset(view_name=view_name, schema_name=schema_name)
1099
+ tdfs4ds.logger.info(f"Registering of the dataset in the dataset catalog.")
1100
+ DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).add_dataset(dataset=dataset)
1101
+
1067
1102
  # Return the query or the DataFrame based on the `return_query` flag
1068
1103
  if return_query:
1069
1104
  tdfs4ds.logger.info("Returning the generated dataset query.")
tdfs4ds/utils/info.py CHANGED
@@ -47,7 +47,7 @@ def get_column_types(df, columns):
47
47
 
48
48
 
49
49
 
50
- def get_column_types_simple(df, columns):
50
+ def get_column_types_simple(df, columns = None):
51
51
  """
52
52
  Retrieve simplified column types for specified columns from a DataFrame.
53
53
 
@@ -71,6 +71,9 @@ def get_column_types_simple(df, columns):
71
71
  """
72
72
 
73
73
  # Ensure that the columns parameter is in list format
74
+ if columns is None:
75
+ columns = df.columns
76
+
74
77
  if type(columns) != list:
75
78
  columns = [columns]
76
79
 
@@ -193,7 +196,7 @@ def generate_partitioning_clause(partitioning):
193
196
  {partitioning}
194
197
  )"""
195
198
 
196
- def get_feature_types_sql_format(tddf, columns):
199
+ def get_feature_types_sql_format(tddf, columns = None):
197
200
  """
198
201
  Retrieve the SQL data types of specified columns from a Teradata dataframe.
199
202
 
@@ -228,6 +231,10 @@ def get_feature_types_sql_format(tddf, columns):
228
231
  'programming': 'VARCHAR(30)',
229
232
  'admitted': 'INTEGER'}
230
233
  """
234
+
235
+ if columns is None:
236
+ columns = tddf.columns
237
+
231
238
  # Validate inputs
232
239
  if not isinstance(tddf, tdml.DataFrame):
233
240
  raise TypeError("tddf must be an instance of tdml.DataFrame")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tdfs4ds
3
- Version: 0.2.4.2
3
+ Version: 0.2.4.3
4
4
  Summary: A python package to simplify the usage of feature store using Teradata Vantage ...
5
5
  Author: Denis Molin
6
6
  Requires-Python: >=3.6
@@ -2,7 +2,7 @@ tdfs/__init__.py,sha256=7AcO7uB1opRCt7t2JOHworKimfAaDeO3boRW7u9Geo8,23
2
2
  tdfs/datasets.py,sha256=-b2MPEKGki2V1M8iUcoDR9uc2krIK7u1CK-EhChvihs,985
3
3
  tdfs/feature_store.py,sha256=Honu7eOAXxP4Ivz0mRlhuNkfTDzgZl5HB1WlQUwzcZ0,31354
4
4
  tdfs/data/curves.csv,sha256=q0Tm-0yu7VMK4lHvHpgi1LMeRq0lO5gJy2Q17brKbEM,112488
5
- tdfs4ds/__init__.py,sha256=FJ9hllt1QYfh_5dOmt2BslPL-YoKSAb7MRaprF8Z_vU,62202
5
+ tdfs4ds/__init__.py,sha256=OOakI_WdX1fjXTheqqLMUQY99apaGFXdEYg_SQpWQng,63986
6
6
  tdfs4ds/datasets.py,sha256=LE4Gn0muwdyrIrCrbkE92cnafUML63z1lj5bFIIVzmc,3524
7
7
  tdfs4ds/feature_engineering.py,sha256=oVnZ2V_XNGE12LKC_fNfkrWSQZLgtYRmaf8Dispi6S4,7081
8
8
  tdfs4ds/feature_store.py,sha256=y-oItPZw6nBkBcGAceaATZbkLPTsvpk0OnpzTxYofDs,68576
@@ -21,12 +21,12 @@ tdfs4ds/process_store/process_registration_management.py,sha256=F8VlBoL-de98KnkM
21
21
  tdfs4ds/process_store/process_store_catalog_management.py,sha256=H135RRTYn-pyWIqPVbHpuIyyvsaNrek6b1iPk8avJMI,16088
22
22
  tdfs4ds/utils/__init__.py,sha256=-yTMfDLZbQnIRQ64s_bczzT21tDW2A8FZeq9PX5SgFU,168
23
23
  tdfs4ds/utils/filter_management.py,sha256=7D47N_hnTSUVOkaV2XuKrlUFMxzWjDsCBvRYsH4lXdU,11073
24
- tdfs4ds/utils/info.py,sha256=N036s8h2AqJ7HPd6OBgLb1V3qUS6V1jtalPNW4Dld6c,10414
24
+ tdfs4ds/utils/info.py,sha256=lc9-rQDfM4NWnZGkSUkY_G0qYx7qnoErNKKcYMuLIRs,10554
25
25
  tdfs4ds/utils/lineage.py,sha256=LI-5pG7D8lO3-YFa9qA6CrEackiYugV23_Vz9IpF5xw,28670
26
26
  tdfs4ds/utils/query_management.py,sha256=nAcE8QY1GWAKgOtb-ubSfDVcnYbU7Ge8CruVRLoPtmY,6356
27
27
  tdfs4ds/utils/time_management.py,sha256=_jbwdyZH4Yr3VzbUrq6X93FpXDCDEdH0iv56vX7j8mA,8446
28
28
  tdfs4ds/utils/visualization.py,sha256=5S528KoKzzkrAdCxfy7ecyqKvAXBoibNvHwz_u5ISMs,23167
29
- tdfs4ds-0.2.4.2.dist-info/METADATA,sha256=sFZ7UgcZ_2xj9XFQsMPOAhh1O8MrbWYzmijfs4f5Shk,11944
30
- tdfs4ds-0.2.4.2.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
31
- tdfs4ds-0.2.4.2.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
32
- tdfs4ds-0.2.4.2.dist-info/RECORD,,
29
+ tdfs4ds-0.2.4.3.dist-info/METADATA,sha256=dUqe-90oXLdYx2U6F-WmeQDHhAFN_vvZrFfVuYGmTn8,11944
30
+ tdfs4ds-0.2.4.3.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
31
+ tdfs4ds-0.2.4.3.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
32
+ tdfs4ds-0.2.4.3.dist-info/RECORD,,