tdfs4ds 0.2.4.3__py3-none-any.whl → 0.2.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tdfs4ds/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = '0.2.4.3'
1
+ __version__ = '0.2.4.5'
2
2
  import logging
3
3
  # Setup the logger
4
4
  logging.basicConfig(
@@ -310,7 +310,7 @@ def get_dataset_entity(dataset_id = None):
310
310
  def get_dataset_features(dataset_id = None):
311
311
  return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_features(dataset_id)
312
312
 
313
- def run(process_id, return_dataset = False, force_compute = False):
313
+ def run(process_id, return_dataset = False, force_compute = False, force_varchar_length = None):
314
314
  """
315
315
  Executes a specific process from the feature store identified by the process ID.
316
316
  The function handles different process types and performs appropriate actions.
@@ -321,6 +321,10 @@ def run(process_id, return_dataset = False, force_compute = False):
321
321
  Default is False.
322
322
  - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
323
323
  Default is False.
324
+ - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
325
+ VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
326
+ where k is the smallest integer so that the original lengths is smaller or equal
327
+ to k x force_varchar_length. Default is None.
324
328
 
325
329
  Returns:
326
330
  DataFrame or None: If return_dataset is True, returns the dataset created during the process. Otherwise, returns None.
@@ -423,7 +427,8 @@ def run(process_id, return_dataset = False, force_compute = False):
423
427
  filtermanager = filtermanager,
424
428
  entity_null_substitute = entity_null_substitute,
425
429
  process_id = process_id,
426
- force_compute= force_compute
430
+ force_compute= force_compute,
431
+ force_varchar_length = force_varchar_length
427
432
  )
428
433
 
429
434
  # Handling 'tdstone2 view' process type
@@ -437,7 +442,7 @@ def run(process_id, return_dataset = False, force_compute = False):
437
442
  else:
438
443
  return
439
444
 
440
- def upload_features(df, entity_id, feature_names, metadata={}, primary_index = None, partitioning = '', filtermanager = None, entity_null_substitute = {}, force_compute = True):
445
+ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = None, partitioning = '', filtermanager = None, entity_null_substitute = {}, force_compute = True, force_varchar_length = 1024):
441
446
  """
442
447
  Uploads feature data from a DataFrame to the feature store for a specified entity. This involves registering the
443
448
  process in the feature store, executing the necessary SQL to insert the data, and returning the resulting dataset
@@ -463,7 +468,10 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
463
468
  Default is an empty dictionary.
464
469
  - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
465
470
  Default is True.
466
-
471
+ - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
472
+ VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
473
+ where k is the smallest integer so that the original lengths is smaller or equal
474
+ to k x force_varchar_length. Default is 1024.
467
475
  Returns:
468
476
  DataFrame: A DataFrame representing the dataset resulting from the upload process, typically used for validation
469
477
  or further processing.
@@ -575,7 +583,7 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
575
583
 
576
584
  try:
577
585
 
578
- dataset = run(process_id=process_id, return_dataset=True, force_compute = force_compute)
586
+ dataset = run(process_id=process_id, return_dataset=True, force_compute = force_compute, force_varchar_length = force_varchar_length)
579
587
 
580
588
  except Exception as e:
581
589
  tdfs4ds.process_store.process_followup.followup_close(
@@ -591,7 +599,7 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
591
599
  else:
592
600
 
593
601
  try:
594
- run(process_id=process_id, return_dataset=False)
602
+ run(process_id=process_id, return_dataset=False, force_compute = force_compute, force_varchar_length = force_varchar_length)
595
603
  except Exception as e:
596
604
  tdfs4ds.process_store.process_followup.followup_close(
597
605
  run_id = tdfs4ds.RUN_ID,
@@ -605,7 +613,7 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
605
613
  tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
606
614
 
607
615
  def _upload_features(df, entity_id, feature_names,
608
- feature_versions=FEATURE_VERSION_DEFAULT, primary_index = None, partitioning = '', filtermanager=None, entity_null_substitute={}, process_id = None, force_compute = False):
616
+ feature_versions=FEATURE_VERSION_DEFAULT, primary_index = None, partitioning = '', filtermanager=None, entity_null_substitute={}, process_id = None, force_compute = False,force_varchar_length = None):
609
617
  """
610
618
  Uploads features from a DataFrame to the feature store, handling entity registration, feature type determination,
611
619
  feature registration, preparation for ingestion, and storage in the designated feature tables.
@@ -628,6 +636,11 @@ def _upload_features(df, entity_id, feature_names,
628
636
  - process_id (str, optional): An identifier for the process, used for tracking and follow-up. Default is None.
629
637
  - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
630
638
  Default is False.
639
+ - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
640
+ VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
641
+ where k is the smallest integer so that the original lengths is smaller or equal
642
+ to k x force_varchar_length. Default is None.
643
+
631
644
 
632
645
  Returns:
633
646
  DataFrame: A DataFrame representing the dataset view created in the feature store, detailing the features and their
@@ -655,7 +668,7 @@ def _upload_features(df, entity_id, feature_names,
655
668
  from tdfs4ds.feature_store.feature_store_management import register_features
656
669
  from tdfs4ds.feature_store.feature_data_processing import prepare_feature_ingestion
657
670
  from tdfs4ds.feature_store.feature_data_processing import store_feature, apply_collect_stats
658
- from tdfs4ds.utils.info import get_column_types
671
+ from tdfs4ds.utils.info import get_column_types, update_varchar_length
659
672
 
660
673
  # Convert entity_id to a dictionary if it's not already one
661
674
  if type(entity_id) == list:
@@ -685,6 +698,10 @@ def _upload_features(df, entity_id, feature_names,
685
698
  entity_id=entity_id
686
699
  )
687
700
 
701
+ if force_varchar_length is not None:
702
+ print(feature_names_types)
703
+ feature_names_types = update_varchar_length(feature_names_types,new_varchar_length = force_varchar_length)
704
+
688
705
  def validate_feature_types(feature_names_types):
689
706
  """
690
707
  Validates feature data types and raises an error if any value contains
File without changes
@@ -0,0 +1,117 @@
1
+ import teradataml as tdml
2
+ from tdfs4ds.utils.info import get_feature_types_sql_format
3
+ from tdfs4ds import logger
4
+
5
+ class Dataset:
6
+
7
+ def __init__(self, view_name=None, schema_name=None, df=None):
8
+
9
+ if df is not None:
10
+ self.df = df
11
+ df._DataFrame__execute_node_and_set_table_name(df._nodeid, df._metaexpr)
12
+ view_name = df._table_name
13
+ if '.' in view_name:
14
+ self.view_name = view_name.split('.')[1]
15
+ self.schema_name = view_name.split('.')[0]
16
+ else:
17
+ self.view_name = view_name
18
+ self.schema_name = tdml.context.context._get_current_databasename()
19
+ elif view_name is not None and schema_name is not None:
20
+ self.view_name = view_name
21
+ self.schema_name = schema_name
22
+ if view_name.lower() in map(str.lower, tdml.db_list_tables(object_type='view', schema_name=self.schema_name).TableName.values):
23
+ self.df = tdml.DataFrame(tdml.in_schema(schema_name, view_name))
24
+ else:
25
+ print(f"{self.view_name} not found in {self.schema_name} database")
26
+ self.df = None
27
+ else:
28
+ raise ValueError("Either df or both view_name and schema_name must be provided.")
29
+
30
+ self.valid_time = self._get_validtime()
31
+ self.dataset_type = self._get_dataset_type()
32
+ self.entity, self.features = self._retrieve_entities_and_features()
33
+
34
+
35
+ def get_dataframe(self):
36
+ return self.df
37
+
38
+ def __repr__(self):
39
+ return f"Dataset(view_name={self.view_name}, schema_name={self.schema_name}, df={type(self.df)})"
40
+
41
+ def __getattr__(self, item):
42
+ if self.df is not None:
43
+ return getattr(self.df, item)
44
+ raise AttributeError(f"'{type(self).__name__}' object has no attribute '{item}'")
45
+
46
+ def _retrieve_entities_and_features(self):
47
+
48
+ if self._get_dataset_type() == 'snapshot':
49
+
50
+ blocks = [x.split(')')[0] for x in self._get_ddl().split('(')]
51
+ feature_names = [blocks[i].replace('\n','').split('AS ')[1].split('FROM')[0].strip() for i in range(1,len(blocks)) if i % 2 == 1]
52
+ feature_ids = [int(blocks[i].replace('\n','').split('=')[1].split('AND')[0].strip()) for i in range(1,len(blocks)) if i % 2 == 0]
53
+ feature_versions = [blocks[i].replace('\n','').split('=')[2].replace("'",'').strip() for i in range(1,len(blocks)) if i % 2 == 0]
54
+
55
+ feature_database = [blocks[i].replace('\n','').split('"')[1].strip() for i in range(1,len(blocks)) if i % 2 == 1]
56
+ feature_view = [blocks[i].replace('\n','').split('"')[3].strip() for i in range(1,len(blocks)) if i % 2 == 1]
57
+
58
+ columns_types = get_feature_types_sql_format(self.df)
59
+ feature_types = [columns_types[f] for f in feature_names]
60
+
61
+ features = {}
62
+ for n,i,v,t,d,vv in zip(feature_names, feature_ids, feature_versions, feature_types, feature_database, feature_view):
63
+ features[n.upper()] = {'id' : i, 'version': v, 'type': t.upper(), 'database' : d.upper(), 'view' : vv.upper()}
64
+
65
+
66
+ entity_names = [x.strip().split('.')[1] for x in blocks[0].split('SELECT')[1].split('FROM')[0].replace('\n','').split(',') if x.strip().startswith('A1') if x.strip().split('.')[1] not in feature_names]
67
+ entity_types = [columns_types[e] for e in entity_names]
68
+
69
+ entity = {}
70
+ for n,t in zip(entity_names, entity_types):
71
+ entity[n] = t
72
+
73
+ return entity, features
74
+ else:
75
+ logger.error(f"not implemented yet for dataset type: {self._get_dataset_type()}")
76
+ raise
77
+
78
+ def _get_dataset_type(self):
79
+ return 'snapshot'
80
+
81
+ def _get_validtime(self):
82
+ if self._get_dataset_type() == 'snapshot':
83
+ return self._get_ddl().split('\n')[4].strip()
84
+ else:
85
+ logger.error(f"not implemented yet for dataset type: {self._get_dataset_type()}")
86
+ return ''
87
+
88
+ def _get_feature_store_database(self):
89
+
90
+ databases = [self.features[k]['database'] for k in self.features.keys()]
91
+ databases = list(set(databases))
92
+ if len(databases) == 1:
93
+ self.feature_store_database = databases[0]
94
+ elif len(databases) > 1:
95
+ logger.warning(f"features are stored in multiple databases: {databases}")
96
+ else:
97
+ logger.error(f"unable to identify the feature store database")
98
+ raise
99
+
100
+
101
+ def _get_ddl(self):
102
+ return tdml.execute_sql(f"SHOW VIEW {self.schema_name}.{self.view_name}").fetchall()[0][0].replace('\r','\n')
103
+
104
+ def show_query(self):
105
+ if self.df is not None:
106
+ print(self._get_ddl())
107
+
108
+ def info(self):
109
+ print("\nEntities:")
110
+ for key, value in self.entity.items():
111
+ print(f" - {key}: {value}")
112
+
113
+ print("\nFeatures:")
114
+ for feature, details in self.features.items():
115
+ print(f" - {feature}:")
116
+ for detail_key, detail_value in details.items():
117
+ print(f" {detail_key}: {detail_value}")
@@ -0,0 +1,373 @@
1
+ from tdfs4ds import logger
2
+ import uuid
3
+ from tdfs4ds.dataset.dataset import Dataset
4
+ import teradataml as tdml
5
+ import json
6
+
7
+ class DatasetCatalog:
8
+
9
+ def __init__(self, schema_name = None, name = 'DATASET'):
10
+ if schema_name is None:
11
+ self.schema_name = tdml.context.context._get_current_databasename()
12
+ else:
13
+ self.schema_name = schema_name
14
+ self.name = name
15
+
16
+ self.catalog_table_name = f"{self.schema_name}.FS_{self.name}_CATALOG"
17
+ self.catalog_view_name = f"{self.schema_name}.FS_V_{self.name}_CATALOG"
18
+ self.entity_table_name = f"{self.schema_name}.FS_{self.name}_ENTITY"
19
+ self.entity_view_name = f"{self.schema_name}.FS_V_{self.name}_ENTITY"
20
+ self.feature_table_name = f"{self.schema_name}.FS_{self.name}_FEATURES"
21
+ self.feature_view_name = f"{self.schema_name}.FS_V_{self.name}_FEATURES"
22
+
23
+ self.creation_queries = self._creation_query()
24
+ if not self._exists():
25
+ self.create_catalog()
26
+
27
+ self.catalog = tdml.DataFrame(tdml.in_schema(self.catalog_view_name.split('.')[0],self.catalog_view_name.split('.')[1]))
28
+ self.entity = tdml.DataFrame(tdml.in_schema(self.entity_view_name.split('.')[0],self.entity_view_name.split('.')[1]))
29
+ self.features = tdml.DataFrame(tdml.in_schema(self.feature_view_name.split('.')[0],self.feature_view_name.split('.')[1]))
30
+
31
+ def __repr__(self):
32
+ return f"DatasetCatalog(catalog_view={self.catalog_view_name}, entity_view={self.entity_view_name}, feature_view={self.feature_view_name})"
33
+
34
+ def __getattr__(self, item):
35
+ if self.catalog is not None:
36
+ return getattr(self.catalog, item)
37
+ raise AttributeError(f"'{type(self).__name__}' object has no attribute '{item}'")
38
+
39
+ def _creation_query(self):
40
+
41
+ if self.schema_name is not None and self.name is not None:
42
+
43
+ query_dataset_catalog = f"""
44
+ CREATE MULTISET TABLE {self.catalog_table_name},
45
+ FALLBACK,
46
+ NO BEFORE JOURNAL,
47
+ NO AFTER JOURNAL,
48
+ CHECKSUM = DEFAULT,
49
+ DEFAULT MERGEBLOCKRATIO,
50
+ MAP = TD_MAP1
51
+ (
52
+ DATASET_ID VARCHAR(36) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
53
+ DATASET_NAME VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
54
+ DATASET_DATABASE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
55
+ DATASET_TYPE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
56
+ DATASET_VALIDTIME VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
57
+ METADATA JSON(32000) CHARACTER SET LATIN,
58
+ ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
59
+ ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
60
+ PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME
61
+ )
62
+ PRIMARY INDEX (DATASET_ID);
63
+ """
64
+
65
+ query_dataset_catalog_view = f"""
66
+ CREATE VIEW {self.catalog_view_name} AS
67
+ LOCK ROW FOR ACCESS
68
+ CURRENT VALIDTIME
69
+ SELECT *
70
+ FROM {self.catalog_table_name}
71
+ """
72
+
73
+ query_dataset_entity = f"""
74
+ CREATE MULTISET TABLE {self.entity_table_name},
75
+ FALLBACK,
76
+ NO BEFORE JOURNAL,
77
+ NO AFTER JOURNAL,
78
+ CHECKSUM = DEFAULT,
79
+ DEFAULT MERGEBLOCKRATIO,
80
+ MAP = TD_MAP1
81
+ (
82
+ DATASET_ID VARCHAR(36) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
83
+ ENTITY VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
84
+ ENTITY_TYPE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
85
+ ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
86
+ ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
87
+ PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME
88
+ )
89
+ PRIMARY INDEX (DATASET_ID);
90
+ """
91
+
92
+ query_dataset_entity_view = f"""
93
+ CREATE VIEW {self.entity_view_name} AS
94
+ LOCK ROW FOR ACCESS
95
+ CURRENT VALIDTIME
96
+ SELECT *
97
+ FROM {self.entity_table_name}
98
+ """
99
+
100
+ query_dataset_features = f"""
101
+ CREATE MULTISET TABLE {self.feature_table_name},
102
+ FALLBACK,
103
+ NO BEFORE JOURNAL,
104
+ NO AFTER JOURNAL,
105
+ CHECKSUM = DEFAULT,
106
+ DEFAULT MERGEBLOCKRATIO,
107
+ MAP = TD_MAP1
108
+ (
109
+ DATASET_ID VARCHAR(36) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
110
+ FEATURE_ID VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
111
+ FEATURE_NAME VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
112
+ FEATURE_TYPE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
113
+ FEATURE_DATABASE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
114
+ FEATURE_VIEW VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
115
+ ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
116
+ ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
117
+ PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME
118
+ )
119
+ PRIMARY INDEX (DATASET_ID);
120
+ """
121
+
122
+ query_dataset_feature_view = f"""
123
+ CREATE VIEW {self.feature_view_name} AS
124
+ LOCK ROW FOR ACCESS
125
+ CURRENT VALIDTIME
126
+ SELECT *
127
+ FROM {self.feature_table_name}
128
+ """
129
+
130
+ queries = [
131
+ {'name' : f'{self.catalog_table_name}', 'type': 'table', 'query': query_dataset_catalog},
132
+ {'name' : f'{self.entity_table_name}', 'type': 'table', 'query': query_dataset_entity},
133
+ {'name' : f'{self.feature_table_name}', 'type': 'table', 'query': query_dataset_features},
134
+ {'name' : f'{self.catalog_view_name}', 'type': 'view', 'query': query_dataset_catalog_view},
135
+ {'name' : f'{self.entity_view_name}', 'type': 'view', 'query': query_dataset_entity_view},
136
+ {'name' : f'{self.feature_view_name}', 'type': 'view', 'query': query_dataset_feature_view}
137
+ ]
138
+
139
+ return queries
140
+ else:
141
+ logger.error('the schema name is not defined')
142
+ raise ValueError("the schema name is not defined")
143
+
144
+ def _get_list_objects(self):
145
+ return [self.catalog_table_name, self.entity_table_name, self.feature_table_name, self.catalog_view_name, self.entity_view_name, self.feature_view_name]
146
+
147
+ def create_catalog(self, schema_name = None):
148
+
149
+ if schema_name is not None:
150
+ self.schema_name = schema_name
151
+ self.catalog_table_name = f"{self.schema_name}.{self.name}_CATALOG"
152
+ self.catalog_view_name = f"{self.schema_name}.V_{self.name}_CATALOG"
153
+ self.entity_table_name = f"{self.schema_name}.{self.name}_ENTITY"
154
+ self.entity_view_name = f"{self.schema_name}.V_{self.name}_ENTITY"
155
+ self.feature_table_name = f"{self.schema_name}.{self.name}_FEATURES"
156
+ self.feature_view_name = f"{self.schema_name}.V_{self.name}_FEATURES"
157
+
158
+
159
+ self.creation_queries = self._creation_query()
160
+ already_exists = [v for v in self._get_list_objects() if v.lower().split('.')[1] in map(str.lower, tdml.db_list_tables(schema_name = self.schema_name).TableName.values)]
161
+
162
+ if len(already_exists) > 0:
163
+ msg = f"The dataset catalog cannot be created because these tables already exist : {already_exists}"
164
+ logger.error(msg)
165
+ raise ValueError(msg)
166
+ else:
167
+ for query in self.creation_queries:
168
+ logger.info(f"creation of {query['name']}")
169
+ tdml.execute_sql(query['query'])
170
+
171
+ def drop_catalog(self):
172
+
173
+ for query in self.creation_queries:
174
+ logger.info(f"drop {query['name']}")
175
+ if query['type'] == 'table':
176
+ tdml.execute_sql(f"DROP TABLE {query['name']}")
177
+ elif query['type'] == 'view':
178
+ tdml.execute_sql(f"DROP VIEW {query['name']}")
179
+
180
+ def _exists(self):
181
+ not_exists = [v for v in self._get_list_objects() if v.lower().split('.')[1] not in map(str.lower, tdml.db_list_tables(schema_name = self.schema_name).TableName.values)]
182
+ return not_exists == []
183
+
184
+ def add_dataset(self, dataset, metadata = {}):
185
+
186
+ # if dataset exists:
187
+ res = self.catalog[(self.catalog.DATASET_NAME == dataset.view_name.upper())&(self.catalog.DATASET_DATABASE == dataset.schema_name.upper())]
188
+ if res.shape[0] == 1:
189
+ logger.info('this dataset is already present and will be updated')
190
+ print(res[['DATASET_ID', 'DATASET_NAME', 'DATASET_DATABASE']])
191
+ dataset_id = res[['DATASET_ID']].to_pandas().DATASET_ID.values[0]
192
+
193
+ entity = tdml.DataFrame(tdml.in_schema(self.entity_view_name.split('.')[0],self.entity_view_name.split('.')[1]))
194
+ existing_entity = entity[entity.DATASET_ID == dataset_id].to_pandas()
195
+
196
+ features = tdml.DataFrame(tdml.in_schema(self.feature_view_name.split('.')[0],self.feature_view_name.split('.')[1]))
197
+ existing_features = features[features.DATASET_ID == dataset_id].to_pandas()
198
+
199
+ elif res.shape[0] == 0:
200
+ dataset_id = str(uuid.uuid4())
201
+ existing_entity = None
202
+ existing_features = None
203
+ logger.info('the dataset is new and will be registered')
204
+ else:
205
+ logger.error('there are more that one dataset with the same id')
206
+ raise
207
+ logger.info(f'dataset is : {dataset_id}')
208
+
209
+ query_insert_catalog = f"""
210
+ CURRENT VALIDTIME
211
+ MERGE INTO {self.catalog_table_name} EXISTING
212
+ USING (
213
+ SEL
214
+ '{dataset_id}' AS DATASET_ID
215
+ , '{dataset.view_name}' AS DATASET_NAME
216
+ , '{dataset.schema_name}' AS DATASET_DATABASE
217
+ , '{dataset.dataset_type}' AS DATASET_TYPE
218
+ , '{dataset.valid_time}' AS DATASET_VALIDTIME
219
+ , '{json.dumps(metadata).replace("'", '"')}' AS METADATA
220
+ ) UPDATED
221
+ ON EXISTING.DATASET_ID = UPDATED.DATASET_ID
222
+ WHEN MATCHED THEN
223
+ UPDATE
224
+ SET
225
+ DATASET_NAME = UPDATED.DATASET_NAME
226
+ , DATASET_DATABASE = UPDATED.DATASET_DATABASE
227
+ , DATASET_TYPE = UPDATED.DATASET_TYPE
228
+ , DATASET_VALIDTIME = UPDATED.DATASET_VALIDTIME
229
+ , METADATA = UPDATED.METADATA
230
+ WHEN NOT MATCHED THEN
231
+ INSERT (
232
+ UPDATED.DATASET_ID,
233
+ UPDATED.DATASET_NAME,
234
+ UPDATED.DATASET_DATABASE,
235
+ UPDATED.DATASET_TYPE,
236
+ UPDATED.DATASET_VALIDTIME,
237
+ UPDATED.METADATA
238
+ )
239
+ """
240
+
241
+ updated_entity = dataset.entity
242
+ if existing_entity is not None:
243
+ dropped_entity = [e for e in existing_entity.ENTITY.values if e.lower() not in map(str.lower, updated_entity.keys())]
244
+ else:
245
+ dropped_entity = []
246
+
247
+ logger.info(f"entity to update : {list(updated_entity.keys())}")
248
+ logger.info(f"entity to drop : {dropped_entity}")
249
+
250
+ query_insert_entity = []
251
+ for k,v in updated_entity.items():
252
+ query_insert_entity_ = f"""
253
+ CURRENT VALIDTIME
254
+ MERGE INTO {self.entity_table_name} EXISTING
255
+ USING (
256
+ SEL
257
+ '{dataset_id}' AS DATASET_ID
258
+ , '{k}' AS ENTITY
259
+ , '{v}' AS ENTITY_TYPE
260
+ ) UPDATED
261
+ ON EXISTING.DATASET_ID = UPDATED.DATASET_ID
262
+ AND EXISTING.ENTITY = UPDATED.ENTITY
263
+ WHEN MATCHED THEN
264
+ UPDATE
265
+ SET
266
+ ENTITY_TYPE = UPDATED.ENTITY_TYPE
267
+
268
+ WHEN NOT MATCHED THEN
269
+ INSERT (
270
+ UPDATED.DATASET_ID,
271
+ UPDATED.ENTITY,
272
+ UPDATED.ENTITY_TYPE
273
+ )
274
+ """
275
+ query_insert_entity.append(query_insert_entity_)
276
+
277
+ for k in dropped_entity:
278
+ query_insert_entity_ = f"""
279
+ CURRENT VALIDTIME
280
+ DELETE {self.entity_table_name} WHERE DATASET_ID = '{dataset_id}' AND ENTITY = '{k}'
281
+ """
282
+ query_insert_entity.append(query_insert_entity_)
283
+
284
+ updated_features = dataset.features
285
+
286
+ if existing_features is not None:
287
+ dropped_features = [f for f in existing_features.FEATURE_NAME.values if f.lower() not in map(str.lower, updated_features.keys())]
288
+ else:
289
+ dropped_features = []
290
+
291
+ logger.info(f"features to update : {list(updated_features.keys())}")
292
+ logger.info(f"features to drop : {dropped_features}")
293
+
294
+ query_insert_features = []
295
+ for k,v in updated_features.items():
296
+ query_insert_feature_ = f"""
297
+ CURRENT VALIDTIME
298
+ MERGE INTO {self.feature_table_name} EXISTING
299
+ USING (
300
+ SEL
301
+ '{dataset_id}' AS DATASET_ID
302
+ , {v['id']} AS FEATURE_ID
303
+ , '{k}' AS FEATURE_NAME
304
+ , '{v['type']}' AS FEATURE_TYPE
305
+ , '{v['database']}' AS FEATURE_DATABASE
306
+ , '{v['view']}' AS FEATURE_VIEW
307
+ ) UPDATED
308
+ ON EXISTING.DATASET_ID = UPDATED.DATASET_ID
309
+ AND EXISTING.FEATURE_NAME = UPDATED.FEATURE_NAME
310
+ WHEN MATCHED THEN
311
+ UPDATE
312
+ SET
313
+ FEATURE_ID = UPDATED.FEATURE_ID
314
+ , FEATURE_TYPE = UPDATED.FEATURE_TYPE
315
+ , FEATURE_DATABASE = UPDATED.FEATURE_DATABASE
316
+ , FEATURE_VIEW = UPDATED.FEATURE_VIEW
317
+ WHEN NOT MATCHED THEN
318
+ INSERT (
319
+ UPDATED.DATASET_ID,
320
+ UPDATED.FEATURE_ID,
321
+ UPDATED.FEATURE_NAME,
322
+ UPDATED.FEATURE_TYPE,
323
+ UPDATED.FEATURE_DATABASE,
324
+ UPDATED.FEATURE_VIEW
325
+ )
326
+ """
327
+ query_insert_features.append(query_insert_feature_)
328
+
329
+ for k in dropped_entity:
330
+ query_insert_feature_ = f"""
331
+ CURRENT VALIDTIME
332
+ DELETE {self.feature_table_name} WHERE DATASET_ID = '{dataset_id}' AND FEATURE_NAME = '{k}'
333
+ """
334
+ query_insert_features.append(query_insert_feature_)
335
+
336
+ queries = [query_insert_catalog] + query_insert_entity + query_insert_features
337
+ for query in queries:
338
+ logger.info(query.split('\n')[2].strip())
339
+ tdml.execute_sql(query)
340
+
341
+ def drop_dataset(self, dataset_id):
342
+ if self.catalog[self.catalog.DATASET_ID == dataset_id].shape[0] == 1:
343
+ query_drop_feature = f"""
344
+ CURRENT VALIDTIME
345
+ DELETE {self.feature_table_name} WHERE DATASET_ID = '{dataset_id}'
346
+ """
347
+ query_drop_entity = f"""
348
+ CURRENT VALIDTIME
349
+ DELETE {self.entity_table_name} WHERE DATASET_ID = '{dataset_id}'
350
+ """
351
+ query_drop_catalog = f"""
352
+ CURRENT VALIDTIME
353
+ DELETE {self.catalog_table_name} WHERE DATASET_ID = '{dataset_id}'
354
+ """
355
+
356
+ for query in [query_drop_feature, query_drop_entity, query_drop_catalog]:
357
+ logger.info(query.split('\n')[2].strip())
358
+ tdml.execute_sql(query)
359
+
360
+ def get_dataset_entity(self, dataset_id = None):
361
+
362
+ if dataset_id is None:
363
+ return self.entity
364
+ else:
365
+ return self.entity[self.entity.DATASET_ID == dataset_id]
366
+
367
+
368
+ def get_dataset_features(self, dataset_id = None):
369
+
370
+ if dataset_id is None:
371
+ return self.features
372
+ else:
373
+ return self.features[self.features.DATASET_ID == dataset_id]
@@ -124,6 +124,7 @@ def get_feature_id_and_conversion(list_entity_id, feature_names):
124
124
  conversion_name2id = {x[1]: x[0] for x in feature_id_names}
125
125
 
126
126
  return feature_id_names, conversion_name2id
127
+
127
128
  def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=None, primary_index=None, partitioning = '', entity_null_substitute={}, **kwargs):
128
129
  """
129
130
  Transforms and prepares a DataFrame for feature ingestion into a feature store by unpivoting it.
@@ -73,6 +73,7 @@ def feature_store_catalog_creation(if_exists='replace', comment='this table is a
73
73
 
74
74
  FEATURE_ID BIGINT,
75
75
  FEATURE_NAME VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
76
+ FEATURE_TYPE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
76
77
  FEATURE_TABLE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
77
78
  FEATURE_DATABASE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
78
79
  FEATURE_VIEW VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
@@ -410,12 +411,12 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
410
411
  # Create a DataFrame from the feature_names_types dictionary
411
412
  if len(feature_names_types.keys()) > 1:
412
413
  df = pd.DataFrame(feature_names_types).transpose().reset_index()
413
- df.columns = ['FEATURE_NAME', 'TYPE', 'FEATURE_ID']
414
+ df.columns = ['FEATURE_NAME', 'FEATURE_TYPE', 'FEATURE_ID']
414
415
  else:
415
416
  df = pd.DataFrame(columns=['FEATURE_NAME', 'TYPE', 'FEATURE_ID'])
416
417
  k = list(feature_names_types.keys())[0]
417
418
  df['FEATURE_NAME'] = [k]
418
- df['TYPE'] = [feature_names_types[k]['type']]
419
+ df['FEATURE_TYPE'] = [feature_names_types[k]['type']]
419
420
  df['FEATURE_ID'] = [feature_names_types[k]['id']]
420
421
 
421
422
 
@@ -458,6 +459,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
458
459
  SELECT
459
460
  CASE WHEN B.FEATURE_ID IS NULL THEN A.FEATURE_ID ELSE B.FEATURE_ID END AS FEATURE_ID
460
461
  , A.FEATURE_NAME
462
+ , A.FEATURE_TYPE
461
463
  , A.FEATURE_TABLE
462
464
  , A.FEATURE_DATABASE
463
465
  , A.FEATURE_VIEW
@@ -476,6 +478,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
476
478
  UPDATE
477
479
  SET
478
480
  FEATURE_TABLE = UPDATED_FEATURES.FEATURE_TABLE,
481
+ FEATURE_TYPE = UPDATED_FEATURES.FEATURE_TYPE,
479
482
  FEATURE_DATABASE = UPDATED_FEATURES.FEATURE_DATABASE,
480
483
  FEATURE_VIEW = UPDATED_FEATURES.FEATURE_VIEW
481
484
  --,ENTITY_NAME = UPDATED_FEATURES.ENTITY_NAME -- modified
@@ -483,6 +486,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
483
486
  INSERT
484
487
  ( UPDATED_FEATURES.FEATURE_ID
485
488
  , UPDATED_FEATURES.FEATURE_NAME
489
+ , UPDATED_FEATURES.FEATURE_TYPE
486
490
  , UPDATED_FEATURES.FEATURE_TABLE
487
491
  , UPDATED_FEATURES.FEATURE_DATABASE
488
492
  , UPDATED_FEATURES.FEATURE_VIEW
@@ -498,6 +502,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
498
502
  SELECT
499
503
  CASE WHEN B.FEATURE_ID IS NULL THEN A.FEATURE_ID ELSE B.FEATURE_ID END AS FEATURE_ID
500
504
  , A.FEATURE_NAME
505
+ , A.FEATURE_TYPE
501
506
  , A.FEATURE_TABLE
502
507
  , A.FEATURE_DATABASE
503
508
  , A.FEATURE_VIEW
@@ -516,6 +521,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
516
521
  UPDATE
517
522
  SET
518
523
  FEATURE_TABLE = UPDATED_FEATURES.FEATURE_TABLE,
524
+ FEATURE_TYPE = UPDATED_FEATURES.FEATURE_TYPE,
519
525
  FEATURE_DATABASE = UPDATED_FEATURES.FEATURE_DATABASE,
520
526
  FEATURE_VIEW = UPDATED_FEATURES.FEATURE_VIEW
521
527
  --,ENTITY_NAME = UPDATED_FEATURES.ENTITY_NAME -- modified
@@ -523,6 +529,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
523
529
  INSERT
524
530
  ( UPDATED_FEATURES.FEATURE_ID
525
531
  , UPDATED_FEATURES.FEATURE_NAME
532
+ , UPDATED_FEATURES.FEATURE_TYPE
526
533
  , UPDATED_FEATURES.FEATURE_TABLE
527
534
  , UPDATED_FEATURES.FEATURE_DATABASE
528
535
  , UPDATED_FEATURES.FEATURE_VIEW
tdfs4ds/utils/info.py CHANGED
@@ -2,6 +2,8 @@ import re
2
2
 
3
3
  import tdfs4ds
4
4
  import teradataml as tdml
5
+ from tdfs4ds import logger
6
+ import numpy as np
5
7
 
6
8
  def get_column_types(df, columns):
7
9
  """
@@ -264,4 +266,40 @@ def get_feature_types_sql_format(tddf, columns = None):
264
266
  res = tdml.DataFrame.from_query(query).to_pandas()
265
267
 
266
268
  # Return column names with their corresponding SQL data types in a dictionary
267
- return {c: res[c].values[0].strip() for c in columns}
269
+ return {c: res[c].values[0].strip() for c in columns}
270
+
271
+ def update_varchar_length(feature_types: dict, new_varchar_length: int) -> dict:
272
+ """
273
+ Updates the length of all VARCHAR fields in the feature_types dictionary based on an increment.
274
+ The new length is calculated as ceil(previous_length / new_varchar_length) * new_varchar_length,
275
+ ensuring that when new_varchar_length is equal to the current length, no change occurs.
276
+
277
+ Args:
278
+ feature_types (dict): A dictionary where keys are feature names and values are dictionaries with 'type' and 'id'.
279
+ new_varchar_length (int): The increment value for adjusting VARCHAR lengths.
280
+
281
+ Returns:
282
+ dict: A dictionary with updated VARCHAR lengths.
283
+
284
+ Issues a warning if the new length is smaller than the original length.
285
+ """
286
+ updated_feature_types = {}
287
+ varchar_pattern = re.compile(r'VARCHAR\((\d+)\)', re.IGNORECASE)
288
+
289
+ for key, value in feature_types.items():
290
+ type_value = value['type']
291
+ match = varchar_pattern.search(type_value)
292
+ if match:
293
+ original_length = int(match.group(1))
294
+ modified_length = int(np.ceil(original_length / new_varchar_length) * new_varchar_length)
295
+
296
+ if modified_length < original_length:
297
+ logger.warning(f"Reducing VARCHAR length for {key} from {original_length} to {modified_length}")
298
+
299
+ # Replace only the VARCHAR length
300
+ updated_value = varchar_pattern.sub(f'VARCHAR({modified_length})', type_value)
301
+ updated_feature_types[key] = {'type': updated_value, 'id': value['id']}
302
+ else:
303
+ updated_feature_types[key] = value
304
+
305
+ return updated_feature_types
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tdfs4ds
3
- Version: 0.2.4.3
3
+ Version: 0.2.4.5
4
4
  Summary: A python package to simplify the usage of feature store using Teradata Vantage ...
5
5
  Author: Denis Molin
6
6
  Requires-Python: >=3.6
@@ -2,18 +2,21 @@ tdfs/__init__.py,sha256=7AcO7uB1opRCt7t2JOHworKimfAaDeO3boRW7u9Geo8,23
2
2
  tdfs/datasets.py,sha256=-b2MPEKGki2V1M8iUcoDR9uc2krIK7u1CK-EhChvihs,985
3
3
  tdfs/feature_store.py,sha256=Honu7eOAXxP4Ivz0mRlhuNkfTDzgZl5HB1WlQUwzcZ0,31354
4
4
  tdfs/data/curves.csv,sha256=q0Tm-0yu7VMK4lHvHpgi1LMeRq0lO5gJy2Q17brKbEM,112488
5
- tdfs4ds/__init__.py,sha256=OOakI_WdX1fjXTheqqLMUQY99apaGFXdEYg_SQpWQng,63986
5
+ tdfs4ds/__init__.py,sha256=uyLZlPaGAVi41BEZke6OnknD0RDRWkcr-7nkCjFym34,65844
6
6
  tdfs4ds/datasets.py,sha256=LE4Gn0muwdyrIrCrbkE92cnafUML63z1lj5bFIIVzmc,3524
7
7
  tdfs4ds/feature_engineering.py,sha256=oVnZ2V_XNGE12LKC_fNfkrWSQZLgtYRmaf8Dispi6S4,7081
8
8
  tdfs4ds/feature_store.py,sha256=y-oItPZw6nBkBcGAceaATZbkLPTsvpk0OnpzTxYofDs,68576
9
9
  tdfs4ds/process_store.py,sha256=W97pwqOwabo062ow_LfAXZmlSkcq8xTuwhwAX1EStlQ,16939
10
10
  tdfs4ds/utils.py,sha256=xF1VP0NCgosXcKymOo_ofMMnvLEF228IxaxIl-f65uA,23312
11
11
  tdfs4ds/data/curves.csv,sha256=q0Tm-0yu7VMK4lHvHpgi1LMeRq0lO5gJy2Q17brKbEM,112488
12
+ tdfs4ds/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ tdfs4ds/dataset/dataset.py,sha256=caiQwT-RtdPe5MDtsynWMm1n12OxftgMp7_BR9SCHKw,5360
14
+ tdfs4ds/dataset/dataset_catalog.py,sha256=qxS2thDW2MvsRouSFaX1M0sX2J7IzBAYD8Yf22Tsd5k,16638
12
15
  tdfs4ds/feature_store/__init__.py,sha256=a7NPCkpTx40UR5LRErwnskpABG2Vuib7F5wUjaUGCnI,209
13
16
  tdfs4ds/feature_store/entity_management.py,sha256=9ltytv3yCTG84NZXBpb1Tlkf9pOxvrNb0MVidU4pwvE,10157
14
- tdfs4ds/feature_store/feature_data_processing.py,sha256=SuJeCTJF51l9-VS9WRS0oBUnxaVqba4hqjOpsCtdVs8,42352
17
+ tdfs4ds/feature_store/feature_data_processing.py,sha256=vCviEJ0ARjaZ2KB8LUAdCyHdErYtWyng6iNiMJy9SQg,42354
15
18
  tdfs4ds/feature_store/feature_query_retrieval.py,sha256=zuHRZhL6-qyLpPS7mWgRy1WingSN5iibkbi53Q7jfAs,33834
16
- tdfs4ds/feature_store/feature_store_management.py,sha256=RIa3ZjKBULTovEmy3KEa0M2Rn5D6LMizDVnx4Q25S6o,55724
19
+ tdfs4ds/feature_store/feature_store_management.py,sha256=WcgawACgC_lI880wj_FO2wV_FIp0W5WZ3x7r2-0WKdI,56121
17
20
  tdfs4ds/process_store/__init__.py,sha256=npHR_xju5ecGmWfYHDyteLwiU3x-cL4HD3sFK_th7xY,229
18
21
  tdfs4ds/process_store/process_followup.py,sha256=PvLcU7meg3ljBlPfuez3qwTVqpHHhVJxYxGqjgiHE8E,7265
19
22
  tdfs4ds/process_store/process_query_administration.py,sha256=DsIt97cBoJ7NcpQzbQt55eUFNgXGdOMm5Hh2aX5v0PY,7762
@@ -21,12 +24,12 @@ tdfs4ds/process_store/process_registration_management.py,sha256=F8VlBoL-de98KnkM
21
24
  tdfs4ds/process_store/process_store_catalog_management.py,sha256=H135RRTYn-pyWIqPVbHpuIyyvsaNrek6b1iPk8avJMI,16088
22
25
  tdfs4ds/utils/__init__.py,sha256=-yTMfDLZbQnIRQ64s_bczzT21tDW2A8FZeq9PX5SgFU,168
23
26
  tdfs4ds/utils/filter_management.py,sha256=7D47N_hnTSUVOkaV2XuKrlUFMxzWjDsCBvRYsH4lXdU,11073
24
- tdfs4ds/utils/info.py,sha256=lc9-rQDfM4NWnZGkSUkY_G0qYx7qnoErNKKcYMuLIRs,10554
27
+ tdfs4ds/utils/info.py,sha256=SQR_ec4M9-5Z4erjb9_N0n8JPY1wpelgxkw3B12D1Q4,12322
25
28
  tdfs4ds/utils/lineage.py,sha256=LI-5pG7D8lO3-YFa9qA6CrEackiYugV23_Vz9IpF5xw,28670
26
29
  tdfs4ds/utils/query_management.py,sha256=nAcE8QY1GWAKgOtb-ubSfDVcnYbU7Ge8CruVRLoPtmY,6356
27
30
  tdfs4ds/utils/time_management.py,sha256=_jbwdyZH4Yr3VzbUrq6X93FpXDCDEdH0iv56vX7j8mA,8446
28
31
  tdfs4ds/utils/visualization.py,sha256=5S528KoKzzkrAdCxfy7ecyqKvAXBoibNvHwz_u5ISMs,23167
29
- tdfs4ds-0.2.4.3.dist-info/METADATA,sha256=dUqe-90oXLdYx2U6F-WmeQDHhAFN_vvZrFfVuYGmTn8,11944
30
- tdfs4ds-0.2.4.3.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
31
- tdfs4ds-0.2.4.3.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
32
- tdfs4ds-0.2.4.3.dist-info/RECORD,,
32
+ tdfs4ds-0.2.4.5.dist-info/METADATA,sha256=JwpkKDPO-5TgvnQFZJF8qllaVkUzcbv6dwHJPp2Sd1M,11944
33
+ tdfs4ds-0.2.4.5.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
34
+ tdfs4ds-0.2.4.5.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
35
+ tdfs4ds-0.2.4.5.dist-info/RECORD,,