tdfs4ds 0.2.4.32__py3-none-any.whl → 0.2.4.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tdfs4ds/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = '0.2.4.32'
1
+ __version__ = '0.2.4.33'
2
2
  import logging
3
3
  # Setup the logger
4
4
  logging.basicConfig(
@@ -7,6 +7,15 @@ logging.basicConfig(
7
7
  datefmt='%Y-%m-%d %H:%M:%S' # Set the date/time format
8
8
  )
9
9
 
10
+ # Helper: central logging gate controlled by tdfs4ds.DISPLAY_LOGS
11
+ def logger_safe(level, message, *args, **kwargs):
12
+ """
13
+ Wrapper around the global `logger` that only emits logs when
14
+ tdfs4ds.DISPLAY_LOGS is True. `level` is a string like "info", "error", etc.
15
+ """
16
+ if getattr(tdfs4ds, "DISPLAY_LOGS", True):
17
+ getattr(logger, level)(message, *args, **kwargs)
18
+
10
19
  logger = logging.getLogger(__name__)
11
20
 
12
21
  from tdfs4ds.feature_store.feature_query_retrieval import get_available_entity_id_records, write_where_clause_filter
@@ -70,92 +79,80 @@ PROCESS_TYPE = 'RUN PROCESS'
70
79
  try:
71
80
  SCHEMA = tdml.context.context._get_current_databasename()
72
81
  if SCHEMA is None:
73
- print('Please specify the database which is hosting the feature store.')
74
- print('tdfs4ds.feature_store.schema = "<feature store database>"')
82
+ logger.warning("No default database detected for feature store.")
83
+ logger.warning('Please set it explicitly: tdfs4ds.feature_store.schema = "<feature store database>"')
75
84
  else:
76
- print('The default database is used for the feature store.')
77
- print(f"tdfs4ds.feature_store.schema = '{SCHEMA}'")
85
+ logger.info("Default database detected for feature store: %s", SCHEMA)
86
+ logger.info('tdfs4ds.feature_store.schema = "%s"', SCHEMA)
87
+
78
88
  if DATA_DOMAIN is None:
79
89
  DATA_DOMAIN = SCHEMA
80
- print(f"the data domain for the current work is :{DATA_DOMAIN}")
81
- print("Please update it as you wish with tdfs4ds.DATA_DOMAIN=<your data domain>")
90
+ logger.info("DATA_DOMAIN not set. Defaulting to SCHEMA: %s", DATA_DOMAIN)
91
+ logger.info('You can override it using: tdfs4ds.DATA_DOMAIN = "<your data domain>"')
82
92
 
83
93
  except Exception as e:
84
- print('Please specify the database which is hosting the feature store.')
85
- print('tdfs4ds.feature_store.schema = "<feature store database>"')
94
+ logger.error("Could not determine current database: %s", str(e).split('\n')[0])
95
+ logger.warning("Please specify the feature store database manually:")
96
+ logger.warning('tdfs4ds.feature_store.schema = "<feature store database>"')
86
97
 
87
98
 
88
99
  def setup(database, if_exists='fail'):
89
100
  """
90
- Set up the database environment by configuring schema names and optionally dropping existing tables.
91
-
92
- This function sets the database schema for feature and process catalogs. If specified, it also handles
93
- the replacement of existing catalog tables. It reports the status of these operations, including any
94
- encountered exceptions.
95
-
96
- Parameters:
97
- database (str): The name of the database schema to be used.
98
- if_exists (str, optional): Determines the behavior if catalog tables already exist in the database.
99
- 'fail' (default) - Do nothing if the tables exist.
100
- 'replace' - Drop the tables if they exist before creating new ones.
101
-
102
- Steps performed:
103
- 1. Sets the schema to the provided database name.
104
- 2. If 'if_exists' is 'replace', attempts to drop 'FS_FEATURE_CATALOG' and 'FS_PROCESS_CATALOG' tables.
105
- 3. Creates new feature and process catalog tables and sets their names in the tdfs4ds module.
106
- 4. Prints the names of the newly created tables along with the database name.
107
- 5. Captures and prints the first line of any exceptions that occur during these operations.
108
-
109
- Returns:
110
- None
101
+ Initialize the feature store environment by creating catalog tables and views.
111
102
  """
112
103
 
113
104
  from tdfs4ds.feature_store.feature_store_management import feature_store_catalog_creation
114
105
  from tdfs4ds.process_store.process_store_catalog_management import process_store_catalog_creation
115
106
 
116
107
  tdfs4ds.SCHEMA = database
108
+ logger_safe("info", "Setting up feature store in database: %s", database)
109
+
117
110
  if if_exists == 'replace':
118
- try:
119
- tdml.db_drop_table(table_name = tdfs4ds.FEATURE_CATALOG_NAME, schema_name=database)
120
- except Exception as e:
121
- print(str(e).split('\n')[0])
122
- try:
123
- tdml.db_drop_table(table_name = tdfs4ds.PROCESS_CATALOG_NAME, schema_name=database)
124
- except Exception as e:
125
- print(str(e).split('\n')[0])
126
- try:
127
- tdml.db_drop_table(table_name = tdfs4ds.DATA_DISTRIBUTION_NAME, schema_name=database)
128
- except Exception as e:
129
- print(str(e).split('\n')[0])
111
+ logger_safe("info", "Replacing existing catalog tables if they exist.")
112
+ for table in [tdfs4ds.FEATURE_CATALOG_NAME, tdfs4ds.PROCESS_CATALOG_NAME, tdfs4ds.DATA_DISTRIBUTION_NAME]:
113
+ try:
114
+ tdml.db_drop_table(table_name=table, schema_name=database)
115
+ logger_safe("info", "Dropped table %s.%s", database, table)
116
+ except Exception as e:
117
+ logger_safe("warning", "Could not drop table %s.%s: %s", database, table, str(e).split('\n')[0])
130
118
 
131
119
  DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME).drop_catalog()
120
+
132
121
  try:
133
122
  tdfs4ds.FEATURE_CATALOG_NAME = feature_store_catalog_creation()
134
- print('feature catalog table: ', tdfs4ds.FEATURE_CATALOG_NAME, ' in database ', database)
123
+ logger_safe("info", "Feature catalog table created: %s in database %s", tdfs4ds.FEATURE_CATALOG_NAME, database)
135
124
  except Exception as e:
136
- print(str(e).split('\n')[0])
125
+ logger_safe("error", "Feature catalog creation failed: %s", str(e).split('\n')[0])
137
126
 
138
127
  try:
139
- tdfs4ds.PROCESS_CATALOG_NAME, tdfs4ds.DATA_DISTRIBUTION_NAME, tdfs4ds.FILTER_MANAGER_NAME = process_store_catalog_creation()
140
- print('process catalog table: ', tdfs4ds.PROCESS_CATALOG_NAME, ' in database ', database)
141
- print('data distribution table: ', tdfs4ds.DATA_DISTRIBUTION_NAME, ' in database ', database)
142
- print('filter manager table: ', tdfs4ds.FILTER_MANAGER_NAME, ' in database ', database)
128
+ (tdfs4ds.PROCESS_CATALOG_NAME,
129
+ tdfs4ds.DATA_DISTRIBUTION_NAME,
130
+ tdfs4ds.FILTER_MANAGER_NAME) = process_store_catalog_creation()
131
+
132
+ logger_safe("info", "Process catalog table created: %s", tdfs4ds.PROCESS_CATALOG_NAME)
133
+ logger_safe("info", "Data distribution table created: %s", tdfs4ds.DATA_DISTRIBUTION_NAME)
134
+ logger_safe("info", "Filter manager table created: %s", tdfs4ds.FILTER_MANAGER_NAME)
143
135
  except Exception as e:
144
- print(str(e).split('\n')[0])
136
+ logger_safe("error", "Process catalog creation failed: %s", str(e).split('\n')[0])
145
137
 
146
138
  try:
147
139
  tdfs4ds.process_store.process_followup.follow_up_table_creation()
140
+ logger_safe("info", "Follow-up table created successfully.")
148
141
  except Exception as e:
149
- print(str(e).split('\n')[0])
142
+ logger_safe("error", "Follow-up table creation failed: %s", str(e).split('\n')[0])
150
143
 
151
144
  tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
152
145
  tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
146
+
153
147
  dataset_catalog = DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME)
154
148
  if not dataset_catalog._exists():
155
149
  dataset_catalog.create_catalog()
150
+ logger_safe("info", "Dataset catalog created: %s", tdfs4ds.DATASET_CATALOG_NAME)
156
151
 
152
+ logger_safe("info", "Setup complete.")
157
153
  return
158
154
 
155
+
159
156
  def connect(
160
157
  database = tdfs4ds.SCHEMA,
161
158
  feature_catalog_name = tdfs4ds.FEATURE_CATALOG_NAME,
@@ -166,15 +163,15 @@ def connect(
166
163
  feature_catalog_name_view = tdfs4ds.FEATURE_CATALOG_NAME_VIEW,
167
164
  process_catalog_name_view = tdfs4ds.PROCESS_CATALOG_NAME_VIEW,
168
165
  dataset_catalog_name = tdfs4ds.DATASET_CATALOG_NAME,
169
- create_if_missing = False # New argument
166
+ create_if_missing = False
170
167
  ):
171
- if database is not None:
172
- tdfs4ds.SCHEMA = database
173
- else:
168
+ if database is None:
174
169
  raise ValueError("database parameter is None.")
170
+ tdfs4ds.SCHEMA = database
171
+ logger_safe("info", "Connecting to feature store in database: %s", database)
175
172
 
176
173
  tables = [x.lower() for x in list(tdml.db_list_tables(schema_name=tdfs4ds.SCHEMA, object_type='table').TableName.values)]
177
-
174
+
178
175
  feature_exists = feature_catalog_name.lower() in tables
179
176
  process_exists = process_catalog_name.lower() in tables
180
177
  distrib_exists = data_distribution_name.lower() in tables
@@ -183,20 +180,20 @@ def connect(
183
180
 
184
181
  if not (feature_exists and process_exists and distrib_exists and filter_manager_exists):
185
182
  if not create_if_missing:
186
- return False # Feature store does not exist
187
- else:
188
- # Create the missing components
189
- if not feature_exists:
190
- tdfs4ds.feature_store.feature_store_management.feature_store_catalog_creation()
191
- if not process_exists:
192
- tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_creation()
193
- if not distrib_exists:
194
- tdfs4ds.data_distribution.data_distribution_catalog_creation()
195
- if not filter_manager_exists:
196
- tdfs4ds.filter_manager.filter_manager_catalog_creation()
197
-
198
- # Follow-up table handling
183
+ logger_safe("warning", "Feature store components missing and create_if_missing=False")
184
+ return False
185
+ logger_safe("info", "Missing components detected; creating missing parts...")
186
+ if not feature_exists:
187
+ tdfs4ds.feature_store.feature_store_management.feature_store_catalog_creation()
188
+ if not process_exists:
189
+ tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_creation()
190
+ if not distrib_exists:
191
+ tdfs4ds.data_distribution.data_distribution_catalog_creation()
192
+ if not filter_manager_exists:
193
+ tdfs4ds.filter_manager.filter_manager_catalog_creation()
194
+
199
195
  if not followup_name_exists:
196
+ logger_safe("info", "Creating follow-up table: %s", followup_name)
200
197
  tdfs4ds.process_store.process_followup.follow_up_table_creation()
201
198
  tdfs4ds.FOLLOW_UP_NAME = followup_name
202
199
 
@@ -210,30 +207,31 @@ def connect(
210
207
 
211
208
  process_list = tdml.DataFrame(tdml.in_schema(database, process_catalog_name))
212
209
  if 'ENTITY_NULL_SUBSTITUTE' not in process_list.columns:
213
- print('ENTITY_NULL_SUBSTITUTE column does not exist in the existing process catalog')
214
- print('upgrade to the latest DDL')
210
+ logger_safe("warning", "ENTITY_NULL_SUBSTITUTE column missing. Upgrading catalog.")
215
211
  tdfs4ds.process_store.process_store_catalog_management.upgrade_process_catalog()
216
212
 
217
213
  tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
218
214
  tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
219
215
 
220
- # Dataset catalog setup
216
+ # Dataset Catalog
221
217
  tdfs4ds.DATASET_CATALOG_NAME = dataset_catalog_name
222
- dataset_catalog = DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME)
218
+ dataset_catalog = DatasetCatalog(schema_name=database, name=dataset_catalog_name)
223
219
  if not dataset_catalog._exists():
224
220
  dataset_catalog.create_catalog()
221
+ logger_safe("info", "Dataset catalog created: %s", dataset_catalog_name)
225
222
 
226
- # Check if distribution is temporal
223
+ # Detect temporal distribution
227
224
  def is_data_distribution_temporal():
228
225
  return 'PERIOD' in tdfs4ds.utils.lineage.get_ddl(
229
226
  view_name=tdfs4ds.DATA_DISTRIBUTION_NAME,
230
227
  schema_name=tdfs4ds.SCHEMA,
231
228
  object_type='table'
232
229
  )
233
-
230
+
234
231
  tdfs4ds.DATA_DISTRIBUTION_TEMPORAL = is_data_distribution_temporal()
235
-
236
- return True # Feature store exists or was created
232
+ logger_safe("info", "Connected to feature store successfully.")
233
+ return True
234
+
237
235
 
238
236
 
239
237
 
@@ -287,50 +285,22 @@ def get_dataset_entity(dataset_id = None):
287
285
  def get_dataset_features(dataset_id = None):
288
286
  return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_features(dataset_id)
289
287
 
290
- def run(process_id, return_dataset = False, force_compute = False, force_varchar_length = None):
288
+ def run(process_id, return_dataset=False, force_compute=False, force_varchar_length=None):
291
289
  """
292
290
  Executes a specific process from the feature store identified by the process ID.
293
- The function handles different process types and performs appropriate actions.
294
-
295
- Parameters:
296
- - process_id (str): The unique identifier of the process to run.
297
- - return_dataset (bool, optional): A flag indicating whether to return the dataset created during the process.
298
- Default is False.
299
- - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
300
- Default is False.
301
- - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
302
- VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
303
- where k is the smallest integer so that the original lengths is smaller or equal
304
- to k x force_varchar_length. Default is None.
305
-
306
- Returns:
307
- DataFrame or None: If return_dataset is True, returns the dataset created during the process. Otherwise, returns None.
308
-
309
- This function performs the following steps:
310
- 1. Determines the process type and initializes necessary variables.
311
- 2. Constructs and executes a SQL query to retrieve process details by process ID.
312
- 3. Fetches the filter manager, process type, primary index, partitioning, and data domain from the query result.
313
- 4. Handles different process types, such as 'denormalized view' and 'tdstone2 view'.
314
- 5. For 'denormalized view' process type, extracts necessary details, fetches data, and uploads features to the feature store.
315
- 6. Optionally returns the dataset created during the process if return_dataset is True.
316
-
317
- Note:
318
- - The function relies on various sub-modules within the `tdfs4ds` library for different steps of the process, from
319
- data retrieval to feature uploading.
320
- - It is intended to be used internally within a system that manages a Teradata feature store, assuming access to
321
- a Teradata database and the appropriate schema for feature storage.
291
+ Uses global `logger` for diagnostics (gated by tdfs4ds.DISPLAY_LOGS).
322
292
  """
323
293
 
324
294
  if tdfs4ds.PROCESS_TYPE is None:
325
295
  PROCESS_TYPE_ = 'RUN PROCESS'
326
- tdfs4ds.RUN_ID = str(uuid.uuid4())
296
+ tdfs4ds.RUN_ID = str(uuid.uuid4())
327
297
  else:
328
298
  PROCESS_TYPE_ = tdfs4ds.PROCESS_TYPE
329
299
 
330
- if tdfs4ds.DEBUG_MODE:
331
- print('def run','tdfs4ds.FEATURE_STORE_TIME', tdfs4ds.FEATURE_STORE_TIME)
300
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
301
+ logger_safe("debug", "def run | tdfs4ds.FEATURE_STORE_TIME=%s", tdfs4ds.FEATURE_STORE_TIME)
332
302
 
333
- if tdfs4ds.FEATURE_STORE_TIME == None:
303
+ if tdfs4ds.FEATURE_STORE_TIME is None:
334
304
  validtime_statement = 'CURRENT VALIDTIME'
335
305
  else:
336
306
  validtime_statement = f"VALIDTIME AS OF TIMESTAMP '{tdfs4ds.FEATURE_STORE_TIME}'"
@@ -342,148 +312,110 @@ def run(process_id, return_dataset = False, force_compute = False, force_varchar
342
312
  WHERE A.PROCESS_ID = '{process_id}'
343
313
  """
344
314
 
315
+ logger_safe(
316
+ "info",
317
+ "Starting run | run_id=%s | process_type=%s | process_id=%s | return_dataset=%s | force_compute=%s | force_varchar_length=%s",
318
+ tdfs4ds.RUN_ID, PROCESS_TYPE_, process_id, return_dataset, force_compute, force_varchar_length
319
+ )
320
+
345
321
  # Executing the query and converting the result to Pandas DataFrame
346
322
  df = tdml.DataFrame.from_query(query).to_pandas()
347
323
 
348
- # Check if exactly one record is returned, else print an error
324
+ # Check if exactly one record is returned, else log an error and return
349
325
  if df.shape[0] != 1:
350
- print('error - there is ', df.shape[0], f' records. Check table {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}')
351
- print('check ou this query:')
352
- print(query)
326
+ logger_safe(
327
+ "error",
328
+ "Process catalog lookup returned %s record(s); expected 1. Check table %s.%s. Query: %s",
329
+ df.shape[0], tdfs4ds.SCHEMA, tdfs4ds.PROCESS_CATALOG_NAME_VIEW, query.strip()
330
+ )
353
331
  return
354
332
 
355
-
356
333
  # Fetching the filter manager
357
334
  filter_schema_name = df['FILTER_DATABASE_NAME'].values[0]
358
335
  if filter_schema_name is None:
359
336
  filtermanager = None
360
337
  else:
361
338
  filter_view_name = df['FILTER_VIEW_NAME'].values[0]
362
- filter_table_name = df['FILTER_TABLE_NAME'].values[0]
339
+ filter_table_name = df['FILTER_TABLE_NAME'].values[0] # kept for parity; not used directly here
363
340
  filtermanager = FilterManager(table_name=filter_view_name, schema_name=filter_schema_name)
364
341
 
365
- # Fetching the process type from the query result
366
- process_type = df['PROCESS_TYPE'].values[0]
367
-
368
- # Fetching the primary index from the query result
369
- primary_index = df['FOR_PRIMARY_INDEX'].values[0]
342
+ # Fetching process metadata
343
+ process_type = df['PROCESS_TYPE'].values[0]
344
+ primary_index = df['FOR_PRIMARY_INDEX'].values[0]
370
345
  if primary_index is not None:
371
- primary_index = primary_index.split(',')
372
-
373
- # Fetching the primary index from the query result
374
- partitioning = df['FOR_DATA_PARTITIONING'].values[0]
375
-
376
- # Fetching the data domain from the query result
377
- DATA_DOMAIN = df['DATA_DOMAIN'].values[0]
346
+ primary_index = [x.strip() for x in primary_index.split(',') if x.strip()]
347
+ partitioning = df['FOR_DATA_PARTITIONING'].values[0]
348
+ DATA_DOMAIN = df['DATA_DOMAIN'].values[0]
349
+
350
+ logger_safe(
351
+ "info",
352
+ "Process metadata | process_id=%s | process_type=%s | primary_index=%s | partitioning=%s | data_domain=%s | validtime=%s",
353
+ process_id, process_type, primary_index, partitioning, DATA_DOMAIN, validtime_statement
354
+ )
378
355
 
379
356
  # Handling 'denormalized view' process type
380
357
  if process_type == 'denormalized view':
381
- # Extracting necessary details for this process type
382
- view_name = df['VIEW_NAME'].values[0]
383
- entity_id = df['ENTITY_ID'].values[0].split(',')
358
+ view_name = df['VIEW_NAME'].values[0]
359
+ entity_id = [x.strip() for x in df['ENTITY_ID'].values[0].split(',') if x.strip()]
384
360
  entity_null_substitute = eval(df['ENTITY_NULL_SUBSTITUTE'].values[0])
385
- feature_names = df['FEATURE_NAMES'].values[0].split(',')
361
+ feature_names = [x.strip() for x in df['FEATURE_NAMES'].values[0].split(',') if x.strip()]
386
362
 
387
- # Fetching data and uploading features to the feature store
388
363
  df_data = tdml.DataFrame(tdml.in_schema(view_name.split('.')[0], view_name.split('.')[1]))
389
364
 
390
- if tdfs4ds.DEBUG_MODE:
391
- print('run','entity_id',entity_id)
392
- print('run', 'entity_null_substitute', entity_null_substitute)
393
- print('run','feature_names',feature_names)
394
- print('run','process_id',process_id)
395
- print('run','primary_index',primary_index)
396
- print('run','partitioning',partitioning)
365
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
366
+ logger_safe("debug", "run | entity_id=%s", entity_id)
367
+ logger_safe("debug", "run | entity_null_substitute=%s", entity_null_substitute)
368
+ logger_safe("debug", "run | feature_names=%s", feature_names)
369
+ logger_safe("debug", "run | process_id=%s", process_id)
370
+ logger_safe("debug", "run | primary_index=%s", primary_index)
371
+ logger_safe("debug", "run | partitioning=%s", partitioning)
372
+
397
373
  dataset = _upload_features(
398
374
  df_data,
399
375
  entity_id,
400
376
  feature_names,
401
- feature_versions = process_id,
402
- primary_index = primary_index,
403
- partitioning = partitioning,
404
- filtermanager = filtermanager,
405
- entity_null_substitute = entity_null_substitute,
406
- process_id = process_id,
407
- force_compute= force_compute,
408
- force_varchar_length = force_varchar_length
377
+ feature_versions=process_id,
378
+ primary_index=primary_index,
379
+ partitioning=partitioning,
380
+ filtermanager=filtermanager,
381
+ entity_null_substitute=entity_null_substitute,
382
+ process_id=process_id,
383
+ force_compute=force_compute,
384
+ force_varchar_length=force_varchar_length
409
385
  )
410
386
 
411
387
  # Handling 'tdstone2 view' process type
412
388
  elif process_type == 'tdstone2 view':
413
- print('not implemented yet')
414
-
389
+ logger_safe("warning", "Process type 'tdstone2 view' not implemented yet for process_id=%s", process_id)
390
+ dataset = None
415
391
 
392
+ else:
393
+ logger_safe("error", "Unknown process type '%s' for process_id=%s", process_type, process_id)
394
+ dataset = None
416
395
 
417
396
  if return_dataset:
397
+ logger_safe("info", "Run finished with dataset | run_id=%s | process_id=%s", tdfs4ds.RUN_ID, process_id)
418
398
  return dataset
419
399
  else:
400
+ logger_safe("info", "Run finished without dataset | run_id=%s | process_id=%s", tdfs4ds.RUN_ID, process_id)
420
401
  return
421
402
 
422
- def upload_features(df, entity_id, feature_names, metadata={}, primary_index = None, partitioning = '', filtermanager = None, entity_null_substitute = {}, force_compute = True, force_varchar_length = 1024):
423
- """
424
- Uploads feature data from a DataFrame to the feature store for a specified entity. This involves registering the
425
- process in the feature store, executing the necessary SQL to insert the data, and returning the resulting dataset
426
- for further use or inspection.
427
-
428
- The function supports dynamic entity ID interpretation and flexible feature name handling, ensuring compatibility
429
- with various data schemas. It automatically registers the data upload process and applies additional metadata,
430
- if provided.
431
-
432
- Parameters:
433
- - df (DataFrame): The DataFrame containing the feature data to be uploaded.
434
- - entity_id (dict, list, or str): The identifier of the entity to which the features belong. This can be:
435
- - a dictionary mapping column names to their data types,
436
- - a list of column names, which will be automatically converted to a dictionary with types inferred from `df`,
437
- - a string representing a single column name, which will be converted into a list and then to a dictionary as above.
438
- - feature_names (list or str): The names of the features to be uploaded. If a string is provided, it will be
439
- split into a list based on commas or treated as a single feature name.
440
- - metadata (dict, optional): Additional metadata to associate with the upload process. Defaults to an empty dictionary.
441
- - primary_index (list, optional): Specifies the primary index columns for optimizing data storage and retrieval.
442
- - partitioning (str, optional): Defines how the data should be partitioned in the store for performance optimization.
443
- - filtermanager (object, optional): An object managing filter conditions for the feature data. Default is None.
444
- - entity_null_substitute (dict, optional): A dictionary specifying substitute values for nulls in entity columns.
445
- Default is an empty dictionary.
446
- - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
447
- Default is True.
448
- - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
449
- VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
450
- where k is the smallest integer so that the original lengths is smaller or equal
451
- to k x force_varchar_length. Default is 1024.
452
- Returns:
453
- DataFrame: A DataFrame representing the dataset resulting from the upload process, typically used for validation
454
- or further processing.
455
403
 
456
- The process involves several steps, including entity ID type conversion if necessary, feature name normalization,
457
- process registration in the feature store, and the execution of SQL queries to insert the data. The function concludes
458
- by returning a dataset derived from the uploaded data, offering immediate access to the newly stored information.
459
-
460
- Example:
461
- >>> df = tdml.DataFrame(...)
462
- >>> entity_id = ['customer_id']
463
- >>> feature_names = ['age', 'income']
464
- >>> dataset = upload_features(df, entity_id, feature_names)
465
- >>> # Another example with list-based entity_id, custom primary_index, and partitioning
466
- >>> tddf = tdml.DataFrame(...) # Assuming tddf is predefined with appropriate columns
467
- >>> entity_id = ['tx_type', 'txn_id']
468
- >>> primary_index = ['txn_id']
469
- >>> partitioning = '''
470
- ... PARTITION BY CASE_N (
471
- ... tx_type LIKE 'DEBIT',
472
- ... tx_type LIKE 'PAYMENT',
473
- ... tx_type LIKE 'CASH_OUT',
474
- ... tx_type LIKE 'CASH_IN',
475
- ... tx_type LIKE 'TRANSFER',
476
- ... NO CASE,
477
- ... UNKNOWN)'''
478
- >>> features = [x for x in tddf.columns if x not in entity_id]
479
- >>> dataset = upload_features(
480
- ... df = tddf,
481
- ... entity_id = entity_id,
482
- ... feature_names = features,
483
- ... metadata = {'project': 'test'},
484
- ... primary_index = primary_index,
485
- ... partitioning = partitioning
486
- ... )
404
+ def upload_features(
405
+ df,
406
+ entity_id,
407
+ feature_names,
408
+ metadata={},
409
+ primary_index=None,
410
+ partitioning='',
411
+ filtermanager=None,
412
+ entity_null_substitute={},
413
+ force_compute=True,
414
+ force_varchar_length=1024
415
+ ):
416
+ """
417
+ Uploads feature data from a DataFrame to the feature store for a specified entity.
418
+ All diagnostics go through `logger_safe()` which respects `tdfs4ds.DISPLAY_LOGS`.
487
419
  """
488
420
 
489
421
  from tdfs4ds.utils.info import get_column_types
@@ -491,45 +423,42 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
491
423
  from tdfs4ds.process_store.process_registration_management import register_process_view
492
424
 
493
425
  # Convert entity_id to a dictionary if it's not already one
494
- if type(entity_id) == list:
426
+ if isinstance(entity_id, list):
495
427
  entity_id.sort()
496
428
  entity_id = get_column_types(df, entity_id)
497
- if tdfs4ds.DISPLAY_LOGS:
498
- print('entity_id has been converted to a proper dictionary : ', entity_id)
499
- elif type(entity_id) == str:
429
+ logger_safe("debug", "entity_id converted to dict: %s", entity_id)
430
+
431
+ elif isinstance(entity_id, str):
500
432
  entity_id = [entity_id]
501
433
  entity_id = get_column_types(df, entity_id)
502
- if tdfs4ds.DISPLAY_LOGS:
503
- print('entity_id has been converted to a proper dictionary : ', entity_id)
504
-
505
- if type(feature_names) != list:
506
- if tdfs4ds.DISPLAY_LOGS:
507
- print('feature_names is not a list:', feature_names)
508
- if ',' in feature_names:
509
- feature_names = feature_names.split(',')
434
+ logger_safe("debug", "entity_id converted to dict: %s", entity_id)
435
+
436
+ # Normalize feature_names
437
+ if not isinstance(feature_names, list):
438
+ logger_safe("debug", "feature_names is not a list: %s", feature_names)
439
+ if isinstance(feature_names, str) and ',' in feature_names:
440
+ feature_names = [x.strip() for x in feature_names.split(',')]
510
441
  else:
511
442
  feature_names = [feature_names]
512
- if tdfs4ds.DISPLAY_LOGS:
513
- print('it has been converted to : ', feature_names)
514
- print('check it is a expected.')
515
-
516
- if primary_index is not None and type(primary_index) != list:
517
- if tdfs4ds.DISPLAY_LOGS:
518
- print('primary_index is not a list:', primary_index)
519
- if ',' in primary_index:
520
- primary_index = primary_index.split(',')
443
+ logger_safe("debug", "feature_names converted to list: %s", feature_names)
444
+ logger_safe("debug", "Check the conversion is as expected.")
445
+
446
+ # Normalize primary_index
447
+ if primary_index is not None and not isinstance(primary_index, list):
448
+ logger_safe("debug", "primary_index is not a list: %s", primary_index)
449
+ if isinstance(primary_index, str) and ',' in primary_index:
450
+ primary_index = [x.strip() for x in primary_index.split(',')]
521
451
  else:
522
452
  primary_index = [primary_index]
523
- if tdfs4ds.DISPLAY_LOGS:
524
- print('it has been converted to : ', feature_names)
525
- print('check it is a expected.')
453
+ logger_safe("debug", "primary_index converted to list: %s", primary_index)
454
+ logger_safe("debug", "Check the conversion is as expected.")
526
455
 
456
+ # Partitioning
527
457
  partitioning = tdfs4ds.utils.info.generate_partitioning_clause(partitioning=partitioning)
528
458
 
529
- if tdfs4ds.DISPLAY_LOGS:
530
- print("filtermanager", filtermanager)
459
+ logger_safe("debug", "filtermanager: %s", filtermanager)
531
460
 
532
- # Register the process and retrieve the SQL query to insert the features, and the process ID
461
+ # Register process -> get SQL(s) + process_id
533
462
  query_insert, process_id, query_insert_dist, query_insert_filtermanager = register_process_view.__wrapped__(
534
463
  view_name = df,
535
464
  entity_id = entity_id,
@@ -542,104 +471,96 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
542
471
  entity_null_substitute = entity_null_substitute
543
472
  )
544
473
 
545
- # Execute the SQL query to insert the features into the database
546
- execute_query(query_insert)
547
- execute_query(query_insert_dist)
548
- if tdfs4ds.DEBUG_MODE:
549
- print("query_insert_filtermanager",query_insert_filtermanager)
550
- if query_insert_filtermanager is not None:
551
- execute_query(query_insert_filtermanager)
552
-
553
- # Run the registered process and return the resulting dataset
554
- PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
555
- tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES'
556
- if tdfs4ds.BUILD_DATASET_AT_UPLOAD: tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES WITH DATASET VALIDATION'
557
- tdfs4ds.RUN_ID = str(uuid.uuid4())
474
+ logger_safe("info", "Registered process (process_id=%s) for upload_features", process_id)
558
475
 
559
- if tdfs4ds.BUILD_DATASET_AT_UPLOAD:
476
+ # Execute queries
477
+ try:
478
+ execute_query(query_insert)
479
+ logger_safe("info", "Executed main insert query for process_id=%s", process_id)
480
+ except Exception as e:
481
+ logger_safe("exception", "Main insert query failed for process_id=%s", process_id)
482
+ raise
560
483
 
561
- try:
484
+ try:
485
+ execute_query(query_insert_dist)
486
+ logger_safe("info", "Executed distribution insert query for process_id=%s", process_id)
487
+ except Exception as e:
488
+ logger_safe("exception", "Distribution insert query failed for process_id=%s", process_id)
489
+ raise
562
490
 
563
- dataset = run(process_id=process_id, return_dataset=True, force_compute = force_compute, force_varchar_length = force_varchar_length)
491
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
492
+ # Avoid dumping entire SQL in normal logs; keep it debug-only.
493
+ logger_safe("debug", "query_insert_filtermanager: %s", query_insert_filtermanager)
564
494
 
495
+ if query_insert_filtermanager is not None:
496
+ try:
497
+ execute_query(query_insert_filtermanager)
498
+ logger_safe("info", "Executed filtermanager insert query for process_id=%s", process_id)
565
499
  except Exception as e:
566
- tdfs4ds.process_store.process_followup.followup_close(
567
- run_id = tdfs4ds.RUN_ID,
568
- process_type = tdfs4ds.PROCESS_TYPE,
569
- process_id = process_id,
570
- status = 'FAILED,' + str(e).split('\n')[0]
571
- )
500
+ logger_safe("exception", "Filtermanager insert query failed for process_id=%s", process_id)
572
501
  raise
573
502
 
503
+ # Run the registered process (with/without dataset)
504
+ PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
505
+ tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES'
506
+ if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
507
+ tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES WITH DATASET VALIDATION'
508
+ tdfs4ds.RUN_ID = str(uuid.uuid4())
574
509
 
575
- return dataset
576
- else:
510
+ logger_safe(
511
+ "info",
512
+ "Starting run (run_id=%s, process_type=%s, process_id=%s, force_compute=%s, force_varchar_length=%s)",
513
+ tdfs4ds.RUN_ID, tdfs4ds.PROCESS_TYPE, process_id, force_compute, force_varchar_length
514
+ )
577
515
 
516
+ try:
517
+ if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
518
+ dataset = run(
519
+ process_id=process_id,
520
+ return_dataset=True,
521
+ force_compute=force_compute,
522
+ force_varchar_length=force_varchar_length
523
+ )
524
+ logger_safe("info", "Run completed with dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
525
+ return dataset
526
+ else:
527
+ run(
528
+ process_id=process_id,
529
+ return_dataset=False,
530
+ force_compute=force_compute,
531
+ force_varchar_length=force_varchar_length
532
+ )
533
+ logger_safe("info", "Run completed without dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
534
+ return
535
+
536
+ except Exception as e:
537
+ # Keep your existing follow-up close behavior, but ensure the error is logged.
578
538
  try:
579
- run(process_id=process_id, return_dataset=False, force_compute = force_compute, force_varchar_length = force_varchar_length)
580
- except Exception as e:
581
539
  tdfs4ds.process_store.process_followup.followup_close(
582
540
  run_id = tdfs4ds.RUN_ID,
583
541
  process_type = tdfs4ds.PROCESS_TYPE,
584
542
  process_id = process_id,
585
543
  status = 'FAILED,' + str(e).split('\n')[0]
586
544
  )
587
- raise
588
- return
589
-
590
- tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
591
-
592
- def _upload_features(df, entity_id, feature_names,
593
- feature_versions=FEATURE_VERSION_DEFAULT, primary_index = None, partitioning = '', filtermanager=None, entity_null_substitute={}, process_id = None, force_compute = False,force_varchar_length = None):
594
- """
595
- Uploads features from a DataFrame to the feature store, handling entity registration, feature type determination,
596
- feature registration, preparation for ingestion, and storage in the designated feature tables.
597
-
598
- Parameters:
599
- - df (DataFrame): The input DataFrame containing the feature data.
600
- - entity_id (str or dict): The identifier for the entity to which these features belong. This can be a single ID
601
- (str) or a dictionary of attribute names and values uniquely identifying the entity.
602
- - feature_names (list): A list of strings specifying the names of the features to be uploaded.
603
- - feature_versions (str or list, optional): Specifies the versions of the features to be uploaded. Can be a single
604
- string applied to all features or a list of strings specifying the version
605
- for each feature respectively. Default is 'dev.0.0'.
606
- - primary_index (list, optional): Specifies the columns to be used as the primary index in the feature store tables.
607
- This can significantly impact the performance of data retrieval operations.
608
- - partitioning (str, optional): A string indicating the partitioning strategy for the feature store tables, which can
609
- enhance query performance based on the access patterns.
610
- - filtermanager (object, optional): An object managing filter conditions for the feature data. Default is None.
611
- - entity_null_substitute (dict, optional): A dictionary specifying substitute values for nulls in entity columns.
612
- Default is an empty dictionary.
613
- - process_id (str, optional): An identifier for the process, used for tracking and follow-up. Default is None.
614
- - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
615
- Default is False.
616
- - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
617
- VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
618
- where k is the smallest integer so that the original lengths is smaller or equal
619
- to k x force_varchar_length. Default is None.
545
+ finally:
546
+ logger_safe("exception", "Run failed (run_id=%s, process_id=%s): %s",
547
+ tdfs4ds.RUN_ID, process_id, str(e).split('\n')[0]
548
+ )
549
+ raise
550
+ finally:
551
+ # Restore previous process type just in case the caller relies on it.
552
+ tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
620
553
 
621
554
 
622
- Returns:
623
- DataFrame: A DataFrame representing the dataset view created in the feature store, detailing the features and their
624
- metadata, including versions and storage locations.
625
-
626
- This function orchestrates several steps involved in feature storage:
627
- 1. Registers the entity in the feature store if not already present.
628
- 2. Determines the data types of the features based on the input DataFrame.
629
- 3. Registers the features, including their names, types, and versions, in the feature catalog.
630
- 4. Prepares the feature data for ingestion, including any necessary transformations.
631
- 5. Stores the prepared feature data in the feature store.
632
- 6. Optionally, cleans up temporary resources used during the process.
633
- 7. Builds and returns a view of the dataset representing the uploaded features for easy access.
634
555
 
635
- Note:
636
- - The function relies on various sub-modules within the `tdfs4ds` library for different steps of the process, from
637
- entity and feature registration to data preparation and storage.
638
- - It is intended to be used internally within a system that manages a Teradata feature store, assuming access to
639
- a Teradata database and the appropriate schema for feature storage.
640
- - The function assumes that the feature_versions, if provided as a list, matches the length of feature_names.
641
- """
642
-
556
+ def _upload_features(
557
+ df, entity_id, feature_names,
558
+ feature_versions=FEATURE_VERSION_DEFAULT,
559
+ primary_index=None, partitioning='',
560
+ filtermanager=None, entity_null_substitute={},
561
+ process_id=None, force_compute=False,
562
+ force_varchar_length=None
563
+ ):
643
564
  from tdfs4ds.feature_store.entity_management import register_entity
644
565
  from tdfs4ds.feature_store.feature_store_management import Gettdtypes
645
566
  from tdfs4ds.feature_store.feature_store_management import register_features
@@ -647,235 +568,149 @@ def _upload_features(df, entity_id, feature_names,
647
568
  from tdfs4ds.feature_store.feature_data_processing import store_feature, apply_collect_stats
648
569
  from tdfs4ds.utils.info import get_column_types, update_varchar_length
649
570
 
650
- # Convert entity_id to a dictionary if it's not already one
651
- if type(entity_id) == list:
571
+ # Convert entity_id to a dictionary if not already
572
+ if isinstance(entity_id, list):
652
573
  entity_id.sort()
653
574
  entity_id = get_column_types(df, entity_id)
654
- if tdfs4ds.DISPLAY_LOGS:
655
- print('entity_id has been converted to a proper dictionary : ', entity_id)
656
- elif type(entity_id) == str:
657
- entity_id = [entity_id]
658
- entity_id = get_column_types(df, entity_id)
659
- if tdfs4ds.DISPLAY_LOGS:
660
- print('entity_id has been converted to a proper dictionary : ', entity_id)
661
-
662
- #register_entity(entity_id, primary_index=primary_index, partitioning=partitioning)
663
-
664
- # If feature_versions is a list, create a dictionary mapping each feature name to its corresponding version.
665
- # If feature_versions is a string, create a dictionary mapping each feature name to this string.
666
- if type(feature_versions) == list:
667
- selected_features = {k: v for k, v in zip(feature_names, feature_versions)}
575
+ logger_safe("debug", "entity_id converted to dict: %s", entity_id)
576
+ elif isinstance(entity_id, str):
577
+ entity_id = get_column_types(df, [entity_id])
578
+ logger_safe("debug", "entity_id converted to dict: %s", entity_id)
579
+
580
+ # Map feature versions
581
+ if isinstance(feature_versions, list):
582
+ selected_features = dict(zip(feature_names, feature_versions))
668
583
  else:
669
584
  selected_features = {k: feature_versions for k in feature_names}
670
585
 
671
- # Get the Teradata types of the features in df.
672
- feature_names_types = Gettdtypes(
673
- df,
674
- features_columns=feature_names,
675
- entity_id=entity_id
676
- )
586
+ # Get Teradata types for features
587
+ feature_names_types = Gettdtypes(df, features_columns=feature_names, entity_id=entity_id)
677
588
 
678
589
  if force_varchar_length is not None:
679
- print(feature_names_types)
680
- feature_names_types = update_varchar_length(feature_names_types,new_varchar_length = force_varchar_length)
590
+ logger_safe("debug", "Updating VARCHAR lengths with force_varchar_length=%s", force_varchar_length)
591
+ feature_names_types = update_varchar_length(
592
+ feature_names_types,
593
+ new_varchar_length=force_varchar_length
594
+ )
681
595
 
682
596
  def validate_feature_types(feature_names_types):
683
- """
684
- Validates feature data types and raises an error if any value contains
685
- the substrings 'clob', 'blob', or 'json' (case insensitive).
686
-
687
- Parameters:
688
- feature_names_types (dict): A dictionary where keys are feature names and values are their data types.
689
-
690
- Raises:
691
- ValueError: If any feature type contains 'clob', 'blob', or 'json'.
692
- """
693
- invalid_types = {key: value['type'] for key, value in feature_names_types.items()
694
- if any(term in value['type'].lower() for term in ['clob', 'blob', 'json'])}
695
-
696
- if invalid_types:
597
+ invalid = {
598
+ k: v['type'] for k, v in feature_names_types.items()
599
+ if any(x in v['type'].lower() for x in ['clob', 'blob', 'json'])
600
+ }
601
+ if invalid:
697
602
  raise ValueError(
698
- f"The following features have unsupported data types: {invalid_types}. "
699
- "The data types 'CLOB', 'BLOB', and 'JSON' are not yet managed by the feature store."
603
+ f"Unsupported data types found: {invalid}. "
604
+ "CLOB/BLOB/JSON are not supported."
700
605
  )
701
-
702
- validate_feature_types(feature_names_types)
703
-
606
+
607
+ validate_feature_types(feature_names_types)
608
+
609
+ logger_safe("info", "Registering entity %s in feature store", entity_id)
704
610
  register_entity(entity_id, feature_names_types, primary_index=primary_index, partitioning=partitioning)
705
611
 
706
- if tdfs4ds.DEBUG_MODE:
707
- print('_upload_features', 'entity_id', entity_id)
708
- print('_upload_features', 'entity_null_substitute', entity_null_substitute)
709
- print('_upload_features', 'feature_names', feature_names)
710
- print('_upload_features', 'primary_index', primary_index)
711
- print('_upload_features', 'partitioning', partitioning)
712
- print('_upload_features', 'selected_features', selected_features)
713
- print('_upload_features', 'df.columns', df.columns)
714
-
715
- # Register the features in the feature catalog.
716
- register_features(
717
- entity_id,
718
- feature_names_types,
719
- primary_index,
720
- partitioning
721
- )
722
-
723
- if tdfs4ds.DEBUG_MODE:
724
- print("---------_upload_features")
725
- print("filtermanager : ", filtermanager)
726
- print("feature names : ", feature_names)
727
- print("selected features : ", selected_features)
728
-
729
- if process_id is not None and tdfs4ds.FEATURE_STORE_TIME is not None:
612
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
613
+ logger_safe(
614
+ "debug",
615
+ "_upload_features entity_id=%s null_substitute=%s features=%s primary_index=%s partitioning=%s",
616
+ entity_id, entity_null_substitute, feature_names, primary_index, partitioning
617
+ )
618
+ logger_safe("debug", "selected_features=%s df.columns=%s", selected_features, df.columns)
619
+
620
+ register_features(entity_id, feature_names_types, primary_index, partitioning)
621
+ logger_safe("info", "Features registered in catalog: %s", feature_names)
622
+
623
+ follow_up = None
624
+ if process_id and tdfs4ds.FEATURE_STORE_TIME:
730
625
  follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
731
- follow_up = follow_up[(follow_up.STATUS == 'COMPLETED') & (follow_up.VALIDTIME_DATE.isna() == False) & (
732
- follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) & (follow_up.PROCESS_ID == process_id)]
733
- if filtermanager is None:
734
- do_compute = True
735
- if process_id is not None and tdfs4ds.FEATURE_STORE_TIME is not None:
736
- if follow_up.shape[0] > 0:
737
- do_compute = False
626
+ follow_up = follow_up[
627
+ (follow_up.STATUS == 'COMPLETED') &
628
+ (follow_up.VALIDTIME_DATE.isna() == False) &
629
+ (follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) &
630
+ (follow_up.PROCESS_ID == process_id)
631
+ ]
738
632
 
739
- # Prepare the features for ingestion.
633
+ if filtermanager is None:
634
+ do_compute = not (process_id and follow_up is not None and follow_up.shape[0] > 0)
740
635
  if do_compute or force_compute:
741
-
636
+ logger_safe("info", "Beginning feature ingestion for entity=%s", entity_id)
742
637
  tdfs4ds.process_store.process_followup.followup_open(
743
- run_id = tdfs4ds.RUN_ID,
744
- process_type = tdfs4ds.PROCESS_TYPE,
745
- process_id = process_id
638
+ run_id=tdfs4ds.RUN_ID,
639
+ process_type=tdfs4ds.PROCESS_TYPE,
640
+ process_id=process_id
746
641
  )
747
-
748
642
  try:
749
- prepared_features, volatile_table_name, features_infos = prepare_feature_ingestion(
750
- df,
751
- entity_id,
752
- feature_names,
643
+ prepared_features, volatile_table, features_infos = prepare_feature_ingestion(
644
+ df, entity_id, feature_names,
753
645
  feature_versions=selected_features,
754
646
  primary_index=primary_index,
755
647
  entity_null_substitute=entity_null_substitute,
756
648
  partitioning=partitioning
757
649
  )
758
- # Store the prepared features in the feature store.
759
- store_feature(
760
- entity_id,
761
- volatile_table_name,
762
- entity_null_substitute=entity_null_substitute,
763
- primary_index=primary_index,
764
- partitioning=partitioning,
765
- features_infos = features_infos
766
- )
767
-
768
- # Collect statistics
769
- apply_collect_stats(
770
- entity_id,
771
- primary_index = primary_index,
772
- partitioning = partitioning,
773
- feature_infos = features_infos
774
- )
650
+ store_feature(entity_id, volatile_table, entity_null_substitute,
651
+ primary_index, partitioning, features_infos)
652
+ apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
775
653
 
776
654
  tdfs4ds.process_store.process_followup.followup_close(
777
- run_id = tdfs4ds.RUN_ID,
778
- process_type = tdfs4ds.PROCESS_TYPE,
779
- process_id = process_id
655
+ run_id=tdfs4ds.RUN_ID,
656
+ process_type=tdfs4ds.PROCESS_TYPE,
657
+ process_id=process_id
780
658
  )
659
+ logger_safe("info", "Feature ingestion completed for entity=%s", entity_id)
781
660
 
782
661
  except Exception as e:
662
+ logger_safe("exception", "Feature ingestion failed for entity=%s", entity_id)
783
663
  tdfs4ds.process_store.process_followup.followup_close(
784
- run_id = tdfs4ds.RUN_ID,
785
- process_type = tdfs4ds.PROCESS_TYPE,
786
- process_id = process_id,
787
- status = 'FAILED,' + str(e).split('\n')[0]
664
+ run_id=tdfs4ds.RUN_ID,
665
+ process_type=tdfs4ds.PROCESS_TYPE,
666
+ process_id=process_id,
667
+ status='FAILED,' + str(e).split('\n')[0]
788
668
  )
789
669
  raise
790
- else:
791
- # get the total number of filter condition in the filter manager
792
- nb_filters = filtermanager.nb_filters
793
670
 
794
- # the flag that indicates that we computed something in the next loop
671
+ else:
672
+ logger_safe("info", "FilterManager detected: %s filters to process", filtermanager.nb_filters)
795
673
  something_computed = False
674
+ for i in range(filtermanager.nb_filters):
675
+ filtermanager.update(i + 1)
676
+ logger_safe("debug", "Applying filter %s/%s:\n%s",
677
+ i + 1, filtermanager.nb_filters, filtermanager.display())
796
678
 
797
- for i in range(nb_filters):
798
-
799
- # place the cursor on the next filter
800
- filtermanager.update(i+1)
801
-
802
- if filtermanager.time_filtering:
803
- # if the filter manager is hybrid, then synchronize the time with tdfs4ds
804
- tdfs4ds.FEATURE_STORE_TIME = filtermanager.get_date_in_the_past()
805
-
806
- # overwrite the follow up table to tilter on the VALIDTIME_DATE too
807
- follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
808
- follow_up = follow_up[(follow_up.STATUS == 'COMPLETED') & (follow_up.VALIDTIME_DATE.isna() == False) & (
809
- follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) & (follow_up.PROCESS_ID == process_id)]
810
-
811
- # initialize do_compute, the flag that something has to be computed
812
679
  do_compute = True
813
-
814
- # if the process_id is defined and if we are working at a specific time:
815
- if process_id is not None and tdfs4ds.FEATURE_STORE_TIME is not None:
816
- # we check if the filter condition has already been computed
817
- follow_up_ = follow_up.assign(APPLIED_FILTER=follow_up.APPLIED_FILTER.cast(tdml.VARCHAR(20000))).join(
818
- tdml.DataFrame.from_query(
819
- f"""
820
- SELECT
821
- CAST(JSON_AGG({','.join(filtermanager.col_names)}) AS VARCHAR(20000)) AS APPLIED_FILTER
822
- FROM {filtermanager.schema_name}.{filtermanager.view_name}
823
- """
824
- ),
825
- on = 'APPLIED_FILTER',
826
- how = 'inner',
827
- lprefix = 'l',
828
- rprefix = 'r'
829
- )
830
- # if already computed and completed, then do_compute is set to False
831
- if follow_up_.shape[0] > 0:
680
+ if process_id and tdfs4ds.FEATURE_STORE_TIME:
681
+ # see if already computed
682
+ follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
683
+ if follow_up.shape[0] > 0:
832
684
  do_compute = False
833
685
 
834
- if tdfs4ds.DISPLAY_LOGS:
835
- print(filtermanager.display())
836
-
837
686
  if do_compute or force_compute:
838
687
  tdfs4ds.process_store.process_followup.followup_open(
839
- run_id = tdfs4ds.RUN_ID,
840
- process_type = tdfs4ds.PROCESS_TYPE,
841
- process_id = process_id,
842
- filtermanager = filtermanager
688
+ run_id=tdfs4ds.RUN_ID,
689
+ process_type=tdfs4ds.PROCESS_TYPE,
690
+ process_id=process_id,
691
+ filtermanager=filtermanager
843
692
  )
844
693
  try:
845
- # Prepare the features for ingestion.
846
- prepared_features, volatile_table_name, features_infos = prepare_feature_ingestion(
847
- df,
848
- entity_id,
849
- feature_names,
850
- feature_versions = selected_features,
851
- primary_index = primary_index,
852
- entity_null_substitute = entity_null_substitute,
853
- partitioning = partitioning
854
- )
855
-
856
- # Store the prepared features in the feature store.
857
- store_feature(
858
- entity_id,
859
- volatile_table_name,
694
+ prepared_features, volatile_table, features_infos = prepare_feature_ingestion(
695
+ df, entity_id, feature_names,
696
+ feature_versions=selected_features,
697
+ primary_index=primary_index,
860
698
  entity_null_substitute=entity_null_substitute,
861
- primary_index = primary_index,
862
- partitioning = partitioning,
863
- features_infos=features_infos
864
-
699
+ partitioning=partitioning
865
700
  )
866
-
867
- # indicate that something has been processed:
701
+ store_feature(entity_id, volatile_table, entity_null_substitute,
702
+ primary_index, partitioning, features_infos)
868
703
  something_computed = True
869
704
 
870
705
  tdfs4ds.process_store.process_followup.followup_close(
871
706
  run_id=tdfs4ds.RUN_ID,
872
707
  process_type=tdfs4ds.PROCESS_TYPE,
873
708
  process_id=process_id,
874
- filtermanager = filtermanager
709
+ filtermanager=filtermanager
875
710
  )
876
711
 
877
712
  except Exception as e:
878
- print(e)
713
+ logger_safe("exception", "Error with filter iteration %s: %s", i + 1, str(e))
879
714
  tdfs4ds.process_store.process_followup.followup_close(
880
715
  run_id=tdfs4ds.RUN_ID,
881
716
  process_type=tdfs4ds.PROCESS_TYPE,
@@ -884,41 +719,28 @@ def _upload_features(df, entity_id, feature_names,
884
719
  filtermanager=filtermanager
885
720
  )
886
721
  raise
887
- # Clean up by dropping the temporary volatile table.
888
- # tdml.execute_sql(f'DROP TABLE {volatile_table_name}')
889
722
 
890
- # Collect statistics only if something has been computed
891
723
  if something_computed:
892
- apply_collect_stats(
893
- entity_id,
894
- primary_index = primary_index,
895
- partitioning = partitioning,
896
- feature_infos = features_infos
897
- )
724
+ apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
898
725
 
899
- # Build a dataset view in the feature store.
900
726
  if tdfs4ds.BUILD_DATASET_AT_UPLOAD:
901
- if tdfs4ds.DISPLAY_LOGS: print('build dataset for validation')
727
+ logger_safe("info", "Building dataset for validation...")
902
728
  try:
903
- dataset = build_dataset(
904
- entity_id,
905
- selected_features,
729
+ return build_dataset(
730
+ entity_id, selected_features,
906
731
  view_name=None,
907
- entity_null_substitute = entity_null_substitute
732
+ entity_null_substitute=entity_null_substitute
908
733
  )
909
734
  except Exception as e:
910
- print('ERROR at build_dataset in _upload_features:')
911
- print(str(e).split('\n')[0])
912
- print('entity :', entity_id)
913
- print('selected features :', selected_features)
914
-
915
- # Return the dataset view.
916
- return dataset
735
+ logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
736
+ logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
917
737
  else:
918
- if tdfs4ds.DISPLAY_LOGS: print('no dataset built for validation. Set tdfs4ds.BUILD_DATASET_AT_UPLOAD to True if you want it')
738
+ logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False)")
919
739
  return
920
740
 
921
741
 
742
+
743
+
922
744
  def build_dataset(entity_id, selected_features, view_name, schema_name=None, comment=None, return_query=False,
923
745
  feature_store_time=False, join_type='INNER'):
924
746
  """