tdfs4ds 0.2.4.32__py3-none-any.whl → 0.2.4.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tdfs4ds/__init__.py CHANGED
@@ -1,5 +1,6 @@
1
- __version__ = '0.2.4.32'
1
+ __version__ = '0.2.4.34'
2
2
  import logging
3
+
3
4
  # Setup the logger
4
5
  logging.basicConfig(
5
6
  level=logging.INFO,
@@ -7,6 +8,15 @@ logging.basicConfig(
7
8
  datefmt='%Y-%m-%d %H:%M:%S' # Set the date/time format
8
9
  )
9
10
 
11
+ # Helper: central logging gate controlled by tdfs4ds.DISPLAY_LOGS
12
+ def logger_safe(level, message, *args, **kwargs):
13
+ """
14
+ Wrapper around the global `logger` that only emits logs when
15
+ tdfs4ds.DISPLAY_LOGS is True. `level` is a string like "info", "error", etc.
16
+ """
17
+ if getattr(tdfs4ds, "DISPLAY_LOGS", True):
18
+ getattr(logger, level)(message, *args, **kwargs)
19
+
10
20
  logger = logging.getLogger(__name__)
11
21
 
12
22
  from tdfs4ds.feature_store.feature_query_retrieval import get_available_entity_id_records, write_where_clause_filter
@@ -57,7 +67,7 @@ import tdfs4ds.datasets
57
67
  import time
58
68
 
59
69
  import inspect
60
- import tqdm
70
+ from tqdm.auto import tqdm # auto picks the right frontend (notebook/terminal)
61
71
 
62
72
  from tdfs4ds.feature_store.feature_data_processing import generate_on_clause
63
73
 
@@ -70,92 +80,80 @@ PROCESS_TYPE = 'RUN PROCESS'
70
80
  try:
71
81
  SCHEMA = tdml.context.context._get_current_databasename()
72
82
  if SCHEMA is None:
73
- print('Please specify the database which is hosting the feature store.')
74
- print('tdfs4ds.feature_store.schema = "<feature store database>"')
83
+ logger.warning("No default database detected for feature store.")
84
+ logger.warning('Please set it explicitly: tdfs4ds.feature_store.schema = "<feature store database>"')
75
85
  else:
76
- print('The default database is used for the feature store.')
77
- print(f"tdfs4ds.feature_store.schema = '{SCHEMA}'")
86
+ logger.info("Default database detected for feature store: %s", SCHEMA)
87
+ logger.info('tdfs4ds.feature_store.schema = "%s"', SCHEMA)
88
+
78
89
  if DATA_DOMAIN is None:
79
90
  DATA_DOMAIN = SCHEMA
80
- print(f"the data domain for the current work is :{DATA_DOMAIN}")
81
- print("Please update it as you wish with tdfs4ds.DATA_DOMAIN=<your data domain>")
91
+ logger.info("DATA_DOMAIN not set. Defaulting to SCHEMA: %s", DATA_DOMAIN)
92
+ logger.info('You can override it using: tdfs4ds.DATA_DOMAIN = "<your data domain>"')
82
93
 
83
94
  except Exception as e:
84
- print('Please specify the database which is hosting the feature store.')
85
- print('tdfs4ds.feature_store.schema = "<feature store database>"')
95
+ logger.error("Could not determine current database: %s", str(e).split('\n')[0])
96
+ logger.warning("Please specify the feature store database manually:")
97
+ logger.warning('tdfs4ds.feature_store.schema = "<feature store database>"')
86
98
 
87
99
 
88
100
  def setup(database, if_exists='fail'):
89
101
  """
90
- Set up the database environment by configuring schema names and optionally dropping existing tables.
91
-
92
- This function sets the database schema for feature and process catalogs. If specified, it also handles
93
- the replacement of existing catalog tables. It reports the status of these operations, including any
94
- encountered exceptions.
95
-
96
- Parameters:
97
- database (str): The name of the database schema to be used.
98
- if_exists (str, optional): Determines the behavior if catalog tables already exist in the database.
99
- 'fail' (default) - Do nothing if the tables exist.
100
- 'replace' - Drop the tables if they exist before creating new ones.
101
-
102
- Steps performed:
103
- 1. Sets the schema to the provided database name.
104
- 2. If 'if_exists' is 'replace', attempts to drop 'FS_FEATURE_CATALOG' and 'FS_PROCESS_CATALOG' tables.
105
- 3. Creates new feature and process catalog tables and sets their names in the tdfs4ds module.
106
- 4. Prints the names of the newly created tables along with the database name.
107
- 5. Captures and prints the first line of any exceptions that occur during these operations.
108
-
109
- Returns:
110
- None
102
+ Initialize the feature store environment by creating catalog tables and views.
111
103
  """
112
104
 
113
105
  from tdfs4ds.feature_store.feature_store_management import feature_store_catalog_creation
114
106
  from tdfs4ds.process_store.process_store_catalog_management import process_store_catalog_creation
115
107
 
116
108
  tdfs4ds.SCHEMA = database
109
+ logger_safe("info", "Setting up feature store in database: %s", database)
110
+
117
111
  if if_exists == 'replace':
118
- try:
119
- tdml.db_drop_table(table_name = tdfs4ds.FEATURE_CATALOG_NAME, schema_name=database)
120
- except Exception as e:
121
- print(str(e).split('\n')[0])
122
- try:
123
- tdml.db_drop_table(table_name = tdfs4ds.PROCESS_CATALOG_NAME, schema_name=database)
124
- except Exception as e:
125
- print(str(e).split('\n')[0])
126
- try:
127
- tdml.db_drop_table(table_name = tdfs4ds.DATA_DISTRIBUTION_NAME, schema_name=database)
128
- except Exception as e:
129
- print(str(e).split('\n')[0])
112
+ logger_safe("info", "Replacing existing catalog tables if they exist.")
113
+ for table in [tdfs4ds.FEATURE_CATALOG_NAME, tdfs4ds.PROCESS_CATALOG_NAME, tdfs4ds.DATA_DISTRIBUTION_NAME]:
114
+ try:
115
+ tdml.db_drop_table(table_name=table, schema_name=database)
116
+ logger_safe("info", "Dropped table %s.%s", database, table)
117
+ except Exception as e:
118
+ logger_safe("warning", "Could not drop table %s.%s: %s", database, table, str(e).split('\n')[0])
130
119
 
131
120
  DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME).drop_catalog()
121
+
132
122
  try:
133
123
  tdfs4ds.FEATURE_CATALOG_NAME = feature_store_catalog_creation()
134
- print('feature catalog table: ', tdfs4ds.FEATURE_CATALOG_NAME, ' in database ', database)
124
+ logger_safe("info", "Feature catalog table created: %s in database %s", tdfs4ds.FEATURE_CATALOG_NAME, database)
135
125
  except Exception as e:
136
- print(str(e).split('\n')[0])
126
+ logger_safe("error", "Feature catalog creation failed: %s", str(e).split('\n')[0])
137
127
 
138
128
  try:
139
- tdfs4ds.PROCESS_CATALOG_NAME, tdfs4ds.DATA_DISTRIBUTION_NAME, tdfs4ds.FILTER_MANAGER_NAME = process_store_catalog_creation()
140
- print('process catalog table: ', tdfs4ds.PROCESS_CATALOG_NAME, ' in database ', database)
141
- print('data distribution table: ', tdfs4ds.DATA_DISTRIBUTION_NAME, ' in database ', database)
142
- print('filter manager table: ', tdfs4ds.FILTER_MANAGER_NAME, ' in database ', database)
129
+ (tdfs4ds.PROCESS_CATALOG_NAME,
130
+ tdfs4ds.DATA_DISTRIBUTION_NAME,
131
+ tdfs4ds.FILTER_MANAGER_NAME) = process_store_catalog_creation()
132
+
133
+ logger_safe("info", "Process catalog table created: %s", tdfs4ds.PROCESS_CATALOG_NAME)
134
+ logger_safe("info", "Data distribution table created: %s", tdfs4ds.DATA_DISTRIBUTION_NAME)
135
+ logger_safe("info", "Filter manager table created: %s", tdfs4ds.FILTER_MANAGER_NAME)
143
136
  except Exception as e:
144
- print(str(e).split('\n')[0])
137
+ logger_safe("error", "Process catalog creation failed: %s", str(e).split('\n')[0])
145
138
 
146
139
  try:
147
140
  tdfs4ds.process_store.process_followup.follow_up_table_creation()
141
+ logger_safe("info", "Follow-up table created successfully.")
148
142
  except Exception as e:
149
- print(str(e).split('\n')[0])
143
+ logger_safe("error", "Follow-up table creation failed: %s", str(e).split('\n')[0])
150
144
 
151
145
  tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
152
146
  tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
147
+
153
148
  dataset_catalog = DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME)
154
149
  if not dataset_catalog._exists():
155
150
  dataset_catalog.create_catalog()
151
+ logger_safe("info", "Dataset catalog created: %s", tdfs4ds.DATASET_CATALOG_NAME)
156
152
 
153
+ logger_safe("info", "Setup complete.")
157
154
  return
158
155
 
156
+
159
157
  def connect(
160
158
  database = tdfs4ds.SCHEMA,
161
159
  feature_catalog_name = tdfs4ds.FEATURE_CATALOG_NAME,
@@ -166,15 +164,15 @@ def connect(
166
164
  feature_catalog_name_view = tdfs4ds.FEATURE_CATALOG_NAME_VIEW,
167
165
  process_catalog_name_view = tdfs4ds.PROCESS_CATALOG_NAME_VIEW,
168
166
  dataset_catalog_name = tdfs4ds.DATASET_CATALOG_NAME,
169
- create_if_missing = False # New argument
167
+ create_if_missing = False
170
168
  ):
171
- if database is not None:
172
- tdfs4ds.SCHEMA = database
173
- else:
169
+ if database is None:
174
170
  raise ValueError("database parameter is None.")
171
+ tdfs4ds.SCHEMA = database
172
+ logger_safe("info", "Connecting to feature store in database: %s", database)
175
173
 
176
174
  tables = [x.lower() for x in list(tdml.db_list_tables(schema_name=tdfs4ds.SCHEMA, object_type='table').TableName.values)]
177
-
175
+
178
176
  feature_exists = feature_catalog_name.lower() in tables
179
177
  process_exists = process_catalog_name.lower() in tables
180
178
  distrib_exists = data_distribution_name.lower() in tables
@@ -183,20 +181,20 @@ def connect(
183
181
 
184
182
  if not (feature_exists and process_exists and distrib_exists and filter_manager_exists):
185
183
  if not create_if_missing:
186
- return False # Feature store does not exist
187
- else:
188
- # Create the missing components
189
- if not feature_exists:
190
- tdfs4ds.feature_store.feature_store_management.feature_store_catalog_creation()
191
- if not process_exists:
192
- tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_creation()
193
- if not distrib_exists:
194
- tdfs4ds.data_distribution.data_distribution_catalog_creation()
195
- if not filter_manager_exists:
196
- tdfs4ds.filter_manager.filter_manager_catalog_creation()
197
-
198
- # Follow-up table handling
184
+ logger_safe("warning", "Feature store components missing and create_if_missing=False")
185
+ return False
186
+ logger_safe("info", "Missing components detected; creating missing parts...")
187
+ if not feature_exists:
188
+ tdfs4ds.feature_store.feature_store_management.feature_store_catalog_creation()
189
+ if not process_exists:
190
+ tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_creation()
191
+ if not distrib_exists:
192
+ tdfs4ds.data_distribution.data_distribution_catalog_creation()
193
+ if not filter_manager_exists:
194
+ tdfs4ds.filter_manager.filter_manager_catalog_creation()
195
+
199
196
  if not followup_name_exists:
197
+ logger_safe("info", "Creating follow-up table: %s", followup_name)
200
198
  tdfs4ds.process_store.process_followup.follow_up_table_creation()
201
199
  tdfs4ds.FOLLOW_UP_NAME = followup_name
202
200
 
@@ -210,30 +208,31 @@ def connect(
210
208
 
211
209
  process_list = tdml.DataFrame(tdml.in_schema(database, process_catalog_name))
212
210
  if 'ENTITY_NULL_SUBSTITUTE' not in process_list.columns:
213
- print('ENTITY_NULL_SUBSTITUTE column does not exist in the existing process catalog')
214
- print('upgrade to the latest DDL')
211
+ logger_safe("warning", "ENTITY_NULL_SUBSTITUTE column missing. Upgrading catalog.")
215
212
  tdfs4ds.process_store.process_store_catalog_management.upgrade_process_catalog()
216
213
 
217
214
  tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
218
215
  tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
219
216
 
220
- # Dataset catalog setup
217
+ # Dataset Catalog
221
218
  tdfs4ds.DATASET_CATALOG_NAME = dataset_catalog_name
222
- dataset_catalog = DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME)
219
+ dataset_catalog = DatasetCatalog(schema_name=database, name=dataset_catalog_name)
223
220
  if not dataset_catalog._exists():
224
221
  dataset_catalog.create_catalog()
222
+ logger_safe("info", "Dataset catalog created: %s", dataset_catalog_name)
225
223
 
226
- # Check if distribution is temporal
224
+ # Detect temporal distribution
227
225
  def is_data_distribution_temporal():
228
226
  return 'PERIOD' in tdfs4ds.utils.lineage.get_ddl(
229
227
  view_name=tdfs4ds.DATA_DISTRIBUTION_NAME,
230
228
  schema_name=tdfs4ds.SCHEMA,
231
229
  object_type='table'
232
230
  )
233
-
231
+
234
232
  tdfs4ds.DATA_DISTRIBUTION_TEMPORAL = is_data_distribution_temporal()
235
-
236
- return True # Feature store exists or was created
233
+ logger_safe("info", "Connected to feature store successfully.")
234
+ return True
235
+
237
236
 
238
237
 
239
238
 
@@ -287,50 +286,22 @@ def get_dataset_entity(dataset_id = None):
287
286
  def get_dataset_features(dataset_id = None):
288
287
  return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_features(dataset_id)
289
288
 
290
- def run(process_id, return_dataset = False, force_compute = False, force_varchar_length = None):
289
+ def run(process_id, return_dataset=False, force_compute=False, force_varchar_length=None):
291
290
  """
292
291
  Executes a specific process from the feature store identified by the process ID.
293
- The function handles different process types and performs appropriate actions.
294
-
295
- Parameters:
296
- - process_id (str): The unique identifier of the process to run.
297
- - return_dataset (bool, optional): A flag indicating whether to return the dataset created during the process.
298
- Default is False.
299
- - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
300
- Default is False.
301
- - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
302
- VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
303
- where k is the smallest integer so that the original lengths is smaller or equal
304
- to k x force_varchar_length. Default is None.
305
-
306
- Returns:
307
- DataFrame or None: If return_dataset is True, returns the dataset created during the process. Otherwise, returns None.
308
-
309
- This function performs the following steps:
310
- 1. Determines the process type and initializes necessary variables.
311
- 2. Constructs and executes a SQL query to retrieve process details by process ID.
312
- 3. Fetches the filter manager, process type, primary index, partitioning, and data domain from the query result.
313
- 4. Handles different process types, such as 'denormalized view' and 'tdstone2 view'.
314
- 5. For 'denormalized view' process type, extracts necessary details, fetches data, and uploads features to the feature store.
315
- 6. Optionally returns the dataset created during the process if return_dataset is True.
316
-
317
- Note:
318
- - The function relies on various sub-modules within the `tdfs4ds` library for different steps of the process, from
319
- data retrieval to feature uploading.
320
- - It is intended to be used internally within a system that manages a Teradata feature store, assuming access to
321
- a Teradata database and the appropriate schema for feature storage.
292
+ Uses global `logger` for diagnostics (gated by tdfs4ds.DISPLAY_LOGS).
322
293
  """
323
294
 
324
295
  if tdfs4ds.PROCESS_TYPE is None:
325
296
  PROCESS_TYPE_ = 'RUN PROCESS'
326
- tdfs4ds.RUN_ID = str(uuid.uuid4())
297
+ tdfs4ds.RUN_ID = str(uuid.uuid4())
327
298
  else:
328
299
  PROCESS_TYPE_ = tdfs4ds.PROCESS_TYPE
329
300
 
330
- if tdfs4ds.DEBUG_MODE:
331
- print('def run','tdfs4ds.FEATURE_STORE_TIME', tdfs4ds.FEATURE_STORE_TIME)
301
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
302
+ logger_safe("debug", "def run | tdfs4ds.FEATURE_STORE_TIME=%s", tdfs4ds.FEATURE_STORE_TIME)
332
303
 
333
- if tdfs4ds.FEATURE_STORE_TIME == None:
304
+ if tdfs4ds.FEATURE_STORE_TIME is None:
334
305
  validtime_statement = 'CURRENT VALIDTIME'
335
306
  else:
336
307
  validtime_statement = f"VALIDTIME AS OF TIMESTAMP '{tdfs4ds.FEATURE_STORE_TIME}'"
@@ -342,148 +313,110 @@ def run(process_id, return_dataset = False, force_compute = False, force_varchar
342
313
  WHERE A.PROCESS_ID = '{process_id}'
343
314
  """
344
315
 
316
+ logger_safe(
317
+ "info",
318
+ "Starting run | run_id=%s | process_type=%s | process_id=%s | return_dataset=%s | force_compute=%s | force_varchar_length=%s",
319
+ tdfs4ds.RUN_ID, PROCESS_TYPE_, process_id, return_dataset, force_compute, force_varchar_length
320
+ )
321
+
345
322
  # Executing the query and converting the result to Pandas DataFrame
346
323
  df = tdml.DataFrame.from_query(query).to_pandas()
347
324
 
348
- # Check if exactly one record is returned, else print an error
325
+ # Check if exactly one record is returned, else log an error and return
349
326
  if df.shape[0] != 1:
350
- print('error - there is ', df.shape[0], f' records. Check table {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}')
351
- print('check ou this query:')
352
- print(query)
327
+ logger_safe(
328
+ "error",
329
+ "Process catalog lookup returned %s record(s); expected 1. Check table %s.%s. Query: %s",
330
+ df.shape[0], tdfs4ds.SCHEMA, tdfs4ds.PROCESS_CATALOG_NAME_VIEW, query.strip()
331
+ )
353
332
  return
354
333
 
355
-
356
334
  # Fetching the filter manager
357
335
  filter_schema_name = df['FILTER_DATABASE_NAME'].values[0]
358
336
  if filter_schema_name is None:
359
337
  filtermanager = None
360
338
  else:
361
339
  filter_view_name = df['FILTER_VIEW_NAME'].values[0]
362
- filter_table_name = df['FILTER_TABLE_NAME'].values[0]
340
+ filter_table_name = df['FILTER_TABLE_NAME'].values[0] # kept for parity; not used directly here
363
341
  filtermanager = FilterManager(table_name=filter_view_name, schema_name=filter_schema_name)
364
342
 
365
- # Fetching the process type from the query result
366
- process_type = df['PROCESS_TYPE'].values[0]
367
-
368
- # Fetching the primary index from the query result
369
- primary_index = df['FOR_PRIMARY_INDEX'].values[0]
343
+ # Fetching process metadata
344
+ process_type = df['PROCESS_TYPE'].values[0]
345
+ primary_index = df['FOR_PRIMARY_INDEX'].values[0]
370
346
  if primary_index is not None:
371
- primary_index = primary_index.split(',')
372
-
373
- # Fetching the primary index from the query result
374
- partitioning = df['FOR_DATA_PARTITIONING'].values[0]
375
-
376
- # Fetching the data domain from the query result
377
- DATA_DOMAIN = df['DATA_DOMAIN'].values[0]
347
+ primary_index = [x.strip() for x in primary_index.split(',') if x.strip()]
348
+ partitioning = df['FOR_DATA_PARTITIONING'].values[0]
349
+ DATA_DOMAIN = df['DATA_DOMAIN'].values[0]
350
+
351
+ logger_safe(
352
+ "info",
353
+ "Process metadata | process_id=%s | process_type=%s | primary_index=%s | partitioning=%s | data_domain=%s | validtime=%s",
354
+ process_id, process_type, primary_index, partitioning, DATA_DOMAIN, validtime_statement
355
+ )
378
356
 
379
357
  # Handling 'denormalized view' process type
380
358
  if process_type == 'denormalized view':
381
- # Extracting necessary details for this process type
382
- view_name = df['VIEW_NAME'].values[0]
383
- entity_id = df['ENTITY_ID'].values[0].split(',')
359
+ view_name = df['VIEW_NAME'].values[0]
360
+ entity_id = [x.strip() for x in df['ENTITY_ID'].values[0].split(',') if x.strip()]
384
361
  entity_null_substitute = eval(df['ENTITY_NULL_SUBSTITUTE'].values[0])
385
- feature_names = df['FEATURE_NAMES'].values[0].split(',')
362
+ feature_names = [x.strip() for x in df['FEATURE_NAMES'].values[0].split(',') if x.strip()]
386
363
 
387
- # Fetching data and uploading features to the feature store
388
364
  df_data = tdml.DataFrame(tdml.in_schema(view_name.split('.')[0], view_name.split('.')[1]))
389
365
 
390
- if tdfs4ds.DEBUG_MODE:
391
- print('run','entity_id',entity_id)
392
- print('run', 'entity_null_substitute', entity_null_substitute)
393
- print('run','feature_names',feature_names)
394
- print('run','process_id',process_id)
395
- print('run','primary_index',primary_index)
396
- print('run','partitioning',partitioning)
366
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
367
+ logger_safe("debug", "run | entity_id=%s", entity_id)
368
+ logger_safe("debug", "run | entity_null_substitute=%s", entity_null_substitute)
369
+ logger_safe("debug", "run | feature_names=%s", feature_names)
370
+ logger_safe("debug", "run | process_id=%s", process_id)
371
+ logger_safe("debug", "run | primary_index=%s", primary_index)
372
+ logger_safe("debug", "run | partitioning=%s", partitioning)
373
+
397
374
  dataset = _upload_features(
398
375
  df_data,
399
376
  entity_id,
400
377
  feature_names,
401
- feature_versions = process_id,
402
- primary_index = primary_index,
403
- partitioning = partitioning,
404
- filtermanager = filtermanager,
405
- entity_null_substitute = entity_null_substitute,
406
- process_id = process_id,
407
- force_compute= force_compute,
408
- force_varchar_length = force_varchar_length
378
+ feature_versions=process_id,
379
+ primary_index=primary_index,
380
+ partitioning=partitioning,
381
+ filtermanager=filtermanager,
382
+ entity_null_substitute=entity_null_substitute,
383
+ process_id=process_id,
384
+ force_compute=force_compute,
385
+ force_varchar_length=force_varchar_length
409
386
  )
410
387
 
411
388
  # Handling 'tdstone2 view' process type
412
389
  elif process_type == 'tdstone2 view':
413
- print('not implemented yet')
414
-
390
+ logger_safe("warning", "Process type 'tdstone2 view' not implemented yet for process_id=%s", process_id)
391
+ dataset = None
415
392
 
393
+ else:
394
+ logger_safe("error", "Unknown process type '%s' for process_id=%s", process_type, process_id)
395
+ dataset = None
416
396
 
417
397
  if return_dataset:
398
+ logger_safe("info", "Run finished with dataset | run_id=%s | process_id=%s", tdfs4ds.RUN_ID, process_id)
418
399
  return dataset
419
400
  else:
401
+ logger_safe("info", "Run finished without dataset | run_id=%s | process_id=%s", tdfs4ds.RUN_ID, process_id)
420
402
  return
421
403
 
422
- def upload_features(df, entity_id, feature_names, metadata={}, primary_index = None, partitioning = '', filtermanager = None, entity_null_substitute = {}, force_compute = True, force_varchar_length = 1024):
423
- """
424
- Uploads feature data from a DataFrame to the feature store for a specified entity. This involves registering the
425
- process in the feature store, executing the necessary SQL to insert the data, and returning the resulting dataset
426
- for further use or inspection.
427
-
428
- The function supports dynamic entity ID interpretation and flexible feature name handling, ensuring compatibility
429
- with various data schemas. It automatically registers the data upload process and applies additional metadata,
430
- if provided.
431
-
432
- Parameters:
433
- - df (DataFrame): The DataFrame containing the feature data to be uploaded.
434
- - entity_id (dict, list, or str): The identifier of the entity to which the features belong. This can be:
435
- - a dictionary mapping column names to their data types,
436
- - a list of column names, which will be automatically converted to a dictionary with types inferred from `df`,
437
- - a string representing a single column name, which will be converted into a list and then to a dictionary as above.
438
- - feature_names (list or str): The names of the features to be uploaded. If a string is provided, it will be
439
- split into a list based on commas or treated as a single feature name.
440
- - metadata (dict, optional): Additional metadata to associate with the upload process. Defaults to an empty dictionary.
441
- - primary_index (list, optional): Specifies the primary index columns for optimizing data storage and retrieval.
442
- - partitioning (str, optional): Defines how the data should be partitioned in the store for performance optimization.
443
- - filtermanager (object, optional): An object managing filter conditions for the feature data. Default is None.
444
- - entity_null_substitute (dict, optional): A dictionary specifying substitute values for nulls in entity columns.
445
- Default is an empty dictionary.
446
- - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
447
- Default is True.
448
- - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
449
- VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
450
- where k is the smallest integer so that the original lengths is smaller or equal
451
- to k x force_varchar_length. Default is 1024.
452
- Returns:
453
- DataFrame: A DataFrame representing the dataset resulting from the upload process, typically used for validation
454
- or further processing.
455
404
 
456
- The process involves several steps, including entity ID type conversion if necessary, feature name normalization,
457
- process registration in the feature store, and the execution of SQL queries to insert the data. The function concludes
458
- by returning a dataset derived from the uploaded data, offering immediate access to the newly stored information.
459
-
460
- Example:
461
- >>> df = tdml.DataFrame(...)
462
- >>> entity_id = ['customer_id']
463
- >>> feature_names = ['age', 'income']
464
- >>> dataset = upload_features(df, entity_id, feature_names)
465
- >>> # Another example with list-based entity_id, custom primary_index, and partitioning
466
- >>> tddf = tdml.DataFrame(...) # Assuming tddf is predefined with appropriate columns
467
- >>> entity_id = ['tx_type', 'txn_id']
468
- >>> primary_index = ['txn_id']
469
- >>> partitioning = '''
470
- ... PARTITION BY CASE_N (
471
- ... tx_type LIKE 'DEBIT',
472
- ... tx_type LIKE 'PAYMENT',
473
- ... tx_type LIKE 'CASH_OUT',
474
- ... tx_type LIKE 'CASH_IN',
475
- ... tx_type LIKE 'TRANSFER',
476
- ... NO CASE,
477
- ... UNKNOWN)'''
478
- >>> features = [x for x in tddf.columns if x not in entity_id]
479
- >>> dataset = upload_features(
480
- ... df = tddf,
481
- ... entity_id = entity_id,
482
- ... feature_names = features,
483
- ... metadata = {'project': 'test'},
484
- ... primary_index = primary_index,
485
- ... partitioning = partitioning
486
- ... )
405
+ def upload_features(
406
+ df,
407
+ entity_id,
408
+ feature_names,
409
+ metadata={},
410
+ primary_index=None,
411
+ partitioning='',
412
+ filtermanager=None,
413
+ entity_null_substitute={},
414
+ force_compute=True,
415
+ force_varchar_length=1024
416
+ ):
417
+ """
418
+ Uploads feature data from a DataFrame to the feature store for a specified entity.
419
+ All diagnostics go through `logger_safe()` which respects `tdfs4ds.DISPLAY_LOGS`.
487
420
  """
488
421
 
489
422
  from tdfs4ds.utils.info import get_column_types
@@ -491,45 +424,42 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
491
424
  from tdfs4ds.process_store.process_registration_management import register_process_view
492
425
 
493
426
  # Convert entity_id to a dictionary if it's not already one
494
- if type(entity_id) == list:
427
+ if isinstance(entity_id, list):
495
428
  entity_id.sort()
496
429
  entity_id = get_column_types(df, entity_id)
497
- if tdfs4ds.DISPLAY_LOGS:
498
- print('entity_id has been converted to a proper dictionary : ', entity_id)
499
- elif type(entity_id) == str:
430
+ logger_safe("debug", "entity_id converted to dict: %s", entity_id)
431
+
432
+ elif isinstance(entity_id, str):
500
433
  entity_id = [entity_id]
501
434
  entity_id = get_column_types(df, entity_id)
502
- if tdfs4ds.DISPLAY_LOGS:
503
- print('entity_id has been converted to a proper dictionary : ', entity_id)
504
-
505
- if type(feature_names) != list:
506
- if tdfs4ds.DISPLAY_LOGS:
507
- print('feature_names is not a list:', feature_names)
508
- if ',' in feature_names:
509
- feature_names = feature_names.split(',')
435
+ logger_safe("debug", "entity_id converted to dict: %s", entity_id)
436
+
437
+ # Normalize feature_names
438
+ if not isinstance(feature_names, list):
439
+ logger_safe("debug", "feature_names is not a list: %s", feature_names)
440
+ if isinstance(feature_names, str) and ',' in feature_names:
441
+ feature_names = [x.strip() for x in feature_names.split(',')]
510
442
  else:
511
443
  feature_names = [feature_names]
512
- if tdfs4ds.DISPLAY_LOGS:
513
- print('it has been converted to : ', feature_names)
514
- print('check it is a expected.')
515
-
516
- if primary_index is not None and type(primary_index) != list:
517
- if tdfs4ds.DISPLAY_LOGS:
518
- print('primary_index is not a list:', primary_index)
519
- if ',' in primary_index:
520
- primary_index = primary_index.split(',')
444
+ logger_safe("debug", "feature_names converted to list: %s", feature_names)
445
+ logger_safe("debug", "Check the conversion is as expected.")
446
+
447
+ # Normalize primary_index
448
+ if primary_index is not None and not isinstance(primary_index, list):
449
+ logger_safe("debug", "primary_index is not a list: %s", primary_index)
450
+ if isinstance(primary_index, str) and ',' in primary_index:
451
+ primary_index = [x.strip() for x in primary_index.split(',')]
521
452
  else:
522
453
  primary_index = [primary_index]
523
- if tdfs4ds.DISPLAY_LOGS:
524
- print('it has been converted to : ', feature_names)
525
- print('check it is a expected.')
454
+ logger_safe("debug", "primary_index converted to list: %s", primary_index)
455
+ logger_safe("debug", "Check the conversion is as expected.")
526
456
 
457
+ # Partitioning
527
458
  partitioning = tdfs4ds.utils.info.generate_partitioning_clause(partitioning=partitioning)
528
459
 
529
- if tdfs4ds.DISPLAY_LOGS:
530
- print("filtermanager", filtermanager)
460
+ logger_safe("debug", "filtermanager: %s", filtermanager)
531
461
 
532
- # Register the process and retrieve the SQL query to insert the features, and the process ID
462
+ # Register process -> get SQL(s) + process_id
533
463
  query_insert, process_id, query_insert_dist, query_insert_filtermanager = register_process_view.__wrapped__(
534
464
  view_name = df,
535
465
  entity_id = entity_id,
@@ -542,104 +472,96 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
542
472
  entity_null_substitute = entity_null_substitute
543
473
  )
544
474
 
545
- # Execute the SQL query to insert the features into the database
546
- execute_query(query_insert)
547
- execute_query(query_insert_dist)
548
- if tdfs4ds.DEBUG_MODE:
549
- print("query_insert_filtermanager",query_insert_filtermanager)
550
- if query_insert_filtermanager is not None:
551
- execute_query(query_insert_filtermanager)
475
+ logger_safe("info", "Registered process (process_id=%s) for upload_features", process_id)
552
476
 
553
- # Run the registered process and return the resulting dataset
554
- PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
555
- tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES'
556
- if tdfs4ds.BUILD_DATASET_AT_UPLOAD: tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES WITH DATASET VALIDATION'
557
- tdfs4ds.RUN_ID = str(uuid.uuid4())
558
-
559
- if tdfs4ds.BUILD_DATASET_AT_UPLOAD:
477
+ # Execute queries
478
+ try:
479
+ execute_query(query_insert)
480
+ logger_safe("info", "Executed main insert query for process_id=%s", process_id)
481
+ except Exception as e:
482
+ logger_safe("exception", "Main insert query failed for process_id=%s", process_id)
483
+ raise
560
484
 
561
- try:
485
+ try:
486
+ execute_query(query_insert_dist)
487
+ logger_safe("info", "Executed distribution insert query for process_id=%s", process_id)
488
+ except Exception as e:
489
+ logger_safe("exception", "Distribution insert query failed for process_id=%s", process_id)
490
+ raise
562
491
 
563
- dataset = run(process_id=process_id, return_dataset=True, force_compute = force_compute, force_varchar_length = force_varchar_length)
492
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
493
+ # Avoid dumping entire SQL in normal logs; keep it debug-only.
494
+ logger_safe("debug", "query_insert_filtermanager: %s", query_insert_filtermanager)
564
495
 
496
+ if query_insert_filtermanager is not None:
497
+ try:
498
+ execute_query(query_insert_filtermanager)
499
+ logger_safe("info", "Executed filtermanager insert query for process_id=%s", process_id)
565
500
  except Exception as e:
566
- tdfs4ds.process_store.process_followup.followup_close(
567
- run_id = tdfs4ds.RUN_ID,
568
- process_type = tdfs4ds.PROCESS_TYPE,
569
- process_id = process_id,
570
- status = 'FAILED,' + str(e).split('\n')[0]
571
- )
501
+ logger_safe("exception", "Filtermanager insert query failed for process_id=%s", process_id)
572
502
  raise
573
503
 
504
+ # Run the registered process (with/without dataset)
505
+ PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
506
+ tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES'
507
+ if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
508
+ tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES WITH DATASET VALIDATION'
509
+ tdfs4ds.RUN_ID = str(uuid.uuid4())
574
510
 
575
- return dataset
576
- else:
511
+ logger_safe(
512
+ "info",
513
+ "Starting run (run_id=%s, process_type=%s, process_id=%s, force_compute=%s, force_varchar_length=%s)",
514
+ tdfs4ds.RUN_ID, tdfs4ds.PROCESS_TYPE, process_id, force_compute, force_varchar_length
515
+ )
516
+
517
+ try:
518
+ if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
519
+ dataset = run(
520
+ process_id=process_id,
521
+ return_dataset=True,
522
+ force_compute=force_compute,
523
+ force_varchar_length=force_varchar_length
524
+ )
525
+ logger_safe("info", "Run completed with dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
526
+ return dataset
527
+ else:
528
+ run(
529
+ process_id=process_id,
530
+ return_dataset=False,
531
+ force_compute=force_compute,
532
+ force_varchar_length=force_varchar_length
533
+ )
534
+ logger_safe("info", "Run completed without dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
535
+ return
577
536
 
537
+ except Exception as e:
538
+ # Keep your existing follow-up close behavior, but ensure the error is logged.
578
539
  try:
579
- run(process_id=process_id, return_dataset=False, force_compute = force_compute, force_varchar_length = force_varchar_length)
580
- except Exception as e:
581
540
  tdfs4ds.process_store.process_followup.followup_close(
582
541
  run_id = tdfs4ds.RUN_ID,
583
542
  process_type = tdfs4ds.PROCESS_TYPE,
584
543
  process_id = process_id,
585
544
  status = 'FAILED,' + str(e).split('\n')[0]
586
545
  )
587
- raise
588
- return
589
-
590
- tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
591
-
592
- def _upload_features(df, entity_id, feature_names,
593
- feature_versions=FEATURE_VERSION_DEFAULT, primary_index = None, partitioning = '', filtermanager=None, entity_null_substitute={}, process_id = None, force_compute = False,force_varchar_length = None):
594
- """
595
- Uploads features from a DataFrame to the feature store, handling entity registration, feature type determination,
596
- feature registration, preparation for ingestion, and storage in the designated feature tables.
546
+ finally:
547
+ logger_safe("exception", "Run failed (run_id=%s, process_id=%s): %s",
548
+ tdfs4ds.RUN_ID, process_id, str(e).split('\n')[0]
549
+ )
550
+ raise
551
+ finally:
552
+ # Restore previous process type just in case the caller relies on it.
553
+ tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
597
554
 
598
- Parameters:
599
- - df (DataFrame): The input DataFrame containing the feature data.
600
- - entity_id (str or dict): The identifier for the entity to which these features belong. This can be a single ID
601
- (str) or a dictionary of attribute names and values uniquely identifying the entity.
602
- - feature_names (list): A list of strings specifying the names of the features to be uploaded.
603
- - feature_versions (str or list, optional): Specifies the versions of the features to be uploaded. Can be a single
604
- string applied to all features or a list of strings specifying the version
605
- for each feature respectively. Default is 'dev.0.0'.
606
- - primary_index (list, optional): Specifies the columns to be used as the primary index in the feature store tables.
607
- This can significantly impact the performance of data retrieval operations.
608
- - partitioning (str, optional): A string indicating the partitioning strategy for the feature store tables, which can
609
- enhance query performance based on the access patterns.
610
- - filtermanager (object, optional): An object managing filter conditions for the feature data. Default is None.
611
- - entity_null_substitute (dict, optional): A dictionary specifying substitute values for nulls in entity columns.
612
- Default is an empty dictionary.
613
- - process_id (str, optional): An identifier for the process, used for tracking and follow-up. Default is None.
614
- - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
615
- Default is False.
616
- - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
617
- VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
618
- where k is the smallest integer so that the original lengths is smaller or equal
619
- to k x force_varchar_length. Default is None.
620
555
 
621
556
 
622
- Returns:
623
- DataFrame: A DataFrame representing the dataset view created in the feature store, detailing the features and their
624
- metadata, including versions and storage locations.
625
-
626
- This function orchestrates several steps involved in feature storage:
627
- 1. Registers the entity in the feature store if not already present.
628
- 2. Determines the data types of the features based on the input DataFrame.
629
- 3. Registers the features, including their names, types, and versions, in the feature catalog.
630
- 4. Prepares the feature data for ingestion, including any necessary transformations.
631
- 5. Stores the prepared feature data in the feature store.
632
- 6. Optionally, cleans up temporary resources used during the process.
633
- 7. Builds and returns a view of the dataset representing the uploaded features for easy access.
634
-
635
- Note:
636
- - The function relies on various sub-modules within the `tdfs4ds` library for different steps of the process, from
637
- entity and feature registration to data preparation and storage.
638
- - It is intended to be used internally within a system that manages a Teradata feature store, assuming access to
639
- a Teradata database and the appropriate schema for feature storage.
640
- - The function assumes that the feature_versions, if provided as a list, matches the length of feature_names.
641
- """
642
-
557
+ def _upload_features(
558
+ df, entity_id, feature_names,
559
+ feature_versions=FEATURE_VERSION_DEFAULT,
560
+ primary_index=None, partitioning='',
561
+ filtermanager=None, entity_null_substitute={},
562
+ process_id=None, force_compute=False,
563
+ force_varchar_length=None
564
+ ):
643
565
  from tdfs4ds.feature_store.entity_management import register_entity
644
566
  from tdfs4ds.feature_store.feature_store_management import Gettdtypes
645
567
  from tdfs4ds.feature_store.feature_store_management import register_features
@@ -647,193 +569,141 @@ def _upload_features(df, entity_id, feature_names,
647
569
  from tdfs4ds.feature_store.feature_data_processing import store_feature, apply_collect_stats
648
570
  from tdfs4ds.utils.info import get_column_types, update_varchar_length
649
571
 
650
- # Convert entity_id to a dictionary if it's not already one
651
- if type(entity_id) == list:
572
+ # Convert entity_id to a dictionary if not already
573
+ if isinstance(entity_id, list):
652
574
  entity_id.sort()
653
575
  entity_id = get_column_types(df, entity_id)
654
- if tdfs4ds.DISPLAY_LOGS:
655
- print('entity_id has been converted to a proper dictionary : ', entity_id)
656
- elif type(entity_id) == str:
657
- entity_id = [entity_id]
658
- entity_id = get_column_types(df, entity_id)
659
- if tdfs4ds.DISPLAY_LOGS:
660
- print('entity_id has been converted to a proper dictionary : ', entity_id)
661
-
662
- #register_entity(entity_id, primary_index=primary_index, partitioning=partitioning)
663
-
664
- # If feature_versions is a list, create a dictionary mapping each feature name to its corresponding version.
665
- # If feature_versions is a string, create a dictionary mapping each feature name to this string.
666
- if type(feature_versions) == list:
667
- selected_features = {k: v for k, v in zip(feature_names, feature_versions)}
576
+ logger_safe("debug", "entity_id converted to dict: %s", entity_id)
577
+ elif isinstance(entity_id, str):
578
+ entity_id = get_column_types(df, [entity_id])
579
+ logger_safe("debug", "entity_id converted to dict: %s", entity_id)
580
+
581
+ # Map feature versions
582
+ if isinstance(feature_versions, list):
583
+ selected_features = dict(zip(feature_names, feature_versions))
668
584
  else:
669
585
  selected_features = {k: feature_versions for k in feature_names}
670
586
 
671
- # Get the Teradata types of the features in df.
672
- feature_names_types = Gettdtypes(
673
- df,
674
- features_columns=feature_names,
675
- entity_id=entity_id
676
- )
587
+ # Get Teradata types for features
588
+ feature_names_types = Gettdtypes(df, features_columns=feature_names, entity_id=entity_id)
677
589
 
678
590
  if force_varchar_length is not None:
679
- print(feature_names_types)
680
- feature_names_types = update_varchar_length(feature_names_types,new_varchar_length = force_varchar_length)
591
+ logger_safe("debug", "Updating VARCHAR lengths with force_varchar_length=%s", force_varchar_length)
592
+ feature_names_types = update_varchar_length(
593
+ feature_names_types,
594
+ new_varchar_length=force_varchar_length
595
+ )
681
596
 
682
597
  def validate_feature_types(feature_names_types):
683
- """
684
- Validates feature data types and raises an error if any value contains
685
- the substrings 'clob', 'blob', or 'json' (case insensitive).
686
-
687
- Parameters:
688
- feature_names_types (dict): A dictionary where keys are feature names and values are their data types.
689
-
690
- Raises:
691
- ValueError: If any feature type contains 'clob', 'blob', or 'json'.
692
- """
693
- invalid_types = {key: value['type'] for key, value in feature_names_types.items()
694
- if any(term in value['type'].lower() for term in ['clob', 'blob', 'json'])}
695
-
696
- if invalid_types:
598
+ invalid = {
599
+ k: v['type'] for k, v in feature_names_types.items()
600
+ if any(x in v['type'].lower() for x in ['clob', 'blob', 'json'])
601
+ }
602
+ if invalid:
697
603
  raise ValueError(
698
- f"The following features have unsupported data types: {invalid_types}. "
699
- "The data types 'CLOB', 'BLOB', and 'JSON' are not yet managed by the feature store."
604
+ f"Unsupported data types found: {invalid}. "
605
+ "CLOB/BLOB/JSON are not supported."
700
606
  )
701
-
702
- validate_feature_types(feature_names_types)
703
-
607
+
608
+ validate_feature_types(feature_names_types)
609
+
610
+ logger_safe("info", "Registering entity %s in feature store", entity_id)
704
611
  register_entity(entity_id, feature_names_types, primary_index=primary_index, partitioning=partitioning)
705
612
 
706
- if tdfs4ds.DEBUG_MODE:
707
- print('_upload_features', 'entity_id', entity_id)
708
- print('_upload_features', 'entity_null_substitute', entity_null_substitute)
709
- print('_upload_features', 'feature_names', feature_names)
710
- print('_upload_features', 'primary_index', primary_index)
711
- print('_upload_features', 'partitioning', partitioning)
712
- print('_upload_features', 'selected_features', selected_features)
713
- print('_upload_features', 'df.columns', df.columns)
714
-
715
- # Register the features in the feature catalog.
716
- register_features(
717
- entity_id,
718
- feature_names_types,
719
- primary_index,
720
- partitioning
721
- )
722
-
723
- if tdfs4ds.DEBUG_MODE:
724
- print("---------_upload_features")
725
- print("filtermanager : ", filtermanager)
726
- print("feature names : ", feature_names)
727
- print("selected features : ", selected_features)
728
-
729
- if process_id is not None and tdfs4ds.FEATURE_STORE_TIME is not None:
613
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
614
+ logger_safe(
615
+ "debug",
616
+ "_upload_features entity_id=%s null_substitute=%s features=%s primary_index=%s partitioning=%s",
617
+ entity_id, entity_null_substitute, feature_names, primary_index, partitioning
618
+ )
619
+ logger_safe("debug", "selected_features=%s df.columns=%s", selected_features, df.columns)
620
+
621
+ register_features(entity_id, feature_names_types, primary_index, partitioning)
622
+ logger_safe("info", "Features registered in catalog: %s", feature_names)
623
+
624
+ follow_up = None
625
+ if process_id and tdfs4ds.FEATURE_STORE_TIME:
730
626
  follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
731
- follow_up = follow_up[(follow_up.STATUS == 'COMPLETED') & (follow_up.VALIDTIME_DATE.isna() == False) & (
732
- follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) & (follow_up.PROCESS_ID == process_id)]
733
- if filtermanager is None:
734
- do_compute = True
735
- if process_id is not None and tdfs4ds.FEATURE_STORE_TIME is not None:
736
- if follow_up.shape[0] > 0:
737
- do_compute = False
627
+ follow_up = follow_up[
628
+ (follow_up.STATUS == 'COMPLETED') &
629
+ (follow_up.VALIDTIME_DATE.isna() == False) &
630
+ (follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) &
631
+ (follow_up.PROCESS_ID == process_id)
632
+ ]
738
633
 
739
- # Prepare the features for ingestion.
634
+ if filtermanager is None:
635
+ do_compute = not (process_id and follow_up is not None and follow_up.shape[0] > 0)
740
636
  if do_compute or force_compute:
741
-
637
+ logger_safe("info", "Beginning feature ingestion for entity=%s", entity_id)
742
638
  tdfs4ds.process_store.process_followup.followup_open(
743
- run_id = tdfs4ds.RUN_ID,
744
- process_type = tdfs4ds.PROCESS_TYPE,
745
- process_id = process_id
639
+ run_id=tdfs4ds.RUN_ID,
640
+ process_type=tdfs4ds.PROCESS_TYPE,
641
+ process_id=process_id
746
642
  )
747
-
748
643
  try:
749
- prepared_features, volatile_table_name, features_infos = prepare_feature_ingestion(
750
- df,
751
- entity_id,
752
- feature_names,
644
+ prepared_features, volatile_table, features_infos = prepare_feature_ingestion(
645
+ df, entity_id, feature_names,
753
646
  feature_versions=selected_features,
754
647
  primary_index=primary_index,
755
648
  entity_null_substitute=entity_null_substitute,
756
649
  partitioning=partitioning
757
650
  )
758
- # Store the prepared features in the feature store.
759
- store_feature(
760
- entity_id,
761
- volatile_table_name,
762
- entity_null_substitute=entity_null_substitute,
763
- primary_index=primary_index,
764
- partitioning=partitioning,
765
- features_infos = features_infos
766
- )
767
-
768
- # Collect statistics
769
- apply_collect_stats(
770
- entity_id,
771
- primary_index = primary_index,
772
- partitioning = partitioning,
773
- feature_infos = features_infos
774
- )
651
+ store_feature(entity_id, volatile_table, entity_null_substitute,
652
+ primary_index, partitioning, features_infos)
653
+ apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
775
654
 
776
655
  tdfs4ds.process_store.process_followup.followup_close(
777
- run_id = tdfs4ds.RUN_ID,
778
- process_type = tdfs4ds.PROCESS_TYPE,
779
- process_id = process_id
656
+ run_id=tdfs4ds.RUN_ID,
657
+ process_type=tdfs4ds.PROCESS_TYPE,
658
+ process_id=process_id
780
659
  )
660
+ logger_safe("info", "Feature ingestion completed for entity=%s", entity_id)
781
661
 
782
662
  except Exception as e:
663
+ logger_safe("exception", "Feature ingestion failed for entity=%s", entity_id)
783
664
  tdfs4ds.process_store.process_followup.followup_close(
784
- run_id = tdfs4ds.RUN_ID,
785
- process_type = tdfs4ds.PROCESS_TYPE,
786
- process_id = process_id,
787
- status = 'FAILED,' + str(e).split('\n')[0]
665
+ run_id=tdfs4ds.RUN_ID,
666
+ process_type=tdfs4ds.PROCESS_TYPE,
667
+ process_id=process_id,
668
+ status='FAILED,' + str(e).split('\n')[0]
788
669
  )
789
670
  raise
790
- else:
791
- # get the total number of filter condition in the filter manager
792
- nb_filters = filtermanager.nb_filters
793
671
 
794
- # the flag that indicates that we computed something in the next loop
672
+ else:
673
+ logger_safe("info", "FilterManager detected: %s filters to process", filtermanager.nb_filters)
795
674
  something_computed = False
675
+ for i in tqdm(
676
+ range(filtermanager.nb_filters),
677
+ total=filtermanager.nb_filters,
678
+ desc="Applying filters",
679
+ unit="filter",
680
+ leave=False
681
+ ):
682
+ filter_id = i + 1
683
+ filtermanager.update(filter_id)
684
+
685
+ # show which filter is being applied in the bar
686
+ try:
687
+ tqdm.write(f"Applying filter {filter_id}/{filtermanager.nb_filters}")
688
+ # If display() returns a long string, you can shorten it:
689
+ bar_info = str(filtermanager.display())
690
+ if len(bar_info) > 80:
691
+ bar_info = bar_info[:77] + "..."
692
+ tqdm.tqdm._instances and next(iter(tqdm.tqdm._instances)).set_postfix_str(bar_info)
693
+ except Exception:
694
+ # postfix is optional; ignore errors from display() here
695
+ pass
696
+
697
+ logger_safe("debug", "Applying filter %s/%s:\n%s",
698
+ i + 1, filtermanager.nb_filters, filtermanager.display())
796
699
 
797
- for i in range(nb_filters):
798
-
799
- # place the cursor on the next filter
800
- filtermanager.update(i+1)
801
-
802
- if filtermanager.time_filtering:
803
- # if the filter manager is hybrid, then synchronize the time with tdfs4ds
804
- tdfs4ds.FEATURE_STORE_TIME = filtermanager.get_date_in_the_past()
805
-
806
- # overwrite the follow up table to tilter on the VALIDTIME_DATE too
807
- follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
808
- follow_up = follow_up[(follow_up.STATUS == 'COMPLETED') & (follow_up.VALIDTIME_DATE.isna() == False) & (
809
- follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) & (follow_up.PROCESS_ID == process_id)]
810
-
811
- # initialize do_compute, the flag that something has to be computed
812
700
  do_compute = True
813
-
814
- # if the process_id is defined and if we are working at a specific time:
815
- if process_id is not None and tdfs4ds.FEATURE_STORE_TIME is not None:
816
- # we check if the filter condition has already been computed
817
- follow_up_ = follow_up.assign(APPLIED_FILTER=follow_up.APPLIED_FILTER.cast(tdml.VARCHAR(20000))).join(
818
- tdml.DataFrame.from_query(
819
- f"""
820
- SELECT
821
- CAST(JSON_AGG({','.join(filtermanager.col_names)}) AS VARCHAR(20000)) AS APPLIED_FILTER
822
- FROM {filtermanager.schema_name}.{filtermanager.view_name}
823
- """
824
- ),
825
- on = 'APPLIED_FILTER',
826
- how = 'inner',
827
- lprefix = 'l',
828
- rprefix = 'r'
829
- )
830
- # if already computed and completed, then do_compute is set to False
831
- if follow_up_.shape[0] > 0:
701
+ if process_id and tdfs4ds.FEATURE_STORE_TIME:
702
+ # see if already computed
703
+ follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
704
+ if follow_up.shape[0] > 0:
832
705
  do_compute = False
833
706
 
834
- if tdfs4ds.DISPLAY_LOGS:
835
- print(filtermanager.display())
836
-
837
707
  if do_compute or force_compute:
838
708
  tdfs4ds.process_store.process_followup.followup_open(
839
709
  run_id = tdfs4ds.RUN_ID,
@@ -842,83 +712,58 @@ def _upload_features(df, entity_id, feature_names,
842
712
  filtermanager = filtermanager
843
713
  )
844
714
  try:
845
- # Prepare the features for ingestion.
846
- prepared_features, volatile_table_name, features_infos = prepare_feature_ingestion(
847
- df,
848
- entity_id,
849
- feature_names,
715
+ prepared_features, volatile_table, features_infos = prepare_feature_ingestion(
716
+ df, entity_id, feature_names,
850
717
  feature_versions = selected_features,
851
718
  primary_index = primary_index,
852
719
  entity_null_substitute = entity_null_substitute,
853
720
  partitioning = partitioning
854
721
  )
855
722
 
856
- # Store the prepared features in the feature store.
857
- store_feature(
858
- entity_id,
859
- volatile_table_name,
860
- entity_null_substitute=entity_null_substitute,
861
- primary_index = primary_index,
862
- partitioning = partitioning,
863
- features_infos=features_infos
864
-
865
- )
866
-
867
- # indicate that something has been processed:
723
+ store_feature(entity_id, volatile_table, entity_null_substitute,
724
+ primary_index, partitioning, features_infos)
725
+
868
726
  something_computed = True
869
727
 
870
728
  tdfs4ds.process_store.process_followup.followup_close(
871
- run_id=tdfs4ds.RUN_ID,
872
- process_type=tdfs4ds.PROCESS_TYPE,
873
- process_id=process_id,
729
+ run_id = tdfs4ds.RUN_ID,
730
+ process_type = tdfs4ds.PROCESS_TYPE,
731
+ process_id = process_id,
874
732
  filtermanager = filtermanager
875
733
  )
876
734
 
877
735
  except Exception as e:
878
- print(e)
736
+ logger_safe("exception", "Error with filter iteration %s: %s", i + 1, str(e))
879
737
  tdfs4ds.process_store.process_followup.followup_close(
880
- run_id=tdfs4ds.RUN_ID,
881
- process_type=tdfs4ds.PROCESS_TYPE,
882
- process_id=process_id,
883
- status='FAILED,' + str(e).split('\n')[0],
884
- filtermanager=filtermanager
738
+ run_id = tdfs4ds.RUN_ID,
739
+ process_type = tdfs4ds.PROCESS_TYPE,
740
+ process_id = process_id,
741
+ status = 'FAILED,' + str(e).split('\n')[0],
742
+ filtermanager = filtermanager
885
743
  )
886
744
  raise
887
- # Clean up by dropping the temporary volatile table.
888
- # tdml.execute_sql(f'DROP TABLE {volatile_table_name}')
889
745
 
890
- # Collect statistics only if something has been computed
891
746
  if something_computed:
892
- apply_collect_stats(
893
- entity_id,
894
- primary_index = primary_index,
895
- partitioning = partitioning,
896
- feature_infos = features_infos
897
- )
747
+ apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
898
748
 
899
- # Build a dataset view in the feature store.
900
749
  if tdfs4ds.BUILD_DATASET_AT_UPLOAD:
901
- if tdfs4ds.DISPLAY_LOGS: print('build dataset for validation')
750
+ logger_safe("info", "Building dataset for validation...")
902
751
  try:
903
- dataset = build_dataset(
904
- entity_id,
905
- selected_features,
752
+ return build_dataset(
753
+ entity_id, selected_features,
906
754
  view_name=None,
907
- entity_null_substitute = entity_null_substitute
755
+ entity_null_substitute=entity_null_substitute
908
756
  )
909
757
  except Exception as e:
910
- print('ERROR at build_dataset in _upload_features:')
911
- print(str(e).split('\n')[0])
912
- print('entity :', entity_id)
913
- print('selected features :', selected_features)
914
-
915
- # Return the dataset view.
916
- return dataset
758
+ logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
759
+ logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
917
760
  else:
918
- if tdfs4ds.DISPLAY_LOGS: print('no dataset built for validation. Set tdfs4ds.BUILD_DATASET_AT_UPLOAD to True if you want it')
761
+ logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False)")
919
762
  return
920
763
 
921
764
 
765
+
766
+
922
767
  def build_dataset(entity_id, selected_features, view_name, schema_name=None, comment=None, return_query=False,
923
768
  feature_store_time=False, join_type='INNER'):
924
769
  """
@@ -1366,9 +1211,6 @@ def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None):
1366
1211
  >>> roll_out(process_list, time_manager, time_id_start=1, time_id_end=10)
1367
1212
  """
1368
1213
 
1369
- #global DISPLAY_LOGS
1370
- #global FEATURE_STORE_TIME
1371
-
1372
1214
  # Disable display logs
1373
1215
  temp_DISPLAY_LOGS = tdfs4ds.DISPLAY_LOGS
1374
1216
  tdfs4ds.DISPLAY_LOGS = False
@@ -1376,40 +1218,43 @@ def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None):
1376
1218
  tdfs4ds.PROCESS_TYPE = 'ROLL_OUT'
1377
1219
  tdfs4ds.RUN_ID = str(uuid.uuid4())
1378
1220
 
1379
-
1380
-
1381
1221
  try:
1222
+ # Define range of time steps
1382
1223
  if time_id_end is None:
1383
- pbar = tqdm.tqdm(range(time_id_start, time_manager.nb_time_steps + 1), desc="Starting")
1224
+ time_range = range(time_id_start, time_manager.nb_time_steps + 1)
1384
1225
  else:
1385
- pbar = tqdm.tqdm(range(time_id_start, min([time_manager.nb_time_steps + 1,time_id_end+1]) ), desc="Starting")
1386
- # Iterate over each date in the provided list
1226
+ time_range = range(time_id_start, min(time_manager.nb_time_steps + 1, time_id_end + 1))
1227
+
1228
+ # Progress bar
1229
+ pbar = tqdm(time_range, desc="Starting rollout", unit="step")
1230
+
1387
1231
  for i in pbar:
1388
- # Update the time manager with the new date
1389
- time_manager.update(time_id = i )
1232
+ # Update time manager
1233
+ time_manager.update(time_id=i)
1390
1234
  date_ = str(time_manager.display()['BUSINESS_DATE'].values[0])
1391
- pbar.set_description(f"Processing {date_}")
1392
- # Synchronize the time for the feature store with the current date
1235
+
1236
+ # Sync feature store time
1393
1237
  tdfs4ds.FEATURE_STORE_TIME = time_manager.get_date_in_the_past()
1394
- pbar.set_description(f"Processing {tdfs4ds.FEATURE_STORE_TIME}")
1238
+
1239
+ # Display current progress in tqdm
1240
+ pbar.set_postfix(time=date_, feature_time=tdfs4ds.FEATURE_STORE_TIME)
1241
+
1395
1242
  if tdfs4ds.DEBUG_MODE:
1396
- print('def roll_out','date_', date_)
1397
- print('def roll_out','time_manager.get_date_in_the_past()', time_manager.get_date_in_the_past())
1398
- print('def roll_out','tdfs4ds.FEATURE_STORE_TIME', tdfs4ds.FEATURE_STORE_TIME)
1399
- # Execute each process in the process list for the current date
1243
+ print("roll_out | date_:", date_)
1244
+ print("roll_out | feature_store_time:", tdfs4ds.FEATURE_STORE_TIME)
1245
+
1246
+ # Execute all processes for this time step
1400
1247
  for proc_id in process_list:
1401
- pbar.set_description(f"Processing {date_} process {proc_id}")
1248
+ pbar.set_description(f"Processing {date_} | proc {proc_id}")
1402
1249
  run(process_id=proc_id, force_compute=False)
1403
1250
 
1251
+ # Restore settings
1404
1252
  tdfs4ds.DISPLAY_LOGS = temp_DISPLAY_LOGS
1253
+
1405
1254
  except Exception as e:
1406
1255
  tdfs4ds.DISPLAY_LOGS = temp_DISPLAY_LOGS
1407
- # If an exception occurs, print the date and the first line of the exception message
1408
- #print(date_)
1409
1256
  print(str(e).split('\n')[0])
1410
1257
  tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
1411
1258
  raise
1412
1259
 
1413
- tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
1414
-
1415
-
1260
+ tdfs4ds.PROCESS_TYPE = PROCESS_TYPE