tdfs4ds 0.2.4.26__py3-none-any.whl → 0.2.4.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tdfs4ds/__init__.py CHANGED
@@ -1,5 +1,7 @@
1
- __version__ = '0.2.4.26'
1
+ __version__ = '0.2.4.41'
2
2
  import logging
3
+ import json
4
+
3
5
  # Setup the logger
4
6
  logging.basicConfig(
5
7
  level=logging.INFO,
@@ -7,6 +9,15 @@ logging.basicConfig(
7
9
  datefmt='%Y-%m-%d %H:%M:%S' # Set the date/time format
8
10
  )
9
11
 
12
+ # Helper: central logging gate controlled by tdfs4ds.DISPLAY_LOGS
13
+ def logger_safe(level, message, *args, **kwargs):
14
+ """
15
+ Wrapper around the global `logger` that only emits logs when
16
+ tdfs4ds.DISPLAY_LOGS is True. `level` is a string like "info", "error", etc.
17
+ """
18
+ if getattr(tdfs4ds, "DISPLAY_LOGS", True):
19
+ getattr(logger, level)(message, *args, **kwargs)
20
+
10
21
  logger = logging.getLogger(__name__)
11
22
 
12
23
  from tdfs4ds.feature_store.feature_query_retrieval import get_available_entity_id_records, write_where_clause_filter
@@ -57,7 +68,7 @@ import tdfs4ds.datasets
57
68
  import time
58
69
 
59
70
  import inspect
60
- import tqdm
71
+ from tqdm.auto import tqdm # auto picks the right frontend (notebook/terminal)
61
72
 
62
73
  from tdfs4ds.feature_store.feature_data_processing import generate_on_clause
63
74
 
@@ -70,92 +81,80 @@ PROCESS_TYPE = 'RUN PROCESS'
70
81
  try:
71
82
  SCHEMA = tdml.context.context._get_current_databasename()
72
83
  if SCHEMA is None:
73
- print('Please specify the database which is hosting the feature store.')
74
- print('tdfs4ds.feature_store.schema = "<feature store database>"')
84
+ logger.warning("No default database detected for feature store.")
85
+ logger.warning('Please set it explicitly: tdfs4ds.feature_store.schema = "<feature store database>"')
75
86
  else:
76
- print('The default database is used for the feature store.')
77
- print(f"tdfs4ds.feature_store.schema = '{SCHEMA}'")
87
+ logger.info("Default database detected for feature store: %s", SCHEMA)
88
+ logger.info('tdfs4ds.feature_store.schema = "%s"', SCHEMA)
89
+
78
90
  if DATA_DOMAIN is None:
79
91
  DATA_DOMAIN = SCHEMA
80
- print(f"the data domain for the current work is :{DATA_DOMAIN}")
81
- print("Please update it as you wish with tdfs4ds.DATA_DOMAIN=<your data domain>")
92
+ logger.info("DATA_DOMAIN not set. Defaulting to SCHEMA: %s", DATA_DOMAIN)
93
+ logger.info('You can override it using: tdfs4ds.DATA_DOMAIN = "<your data domain>"')
82
94
 
83
95
  except Exception as e:
84
- print('Please specify the database which is hosting the feature store.')
85
- print('tdfs4ds.feature_store.schema = "<feature store database>"')
96
+ logger.error("Could not determine current database: %s", str(e).split('\n')[0])
97
+ logger.warning("Please specify the feature store database manually:")
98
+ logger.warning('tdfs4ds.feature_store.schema = "<feature store database>"')
86
99
 
87
100
 
88
101
  def setup(database, if_exists='fail'):
89
102
  """
90
- Set up the database environment by configuring schema names and optionally dropping existing tables.
91
-
92
- This function sets the database schema for feature and process catalogs. If specified, it also handles
93
- the replacement of existing catalog tables. It reports the status of these operations, including any
94
- encountered exceptions.
95
-
96
- Parameters:
97
- database (str): The name of the database schema to be used.
98
- if_exists (str, optional): Determines the behavior if catalog tables already exist in the database.
99
- 'fail' (default) - Do nothing if the tables exist.
100
- 'replace' - Drop the tables if they exist before creating new ones.
101
-
102
- Steps performed:
103
- 1. Sets the schema to the provided database name.
104
- 2. If 'if_exists' is 'replace', attempts to drop 'FS_FEATURE_CATALOG' and 'FS_PROCESS_CATALOG' tables.
105
- 3. Creates new feature and process catalog tables and sets their names in the tdfs4ds module.
106
- 4. Prints the names of the newly created tables along with the database name.
107
- 5. Captures and prints the first line of any exceptions that occur during these operations.
108
-
109
- Returns:
110
- None
103
+ Initialize the feature store environment by creating catalog tables and views.
111
104
  """
112
105
 
113
106
  from tdfs4ds.feature_store.feature_store_management import feature_store_catalog_creation
114
107
  from tdfs4ds.process_store.process_store_catalog_management import process_store_catalog_creation
115
108
 
116
109
  tdfs4ds.SCHEMA = database
110
+ logger_safe("info", "Setting up feature store in database: %s", database)
111
+
117
112
  if if_exists == 'replace':
118
- try:
119
- tdml.db_drop_table(table_name = tdfs4ds.FEATURE_CATALOG_NAME, schema_name=database)
120
- except Exception as e:
121
- print(str(e).split('\n')[0])
122
- try:
123
- tdml.db_drop_table(table_name = tdfs4ds.PROCESS_CATALOG_NAME, schema_name=database)
124
- except Exception as e:
125
- print(str(e).split('\n')[0])
126
- try:
127
- tdml.db_drop_table(table_name = tdfs4ds.DATA_DISTRIBUTION_NAME, schema_name=database)
128
- except Exception as e:
129
- print(str(e).split('\n')[0])
113
+ logger_safe("info", "Replacing existing catalog tables if they exist.")
114
+ for table in [tdfs4ds.FEATURE_CATALOG_NAME, tdfs4ds.PROCESS_CATALOG_NAME, tdfs4ds.DATA_DISTRIBUTION_NAME]:
115
+ try:
116
+ tdml.db_drop_table(table_name=table, schema_name=database)
117
+ logger_safe("info", "Dropped table %s.%s", database, table)
118
+ except Exception as e:
119
+ logger_safe("warning", "Could not drop table %s.%s: %s", database, table, str(e).split('\n')[0])
130
120
 
131
121
  DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME).drop_catalog()
122
+
132
123
  try:
133
124
  tdfs4ds.FEATURE_CATALOG_NAME = feature_store_catalog_creation()
134
- print('feature catalog table: ', tdfs4ds.FEATURE_CATALOG_NAME, ' in database ', database)
125
+ logger_safe("info", "Feature catalog table created: %s in database %s", tdfs4ds.FEATURE_CATALOG_NAME, database)
135
126
  except Exception as e:
136
- print(str(e).split('\n')[0])
127
+ logger_safe("error", "Feature catalog creation failed: %s", str(e).split('\n')[0])
137
128
 
138
129
  try:
139
- tdfs4ds.PROCESS_CATALOG_NAME, tdfs4ds.DATA_DISTRIBUTION_NAME, tdfs4ds.FILTER_MANAGER_NAME = process_store_catalog_creation()
140
- print('process catalog table: ', tdfs4ds.PROCESS_CATALOG_NAME, ' in database ', database)
141
- print('data distribution table: ', tdfs4ds.DATA_DISTRIBUTION_NAME, ' in database ', database)
142
- print('filter manager table: ', tdfs4ds.FILTER_MANAGER_NAME, ' in database ', database)
130
+ (tdfs4ds.PROCESS_CATALOG_NAME,
131
+ tdfs4ds.DATA_DISTRIBUTION_NAME,
132
+ tdfs4ds.FILTER_MANAGER_NAME) = process_store_catalog_creation()
133
+
134
+ logger_safe("info", "Process catalog table created: %s", tdfs4ds.PROCESS_CATALOG_NAME)
135
+ logger_safe("info", "Data distribution table created: %s", tdfs4ds.DATA_DISTRIBUTION_NAME)
136
+ logger_safe("info", "Filter manager table created: %s", tdfs4ds.FILTER_MANAGER_NAME)
143
137
  except Exception as e:
144
- print(str(e).split('\n')[0])
138
+ logger_safe("error", "Process catalog creation failed: %s", str(e).split('\n')[0])
145
139
 
146
140
  try:
147
141
  tdfs4ds.process_store.process_followup.follow_up_table_creation()
142
+ logger_safe("info", "Follow-up table created successfully.")
148
143
  except Exception as e:
149
- print(str(e).split('\n')[0])
144
+ logger_safe("error", "Follow-up table creation failed: %s", str(e).split('\n')[0])
150
145
 
151
146
  tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
152
147
  tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
148
+
153
149
  dataset_catalog = DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME)
154
150
  if not dataset_catalog._exists():
155
151
  dataset_catalog.create_catalog()
152
+ logger_safe("info", "Dataset catalog created: %s", tdfs4ds.DATASET_CATALOG_NAME)
156
153
 
154
+ logger_safe("info", "Setup complete.")
157
155
  return
158
156
 
157
+
159
158
  def connect(
160
159
  database = tdfs4ds.SCHEMA,
161
160
  feature_catalog_name = tdfs4ds.FEATURE_CATALOG_NAME,
@@ -166,15 +165,15 @@ def connect(
166
165
  feature_catalog_name_view = tdfs4ds.FEATURE_CATALOG_NAME_VIEW,
167
166
  process_catalog_name_view = tdfs4ds.PROCESS_CATALOG_NAME_VIEW,
168
167
  dataset_catalog_name = tdfs4ds.DATASET_CATALOG_NAME,
169
- create_if_missing = False # New argument
168
+ create_if_missing = False
170
169
  ):
171
- if database is not None:
172
- tdfs4ds.SCHEMA = database
173
- else:
170
+ if database is None:
174
171
  raise ValueError("database parameter is None.")
172
+ tdfs4ds.SCHEMA = database
173
+ logger_safe("info", "Connecting to feature store in database: %s", database)
175
174
 
176
175
  tables = [x.lower() for x in list(tdml.db_list_tables(schema_name=tdfs4ds.SCHEMA, object_type='table').TableName.values)]
177
-
176
+
178
177
  feature_exists = feature_catalog_name.lower() in tables
179
178
  process_exists = process_catalog_name.lower() in tables
180
179
  distrib_exists = data_distribution_name.lower() in tables
@@ -183,20 +182,20 @@ def connect(
183
182
 
184
183
  if not (feature_exists and process_exists and distrib_exists and filter_manager_exists):
185
184
  if not create_if_missing:
186
- return False # Feature store does not exist
187
- else:
188
- # Create the missing components
189
- if not feature_exists:
190
- tdfs4ds.feature_store.feature_store_management.feature_store_catalog_creation()
191
- if not process_exists:
192
- tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_creation()
193
- if not distrib_exists:
194
- tdfs4ds.data_distribution.data_distribution_catalog_creation()
195
- if not filter_manager_exists:
196
- tdfs4ds.filter_manager.filter_manager_catalog_creation()
197
-
198
- # Follow-up table handling
185
+ logger_safe("warning", "Feature store components missing and create_if_missing=False")
186
+ return False
187
+ logger_safe("info", "Missing components detected; creating missing parts...")
188
+ if not feature_exists:
189
+ tdfs4ds.feature_store.feature_store_management.feature_store_catalog_creation()
190
+ if not process_exists:
191
+ tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_creation()
192
+ if not distrib_exists:
193
+ tdfs4ds.data_distribution.data_distribution_catalog_creation()
194
+ if not filter_manager_exists:
195
+ tdfs4ds.filter_manager.filter_manager_catalog_creation()
196
+
199
197
  if not followup_name_exists:
198
+ logger_safe("info", "Creating follow-up table: %s", followup_name)
200
199
  tdfs4ds.process_store.process_followup.follow_up_table_creation()
201
200
  tdfs4ds.FOLLOW_UP_NAME = followup_name
202
201
 
@@ -210,30 +209,31 @@ def connect(
210
209
 
211
210
  process_list = tdml.DataFrame(tdml.in_schema(database, process_catalog_name))
212
211
  if 'ENTITY_NULL_SUBSTITUTE' not in process_list.columns:
213
- print('ENTITY_NULL_SUBSTITUTE column does not exist in the existing process catalog')
214
- print('upgrade to the latest DDL')
212
+ logger_safe("warning", "ENTITY_NULL_SUBSTITUTE column missing. Upgrading catalog.")
215
213
  tdfs4ds.process_store.process_store_catalog_management.upgrade_process_catalog()
216
214
 
217
215
  tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
218
216
  tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
219
217
 
220
- # Dataset catalog setup
218
+ # Dataset Catalog
221
219
  tdfs4ds.DATASET_CATALOG_NAME = dataset_catalog_name
222
- dataset_catalog = DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME)
220
+ dataset_catalog = DatasetCatalog(schema_name=database, name=dataset_catalog_name)
223
221
  if not dataset_catalog._exists():
224
222
  dataset_catalog.create_catalog()
223
+ logger_safe("info", "Dataset catalog created: %s", dataset_catalog_name)
225
224
 
226
- # Check if distribution is temporal
225
+ # Detect temporal distribution
227
226
  def is_data_distribution_temporal():
228
227
  return 'PERIOD' in tdfs4ds.utils.lineage.get_ddl(
229
228
  view_name=tdfs4ds.DATA_DISTRIBUTION_NAME,
230
229
  schema_name=tdfs4ds.SCHEMA,
231
230
  object_type='table'
232
231
  )
233
-
232
+
234
233
  tdfs4ds.DATA_DISTRIBUTION_TEMPORAL = is_data_distribution_temporal()
235
-
236
- return True # Feature store exists or was created
234
+ logger_safe("info", "Connected to feature store successfully.")
235
+ return True
236
+
237
237
 
238
238
 
239
239
 
@@ -287,50 +287,22 @@ def get_dataset_entity(dataset_id = None):
287
287
  def get_dataset_features(dataset_id = None):
288
288
  return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_features(dataset_id)
289
289
 
290
- def run(process_id, return_dataset = False, force_compute = False, force_varchar_length = None):
290
+ def run(process_id, return_dataset=False, force_compute=False, force_varchar_length=None):
291
291
  """
292
292
  Executes a specific process from the feature store identified by the process ID.
293
- The function handles different process types and performs appropriate actions.
294
-
295
- Parameters:
296
- - process_id (str): The unique identifier of the process to run.
297
- - return_dataset (bool, optional): A flag indicating whether to return the dataset created during the process.
298
- Default is False.
299
- - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
300
- Default is False.
301
- - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
302
- VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
303
- where k is the smallest integer so that the original lengths is smaller or equal
304
- to k x force_varchar_length. Default is None.
305
-
306
- Returns:
307
- DataFrame or None: If return_dataset is True, returns the dataset created during the process. Otherwise, returns None.
308
-
309
- This function performs the following steps:
310
- 1. Determines the process type and initializes necessary variables.
311
- 2. Constructs and executes a SQL query to retrieve process details by process ID.
312
- 3. Fetches the filter manager, process type, primary index, partitioning, and data domain from the query result.
313
- 4. Handles different process types, such as 'denormalized view' and 'tdstone2 view'.
314
- 5. For 'denormalized view' process type, extracts necessary details, fetches data, and uploads features to the feature store.
315
- 6. Optionally returns the dataset created during the process if return_dataset is True.
316
-
317
- Note:
318
- - The function relies on various sub-modules within the `tdfs4ds` library for different steps of the process, from
319
- data retrieval to feature uploading.
320
- - It is intended to be used internally within a system that manages a Teradata feature store, assuming access to
321
- a Teradata database and the appropriate schema for feature storage.
293
+ Uses global `logger` for diagnostics (gated by tdfs4ds.DISPLAY_LOGS).
322
294
  """
323
295
 
324
296
  if tdfs4ds.PROCESS_TYPE is None:
325
297
  PROCESS_TYPE_ = 'RUN PROCESS'
326
- tdfs4ds.RUN_ID = str(uuid.uuid4())
298
+ tdfs4ds.RUN_ID = str(uuid.uuid4())
327
299
  else:
328
300
  PROCESS_TYPE_ = tdfs4ds.PROCESS_TYPE
329
301
 
330
- if tdfs4ds.DEBUG_MODE:
331
- print('def run','tdfs4ds.FEATURE_STORE_TIME', tdfs4ds.FEATURE_STORE_TIME)
302
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
303
+ logger_safe("debug", "def run | tdfs4ds.FEATURE_STORE_TIME=%s", tdfs4ds.FEATURE_STORE_TIME)
332
304
 
333
- if tdfs4ds.FEATURE_STORE_TIME == None:
305
+ if tdfs4ds.FEATURE_STORE_TIME is None:
334
306
  validtime_statement = 'CURRENT VALIDTIME'
335
307
  else:
336
308
  validtime_statement = f"VALIDTIME AS OF TIMESTAMP '{tdfs4ds.FEATURE_STORE_TIME}'"
@@ -342,148 +314,110 @@ def run(process_id, return_dataset = False, force_compute = False, force_varchar
342
314
  WHERE A.PROCESS_ID = '{process_id}'
343
315
  """
344
316
 
317
+ logger_safe(
318
+ "info",
319
+ "Starting run | run_id=%s | process_type=%s | process_id=%s | return_dataset=%s | force_compute=%s | force_varchar_length=%s",
320
+ tdfs4ds.RUN_ID, PROCESS_TYPE_, process_id, return_dataset, force_compute, force_varchar_length
321
+ )
322
+
345
323
  # Executing the query and converting the result to Pandas DataFrame
346
324
  df = tdml.DataFrame.from_query(query).to_pandas()
347
325
 
348
- # Check if exactly one record is returned, else print an error
326
+ # Check if exactly one record is returned, else log an error and return
349
327
  if df.shape[0] != 1:
350
- print('error - there is ', df.shape[0], f' records. Check table {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}')
351
- print('check ou this query:')
352
- print(query)
328
+ logger_safe(
329
+ "error",
330
+ "Process catalog lookup returned %s record(s); expected 1. Check table %s.%s. Query: %s",
331
+ df.shape[0], tdfs4ds.SCHEMA, tdfs4ds.PROCESS_CATALOG_NAME_VIEW, query.strip()
332
+ )
353
333
  return
354
334
 
355
-
356
335
  # Fetching the filter manager
357
336
  filter_schema_name = df['FILTER_DATABASE_NAME'].values[0]
358
337
  if filter_schema_name is None:
359
338
  filtermanager = None
360
339
  else:
361
340
  filter_view_name = df['FILTER_VIEW_NAME'].values[0]
362
- filter_table_name = df['FILTER_TABLE_NAME'].values[0]
341
+ filter_table_name = df['FILTER_TABLE_NAME'].values[0] # kept for parity; not used directly here
363
342
  filtermanager = FilterManager(table_name=filter_view_name, schema_name=filter_schema_name)
364
343
 
365
- # Fetching the process type from the query result
366
- process_type = df['PROCESS_TYPE'].values[0]
367
-
368
- # Fetching the primary index from the query result
369
- primary_index = df['FOR_PRIMARY_INDEX'].values[0]
344
+ # Fetching process metadata
345
+ process_type = df['PROCESS_TYPE'].values[0]
346
+ primary_index = df['FOR_PRIMARY_INDEX'].values[0]
370
347
  if primary_index is not None:
371
- primary_index = primary_index.split(',')
372
-
373
- # Fetching the primary index from the query result
374
- partitioning = df['FOR_DATA_PARTITIONING'].values[0]
375
-
376
- # Fetching the data domain from the query result
377
- DATA_DOMAIN = df['DATA_DOMAIN'].values[0]
348
+ primary_index = [x.strip() for x in primary_index.split(',') if x.strip()]
349
+ partitioning = df['FOR_DATA_PARTITIONING'].values[0]
350
+ DATA_DOMAIN = df['DATA_DOMAIN'].values[0]
351
+
352
+ logger_safe(
353
+ "info",
354
+ "Process metadata | process_id=%s | process_type=%s | primary_index=%s | partitioning=%s | data_domain=%s | validtime=%s",
355
+ process_id, process_type, primary_index, partitioning, DATA_DOMAIN, validtime_statement
356
+ )
378
357
 
379
358
  # Handling 'denormalized view' process type
380
359
  if process_type == 'denormalized view':
381
- # Extracting necessary details for this process type
382
- view_name = df['VIEW_NAME'].values[0]
383
- entity_id = df['ENTITY_ID'].values[0].split(',')
360
+ view_name = df['VIEW_NAME'].values[0]
361
+ entity_id = [x.strip() for x in df['ENTITY_ID'].values[0].split(',') if x.strip()]
384
362
  entity_null_substitute = eval(df['ENTITY_NULL_SUBSTITUTE'].values[0])
385
- feature_names = df['FEATURE_NAMES'].values[0].split(',')
363
+ feature_names = [x.strip() for x in df['FEATURE_NAMES'].values[0].split(',') if x.strip()]
386
364
 
387
- # Fetching data and uploading features to the feature store
388
365
  df_data = tdml.DataFrame(tdml.in_schema(view_name.split('.')[0], view_name.split('.')[1]))
389
366
 
390
- if tdfs4ds.DEBUG_MODE:
391
- print('run','entity_id',entity_id)
392
- print('run', 'entity_null_substitute', entity_null_substitute)
393
- print('run','feature_names',feature_names)
394
- print('run','process_id',process_id)
395
- print('run','primary_index',primary_index)
396
- print('run','partitioning',partitioning)
367
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
368
+ logger_safe("debug", "run | entity_id=%s", entity_id)
369
+ logger_safe("debug", "run | entity_null_substitute=%s", entity_null_substitute)
370
+ logger_safe("debug", "run | feature_names=%s", feature_names)
371
+ logger_safe("debug", "run | process_id=%s", process_id)
372
+ logger_safe("debug", "run | primary_index=%s", primary_index)
373
+ logger_safe("debug", "run | partitioning=%s", partitioning)
374
+
397
375
  dataset = _upload_features(
398
376
  df_data,
399
377
  entity_id,
400
378
  feature_names,
401
- feature_versions = process_id,
402
- primary_index = primary_index,
403
- partitioning = partitioning,
404
- filtermanager = filtermanager,
405
- entity_null_substitute = entity_null_substitute,
406
- process_id = process_id,
407
- force_compute= force_compute,
408
- force_varchar_length = force_varchar_length
379
+ feature_versions=process_id,
380
+ primary_index=primary_index,
381
+ partitioning=partitioning,
382
+ filtermanager=filtermanager,
383
+ entity_null_substitute=entity_null_substitute,
384
+ process_id=process_id,
385
+ force_compute=force_compute,
386
+ force_varchar_length=force_varchar_length
409
387
  )
410
388
 
411
389
  # Handling 'tdstone2 view' process type
412
390
  elif process_type == 'tdstone2 view':
413
- print('not implemented yet')
414
-
391
+ logger_safe("warning", "Process type 'tdstone2 view' not implemented yet for process_id=%s", process_id)
392
+ dataset = None
415
393
 
394
+ else:
395
+ logger_safe("error", "Unknown process type '%s' for process_id=%s", process_type, process_id)
396
+ dataset = None
416
397
 
417
398
  if return_dataset:
399
+ logger_safe("info", "Run finished with dataset | run_id=%s | process_id=%s", tdfs4ds.RUN_ID, process_id)
418
400
  return dataset
419
401
  else:
402
+ logger_safe("info", "Run finished without dataset | run_id=%s | process_id=%s", tdfs4ds.RUN_ID, process_id)
420
403
  return
421
404
 
422
- def upload_features(df, entity_id, feature_names, metadata={}, primary_index = None, partitioning = '', filtermanager = None, entity_null_substitute = {}, force_compute = True, force_varchar_length = 1024):
423
- """
424
- Uploads feature data from a DataFrame to the feature store for a specified entity. This involves registering the
425
- process in the feature store, executing the necessary SQL to insert the data, and returning the resulting dataset
426
- for further use or inspection.
427
405
 
428
- The function supports dynamic entity ID interpretation and flexible feature name handling, ensuring compatibility
429
- with various data schemas. It automatically registers the data upload process and applies additional metadata,
430
- if provided.
431
-
432
- Parameters:
433
- - df (DataFrame): The DataFrame containing the feature data to be uploaded.
434
- - entity_id (dict, list, or str): The identifier of the entity to which the features belong. This can be:
435
- - a dictionary mapping column names to their data types,
436
- - a list of column names, which will be automatically converted to a dictionary with types inferred from `df`,
437
- - a string representing a single column name, which will be converted into a list and then to a dictionary as above.
438
- - feature_names (list or str): The names of the features to be uploaded. If a string is provided, it will be
439
- split into a list based on commas or treated as a single feature name.
440
- - metadata (dict, optional): Additional metadata to associate with the upload process. Defaults to an empty dictionary.
441
- - primary_index (list, optional): Specifies the primary index columns for optimizing data storage and retrieval.
442
- - partitioning (str, optional): Defines how the data should be partitioned in the store for performance optimization.
443
- - filtermanager (object, optional): An object managing filter conditions for the feature data. Default is None.
444
- - entity_null_substitute (dict, optional): A dictionary specifying substitute values for nulls in entity columns.
445
- Default is an empty dictionary.
446
- - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
447
- Default is True.
448
- - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
449
- VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
450
- where k is the smallest integer so that the original lengths is smaller or equal
451
- to k x force_varchar_length. Default is 1024.
452
- Returns:
453
- DataFrame: A DataFrame representing the dataset resulting from the upload process, typically used for validation
454
- or further processing.
455
-
456
- The process involves several steps, including entity ID type conversion if necessary, feature name normalization,
457
- process registration in the feature store, and the execution of SQL queries to insert the data. The function concludes
458
- by returning a dataset derived from the uploaded data, offering immediate access to the newly stored information.
459
-
460
- Example:
461
- >>> df = tdml.DataFrame(...)
462
- >>> entity_id = ['customer_id']
463
- >>> feature_names = ['age', 'income']
464
- >>> dataset = upload_features(df, entity_id, feature_names)
465
- >>> # Another example with list-based entity_id, custom primary_index, and partitioning
466
- >>> tddf = tdml.DataFrame(...) # Assuming tddf is predefined with appropriate columns
467
- >>> entity_id = ['tx_type', 'txn_id']
468
- >>> primary_index = ['txn_id']
469
- >>> partitioning = '''
470
- ... PARTITION BY CASE_N (
471
- ... tx_type LIKE 'DEBIT',
472
- ... tx_type LIKE 'PAYMENT',
473
- ... tx_type LIKE 'CASH_OUT',
474
- ... tx_type LIKE 'CASH_IN',
475
- ... tx_type LIKE 'TRANSFER',
476
- ... NO CASE,
477
- ... UNKNOWN)'''
478
- >>> features = [x for x in tddf.columns if x not in entity_id]
479
- >>> dataset = upload_features(
480
- ... df = tddf,
481
- ... entity_id = entity_id,
482
- ... feature_names = features,
483
- ... metadata = {'project': 'test'},
484
- ... primary_index = primary_index,
485
- ... partitioning = partitioning
486
- ... )
406
+ def upload_features(
407
+ df,
408
+ entity_id,
409
+ feature_names,
410
+ metadata={},
411
+ primary_index=None,
412
+ partitioning='',
413
+ filtermanager=None,
414
+ entity_null_substitute={},
415
+ force_compute=True,
416
+ force_varchar_length=1024
417
+ ):
418
+ """
419
+ Uploads feature data from a DataFrame to the feature store for a specified entity.
420
+ All diagnostics go through `logger_safe()` which respects `tdfs4ds.DISPLAY_LOGS`.
487
421
  """
488
422
 
489
423
  from tdfs4ds.utils.info import get_column_types
@@ -491,45 +425,42 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
491
425
  from tdfs4ds.process_store.process_registration_management import register_process_view
492
426
 
493
427
  # Convert entity_id to a dictionary if it's not already one
494
- if type(entity_id) == list:
428
+ if isinstance(entity_id, list):
495
429
  entity_id.sort()
496
430
  entity_id = get_column_types(df, entity_id)
497
- if tdfs4ds.DISPLAY_LOGS:
498
- print('entity_id has been converted to a proper dictionary : ', entity_id)
499
- elif type(entity_id) == str:
431
+ logger_safe("debug", "entity_id converted to dict: %s", entity_id)
432
+
433
+ elif isinstance(entity_id, str):
500
434
  entity_id = [entity_id]
501
435
  entity_id = get_column_types(df, entity_id)
502
- if tdfs4ds.DISPLAY_LOGS:
503
- print('entity_id has been converted to a proper dictionary : ', entity_id)
504
-
505
- if type(feature_names) != list:
506
- if tdfs4ds.DISPLAY_LOGS:
507
- print('feature_names is not a list:', feature_names)
508
- if ',' in feature_names:
509
- feature_names = feature_names.split(',')
436
+ logger_safe("debug", "entity_id converted to dict: %s", entity_id)
437
+
438
+ # Normalize feature_names
439
+ if not isinstance(feature_names, list):
440
+ logger_safe("debug", "feature_names is not a list: %s", feature_names)
441
+ if isinstance(feature_names, str) and ',' in feature_names:
442
+ feature_names = [x.strip() for x in feature_names.split(',')]
510
443
  else:
511
444
  feature_names = [feature_names]
512
- if tdfs4ds.DISPLAY_LOGS:
513
- print('it has been converted to : ', feature_names)
514
- print('check it is a expected.')
515
-
516
- if primary_index is not None and type(primary_index) != list:
517
- if tdfs4ds.DISPLAY_LOGS:
518
- print('primary_index is not a list:', primary_index)
519
- if ',' in primary_index:
520
- primary_index = primary_index.split(',')
445
+ logger_safe("debug", "feature_names converted to list: %s", feature_names)
446
+ logger_safe("debug", "Check the conversion is as expected.")
447
+
448
+ # Normalize primary_index
449
+ if primary_index is not None and not isinstance(primary_index, list):
450
+ logger_safe("debug", "primary_index is not a list: %s", primary_index)
451
+ if isinstance(primary_index, str) and ',' in primary_index:
452
+ primary_index = [x.strip() for x in primary_index.split(',')]
521
453
  else:
522
454
  primary_index = [primary_index]
523
- if tdfs4ds.DISPLAY_LOGS:
524
- print('it has been converted to : ', feature_names)
525
- print('check it is a expected.')
455
+ logger_safe("debug", "primary_index converted to list: %s", primary_index)
456
+ logger_safe("debug", "Check the conversion is as expected.")
526
457
 
458
+ # Partitioning
527
459
  partitioning = tdfs4ds.utils.info.generate_partitioning_clause(partitioning=partitioning)
528
460
 
529
- if tdfs4ds.DISPLAY_LOGS:
530
- print("filtermanager", filtermanager)
461
+ logger_safe("debug", "filtermanager: %s", filtermanager)
531
462
 
532
- # Register the process and retrieve the SQL query to insert the features, and the process ID
463
+ # Register process -> get SQL(s) + process_id
533
464
  query_insert, process_id, query_insert_dist, query_insert_filtermanager = register_process_view.__wrapped__(
534
465
  view_name = df,
535
466
  entity_id = entity_id,
@@ -542,104 +473,171 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
542
473
  entity_null_substitute = entity_null_substitute
543
474
  )
544
475
 
545
- # Execute the SQL query to insert the features into the database
546
- execute_query(query_insert)
547
- execute_query(query_insert_dist)
548
- if tdfs4ds.DEBUG_MODE:
549
- print("query_insert_filtermanager",query_insert_filtermanager)
550
- if query_insert_filtermanager is not None:
551
- execute_query(query_insert_filtermanager)
476
+ logger_safe("info", "Registered process (process_id=%s) for upload_features", process_id)
552
477
 
553
- # Run the registered process and return the resulting dataset
554
- PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
555
- tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES'
556
- if tdfs4ds.BUILD_DATASET_AT_UPLOAD: tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES WITH DATASET VALIDATION'
557
- tdfs4ds.RUN_ID = str(uuid.uuid4())
558
-
559
- if tdfs4ds.BUILD_DATASET_AT_UPLOAD:
478
+ # Execute queries
479
+ try:
480
+ execute_query(query_insert)
481
+ logger_safe("info", "Executed main insert query for process_id=%s", process_id)
482
+ except Exception as e:
483
+ logger_safe("exception", "Main insert query failed for process_id=%s", process_id)
484
+ raise
560
485
 
561
- try:
486
+ try:
487
+ execute_query(query_insert_dist)
488
+ logger_safe("info", "Executed distribution insert query for process_id=%s", process_id)
489
+ except Exception as e:
490
+ logger_safe("exception", "Distribution insert query failed for process_id=%s", process_id)
491
+ raise
562
492
 
563
- dataset = run(process_id=process_id, return_dataset=True, force_compute = force_compute, force_varchar_length = force_varchar_length)
493
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
494
+ # Avoid dumping entire SQL in normal logs; keep it debug-only.
495
+ logger_safe("debug", "query_insert_filtermanager: %s", query_insert_filtermanager)
564
496
 
497
+ if query_insert_filtermanager is not None:
498
+ try:
499
+ execute_query(query_insert_filtermanager)
500
+ logger_safe("info", "Executed filtermanager insert query for process_id=%s", process_id)
565
501
  except Exception as e:
566
- tdfs4ds.process_store.process_followup.followup_close(
567
- run_id = tdfs4ds.RUN_ID,
568
- process_type = tdfs4ds.PROCESS_TYPE,
569
- process_id = process_id,
570
- status = 'FAILED,' + str(e).split('\n')[0]
571
- )
502
+ logger_safe("exception", "Filtermanager insert query failed for process_id=%s", process_id)
572
503
  raise
573
504
 
505
+ # Run the registered process (with/without dataset)
506
+ PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
507
+ tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES'
508
+ if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
509
+ tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES WITH DATASET VALIDATION'
510
+ tdfs4ds.RUN_ID = str(uuid.uuid4())
574
511
 
575
- return dataset
576
- else:
512
+ logger_safe(
513
+ "info",
514
+ "Starting run (run_id=%s, process_type=%s, process_id=%s, force_compute=%s, force_varchar_length=%s)",
515
+ tdfs4ds.RUN_ID, tdfs4ds.PROCESS_TYPE, process_id, force_compute, force_varchar_length
516
+ )
517
+
518
+ try:
519
+ if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
520
+ dataset = run(
521
+ process_id=process_id,
522
+ return_dataset=True,
523
+ force_compute=force_compute,
524
+ force_varchar_length=force_varchar_length
525
+ )
526
+ logger_safe("info", "Run completed with dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
527
+ return dataset
528
+ else:
529
+ run(
530
+ process_id=process_id,
531
+ return_dataset=False,
532
+ force_compute=force_compute,
533
+ force_varchar_length=force_varchar_length
534
+ )
535
+ logger_safe("info", "Run completed without dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
536
+ return
577
537
 
538
+ except Exception as e:
539
+ # Keep your existing follow-up close behavior, but ensure the error is logged.
578
540
  try:
579
- run(process_id=process_id, return_dataset=False, force_compute = force_compute, force_varchar_length = force_varchar_length)
580
- except Exception as e:
581
541
  tdfs4ds.process_store.process_followup.followup_close(
582
542
  run_id = tdfs4ds.RUN_ID,
583
543
  process_type = tdfs4ds.PROCESS_TYPE,
584
544
  process_id = process_id,
585
545
  status = 'FAILED,' + str(e).split('\n')[0]
586
546
  )
587
- raise
588
- return
547
+ finally:
548
+ logger_safe("exception", "Run failed (run_id=%s, process_id=%s): %s",
549
+ tdfs4ds.RUN_ID, process_id, str(e).split('\n')[0]
550
+ )
551
+ raise
552
+ finally:
553
+ # Restore previous process type just in case the caller relies on it.
554
+ tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
589
555
 
590
- tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
591
556
 
592
- def _upload_features(df, entity_id, feature_names,
593
- feature_versions=FEATURE_VERSION_DEFAULT, primary_index = None, partitioning = '', filtermanager=None, entity_null_substitute={}, process_id = None, force_compute = False,force_varchar_length = None):
594
- """
595
- Uploads features from a DataFrame to the feature store, handling entity registration, feature type determination,
596
- feature registration, preparation for ingestion, and storage in the designated feature tables.
597
557
 
598
- Parameters:
599
- - df (DataFrame): The input DataFrame containing the feature data.
600
- - entity_id (str or dict): The identifier for the entity to which these features belong. This can be a single ID
601
- (str) or a dictionary of attribute names and values uniquely identifying the entity.
602
- - feature_names (list): A list of strings specifying the names of the features to be uploaded.
603
- - feature_versions (str or list, optional): Specifies the versions of the features to be uploaded. Can be a single
604
- string applied to all features or a list of strings specifying the version
605
- for each feature respectively. Default is 'dev.0.0'.
606
- - primary_index (list, optional): Specifies the columns to be used as the primary index in the feature store tables.
607
- This can significantly impact the performance of data retrieval operations.
608
- - partitioning (str, optional): A string indicating the partitioning strategy for the feature store tables, which can
609
- enhance query performance based on the access patterns.
610
- - filtermanager (object, optional): An object managing filter conditions for the feature data. Default is None.
611
- - entity_null_substitute (dict, optional): A dictionary specifying substitute values for nulls in entity columns.
612
- Default is an empty dictionary.
613
- - process_id (str, optional): An identifier for the process, used for tracking and follow-up. Default is None.
614
- - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
615
- Default is False.
616
- - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
617
- VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
618
- where k is the smallest integer so that the original lengths is smaller or equal
619
- to k x force_varchar_length. Default is None.
558
+ def _upload_features(
559
+ df, entity_id, feature_names,
560
+ feature_versions=FEATURE_VERSION_DEFAULT,
561
+ primary_index=None, partitioning='',
562
+ filtermanager=None, entity_null_substitute={},
563
+ process_id=None, force_compute=False,
564
+ force_varchar_length=None
565
+ ):
566
+ """
567
+ Uploads a set of features into the Feature Store for a given entity.
620
568
 
569
+ This function registers an entity and its associated features in the feature catalog
570
+ if they are not already defined, prepares the data for ingestion, and stores it in the
571
+ feature store. It also supports incremental feature computation and conditional execution
572
+ depending on prior runs.
621
573
 
622
- Returns:
623
- DataFrame: A DataFrame representing the dataset view created in the feature store, detailing the features and their
624
- metadata, including versions and storage locations.
625
-
626
- This function orchestrates several steps involved in feature storage:
627
- 1. Registers the entity in the feature store if not already present.
628
- 2. Determines the data types of the features based on the input DataFrame.
629
- 3. Registers the features, including their names, types, and versions, in the feature catalog.
630
- 4. Prepares the feature data for ingestion, including any necessary transformations.
631
- 5. Stores the prepared feature data in the feature store.
632
- 6. Optionally, cleans up temporary resources used during the process.
633
- 7. Builds and returns a view of the dataset representing the uploaded features for easy access.
574
+ Parameters
575
+ ----------
576
+ df : pandas.DataFrame
577
+ Input dataframe containing entity keys and feature columns to upload.
578
+ entity_id : str, list, or dict
579
+ Identifier(s) for the entity. Can be:
580
+ - A string (single entity key)
581
+ - A list of key column names
582
+ - A dict mapping column names to data types
583
+ If not a dict, entity metadata is inferred automatically.
584
+ feature_names : list of str
585
+ List of feature column names to upload from `df`.
586
+ feature_versions : dict or int, optional
587
+ Feature version(s). If a single integer is provided, it is applied to all features.
588
+ If a dict is provided, it maps each feature name to its version.
589
+ Default is FEATURE_VERSION_DEFAULT.
590
+ primary_index : str or list, optional
591
+ Primary index to use when storing features in Teradata.
592
+ partitioning : str, optional
593
+ Partitioning clause for feature store tables. Default is ''.
594
+ filtermanager : FilterManager, optional
595
+ If provided, features are built iteratively per filter step.
596
+ entity_null_substitute : dict, optional
597
+ Replacement values for nulls in entity keys.
598
+ Example: {'customer_id': -1}
599
+ process_id : str, optional
600
+ Identifier for the process execution, used for follow-up logging.
601
+ force_compute : bool, optional
602
+ If True, forces recomputation even if the same process_id and timestamp were
603
+ already computed earlier. If False, the computation is skipped when existing
604
+ results are detected. Default is False.
605
+ force_varchar_length : int, optional
606
+ If provided, all VARCHAR feature columns are resized to this length
607
+ before ingestion.
608
+
609
+ Returns
610
+ -------
611
+ pandas.DataFrame or None
612
+ If BUILD_DATASET_AT_UPLOAD is enabled, returns a dataset built from the
613
+ ingested features for validation. Otherwise, returns None.
614
+
615
+ Notes
616
+ -----
617
+ - Uses global tdfs4ds context such as FEATURE_STORE_TIME, RUN_ID, and PROCESS_TYPE.
618
+ - Logs ingestion status in process follow-up tables.
619
+ - Skips ingestion when existing completed results are found unless
620
+ `force_compute=True`.
621
+ - Applies Teradata-optimized storage and statistics collection.
622
+
623
+ Raises
624
+ ------
625
+ ValueError
626
+ If unsupported data types are found (CLOB/BLOB/JSON).
627
+ Exception
628
+ For ingestion failure or storage errors.
634
629
 
635
- Note:
636
- - The function relies on various sub-modules within the `tdfs4ds` library for different steps of the process, from
637
- entity and feature registration to data preparation and storage.
638
- - It is intended to be used internally within a system that manages a Teradata feature store, assuming access to
639
- a Teradata database and the appropriate schema for feature storage.
640
- - The function assumes that the feature_versions, if provided as a list, matches the length of feature_names.
630
+ Example
631
+ -------
632
+ >>> _upload_features(
633
+ ... df=dataframe,
634
+ ... entity_id="customer_id",
635
+ ... feature_names=["age", "credit_score"],
636
+ ... process_id="customer_features_v1",
637
+ ... force_compute=False
638
+ ... )
641
639
  """
642
-
640
+
643
641
  from tdfs4ds.feature_store.entity_management import register_entity
644
642
  from tdfs4ds.feature_store.feature_store_management import Gettdtypes
645
643
  from tdfs4ds.feature_store.feature_store_management import register_features
@@ -647,194 +645,180 @@ def _upload_features(df, entity_id, feature_names,
647
645
  from tdfs4ds.feature_store.feature_data_processing import store_feature, apply_collect_stats
648
646
  from tdfs4ds.utils.info import get_column_types, update_varchar_length
649
647
 
650
- # Convert entity_id to a dictionary if it's not already one
651
- if type(entity_id) == list:
648
+ # Convert entity_id to a dictionary if not already
649
+ if isinstance(entity_id, list):
652
650
  entity_id.sort()
653
651
  entity_id = get_column_types(df, entity_id)
654
- if tdfs4ds.DISPLAY_LOGS:
655
- print('entity_id has been converted to a proper dictionary : ', entity_id)
656
- elif type(entity_id) == str:
657
- entity_id = [entity_id]
658
- entity_id = get_column_types(df, entity_id)
659
- if tdfs4ds.DISPLAY_LOGS:
660
- print('entity_id has been converted to a proper dictionary : ', entity_id)
661
-
662
- #register_entity(entity_id, primary_index=primary_index, partitioning=partitioning)
663
-
664
- # If feature_versions is a list, create a dictionary mapping each feature name to its corresponding version.
665
- # If feature_versions is a string, create a dictionary mapping each feature name to this string.
666
- if type(feature_versions) == list:
667
- selected_features = {k: v for k, v in zip(feature_names, feature_versions)}
652
+ logger_safe("debug", "entity_id converted to dict: %s", entity_id)
653
+ elif isinstance(entity_id, str):
654
+ entity_id = get_column_types(df, [entity_id])
655
+ logger_safe("debug", "entity_id converted to dict: %s", entity_id)
656
+
657
+ # Map feature versions
658
+ if isinstance(feature_versions, list):
659
+ selected_features = dict(zip(feature_names, feature_versions))
668
660
  else:
669
661
  selected_features = {k: feature_versions for k in feature_names}
670
662
 
671
- # Get the Teradata types of the features in df.
672
- feature_names_types = Gettdtypes(
673
- df,
674
- features_columns=feature_names,
675
- entity_id=entity_id
676
- )
663
+ # Get Teradata types for features
664
+ feature_names_types = Gettdtypes(df, features_columns=feature_names, entity_id=entity_id)
677
665
 
678
666
  if force_varchar_length is not None:
679
- print(feature_names_types)
680
- feature_names_types = update_varchar_length(feature_names_types,new_varchar_length = force_varchar_length)
667
+ logger_safe("debug", "Updating VARCHAR lengths with force_varchar_length=%s", force_varchar_length)
668
+ feature_names_types = update_varchar_length(
669
+ feature_names_types,
670
+ new_varchar_length=force_varchar_length
671
+ )
681
672
 
682
673
  def validate_feature_types(feature_names_types):
683
- """
684
- Validates feature data types and raises an error if any value contains
685
- the substrings 'clob', 'blob', or 'json' (case insensitive).
686
-
687
- Parameters:
688
- feature_names_types (dict): A dictionary where keys are feature names and values are their data types.
689
-
690
- Raises:
691
- ValueError: If any feature type contains 'clob', 'blob', or 'json'.
692
- """
693
- invalid_types = {key: value['type'] for key, value in feature_names_types.items()
694
- if any(term in value['type'].lower() for term in ['clob', 'blob', 'json'])}
695
-
696
- if invalid_types:
674
+ invalid = {
675
+ k: v['type'] for k, v in feature_names_types.items()
676
+ if any(x in v['type'].lower() for x in ['clob', 'blob', 'json'])
677
+ }
678
+ if invalid:
697
679
  raise ValueError(
698
- f"The following features have unsupported data types: {invalid_types}. "
699
- "The data types 'CLOB', 'BLOB', and 'JSON' are not yet managed by the feature store."
680
+ f"Unsupported data types found: {invalid}. "
681
+ "CLOB/BLOB/JSON are not supported."
700
682
  )
701
-
702
- validate_feature_types(feature_names_types)
703
-
683
+
684
+ validate_feature_types(feature_names_types)
685
+
686
+ logger_safe("info", "Registering entity %s in feature store", entity_id)
704
687
  register_entity(entity_id, feature_names_types, primary_index=primary_index, partitioning=partitioning)
705
688
 
706
- if tdfs4ds.DEBUG_MODE:
707
- print('_upload_features', 'entity_id', entity_id)
708
- print('_upload_features', 'entity_null_substitute', entity_null_substitute)
709
- print('_upload_features', 'feature_names', feature_names)
710
- print('_upload_features', 'primary_index', primary_index)
711
- print('_upload_features', 'partitioning', partitioning)
712
- print('_upload_features', 'selected_features', selected_features)
713
- print('_upload_features', 'df.columns', df.columns)
714
-
715
- # Register the features in the feature catalog.
716
- register_features(
717
- entity_id,
718
- feature_names_types,
719
- primary_index,
720
- partitioning
721
- )
722
-
723
- if tdfs4ds.DEBUG_MODE:
724
- print("---------_upload_features")
725
- print("filtermanager : ", filtermanager)
726
- print("feature names : ", feature_names)
727
- print("selected features : ", selected_features)
728
-
729
- if process_id is not None and tdfs4ds.FEATURE_STORE_TIME is not None:
689
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
690
+ logger_safe(
691
+ "debug",
692
+ "_upload_features entity_id=%s null_substitute=%s features=%s primary_index=%s partitioning=%s",
693
+ entity_id, entity_null_substitute, feature_names, primary_index, partitioning
694
+ )
695
+ logger_safe("debug", "selected_features=%s df.columns=%s", selected_features, df.columns)
696
+
697
+ register_features(entity_id, feature_names_types, primary_index, partitioning)
698
+ logger_safe("info", "Features registered in catalog: %s", feature_names)
699
+
700
+ follow_up = None
701
+ if process_id and tdfs4ds.FEATURE_STORE_TIME:
730
702
  follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
731
- follow_up = follow_up[(follow_up.STATUS == 'COMPLETED') & (follow_up.VALIDTIME_DATE.isna() == False) & (
732
- follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) & (follow_up.PROCESS_ID == process_id)]
733
- if filtermanager is None:
734
- do_compute = True
735
- if process_id is not None and tdfs4ds.FEATURE_STORE_TIME is not None:
736
- if follow_up.shape[0] > 0:
737
- do_compute = False
703
+ follow_up = follow_up[
704
+ (follow_up.STATUS == 'COMPLETED') &
705
+ (follow_up.VALIDTIME_DATE.isna() == False) &
706
+ (follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) &
707
+ (follow_up.PROCESS_ID == process_id)
708
+ ]
738
709
 
739
- # Prepare the features for ingestion.
710
+ if filtermanager is None:
711
+ do_compute = not (process_id and follow_up is not None and follow_up.shape[0] > 0)
712
+ if not do_compute and not force_compute:
713
+ logger_safe(
714
+ "info",
715
+ "Skipping computation for process_id=%s at time %s (already exists, force_compute=False)",
716
+ process_id, tdfs4ds.FEATURE_STORE_TIME
717
+ )
740
718
  if do_compute or force_compute:
741
-
719
+ logger_safe("info", "Beginning feature ingestion for entity=%s", entity_id)
742
720
  tdfs4ds.process_store.process_followup.followup_open(
743
- run_id = tdfs4ds.RUN_ID,
744
- process_type = tdfs4ds.PROCESS_TYPE,
745
- process_id = process_id
721
+ run_id=tdfs4ds.RUN_ID,
722
+ process_type=tdfs4ds.PROCESS_TYPE,
723
+ process_id=process_id
746
724
  )
747
-
748
725
  try:
749
- prepared_features, volatile_table_name, features_infos = prepare_feature_ingestion(
750
- df,
751
- entity_id,
752
- feature_names,
726
+ prepared_features, volatile_table, features_infos = prepare_feature_ingestion(
727
+ df, entity_id, feature_names,
753
728
  feature_versions=selected_features,
754
729
  primary_index=primary_index,
755
730
  entity_null_substitute=entity_null_substitute,
756
731
  partitioning=partitioning
757
732
  )
758
- # Store the prepared features in the feature store.
759
- store_feature(
760
- entity_id,
761
- volatile_table_name,
762
- entity_null_substitute=entity_null_substitute,
763
- primary_index=primary_index,
764
- partitioning=partitioning,
765
- features_infos = features_infos
766
- )
767
-
768
- # Collect statistics
769
- apply_collect_stats(
770
- entity_id,
771
- primary_index = primary_index,
772
- partitioning = partitioning,
773
- feature_infos = features_infos
774
- )
733
+ store_feature(entity_id, volatile_table, entity_null_substitute,
734
+ primary_index, partitioning, features_infos)
735
+ apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
775
736
 
776
737
  tdfs4ds.process_store.process_followup.followup_close(
777
- run_id = tdfs4ds.RUN_ID,
778
- process_type = tdfs4ds.PROCESS_TYPE,
779
- process_id = process_id
738
+ run_id=tdfs4ds.RUN_ID,
739
+ process_type=tdfs4ds.PROCESS_TYPE,
740
+ process_id=process_id
780
741
  )
742
+ logger_safe("info", "Feature ingestion completed for entity=%s", entity_id)
781
743
 
782
744
  except Exception as e:
745
+ logger_safe("exception", "Feature ingestion failed for entity=%s", entity_id)
783
746
  tdfs4ds.process_store.process_followup.followup_close(
784
- run_id = tdfs4ds.RUN_ID,
785
- process_type = tdfs4ds.PROCESS_TYPE,
786
- process_id = process_id,
787
- status = 'FAILED,' + str(e).split('\n')[0]
747
+ run_id=tdfs4ds.RUN_ID,
748
+ process_type=tdfs4ds.PROCESS_TYPE,
749
+ process_id=process_id,
750
+ status='FAILED,' + str(e).split('\n')[0]
788
751
  )
789
752
  raise
753
+
790
754
  else:
791
- # get the total number of filter condition in the filter manager
792
- nb_filters = filtermanager.nb_filters
793
755
 
794
- # the flag that indicates that we computed something in the next loop
756
+ logger_safe("info", "FilterManager detected: %s filters to process", filtermanager.nb_filters)
795
757
  something_computed = False
758
+ pbar = tqdm(
759
+ range(filtermanager.nb_filters),
760
+ total=filtermanager.nb_filters,
761
+ desc="Applying filters",
762
+ unit="filter",
763
+ leave=False
764
+ )
796
765
 
797
- for i in range(nb_filters):
766
+ for i in pbar:
767
+ filter_id = i + 1
768
+ filtermanager.update(filter_id)
798
769
 
799
- # place the cursor on the next filter
800
- filtermanager.update(i+1)
770
+ try:
771
+ pbar.set_description(f"Applying filter {filter_id}/{filtermanager.nb_filters}")
801
772
 
802
- if filtermanager.time_filtering:
803
- # if the filter manager is hybrid, then synchronize the time with tdfs4ds
804
- tdfs4ds.FEATURE_STORE_TIME = filtermanager.get_date_in_the_past()
773
+ # Convert datetime columns to string
774
+ df_bar = filtermanager.display().to_pandas().astype(object) # avoid conversion issues
775
+ for col in df_bar.select_dtypes(include=["datetime", "datetimetz"]).columns:
776
+ df_bar[col] = df_bar[col].dt.strftime("%Y-%m-%d %H:%M:%S")
805
777
 
806
- # overwrite the follow up table to tilter on the VALIDTIME_DATE too
807
- follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
808
- follow_up = follow_up[(follow_up.STATUS == 'COMPLETED') & (follow_up.VALIDTIME_DATE.isna() == False) & (
809
- follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) & (follow_up.PROCESS_ID == process_id)]
778
+ # Convert to JSON object (dict)
779
+ bar_info = df_bar.iloc[0].to_dict()
810
780
 
811
- # initialize do_compute, the flag that something has to be computed
812
- do_compute = True
781
+ # ---- ADD THIS: handle python date objects ----
782
+ from datetime import date, datetime
783
+ for key, value in bar_info.items():
784
+ if isinstance(value, (date, datetime)): # convert date/datetime to string
785
+ bar_info[key] = value.strftime("%Y-%m-%d %H:%M:%S")
786
+ # ----------------------------------------------
813
787
 
814
- # if the process_id is defined and if we are working at a specific time:
815
- if process_id is not None and tdfs4ds.FEATURE_STORE_TIME is not None:
816
- # we check if the filter condition has already been computed
817
- follow_up_ = follow_up.assign(APPLIED_FILTER=follow_up.APPLIED_FILTER.cast(tdml.VARCHAR(20000))).join(
818
- tdml.DataFrame.from_query(
819
- f"""
820
- SELECT
821
- CAST(JSON_AGG({','.join(filtermanager.col_names)}) AS VARCHAR(20000)) AS APPLIED_FILTER
822
- FROM {filtermanager.schema_name}.{filtermanager.view_name}
823
- """
824
- ),
825
- on = 'APPLIED_FILTER',
826
- how = 'inner',
827
- lprefix = 'l',
828
- rprefix = 'r'
829
- )
830
- # if already computed and completed, then do_compute is set to False
831
- if follow_up_.shape[0] > 0:
832
- do_compute = False
788
+ bar_info = str(bar_info)
789
+ if len(bar_info) > 120:
790
+ bar_info = bar_info[:117] + "..."
791
+ pbar.set_postfix_str(bar_info)
792
+
793
+ except Exception:
794
+ # postfix is optional; ignore errors from display() here
795
+ pass
833
796
 
834
- if tdfs4ds.DISPLAY_LOGS:
835
- print(filtermanager.display())
797
+ logger_safe("debug", "Applying filter %s/%s:\n%s",
798
+ i + 1, filtermanager.nb_filters, filtermanager.display())
836
799
 
800
+ do_compute = True
801
+ if process_id and tdfs4ds.FEATURE_STORE_TIME:
802
+ # see if already computed
803
+ follow_up = tdfs4ds.process_store.process_followup.follow_up_report(process_id=process_id, filtermanager=filtermanager)
804
+ follow_up = follow_up[
805
+ (follow_up.STATUS == 'COMPLETED') &
806
+ (follow_up.VALIDTIME_DATE.isna() == False) &
807
+ (follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME)
808
+ ]
809
+
810
+ if follow_up.shape[0] > 0:
811
+ do_compute = False
812
+
813
+ if not do_compute and not force_compute:
814
+ logger_safe(
815
+ "info",
816
+ "Skipping computation for process_id=%s at time %s (already exists, force_compute=False)",
817
+ process_id, tdfs4ds.FEATURE_STORE_TIME
818
+ )
819
+ pbar.colour = "green"
837
820
  if do_compute or force_compute:
821
+ pbar.colour = "blue"
838
822
  tdfs4ds.process_store.process_followup.followup_open(
839
823
  run_id = tdfs4ds.RUN_ID,
840
824
  process_type = tdfs4ds.PROCESS_TYPE,
@@ -842,83 +826,58 @@ def _upload_features(df, entity_id, feature_names,
842
826
  filtermanager = filtermanager
843
827
  )
844
828
  try:
845
- # Prepare the features for ingestion.
846
- prepared_features, volatile_table_name, features_infos = prepare_feature_ingestion(
847
- df,
848
- entity_id,
849
- feature_names,
829
+ prepared_features, volatile_table, features_infos = prepare_feature_ingestion(
830
+ df, entity_id, feature_names,
850
831
  feature_versions = selected_features,
851
832
  primary_index = primary_index,
852
833
  entity_null_substitute = entity_null_substitute,
853
834
  partitioning = partitioning
854
835
  )
855
836
 
856
- # Store the prepared features in the feature store.
857
- store_feature(
858
- entity_id,
859
- volatile_table_name,
860
- entity_null_substitute=entity_null_substitute,
861
- primary_index = primary_index,
862
- partitioning = partitioning,
863
- features_infos=features_infos
864
-
865
- )
866
-
867
- # indicate that something has been processed:
837
+ store_feature(entity_id, volatile_table, entity_null_substitute,
838
+ primary_index, partitioning, features_infos)
839
+
868
840
  something_computed = True
869
841
 
870
842
  tdfs4ds.process_store.process_followup.followup_close(
871
- run_id=tdfs4ds.RUN_ID,
872
- process_type=tdfs4ds.PROCESS_TYPE,
873
- process_id=process_id,
843
+ run_id = tdfs4ds.RUN_ID,
844
+ process_type = tdfs4ds.PROCESS_TYPE,
845
+ process_id = process_id,
874
846
  filtermanager = filtermanager
875
847
  )
876
848
 
877
849
  except Exception as e:
878
- print(e)
850
+ logger_safe("exception", "Error with filter iteration %s: %s", i + 1, str(e))
879
851
  tdfs4ds.process_store.process_followup.followup_close(
880
- run_id=tdfs4ds.RUN_ID,
881
- process_type=tdfs4ds.PROCESS_TYPE,
882
- process_id=process_id,
883
- status='FAILED,' + str(e).split('\n')[0],
884
- filtermanager=filtermanager
852
+ run_id = tdfs4ds.RUN_ID,
853
+ process_type = tdfs4ds.PROCESS_TYPE,
854
+ process_id = process_id,
855
+ status = 'FAILED,' + str(e).split('\n')[0],
856
+ filtermanager = filtermanager
885
857
  )
886
858
  raise
887
- # Clean up by dropping the temporary volatile table.
888
- # tdml.execute_sql(f'DROP TABLE {volatile_table_name}')
889
859
 
890
- # Collect statistics only if something has been computed
891
860
  if something_computed:
892
- apply_collect_stats(
893
- entity_id,
894
- primary_index = primary_index,
895
- partitioning = partitioning,
896
- feature_infos = features_infos
897
- )
861
+ apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
898
862
 
899
- # Build a dataset view in the feature store.
900
863
  if tdfs4ds.BUILD_DATASET_AT_UPLOAD:
901
- if tdfs4ds.DISPLAY_LOGS: print('build dataset for validation')
864
+ logger_safe("info", "Building dataset for validation...")
902
865
  try:
903
- dataset = build_dataset(
904
- entity_id,
905
- selected_features,
866
+ return build_dataset(
867
+ entity_id, selected_features,
906
868
  view_name=None,
907
- entity_null_substitute = entity_null_substitute
869
+ entity_null_substitute=entity_null_substitute
908
870
  )
909
871
  except Exception as e:
910
- print('ERROR at build_dataset in _upload_features:')
911
- print(str(e).split('\n')[0])
912
- print('entity :', entity_id)
913
- print('selected features :', selected_features)
914
-
915
- # Return the dataset view.
916
- return dataset
872
+ logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
873
+ logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
917
874
  else:
918
- if tdfs4ds.DISPLAY_LOGS: print('no dataset built for validation. Set tdfs4ds.BUILD_DATASET_AT_UPLOAD to True if you want it')
875
+ logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False)")
919
876
  return
920
877
 
921
878
 
879
+
880
+
922
881
  def build_dataset(entity_id, selected_features, view_name, schema_name=None, comment=None, return_query=False,
923
882
  feature_store_time=False, join_type='INNER'):
924
883
  """
@@ -935,6 +894,10 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
935
894
  selected_features : dict
936
895
  A dictionary where the keys are feature table names, and the values are lists of tuples
937
896
  (feature_id, feature_version, feature_name) specifying the features to retrieve.
897
+ NOTE: feature_version may be either:
898
+ - a single UUID string, or
899
+ - a list of dicts like:
900
+ {"process_id": <UUID>, "process_view_name": <str>}
938
901
 
939
902
  view_name : str
940
903
  The name of the view to be created in the database.
@@ -1004,6 +967,24 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
1004
967
  # Sort the entity ID list for consistent query generation
1005
968
  list_entity_id.sort()
1006
969
 
970
+ # Helpers
971
+ import re
972
+ def _sanitize_identifier(name: str) -> str:
973
+ # Keep letters, numbers, and underscores; replace others with '_'
974
+ return re.sub(r'[^0-9A-Za-z_]', '_', name)
975
+
976
+ used_alias_counts = {} # base_alias -> count
977
+
978
+ def _unique_alias(base: str) -> str:
979
+ """
980
+ Ensure alias uniqueness: if base already used, append _2, _3, ...
981
+ """
982
+ if base not in used_alias_counts:
983
+ used_alias_counts[base] = 1
984
+ return base
985
+ used_alias_counts[base] += 1
986
+ return f"{base}_{used_alias_counts[base]}"
987
+
1007
988
  # Initialize sub-query construction
1008
989
  tdfs4ds.logger.info("Generating the sub-queries for feature retrieval.")
1009
990
  sub_queries = []
@@ -1014,21 +995,52 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
1014
995
  # Construct sub-queries for each feature
1015
996
  for k, v in list_features.items():
1016
997
  for feature_id, feature_version, feature_name in v:
1017
- txt_where = f"(FEATURE_ID = {feature_id} AND FEATURE_VERSION='{feature_version}')"
1018
- feature_str = ',B1.FEATURE_VALUE AS ' + feature_name
1019
- sub_queries.append(
1020
- {
1021
- 'feature_name': feature_name,
1022
- 'query': f"""
1023
- SEQUENCED VALIDTIME
1024
- SELECT
1025
- {txt_entity}
1026
- {feature_str}
1027
- FROM {k} B1
1028
- WHERE {txt_where}
1029
- """
1030
- }
1031
- )
998
+
999
+ # Multiple processes: list of dicts
1000
+ if isinstance(feature_version, list):
1001
+ for item in feature_version:
1002
+ process_id = item.get("process_id")
1003
+ process_view_name = item.get("process_view_name") or "PROCESS"
1004
+ base_alias = _sanitize_identifier(f"{feature_name}_{process_view_name}")
1005
+ alias = _unique_alias(base_alias)
1006
+
1007
+ txt_where = f"(FEATURE_ID = {feature_id} AND FEATURE_VERSION='{process_id}')"
1008
+ feature_str = ',B1.FEATURE_VALUE AS ' + alias
1009
+
1010
+ sub_queries.append(
1011
+ {
1012
+ 'feature_name': alias,
1013
+ 'query': f"""
1014
+ SEQUENCED VALIDTIME
1015
+ SELECT
1016
+ {txt_entity}
1017
+ {feature_str}
1018
+ FROM {k} B1
1019
+ WHERE {txt_where}
1020
+ """
1021
+ }
1022
+ )
1023
+
1024
+ # Single UUID
1025
+ else:
1026
+ base_alias = _sanitize_identifier(feature_name)
1027
+ alias = _unique_alias(base_alias)
1028
+
1029
+ txt_where = f"(FEATURE_ID = {feature_id} AND FEATURE_VERSION='{feature_version}')"
1030
+ feature_str = ',B1.FEATURE_VALUE AS ' + alias
1031
+ sub_queries.append(
1032
+ {
1033
+ 'feature_name': alias,
1034
+ 'query': f"""
1035
+ SEQUENCED VALIDTIME
1036
+ SELECT
1037
+ {txt_entity}
1038
+ {feature_str}
1039
+ FROM {k} B1
1040
+ WHERE {txt_where}
1041
+ """
1042
+ }
1043
+ )
1032
1044
 
1033
1045
  # Handle case where no features are available
1034
1046
  if len(sub_queries) == 0:
@@ -1102,6 +1114,7 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
1102
1114
  return tdml.DataFrame.from_table(tdml.in_schema(schema_name, view_name))
1103
1115
 
1104
1116
 
1117
+
1105
1118
  def build_dataset_opt(entity_id, selected_features, view_name = None, schema_name=tdfs4ds.SCHEMA,
1106
1119
  comment='dataset', no_temporal=False, time_manager=None, query_only=False, entity_null_substitute={},
1107
1120
  other=None, time_column=None, filtermanager = None, filter_conditions = None
@@ -1280,82 +1293,91 @@ def upload_tdstone2_scores(model):
1280
1293
  return dataset
1281
1294
 
1282
1295
 
1283
- def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None):
1296
+ def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None, force_compute = False, force_display_logs = False):
1284
1297
  """
1285
- Executes a series of processes for each date in a given list, managing the time and logging settings.
1298
+ Executes a series of processes for each date in a given list, managing time, computation settings, and logging.
1286
1299
 
1287
1300
  This function iterates over a range of time steps, updating a TimeManager object with each step, and then
1288
- executes a list of processes for that time step. It also manages the synchronization of time for a feature store
1289
- and disables display logs during its execution.
1301
+ executes a list of processes for that time step. It also manages synchronization of time for the feature store
1302
+ and optionally controls forced computation and log display behavior.
1290
1303
 
1291
1304
  Parameters:
1292
1305
  - process_list (list): A list of process IDs that need to be executed for each time step.
1293
- - time_manager (TimeManager object): An object that manages time-related operations, like updating or retrieving time.
1306
+ - time_manager (TimeManager): An object that manages time-related operations, like updating or retrieving time.
1294
1307
  - time_id_start (int, optional): The starting time step ID. Default is 1.
1295
- - time_id_end (int, optional): The ending time step ID. If None, it will run until the last time step in the time manager.
1308
+ - time_id_end (int, optional): The ending time step ID. If None, it will run until the last time step in the
1309
+ time manager.
1310
+ - force_compute (bool, optional): If True, forces each process to recompute even if previous results exist.
1311
+ Default is False.
1312
+ - force_display_logs (bool, optional): If True, forces log display during the rollout even if global log display
1313
+ is disabled. Default is False.
1296
1314
 
1297
1315
  Side Effects:
1298
- - Sets global variables DISPLAY_LOGS and FEATURE_STORE_TIME.
1316
+ - Temporarily modifies global variables DISPLAY_LOGS, PROCESS_TYPE, RUN_ID, and FEATURE_STORE_TIME.
1317
+ - Restores DISPLAY_LOGS setting after execution.
1299
1318
  - Catches and prints exceptions along with the time step on which they occurred.
1300
1319
 
1301
- This function performs the following steps:
1302
- 1. Disables display logs and sets the process type to 'ROLL_OUT'.
1303
- 2. Iterates over the specified range of time steps.
1304
- 3. Updates the time manager with the current time step.
1305
- 4. Synchronizes the feature store time with the current time step.
1306
- 5. Executes each process in the process list for the current time step.
1307
- 6. Restores the original display log setting after execution.
1320
+ Steps performed:
1321
+ 1. Disables display logs by default unless `force_display_logs` is True.
1322
+ 2. Sets process type to 'ROLL_OUT' and initializes a unique run ID.
1323
+ 3. Iterates over the specified range of time steps.
1324
+ 4. Updates the time manager with the current time step.
1325
+ 5. Synchronizes the feature store time with the current time step.
1326
+ 6. Executes each process in the process list with optional forced computation.
1327
+ 7. Restores original display log settings after completion.
1308
1328
 
1309
1329
  Example:
1310
1330
  >>> process_list = ['process_1', 'process_2']
1311
1331
  >>> time_manager = TimeManager(...)
1312
- >>> roll_out(process_list, time_manager, time_id_start=1, time_id_end=10)
1332
+ >>> roll_out(process_list, time_manager, time_id_start=1, time_id_end=10, force_compute=True, force_display_logs=True)
1313
1333
  """
1314
1334
 
1315
- #global DISPLAY_LOGS
1316
- #global FEATURE_STORE_TIME
1317
-
1318
1335
  # Disable display logs
1319
1336
  temp_DISPLAY_LOGS = tdfs4ds.DISPLAY_LOGS
1320
1337
  tdfs4ds.DISPLAY_LOGS = False
1338
+ if force_display_logs:
1339
+ tdfs4ds.DISPLAY_LOGS = True
1321
1340
  PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
1322
1341
  tdfs4ds.PROCESS_TYPE = 'ROLL_OUT'
1323
1342
  tdfs4ds.RUN_ID = str(uuid.uuid4())
1324
1343
 
1325
-
1326
-
1327
1344
  try:
1345
+ # Define range of time steps
1328
1346
  if time_id_end is None:
1329
- pbar = tqdm.tqdm(range(time_id_start, time_manager.nb_time_steps + 1), desc="Starting")
1347
+ time_range = range(time_id_start, time_manager.nb_time_steps + 1)
1330
1348
  else:
1331
- pbar = tqdm.tqdm(range(time_id_start, min([time_manager.nb_time_steps + 1,time_id_end+1]) ), desc="Starting")
1332
- # Iterate over each date in the provided list
1349
+ time_range = range(time_id_start, min(time_manager.nb_time_steps + 1, time_id_end + 1))
1350
+
1351
+ # Progress bar
1352
+ pbar = tqdm(time_range, desc="Starting rollout", unit="step")
1353
+
1333
1354
  for i in pbar:
1334
- # Update the time manager with the new date
1335
- time_manager.update(time_id = i )
1355
+ # Update time manager
1356
+ time_manager.update(time_id=i)
1336
1357
  date_ = str(time_manager.display()['BUSINESS_DATE'].values[0])
1337
- pbar.set_description(f"Processing {date_}")
1338
- # Synchronize the time for the feature store with the current date
1358
+
1359
+ # Sync feature store time
1339
1360
  tdfs4ds.FEATURE_STORE_TIME = time_manager.get_date_in_the_past()
1340
- pbar.set_description(f"Processing {tdfs4ds.FEATURE_STORE_TIME}")
1361
+
1362
+ # Display current progress in tqdm
1363
+ pbar.set_postfix(time=date_, feature_time=tdfs4ds.FEATURE_STORE_TIME)
1364
+
1341
1365
  if tdfs4ds.DEBUG_MODE:
1342
- print('def roll_out','date_', date_)
1343
- print('def roll_out','time_manager.get_date_in_the_past()', time_manager.get_date_in_the_past())
1344
- print('def roll_out','tdfs4ds.FEATURE_STORE_TIME', tdfs4ds.FEATURE_STORE_TIME)
1345
- # Execute each process in the process list for the current date
1366
+ print("roll_out | date_:", date_)
1367
+ print("roll_out | feature_store_time:", tdfs4ds.FEATURE_STORE_TIME)
1368
+
1369
+ # Execute all processes for this time step
1346
1370
  for proc_id in process_list:
1347
- pbar.set_description(f"Processing {date_} process {proc_id}")
1348
- run(process_id=proc_id, force_compute=False)
1371
+ pbar.set_description(f"Processing {date_} | proc {proc_id}")
1372
+ run(process_id=proc_id, force_compute=force_compute)
1349
1373
 
1374
+ # Restore settings
1350
1375
  tdfs4ds.DISPLAY_LOGS = temp_DISPLAY_LOGS
1376
+
1351
1377
  except Exception as e:
1352
1378
  tdfs4ds.DISPLAY_LOGS = temp_DISPLAY_LOGS
1353
- # If an exception occurs, print the date and the first line of the exception message
1354
- #print(date_)
1355
1379
  print(str(e).split('\n')[0])
1356
1380
  tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
1357
1381
  raise
1358
1382
 
1359
- tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
1360
-
1361
-
1383
+ tdfs4ds.PROCESS_TYPE = PROCESS_TYPE