tdfs4ds 0.2.4.26__py3-none-any.whl → 0.2.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tdfs4ds/__init__.py CHANGED
@@ -1,5 +1,8 @@
1
- __version__ = '0.2.4.26'
1
+ __version__ = '0.2.5.1'
2
+ import difflib
2
3
  import logging
4
+ import json
5
+
3
6
  # Setup the logger
4
7
  logging.basicConfig(
5
8
  level=logging.INFO,
@@ -7,11 +10,21 @@ logging.basicConfig(
7
10
  datefmt='%Y-%m-%d %H:%M:%S' # Set the date/time format
8
11
  )
9
12
 
13
+ # Helper: central logging gate controlled by tdfs4ds.DISPLAY_LOGS
14
+ def logger_safe(level, message, *args, **kwargs):
15
+ """
16
+ Wrapper around the global `logger` that only emits logs when
17
+ tdfs4ds.DISPLAY_LOGS is True. `level` is a string like "info", "error", etc.
18
+ """
19
+ if getattr(tdfs4ds, "DISPLAY_LOGS", True):
20
+ getattr(logger, level)(message, *args, **kwargs)
21
+
10
22
  logger = logging.getLogger(__name__)
11
23
 
12
24
  from tdfs4ds.feature_store.feature_query_retrieval import get_available_entity_id_records, write_where_clause_filter
13
25
  from tdfs4ds.process_store.process_followup import follow_up_report
14
26
  from tdfs4ds.dataset.dataset_catalog import DatasetCatalog, Dataset
27
+ from . import genai
15
28
 
16
29
  DATA_DOMAIN = None
17
30
  SCHEMA = None
@@ -44,6 +57,18 @@ FEATURE_PARTITION_EACH = 1
44
57
 
45
58
  VARCHAR_SIZE = 1024
46
59
 
60
+ INSTRUCT_MODEL_URL = None
61
+ INSTRUCT_MODEL_API_KEY = None
62
+ INSTRUCT_MODEL_MODEL = None
63
+ INSTRUCT_MODEL_PROVIDER = None
64
+
65
+ DOCUMENTATION_PROCESS_BUSINESS_LOGIC = 'FS_PROCESS_DOCUMENTATION_BUSINESS_LOGIC'
66
+ DOCUMENTATION_PROCESS_FEATURES = 'FS_PROCESS_DOCUMENTATION_FEATURES'
67
+ DOCUMENTATION_PROCESS_BUSINESS_LOGIC_VIEW = 'FS_V_PROCESS_DOCUMENTATION_BUSINESS_LOGIC'
68
+ DOCUMENTATION_PROCESS_FEATURES_VIEW = 'FS_V_PROCESS_DOCUMENTATION_FEATURES'
69
+ DOCUMENTATION_PROCESS_EXPLAIN = 'FS_PROCESS_DOCUMENTATION_EXPLAIN'
70
+ DOCUMENTATION_PROCESS_EXPLAIN_VIEW = 'FS_V_PROCESS_DOCUMENTATION_EXPLAIN'
71
+
47
72
  import warnings
48
73
  warnings.filterwarnings('ignore')
49
74
 
@@ -57,7 +82,7 @@ import tdfs4ds.datasets
57
82
  import time
58
83
 
59
84
  import inspect
60
- import tqdm
85
+ from tqdm.auto import tqdm # auto picks the right frontend (notebook/terminal)
61
86
 
62
87
  from tdfs4ds.feature_store.feature_data_processing import generate_on_clause
63
88
 
@@ -70,92 +95,85 @@ PROCESS_TYPE = 'RUN PROCESS'
70
95
  try:
71
96
  SCHEMA = tdml.context.context._get_current_databasename()
72
97
  if SCHEMA is None:
73
- print('Please specify the database which is hosting the feature store.')
74
- print('tdfs4ds.feature_store.schema = "<feature store database>"')
98
+ logger.warning("No default database detected for feature store.")
99
+ logger.warning('Please set it explicitly: tdfs4ds.feature_store.schema = "<feature store database>"')
75
100
  else:
76
- print('The default database is used for the feature store.')
77
- print(f"tdfs4ds.feature_store.schema = '{SCHEMA}'")
101
+ logger.info("Default database detected for feature store: %s", SCHEMA)
102
+ logger.info('tdfs4ds.feature_store.schema = "%s"', SCHEMA)
103
+
78
104
  if DATA_DOMAIN is None:
79
105
  DATA_DOMAIN = SCHEMA
80
- print(f"the data domain for the current work is :{DATA_DOMAIN}")
81
- print("Please update it as you wish with tdfs4ds.DATA_DOMAIN=<your data domain>")
106
+ logger.info("DATA_DOMAIN not set. Defaulting to SCHEMA: %s", DATA_DOMAIN)
107
+ logger.info('You can override it using: tdfs4ds.DATA_DOMAIN = "<your data domain>"')
82
108
 
83
109
  except Exception as e:
84
- print('Please specify the database which is hosting the feature store.')
85
- print('tdfs4ds.feature_store.schema = "<feature store database>"')
110
+ logger.error("Could not determine current database: %s", str(e).split('\n')[0])
111
+ logger.warning("Please specify the feature store database manually:")
112
+ logger.warning('tdfs4ds.feature_store.schema = "<feature store database>"')
86
113
 
87
114
 
88
115
  def setup(database, if_exists='fail'):
89
116
  """
90
- Set up the database environment by configuring schema names and optionally dropping existing tables.
91
-
92
- This function sets the database schema for feature and process catalogs. If specified, it also handles
93
- the replacement of existing catalog tables. It reports the status of these operations, including any
94
- encountered exceptions.
95
-
96
- Parameters:
97
- database (str): The name of the database schema to be used.
98
- if_exists (str, optional): Determines the behavior if catalog tables already exist in the database.
99
- 'fail' (default) - Do nothing if the tables exist.
100
- 'replace' - Drop the tables if they exist before creating new ones.
101
-
102
- Steps performed:
103
- 1. Sets the schema to the provided database name.
104
- 2. If 'if_exists' is 'replace', attempts to drop 'FS_FEATURE_CATALOG' and 'FS_PROCESS_CATALOG' tables.
105
- 3. Creates new feature and process catalog tables and sets their names in the tdfs4ds module.
106
- 4. Prints the names of the newly created tables along with the database name.
107
- 5. Captures and prints the first line of any exceptions that occur during these operations.
108
-
109
- Returns:
110
- None
117
+ Initialize the feature store environment by creating catalog tables and views.
111
118
  """
112
119
 
113
120
  from tdfs4ds.feature_store.feature_store_management import feature_store_catalog_creation
114
121
  from tdfs4ds.process_store.process_store_catalog_management import process_store_catalog_creation
115
122
 
116
123
  tdfs4ds.SCHEMA = database
124
+ logger_safe("info", "Setting up feature store in database: %s", database)
125
+
117
126
  if if_exists == 'replace':
118
- try:
119
- tdml.db_drop_table(table_name = tdfs4ds.FEATURE_CATALOG_NAME, schema_name=database)
120
- except Exception as e:
121
- print(str(e).split('\n')[0])
122
- try:
123
- tdml.db_drop_table(table_name = tdfs4ds.PROCESS_CATALOG_NAME, schema_name=database)
124
- except Exception as e:
125
- print(str(e).split('\n')[0])
126
- try:
127
- tdml.db_drop_table(table_name = tdfs4ds.DATA_DISTRIBUTION_NAME, schema_name=database)
128
- except Exception as e:
129
- print(str(e).split('\n')[0])
127
+ logger_safe("info", "Replacing existing catalog tables if they exist.")
128
+ for table in [tdfs4ds.FEATURE_CATALOG_NAME, tdfs4ds.PROCESS_CATALOG_NAME, tdfs4ds.DATA_DISTRIBUTION_NAME]:
129
+ try:
130
+ tdml.db_drop_table(table_name=table, schema_name=database)
131
+ logger_safe("info", "Dropped table %s.%s", database, table)
132
+ except Exception as e:
133
+ logger_safe("warning", "Could not drop table %s.%s: %s", database, table, str(e).split('\n')[0])
130
134
 
131
135
  DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME).drop_catalog()
136
+
132
137
  try:
133
138
  tdfs4ds.FEATURE_CATALOG_NAME = feature_store_catalog_creation()
134
- print('feature catalog table: ', tdfs4ds.FEATURE_CATALOG_NAME, ' in database ', database)
139
+ logger_safe("info", "Feature catalog table created: %s in database %s", tdfs4ds.FEATURE_CATALOG_NAME, database)
135
140
  except Exception as e:
136
- print(str(e).split('\n')[0])
141
+ logger_safe("error", "Feature catalog creation failed: %s", str(e).split('\n')[0])
137
142
 
138
143
  try:
139
- tdfs4ds.PROCESS_CATALOG_NAME, tdfs4ds.DATA_DISTRIBUTION_NAME, tdfs4ds.FILTER_MANAGER_NAME = process_store_catalog_creation()
140
- print('process catalog table: ', tdfs4ds.PROCESS_CATALOG_NAME, ' in database ', database)
141
- print('data distribution table: ', tdfs4ds.DATA_DISTRIBUTION_NAME, ' in database ', database)
142
- print('filter manager table: ', tdfs4ds.FILTER_MANAGER_NAME, ' in database ', database)
144
+ (tdfs4ds.PROCESS_CATALOG_NAME,
145
+ tdfs4ds.DATA_DISTRIBUTION_NAME,
146
+ tdfs4ds.FILTER_MANAGER_NAME) = process_store_catalog_creation()
147
+
148
+ logger_safe("info", "Process catalog table created: %s", tdfs4ds.PROCESS_CATALOG_NAME)
149
+ logger_safe("info", "Data distribution table created: %s", tdfs4ds.DATA_DISTRIBUTION_NAME)
150
+ logger_safe("info", "Filter manager table created: %s", tdfs4ds.FILTER_MANAGER_NAME)
143
151
  except Exception as e:
144
- print(str(e).split('\n')[0])
152
+ logger_safe("error", "Process catalog creation failed: %s", str(e).split('\n')[0])
145
153
 
146
154
  try:
147
155
  tdfs4ds.process_store.process_followup.follow_up_table_creation()
156
+ logger_safe("info", "Follow-up table created successfully.")
148
157
  except Exception as e:
149
- print(str(e).split('\n')[0])
158
+ logger_safe("error", "Follow-up table creation failed: %s", str(e).split('\n')[0])
150
159
 
151
160
  tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
152
161
  tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
162
+
153
163
  dataset_catalog = DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME)
154
164
  if not dataset_catalog._exists():
155
165
  dataset_catalog.create_catalog()
166
+ logger_safe("info", "Dataset catalog created: %s", tdfs4ds.DATASET_CATALOG_NAME)
156
167
 
168
+ logger_safe("info", "Setup complete.")
169
+ try:
170
+ tdfs4ds.genai.documentations_tables_creation()
171
+ logger_safe("info", "Documentation tables created successfully.")
172
+ except Exception as e:
173
+ logger_safe("error", "Documentation tables creation failed: %s", str(e).split('\n')[0])
157
174
  return
158
175
 
176
+
159
177
  def connect(
160
178
  database = tdfs4ds.SCHEMA,
161
179
  feature_catalog_name = tdfs4ds.FEATURE_CATALOG_NAME,
@@ -166,37 +184,51 @@ def connect(
166
184
  feature_catalog_name_view = tdfs4ds.FEATURE_CATALOG_NAME_VIEW,
167
185
  process_catalog_name_view = tdfs4ds.PROCESS_CATALOG_NAME_VIEW,
168
186
  dataset_catalog_name = tdfs4ds.DATASET_CATALOG_NAME,
169
- create_if_missing = False # New argument
187
+ documentation_process_business_logic = tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC,
188
+ documentation_process_features = tdfs4ds.DOCUMENTATION_PROCESS_FEATURES,
189
+ documentation_process_explain = tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN,
190
+ create_if_missing = False
170
191
  ):
171
- if database is not None:
172
- tdfs4ds.SCHEMA = database
173
- else:
192
+ if database is None:
174
193
  raise ValueError("database parameter is None.")
194
+ tdfs4ds.SCHEMA = database
195
+ logger_safe("info", "Connecting to feature store in database: %s", database)
175
196
 
176
197
  tables = [x.lower() for x in list(tdml.db_list_tables(schema_name=tdfs4ds.SCHEMA, object_type='table').TableName.values)]
177
-
198
+
178
199
  feature_exists = feature_catalog_name.lower() in tables
179
200
  process_exists = process_catalog_name.lower() in tables
180
201
  distrib_exists = data_distribution_name.lower() in tables
181
202
  filter_manager_exists = filter_manager_name.lower() in tables
182
203
  followup_name_exists = followup_name.lower() in tables
204
+ documentation_process_business_logic_exist = documentation_process_business_logic.lower() in tables
205
+ documentation_process_features_exist = documentation_process_features.lower() in tables
206
+ documentation_process_explain_exist = documentation_process_explain.lower() in tables
183
207
 
184
- if not (feature_exists and process_exists and distrib_exists and filter_manager_exists):
208
+
209
+ if not (feature_exists and process_exists and distrib_exists and filter_manager_exists and documentation_process_business_logic_exist and documentation_process_features_exist):
185
210
  if not create_if_missing:
186
- return False # Feature store does not exist
187
- else:
188
- # Create the missing components
189
- if not feature_exists:
190
- tdfs4ds.feature_store.feature_store_management.feature_store_catalog_creation()
191
- if not process_exists:
192
- tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_creation()
193
- if not distrib_exists:
194
- tdfs4ds.data_distribution.data_distribution_catalog_creation()
195
- if not filter_manager_exists:
196
- tdfs4ds.filter_manager.filter_manager_catalog_creation()
197
-
198
- # Follow-up table handling
211
+ logger_safe("warning", "Feature store components missing and create_if_missing=False")
212
+ return False
213
+ logger_safe("info", "Missing components detected; creating missing parts...")
214
+ if not feature_exists:
215
+ logger_safe("info", "Creating feature catalog: %s", feature_catalog_name)
216
+ tdfs4ds.feature_store.feature_store_management.feature_store_catalog_creation()
217
+ if not process_exists:
218
+ logger_safe("info", "Creating process catalog: %s", process_catalog_name)
219
+ tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_creation()
220
+ if not distrib_exists:
221
+ logger_safe("info", "Creating data distribution table: %s", data_distribution_name)
222
+ tdfs4ds.data_distribution.data_distribution_catalog_creation()
223
+ if not filter_manager_exists:
224
+ logger_safe("info", "Creating filter manager table: %s", filter_manager_name)
225
+ tdfs4ds.filter_manager.filter_manager_catalog_creation()
226
+ if not documentation_process_business_logic_exist or not documentation_process_features_exist or not documentation_process_explain_exist:
227
+ logger_safe("info", "Creating documentation tables.")
228
+ tdfs4ds.genai.documentation_tables_creation()
229
+
199
230
  if not followup_name_exists:
231
+ logger_safe("info", "Creating follow-up table: %s", followup_name)
200
232
  tdfs4ds.process_store.process_followup.follow_up_table_creation()
201
233
  tdfs4ds.FOLLOW_UP_NAME = followup_name
202
234
 
@@ -210,20 +242,20 @@ def connect(
210
242
 
211
243
  process_list = tdml.DataFrame(tdml.in_schema(database, process_catalog_name))
212
244
  if 'ENTITY_NULL_SUBSTITUTE' not in process_list.columns:
213
- print('ENTITY_NULL_SUBSTITUTE column does not exist in the existing process catalog')
214
- print('upgrade to the latest DDL')
245
+ logger_safe("warning", "ENTITY_NULL_SUBSTITUTE column missing. Upgrading catalog.")
215
246
  tdfs4ds.process_store.process_store_catalog_management.upgrade_process_catalog()
216
247
 
217
248
  tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
218
249
  tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
219
250
 
220
- # Dataset catalog setup
251
+ # Dataset Catalog
221
252
  tdfs4ds.DATASET_CATALOG_NAME = dataset_catalog_name
222
- dataset_catalog = DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME)
253
+ dataset_catalog = DatasetCatalog(schema_name=database, name=dataset_catalog_name)
223
254
  if not dataset_catalog._exists():
224
255
  dataset_catalog.create_catalog()
256
+ logger_safe("info", "Dataset catalog created: %s", dataset_catalog_name)
225
257
 
226
- # Check if distribution is temporal
258
+ # Detect temporal distribution
227
259
  def is_data_distribution_temporal():
228
260
  return 'PERIOD' in tdfs4ds.utils.lineage.get_ddl(
229
261
  view_name=tdfs4ds.DATA_DISTRIBUTION_NAME,
@@ -231,10 +263,110 @@ def connect(
231
263
  object_type='table'
232
264
  )
233
265
 
266
+ query_data_domain = f"""
267
+ SELECT DISTINCT DATA_DOMAIN
268
+ FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}
269
+ UNION
270
+ SELECT DISTINCT DATA_DOMAIN
271
+ FROM {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW}
272
+ """
273
+ data_domains = tdml.DataFrame.from_query(query_data_domain).to_pandas()['DATA_DOMAIN'].tolist()
274
+ logger_safe("info", "Data domains in feature store: %s", data_domains)
275
+
234
276
  tdfs4ds.DATA_DISTRIBUTION_TEMPORAL = is_data_distribution_temporal()
235
-
236
- return True # Feature store exists or was created
277
+ logger_safe("info", "Connected to feature store successfully.")
278
+ return True
279
+
280
+ def get_data_domains(verbose=True):
281
+ """
282
+ Retrieve and display all data domains available in the feature store.
283
+ This function queries the feature store to obtain a list of all distinct data domains
284
+ that have been defined within the system. It combines data domains from both the process
285
+ catalog and the feature catalog, ensuring a comprehensive overview. The current data
286
+ domain in use is highlighted for easy identification.
287
+ Parameters:
288
+ - verbose (bool): If True, prints the list of data domains with the current one marked.
289
+ Returns:
290
+ - str: The current data domain in use.
291
+ """
292
+
293
+ query_data_domain = f"""
294
+ SELECT DISTINCT DATA_DOMAIN
295
+ FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}
296
+ UNION
297
+ SELECT DISTINCT DATA_DOMAIN
298
+ FROM {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW}
299
+ """
300
+ data_domains = tdml.DataFrame.from_query(query_data_domain).to_pandas()['DATA_DOMAIN'].tolist()
301
+
302
+ if verbose:
303
+ print("Data Domains in Feature Store:")
304
+ for d in data_domains:
305
+ if d != tdfs4ds.DATA_DOMAIN:
306
+ print('\t'+d)
307
+ else:
308
+ print('*\t'+d)
309
+ if tdfs4ds.DATA_DOMAIN not in data_domains and tdfs4ds.DATA_DOMAIN is not None:
310
+ print("\nCurrent data domain (%s) not available yet in feature store. It may be a new one" % tdfs4ds.DATA_DOMAIN)
311
+ return
312
+ return data_domains
313
+
314
+ def select_data_domain(data_domain):
315
+ """
316
+ Set the active data domain for feature store operations.
237
317
 
318
+ This function allows users to specify which data domain should be considered
319
+ as the current context for subsequent feature store operations. By setting
320
+ the data domain, users can ensure that all feature queries, registrations,
321
+ and other interactions with the feature store are scoped appropriately.
322
+ This is particularly useful in environments where multiple data domains
323
+ exist, allowing for clear separation and organization of features.
324
+
325
+ Parameters:
326
+ - data_domain (str): The name of the data domain to set as active.
327
+
328
+ Returns:
329
+ - str: The data domain that has been set as active.
330
+ """
331
+ data_domains = get_data_domains(verbose=False)
332
+ if data_domain not in data_domains:
333
+ logger_safe("error", "Data domain '%s' not found in feature store.", data_domain)
334
+ raise ValueError(f"Data domain '{data_domain}' not found in feature store.")
335
+ #suggest a data domain closest to the requested one
336
+ closest_domain = difflib.get_close_matches(data_domain, data_domains, n=1)
337
+ if data_domain in data_domains:
338
+ tdfs4ds.DATA_DOMAIN = data_domain
339
+ elif closest_domain:
340
+ logger_safe("info", "Did you mean '%s'?", closest_domain[0])
341
+ return
342
+ tdfs4ds.DATA_DOMAIN = data_domain
343
+ logger_safe("info", "Data domain set to: %s", data_domain)
344
+ return
345
+
346
+ def create_data_domain(data_domain):
347
+ """
348
+ Create a new data domain in the feature store.
349
+
350
+ This function facilitates the creation of a new data domain within the feature store.
351
+ A data domain serves as a logical grouping for features, allowing for better organization
352
+ and management. By creating a new data domain, users can segregate features based on
353
+ specific criteria, such as business units, projects, or data types. This helps in
354
+ maintaining clarity and structure within the feature store, especially in environments
355
+ with diverse datasets and use cases.
356
+
357
+ Parameters:
358
+ - data_domain (str): The name of the new data domain to be created.
359
+
360
+ Returns:
361
+ - str: The name of the newly created data domain.
362
+ """
363
+ existing_domains = get_data_domains(verbose=False)
364
+ if data_domain in existing_domains:
365
+ logger_safe("warning", "Data domain '%s' already exists in feature store.", data_domain)
366
+ return data_domain
367
+ tdfs4ds.DATA_DOMAIN = data_domain
368
+ logger_safe("info", "Data domain '%s' created in locally.", data_domain)
369
+ return
238
370
 
239
371
 
240
372
  def feature_catalog():
@@ -287,50 +419,22 @@ def get_dataset_entity(dataset_id = None):
287
419
  def get_dataset_features(dataset_id = None):
288
420
  return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_features(dataset_id)
289
421
 
290
- def run(process_id, return_dataset = False, force_compute = False, force_varchar_length = None):
422
+ def run(process_id, return_dataset=False, force_compute=False, force_varchar_length=None, dataset_view_name=None):
291
423
  """
292
424
  Executes a specific process from the feature store identified by the process ID.
293
- The function handles different process types and performs appropriate actions.
294
-
295
- Parameters:
296
- - process_id (str): The unique identifier of the process to run.
297
- - return_dataset (bool, optional): A flag indicating whether to return the dataset created during the process.
298
- Default is False.
299
- - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
300
- Default is False.
301
- - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
302
- VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
303
- where k is the smallest integer so that the original lengths is smaller or equal
304
- to k x force_varchar_length. Default is None.
305
-
306
- Returns:
307
- DataFrame or None: If return_dataset is True, returns the dataset created during the process. Otherwise, returns None.
308
-
309
- This function performs the following steps:
310
- 1. Determines the process type and initializes necessary variables.
311
- 2. Constructs and executes a SQL query to retrieve process details by process ID.
312
- 3. Fetches the filter manager, process type, primary index, partitioning, and data domain from the query result.
313
- 4. Handles different process types, such as 'denormalized view' and 'tdstone2 view'.
314
- 5. For 'denormalized view' process type, extracts necessary details, fetches data, and uploads features to the feature store.
315
- 6. Optionally returns the dataset created during the process if return_dataset is True.
316
-
317
- Note:
318
- - The function relies on various sub-modules within the `tdfs4ds` library for different steps of the process, from
319
- data retrieval to feature uploading.
320
- - It is intended to be used internally within a system that manages a Teradata feature store, assuming access to
321
- a Teradata database and the appropriate schema for feature storage.
425
+ Uses global `logger` for diagnostics (gated by tdfs4ds.DISPLAY_LOGS).
322
426
  """
323
427
 
324
428
  if tdfs4ds.PROCESS_TYPE is None:
325
429
  PROCESS_TYPE_ = 'RUN PROCESS'
326
- tdfs4ds.RUN_ID = str(uuid.uuid4())
430
+ tdfs4ds.RUN_ID = str(uuid.uuid4())
327
431
  else:
328
432
  PROCESS_TYPE_ = tdfs4ds.PROCESS_TYPE
329
433
 
330
- if tdfs4ds.DEBUG_MODE:
331
- print('def run','tdfs4ds.FEATURE_STORE_TIME', tdfs4ds.FEATURE_STORE_TIME)
434
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
435
+ logger_safe("debug", "def run | tdfs4ds.FEATURE_STORE_TIME=%s", tdfs4ds.FEATURE_STORE_TIME)
332
436
 
333
- if tdfs4ds.FEATURE_STORE_TIME == None:
437
+ if tdfs4ds.FEATURE_STORE_TIME is None:
334
438
  validtime_statement = 'CURRENT VALIDTIME'
335
439
  else:
336
440
  validtime_statement = f"VALIDTIME AS OF TIMESTAMP '{tdfs4ds.FEATURE_STORE_TIME}'"
@@ -342,148 +446,112 @@ def run(process_id, return_dataset = False, force_compute = False, force_varchar
342
446
  WHERE A.PROCESS_ID = '{process_id}'
343
447
  """
344
448
 
449
+ logger_safe(
450
+ "info",
451
+ "Starting run | run_id=%s | process_type=%s | process_id=%s | return_dataset=%s | force_compute=%s | force_varchar_length=%s",
452
+ tdfs4ds.RUN_ID, PROCESS_TYPE_, process_id, return_dataset, force_compute, force_varchar_length
453
+ )
454
+
345
455
  # Executing the query and converting the result to Pandas DataFrame
346
456
  df = tdml.DataFrame.from_query(query).to_pandas()
347
457
 
348
- # Check if exactly one record is returned, else print an error
458
+ # Check if exactly one record is returned, else log an error and return
349
459
  if df.shape[0] != 1:
350
- print('error - there is ', df.shape[0], f' records. Check table {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}')
351
- print('check ou this query:')
352
- print(query)
460
+ logger_safe(
461
+ "error",
462
+ "Process catalog lookup returned %s record(s); expected 1. Check table %s.%s. Query: %s",
463
+ df.shape[0], tdfs4ds.SCHEMA, tdfs4ds.PROCESS_CATALOG_NAME_VIEW, query.strip()
464
+ )
353
465
  return
354
466
 
355
-
356
467
  # Fetching the filter manager
357
468
  filter_schema_name = df['FILTER_DATABASE_NAME'].values[0]
358
469
  if filter_schema_name is None:
359
470
  filtermanager = None
360
471
  else:
361
472
  filter_view_name = df['FILTER_VIEW_NAME'].values[0]
362
- filter_table_name = df['FILTER_TABLE_NAME'].values[0]
473
+ filter_table_name = df['FILTER_TABLE_NAME'].values[0] # kept for parity; not used directly here
363
474
  filtermanager = FilterManager(table_name=filter_view_name, schema_name=filter_schema_name)
364
475
 
365
- # Fetching the process type from the query result
366
- process_type = df['PROCESS_TYPE'].values[0]
367
-
368
- # Fetching the primary index from the query result
369
- primary_index = df['FOR_PRIMARY_INDEX'].values[0]
476
+ # Fetching process metadata
477
+ process_type = df['PROCESS_TYPE'].values[0]
478
+ primary_index = df['FOR_PRIMARY_INDEX'].values[0]
370
479
  if primary_index is not None:
371
- primary_index = primary_index.split(',')
372
-
373
- # Fetching the primary index from the query result
374
- partitioning = df['FOR_DATA_PARTITIONING'].values[0]
375
-
376
- # Fetching the data domain from the query result
377
- DATA_DOMAIN = df['DATA_DOMAIN'].values[0]
480
+ primary_index = [x.strip() for x in primary_index.split(',') if x.strip()]
481
+ partitioning = df['FOR_DATA_PARTITIONING'].values[0]
482
+ DATA_DOMAIN = df['DATA_DOMAIN'].values[0]
483
+
484
+ logger_safe(
485
+ "info",
486
+ "Process metadata | process_id=%s | process_type=%s | primary_index=%s | partitioning=%s | data_domain=%s | validtime=%s",
487
+ process_id, process_type, primary_index, partitioning, DATA_DOMAIN, validtime_statement
488
+ )
378
489
 
379
490
  # Handling 'denormalized view' process type
380
491
  if process_type == 'denormalized view':
381
- # Extracting necessary details for this process type
382
- view_name = df['VIEW_NAME'].values[0]
383
- entity_id = df['ENTITY_ID'].values[0].split(',')
492
+ view_name = df['VIEW_NAME'].values[0]
493
+ entity_id = [x.strip() for x in df['ENTITY_ID'].values[0].split(',') if x.strip()]
384
494
  entity_null_substitute = eval(df['ENTITY_NULL_SUBSTITUTE'].values[0])
385
- feature_names = df['FEATURE_NAMES'].values[0].split(',')
495
+ feature_names = [x.strip() for x in df['FEATURE_NAMES'].values[0].split(',') if x.strip()]
386
496
 
387
- # Fetching data and uploading features to the feature store
388
497
  df_data = tdml.DataFrame(tdml.in_schema(view_name.split('.')[0], view_name.split('.')[1]))
389
498
 
390
- if tdfs4ds.DEBUG_MODE:
391
- print('run','entity_id',entity_id)
392
- print('run', 'entity_null_substitute', entity_null_substitute)
393
- print('run','feature_names',feature_names)
394
- print('run','process_id',process_id)
395
- print('run','primary_index',primary_index)
396
- print('run','partitioning',partitioning)
499
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
500
+ logger_safe("debug", "run | entity_id=%s", entity_id)
501
+ logger_safe("debug", "run | entity_null_substitute=%s", entity_null_substitute)
502
+ logger_safe("debug", "run | feature_names=%s", feature_names)
503
+ logger_safe("debug", "run | process_id=%s", process_id)
504
+ logger_safe("debug", "run | primary_index=%s", primary_index)
505
+ logger_safe("debug", "run | partitioning=%s", partitioning)
506
+
397
507
  dataset = _upload_features(
398
508
  df_data,
399
509
  entity_id,
400
510
  feature_names,
401
- feature_versions = process_id,
402
- primary_index = primary_index,
403
- partitioning = partitioning,
404
- filtermanager = filtermanager,
405
- entity_null_substitute = entity_null_substitute,
406
- process_id = process_id,
407
- force_compute= force_compute,
408
- force_varchar_length = force_varchar_length
511
+ feature_versions=process_id,
512
+ primary_index=primary_index,
513
+ partitioning=partitioning,
514
+ filtermanager=filtermanager,
515
+ entity_null_substitute=entity_null_substitute,
516
+ process_id=process_id,
517
+ force_compute=force_compute,
518
+ force_varchar_length=force_varchar_length,
519
+ dataset_view_name = dataset_view_name
409
520
  )
410
521
 
411
522
  # Handling 'tdstone2 view' process type
412
523
  elif process_type == 'tdstone2 view':
413
- print('not implemented yet')
414
-
524
+ logger_safe("warning", "Process type 'tdstone2 view' not implemented yet for process_id=%s", process_id)
525
+ dataset = None
415
526
 
527
+ else:
528
+ logger_safe("error", "Unknown process type '%s' for process_id=%s", process_type, process_id)
529
+ dataset = None
416
530
 
417
531
  if return_dataset:
532
+ logger_safe("info", "Run finished with dataset | run_id=%s | process_id=%s", tdfs4ds.RUN_ID, process_id)
418
533
  return dataset
419
534
  else:
535
+ logger_safe("info", "Run finished without dataset | run_id=%s | process_id=%s", tdfs4ds.RUN_ID, process_id)
420
536
  return
421
537
 
422
- def upload_features(df, entity_id, feature_names, metadata={}, primary_index = None, partitioning = '', filtermanager = None, entity_null_substitute = {}, force_compute = True, force_varchar_length = 1024):
423
- """
424
- Uploads feature data from a DataFrame to the feature store for a specified entity. This involves registering the
425
- process in the feature store, executing the necessary SQL to insert the data, and returning the resulting dataset
426
- for further use or inspection.
427
538
 
428
- The function supports dynamic entity ID interpretation and flexible feature name handling, ensuring compatibility
429
- with various data schemas. It automatically registers the data upload process and applies additional metadata,
430
- if provided.
431
-
432
- Parameters:
433
- - df (DataFrame): The DataFrame containing the feature data to be uploaded.
434
- - entity_id (dict, list, or str): The identifier of the entity to which the features belong. This can be:
435
- - a dictionary mapping column names to their data types,
436
- - a list of column names, which will be automatically converted to a dictionary with types inferred from `df`,
437
- - a string representing a single column name, which will be converted into a list and then to a dictionary as above.
438
- - feature_names (list or str): The names of the features to be uploaded. If a string is provided, it will be
439
- split into a list based on commas or treated as a single feature name.
440
- - metadata (dict, optional): Additional metadata to associate with the upload process. Defaults to an empty dictionary.
441
- - primary_index (list, optional): Specifies the primary index columns for optimizing data storage and retrieval.
442
- - partitioning (str, optional): Defines how the data should be partitioned in the store for performance optimization.
443
- - filtermanager (object, optional): An object managing filter conditions for the feature data. Default is None.
444
- - entity_null_substitute (dict, optional): A dictionary specifying substitute values for nulls in entity columns.
445
- Default is an empty dictionary.
446
- - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
447
- Default is True.
448
- - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
449
- VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
450
- where k is the smallest integer so that the original lengths is smaller or equal
451
- to k x force_varchar_length. Default is 1024.
452
- Returns:
453
- DataFrame: A DataFrame representing the dataset resulting from the upload process, typically used for validation
454
- or further processing.
455
-
456
- The process involves several steps, including entity ID type conversion if necessary, feature name normalization,
457
- process registration in the feature store, and the execution of SQL queries to insert the data. The function concludes
458
- by returning a dataset derived from the uploaded data, offering immediate access to the newly stored information.
459
-
460
- Example:
461
- >>> df = tdml.DataFrame(...)
462
- >>> entity_id = ['customer_id']
463
- >>> feature_names = ['age', 'income']
464
- >>> dataset = upload_features(df, entity_id, feature_names)
465
- >>> # Another example with list-based entity_id, custom primary_index, and partitioning
466
- >>> tddf = tdml.DataFrame(...) # Assuming tddf is predefined with appropriate columns
467
- >>> entity_id = ['tx_type', 'txn_id']
468
- >>> primary_index = ['txn_id']
469
- >>> partitioning = '''
470
- ... PARTITION BY CASE_N (
471
- ... tx_type LIKE 'DEBIT',
472
- ... tx_type LIKE 'PAYMENT',
473
- ... tx_type LIKE 'CASH_OUT',
474
- ... tx_type LIKE 'CASH_IN',
475
- ... tx_type LIKE 'TRANSFER',
476
- ... NO CASE,
477
- ... UNKNOWN)'''
478
- >>> features = [x for x in tddf.columns if x not in entity_id]
479
- >>> dataset = upload_features(
480
- ... df = tddf,
481
- ... entity_id = entity_id,
482
- ... feature_names = features,
483
- ... metadata = {'project': 'test'},
484
- ... primary_index = primary_index,
485
- ... partitioning = partitioning
486
- ... )
539
+ def upload_features(
540
+ df,
541
+ entity_id,
542
+ feature_names,
543
+ metadata={},
544
+ primary_index=None,
545
+ partitioning='',
546
+ filtermanager=None,
547
+ entity_null_substitute={},
548
+ force_compute=True,
549
+ force_varchar_length=1024,
550
+ dataset_view_name = None
551
+ ):
552
+ """
553
+ Uploads feature data from a DataFrame to the feature store for a specified entity.
554
+ All diagnostics go through `logger_safe()` which respects `tdfs4ds.DISPLAY_LOGS`.
487
555
  """
488
556
 
489
557
  from tdfs4ds.utils.info import get_column_types
@@ -491,45 +559,42 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
491
559
  from tdfs4ds.process_store.process_registration_management import register_process_view
492
560
 
493
561
  # Convert entity_id to a dictionary if it's not already one
494
- if type(entity_id) == list:
562
+ if isinstance(entity_id, list):
495
563
  entity_id.sort()
496
564
  entity_id = get_column_types(df, entity_id)
497
- if tdfs4ds.DISPLAY_LOGS:
498
- print('entity_id has been converted to a proper dictionary : ', entity_id)
499
- elif type(entity_id) == str:
565
+ logger_safe("debug", "entity_id converted to dict: %s", entity_id)
566
+
567
+ elif isinstance(entity_id, str):
500
568
  entity_id = [entity_id]
501
569
  entity_id = get_column_types(df, entity_id)
502
- if tdfs4ds.DISPLAY_LOGS:
503
- print('entity_id has been converted to a proper dictionary : ', entity_id)
504
-
505
- if type(feature_names) != list:
506
- if tdfs4ds.DISPLAY_LOGS:
507
- print('feature_names is not a list:', feature_names)
508
- if ',' in feature_names:
509
- feature_names = feature_names.split(',')
570
+ logger_safe("debug", "entity_id converted to dict: %s", entity_id)
571
+
572
+ # Normalize feature_names
573
+ if not isinstance(feature_names, list):
574
+ logger_safe("debug", "feature_names is not a list: %s", feature_names)
575
+ if isinstance(feature_names, str) and ',' in feature_names:
576
+ feature_names = [x.strip() for x in feature_names.split(',')]
510
577
  else:
511
578
  feature_names = [feature_names]
512
- if tdfs4ds.DISPLAY_LOGS:
513
- print('it has been converted to : ', feature_names)
514
- print('check it is a expected.')
515
-
516
- if primary_index is not None and type(primary_index) != list:
517
- if tdfs4ds.DISPLAY_LOGS:
518
- print('primary_index is not a list:', primary_index)
519
- if ',' in primary_index:
520
- primary_index = primary_index.split(',')
579
+ logger_safe("debug", "feature_names converted to list: %s", feature_names)
580
+ logger_safe("debug", "Check the conversion is as expected.")
581
+
582
+ # Normalize primary_index
583
+ if primary_index is not None and not isinstance(primary_index, list):
584
+ logger_safe("debug", "primary_index is not a list: %s", primary_index)
585
+ if isinstance(primary_index, str) and ',' in primary_index:
586
+ primary_index = [x.strip() for x in primary_index.split(',')]
521
587
  else:
522
588
  primary_index = [primary_index]
523
- if tdfs4ds.DISPLAY_LOGS:
524
- print('it has been converted to : ', feature_names)
525
- print('check it is a expected.')
589
+ logger_safe("debug", "primary_index converted to list: %s", primary_index)
590
+ logger_safe("debug", "Check the conversion is as expected.")
526
591
 
592
+ # Partitioning
527
593
  partitioning = tdfs4ds.utils.info.generate_partitioning_clause(partitioning=partitioning)
528
594
 
529
- if tdfs4ds.DISPLAY_LOGS:
530
- print("filtermanager", filtermanager)
595
+ logger_safe("debug", "filtermanager: %s", filtermanager)
531
596
 
532
- # Register the process and retrieve the SQL query to insert the features, and the process ID
597
+ # Register process -> get SQL(s) + process_id
533
598
  query_insert, process_id, query_insert_dist, query_insert_filtermanager = register_process_view.__wrapped__(
534
599
  view_name = df,
535
600
  entity_id = entity_id,
@@ -542,104 +607,174 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
542
607
  entity_null_substitute = entity_null_substitute
543
608
  )
544
609
 
545
- # Execute the SQL query to insert the features into the database
546
- execute_query(query_insert)
547
- execute_query(query_insert_dist)
548
- if tdfs4ds.DEBUG_MODE:
549
- print("query_insert_filtermanager",query_insert_filtermanager)
550
- if query_insert_filtermanager is not None:
551
- execute_query(query_insert_filtermanager)
610
+ logger_safe("info", "Registered process (process_id=%s) for upload_features", process_id)
552
611
 
553
- # Run the registered process and return the resulting dataset
554
- PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
555
- tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES'
556
- if tdfs4ds.BUILD_DATASET_AT_UPLOAD: tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES WITH DATASET VALIDATION'
557
- tdfs4ds.RUN_ID = str(uuid.uuid4())
558
-
559
- if tdfs4ds.BUILD_DATASET_AT_UPLOAD:
612
+ # Execute queries
613
+ try:
614
+ execute_query(query_insert)
615
+ logger_safe("info", "Executed main insert query for process_id=%s", process_id)
616
+ except Exception as e:
617
+ logger_safe("exception", "Main insert query failed for process_id=%s", process_id)
618
+ raise
560
619
 
561
- try:
620
+ try:
621
+ execute_query(query_insert_dist)
622
+ logger_safe("info", "Executed distribution insert query for process_id=%s", process_id)
623
+ except Exception as e:
624
+ logger_safe("exception", "Distribution insert query failed for process_id=%s", process_id)
625
+ raise
562
626
 
563
- dataset = run(process_id=process_id, return_dataset=True, force_compute = force_compute, force_varchar_length = force_varchar_length)
627
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
628
+ # Avoid dumping entire SQL in normal logs; keep it debug-only.
629
+ logger_safe("debug", "query_insert_filtermanager: %s", query_insert_filtermanager)
564
630
 
631
+ if query_insert_filtermanager is not None:
632
+ try:
633
+ execute_query(query_insert_filtermanager)
634
+ logger_safe("info", "Executed filtermanager insert query for process_id=%s", process_id)
565
635
  except Exception as e:
566
- tdfs4ds.process_store.process_followup.followup_close(
567
- run_id = tdfs4ds.RUN_ID,
568
- process_type = tdfs4ds.PROCESS_TYPE,
569
- process_id = process_id,
570
- status = 'FAILED,' + str(e).split('\n')[0]
571
- )
636
+ logger_safe("exception", "Filtermanager insert query failed for process_id=%s", process_id)
572
637
  raise
573
638
 
639
+ # Run the registered process (with/without dataset)
640
+ PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
641
+ tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES'
642
+ if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
643
+ tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES WITH DATASET VALIDATION'
644
+ tdfs4ds.RUN_ID = str(uuid.uuid4())
574
645
 
575
- return dataset
576
- else:
646
+ logger_safe(
647
+ "info",
648
+ "Starting run (run_id=%s, process_type=%s, process_id=%s, force_compute=%s, force_varchar_length=%s)",
649
+ tdfs4ds.RUN_ID, tdfs4ds.PROCESS_TYPE, process_id, force_compute, force_varchar_length
650
+ )
651
+
652
+ try:
653
+ if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
654
+ dataset = run(
655
+ process_id = process_id,
656
+ return_dataset = True,
657
+ force_compute = force_compute,
658
+ force_varchar_length = force_varchar_length,
659
+ dataset_view_name = dataset_view_name
660
+ )
661
+ logger_safe("info", "Run completed with dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
662
+ return dataset
663
+ else:
664
+ run(
665
+ process_id = process_id,
666
+ return_dataset = False,
667
+ force_compute = force_compute,
668
+ force_varchar_length = force_varchar_length,
669
+ dataset_view_name = dataset_view_name
670
+ )
671
+ logger_safe("info", "Run completed without dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
672
+ return
577
673
 
674
+ except Exception as e:
675
+ # Keep your existing follow-up close behavior, but ensure the error is logged.
578
676
  try:
579
- run(process_id=process_id, return_dataset=False, force_compute = force_compute, force_varchar_length = force_varchar_length)
580
- except Exception as e:
581
677
  tdfs4ds.process_store.process_followup.followup_close(
582
- run_id = tdfs4ds.RUN_ID,
583
- process_type = tdfs4ds.PROCESS_TYPE,
584
- process_id = process_id,
585
- status = 'FAILED,' + str(e).split('\n')[0]
678
+ run_id = tdfs4ds.RUN_ID,
679
+ process_type = tdfs4ds.PROCESS_TYPE,
680
+ process_id = process_id,
681
+ status = 'FAILED,' + str(e).split('\n')[0]
586
682
  )
587
- raise
588
- return
683
+ finally:
684
+ logger_safe("exception", "Run failed (run_id=%s, process_id=%s): %s",
685
+ tdfs4ds.RUN_ID, process_id, str(e).split('\n')[0]
686
+ )
687
+ raise
688
+ finally:
689
+ # Restore previous process type just in case the caller relies on it.
690
+ tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
589
691
 
590
- tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
591
692
 
592
- def _upload_features(df, entity_id, feature_names,
593
- feature_versions=FEATURE_VERSION_DEFAULT, primary_index = None, partitioning = '', filtermanager=None, entity_null_substitute={}, process_id = None, force_compute = False,force_varchar_length = None):
594
- """
595
- Uploads features from a DataFrame to the feature store, handling entity registration, feature type determination,
596
- feature registration, preparation for ingestion, and storage in the designated feature tables.
597
693
 
598
- Parameters:
599
- - df (DataFrame): The input DataFrame containing the feature data.
600
- - entity_id (str or dict): The identifier for the entity to which these features belong. This can be a single ID
601
- (str) or a dictionary of attribute names and values uniquely identifying the entity.
602
- - feature_names (list): A list of strings specifying the names of the features to be uploaded.
603
- - feature_versions (str or list, optional): Specifies the versions of the features to be uploaded. Can be a single
604
- string applied to all features or a list of strings specifying the version
605
- for each feature respectively. Default is 'dev.0.0'.
606
- - primary_index (list, optional): Specifies the columns to be used as the primary index in the feature store tables.
607
- This can significantly impact the performance of data retrieval operations.
608
- - partitioning (str, optional): A string indicating the partitioning strategy for the feature store tables, which can
609
- enhance query performance based on the access patterns.
610
- - filtermanager (object, optional): An object managing filter conditions for the feature data. Default is None.
611
- - entity_null_substitute (dict, optional): A dictionary specifying substitute values for nulls in entity columns.
612
- Default is an empty dictionary.
613
- - process_id (str, optional): An identifier for the process, used for tracking and follow-up. Default is None.
614
- - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
615
- Default is False.
616
- - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
617
- VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
618
- where k is the smallest integer so that the original lengths is smaller or equal
619
- to k x force_varchar_length. Default is None.
694
+ def _upload_features(
695
+ df, entity_id, feature_names,
696
+ feature_versions = FEATURE_VERSION_DEFAULT,
697
+ primary_index = None, partitioning = '',
698
+ filtermanager = None, entity_null_substitute = {},
699
+ process_id = None, force_compute = False,
700
+ force_varchar_length = None,
701
+ dataset_view_name = None
702
+ ):
703
+ """
704
+ Uploads a set of features into the Feature Store for a given entity.
620
705
 
706
+ This function registers an entity and its associated features in the feature catalog
707
+ if they are not already defined, prepares the data for ingestion, and stores it in the
708
+ feature store. It also supports incremental feature computation and conditional execution
709
+ depending on prior runs.
621
710
 
622
- Returns:
623
- DataFrame: A DataFrame representing the dataset view created in the feature store, detailing the features and their
624
- metadata, including versions and storage locations.
625
-
626
- This function orchestrates several steps involved in feature storage:
627
- 1. Registers the entity in the feature store if not already present.
628
- 2. Determines the data types of the features based on the input DataFrame.
629
- 3. Registers the features, including their names, types, and versions, in the feature catalog.
630
- 4. Prepares the feature data for ingestion, including any necessary transformations.
631
- 5. Stores the prepared feature data in the feature store.
632
- 6. Optionally, cleans up temporary resources used during the process.
633
- 7. Builds and returns a view of the dataset representing the uploaded features for easy access.
711
+ Parameters
712
+ ----------
713
+ df : pandas.DataFrame
714
+ Input dataframe containing entity keys and feature columns to upload.
715
+ entity_id : str, list, or dict
716
+ Identifier(s) for the entity. Can be:
717
+ - A string (single entity key)
718
+ - A list of key column names
719
+ - A dict mapping column names to data types
720
+ If not a dict, entity metadata is inferred automatically.
721
+ feature_names : list of str
722
+ List of feature column names to upload from `df`.
723
+ feature_versions : dict or int, optional
724
+ Feature version(s). If a single integer is provided, it is applied to all features.
725
+ If a dict is provided, it maps each feature name to its version.
726
+ Default is FEATURE_VERSION_DEFAULT.
727
+ primary_index : str or list, optional
728
+ Primary index to use when storing features in Teradata.
729
+ partitioning : str, optional
730
+ Partitioning clause for feature store tables. Default is ''.
731
+ filtermanager : FilterManager, optional
732
+ If provided, features are built iteratively per filter step.
733
+ entity_null_substitute : dict, optional
734
+ Replacement values for nulls in entity keys.
735
+ Example: {'customer_id': -1}
736
+ process_id : str, optional
737
+ Identifier for the process execution, used for follow-up logging.
738
+ force_compute : bool, optional
739
+ If True, forces recomputation even if the same process_id and timestamp were
740
+ already computed earlier. If False, the computation is skipped when existing
741
+ results are detected. Default is False.
742
+ force_varchar_length : int, optional
743
+ If provided, all VARCHAR feature columns are resized to this length
744
+ before ingestion.
745
+
746
+ Returns
747
+ -------
748
+ pandas.DataFrame or None
749
+ If BUILD_DATASET_AT_UPLOAD is enabled, returns a dataset built from the
750
+ ingested features for validation. Otherwise, returns None.
751
+
752
+ Notes
753
+ -----
754
+ - Uses global tdfs4ds context such as FEATURE_STORE_TIME, RUN_ID, and PROCESS_TYPE.
755
+ - Logs ingestion status in process follow-up tables.
756
+ - Skips ingestion when existing completed results are found unless
757
+ `force_compute=True`.
758
+ - Applies Teradata-optimized storage and statistics collection.
759
+
760
+ Raises
761
+ ------
762
+ ValueError
763
+ If unsupported data types are found (CLOB/BLOB/JSON).
764
+ Exception
765
+ For ingestion failure or storage errors.
634
766
 
635
- Note:
636
- - The function relies on various sub-modules within the `tdfs4ds` library for different steps of the process, from
637
- entity and feature registration to data preparation and storage.
638
- - It is intended to be used internally within a system that manages a Teradata feature store, assuming access to
639
- a Teradata database and the appropriate schema for feature storage.
640
- - The function assumes that the feature_versions, if provided as a list, matches the length of feature_names.
767
+ Example
768
+ -------
769
+ >>> _upload_features(
770
+ ... df=dataframe,
771
+ ... entity_id="customer_id",
772
+ ... feature_names=["age", "credit_score"],
773
+ ... process_id="customer_features_v1",
774
+ ... force_compute=False
775
+ ... )
641
776
  """
642
-
777
+
643
778
  from tdfs4ds.feature_store.entity_management import register_entity
644
779
  from tdfs4ds.feature_store.feature_store_management import Gettdtypes
645
780
  from tdfs4ds.feature_store.feature_store_management import register_features
@@ -647,194 +782,199 @@ def _upload_features(df, entity_id, feature_names,
647
782
  from tdfs4ds.feature_store.feature_data_processing import store_feature, apply_collect_stats
648
783
  from tdfs4ds.utils.info import get_column_types, update_varchar_length
649
784
 
650
- # Convert entity_id to a dictionary if it's not already one
651
- if type(entity_id) == list:
785
+ # Convert entity_id to a dictionary if not already
786
+ if isinstance(entity_id, list):
652
787
  entity_id.sort()
653
788
  entity_id = get_column_types(df, entity_id)
654
- if tdfs4ds.DISPLAY_LOGS:
655
- print('entity_id has been converted to a proper dictionary : ', entity_id)
656
- elif type(entity_id) == str:
657
- entity_id = [entity_id]
658
- entity_id = get_column_types(df, entity_id)
659
- if tdfs4ds.DISPLAY_LOGS:
660
- print('entity_id has been converted to a proper dictionary : ', entity_id)
661
-
662
- #register_entity(entity_id, primary_index=primary_index, partitioning=partitioning)
663
-
664
- # If feature_versions is a list, create a dictionary mapping each feature name to its corresponding version.
665
- # If feature_versions is a string, create a dictionary mapping each feature name to this string.
666
- if type(feature_versions) == list:
667
- selected_features = {k: v for k, v in zip(feature_names, feature_versions)}
789
+ logger_safe("debug", "entity_id converted to dict: %s", entity_id)
790
+ elif isinstance(entity_id, str):
791
+ entity_id = get_column_types(df, [entity_id])
792
+ logger_safe("debug", "entity_id converted to dict: %s", entity_id)
793
+
794
+ # Map feature versions
795
+ if isinstance(feature_versions, list):
796
+ selected_features = dict(zip(feature_names, feature_versions))
668
797
  else:
669
798
  selected_features = {k: feature_versions for k in feature_names}
670
799
 
671
- # Get the Teradata types of the features in df.
672
- feature_names_types = Gettdtypes(
673
- df,
674
- features_columns=feature_names,
675
- entity_id=entity_id
676
- )
800
+ # Get Teradata types for features
801
+ feature_names_types = Gettdtypes(df, features_columns=feature_names, entity_id=entity_id)
677
802
 
678
803
  if force_varchar_length is not None:
679
- print(feature_names_types)
680
- feature_names_types = update_varchar_length(feature_names_types,new_varchar_length = force_varchar_length)
804
+ logger_safe("debug", "Updating VARCHAR lengths with force_varchar_length=%s", force_varchar_length)
805
+ feature_names_types = update_varchar_length(
806
+ feature_names_types,
807
+ new_varchar_length=force_varchar_length
808
+ )
681
809
 
682
810
  def validate_feature_types(feature_names_types):
683
- """
684
- Validates feature data types and raises an error if any value contains
685
- the substrings 'clob', 'blob', or 'json' (case insensitive).
686
-
687
- Parameters:
688
- feature_names_types (dict): A dictionary where keys are feature names and values are their data types.
689
-
690
- Raises:
691
- ValueError: If any feature type contains 'clob', 'blob', or 'json'.
692
- """
693
- invalid_types = {key: value['type'] for key, value in feature_names_types.items()
694
- if any(term in value['type'].lower() for term in ['clob', 'blob', 'json'])}
695
-
696
- if invalid_types:
811
+ invalid = {
812
+ k: v['type'] for k, v in feature_names_types.items()
813
+ if any(x in v['type'].lower() for x in ['clob', 'blob', 'json'])
814
+ }
815
+ if invalid:
697
816
  raise ValueError(
698
- f"The following features have unsupported data types: {invalid_types}. "
699
- "The data types 'CLOB', 'BLOB', and 'JSON' are not yet managed by the feature store."
817
+ f"Unsupported data types found: {invalid}. "
818
+ "CLOB/BLOB/JSON are not supported."
700
819
  )
701
-
702
- validate_feature_types(feature_names_types)
703
-
820
+
821
+ validate_feature_types(feature_names_types)
822
+
823
+ logger_safe("info", "Registering entity %s in feature store", entity_id)
704
824
  register_entity(entity_id, feature_names_types, primary_index=primary_index, partitioning=partitioning)
705
825
 
706
- if tdfs4ds.DEBUG_MODE:
707
- print('_upload_features', 'entity_id', entity_id)
708
- print('_upload_features', 'entity_null_substitute', entity_null_substitute)
709
- print('_upload_features', 'feature_names', feature_names)
710
- print('_upload_features', 'primary_index', primary_index)
711
- print('_upload_features', 'partitioning', partitioning)
712
- print('_upload_features', 'selected_features', selected_features)
713
- print('_upload_features', 'df.columns', df.columns)
714
-
715
- # Register the features in the feature catalog.
716
- register_features(
717
- entity_id,
718
- feature_names_types,
719
- primary_index,
720
- partitioning
721
- )
722
-
723
- if tdfs4ds.DEBUG_MODE:
724
- print("---------_upload_features")
725
- print("filtermanager : ", filtermanager)
726
- print("feature names : ", feature_names)
727
- print("selected features : ", selected_features)
826
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
827
+ logger_safe(
828
+ "debug",
829
+ "_upload_features entity_id=%s null_substitute=%s features=%s primary_index=%s partitioning=%s",
830
+ entity_id, entity_null_substitute, feature_names, primary_index, partitioning
831
+ )
832
+ logger_safe("debug", "selected_features=%s df.columns=%s", selected_features, df.columns)
728
833
 
729
- if process_id is not None and tdfs4ds.FEATURE_STORE_TIME is not None:
834
+ register_features(entity_id, feature_names_types, primary_index, partitioning)
835
+ logger_safe("info", "Features registered in catalog: %s", feature_names)
836
+
837
+ follow_up = None
838
+ if process_id and tdfs4ds.FEATURE_STORE_TIME:
730
839
  follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
731
- follow_up = follow_up[(follow_up.STATUS == 'COMPLETED') & (follow_up.VALIDTIME_DATE.isna() == False) & (
732
- follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) & (follow_up.PROCESS_ID == process_id)]
733
- if filtermanager is None:
734
- do_compute = True
735
- if process_id is not None and tdfs4ds.FEATURE_STORE_TIME is not None:
736
- if follow_up.shape[0] > 0:
737
- do_compute = False
840
+ follow_up = follow_up[
841
+ (follow_up.STATUS == 'COMPLETED') &
842
+ (follow_up.VALIDTIME_DATE.isna() == False) &
843
+ (follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) &
844
+ (follow_up.PROCESS_ID == process_id)
845
+ ]
738
846
 
739
- # Prepare the features for ingestion.
847
+ if filtermanager is None:
848
+ dataset_created = False
849
+ do_compute = not (process_id and follow_up is not None and follow_up.shape[0] > 0)
850
+ if not do_compute and not force_compute:
851
+ logger_safe(
852
+ "info",
853
+ "Skipping computation for process_id=%s at time %s (already exists, force_compute=False)",
854
+ process_id, tdfs4ds.FEATURE_STORE_TIME
855
+ )
740
856
  if do_compute or force_compute:
741
-
857
+ logger_safe("info", "Beginning feature ingestion for entity=%s", entity_id)
742
858
  tdfs4ds.process_store.process_followup.followup_open(
743
- run_id = tdfs4ds.RUN_ID,
744
- process_type = tdfs4ds.PROCESS_TYPE,
745
- process_id = process_id
859
+ run_id=tdfs4ds.RUN_ID,
860
+ process_type=tdfs4ds.PROCESS_TYPE,
861
+ process_id=process_id
746
862
  )
747
-
748
863
  try:
749
- prepared_features, volatile_table_name, features_infos = prepare_feature_ingestion(
750
- df,
751
- entity_id,
752
- feature_names,
864
+ prepared_features, volatile_table, features_infos = prepare_feature_ingestion(
865
+ df, entity_id, feature_names,
753
866
  feature_versions=selected_features,
754
867
  primary_index=primary_index,
755
868
  entity_null_substitute=entity_null_substitute,
756
869
  partitioning=partitioning
757
870
  )
758
- # Store the prepared features in the feature store.
759
- store_feature(
760
- entity_id,
761
- volatile_table_name,
762
- entity_null_substitute=entity_null_substitute,
763
- primary_index=primary_index,
764
- partitioning=partitioning,
765
- features_infos = features_infos
766
- )
767
871
 
768
- # Collect statistics
769
- apply_collect_stats(
770
- entity_id,
771
- primary_index = primary_index,
772
- partitioning = partitioning,
773
- feature_infos = features_infos
774
- )
872
+ count_rows = store_feature(entity_id, volatile_table, entity_null_substitute,
873
+ primary_index, partitioning, features_infos)
874
+
875
+
876
+
877
+ apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
775
878
 
776
879
  tdfs4ds.process_store.process_followup.followup_close(
777
- run_id = tdfs4ds.RUN_ID,
778
- process_type = tdfs4ds.PROCESS_TYPE,
779
- process_id = process_id
880
+ run_id=tdfs4ds.RUN_ID,
881
+ process_type=tdfs4ds.PROCESS_TYPE,
882
+ process_id=process_id
780
883
  )
884
+ logger_safe("info", "Feature ingestion completed for entity=%s", entity_id)
885
+ # Build dataset for validation if enabled
886
+ if tdfs4ds.BUILD_DATASET_AT_UPLOAD or dataset_view_name is not None:
887
+ logger_safe("info", "Building dataset for validation...")
888
+ try:
889
+ dataset = build_dataset(
890
+ entity_id, selected_features,
891
+ view_name = dataset_view_name
892
+ )
893
+ dataset_created = True
894
+ except Exception as e:
895
+ logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
896
+ logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
897
+ else:
898
+ logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False) and no dataset view name provided.")
781
899
 
782
900
  except Exception as e:
901
+ logger_safe("exception", "Feature ingestion failed for entity=%s", entity_id)
783
902
  tdfs4ds.process_store.process_followup.followup_close(
784
- run_id = tdfs4ds.RUN_ID,
785
- process_type = tdfs4ds.PROCESS_TYPE,
786
- process_id = process_id,
787
- status = 'FAILED,' + str(e).split('\n')[0]
903
+ run_id=tdfs4ds.RUN_ID,
904
+ process_type=tdfs4ds.PROCESS_TYPE,
905
+ process_id=process_id,
906
+ status='FAILED,' + str(e).split('\n')[0]
788
907
  )
789
908
  raise
909
+
790
910
  else:
791
- # get the total number of filter condition in the filter manager
792
- nb_filters = filtermanager.nb_filters
793
911
 
794
- # the flag that indicates that we computed something in the next loop
912
+ logger_safe("info", "FilterManager detected: %s filters to process", filtermanager.nb_filters)
795
913
  something_computed = False
914
+ pbar = tqdm(
915
+ range(filtermanager.nb_filters),
916
+ total=filtermanager.nb_filters,
917
+ desc="Applying filters",
918
+ unit="filter",
919
+ leave=False
920
+ )
921
+ dataset_created = False
922
+ for i in pbar:
923
+ filter_id = i + 1
924
+ filtermanager.update(filter_id)
796
925
 
797
- for i in range(nb_filters):
926
+ try:
927
+ pbar.set_description(f"Applying filter {filter_id}/{filtermanager.nb_filters}")
798
928
 
799
- # place the cursor on the next filter
800
- filtermanager.update(i+1)
929
+ # Convert datetime columns to string
930
+ df_bar = filtermanager.display().to_pandas().astype(object) # avoid conversion issues
931
+ for col in df_bar.select_dtypes(include=["datetime", "datetimetz"]).columns:
932
+ df_bar[col] = df_bar[col].dt.strftime("%Y-%m-%d %H:%M:%S")
801
933
 
802
- if filtermanager.time_filtering:
803
- # if the filter manager is hybrid, then synchronize the time with tdfs4ds
804
- tdfs4ds.FEATURE_STORE_TIME = filtermanager.get_date_in_the_past()
934
+ # Convert to JSON object (dict)
935
+ bar_info = df_bar.iloc[0].to_dict()
805
936
 
806
- # overwrite the follow up table to tilter on the VALIDTIME_DATE too
807
- follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
808
- follow_up = follow_up[(follow_up.STATUS == 'COMPLETED') & (follow_up.VALIDTIME_DATE.isna() == False) & (
809
- follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) & (follow_up.PROCESS_ID == process_id)]
937
+ # ---- ADD THIS: handle python date objects ----
938
+ from datetime import date, datetime
939
+ for key, value in bar_info.items():
940
+ if isinstance(value, (date, datetime)): # convert date/datetime to string
941
+ bar_info[key] = value.strftime("%Y-%m-%d %H:%M:%S")
942
+ # ----------------------------------------------
810
943
 
811
- # initialize do_compute, the flag that something has to be computed
812
- do_compute = True
944
+ bar_info = str(bar_info)
945
+ if len(bar_info) > 120:
946
+ bar_info = bar_info[:117] + "..."
947
+ pbar.set_postfix_str(bar_info)
813
948
 
814
- # if the process_id is defined and if we are working at a specific time:
815
- if process_id is not None and tdfs4ds.FEATURE_STORE_TIME is not None:
816
- # we check if the filter condition has already been computed
817
- follow_up_ = follow_up.assign(APPLIED_FILTER=follow_up.APPLIED_FILTER.cast(tdml.VARCHAR(20000))).join(
818
- tdml.DataFrame.from_query(
819
- f"""
820
- SELECT
821
- CAST(JSON_AGG({','.join(filtermanager.col_names)}) AS VARCHAR(20000)) AS APPLIED_FILTER
822
- FROM {filtermanager.schema_name}.{filtermanager.view_name}
823
- """
824
- ),
825
- on = 'APPLIED_FILTER',
826
- how = 'inner',
827
- lprefix = 'l',
828
- rprefix = 'r'
829
- )
830
- # if already computed and completed, then do_compute is set to False
831
- if follow_up_.shape[0] > 0:
832
- do_compute = False
949
+ except Exception:
950
+ # postfix is optional; ignore errors from display() here
951
+ pass
833
952
 
834
- if tdfs4ds.DISPLAY_LOGS:
835
- print(filtermanager.display())
953
+ logger_safe("debug", "Applying filter %s/%s:\n%s",
954
+ i + 1, filtermanager.nb_filters, filtermanager.display())
955
+
956
+ do_compute = True
957
+ if process_id and tdfs4ds.FEATURE_STORE_TIME:
958
+ # see if already computed
959
+ follow_up = tdfs4ds.process_store.process_followup.follow_up_report(process_id=process_id, filtermanager=filtermanager)
960
+ follow_up = follow_up[
961
+ (follow_up.STATUS == 'COMPLETED') &
962
+ (follow_up.VALIDTIME_DATE.isna() == False) &
963
+ (follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME)
964
+ ]
965
+
966
+ if follow_up.shape[0] > 0:
967
+ do_compute = False
836
968
 
969
+ if not do_compute and not force_compute:
970
+ logger_safe(
971
+ "info",
972
+ "Skipping computation for process_id=%s at time %s (already exists, force_compute=False)",
973
+ process_id, tdfs4ds.FEATURE_STORE_TIME
974
+ )
975
+ pbar.colour = "green"
837
976
  if do_compute or force_compute:
977
+ pbar.colour = "blue"
838
978
  tdfs4ds.process_store.process_followup.followup_open(
839
979
  run_id = tdfs4ds.RUN_ID,
840
980
  process_type = tdfs4ds.PROCESS_TYPE,
@@ -842,83 +982,78 @@ def _upload_features(df, entity_id, feature_names,
842
982
  filtermanager = filtermanager
843
983
  )
844
984
  try:
845
- # Prepare the features for ingestion.
846
- prepared_features, volatile_table_name, features_infos = prepare_feature_ingestion(
847
- df,
848
- entity_id,
849
- feature_names,
985
+ prepared_features, volatile_table, features_infos = prepare_feature_ingestion(
986
+ df, entity_id, feature_names,
850
987
  feature_versions = selected_features,
851
988
  primary_index = primary_index,
852
989
  entity_null_substitute = entity_null_substitute,
853
990
  partitioning = partitioning
854
991
  )
855
992
 
856
- # Store the prepared features in the feature store.
857
- store_feature(
858
- entity_id,
859
- volatile_table_name,
860
- entity_null_substitute=entity_null_substitute,
861
- primary_index = primary_index,
862
- partitioning = partitioning,
863
- features_infos=features_infos
864
-
865
- )
866
-
867
- # indicate that something has been processed:
993
+ count_rows = store_feature(entity_id, volatile_table, entity_null_substitute,
994
+ primary_index, partitioning, features_infos)
995
+
868
996
  something_computed = True
869
997
 
870
998
  tdfs4ds.process_store.process_followup.followup_close(
871
- run_id=tdfs4ds.RUN_ID,
872
- process_type=tdfs4ds.PROCESS_TYPE,
873
- process_id=process_id,
999
+ run_id = tdfs4ds.RUN_ID,
1000
+ process_type = tdfs4ds.PROCESS_TYPE,
1001
+ process_id = process_id,
874
1002
  filtermanager = filtermanager
875
1003
  )
876
1004
 
1005
+ # Build dataset for validation if enabled
1006
+ if (tdfs4ds.BUILD_DATASET_AT_UPLOAD or dataset_view_name is not None) and dataset_created==False:
1007
+ logger_safe("info", "Building dataset for validation...")
1008
+ try:
1009
+ dataset = build_dataset(
1010
+ entity_id, selected_features,
1011
+ view_name = dataset_view_name
1012
+ )
1013
+ dataset_created = True
1014
+ except Exception as e:
1015
+ logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
1016
+ logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
1017
+ else:
1018
+ logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False) and no dataset view name provided.")
1019
+
877
1020
  except Exception as e:
878
- print(e)
1021
+ logger_safe("exception", "Error with filter iteration %s: %s", i + 1, str(e))
879
1022
  tdfs4ds.process_store.process_followup.followup_close(
880
- run_id=tdfs4ds.RUN_ID,
881
- process_type=tdfs4ds.PROCESS_TYPE,
882
- process_id=process_id,
883
- status='FAILED,' + str(e).split('\n')[0],
884
- filtermanager=filtermanager
1023
+ run_id = tdfs4ds.RUN_ID,
1024
+ process_type = tdfs4ds.PROCESS_TYPE,
1025
+ process_id = process_id,
1026
+ status = 'FAILED,' + str(e).split('\n')[0],
1027
+ filtermanager = filtermanager
885
1028
  )
886
1029
  raise
887
- # Clean up by dropping the temporary volatile table.
888
- # tdml.execute_sql(f'DROP TABLE {volatile_table_name}')
889
1030
 
890
- # Collect statistics only if something has been computed
891
1031
  if something_computed:
892
- apply_collect_stats(
893
- entity_id,
894
- primary_index = primary_index,
895
- partitioning = partitioning,
896
- feature_infos = features_infos
897
- )
1032
+ apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
898
1033
 
899
- # Build a dataset view in the feature store.
900
- if tdfs4ds.BUILD_DATASET_AT_UPLOAD:
901
- if tdfs4ds.DISPLAY_LOGS: print('build dataset for validation')
902
- try:
903
- dataset = build_dataset(
904
- entity_id,
905
- selected_features,
906
- view_name=None,
907
- entity_null_substitute = entity_null_substitute
908
- )
909
- except Exception as e:
910
- print('ERROR at build_dataset in _upload_features:')
911
- print(str(e).split('\n')[0])
912
- print('entity :', entity_id)
913
- print('selected features :', selected_features)
1034
+ if dataset_created == False and tdfs4ds.BUILD_DATASET_AT_UPLOAD and dataset_view_name == None:
1035
+ logger_safe("info", "Building dataset for validation...")
1036
+ try:
1037
+ dataset = build_dataset(
1038
+ entity_id, selected_features,
1039
+ view_name = dataset_view_name
1040
+ )
1041
+ return dataset
1042
+ except Exception as e:
1043
+ logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
1044
+ logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
1045
+ else:
1046
+ if tdfs4ds.BUILD_DATASET_AT_UPLOAD == False:
1047
+ logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False) and no dataset view name provided.")
1048
+ else:
1049
+ return
1050
+
914
1051
 
915
- # Return the dataset view.
916
- return dataset
917
- else:
918
- if tdfs4ds.DISPLAY_LOGS: print('no dataset built for validation. Set tdfs4ds.BUILD_DATASET_AT_UPLOAD to True if you want it')
919
1052
  return
920
1053
 
921
1054
 
1055
+
1056
+
922
1057
  def build_dataset(entity_id, selected_features, view_name, schema_name=None, comment=None, return_query=False,
923
1058
  feature_store_time=False, join_type='INNER'):
924
1059
  """
@@ -935,6 +1070,10 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
935
1070
  selected_features : dict
936
1071
  A dictionary where the keys are feature table names, and the values are lists of tuples
937
1072
  (feature_id, feature_version, feature_name) specifying the features to retrieve.
1073
+ NOTE: feature_version may be either:
1074
+ - a single UUID string, or
1075
+ - a list of dicts like:
1076
+ {"process_id": <UUID>, "process_view_name": <str>}
938
1077
 
939
1078
  view_name : str
940
1079
  The name of the view to be created in the database.
@@ -1004,6 +1143,24 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
1004
1143
  # Sort the entity ID list for consistent query generation
1005
1144
  list_entity_id.sort()
1006
1145
 
1146
+ # Helpers
1147
+ import re
1148
+ def _sanitize_identifier(name: str) -> str:
1149
+ # Keep letters, numbers, and underscores; replace others with '_'
1150
+ return re.sub(r'[^0-9A-Za-z_]', '_', name)
1151
+
1152
+ used_alias_counts = {} # base_alias -> count
1153
+
1154
+ def _unique_alias(base: str) -> str:
1155
+ """
1156
+ Ensure alias uniqueness: if base already used, append _2, _3, ...
1157
+ """
1158
+ if base not in used_alias_counts:
1159
+ used_alias_counts[base] = 1
1160
+ return base
1161
+ used_alias_counts[base] += 1
1162
+ return f"{base}_{used_alias_counts[base]}"
1163
+
1007
1164
  # Initialize sub-query construction
1008
1165
  tdfs4ds.logger.info("Generating the sub-queries for feature retrieval.")
1009
1166
  sub_queries = []
@@ -1014,21 +1171,52 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
1014
1171
  # Construct sub-queries for each feature
1015
1172
  for k, v in list_features.items():
1016
1173
  for feature_id, feature_version, feature_name in v:
1017
- txt_where = f"(FEATURE_ID = {feature_id} AND FEATURE_VERSION='{feature_version}')"
1018
- feature_str = ',B1.FEATURE_VALUE AS ' + feature_name
1019
- sub_queries.append(
1020
- {
1021
- 'feature_name': feature_name,
1022
- 'query': f"""
1023
- SEQUENCED VALIDTIME
1024
- SELECT
1025
- {txt_entity}
1026
- {feature_str}
1027
- FROM {k} B1
1028
- WHERE {txt_where}
1029
- """
1030
- }
1031
- )
1174
+
1175
+ # Multiple processes: list of dicts
1176
+ if isinstance(feature_version, list):
1177
+ for item in feature_version:
1178
+ process_id = item.get("process_id")
1179
+ process_view_name = item.get("process_view_name") or "PROCESS"
1180
+ base_alias = _sanitize_identifier(f"{feature_name}_{process_view_name}")
1181
+ alias = _unique_alias(base_alias)
1182
+
1183
+ txt_where = f"(FEATURE_ID = {feature_id} AND FEATURE_VERSION='{process_id}')"
1184
+ feature_str = ',B1.FEATURE_VALUE AS ' + alias
1185
+
1186
+ sub_queries.append(
1187
+ {
1188
+ 'feature_name': alias,
1189
+ 'query': f"""
1190
+ SEQUENCED VALIDTIME
1191
+ SELECT
1192
+ {txt_entity}
1193
+ {feature_str}
1194
+ FROM {k} B1
1195
+ WHERE {txt_where}
1196
+ """
1197
+ }
1198
+ )
1199
+
1200
+ # Single UUID
1201
+ else:
1202
+ base_alias = _sanitize_identifier(feature_name)
1203
+ alias = _unique_alias(base_alias)
1204
+
1205
+ txt_where = f"(FEATURE_ID = {feature_id} AND FEATURE_VERSION='{feature_version}')"
1206
+ feature_str = ',B1.FEATURE_VALUE AS ' + alias
1207
+ sub_queries.append(
1208
+ {
1209
+ 'feature_name': alias,
1210
+ 'query': f"""
1211
+ SEQUENCED VALIDTIME
1212
+ SELECT
1213
+ {txt_entity}
1214
+ {feature_str}
1215
+ FROM {k} B1
1216
+ WHERE {txt_where}
1217
+ """
1218
+ }
1219
+ )
1032
1220
 
1033
1221
  # Handle case where no features are available
1034
1222
  if len(sub_queries) == 0:
@@ -1102,6 +1290,7 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
1102
1290
  return tdml.DataFrame.from_table(tdml.in_schema(schema_name, view_name))
1103
1291
 
1104
1292
 
1293
+
1105
1294
  def build_dataset_opt(entity_id, selected_features, view_name = None, schema_name=tdfs4ds.SCHEMA,
1106
1295
  comment='dataset', no_temporal=False, time_manager=None, query_only=False, entity_null_substitute={},
1107
1296
  other=None, time_column=None, filtermanager = None, filter_conditions = None
@@ -1280,82 +1469,91 @@ def upload_tdstone2_scores(model):
1280
1469
  return dataset
1281
1470
 
1282
1471
 
1283
- def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None):
1472
+ def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None, force_compute = False, force_display_logs = False):
1284
1473
  """
1285
- Executes a series of processes for each date in a given list, managing the time and logging settings.
1474
+ Executes a series of processes for each date in a given list, managing time, computation settings, and logging.
1286
1475
 
1287
1476
  This function iterates over a range of time steps, updating a TimeManager object with each step, and then
1288
- executes a list of processes for that time step. It also manages the synchronization of time for a feature store
1289
- and disables display logs during its execution.
1477
+ executes a list of processes for that time step. It also manages synchronization of time for the feature store
1478
+ and optionally controls forced computation and log display behavior.
1290
1479
 
1291
1480
  Parameters:
1292
1481
  - process_list (list): A list of process IDs that need to be executed for each time step.
1293
- - time_manager (TimeManager object): An object that manages time-related operations, like updating or retrieving time.
1482
+ - time_manager (TimeManager): An object that manages time-related operations, like updating or retrieving time.
1294
1483
  - time_id_start (int, optional): The starting time step ID. Default is 1.
1295
- - time_id_end (int, optional): The ending time step ID. If None, it will run until the last time step in the time manager.
1484
+ - time_id_end (int, optional): The ending time step ID. If None, it will run until the last time step in the
1485
+ time manager.
1486
+ - force_compute (bool, optional): If True, forces each process to recompute even if previous results exist.
1487
+ Default is False.
1488
+ - force_display_logs (bool, optional): If True, forces log display during the rollout even if global log display
1489
+ is disabled. Default is False.
1296
1490
 
1297
1491
  Side Effects:
1298
- - Sets global variables DISPLAY_LOGS and FEATURE_STORE_TIME.
1492
+ - Temporarily modifies global variables DISPLAY_LOGS, PROCESS_TYPE, RUN_ID, and FEATURE_STORE_TIME.
1493
+ - Restores DISPLAY_LOGS setting after execution.
1299
1494
  - Catches and prints exceptions along with the time step on which they occurred.
1300
1495
 
1301
- This function performs the following steps:
1302
- 1. Disables display logs and sets the process type to 'ROLL_OUT'.
1303
- 2. Iterates over the specified range of time steps.
1304
- 3. Updates the time manager with the current time step.
1305
- 4. Synchronizes the feature store time with the current time step.
1306
- 5. Executes each process in the process list for the current time step.
1307
- 6. Restores the original display log setting after execution.
1496
+ Steps performed:
1497
+ 1. Disables display logs by default unless `force_display_logs` is True.
1498
+ 2. Sets process type to 'ROLL_OUT' and initializes a unique run ID.
1499
+ 3. Iterates over the specified range of time steps.
1500
+ 4. Updates the time manager with the current time step.
1501
+ 5. Synchronizes the feature store time with the current time step.
1502
+ 6. Executes each process in the process list with optional forced computation.
1503
+ 7. Restores original display log settings after completion.
1308
1504
 
1309
1505
  Example:
1310
1506
  >>> process_list = ['process_1', 'process_2']
1311
1507
  >>> time_manager = TimeManager(...)
1312
- >>> roll_out(process_list, time_manager, time_id_start=1, time_id_end=10)
1508
+ >>> roll_out(process_list, time_manager, time_id_start=1, time_id_end=10, force_compute=True, force_display_logs=True)
1313
1509
  """
1314
1510
 
1315
- #global DISPLAY_LOGS
1316
- #global FEATURE_STORE_TIME
1317
-
1318
1511
  # Disable display logs
1319
1512
  temp_DISPLAY_LOGS = tdfs4ds.DISPLAY_LOGS
1320
1513
  tdfs4ds.DISPLAY_LOGS = False
1514
+ if force_display_logs:
1515
+ tdfs4ds.DISPLAY_LOGS = True
1321
1516
  PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
1322
1517
  tdfs4ds.PROCESS_TYPE = 'ROLL_OUT'
1323
1518
  tdfs4ds.RUN_ID = str(uuid.uuid4())
1324
1519
 
1325
-
1326
-
1327
1520
  try:
1521
+ # Define range of time steps
1328
1522
  if time_id_end is None:
1329
- pbar = tqdm.tqdm(range(time_id_start, time_manager.nb_time_steps + 1), desc="Starting")
1523
+ time_range = range(time_id_start, time_manager.nb_time_steps + 1)
1330
1524
  else:
1331
- pbar = tqdm.tqdm(range(time_id_start, min([time_manager.nb_time_steps + 1,time_id_end+1]) ), desc="Starting")
1332
- # Iterate over each date in the provided list
1525
+ time_range = range(time_id_start, min(time_manager.nb_time_steps + 1, time_id_end + 1))
1526
+
1527
+ # Progress bar
1528
+ pbar = tqdm(time_range, desc="Starting rollout", unit="step")
1529
+
1333
1530
  for i in pbar:
1334
- # Update the time manager with the new date
1335
- time_manager.update(time_id = i )
1531
+ # Update time manager
1532
+ time_manager.update(time_id=i)
1336
1533
  date_ = str(time_manager.display()['BUSINESS_DATE'].values[0])
1337
- pbar.set_description(f"Processing {date_}")
1338
- # Synchronize the time for the feature store with the current date
1534
+
1535
+ # Sync feature store time
1339
1536
  tdfs4ds.FEATURE_STORE_TIME = time_manager.get_date_in_the_past()
1340
- pbar.set_description(f"Processing {tdfs4ds.FEATURE_STORE_TIME}")
1537
+
1538
+ # Display current progress in tqdm
1539
+ pbar.set_postfix(time=date_, feature_time=tdfs4ds.FEATURE_STORE_TIME)
1540
+
1341
1541
  if tdfs4ds.DEBUG_MODE:
1342
- print('def roll_out','date_', date_)
1343
- print('def roll_out','time_manager.get_date_in_the_past()', time_manager.get_date_in_the_past())
1344
- print('def roll_out','tdfs4ds.FEATURE_STORE_TIME', tdfs4ds.FEATURE_STORE_TIME)
1345
- # Execute each process in the process list for the current date
1542
+ print("roll_out | date_:", date_)
1543
+ print("roll_out | feature_store_time:", tdfs4ds.FEATURE_STORE_TIME)
1544
+
1545
+ # Execute all processes for this time step
1346
1546
  for proc_id in process_list:
1347
- pbar.set_description(f"Processing {date_} process {proc_id}")
1348
- run(process_id=proc_id, force_compute=False)
1547
+ pbar.set_description(f"Processing {date_} | proc {proc_id}")
1548
+ run(process_id=proc_id, force_compute=force_compute)
1349
1549
 
1550
+ # Restore settings
1350
1551
  tdfs4ds.DISPLAY_LOGS = temp_DISPLAY_LOGS
1552
+
1351
1553
  except Exception as e:
1352
1554
  tdfs4ds.DISPLAY_LOGS = temp_DISPLAY_LOGS
1353
- # If an exception occurs, print the date and the first line of the exception message
1354
- #print(date_)
1355
1555
  print(str(e).split('\n')[0])
1356
1556
  tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
1357
1557
  raise
1358
1558
 
1359
- tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
1360
-
1361
-
1559
+ tdfs4ds.PROCESS_TYPE = PROCESS_TYPE