PyPI - tdfs4ds - Versions diffs - 0.2.4.26__py3-none-any.whl → 0.2.4.41__py3-none-any.whl - Mend

tdfs4ds 0.2.4.26py3-none-any.whl → 0.2.4.41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

tdfs4ds/__init__.py +586 -564
tdfs4ds/feature_store/feature_data_processing.py +367 -299
tdfs4ds/feature_store/feature_query_retrieval.py +105 -52
tdfs4ds/feature_store/feature_store_management.py +226 -231
tdfs4ds/process_store/process_followup.py +113 -2
tdfs4ds/process_store/process_query_administration.py +1 -1
tdfs4ds/process_store/process_registration_management.py +67 -55
tdfs4ds/process_store/process_store_catalog_management.py +2 -2
tdfs4ds/utils/filter_management.py +521 -138
tdfs4ds/utils/query_management.py +18 -40
tdfs4ds/utils/time_management.py +547 -97
{tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.4.41.dist-info}/METADATA +1 -1
{tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.4.41.dist-info}/RECORD +15 -15
{tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.4.41.dist-info}/WHEEL +0 -0
{tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.4.41.dist-info}/top_level.txt +0 -0

tdfs4ds/__init__.py CHANGED Viewed

@@ -1,5 +1,7 @@
-__version__ = '0.2.4.26'
+__version__ = '0.2.4.41'
 import logging
+import json
 # Setup the logger
 logging.basicConfig(
     level=logging.INFO,
@@ -7,6 +9,15 @@ logging.basicConfig(
     datefmt='%Y-%m-%d %H:%M:%S'  # Set the date/time format
 )
+# Helper: central logging gate controlled by tdfs4ds.DISPLAY_LOGS
+def logger_safe(level, message, *args, **kwargs):
+    """
+    Wrapper around the global `logger` that only emits logs when
+    tdfs4ds.DISPLAY_LOGS is True. `level` is a string like "info", "error", etc.
+    """
+    if getattr(tdfs4ds, "DISPLAY_LOGS", True):
+        getattr(logger, level)(message, *args, **kwargs)
 logger = logging.getLogger(__name__)
 from tdfs4ds.feature_store.feature_query_retrieval import get_available_entity_id_records, write_where_clause_filter
@@ -57,7 +68,7 @@ import tdfs4ds.datasets
 import time
 import inspect
-import tqdm
+from tqdm.auto import tqdm  # auto picks the right frontend (notebook/terminal)
 from tdfs4ds.feature_store.feature_data_processing import generate_on_clause
@@ -70,92 +81,80 @@ PROCESS_TYPE = 'RUN PROCESS'
 try:
     SCHEMA = tdml.context.context._get_current_databasename()
     if SCHEMA is None:
-        print('Please specify the database which is hosting the feature store.')
-        print('tdfs4ds.feature_store.schema = "<feature store database>"')
+        logger.warning("No default database detected for feature store.")
+        logger.warning('Please set it explicitly: tdfs4ds.feature_store.schema = "<feature store database>"')
     else:
-        print('The default database is used for the feature store.')
-        print(f"tdfs4ds.feature_store.schema = '{SCHEMA}'")
+        logger.info("Default database detected for feature store: %s", SCHEMA)
+        logger.info('tdfs4ds.feature_store.schema = "%s"', SCHEMA)
         if DATA_DOMAIN is None:
             DATA_DOMAIN = SCHEMA
-            print(f"the data domain for the current work is :{DATA_DOMAIN}")
-            print("Please update it as you wish with tdfs4ds.DATA_DOMAIN=<your data domain>")
+            logger.info("DATA_DOMAIN not set. Defaulting to SCHEMA: %s", DATA_DOMAIN)
+            logger.info('You can override it using: tdfs4ds.DATA_DOMAIN = "<your data domain>"')
 except Exception as e:
-    print('Please specify the database which is hosting the feature store.')
-    print('tdfs4ds.feature_store.schema = "<feature store database>"')
+    logger.error("Could not determine current database: %s", str(e).split('\n')[0])
+    logger.warning("Please specify the feature store database manually:")
+    logger.warning('tdfs4ds.feature_store.schema = "<feature store database>"')
 def setup(database, if_exists='fail'):
     """
-    Set up the database environment by configuring schema names and optionally dropping existing tables.
-    This function sets the database schema for feature and process catalogs. If specified, it also handles
-    the replacement of existing catalog tables. It reports the status of these operations, including any
-    encountered exceptions.
-    Parameters:
-    database (str): The name of the database schema to be used.
-    if_exists (str, optional): Determines the behavior if catalog tables already exist in the database.
-                               'fail' (default) - Do nothing if the tables exist.
-                               'replace' - Drop the tables if they exist before creating new ones.
-    Steps performed:
-    1. Sets the schema to the provided database name.
-    2. If 'if_exists' is 'replace', attempts to drop 'FS_FEATURE_CATALOG' and 'FS_PROCESS_CATALOG' tables.
-    3. Creates new feature and process catalog tables and sets their names in the tdfs4ds module.
-    4. Prints the names of the newly created tables along with the database name.
-    5. Captures and prints the first line of any exceptions that occur during these operations.
-    Returns:
-    None
+    Initialize the feature store environment by creating catalog tables and views.
     """
     from tdfs4ds.feature_store.feature_store_management import feature_store_catalog_creation
     from tdfs4ds.process_store.process_store_catalog_management import process_store_catalog_creation
     tdfs4ds.SCHEMA = database
+    logger_safe("info", "Setting up feature store in database: %s", database)
     if if_exists == 'replace':
-        try:
-            tdml.db_drop_table(table_name = tdfs4ds.FEATURE_CATALOG_NAME, schema_name=database)
-        except Exception as e:
-            print(str(e).split('\n')[0])
-        try:
-            tdml.db_drop_table(table_name = tdfs4ds.PROCESS_CATALOG_NAME, schema_name=database)
-        except Exception as e:
-            print(str(e).split('\n')[0])
-        try:
-            tdml.db_drop_table(table_name = tdfs4ds.DATA_DISTRIBUTION_NAME, schema_name=database)
-        except Exception as e:
-            print(str(e).split('\n')[0])
+        logger_safe("info", "Replacing existing catalog tables if they exist.")
+        for table in [tdfs4ds.FEATURE_CATALOG_NAME, tdfs4ds.PROCESS_CATALOG_NAME, tdfs4ds.DATA_DISTRIBUTION_NAME]:
+            try:
+                tdml.db_drop_table(table_name=table, schema_name=database)
+                logger_safe("info", "Dropped table %s.%s", database, table)
+            except Exception as e:
+                logger_safe("warning", "Could not drop table %s.%s: %s", database, table, str(e).split('\n')[0])
         DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME).drop_catalog()
     try:
         tdfs4ds.FEATURE_CATALOG_NAME = feature_store_catalog_creation()
-        print('feature catalog table: ', tdfs4ds.FEATURE_CATALOG_NAME, ' in database ', database)
+        logger_safe("info", "Feature catalog table created: %s in database %s", tdfs4ds.FEATURE_CATALOG_NAME, database)
     except Exception as e:
-        print(str(e).split('\n')[0])
+        logger_safe("error", "Feature catalog creation failed: %s", str(e).split('\n')[0])
     try:
-        tdfs4ds.PROCESS_CATALOG_NAME, tdfs4ds.DATA_DISTRIBUTION_NAME, tdfs4ds.FILTER_MANAGER_NAME = process_store_catalog_creation()
-        print('process catalog table: ', tdfs4ds.PROCESS_CATALOG_NAME, ' in database ', database)
-        print('data distribution table: ', tdfs4ds.DATA_DISTRIBUTION_NAME, ' in database ', database)
-        print('filter manager table: ', tdfs4ds.FILTER_MANAGER_NAME, ' in database ', database)
+        (tdfs4ds.PROCESS_CATALOG_NAME,
+         tdfs4ds.DATA_DISTRIBUTION_NAME,
+         tdfs4ds.FILTER_MANAGER_NAME) = process_store_catalog_creation()
+        logger_safe("info", "Process catalog table created: %s", tdfs4ds.PROCESS_CATALOG_NAME)
+        logger_safe("info", "Data distribution table created: %s", tdfs4ds.DATA_DISTRIBUTION_NAME)
+        logger_safe("info", "Filter manager table created: %s", tdfs4ds.FILTER_MANAGER_NAME)
     except Exception as e:
-        print(str(e).split('\n')[0])
+        logger_safe("error", "Process catalog creation failed: %s", str(e).split('\n')[0])
     try:
         tdfs4ds.process_store.process_followup.follow_up_table_creation()
+        logger_safe("info", "Follow-up table created successfully.")
     except Exception as e:
-        print(str(e).split('\n')[0])
+        logger_safe("error", "Follow-up table creation failed: %s", str(e).split('\n')[0])
     tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
     tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
     dataset_catalog = DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME)
     if not dataset_catalog._exists():
         dataset_catalog.create_catalog()
+        logger_safe("info", "Dataset catalog created: %s", tdfs4ds.DATASET_CATALOG_NAME)
+    logger_safe("info", "Setup complete.")
     return
 def connect(
     database                  = tdfs4ds.SCHEMA,
     feature_catalog_name      = tdfs4ds.FEATURE_CATALOG_NAME,
@@ -166,15 +165,15 @@ def connect(
     feature_catalog_name_view = tdfs4ds.FEATURE_CATALOG_NAME_VIEW,
     process_catalog_name_view = tdfs4ds.PROCESS_CATALOG_NAME_VIEW,
     dataset_catalog_name      = tdfs4ds.DATASET_CATALOG_NAME,
-    create_if_missing         = False  # New argument
+    create_if_missing         = False
 ):
-    if database is not None:
-        tdfs4ds.SCHEMA = database
-    else:
+    if database is None:
         raise ValueError("database parameter is None.")
+    tdfs4ds.SCHEMA = database
+    logger_safe("info", "Connecting to feature store in database: %s", database)
     tables = [x.lower() for x in list(tdml.db_list_tables(schema_name=tdfs4ds.SCHEMA, object_type='table').TableName.values)]
     feature_exists = feature_catalog_name.lower() in tables
     process_exists = process_catalog_name.lower() in tables
     distrib_exists = data_distribution_name.lower() in tables
@@ -183,20 +182,20 @@ def connect(
     if not (feature_exists and process_exists and distrib_exists and filter_manager_exists):
         if not create_if_missing:
-            return False  # Feature store does not exist
-        else:
-            # Create the missing components
-            if not feature_exists:
-                tdfs4ds.feature_store.feature_store_management.feature_store_catalog_creation()
-            if not process_exists:
-                tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_creation()
-            if not distrib_exists:
-                tdfs4ds.data_distribution.data_distribution_catalog_creation()
-            if not filter_manager_exists:
-                tdfs4ds.filter_manager.filter_manager_catalog_creation()
-    # Follow-up table handling
+            logger_safe("warning", "Feature store components missing and create_if_missing=False")
+            return False
+        logger_safe("info", "Missing components detected; creating missing parts...")
+        if not feature_exists:
+            tdfs4ds.feature_store.feature_store_management.feature_store_catalog_creation()
+        if not process_exists:
+            tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_creation()
+        if not distrib_exists:
+            tdfs4ds.data_distribution.data_distribution_catalog_creation()
+        if not filter_manager_exists:
+            tdfs4ds.filter_manager.filter_manager_catalog_creation()
     if not followup_name_exists:
+        logger_safe("info", "Creating follow-up table: %s", followup_name)
         tdfs4ds.process_store.process_followup.follow_up_table_creation()
     tdfs4ds.FOLLOW_UP_NAME = followup_name
@@ -210,30 +209,31 @@ def connect(
     process_list = tdml.DataFrame(tdml.in_schema(database, process_catalog_name))
     if 'ENTITY_NULL_SUBSTITUTE' not in process_list.columns:
-        print('ENTITY_NULL_SUBSTITUTE column does not exist in the existing process catalog')
-        print('upgrade to the latest DDL')
+        logger_safe("warning", "ENTITY_NULL_SUBSTITUTE column missing. Upgrading catalog.")
         tdfs4ds.process_store.process_store_catalog_management.upgrade_process_catalog()
     tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
     tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
-    # Dataset catalog setup
+    # Dataset Catalog
     tdfs4ds.DATASET_CATALOG_NAME = dataset_catalog_name
-    dataset_catalog = DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME)
+    dataset_catalog = DatasetCatalog(schema_name=database, name=dataset_catalog_name)
     if not dataset_catalog._exists():
         dataset_catalog.create_catalog()
+        logger_safe("info", "Dataset catalog created: %s", dataset_catalog_name)
-    # Check if distribution is temporal
+    # Detect temporal distribution
     def is_data_distribution_temporal():
         return 'PERIOD' in tdfs4ds.utils.lineage.get_ddl(
             view_name=tdfs4ds.DATA_DISTRIBUTION_NAME,
             schema_name=tdfs4ds.SCHEMA,
             object_type='table'
         )
     tdfs4ds.DATA_DISTRIBUTION_TEMPORAL = is_data_distribution_temporal()
-    return True  # Feature store exists or was created
+    logger_safe("info", "Connected to feature store successfully.")
+    return True
@@ -287,50 +287,22 @@ def get_dataset_entity(dataset_id = None):
 def get_dataset_features(dataset_id = None):
     return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_features(dataset_id)
-def run(process_id, return_dataset = False, force_compute = False, force_varchar_length = None):
+def run(process_id, return_dataset=False, force_compute=False, force_varchar_length=None):
     """
     Executes a specific process from the feature store identified by the process ID.
-    The function handles different process types and performs appropriate actions.
-    Parameters:
-    - process_id (str): The unique identifier of the process to run.
-    - return_dataset (bool, optional): A flag indicating whether to return the dataset created during the process.
-                                       Default is False.
-    - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
-                                      Default is False.
-    - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
-                                        VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
-                                        where k is the smallest integer so that the original lengths is smaller or equal
-                                        to k x force_varchar_length. Default is None.
-    Returns:
-    DataFrame or None: If return_dataset is True, returns the dataset created during the process. Otherwise, returns None.
-    This function performs the following steps:
-    1. Determines the process type and initializes necessary variables.
-    2. Constructs and executes a SQL query to retrieve process details by process ID.
-    3. Fetches the filter manager, process type, primary index, partitioning, and data domain from the query result.
-    4. Handles different process types, such as 'denormalized view' and 'tdstone2 view'.
-    5. For 'denormalized view' process type, extracts necessary details, fetches data, and uploads features to the feature store.
-    6. Optionally returns the dataset created during the process if return_dataset is True.
-    Note:
-    - The function relies on various sub-modules within the `tdfs4ds` library for different steps of the process, from
-      data retrieval to feature uploading.
-    - It is intended to be used internally within a system that manages a Teradata feature store, assuming access to
-      a Teradata database and the appropriate schema for feature storage.
+    Uses global `logger` for diagnostics (gated by tdfs4ds.DISPLAY_LOGS).
     """
     if tdfs4ds.PROCESS_TYPE is None:
         PROCESS_TYPE_ = 'RUN PROCESS'
-        tdfs4ds.RUN_ID       = str(uuid.uuid4())
+        tdfs4ds.RUN_ID = str(uuid.uuid4())
     else:
         PROCESS_TYPE_ = tdfs4ds.PROCESS_TYPE
-    if tdfs4ds.DEBUG_MODE:
-        print('def run','tdfs4ds.FEATURE_STORE_TIME', tdfs4ds.FEATURE_STORE_TIME)
+    if getattr(tdfs4ds, "DEBUG_MODE", False):
+        logger_safe("debug", "def run | tdfs4ds.FEATURE_STORE_TIME=%s", tdfs4ds.FEATURE_STORE_TIME)
-    if tdfs4ds.FEATURE_STORE_TIME == None:
+    if tdfs4ds.FEATURE_STORE_TIME is None:
         validtime_statement = 'CURRENT VALIDTIME'
     else:
         validtime_statement = f"VALIDTIME AS OF TIMESTAMP '{tdfs4ds.FEATURE_STORE_TIME}'"
@@ -342,148 +314,110 @@ def run(process_id, return_dataset = False, force_compute = False, force_varchar
     WHERE A.PROCESS_ID = '{process_id}'
     """
+    logger_safe(
+        "info",
+        "Starting run | run_id=%s | process_type=%s | process_id=%s | return_dataset=%s | force_compute=%s | force_varchar_length=%s",
+        tdfs4ds.RUN_ID, PROCESS_TYPE_, process_id, return_dataset, force_compute, force_varchar_length
+    )
     # Executing the query and converting the result to Pandas DataFrame
     df = tdml.DataFrame.from_query(query).to_pandas()
-    # Check if exactly one record is returned, else print an error
+    # Check if exactly one record is returned, else log an error and return
     if df.shape[0] != 1:
-        print('error - there is ', df.shape[0], f' records. Check table {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}')
-        print('check ou this query:')
-        print(query)
+        logger_safe(
+            "error",
+            "Process catalog lookup returned %s record(s); expected 1. Check table %s.%s. Query: %s",
+            df.shape[0], tdfs4ds.SCHEMA, tdfs4ds.PROCESS_CATALOG_NAME_VIEW, query.strip()
+        )
         return
     # Fetching the filter manager
     filter_schema_name = df['FILTER_DATABASE_NAME'].values[0]
     if filter_schema_name is None:
         filtermanager = None
     else:
         filter_view_name = df['FILTER_VIEW_NAME'].values[0]
-        filter_table_name = df['FILTER_TABLE_NAME'].values[0]
+        filter_table_name = df['FILTER_TABLE_NAME'].values[0]  # kept for parity; not used directly here
         filtermanager = FilterManager(table_name=filter_view_name, schema_name=filter_schema_name)
-    # Fetching the process type from the query result
-    process_type = df['PROCESS_TYPE'].values[0]
-    # Fetching the primary index from the query result
-    primary_index = df['FOR_PRIMARY_INDEX'].values[0]
+    # Fetching process metadata
+    process_type   = df['PROCESS_TYPE'].values[0]
+    primary_index  = df['FOR_PRIMARY_INDEX'].values[0]
     if primary_index is not None:
-        primary_index = primary_index.split(',')
-    # Fetching the primary index from the query result
-    partitioning = df['FOR_DATA_PARTITIONING'].values[0]
-    # Fetching the data domain from the query result
-    DATA_DOMAIN = df['DATA_DOMAIN'].values[0]
+        primary_index = [x.strip() for x in primary_index.split(',') if x.strip()]
+    partitioning   = df['FOR_DATA_PARTITIONING'].values[0]
+    DATA_DOMAIN    = df['DATA_DOMAIN'].values[0]
+    logger_safe(
+        "info",
+        "Process metadata | process_id=%s | process_type=%s | primary_index=%s | partitioning=%s | data_domain=%s | validtime=%s",
+        process_id, process_type, primary_index, partitioning, DATA_DOMAIN, validtime_statement
+    )
     # Handling 'denormalized view' process type
     if process_type == 'denormalized view':
-        # Extracting necessary details for this process type
-        view_name = df['VIEW_NAME'].values[0]
-        entity_id = df['ENTITY_ID'].values[0].split(',')
+        view_name              = df['VIEW_NAME'].values[0]
+        entity_id              = [x.strip() for x in df['ENTITY_ID'].values[0].split(',') if x.strip()]
         entity_null_substitute = eval(df['ENTITY_NULL_SUBSTITUTE'].values[0])
-        feature_names = df['FEATURE_NAMES'].values[0].split(',')
+        feature_names          = [x.strip() for x in df['FEATURE_NAMES'].values[0].split(',') if x.strip()]
-        # Fetching data and uploading features to the feature store
         df_data = tdml.DataFrame(tdml.in_schema(view_name.split('.')[0], view_name.split('.')[1]))
-        if tdfs4ds.DEBUG_MODE:
-            print('run','entity_id',entity_id)
-            print('run', 'entity_null_substitute', entity_null_substitute)
-            print('run','feature_names',feature_names)
-            print('run','process_id',process_id)
-            print('run','primary_index',primary_index)
-            print('run','partitioning',partitioning)
+        if getattr(tdfs4ds, "DEBUG_MODE", False):
+            logger_safe("debug", "run | entity_id=%s", entity_id)
+            logger_safe("debug", "run | entity_null_substitute=%s", entity_null_substitute)
+            logger_safe("debug", "run | feature_names=%s", feature_names)
+            logger_safe("debug", "run | process_id=%s", process_id)
+            logger_safe("debug", "run | primary_index=%s", primary_index)
+            logger_safe("debug", "run | partitioning=%s", partitioning)
         dataset = _upload_features(
             df_data,
             entity_id,
             feature_names,
-            feature_versions = process_id,
-            primary_index = primary_index,
-            partitioning = partitioning,
-            filtermanager = filtermanager,
-            entity_null_substitute = entity_null_substitute,
-            process_id = process_id,
-            force_compute= force_compute,
-            force_varchar_length = force_varchar_length
+            feature_versions=process_id,
+            primary_index=primary_index,
+            partitioning=partitioning,
+            filtermanager=filtermanager,
+            entity_null_substitute=entity_null_substitute,
+            process_id=process_id,
+            force_compute=force_compute,
+            force_varchar_length=force_varchar_length
         )
     # Handling 'tdstone2 view' process type
     elif process_type == 'tdstone2 view':
-        print('not implemented yet')
+        logger_safe("warning", "Process type 'tdstone2 view' not implemented yet for process_id=%s", process_id)
+        dataset = None
+    else:
+        logger_safe("error", "Unknown process type '%s' for process_id=%s", process_type, process_id)
+        dataset = None
     if return_dataset:
+        logger_safe("info", "Run finished with dataset | run_id=%s | process_id=%s", tdfs4ds.RUN_ID, process_id)
         return dataset
     else:
+        logger_safe("info", "Run finished without dataset | run_id=%s | process_id=%s", tdfs4ds.RUN_ID, process_id)
         return
-def upload_features(df, entity_id, feature_names, metadata={}, primary_index = None, partitioning = '', filtermanager = None, entity_null_substitute = {}, force_compute = True, force_varchar_length = 1024):
-    """
-    Uploads feature data from a DataFrame to the feature store for a specified entity. This involves registering the
-    process in the feature store, executing the necessary SQL to insert the data, and returning the resulting dataset
-    for further use or inspection.
-    The function supports dynamic entity ID interpretation and flexible feature name handling, ensuring compatibility
-    with various data schemas. It automatically registers the data upload process and applies additional metadata,
-    if provided.
-    Parameters:
-    - df (DataFrame): The DataFrame containing the feature data to be uploaded.
-    - entity_id (dict, list, or str): The identifier of the entity to which the features belong. This can be:
-        - a dictionary mapping column names to their data types,
-        - a list of column names, which will be automatically converted to a dictionary with types inferred from `df`,
-        - a string representing a single column name, which will be converted into a list and then to a dictionary as above.
-    - feature_names (list or str): The names of the features to be uploaded. If a string is provided, it will be
-      split into a list based on commas or treated as a single feature name.
-    - metadata (dict, optional): Additional metadata to associate with the upload process. Defaults to an empty dictionary.
-    - primary_index (list, optional): Specifies the primary index columns for optimizing data storage and retrieval.
-    - partitioning (str, optional): Defines how the data should be partitioned in the store for performance optimization.
-    - filtermanager (object, optional): An object managing filter conditions for the feature data. Default is None.
-    - entity_null_substitute (dict, optional): A dictionary specifying substitute values for nulls in entity columns.
-                                               Default is an empty dictionary.
-    - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
-                                      Default is True.
-    - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
-                                        VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
-                                        where k is the smallest integer so that the original lengths is smaller or equal
-                                        to k x force_varchar_length. Default is 1024.
-    Returns:
-    DataFrame: A DataFrame representing the dataset resulting from the upload process, typically used for validation
-               or further processing.
-    The process involves several steps, including entity ID type conversion if necessary, feature name normalization,
-    process registration in the feature store, and the execution of SQL queries to insert the data. The function concludes
-    by returning a dataset derived from the uploaded data, offering immediate access to the newly stored information.
-    Example:
-    >>> df = tdml.DataFrame(...)
-    >>> entity_id = ['customer_id']
-    >>> feature_names = ['age', 'income']
-    >>> dataset = upload_features(df, entity_id, feature_names)
-    >>> # Another example with list-based entity_id, custom primary_index, and partitioning
-    >>> tddf = tdml.DataFrame(...)  # Assuming tddf is predefined with appropriate columns
-    >>> entity_id = ['tx_type', 'txn_id']
-    >>> primary_index = ['txn_id']
-    >>> partitioning = '''
-    ... PARTITION BY CASE_N (
-    ...     tx_type LIKE 'DEBIT',
-    ...     tx_type LIKE 'PAYMENT',
-    ...     tx_type LIKE 'CASH_OUT',
-    ...     tx_type LIKE 'CASH_IN',
-    ...     tx_type LIKE 'TRANSFER',
-    ...     NO CASE,
-    ...     UNKNOWN)'''
-    >>> features = [x for x in tddf.columns if x not in entity_id]
-    >>> dataset = upload_features(
-    ...     df = tddf,
-    ...     entity_id = entity_id,
-    ...     feature_names = features,
-    ...     metadata = {'project': 'test'},
-    ...     primary_index = primary_index,
-    ...     partitioning = partitioning
-    ... )
+def upload_features(
+    df,
+    entity_id,
+    feature_names,
+    metadata={},
+    primary_index=None,
+    partitioning='',
+    filtermanager=None,
+    entity_null_substitute={},
+    force_compute=True,
+    force_varchar_length=1024
+):
+    """
+    Uploads feature data from a DataFrame to the feature store for a specified entity.
+    All diagnostics go through `logger_safe()` which respects `tdfs4ds.DISPLAY_LOGS`.
     """
     from tdfs4ds.utils.info import get_column_types
@@ -491,45 +425,42 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
     from tdfs4ds.process_store.process_registration_management import register_process_view
     # Convert entity_id to a dictionary if it's not already one
-    if type(entity_id) == list:
+    if isinstance(entity_id, list):
         entity_id.sort()
         entity_id = get_column_types(df, entity_id)
-        if tdfs4ds.DISPLAY_LOGS:
-            print('entity_id has been converted to a proper dictionary : ', entity_id)
-    elif type(entity_id) == str:
+        logger_safe("debug", "entity_id converted to dict: %s", entity_id)
+    elif isinstance(entity_id, str):
         entity_id = [entity_id]
         entity_id = get_column_types(df, entity_id)
-        if tdfs4ds.DISPLAY_LOGS:
-            print('entity_id has been converted to a proper dictionary : ', entity_id)
-    if type(feature_names) != list:
-        if tdfs4ds.DISPLAY_LOGS:
-            print('feature_names is not a list:', feature_names)
-        if ',' in feature_names:
-            feature_names = feature_names.split(',')
+        logger_safe("debug", "entity_id converted to dict: %s", entity_id)
+    # Normalize feature_names
+    if not isinstance(feature_names, list):
+        logger_safe("debug", "feature_names is not a list: %s", feature_names)
+        if isinstance(feature_names, str) and ',' in feature_names:
+            feature_names = [x.strip() for x in feature_names.split(',')]
         else:
             feature_names = [feature_names]
-        if tdfs4ds.DISPLAY_LOGS:
-            print('it has been converted to : ', feature_names)
-            print('check it is a expected.')
-    if primary_index is not None and type(primary_index) != list:
-        if tdfs4ds.DISPLAY_LOGS:
-            print('primary_index is not a list:', primary_index)
-        if ',' in primary_index:
-            primary_index = primary_index.split(',')
+        logger_safe("debug", "feature_names converted to list: %s", feature_names)
+        logger_safe("debug", "Check the conversion is as expected.")
+    # Normalize primary_index
+    if primary_index is not None and not isinstance(primary_index, list):
+        logger_safe("debug", "primary_index is not a list: %s", primary_index)
+        if isinstance(primary_index, str) and ',' in primary_index:
+            primary_index = [x.strip() for x in primary_index.split(',')]
         else:
             primary_index = [primary_index]
-        if tdfs4ds.DISPLAY_LOGS:
-            print('it has been converted to : ', feature_names)
-            print('check it is a expected.')
+        logger_safe("debug", "primary_index converted to list: %s", primary_index)
+        logger_safe("debug", "Check the conversion is as expected.")
+    # Partitioning
     partitioning = tdfs4ds.utils.info.generate_partitioning_clause(partitioning=partitioning)
-    if tdfs4ds.DISPLAY_LOGS:
-        print("filtermanager", filtermanager)
+    logger_safe("debug", "filtermanager: %s", filtermanager)
-    # Register the process and retrieve the SQL query to insert the features, and the process ID
+    # Register process -> get SQL(s) + process_id
     query_insert, process_id, query_insert_dist, query_insert_filtermanager = register_process_view.__wrapped__(
         view_name       = df,
         entity_id       = entity_id,
@@ -542,104 +473,171 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
         entity_null_substitute = entity_null_substitute
     )
-    # Execute the SQL query to insert the features into the database
-    execute_query(query_insert)
-    execute_query(query_insert_dist)
-    if tdfs4ds.DEBUG_MODE:
-        print("query_insert_filtermanager",query_insert_filtermanager)
-    if query_insert_filtermanager is not None:
-        execute_query(query_insert_filtermanager)
+    logger_safe("info", "Registered process (process_id=%s) for upload_features", process_id)
-    # Run the registered process and return the resulting dataset
-    PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
-    tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES'
-    if tdfs4ds.BUILD_DATASET_AT_UPLOAD: tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES WITH DATASET VALIDATION'
-    tdfs4ds.RUN_ID = str(uuid.uuid4())
-    if tdfs4ds.BUILD_DATASET_AT_UPLOAD:
+    # Execute queries
+    try:
+        execute_query(query_insert)
+        logger_safe("info", "Executed main insert query for process_id=%s", process_id)
+    except Exception as e:
+        logger_safe("exception", "Main insert query failed for process_id=%s", process_id)
+        raise
-        try:
+    try:
+        execute_query(query_insert_dist)
+        logger_safe("info", "Executed distribution insert query for process_id=%s", process_id)
+    except Exception as e:
+        logger_safe("exception", "Distribution insert query failed for process_id=%s", process_id)
+        raise
-            dataset = run(process_id=process_id, return_dataset=True, force_compute = force_compute, force_varchar_length = force_varchar_length)
+    if getattr(tdfs4ds, "DEBUG_MODE", False):
+        # Avoid dumping entire SQL in normal logs; keep it debug-only.
+        logger_safe("debug", "query_insert_filtermanager: %s", query_insert_filtermanager)
+    if query_insert_filtermanager is not None:
+        try:
+            execute_query(query_insert_filtermanager)
+            logger_safe("info", "Executed filtermanager insert query for process_id=%s", process_id)
         except Exception as e:
-            tdfs4ds.process_store.process_followup.followup_close(
-                run_id       = tdfs4ds.RUN_ID,
-                process_type = tdfs4ds.PROCESS_TYPE,
-                process_id   = process_id,
-                status       = 'FAILED,' + str(e).split('\n')[0]
-            )
+            logger_safe("exception", "Filtermanager insert query failed for process_id=%s", process_id)
             raise
+    # Run the registered process (with/without dataset)
+    PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
+    tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES'
+    if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
+        tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES WITH DATASET VALIDATION'
+    tdfs4ds.RUN_ID = str(uuid.uuid4())
-        return dataset
-    else:
+    logger_safe(
+        "info",
+        "Starting run (run_id=%s, process_type=%s, process_id=%s, force_compute=%s, force_varchar_length=%s)",
+        tdfs4ds.RUN_ID, tdfs4ds.PROCESS_TYPE, process_id, force_compute, force_varchar_length
+    )
+    try:
+        if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
+            dataset = run(
+                process_id=process_id,
+                return_dataset=True,
+                force_compute=force_compute,
+                force_varchar_length=force_varchar_length
+            )
+            logger_safe("info", "Run completed with dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
+            return dataset
+        else:
+            run(
+                process_id=process_id,
+                return_dataset=False,
+                force_compute=force_compute,
+                force_varchar_length=force_varchar_length
+            )
+            logger_safe("info", "Run completed without dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
+            return
+    except Exception as e:
+        # Keep your existing follow-up close behavior, but ensure the error is logged.
         try:
-            run(process_id=process_id, return_dataset=False, force_compute = force_compute, force_varchar_length = force_varchar_length)
-        except Exception as e:
             tdfs4ds.process_store.process_followup.followup_close(
                 run_id       = tdfs4ds.RUN_ID,
                 process_type = tdfs4ds.PROCESS_TYPE,
                 process_id   = process_id,
                 status       = 'FAILED,' + str(e).split('\n')[0]
             )
-            raise
-        return
+        finally:
+            logger_safe("exception", "Run failed (run_id=%s, process_id=%s): %s",
+                tdfs4ds.RUN_ID, process_id, str(e).split('\n')[0]
+            )
+        raise
+    finally:
+        # Restore previous process type just in case the caller relies on it.
+        tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
-    tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
-def _upload_features(df, entity_id, feature_names,
-                   feature_versions=FEATURE_VERSION_DEFAULT, primary_index = None, partitioning = '', filtermanager=None, entity_null_substitute={}, process_id = None, force_compute = False,force_varchar_length = None):
-    """
-    Uploads features from a DataFrame to the feature store, handling entity registration, feature type determination,
-    feature registration, preparation for ingestion, and storage in the designated feature tables.
-    Parameters:
-    - df (DataFrame): The input DataFrame containing the feature data.
-    - entity_id (str or dict): The identifier for the entity to which these features belong. This can be a single ID
-                               (str) or a dictionary of attribute names and values uniquely identifying the entity.
-    - feature_names (list): A list of strings specifying the names of the features to be uploaded.
-    - feature_versions (str or list, optional): Specifies the versions of the features to be uploaded. Can be a single
-                                                string applied to all features or a list of strings specifying the version
-                                                for each feature respectively. Default is 'dev.0.0'.
-    - primary_index (list, optional): Specifies the columns to be used as the primary index in the feature store tables.
-                                      This can significantly impact the performance of data retrieval operations.
-    - partitioning (str, optional): A string indicating the partitioning strategy for the feature store tables, which can
-                                    enhance query performance based on the access patterns.
-    - filtermanager (object, optional): An object managing filter conditions for the feature data. Default is None.
-    - entity_null_substitute (dict, optional): A dictionary specifying substitute values for nulls in entity columns.
-                                               Default is an empty dictionary.
-    - process_id (str, optional): An identifier for the process, used for tracking and follow-up. Default is None.
-    - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
-                                      Default is False.
-    - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
-                                            VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
-                                            where k is the smallest integer so that the original lengths is smaller or equal
-                                            to k x force_varchar_length. Default is None.
+def _upload_features(
+    df, entity_id, feature_names,
+    feature_versions=FEATURE_VERSION_DEFAULT,
+    primary_index=None, partitioning='',
+    filtermanager=None, entity_null_substitute={},
+    process_id=None, force_compute=False,
+    force_varchar_length=None
+):
+    """
+    Uploads a set of features into the Feature Store for a given entity.
+    This function registers an entity and its associated features in the feature catalog
+    if they are not already defined, prepares the data for ingestion, and stores it in the
+    feature store. It also supports incremental feature computation and conditional execution
+    depending on prior runs.
-    Returns:
-    DataFrame: A DataFrame representing the dataset view created in the feature store, detailing the features and their
-               metadata, including versions and storage locations.
-    This function orchestrates several steps involved in feature storage:
-    1. Registers the entity in the feature store if not already present.
-    2. Determines the data types of the features based on the input DataFrame.
-    3. Registers the features, including their names, types, and versions, in the feature catalog.
-    4. Prepares the feature data for ingestion, including any necessary transformations.
-    5. Stores the prepared feature data in the feature store.
-    6. Optionally, cleans up temporary resources used during the process.
-    7. Builds and returns a view of the dataset representing the uploaded features for easy access.
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        Input dataframe containing entity keys and feature columns to upload.
+    entity_id : str, list, or dict
+        Identifier(s) for the entity. Can be:
+            - A string (single entity key)
+            - A list of key column names
+            - A dict mapping column names to data types
+        If not a dict, entity metadata is inferred automatically.
+    feature_names : list of str
+        List of feature column names to upload from `df`.
+    feature_versions : dict or int, optional
+        Feature version(s). If a single integer is provided, it is applied to all features.
+        If a dict is provided, it maps each feature name to its version.
+        Default is FEATURE_VERSION_DEFAULT.
+    primary_index : str or list, optional
+        Primary index to use when storing features in Teradata.
+    partitioning : str, optional
+        Partitioning clause for feature store tables. Default is ''.
+    filtermanager : FilterManager, optional
+        If provided, features are built iteratively per filter step.
+    entity_null_substitute : dict, optional
+        Replacement values for nulls in entity keys.
+        Example: {'customer_id': -1}
+    process_id : str, optional
+        Identifier for the process execution, used for follow-up logging.
+    force_compute : bool, optional
+        If True, forces recomputation even if the same process_id and timestamp were
+        already computed earlier. If False, the computation is skipped when existing
+        results are detected. Default is False.
+    force_varchar_length : int, optional
+        If provided, all VARCHAR feature columns are resized to this length
+        before ingestion.
+    Returns
+    -------
+    pandas.DataFrame or None
+        If BUILD_DATASET_AT_UPLOAD is enabled, returns a dataset built from the
+        ingested features for validation. Otherwise, returns None.
+    Notes
+    -----
+    - Uses global tdfs4ds context such as FEATURE_STORE_TIME, RUN_ID, and PROCESS_TYPE.
+    - Logs ingestion status in process follow-up tables.
+    - Skips ingestion when existing completed results are found unless
+      `force_compute=True`.
+    - Applies Teradata-optimized storage and statistics collection.
+    Raises
+    ------
+    ValueError
+        If unsupported data types are found (CLOB/BLOB/JSON).
+    Exception
+        For ingestion failure or storage errors.
-    Note:
-    - The function relies on various sub-modules within the `tdfs4ds` library for different steps of the process, from
-      entity and feature registration to data preparation and storage.
-    - It is intended to be used internally within a system that manages a Teradata feature store, assuming access to
-      a Teradata database and the appropriate schema for feature storage.
-    - The function assumes that the feature_versions, if provided as a list, matches the length of feature_names.
+    Example
+    -------
+    >>> _upload_features(
+    ...     df=dataframe,
+    ...     entity_id="customer_id",
+    ...     feature_names=["age", "credit_score"],
+    ...     process_id="customer_features_v1",
+    ...     force_compute=False
+    ... )
     """
     from tdfs4ds.feature_store.entity_management        import register_entity
     from tdfs4ds.feature_store.feature_store_management import Gettdtypes
     from tdfs4ds.feature_store.feature_store_management import register_features
@@ -647,194 +645,180 @@ def _upload_features(df, entity_id, feature_names,
     from tdfs4ds.feature_store.feature_data_processing  import store_feature, apply_collect_stats
     from tdfs4ds.utils.info import get_column_types, update_varchar_length
-    # Convert entity_id to a dictionary if it's not already one
-    if type(entity_id) == list:
+    # Convert entity_id to a dictionary if not already
+    if isinstance(entity_id, list):
         entity_id.sort()
         entity_id = get_column_types(df, entity_id)
-        if tdfs4ds.DISPLAY_LOGS:
-            print('entity_id has been converted to a proper dictionary : ', entity_id)
-    elif type(entity_id) == str:
-        entity_id = [entity_id]
-        entity_id = get_column_types(df, entity_id)
-        if tdfs4ds.DISPLAY_LOGS:
-            print('entity_id has been converted to a proper dictionary : ', entity_id)
-    #register_entity(entity_id, primary_index=primary_index, partitioning=partitioning)
-    # If feature_versions is a list, create a dictionary mapping each feature name to its corresponding version.
-    # If feature_versions is a string, create a dictionary mapping each feature name to this string.
-    if type(feature_versions) == list:
-        selected_features = {k: v for k, v in zip(feature_names, feature_versions)}
+        logger_safe("debug", "entity_id converted to dict: %s", entity_id)
+    elif isinstance(entity_id, str):
+        entity_id = get_column_types(df, [entity_id])
+        logger_safe("debug", "entity_id converted to dict: %s", entity_id)
+    # Map feature versions
+    if isinstance(feature_versions, list):
+        selected_features = dict(zip(feature_names, feature_versions))
     else:
         selected_features = {k: feature_versions for k in feature_names}
-    # Get the Teradata types of the features in df.
-    feature_names_types = Gettdtypes(
-        df,
-        features_columns=feature_names,
-        entity_id=entity_id
-    )
+    # Get Teradata types for features
+    feature_names_types = Gettdtypes(df, features_columns=feature_names, entity_id=entity_id)
     if force_varchar_length is not None:
-        print(feature_names_types)
-        feature_names_types = update_varchar_length(feature_names_types,new_varchar_length = force_varchar_length)
+        logger_safe("debug", "Updating VARCHAR lengths with force_varchar_length=%s", force_varchar_length)
+        feature_names_types = update_varchar_length(
+            feature_names_types,
+            new_varchar_length=force_varchar_length
+        )
     def validate_feature_types(feature_names_types):
-        """
-        Validates feature data types and raises an error if any value contains
-        the substrings 'clob', 'blob', or 'json' (case insensitive).
-        Parameters:
-        feature_names_types (dict): A dictionary where keys are feature names and values are their data types.
-        Raises:
-        ValueError: If any feature type contains 'clob', 'blob', or 'json'.
-        """
-        invalid_types = {key: value['type'] for key, value in feature_names_types.items()
-                        if any(term in value['type'].lower() for term in ['clob', 'blob', 'json'])}
-        if invalid_types:
+        invalid = {
+            k: v['type'] for k, v in feature_names_types.items()
+            if any(x in v['type'].lower() for x in ['clob', 'blob', 'json'])
+        }
+        if invalid:
             raise ValueError(
-                f"The following features have unsupported data types: {invalid_types}. "
-                "The data types 'CLOB', 'BLOB', and 'JSON' are not yet managed by the feature store."
+                f"Unsupported data types found: {invalid}. "
+                "CLOB/BLOB/JSON are not supported."
             )
-    validate_feature_types(feature_names_types)
+    validate_feature_types(feature_names_types)
+    logger_safe("info", "Registering entity %s in feature store", entity_id)
     register_entity(entity_id, feature_names_types, primary_index=primary_index, partitioning=partitioning)
-    if tdfs4ds.DEBUG_MODE:
-        print('_upload_features', 'entity_id',     entity_id)
-        print('_upload_features', 'entity_null_substitute', entity_null_substitute)
-        print('_upload_features', 'feature_names', feature_names)
-        print('_upload_features', 'primary_index', primary_index)
-        print('_upload_features', 'partitioning',  partitioning)
-        print('_upload_features', 'selected_features', selected_features)
-        print('_upload_features', 'df.columns', df.columns)
-    # Register the features in the feature catalog.
-    register_features(
-        entity_id,
-        feature_names_types,
-        primary_index,
-        partitioning
-    )
-    if tdfs4ds.DEBUG_MODE:
-        print("---------_upload_features")
-        print("filtermanager     : ", filtermanager)
-        print("feature names     : ", feature_names)
-        print("selected features : ", selected_features)
-    if process_id is not None and tdfs4ds.FEATURE_STORE_TIME is not None:
+    if getattr(tdfs4ds, "DEBUG_MODE", False):
+        logger_safe(
+            "debug",
+            "_upload_features entity_id=%s null_substitute=%s features=%s primary_index=%s partitioning=%s",
+            entity_id, entity_null_substitute, feature_names, primary_index, partitioning
+        )
+        logger_safe("debug", "selected_features=%s df.columns=%s", selected_features, df.columns)
+    register_features(entity_id, feature_names_types, primary_index, partitioning)
+    logger_safe("info", "Features registered in catalog: %s", feature_names)
+    follow_up = None
+    if process_id and tdfs4ds.FEATURE_STORE_TIME:
         follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
-        follow_up = follow_up[(follow_up.STATUS == 'COMPLETED') & (follow_up.VALIDTIME_DATE.isna() == False) & (
-                    follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) & (follow_up.PROCESS_ID == process_id)]
-    if filtermanager is None:
-        do_compute = True
-        if process_id is not None and tdfs4ds.FEATURE_STORE_TIME is not None:
-            if follow_up.shape[0] > 0:
-                do_compute = False
+        follow_up = follow_up[
+            (follow_up.STATUS == 'COMPLETED') &
+            (follow_up.VALIDTIME_DATE.isna() == False) &
+            (follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) &
+            (follow_up.PROCESS_ID == process_id)
+        ]
-        # Prepare the features for ingestion.
+    if filtermanager is None:
+        do_compute = not (process_id and follow_up is not None and follow_up.shape[0] > 0)
+        if not do_compute and not force_compute:
+            logger_safe(
+                "info",
+                "Skipping computation for process_id=%s at time %s (already exists, force_compute=False)",
+                process_id, tdfs4ds.FEATURE_STORE_TIME
+            )
         if do_compute or force_compute:
+            logger_safe("info", "Beginning feature ingestion for entity=%s", entity_id)
             tdfs4ds.process_store.process_followup.followup_open(
-                run_id       = tdfs4ds.RUN_ID,
-                process_type = tdfs4ds.PROCESS_TYPE,
-                process_id   = process_id
+                run_id=tdfs4ds.RUN_ID,
+                process_type=tdfs4ds.PROCESS_TYPE,
+                process_id=process_id
             )
             try:
-                prepared_features, volatile_table_name, features_infos = prepare_feature_ingestion(
-                    df,
-                    entity_id,
-                    feature_names,
+                prepared_features, volatile_table, features_infos = prepare_feature_ingestion(
+                    df, entity_id, feature_names,
                     feature_versions=selected_features,
                     primary_index=primary_index,
                     entity_null_substitute=entity_null_substitute,
                     partitioning=partitioning
                 )
-                # Store the prepared features in the feature store.
-                store_feature(
-                    entity_id,
-                    volatile_table_name,
-                    entity_null_substitute=entity_null_substitute,
-                    primary_index=primary_index,
-                    partitioning=partitioning,
-                    features_infos = features_infos
-                )
-                # Collect statistics
-                apply_collect_stats(
-                    entity_id,
-                    primary_index = primary_index,
-                    partitioning  = partitioning,
-                    feature_infos = features_infos
-                )
+                store_feature(entity_id, volatile_table, entity_null_substitute,
+                              primary_index, partitioning, features_infos)
+                apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
                 tdfs4ds.process_store.process_followup.followup_close(
-                    run_id        = tdfs4ds.RUN_ID,
-                    process_type  = tdfs4ds.PROCESS_TYPE,
-                    process_id    = process_id
+                    run_id=tdfs4ds.RUN_ID,
+                    process_type=tdfs4ds.PROCESS_TYPE,
+                    process_id=process_id
                 )
+                logger_safe("info", "Feature ingestion completed for entity=%s", entity_id)
             except Exception as e:
+                logger_safe("exception", "Feature ingestion failed for entity=%s", entity_id)
                 tdfs4ds.process_store.process_followup.followup_close(
-                    run_id        = tdfs4ds.RUN_ID,
-                    process_type  = tdfs4ds.PROCESS_TYPE,
-                    process_id    = process_id,
-                    status        = 'FAILED,' + str(e).split('\n')[0]
+                    run_id=tdfs4ds.RUN_ID,
+                    process_type=tdfs4ds.PROCESS_TYPE,
+                    process_id=process_id,
+                    status='FAILED,' + str(e).split('\n')[0]
                 )
                 raise
     else:
-        # get the total number of filter condition in the filter manager
-        nb_filters = filtermanager.nb_filters
-        # the flag that indicates that we computed something in the next loop
+        logger_safe("info", "FilterManager detected: %s filters to process", filtermanager.nb_filters)
         something_computed = False
+        pbar = tqdm(
+            range(filtermanager.nb_filters),
+            total=filtermanager.nb_filters,
+            desc="Applying filters",
+            unit="filter",
+            leave=False
+        )
-        for i in range(nb_filters):
+        for i in pbar:
+            filter_id = i + 1
+            filtermanager.update(filter_id)
-            # place the cursor on the next filter
-            filtermanager.update(i+1)
+            try:
+                pbar.set_description(f"Applying filter {filter_id}/{filtermanager.nb_filters}")
-            if filtermanager.time_filtering:
-                # if the filter manager is hybrid, then synchronize the time with tdfs4ds
-                tdfs4ds.FEATURE_STORE_TIME = filtermanager.get_date_in_the_past()
+                # Convert datetime columns to string
+                df_bar = filtermanager.display().to_pandas().astype(object)  # avoid conversion issues
+                for col in df_bar.select_dtypes(include=["datetime", "datetimetz"]).columns:
+                    df_bar[col] = df_bar[col].dt.strftime("%Y-%m-%d %H:%M:%S")
-                # overwrite the follow up table to tilter on the VALIDTIME_DATE too
-                follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
-                follow_up = follow_up[(follow_up.STATUS == 'COMPLETED') & (follow_up.VALIDTIME_DATE.isna() == False) & (
-                        follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) & (follow_up.PROCESS_ID == process_id)]
+                # Convert to JSON object (dict)
+                bar_info = df_bar.iloc[0].to_dict()
-            # initialize do_compute, the flag that something has to be computed
-            do_compute = True
+                # ---- ADD THIS: handle python date objects ----
+                from datetime import date, datetime
+                for key, value in bar_info.items():
+                    if isinstance(value, (date, datetime)):  # convert date/datetime to string
+                        bar_info[key] = value.strftime("%Y-%m-%d %H:%M:%S")
+                # ----------------------------------------------
-            # if the process_id is defined and if we are working at a specific time:
-            if process_id is not None and tdfs4ds.FEATURE_STORE_TIME is not None:
-                # we check if the filter condition has already been computed
-                follow_up_ = follow_up.assign(APPLIED_FILTER=follow_up.APPLIED_FILTER.cast(tdml.VARCHAR(20000))).join(
-                    tdml.DataFrame.from_query(
-                        f"""
-                        SELECT
-                        CAST(JSON_AGG({','.join(filtermanager.col_names)}) AS VARCHAR(20000)) AS APPLIED_FILTER
-                        FROM {filtermanager.schema_name}.{filtermanager.view_name}
-                        """
-                    ),
-                    on      = 'APPLIED_FILTER',
-                    how     = 'inner',
-                    lprefix = 'l',
-                    rprefix = 'r'
-                )
-                # if already computed and completed, then do_compute is set to False
-                if follow_up_.shape[0] > 0:
-                    do_compute = False
+                bar_info = str(bar_info)
+                if len(bar_info) > 120:
+                    bar_info = bar_info[:117] + "..."
+                pbar.set_postfix_str(bar_info)
+            except Exception:
+                # postfix is optional; ignore errors from display() here
+                pass
-            if tdfs4ds.DISPLAY_LOGS:
-                print(filtermanager.display())
+            logger_safe("debug", "Applying filter %s/%s:\n%s",
+                i + 1, filtermanager.nb_filters, filtermanager.display())
+            do_compute = True
+            if process_id and tdfs4ds.FEATURE_STORE_TIME:
+                # see if already computed
+                follow_up = tdfs4ds.process_store.process_followup.follow_up_report(process_id=process_id, filtermanager=filtermanager)
+                follow_up = follow_up[
+                    (follow_up.STATUS == 'COMPLETED') &
+                    (follow_up.VALIDTIME_DATE.isna() == False) &
+                    (follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME)
+                    ]
+                if follow_up.shape[0] > 0:
+                    do_compute = False
+            if not do_compute and not force_compute:
+                logger_safe(
+                    "info",
+                    "Skipping computation for process_id=%s at time %s (already exists, force_compute=False)",
+                    process_id, tdfs4ds.FEATURE_STORE_TIME
+                )
+                pbar.colour = "green"
             if do_compute or force_compute:
+                pbar.colour = "blue"
                 tdfs4ds.process_store.process_followup.followup_open(
                     run_id        = tdfs4ds.RUN_ID,
                     process_type  = tdfs4ds.PROCESS_TYPE,
@@ -842,83 +826,58 @@ def _upload_features(df, entity_id, feature_names,
                     filtermanager = filtermanager
                 )
                 try:
-                    # Prepare the features for ingestion.
-                    prepared_features, volatile_table_name, features_infos = prepare_feature_ingestion(
-                        df,
-                        entity_id,
-                        feature_names,
+                    prepared_features, volatile_table, features_infos = prepare_feature_ingestion(
+                        df, entity_id, feature_names,
                         feature_versions       = selected_features,
                         primary_index          = primary_index,
                         entity_null_substitute = entity_null_substitute,
                         partitioning           = partitioning
                     )
-                    # Store the prepared features in the feature store.
-                    store_feature(
-                        entity_id,
-                        volatile_table_name,
-                        entity_null_substitute=entity_null_substitute,
-                        primary_index = primary_index,
-                        partitioning = partitioning,
-                        features_infos=features_infos
-                    )
-                    # indicate that something has been processed:
+                    store_feature(entity_id, volatile_table, entity_null_substitute,
+                                  primary_index, partitioning, features_infos)
                     something_computed = True
                     tdfs4ds.process_store.process_followup.followup_close(
-                        run_id=tdfs4ds.RUN_ID,
-                        process_type=tdfs4ds.PROCESS_TYPE,
-                        process_id=process_id,
+                        run_id        = tdfs4ds.RUN_ID,
+                        process_type  = tdfs4ds.PROCESS_TYPE,
+                        process_id    = process_id,
                         filtermanager = filtermanager
                     )
                 except Exception as e:
-                    print(e)
+                    logger_safe("exception", "Error with filter iteration %s: %s", i + 1, str(e))
                     tdfs4ds.process_store.process_followup.followup_close(
-                        run_id=tdfs4ds.RUN_ID,
-                        process_type=tdfs4ds.PROCESS_TYPE,
-                        process_id=process_id,
-                        status='FAILED,' + str(e).split('\n')[0],
-                        filtermanager=filtermanager
+                        run_id        = tdfs4ds.RUN_ID,
+                        process_type  = tdfs4ds.PROCESS_TYPE,
+                        process_id    = process_id,
+                        status        = 'FAILED,' + str(e).split('\n')[0],
+                        filtermanager = filtermanager
                     )
                     raise
-                # Clean up by dropping the temporary volatile table.
-                # tdml.execute_sql(f'DROP TABLE {volatile_table_name}')
-        # Collect statistics only if something has been computed
         if something_computed:
-            apply_collect_stats(
-                entity_id,
-                primary_index  = primary_index,
-                partitioning   = partitioning,
-                feature_infos  = features_infos
-            )
+            apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
-    # Build a dataset view in the feature store.
     if tdfs4ds.BUILD_DATASET_AT_UPLOAD:
-        if tdfs4ds.DISPLAY_LOGS: print('build dataset for validation')
+        logger_safe("info", "Building dataset for validation...")
         try:
-            dataset = build_dataset(
-                entity_id,
-                selected_features,
+            return build_dataset(
+                entity_id, selected_features,
                 view_name=None,
-                entity_null_substitute = entity_null_substitute
+                entity_null_substitute=entity_null_substitute
             )
         except Exception as e:
-            print('ERROR at build_dataset in _upload_features:')
-            print(str(e).split('\n')[0])
-            print('entity :', entity_id)
-            print('selected features :', selected_features)
-        # Return the dataset view.
-        return dataset
+            logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
+            logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
     else:
-        if tdfs4ds.DISPLAY_LOGS: print('no dataset built for validation. Set tdfs4ds.BUILD_DATASET_AT_UPLOAD to True if you want it')
+        logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False)")
         return
 def build_dataset(entity_id, selected_features, view_name, schema_name=None, comment=None, return_query=False,
                   feature_store_time=False, join_type='INNER'):
     """
@@ -935,6 +894,10 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
     selected_features : dict
         A dictionary where the keys are feature table names, and the values are lists of tuples
         (feature_id, feature_version, feature_name) specifying the features to retrieve.
+        NOTE: feature_version may be either:
+          - a single UUID string, or
+          - a list of dicts like:
+              {"process_id": <UUID>, "process_view_name": <str>}
     view_name : str
         The name of the view to be created in the database.
@@ -1004,6 +967,24 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
     # Sort the entity ID list for consistent query generation
     list_entity_id.sort()
+    # Helpers
+    import re
+    def _sanitize_identifier(name: str) -> str:
+        # Keep letters, numbers, and underscores; replace others with '_'
+        return re.sub(r'[^0-9A-Za-z_]', '_', name)
+    used_alias_counts = {}  # base_alias -> count
+    def _unique_alias(base: str) -> str:
+        """
+        Ensure alias uniqueness: if base already used, append _2, _3, ...
+        """
+        if base not in used_alias_counts:
+            used_alias_counts[base] = 1
+            return base
+        used_alias_counts[base] += 1
+        return f"{base}_{used_alias_counts[base]}"
     # Initialize sub-query construction
     tdfs4ds.logger.info("Generating the sub-queries for feature retrieval.")
     sub_queries = []
@@ -1014,21 +995,52 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
     # Construct sub-queries for each feature
     for k, v in list_features.items():
         for feature_id, feature_version, feature_name in v:
-            txt_where = f"(FEATURE_ID = {feature_id} AND FEATURE_VERSION='{feature_version}')"
-            feature_str = ',B1.FEATURE_VALUE AS ' + feature_name
-            sub_queries.append(
-                {
-                    'feature_name': feature_name,
-                    'query': f"""
-                    SEQUENCED VALIDTIME
-                    SELECT
-                       {txt_entity}
-                      {feature_str}
-                    FROM {k} B1
-                    WHERE {txt_where}
-                    """
-                }
-            )
+            # Multiple processes: list of dicts
+            if isinstance(feature_version, list):
+                for item in feature_version:
+                    process_id = item.get("process_id")
+                    process_view_name = item.get("process_view_name") or "PROCESS"
+                    base_alias = _sanitize_identifier(f"{feature_name}_{process_view_name}")
+                    alias = _unique_alias(base_alias)
+                    txt_where = f"(FEATURE_ID = {feature_id} AND FEATURE_VERSION='{process_id}')"
+                    feature_str = ',B1.FEATURE_VALUE AS ' + alias
+                    sub_queries.append(
+                        {
+                            'feature_name': alias,
+                            'query': f"""
+                            SEQUENCED VALIDTIME
+                            SELECT
+                               {txt_entity}
+                              {feature_str}
+                            FROM {k} B1
+                            WHERE {txt_where}
+                            """
+                        }
+                    )
+            # Single UUID
+            else:
+                base_alias = _sanitize_identifier(feature_name)
+                alias = _unique_alias(base_alias)
+                txt_where = f"(FEATURE_ID = {feature_id} AND FEATURE_VERSION='{feature_version}')"
+                feature_str = ',B1.FEATURE_VALUE AS ' + alias
+                sub_queries.append(
+                    {
+                        'feature_name': alias,
+                        'query': f"""
+                        SEQUENCED VALIDTIME
+                        SELECT
+                           {txt_entity}
+                          {feature_str}
+                        FROM {k} B1
+                        WHERE {txt_where}
+                        """
+                    }
+                )
     # Handle case where no features are available
     if len(sub_queries) == 0:
@@ -1102,6 +1114,7 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
         return tdml.DataFrame.from_table(tdml.in_schema(schema_name, view_name))
 def build_dataset_opt(entity_id, selected_features, view_name = None, schema_name=tdfs4ds.SCHEMA,
                   comment='dataset', no_temporal=False, time_manager=None, query_only=False, entity_null_substitute={},
                   other=None, time_column=None, filtermanager = None, filter_conditions = None
@@ -1280,82 +1293,91 @@ def upload_tdstone2_scores(model):
     return dataset
-def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None):
+def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None, force_compute = False, force_display_logs = False):
     """
-    Executes a series of processes for each date in a given list, managing the time and logging settings.
+    Executes a series of processes for each date in a given list, managing time, computation settings, and logging.
     This function iterates over a range of time steps, updating a TimeManager object with each step, and then
-    executes a list of processes for that time step. It also manages the synchronization of time for a feature store
-    and disables display logs during its execution.
+    executes a list of processes for that time step. It also manages synchronization of time for the feature store
+    and optionally controls forced computation and log display behavior.
     Parameters:
     - process_list (list): A list of process IDs that need to be executed for each time step.
-    - time_manager (TimeManager object): An object that manages time-related operations, like updating or retrieving time.
+    - time_manager (TimeManager): An object that manages time-related operations, like updating or retrieving time.
     - time_id_start (int, optional): The starting time step ID. Default is 1.
-    - time_id_end (int, optional): The ending time step ID. If None, it will run until the last time step in the time manager.
+    - time_id_end (int, optional): The ending time step ID. If None, it will run until the last time step in the
+      time manager.
+    - force_compute (bool, optional): If True, forces each process to recompute even if previous results exist.
+      Default is False.
+    - force_display_logs (bool, optional): If True, forces log display during the rollout even if global log display
+      is disabled. Default is False.
     Side Effects:
-    - Sets global variables DISPLAY_LOGS and FEATURE_STORE_TIME.
+    - Temporarily modifies global variables DISPLAY_LOGS, PROCESS_TYPE, RUN_ID, and FEATURE_STORE_TIME.
+    - Restores DISPLAY_LOGS setting after execution.
     - Catches and prints exceptions along with the time step on which they occurred.
-    This function performs the following steps:
-    1. Disables display logs and sets the process type to 'ROLL_OUT'.
-    2. Iterates over the specified range of time steps.
-    3. Updates the time manager with the current time step.
-    4. Synchronizes the feature store time with the current time step.
-    5. Executes each process in the process list for the current time step.
-    6. Restores the original display log setting after execution.
+    Steps performed:
+    1. Disables display logs by default unless `force_display_logs` is True.
+    2. Sets process type to 'ROLL_OUT' and initializes a unique run ID.
+    3. Iterates over the specified range of time steps.
+    4. Updates the time manager with the current time step.
+    5. Synchronizes the feature store time with the current time step.
+    6. Executes each process in the process list with optional forced computation.
+    7. Restores original display log settings after completion.
     Example:
     >>> process_list = ['process_1', 'process_2']
     >>> time_manager = TimeManager(...)
-    >>> roll_out(process_list, time_manager, time_id_start=1, time_id_end=10)
+    >>> roll_out(process_list, time_manager, time_id_start=1, time_id_end=10, force_compute=True, force_display_logs=True)
     """
-    #global DISPLAY_LOGS
-    #global FEATURE_STORE_TIME
     # Disable display logs
     temp_DISPLAY_LOGS = tdfs4ds.DISPLAY_LOGS
     tdfs4ds.DISPLAY_LOGS = False
+    if force_display_logs:
+        tdfs4ds.DISPLAY_LOGS = True
     PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
     tdfs4ds.PROCESS_TYPE = 'ROLL_OUT'
     tdfs4ds.RUN_ID = str(uuid.uuid4())
     try:
+        # Define range of time steps
         if time_id_end is None:
-            pbar = tqdm.tqdm(range(time_id_start, time_manager.nb_time_steps + 1), desc="Starting")
+            time_range = range(time_id_start, time_manager.nb_time_steps + 1)
         else:
-            pbar = tqdm.tqdm(range(time_id_start, min([time_manager.nb_time_steps + 1,time_id_end+1]) ), desc="Starting")
-        # Iterate over each date in the provided list
+            time_range = range(time_id_start, min(time_manager.nb_time_steps + 1, time_id_end + 1))
+        # Progress bar
+        pbar = tqdm(time_range, desc="Starting rollout", unit="step")
         for i in pbar:
-            # Update the time manager with the new date
-            time_manager.update(time_id = i )
+            # Update time manager
+            time_manager.update(time_id=i)
             date_ = str(time_manager.display()['BUSINESS_DATE'].values[0])
-            pbar.set_description(f"Processing {date_}")
-            # Synchronize the time for the feature store with the current date
+            # Sync feature store time
             tdfs4ds.FEATURE_STORE_TIME = time_manager.get_date_in_the_past()
-            pbar.set_description(f"Processing {tdfs4ds.FEATURE_STORE_TIME}")
+            # Display current progress in tqdm
+            pbar.set_postfix(time=date_, feature_time=tdfs4ds.FEATURE_STORE_TIME)
             if tdfs4ds.DEBUG_MODE:
-                print('def roll_out','date_', date_)
-                print('def roll_out','time_manager.get_date_in_the_past()', time_manager.get_date_in_the_past())
-                print('def roll_out','tdfs4ds.FEATURE_STORE_TIME', tdfs4ds.FEATURE_STORE_TIME)
-            # Execute each process in the process list for the current date
+                print("roll_out | date_:", date_)
+                print("roll_out | feature_store_time:", tdfs4ds.FEATURE_STORE_TIME)
+            # Execute all processes for this time step
             for proc_id in process_list:
-                pbar.set_description(f"Processing {date_} process {proc_id}")
-                run(process_id=proc_id, force_compute=False)
+                pbar.set_description(f"Processing {date_} | proc {proc_id}")
+                run(process_id=proc_id, force_compute=force_compute)
+        # Restore settings
         tdfs4ds.DISPLAY_LOGS = temp_DISPLAY_LOGS
     except Exception as e:
         tdfs4ds.DISPLAY_LOGS = temp_DISPLAY_LOGS
-        # If an exception occurs, print the date and the first line of the exception message
-        #print(date_)
         print(str(e).split('\n')[0])
         tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
         raise
-    tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
+    tdfs4ds.PROCESS_TYPE = PROCESS_TYPE

tdfs4ds 0.2.4.26__py3-none-any.whl → 0.2.4.41__py3-none-any.whl

tdfs4ds 0.2.4.26py3-none-any.whl → 0.2.4.41py3-none-any.whl