PyPI - tdfs4ds - Versions diffs - 0.2.4.32__py3-none-any.whl → 0.2.4.34__py3-none-any.whl - Mend

tdfs4ds 0.2.4.32py3-none-any.whl → 0.2.4.34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

tdfs4ds/__init__.py +387 -542
tdfs4ds/feature_store/feature_data_processing.py +367 -299
tdfs4ds/feature_store/feature_store_management.py +189 -167
tdfs4ds/process_store/process_query_administration.py +1 -1
tdfs4ds/process_store/process_registration_management.py +67 -55
tdfs4ds/utils/filter_management.py +87 -53
tdfs4ds/utils/time_management.py +67 -24
{tdfs4ds-0.2.4.32.dist-info → tdfs4ds-0.2.4.34.dist-info}/METADATA +1 -1
{tdfs4ds-0.2.4.32.dist-info → tdfs4ds-0.2.4.34.dist-info}/RECORD +11 -11
{tdfs4ds-0.2.4.32.dist-info → tdfs4ds-0.2.4.34.dist-info}/WHEEL +0 -0
{tdfs4ds-0.2.4.32.dist-info → tdfs4ds-0.2.4.34.dist-info}/top_level.txt +0 -0

tdfs4ds/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
-__version__ = '0.2.4.32'
+__version__ = '0.2.4.34'
 import logging
 # Setup the logger
 logging.basicConfig(
     level=logging.INFO,
@@ -7,6 +8,15 @@ logging.basicConfig(
     datefmt='%Y-%m-%d %H:%M:%S'  # Set the date/time format
 )
+# Helper: central logging gate controlled by tdfs4ds.DISPLAY_LOGS
+def logger_safe(level, message, *args, **kwargs):
+    """
+    Wrapper around the global `logger` that only emits logs when
+    tdfs4ds.DISPLAY_LOGS is True. `level` is a string like "info", "error", etc.
+    """
+    if getattr(tdfs4ds, "DISPLAY_LOGS", True):
+        getattr(logger, level)(message, *args, **kwargs)
 logger = logging.getLogger(__name__)
 from tdfs4ds.feature_store.feature_query_retrieval import get_available_entity_id_records, write_where_clause_filter
@@ -57,7 +67,7 @@ import tdfs4ds.datasets
 import time
 import inspect
-import tqdm
+from tqdm.auto import tqdm  # auto picks the right frontend (notebook/terminal)
 from tdfs4ds.feature_store.feature_data_processing import generate_on_clause
@@ -70,92 +80,80 @@ PROCESS_TYPE = 'RUN PROCESS'
 try:
     SCHEMA = tdml.context.context._get_current_databasename()
     if SCHEMA is None:
-        print('Please specify the database which is hosting the feature store.')
-        print('tdfs4ds.feature_store.schema = "<feature store database>"')
+        logger.warning("No default database detected for feature store.")
+        logger.warning('Please set it explicitly: tdfs4ds.feature_store.schema = "<feature store database>"')
     else:
-        print('The default database is used for the feature store.')
-        print(f"tdfs4ds.feature_store.schema = '{SCHEMA}'")
+        logger.info("Default database detected for feature store: %s", SCHEMA)
+        logger.info('tdfs4ds.feature_store.schema = "%s"', SCHEMA)
         if DATA_DOMAIN is None:
             DATA_DOMAIN = SCHEMA
-            print(f"the data domain for the current work is :{DATA_DOMAIN}")
-            print("Please update it as you wish with tdfs4ds.DATA_DOMAIN=<your data domain>")
+            logger.info("DATA_DOMAIN not set. Defaulting to SCHEMA: %s", DATA_DOMAIN)
+            logger.info('You can override it using: tdfs4ds.DATA_DOMAIN = "<your data domain>"')
 except Exception as e:
-    print('Please specify the database which is hosting the feature store.')
-    print('tdfs4ds.feature_store.schema = "<feature store database>"')
+    logger.error("Could not determine current database: %s", str(e).split('\n')[0])
+    logger.warning("Please specify the feature store database manually:")
+    logger.warning('tdfs4ds.feature_store.schema = "<feature store database>"')
 def setup(database, if_exists='fail'):
     """
-    Set up the database environment by configuring schema names and optionally dropping existing tables.
-    This function sets the database schema for feature and process catalogs. If specified, it also handles
-    the replacement of existing catalog tables. It reports the status of these operations, including any
-    encountered exceptions.
-    Parameters:
-    database (str): The name of the database schema to be used.
-    if_exists (str, optional): Determines the behavior if catalog tables already exist in the database.
-                               'fail' (default) - Do nothing if the tables exist.
-                               'replace' - Drop the tables if they exist before creating new ones.
-    Steps performed:
-    1. Sets the schema to the provided database name.
-    2. If 'if_exists' is 'replace', attempts to drop 'FS_FEATURE_CATALOG' and 'FS_PROCESS_CATALOG' tables.
-    3. Creates new feature and process catalog tables and sets their names in the tdfs4ds module.
-    4. Prints the names of the newly created tables along with the database name.
-    5. Captures and prints the first line of any exceptions that occur during these operations.
-    Returns:
-    None
+    Initialize the feature store environment by creating catalog tables and views.
     """
     from tdfs4ds.feature_store.feature_store_management import feature_store_catalog_creation
     from tdfs4ds.process_store.process_store_catalog_management import process_store_catalog_creation
     tdfs4ds.SCHEMA = database
+    logger_safe("info", "Setting up feature store in database: %s", database)
     if if_exists == 'replace':
-        try:
-            tdml.db_drop_table(table_name = tdfs4ds.FEATURE_CATALOG_NAME, schema_name=database)
-        except Exception as e:
-            print(str(e).split('\n')[0])
-        try:
-            tdml.db_drop_table(table_name = tdfs4ds.PROCESS_CATALOG_NAME, schema_name=database)
-        except Exception as e:
-            print(str(e).split('\n')[0])
-        try:
-            tdml.db_drop_table(table_name = tdfs4ds.DATA_DISTRIBUTION_NAME, schema_name=database)
-        except Exception as e:
-            print(str(e).split('\n')[0])
+        logger_safe("info", "Replacing existing catalog tables if they exist.")
+        for table in [tdfs4ds.FEATURE_CATALOG_NAME, tdfs4ds.PROCESS_CATALOG_NAME, tdfs4ds.DATA_DISTRIBUTION_NAME]:
+            try:
+                tdml.db_drop_table(table_name=table, schema_name=database)
+                logger_safe("info", "Dropped table %s.%s", database, table)
+            except Exception as e:
+                logger_safe("warning", "Could not drop table %s.%s: %s", database, table, str(e).split('\n')[0])
         DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME).drop_catalog()
     try:
         tdfs4ds.FEATURE_CATALOG_NAME = feature_store_catalog_creation()
-        print('feature catalog table: ', tdfs4ds.FEATURE_CATALOG_NAME, ' in database ', database)
+        logger_safe("info", "Feature catalog table created: %s in database %s", tdfs4ds.FEATURE_CATALOG_NAME, database)
     except Exception as e:
-        print(str(e).split('\n')[0])
+        logger_safe("error", "Feature catalog creation failed: %s", str(e).split('\n')[0])
     try:
-        tdfs4ds.PROCESS_CATALOG_NAME, tdfs4ds.DATA_DISTRIBUTION_NAME, tdfs4ds.FILTER_MANAGER_NAME = process_store_catalog_creation()
-        print('process catalog table: ', tdfs4ds.PROCESS_CATALOG_NAME, ' in database ', database)
-        print('data distribution table: ', tdfs4ds.DATA_DISTRIBUTION_NAME, ' in database ', database)
-        print('filter manager table: ', tdfs4ds.FILTER_MANAGER_NAME, ' in database ', database)
+        (tdfs4ds.PROCESS_CATALOG_NAME,
+         tdfs4ds.DATA_DISTRIBUTION_NAME,
+         tdfs4ds.FILTER_MANAGER_NAME) = process_store_catalog_creation()
+        logger_safe("info", "Process catalog table created: %s", tdfs4ds.PROCESS_CATALOG_NAME)
+        logger_safe("info", "Data distribution table created: %s", tdfs4ds.DATA_DISTRIBUTION_NAME)
+        logger_safe("info", "Filter manager table created: %s", tdfs4ds.FILTER_MANAGER_NAME)
     except Exception as e:
-        print(str(e).split('\n')[0])
+        logger_safe("error", "Process catalog creation failed: %s", str(e).split('\n')[0])
     try:
         tdfs4ds.process_store.process_followup.follow_up_table_creation()
+        logger_safe("info", "Follow-up table created successfully.")
     except Exception as e:
-        print(str(e).split('\n')[0])
+        logger_safe("error", "Follow-up table creation failed: %s", str(e).split('\n')[0])
     tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
     tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
     dataset_catalog = DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME)
     if not dataset_catalog._exists():
         dataset_catalog.create_catalog()
+        logger_safe("info", "Dataset catalog created: %s", tdfs4ds.DATASET_CATALOG_NAME)
+    logger_safe("info", "Setup complete.")
     return
 def connect(
     database                  = tdfs4ds.SCHEMA,
     feature_catalog_name      = tdfs4ds.FEATURE_CATALOG_NAME,
@@ -166,15 +164,15 @@ def connect(
     feature_catalog_name_view = tdfs4ds.FEATURE_CATALOG_NAME_VIEW,
     process_catalog_name_view = tdfs4ds.PROCESS_CATALOG_NAME_VIEW,
     dataset_catalog_name      = tdfs4ds.DATASET_CATALOG_NAME,
-    create_if_missing         = False  # New argument
+    create_if_missing         = False
 ):
-    if database is not None:
-        tdfs4ds.SCHEMA = database
-    else:
+    if database is None:
         raise ValueError("database parameter is None.")
+    tdfs4ds.SCHEMA = database
+    logger_safe("info", "Connecting to feature store in database: %s", database)
     tables = [x.lower() for x in list(tdml.db_list_tables(schema_name=tdfs4ds.SCHEMA, object_type='table').TableName.values)]
     feature_exists = feature_catalog_name.lower() in tables
     process_exists = process_catalog_name.lower() in tables
     distrib_exists = data_distribution_name.lower() in tables
@@ -183,20 +181,20 @@ def connect(
     if not (feature_exists and process_exists and distrib_exists and filter_manager_exists):
         if not create_if_missing:
-            return False  # Feature store does not exist
-        else:
-            # Create the missing components
-            if not feature_exists:
-                tdfs4ds.feature_store.feature_store_management.feature_store_catalog_creation()
-            if not process_exists:
-                tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_creation()
-            if not distrib_exists:
-                tdfs4ds.data_distribution.data_distribution_catalog_creation()
-            if not filter_manager_exists:
-                tdfs4ds.filter_manager.filter_manager_catalog_creation()
-    # Follow-up table handling
+            logger_safe("warning", "Feature store components missing and create_if_missing=False")
+            return False
+        logger_safe("info", "Missing components detected; creating missing parts...")
+        if not feature_exists:
+            tdfs4ds.feature_store.feature_store_management.feature_store_catalog_creation()
+        if not process_exists:
+            tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_creation()
+        if not distrib_exists:
+            tdfs4ds.data_distribution.data_distribution_catalog_creation()
+        if not filter_manager_exists:
+            tdfs4ds.filter_manager.filter_manager_catalog_creation()
     if not followup_name_exists:
+        logger_safe("info", "Creating follow-up table: %s", followup_name)
         tdfs4ds.process_store.process_followup.follow_up_table_creation()
     tdfs4ds.FOLLOW_UP_NAME = followup_name
@@ -210,30 +208,31 @@ def connect(
     process_list = tdml.DataFrame(tdml.in_schema(database, process_catalog_name))
     if 'ENTITY_NULL_SUBSTITUTE' not in process_list.columns:
-        print('ENTITY_NULL_SUBSTITUTE column does not exist in the existing process catalog')
-        print('upgrade to the latest DDL')
+        logger_safe("warning", "ENTITY_NULL_SUBSTITUTE column missing. Upgrading catalog.")
         tdfs4ds.process_store.process_store_catalog_management.upgrade_process_catalog()
     tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
     tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
-    # Dataset catalog setup
+    # Dataset Catalog
     tdfs4ds.DATASET_CATALOG_NAME = dataset_catalog_name
-    dataset_catalog = DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME)
+    dataset_catalog = DatasetCatalog(schema_name=database, name=dataset_catalog_name)
     if not dataset_catalog._exists():
         dataset_catalog.create_catalog()
+        logger_safe("info", "Dataset catalog created: %s", dataset_catalog_name)
-    # Check if distribution is temporal
+    # Detect temporal distribution
     def is_data_distribution_temporal():
         return 'PERIOD' in tdfs4ds.utils.lineage.get_ddl(
             view_name=tdfs4ds.DATA_DISTRIBUTION_NAME,
             schema_name=tdfs4ds.SCHEMA,
             object_type='table'
         )
     tdfs4ds.DATA_DISTRIBUTION_TEMPORAL = is_data_distribution_temporal()
-    return True  # Feature store exists or was created
+    logger_safe("info", "Connected to feature store successfully.")
+    return True
@@ -287,50 +286,22 @@ def get_dataset_entity(dataset_id = None):
 def get_dataset_features(dataset_id = None):
     return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_features(dataset_id)
-def run(process_id, return_dataset = False, force_compute = False, force_varchar_length = None):
+def run(process_id, return_dataset=False, force_compute=False, force_varchar_length=None):
     """
     Executes a specific process from the feature store identified by the process ID.
-    The function handles different process types and performs appropriate actions.
-    Parameters:
-    - process_id (str): The unique identifier of the process to run.
-    - return_dataset (bool, optional): A flag indicating whether to return the dataset created during the process.
-                                       Default is False.
-    - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
-                                      Default is False.
-    - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
-                                        VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
-                                        where k is the smallest integer so that the original lengths is smaller or equal
-                                        to k x force_varchar_length. Default is None.
-    Returns:
-    DataFrame or None: If return_dataset is True, returns the dataset created during the process. Otherwise, returns None.
-    This function performs the following steps:
-    1. Determines the process type and initializes necessary variables.
-    2. Constructs and executes a SQL query to retrieve process details by process ID.
-    3. Fetches the filter manager, process type, primary index, partitioning, and data domain from the query result.
-    4. Handles different process types, such as 'denormalized view' and 'tdstone2 view'.
-    5. For 'denormalized view' process type, extracts necessary details, fetches data, and uploads features to the feature store.
-    6. Optionally returns the dataset created during the process if return_dataset is True.
-    Note:
-    - The function relies on various sub-modules within the `tdfs4ds` library for different steps of the process, from
-      data retrieval to feature uploading.
-    - It is intended to be used internally within a system that manages a Teradata feature store, assuming access to
-      a Teradata database and the appropriate schema for feature storage.
+    Uses global `logger` for diagnostics (gated by tdfs4ds.DISPLAY_LOGS).
     """
     if tdfs4ds.PROCESS_TYPE is None:
         PROCESS_TYPE_ = 'RUN PROCESS'
-        tdfs4ds.RUN_ID       = str(uuid.uuid4())
+        tdfs4ds.RUN_ID = str(uuid.uuid4())
     else:
         PROCESS_TYPE_ = tdfs4ds.PROCESS_TYPE
-    if tdfs4ds.DEBUG_MODE:
-        print('def run','tdfs4ds.FEATURE_STORE_TIME', tdfs4ds.FEATURE_STORE_TIME)
+    if getattr(tdfs4ds, "DEBUG_MODE", False):
+        logger_safe("debug", "def run | tdfs4ds.FEATURE_STORE_TIME=%s", tdfs4ds.FEATURE_STORE_TIME)
-    if tdfs4ds.FEATURE_STORE_TIME == None:
+    if tdfs4ds.FEATURE_STORE_TIME is None:
         validtime_statement = 'CURRENT VALIDTIME'
     else:
         validtime_statement = f"VALIDTIME AS OF TIMESTAMP '{tdfs4ds.FEATURE_STORE_TIME}'"
@@ -342,148 +313,110 @@ def run(process_id, return_dataset = False, force_compute = False, force_varchar
     WHERE A.PROCESS_ID = '{process_id}'
     """
+    logger_safe(
+        "info",
+        "Starting run | run_id=%s | process_type=%s | process_id=%s | return_dataset=%s | force_compute=%s | force_varchar_length=%s",
+        tdfs4ds.RUN_ID, PROCESS_TYPE_, process_id, return_dataset, force_compute, force_varchar_length
+    )
     # Executing the query and converting the result to Pandas DataFrame
     df = tdml.DataFrame.from_query(query).to_pandas()
-    # Check if exactly one record is returned, else print an error
+    # Check if exactly one record is returned, else log an error and return
     if df.shape[0] != 1:
-        print('error - there is ', df.shape[0], f' records. Check table {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}')
-        print('check ou this query:')
-        print(query)
+        logger_safe(
+            "error",
+            "Process catalog lookup returned %s record(s); expected 1. Check table %s.%s. Query: %s",
+            df.shape[0], tdfs4ds.SCHEMA, tdfs4ds.PROCESS_CATALOG_NAME_VIEW, query.strip()
+        )
         return
     # Fetching the filter manager
     filter_schema_name = df['FILTER_DATABASE_NAME'].values[0]
     if filter_schema_name is None:
         filtermanager = None
     else:
         filter_view_name = df['FILTER_VIEW_NAME'].values[0]
-        filter_table_name = df['FILTER_TABLE_NAME'].values[0]
+        filter_table_name = df['FILTER_TABLE_NAME'].values[0]  # kept for parity; not used directly here
         filtermanager = FilterManager(table_name=filter_view_name, schema_name=filter_schema_name)
-    # Fetching the process type from the query result
-    process_type = df['PROCESS_TYPE'].values[0]
-    # Fetching the primary index from the query result
-    primary_index = df['FOR_PRIMARY_INDEX'].values[0]
+    # Fetching process metadata
+    process_type   = df['PROCESS_TYPE'].values[0]
+    primary_index  = df['FOR_PRIMARY_INDEX'].values[0]
     if primary_index is not None:
-        primary_index = primary_index.split(',')
-    # Fetching the primary index from the query result
-    partitioning = df['FOR_DATA_PARTITIONING'].values[0]
-    # Fetching the data domain from the query result
-    DATA_DOMAIN = df['DATA_DOMAIN'].values[0]
+        primary_index = [x.strip() for x in primary_index.split(',') if x.strip()]
+    partitioning   = df['FOR_DATA_PARTITIONING'].values[0]
+    DATA_DOMAIN    = df['DATA_DOMAIN'].values[0]
+    logger_safe(
+        "info",
+        "Process metadata | process_id=%s | process_type=%s | primary_index=%s | partitioning=%s | data_domain=%s | validtime=%s",
+        process_id, process_type, primary_index, partitioning, DATA_DOMAIN, validtime_statement
+    )
     # Handling 'denormalized view' process type
     if process_type == 'denormalized view':
-        # Extracting necessary details for this process type
-        view_name = df['VIEW_NAME'].values[0]
-        entity_id = df['ENTITY_ID'].values[0].split(',')
+        view_name              = df['VIEW_NAME'].values[0]
+        entity_id              = [x.strip() for x in df['ENTITY_ID'].values[0].split(',') if x.strip()]
         entity_null_substitute = eval(df['ENTITY_NULL_SUBSTITUTE'].values[0])
-        feature_names = df['FEATURE_NAMES'].values[0].split(',')
+        feature_names          = [x.strip() for x in df['FEATURE_NAMES'].values[0].split(',') if x.strip()]
-        # Fetching data and uploading features to the feature store
         df_data = tdml.DataFrame(tdml.in_schema(view_name.split('.')[0], view_name.split('.')[1]))
-        if tdfs4ds.DEBUG_MODE:
-            print('run','entity_id',entity_id)
-            print('run', 'entity_null_substitute', entity_null_substitute)
-            print('run','feature_names',feature_names)
-            print('run','process_id',process_id)
-            print('run','primary_index',primary_index)
-            print('run','partitioning',partitioning)
+        if getattr(tdfs4ds, "DEBUG_MODE", False):
+            logger_safe("debug", "run | entity_id=%s", entity_id)
+            logger_safe("debug", "run | entity_null_substitute=%s", entity_null_substitute)
+            logger_safe("debug", "run | feature_names=%s", feature_names)
+            logger_safe("debug", "run | process_id=%s", process_id)
+            logger_safe("debug", "run | primary_index=%s", primary_index)
+            logger_safe("debug", "run | partitioning=%s", partitioning)
         dataset = _upload_features(
             df_data,
             entity_id,
             feature_names,
-            feature_versions = process_id,
-            primary_index = primary_index,
-            partitioning = partitioning,
-            filtermanager = filtermanager,
-            entity_null_substitute = entity_null_substitute,
-            process_id = process_id,
-            force_compute= force_compute,
-            force_varchar_length = force_varchar_length
+            feature_versions=process_id,
+            primary_index=primary_index,
+            partitioning=partitioning,
+            filtermanager=filtermanager,
+            entity_null_substitute=entity_null_substitute,
+            process_id=process_id,
+            force_compute=force_compute,
+            force_varchar_length=force_varchar_length
         )
     # Handling 'tdstone2 view' process type
     elif process_type == 'tdstone2 view':
-        print('not implemented yet')
+        logger_safe("warning", "Process type 'tdstone2 view' not implemented yet for process_id=%s", process_id)
+        dataset = None
+    else:
+        logger_safe("error", "Unknown process type '%s' for process_id=%s", process_type, process_id)
+        dataset = None
     if return_dataset:
+        logger_safe("info", "Run finished with dataset | run_id=%s | process_id=%s", tdfs4ds.RUN_ID, process_id)
         return dataset
     else:
+        logger_safe("info", "Run finished without dataset | run_id=%s | process_id=%s", tdfs4ds.RUN_ID, process_id)
         return
-def upload_features(df, entity_id, feature_names, metadata={}, primary_index = None, partitioning = '', filtermanager = None, entity_null_substitute = {}, force_compute = True, force_varchar_length = 1024):
-    """
-    Uploads feature data from a DataFrame to the feature store for a specified entity. This involves registering the
-    process in the feature store, executing the necessary SQL to insert the data, and returning the resulting dataset
-    for further use or inspection.
-    The function supports dynamic entity ID interpretation and flexible feature name handling, ensuring compatibility
-    with various data schemas. It automatically registers the data upload process and applies additional metadata,
-    if provided.
-    Parameters:
-    - df (DataFrame): The DataFrame containing the feature data to be uploaded.
-    - entity_id (dict, list, or str): The identifier of the entity to which the features belong. This can be:
-        - a dictionary mapping column names to their data types,
-        - a list of column names, which will be automatically converted to a dictionary with types inferred from `df`,
-        - a string representing a single column name, which will be converted into a list and then to a dictionary as above.
-    - feature_names (list or str): The names of the features to be uploaded. If a string is provided, it will be
-      split into a list based on commas or treated as a single feature name.
-    - metadata (dict, optional): Additional metadata to associate with the upload process. Defaults to an empty dictionary.
-    - primary_index (list, optional): Specifies the primary index columns for optimizing data storage and retrieval.
-    - partitioning (str, optional): Defines how the data should be partitioned in the store for performance optimization.
-    - filtermanager (object, optional): An object managing filter conditions for the feature data. Default is None.
-    - entity_null_substitute (dict, optional): A dictionary specifying substitute values for nulls in entity columns.
-                                               Default is an empty dictionary.
-    - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
-                                      Default is True.
-    - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
-                                        VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
-                                        where k is the smallest integer so that the original lengths is smaller or equal
-                                        to k x force_varchar_length. Default is 1024.
-    Returns:
-    DataFrame: A DataFrame representing the dataset resulting from the upload process, typically used for validation
-               or further processing.
-    The process involves several steps, including entity ID type conversion if necessary, feature name normalization,
-    process registration in the feature store, and the execution of SQL queries to insert the data. The function concludes
-    by returning a dataset derived from the uploaded data, offering immediate access to the newly stored information.
-    Example:
-    >>> df = tdml.DataFrame(...)
-    >>> entity_id = ['customer_id']
-    >>> feature_names = ['age', 'income']
-    >>> dataset = upload_features(df, entity_id, feature_names)
-    >>> # Another example with list-based entity_id, custom primary_index, and partitioning
-    >>> tddf = tdml.DataFrame(...)  # Assuming tddf is predefined with appropriate columns
-    >>> entity_id = ['tx_type', 'txn_id']
-    >>> primary_index = ['txn_id']
-    >>> partitioning = '''
-    ... PARTITION BY CASE_N (
-    ...     tx_type LIKE 'DEBIT',
-    ...     tx_type LIKE 'PAYMENT',
-    ...     tx_type LIKE 'CASH_OUT',
-    ...     tx_type LIKE 'CASH_IN',
-    ...     tx_type LIKE 'TRANSFER',
-    ...     NO CASE,
-    ...     UNKNOWN)'''
-    >>> features = [x for x in tddf.columns if x not in entity_id]
-    >>> dataset = upload_features(
-    ...     df = tddf,
-    ...     entity_id = entity_id,
-    ...     feature_names = features,
-    ...     metadata = {'project': 'test'},
-    ...     primary_index = primary_index,
-    ...     partitioning = partitioning
-    ... )
+def upload_features(
+    df,
+    entity_id,
+    feature_names,
+    metadata={},
+    primary_index=None,
+    partitioning='',
+    filtermanager=None,
+    entity_null_substitute={},
+    force_compute=True,
+    force_varchar_length=1024
+):
+    """
+    Uploads feature data from a DataFrame to the feature store for a specified entity.
+    All diagnostics go through `logger_safe()` which respects `tdfs4ds.DISPLAY_LOGS`.
     """
     from tdfs4ds.utils.info import get_column_types
@@ -491,45 +424,42 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
     from tdfs4ds.process_store.process_registration_management import register_process_view
     # Convert entity_id to a dictionary if it's not already one
-    if type(entity_id) == list:
+    if isinstance(entity_id, list):
         entity_id.sort()
         entity_id = get_column_types(df, entity_id)
-        if tdfs4ds.DISPLAY_LOGS:
-            print('entity_id has been converted to a proper dictionary : ', entity_id)
-    elif type(entity_id) == str:
+        logger_safe("debug", "entity_id converted to dict: %s", entity_id)
+    elif isinstance(entity_id, str):
         entity_id = [entity_id]
         entity_id = get_column_types(df, entity_id)
-        if tdfs4ds.DISPLAY_LOGS:
-            print('entity_id has been converted to a proper dictionary : ', entity_id)
-    if type(feature_names) != list:
-        if tdfs4ds.DISPLAY_LOGS:
-            print('feature_names is not a list:', feature_names)
-        if ',' in feature_names:
-            feature_names = feature_names.split(',')
+        logger_safe("debug", "entity_id converted to dict: %s", entity_id)
+    # Normalize feature_names
+    if not isinstance(feature_names, list):
+        logger_safe("debug", "feature_names is not a list: %s", feature_names)
+        if isinstance(feature_names, str) and ',' in feature_names:
+            feature_names = [x.strip() for x in feature_names.split(',')]
         else:
             feature_names = [feature_names]
-        if tdfs4ds.DISPLAY_LOGS:
-            print('it has been converted to : ', feature_names)
-            print('check it is a expected.')
-    if primary_index is not None and type(primary_index) != list:
-        if tdfs4ds.DISPLAY_LOGS:
-            print('primary_index is not a list:', primary_index)
-        if ',' in primary_index:
-            primary_index = primary_index.split(',')
+        logger_safe("debug", "feature_names converted to list: %s", feature_names)
+        logger_safe("debug", "Check the conversion is as expected.")
+    # Normalize primary_index
+    if primary_index is not None and not isinstance(primary_index, list):
+        logger_safe("debug", "primary_index is not a list: %s", primary_index)
+        if isinstance(primary_index, str) and ',' in primary_index:
+            primary_index = [x.strip() for x in primary_index.split(',')]
         else:
             primary_index = [primary_index]
-        if tdfs4ds.DISPLAY_LOGS:
-            print('it has been converted to : ', feature_names)
-            print('check it is a expected.')
+        logger_safe("debug", "primary_index converted to list: %s", primary_index)
+        logger_safe("debug", "Check the conversion is as expected.")
+    # Partitioning
     partitioning = tdfs4ds.utils.info.generate_partitioning_clause(partitioning=partitioning)
-    if tdfs4ds.DISPLAY_LOGS:
-        print("filtermanager", filtermanager)
+    logger_safe("debug", "filtermanager: %s", filtermanager)
-    # Register the process and retrieve the SQL query to insert the features, and the process ID
+    # Register process -> get SQL(s) + process_id
     query_insert, process_id, query_insert_dist, query_insert_filtermanager = register_process_view.__wrapped__(
         view_name       = df,
         entity_id       = entity_id,
@@ -542,104 +472,96 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
         entity_null_substitute = entity_null_substitute
     )
-    # Execute the SQL query to insert the features into the database
-    execute_query(query_insert)
-    execute_query(query_insert_dist)
-    if tdfs4ds.DEBUG_MODE:
-        print("query_insert_filtermanager",query_insert_filtermanager)
-    if query_insert_filtermanager is not None:
-        execute_query(query_insert_filtermanager)
+    logger_safe("info", "Registered process (process_id=%s) for upload_features", process_id)
-    # Run the registered process and return the resulting dataset
-    PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
-    tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES'
-    if tdfs4ds.BUILD_DATASET_AT_UPLOAD: tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES WITH DATASET VALIDATION'
-    tdfs4ds.RUN_ID = str(uuid.uuid4())
-    if tdfs4ds.BUILD_DATASET_AT_UPLOAD:
+    # Execute queries
+    try:
+        execute_query(query_insert)
+        logger_safe("info", "Executed main insert query for process_id=%s", process_id)
+    except Exception as e:
+        logger_safe("exception", "Main insert query failed for process_id=%s", process_id)
+        raise
-        try:
+    try:
+        execute_query(query_insert_dist)
+        logger_safe("info", "Executed distribution insert query for process_id=%s", process_id)
+    except Exception as e:
+        logger_safe("exception", "Distribution insert query failed for process_id=%s", process_id)
+        raise
-            dataset = run(process_id=process_id, return_dataset=True, force_compute = force_compute, force_varchar_length = force_varchar_length)
+    if getattr(tdfs4ds, "DEBUG_MODE", False):
+        # Avoid dumping entire SQL in normal logs; keep it debug-only.
+        logger_safe("debug", "query_insert_filtermanager: %s", query_insert_filtermanager)
+    if query_insert_filtermanager is not None:
+        try:
+            execute_query(query_insert_filtermanager)
+            logger_safe("info", "Executed filtermanager insert query for process_id=%s", process_id)
         except Exception as e:
-            tdfs4ds.process_store.process_followup.followup_close(
-                run_id       = tdfs4ds.RUN_ID,
-                process_type = tdfs4ds.PROCESS_TYPE,
-                process_id   = process_id,
-                status       = 'FAILED,' + str(e).split('\n')[0]
-            )
+            logger_safe("exception", "Filtermanager insert query failed for process_id=%s", process_id)
             raise
+    # Run the registered process (with/without dataset)
+    PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
+    tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES'
+    if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
+        tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES WITH DATASET VALIDATION'
+    tdfs4ds.RUN_ID = str(uuid.uuid4())
-        return dataset
-    else:
+    logger_safe(
+        "info",
+        "Starting run (run_id=%s, process_type=%s, process_id=%s, force_compute=%s, force_varchar_length=%s)",
+        tdfs4ds.RUN_ID, tdfs4ds.PROCESS_TYPE, process_id, force_compute, force_varchar_length
+    )
+    try:
+        if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
+            dataset = run(
+                process_id=process_id,
+                return_dataset=True,
+                force_compute=force_compute,
+                force_varchar_length=force_varchar_length
+            )
+            logger_safe("info", "Run completed with dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
+            return dataset
+        else:
+            run(
+                process_id=process_id,
+                return_dataset=False,
+                force_compute=force_compute,
+                force_varchar_length=force_varchar_length
+            )
+            logger_safe("info", "Run completed without dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
+            return
+    except Exception as e:
+        # Keep your existing follow-up close behavior, but ensure the error is logged.
         try:
-            run(process_id=process_id, return_dataset=False, force_compute = force_compute, force_varchar_length = force_varchar_length)
-        except Exception as e:
             tdfs4ds.process_store.process_followup.followup_close(
                 run_id       = tdfs4ds.RUN_ID,
                 process_type = tdfs4ds.PROCESS_TYPE,
                 process_id   = process_id,
                 status       = 'FAILED,' + str(e).split('\n')[0]
             )
-            raise
-        return
-    tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
-def _upload_features(df, entity_id, feature_names,
-                   feature_versions=FEATURE_VERSION_DEFAULT, primary_index = None, partitioning = '', filtermanager=None, entity_null_substitute={}, process_id = None, force_compute = False,force_varchar_length = None):
-    """
-    Uploads features from a DataFrame to the feature store, handling entity registration, feature type determination,
-    feature registration, preparation for ingestion, and storage in the designated feature tables.
+        finally:
+            logger_safe("exception", "Run failed (run_id=%s, process_id=%s): %s",
+                tdfs4ds.RUN_ID, process_id, str(e).split('\n')[0]
+            )
+        raise
+    finally:
+        # Restore previous process type just in case the caller relies on it.
+        tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
-    Parameters:
-    - df (DataFrame): The input DataFrame containing the feature data.
-    - entity_id (str or dict): The identifier for the entity to which these features belong. This can be a single ID
-                               (str) or a dictionary of attribute names and values uniquely identifying the entity.
-    - feature_names (list): A list of strings specifying the names of the features to be uploaded.
-    - feature_versions (str or list, optional): Specifies the versions of the features to be uploaded. Can be a single
-                                                string applied to all features or a list of strings specifying the version
-                                                for each feature respectively. Default is 'dev.0.0'.
-    - primary_index (list, optional): Specifies the columns to be used as the primary index in the feature store tables.
-                                      This can significantly impact the performance of data retrieval operations.
-    - partitioning (str, optional): A string indicating the partitioning strategy for the feature store tables, which can
-                                    enhance query performance based on the access patterns.
-    - filtermanager (object, optional): An object managing filter conditions for the feature data. Default is None.
-    - entity_null_substitute (dict, optional): A dictionary specifying substitute values for nulls in entity columns.
-                                               Default is an empty dictionary.
-    - process_id (str, optional): An identifier for the process, used for tracking and follow-up. Default is None.
-    - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
-                                      Default is False.
-    - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
-                                            VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
-                                            where k is the smallest integer so that the original lengths is smaller or equal
-                                            to k x force_varchar_length. Default is None.
-    Returns:
-    DataFrame: A DataFrame representing the dataset view created in the feature store, detailing the features and their
-               metadata, including versions and storage locations.
-    This function orchestrates several steps involved in feature storage:
-    1. Registers the entity in the feature store if not already present.
-    2. Determines the data types of the features based on the input DataFrame.
-    3. Registers the features, including their names, types, and versions, in the feature catalog.
-    4. Prepares the feature data for ingestion, including any necessary transformations.
-    5. Stores the prepared feature data in the feature store.
-    6. Optionally, cleans up temporary resources used during the process.
-    7. Builds and returns a view of the dataset representing the uploaded features for easy access.
-    Note:
-    - The function relies on various sub-modules within the `tdfs4ds` library for different steps of the process, from
-      entity and feature registration to data preparation and storage.
-    - It is intended to be used internally within a system that manages a Teradata feature store, assuming access to
-      a Teradata database and the appropriate schema for feature storage.
-    - The function assumes that the feature_versions, if provided as a list, matches the length of feature_names.
-    """
+def _upload_features(
+    df, entity_id, feature_names,
+    feature_versions=FEATURE_VERSION_DEFAULT,
+    primary_index=None, partitioning='',
+    filtermanager=None, entity_null_substitute={},
+    process_id=None, force_compute=False,
+    force_varchar_length=None
+):
     from tdfs4ds.feature_store.entity_management        import register_entity
     from tdfs4ds.feature_store.feature_store_management import Gettdtypes
     from tdfs4ds.feature_store.feature_store_management import register_features
@@ -647,193 +569,141 @@ def _upload_features(df, entity_id, feature_names,
     from tdfs4ds.feature_store.feature_data_processing  import store_feature, apply_collect_stats
     from tdfs4ds.utils.info import get_column_types, update_varchar_length
-    # Convert entity_id to a dictionary if it's not already one
-    if type(entity_id) == list:
+    # Convert entity_id to a dictionary if not already
+    if isinstance(entity_id, list):
         entity_id.sort()
         entity_id = get_column_types(df, entity_id)
-        if tdfs4ds.DISPLAY_LOGS:
-            print('entity_id has been converted to a proper dictionary : ', entity_id)
-    elif type(entity_id) == str:
-        entity_id = [entity_id]
-        entity_id = get_column_types(df, entity_id)
-        if tdfs4ds.DISPLAY_LOGS:
-            print('entity_id has been converted to a proper dictionary : ', entity_id)
-    #register_entity(entity_id, primary_index=primary_index, partitioning=partitioning)
-    # If feature_versions is a list, create a dictionary mapping each feature name to its corresponding version.
-    # If feature_versions is a string, create a dictionary mapping each feature name to this string.
-    if type(feature_versions) == list:
-        selected_features = {k: v for k, v in zip(feature_names, feature_versions)}
+        logger_safe("debug", "entity_id converted to dict: %s", entity_id)
+    elif isinstance(entity_id, str):
+        entity_id = get_column_types(df, [entity_id])
+        logger_safe("debug", "entity_id converted to dict: %s", entity_id)
+    # Map feature versions
+    if isinstance(feature_versions, list):
+        selected_features = dict(zip(feature_names, feature_versions))
     else:
         selected_features = {k: feature_versions for k in feature_names}
-    # Get the Teradata types of the features in df.
-    feature_names_types = Gettdtypes(
-        df,
-        features_columns=feature_names,
-        entity_id=entity_id
-    )
+    # Get Teradata types for features
+    feature_names_types = Gettdtypes(df, features_columns=feature_names, entity_id=entity_id)
     if force_varchar_length is not None:
-        print(feature_names_types)
-        feature_names_types = update_varchar_length(feature_names_types,new_varchar_length = force_varchar_length)
+        logger_safe("debug", "Updating VARCHAR lengths with force_varchar_length=%s", force_varchar_length)
+        feature_names_types = update_varchar_length(
+            feature_names_types,
+            new_varchar_length=force_varchar_length
+        )
     def validate_feature_types(feature_names_types):
-        """
-        Validates feature data types and raises an error if any value contains
-        the substrings 'clob', 'blob', or 'json' (case insensitive).
-        Parameters:
-        feature_names_types (dict): A dictionary where keys are feature names and values are their data types.
-        Raises:
-        ValueError: If any feature type contains 'clob', 'blob', or 'json'.
-        """
-        invalid_types = {key: value['type'] for key, value in feature_names_types.items()
-                        if any(term in value['type'].lower() for term in ['clob', 'blob', 'json'])}
-        if invalid_types:
+        invalid = {
+            k: v['type'] for k, v in feature_names_types.items()
+            if any(x in v['type'].lower() for x in ['clob', 'blob', 'json'])
+        }
+        if invalid:
             raise ValueError(
-                f"The following features have unsupported data types: {invalid_types}. "
-                "The data types 'CLOB', 'BLOB', and 'JSON' are not yet managed by the feature store."
+                f"Unsupported data types found: {invalid}. "
+                "CLOB/BLOB/JSON are not supported."
             )
-    validate_feature_types(feature_names_types)
+    validate_feature_types(feature_names_types)
+    logger_safe("info", "Registering entity %s in feature store", entity_id)
     register_entity(entity_id, feature_names_types, primary_index=primary_index, partitioning=partitioning)
-    if tdfs4ds.DEBUG_MODE:
-        print('_upload_features', 'entity_id',     entity_id)
-        print('_upload_features', 'entity_null_substitute', entity_null_substitute)
-        print('_upload_features', 'feature_names', feature_names)
-        print('_upload_features', 'primary_index', primary_index)
-        print('_upload_features', 'partitioning',  partitioning)
-        print('_upload_features', 'selected_features', selected_features)
-        print('_upload_features', 'df.columns', df.columns)
-    # Register the features in the feature catalog.
-    register_features(
-        entity_id,
-        feature_names_types,
-        primary_index,
-        partitioning
-    )
-    if tdfs4ds.DEBUG_MODE:
-        print("---------_upload_features")
-        print("filtermanager     : ", filtermanager)
-        print("feature names     : ", feature_names)
-        print("selected features : ", selected_features)
-    if process_id is not None and tdfs4ds.FEATURE_STORE_TIME is not None:
+    if getattr(tdfs4ds, "DEBUG_MODE", False):
+        logger_safe(
+            "debug",
+            "_upload_features entity_id=%s null_substitute=%s features=%s primary_index=%s partitioning=%s",
+            entity_id, entity_null_substitute, feature_names, primary_index, partitioning
+        )
+        logger_safe("debug", "selected_features=%s df.columns=%s", selected_features, df.columns)
+    register_features(entity_id, feature_names_types, primary_index, partitioning)
+    logger_safe("info", "Features registered in catalog: %s", feature_names)
+    follow_up = None
+    if process_id and tdfs4ds.FEATURE_STORE_TIME:
         follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
-        follow_up = follow_up[(follow_up.STATUS == 'COMPLETED') & (follow_up.VALIDTIME_DATE.isna() == False) & (
-                    follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) & (follow_up.PROCESS_ID == process_id)]
-    if filtermanager is None:
-        do_compute = True
-        if process_id is not None and tdfs4ds.FEATURE_STORE_TIME is not None:
-            if follow_up.shape[0] > 0:
-                do_compute = False
+        follow_up = follow_up[
+            (follow_up.STATUS == 'COMPLETED') &
+            (follow_up.VALIDTIME_DATE.isna() == False) &
+            (follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) &
+            (follow_up.PROCESS_ID == process_id)
+        ]
-        # Prepare the features for ingestion.
+    if filtermanager is None:
+        do_compute = not (process_id and follow_up is not None and follow_up.shape[0] > 0)
         if do_compute or force_compute:
+            logger_safe("info", "Beginning feature ingestion for entity=%s", entity_id)
             tdfs4ds.process_store.process_followup.followup_open(
-                run_id       = tdfs4ds.RUN_ID,
-                process_type = tdfs4ds.PROCESS_TYPE,
-                process_id   = process_id
+                run_id=tdfs4ds.RUN_ID,
+                process_type=tdfs4ds.PROCESS_TYPE,
+                process_id=process_id
             )
             try:
-                prepared_features, volatile_table_name, features_infos = prepare_feature_ingestion(
-                    df,
-                    entity_id,
-                    feature_names,
+                prepared_features, volatile_table, features_infos = prepare_feature_ingestion(
+                    df, entity_id, feature_names,
                     feature_versions=selected_features,
                     primary_index=primary_index,
                     entity_null_substitute=entity_null_substitute,
                     partitioning=partitioning
                 )
-                # Store the prepared features in the feature store.
-                store_feature(
-                    entity_id,
-                    volatile_table_name,
-                    entity_null_substitute=entity_null_substitute,
-                    primary_index=primary_index,
-                    partitioning=partitioning,
-                    features_infos = features_infos
-                )
-                # Collect statistics
-                apply_collect_stats(
-                    entity_id,
-                    primary_index = primary_index,
-                    partitioning  = partitioning,
-                    feature_infos = features_infos
-                )
+                store_feature(entity_id, volatile_table, entity_null_substitute,
+                              primary_index, partitioning, features_infos)
+                apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
                 tdfs4ds.process_store.process_followup.followup_close(
-                    run_id        = tdfs4ds.RUN_ID,
-                    process_type  = tdfs4ds.PROCESS_TYPE,
-                    process_id    = process_id
+                    run_id=tdfs4ds.RUN_ID,
+                    process_type=tdfs4ds.PROCESS_TYPE,
+                    process_id=process_id
                 )
+                logger_safe("info", "Feature ingestion completed for entity=%s", entity_id)
             except Exception as e:
+                logger_safe("exception", "Feature ingestion failed for entity=%s", entity_id)
                 tdfs4ds.process_store.process_followup.followup_close(
-                    run_id        = tdfs4ds.RUN_ID,
-                    process_type  = tdfs4ds.PROCESS_TYPE,
-                    process_id    = process_id,
-                    status        = 'FAILED,' + str(e).split('\n')[0]
+                    run_id=tdfs4ds.RUN_ID,
+                    process_type=tdfs4ds.PROCESS_TYPE,
+                    process_id=process_id,
+                    status='FAILED,' + str(e).split('\n')[0]
                 )
                 raise
-    else:
-        # get the total number of filter condition in the filter manager
-        nb_filters = filtermanager.nb_filters
-        # the flag that indicates that we computed something in the next loop
+    else:
+        logger_safe("info", "FilterManager detected: %s filters to process", filtermanager.nb_filters)
         something_computed = False
+        for i in tqdm(
+            range(filtermanager.nb_filters),
+            total=filtermanager.nb_filters,
+            desc="Applying filters",
+            unit="filter",
+            leave=False
+        ):
+            filter_id = i + 1
+            filtermanager.update(filter_id)
+            # show which filter is being applied in the bar
+            try:
+                tqdm.write(f"Applying filter {filter_id}/{filtermanager.nb_filters}")
+                # If display() returns a long string, you can shorten it:
+                bar_info = str(filtermanager.display())
+                if len(bar_info) > 80:
+                    bar_info = bar_info[:77] + "..."
+                tqdm.tqdm._instances and next(iter(tqdm.tqdm._instances)).set_postfix_str(bar_info)
+            except Exception:
+                # postfix is optional; ignore errors from display() here
+                pass
+            logger_safe("debug", "Applying filter %s/%s:\n%s",
+                i + 1, filtermanager.nb_filters, filtermanager.display())
-        for i in range(nb_filters):
-            # place the cursor on the next filter
-            filtermanager.update(i+1)
-            if filtermanager.time_filtering:
-                # if the filter manager is hybrid, then synchronize the time with tdfs4ds
-                tdfs4ds.FEATURE_STORE_TIME = filtermanager.get_date_in_the_past()
-                # overwrite the follow up table to tilter on the VALIDTIME_DATE too
-                follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
-                follow_up = follow_up[(follow_up.STATUS == 'COMPLETED') & (follow_up.VALIDTIME_DATE.isna() == False) & (
-                        follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) & (follow_up.PROCESS_ID == process_id)]
-            # initialize do_compute, the flag that something has to be computed
             do_compute = True
-            # if the process_id is defined and if we are working at a specific time:
-            if process_id is not None and tdfs4ds.FEATURE_STORE_TIME is not None:
-                # we check if the filter condition has already been computed
-                follow_up_ = follow_up.assign(APPLIED_FILTER=follow_up.APPLIED_FILTER.cast(tdml.VARCHAR(20000))).join(
-                    tdml.DataFrame.from_query(
-                        f"""
-                        SELECT
-                        CAST(JSON_AGG({','.join(filtermanager.col_names)}) AS VARCHAR(20000)) AS APPLIED_FILTER
-                        FROM {filtermanager.schema_name}.{filtermanager.view_name}
-                        """
-                    ),
-                    on      = 'APPLIED_FILTER',
-                    how     = 'inner',
-                    lprefix = 'l',
-                    rprefix = 'r'
-                )
-                # if already computed and completed, then do_compute is set to False
-                if follow_up_.shape[0] > 0:
+            if process_id and tdfs4ds.FEATURE_STORE_TIME:
+                # see if already computed
+                follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
+                if follow_up.shape[0] > 0:
                     do_compute = False
-            if tdfs4ds.DISPLAY_LOGS:
-                print(filtermanager.display())
             if do_compute or force_compute:
                 tdfs4ds.process_store.process_followup.followup_open(
                     run_id        = tdfs4ds.RUN_ID,
@@ -842,83 +712,58 @@ def _upload_features(df, entity_id, feature_names,
                     filtermanager = filtermanager
                 )
                 try:
-                    # Prepare the features for ingestion.
-                    prepared_features, volatile_table_name, features_infos = prepare_feature_ingestion(
-                        df,
-                        entity_id,
-                        feature_names,
+                    prepared_features, volatile_table, features_infos = prepare_feature_ingestion(
+                        df, entity_id, feature_names,
                         feature_versions       = selected_features,
                         primary_index          = primary_index,
                         entity_null_substitute = entity_null_substitute,
                         partitioning           = partitioning
                     )
-                    # Store the prepared features in the feature store.
-                    store_feature(
-                        entity_id,
-                        volatile_table_name,
-                        entity_null_substitute=entity_null_substitute,
-                        primary_index = primary_index,
-                        partitioning = partitioning,
-                        features_infos=features_infos
-                    )
-                    # indicate that something has been processed:
+                    store_feature(entity_id, volatile_table, entity_null_substitute,
+                                  primary_index, partitioning, features_infos)
                     something_computed = True
                     tdfs4ds.process_store.process_followup.followup_close(
-                        run_id=tdfs4ds.RUN_ID,
-                        process_type=tdfs4ds.PROCESS_TYPE,
-                        process_id=process_id,
+                        run_id        = tdfs4ds.RUN_ID,
+                        process_type  = tdfs4ds.PROCESS_TYPE,
+                        process_id    = process_id,
                         filtermanager = filtermanager
                     )
                 except Exception as e:
-                    print(e)
+                    logger_safe("exception", "Error with filter iteration %s: %s", i + 1, str(e))
                     tdfs4ds.process_store.process_followup.followup_close(
-                        run_id=tdfs4ds.RUN_ID,
-                        process_type=tdfs4ds.PROCESS_TYPE,
-                        process_id=process_id,
-                        status='FAILED,' + str(e).split('\n')[0],
-                        filtermanager=filtermanager
+                        run_id        = tdfs4ds.RUN_ID,
+                        process_type  = tdfs4ds.PROCESS_TYPE,
+                        process_id    = process_id,
+                        status        = 'FAILED,' + str(e).split('\n')[0],
+                        filtermanager = filtermanager
                     )
                     raise
-                # Clean up by dropping the temporary volatile table.
-                # tdml.execute_sql(f'DROP TABLE {volatile_table_name}')
-        # Collect statistics only if something has been computed
         if something_computed:
-            apply_collect_stats(
-                entity_id,
-                primary_index  = primary_index,
-                partitioning   = partitioning,
-                feature_infos  = features_infos
-            )
+            apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
-    # Build a dataset view in the feature store.
     if tdfs4ds.BUILD_DATASET_AT_UPLOAD:
-        if tdfs4ds.DISPLAY_LOGS: print('build dataset for validation')
+        logger_safe("info", "Building dataset for validation...")
         try:
-            dataset = build_dataset(
-                entity_id,
-                selected_features,
+            return build_dataset(
+                entity_id, selected_features,
                 view_name=None,
-                entity_null_substitute = entity_null_substitute
+                entity_null_substitute=entity_null_substitute
             )
         except Exception as e:
-            print('ERROR at build_dataset in _upload_features:')
-            print(str(e).split('\n')[0])
-            print('entity :', entity_id)
-            print('selected features :', selected_features)
-        # Return the dataset view.
-        return dataset
+            logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
+            logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
     else:
-        if tdfs4ds.DISPLAY_LOGS: print('no dataset built for validation. Set tdfs4ds.BUILD_DATASET_AT_UPLOAD to True if you want it')
+        logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False)")
         return
 def build_dataset(entity_id, selected_features, view_name, schema_name=None, comment=None, return_query=False,
                   feature_store_time=False, join_type='INNER'):
     """
@@ -1366,9 +1211,6 @@ def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None):
     >>> roll_out(process_list, time_manager, time_id_start=1, time_id_end=10)
     """
-    #global DISPLAY_LOGS
-    #global FEATURE_STORE_TIME
     # Disable display logs
     temp_DISPLAY_LOGS = tdfs4ds.DISPLAY_LOGS
     tdfs4ds.DISPLAY_LOGS = False
@@ -1376,40 +1218,43 @@ def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None):
     tdfs4ds.PROCESS_TYPE = 'ROLL_OUT'
     tdfs4ds.RUN_ID = str(uuid.uuid4())
     try:
+        # Define range of time steps
         if time_id_end is None:
-            pbar = tqdm.tqdm(range(time_id_start, time_manager.nb_time_steps + 1), desc="Starting")
+            time_range = range(time_id_start, time_manager.nb_time_steps + 1)
         else:
-            pbar = tqdm.tqdm(range(time_id_start, min([time_manager.nb_time_steps + 1,time_id_end+1]) ), desc="Starting")
-        # Iterate over each date in the provided list
+            time_range = range(time_id_start, min(time_manager.nb_time_steps + 1, time_id_end + 1))
+        # Progress bar
+        pbar = tqdm(time_range, desc="Starting rollout", unit="step")
         for i in pbar:
-            # Update the time manager with the new date
-            time_manager.update(time_id = i )
+            # Update time manager
+            time_manager.update(time_id=i)
             date_ = str(time_manager.display()['BUSINESS_DATE'].values[0])
-            pbar.set_description(f"Processing {date_}")
-            # Synchronize the time for the feature store with the current date
+            # Sync feature store time
             tdfs4ds.FEATURE_STORE_TIME = time_manager.get_date_in_the_past()
-            pbar.set_description(f"Processing {tdfs4ds.FEATURE_STORE_TIME}")
+            # Display current progress in tqdm
+            pbar.set_postfix(time=date_, feature_time=tdfs4ds.FEATURE_STORE_TIME)
             if tdfs4ds.DEBUG_MODE:
-                print('def roll_out','date_', date_)
-                print('def roll_out','time_manager.get_date_in_the_past()', time_manager.get_date_in_the_past())
-                print('def roll_out','tdfs4ds.FEATURE_STORE_TIME', tdfs4ds.FEATURE_STORE_TIME)
-            # Execute each process in the process list for the current date
+                print("roll_out | date_:", date_)
+                print("roll_out | feature_store_time:", tdfs4ds.FEATURE_STORE_TIME)
+            # Execute all processes for this time step
             for proc_id in process_list:
-                pbar.set_description(f"Processing {date_} process {proc_id}")
+                pbar.set_description(f"Processing {date_} | proc {proc_id}")
                 run(process_id=proc_id, force_compute=False)
+        # Restore settings
         tdfs4ds.DISPLAY_LOGS = temp_DISPLAY_LOGS
     except Exception as e:
         tdfs4ds.DISPLAY_LOGS = temp_DISPLAY_LOGS
-        # If an exception occurs, print the date and the first line of the exception message
-        #print(date_)
         print(str(e).split('\n')[0])
         tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
         raise
-    tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
+    tdfs4ds.PROCESS_TYPE = PROCESS_TYPE

tdfs4ds 0.2.4.32__py3-none-any.whl → 0.2.4.34__py3-none-any.whl

tdfs4ds 0.2.4.32py3-none-any.whl → 0.2.4.34py3-none-any.whl