PyPI - tdfs4ds - Versions diffs - 0.2.4.32__py3-none-any.whl → 0.2.4.34__py3-none-any.whl - Mend

tdfs4ds 0.2.4.32py3-none-any.whl → 0.2.4.34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

tdfs4ds/__init__.py +387 -542
tdfs4ds/feature_store/feature_data_processing.py +367 -299
tdfs4ds/feature_store/feature_store_management.py +189 -167
tdfs4ds/process_store/process_query_administration.py +1 -1
tdfs4ds/process_store/process_registration_management.py +67 -55
tdfs4ds/utils/filter_management.py +87 -53
tdfs4ds/utils/time_management.py +67 -24
{tdfs4ds-0.2.4.32.dist-info → tdfs4ds-0.2.4.34.dist-info}/METADATA +1 -1
{tdfs4ds-0.2.4.32.dist-info → tdfs4ds-0.2.4.34.dist-info}/RECORD +11 -11
{tdfs4ds-0.2.4.32.dist-info → tdfs4ds-0.2.4.34.dist-info}/WHEEL +0 -0
{tdfs4ds-0.2.4.32.dist-info → tdfs4ds-0.2.4.34.dist-info}/top_level.txt +0 -0

tdfs4ds/feature_store/feature_data_processing.py CHANGED Viewed

@@ -7,6 +7,8 @@ from tdfs4ds.utils.info import seconds_to_dhms
 import time
 import re
 import pandas as pd
+from tdfs4ds import logger_safe, logger
 def generate_on_clause(entity_id, entity_null_substitute, left_name, right_name):
     res = []
@@ -73,7 +75,7 @@ def generate_collect_stats(entity_id, primary_index='', partitioning=''):
     # Initialize the extended query with sampling and threshold settings for statistics collection
     query_extension_header = 'COLLECT STATISTICS USING SAMPLE 25 PERCENT AND THRESHOLD 15 PERCENT'
-    query_extension = []
+    query_extension        = []
     # Add primary index columns to the extended query
     if primary_index:
@@ -164,11 +166,10 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
     # Record the start time
     start_time = time.time()
-    if type(entity_id) == list:
+    # Normalize entity_id into a list of keys
+    if isinstance(entity_id, list):
         list_entity_id = entity_id
-    elif type(entity_id) == dict:
+    elif isinstance(entity_id, dict):
         list_entity_id = list(entity_id.keys())
     else:
         list_entity_id = [entity_id]
@@ -176,260 +177,333 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
     feature_id_names, conversion_name2id = get_feature_id_and_conversion(list_entity_id, feature_names)
-    features_infos = pd.DataFrame(feature_id_names, columns = ['FEATURE_ID','FEATURE_NAME','FEATURE_TABLE','FEATURE_DATABASE'])
+    features_infos = pd.DataFrame(feature_id_names, columns=['FEATURE_ID', 'FEATURE_NAME', 'FEATURE_TABLE', 'FEATURE_DATABASE'])
     features_infos['FEATURE_VERSION'] = [feature_versions[k] for k in features_infos.FEATURE_NAME.values]
-    if tdfs4ds.DEBUG_MODE:
-        print('--- prepare_feature_ingestion ---')
-        print('conversion_name2id : ', conversion_name2id)
-        print('feature_names : ', feature_names)
-    # Create the UNPIVOT clause for the specified feature columns
-    unpivot_columns = ", \n".join(["(" + x + ") as '" + str(conversion_name2id[x]) + "'" for x in feature_names])
+    if getattr(tdfs4ds, "DEBUG_MODE", False):
+        logger_safe("debug", "--- prepare_feature_ingestion ---")
+        logger_safe("debug", "conversion_name2id=%s", conversion_name2id)
+        logger_safe("debug", "feature_names=%s", feature_names)
-    if tdfs4ds.DEBUG_MODE:
-        print('--- prepare_feature_ingestion ---')
-        print('unpivot_columns : ', unpivot_columns)
-    # Create the output column list including entity IDs, feature names, and feature values
+    # UNPIVOT mapping
+    unpivot_columns = ", \n".join([f"({x}) as '{conversion_name2id[x]}'" for x in feature_names])
+    if getattr(tdfs4ds, "DEBUG_MODE", False):
+        logger_safe("debug", "unpivot_columns=%s", unpivot_columns)
+    # Output columns for volatile table
     output_columns = ', \n'.join(list_entity_id + ['CAST(FEATURE_ID AS BIGINT) AS FEATURE_ID', 'FEATURE_VALUE'])
+    # Primary index
     if primary_index is None:
         primary_index = ','.join(list_entity_id)
     else:
-        if type(primary_index) == list:
-            primary_index = primary_index
-        else:
+        if not isinstance(primary_index, list):
             primary_index = [primary_index]
         primary_index = ','.join(primary_index)
-    # Create a dictionary to store feature versions, using the default version if not specified
+    # Feature versions (defaults)
     versions = {f: tdfs4ds.FEATURE_VERSION_DEFAULT for f in feature_names}
     if feature_versions is not None:
         for k, v in feature_versions.items():
             versions[k] = v
-    if tdfs4ds.DEBUG_MODE:
-        print('--- prepare_feature_ingestion ---')
-        print('versions : ', versions)
+    if getattr(tdfs4ds, "DEBUG_MODE", False):
+        logger_safe("debug", "versions=%s", versions)
-    # Create the CASE statement to assign feature versions based on feature names
-    version_query = ["CASE"] + [f"WHEN FEATURE_ID = '{conversion_name2id[k]}' THEN '{v}' " for k, v in versions.items()] + [
-        "END AS FEATURE_VERSION"]
+    # CASE statement for versions
+    version_query = ["CASE"] + [f"WHEN FEATURE_ID = '{conversion_name2id[k]}' THEN '{v}' " for k, v in versions.items()] + ["END AS FEATURE_VERSION"]
     version_query = '\n'.join(version_query)
-    if tdfs4ds.DEBUG_MODE:
-        print('--- prepare_feature_ingestion ---')
-        print('version_query : ', version_query)
+    if getattr(tdfs4ds, "DEBUG_MODE", False):
+        logger_safe("debug", "version_query=%s", version_query)
-    # Create a volatile table name based on the original table's name, ensuring it is unique.
+    # Volatile table name
     volatile_table_name = df._table_name.split('.')[1].replace('"', '')
-    volatile_table_name = f'temp_{volatile_table_name}'
+    volatile_table_name = f"temp_{volatile_table_name}"
-    if type(entity_id) == list:
+    # Normalize entity_id again for var casting
+    if isinstance(entity_id, list):
         list_entity_id = entity_id
-    elif type(entity_id) == dict:
+    elif isinstance(entity_id, dict):
         list_entity_id = list(entity_id.keys())
     else:
         list_entity_id = [entity_id]
-    # get the character set of varchars
-    res = {x.split()[0]:''.join(x.split()[1::]) for x in str(df[feature_names].tdtypes).split('\n')}
+    # Character set handling / pass-through
+    res = {x.split()[0]: ''.join(x.split()[1::]) for x in str(df[feature_names].tdtypes).split('\n')}
     var_temp2 = []
-    for k,v in res.items():
+    for k, v in res.items():
         if 'UNICODE' in v:
-            #var_temp2.append(f'TRANSLATE({k} USING UNICODE_TO_LATIN) AS {k}')
             var_temp2.append(f'{k}')
         elif 'LATIN' in v:
-            #var_temp2.append(f'{k}')
             var_temp2.append(f'TRANSLATE({k} USING LATIN_TO_UNICODE) AS {k}')
         else:
             var_temp2.append(f'CAST({k} AS VARCHAR(2048) CHARACTER SET UNICODE) AS {k}')
     var_temp2 = ', \n'.join(var_temp2)
+    # NOTE: the original code overrides var_temp2 with just the raw column names.
+    # Preserve that behavior to avoid functional change.
     var_temp2 = ', \n'.join(list(res.keys()))
+    # Null substitution on entity keys
     var_temp3 = []
     for e in list_entity_id:
         if e in entity_null_substitute.keys():
-            if type(entity_null_substitute[e]) == str:
+            if isinstance(entity_null_substitute[e], str):
                 var_temp3.append(f"coalesce({e},'{entity_null_substitute[e]}') AS {e}")
             else:
                 var_temp3.append(f"coalesce({e},{entity_null_substitute[e]}) AS {e}")
         else:
             var_temp3.append(e)
     var_temp3 = ', \n'.join(var_temp3)
-    nested_query = f"""
-    CREATE MULTISET VOLATILE TABLE {volatile_table_name} AS
-    (
-    SELECT
-    {output_columns},
-    {version_query}
-    FROM
-        (SELECT
-        {var_temp3},
-        {var_temp2}
-        FROM {df._table_name}
-        ) A
-    UNPIVOT INCLUDE NULLS ((FEATURE_VALUE )  FOR  FEATURE_ID
-    IN ({unpivot_columns})) Tmp
-    ) WITH DATA
-    PRIMARY INDEX ({primary_index})
-    PARTITION BY RANGE_N(FEATURE_ID  BETWEEN 0  AND 2000  EACH 1 )
-    ON COMMIT PRESERVE ROWS
-    """
-    nested_query = f"""
-        CREATE MULTISET VOLATILE TABLE {volatile_table_name} AS
-        (
-            SELECT
-            {var_temp3},
-            {var_temp2}
-            FROM {df._table_name}
-        ) WITH DATA
-        PRIMARY INDEX ({primary_index})
-        ON COMMIT PRESERVE ROWS
-        """
+    # Final nested query used (the function reassigns to plain SELECT; preserve as-is)
     nested_query = f"""
             SELECT
             {var_temp3},
             {var_temp2}
             FROM {df._table_name}
         """
-    # Test unicity of the process
+    # Duplicate check query
     output_columns_unicity = ', \n'.join(list_entity_id)
     query_test_unicity = f"""
     SELECT sum(CASE WHEN n>1 THEN 1 ELSE 0 END) AS nb_duplicates
     FROM (
-    SELECT
-    {output_columns_unicity}
-    , count(*) as n
-    FROM {_get_database_username()}.{volatile_table_name}
-    GROUP BY {output_columns_unicity}
+        SELECT
+            {output_columns_unicity},
+            count(*) as n
+        FROM {_get_database_username()}.{volatile_table_name}
+        GROUP BY {output_columns_unicity}
     ) A
     """
-    if tdfs4ds.DEBUG_MODE:
-        print('--- prepare_feature_ingestion ---')
-        print('var_temp2 : ', var_temp2)
-        print('var_temp3 : ', var_temp3)
-        print('nested_query :', nested_query)
+    if getattr(tdfs4ds, "DEBUG_MODE", False):
+        logger_safe("debug", "var_temp2=%s", var_temp2)
+        logger_safe("debug", "var_temp3=%s", var_temp3)
+        logger_safe("debug", "nested_query=%s", nested_query)
-    # Execute the SQL query to create the volatile table.
+    # Execute: create volatile table and test unicity
     try:
-        #tdml.execute_sql(nested_query)
-        tdml.DataFrame.from_query(nested_query).to_sql(table_name = volatile_table_name, temporary = True, primary_index = primary_index.split(','), if_exists='replace')
+        tdml.DataFrame.from_query(nested_query).to_sql(
+            table_name=volatile_table_name,
+            temporary=True,
+            primary_index=primary_index.split(','),
+            if_exists='replace'
+        )
         nb_duplicates = tdml.execute_sql(query_test_unicity).fetchall()[0][0]
         if nb_duplicates is not None and nb_duplicates > 0:
-            tdfs4ds.logger.error(f"The process generates {nb_duplicates} duplicates")
-            query_test_unicity = f"""
-            SELECT TOP 3
-            {output_columns_unicity}
-            , count(*) as n
-            FROM {_get_database_username()}.{volatile_table_name}
-            GROUP BY {output_columns_unicity}
-            HAVING n > 1
-            """
+            logger_safe("error", "The process generates %s duplicates", nb_duplicates)
+            # Show sample duplicates in debug for troubleshooting
+            if getattr(tdfs4ds, "DEBUG_MODE", False):
+                sample_dups_query = f"""
+                SELECT TOP 3
+                    {output_columns_unicity},
+                    count(*) as n
+                FROM {_get_database_username()}.{volatile_table_name}
+                GROUP BY {output_columns_unicity}
+                HAVING n > 1
+                """
+                logger_safe("debug", "Sample duplicates query:\n%s", sample_dups_query)
             raise ValueError("Invalid process: the process generates duplicates.")
-        #tdfs4ds.logger.info(f"No duplicate found.")
+        # else: no duplicates
+        # logger_safe("info", "No duplicate found.")  # optional
     except Exception as e:
-        if tdfs4ds.DISPLAY_LOGS:
-            print(str(e).split('\n')[0])
+        logger_safe("error", "prepare_feature_ingestion failed: %s", str(e).split('\n')[0])
         raise
+    if getattr(tdfs4ds, "DEBUG_MODE", False):
+        logger_safe(
+            "debug",
+            "Result volatile table dtypes:\n%s",
+            tdml.DataFrame(tdml.in_schema(_get_database_username(), volatile_table_name)).tdtypes
+        )
-    if tdfs4ds.DEBUG_MODE:
-        print('--- prepare_feature_ingestion ---')
-        print(tdml.DataFrame(tdml.in_schema(_get_database_username(), volatile_table_name)).tdtypes)
-    # Record the end time
+    # Timing
     end_time = time.time()
-    # Calculate the elapsed time in seconds
     elapsed_time = end_time - start_time
     formatted_elapsed_time = seconds_to_dhms(elapsed_time)
-    if tdfs4ds.DISPLAY_LOGS:
-        print(f'Feature preparation for ingestion : {formatted_elapsed_time} ({elapsed_time}s)')
+    logger_safe("info", "Feature preparation for ingestion: %s (%.3fs)", formatted_elapsed_time, elapsed_time)
+    # Return DataFrame and metadata
     try:
         df_out = tdml.DataFrame(tdml.in_schema(_get_database_username(), volatile_table_name))
         return df_out, volatile_table_name, features_infos
     except Exception as e:
-        print(str(e).split()[0])
-        print(df[feature_names].tdtypes)
+        logger_safe("error", "Failed to materialize volatile DataFrame: %s", str(e).split()[0])
+        logger_safe("error", "Feature dtypes snapshot: %s", df[feature_names].tdtypes)
         if 'TD_Unpivot contract function' in str(e).split()[0]:
-            raise('Error : you may have string with UNICODE encoding as feature, please convert them to latin first')
+            raise RuntimeError(
+                "Error: you may have strings with UNICODE encoding as features; please convert them to LATIN first."
+            )
+        raise
-    return None, None, None
+    # Fallback (should not reach)
+    # return None, None, None
-def apply_collect_stats(entity_id, primary_index, partitioning, feature_infos):
-    """
-    Applies a collect statistics operation on target tables grouped by feature table and database.
-    This function performs the following steps:
-    1. Sorts the `entity_id`.
-    2. Groups the feature information by feature table and database to count occurrences.
-    3. Generates collect statistics queries.
-    4. Executes the queries on the target tables while recording the execution time.
-    5. Logs the elapsed time if logging is enabled.
-    Args:
-        entity_id (list): A list of entity IDs to process.
-        primary_index (str): The primary index to use in the collect statistics query.
-        partitioning (str): Partitioning information for the query.
-        feature_infos (pd.DataFrame): A DataFrame containing feature information,
-            including columns 'FEATURE_TABLE', 'FEATURE_DATABASE', and 'FEATURE_ID'.
+import time
+from typing import Any, Dict, Iterable, Mapping, Optional, Tuple
-    Returns:
-        None
+import pandas as pd
+def apply_collect_stats(
+    entity_id: Mapping[str, Any] | Iterable[str],
+    primary_index: Optional[str],
+    partitioning: Optional[str],
+    feature_infos: pd.DataFrame,
+) -> Dict[str, Any]:
     """
-    # Sort the entity IDs to ensure consistent ordering.
-    sorted_entity_id = list(entity_id.keys())
-    sorted_entity_id.sort()
+    Run COLLECT STATS on all target feature tables, with fallbacks and timing.
+    Steps:
+    1) Determine a stable ordering of entity IDs (for deterministic query gen).
+    2) Group `feature_infos` by FEATURE_DATABASE + FEATURE_TABLE to get unique targets.
+    3) Generate COLLECT STATS statements via `generate_collect_stats(...)` for fallback use.
+    4) For each target table:
+        - Try a simple `COLLECT STATS ON <db>.<table>`.
+        - On failure, retry with generated statements (and optional extension).
+    5) Log a compact summary (counts + total duration) and return it as a dict.
+    Parameters
+    ----------
+    entity_id : Mapping[str, Any] | Iterable[str]
+        Entity identifiers used to parameterize collect-stat statements.
+        If a mapping (e.g., dict), its *keys* are used and sorted.
+        If an iterable (e.g., list/tuple), it’s sorted directly.
+    primary_index : Optional[str]
+        Primary index used by `generate_collect_stats` (may be None).
+    partitioning : Optional[str]
+        Partitioning clause used by `generate_collect_stats` (may be None).
+    feature_infos : pd.DataFrame
+        Must contain columns: 'FEATURE_TABLE', 'FEATURE_DATABASE', 'FEATURE_ID'.
+    Returns
+    -------
+    Dict[str, Any]
+        Summary with keys:
+        - total_tables: int
+        - ok: int
+        - retried: int
+        - failed: int
+        - duration_seconds: float
+        - duration_hms: str
+        - details: list[dict]  # per-table status entries
+    """
+    # --- Validate inputs -----------------------------------------------------
+    required_cols = {"FEATURE_TABLE", "FEATURE_DATABASE", "FEATURE_ID"}
+    missing = required_cols.difference(feature_infos.columns)
+    if missing:
+        raise ValueError(f"feature_infos is missing required columns: {sorted(missing)}")
+    # --- Normalize & sort entity IDs ----------------------------------------
+    if hasattr(entity_id, "keys"):
+        sorted_entity_ids = sorted(list(entity_id.keys()))
+    else:
+        sorted_entity_ids = sorted(list(entity_id))
+    # --- Group to unique targets --------------------------------------------
+    target_tables = (
+        feature_infos[["FEATURE_TABLE", "FEATURE_DATABASE", "FEATURE_ID"]]
+        .groupby(["FEATURE_TABLE", "FEATURE_DATABASE"])
+        .count()
+        .reset_index()
+    )
-    # Group the target tables by 'FEATURE_TABLE' and 'FEATURE_DATABASE' and count occurrences.
-    target_tables = feature_infos[['FEATURE_TABLE', 'FEATURE_DATABASE', 'FEATURE_ID']].groupby(
-        ['FEATURE_TABLE', 'FEATURE_DATABASE']
-    ).count().reset_index()
+    if getattr(tdfs4ds, "DEBUG_MODE", False):
+        logger_safe(
+            "debug",
+            "collect_stats.targets | count=%s | tables=%s",
+            len(target_tables),
+            target_tables[["FEATURE_DATABASE", "FEATURE_TABLE"]].to_dict(orient="records"),
+        )
-    # Generate the collect statistics query and its optional extension.
+    # --- Prepare statements --------------------------------------------------
     query_collect_stats, query_collect_stats_extension = generate_collect_stats(
-        sorted_entity_id,
+        sorted_entity_ids,
         primary_index=primary_index,
-        partitioning=partitioning
+        partitioning=partitioning,
     )
-    # Record the start time for measuring query execution duration.
-    start_time = time.time()
-    # Loop through the grouped target tables and execute the queries.
-    for i, row in target_tables.iterrows():
-        # Execute the main collect statistics query.
-        execute_query(query_collect_stats + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
+    # --- Execute -------------------------------------------------------------
+    started = time.perf_counter()
+    results: list[Dict[str, Any]] = []
+    ok = retried = failed = 0
+    for _, row in target_tables.iterrows():
+        db = row["FEATURE_DATABASE"]
+        tbl = row["FEATURE_TABLE"]
+        table_fqn = f"{db}.{tbl}"
+        if getattr(tdfs4ds, "DEBUG_MODE", False):
+            logger_safe("debug", "collect_stats.run | table=%s", table_fqn)
+        t0 = time.perf_counter()
+        status = "ok"
+        error_short = None
+        retried_flag = False
+        try:
+            execute_query(f"COLLECT STATS ON {table_fqn}")
+            ok += 1
+        except Exception as e:
+            # First attempt failed; try generated statement(s)
+            error_short = str(e).split("\n")[0]
+            logger_safe("warning", "collect_stats.initial_fail | table=%s | err=%s", table_fqn, error_short)
+            try:
+                execute_query(query_collect_stats + f" ON {table_fqn}")
+                retried_flag = True
+                retried += 1
+                if query_collect_stats_extension is not None:
+                    execute_query(query_collect_stats_extension + f" ON {table_fqn}")
+            except Exception as e2:
+                status = "failed"
+                error_short = str(e2).split("\n")[0]
+                failed += 1
+                logger_safe("error", "collect_stats.retry_fail | table=%s | err=%s", table_fqn, error_short)
+        dt = time.perf_counter() - t0
+        results.append(
+            {
+                "table": table_fqn,
+                "status": status,
+                "retried": retried_flag,
+                "elapsed_s": dt,
+                "error": error_short,
+            }
+        )
+    # --- Final summary -------------------------------------------------------
+    elapsed = time.perf_counter() - started
+    formatted = seconds_to_dhms(elapsed)
+    # Structured, parseable one-liner
+    logger_safe(
+        "info",
+        "collect_stats.summary | tables=%d | ok=%d | retried=%d | failed=%d | duration=%s (%.3fs)",
+        len(target_tables),
+        ok,
+        retried,
+        failed,
+        formatted,
+        elapsed,
+    )
-        # If an extension query exists, execute it as well.
-        if query_collect_stats_extension is not None:
-            execute_query(query_collect_stats_extension + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
+    return {
+        "total_tables": int(len(target_tables)),
+        "ok": int(ok),
+        "retried": int(retried),
+        "failed": int(failed),
+        "duration_seconds": float(elapsed),
+        "duration_hms": formatted,
+        "details": results,
+    }
-    # Record the end time after query execution.
-    end_time = time.time()
-    # Calculate the elapsed time in seconds and format it into a human-readable format.
-    elapsed_time = end_time - start_time
-    formatted_elapsed_time = seconds_to_dhms(elapsed_time)
-    # Log the execution time if logging is enabled.
-    if tdfs4ds.DISPLAY_LOGS:
-        print(f'Storage of the prepared features - collect stats only : {formatted_elapsed_time} ({elapsed_time}s)')
 def _store_feature_update_insert(entity_id, volatile_table_name, entity_null_substitute={},primary_index=None,
@@ -627,9 +701,8 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
     >>> store_feature(entity_id_dict, prepared_features)
     """
-    #feature_catalog = tdml.DataFrame(tdml.in_schema(tdfs4ds.SCHEMA, tdfs4ds.FEATURE_CATALOG_NAME))
-    if tdfs4ds.FEATURE_STORE_TIME == None:
+    # VALIDTIME handling
+    if tdfs4ds.FEATURE_STORE_TIME is None:
         validtime_statement = 'CURRENT VALIDTIME'
         validtime_statement2 = validtime_statement
         validtime_start = 'CAST(CURRENT_TIME AS TIMESTAMP(0) WITH TIME ZONE)'
@@ -638,180 +711,155 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
         validtime_statement2 = f"VALIDTIME AS OF TIMESTAMP '{tdfs4ds.FEATURE_STORE_TIME}'"
         validtime_start = f"CAST('{tdfs4ds.FEATURE_STORE_TIME}' AS TIMESTAMP(0) WITH TIME ZONE)"
-    if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED':
-        end_period_ = '9999-01-01 00:00:00'
-    else:
-        end_period_ = tdfs4ds.END_PERIOD
+    end_period_ = '9999-01-01 00:00:00' if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED' else tdfs4ds.END_PERIOD
-    if tdfs4ds.DEBUG_MODE:
-        print('tdfs4ds.FEATURE_STORE_TIME :' , tdfs4ds.FEATURE_STORE_TIME)
+    if getattr(tdfs4ds, "DEBUG_MODE", False):
+        logger_safe("debug", "FEATURE_STORE_TIME=%s | END_PERIOD=%s", tdfs4ds.FEATURE_STORE_TIME, tdfs4ds.END_PERIOD)
+        logger_safe("debug", "entity_id=%s", entity_id)
+    # Entity id helpers
+    sorted_entity_id = sorted(list(entity_id.keys()))
+    ENTITY_ID = ','.join(sorted_entity_id)
-    if tdfs4ds.DEBUG_MODE:
-        print('entity_id :' , entity_id)
-    sorted_entity_id = list(entity_id.keys())
-    sorted_entity_id.sort()
-    ENTITY_ID = ','.join([k for k in sorted_entity_id])
-    count_features = pd.DataFrame(tdml.execute_sql(f"""
-    SEL count(*) as NB_ROWS FROM
-    {_get_database_username()}.
-    {volatile_table_name}
-    """).fetchall(), columns = ['NB_ROWS'])
-    if tdfs4ds.DEBUG_MODE:
-        print('count_features :' , count_features)
-        print('features_infos :', features_infos)
+    # Count rows in volatile table
+    count_features = pd.DataFrame(
+        tdml.execute_sql(
+            f"""
+            SEL count(*) as NB_ROWS
+            FROM {_get_database_username()}.{volatile_table_name}
+            """
+        ).fetchall(),
+        columns=['NB_ROWS']
+    )
+    if getattr(tdfs4ds, "DEBUG_MODE", False):
+        logger_safe("debug", "count_features=%s", count_features)
+        logger_safe("debug", "features_infos initial=%s", features_infos)
     if count_features.shape[0] > 0:
         features_infos['NB_ROWS'] = count_features['NB_ROWS'].values[0]
     else:
         features_infos['NB_ROWS'] = 0
-    if tdfs4ds.DEBUG_MODE:
-        print('features_infos :' , features_infos)
-    # Group the target tables by feature table and feature database and count the number of occurrences
-    target_tables = features_infos[['FEATURE_TABLE', 'FEATURE_DATABASE', 'NB_ROWS']].groupby(
-        ['FEATURE_TABLE', 'FEATURE_DATABASE']).sum().reset_index()
-    if tdfs4ds.DEBUG_MODE:
-        print('target_tables :' , target_tables)
-    if tdfs4ds.DISPLAY_LOGS:
-        display_table(target_tables[['FEATURE_DATABASE', 'FEATURE_TABLE', 'NB_ROWS']])
+    if getattr(tdfs4ds, "DEBUG_MODE", False):
+        logger_safe("debug", "features_infos updated=%s", features_infos)
+    # Compute target tables
+    target_tables = features_infos[['FEATURE_TABLE', 'FEATURE_DATABASE', 'NB_ROWS']].groupby(
+        ['FEATURE_TABLE', 'FEATURE_DATABASE']
+    ).sum().reset_index()
+    if getattr(tdfs4ds, "DEBUG_MODE", False):
+        logger_safe("debug", "target_tables=%s", target_tables)
-    sorted_entity_id = list(entity_id.keys())
-    sorted_entity_id.sort()
+    # Optional display (keep existing UX semantics)
+    if getattr(tdfs4ds, "DISPLAY_LOGS", False):
+        try:
+            display_table(target_tables[['FEATURE_DATABASE', 'FEATURE_TABLE', 'NB_ROWS']])
+        except Exception as e:
+            logger_safe("warning", "display_table failed: %s", str(e).split('\n')[0])
     ENTITY_ID_ON = ' AND '.join([f'NEW_FEATURES.{k} = EXISTING_FEATURES.{k}' for k in sorted_entity_id])
     ENTITY_ID_SELECT = ', \n'.join(['NEW_FEATURES.' + k for k in sorted_entity_id])
-    # Iterate over target tables and perform update and insert operations
-    #query_collect_stats, query_collect_stats_extension = generate_collect_stats(sorted_entity_id,primary_index=primary_index, partitioning=partitioning)
     queries = []
-    for i, row in features_infos.iterrows():
-        features_infos_      = features_infos[(features_infos.FEATURE_TABLE == row['FEATURE_TABLE']) & (features_infos.FEATURE_DATABASE == row['FEATURE_DATABASE'])]
-        feature_id_list      = ','.join([str(x) for x in list(set(features_infos_.FEATURE_ID.values))])
-        feature_version_list = ','.join(["'"+x+"'" for x in list(set(features_infos_.FEATURE_VERSION.values))])
-        nested_query = f"SEL * FROM {_get_database_username()}.{volatile_table_name} WHERE FEATURE_ID IN ({feature_id_list})"
+    for _, row in features_infos.iterrows():
+        features_infos_ = features_infos[
+            (features_infos.FEATURE_TABLE == row['FEATURE_TABLE']) &
+            (features_infos.FEATURE_DATABASE == row['FEATURE_DATABASE'])
+        ]
+        feature_id_list = ','.join([str(x) for x in sorted(set(features_infos_.FEATURE_ID.values))])
+        feature_version_list = ','.join(["'" + x + "'" for x in sorted(set(features_infos_.FEATURE_VERSION.values))])
+        # Build nested query
         nested_query = f"""
             SEL
-            {ENTITY_ID}
-            , {row['FEATURE_ID']} AS FEATURE_ID
-            , {row['FEATURE_NAME']} AS FEATURE_VALUE
-            , '{row['FEATURE_VERSION']}' AS FEATURE_VERSION
-            FROM {_get_database_username()}.{volatile_table_name}
+                {ENTITY_ID}
+              , {row['FEATURE_ID']} AS FEATURE_ID
+              , {row['FEATURE_NAME']} AS FEATURE_VALUE
+              , '{row['FEATURE_VERSION']}' AS FEATURE_VERSION
+            FROM {_get_database_username()}.{volatile_table_name}
         """
-        if tdfs4ds.FEATURE_STORE_TIME == None:
+        if tdfs4ds.FEATURE_STORE_TIME is None:
             query_merge = f"""
             {validtime_statement}
-            MERGE INTO  {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']} EXISTING_FEATURES
+            MERGE INTO {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']} EXISTING_FEATURES
             USING ( {nested_query} ) NEW_FEATURES
-            ON {ENTITY_ID_ON}
+            ON {ENTITY_ID_ON}
             AND NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
             AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
             AND NEW_FEATURES.FEATURE_ID IN ({feature_id_list})
             AND EXISTING_FEATURES.FEATURE_ID IN ({feature_id_list})
             AND EXISTING_FEATURES.FEATURE_VERSION IN ({feature_version_list})
             WHEN MATCHED THEN
-                UPDATE
-                SET
-                FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
+                UPDATE SET FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
             WHEN NOT MATCHED THEN
                 INSERT
-                ({ENTITY_ID_SELECT},
-                NEW_FEATURES.FEATURE_ID,
-                NEW_FEATURES.FEATURE_VALUE,
-                NEW_FEATURES.FEATURE_VERSION)
-                --,
-                --{validtime_start},
-                --'{end_period_}')
+                (
+                    {ENTITY_ID_SELECT},
+                    NEW_FEATURES.FEATURE_ID,
+                    NEW_FEATURES.FEATURE_VALUE,
+                    NEW_FEATURES.FEATURE_VERSION
+                )
             """
         else:
             query_merge = f"""
             {validtime_statement}
-            MERGE INTO  {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']} EXISTING_FEATURES
+            MERGE INTO {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']} EXISTING_FEATURES
             USING ( {nested_query} ) NEW_FEATURES
-            ON {ENTITY_ID_ON}
+            ON {ENTITY_ID_ON}
             AND NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
-            AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
-            AND EXISTING_FEATURES.FEATURE_ID IN ({feature_id_list})
-            AND NEW_FEATURES.FEATURE_ID IN ({feature_id_list})
+            AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
+            AND EXISTING_FEATURES.FEATURE_ID IN ({feature_id_list})
+            AND NEW_FEATURES.FEATURE_ID IN ({feature_id_list})
             AND EXISTING_FEATURES.FEATURE_VERSION IN ({feature_version_list})
             WHEN MATCHED THEN
-                UPDATE
-                SET
-                FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
+                UPDATE SET FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
             WHEN NOT MATCHED THEN
                 INSERT
-                ({ENTITY_ID_SELECT},
-                NEW_FEATURES.FEATURE_ID,
-                NEW_FEATURES.FEATURE_VALUE,
-                NEW_FEATURES.FEATURE_VERSION,
-                {validtime_start},
-                '{end_period_}')
+                (
+                    {ENTITY_ID_SELECT},
+                    NEW_FEATURES.FEATURE_ID,
+                    NEW_FEATURES.FEATURE_VALUE,
+                    NEW_FEATURES.FEATURE_VERSION,
+                    {validtime_start},
+                    '{end_period_}'
+                )
             """
-        entity_id_str = ', \n'.join([k for k in sorted_entity_id])
-        if tdfs4ds.DEBUG_MODE: print(
-            f'merge feature values of new {entity_id_str} combinations in {row.iloc[1]}.{row.iloc[0]}')
-        if tdfs4ds.DEBUG_MODE:
-            print(query_merge)
+        if getattr(tdfs4ds, "DEBUG_MODE", False):
+            entity_id_str = ', '.join(sorted_entity_id)
+            logger_safe(
+                "debug",
+                "Merging feature values for entity keys (%s) into %s.%s",
+                entity_id_str, row['FEATURE_DATABASE'], row['FEATURE_TABLE']
+            )
+            logger_safe("debug", "Query (truncated): %s", "\n".join(query_merge.splitlines()[:12]) + "\n...")
         queries.append(query_merge)
-    query_merge = '; \n'.join(queries)
     try:
-        # Record the end time
         start_time = time.time()
         for q in queries:
-            if tdfs4ds.DEBUG_MODE:
-                print(q.split('\n')[0:3])
-            # Execute the SQL query to create the volatile table.
+            if getattr(tdfs4ds, "DEBUG_MODE", False):
+                logger_safe("debug", "Executing merge (head): %s", "\n".join(q.split('\n')[0:3]))
             execute_query(q)
-        #execute_query(query_merge)
-        # Record the end time
-        end_time = time.time()
-        # Calculate the elapsed time in seconds
-        elapsed_time = end_time - start_time
+        elapsed_time = time.time() - start_time
         formatted_elapsed_time = seconds_to_dhms(elapsed_time)
-        if tdfs4ds.DISPLAY_LOGS:
-            print(f'Storage of the prepared features - merge only : {formatted_elapsed_time} ({elapsed_time}s)')
+        logger_safe(
+            "info",
+            "Storage of prepared features (merge-only) completed in %s (%.3fs)",
+            formatted_elapsed_time, elapsed_time
+        )
     except Exception as e:
-        print(str(e))
+        logger_safe("exception", "Feature storage (merge) failed: %s", str(e).split('\n')[0])
         raise
-    # # Record the end time
-    # start_time = time.time()
-    # for i, row in features_infos.iterrows():
-    #     execute_query(query_collect_stats + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
-    #     #print(query_collect_stats + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
-    #     if query_collect_stats_extension is not None:
-    #         execute_query(query_collect_stats_extension + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
-    #         #print(query_collect_stats_extension + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
-    # # Record the end time
-    # end_time = time.time()
-    #
-    # # Calculate the elapsed time in seconds
-    # elapsed_time = end_time - start_time
-    # formatted_elapsed_time = seconds_to_dhms(elapsed_time)
-    # if tdfs4ds.DISPLAY_LOGS:
-    #     print(f'Storage of the prepared features - collect stats only : {formatted_elapsed_time} ({elapsed_time}s)')
     return
 def store_feature(entity_id, volatile_table_name, entity_null_substitute = {},primary_index=None,
             partitioning='', features_infos = None, **kwargs):
     """
@@ -832,27 +880,47 @@ def store_feature(entity_id, volatile_table_name, entity_null_substitute = {},pr
     >>> store_feature(entity_id_dict, prepared_features)
     """
-    # Record the start time
     start_time = time.time()
+    # Choose storage strategy
     if tdfs4ds.STORE_FEATURE == 'UPDATE_INSERT':
-        _store_feature_update_insert(entity_id, volatile_table_name, entity_null_substitute=entity_null_substitute,primary_index=primary_index,
-            partitioning=partitioning, features_infos=features_infos, **kwargs)
+        logger_safe("info", "Storing features using UPDATE/INSERT strategy.")
+        _store_feature_update_insert(
+            entity_id,
+            volatile_table_name,
+            entity_null_substitute=entity_null_substitute,
+            primary_index=primary_index,
+            partitioning=partitioning,
+            features_infos=features_infos,
+            **kwargs
+        )
     elif tdfs4ds.STORE_FEATURE == 'MERGE':
-        _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=entity_null_substitute,primary_index=primary_index,
-            partitioning=partitioning, features_infos=features_infos, **kwargs)
+        logger_safe("info", "Storing features using MERGE strategy.")
+        _store_feature_merge(
+            entity_id,
+            volatile_table_name,
+            entity_null_substitute=entity_null_substitute,
+            primary_index=primary_index,
+            partitioning=partitioning,
+            features_infos=features_infos,
+            **kwargs
+        )
     else:
-        # Handle other conditions or operations as required
-        pass
-    # Record the end time
-    end_time = time.time()
-    # Calculate the elapsed time in seconds
-    elapsed_time = end_time - start_time
+        logger_safe(
+            "warning",
+            "Unknown STORE_FEATURE strategy '%s'. No storage operation was performed.",
+            tdfs4ds.STORE_FEATURE
+        )
+    # Log duration
+    elapsed_time = time.time() - start_time
     formatted_elapsed_time = seconds_to_dhms(elapsed_time)
-    if tdfs4ds.DISPLAY_LOGS:
-        print(f'Storage of the prepared features : {formatted_elapsed_time} ({elapsed_time}s)')
+    logger_safe(
+        "info",
+        "Storage of prepared features completed in %s (%.3fs)",
+        formatted_elapsed_time,
+        elapsed_time
+    )
 def prepare_feature_ingestion_tdstone2(df, entity_id):
     """

tdfs4ds 0.2.4.32__py3-none-any.whl → 0.2.4.34__py3-none-any.whl

tdfs4ds 0.2.4.32py3-none-any.whl → 0.2.4.34py3-none-any.whl