PyPI - tdfs4ds - Versions diffs - 0.2.4.26__py3-none-any.whl → 0.2.4.41__py3-none-any.whl - Mend

tdfs4ds 0.2.4.26py3-none-any.whl → 0.2.4.41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

tdfs4ds/__init__.py +586 -564
tdfs4ds/feature_store/feature_data_processing.py +367 -299
tdfs4ds/feature_store/feature_query_retrieval.py +105 -52
tdfs4ds/feature_store/feature_store_management.py +226 -231
tdfs4ds/process_store/process_followup.py +113 -2
tdfs4ds/process_store/process_query_administration.py +1 -1
tdfs4ds/process_store/process_registration_management.py +67 -55
tdfs4ds/process_store/process_store_catalog_management.py +2 -2
tdfs4ds/utils/filter_management.py +521 -138
tdfs4ds/utils/query_management.py +18 -40
tdfs4ds/utils/time_management.py +547 -97
{tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.4.41.dist-info}/METADATA +1 -1
{tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.4.41.dist-info}/RECORD +15 -15
{tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.4.41.dist-info}/WHEEL +0 -0
{tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.4.41.dist-info}/top_level.txt +0 -0

tdfs4ds/process_store/process_followup.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import tdfs4ds
 from tdfs4ds.utils.query_management import execute_query_wrapper
 import teradataml as tdml
+from tdfs4ds import logger_safe, logger
 @execute_query_wrapper
 def follow_up_table_creation():
@@ -194,5 +195,115 @@ def followup_close(run_id, process_type, process_id, status='COMPLETED', filterm
         raise
     return query
-def follow_up_report():
-    return tdml.DataFrame(tdml.in_schema(tdfs4ds.SCHEMA, tdfs4ds.FOLLOW_UP_NAME.replace('FS_', 'FS_V_'))).sort('START_DATETIME',ascending=False)
+from typing import Optional
+def follow_up_report(filtermanager: Optional[object] = None, process_id: Optional[str] = None):
+    """
+    Return a follow-up report as a `tdml.DataFrame`, optionally filtered by
+    `process_id` and/or a `filtermanager`'s applied filter.
+    Behavior by arguments:
+        - process_id is None and filtermanager is None:
+            Return all rows from SCHEMA.FOLLOW_UP_NAME, sorted by START_DATETIME desc.
+        - process_id is not None and filtermanager is None:
+            Return rows for the given PROCESS_ID.
+        - process_id is not None and filtermanager is not None:
+            Return rows for the given PROCESS_ID whose APPLIED_FILTER matches the
+            JSON_AGG of `filtermanager`'s columns coming from its schema/view.
+        - process_id is None and filtermanager is not None:
+            Return rows whose APPLIED_FILTER matches the JSON_AGG of `filtermanager`
+            (no PROCESS_ID constraint).
+    Args:
+        filtermanager: An object exposing `col_names`, `schema_name`, and `view_name`.
+            Its columns are aggregated via `JSON_AGG(col1, col2, ...)` to compare
+            against A.APPLIED_FILTER.
+        process_id: Optional process identifier used to filter by PROCESS_ID.
+    Returns:
+        tdml.DataFrame: The resulting dataframe sorted by START_DATETIME (descending).
+    Raises:
+        ValueError: If `filtermanager` is provided but is missing required attributes
+            or has an empty `col_names` list.
+        RuntimeError: If the query fails.
+    """
+    logger_safe("debug", "follow_up_report called with process_id=%s, filtermanager=%s",
+                process_id, type(filtermanager).__name__ if filtermanager else None)
+    table_fqn = f"{tdfs4ds.SCHEMA}.{tdfs4ds.FOLLOW_UP_NAME}"
+    # Case 1: No filters at all -> return full table
+    if process_id is None and filtermanager is None:
+        logger_safe("info", "Returning all follow-up rows (no filters).")
+        try:
+            return tdml.DataFrame(tdml.in_schema(tdfs4ds.SCHEMA, tdfs4ds.FOLLOW_UP_NAME)) \
+                      .sort('START_DATETIME', ascending=False)
+        except Exception as e:
+            logger_safe("error", "Failed to fetch all follow-up rows: %s", e)
+            raise RuntimeError("Database query failed while fetching follow-up report.") from e
+    # Helper to build the FILTER_MANAGER scalar subquery when filtermanager is provided
+    def _build_filter_manager_subquery(fm: object) -> str:
+        required_attrs = ("col_names", "schema_name", "view_name")
+        if not all(hasattr(fm, a) for a in required_attrs):
+            raise ValueError("filtermanager must have col_names, schema_name, and view_name.")
+        if not getattr(fm, "col_names", None):
+            raise ValueError("filtermanager.col_names must be a non-empty list.")
+        json_cols = ",".join(fm.col_names)
+        subq = f"""
+            (
+                SELECT JSON_AGG({json_cols}) AS APPLIED_FILTER
+                FROM {fm.schema_name}.{fm.view_name}
+            ) FILTER_MANAGER
+        """
+        logger_safe("debug", "Constructed FILTER_MANAGER subquery with columns: %s", json_cols)
+        return subq
+    # Defensive escaping for process_id if used in a literal (prefer bind params if available)
+    def _escape_literal(val: str) -> str:
+        return val.replace("'", "''")
+    try:
+        # Case 2: process_id only
+        if process_id is not None and filtermanager is None:
+            pid = _escape_literal(process_id)
+            query = f"""
+                SELECT *
+                FROM {table_fqn}
+                WHERE PROCESS_ID = '{pid}'
+            """
+            logger_safe("info", "Fetching follow-up rows filtered by PROCESS_ID only.")
+            return tdml.DataFrame.from_query(query).sort('START_DATETIME', ascending=False)
+        # Case 3: filtermanager only
+        if process_id is None and filtermanager is not None:
+            subq = _build_filter_manager_subquery(filtermanager)
+            query = f"""
+                SELECT A.*
+                FROM {table_fqn} A,
+                     {subq}
+                WHERE CAST(A.APPLIED_FILTER AS VARCHAR(20000)) =
+                      CAST(FILTER_MANAGER.APPLIED_FILTER AS VARCHAR(20000))
+            """
+            logger_safe("info", "Fetching follow-up rows filtered by FILTER_MANAGER only.")
+            return tdml.DataFrame.from_query(query).sort('START_DATETIME', ascending=False)
+        # Case 4: both process_id and filtermanager
+        pid = _escape_literal(process_id)  # type: ignore[arg-type]
+        subq = _build_filter_manager_subquery(filtermanager)  # type: ignore[arg-type]
+        query = f"""
+            SELECT A.*
+            FROM {table_fqn} A,
+                 {subq}
+            WHERE A.PROCESS_ID = '{pid}'
+              AND CAST(A.APPLIED_FILTER AS VARCHAR(20000)) =
+                  CAST(FILTER_MANAGER.APPLIED_FILTER AS VARCHAR(20000))
+        """
+        logger_safe("info", "Fetching follow-up rows filtered by PROCESS_ID and FILTER_MANAGER.")
+        return tdml.DataFrame.from_query(query).sort('START_DATETIME', ascending=False)
+    except Exception as e:
+        logger_safe("error", "Failed to fetch follow-up report: %s", e)
+        raise RuntimeError("Database query failed while fetching follow-up report.") from e

tdfs4ds/process_store/process_query_administration.py CHANGED Viewed

@@ -28,7 +28,7 @@ def list_processes():
         return tdml.DataFrame(tdml.in_schema(tdfs4ds.SCHEMA, tdfs4ds.PROCESS_CATALOG_NAME_VIEW))
     except Exception as e:
         print(str(e))
-        print(query)
+        print(tdml.DataFrame(tdml.in_schema(tdfs4ds.SCHEMA, tdfs4ds.PROCESS_CATALOG_NAME_VIEW)).show_query())
 def list_processes_feature_split():
     """

tdfs4ds/process_store/process_registration_management.py CHANGED Viewed

@@ -3,6 +3,7 @@ import tdfs4ds
 from tdfs4ds.utils.query_management import execute_query_wrapper
 import uuid
 import json
+from tdfs4ds import logger,logger_safe
 @execute_query_wrapper
 def register_process_view(view_name, entity_id, feature_names, metadata={}, entity_null_substitute = {}, **kwargs):
@@ -74,80 +75,91 @@ def _register_process_view_merge(view_name, entity_id, feature_names, metadata={
     - Requires 'tdml' module for DataFrame operations and 'uuid' for generating unique identifiers.
     """
-    # Handling the case where the view name is provided as a DataFrame
-    if type(view_name) == tdml.dataframe.dataframe.DataFrame:
+    # Handle teradataml DataFrame input
+    if isinstance(view_name, tdml.dataframe.dataframe.DataFrame):
         try:
             view_name = view_name._table_name
-        except:
-            print(
-                'create your teradata dataframe using tdml.DataFrame(<view name>). Crystallize your view if needed')
+        except Exception:
+            logger_safe(
+                "error",
+                "Invalid DataFrame for view registration. Use: tdml.DataFrame(<table/view>). Crystallize if needed."
+            )
             raise
+    # Prevent using temporary teradataml views
     if view_name.split('.')[1].startswith('ml__'):
-        tdfs4ds.logger.error('Your dataframe is a temporary teradataml dataframe. Please crystallize your view first.')
-        raise ValueError("Invalid process view name: it starts with 'ml__'. Please consider view crystallization")
-    # Get filter manager:
+        logger_safe(
+            "error",
+            "Invalid view name '%s': starts with 'ml__'. Please crystallize your view first.",
+            view_name
+        )
+        raise ValueError("Invalid process view name: temporary teradataml views are not allowed.")
+    # Get optional arguments
     filtermanager = kwargs.get('filtermanager', None)
-    if filtermanager is None:
-        query_upsert_filtermanager = None
-    # Get data distribution related inputs:
-    primary_index = kwargs.get('primary_index', [e for e in entity_id.keys()])
+    query_upsert_filtermanager = None
+    primary_index = kwargs.get('primary_index', list(entity_id.keys()))
     partitioning = kwargs.get('partitioning', '').replace("'", '"')
     if primary_index is None:
-        primary_index = [e for e in entity_id.keys()]
+        primary_index = list(entity_id.keys())
+    feature_names = ','.join(feature_names)
+    # Validtime period
+    end_period_ = '9999-01-01 00:00:00' if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED' else tdfs4ds.END_PERIOD
+    validtime_statement = (
+        'CURRENT VALIDTIME'
+        if tdfs4ds.FEATURE_STORE_TIME is None
+        else f"VALIDTIME PERIOD '({tdfs4ds.FEATURE_STORE_TIME},{end_period_})'"
+    )
-    # Joining the feature names into a comma-separated string
-    feature_names = ','.join(feature_names)
+    logger_safe("info", "Registering process view: %s", view_name)
-    # Setting the end period for the view
-    if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED':
-        end_period_ = '9999-01-01 00:00:00'
-    else:
-        end_period_ = tdfs4ds.END_PERIOD
+    # Check if view already exists in catalog
+    query_process_id = f"""
+        SEL PROCESS_ID FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}
+        WHERE view_name = '{view_name}'
+    """
+    process_id_result = tdml.execute_sql(query_process_id).fetchall()
-    if tdfs4ds.FEATURE_STORE_TIME == None:
-        validtime_statement = 'CURRENT VALIDTIME'
-    else:
-        validtime_statement = f"VALIDTIME PERIOD '({tdfs4ds.FEATURE_STORE_TIME},{end_period_})'"
+    if process_id_result:
+        process_id = process_id_result[0][0]
+        logger_safe("info", "Updating existing process_id=%s", process_id)
+        query_feature_version = f"""
+            SEL PROCESS_VERSION FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}
+            WHERE view_name = '{view_name}'
+        """
+        feature_version = tdml.execute_sql(query_feature_version).fetchall()[0][0]
-    query_process_id = f"SEL PROCESS_ID FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW} WHERE view_name = '{view_name}'"
-    process_id = tdml.execute_sql(query_process_id).fetchall()
-    if len(process_id)>0:
-        process_id            = process_id[0][0]
-        query_feature_version = f"SEL PROCESS_VERSION FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW} WHERE view_name = '{view_name}'"
-        feature_version       = tdml.execute_sql(query_feature_version).fetchall()[0][0]
-        query_primary_index   = f"SEL FOR_PRIMARY_INDEX, FOR_DATA_PARTITIONING FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME} WHERE process_id = '{process_id}'"
-        query_primary_index_res = tdml.execute_sql(query_primary_index).fetchall()
-        if len(query_primary_index_res)>0:
-            FOR_PRIMARY_INDEX, FOR_DATA_PARTITIONING = tdml.execute_sql(query_primary_index).fetchall()[0]
+        query_primary_index = f"""
+            SEL FOR_PRIMARY_INDEX, FOR_DATA_PARTITIONING
+            FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME}
+            WHERE process_id = '{process_id}'
+        """
+        dist_res = tdml.execute_sql(query_primary_index).fetchall()
+        if dist_res:
+            FOR_PRIMARY_INDEX, FOR_DATA_PARTITIONING = dist_res[0]
         else:
-            raise ValueError(f"""
-                There is not information on primary index and partitioning for process: {process_id}.
-                The working date is: {validtime_statement}
-                The content of the distribution table is:
-                {print(tdml.DataFrame.from_query(f"SEL * FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME} WHERE process_id = '{process_id}'"))}
-            """)
+            logger_safe(
+                "error",
+                "Missing data distribution info for existing process %s. Check distribution table.",
+                process_id
+            )
+            raise ValueError("Missing distribution info.")
     else:
-        # Generating a unique process identifier
         process_id = str(uuid.uuid4())
         feature_version = 1
         FOR_PRIMARY_INDEX = ",".join(primary_index)
         FOR_DATA_PARTITIONING = partitioning
+        logger_safe("info", "Generated new process_id=%s", process_id)
-    # Create a comma-separated string of entity IDs
-    entity_id_list = list(entity_id.keys())
-    entity_id_list.sort()
-    ENTITY_ID__ = ','.join([k for k in entity_id_list])
+    # Build entity_id string
+    ENTITY_ID__ = ','.join(sorted(entity_id.keys()))
+    logger_safe("debug", "Entity IDs: %s", ENTITY_ID__)
+    logger_safe("debug", "Feature names: %s", feature_names)
-    print('feature_version :',feature_version)
-    print('int(feature_version) :', int(feature_version))
     if tdfs4ds.FEATURE_STORE_TIME == None:
@@ -402,16 +414,16 @@ def _register_process_view_merge(view_name, entity_id, feature_names, metadata={
             """
-    # Logging the process registration
-    print(f'register process with id : {process_id}')
-    print(f"to run the process again just type : run(process_id='{process_id}')")
-    print(f"to update your dataset : dataset = run(process_id='{process_id}',return_dataset=True)")
+    logger_safe("info", "Process registered: process_id=%s", process_id)
+    logger_safe("info", "To rerun: run(process_id='%s')", process_id)
+    logger_safe("info", "To build dataset: dataset = run(process_id='%s', return_dataset=True)", process_id)
-    #print('query_insert_dist', query_upsert_dist)
+    # Return queries
     if kwargs.get('with_process_id'):
         return query_upsert, process_id, query_upsert_dist, query_upsert_filtermanager
     else:
         return query_upsert, query_upsert_dist, query_upsert_filtermanager
 @execute_query_wrapper
 def _register_process_view_update_insert(view_name, entity_id, feature_names, metadata={}, entity_null_substitute={}, **kwargs):
     """

tdfs4ds/process_store/process_store_catalog_management.py CHANGED Viewed

@@ -208,7 +208,7 @@ def process_store_catalog_creation(if_exists='replace', comment='this table is a
                 (
                     PROCESS_ID VARCHAR(36) NOT NULL,
                     FOR_PRIMARY_INDEX VARCHAR(2048),
-                    FOR_DATA_PARTITIONING VARCHAR(2048),
+                    FOR_DATA_PARTITIONING VARCHAR(32000),
                     ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
                     ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
                     PERIOD FOR ValidPeriod  (ValidStart, ValidEnd) AS VALIDTIME
@@ -227,7 +227,7 @@ def process_store_catalog_creation(if_exists='replace', comment='this table is a
                 (
                     PROCESS_ID VARCHAR(36) NOT NULL,
                     FOR_PRIMARY_INDEX VARCHAR(2048),
-                    FOR_DATA_PARTITIONING VARCHAR(2048)
+                    FOR_DATA_PARTITIONING VARCHAR(32000)
                 )
                 PRIMARY INDEX (PROCESS_ID);
         """

tdfs4ds 0.2.4.26__py3-none-any.whl → 0.2.4.41__py3-none-any.whl

tdfs4ds 0.2.4.26py3-none-any.whl → 0.2.4.41py3-none-any.whl