PyPI - tdfs4ds - Versions diffs - 0.2.4.41__py3-none-any.whl → 0.2.5.1__py3-none-any.whl - Mend

tdfs4ds 0.2.4.41py3-none-any.whl → 0.2.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

tdfs4ds/__init__.py +214 -38
tdfs4ds/feature_store/feature_data_processing.py +7 -5
tdfs4ds/genai/__init__.py +27 -0
tdfs4ds/genai/documentation.py +1878 -0
tdfs4ds/process_store/process_store_catalog_management.py +77 -24
tdfs4ds/utils/filter_management.py +40 -13
tdfs4ds/utils/time_management.py +28 -11
{tdfs4ds-0.2.4.41.dist-info → tdfs4ds-0.2.5.1.dist-info}/METADATA +1 -1
{tdfs4ds-0.2.4.41.dist-info → tdfs4ds-0.2.5.1.dist-info}/RECORD +11 -17
tdfs/__init__.py +0 -1
tdfs/data/curves.csv +0 -5086
tdfs/datasets.py +0 -27
tdfs/feature_store.py +0 -723
tdfs4ds/feature_engineering.py +0 -152
tdfs4ds/feature_store.py +0 -1529
tdfs4ds/process_store.py +0 -387
tdfs4ds/utils.py +0 -579
{tdfs4ds-0.2.4.41.dist-info → tdfs4ds-0.2.5.1.dist-info}/WHEEL +0 -0
{tdfs4ds-0.2.4.41.dist-info → tdfs4ds-0.2.5.1.dist-info}/top_level.txt +0 -0

tdfs4ds/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
-__version__ = '0.2.4.41'
+__version__ = '0.2.5.1'
+import difflib
 import logging
 import json
@@ -23,6 +24,7 @@ logger = logging.getLogger(__name__)
 from tdfs4ds.feature_store.feature_query_retrieval import get_available_entity_id_records, write_where_clause_filter
 from tdfs4ds.process_store.process_followup import follow_up_report
 from tdfs4ds.dataset.dataset_catalog import DatasetCatalog, Dataset
+from . import genai
 DATA_DOMAIN             = None
 SCHEMA                  = None
@@ -55,6 +57,18 @@ FEATURE_PARTITION_EACH  = 1
 VARCHAR_SIZE            = 1024
+INSTRUCT_MODEL_URL      = None
+INSTRUCT_MODEL_API_KEY  = None
+INSTRUCT_MODEL_MODEL    = None
+INSTRUCT_MODEL_PROVIDER = None
+DOCUMENTATION_PROCESS_BUSINESS_LOGIC      = 'FS_PROCESS_DOCUMENTATION_BUSINESS_LOGIC'
+DOCUMENTATION_PROCESS_FEATURES            = 'FS_PROCESS_DOCUMENTATION_FEATURES'
+DOCUMENTATION_PROCESS_BUSINESS_LOGIC_VIEW = 'FS_V_PROCESS_DOCUMENTATION_BUSINESS_LOGIC'
+DOCUMENTATION_PROCESS_FEATURES_VIEW       = 'FS_V_PROCESS_DOCUMENTATION_FEATURES'
+DOCUMENTATION_PROCESS_EXPLAIN             = 'FS_PROCESS_DOCUMENTATION_EXPLAIN'
+DOCUMENTATION_PROCESS_EXPLAIN_VIEW        = 'FS_V_PROCESS_DOCUMENTATION_EXPLAIN'
 import warnings
 warnings.filterwarnings('ignore')
@@ -152,6 +166,11 @@ def setup(database, if_exists='fail'):
         logger_safe("info", "Dataset catalog created: %s", tdfs4ds.DATASET_CATALOG_NAME)
     logger_safe("info", "Setup complete.")
+    try:
+        tdfs4ds.genai.documentations_tables_creation()
+        logger_safe("info", "Documentation tables created successfully.")
+    except Exception as e:
+        logger_safe("error", "Documentation tables creation failed: %s", str(e).split('\n')[0])
     return
@@ -165,6 +184,9 @@ def connect(
     feature_catalog_name_view = tdfs4ds.FEATURE_CATALOG_NAME_VIEW,
     process_catalog_name_view = tdfs4ds.PROCESS_CATALOG_NAME_VIEW,
     dataset_catalog_name      = tdfs4ds.DATASET_CATALOG_NAME,
+    documentation_process_business_logic = tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC,
+    documentation_process_features       = tdfs4ds.DOCUMENTATION_PROCESS_FEATURES,
+    documentation_process_explain        = tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN,
     create_if_missing         = False
 ):
     if database is None:
@@ -179,20 +201,31 @@ def connect(
     distrib_exists = data_distribution_name.lower() in tables
     filter_manager_exists = filter_manager_name.lower() in tables
     followup_name_exists = followup_name.lower() in tables
+    documentation_process_business_logic_exist = documentation_process_business_logic.lower() in tables
+    documentation_process_features_exist = documentation_process_features.lower() in tables
+    documentation_process_explain_exist = documentation_process_explain.lower() in tables
-    if not (feature_exists and process_exists and distrib_exists and filter_manager_exists):
+    if not (feature_exists and process_exists and distrib_exists and filter_manager_exists and documentation_process_business_logic_exist and documentation_process_features_exist):
         if not create_if_missing:
             logger_safe("warning", "Feature store components missing and create_if_missing=False")
             return False
         logger_safe("info", "Missing components detected; creating missing parts...")
         if not feature_exists:
+            logger_safe("info", "Creating feature catalog: %s", feature_catalog_name)
             tdfs4ds.feature_store.feature_store_management.feature_store_catalog_creation()
         if not process_exists:
+            logger_safe("info", "Creating process catalog: %s", process_catalog_name)
             tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_creation()
         if not distrib_exists:
+            logger_safe("info", "Creating data distribution table: %s", data_distribution_name)
             tdfs4ds.data_distribution.data_distribution_catalog_creation()
         if not filter_manager_exists:
+            logger_safe("info", "Creating filter manager table: %s", filter_manager_name)
             tdfs4ds.filter_manager.filter_manager_catalog_creation()
+        if not documentation_process_business_logic_exist or not documentation_process_features_exist or not documentation_process_explain_exist:
+            logger_safe("info", "Creating documentation tables.")
+            tdfs4ds.genai.documentation_tables_creation()
     if not followup_name_exists:
         logger_safe("info", "Creating follow-up table: %s", followup_name)
@@ -229,12 +262,111 @@ def connect(
             schema_name=tdfs4ds.SCHEMA,
             object_type='table'
         )
+    query_data_domain = f"""
+    SELECT DISTINCT DATA_DOMAIN
+    FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}
+    UNION
+    SELECT DISTINCT DATA_DOMAIN
+    FROM {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW}
+    """
+    data_domains = tdml.DataFrame.from_query(query_data_domain).to_pandas()['DATA_DOMAIN'].tolist()
+    logger_safe("info", "Data domains in feature store: %s", data_domains)
     tdfs4ds.DATA_DISTRIBUTION_TEMPORAL = is_data_distribution_temporal()
     logger_safe("info", "Connected to feature store successfully.")
     return True
+def get_data_domains(verbose=True):
+    """
+    Retrieve and display all data domains available in the feature store.
+    This function queries the feature store to obtain a list of all distinct data domains
+    that have been defined within the system. It combines data domains from both the process
+    catalog and the feature catalog, ensuring a comprehensive overview. The current data
+    domain in use is highlighted for easy identification.
+    Parameters:
+    - verbose (bool): If True, prints the list of data domains with the current one marked.
+    Returns:
+    - str: The current data domain in use.
+    """
+    query_data_domain = f"""
+    SELECT DISTINCT DATA_DOMAIN
+    FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}
+    UNION
+    SELECT DISTINCT DATA_DOMAIN
+    FROM {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW}
+    """
+    data_domains = tdml.DataFrame.from_query(query_data_domain).to_pandas()['DATA_DOMAIN'].tolist()
+    if verbose:
+        print("Data Domains in Feature Store:")
+        for d in data_domains:
+            if d != tdfs4ds.DATA_DOMAIN:
+                print('\t'+d)
+            else:
+                print('*\t'+d)
+        if tdfs4ds.DATA_DOMAIN not in data_domains and tdfs4ds.DATA_DOMAIN is not None:
+            print("\nCurrent data domain (%s) not available yet in feature store. It may be a new one" % tdfs4ds.DATA_DOMAIN)
+        return
+    return data_domains
+def select_data_domain(data_domain):
+    """
+    Set the active data domain for feature store operations.
+    This function allows users to specify which data domain should be considered
+    as the current context for subsequent feature store operations. By setting
+    the data domain, users can ensure that all feature queries, registrations,
+    and other interactions with the feature store are scoped appropriately.
+    This is particularly useful in environments where multiple data domains
+    exist, allowing for clear separation and organization of features.
+    Parameters:
+    - data_domain (str): The name of the data domain to set as active.
+    Returns:
+    - str: The data domain that has been set as active.
+    """
+    data_domains = get_data_domains(verbose=False)
+    if data_domain not in data_domains:
+        logger_safe("error", "Data domain '%s' not found in feature store.", data_domain)
+        raise ValueError(f"Data domain '{data_domain}' not found in feature store.")
+    #suggest a data domain closest to the requested one
+    closest_domain = difflib.get_close_matches(data_domain, data_domains, n=1)
+    if data_domain in data_domains:
+        tdfs4ds.DATA_DOMAIN = data_domain
+    elif closest_domain:
+        logger_safe("info", "Did you mean '%s'?", closest_domain[0])
+        return
+    tdfs4ds.DATA_DOMAIN = data_domain
+    logger_safe("info", "Data domain set to: %s", data_domain)
+    return
+def create_data_domain(data_domain):
+    """
+    Create a new data domain in the feature store.
+    This function facilitates the creation of a new data domain within the feature store.
+    A data domain serves as a logical grouping for features, allowing for better organization
+    and management. By creating a new data domain, users can segregate features based on
+    specific criteria, such as business units, projects, or data types. This helps in
+    maintaining clarity and structure within the feature store, especially in environments
+    with diverse datasets and use cases.
+    Parameters:
+    - data_domain (str): The name of the new data domain to be created.
+    Returns:
+    - str: The name of the newly created data domain.
+    """
+    existing_domains = get_data_domains(verbose=False)
+    if data_domain in existing_domains:
+        logger_safe("warning", "Data domain '%s' already exists in feature store.", data_domain)
+        return data_domain
+    tdfs4ds.DATA_DOMAIN = data_domain
+    logger_safe("info", "Data domain '%s' created in locally.", data_domain)
+    return
 def feature_catalog():
@@ -287,7 +419,7 @@ def get_dataset_entity(dataset_id = None):
 def get_dataset_features(dataset_id = None):
     return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_features(dataset_id)
-def run(process_id, return_dataset=False, force_compute=False, force_varchar_length=None):
+def run(process_id, return_dataset=False, force_compute=False, force_varchar_length=None, dataset_view_name=None):
     """
     Executes a specific process from the feature store identified by the process ID.
     Uses global `logger` for diagnostics (gated by tdfs4ds.DISPLAY_LOGS).
@@ -383,7 +515,8 @@ def run(process_id, return_dataset=False, force_compute=False, force_varchar_len
             entity_null_substitute=entity_null_substitute,
             process_id=process_id,
             force_compute=force_compute,
-            force_varchar_length=force_varchar_length
+            force_varchar_length=force_varchar_length,
+            dataset_view_name = dataset_view_name
         )
     # Handling 'tdstone2 view' process type
@@ -413,7 +546,8 @@ def upload_features(
     filtermanager=None,
     entity_null_substitute={},
     force_compute=True,
-    force_varchar_length=1024
+    force_varchar_length=1024,
+    dataset_view_name = None
 ):
     """
     Uploads feature data from a DataFrame to the feature store for a specified entity.
@@ -518,19 +652,21 @@ def upload_features(
     try:
         if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
             dataset = run(
-                process_id=process_id,
-                return_dataset=True,
-                force_compute=force_compute,
-                force_varchar_length=force_varchar_length
+                process_id           = process_id,
+                return_dataset       = True,
+                force_compute        = force_compute,
+                force_varchar_length = force_varchar_length,
+                dataset_view_name    = dataset_view_name
             )
             logger_safe("info", "Run completed with dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
             return dataset
         else:
             run(
-                process_id=process_id,
-                return_dataset=False,
-                force_compute=force_compute,
-                force_varchar_length=force_varchar_length
+                process_id            = process_id,
+                return_dataset        = False,
+                force_compute         = force_compute,
+                force_varchar_length  = force_varchar_length,
+                dataset_view_name     = dataset_view_name
             )
             logger_safe("info", "Run completed without dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
             return
@@ -539,10 +675,10 @@ def upload_features(
         # Keep your existing follow-up close behavior, but ensure the error is logged.
         try:
             tdfs4ds.process_store.process_followup.followup_close(
-                run_id       = tdfs4ds.RUN_ID,
-                process_type = tdfs4ds.PROCESS_TYPE,
-                process_id   = process_id,
-                status       = 'FAILED,' + str(e).split('\n')[0]
+                run_id                = tdfs4ds.RUN_ID,
+                process_type          = tdfs4ds.PROCESS_TYPE,
+                process_id            = process_id,
+                status                = 'FAILED,' + str(e).split('\n')[0]
             )
         finally:
             logger_safe("exception", "Run failed (run_id=%s, process_id=%s): %s",
@@ -557,11 +693,12 @@ def upload_features(
 def _upload_features(
     df, entity_id, feature_names,
-    feature_versions=FEATURE_VERSION_DEFAULT,
-    primary_index=None, partitioning='',
-    filtermanager=None, entity_null_substitute={},
-    process_id=None, force_compute=False,
-    force_varchar_length=None
+    feature_versions     = FEATURE_VERSION_DEFAULT,
+    primary_index        = None, partitioning           = '',
+    filtermanager        = None, entity_null_substitute = {},
+    process_id           = None, force_compute          = False,
+    force_varchar_length = None,
+    dataset_view_name    = None
 ):
     """
     Uploads a set of features into the Feature Store for a given entity.
@@ -708,6 +845,7 @@ def _upload_features(
         ]
     if filtermanager is None:
+        dataset_created = False
         do_compute = not (process_id and follow_up is not None and follow_up.shape[0] > 0)
         if not do_compute and not force_compute:
             logger_safe(
@@ -730,8 +868,12 @@ def _upload_features(
                     entity_null_substitute=entity_null_substitute,
                     partitioning=partitioning
                 )
-                store_feature(entity_id, volatile_table, entity_null_substitute,
+                count_rows = store_feature(entity_id, volatile_table, entity_null_substitute,
                               primary_index, partitioning, features_infos)
                 apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
                 tdfs4ds.process_store.process_followup.followup_close(
@@ -740,6 +882,20 @@ def _upload_features(
                     process_id=process_id
                 )
                 logger_safe("info", "Feature ingestion completed for entity=%s", entity_id)
+                # Build dataset for validation if enabled
+                if tdfs4ds.BUILD_DATASET_AT_UPLOAD or dataset_view_name is not None:
+                    logger_safe("info", "Building dataset for validation...")
+                    try:
+                        dataset = build_dataset(
+                            entity_id, selected_features,
+                            view_name = dataset_view_name
+                        )
+                        dataset_created = True
+                    except Exception as e:
+                        logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
+                        logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
+                else:
+                    logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False) and no dataset view name provided.")
             except Exception as e:
                 logger_safe("exception", "Feature ingestion failed for entity=%s", entity_id)
@@ -762,7 +918,7 @@ def _upload_features(
             unit="filter",
             leave=False
         )
+        dataset_created = False
         for i in pbar:
             filter_id = i + 1
             filtermanager.update(filter_id)
@@ -834,7 +990,7 @@ def _upload_features(
                         partitioning           = partitioning
                     )
-                    store_feature(entity_id, volatile_table, entity_null_substitute,
+                    count_rows = store_feature(entity_id, volatile_table, entity_null_substitute,
                                   primary_index, partitioning, features_infos)
                     something_computed = True
@@ -846,6 +1002,21 @@ def _upload_features(
                         filtermanager = filtermanager
                     )
+                                    # Build dataset for validation if enabled
+                    if (tdfs4ds.BUILD_DATASET_AT_UPLOAD or dataset_view_name is not None) and dataset_created==False:
+                        logger_safe("info", "Building dataset for validation...")
+                        try:
+                            dataset = build_dataset(
+                                entity_id, selected_features,
+                                view_name = dataset_view_name
+                            )
+                            dataset_created = True
+                        except Exception as e:
+                            logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
+                            logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
+                    else:
+                        logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False) and no dataset view name provided.")
                 except Exception as e:
                     logger_safe("exception", "Error with filter iteration %s: %s", i + 1, str(e))
                     tdfs4ds.process_store.process_followup.followup_close(
@@ -860,19 +1031,24 @@ def _upload_features(
         if something_computed:
             apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
-    if tdfs4ds.BUILD_DATASET_AT_UPLOAD:
-        logger_safe("info", "Building dataset for validation...")
-        try:
-            return build_dataset(
-                entity_id, selected_features,
-                view_name=None,
-                entity_null_substitute=entity_null_substitute
-            )
-        except Exception as e:
-            logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
-            logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
-    else:
-        logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False)")
+        if dataset_created == False and tdfs4ds.BUILD_DATASET_AT_UPLOAD and dataset_view_name == None:
+            logger_safe("info", "Building dataset for validation...")
+            try:
+                dataset = build_dataset(
+                    entity_id, selected_features,
+                    view_name = dataset_view_name
+                )
+                return dataset
+            except Exception as e:
+                logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
+                logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
+        else:
+            if tdfs4ds.BUILD_DATASET_AT_UPLOAD == False:
+                logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False) and no dataset view name provided.")
+            else:
+                return
         return

tdfs4ds/feature_store/feature_data_processing.py CHANGED Viewed

@@ -287,10 +287,10 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
     # Execute: create volatile table and test unicity
     try:
         tdml.DataFrame.from_query(nested_query).to_sql(
-            table_name=volatile_table_name,
-            temporary=True,
-            primary_index=primary_index.split(','),
-            if_exists='replace'
+            table_name    = volatile_table_name,
+            temporary     = True,
+            primary_index = primary_index.split(','),
+            if_exists     = 'replace'
         )
         nb_duplicates = tdml.execute_sql(query_test_unicity).fetchall()[0][0]
         if nb_duplicates is not None and nb_duplicates > 0:
@@ -731,6 +731,8 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
         ).fetchall(),
         columns=['NB_ROWS']
     )
+    # log the number of rows obtained after transformations
+    logger_safe("info", f"{count_features.NB_ROWS.values[0]} rows of features")
     if getattr(tdfs4ds, "DEBUG_MODE", False):
         logger_safe("debug", "count_features=%s", count_features)
@@ -858,7 +860,7 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
         logger_safe("exception", "Feature storage (merge) failed: %s", str(e).split('\n')[0])
         raise
-    return
+    return count_features.NB_ROWS.values[0]
 def store_feature(entity_id, volatile_table_name, entity_null_substitute = {},primary_index=None,
             partitioning='', features_infos = None, **kwargs):

tdfs4ds/genai/__init__.py ADDED Viewed

@@ -0,0 +1,27 @@
+from .documentation import (
+    document_sql_query_columns,
+    document_process,
+    documentation_tables_creation,
+    document_sql_query_explain,
+    build_explain_documentation_chain,
+    run_explain_documentation,
+    build_sql_documentation_chain,
+    run_sql_documentation,
+    build_llm,
+    get_the_explain,
+    display_process_info
+)
+__all__ = [
+    "document_sql_query_columns",
+    "document_process",
+    "documentation_tables_creation",
+    "document_sql_query_explain",
+    "build_explain_documentation_chain",
+    "run_explain_documentation",
+    "build_sql_documentation_chain",
+    "run_sql_documentation",
+    "build_llm",
+    "get_the_explain",
+    "display_process_info"
+]

tdfs4ds 0.2.4.41__py3-none-any.whl → 0.2.5.1__py3-none-any.whl

tdfs4ds 0.2.4.41py3-none-any.whl → 0.2.5.1py3-none-any.whl