PyPI - tdfs4ds - Versions diffs - 0.2.4.3__py3-none-any.whl → 0.2.4.4__py3-none-any.whl - Mend

tdfs4ds 0.2.4.3py3-none-any.whl → 0.2.4.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

tdfs4ds/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = '0.2.4.3'
+__version__ = '0.2.4.4'
 import logging
 # Setup the logger
 logging.basicConfig(
@@ -310,7 +310,7 @@ def get_dataset_entity(dataset_id = None):
 def get_dataset_features(dataset_id = None):
     return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_features(dataset_id)
-def run(process_id, return_dataset = False, force_compute = False):
+def run(process_id, return_dataset = False, force_compute = False, force_varchar_length = None):
     """
     Executes a specific process from the feature store identified by the process ID.
     The function handles different process types and performs appropriate actions.
@@ -321,6 +321,10 @@ def run(process_id, return_dataset = False, force_compute = False):
                                        Default is False.
     - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
                                       Default is False.
+    - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
+                                        VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
+                                        where k is the smallest integer so that the original lengths is smaller or equal
+                                        to k x force_varchar_length. Default is None.
     Returns:
     DataFrame or None: If return_dataset is True, returns the dataset created during the process. Otherwise, returns None.
@@ -423,7 +427,8 @@ def run(process_id, return_dataset = False, force_compute = False):
             filtermanager = filtermanager,
             entity_null_substitute = entity_null_substitute,
             process_id = process_id,
-            force_compute= force_compute
+            force_compute= force_compute,
+            force_varchar_length = force_varchar_length
         )
     # Handling 'tdstone2 view' process type
@@ -437,7 +442,7 @@ def run(process_id, return_dataset = False, force_compute = False):
     else:
         return
-def upload_features(df, entity_id, feature_names, metadata={}, primary_index = None, partitioning = '', filtermanager = None, entity_null_substitute = {}, force_compute = True):
+def upload_features(df, entity_id, feature_names, metadata={}, primary_index = None, partitioning = '', filtermanager = None, entity_null_substitute = {}, force_compute = True, force_varchar_length = 1024):
     """
     Uploads feature data from a DataFrame to the feature store for a specified entity. This involves registering the
     process in the feature store, executing the necessary SQL to insert the data, and returning the resulting dataset
@@ -463,7 +468,10 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
                                                Default is an empty dictionary.
     - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
                                       Default is True.
+    - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
+                                        VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
+                                        where k is the smallest integer so that the original lengths is smaller or equal
+                                        to k x force_varchar_length. Default is 1024.
     Returns:
     DataFrame: A DataFrame representing the dataset resulting from the upload process, typically used for validation
                or further processing.
@@ -575,7 +583,7 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
         try:
-            dataset = run(process_id=process_id, return_dataset=True, force_compute = force_compute)
+            dataset = run(process_id=process_id, return_dataset=True, force_compute = force_compute, force_varchar_length = force_varchar_length)
         except Exception as e:
             tdfs4ds.process_store.process_followup.followup_close(
@@ -591,7 +599,7 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
     else:
         try:
-            run(process_id=process_id, return_dataset=False)
+            run(process_id=process_id, return_dataset=False, force_compute = force_compute, force_varchar_length = force_varchar_length)
         except Exception as e:
             tdfs4ds.process_store.process_followup.followup_close(
                 run_id       = tdfs4ds.RUN_ID,
@@ -605,7 +613,7 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
     tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
 def _upload_features(df, entity_id, feature_names,
-                   feature_versions=FEATURE_VERSION_DEFAULT, primary_index = None, partitioning = '', filtermanager=None, entity_null_substitute={}, process_id = None, force_compute = False):
+                   feature_versions=FEATURE_VERSION_DEFAULT, primary_index = None, partitioning = '', filtermanager=None, entity_null_substitute={}, process_id = None, force_compute = False,force_varchar_length = None):
     """
     Uploads features from a DataFrame to the feature store, handling entity registration, feature type determination,
     feature registration, preparation for ingestion, and storage in the designated feature tables.
@@ -628,6 +636,11 @@ def _upload_features(df, entity_id, feature_names,
     - process_id (str, optional): An identifier for the process, used for tracking and follow-up. Default is None.
     - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
                                       Default is False.
+    - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
+                                            VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
+                                            where k is the smallest integer so that the original lengths is smaller or equal
+                                            to k x force_varchar_length. Default is None.
     Returns:
     DataFrame: A DataFrame representing the dataset view created in the feature store, detailing the features and their
@@ -655,7 +668,7 @@ def _upload_features(df, entity_id, feature_names,
     from tdfs4ds.feature_store.feature_store_management import register_features
     from tdfs4ds.feature_store.feature_data_processing  import prepare_feature_ingestion
     from tdfs4ds.feature_store.feature_data_processing  import store_feature, apply_collect_stats
-    from tdfs4ds.utils.info import get_column_types
+    from tdfs4ds.utils.info import get_column_types, update_varchar_length
     # Convert entity_id to a dictionary if it's not already one
     if type(entity_id) == list:
@@ -685,6 +698,10 @@ def _upload_features(df, entity_id, feature_names,
         entity_id=entity_id
     )
+    if force_varchar_length is not None:
+        print(feature_names_types)
+        feature_names_types = update_varchar_length(feature_names_types,new_varchar_length = force_varchar_length)
     def validate_feature_types(feature_names_types):
         """
         Validates feature data types and raises an error if any value contains

tdfs4ds/feature_store/feature_store_management.py CHANGED Viewed

@@ -73,6 +73,7 @@ def feature_store_catalog_creation(if_exists='replace', comment='this table is a
                 FEATURE_ID BIGINT,
                 FEATURE_NAME VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
+                FEATURE_TYPE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
                 FEATURE_TABLE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
                 FEATURE_DATABASE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
                 FEATURE_VIEW VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
@@ -410,12 +411,12 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
     # Create a DataFrame from the feature_names_types dictionary
     if len(feature_names_types.keys()) > 1:
         df = pd.DataFrame(feature_names_types).transpose().reset_index()
-        df.columns = ['FEATURE_NAME', 'TYPE', 'FEATURE_ID']
+        df.columns = ['FEATURE_NAME', 'FEATURE_TYPE', 'FEATURE_ID']
     else:
         df = pd.DataFrame(columns=['FEATURE_NAME', 'TYPE', 'FEATURE_ID'])
         k = list(feature_names_types.keys())[0]
         df['FEATURE_NAME'] = [k]
-        df['TYPE'] = [feature_names_types[k]['type']]
+        df['FEATURE_TYPE'] = [feature_names_types[k]['type']]
         df['FEATURE_ID'] = [feature_names_types[k]['id']]
@@ -458,6 +459,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
              SELECT
                 CASE WHEN B.FEATURE_ID IS NULL THEN A.FEATURE_ID ELSE B.FEATURE_ID END AS FEATURE_ID
             ,   A.FEATURE_NAME
+            ,   A.FEATURE_TYPE
             ,   A.FEATURE_TABLE
             ,   A.FEATURE_DATABASE
             ,   A.FEATURE_VIEW
@@ -476,6 +478,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
              UPDATE
              SET
                 FEATURE_TABLE    = UPDATED_FEATURES.FEATURE_TABLE,
+                FEATURE_TYPE     = UPDATED_FEATURES.FEATURE_TYPE,
                 FEATURE_DATABASE = UPDATED_FEATURES.FEATURE_DATABASE,
                 FEATURE_VIEW     = UPDATED_FEATURES.FEATURE_VIEW
                 --,ENTITY_NAME      = UPDATED_FEATURES.ENTITY_NAME -- modified
@@ -483,6 +486,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
              INSERT
              (  UPDATED_FEATURES.FEATURE_ID
             ,   UPDATED_FEATURES.FEATURE_NAME
+            ,   UPDATED_FEATURES.FEATURE_TYPE
             ,   UPDATED_FEATURES.FEATURE_TABLE
             ,   UPDATED_FEATURES.FEATURE_DATABASE
             ,   UPDATED_FEATURES.FEATURE_VIEW
@@ -498,6 +502,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
              SELECT
                 CASE WHEN B.FEATURE_ID IS NULL THEN A.FEATURE_ID ELSE B.FEATURE_ID END AS FEATURE_ID
             ,   A.FEATURE_NAME
+            ,   A.FEATURE_TYPE
             ,   A.FEATURE_TABLE
             ,   A.FEATURE_DATABASE
             ,   A.FEATURE_VIEW
@@ -516,6 +521,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
              UPDATE
              SET
                 FEATURE_TABLE    = UPDATED_FEATURES.FEATURE_TABLE,
+                FEATURE_TYPE    = UPDATED_FEATURES.FEATURE_TYPE,
                 FEATURE_DATABASE = UPDATED_FEATURES.FEATURE_DATABASE,
                 FEATURE_VIEW     = UPDATED_FEATURES.FEATURE_VIEW
                 --,ENTITY_NAME      = UPDATED_FEATURES.ENTITY_NAME -- modified
@@ -523,6 +529,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
              INSERT
              (  UPDATED_FEATURES.FEATURE_ID
             ,   UPDATED_FEATURES.FEATURE_NAME
+            ,   UPDATED_FEATURES.FEATURE_TYPE
             ,   UPDATED_FEATURES.FEATURE_TABLE
             ,   UPDATED_FEATURES.FEATURE_DATABASE
             ,   UPDATED_FEATURES.FEATURE_VIEW

tdfs4ds/utils/info.py CHANGED Viewed

@@ -2,6 +2,8 @@ import re
 import tdfs4ds
 import teradataml as tdml
+from tdfs4ds import logger
+import numpy as np
 def get_column_types(df, columns):
     """
@@ -264,4 +266,40 @@ def get_feature_types_sql_format(tddf, columns = None):
     res = tdml.DataFrame.from_query(query).to_pandas()
     # Return column names with their corresponding SQL data types in a dictionary
-    return {c: res[c].values[0].strip() for c in columns}
+    return {c: res[c].values[0].strip() for c in columns}
+def update_varchar_length(feature_types: dict, new_varchar_length: int) -> dict:
+    """
+    Updates the length of all VARCHAR fields in the feature_types dictionary based on an increment.
+    The new length is calculated as ceil(previous_length / new_varchar_length) * new_varchar_length,
+    ensuring that when new_varchar_length is equal to the current length, no change occurs.
+    Args:
+        feature_types (dict): A dictionary where keys are feature names and values are dictionaries with 'type' and 'id'.
+        new_varchar_length (int): The increment value for adjusting VARCHAR lengths.
+    Returns:
+        dict: A dictionary with updated VARCHAR lengths.
+    Issues a warning if the new length is smaller than the original length.
+    """
+    updated_feature_types = {}
+    varchar_pattern = re.compile(r'VARCHAR\((\d+)\)', re.IGNORECASE)
+    for key, value in feature_types.items():
+        type_value = value['type']
+        match = varchar_pattern.search(type_value)
+        if match:
+            original_length = int(match.group(1))
+            modified_length = int(np.ceil(original_length / new_varchar_length) * new_varchar_length)
+            if modified_length < original_length:
+                logger.warning(f"Reducing VARCHAR length for {key} from {original_length} to {modified_length}")
+            # Replace only the VARCHAR length
+            updated_value = varchar_pattern.sub(f'VARCHAR({modified_length})', type_value)
+            updated_feature_types[key] = {'type': updated_value, 'id': value['id']}
+        else:
+            updated_feature_types[key] = value
+    return updated_feature_types

{tdfs4ds-0.2.4.3.dist-info → tdfs4ds-0.2.4.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: tdfs4ds
-Version: 0.2.4.3
+Version: 0.2.4.4
 Summary: A python package to simplify the usage of feature store using Teradata Vantage ...
 Author: Denis Molin
 Requires-Python: >=3.6

{tdfs4ds-0.2.4.3.dist-info → tdfs4ds-0.2.4.4.dist-info}/RECORD RENAMED Viewed

@@ -2,7 +2,7 @@ tdfs/__init__.py,sha256=7AcO7uB1opRCt7t2JOHworKimfAaDeO3boRW7u9Geo8,23
 tdfs/datasets.py,sha256=-b2MPEKGki2V1M8iUcoDR9uc2krIK7u1CK-EhChvihs,985
 tdfs/feature_store.py,sha256=Honu7eOAXxP4Ivz0mRlhuNkfTDzgZl5HB1WlQUwzcZ0,31354
 tdfs/data/curves.csv,sha256=q0Tm-0yu7VMK4lHvHpgi1LMeRq0lO5gJy2Q17brKbEM,112488
-tdfs4ds/__init__.py,sha256=OOakI_WdX1fjXTheqqLMUQY99apaGFXdEYg_SQpWQng,63986
+tdfs4ds/__init__.py,sha256=BSE-ct7eaUwHsch7GGloYMSDWTb9nPLSeR-LSDZNY18,65844
 tdfs4ds/datasets.py,sha256=LE4Gn0muwdyrIrCrbkE92cnafUML63z1lj5bFIIVzmc,3524
 tdfs4ds/feature_engineering.py,sha256=oVnZ2V_XNGE12LKC_fNfkrWSQZLgtYRmaf8Dispi6S4,7081
 tdfs4ds/feature_store.py,sha256=y-oItPZw6nBkBcGAceaATZbkLPTsvpk0OnpzTxYofDs,68576
@@ -13,7 +13,7 @@ tdfs4ds/feature_store/__init__.py,sha256=a7NPCkpTx40UR5LRErwnskpABG2Vuib7F5wUjaU
 tdfs4ds/feature_store/entity_management.py,sha256=9ltytv3yCTG84NZXBpb1Tlkf9pOxvrNb0MVidU4pwvE,10157
 tdfs4ds/feature_store/feature_data_processing.py,sha256=SuJeCTJF51l9-VS9WRS0oBUnxaVqba4hqjOpsCtdVs8,42352
 tdfs4ds/feature_store/feature_query_retrieval.py,sha256=zuHRZhL6-qyLpPS7mWgRy1WingSN5iibkbi53Q7jfAs,33834
-tdfs4ds/feature_store/feature_store_management.py,sha256=RIa3ZjKBULTovEmy3KEa0M2Rn5D6LMizDVnx4Q25S6o,55724
+tdfs4ds/feature_store/feature_store_management.py,sha256=WcgawACgC_lI880wj_FO2wV_FIp0W5WZ3x7r2-0WKdI,56121
 tdfs4ds/process_store/__init__.py,sha256=npHR_xju5ecGmWfYHDyteLwiU3x-cL4HD3sFK_th7xY,229
 tdfs4ds/process_store/process_followup.py,sha256=PvLcU7meg3ljBlPfuez3qwTVqpHHhVJxYxGqjgiHE8E,7265
 tdfs4ds/process_store/process_query_administration.py,sha256=DsIt97cBoJ7NcpQzbQt55eUFNgXGdOMm5Hh2aX5v0PY,7762
@@ -21,12 +21,12 @@ tdfs4ds/process_store/process_registration_management.py,sha256=F8VlBoL-de98KnkM
 tdfs4ds/process_store/process_store_catalog_management.py,sha256=H135RRTYn-pyWIqPVbHpuIyyvsaNrek6b1iPk8avJMI,16088
 tdfs4ds/utils/__init__.py,sha256=-yTMfDLZbQnIRQ64s_bczzT21tDW2A8FZeq9PX5SgFU,168
 tdfs4ds/utils/filter_management.py,sha256=7D47N_hnTSUVOkaV2XuKrlUFMxzWjDsCBvRYsH4lXdU,11073
-tdfs4ds/utils/info.py,sha256=lc9-rQDfM4NWnZGkSUkY_G0qYx7qnoErNKKcYMuLIRs,10554
+tdfs4ds/utils/info.py,sha256=SQR_ec4M9-5Z4erjb9_N0n8JPY1wpelgxkw3B12D1Q4,12322
 tdfs4ds/utils/lineage.py,sha256=LI-5pG7D8lO3-YFa9qA6CrEackiYugV23_Vz9IpF5xw,28670
 tdfs4ds/utils/query_management.py,sha256=nAcE8QY1GWAKgOtb-ubSfDVcnYbU7Ge8CruVRLoPtmY,6356
 tdfs4ds/utils/time_management.py,sha256=_jbwdyZH4Yr3VzbUrq6X93FpXDCDEdH0iv56vX7j8mA,8446
 tdfs4ds/utils/visualization.py,sha256=5S528KoKzzkrAdCxfy7ecyqKvAXBoibNvHwz_u5ISMs,23167
-tdfs4ds-0.2.4.3.dist-info/METADATA,sha256=dUqe-90oXLdYx2U6F-WmeQDHhAFN_vvZrFfVuYGmTn8,11944
-tdfs4ds-0.2.4.3.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
-tdfs4ds-0.2.4.3.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
-tdfs4ds-0.2.4.3.dist-info/RECORD,,
+tdfs4ds-0.2.4.4.dist-info/METADATA,sha256=OtxHXtfMZvmAZNaX0sontwt5luQ_0__lJhxoE3XHuho,11944
+tdfs4ds-0.2.4.4.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
+tdfs4ds-0.2.4.4.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
+tdfs4ds-0.2.4.4.dist-info/RECORD,,

{tdfs4ds-0.2.4.3.dist-info → tdfs4ds-0.2.4.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{tdfs4ds-0.2.4.3.dist-info → tdfs4ds-0.2.4.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

tdfs4ds 0.2.4.3__py3-none-any.whl → 0.2.4.4__py3-none-any.whl

tdfs4ds 0.2.4.3py3-none-any.whl → 0.2.4.4py3-none-any.whl