PyPI - tdfs4ds - Versions diffs - 0.2.4.25__py3-none-any.whl → 0.2.4.41__py3-none-any.whl - Mend

tdfs4ds 0.2.4.25py3-none-any.whl → 0.2.4.41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

tdfs4ds/__init__.py +586 -564
tdfs4ds/feature_store/feature_data_processing.py +367 -299
tdfs4ds/feature_store/feature_query_retrieval.py +105 -52
tdfs4ds/feature_store/feature_store_management.py +268 -285
tdfs4ds/process_store/process_followup.py +113 -2
tdfs4ds/process_store/process_query_administration.py +1 -1
tdfs4ds/process_store/process_registration_management.py +67 -55
tdfs4ds/process_store/process_store_catalog_management.py +2 -2
tdfs4ds/utils/filter_management.py +521 -138
tdfs4ds/utils/query_management.py +18 -40
tdfs4ds/utils/time_management.py +547 -97
{tdfs4ds-0.2.4.25.dist-info → tdfs4ds-0.2.4.41.dist-info}/METADATA +1 -1
{tdfs4ds-0.2.4.25.dist-info → tdfs4ds-0.2.4.41.dist-info}/RECORD +15 -15
{tdfs4ds-0.2.4.25.dist-info → tdfs4ds-0.2.4.41.dist-info}/WHEEL +0 -0
{tdfs4ds-0.2.4.25.dist-info → tdfs4ds-0.2.4.41.dist-info}/top_level.txt +0 -0

tdfs4ds/feature_store/feature_query_retrieval.py CHANGED Viewed

@@ -249,84 +249,137 @@ def get_list_features(entity_name, domain=None):
     return tdml.DataFrame.from_query(query)
-def get_feature_versions(entity_name, features, domain=None, latest_version_only=True, version_lag=0):
+def get_feature_versions(entity_name, features, domain=None):
     """
-    Retrieve feature versions for specified features associated with certain entities
-    from a given data domain. This function allows fetching either all versions or
-    just the latest versions of the features.
+    Retrieve version identifiers for one or more features belonging to a given entity.
+    The function queries the underlying metadata tables to find the *process*
+    (i.e., feature‑version) records that match the supplied entity and feature
+    names.  It returns a mapping from each requested feature name to either:
+    * **None** – if no matching rows were found.
+    * A single UUID string – if exactly one matching row exists for the feature.
+    * A list of dictionaries – if more than one matching row is found; each
+      dictionary contains:
+        ``process_id``          – the UUID of the process that produced the
+                                  version,
+        ``process_view_name``   – the human‑readable view name associated with
+                                  that process.
+    Parameters
+    ----------
+    entity_name : str | list[str]
+        The name (or names) of the entity whose features we are querying.
+        If a single string is supplied it is treated as a singleton list.
+    features : str | list[str]
+        One or more feature names to look up.  Accepts a single string or
+        an iterable of strings; if a single string is provided it is wrapped in
+        a list internally.
+    domain : str, optional
+        The data‑domain partition to filter on.  If omitted the default
+        ``tdfs4ds.DATA_DOMAIN`` constant is used.
+    Returns
+    -------
+    dict[str, str | None | list[dict]]
+        A dictionary keyed by feature name.  Each value is either:
+            * ``None`` – no records were found for that feature.
+            * ``str`` – a single UUID string when exactly one row matched.
+            * ``list[dict]`` – multiple matches; each dict has keys
+              ``process_id`` and ``process_view_name``.
+    Notes
+    -----
+    * The query joins the feature catalog view with the process catalog
+      (specifically the “feature split” view) on data domain, entity ID,
+      and feature name.
+    * SQL string literals are escaped by doubling single quotes; this is a
+      lightweight escape that suffices for the current use‑case.
+    * The function preserves insertion order of features in the returned
+      dictionary (Python 3.7+ guarantees dict order).
+    * When ``tdfs4ds.DEBUG_MODE`` is true, the generated SQL statement is
+      printed to stdout – useful for troubleshooting.
+    Example
+    -------
+    >>> get_feature_versions('user', ['age', 'income'])
+    {'age': 'c1d2e3f4-...', 'income': None}
-    Parameters:
-    entity_name (str or list): The name of the entity or a list of entity names
-                               for which feature versions are to be fetched.
-    features (list): A list of features for which versions are required.
-    domain (str, optional): The data domain to filter the feature versions.
-                            Defaults to None, where a predefined domain is used.
-    latest_version_only (bool, optional): Flag to fetch only the latest version
-                                          of each feature. Defaults to True.
-    version_lag (int, optional): The number of versions to lag behind the latest.
-                                 Only effective if latest_version_only is True. Defaults to 0.
-    Returns:
-    dict: A dictionary with feature names as keys and their corresponding versions as values.
     """
-    # Default to a predefined data domain if none is provided
+    # Normalize inputs
+    if isinstance(features, str):
+        features = [features]
+    if isinstance(entity_name, str):
+        entity_name = [entity_name]
     if domain is None:
         domain = tdfs4ds.DATA_DOMAIN
+    # Basic escaping for single quotes in values used in SQL literals
+    def _esc(s: str) -> str:
+        return s.replace("'", "''")
+    features_lits = ",".join(f"'{_esc(f)}'" for f in features)
-    # Convert the entity_name to a string if it is a list
-    if type(entity_name) == list:
-        entity_name.sort()
-        entity_name = ','.join(entity_name)
-    # Preparing the feature names for inclusion in the SQL query
-    if type(features) == list:
-        features = ["'" + f + "'" for f in features]
-    else:
-        features = "'" + features + "'"
+    entity_name.sort()
+    entity_name_str = ','.join(entity_name)
     query = f"""
         SELECT
             A.FEATURE_NAME
         ,   B.PROCESS_ID AS FEATURE_VERSION
+        ,   B.VIEW_NAME AS PROCESS_VIEW_NAME
         FROM {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW} A
         INNER JOIN {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW_FEATURE_SPLIT} B
         ON A.DATA_DOMAIN = B.DATA_DOMAIN
         AND A.ENTITY_NAME = B.ENTITY_ID
         AND A.FEATURE_NAME = B.FEATURE_NAME
         WHERE A.DATA_DOMAIN = '{domain}'
-        AND A.ENTITY_NAME = '{entity_name}'
-        AND A.FEATURE_NAME IN ({','.join(features)})
+        AND A.ENTITY_NAME = '{entity_name_str}'
+        AND A.FEATURE_NAME IN ({features_lits})
     """
-    # Executing the first query and converting the results to a pandas DataFrame
-    df = tdml.DataFrame.from_query(query).to_pandas()
-    # if df is empty
-    if df.shape[0] == 0:
-        print('the features you are requesting for this entity and data domain do not exist. Here is what you requested:')
-        print('feature store database :', tdfs4ds.SCHEMA)
-        print('feature catalog        :', tdfs4ds.FEATURE_CATALOG_NAME_VIEW)
-        print('entity name            :', entity_name)
-        print('data domain            :', domain)
-        print('features               :', ','.join(features))
-        print('')
+    if tdfs4ds.DEBUG_MODE:
         print(query)
-        return
-    if tdfs4ds.DEBUG_MODE == True:
-        print(query)
+    rows = tdml.execute_sql(query).fetchall()
+    # Initialize result for all requested features
+    result = {f: None for f in features}
+    # Collect (version, view) per feature, deduplicating while preserving order
+    tmp = {f: [] for f in features}
+    seen = {f: set() for f in features}
+    for feat, version, view_name in rows:
+        key = (version, view_name)
+        if key not in seen.setdefault(feat, set()):
+            seen[feat].add(key)
+            tmp.setdefault(feat, []).append(key)
+    # Shape:
+    # - if exactly one row: return UUID string
+    # - if multiple rows: list of {"process_id": <uuid>, "process_view_name": <str>}
+    for feat in result:
+        pairs = tmp.get(feat, [])
+        if len(pairs) == 0:
+            result[feat] = None
+        elif len(pairs) == 1:
+            result[feat] = pairs[0][0]  # UUID only
+        else:
+            result[feat] = [
+                {"process_id": ver, "process_view_name": view}
+                for (ver, view) in pairs
+            ]
+    return result
-    # results in dictionary:
-    results = {row['FEATURE_NAME']: row['FEATURE_VERSION'] for i, row in df.iterrows()}
-    if tdfs4ds.DEBUG_MODE == True:
-        print('---> RESULTS <---')
-        print(results)
-    # Returning the results as a dictionary with feature names as keys and their versions as values
-    return results
 def get_entity_tables(entity_id, data_domain=None):
     """
     Retrieves a list of table names associated with a given entity ID or IDs from a feature catalog within a specific data domain.

tdfs4ds 0.2.4.25__py3-none-any.whl → 0.2.4.41__py3-none-any.whl

tdfs4ds 0.2.4.25py3-none-any.whl → 0.2.4.41py3-none-any.whl