PyPI - tencent-wedata-feature-engineering-dev - Versions diffs - 0.1.0__py3-none-any.whl - Mend

tencent-wedata-feature-engineering-dev 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.

Files changed (64) hide show

tencent_wedata_feature_engineering_dev-0.1.0.dist-info/METADATA +19 -0
tencent_wedata_feature_engineering_dev-0.1.0.dist-info/RECORD +64 -0
tencent_wedata_feature_engineering_dev-0.1.0.dist-info/WHEEL +5 -0
tencent_wedata_feature_engineering_dev-0.1.0.dist-info/top_level.txt +1 -0
wedata/__init__.py +9 -0
wedata/feature_store/__init__.py +0 -0
wedata/feature_store/client.py +462 -0
wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
wedata/feature_store/cloud_sdk_client/client.py +86 -0
wedata/feature_store/cloud_sdk_client/models.py +686 -0
wedata/feature_store/cloud_sdk_client/utils.py +32 -0
wedata/feature_store/common/__init__.py +0 -0
wedata/feature_store/common/protos/__init__.py +0 -0
wedata/feature_store/common/protos/feature_store_pb2.py +49 -0
wedata/feature_store/common/store_config/__init__.py +0 -0
wedata/feature_store/common/store_config/redis.py +48 -0
wedata/feature_store/constants/__init__.py +0 -0
wedata/feature_store/constants/constants.py +59 -0
wedata/feature_store/constants/engine_types.py +34 -0
wedata/feature_store/entities/__init__.py +0 -0
wedata/feature_store/entities/column_info.py +138 -0
wedata/feature_store/entities/environment_variables.py +55 -0
wedata/feature_store/entities/feature.py +53 -0
wedata/feature_store/entities/feature_column_info.py +72 -0
wedata/feature_store/entities/feature_function.py +55 -0
wedata/feature_store/entities/feature_lookup.py +200 -0
wedata/feature_store/entities/feature_spec.py +489 -0
wedata/feature_store/entities/feature_spec_constants.py +25 -0
wedata/feature_store/entities/feature_table.py +111 -0
wedata/feature_store/entities/feature_table_info.py +49 -0
wedata/feature_store/entities/function_info.py +90 -0
wedata/feature_store/entities/on_demand_column_info.py +57 -0
wedata/feature_store/entities/source_data_column_info.py +24 -0
wedata/feature_store/entities/training_set.py +135 -0
wedata/feature_store/feast_client/__init__.py +0 -0
wedata/feature_store/feast_client/feast_client.py +482 -0
wedata/feature_store/feature_table_client/__init__.py +0 -0
wedata/feature_store/feature_table_client/feature_table_client.py +969 -0
wedata/feature_store/mlflow_model.py +17 -0
wedata/feature_store/spark_client/__init__.py +0 -0
wedata/feature_store/spark_client/spark_client.py +289 -0
wedata/feature_store/training_set_client/__init__.py +0 -0
wedata/feature_store/training_set_client/training_set_client.py +572 -0
wedata/feature_store/utils/__init__.py +0 -0
wedata/feature_store/utils/common_utils.py +352 -0
wedata/feature_store/utils/env_utils.py +86 -0
wedata/feature_store/utils/feature_lookup_utils.py +564 -0
wedata/feature_store/utils/feature_spec_utils.py +286 -0
wedata/feature_store/utils/feature_utils.py +73 -0
wedata/feature_store/utils/on_demand_utils.py +107 -0
wedata/feature_store/utils/schema_utils.py +117 -0
wedata/feature_store/utils/signature_utils.py +202 -0
wedata/feature_store/utils/topological_sort.py +158 -0
wedata/feature_store/utils/training_set_utils.py +579 -0
wedata/feature_store/utils/uc_utils.py +296 -0
wedata/feature_store/utils/validation_utils.py +79 -0
wedata/tempo/__init__.py +0 -0
wedata/tempo/interpol.py +448 -0
wedata/tempo/intervals.py +1331 -0
wedata/tempo/io.py +61 -0
wedata/tempo/ml.py +129 -0
wedata/tempo/resample.py +318 -0
wedata/tempo/tsdf.py +1720 -0
wedata/tempo/utils.py +254 -0

wedata/feature_store/utils/feature_spec_utils.py ADDED Viewed

@@ -0,0 +1,286 @@
+import logging
+from dataclasses import dataclass
+from functools import reduce
+from typing import Dict, List, Tuple, Union
+import yaml
+from mlflow.utils.file_utils import YamlSafeDumper
+from wedata.feature_store.entities.column_info import ColumnInfo
+from wedata.feature_store.entities.feature_column_info import FeatureColumnInfo
+from wedata.feature_store.entities.feature_spec import FeatureSpec
+from wedata.feature_store.entities.on_demand_column_info import OnDemandColumnInfo
+from wedata.feature_store.entities.source_data_column_info import SourceDataColumnInfo
+from wedata.feature_store.utils.topological_sort import topological_sort
+DEFAULT_GRAPH_DEPTH_LIMIT = 5
+COLUMN_INFO_TYPE_SOURCE = "SOURCE"
+COLUMN_INFO_TYPE_ON_DEMAND = "ON_DEMAND"
+COLUMN_INFO_TYPE_FEATURE = "FEATURE"
+_logger = logging.getLogger(__name__)
+@dataclass
+class FeatureExecutionGroup:
+    type: str  # could be FEATURE, ON_DEMAND, SOURCE
+    features: Union[
+        List[FeatureColumnInfo], List[OnDemandColumnInfo], List[SourceDataColumnInfo]
+    ]
+# Small number has high priority. Besides SOURCE, preferring FEATURE over ON_DEMAND in topological
+# sorting to make sure ON_DEMAND columns after FEATURE in simple cases to align with previous
+# assumption before implementing TLT.
+# NOTE: changing this priority may cause performance regression, proceed with caution.
+COLUMN_TYPE_PRIORITY = {
+    COLUMN_INFO_TYPE_SOURCE: 0,
+    COLUMN_INFO_TYPE_ON_DEMAND: 1,
+    COLUMN_INFO_TYPE_FEATURE: 2,
+}
+class _GraphNode:
+    def __init__(self, column_info: ColumnInfo):
+        info = column_info.info
+        self.column_info = column_info
+        self.output_name = info.output_name
+        if isinstance(column_info.info, SourceDataColumnInfo):
+            self.input_names = set()
+            self.type = COLUMN_INFO_TYPE_SOURCE
+        elif isinstance(column_info.info, FeatureColumnInfo):
+            self.input_names = set(info.lookup_key)
+            self.type = COLUMN_INFO_TYPE_FEATURE
+        elif isinstance(column_info.info, OnDemandColumnInfo):
+            self.input_names = set(info.input_bindings.values())
+            self.type = COLUMN_INFO_TYPE_ON_DEMAND
+        else:
+            raise ValueError("unknown column info type")
+    def __str__(self):
+        return "node<" + self.output_name + ">"
+    def __repr__(self):
+        return str(self)
+def _column_info_sort_key(node: _GraphNode) -> Tuple[int, str]:
+    """
+    Returns a tuple of an int and a str as the sorting key for _GraphNode. Priority is determined by
+    the first element and then use the second element to break ties.
+    """
+    return COLUMN_TYPE_PRIORITY[node.type], node.output_name
+def _should_be_grouped(node: _GraphNode) -> bool:
+    """
+    Returns True if the given node is of type that should be grouped together as much as possible.
+    """
+    return node.type == COLUMN_INFO_TYPE_FEATURE
+def _validate_graph_depth(nodes: List[_GraphNode], depth_limit: int):
+    name_to_node = {node.output_name: node for node in nodes}
+    visited_depth = {}
+    def dfs(node: _GraphNode, depth: int):
+        if depth > depth_limit:
+            raise ValueError(
+                f"The given graph contains a dependency path longer than the limit {depth_limit}"
+            )
+        if (
+            node.output_name in visited_depth
+            and depth <= visited_depth[node.output_name]
+        ):
+            return
+        visited_depth[node.output_name] = depth
+        for column_name in node.input_names:
+            dependency = name_to_node[column_name]
+            dfs(dependency, depth + 1)
+    for node in nodes:
+        dfs(node, 1)
+def get_encoded_graph_map(column_infos: List[ColumnInfo]) -> Dict[str, List[str]]:
+    """
+    Creates a dictionary of columns with their dependency columns for metric use. Columns are
+    encoded with a string representing the type and index. For example:
+      {
+        "f3": ["s1", "s2"],
+        "o4": ["f3"],
+        "o5": []
+      }
+    "s1" and "s2" are SourceColumnInfos, "f3" is FeatureColumnInfo and "o4", "o5" are
+    OnDemandColumnInfos. "f3" depends on "s1" and "s2", "o5" doesn't depend on any column, etc.
+    :param column_infos: A list of ColumnInfos.
+    """
+    nodes = {info.output_name: _GraphNode(info) for info in column_infos}
+    next_node_index = 0
+    # A map from column info's output_name to its label.
+    node_label = {}
+    def get_node_label(node):
+        nonlocal next_node_index
+        output_name = node.output_name
+        if output_name not in node_label:
+            if node.type == COLUMN_INFO_TYPE_SOURCE:
+                type_simple_str = "s"
+            if node.type == COLUMN_INFO_TYPE_FEATURE:
+                type_simple_str = "f"
+            if node.type == COLUMN_INFO_TYPE_ON_DEMAND:
+                type_simple_str = "o"
+            new_label = type_simple_str + str(next_node_index)
+            next_node_index += 1
+            node_label[output_name] = new_label
+        return node_label[output_name]
+    graph_map = {}
+    for node in nodes.values():
+        label = get_node_label(node)
+        dependencies = []
+        for dep_name in sorted(node.input_names):
+            if dep_name not in nodes:
+                # skip the column if it's not in the feature spec.
+                continue
+            dep = get_node_label(nodes[dep_name])
+            dependencies.append(dep)
+        graph_map[label] = dependencies
+    return graph_map
+def assign_topological_ordering(
+    column_infos: List[ColumnInfo],
+    allow_missing_source_columns=False,
+    graph_depth_limit=DEFAULT_GRAPH_DEPTH_LIMIT,
+) -> List[ColumnInfo]:
+    """
+    Assigns the topological ordering for each ColumnInfo of the input. Returns a list of new
+    ColumnInfo objects with topological_ordering set to an integer.
+    :param column_infos: a list of ColumnInfos.
+    :param allow_missing_source_columns: ONLY USED BY FSE TEMPORARILY. Allow lookup key or
+    function input be missing from source columns. If true, this method will assign
+    topological_ordering to columns as if the missing sources are added in the column_infos.
+    :param graph_depth_limit raises if the given graph exceed the limit.
+    :raises ValueError if there is a cycle in the graph.
+    """
+    nodes = list(map(lambda c: _GraphNode(c), column_infos))
+    # allow_missing_source_columns is used when feature_serving_endpoint_client creates training
+    # sets. It doesn't include source columns in the dataframe.
+    # TODO[ML-33809]: clean up allow_missing_source_columns.
+    all_output_names = set([n.output_name for n in nodes])
+    all_input_names = reduce(lambda a, b: a | b, [n.input_names for n in nodes])
+    missing_inputs = all_input_names - all_output_names
+    if allow_missing_source_columns:
+        for input_name in missing_inputs:
+            if input_name not in all_output_names:
+                nodes.append(
+                    _GraphNode(ColumnInfo(SourceDataColumnInfo(input_name), False))
+                )
+    elif len(missing_inputs) > 0:
+        missing_input_names_str = ", ".join(
+            [f"'{name}'" for name in sorted(missing_inputs)]
+        )
+        raise ValueError(
+            f"Input columns {missing_input_names_str} required by FeatureLookups or "
+            "FeatureFunctions are not provided by input DataFrame or other FeatureFunctions and "
+            "FeatureLookups"
+        )
+    output_name_to_node = {node.output_name: node for node in nodes}
+    graph = {
+        node: [output_name_to_node[input_name] for input_name in node.input_names]
+        for node in nodes
+    }
+    sorted_nodes = topological_sort(graph, _column_info_sort_key, _should_be_grouped)
+    # validate depth after sorting the graph because cycle is detected during sorting.
+    _validate_graph_depth(nodes, graph_depth_limit)
+    name_to_ordering = {node.output_name: i for i, node in enumerate(sorted_nodes)}
+    return [
+        column.with_topological_ordering(name_to_ordering[column.output_name])
+        for column in column_infos
+    ]
+def get_feature_execution_groups(
+    feature_spec: FeatureSpec, df_columns: List[str] = []
+) -> List[FeatureExecutionGroup]:
+    """
+    Splits the list of column_infos in feature_spec into groups based on the topological_ordering of
+    the column_infos such that each group contains only one type of feature columns and columns
+    don't depend on other columns in the same group. The type of feature column is equivalent to the
+    class type of column_info.info field.
+    Example:
+        Given FeatureSpec with some columns, after sorting the columns by topological_ordering,
+        assuming the sorted list:
+            [source_1, feature_2, feature_3, on_demand_4, on_demand_5]
+        where feature_2 depends on feature_3. The resulting groups will be:
+            [
+                group(SOURCE, [source_1]),
+                group(FEATURE, [feature_2]),
+                group(FEATURE, [feature_3]),
+                group(ON_DEMAND, [on_demand_4, on_demand_5]),
+            ]
+    :param feature_spec: A FeatureSpec with topologically sorted column_infos.
+    :param df_columns: the columns from the DF used to create_training_set or score_batch.
+    """
+    # convert column infos into _GraphNode
+    nodes = list(map(lambda c: _GraphNode(c), feature_spec.column_infos))
+    if any(info.topological_ordering is None for info in feature_spec.column_infos):
+        # The old version of feature_spec may not have topological_ordering, we can safely assume
+        # they are already sorted because of validations during the feature_spec creation.
+        _logger.warning(
+            "Processing a feature spec that at least one of the column_infos has no "
+            "topological_ordering"
+        )
+    else:
+        # sort nodes by topological_ordering
+        nodes = sorted(nodes, key=lambda n: n.column_info.topological_ordering)
+    # A buffer holding the columns in a group.
+    buffer = []
+    # output names of columns in the current buffer.
+    buffered_output_names = set()
+    # Used to validate the topological sorting.
+    # df_columns is used to be backward compatible. In old FeatureSpecs, source columns might not
+    # exist. So we need to consider the df as initial resolved columns.
+    resolved_columns = set(df_columns)
+    result_list = []
+    last_type = None
+    for node in nodes:
+        if not node.input_names.issubset(resolved_columns):
+            raise ValueError(
+                "The column_infos in the FeatureSpec is not topologically sorted"
+            )
+        if node.type != last_type or buffered_output_names.intersection(
+            node.input_names
+        ):
+            # split group if the current node has a different type from the previous node OR
+            # any of the inputs are from the nodes in the current group.
+            if buffer:
+                result_list.append(FeatureExecutionGroup(last_type, buffer))
+                buffer = []
+            buffered_output_names.clear()
+            last_type = node.type
+        buffer.append(node.column_info.info)
+        resolved_columns.add(node.output_name)
+        buffered_output_names.add(node.output_name)
+    if buffer:
+        result_list.append(FeatureExecutionGroup(last_type, buffer))
+    return result_list
+def convert_to_yaml_string(feature_spec: FeatureSpec) -> str:
+    """
+    Converts the given FeatureSpec to a YAML string.
+    """
+    feature_spec_dict = feature_spec._to_dict()
+    return yaml.dump(
+        feature_spec_dict,
+        default_flow_style=False,
+        allow_unicode=True,
+        sort_keys=False,
+        Dumper=YamlSafeDumper,
+    )

wedata/feature_store/utils/feature_utils.py ADDED Viewed

@@ -0,0 +1,73 @@
+import copy
+from typing import List, Union
+from wedata.feature_store.entities.feature_function import FeatureFunction
+from wedata.feature_store.entities.feature_lookup import FeatureLookup
+from wedata.feature_store.spark_client.spark_client import SparkClient
+from wedata.feature_store.utils import uc_utils
+from wedata.feature_store.utils.feature_lookup_utils import get_feature_lookups_with_full_table_names
+def format_feature_lookups_and_functions(
+        _spark_client: SparkClient, features: List[Union[FeatureLookup, FeatureFunction]]
+):
+    fl_idx = []
+    ff_idx = []
+    feature_lookups = []
+    feature_functions = []
+    for idx, feature in enumerate(features):
+        if isinstance(feature, FeatureLookup):
+            fl_idx.append(idx)
+            feature_lookups.append(feature)
+        elif isinstance(feature, FeatureFunction):
+            ff_idx.append(idx)
+            feature_functions.append(feature)
+        else:
+            raise ValueError(
+                f"Expected a list of FeatureLookups for 'feature_lookups', but received type '{type(feature)}'."
+            )
+    # FeatureLookups and FeatureFunctions must have fully qualified table, UDF names
+    feature_lookups = get_feature_lookups_with_full_table_names(
+        feature_lookups,
+        _spark_client.get_current_catalog(),
+        _spark_client.get_current_database(),
+    )
+    feature_functions = get_feature_functions_with_full_udf_names(
+        feature_functions,
+        _spark_client.get_current_catalog(),
+        _spark_client.get_current_database(),
+    )
+    # Restore original order of FeatureLookups, FeatureFunctions. Copy to avoid mutating original list.
+    features = features.copy()
+    for idx, feature in zip(fl_idx + ff_idx, feature_lookups + feature_functions):
+        features[idx] = feature
+    return features
+def get_feature_functions_with_full_udf_names(
+        feature_functions: List[FeatureFunction], current_catalog: str, current_schema: str
+):
+    """
+    Takes in a list of FeatureFunctions, and returns copies with:
+    1. Fully qualified UDF names.
+    2. If output_name is empty, fully qualified UDF names as output_name.
+    """
+    udf_names = {ff.udf_name for ff in feature_functions}
+    uc_utils._check_qualified_udf_names(udf_names)
+    uc_utils._verify_all_udfs_in_uc(udf_names, current_catalog, current_schema)
+    standardized_feature_functions = []
+    for ff in feature_functions:
+        ff_copy = copy.deepcopy(ff)
+        del ff
+        ff_copy._udf_name = uc_utils.get_full_udf_name(
+            ff_copy.udf_name, current_catalog, current_schema
+        )
+        if not ff_copy.output_name:
+            ff_copy._output_name = ff_copy.udf_name
+        standardized_feature_functions.append(ff_copy)
+    return standardized_feature_functions

wedata/feature_store/utils/on_demand_utils.py ADDED Viewed

@@ -0,0 +1,107 @@
+import copy
+from typing import Dict, List
+from pyspark.sql import DataFrame
+from pyspark.sql.functions import expr
+from wedata.feature_store.entities.feature_function import FeatureFunction
+from wedata.feature_store.entities.function_info import FunctionInfo
+from wedata.feature_store.entities.on_demand_column_info import OnDemandColumnInfo
+from wedata.feature_store.utils import common_utils, uc_utils
+def _udf_expr(udf_name: str, arguments: List[str]) -> expr:
+    """
+    Generate a Spark SQL expression, e.g. expr("udf_name(col1, col2)")
+    """
+    arguments_str = ", ".join(common_utils.sanitize_identifiers(arguments))
+    return expr(f"{udf_name}({arguments_str})")
+def _validate_apply_functions_df(
+    df: DataFrame,
+    functions_to_apply: List[OnDemandColumnInfo],
+    uc_function_infos: Dict[str, FunctionInfo],
+):
+    """
+    Validate the following:
+    1. On-demand input columns specified by functions_to_apply exist in the DataFrame.
+    2. On-demand input columns have data types that match those of UDF parameters.
+    """
+    for odci in functions_to_apply:
+        function_info = uc_function_infos[odci.udf_name]
+        types_dict = dict(df.dtypes)
+        for p in function_info.input_params:
+            arg_column = odci.input_bindings[p.name]
+            if arg_column not in df.columns:
+                raise ValueError(
+                    f"FeatureFunction argument column '{arg_column}' for UDF '{odci.udf_name}' parameter '{p.name}' "
+                    f"does not exist in provided DataFrame with schema '{df.schema}'."
+                )
+            if types_dict[arg_column] != p.type_text:
+                raise ValueError(
+                    f"FeatureFunction argument column '{arg_column}' for UDF '{odci.udf_name}' parameter '{p.name}' "
+                    f"does not have the expected type. Argument column '{arg_column}' has type "
+                    f"'{types_dict[arg_column]}' and parameter '{p.name}' has type '{p.type_text}'."
+                )
+def apply_functions_if_not_overridden(
+    df: DataFrame,
+    functions_to_apply: List[OnDemandColumnInfo],
+    uc_function_infos: Dict[str, FunctionInfo],
+) -> DataFrame:
+    """
+    For all on-demand features, in the order defined by the FeatureSpec:
+    If the feature does not already exist, append the evaluated UDF expression.
+    Existing column values or column positions are not modified.
+    `_validate_apply_functions_df` validates UDFs can be applied on `df` schema.
+    The caller should validate:
+    1. FeatureFunction bound argument columns for UDF parameters exist in FeatureSpec defined features.
+    2. FeatureFunction output feature names are unique.
+    """
+    _validate_apply_functions_df(
+        df=df,
+        functions_to_apply=functions_to_apply,
+        uc_function_infos=uc_function_infos,
+    )
+    columns = {}
+    for odci in functions_to_apply:
+        if odci.output_name not in df.columns:
+            function_info = uc_function_infos[odci.udf_name]
+            # Resolve the bound arguments in the UDF parameter order
+            udf_arguments = [
+                odci.input_bindings[p.name] for p in function_info.input_params
+            ]
+            columns[odci.output_name] = _udf_expr(odci.udf_name, udf_arguments)
+    return df.withColumns(columns)
+def get_feature_functions_with_full_udf_names(
+    feature_functions: List[FeatureFunction], current_catalog: str, current_schema: str
+):
+    """
+    Takes in a list of FeatureFunctions, and returns copies with:
+    1. Fully qualified UDF names.
+    2. If output_name is empty, fully qualified UDF names as output_name.
+    """
+    udf_names = {ff.udf_name for ff in feature_functions}
+    uc_utils._check_qualified_udf_names(udf_names)
+    uc_utils._verify_all_udfs_in_uc(udf_names, current_catalog, current_schema)
+    standardized_feature_functions = []
+    for ff in feature_functions:
+        ff_copy = copy.deepcopy(ff)
+        del ff
+        ff_copy._udf_name = uc_utils.get_full_udf_name(
+            ff_copy.udf_name, current_catalog, current_schema
+        )
+        if not ff_copy.output_name:
+            ff_copy._output_name = ff_copy.udf_name
+        standardized_feature_functions.append(ff_copy)
+    return standardized_feature_functions

wedata/feature_store/utils/schema_utils.py ADDED Viewed

@@ -0,0 +1,117 @@
+import logging
+from wedata.feature_store.constants.constants import _ERROR, _WARN
+_logger = logging.getLogger(__name__)
+def catalog_matches_delta_schema(catalog_features, df_schema, column_filter=None):
+    """
+    Confirm that the column names and column types are the same.
+    Returns True if identical, False if there is a mismatch.
+    If column_filter is not None, only columns in column_filter must match.
+    """
+    if column_filter is not None:
+        catalog_features = [c for c in catalog_features if c.name in column_filter]
+        df_schema = [c for c in df_schema if c.name in column_filter]
+    catalog_schema = {
+        feature.name: feature.data_type
+        for feature in catalog_features
+    }
+    delta_schema = {
+        feature.name: feature.dataType
+        for feature in df_schema
+    }
+    complex_catalog_schema = get_complex_catalog_schema(
+        catalog_features, catalog_schema
+    )
+    complex_delta_schema = get_complex_delta_schema(df_schema, delta_schema)
+    return (
+        catalog_schema == delta_schema
+        and complex_catalog_schema == complex_delta_schema
+    )
+def get_complex_delta_schema(delta_features, delta_feature_names_to_fs_types):
+    """
+    1. Filter delta features to features that have complex datatypes.
+    2. Take the existing Spark DataType stored on the Delta features. This is later used for
+    comparison against the Catalog schema's complex Spark DataTypes.
+    3. Return a mapping of feature name to their respective complex Spark DataTypes.
+    :param delta_features: List[Feature]. List of features stored in Delta.
+    :param delta_feature_names_to_fs_types: Map[str, feature_store.DataType]. A mapping of feature
+    names to their respective Feature Store DataTypes.
+    :return: Map[str, spark.sql.types.DataType]. A mapping of feature names to their respective
+    Spark DataTypes.
+    """
+    complex_delta_features = [
+        feature
+        for feature in delta_features
+        if delta_feature_names_to_fs_types[feature.name] in DATA_TYPES_REQUIRES_DETAILS
+    ]
+    complex_delta_feature_names_to_spark_types = {
+        feature.name: feature.dataType for feature in complex_delta_features
+    }
+    return complex_delta_feature_names_to_spark_types
+def get_complex_catalog_schema(catalog_features, catalog_feature_names_to_fs_types):
+    """
+    1. Filter catalog features to features that have complex datatypes.
+    2. Convert the JSON string stored in each feature's data_type_details to the corresponding
+    Spark DataType. This is later used for comparison against the Delta schema's complex Spark
+    DataTypes.
+    3. Return a mapping of feature name to their respective complex Spark DataTypes.
+    :param catalog_features: List[Feature]. List of features stored in the Catalog.
+    :param catalog_feature_names_to_fs_types: Map[str, feature_store.DataType]. A mapping of feature
+    names to their respective Feature Store DataTypes.
+    :return: Map[str, spark.sql.types.DataType]. A mapping of feature names to their respective
+    Spark DataTypes.
+    """
+    complex_catalog_features = [
+        feature
+        for feature in catalog_features
+        if catalog_feature_names_to_fs_types[feature.name]
+        in DATA_TYPES_REQUIRES_DETAILS
+    ]
+    complex_catalog_feature_names_to_spark_types = {
+        feature.name: feature.data_type_details
+        for feature in complex_catalog_features
+    }
+    return complex_catalog_feature_names_to_spark_types
+def log_catalog_schema_not_match_delta_schema(catalog_features, df_schema, level):
+    """
+    Log the catalog schema does not match the delta table schema.
+    Example warning:
+    Expected recorded schema from Feature Catalog to be identical with
+    schema in delta table.Feature Catalog's schema is
+    '{'id': 'INTEGER', 'feat1': 'INTEGER'}' while delta table's
+    schema is '{'id': 'INTEGER', 'feat1': 'FLOAT'}'
+    """
+    catalog_schema = {feature.name: feature.data_type for feature in catalog_features}
+    delta_schema = {
+        feature.name: feature.dataType
+        for feature in df_schema
+    }
+    msg = (
+        f"Expected recorded schema from Feature Catalog to be identical with schema "
+        f"in Delta table. "
+        f"Feature Catalog's schema is '{catalog_schema}' while Delta table's schema "
+        f"is '{delta_schema}'"
+    )
+    if level == _WARN:
+        _logger.warning(msg)
+    elif level == _ERROR:
+        raise RuntimeError(msg)
+    else:
+        _logger.info(msg)