tencent-wedata-feature-engineering-dev 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.

Files changed (64) hide show
  1. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/METADATA +19 -0
  2. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/RECORD +64 -0
  3. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/WHEEL +5 -0
  4. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/top_level.txt +1 -0
  5. wedata/__init__.py +9 -0
  6. wedata/feature_store/__init__.py +0 -0
  7. wedata/feature_store/client.py +462 -0
  8. wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
  9. wedata/feature_store/cloud_sdk_client/client.py +86 -0
  10. wedata/feature_store/cloud_sdk_client/models.py +686 -0
  11. wedata/feature_store/cloud_sdk_client/utils.py +32 -0
  12. wedata/feature_store/common/__init__.py +0 -0
  13. wedata/feature_store/common/protos/__init__.py +0 -0
  14. wedata/feature_store/common/protos/feature_store_pb2.py +49 -0
  15. wedata/feature_store/common/store_config/__init__.py +0 -0
  16. wedata/feature_store/common/store_config/redis.py +48 -0
  17. wedata/feature_store/constants/__init__.py +0 -0
  18. wedata/feature_store/constants/constants.py +59 -0
  19. wedata/feature_store/constants/engine_types.py +34 -0
  20. wedata/feature_store/entities/__init__.py +0 -0
  21. wedata/feature_store/entities/column_info.py +138 -0
  22. wedata/feature_store/entities/environment_variables.py +55 -0
  23. wedata/feature_store/entities/feature.py +53 -0
  24. wedata/feature_store/entities/feature_column_info.py +72 -0
  25. wedata/feature_store/entities/feature_function.py +55 -0
  26. wedata/feature_store/entities/feature_lookup.py +200 -0
  27. wedata/feature_store/entities/feature_spec.py +489 -0
  28. wedata/feature_store/entities/feature_spec_constants.py +25 -0
  29. wedata/feature_store/entities/feature_table.py +111 -0
  30. wedata/feature_store/entities/feature_table_info.py +49 -0
  31. wedata/feature_store/entities/function_info.py +90 -0
  32. wedata/feature_store/entities/on_demand_column_info.py +57 -0
  33. wedata/feature_store/entities/source_data_column_info.py +24 -0
  34. wedata/feature_store/entities/training_set.py +135 -0
  35. wedata/feature_store/feast_client/__init__.py +0 -0
  36. wedata/feature_store/feast_client/feast_client.py +482 -0
  37. wedata/feature_store/feature_table_client/__init__.py +0 -0
  38. wedata/feature_store/feature_table_client/feature_table_client.py +969 -0
  39. wedata/feature_store/mlflow_model.py +17 -0
  40. wedata/feature_store/spark_client/__init__.py +0 -0
  41. wedata/feature_store/spark_client/spark_client.py +289 -0
  42. wedata/feature_store/training_set_client/__init__.py +0 -0
  43. wedata/feature_store/training_set_client/training_set_client.py +572 -0
  44. wedata/feature_store/utils/__init__.py +0 -0
  45. wedata/feature_store/utils/common_utils.py +352 -0
  46. wedata/feature_store/utils/env_utils.py +86 -0
  47. wedata/feature_store/utils/feature_lookup_utils.py +564 -0
  48. wedata/feature_store/utils/feature_spec_utils.py +286 -0
  49. wedata/feature_store/utils/feature_utils.py +73 -0
  50. wedata/feature_store/utils/on_demand_utils.py +107 -0
  51. wedata/feature_store/utils/schema_utils.py +117 -0
  52. wedata/feature_store/utils/signature_utils.py +202 -0
  53. wedata/feature_store/utils/topological_sort.py +158 -0
  54. wedata/feature_store/utils/training_set_utils.py +579 -0
  55. wedata/feature_store/utils/uc_utils.py +296 -0
  56. wedata/feature_store/utils/validation_utils.py +79 -0
  57. wedata/tempo/__init__.py +0 -0
  58. wedata/tempo/interpol.py +448 -0
  59. wedata/tempo/intervals.py +1331 -0
  60. wedata/tempo/io.py +61 -0
  61. wedata/tempo/ml.py +129 -0
  62. wedata/tempo/resample.py +318 -0
  63. wedata/tempo/tsdf.py +1720 -0
  64. wedata/tempo/utils.py +254 -0
@@ -0,0 +1,286 @@
1
+ import logging
2
+ from dataclasses import dataclass
3
+ from functools import reduce
4
+ from typing import Dict, List, Tuple, Union
5
+
6
+ import yaml
7
+ from mlflow.utils.file_utils import YamlSafeDumper
8
+
9
+ from wedata.feature_store.entities.column_info import ColumnInfo
10
+ from wedata.feature_store.entities.feature_column_info import FeatureColumnInfo
11
+ from wedata.feature_store.entities.feature_spec import FeatureSpec
12
+ from wedata.feature_store.entities.on_demand_column_info import OnDemandColumnInfo
13
+ from wedata.feature_store.entities.source_data_column_info import SourceDataColumnInfo
14
+ from wedata.feature_store.utils.topological_sort import topological_sort
15
+
16
+ DEFAULT_GRAPH_DEPTH_LIMIT = 5
17
+
18
+ COLUMN_INFO_TYPE_SOURCE = "SOURCE"
19
+ COLUMN_INFO_TYPE_ON_DEMAND = "ON_DEMAND"
20
+ COLUMN_INFO_TYPE_FEATURE = "FEATURE"
21
+
22
+ _logger = logging.getLogger(__name__)
23
+
24
+
25
+ @dataclass
26
+ class FeatureExecutionGroup:
27
+ type: str # could be FEATURE, ON_DEMAND, SOURCE
28
+ features: Union[
29
+ List[FeatureColumnInfo], List[OnDemandColumnInfo], List[SourceDataColumnInfo]
30
+ ]
31
+
32
+
33
+ # Small number has high priority. Besides SOURCE, preferring FEATURE over ON_DEMAND in topological
34
+ # sorting to make sure ON_DEMAND columns after FEATURE in simple cases to align with previous
35
+ # assumption before implementing TLT.
36
+ # NOTE: changing this priority may cause performance regression, proceed with caution.
37
+ COLUMN_TYPE_PRIORITY = {
38
+ COLUMN_INFO_TYPE_SOURCE: 0,
39
+ COLUMN_INFO_TYPE_ON_DEMAND: 1,
40
+ COLUMN_INFO_TYPE_FEATURE: 2,
41
+ }
42
+
43
+
44
+ class _GraphNode:
45
+ def __init__(self, column_info: ColumnInfo):
46
+ info = column_info.info
47
+ self.column_info = column_info
48
+ self.output_name = info.output_name
49
+
50
+ if isinstance(column_info.info, SourceDataColumnInfo):
51
+ self.input_names = set()
52
+ self.type = COLUMN_INFO_TYPE_SOURCE
53
+ elif isinstance(column_info.info, FeatureColumnInfo):
54
+ self.input_names = set(info.lookup_key)
55
+ self.type = COLUMN_INFO_TYPE_FEATURE
56
+ elif isinstance(column_info.info, OnDemandColumnInfo):
57
+ self.input_names = set(info.input_bindings.values())
58
+ self.type = COLUMN_INFO_TYPE_ON_DEMAND
59
+ else:
60
+ raise ValueError("unknown column info type")
61
+
62
+ def __str__(self):
63
+ return "node<" + self.output_name + ">"
64
+
65
+ def __repr__(self):
66
+ return str(self)
67
+
68
+
69
+ def _column_info_sort_key(node: _GraphNode) -> Tuple[int, str]:
70
+ """
71
+ Returns a tuple of an int and a str as the sorting key for _GraphNode. Priority is determined by
72
+ the first element and then use the second element to break ties.
73
+ """
74
+ return COLUMN_TYPE_PRIORITY[node.type], node.output_name
75
+
76
+
77
+ def _should_be_grouped(node: _GraphNode) -> bool:
78
+ """
79
+ Returns True if the given node is of type that should be grouped together as much as possible.
80
+ """
81
+ return node.type == COLUMN_INFO_TYPE_FEATURE
82
+
83
+
84
+ def _validate_graph_depth(nodes: List[_GraphNode], depth_limit: int):
85
+ name_to_node = {node.output_name: node for node in nodes}
86
+ visited_depth = {}
87
+
88
+ def dfs(node: _GraphNode, depth: int):
89
+ if depth > depth_limit:
90
+ raise ValueError(
91
+ f"The given graph contains a dependency path longer than the limit {depth_limit}"
92
+ )
93
+ if (
94
+ node.output_name in visited_depth
95
+ and depth <= visited_depth[node.output_name]
96
+ ):
97
+ return
98
+ visited_depth[node.output_name] = depth
99
+ for column_name in node.input_names:
100
+ dependency = name_to_node[column_name]
101
+ dfs(dependency, depth + 1)
102
+
103
+ for node in nodes:
104
+ dfs(node, 1)
105
+
106
+
107
+ def get_encoded_graph_map(column_infos: List[ColumnInfo]) -> Dict[str, List[str]]:
108
+ """
109
+ Creates a dictionary of columns with their dependency columns for metric use. Columns are
110
+ encoded with a string representing the type and index. For example:
111
+ {
112
+ "f3": ["s1", "s2"],
113
+ "o4": ["f3"],
114
+ "o5": []
115
+ }
116
+ "s1" and "s2" are SourceColumnInfos, "f3" is FeatureColumnInfo and "o4", "o5" are
117
+ OnDemandColumnInfos. "f3" depends on "s1" and "s2", "o5" doesn't depend on any column, etc.
118
+ :param column_infos: A list of ColumnInfos.
119
+ """
120
+ nodes = {info.output_name: _GraphNode(info) for info in column_infos}
121
+ next_node_index = 0
122
+ # A map from column info's output_name to its label.
123
+ node_label = {}
124
+
125
+ def get_node_label(node):
126
+ nonlocal next_node_index
127
+ output_name = node.output_name
128
+ if output_name not in node_label:
129
+ if node.type == COLUMN_INFO_TYPE_SOURCE:
130
+ type_simple_str = "s"
131
+ if node.type == COLUMN_INFO_TYPE_FEATURE:
132
+ type_simple_str = "f"
133
+ if node.type == COLUMN_INFO_TYPE_ON_DEMAND:
134
+ type_simple_str = "o"
135
+ new_label = type_simple_str + str(next_node_index)
136
+ next_node_index += 1
137
+ node_label[output_name] = new_label
138
+ return node_label[output_name]
139
+
140
+ graph_map = {}
141
+ for node in nodes.values():
142
+ label = get_node_label(node)
143
+ dependencies = []
144
+ for dep_name in sorted(node.input_names):
145
+ if dep_name not in nodes:
146
+ # skip the column if it's not in the feature spec.
147
+ continue
148
+ dep = get_node_label(nodes[dep_name])
149
+ dependencies.append(dep)
150
+ graph_map[label] = dependencies
151
+ return graph_map
152
+
153
+
154
+ def assign_topological_ordering(
155
+ column_infos: List[ColumnInfo],
156
+ allow_missing_source_columns=False,
157
+ graph_depth_limit=DEFAULT_GRAPH_DEPTH_LIMIT,
158
+ ) -> List[ColumnInfo]:
159
+ """
160
+ Assigns the topological ordering for each ColumnInfo of the input. Returns a list of new
161
+ ColumnInfo objects with topological_ordering set to an integer.
162
+
163
+ :param column_infos: a list of ColumnInfos.
164
+ :param allow_missing_source_columns: ONLY USED BY FSE TEMPORARILY. Allow lookup key or
165
+ function input be missing from source columns. If true, this method will assign
166
+ topological_ordering to columns as if the missing sources are added in the column_infos.
167
+ :param graph_depth_limit raises if the given graph exceed the limit.
168
+ :raises ValueError if there is a cycle in the graph.
169
+ """
170
+ nodes = list(map(lambda c: _GraphNode(c), column_infos))
171
+ # allow_missing_source_columns is used when feature_serving_endpoint_client creates training
172
+ # sets. It doesn't include source columns in the dataframe.
173
+ # TODO[ML-33809]: clean up allow_missing_source_columns.
174
+ all_output_names = set([n.output_name for n in nodes])
175
+ all_input_names = reduce(lambda a, b: a | b, [n.input_names for n in nodes])
176
+ missing_inputs = all_input_names - all_output_names
177
+ if allow_missing_source_columns:
178
+ for input_name in missing_inputs:
179
+ if input_name not in all_output_names:
180
+ nodes.append(
181
+ _GraphNode(ColumnInfo(SourceDataColumnInfo(input_name), False))
182
+ )
183
+ elif len(missing_inputs) > 0:
184
+ missing_input_names_str = ", ".join(
185
+ [f"'{name}'" for name in sorted(missing_inputs)]
186
+ )
187
+ raise ValueError(
188
+ f"Input columns {missing_input_names_str} required by FeatureLookups or "
189
+ "FeatureFunctions are not provided by input DataFrame or other FeatureFunctions and "
190
+ "FeatureLookups"
191
+ )
192
+ output_name_to_node = {node.output_name: node for node in nodes}
193
+ graph = {
194
+ node: [output_name_to_node[input_name] for input_name in node.input_names]
195
+ for node in nodes
196
+ }
197
+ sorted_nodes = topological_sort(graph, _column_info_sort_key, _should_be_grouped)
198
+ # validate depth after sorting the graph because cycle is detected during sorting.
199
+ _validate_graph_depth(nodes, graph_depth_limit)
200
+ name_to_ordering = {node.output_name: i for i, node in enumerate(sorted_nodes)}
201
+ return [
202
+ column.with_topological_ordering(name_to_ordering[column.output_name])
203
+ for column in column_infos
204
+ ]
205
+
206
+
207
+ def get_feature_execution_groups(
208
+ feature_spec: FeatureSpec, df_columns: List[str] = []
209
+ ) -> List[FeatureExecutionGroup]:
210
+ """
211
+ Splits the list of column_infos in feature_spec into groups based on the topological_ordering of
212
+ the column_infos such that each group contains only one type of feature columns and columns
213
+ don't depend on other columns in the same group. The type of feature column is equivalent to the
214
+ class type of column_info.info field.
215
+ Example:
216
+ Given FeatureSpec with some columns, after sorting the columns by topological_ordering,
217
+ assuming the sorted list:
218
+ [source_1, feature_2, feature_3, on_demand_4, on_demand_5]
219
+ where feature_2 depends on feature_3. The resulting groups will be:
220
+ [
221
+ group(SOURCE, [source_1]),
222
+ group(FEATURE, [feature_2]),
223
+ group(FEATURE, [feature_3]),
224
+ group(ON_DEMAND, [on_demand_4, on_demand_5]),
225
+ ]
226
+
227
+ :param feature_spec: A FeatureSpec with topologically sorted column_infos.
228
+ :param df_columns: the columns from the DF used to create_training_set or score_batch.
229
+ """
230
+ # convert column infos into _GraphNode
231
+ nodes = list(map(lambda c: _GraphNode(c), feature_spec.column_infos))
232
+ if any(info.topological_ordering is None for info in feature_spec.column_infos):
233
+ # The old version of feature_spec may not have topological_ordering, we can safely assume
234
+ # they are already sorted because of validations during the feature_spec creation.
235
+ _logger.warning(
236
+ "Processing a feature spec that at least one of the column_infos has no "
237
+ "topological_ordering"
238
+ )
239
+ else:
240
+ # sort nodes by topological_ordering
241
+ nodes = sorted(nodes, key=lambda n: n.column_info.topological_ordering)
242
+ # A buffer holding the columns in a group.
243
+ buffer = []
244
+ # output names of columns in the current buffer.
245
+ buffered_output_names = set()
246
+ # Used to validate the topological sorting.
247
+ # df_columns is used to be backward compatible. In old FeatureSpecs, source columns might not
248
+ # exist. So we need to consider the df as initial resolved columns.
249
+ resolved_columns = set(df_columns)
250
+ result_list = []
251
+ last_type = None
252
+ for node in nodes:
253
+ if not node.input_names.issubset(resolved_columns):
254
+ raise ValueError(
255
+ "The column_infos in the FeatureSpec is not topologically sorted"
256
+ )
257
+ if node.type != last_type or buffered_output_names.intersection(
258
+ node.input_names
259
+ ):
260
+ # split group if the current node has a different type from the previous node OR
261
+ # any of the inputs are from the nodes in the current group.
262
+ if buffer:
263
+ result_list.append(FeatureExecutionGroup(last_type, buffer))
264
+ buffer = []
265
+ buffered_output_names.clear()
266
+ last_type = node.type
267
+ buffer.append(node.column_info.info)
268
+ resolved_columns.add(node.output_name)
269
+ buffered_output_names.add(node.output_name)
270
+ if buffer:
271
+ result_list.append(FeatureExecutionGroup(last_type, buffer))
272
+ return result_list
273
+
274
+
275
+ def convert_to_yaml_string(feature_spec: FeatureSpec) -> str:
276
+ """
277
+ Converts the given FeatureSpec to a YAML string.
278
+ """
279
+ feature_spec_dict = feature_spec._to_dict()
280
+ return yaml.dump(
281
+ feature_spec_dict,
282
+ default_flow_style=False,
283
+ allow_unicode=True,
284
+ sort_keys=False,
285
+ Dumper=YamlSafeDumper,
286
+ )
@@ -0,0 +1,73 @@
1
+ import copy
2
+ from typing import List, Union
3
+
4
+ from wedata.feature_store.entities.feature_function import FeatureFunction
5
+ from wedata.feature_store.entities.feature_lookup import FeatureLookup
6
+ from wedata.feature_store.spark_client.spark_client import SparkClient
7
+ from wedata.feature_store.utils import uc_utils
8
+ from wedata.feature_store.utils.feature_lookup_utils import get_feature_lookups_with_full_table_names
9
+
10
+
11
+ def format_feature_lookups_and_functions(
12
+ _spark_client: SparkClient, features: List[Union[FeatureLookup, FeatureFunction]]
13
+ ):
14
+ fl_idx = []
15
+ ff_idx = []
16
+ feature_lookups = []
17
+ feature_functions = []
18
+ for idx, feature in enumerate(features):
19
+ if isinstance(feature, FeatureLookup):
20
+ fl_idx.append(idx)
21
+ feature_lookups.append(feature)
22
+ elif isinstance(feature, FeatureFunction):
23
+ ff_idx.append(idx)
24
+ feature_functions.append(feature)
25
+ else:
26
+ raise ValueError(
27
+ f"Expected a list of FeatureLookups for 'feature_lookups', but received type '{type(feature)}'."
28
+ )
29
+
30
+ # FeatureLookups and FeatureFunctions must have fully qualified table, UDF names
31
+ feature_lookups = get_feature_lookups_with_full_table_names(
32
+ feature_lookups,
33
+ _spark_client.get_current_catalog(),
34
+ _spark_client.get_current_database(),
35
+ )
36
+ feature_functions = get_feature_functions_with_full_udf_names(
37
+ feature_functions,
38
+ _spark_client.get_current_catalog(),
39
+ _spark_client.get_current_database(),
40
+ )
41
+
42
+ # Restore original order of FeatureLookups, FeatureFunctions. Copy to avoid mutating original list.
43
+ features = features.copy()
44
+ for idx, feature in zip(fl_idx + ff_idx, feature_lookups + feature_functions):
45
+ features[idx] = feature
46
+
47
+ return features
48
+
49
+
50
+ def get_feature_functions_with_full_udf_names(
51
+ feature_functions: List[FeatureFunction], current_catalog: str, current_schema: str
52
+ ):
53
+ """
54
+ Takes in a list of FeatureFunctions, and returns copies with:
55
+ 1. Fully qualified UDF names.
56
+ 2. If output_name is empty, fully qualified UDF names as output_name.
57
+ """
58
+ udf_names = {ff.udf_name for ff in feature_functions}
59
+ uc_utils._check_qualified_udf_names(udf_names)
60
+ uc_utils._verify_all_udfs_in_uc(udf_names, current_catalog, current_schema)
61
+
62
+ standardized_feature_functions = []
63
+ for ff in feature_functions:
64
+ ff_copy = copy.deepcopy(ff)
65
+ del ff
66
+
67
+ ff_copy._udf_name = uc_utils.get_full_udf_name(
68
+ ff_copy.udf_name, current_catalog, current_schema
69
+ )
70
+ if not ff_copy.output_name:
71
+ ff_copy._output_name = ff_copy.udf_name
72
+ standardized_feature_functions.append(ff_copy)
73
+ return standardized_feature_functions
@@ -0,0 +1,107 @@
1
+ import copy
2
+ from typing import Dict, List
3
+
4
+ from pyspark.sql import DataFrame
5
+ from pyspark.sql.functions import expr
6
+
7
+ from wedata.feature_store.entities.feature_function import FeatureFunction
8
+ from wedata.feature_store.entities.function_info import FunctionInfo
9
+ from wedata.feature_store.entities.on_demand_column_info import OnDemandColumnInfo
10
+ from wedata.feature_store.utils import common_utils, uc_utils
11
+
12
+
13
+ def _udf_expr(udf_name: str, arguments: List[str]) -> expr:
14
+ """
15
+ Generate a Spark SQL expression, e.g. expr("udf_name(col1, col2)")
16
+ """
17
+ arguments_str = ", ".join(common_utils.sanitize_identifiers(arguments))
18
+ return expr(f"{udf_name}({arguments_str})")
19
+
20
+
21
+ def _validate_apply_functions_df(
22
+ df: DataFrame,
23
+ functions_to_apply: List[OnDemandColumnInfo],
24
+ uc_function_infos: Dict[str, FunctionInfo],
25
+ ):
26
+ """
27
+ Validate the following:
28
+ 1. On-demand input columns specified by functions_to_apply exist in the DataFrame.
29
+ 2. On-demand input columns have data types that match those of UDF parameters.
30
+ """
31
+ for odci in functions_to_apply:
32
+ function_info = uc_function_infos[odci.udf_name]
33
+ types_dict = dict(df.dtypes)
34
+
35
+ for p in function_info.input_params:
36
+ arg_column = odci.input_bindings[p.name]
37
+ if arg_column not in df.columns:
38
+ raise ValueError(
39
+ f"FeatureFunction argument column '{arg_column}' for UDF '{odci.udf_name}' parameter '{p.name}' "
40
+ f"does not exist in provided DataFrame with schema '{df.schema}'."
41
+ )
42
+ if types_dict[arg_column] != p.type_text:
43
+ raise ValueError(
44
+ f"FeatureFunction argument column '{arg_column}' for UDF '{odci.udf_name}' parameter '{p.name}' "
45
+ f"does not have the expected type. Argument column '{arg_column}' has type "
46
+ f"'{types_dict[arg_column]}' and parameter '{p.name}' has type '{p.type_text}'."
47
+ )
48
+
49
+
50
+ def apply_functions_if_not_overridden(
51
+ df: DataFrame,
52
+ functions_to_apply: List[OnDemandColumnInfo],
53
+ uc_function_infos: Dict[str, FunctionInfo],
54
+ ) -> DataFrame:
55
+ """
56
+ For all on-demand features, in the order defined by the FeatureSpec:
57
+ If the feature does not already exist, append the evaluated UDF expression.
58
+ Existing column values or column positions are not modified.
59
+
60
+ `_validate_apply_functions_df` validates UDFs can be applied on `df` schema.
61
+
62
+ The caller should validate:
63
+ 1. FeatureFunction bound argument columns for UDF parameters exist in FeatureSpec defined features.
64
+ 2. FeatureFunction output feature names are unique.
65
+ """
66
+ _validate_apply_functions_df(
67
+ df=df,
68
+ functions_to_apply=functions_to_apply,
69
+ uc_function_infos=uc_function_infos,
70
+ )
71
+
72
+ columns = {}
73
+ for odci in functions_to_apply:
74
+ if odci.output_name not in df.columns:
75
+ function_info = uc_function_infos[odci.udf_name]
76
+ # Resolve the bound arguments in the UDF parameter order
77
+ udf_arguments = [
78
+ odci.input_bindings[p.name] for p in function_info.input_params
79
+ ]
80
+ columns[odci.output_name] = _udf_expr(odci.udf_name, udf_arguments)
81
+ return df.withColumns(columns)
82
+
83
+
84
+ def get_feature_functions_with_full_udf_names(
85
+ feature_functions: List[FeatureFunction], current_catalog: str, current_schema: str
86
+ ):
87
+ """
88
+ Takes in a list of FeatureFunctions, and returns copies with:
89
+ 1. Fully qualified UDF names.
90
+ 2. If output_name is empty, fully qualified UDF names as output_name.
91
+ """
92
+ udf_names = {ff.udf_name for ff in feature_functions}
93
+ uc_utils._check_qualified_udf_names(udf_names)
94
+ uc_utils._verify_all_udfs_in_uc(udf_names, current_catalog, current_schema)
95
+
96
+ standardized_feature_functions = []
97
+ for ff in feature_functions:
98
+ ff_copy = copy.deepcopy(ff)
99
+ del ff
100
+
101
+ ff_copy._udf_name = uc_utils.get_full_udf_name(
102
+ ff_copy.udf_name, current_catalog, current_schema
103
+ )
104
+ if not ff_copy.output_name:
105
+ ff_copy._output_name = ff_copy.udf_name
106
+ standardized_feature_functions.append(ff_copy)
107
+ return standardized_feature_functions
@@ -0,0 +1,117 @@
1
+ import logging
2
+
3
+ from wedata.feature_store.constants.constants import _ERROR, _WARN
4
+
5
+ _logger = logging.getLogger(__name__)
6
+
7
+
8
+ def catalog_matches_delta_schema(catalog_features, df_schema, column_filter=None):
9
+ """
10
+ Confirm that the column names and column types are the same.
11
+
12
+ Returns True if identical, False if there is a mismatch.
13
+
14
+ If column_filter is not None, only columns in column_filter must match.
15
+ """
16
+ if column_filter is not None:
17
+ catalog_features = [c for c in catalog_features if c.name in column_filter]
18
+ df_schema = [c for c in df_schema if c.name in column_filter]
19
+
20
+ catalog_schema = {
21
+ feature.name: feature.data_type
22
+ for feature in catalog_features
23
+ }
24
+ delta_schema = {
25
+ feature.name: feature.dataType
26
+ for feature in df_schema
27
+ }
28
+
29
+ complex_catalog_schema = get_complex_catalog_schema(
30
+ catalog_features, catalog_schema
31
+ )
32
+ complex_delta_schema = get_complex_delta_schema(df_schema, delta_schema)
33
+
34
+ return (
35
+ catalog_schema == delta_schema
36
+ and complex_catalog_schema == complex_delta_schema
37
+ )
38
+
39
+
40
+ def get_complex_delta_schema(delta_features, delta_feature_names_to_fs_types):
41
+ """
42
+ 1. Filter delta features to features that have complex datatypes.
43
+ 2. Take the existing Spark DataType stored on the Delta features. This is later used for
44
+ comparison against the Catalog schema's complex Spark DataTypes.
45
+ 3. Return a mapping of feature name to their respective complex Spark DataTypes.
46
+
47
+ :param delta_features: List[Feature]. List of features stored in Delta.
48
+ :param delta_feature_names_to_fs_types: Map[str, feature_store.DataType]. A mapping of feature
49
+ names to their respective Feature Store DataTypes.
50
+ :return: Map[str, spark.sql.types.DataType]. A mapping of feature names to their respective
51
+ Spark DataTypes.
52
+ """
53
+ complex_delta_features = [
54
+ feature
55
+ for feature in delta_features
56
+ if delta_feature_names_to_fs_types[feature.name] in DATA_TYPES_REQUIRES_DETAILS
57
+ ]
58
+ complex_delta_feature_names_to_spark_types = {
59
+ feature.name: feature.dataType for feature in complex_delta_features
60
+ }
61
+ return complex_delta_feature_names_to_spark_types
62
+
63
+
64
+ def get_complex_catalog_schema(catalog_features, catalog_feature_names_to_fs_types):
65
+ """
66
+ 1. Filter catalog features to features that have complex datatypes.
67
+ 2. Convert the JSON string stored in each feature's data_type_details to the corresponding
68
+ Spark DataType. This is later used for comparison against the Delta schema's complex Spark
69
+ DataTypes.
70
+ 3. Return a mapping of feature name to their respective complex Spark DataTypes.
71
+
72
+ :param catalog_features: List[Feature]. List of features stored in the Catalog.
73
+ :param catalog_feature_names_to_fs_types: Map[str, feature_store.DataType]. A mapping of feature
74
+ names to their respective Feature Store DataTypes.
75
+ :return: Map[str, spark.sql.types.DataType]. A mapping of feature names to their respective
76
+ Spark DataTypes.
77
+ """
78
+ complex_catalog_features = [
79
+ feature
80
+ for feature in catalog_features
81
+ if catalog_feature_names_to_fs_types[feature.name]
82
+ in DATA_TYPES_REQUIRES_DETAILS
83
+ ]
84
+ complex_catalog_feature_names_to_spark_types = {
85
+ feature.name: feature.data_type_details
86
+ for feature in complex_catalog_features
87
+ }
88
+ return complex_catalog_feature_names_to_spark_types
89
+
90
+
91
+ def log_catalog_schema_not_match_delta_schema(catalog_features, df_schema, level):
92
+ """
93
+ Log the catalog schema does not match the delta table schema.
94
+
95
+ Example warning:
96
+ Expected recorded schema from Feature Catalog to be identical with
97
+ schema in delta table.Feature Catalog's schema is
98
+ '{'id': 'INTEGER', 'feat1': 'INTEGER'}' while delta table's
99
+ schema is '{'id': 'INTEGER', 'feat1': 'FLOAT'}'
100
+ """
101
+ catalog_schema = {feature.name: feature.data_type for feature in catalog_features}
102
+ delta_schema = {
103
+ feature.name: feature.dataType
104
+ for feature in df_schema
105
+ }
106
+ msg = (
107
+ f"Expected recorded schema from Feature Catalog to be identical with schema "
108
+ f"in Delta table. "
109
+ f"Feature Catalog's schema is '{catalog_schema}' while Delta table's schema "
110
+ f"is '{delta_schema}'"
111
+ )
112
+ if level == _WARN:
113
+ _logger.warning(msg)
114
+ elif level == _ERROR:
115
+ raise RuntimeError(msg)
116
+ else:
117
+ _logger.info(msg)