tencent-wedata-feature-engineering-dev 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.

Files changed (64) hide show
  1. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/METADATA +19 -0
  2. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/RECORD +64 -0
  3. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/WHEEL +5 -0
  4. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/top_level.txt +1 -0
  5. wedata/__init__.py +9 -0
  6. wedata/feature_store/__init__.py +0 -0
  7. wedata/feature_store/client.py +462 -0
  8. wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
  9. wedata/feature_store/cloud_sdk_client/client.py +86 -0
  10. wedata/feature_store/cloud_sdk_client/models.py +686 -0
  11. wedata/feature_store/cloud_sdk_client/utils.py +32 -0
  12. wedata/feature_store/common/__init__.py +0 -0
  13. wedata/feature_store/common/protos/__init__.py +0 -0
  14. wedata/feature_store/common/protos/feature_store_pb2.py +49 -0
  15. wedata/feature_store/common/store_config/__init__.py +0 -0
  16. wedata/feature_store/common/store_config/redis.py +48 -0
  17. wedata/feature_store/constants/__init__.py +0 -0
  18. wedata/feature_store/constants/constants.py +59 -0
  19. wedata/feature_store/constants/engine_types.py +34 -0
  20. wedata/feature_store/entities/__init__.py +0 -0
  21. wedata/feature_store/entities/column_info.py +138 -0
  22. wedata/feature_store/entities/environment_variables.py +55 -0
  23. wedata/feature_store/entities/feature.py +53 -0
  24. wedata/feature_store/entities/feature_column_info.py +72 -0
  25. wedata/feature_store/entities/feature_function.py +55 -0
  26. wedata/feature_store/entities/feature_lookup.py +200 -0
  27. wedata/feature_store/entities/feature_spec.py +489 -0
  28. wedata/feature_store/entities/feature_spec_constants.py +25 -0
  29. wedata/feature_store/entities/feature_table.py +111 -0
  30. wedata/feature_store/entities/feature_table_info.py +49 -0
  31. wedata/feature_store/entities/function_info.py +90 -0
  32. wedata/feature_store/entities/on_demand_column_info.py +57 -0
  33. wedata/feature_store/entities/source_data_column_info.py +24 -0
  34. wedata/feature_store/entities/training_set.py +135 -0
  35. wedata/feature_store/feast_client/__init__.py +0 -0
  36. wedata/feature_store/feast_client/feast_client.py +482 -0
  37. wedata/feature_store/feature_table_client/__init__.py +0 -0
  38. wedata/feature_store/feature_table_client/feature_table_client.py +969 -0
  39. wedata/feature_store/mlflow_model.py +17 -0
  40. wedata/feature_store/spark_client/__init__.py +0 -0
  41. wedata/feature_store/spark_client/spark_client.py +289 -0
  42. wedata/feature_store/training_set_client/__init__.py +0 -0
  43. wedata/feature_store/training_set_client/training_set_client.py +572 -0
  44. wedata/feature_store/utils/__init__.py +0 -0
  45. wedata/feature_store/utils/common_utils.py +352 -0
  46. wedata/feature_store/utils/env_utils.py +86 -0
  47. wedata/feature_store/utils/feature_lookup_utils.py +564 -0
  48. wedata/feature_store/utils/feature_spec_utils.py +286 -0
  49. wedata/feature_store/utils/feature_utils.py +73 -0
  50. wedata/feature_store/utils/on_demand_utils.py +107 -0
  51. wedata/feature_store/utils/schema_utils.py +117 -0
  52. wedata/feature_store/utils/signature_utils.py +202 -0
  53. wedata/feature_store/utils/topological_sort.py +158 -0
  54. wedata/feature_store/utils/training_set_utils.py +579 -0
  55. wedata/feature_store/utils/uc_utils.py +296 -0
  56. wedata/feature_store/utils/validation_utils.py +79 -0
  57. wedata/tempo/__init__.py +0 -0
  58. wedata/tempo/interpol.py +448 -0
  59. wedata/tempo/intervals.py +1331 -0
  60. wedata/tempo/io.py +61 -0
  61. wedata/tempo/ml.py +129 -0
  62. wedata/tempo/resample.py +318 -0
  63. wedata/tempo/tsdf.py +1720 -0
  64. wedata/tempo/utils.py +254 -0
@@ -0,0 +1,200 @@
1
+ import copy
2
+ import datetime
3
+ import logging
4
+ from typing import Dict, List, Optional, Union
5
+
6
+ from wedata.feature_store.utils import common_utils
7
+ from wedata.feature_store.common.store_config.redis import RedisStoreConfig
8
+
9
+ _logger = logging.getLogger(__name__)
10
+
11
+
12
+ class FeatureLookup:
13
+
14
+ """
15
+ 特征查找类
16
+
17
+ 特征查找类用于指定特征表中的特征,并将其与训练集中的特征进行关联。
18
+
19
+ 特征查找类有以下属性:
20
+
21
+ - table_name:特征表的名称。
22
+ - lookup_key:用于在特征表和训练集之间进行联接的键。lookup_key必须是训练集中的列。lookup_key的类型和顺序必须与特征表的主键匹配。
23
+ - is_online:如果为True,则会使用在线特征表。如果为False,则会使用离线特征表。默认值为False。
24
+ - online_config:如果is_online为True,则会使用此配置来配置在线特征表。默认值为None。
25
+ - feature_names:要从特征表中查找的特征的名称。如果您的模型需要主键作为特征,则可以将它们声明为独立的FeatureLookups。
26
+ - rename_outputs:如果提供,则会将特征重命名为 :meth:`create_training_set() <databricks.feature_engineering.client.FeatureEngineeringClient.create_training_set>`返回的 :class:`TrainingSet <databricks.ml_features.training_set.TrainingSet>` 中的特征。
27
+ - timestamp_lookup_key:用于在特征表和训练集之间进行联接的时间戳键。timestamp_lookup_key必须是训练集中的列。timestamp_lookup_key的类型必须与特征表的时间戳键的类型匹配。
28
+ - lookback_window: 当对特征表执行时间点查找时使用的回溯窗口,该查找针对传递给 :meth:`create_training_set() <databricks.feature_engineering.client.FeatureEngineeringClient.create_training_set>` 方法的数据帧。特征存储将检索在数据帧的``timestamp_lookup_key``指定时间戳之前且在``lookback_window``时间范围内的最新特征值,如果不存在这样的特征值则返回null。当设置为0时,仅返回特征表中的精确匹配项。
29
+ - feature_name:特征名称。**已弃用**。使用 `feature_names`。
30
+ - output_name:如果提供,则会将此特征重命名为 :meth:`create_training_set() <databricks.feature_engineering.client.FeatureEngineeringClient.create_training_set>` 返回的 :class:`TrainingSet <databricks.ml_features.training_set.TrainingSet>` 中的特征。**已弃用**。使用 `rename_outputs`。
31
+
32
+ 示例:
33
+
34
+ from databricks.feature_store import FeatureLookup
35
+
36
+ lookup = FeatureLookup(
37
+ table_name="my_feature_table",
38
+ lookup_key="my_lookup_key",
39
+ feature_names=["my_feature_1", "my_feature_2"],
40
+ rename_outputs={"my_feature_1": "my_feature_1_renamed"},
41
+ timestamp_lookup_key="my_timestamp_lookup_key",
42
+ lookback_window=datetime.timedelta(days=1)
43
+ )
44
+
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ table_name: str,
50
+ lookup_key: Union[str, List[str]],
51
+ *,
52
+ is_online: bool = False,
53
+ online_config: RedisStoreConfig = None,
54
+ feature_names: Union[str, List[str], None] = None,
55
+ rename_outputs: Optional[Dict[str, str]] = None,
56
+ timestamp_lookup_key: Optional[str] = None,
57
+ lookback_window: Optional[datetime.timedelta] = None,
58
+ **kwargs,
59
+ ):
60
+ """Initialize a FeatureLookup object. See class documentation."""
61
+
62
+ self._feature_name_deprecated = kwargs.pop("feature_name", None)
63
+ self._output_name_deprecated = kwargs.pop("output_name", None)
64
+
65
+ if kwargs:
66
+ raise TypeError(
67
+ f"FeatureLookup got unexpected keyword argument(s): {list(kwargs.keys())}"
68
+ )
69
+
70
+ self._table_name = table_name
71
+
72
+ if type(timestamp_lookup_key) is list:
73
+ if len(timestamp_lookup_key) == 0:
74
+ timestamp_lookup_key = None
75
+ elif len(timestamp_lookup_key) == 1:
76
+ timestamp_lookup_key = timestamp_lookup_key[0]
77
+ else:
78
+ raise ValueError(
79
+ f"Setting multiple timestamp lookup keys is not supported."
80
+ )
81
+
82
+ if rename_outputs is not None and not isinstance(rename_outputs, dict):
83
+ raise ValueError(
84
+ f"Unexpected type for rename_outputs: {type(rename_outputs)}"
85
+ )
86
+
87
+ self._feature_names = common_utils.as_list(feature_names, default=[])
88
+
89
+ # Make sure the user didn't accidentally pass in any nested lists/dicts in feature_names
90
+ for fn in self._feature_names:
91
+ if not isinstance(fn, str):
92
+ raise ValueError(
93
+ f"Unexpected type for element in feature_names: {type(self._feature_names)}, only strings allowed in list"
94
+ )
95
+
96
+ if lookback_window is not None:
97
+ if not timestamp_lookup_key:
98
+ raise ValueError(
99
+ f"Unexpected lookback_window value: {lookback_window}, lookback windows can only be applied on time series "
100
+ f"feature tables. Use timestamp_lookup_key to perform point-in-time lookups with lookback window."
101
+ )
102
+ if not isinstance(
103
+ lookback_window, datetime.timedelta
104
+ ) or lookback_window < datetime.timedelta(0):
105
+ raise ValueError(
106
+ f"Unexpected value for lookback_window: {lookback_window}, only non-negative datetime.timedelta allowed."
107
+ )
108
+
109
+ self._lookup_key = copy.copy(lookup_key)
110
+ self._timestamp_lookup_key = copy.copy(timestamp_lookup_key)
111
+ self._lookback_window = copy.copy(lookback_window)
112
+ self._is_online = is_online
113
+ self._online_config = online_config
114
+
115
+ self._rename_outputs = {}
116
+ if rename_outputs is not None:
117
+ self._rename_outputs = rename_outputs.copy()
118
+
119
+ self._inject_deprecated_feature_name()
120
+ self._inject_deprecated_output_name()
121
+
122
+ @property
123
+ def table_name(self):
124
+ """The table name to use in this FeatureLookup."""
125
+ return self._table_name
126
+
127
+ @property
128
+ def lookup_key(self):
129
+ """The lookup key(s) to use in this FeatureLookup."""
130
+ return self._lookup_key
131
+
132
+ @property
133
+ def feature_name(self):
134
+ """The feature name to use in this FeatureLookup. **Deprecated**. Use `feature_names`."""
135
+ return self._feature_name_deprecated
136
+
137
+ @property
138
+ def feature_names(self):
139
+ """The feature names to use in this FeatureLookup."""
140
+ return self._feature_names
141
+
142
+ @property
143
+ def output_name(self):
144
+ """The output name to use in this FeatureLookup. **Deprecated**. Use `feature_names`."""
145
+ if self._output_name_deprecated:
146
+ return self._output_name_deprecated
147
+ else:
148
+ return self._feature_name_deprecated
149
+
150
+ @property
151
+ def timestamp_lookup_key(self):
152
+ return self._timestamp_lookup_key
153
+
154
+ @property
155
+ def lookback_window(self):
156
+ """A lookback window applied only for point-in-time lookups."""
157
+ return self._lookback_window
158
+
159
+ @property
160
+ def is_online(self):
161
+ """Whether to use online feature tables."""
162
+ return self._is_online
163
+
164
+ @property
165
+ def online_config(self):
166
+ """The online feature table configuration."""
167
+ return self._online_config
168
+
169
+ def _get_feature_names(self):
170
+ return self._feature_names
171
+
172
+ def _get_output_name(self, feature_name):
173
+ """Lookup the renamed output, or fallback to the feature name itself if no mapping is present"""
174
+ return self._rename_outputs.get(feature_name, feature_name)
175
+
176
+ def _inject_deprecated_feature_name(self):
177
+ if self._feature_name_deprecated:
178
+ if len(self._feature_names) > 0:
179
+ raise ValueError(
180
+ "Use either feature_names or feature_name parameter, but not both."
181
+ )
182
+ _logger.warning(
183
+ f'The feature_name parameter is deprecated. Use "feature_names".'
184
+ )
185
+ self._feature_names = [self._feature_name_deprecated]
186
+
187
+ def _inject_deprecated_output_name(self):
188
+ if len(self._feature_names) == 1 and self._output_name_deprecated:
189
+ if len(self._rename_outputs) > 0:
190
+ raise ValueError(
191
+ "Use either output_name or rename_outputs parameter, but not both."
192
+ )
193
+ _logger.warning(
194
+ f'The output_name parameter is deprecated. Use "rename_outputs".'
195
+ )
196
+ self._rename_outputs[self._feature_names[0]] = self._output_name_deprecated
197
+
198
+ @table_name.setter
199
+ def table_name(self, value):
200
+ self._table_name = value
@@ -0,0 +1,489 @@
1
+ import importlib.metadata
2
+ import os
3
+ from typing import Any, Dict, List, Type, Union
4
+
5
+ import mlflow
6
+
7
+ from google.protobuf.json_format import MessageToDict, ParseDict
8
+ from mlflow.utils.file_utils import TempDir, read_yaml, write_yaml
9
+
10
+ from wedata.feature_store.common.protos import feature_store_pb2
11
+ from wedata.feature_store.entities.column_info import ColumnInfo
12
+ from wedata.feature_store.entities.feature_column_info import FeatureColumnInfo
13
+ from wedata.feature_store.entities.function_info import FunctionInfo
14
+ from wedata.feature_store.entities.feature_spec_constants import (
15
+ BOUND_TO,
16
+ DATA_TYPE,
17
+ FEATURE_COLUMN_INFO,
18
+ FEATURE_STORE,
19
+ INCLUDE,
20
+ INPUT_BINDINGS,
21
+ INPUT_COLUMNS,
22
+ INPUT_FUNCTIONS,
23
+ INPUT_TABLES,
24
+ NAME,
25
+ ON_DEMAND_COLUMN_INFO,
26
+ ON_DEMAND_FEATURE,
27
+ OUTPUT_NAME,
28
+ PARAMETER,
29
+ SERIALIZATION_VERSION,
30
+ SOURCE,
31
+ SOURCE_DATA_COLUMN_INFO,
32
+ TABLE_NAME,
33
+ TOPOLOGICAL_ORDERING,
34
+ TRAINING_DATA,
35
+ UDF_NAME,
36
+ )
37
+ from wedata.feature_store.entities.feature_table_info import FeatureTableInfo
38
+ from wedata.feature_store.entities.on_demand_column_info import OnDemandColumnInfo
39
+ from wedata.feature_store.entities.source_data_column_info import SourceDataColumnInfo
40
+ from wedata.feature_store.utils import common_utils
41
+
42
+ # Change log for serialization version. Please update for each serialization version.
43
+ # 1. Initial.
44
+ # 2. (2021/06/16): Record feature_store_client_version to help us make backward compatible changes in the future.
45
+ # 3. (2021/08/25): Record table_id to handle feature table lineage stability if tables are deleted.
46
+ # 4. (2021/09/25): Record timestamp_lookup_key to handle point-in-time lookups.
47
+ # 5. (2021/02/15): Record include flag for column info if False.
48
+ # Record input functions as FunctionInfo and function computation as OnDemandColumnInfo.
49
+ # Remove redundant fields: table_name from table_infos, output_name from column_infos.
50
+ # 6. (2023/04/21): Record lookback_window in table info for point-in-time lookups.
51
+ # 7. (2023/05/05): Record the Spark data type for all columns to track model signatures.
52
+ # 8. (2023/08/14): Record the topological_ordering for all columns to support chained transform and lookup.
53
+ # 9. (2023/09/11): Change the type of lookback_window from int to double for sub-second values
54
+
55
+
56
+ class FeatureSpec:
57
+
58
+ FEATURE_ARTIFACT_FILE = "feature_spec.yaml"
59
+ SERIALIZATION_VERSION_NUMBER = 9
60
+
61
+ def __init__(
62
+ self,
63
+ column_infos: List[ColumnInfo],
64
+ table_infos: List[FeatureTableInfo],
65
+ function_infos: List[FunctionInfo],
66
+ workspace_id: int = None,
67
+ feature_store_client_version: str = None,
68
+ serialization_version: int = None,
69
+ ):
70
+ self._column_infos = column_infos
71
+ self._table_infos = table_infos
72
+ self._function_infos = function_infos
73
+ self._workspace_id = workspace_id
74
+ if self._workspace_id is None:
75
+ self._workspace_id = 0
76
+ # The Feature Store Python client version which wrote this FeatureSpec.
77
+ # If empty, the client version is <=0.3.1.
78
+ self._feature_store_client_version = feature_store_client_version
79
+ if self._feature_store_client_version is None:
80
+ try:
81
+ self._feature_store_client_version = importlib.metadata.version("tencent_wedata_feature_engineering")
82
+ except importlib.metadata.PackageNotFoundError:
83
+ self._feature_store_client_version = "unknown" # 或其它默认值
84
+ self._serialization_version = serialization_version
85
+
86
+ # Perform validations
87
+ self._validate_column_infos()
88
+ self._validate_table_infos()
89
+ self._validate_function_infos()
90
+
91
+ def _validate_column_infos(self):
92
+ if not self.column_infos:
93
+ raise ValueError("column_infos must be non-empty.")
94
+
95
+ for column_info in self.column_infos:
96
+ if not isinstance(column_info, ColumnInfo):
97
+ raise ValueError(
98
+ f"Expected all elements of column_infos to be instances of ColumnInfo. "
99
+ f"'{column_info}' is of the wrong type."
100
+ )
101
+ if (
102
+ self._serialization_version >= 8
103
+ and column_info.topological_ordering is not None
104
+ ):
105
+ ordering = column_info.topological_ordering
106
+ if not isinstance(ordering, int) or ordering < 0:
107
+ raise ValueError(
108
+ "The topological_ordering of column_info must be non non-negative integers."
109
+ )
110
+
111
+ def _validate_table_infos(self):
112
+ if self.table_infos is None:
113
+ raise ValueError("Internal Error: table_infos must be provided.")
114
+
115
+ # table_infos should not be duplicated
116
+ common_utils.validate_strings_unique(
117
+ [table_info.table_name for table_info in self.table_infos],
118
+ "Internal Error: Expect all table_names in table_infos to be unique. Found duplicates {}",
119
+ )
120
+
121
+ # Starting FeatureSpec v3, unique table names in table_infos must match those in column_infos.
122
+ if self.serialization_version >= 3:
123
+ unique_table_names = set(
124
+ [table_info.table_name for table_info in self.table_infos]
125
+ )
126
+ unique_column_table_names = set(
127
+ [fci.table_name for fci in self.feature_column_infos]
128
+ )
129
+ if unique_table_names != unique_column_table_names:
130
+ raise Exception(
131
+ f"Internal Error: table_names from table_infos {sorted(unique_table_names)} "
132
+ f"must match those from column_infos {sorted(unique_column_table_names)}"
133
+ )
134
+
135
+ def _validate_function_infos(self):
136
+ if self.function_infos is None:
137
+ raise ValueError("Internal Error: function_infos must be provided.")
138
+
139
+ # function_infos should not be duplicated
140
+ common_utils.validate_strings_unique(
141
+ [function_info.full_name for function_info in self.function_infos],
142
+ "Internal Error: Expect all udf_names in function_infos to be unique. Found duplicates {}",
143
+ )
144
+
145
+ # Unique UDF names in function_infos must match those in column_infos.
146
+ # No version check is required as both fields were added simultaneously in FeatureSpec v5.
147
+ unique_udf_names = set(
148
+ [function_info.full_name for function_info in self.function_infos]
149
+ )
150
+ unique_column_udf_names = set(
151
+ [odci.udf_name for odci in self.on_demand_column_infos]
152
+ )
153
+ if unique_udf_names != unique_column_udf_names:
154
+ raise Exception(
155
+ f"Internal Error: udf_names from function_infos {sorted(unique_udf_names)} "
156
+ f"must match those from column_infos {sorted(unique_column_udf_names)}"
157
+ )
158
+
159
+ @property
160
+ def column_infos(self):
161
+ return self._column_infos
162
+
163
+ @property
164
+ def table_infos(self):
165
+ return self._table_infos
166
+
167
+ @property
168
+ def function_infos(self):
169
+ return self._function_infos
170
+
171
+ @property
172
+ def workspace_id(self):
173
+ return self._workspace_id
174
+ @property
175
+ def source_data_column_infos(self) -> List[SourceDataColumnInfo]:
176
+ return self._get_infos_of_type(SourceDataColumnInfo)
177
+ @property
178
+ def feature_column_infos(self) -> List[FeatureColumnInfo]:
179
+ return self._get_infos_of_type(FeatureColumnInfo)
180
+
181
+ @property
182
+ def on_demand_column_infos(self) -> List[OnDemandColumnInfo]:
183
+ return self._get_infos_of_type(OnDemandColumnInfo)
184
+
185
+ @property
186
+ def serialization_version(self) -> int:
187
+ return self._serialization_version
188
+
189
+ def _get_infos_of_type(
190
+ self,
191
+ info_type: Union[
192
+ Type[SourceDataColumnInfo],
193
+ Type[FeatureColumnInfo],
194
+ Type[OnDemandColumnInfo],
195
+ ],
196
+ ):
197
+ """
198
+ Helper method to return the ColumnInfo.info subinfo field based on its type.
199
+ """
200
+ return [
201
+ column_info.info
202
+ for column_info in self.column_infos
203
+ if isinstance(column_info.info, info_type)
204
+ ]
205
+
206
+ @classmethod
207
+ def from_proto(cls, feature_spec_proto):
208
+ # Serialization version is not deserialized from the proto as there is currently only one
209
+ # possible version.
210
+ # print(f"feature_spec_proto:{feature_spec_proto}")
211
+ column_infos = [
212
+ ColumnInfo.from_proto(column_info_proto)
213
+ for column_info_proto in feature_spec_proto.input_columns
214
+ ]
215
+ # print(f"column_infos:{column_infos}")
216
+
217
+ table_infos = [
218
+ FeatureTableInfo.from_proto(table_info_proto)
219
+ for table_info_proto in feature_spec_proto.input_tables
220
+ ]
221
+ # print(f"table_infos:{table_infos}")
222
+ # 本期不支持 udf_function
223
+ function_infos = [
224
+ FunctionInfo.from_proto(function_info_proto)
225
+ for function_info_proto in feature_spec_proto.input_functions
226
+ ]
227
+ return cls(
228
+ column_infos=column_infos,
229
+ table_infos=table_infos,
230
+ function_infos=function_infos,
231
+ workspace_id=feature_spec_proto.workspace_id,
232
+ feature_store_client_version=feature_spec_proto.feature_store_client_version,
233
+ serialization_version=feature_spec_proto.serialization_version,
234
+ )
235
+
236
+
237
+ @staticmethod
238
+ def _input_columns_proto_to_yaml_dict(column_info: Dict[str, Any]):
239
+ """
240
+ Converts a single ColumnInfo's proto dict to the expected element in FeatureSpec YAML's input_columns.
241
+ To keep the YAML clean, unnecessary fields are removed (e.g. SourceDataColumnInfo.name field, ColumnInfo.include when True).
242
+
243
+ Example of a column_info transformation. Note that "name" and "include" attributes were excluded.
244
+ {"source_data_column_info": {"name": "source_column"}, "include": True} -> {"source_column": {"source": "training_data"}}
245
+
246
+ Order of elements in the YAML dict should be:
247
+ 1. Attributes present in ColumnInfo.info, using the proto field order
248
+ 2. Remaining attributes of ColumnInfo, using the proto field order
249
+ 3. Feature Store source type
250
+ """
251
+ # Parse oneof field ColumnInfo.info level attributes as column_info_attributes; record column_name, source
252
+ if SOURCE_DATA_COLUMN_INFO in column_info:
253
+ column_info_attributes = column_info[SOURCE_DATA_COLUMN_INFO]
254
+ # pop NAME attribute and use as the YAML key for this column_info to avoid redundancy in YAML
255
+ column_name, source = column_info_attributes.pop(NAME), TRAINING_DATA
256
+ elif FEATURE_COLUMN_INFO in column_info:
257
+ column_info_attributes = column_info[FEATURE_COLUMN_INFO]
258
+ # pop OUTPUT_NAME attribute and use as the YAML key for this column_info to avoid redundancy in YAML
259
+ column_name, source = column_info_attributes.pop(OUTPUT_NAME), FEATURE_STORE
260
+ elif ON_DEMAND_COLUMN_INFO in column_info:
261
+ column_info_attributes = column_info[ON_DEMAND_COLUMN_INFO]
262
+ # Map InputBindings message dictionary to {parameter: bound_to} KV dictionary if defined
263
+ if INPUT_BINDINGS in column_info_attributes:
264
+ column_info_attributes[INPUT_BINDINGS] = {
265
+ ib[PARAMETER]: ib[BOUND_TO]
266
+ for ib in column_info_attributes[INPUT_BINDINGS]
267
+ }
268
+ # pop OUTPUT_NAME attribute and use as the YAML key for this column_info to avoid redundancy in YAML
269
+ column_name, source = (
270
+ column_info_attributes.pop(OUTPUT_NAME),
271
+ ON_DEMAND_FEATURE,
272
+ )
273
+ else:
274
+ raise ValueError(
275
+ f"Expected column_info to be keyed by a valid ColumnInfo.info type. "
276
+ f"'{column_info}' has key '{list(column_info)[0]}'."
277
+ )
278
+
279
+ # Parse and insert ColumnInfo level attributes
280
+ # Note: the ordering of fields in the result yaml file is undefined but in reality, they are
281
+ # in the same order as they are added in the column_info_attributes dict.
282
+
283
+ # DATA_TYPE is supported starting FeatureSpec v7 and is not guaranteed to exist.
284
+ if DATA_TYPE in column_info:
285
+ column_info_attributes[DATA_TYPE] = column_info[DATA_TYPE]
286
+ if not column_info.get(INCLUDE, False):
287
+ column_info_attributes[INCLUDE] = False
288
+ else:
289
+ column_info_attributes[INCLUDE] = True
290
+ # TOPOLOGICAL_ORDERING is supported starting FeatureSpec v8.
291
+ if TOPOLOGICAL_ORDERING in column_info:
292
+ column_info_attributes[TOPOLOGICAL_ORDERING] = column_info[
293
+ TOPOLOGICAL_ORDERING
294
+ ]
295
+
296
+ # Insert source; return YAML keyed by column_name
297
+ column_info_attributes[SOURCE] = source
298
+ return {column_name: column_info_attributes}
299
+
300
+ def _to_dict(self):
301
+ """
302
+ Convert FeatureSpec to a writeable YAML artifact. Uses MessageToDict to convert FeatureSpec proto to dict.
303
+ Sanitizes and modifies the dict as follows:
304
+ 1. Remove redundant or unnecessary information for cleanliness in the YAML
305
+ 2. Modifies the dict to be of the format {column_name: column_attributes_dict}
306
+
307
+ :return: Sanitized FeatureSpec dictionary of {column_name: column_attributes}
308
+ """
309
+ yaml_dict = MessageToDict(self.to_proto(), preserving_proto_field_name=True)
310
+ yaml_dict[INPUT_COLUMNS] = [
311
+ self._input_columns_proto_to_yaml_dict(column_info)
312
+ for column_info in yaml_dict[INPUT_COLUMNS]
313
+ ]
314
+
315
+ if INPUT_TABLES in yaml_dict:
316
+ # pop TABLE_NAME attribute and use as the YAML key for each table_info to avoid redundancy in YAML
317
+ yaml_dict[INPUT_TABLES] = [
318
+ {table_info.pop(TABLE_NAME): table_info}
319
+ for table_info in yaml_dict[INPUT_TABLES]
320
+ ]
321
+ if INPUT_FUNCTIONS in yaml_dict:
322
+ # pop UDF_NAME attribute and use as the YAML key for each table_info to avoid redundancy in YAML
323
+ yaml_dict[INPUT_FUNCTIONS] = [
324
+ {function_info.pop(UDF_NAME): function_info}
325
+ for function_info in yaml_dict[INPUT_FUNCTIONS]
326
+ ]
327
+
328
+ # For readability, place SERIALIZATION_VERSION last in the dictionary.
329
+ yaml_dict[SERIALIZATION_VERSION] = yaml_dict.pop(SERIALIZATION_VERSION)
330
+ return yaml_dict
331
+
332
+ def save(self, path: str):
333
+ """
334
+ Convert spec to a YAML artifact and store at given `path` location.
335
+ :param path: Root path to where YAML artifact is expected to be stored.
336
+ :return: None
337
+ """
338
+ write_yaml(
339
+ root=path,
340
+ file_name=self.FEATURE_ARTIFACT_FILE,
341
+ data=self._to_dict(),
342
+ sort_keys=False,
343
+ )
344
+
345
+ @staticmethod
346
+ def _input_columns_yaml_to_proto_dict(column_info: Dict[str, Any]):
347
+ """
348
+ Convert the FeatureSpec YAML dictionary to the expected ColumnInfo proto dictionary.
349
+
350
+ Example of a column_info transformation.
351
+ {"source_column": {"source": "training_data"}} -> {"source_data_column_info": {"name": "source_column"}}
352
+ """
353
+ if len(column_info) != 1:
354
+ raise ValueError(
355
+ f"Expected column_info dictionary to only have one key, value pair. "
356
+ f"'{column_info}' has length {len(column_info)}."
357
+ )
358
+ column_name, column_data = list(column_info.items())[0]
359
+ if not column_data:
360
+ raise ValueError(
361
+ f"Expected values of '{column_name}' dictionary to be non-empty."
362
+ )
363
+ if SOURCE not in column_data:
364
+ raise ValueError(
365
+ f"Expected values of column_info dictionary to include the source. No source found "
366
+ f"for '{column_name}'."
367
+ )
368
+
369
+ # Parse oneof field ColumnInfo.info level attributes
370
+ source = column_data.pop(SOURCE)
371
+ if source == TRAINING_DATA:
372
+ column_data[NAME] = column_name
373
+ column_info_dict = {SOURCE_DATA_COLUMN_INFO: column_data}
374
+ elif source == FEATURE_STORE:
375
+ column_data[OUTPUT_NAME] = column_name
376
+ column_info_dict = {FEATURE_COLUMN_INFO: column_data}
377
+ elif source == ON_DEMAND_FEATURE:
378
+ column_data[OUTPUT_NAME] = column_name
379
+ # Map {parameter_val: bound_to_val} dictionary to InputBindings(parameter, bound_to) message dictionary.
380
+ column_data[INPUT_BINDINGS] = [
381
+ {PARAMETER: parameter, BOUND_TO: bound_to}
382
+ for parameter, bound_to in column_data.get(INPUT_BINDINGS, {}).items()
383
+ ]
384
+ column_info_dict = {ON_DEMAND_COLUMN_INFO: column_data}
385
+ else:
386
+ raise ValueError(
387
+ f"Internal Error: Expected column_info to have source matching oneof ColumnInfo.info. "
388
+ f"'{column_info}' has source of '{source}'."
389
+ )
390
+
391
+ # Parse ColumnInfo level attributes
392
+ # TOPOLOGICAL_ORDERING is supported starting FeatureSpec v8.
393
+ if TOPOLOGICAL_ORDERING in column_data:
394
+ column_info_dict[TOPOLOGICAL_ORDERING] = column_data.pop(
395
+ TOPOLOGICAL_ORDERING
396
+ )
397
+ # DATA_TYPE is supported starting FeatureSpec v7 and is not guaranteed to exist.
398
+ if DATA_TYPE in column_data:
399
+ column_info_dict[DATA_TYPE] = column_data.pop(DATA_TYPE)
400
+ # INCLUDE is supported starting FeatureSpec v5 and only present in the YAML when INCLUDE = False
401
+ if INCLUDE in column_data:
402
+ column_info_dict[INCLUDE] = column_data.pop(INCLUDE)
403
+ return column_info_dict
404
+
405
+ @classmethod
406
+ def _from_dict(cls, spec_dict):
407
+ """
408
+ Convert YAML artifact to FeatureSpec. Transforms YAML artifact to dict keyed by
409
+ source_data_column_info or feature_column_info, such that ParseDict can convert the dict to
410
+ a proto message, and from_proto can convert the proto message to a FeatureSpec object
411
+ :return: :py:class:`~databricks.ml_features_common.entities.feature_spec.FeatureSpec`
412
+ """
413
+ if INPUT_COLUMNS not in spec_dict:
414
+ raise ValueError(
415
+ f"{INPUT_COLUMNS} must be a key in {cls.FEATURE_ARTIFACT_FILE}."
416
+ )
417
+ if not spec_dict[INPUT_COLUMNS]:
418
+ raise ValueError(
419
+ f"{INPUT_COLUMNS} in {cls.FEATURE_ARTIFACT_FILE} must be non-empty."
420
+ )
421
+ spec_dict[INPUT_COLUMNS] = [
422
+ cls._input_columns_yaml_to_proto_dict(column_info)
423
+ for column_info in spec_dict[INPUT_COLUMNS]
424
+ ]
425
+
426
+ # feature_spec.yaml doesn't include input_tables, input_functions if any are true:
427
+ # 1. The YAML is written by an older client that does not support the functionality.
428
+ # 2. The FeatureSpec does not contain FeatureLookups (input_tables), FeatureFunctions (input_functions).
429
+ input_tables = []
430
+ for input_table in spec_dict.get(INPUT_TABLES, []):
431
+ table_name, attributes = list(input_table.items())[0]
432
+ input_tables.append({TABLE_NAME: table_name, **attributes})
433
+ spec_dict[INPUT_TABLES] = input_tables
434
+
435
+ input_functions = []
436
+ for input_function in spec_dict.get(INPUT_FUNCTIONS, []):
437
+ udf_name, attributes = list(input_function.items())[0]
438
+ input_functions.append({UDF_NAME: udf_name, **attributes})
439
+ spec_dict[INPUT_FUNCTIONS] = input_functions
440
+
441
+ print(f"spec_dict:{spec_dict}")
442
+ return cls.from_proto(
443
+ ParseDict(spec_dict, feature_store_pb2.FeatureSpec(), ignore_unknown_fields=True)
444
+ )
445
+
446
+ @classmethod
447
+ def _read_file(cls, path: str):
448
+ """
449
+ Read the YAML artifact from a file path.
450
+ """
451
+ parent_dir, file = os.path.split(path)
452
+ spec_dict = read_yaml(parent_dir, file)
453
+ return cls._from_dict(spec_dict)
454
+
455
+ @classmethod
456
+ def load(cls, path: str):
457
+ """
458
+ Load the FeatureSpec YAML artifact in the provided root directory (at path/feature_spec.yaml).
459
+
460
+ :param path: Root path to the YAML artifact. This can be a MLflow artifact path or file path.
461
+ :return: :py:class:`~databricks.ml_features_common.entities.feature_spec.FeatureSpec`
462
+ """
463
+ # Create the full file path to the FeatureSpec.
464
+ path = os.path.join(path, cls.FEATURE_ARTIFACT_FILE)
465
+
466
+ if common_utils.is_artifact_uri(path):
467
+ with TempDir() as tmp_location:
468
+ # Returns a file and not directory since the artifact_uri is a single file.
469
+ local_path = mlflow.artifacts.download_artifacts(
470
+ artifact_uri=path, dst_path=tmp_location.path()
471
+ )
472
+ return FeatureSpec._read_file(local_path)
473
+ else:
474
+ return FeatureSpec._read_file(path)
475
+
476
+ def to_proto(self):
477
+ proto_feature_spec = feature_store_pb2.FeatureSpec()
478
+ for column_info in self.column_infos:
479
+ proto_feature_spec.input_columns.append(column_info.to_proto())
480
+ for table_info in self.table_infos:
481
+ proto_feature_spec.input_tables.append(table_info.to_proto())
482
+ for function_info in self.function_infos:
483
+ proto_feature_spec.input_functions.append(function_info.to_proto())
484
+ proto_feature_spec.serialization_version = self.serialization_version
485
+ proto_feature_spec.workspace_id = self.workspace_id
486
+ proto_feature_spec.feature_store_client_version = (
487
+ self._feature_store_client_version
488
+ )
489
+ return proto_feature_spec