wedata-feature-engineering 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wedata/__init__.py +6 -0
- wedata/feature_store/__init__.py +0 -0
- wedata/feature_store/client.py +169 -0
- wedata/feature_store/constants/__init__.py +0 -0
- wedata/feature_store/constants/constants.py +28 -0
- wedata/feature_store/entities/__init__.py +0 -0
- wedata/feature_store/entities/column_info.py +117 -0
- wedata/feature_store/entities/data_type.py +92 -0
- wedata/feature_store/entities/environment_variables.py +55 -0
- wedata/feature_store/entities/feature.py +53 -0
- wedata/feature_store/entities/feature_column_info.py +64 -0
- wedata/feature_store/entities/feature_function.py +55 -0
- wedata/feature_store/entities/feature_lookup.py +179 -0
- wedata/feature_store/entities/feature_spec.py +454 -0
- wedata/feature_store/entities/feature_spec_constants.py +25 -0
- wedata/feature_store/entities/feature_table.py +164 -0
- wedata/feature_store/entities/feature_table_info.py +40 -0
- wedata/feature_store/entities/function_info.py +184 -0
- wedata/feature_store/entities/on_demand_column_info.py +44 -0
- wedata/feature_store/entities/source_data_column_info.py +21 -0
- wedata/feature_store/entities/training_set.py +134 -0
- wedata/feature_store/feature_table_client/__init__.py +0 -0
- wedata/feature_store/feature_table_client/feature_table_client.py +313 -0
- wedata/feature_store/spark_client/__init__.py +0 -0
- wedata/feature_store/spark_client/spark_client.py +286 -0
- wedata/feature_store/training_set_client/__init__.py +0 -0
- wedata/feature_store/training_set_client/training_set_client.py +196 -0
- wedata/feature_store/utils/__init__.py +0 -0
- wedata/feature_store/utils/common_utils.py +96 -0
- wedata/feature_store/utils/feature_lookup_utils.py +570 -0
- wedata/feature_store/utils/feature_spec_utils.py +286 -0
- wedata/feature_store/utils/feature_utils.py +73 -0
- wedata/feature_store/utils/schema_utils.py +117 -0
- wedata/feature_store/utils/topological_sort.py +158 -0
- wedata/feature_store/utils/training_set_utils.py +580 -0
- wedata/feature_store/utils/uc_utils.py +281 -0
- wedata/feature_store/utils/utils.py +252 -0
- wedata/feature_store/utils/validation_utils.py +55 -0
- {wedata_feature_engineering-0.1.4.dist-info → wedata_feature_engineering-0.1.5.dist-info}/METADATA +1 -1
- wedata_feature_engineering-0.1.5.dist-info/RECORD +79 -0
- wedata_feature_engineering-0.1.5.dist-info/top_level.txt +1 -0
- wedata_feature_engineering-0.1.4.dist-info/RECORD +0 -41
- wedata_feature_engineering-0.1.4.dist-info/top_level.txt +0 -1
- {wedata_feature_engineering-0.1.4.dist-info → wedata_feature_engineering-0.1.5.dist-info}/WHEEL +0 -0
@@ -0,0 +1,179 @@
|
|
1
|
+
import copy
|
2
|
+
import datetime
|
3
|
+
import logging
|
4
|
+
from typing import Dict, List, Optional, Union
|
5
|
+
|
6
|
+
from feature_store.utils import common_utils
|
7
|
+
|
8
|
+
_logger = logging.getLogger(__name__)
|
9
|
+
|
10
|
+
|
11
|
+
class FeatureLookup:
|
12
|
+
|
13
|
+
"""
|
14
|
+
特征查找类
|
15
|
+
|
16
|
+
特征查找类用于指定特征表中的特征,并将其与训练集中的特征进行关联。
|
17
|
+
|
18
|
+
特征查找类有以下属性:
|
19
|
+
|
20
|
+
- table_name:特征表的名称。
|
21
|
+
- lookup_key:用于在特征表和训练集之间进行联接的键。lookup_key必须是训练集中的列。lookup_key的类型和顺序必须与特征表的主键匹配。
|
22
|
+
- feature_names:要从特征表中查找的特征的名称。如果您的模型需要主键作为特征,则可以将它们声明为独立的FeatureLookups。
|
23
|
+
- rename_outputs:如果提供,则会将特征重命名为 :meth:`create_training_set() <databricks.feature_engineering.client.FeatureEngineeringClient.create_training_set>`返回的 :class:`TrainingSet <databricks.ml_features.training_set.TrainingSet>` 中的特征。
|
24
|
+
- timestamp_lookup_key:用于在特征表和训练集之间进行联接的时间戳键。timestamp_lookup_key必须是训练集中的列。timestamp_lookup_key的类型必须与特征表的时间戳键的类型匹配。
|
25
|
+
- lookback_window: 当对特征表执行时间点查找时使用的回溯窗口,该查找针对传递给 :meth:`create_training_set() <databricks.feature_engineering.client.FeatureEngineeringClient.create_training_set>` 方法的数据帧。特征存储将检索在数据帧的``timestamp_lookup_key``指定时间戳之前且在``lookback_window``时间范围内的最新特征值,如果不存在这样的特征值则返回null。当设置为0时,仅返回特征表中的精确匹配项。
|
26
|
+
- feature_name:特征名称。**已弃用**。使用 `feature_names`。
|
27
|
+
- output_name:如果提供,则会将此特征重命名为 :meth:`create_training_set() <databricks.feature_engineering.client.FeatureEngineeringClient.create_training_set>` 返回的 :class:`TrainingSet <databricks.ml_features.training_set.TrainingSet>` 中的特征。**已弃用**。使用 `rename_outputs`。
|
28
|
+
|
29
|
+
示例:
|
30
|
+
|
31
|
+
from databricks.feature_store import FeatureLookup
|
32
|
+
|
33
|
+
lookup = FeatureLookup(
|
34
|
+
table_name="my_feature_table",
|
35
|
+
lookup_key="my_lookup_key",
|
36
|
+
feature_names=["my_feature_1", "my_feature_2"],
|
37
|
+
rename_outputs={"my_feature_1": "my_feature_1_renamed"},
|
38
|
+
timestamp_lookup_key="my_timestamp_lookup_key",
|
39
|
+
lookback_window=datetime.timedelta(days=1)
|
40
|
+
)
|
41
|
+
|
42
|
+
"""
|
43
|
+
|
44
|
+
def __init__(
|
45
|
+
self,
|
46
|
+
table_name: str,
|
47
|
+
lookup_key: Union[str, List[str]],
|
48
|
+
*,
|
49
|
+
feature_names: Union[str, List[str], None] = None,
|
50
|
+
rename_outputs: Optional[Dict[str, str]] = None,
|
51
|
+
timestamp_lookup_key: Optional[str] = None,
|
52
|
+
lookback_window: Optional[datetime.timedelta] = None,
|
53
|
+
**kwargs,
|
54
|
+
):
|
55
|
+
"""Initialize a FeatureLookup object. See class documentation."""
|
56
|
+
|
57
|
+
self._feature_name_deprecated = kwargs.pop("feature_name", None)
|
58
|
+
self._output_name_deprecated = kwargs.pop("output_name", None)
|
59
|
+
|
60
|
+
if kwargs:
|
61
|
+
raise TypeError(
|
62
|
+
f"FeatureLookup got unexpected keyword argument(s): {list(kwargs.keys())}"
|
63
|
+
)
|
64
|
+
|
65
|
+
self._table_name = table_name
|
66
|
+
|
67
|
+
if type(timestamp_lookup_key) is list:
|
68
|
+
if len(timestamp_lookup_key) == 0:
|
69
|
+
timestamp_lookup_key = None
|
70
|
+
elif len(timestamp_lookup_key) == 1:
|
71
|
+
timestamp_lookup_key = timestamp_lookup_key[0]
|
72
|
+
else:
|
73
|
+
raise ValueError(
|
74
|
+
f"Setting multiple timestamp lookup keys is not supported."
|
75
|
+
)
|
76
|
+
|
77
|
+
if rename_outputs is not None and not isinstance(rename_outputs, dict):
|
78
|
+
raise ValueError(
|
79
|
+
f"Unexpected type for rename_outputs: {type(rename_outputs)}"
|
80
|
+
)
|
81
|
+
|
82
|
+
self._feature_names = common_utils.as_list(feature_names, default=[])
|
83
|
+
|
84
|
+
# Make sure the user didn't accidentally pass in any nested lists/dicts in feature_names
|
85
|
+
for fn in self._feature_names:
|
86
|
+
if not isinstance(fn, str):
|
87
|
+
raise ValueError(
|
88
|
+
f"Unexpected type for element in feature_names: {type(self._feature_names)}, only strings allowed in list"
|
89
|
+
)
|
90
|
+
|
91
|
+
if lookback_window is not None:
|
92
|
+
if not timestamp_lookup_key:
|
93
|
+
raise ValueError(
|
94
|
+
f"Unexpected lookback_window value: {lookback_window}, lookback windows can only be applied on time series "
|
95
|
+
f"feature tables. Use timestamp_lookup_key to perform point-in-time lookups with lookback window."
|
96
|
+
)
|
97
|
+
if not isinstance(
|
98
|
+
lookback_window, datetime.timedelta
|
99
|
+
) or lookback_window < datetime.timedelta(0):
|
100
|
+
raise ValueError(
|
101
|
+
f"Unexpected value for lookback_window: {lookback_window}, only non-negative datetime.timedelta allowed."
|
102
|
+
)
|
103
|
+
|
104
|
+
self._lookup_key = copy.copy(lookup_key)
|
105
|
+
self._timestamp_lookup_key = copy.copy(timestamp_lookup_key)
|
106
|
+
self._lookback_window = copy.copy(lookback_window)
|
107
|
+
|
108
|
+
self._rename_outputs = {}
|
109
|
+
if rename_outputs is not None:
|
110
|
+
self._rename_outputs = rename_outputs.copy()
|
111
|
+
|
112
|
+
self._inject_deprecated_feature_name()
|
113
|
+
self._inject_deprecated_output_name()
|
114
|
+
|
115
|
+
@property
|
116
|
+
def table_name(self):
|
117
|
+
"""The table name to use in this FeatureLookup."""
|
118
|
+
return self._table_name
|
119
|
+
|
120
|
+
@property
|
121
|
+
def lookup_key(self):
|
122
|
+
"""The lookup key(s) to use in this FeatureLookup."""
|
123
|
+
return self._lookup_key
|
124
|
+
|
125
|
+
@property
|
126
|
+
def feature_name(self):
|
127
|
+
"""The feature name to use in this FeatureLookup. **Deprecated**. Use `feature_names`."""
|
128
|
+
return self._feature_name_deprecated
|
129
|
+
|
130
|
+
@property
|
131
|
+
def feature_names(self):
|
132
|
+
"""The feature names to use in this FeatureLookup."""
|
133
|
+
return self._feature_names
|
134
|
+
|
135
|
+
@property
|
136
|
+
def output_name(self):
|
137
|
+
"""The output name to use in this FeatureLookup. **Deprecated**. Use `feature_names`."""
|
138
|
+
if self._output_name_deprecated:
|
139
|
+
return self._output_name_deprecated
|
140
|
+
else:
|
141
|
+
return self._feature_name_deprecated
|
142
|
+
|
143
|
+
@property
|
144
|
+
def timestamp_lookup_key(self):
|
145
|
+
return self._timestamp_lookup_key
|
146
|
+
|
147
|
+
@property
|
148
|
+
def lookback_window(self):
|
149
|
+
"""A lookback window applied only for point-in-time lookups."""
|
150
|
+
return self._lookback_window
|
151
|
+
|
152
|
+
def _get_feature_names(self):
|
153
|
+
return self._feature_names
|
154
|
+
|
155
|
+
def _get_output_name(self, feature_name):
|
156
|
+
"""Lookup the renamed output, or fallback to the feature name itself if no mapping is present"""
|
157
|
+
return self._rename_outputs.get(feature_name, feature_name)
|
158
|
+
|
159
|
+
def _inject_deprecated_feature_name(self):
|
160
|
+
if self._feature_name_deprecated:
|
161
|
+
if len(self._feature_names) > 0:
|
162
|
+
raise ValueError(
|
163
|
+
"Use either feature_names or feature_name parameter, but not both."
|
164
|
+
)
|
165
|
+
_logger.warning(
|
166
|
+
f'The feature_name parameter is deprecated. Use "feature_names".'
|
167
|
+
)
|
168
|
+
self._feature_names = [self._feature_name_deprecated]
|
169
|
+
|
170
|
+
def _inject_deprecated_output_name(self):
|
171
|
+
if len(self._feature_names) == 1 and self._output_name_deprecated:
|
172
|
+
if len(self._rename_outputs) > 0:
|
173
|
+
raise ValueError(
|
174
|
+
"Use either output_name or rename_outputs parameter, but not both."
|
175
|
+
)
|
176
|
+
_logger.warning(
|
177
|
+
f'The output_name parameter is deprecated. Use "rename_outputs".'
|
178
|
+
)
|
179
|
+
self._rename_outputs[self._feature_names[0]] = self._output_name_deprecated
|
@@ -0,0 +1,454 @@
|
|
1
|
+
import os
|
2
|
+
from typing import Any, Dict, List, Type, Union
|
3
|
+
|
4
|
+
import mlflow
|
5
|
+
from databricks.sdk.service.catalog import FunctionInfo
|
6
|
+
from google.protobuf.json_format import MessageToDict, ParseDict
|
7
|
+
from mlflow.utils.file_utils import TempDir, read_yaml, write_yaml
|
8
|
+
|
9
|
+
from feature_store.entities.column_info import ColumnInfo
|
10
|
+
from feature_store.entities.feature_column_info import FeatureColumnInfo
|
11
|
+
from feature_store.entities.feature_spec_constants import (
|
12
|
+
BOUND_TO,
|
13
|
+
DATA_TYPE,
|
14
|
+
FEATURE_COLUMN_INFO,
|
15
|
+
FEATURE_STORE,
|
16
|
+
INCLUDE,
|
17
|
+
INPUT_BINDINGS,
|
18
|
+
INPUT_COLUMNS,
|
19
|
+
INPUT_FUNCTIONS,
|
20
|
+
INPUT_TABLES,
|
21
|
+
NAME,
|
22
|
+
ON_DEMAND_COLUMN_INFO,
|
23
|
+
ON_DEMAND_FEATURE,
|
24
|
+
OUTPUT_NAME,
|
25
|
+
PARAMETER,
|
26
|
+
SERIALIZATION_VERSION,
|
27
|
+
SOURCE,
|
28
|
+
SOURCE_DATA_COLUMN_INFO,
|
29
|
+
TABLE_NAME,
|
30
|
+
TOPOLOGICAL_ORDERING,
|
31
|
+
TRAINING_DATA,
|
32
|
+
UDF_NAME,
|
33
|
+
)
|
34
|
+
from feature_store.entities.feature_table_info import FeatureTableInfo
|
35
|
+
from feature_store.entities.on_demand_column_info import OnDemandColumnInfo
|
36
|
+
from feature_store.entities.source_data_column_info import SourceDataColumnInfo
|
37
|
+
from feature_store.utils import common_utils
|
38
|
+
|
39
|
+
# Change log for serialization version. Please update for each serialization version.
|
40
|
+
# 1. Initial.
|
41
|
+
# 2. (2021/06/16): Record feature_store_client_version to help us make backward compatible changes in the future.
|
42
|
+
# 3. (2021/08/25): Record table_id to handle feature table lineage stability if tables are deleted.
|
43
|
+
# 4. (2021/09/25): Record timestamp_lookup_key to handle point-in-time lookups.
|
44
|
+
# 5. (2021/02/15): Record include flag for column info if False.
|
45
|
+
# Record input functions as FunctionInfo and function computation as OnDemandColumnInfo.
|
46
|
+
# Remove redundant fields: table_name from table_infos, output_name from column_infos.
|
47
|
+
# 6. (2023/04/21): Record lookback_window in table info for point-in-time lookups.
|
48
|
+
# 7. (2023/05/05): Record the Spark data type for all columns to track model signatures.
|
49
|
+
# 8. (2023/08/14): Record the topological_ordering for all columns to support chained transform and lookup.
|
50
|
+
# 9. (2023/09/11): Change the type of lookback_window from int to double for sub-second values
|
51
|
+
|
52
|
+
|
53
|
+
class FeatureSpec:
|
54
|
+
|
55
|
+
FEATURE_ARTIFACT_FILE = "feature_spec.yaml"
|
56
|
+
SERIALIZATION_VERSION_NUMBER = 9
|
57
|
+
|
58
|
+
def __init__(
|
59
|
+
self,
|
60
|
+
column_infos: List[ColumnInfo],
|
61
|
+
table_infos: List[FeatureTableInfo],
|
62
|
+
function_infos: List[FunctionInfo],
|
63
|
+
workspace_id: int = None,
|
64
|
+
feature_store_client_version: str = None,
|
65
|
+
serialization_version: int = None,
|
66
|
+
):
|
67
|
+
self._column_infos = column_infos
|
68
|
+
self._table_infos = table_infos
|
69
|
+
self._function_infos = function_infos
|
70
|
+
self._workspace_id = workspace_id
|
71
|
+
# The Feature Store Python client version which wrote this FeatureSpec.
|
72
|
+
# If empty, the client version is <=0.3.1.
|
73
|
+
self._feature_store_client_version = feature_store_client_version
|
74
|
+
self._serialization_version = serialization_version
|
75
|
+
|
76
|
+
# Perform validations
|
77
|
+
self._validate_column_infos()
|
78
|
+
self._validate_table_infos()
|
79
|
+
self._validate_function_infos()
|
80
|
+
|
81
|
+
def _validate_column_infos(self):
|
82
|
+
if not self.column_infos:
|
83
|
+
raise ValueError("column_infos must be non-empty.")
|
84
|
+
|
85
|
+
for column_info in self.column_infos:
|
86
|
+
if not isinstance(column_info, ColumnInfo):
|
87
|
+
raise ValueError(
|
88
|
+
f"Expected all elements of column_infos to be instances of ColumnInfo. "
|
89
|
+
f"'{column_info}' is of the wrong type."
|
90
|
+
)
|
91
|
+
if (
|
92
|
+
self._serialization_version >= 8
|
93
|
+
and column_info.topological_ordering is not None
|
94
|
+
):
|
95
|
+
ordering = column_info.topological_ordering
|
96
|
+
if not isinstance(ordering, int) or ordering < 0:
|
97
|
+
raise ValueError(
|
98
|
+
"The topological_ordering of column_info must be non non-negative integers."
|
99
|
+
)
|
100
|
+
|
101
|
+
def _validate_table_infos(self):
|
102
|
+
if self.table_infos is None:
|
103
|
+
raise ValueError("Internal Error: table_infos must be provided.")
|
104
|
+
|
105
|
+
# table_infos should not be duplicated
|
106
|
+
common_utils.validate_strings_unique(
|
107
|
+
[table_info.table_name for table_info in self.table_infos],
|
108
|
+
"Internal Error: Expect all table_names in table_infos to be unique. Found duplicates {}",
|
109
|
+
)
|
110
|
+
|
111
|
+
# Starting FeatureSpec v3, unique table names in table_infos must match those in column_infos.
|
112
|
+
if self.serialization_version >= 3:
|
113
|
+
unique_table_names = set(
|
114
|
+
[table_info.table_name for table_info in self.table_infos]
|
115
|
+
)
|
116
|
+
unique_column_table_names = set(
|
117
|
+
[fci.table_name for fci in self.feature_column_infos]
|
118
|
+
)
|
119
|
+
if unique_table_names != unique_column_table_names:
|
120
|
+
raise Exception(
|
121
|
+
f"Internal Error: table_names from table_infos {sorted(unique_table_names)} "
|
122
|
+
f"must match those from column_infos {sorted(unique_column_table_names)}"
|
123
|
+
)
|
124
|
+
|
125
|
+
def _validate_function_infos(self):
|
126
|
+
if self.function_infos is None:
|
127
|
+
raise ValueError("Internal Error: function_infos must be provided.")
|
128
|
+
|
129
|
+
# function_infos should not be duplicated
|
130
|
+
common_utils.validate_strings_unique(
|
131
|
+
[function_info.udf_name for function_info in self.function_infos],
|
132
|
+
"Internal Error: Expect all udf_names in function_infos to be unique. Found duplicates {}",
|
133
|
+
)
|
134
|
+
|
135
|
+
# Unique UDF names in function_infos must match those in column_infos.
|
136
|
+
# No version check is required as both fields were added simultaneously in FeatureSpec v5.
|
137
|
+
unique_udf_names = set(
|
138
|
+
[function_info.udf_name for function_info in self.function_infos]
|
139
|
+
)
|
140
|
+
unique_column_udf_names = set(
|
141
|
+
[odci.udf_name for odci in self.on_demand_column_infos]
|
142
|
+
)
|
143
|
+
if unique_udf_names != unique_column_udf_names:
|
144
|
+
raise Exception(
|
145
|
+
f"Internal Error: udf_names from function_infos {sorted(unique_udf_names)} "
|
146
|
+
f"must match those from column_infos {sorted(unique_column_udf_names)}"
|
147
|
+
)
|
148
|
+
|
149
|
+
@property
|
150
|
+
def column_infos(self):
|
151
|
+
return self._column_infos
|
152
|
+
|
153
|
+
@property
|
154
|
+
def table_infos(self):
|
155
|
+
return self._table_infos
|
156
|
+
|
157
|
+
@property
|
158
|
+
def function_infos(self):
|
159
|
+
return self._function_infos
|
160
|
+
|
161
|
+
@property
|
162
|
+
def workspace_id(self):
|
163
|
+
return self._workspace_id
|
164
|
+
|
165
|
+
@property
|
166
|
+
def feature_column_infos(self) -> List[FeatureColumnInfo]:
|
167
|
+
return self._get_infos_of_type(FeatureColumnInfo)
|
168
|
+
|
169
|
+
@property
|
170
|
+
def on_demand_column_infos(self) -> List[OnDemandColumnInfo]:
|
171
|
+
return self._get_infos_of_type(OnDemandColumnInfo)
|
172
|
+
|
173
|
+
@property
|
174
|
+
def serialization_version(self) -> int:
|
175
|
+
return self._serialization_version
|
176
|
+
|
177
|
+
def _get_infos_of_type(
|
178
|
+
self,
|
179
|
+
info_type: Union[
|
180
|
+
Type[SourceDataColumnInfo],
|
181
|
+
Type[FeatureColumnInfo],
|
182
|
+
Type[OnDemandColumnInfo],
|
183
|
+
],
|
184
|
+
):
|
185
|
+
"""
|
186
|
+
Helper method to return the ColumnInfo.info subinfo field based on its type.
|
187
|
+
"""
|
188
|
+
return [
|
189
|
+
column_info.info
|
190
|
+
for column_info in self.column_infos
|
191
|
+
if isinstance(column_info.info, info_type)
|
192
|
+
]
|
193
|
+
|
194
|
+
@classmethod
|
195
|
+
def from_proto(cls, feature_spec_proto):
|
196
|
+
# Serialization version is not deserialized from the proto as there is currently only one
|
197
|
+
# possible version.
|
198
|
+
column_infos = [
|
199
|
+
ColumnInfo.from_proto(column_info_proto)
|
200
|
+
for column_info_proto in feature_spec_proto.input_columns
|
201
|
+
]
|
202
|
+
table_infos = [
|
203
|
+
FeatureTableInfo.from_proto(table_info_proto)
|
204
|
+
for table_info_proto in feature_spec_proto.input_tables
|
205
|
+
]
|
206
|
+
function_infos = [
|
207
|
+
FunctionInfo.from_proto(function_info_proto)
|
208
|
+
for function_info_proto in feature_spec_proto.input_functions
|
209
|
+
]
|
210
|
+
return cls(
|
211
|
+
column_infos=column_infos,
|
212
|
+
table_infos=table_infos,
|
213
|
+
function_infos=function_infos,
|
214
|
+
workspace_id=feature_spec_proto.workspace_id,
|
215
|
+
feature_store_client_version=feature_spec_proto.feature_store_client_version,
|
216
|
+
serialization_version=feature_spec_proto.serialization_version,
|
217
|
+
)
|
218
|
+
|
219
|
+
|
220
|
+
@staticmethod
|
221
|
+
def _input_columns_proto_to_yaml_dict(column_info: Dict[str, Any]):
|
222
|
+
"""
|
223
|
+
Converts a single ColumnInfo's proto dict to the expected element in FeatureSpec YAML's input_columns.
|
224
|
+
To keep the YAML clean, unnecessary fields are removed (e.g. SourceDataColumnInfo.name field, ColumnInfo.include when True).
|
225
|
+
|
226
|
+
Example of a column_info transformation. Note that "name" and "include" attributes were excluded.
|
227
|
+
{"source_data_column_info": {"name": "source_column"}, "include": True} -> {"source_column": {"source": "training_data"}}
|
228
|
+
|
229
|
+
Order of elements in the YAML dict should be:
|
230
|
+
1. Attributes present in ColumnInfo.info, using the proto field order
|
231
|
+
2. Remaining attributes of ColumnInfo, using the proto field order
|
232
|
+
3. Feature Store source type
|
233
|
+
"""
|
234
|
+
# Parse oneof field ColumnInfo.info level attributes as column_info_attributes; record column_name, source
|
235
|
+
if SOURCE_DATA_COLUMN_INFO in column_info:
|
236
|
+
column_info_attributes = column_info[SOURCE_DATA_COLUMN_INFO]
|
237
|
+
# pop NAME attribute and use as the YAML key for this column_info to avoid redundancy in YAML
|
238
|
+
column_name, source = column_info_attributes.pop(NAME), TRAINING_DATA
|
239
|
+
elif FEATURE_COLUMN_INFO in column_info:
|
240
|
+
column_info_attributes = column_info[FEATURE_COLUMN_INFO]
|
241
|
+
# pop OUTPUT_NAME attribute and use as the YAML key for this column_info to avoid redundancy in YAML
|
242
|
+
column_name, source = column_info_attributes.pop(OUTPUT_NAME), FEATURE_STORE
|
243
|
+
elif ON_DEMAND_COLUMN_INFO in column_info:
|
244
|
+
column_info_attributes = column_info[ON_DEMAND_COLUMN_INFO]
|
245
|
+
# Map InputBindings message dictionary to {parameter: bound_to} KV dictionary if defined
|
246
|
+
if INPUT_BINDINGS in column_info_attributes:
|
247
|
+
column_info_attributes[INPUT_BINDINGS] = {
|
248
|
+
ib[PARAMETER]: ib[BOUND_TO]
|
249
|
+
for ib in column_info_attributes[INPUT_BINDINGS]
|
250
|
+
}
|
251
|
+
# pop OUTPUT_NAME attribute and use as the YAML key for this column_info to avoid redundancy in YAML
|
252
|
+
column_name, source = (
|
253
|
+
column_info_attributes.pop(OUTPUT_NAME),
|
254
|
+
ON_DEMAND_FEATURE,
|
255
|
+
)
|
256
|
+
else:
|
257
|
+
raise ValueError(
|
258
|
+
f"Expected column_info to be keyed by a valid ColumnInfo.info type. "
|
259
|
+
f"'{column_info}' has key '{list(column_info)[0]}'."
|
260
|
+
)
|
261
|
+
|
262
|
+
# Parse and insert ColumnInfo level attributes
|
263
|
+
# Note: the ordering of fields in the result yaml file is undefined but in reality, they are
|
264
|
+
# in the same order as they are added in the column_info_attributes dict.
|
265
|
+
|
266
|
+
# DATA_TYPE is supported starting FeatureSpec v7 and is not guaranteed to exist.
|
267
|
+
if DATA_TYPE in column_info:
|
268
|
+
column_info_attributes[DATA_TYPE] = column_info[DATA_TYPE]
|
269
|
+
if not column_info[INCLUDE]:
|
270
|
+
column_info_attributes[INCLUDE] = False
|
271
|
+
# TOPOLOGICAL_ORDERING is supported starting FeatureSpec v8.
|
272
|
+
if TOPOLOGICAL_ORDERING in column_info:
|
273
|
+
column_info_attributes[TOPOLOGICAL_ORDERING] = column_info[
|
274
|
+
TOPOLOGICAL_ORDERING
|
275
|
+
]
|
276
|
+
|
277
|
+
# Insert source; return YAML keyed by column_name
|
278
|
+
column_info_attributes[SOURCE] = source
|
279
|
+
return {column_name: column_info_attributes}
|
280
|
+
|
281
|
+
def _to_dict(self):
|
282
|
+
"""
|
283
|
+
Convert FeatureSpec to a writeable YAML artifact. Uses MessageToDict to convert FeatureSpec proto to dict.
|
284
|
+
Sanitizes and modifies the dict as follows:
|
285
|
+
1. Remove redundant or unnecessary information for cleanliness in the YAML
|
286
|
+
2. Modifies the dict to be of the format {column_name: column_attributes_dict}
|
287
|
+
|
288
|
+
:return: Sanitized FeatureSpec dictionary of {column_name: column_attributes}
|
289
|
+
"""
|
290
|
+
yaml_dict = MessageToDict(self.to_proto(), preserving_proto_field_name=True)
|
291
|
+
yaml_dict[INPUT_COLUMNS] = [
|
292
|
+
self._input_columns_proto_to_yaml_dict(column_info)
|
293
|
+
for column_info in yaml_dict[INPUT_COLUMNS]
|
294
|
+
]
|
295
|
+
|
296
|
+
if INPUT_TABLES in yaml_dict:
|
297
|
+
# pop TABLE_NAME attribute and use as the YAML key for each table_info to avoid redundancy in YAML
|
298
|
+
yaml_dict[INPUT_TABLES] = [
|
299
|
+
{table_info.pop(TABLE_NAME): table_info}
|
300
|
+
for table_info in yaml_dict[INPUT_TABLES]
|
301
|
+
]
|
302
|
+
if INPUT_FUNCTIONS in yaml_dict:
|
303
|
+
# pop UDF_NAME attribute and use as the YAML key for each table_info to avoid redundancy in YAML
|
304
|
+
yaml_dict[INPUT_FUNCTIONS] = [
|
305
|
+
{function_info.pop(UDF_NAME): function_info}
|
306
|
+
for function_info in yaml_dict[INPUT_FUNCTIONS]
|
307
|
+
]
|
308
|
+
|
309
|
+
# For readability, place SERIALIZATION_VERSION last in the dictionary.
|
310
|
+
yaml_dict[SERIALIZATION_VERSION] = yaml_dict.pop(SERIALIZATION_VERSION)
|
311
|
+
return yaml_dict
|
312
|
+
|
313
|
+
def save(self, path: str):
|
314
|
+
"""
|
315
|
+
Convert spec to a YAML artifact and store at given `path` location.
|
316
|
+
:param path: Root path to where YAML artifact is expected to be stored.
|
317
|
+
:return: None
|
318
|
+
"""
|
319
|
+
write_yaml(
|
320
|
+
root=path,
|
321
|
+
file_name=self.FEATURE_ARTIFACT_FILE,
|
322
|
+
data=self._to_dict(),
|
323
|
+
sort_keys=False,
|
324
|
+
)
|
325
|
+
|
326
|
+
@staticmethod
|
327
|
+
def _input_columns_yaml_to_proto_dict(column_info: Dict[str, Any]):
|
328
|
+
"""
|
329
|
+
Convert the FeatureSpec YAML dictionary to the expected ColumnInfo proto dictionary.
|
330
|
+
|
331
|
+
Example of a column_info transformation.
|
332
|
+
{"source_column": {"source": "training_data"}} -> {"source_data_column_info": {"name": "source_column"}}
|
333
|
+
"""
|
334
|
+
if len(column_info) != 1:
|
335
|
+
raise ValueError(
|
336
|
+
f"Expected column_info dictionary to only have one key, value pair. "
|
337
|
+
f"'{column_info}' has length {len(column_info)}."
|
338
|
+
)
|
339
|
+
column_name, column_data = list(column_info.items())[0]
|
340
|
+
if not column_data:
|
341
|
+
raise ValueError(
|
342
|
+
f"Expected values of '{column_name}' dictionary to be non-empty."
|
343
|
+
)
|
344
|
+
if SOURCE not in column_data:
|
345
|
+
raise ValueError(
|
346
|
+
f"Expected values of column_info dictionary to include the source. No source found "
|
347
|
+
f"for '{column_name}'."
|
348
|
+
)
|
349
|
+
|
350
|
+
# Parse oneof field ColumnInfo.info level attributes
|
351
|
+
source = column_data.pop(SOURCE)
|
352
|
+
if source == TRAINING_DATA:
|
353
|
+
column_data[NAME] = column_name
|
354
|
+
column_info_dict = {SOURCE_DATA_COLUMN_INFO: column_data}
|
355
|
+
elif source == FEATURE_STORE:
|
356
|
+
column_data[OUTPUT_NAME] = column_name
|
357
|
+
column_info_dict = {FEATURE_COLUMN_INFO: column_data}
|
358
|
+
elif source == ON_DEMAND_FEATURE:
|
359
|
+
column_data[OUTPUT_NAME] = column_name
|
360
|
+
# Map {parameter_val: bound_to_val} dictionary to InputBindings(parameter, bound_to) message dictionary.
|
361
|
+
column_data[INPUT_BINDINGS] = [
|
362
|
+
{PARAMETER: parameter, BOUND_TO: bound_to}
|
363
|
+
for parameter, bound_to in column_data.get(INPUT_BINDINGS, {}).items()
|
364
|
+
]
|
365
|
+
column_info_dict = {ON_DEMAND_COLUMN_INFO: column_data}
|
366
|
+
else:
|
367
|
+
raise ValueError(
|
368
|
+
f"Internal Error: Expected column_info to have source matching oneof ColumnInfo.info. "
|
369
|
+
f"'{column_info}' has source of '{source}'."
|
370
|
+
)
|
371
|
+
|
372
|
+
# Parse ColumnInfo level attributes
|
373
|
+
# TOPOLOGICAL_ORDERING is supported starting FeatureSpec v8.
|
374
|
+
if TOPOLOGICAL_ORDERING in column_data:
|
375
|
+
column_info_dict[TOPOLOGICAL_ORDERING] = column_data.pop(
|
376
|
+
TOPOLOGICAL_ORDERING
|
377
|
+
)
|
378
|
+
# DATA_TYPE is supported starting FeatureSpec v7 and is not guaranteed to exist.
|
379
|
+
if DATA_TYPE in column_data:
|
380
|
+
column_info_dict[DATA_TYPE] = column_data.pop(DATA_TYPE)
|
381
|
+
# INCLUDE is supported starting FeatureSpec v5 and only present in the YAML when INCLUDE = False
|
382
|
+
if INCLUDE in column_data:
|
383
|
+
column_info_dict[INCLUDE] = column_data.pop(INCLUDE)
|
384
|
+
return column_info_dict
|
385
|
+
|
386
|
+
# @classmethod
|
387
|
+
# def _from_dict(cls, spec_dict):
|
388
|
+
# """
|
389
|
+
# Convert YAML artifact to FeatureSpec. Transforms YAML artifact to dict keyed by
|
390
|
+
# source_data_column_info or feature_column_info, such that ParseDict can convert the dict to
|
391
|
+
# a proto message, and from_proto can convert the proto message to a FeatureSpec object
|
392
|
+
# :return: :py:class:`~databricks.ml_features_common.entities.feature_spec.FeatureSpec`
|
393
|
+
# """
|
394
|
+
# if INPUT_COLUMNS not in spec_dict:
|
395
|
+
# raise ValueError(
|
396
|
+
# f"{INPUT_COLUMNS} must be a key in {cls.FEATURE_ARTIFACT_FILE}."
|
397
|
+
# )
|
398
|
+
# if not spec_dict[INPUT_COLUMNS]:
|
399
|
+
# raise ValueError(
|
400
|
+
# f"{INPUT_COLUMNS} in {cls.FEATURE_ARTIFACT_FILE} must be non-empty."
|
401
|
+
# )
|
402
|
+
# spec_dict[INPUT_COLUMNS] = [
|
403
|
+
# cls._input_columns_yaml_to_proto_dict(column_info)
|
404
|
+
# for column_info in spec_dict[INPUT_COLUMNS]
|
405
|
+
# ]
|
406
|
+
#
|
407
|
+
# # feature_spec.yaml doesn't include input_tables, input_functions if any are true:
|
408
|
+
# # 1. The YAML is written by an older client that does not support the functionality.
|
409
|
+
# # 2. The FeatureSpec does not contain FeatureLookups (input_tables), FeatureFunctions (input_functions).
|
410
|
+
# input_tables = []
|
411
|
+
# for input_table in spec_dict.get(INPUT_TABLES, []):
|
412
|
+
# table_name, attributes = list(input_table.items())[0]
|
413
|
+
# input_tables.append({TABLE_NAME: table_name, **attributes})
|
414
|
+
# spec_dict[INPUT_TABLES] = input_tables
|
415
|
+
#
|
416
|
+
# input_functions = []
|
417
|
+
# for input_function in spec_dict.get(INPUT_FUNCTIONS, []):
|
418
|
+
# udf_name, attributes = list(input_function.items())[0]
|
419
|
+
# input_functions.append({UDF_NAME: udf_name, **attributes})
|
420
|
+
# spec_dict[INPUT_FUNCTIONS] = input_functions
|
421
|
+
#
|
422
|
+
# return cls.from_proto(
|
423
|
+
# ParseDict(spec_dict, ProtoFeatureSpec(), ignore_unknown_fields=True)
|
424
|
+
# )
|
425
|
+
|
426
|
+
@classmethod
|
427
|
+
def _read_file(cls, path: str):
|
428
|
+
"""
|
429
|
+
Read the YAML artifact from a file path.
|
430
|
+
"""
|
431
|
+
parent_dir, file = os.path.split(path)
|
432
|
+
spec_dict = read_yaml(parent_dir, file)
|
433
|
+
return cls._from_dict(spec_dict)
|
434
|
+
|
435
|
+
@classmethod
|
436
|
+
def load(cls, path: str):
|
437
|
+
"""
|
438
|
+
Load the FeatureSpec YAML artifact in the provided root directory (at path/feature_spec.yaml).
|
439
|
+
|
440
|
+
:param path: Root path to the YAML artifact. This can be a MLflow artifact path or file path.
|
441
|
+
:return: :py:class:`~databricks.ml_features_common.entities.feature_spec.FeatureSpec`
|
442
|
+
"""
|
443
|
+
# Create the full file path to the FeatureSpec.
|
444
|
+
path = os.path.join(path, cls.FEATURE_ARTIFACT_FILE)
|
445
|
+
|
446
|
+
if common_utils.is_artifact_uri(path):
|
447
|
+
with TempDir() as tmp_location:
|
448
|
+
# Returns a file and not directory since the artifact_uri is a single file.
|
449
|
+
local_path = mlflow.artifacts.download_artifacts(
|
450
|
+
artifact_uri=path, dst_path=tmp_location.path()
|
451
|
+
)
|
452
|
+
return FeatureSpec._read_file(local_path)
|
453
|
+
else:
|
454
|
+
return FeatureSpec._read_file(path)
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# Field names from feature_spec.proto.
|
2
|
+
SOURCE_DATA_COLUMN_INFO = "source_data_column_info"
|
3
|
+
FEATURE_COLUMN_INFO = "feature_column_info"
|
4
|
+
ON_DEMAND_COLUMN_INFO = "on_demand_column_info"
|
5
|
+
INPUT_COLUMNS = "input_columns"
|
6
|
+
NAME = "name"
|
7
|
+
OUTPUT_NAME = "output_name"
|
8
|
+
INPUT_TABLES = "input_tables"
|
9
|
+
TABLE_NAME = "table_name"
|
10
|
+
TABLE_ID = "table_id"
|
11
|
+
SERIALIZATION_VERSION = "serialization_version"
|
12
|
+
INPUT_FUNCTIONS = "input_functions"
|
13
|
+
INCLUDE = "include"
|
14
|
+
DATA_TYPE = "data_type"
|
15
|
+
TOPOLOGICAL_ORDERING = "topological_ordering"
|
16
|
+
UDF_NAME = "udf_name"
|
17
|
+
INPUT_BINDINGS = "input_bindings"
|
18
|
+
PARAMETER = "parameter"
|
19
|
+
BOUND_TO = "bound_to"
|
20
|
+
|
21
|
+
# FeatureSpec YAML source field and allowed values
|
22
|
+
SOURCE = "source"
|
23
|
+
TRAINING_DATA = "training_data"
|
24
|
+
FEATURE_STORE = "feature_store"
|
25
|
+
ON_DEMAND_FEATURE = "on_demand_feature"
|