wedata-feature-engineering 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- feature_store/constants/__init__.py +0 -0
- feature_store/constants/constants.py +28 -0
- feature_store/entities/__init__.py +0 -0
- feature_store/entities/column_info.py +117 -0
- feature_store/entities/data_type.py +92 -0
- feature_store/entities/environment_variables.py +55 -0
- feature_store/entities/feature.py +53 -0
- feature_store/entities/feature_column_info.py +64 -0
- feature_store/entities/feature_function.py +55 -0
- feature_store/entities/feature_lookup.py +179 -0
- feature_store/entities/feature_spec.py +454 -0
- feature_store/entities/feature_spec_constants.py +25 -0
- feature_store/entities/feature_table.py +164 -0
- feature_store/entities/feature_table_info.py +40 -0
- feature_store/entities/function_info.py +184 -0
- feature_store/entities/on_demand_column_info.py +44 -0
- feature_store/entities/source_data_column_info.py +21 -0
- feature_store/entities/training_set.py +134 -0
- feature_store/feature_table_client/__init__.py +0 -0
- feature_store/feature_table_client/feature_table_client.py +313 -0
- feature_store/spark_client/__init__.py +0 -0
- feature_store/spark_client/spark_client.py +286 -0
- feature_store/training_set_client/__init__.py +0 -0
- feature_store/training_set_client/training_set_client.py +196 -0
- {wedata_feature_engineering-0.1.0.dist-info → wedata_feature_engineering-0.1.2.dist-info}/METADATA +1 -1
- wedata_feature_engineering-0.1.2.dist-info/RECORD +30 -0
- wedata_feature_engineering-0.1.0.dist-info/RECORD +0 -6
- {wedata_feature_engineering-0.1.0.dist-info → wedata_feature_engineering-0.1.2.dist-info}/WHEEL +0 -0
- {wedata_feature_engineering-0.1.0.dist-info → wedata_feature_engineering-0.1.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,454 @@
|
|
1
|
+
import os
|
2
|
+
from typing import Any, Dict, List, Type, Union
|
3
|
+
|
4
|
+
import mlflow
|
5
|
+
from databricks.sdk.service.catalog import FunctionInfo
|
6
|
+
from google.protobuf.json_format import MessageToDict, ParseDict
|
7
|
+
from mlflow.utils.file_utils import TempDir, read_yaml, write_yaml
|
8
|
+
|
9
|
+
from feature_store.entities.column_info import ColumnInfo
|
10
|
+
from feature_store.entities.feature_column_info import FeatureColumnInfo
|
11
|
+
from feature_store.entities.feature_spec_constants import (
|
12
|
+
BOUND_TO,
|
13
|
+
DATA_TYPE,
|
14
|
+
FEATURE_COLUMN_INFO,
|
15
|
+
FEATURE_STORE,
|
16
|
+
INCLUDE,
|
17
|
+
INPUT_BINDINGS,
|
18
|
+
INPUT_COLUMNS,
|
19
|
+
INPUT_FUNCTIONS,
|
20
|
+
INPUT_TABLES,
|
21
|
+
NAME,
|
22
|
+
ON_DEMAND_COLUMN_INFO,
|
23
|
+
ON_DEMAND_FEATURE,
|
24
|
+
OUTPUT_NAME,
|
25
|
+
PARAMETER,
|
26
|
+
SERIALIZATION_VERSION,
|
27
|
+
SOURCE,
|
28
|
+
SOURCE_DATA_COLUMN_INFO,
|
29
|
+
TABLE_NAME,
|
30
|
+
TOPOLOGICAL_ORDERING,
|
31
|
+
TRAINING_DATA,
|
32
|
+
UDF_NAME,
|
33
|
+
)
|
34
|
+
from feature_store.entities.feature_table_info import FeatureTableInfo
|
35
|
+
from feature_store.entities.on_demand_column_info import OnDemandColumnInfo
|
36
|
+
from feature_store.entities.source_data_column_info import SourceDataColumnInfo
|
37
|
+
from feature_store.utils import common_utils
|
38
|
+
|
39
|
+
# Change log for serialization version. Please update for each serialization version.
|
40
|
+
# 1. Initial.
|
41
|
+
# 2. (2021/06/16): Record feature_store_client_version to help us make backward compatible changes in the future.
|
42
|
+
# 3. (2021/08/25): Record table_id to handle feature table lineage stability if tables are deleted.
|
43
|
+
# 4. (2021/09/25): Record timestamp_lookup_key to handle point-in-time lookups.
|
44
|
+
# 5. (2021/02/15): Record include flag for column info if False.
|
45
|
+
# Record input functions as FunctionInfo and function computation as OnDemandColumnInfo.
|
46
|
+
# Remove redundant fields: table_name from table_infos, output_name from column_infos.
|
47
|
+
# 6. (2023/04/21): Record lookback_window in table info for point-in-time lookups.
|
48
|
+
# 7. (2023/05/05): Record the Spark data type for all columns to track model signatures.
|
49
|
+
# 8. (2023/08/14): Record the topological_ordering for all columns to support chained transform and lookup.
|
50
|
+
# 9. (2023/09/11): Change the type of lookback_window from int to double for sub-second values
|
51
|
+
|
52
|
+
|
53
|
+
class FeatureSpec:
|
54
|
+
|
55
|
+
FEATURE_ARTIFACT_FILE = "feature_spec.yaml"
|
56
|
+
SERIALIZATION_VERSION_NUMBER = 9
|
57
|
+
|
58
|
+
def __init__(
|
59
|
+
self,
|
60
|
+
column_infos: List[ColumnInfo],
|
61
|
+
table_infos: List[FeatureTableInfo],
|
62
|
+
function_infos: List[FunctionInfo],
|
63
|
+
workspace_id: int = None,
|
64
|
+
feature_store_client_version: str = None,
|
65
|
+
serialization_version: int = None,
|
66
|
+
):
|
67
|
+
self._column_infos = column_infos
|
68
|
+
self._table_infos = table_infos
|
69
|
+
self._function_infos = function_infos
|
70
|
+
self._workspace_id = workspace_id
|
71
|
+
# The Feature Store Python client version which wrote this FeatureSpec.
|
72
|
+
# If empty, the client version is <=0.3.1.
|
73
|
+
self._feature_store_client_version = feature_store_client_version
|
74
|
+
self._serialization_version = serialization_version
|
75
|
+
|
76
|
+
# Perform validations
|
77
|
+
self._validate_column_infos()
|
78
|
+
self._validate_table_infos()
|
79
|
+
self._validate_function_infos()
|
80
|
+
|
81
|
+
def _validate_column_infos(self):
|
82
|
+
if not self.column_infos:
|
83
|
+
raise ValueError("column_infos must be non-empty.")
|
84
|
+
|
85
|
+
for column_info in self.column_infos:
|
86
|
+
if not isinstance(column_info, ColumnInfo):
|
87
|
+
raise ValueError(
|
88
|
+
f"Expected all elements of column_infos to be instances of ColumnInfo. "
|
89
|
+
f"'{column_info}' is of the wrong type."
|
90
|
+
)
|
91
|
+
if (
|
92
|
+
self._serialization_version >= 8
|
93
|
+
and column_info.topological_ordering is not None
|
94
|
+
):
|
95
|
+
ordering = column_info.topological_ordering
|
96
|
+
if not isinstance(ordering, int) or ordering < 0:
|
97
|
+
raise ValueError(
|
98
|
+
"The topological_ordering of column_info must be non non-negative integers."
|
99
|
+
)
|
100
|
+
|
101
|
+
def _validate_table_infos(self):
|
102
|
+
if self.table_infos is None:
|
103
|
+
raise ValueError("Internal Error: table_infos must be provided.")
|
104
|
+
|
105
|
+
# table_infos should not be duplicated
|
106
|
+
common_utils.validate_strings_unique(
|
107
|
+
[table_info.table_name for table_info in self.table_infos],
|
108
|
+
"Internal Error: Expect all table_names in table_infos to be unique. Found duplicates {}",
|
109
|
+
)
|
110
|
+
|
111
|
+
# Starting FeatureSpec v3, unique table names in table_infos must match those in column_infos.
|
112
|
+
if self.serialization_version >= 3:
|
113
|
+
unique_table_names = set(
|
114
|
+
[table_info.table_name for table_info in self.table_infos]
|
115
|
+
)
|
116
|
+
unique_column_table_names = set(
|
117
|
+
[fci.table_name for fci in self.feature_column_infos]
|
118
|
+
)
|
119
|
+
if unique_table_names != unique_column_table_names:
|
120
|
+
raise Exception(
|
121
|
+
f"Internal Error: table_names from table_infos {sorted(unique_table_names)} "
|
122
|
+
f"must match those from column_infos {sorted(unique_column_table_names)}"
|
123
|
+
)
|
124
|
+
|
125
|
+
def _validate_function_infos(self):
|
126
|
+
if self.function_infos is None:
|
127
|
+
raise ValueError("Internal Error: function_infos must be provided.")
|
128
|
+
|
129
|
+
# function_infos should not be duplicated
|
130
|
+
common_utils.validate_strings_unique(
|
131
|
+
[function_info.udf_name for function_info in self.function_infos],
|
132
|
+
"Internal Error: Expect all udf_names in function_infos to be unique. Found duplicates {}",
|
133
|
+
)
|
134
|
+
|
135
|
+
# Unique UDF names in function_infos must match those in column_infos.
|
136
|
+
# No version check is required as both fields were added simultaneously in FeatureSpec v5.
|
137
|
+
unique_udf_names = set(
|
138
|
+
[function_info.udf_name for function_info in self.function_infos]
|
139
|
+
)
|
140
|
+
unique_column_udf_names = set(
|
141
|
+
[odci.udf_name for odci in self.on_demand_column_infos]
|
142
|
+
)
|
143
|
+
if unique_udf_names != unique_column_udf_names:
|
144
|
+
raise Exception(
|
145
|
+
f"Internal Error: udf_names from function_infos {sorted(unique_udf_names)} "
|
146
|
+
f"must match those from column_infos {sorted(unique_column_udf_names)}"
|
147
|
+
)
|
148
|
+
|
149
|
+
@property
|
150
|
+
def column_infos(self):
|
151
|
+
return self._column_infos
|
152
|
+
|
153
|
+
@property
|
154
|
+
def table_infos(self):
|
155
|
+
return self._table_infos
|
156
|
+
|
157
|
+
@property
|
158
|
+
def function_infos(self):
|
159
|
+
return self._function_infos
|
160
|
+
|
161
|
+
@property
|
162
|
+
def workspace_id(self):
|
163
|
+
return self._workspace_id
|
164
|
+
|
165
|
+
@property
|
166
|
+
def feature_column_infos(self) -> List[FeatureColumnInfo]:
|
167
|
+
return self._get_infos_of_type(FeatureColumnInfo)
|
168
|
+
|
169
|
+
@property
|
170
|
+
def on_demand_column_infos(self) -> List[OnDemandColumnInfo]:
|
171
|
+
return self._get_infos_of_type(OnDemandColumnInfo)
|
172
|
+
|
173
|
+
@property
|
174
|
+
def serialization_version(self) -> int:
|
175
|
+
return self._serialization_version
|
176
|
+
|
177
|
+
def _get_infos_of_type(
|
178
|
+
self,
|
179
|
+
info_type: Union[
|
180
|
+
Type[SourceDataColumnInfo],
|
181
|
+
Type[FeatureColumnInfo],
|
182
|
+
Type[OnDemandColumnInfo],
|
183
|
+
],
|
184
|
+
):
|
185
|
+
"""
|
186
|
+
Helper method to return the ColumnInfo.info subinfo field based on its type.
|
187
|
+
"""
|
188
|
+
return [
|
189
|
+
column_info.info
|
190
|
+
for column_info in self.column_infos
|
191
|
+
if isinstance(column_info.info, info_type)
|
192
|
+
]
|
193
|
+
|
194
|
+
@classmethod
|
195
|
+
def from_proto(cls, feature_spec_proto):
|
196
|
+
# Serialization version is not deserialized from the proto as there is currently only one
|
197
|
+
# possible version.
|
198
|
+
column_infos = [
|
199
|
+
ColumnInfo.from_proto(column_info_proto)
|
200
|
+
for column_info_proto in feature_spec_proto.input_columns
|
201
|
+
]
|
202
|
+
table_infos = [
|
203
|
+
FeatureTableInfo.from_proto(table_info_proto)
|
204
|
+
for table_info_proto in feature_spec_proto.input_tables
|
205
|
+
]
|
206
|
+
function_infos = [
|
207
|
+
FunctionInfo.from_proto(function_info_proto)
|
208
|
+
for function_info_proto in feature_spec_proto.input_functions
|
209
|
+
]
|
210
|
+
return cls(
|
211
|
+
column_infos=column_infos,
|
212
|
+
table_infos=table_infos,
|
213
|
+
function_infos=function_infos,
|
214
|
+
workspace_id=feature_spec_proto.workspace_id,
|
215
|
+
feature_store_client_version=feature_spec_proto.feature_store_client_version,
|
216
|
+
serialization_version=feature_spec_proto.serialization_version,
|
217
|
+
)
|
218
|
+
|
219
|
+
|
220
|
+
@staticmethod
|
221
|
+
def _input_columns_proto_to_yaml_dict(column_info: Dict[str, Any]):
|
222
|
+
"""
|
223
|
+
Converts a single ColumnInfo's proto dict to the expected element in FeatureSpec YAML's input_columns.
|
224
|
+
To keep the YAML clean, unnecessary fields are removed (e.g. SourceDataColumnInfo.name field, ColumnInfo.include when True).
|
225
|
+
|
226
|
+
Example of a column_info transformation. Note that "name" and "include" attributes were excluded.
|
227
|
+
{"source_data_column_info": {"name": "source_column"}, "include": True} -> {"source_column": {"source": "training_data"}}
|
228
|
+
|
229
|
+
Order of elements in the YAML dict should be:
|
230
|
+
1. Attributes present in ColumnInfo.info, using the proto field order
|
231
|
+
2. Remaining attributes of ColumnInfo, using the proto field order
|
232
|
+
3. Feature Store source type
|
233
|
+
"""
|
234
|
+
# Parse oneof field ColumnInfo.info level attributes as column_info_attributes; record column_name, source
|
235
|
+
if SOURCE_DATA_COLUMN_INFO in column_info:
|
236
|
+
column_info_attributes = column_info[SOURCE_DATA_COLUMN_INFO]
|
237
|
+
# pop NAME attribute and use as the YAML key for this column_info to avoid redundancy in YAML
|
238
|
+
column_name, source = column_info_attributes.pop(NAME), TRAINING_DATA
|
239
|
+
elif FEATURE_COLUMN_INFO in column_info:
|
240
|
+
column_info_attributes = column_info[FEATURE_COLUMN_INFO]
|
241
|
+
# pop OUTPUT_NAME attribute and use as the YAML key for this column_info to avoid redundancy in YAML
|
242
|
+
column_name, source = column_info_attributes.pop(OUTPUT_NAME), FEATURE_STORE
|
243
|
+
elif ON_DEMAND_COLUMN_INFO in column_info:
|
244
|
+
column_info_attributes = column_info[ON_DEMAND_COLUMN_INFO]
|
245
|
+
# Map InputBindings message dictionary to {parameter: bound_to} KV dictionary if defined
|
246
|
+
if INPUT_BINDINGS in column_info_attributes:
|
247
|
+
column_info_attributes[INPUT_BINDINGS] = {
|
248
|
+
ib[PARAMETER]: ib[BOUND_TO]
|
249
|
+
for ib in column_info_attributes[INPUT_BINDINGS]
|
250
|
+
}
|
251
|
+
# pop OUTPUT_NAME attribute and use as the YAML key for this column_info to avoid redundancy in YAML
|
252
|
+
column_name, source = (
|
253
|
+
column_info_attributes.pop(OUTPUT_NAME),
|
254
|
+
ON_DEMAND_FEATURE,
|
255
|
+
)
|
256
|
+
else:
|
257
|
+
raise ValueError(
|
258
|
+
f"Expected column_info to be keyed by a valid ColumnInfo.info type. "
|
259
|
+
f"'{column_info}' has key '{list(column_info)[0]}'."
|
260
|
+
)
|
261
|
+
|
262
|
+
# Parse and insert ColumnInfo level attributes
|
263
|
+
# Note: the ordering of fields in the result yaml file is undefined but in reality, they are
|
264
|
+
# in the same order as they are added in the column_info_attributes dict.
|
265
|
+
|
266
|
+
# DATA_TYPE is supported starting FeatureSpec v7 and is not guaranteed to exist.
|
267
|
+
if DATA_TYPE in column_info:
|
268
|
+
column_info_attributes[DATA_TYPE] = column_info[DATA_TYPE]
|
269
|
+
if not column_info[INCLUDE]:
|
270
|
+
column_info_attributes[INCLUDE] = False
|
271
|
+
# TOPOLOGICAL_ORDERING is supported starting FeatureSpec v8.
|
272
|
+
if TOPOLOGICAL_ORDERING in column_info:
|
273
|
+
column_info_attributes[TOPOLOGICAL_ORDERING] = column_info[
|
274
|
+
TOPOLOGICAL_ORDERING
|
275
|
+
]
|
276
|
+
|
277
|
+
# Insert source; return YAML keyed by column_name
|
278
|
+
column_info_attributes[SOURCE] = source
|
279
|
+
return {column_name: column_info_attributes}
|
280
|
+
|
281
|
+
def _to_dict(self):
|
282
|
+
"""
|
283
|
+
Convert FeatureSpec to a writeable YAML artifact. Uses MessageToDict to convert FeatureSpec proto to dict.
|
284
|
+
Sanitizes and modifies the dict as follows:
|
285
|
+
1. Remove redundant or unnecessary information for cleanliness in the YAML
|
286
|
+
2. Modifies the dict to be of the format {column_name: column_attributes_dict}
|
287
|
+
|
288
|
+
:return: Sanitized FeatureSpec dictionary of {column_name: column_attributes}
|
289
|
+
"""
|
290
|
+
yaml_dict = MessageToDict(self.to_proto(), preserving_proto_field_name=True)
|
291
|
+
yaml_dict[INPUT_COLUMNS] = [
|
292
|
+
self._input_columns_proto_to_yaml_dict(column_info)
|
293
|
+
for column_info in yaml_dict[INPUT_COLUMNS]
|
294
|
+
]
|
295
|
+
|
296
|
+
if INPUT_TABLES in yaml_dict:
|
297
|
+
# pop TABLE_NAME attribute and use as the YAML key for each table_info to avoid redundancy in YAML
|
298
|
+
yaml_dict[INPUT_TABLES] = [
|
299
|
+
{table_info.pop(TABLE_NAME): table_info}
|
300
|
+
for table_info in yaml_dict[INPUT_TABLES]
|
301
|
+
]
|
302
|
+
if INPUT_FUNCTIONS in yaml_dict:
|
303
|
+
# pop UDF_NAME attribute and use as the YAML key for each table_info to avoid redundancy in YAML
|
304
|
+
yaml_dict[INPUT_FUNCTIONS] = [
|
305
|
+
{function_info.pop(UDF_NAME): function_info}
|
306
|
+
for function_info in yaml_dict[INPUT_FUNCTIONS]
|
307
|
+
]
|
308
|
+
|
309
|
+
# For readability, place SERIALIZATION_VERSION last in the dictionary.
|
310
|
+
yaml_dict[SERIALIZATION_VERSION] = yaml_dict.pop(SERIALIZATION_VERSION)
|
311
|
+
return yaml_dict
|
312
|
+
|
313
|
+
def save(self, path: str):
|
314
|
+
"""
|
315
|
+
Convert spec to a YAML artifact and store at given `path` location.
|
316
|
+
:param path: Root path to where YAML artifact is expected to be stored.
|
317
|
+
:return: None
|
318
|
+
"""
|
319
|
+
write_yaml(
|
320
|
+
root=path,
|
321
|
+
file_name=self.FEATURE_ARTIFACT_FILE,
|
322
|
+
data=self._to_dict(),
|
323
|
+
sort_keys=False,
|
324
|
+
)
|
325
|
+
|
326
|
+
@staticmethod
|
327
|
+
def _input_columns_yaml_to_proto_dict(column_info: Dict[str, Any]):
|
328
|
+
"""
|
329
|
+
Convert the FeatureSpec YAML dictionary to the expected ColumnInfo proto dictionary.
|
330
|
+
|
331
|
+
Example of a column_info transformation.
|
332
|
+
{"source_column": {"source": "training_data"}} -> {"source_data_column_info": {"name": "source_column"}}
|
333
|
+
"""
|
334
|
+
if len(column_info) != 1:
|
335
|
+
raise ValueError(
|
336
|
+
f"Expected column_info dictionary to only have one key, value pair. "
|
337
|
+
f"'{column_info}' has length {len(column_info)}."
|
338
|
+
)
|
339
|
+
column_name, column_data = list(column_info.items())[0]
|
340
|
+
if not column_data:
|
341
|
+
raise ValueError(
|
342
|
+
f"Expected values of '{column_name}' dictionary to be non-empty."
|
343
|
+
)
|
344
|
+
if SOURCE not in column_data:
|
345
|
+
raise ValueError(
|
346
|
+
f"Expected values of column_info dictionary to include the source. No source found "
|
347
|
+
f"for '{column_name}'."
|
348
|
+
)
|
349
|
+
|
350
|
+
# Parse oneof field ColumnInfo.info level attributes
|
351
|
+
source = column_data.pop(SOURCE)
|
352
|
+
if source == TRAINING_DATA:
|
353
|
+
column_data[NAME] = column_name
|
354
|
+
column_info_dict = {SOURCE_DATA_COLUMN_INFO: column_data}
|
355
|
+
elif source == FEATURE_STORE:
|
356
|
+
column_data[OUTPUT_NAME] = column_name
|
357
|
+
column_info_dict = {FEATURE_COLUMN_INFO: column_data}
|
358
|
+
elif source == ON_DEMAND_FEATURE:
|
359
|
+
column_data[OUTPUT_NAME] = column_name
|
360
|
+
# Map {parameter_val: bound_to_val} dictionary to InputBindings(parameter, bound_to) message dictionary.
|
361
|
+
column_data[INPUT_BINDINGS] = [
|
362
|
+
{PARAMETER: parameter, BOUND_TO: bound_to}
|
363
|
+
for parameter, bound_to in column_data.get(INPUT_BINDINGS, {}).items()
|
364
|
+
]
|
365
|
+
column_info_dict = {ON_DEMAND_COLUMN_INFO: column_data}
|
366
|
+
else:
|
367
|
+
raise ValueError(
|
368
|
+
f"Internal Error: Expected column_info to have source matching oneof ColumnInfo.info. "
|
369
|
+
f"'{column_info}' has source of '{source}'."
|
370
|
+
)
|
371
|
+
|
372
|
+
# Parse ColumnInfo level attributes
|
373
|
+
# TOPOLOGICAL_ORDERING is supported starting FeatureSpec v8.
|
374
|
+
if TOPOLOGICAL_ORDERING in column_data:
|
375
|
+
column_info_dict[TOPOLOGICAL_ORDERING] = column_data.pop(
|
376
|
+
TOPOLOGICAL_ORDERING
|
377
|
+
)
|
378
|
+
# DATA_TYPE is supported starting FeatureSpec v7 and is not guaranteed to exist.
|
379
|
+
if DATA_TYPE in column_data:
|
380
|
+
column_info_dict[DATA_TYPE] = column_data.pop(DATA_TYPE)
|
381
|
+
# INCLUDE is supported starting FeatureSpec v5 and only present in the YAML when INCLUDE = False
|
382
|
+
if INCLUDE in column_data:
|
383
|
+
column_info_dict[INCLUDE] = column_data.pop(INCLUDE)
|
384
|
+
return column_info_dict
|
385
|
+
|
386
|
+
# @classmethod
|
387
|
+
# def _from_dict(cls, spec_dict):
|
388
|
+
# """
|
389
|
+
# Convert YAML artifact to FeatureSpec. Transforms YAML artifact to dict keyed by
|
390
|
+
# source_data_column_info or feature_column_info, such that ParseDict can convert the dict to
|
391
|
+
# a proto message, and from_proto can convert the proto message to a FeatureSpec object
|
392
|
+
# :return: :py:class:`~databricks.ml_features_common.entities.feature_spec.FeatureSpec`
|
393
|
+
# """
|
394
|
+
# if INPUT_COLUMNS not in spec_dict:
|
395
|
+
# raise ValueError(
|
396
|
+
# f"{INPUT_COLUMNS} must be a key in {cls.FEATURE_ARTIFACT_FILE}."
|
397
|
+
# )
|
398
|
+
# if not spec_dict[INPUT_COLUMNS]:
|
399
|
+
# raise ValueError(
|
400
|
+
# f"{INPUT_COLUMNS} in {cls.FEATURE_ARTIFACT_FILE} must be non-empty."
|
401
|
+
# )
|
402
|
+
# spec_dict[INPUT_COLUMNS] = [
|
403
|
+
# cls._input_columns_yaml_to_proto_dict(column_info)
|
404
|
+
# for column_info in spec_dict[INPUT_COLUMNS]
|
405
|
+
# ]
|
406
|
+
#
|
407
|
+
# # feature_spec.yaml doesn't include input_tables, input_functions if any are true:
|
408
|
+
# # 1. The YAML is written by an older client that does not support the functionality.
|
409
|
+
# # 2. The FeatureSpec does not contain FeatureLookups (input_tables), FeatureFunctions (input_functions).
|
410
|
+
# input_tables = []
|
411
|
+
# for input_table in spec_dict.get(INPUT_TABLES, []):
|
412
|
+
# table_name, attributes = list(input_table.items())[0]
|
413
|
+
# input_tables.append({TABLE_NAME: table_name, **attributes})
|
414
|
+
# spec_dict[INPUT_TABLES] = input_tables
|
415
|
+
#
|
416
|
+
# input_functions = []
|
417
|
+
# for input_function in spec_dict.get(INPUT_FUNCTIONS, []):
|
418
|
+
# udf_name, attributes = list(input_function.items())[0]
|
419
|
+
# input_functions.append({UDF_NAME: udf_name, **attributes})
|
420
|
+
# spec_dict[INPUT_FUNCTIONS] = input_functions
|
421
|
+
#
|
422
|
+
# return cls.from_proto(
|
423
|
+
# ParseDict(spec_dict, ProtoFeatureSpec(), ignore_unknown_fields=True)
|
424
|
+
# )
|
425
|
+
|
426
|
+
@classmethod
|
427
|
+
def _read_file(cls, path: str):
|
428
|
+
"""
|
429
|
+
Read the YAML artifact from a file path.
|
430
|
+
"""
|
431
|
+
parent_dir, file = os.path.split(path)
|
432
|
+
spec_dict = read_yaml(parent_dir, file)
|
433
|
+
return cls._from_dict(spec_dict)
|
434
|
+
|
435
|
+
@classmethod
|
436
|
+
def load(cls, path: str):
|
437
|
+
"""
|
438
|
+
Load the FeatureSpec YAML artifact in the provided root directory (at path/feature_spec.yaml).
|
439
|
+
|
440
|
+
:param path: Root path to the YAML artifact. This can be a MLflow artifact path or file path.
|
441
|
+
:return: :py:class:`~databricks.ml_features_common.entities.feature_spec.FeatureSpec`
|
442
|
+
"""
|
443
|
+
# Create the full file path to the FeatureSpec.
|
444
|
+
path = os.path.join(path, cls.FEATURE_ARTIFACT_FILE)
|
445
|
+
|
446
|
+
if common_utils.is_artifact_uri(path):
|
447
|
+
with TempDir() as tmp_location:
|
448
|
+
# Returns a file and not directory since the artifact_uri is a single file.
|
449
|
+
local_path = mlflow.artifacts.download_artifacts(
|
450
|
+
artifact_uri=path, dst_path=tmp_location.path()
|
451
|
+
)
|
452
|
+
return FeatureSpec._read_file(local_path)
|
453
|
+
else:
|
454
|
+
return FeatureSpec._read_file(path)
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# Field names from feature_spec.proto.
|
2
|
+
SOURCE_DATA_COLUMN_INFO = "source_data_column_info"
|
3
|
+
FEATURE_COLUMN_INFO = "feature_column_info"
|
4
|
+
ON_DEMAND_COLUMN_INFO = "on_demand_column_info"
|
5
|
+
INPUT_COLUMNS = "input_columns"
|
6
|
+
NAME = "name"
|
7
|
+
OUTPUT_NAME = "output_name"
|
8
|
+
INPUT_TABLES = "input_tables"
|
9
|
+
TABLE_NAME = "table_name"
|
10
|
+
TABLE_ID = "table_id"
|
11
|
+
SERIALIZATION_VERSION = "serialization_version"
|
12
|
+
INPUT_FUNCTIONS = "input_functions"
|
13
|
+
INCLUDE = "include"
|
14
|
+
DATA_TYPE = "data_type"
|
15
|
+
TOPOLOGICAL_ORDERING = "topological_ordering"
|
16
|
+
UDF_NAME = "udf_name"
|
17
|
+
INPUT_BINDINGS = "input_bindings"
|
18
|
+
PARAMETER = "parameter"
|
19
|
+
BOUND_TO = "bound_to"
|
20
|
+
|
21
|
+
# FeatureSpec YAML source field and allowed values
|
22
|
+
SOURCE = "source"
|
23
|
+
TRAINING_DATA = "training_data"
|
24
|
+
FEATURE_STORE = "feature_store"
|
25
|
+
ON_DEMAND_FEATURE = "on_demand_feature"
|
@@ -0,0 +1,164 @@
|
|
1
|
+
from typing import Dict
|
2
|
+
|
3
|
+
|
4
|
+
|
5
|
+
class FeatureTable:
|
6
|
+
"""
|
7
|
+
.. note::
|
8
|
+
|
9
|
+
Aliases:`!databricks.feature_engineering.entities.feature_table.FeatureTable`, `!databricks.feature_store.entities.feature_table.FeatureTable`
|
10
|
+
|
11
|
+
Value class describing one feature table.
|
12
|
+
|
13
|
+
This will typically not be instantiated directly, instead the
|
14
|
+
:meth:`create_table() <databricks.feature_engineering.client.FeatureEngineeringClient.create_table>`
|
15
|
+
will create :class:`.FeatureTable` objects.
|
16
|
+
"""
|
17
|
+
|
18
|
+
def __init__(
|
19
|
+
self,
|
20
|
+
name,
|
21
|
+
table_id,
|
22
|
+
description,
|
23
|
+
primary_keys,
|
24
|
+
partition_columns,
|
25
|
+
features,
|
26
|
+
creation_timestamp=None,
|
27
|
+
online_stores=None,
|
28
|
+
notebook_producers=None,
|
29
|
+
job_producers=None,
|
30
|
+
table_data_sources=None,
|
31
|
+
path_data_sources=None,
|
32
|
+
custom_data_sources=None,
|
33
|
+
timestamp_keys=None,
|
34
|
+
tags=None,
|
35
|
+
):
|
36
|
+
"""Initialize a FeatureTable object."""
|
37
|
+
self.name = name
|
38
|
+
self.table_id = table_id
|
39
|
+
self.description = description
|
40
|
+
self.primary_keys = primary_keys
|
41
|
+
self.partition_columns = partition_columns
|
42
|
+
self.features = features
|
43
|
+
self.creation_timestamp = creation_timestamp
|
44
|
+
self.online_stores = online_stores if online_stores is not None else []
|
45
|
+
self.notebook_producers = (
|
46
|
+
notebook_producers if notebook_producers is not None else []
|
47
|
+
)
|
48
|
+
self.job_producers = job_producers if job_producers is not None else []
|
49
|
+
self.table_data_sources = (
|
50
|
+
table_data_sources if table_data_sources is not None else []
|
51
|
+
)
|
52
|
+
self.path_data_sources = (
|
53
|
+
path_data_sources if path_data_sources is not None else []
|
54
|
+
)
|
55
|
+
self.custom_data_sources = (
|
56
|
+
custom_data_sources if custom_data_sources is not None else []
|
57
|
+
)
|
58
|
+
self.timestamp_keys = timestamp_keys if timestamp_keys is not None else []
|
59
|
+
self._tags = tags
|
60
|
+
|
61
|
+
# @property
|
62
|
+
# @deprecated("FeatureTable.primary_keys", since="v0.3.6")
|
63
|
+
# def keys(self):
|
64
|
+
# return self.primary_keys
|
65
|
+
|
66
|
+
@property
|
67
|
+
def tags(self) -> Dict[str, str]:
|
68
|
+
"""
|
69
|
+
Get the tags associated with the feature table.
|
70
|
+
|
71
|
+
:return a Dictionary of all tags associated with the feature table as key/value pairs
|
72
|
+
"""
|
73
|
+
if self._tags is None:
|
74
|
+
# If no tags are set, self._tags is expected an empty dictionary.
|
75
|
+
raise ValueError(
|
76
|
+
"Internal error: tags have not been fetched for this FeatureTable instance"
|
77
|
+
)
|
78
|
+
return self._tags
|
79
|
+
|
80
|
+
|
81
|
+
@classmethod
|
82
|
+
def from_uc_get_table_response(cls, uc_get_table_response: Dict[str, object]):
|
83
|
+
"""Return a FeatureStore object from a UC get_table response. Note: UC does not return online_stores or tags.
|
84
|
+
|
85
|
+
:param dict uc_get_table_response: A dictionary representing a UC get_table response.
|
86
|
+
:return FeatureTable: a FeatureStore object from the UC response.
|
87
|
+
"""
|
88
|
+
table_name = uc_get_table_response["full_name"]
|
89
|
+
|
90
|
+
if uc_get_table_response["securable_kind"] == "TABLE_ONLINE_VIEW":
|
91
|
+
source_table = uc_get_table_response["properties_pairs"]["properties"][
|
92
|
+
"source_table"
|
93
|
+
]
|
94
|
+
raise ValueError(
|
95
|
+
f"Table '{table_name}' is an online view. Online Views are not feature tables. Please use the source table '{source_table}' instead."
|
96
|
+
)
|
97
|
+
|
98
|
+
if (
|
99
|
+
"table_type" in uc_get_table_response
|
100
|
+
and uc_get_table_response["table_type"] == "VIEW"
|
101
|
+
):
|
102
|
+
return cls(
|
103
|
+
name=table_name,
|
104
|
+
table_id=uc_get_table_response["table_id"],
|
105
|
+
description=uc_get_table_response["comment"]
|
106
|
+
if "comment" in uc_get_table_response
|
107
|
+
else "",
|
108
|
+
primary_keys=[],
|
109
|
+
partition_columns=[],
|
110
|
+
features=[],
|
111
|
+
creation_timestamp=uc_get_table_response["created_at"],
|
112
|
+
timestamp_keys=[],
|
113
|
+
)
|
114
|
+
|
115
|
+
table_constraints = (
|
116
|
+
uc_get_table_response["table_constraints"]
|
117
|
+
if "table_constraints" in uc_get_table_response
|
118
|
+
else []
|
119
|
+
)
|
120
|
+
primary_key_constraints = [
|
121
|
+
c for c in table_constraints if "primary_key_constraint" in c
|
122
|
+
]
|
123
|
+
if len(primary_key_constraints) == 0:
|
124
|
+
raise ValueError(
|
125
|
+
"Table can't be used as a feature table because it has no primary key constraint defined."
|
126
|
+
+ " Use 'ALTER TABLE table_name ADD CONSTRAINT table_name_pk PRIMARY KEY( key_column [,...] )'"
|
127
|
+
+ " to add a primary key constraint on the table."
|
128
|
+
)
|
129
|
+
primary_key_constraint = primary_key_constraint = primary_key_constraints[0][
|
130
|
+
"primary_key_constraint"
|
131
|
+
]
|
132
|
+
timestamp_keys = (
|
133
|
+
primary_key_constraint["timeseries_columns"]
|
134
|
+
if "timeseries_columns" in primary_key_constraint
|
135
|
+
else []
|
136
|
+
)
|
137
|
+
primary_keys = [
|
138
|
+
c
|
139
|
+
for c in primary_key_constraint["child_columns"]
|
140
|
+
if c not in timestamp_keys
|
141
|
+
]
|
142
|
+
|
143
|
+
columns = uc_get_table_response["columns"]
|
144
|
+
features = [c["name"] for c in columns]
|
145
|
+
partition_columns_unordered = [c for c in columns if "partition_index" in c]
|
146
|
+
partition_columns = [
|
147
|
+
c["name"]
|
148
|
+
for c in sorted(
|
149
|
+
partition_columns_unordered, key=lambda x: x["partition_index"]
|
150
|
+
)
|
151
|
+
]
|
152
|
+
|
153
|
+
return cls(
|
154
|
+
name=table_name,
|
155
|
+
table_id=uc_get_table_response["table_id"],
|
156
|
+
description=uc_get_table_response["comment"]
|
157
|
+
if "comment" in uc_get_table_response
|
158
|
+
else "",
|
159
|
+
primary_keys=primary_keys,
|
160
|
+
partition_columns=partition_columns,
|
161
|
+
features=features,
|
162
|
+
creation_timestamp=uc_get_table_response["created_at"],
|
163
|
+
timestamp_keys=timestamp_keys,
|
164
|
+
)
|
@@ -0,0 +1,40 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
|
3
|
+
|
4
|
+
|
5
|
+
class FeatureTableInfo:
|
6
|
+
def __init__(
|
7
|
+
self, table_name: str, table_id: str, lookback_window: Optional[float] = None
|
8
|
+
):
|
9
|
+
if not table_name:
|
10
|
+
raise ValueError("table_name must be non-empty.")
|
11
|
+
if not table_id:
|
12
|
+
raise ValueError("table_id must be non-empty.")
|
13
|
+
self._table_name = table_name
|
14
|
+
self._table_id = table_id
|
15
|
+
self._lookback_window = lookback_window
|
16
|
+
|
17
|
+
@property
|
18
|
+
def table_name(self):
|
19
|
+
return self._table_name
|
20
|
+
|
21
|
+
@property
|
22
|
+
def table_id(self):
|
23
|
+
return self._table_id
|
24
|
+
|
25
|
+
@property
|
26
|
+
def lookback_window(self):
|
27
|
+
return self._lookback_window
|
28
|
+
|
29
|
+
@classmethod
|
30
|
+
def from_proto(cls, feature_table_info_proto):
|
31
|
+
lookback_window = (
|
32
|
+
feature_table_info_proto.lookback_window
|
33
|
+
if feature_table_info_proto.HasField("lookback_window")
|
34
|
+
else None
|
35
|
+
)
|
36
|
+
return cls(
|
37
|
+
table_name=feature_table_info_proto.table_name,
|
38
|
+
table_id=feature_table_info_proto.table_id,
|
39
|
+
lookback_window=lookback_window,
|
40
|
+
)
|