wedata-feature-engineering 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. feature_store/constants/__init__.py +0 -0
  2. feature_store/constants/constants.py +28 -0
  3. feature_store/entities/__init__.py +0 -0
  4. feature_store/entities/column_info.py +117 -0
  5. feature_store/entities/data_type.py +92 -0
  6. feature_store/entities/environment_variables.py +55 -0
  7. feature_store/entities/feature.py +53 -0
  8. feature_store/entities/feature_column_info.py +64 -0
  9. feature_store/entities/feature_function.py +55 -0
  10. feature_store/entities/feature_lookup.py +179 -0
  11. feature_store/entities/feature_spec.py +454 -0
  12. feature_store/entities/feature_spec_constants.py +25 -0
  13. feature_store/entities/feature_table.py +164 -0
  14. feature_store/entities/feature_table_info.py +40 -0
  15. feature_store/entities/function_info.py +184 -0
  16. feature_store/entities/on_demand_column_info.py +44 -0
  17. feature_store/entities/source_data_column_info.py +21 -0
  18. feature_store/entities/training_set.py +134 -0
  19. feature_store/feature_table_client/__init__.py +0 -0
  20. feature_store/feature_table_client/feature_table_client.py +313 -0
  21. feature_store/spark_client/__init__.py +0 -0
  22. feature_store/spark_client/spark_client.py +286 -0
  23. feature_store/training_set_client/__init__.py +0 -0
  24. feature_store/training_set_client/training_set_client.py +196 -0
  25. {wedata_feature_engineering-0.1.0.dist-info → wedata_feature_engineering-0.1.2.dist-info}/METADATA +1 -1
  26. wedata_feature_engineering-0.1.2.dist-info/RECORD +30 -0
  27. wedata_feature_engineering-0.1.0.dist-info/RECORD +0 -6
  28. {wedata_feature_engineering-0.1.0.dist-info → wedata_feature_engineering-0.1.2.dist-info}/WHEEL +0 -0
  29. {wedata_feature_engineering-0.1.0.dist-info → wedata_feature_engineering-0.1.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,454 @@
1
+ import os
2
+ from typing import Any, Dict, List, Type, Union
3
+
4
+ import mlflow
5
+ from databricks.sdk.service.catalog import FunctionInfo
6
+ from google.protobuf.json_format import MessageToDict, ParseDict
7
+ from mlflow.utils.file_utils import TempDir, read_yaml, write_yaml
8
+
9
+ from feature_store.entities.column_info import ColumnInfo
10
+ from feature_store.entities.feature_column_info import FeatureColumnInfo
11
+ from feature_store.entities.feature_spec_constants import (
12
+ BOUND_TO,
13
+ DATA_TYPE,
14
+ FEATURE_COLUMN_INFO,
15
+ FEATURE_STORE,
16
+ INCLUDE,
17
+ INPUT_BINDINGS,
18
+ INPUT_COLUMNS,
19
+ INPUT_FUNCTIONS,
20
+ INPUT_TABLES,
21
+ NAME,
22
+ ON_DEMAND_COLUMN_INFO,
23
+ ON_DEMAND_FEATURE,
24
+ OUTPUT_NAME,
25
+ PARAMETER,
26
+ SERIALIZATION_VERSION,
27
+ SOURCE,
28
+ SOURCE_DATA_COLUMN_INFO,
29
+ TABLE_NAME,
30
+ TOPOLOGICAL_ORDERING,
31
+ TRAINING_DATA,
32
+ UDF_NAME,
33
+ )
34
+ from feature_store.entities.feature_table_info import FeatureTableInfo
35
+ from feature_store.entities.on_demand_column_info import OnDemandColumnInfo
36
+ from feature_store.entities.source_data_column_info import SourceDataColumnInfo
37
+ from feature_store.utils import common_utils
38
+
39
+ # Change log for serialization version. Please update for each serialization version.
40
+ # 1. Initial.
41
+ # 2. (2021/06/16): Record feature_store_client_version to help us make backward compatible changes in the future.
42
+ # 3. (2021/08/25): Record table_id to handle feature table lineage stability if tables are deleted.
43
+ # 4. (2021/09/25): Record timestamp_lookup_key to handle point-in-time lookups.
44
+ # 5. (2021/02/15): Record include flag for column info if False.
45
+ # Record input functions as FunctionInfo and function computation as OnDemandColumnInfo.
46
+ # Remove redundant fields: table_name from table_infos, output_name from column_infos.
47
+ # 6. (2023/04/21): Record lookback_window in table info for point-in-time lookups.
48
+ # 7. (2023/05/05): Record the Spark data type for all columns to track model signatures.
49
+ # 8. (2023/08/14): Record the topological_ordering for all columns to support chained transform and lookup.
50
+ # 9. (2023/09/11): Change the type of lookback_window from int to double for sub-second values
51
+
52
+
53
+ class FeatureSpec:
54
+
55
+ FEATURE_ARTIFACT_FILE = "feature_spec.yaml"
56
+ SERIALIZATION_VERSION_NUMBER = 9
57
+
58
+ def __init__(
59
+ self,
60
+ column_infos: List[ColumnInfo],
61
+ table_infos: List[FeatureTableInfo],
62
+ function_infos: List[FunctionInfo],
63
+ workspace_id: int = None,
64
+ feature_store_client_version: str = None,
65
+ serialization_version: int = None,
66
+ ):
67
+ self._column_infos = column_infos
68
+ self._table_infos = table_infos
69
+ self._function_infos = function_infos
70
+ self._workspace_id = workspace_id
71
+ # The Feature Store Python client version which wrote this FeatureSpec.
72
+ # If empty, the client version is <=0.3.1.
73
+ self._feature_store_client_version = feature_store_client_version
74
+ self._serialization_version = serialization_version
75
+
76
+ # Perform validations
77
+ self._validate_column_infos()
78
+ self._validate_table_infos()
79
+ self._validate_function_infos()
80
+
81
+ def _validate_column_infos(self):
82
+ if not self.column_infos:
83
+ raise ValueError("column_infos must be non-empty.")
84
+
85
+ for column_info in self.column_infos:
86
+ if not isinstance(column_info, ColumnInfo):
87
+ raise ValueError(
88
+ f"Expected all elements of column_infos to be instances of ColumnInfo. "
89
+ f"'{column_info}' is of the wrong type."
90
+ )
91
+ if (
92
+ self._serialization_version >= 8
93
+ and column_info.topological_ordering is not None
94
+ ):
95
+ ordering = column_info.topological_ordering
96
+ if not isinstance(ordering, int) or ordering < 0:
97
+ raise ValueError(
98
+ "The topological_ordering of column_info must be non non-negative integers."
99
+ )
100
+
101
+ def _validate_table_infos(self):
102
+ if self.table_infos is None:
103
+ raise ValueError("Internal Error: table_infos must be provided.")
104
+
105
+ # table_infos should not be duplicated
106
+ common_utils.validate_strings_unique(
107
+ [table_info.table_name for table_info in self.table_infos],
108
+ "Internal Error: Expect all table_names in table_infos to be unique. Found duplicates {}",
109
+ )
110
+
111
+ # Starting FeatureSpec v3, unique table names in table_infos must match those in column_infos.
112
+ if self.serialization_version >= 3:
113
+ unique_table_names = set(
114
+ [table_info.table_name for table_info in self.table_infos]
115
+ )
116
+ unique_column_table_names = set(
117
+ [fci.table_name for fci in self.feature_column_infos]
118
+ )
119
+ if unique_table_names != unique_column_table_names:
120
+ raise Exception(
121
+ f"Internal Error: table_names from table_infos {sorted(unique_table_names)} "
122
+ f"must match those from column_infos {sorted(unique_column_table_names)}"
123
+ )
124
+
125
+ def _validate_function_infos(self):
126
+ if self.function_infos is None:
127
+ raise ValueError("Internal Error: function_infos must be provided.")
128
+
129
+ # function_infos should not be duplicated
130
+ common_utils.validate_strings_unique(
131
+ [function_info.udf_name for function_info in self.function_infos],
132
+ "Internal Error: Expect all udf_names in function_infos to be unique. Found duplicates {}",
133
+ )
134
+
135
+ # Unique UDF names in function_infos must match those in column_infos.
136
+ # No version check is required as both fields were added simultaneously in FeatureSpec v5.
137
+ unique_udf_names = set(
138
+ [function_info.udf_name for function_info in self.function_infos]
139
+ )
140
+ unique_column_udf_names = set(
141
+ [odci.udf_name for odci in self.on_demand_column_infos]
142
+ )
143
+ if unique_udf_names != unique_column_udf_names:
144
+ raise Exception(
145
+ f"Internal Error: udf_names from function_infos {sorted(unique_udf_names)} "
146
+ f"must match those from column_infos {sorted(unique_column_udf_names)}"
147
+ )
148
+
149
+ @property
150
+ def column_infos(self):
151
+ return self._column_infos
152
+
153
+ @property
154
+ def table_infos(self):
155
+ return self._table_infos
156
+
157
+ @property
158
+ def function_infos(self):
159
+ return self._function_infos
160
+
161
+ @property
162
+ def workspace_id(self):
163
+ return self._workspace_id
164
+
165
+ @property
166
+ def feature_column_infos(self) -> List[FeatureColumnInfo]:
167
+ return self._get_infos_of_type(FeatureColumnInfo)
168
+
169
+ @property
170
+ def on_demand_column_infos(self) -> List[OnDemandColumnInfo]:
171
+ return self._get_infos_of_type(OnDemandColumnInfo)
172
+
173
+ @property
174
+ def serialization_version(self) -> int:
175
+ return self._serialization_version
176
+
177
+ def _get_infos_of_type(
178
+ self,
179
+ info_type: Union[
180
+ Type[SourceDataColumnInfo],
181
+ Type[FeatureColumnInfo],
182
+ Type[OnDemandColumnInfo],
183
+ ],
184
+ ):
185
+ """
186
+ Helper method to return the ColumnInfo.info subinfo field based on its type.
187
+ """
188
+ return [
189
+ column_info.info
190
+ for column_info in self.column_infos
191
+ if isinstance(column_info.info, info_type)
192
+ ]
193
+
194
+ @classmethod
195
+ def from_proto(cls, feature_spec_proto):
196
+ # Serialization version is not deserialized from the proto as there is currently only one
197
+ # possible version.
198
+ column_infos = [
199
+ ColumnInfo.from_proto(column_info_proto)
200
+ for column_info_proto in feature_spec_proto.input_columns
201
+ ]
202
+ table_infos = [
203
+ FeatureTableInfo.from_proto(table_info_proto)
204
+ for table_info_proto in feature_spec_proto.input_tables
205
+ ]
206
+ function_infos = [
207
+ FunctionInfo.from_proto(function_info_proto)
208
+ for function_info_proto in feature_spec_proto.input_functions
209
+ ]
210
+ return cls(
211
+ column_infos=column_infos,
212
+ table_infos=table_infos,
213
+ function_infos=function_infos,
214
+ workspace_id=feature_spec_proto.workspace_id,
215
+ feature_store_client_version=feature_spec_proto.feature_store_client_version,
216
+ serialization_version=feature_spec_proto.serialization_version,
217
+ )
218
+
219
+
220
+ @staticmethod
221
+ def _input_columns_proto_to_yaml_dict(column_info: Dict[str, Any]):
222
+ """
223
+ Converts a single ColumnInfo's proto dict to the expected element in FeatureSpec YAML's input_columns.
224
+ To keep the YAML clean, unnecessary fields are removed (e.g. SourceDataColumnInfo.name field, ColumnInfo.include when True).
225
+
226
+ Example of a column_info transformation. Note that "name" and "include" attributes were excluded.
227
+ {"source_data_column_info": {"name": "source_column"}, "include": True} -> {"source_column": {"source": "training_data"}}
228
+
229
+ Order of elements in the YAML dict should be:
230
+ 1. Attributes present in ColumnInfo.info, using the proto field order
231
+ 2. Remaining attributes of ColumnInfo, using the proto field order
232
+ 3. Feature Store source type
233
+ """
234
+ # Parse oneof field ColumnInfo.info level attributes as column_info_attributes; record column_name, source
235
+ if SOURCE_DATA_COLUMN_INFO in column_info:
236
+ column_info_attributes = column_info[SOURCE_DATA_COLUMN_INFO]
237
+ # pop NAME attribute and use as the YAML key for this column_info to avoid redundancy in YAML
238
+ column_name, source = column_info_attributes.pop(NAME), TRAINING_DATA
239
+ elif FEATURE_COLUMN_INFO in column_info:
240
+ column_info_attributes = column_info[FEATURE_COLUMN_INFO]
241
+ # pop OUTPUT_NAME attribute and use as the YAML key for this column_info to avoid redundancy in YAML
242
+ column_name, source = column_info_attributes.pop(OUTPUT_NAME), FEATURE_STORE
243
+ elif ON_DEMAND_COLUMN_INFO in column_info:
244
+ column_info_attributes = column_info[ON_DEMAND_COLUMN_INFO]
245
+ # Map InputBindings message dictionary to {parameter: bound_to} KV dictionary if defined
246
+ if INPUT_BINDINGS in column_info_attributes:
247
+ column_info_attributes[INPUT_BINDINGS] = {
248
+ ib[PARAMETER]: ib[BOUND_TO]
249
+ for ib in column_info_attributes[INPUT_BINDINGS]
250
+ }
251
+ # pop OUTPUT_NAME attribute and use as the YAML key for this column_info to avoid redundancy in YAML
252
+ column_name, source = (
253
+ column_info_attributes.pop(OUTPUT_NAME),
254
+ ON_DEMAND_FEATURE,
255
+ )
256
+ else:
257
+ raise ValueError(
258
+ f"Expected column_info to be keyed by a valid ColumnInfo.info type. "
259
+ f"'{column_info}' has key '{list(column_info)[0]}'."
260
+ )
261
+
262
+ # Parse and insert ColumnInfo level attributes
263
+ # Note: the ordering of fields in the result yaml file is undefined but in reality, they are
264
+ # in the same order as they are added in the column_info_attributes dict.
265
+
266
+ # DATA_TYPE is supported starting FeatureSpec v7 and is not guaranteed to exist.
267
+ if DATA_TYPE in column_info:
268
+ column_info_attributes[DATA_TYPE] = column_info[DATA_TYPE]
269
+ if not column_info[INCLUDE]:
270
+ column_info_attributes[INCLUDE] = False
271
+ # TOPOLOGICAL_ORDERING is supported starting FeatureSpec v8.
272
+ if TOPOLOGICAL_ORDERING in column_info:
273
+ column_info_attributes[TOPOLOGICAL_ORDERING] = column_info[
274
+ TOPOLOGICAL_ORDERING
275
+ ]
276
+
277
+ # Insert source; return YAML keyed by column_name
278
+ column_info_attributes[SOURCE] = source
279
+ return {column_name: column_info_attributes}
280
+
281
+ def _to_dict(self):
282
+ """
283
+ Convert FeatureSpec to a writeable YAML artifact. Uses MessageToDict to convert FeatureSpec proto to dict.
284
+ Sanitizes and modifies the dict as follows:
285
+ 1. Remove redundant or unnecessary information for cleanliness in the YAML
286
+ 2. Modifies the dict to be of the format {column_name: column_attributes_dict}
287
+
288
+ :return: Sanitized FeatureSpec dictionary of {column_name: column_attributes}
289
+ """
290
+ yaml_dict = MessageToDict(self.to_proto(), preserving_proto_field_name=True)
291
+ yaml_dict[INPUT_COLUMNS] = [
292
+ self._input_columns_proto_to_yaml_dict(column_info)
293
+ for column_info in yaml_dict[INPUT_COLUMNS]
294
+ ]
295
+
296
+ if INPUT_TABLES in yaml_dict:
297
+ # pop TABLE_NAME attribute and use as the YAML key for each table_info to avoid redundancy in YAML
298
+ yaml_dict[INPUT_TABLES] = [
299
+ {table_info.pop(TABLE_NAME): table_info}
300
+ for table_info in yaml_dict[INPUT_TABLES]
301
+ ]
302
+ if INPUT_FUNCTIONS in yaml_dict:
303
+ # pop UDF_NAME attribute and use as the YAML key for each table_info to avoid redundancy in YAML
304
+ yaml_dict[INPUT_FUNCTIONS] = [
305
+ {function_info.pop(UDF_NAME): function_info}
306
+ for function_info in yaml_dict[INPUT_FUNCTIONS]
307
+ ]
308
+
309
+ # For readability, place SERIALIZATION_VERSION last in the dictionary.
310
+ yaml_dict[SERIALIZATION_VERSION] = yaml_dict.pop(SERIALIZATION_VERSION)
311
+ return yaml_dict
312
+
313
+ def save(self, path: str):
314
+ """
315
+ Convert spec to a YAML artifact and store at given `path` location.
316
+ :param path: Root path to where YAML artifact is expected to be stored.
317
+ :return: None
318
+ """
319
+ write_yaml(
320
+ root=path,
321
+ file_name=self.FEATURE_ARTIFACT_FILE,
322
+ data=self._to_dict(),
323
+ sort_keys=False,
324
+ )
325
+
326
+ @staticmethod
327
+ def _input_columns_yaml_to_proto_dict(column_info: Dict[str, Any]):
328
+ """
329
+ Convert the FeatureSpec YAML dictionary to the expected ColumnInfo proto dictionary.
330
+
331
+ Example of a column_info transformation.
332
+ {"source_column": {"source": "training_data"}} -> {"source_data_column_info": {"name": "source_column"}}
333
+ """
334
+ if len(column_info) != 1:
335
+ raise ValueError(
336
+ f"Expected column_info dictionary to only have one key, value pair. "
337
+ f"'{column_info}' has length {len(column_info)}."
338
+ )
339
+ column_name, column_data = list(column_info.items())[0]
340
+ if not column_data:
341
+ raise ValueError(
342
+ f"Expected values of '{column_name}' dictionary to be non-empty."
343
+ )
344
+ if SOURCE not in column_data:
345
+ raise ValueError(
346
+ f"Expected values of column_info dictionary to include the source. No source found "
347
+ f"for '{column_name}'."
348
+ )
349
+
350
+ # Parse oneof field ColumnInfo.info level attributes
351
+ source = column_data.pop(SOURCE)
352
+ if source == TRAINING_DATA:
353
+ column_data[NAME] = column_name
354
+ column_info_dict = {SOURCE_DATA_COLUMN_INFO: column_data}
355
+ elif source == FEATURE_STORE:
356
+ column_data[OUTPUT_NAME] = column_name
357
+ column_info_dict = {FEATURE_COLUMN_INFO: column_data}
358
+ elif source == ON_DEMAND_FEATURE:
359
+ column_data[OUTPUT_NAME] = column_name
360
+ # Map {parameter_val: bound_to_val} dictionary to InputBindings(parameter, bound_to) message dictionary.
361
+ column_data[INPUT_BINDINGS] = [
362
+ {PARAMETER: parameter, BOUND_TO: bound_to}
363
+ for parameter, bound_to in column_data.get(INPUT_BINDINGS, {}).items()
364
+ ]
365
+ column_info_dict = {ON_DEMAND_COLUMN_INFO: column_data}
366
+ else:
367
+ raise ValueError(
368
+ f"Internal Error: Expected column_info to have source matching oneof ColumnInfo.info. "
369
+ f"'{column_info}' has source of '{source}'."
370
+ )
371
+
372
+ # Parse ColumnInfo level attributes
373
+ # TOPOLOGICAL_ORDERING is supported starting FeatureSpec v8.
374
+ if TOPOLOGICAL_ORDERING in column_data:
375
+ column_info_dict[TOPOLOGICAL_ORDERING] = column_data.pop(
376
+ TOPOLOGICAL_ORDERING
377
+ )
378
+ # DATA_TYPE is supported starting FeatureSpec v7 and is not guaranteed to exist.
379
+ if DATA_TYPE in column_data:
380
+ column_info_dict[DATA_TYPE] = column_data.pop(DATA_TYPE)
381
+ # INCLUDE is supported starting FeatureSpec v5 and only present in the YAML when INCLUDE = False
382
+ if INCLUDE in column_data:
383
+ column_info_dict[INCLUDE] = column_data.pop(INCLUDE)
384
+ return column_info_dict
385
+
386
+ # @classmethod
387
+ # def _from_dict(cls, spec_dict):
388
+ # """
389
+ # Convert YAML artifact to FeatureSpec. Transforms YAML artifact to dict keyed by
390
+ # source_data_column_info or feature_column_info, such that ParseDict can convert the dict to
391
+ # a proto message, and from_proto can convert the proto message to a FeatureSpec object
392
+ # :return: :py:class:`~databricks.ml_features_common.entities.feature_spec.FeatureSpec`
393
+ # """
394
+ # if INPUT_COLUMNS not in spec_dict:
395
+ # raise ValueError(
396
+ # f"{INPUT_COLUMNS} must be a key in {cls.FEATURE_ARTIFACT_FILE}."
397
+ # )
398
+ # if not spec_dict[INPUT_COLUMNS]:
399
+ # raise ValueError(
400
+ # f"{INPUT_COLUMNS} in {cls.FEATURE_ARTIFACT_FILE} must be non-empty."
401
+ # )
402
+ # spec_dict[INPUT_COLUMNS] = [
403
+ # cls._input_columns_yaml_to_proto_dict(column_info)
404
+ # for column_info in spec_dict[INPUT_COLUMNS]
405
+ # ]
406
+ #
407
+ # # feature_spec.yaml doesn't include input_tables, input_functions if any are true:
408
+ # # 1. The YAML is written by an older client that does not support the functionality.
409
+ # # 2. The FeatureSpec does not contain FeatureLookups (input_tables), FeatureFunctions (input_functions).
410
+ # input_tables = []
411
+ # for input_table in spec_dict.get(INPUT_TABLES, []):
412
+ # table_name, attributes = list(input_table.items())[0]
413
+ # input_tables.append({TABLE_NAME: table_name, **attributes})
414
+ # spec_dict[INPUT_TABLES] = input_tables
415
+ #
416
+ # input_functions = []
417
+ # for input_function in spec_dict.get(INPUT_FUNCTIONS, []):
418
+ # udf_name, attributes = list(input_function.items())[0]
419
+ # input_functions.append({UDF_NAME: udf_name, **attributes})
420
+ # spec_dict[INPUT_FUNCTIONS] = input_functions
421
+ #
422
+ # return cls.from_proto(
423
+ # ParseDict(spec_dict, ProtoFeatureSpec(), ignore_unknown_fields=True)
424
+ # )
425
+
426
+ @classmethod
427
+ def _read_file(cls, path: str):
428
+ """
429
+ Read the YAML artifact from a file path.
430
+ """
431
+ parent_dir, file = os.path.split(path)
432
+ spec_dict = read_yaml(parent_dir, file)
433
+ return cls._from_dict(spec_dict)
434
+
435
+ @classmethod
436
+ def load(cls, path: str):
437
+ """
438
+ Load the FeatureSpec YAML artifact in the provided root directory (at path/feature_spec.yaml).
439
+
440
+ :param path: Root path to the YAML artifact. This can be a MLflow artifact path or file path.
441
+ :return: :py:class:`~databricks.ml_features_common.entities.feature_spec.FeatureSpec`
442
+ """
443
+ # Create the full file path to the FeatureSpec.
444
+ path = os.path.join(path, cls.FEATURE_ARTIFACT_FILE)
445
+
446
+ if common_utils.is_artifact_uri(path):
447
+ with TempDir() as tmp_location:
448
+ # Returns a file and not directory since the artifact_uri is a single file.
449
+ local_path = mlflow.artifacts.download_artifacts(
450
+ artifact_uri=path, dst_path=tmp_location.path()
451
+ )
452
+ return FeatureSpec._read_file(local_path)
453
+ else:
454
+ return FeatureSpec._read_file(path)
@@ -0,0 +1,25 @@
1
+ # Field names from feature_spec.proto.
2
+ SOURCE_DATA_COLUMN_INFO = "source_data_column_info"
3
+ FEATURE_COLUMN_INFO = "feature_column_info"
4
+ ON_DEMAND_COLUMN_INFO = "on_demand_column_info"
5
+ INPUT_COLUMNS = "input_columns"
6
+ NAME = "name"
7
+ OUTPUT_NAME = "output_name"
8
+ INPUT_TABLES = "input_tables"
9
+ TABLE_NAME = "table_name"
10
+ TABLE_ID = "table_id"
11
+ SERIALIZATION_VERSION = "serialization_version"
12
+ INPUT_FUNCTIONS = "input_functions"
13
+ INCLUDE = "include"
14
+ DATA_TYPE = "data_type"
15
+ TOPOLOGICAL_ORDERING = "topological_ordering"
16
+ UDF_NAME = "udf_name"
17
+ INPUT_BINDINGS = "input_bindings"
18
+ PARAMETER = "parameter"
19
+ BOUND_TO = "bound_to"
20
+
21
+ # FeatureSpec YAML source field and allowed values
22
+ SOURCE = "source"
23
+ TRAINING_DATA = "training_data"
24
+ FEATURE_STORE = "feature_store"
25
+ ON_DEMAND_FEATURE = "on_demand_feature"
@@ -0,0 +1,164 @@
1
+ from typing import Dict
2
+
3
+
4
+
5
+ class FeatureTable:
6
+ """
7
+ .. note::
8
+
9
+ Aliases:`!databricks.feature_engineering.entities.feature_table.FeatureTable`, `!databricks.feature_store.entities.feature_table.FeatureTable`
10
+
11
+ Value class describing one feature table.
12
+
13
+ This will typically not be instantiated directly, instead the
14
+ :meth:`create_table() <databricks.feature_engineering.client.FeatureEngineeringClient.create_table>`
15
+ will create :class:`.FeatureTable` objects.
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ name,
21
+ table_id,
22
+ description,
23
+ primary_keys,
24
+ partition_columns,
25
+ features,
26
+ creation_timestamp=None,
27
+ online_stores=None,
28
+ notebook_producers=None,
29
+ job_producers=None,
30
+ table_data_sources=None,
31
+ path_data_sources=None,
32
+ custom_data_sources=None,
33
+ timestamp_keys=None,
34
+ tags=None,
35
+ ):
36
+ """Initialize a FeatureTable object."""
37
+ self.name = name
38
+ self.table_id = table_id
39
+ self.description = description
40
+ self.primary_keys = primary_keys
41
+ self.partition_columns = partition_columns
42
+ self.features = features
43
+ self.creation_timestamp = creation_timestamp
44
+ self.online_stores = online_stores if online_stores is not None else []
45
+ self.notebook_producers = (
46
+ notebook_producers if notebook_producers is not None else []
47
+ )
48
+ self.job_producers = job_producers if job_producers is not None else []
49
+ self.table_data_sources = (
50
+ table_data_sources if table_data_sources is not None else []
51
+ )
52
+ self.path_data_sources = (
53
+ path_data_sources if path_data_sources is not None else []
54
+ )
55
+ self.custom_data_sources = (
56
+ custom_data_sources if custom_data_sources is not None else []
57
+ )
58
+ self.timestamp_keys = timestamp_keys if timestamp_keys is not None else []
59
+ self._tags = tags
60
+
61
+ # @property
62
+ # @deprecated("FeatureTable.primary_keys", since="v0.3.6")
63
+ # def keys(self):
64
+ # return self.primary_keys
65
+
66
+ @property
67
+ def tags(self) -> Dict[str, str]:
68
+ """
69
+ Get the tags associated with the feature table.
70
+
71
+ :return a Dictionary of all tags associated with the feature table as key/value pairs
72
+ """
73
+ if self._tags is None:
74
+ # If no tags are set, self._tags is expected an empty dictionary.
75
+ raise ValueError(
76
+ "Internal error: tags have not been fetched for this FeatureTable instance"
77
+ )
78
+ return self._tags
79
+
80
+
81
+ @classmethod
82
+ def from_uc_get_table_response(cls, uc_get_table_response: Dict[str, object]):
83
+ """Return a FeatureStore object from a UC get_table response. Note: UC does not return online_stores or tags.
84
+
85
+ :param dict uc_get_table_response: A dictionary representing a UC get_table response.
86
+ :return FeatureTable: a FeatureStore object from the UC response.
87
+ """
88
+ table_name = uc_get_table_response["full_name"]
89
+
90
+ if uc_get_table_response["securable_kind"] == "TABLE_ONLINE_VIEW":
91
+ source_table = uc_get_table_response["properties_pairs"]["properties"][
92
+ "source_table"
93
+ ]
94
+ raise ValueError(
95
+ f"Table '{table_name}' is an online view. Online Views are not feature tables. Please use the source table '{source_table}' instead."
96
+ )
97
+
98
+ if (
99
+ "table_type" in uc_get_table_response
100
+ and uc_get_table_response["table_type"] == "VIEW"
101
+ ):
102
+ return cls(
103
+ name=table_name,
104
+ table_id=uc_get_table_response["table_id"],
105
+ description=uc_get_table_response["comment"]
106
+ if "comment" in uc_get_table_response
107
+ else "",
108
+ primary_keys=[],
109
+ partition_columns=[],
110
+ features=[],
111
+ creation_timestamp=uc_get_table_response["created_at"],
112
+ timestamp_keys=[],
113
+ )
114
+
115
+ table_constraints = (
116
+ uc_get_table_response["table_constraints"]
117
+ if "table_constraints" in uc_get_table_response
118
+ else []
119
+ )
120
+ primary_key_constraints = [
121
+ c for c in table_constraints if "primary_key_constraint" in c
122
+ ]
123
+ if len(primary_key_constraints) == 0:
124
+ raise ValueError(
125
+ "Table can't be used as a feature table because it has no primary key constraint defined."
126
+ + " Use 'ALTER TABLE table_name ADD CONSTRAINT table_name_pk PRIMARY KEY( key_column [,...] )'"
127
+ + " to add a primary key constraint on the table."
128
+ )
129
+ primary_key_constraint = primary_key_constraint = primary_key_constraints[0][
130
+ "primary_key_constraint"
131
+ ]
132
+ timestamp_keys = (
133
+ primary_key_constraint["timeseries_columns"]
134
+ if "timeseries_columns" in primary_key_constraint
135
+ else []
136
+ )
137
+ primary_keys = [
138
+ c
139
+ for c in primary_key_constraint["child_columns"]
140
+ if c not in timestamp_keys
141
+ ]
142
+
143
+ columns = uc_get_table_response["columns"]
144
+ features = [c["name"] for c in columns]
145
+ partition_columns_unordered = [c for c in columns if "partition_index" in c]
146
+ partition_columns = [
147
+ c["name"]
148
+ for c in sorted(
149
+ partition_columns_unordered, key=lambda x: x["partition_index"]
150
+ )
151
+ ]
152
+
153
+ return cls(
154
+ name=table_name,
155
+ table_id=uc_get_table_response["table_id"],
156
+ description=uc_get_table_response["comment"]
157
+ if "comment" in uc_get_table_response
158
+ else "",
159
+ primary_keys=primary_keys,
160
+ partition_columns=partition_columns,
161
+ features=features,
162
+ creation_timestamp=uc_get_table_response["created_at"],
163
+ timestamp_keys=timestamp_keys,
164
+ )
@@ -0,0 +1,40 @@
1
+ from typing import Optional
2
+
3
+
4
+
5
+ class FeatureTableInfo:
6
+ def __init__(
7
+ self, table_name: str, table_id: str, lookback_window: Optional[float] = None
8
+ ):
9
+ if not table_name:
10
+ raise ValueError("table_name must be non-empty.")
11
+ if not table_id:
12
+ raise ValueError("table_id must be non-empty.")
13
+ self._table_name = table_name
14
+ self._table_id = table_id
15
+ self._lookback_window = lookback_window
16
+
17
+ @property
18
+ def table_name(self):
19
+ return self._table_name
20
+
21
+ @property
22
+ def table_id(self):
23
+ return self._table_id
24
+
25
+ @property
26
+ def lookback_window(self):
27
+ return self._lookback_window
28
+
29
+ @classmethod
30
+ def from_proto(cls, feature_table_info_proto):
31
+ lookback_window = (
32
+ feature_table_info_proto.lookback_window
33
+ if feature_table_info_proto.HasField("lookback_window")
34
+ else None
35
+ )
36
+ return cls(
37
+ table_name=feature_table_info_proto.table_name,
38
+ table_id=feature_table_info_proto.table_id,
39
+ lookback_window=lookback_window,
40
+ )