wedata-feature-engineering 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. feature_store/utils/__init__.py +0 -0
  2. feature_store/utils/common_utils.py +96 -0
  3. feature_store/utils/feature_lookup_utils.py +570 -0
  4. feature_store/utils/feature_spec_utils.py +286 -0
  5. feature_store/utils/feature_utils.py +73 -0
  6. feature_store/utils/schema_utils.py +117 -0
  7. feature_store/utils/topological_sort.py +158 -0
  8. feature_store/utils/training_set_utils.py +580 -0
  9. feature_store/utils/uc_utils.py +281 -0
  10. feature_store/utils/utils.py +252 -0
  11. feature_store/utils/validation_utils.py +55 -0
  12. wedata/__init__.py +6 -0
  13. wedata/feature_store/__init__.py +0 -0
  14. wedata/feature_store/client.py +169 -0
  15. wedata/feature_store/constants/__init__.py +0 -0
  16. wedata/feature_store/constants/constants.py +28 -0
  17. wedata/feature_store/entities/__init__.py +0 -0
  18. wedata/feature_store/entities/column_info.py +117 -0
  19. wedata/feature_store/entities/data_type.py +92 -0
  20. wedata/feature_store/entities/environment_variables.py +55 -0
  21. wedata/feature_store/entities/feature.py +53 -0
  22. wedata/feature_store/entities/feature_column_info.py +64 -0
  23. wedata/feature_store/entities/feature_function.py +55 -0
  24. wedata/feature_store/entities/feature_lookup.py +179 -0
  25. wedata/feature_store/entities/feature_spec.py +454 -0
  26. wedata/feature_store/entities/feature_spec_constants.py +25 -0
  27. wedata/feature_store/entities/feature_table.py +164 -0
  28. wedata/feature_store/entities/feature_table_info.py +40 -0
  29. wedata/feature_store/entities/function_info.py +184 -0
  30. wedata/feature_store/entities/on_demand_column_info.py +44 -0
  31. wedata/feature_store/entities/source_data_column_info.py +21 -0
  32. wedata/feature_store/entities/training_set.py +134 -0
  33. wedata/feature_store/feature_table_client/__init__.py +0 -0
  34. wedata/feature_store/feature_table_client/feature_table_client.py +313 -0
  35. wedata/feature_store/spark_client/__init__.py +0 -0
  36. wedata/feature_store/spark_client/spark_client.py +286 -0
  37. wedata/feature_store/training_set_client/__init__.py +0 -0
  38. wedata/feature_store/training_set_client/training_set_client.py +196 -0
  39. wedata/feature_store/utils/__init__.py +0 -0
  40. wedata/feature_store/utils/common_utils.py +96 -0
  41. wedata/feature_store/utils/feature_lookup_utils.py +570 -0
  42. wedata/feature_store/utils/feature_spec_utils.py +286 -0
  43. wedata/feature_store/utils/feature_utils.py +73 -0
  44. wedata/feature_store/utils/schema_utils.py +117 -0
  45. wedata/feature_store/utils/topological_sort.py +158 -0
  46. wedata/feature_store/utils/training_set_utils.py +580 -0
  47. wedata/feature_store/utils/uc_utils.py +281 -0
  48. wedata/feature_store/utils/utils.py +252 -0
  49. wedata/feature_store/utils/validation_utils.py +55 -0
  50. {wedata_feature_engineering-0.1.3.dist-info → wedata_feature_engineering-0.1.5.dist-info}/METADATA +1 -1
  51. wedata_feature_engineering-0.1.5.dist-info/RECORD +79 -0
  52. wedata_feature_engineering-0.1.5.dist-info/top_level.txt +1 -0
  53. wedata_feature_engineering-0.1.3.dist-info/RECORD +0 -30
  54. wedata_feature_engineering-0.1.3.dist-info/top_level.txt +0 -1
  55. {wedata_feature_engineering-0.1.3.dist-info → wedata_feature_engineering-0.1.5.dist-info}/WHEEL +0 -0
@@ -0,0 +1,164 @@
1
+ from typing import Dict
2
+
3
+
4
+
5
+ class FeatureTable:
6
+ """
7
+ .. note::
8
+
9
+ Aliases:`!databricks.feature_engineering.entities.feature_table.FeatureTable`, `!databricks.feature_store.entities.feature_table.FeatureTable`
10
+
11
+ Value class describing one feature table.
12
+
13
+ This will typically not be instantiated directly, instead the
14
+ :meth:`create_table() <databricks.feature_engineering.client.FeatureEngineeringClient.create_table>`
15
+ will create :class:`.FeatureTable` objects.
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ name,
21
+ table_id,
22
+ description,
23
+ primary_keys,
24
+ partition_columns,
25
+ features,
26
+ creation_timestamp=None,
27
+ online_stores=None,
28
+ notebook_producers=None,
29
+ job_producers=None,
30
+ table_data_sources=None,
31
+ path_data_sources=None,
32
+ custom_data_sources=None,
33
+ timestamp_keys=None,
34
+ tags=None,
35
+ ):
36
+ """Initialize a FeatureTable object."""
37
+ self.name = name
38
+ self.table_id = table_id
39
+ self.description = description
40
+ self.primary_keys = primary_keys
41
+ self.partition_columns = partition_columns
42
+ self.features = features
43
+ self.creation_timestamp = creation_timestamp
44
+ self.online_stores = online_stores if online_stores is not None else []
45
+ self.notebook_producers = (
46
+ notebook_producers if notebook_producers is not None else []
47
+ )
48
+ self.job_producers = job_producers if job_producers is not None else []
49
+ self.table_data_sources = (
50
+ table_data_sources if table_data_sources is not None else []
51
+ )
52
+ self.path_data_sources = (
53
+ path_data_sources if path_data_sources is not None else []
54
+ )
55
+ self.custom_data_sources = (
56
+ custom_data_sources if custom_data_sources is not None else []
57
+ )
58
+ self.timestamp_keys = timestamp_keys if timestamp_keys is not None else []
59
+ self._tags = tags
60
+
61
+ # @property
62
+ # @deprecated("FeatureTable.primary_keys", since="v0.3.6")
63
+ # def keys(self):
64
+ # return self.primary_keys
65
+
66
+ @property
67
+ def tags(self) -> Dict[str, str]:
68
+ """
69
+ Get the tags associated with the feature table.
70
+
71
+ :return a Dictionary of all tags associated with the feature table as key/value pairs
72
+ """
73
+ if self._tags is None:
74
+ # If no tags are set, self._tags is expected an empty dictionary.
75
+ raise ValueError(
76
+ "Internal error: tags have not been fetched for this FeatureTable instance"
77
+ )
78
+ return self._tags
79
+
80
+
81
+ @classmethod
82
+ def from_uc_get_table_response(cls, uc_get_table_response: Dict[str, object]):
83
+ """Return a FeatureStore object from a UC get_table response. Note: UC does not return online_stores or tags.
84
+
85
+ :param dict uc_get_table_response: A dictionary representing a UC get_table response.
86
+ :return FeatureTable: a FeatureStore object from the UC response.
87
+ """
88
+ table_name = uc_get_table_response["full_name"]
89
+
90
+ if uc_get_table_response["securable_kind"] == "TABLE_ONLINE_VIEW":
91
+ source_table = uc_get_table_response["properties_pairs"]["properties"][
92
+ "source_table"
93
+ ]
94
+ raise ValueError(
95
+ f"Table '{table_name}' is an online view. Online Views are not feature tables. Please use the source table '{source_table}' instead."
96
+ )
97
+
98
+ if (
99
+ "table_type" in uc_get_table_response
100
+ and uc_get_table_response["table_type"] == "VIEW"
101
+ ):
102
+ return cls(
103
+ name=table_name,
104
+ table_id=uc_get_table_response["table_id"],
105
+ description=uc_get_table_response["comment"]
106
+ if "comment" in uc_get_table_response
107
+ else "",
108
+ primary_keys=[],
109
+ partition_columns=[],
110
+ features=[],
111
+ creation_timestamp=uc_get_table_response["created_at"],
112
+ timestamp_keys=[],
113
+ )
114
+
115
+ table_constraints = (
116
+ uc_get_table_response["table_constraints"]
117
+ if "table_constraints" in uc_get_table_response
118
+ else []
119
+ )
120
+ primary_key_constraints = [
121
+ c for c in table_constraints if "primary_key_constraint" in c
122
+ ]
123
+ if len(primary_key_constraints) == 0:
124
+ raise ValueError(
125
+ "Table can't be used as a feature table because it has no primary key constraint defined."
126
+ + " Use 'ALTER TABLE table_name ADD CONSTRAINT table_name_pk PRIMARY KEY( key_column [,...] )'"
127
+ + " to add a primary key constraint on the table."
128
+ )
129
+ primary_key_constraint = primary_key_constraint = primary_key_constraints[0][
130
+ "primary_key_constraint"
131
+ ]
132
+ timestamp_keys = (
133
+ primary_key_constraint["timeseries_columns"]
134
+ if "timeseries_columns" in primary_key_constraint
135
+ else []
136
+ )
137
+ primary_keys = [
138
+ c
139
+ for c in primary_key_constraint["child_columns"]
140
+ if c not in timestamp_keys
141
+ ]
142
+
143
+ columns = uc_get_table_response["columns"]
144
+ features = [c["name"] for c in columns]
145
+ partition_columns_unordered = [c for c in columns if "partition_index" in c]
146
+ partition_columns = [
147
+ c["name"]
148
+ for c in sorted(
149
+ partition_columns_unordered, key=lambda x: x["partition_index"]
150
+ )
151
+ ]
152
+
153
+ return cls(
154
+ name=table_name,
155
+ table_id=uc_get_table_response["table_id"],
156
+ description=uc_get_table_response["comment"]
157
+ if "comment" in uc_get_table_response
158
+ else "",
159
+ primary_keys=primary_keys,
160
+ partition_columns=partition_columns,
161
+ features=features,
162
+ creation_timestamp=uc_get_table_response["created_at"],
163
+ timestamp_keys=timestamp_keys,
164
+ )
@@ -0,0 +1,40 @@
1
+ from typing import Optional
2
+
3
+
4
+
5
+ class FeatureTableInfo:
6
+ def __init__(
7
+ self, table_name: str, table_id: str, lookback_window: Optional[float] = None
8
+ ):
9
+ if not table_name:
10
+ raise ValueError("table_name must be non-empty.")
11
+ if not table_id:
12
+ raise ValueError("table_id must be non-empty.")
13
+ self._table_name = table_name
14
+ self._table_id = table_id
15
+ self._lookback_window = lookback_window
16
+
17
+ @property
18
+ def table_name(self):
19
+ return self._table_name
20
+
21
+ @property
22
+ def table_id(self):
23
+ return self._table_id
24
+
25
+ @property
26
+ def lookback_window(self):
27
+ return self._lookback_window
28
+
29
+ @classmethod
30
+ def from_proto(cls, feature_table_info_proto):
31
+ lookback_window = (
32
+ feature_table_info_proto.lookback_window
33
+ if feature_table_info_proto.HasField("lookback_window")
34
+ else None
35
+ )
36
+ return cls(
37
+ table_name=feature_table_info_proto.table_name,
38
+ table_id=feature_table_info_proto.table_id,
39
+ lookback_window=lookback_window,
40
+ )
@@ -0,0 +1,184 @@
1
+ from collections import defaultdict
2
+ from typing import List, Optional
3
+
4
+ from pyspark.sql import Column, DataFrame
5
+ from pyspark.sql.functions import isnull, when
6
+ from pyspark.sql.types import StringType, StructField, StructType
7
+
8
+ class FunctionParameterInfo():
9
+ def __init__(self, name: str, type_text: str):
10
+ self._name = name
11
+ self._type_text = type_text
12
+
13
+ @property
14
+ def name(self) -> str:
15
+ return self._name
16
+
17
+ @property
18
+ def type_text(self) -> str:
19
+ return self._type_text
20
+
21
+ @classmethod
22
+ def from_dict(cls, function_parameter_info_json):
23
+ return FunctionParameterInfo(
24
+ function_parameter_info_json["name"],
25
+ function_parameter_info_json["type_text"],
26
+ )
27
+
28
+
29
+ class FunctionInfo():
30
+ """
31
+ Helper entity class that exposes properties in GetFunction's response JSON as attributes.
32
+ https://docs.databricks.com/api-explorer/workspace/functions/get
33
+
34
+ Note: empty fields (e.g. when 0 input parameters) are not included in the response JSON.
35
+ """
36
+
37
+ # Python UDFs have external_language = "Python"
38
+ PYTHON = "Python"
39
+
40
+ def __init__(
41
+ self,
42
+ full_name: str,
43
+ input_params: List[FunctionParameterInfo],
44
+ routine_definition: Optional[str],
45
+ external_language: Optional[str],
46
+ ):
47
+ self._full_name = full_name
48
+ self._input_params = input_params
49
+ self._routine_definition = routine_definition
50
+ self._external_language = external_language
51
+
52
+ @property
53
+ def full_name(self) -> str:
54
+ return self._full_name
55
+
56
+ @property
57
+ def input_params(self) -> List[FunctionParameterInfo]:
58
+ return self._input_params
59
+
60
+ @property
61
+ def routine_definition(self) -> Optional[str]:
62
+ return self._routine_definition
63
+
64
+ @property
65
+ def external_language(self) -> Optional[str]:
66
+ """
67
+ Field is None if language is SQL (not an external language).
68
+ """
69
+ return self._external_language
70
+
71
+ @classmethod
72
+ def from_dict(cls, function_info_json):
73
+ input_params = function_info_json.get("input_params", {}).get("parameters", [])
74
+ return FunctionInfo(
75
+ full_name=function_info_json["full_name"],
76
+ input_params=[FunctionParameterInfo.from_dict(p) for p in input_params],
77
+ routine_definition=function_info_json.get("routine_definition", None),
78
+ external_language=function_info_json.get("external_language", None),
79
+ )
80
+
81
+
82
+ class InformationSchemaSparkClient:
83
+ """
84
+ Internal client to retrieve Unity Catalog metadata from system.information_schema.
85
+ https://docs.databricks.com/sql/language-manual/sql-ref-information-schema.html
86
+ """
87
+
88
+ def _get_routines_with_parameters(self, full_routine_names: List[str]) -> DataFrame:
89
+ """
90
+ Retrieve the routines with their parameters from information_schema.routines, information_schema.parameters.
91
+ Return DataFrame only contains routines that 1. exist and 2. the caller has GetFunction permission on.
92
+
93
+ Note: The returned DataFrame contains the cartesian product of routines and parameters.
94
+ For efficiency, routines table columns are only present in the first row for each routine.
95
+ """
96
+ routine_name_schema = StructType(
97
+ [
98
+ StructField("specific_catalog", StringType(), False),
99
+ StructField("specific_schema", StringType(), False),
100
+ StructField("specific_name", StringType(), False),
101
+ ]
102
+ )
103
+ routine_names_df = self._spark_client.createDataFrame(
104
+ [full_routine_name.split(".") for full_routine_name in full_routine_names],
105
+ routine_name_schema,
106
+ )
107
+ routines_table = self._spark_client.read_table(
108
+ "system.information_schema.routines"
109
+ )
110
+ parameters_table = self._spark_client.read_table(
111
+ "system.information_schema.parameters"
112
+ )
113
+
114
+ # Inner join routines table to filter out non-existent routines.
115
+ # Left join parameters as routines may have no parameters.
116
+ full_routines_with_parameters_df = routine_names_df.join(
117
+ routines_table, on=routine_names_df.columns, how="inner"
118
+ ).join(parameters_table, on=routine_names_df.columns, how="left")
119
+
120
+ # Return only relevant metadata from information_schema, sorted by routine name + parameter order.
121
+ # For efficiency, only preserve routine column values in the first of each routine's result rows.
122
+ # The first row will have parameter.ordinal_value is None (no parameters) or equals 0 (first parameter).
123
+ def select_if_first_row(col: Column) -> Column:
124
+ return when(
125
+ isnull(parameters_table.ordinal_position)
126
+ | (parameters_table.ordinal_position == 0),
127
+ col,
128
+ ).otherwise(None)
129
+
130
+ return full_routines_with_parameters_df.select(
131
+ routine_names_df.columns
132
+ + [
133
+ select_if_first_row(routines_table.routine_definition).alias(
134
+ "routine_definition"
135
+ ),
136
+ select_if_first_row(routines_table.external_language).alias(
137
+ "external_language"
138
+ ),
139
+ parameters_table.ordinal_position,
140
+ parameters_table.parameter_name,
141
+ parameters_table.full_data_type,
142
+ ]
143
+ ).sort(routine_names_df.columns + [parameters_table.ordinal_position])
144
+
145
+ def get_functions(self, full_function_names: List[str]) -> List[FunctionInfo]:
146
+ """
147
+ Retrieves and maps Unity Catalog functions' metadata as FunctionInfos.
148
+ """
149
+ # Avoid unnecessary Spark calls and return if empty.
150
+ if not full_function_names:
151
+ return []
152
+
153
+ # Collect dict of routine name -> DataFrame rows describing the routine.
154
+ routines_with_parameters_df = self._get_routines_with_parameters(
155
+ full_routine_names=full_function_names
156
+ )
157
+ routine_infos = defaultdict(list)
158
+ for r in routines_with_parameters_df.collect():
159
+ routine_name = f"{r.specific_catalog}.{r.specific_schema}.{r.specific_name}"
160
+ routine_infos[routine_name].append(r)
161
+
162
+ # Mock GetFunction DNE error, since information_schema does not throw.
163
+ for function_name in full_function_names:
164
+ if not function_name in routine_infos:
165
+ raise ValueError(f"Function '{function_name}' does not exist.")
166
+
167
+ # Map routine_infos into FunctionInfos.
168
+ function_infos = []
169
+ for function_name in full_function_names:
170
+ routine_info = routine_infos[function_name][0]
171
+ input_params = [
172
+ FunctionParameterInfo(name=p.parameter_name, type_text=p.full_data_type)
173
+ for p in routine_infos[function_name]
174
+ if p.ordinal_position is not None
175
+ ]
176
+ function_infos.append(
177
+ FunctionInfo(
178
+ full_name=function_name,
179
+ input_params=input_params,
180
+ routine_definition=routine_info.routine_definition,
181
+ external_language=routine_info.external_language,
182
+ )
183
+ )
184
+ return function_infos
@@ -0,0 +1,44 @@
1
+ from typing import Dict
2
+
3
+ class OnDemandColumnInfo:
4
+ def __init__(
5
+ self,
6
+ udf_name: str,
7
+ input_bindings: Dict[str, str],
8
+ output_name: str,
9
+ ):
10
+ if not udf_name:
11
+ raise ValueError("udf_name must be non-empty.")
12
+ if not output_name:
13
+ raise ValueError("output_name must be non-empty.")
14
+
15
+ self._udf_name = udf_name
16
+ self._input_bindings = input_bindings
17
+ self._output_name = output_name
18
+
19
+ @property
20
+ def udf_name(self) -> str:
21
+ return self._udf_name
22
+
23
+ @property
24
+ def input_bindings(self) -> Dict[str, str]:
25
+ """
26
+ input_bindings is serialized as the InputBindings proto message.
27
+ """
28
+ return self._input_bindings
29
+
30
+ @property
31
+ def output_name(self) -> str:
32
+ return self._output_name
33
+
34
+ @classmethod
35
+ def from_proto(cls, on_demand_column_info_proto):
36
+ input_bindings_dict = {
37
+ input_binding.parameter: input_binding.bound_to
38
+ for input_binding in on_demand_column_info_proto.input_bindings
39
+ }
40
+ return OnDemandColumnInfo(
41
+ udf_name=on_demand_column_info_proto.udf_name,
42
+ input_bindings=input_bindings_dict,
43
+ output_name=on_demand_column_info_proto.output_name,
44
+ )
@@ -0,0 +1,21 @@
1
+
2
+ class SourceDataColumnInfo:
3
+ def __init__(self, name: str):
4
+ if not name:
5
+ raise ValueError("name must be non-empty.")
6
+ self._name = name
7
+
8
+ @property
9
+ def name(self):
10
+ return self._name
11
+
12
+ @property
13
+ def output_name(self) -> str:
14
+ """
15
+ This field does not exist in the proto, and is provided for convenience.
16
+ """
17
+ return self._name
18
+
19
+ @classmethod
20
+ def from_proto(cls, source_data_column_info_proto):
21
+ return cls(name=source_data_column_info_proto.name)
@@ -0,0 +1,134 @@
1
+ from typing import Dict, List, Optional
2
+
3
+ from pyspark.sql import DataFrame
4
+
5
+ from feature_store.entities.feature_table import FeatureTable
6
+ from feature_store.entities.function_info import FunctionInfo
7
+ from feature_store.utils.feature_lookup_utils import (
8
+ join_feature_data_if_not_overridden,
9
+ )
10
+
11
+ from feature_store.entities.feature_spec import FeatureSpec
12
+ from feature_store.utils.feature_spec_utils import (
13
+ COLUMN_INFO_TYPE_FEATURE,
14
+ COLUMN_INFO_TYPE_ON_DEMAND,
15
+ COLUMN_INFO_TYPE_SOURCE,
16
+ get_feature_execution_groups,
17
+ )
18
+
19
+
20
+ class TrainingSet:
21
+ """
22
+ .. note::
23
+
24
+ Aliases: `!databricks.feature_engineering.training_set.TrainingSet`, `!databricks.feature_store.training_set.TrainingSet`
25
+
26
+ Class that defines :obj:`TrainingSet` objects.
27
+
28
+ .. note::
29
+
30
+ The :class:`TrainingSet` constructor should not be called directly. Instead,
31
+ call :meth:`create_training_set() <databricks.feature_engineering.client.FeatureEngineeringClient.create_training_set>`.
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ feature_spec: FeatureSpec,
37
+ df: DataFrame,
38
+ labels: List[str],
39
+ feature_table_metadata_map: Dict[str, FeatureTable],
40
+ feature_table_data_map: Dict[str, DataFrame],
41
+ uc_function_infos: Dict[str, FunctionInfo],
42
+ use_spark_native_join: Optional[bool] = False,
43
+ ):
44
+ """Initialize a :obj:`TrainingSet` object."""
45
+ assert isinstance(
46
+ labels, list
47
+ ), f"Expected type `list` for argument `labels`. Got '{labels}' with type '{type(labels)}'."
48
+
49
+ self._feature_spec = feature_spec
50
+ self._df = df
51
+ self._labels = labels
52
+ self._feature_table_metadata_map = feature_table_metadata_map
53
+ self._feature_table_data_map = feature_table_data_map
54
+ self._uc_function_infos = uc_function_infos
55
+ self._use_spark_native_join = use_spark_native_join
56
+ # Perform basic validations and resolve FeatureSpec and label column data types.
57
+ self._validate_and_inject_dtypes()
58
+ self._label_data_types = {
59
+ name: data_type for name, data_type in df.dtypes if name in labels
60
+ }
61
+
62
+ @property
63
+ def feature_spec(self) -> FeatureSpec:
64
+ """Define a feature spec."""
65
+ return self._feature_spec
66
+
67
+ def _augment_df(self) -> DataFrame:
68
+ """
69
+ Internal helper to augment DataFrame with feature lookups and on-demand features specified in the FeatureSpec.
70
+ Does not drop excluded columns, and does not overwrite columns that already exist.
71
+ Return column order is df.columns + feature lookups + on-demand features.
72
+ """
73
+ execution_groups = get_feature_execution_groups(
74
+ self.feature_spec, self._df.columns
75
+ )
76
+
77
+ result_df = self._df
78
+ # Iterate over all levels and type of DAG nodes in FeatureSpec and execute them.
79
+ for execution_group in execution_groups:
80
+ if execution_group.type == COLUMN_INFO_TYPE_SOURCE:
81
+ continue
82
+ if execution_group.type == COLUMN_INFO_TYPE_FEATURE:
83
+ # Apply FeatureLookups
84
+ result_df = join_feature_data_if_not_overridden(
85
+ feature_spec=self.feature_spec,
86
+ df=result_df,
87
+ features_to_join=execution_group.features,
88
+ feature_table_metadata_map=self._feature_table_metadata_map,
89
+ feature_table_data_map=self._feature_table_data_map,
90
+ use_spark_native_join=self._use_spark_native_join,
91
+ )
92
+ # elif execution_group.type == COLUMN_INFO_TYPE_ON_DEMAND:
93
+ # # Apply all on-demand UDFs
94
+ # result_df = apply_functions_if_not_overridden(
95
+ # df=result_df,
96
+ # functions_to_apply=execution_group.features,
97
+ # uc_function_infos=self._uc_function_infos,
98
+ # )
99
+ else:
100
+ # This should never be reached.
101
+ raise Exception("Unknown feature execution type:", execution_group.type)
102
+ return result_df
103
+
104
+ def _validate_and_inject_dtypes(self):
105
+ """
106
+ Performs validations through _augment_df (e.g. Delta table exists, Delta and feature table dtypes match),
107
+ then inject the result DataFrame dtypes into the FeatureSpec.
108
+ """
109
+ augmented_df = self._augment_df()
110
+ augmented_df_dtypes = {column: dtype for column, dtype in augmented_df.dtypes}
111
+
112
+ # Inject the result DataFrame column types into the respective ColumnInfo
113
+ for ci in self.feature_spec.column_infos:
114
+ ci._data_type = augmented_df_dtypes[ci.output_name]
115
+
116
+ def load_df(self) -> DataFrame:
117
+ """
118
+ Load a :class:`DataFrame <pyspark.sql.DataFrame>`.
119
+
120
+ Return a :class:`DataFrame <pyspark.sql.DataFrame>` for training.
121
+
122
+ The returned :class:`DataFrame <pyspark.sql.DataFrame>` has columns specified
123
+ in the ``feature_spec`` and ``labels`` parameters provided
124
+ in :meth:`create_training_set() <databricks.feature_engineering.client.FeatureEngineeringClient.create_training_set>`.
125
+
126
+ :return:
127
+ A :class:`DataFrame <pyspark.sql.DataFrame>` for training
128
+ """
129
+ augmented_df = self._augment_df()
130
+ # Return only included columns in order defined by FeatureSpec + labels
131
+ included_columns = [
132
+ ci.output_name for ci in self.feature_spec.column_infos if ci.include
133
+ ] + self._labels
134
+ return augmented_df.select(included_columns)
File without changes