wedata-feature-engineering 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wedata/__init__.py +1 -1
- wedata/feature_store/client.py +113 -41
- wedata/feature_store/constants/constants.py +19 -0
- wedata/feature_store/entities/column_info.py +4 -4
- wedata/feature_store/entities/feature_lookup.py +5 -1
- wedata/feature_store/entities/feature_spec.py +46 -46
- wedata/feature_store/entities/feature_table.py +42 -99
- wedata/feature_store/entities/training_set.py +13 -12
- wedata/feature_store/feature_table_client/feature_table_client.py +86 -31
- wedata/feature_store/spark_client/spark_client.py +30 -56
- wedata/feature_store/training_set_client/training_set_client.py +209 -38
- wedata/feature_store/utils/common_utils.py +213 -3
- wedata/feature_store/utils/feature_lookup_utils.py +6 -6
- wedata/feature_store/utils/feature_spec_utils.py +6 -6
- wedata/feature_store/utils/feature_utils.py +5 -5
- wedata/feature_store/utils/on_demand_utils.py +107 -0
- wedata/feature_store/utils/schema_utils.py +1 -1
- wedata/feature_store/utils/signature_utils.py +205 -0
- wedata/feature_store/utils/training_set_utils.py +18 -19
- wedata/feature_store/utils/uc_utils.py +1 -1
- {wedata_feature_engineering-0.1.5.dist-info → wedata_feature_engineering-0.1.7.dist-info}/METADATA +1 -1
- wedata_feature_engineering-0.1.7.dist-info/RECORD +43 -0
- feature_store/__init__.py +0 -6
- feature_store/client.py +0 -169
- feature_store/constants/__init__.py +0 -0
- feature_store/constants/constants.py +0 -28
- feature_store/entities/__init__.py +0 -0
- feature_store/entities/column_info.py +0 -117
- feature_store/entities/data_type.py +0 -92
- feature_store/entities/environment_variables.py +0 -55
- feature_store/entities/feature.py +0 -53
- feature_store/entities/feature_column_info.py +0 -64
- feature_store/entities/feature_function.py +0 -55
- feature_store/entities/feature_lookup.py +0 -179
- feature_store/entities/feature_spec.py +0 -454
- feature_store/entities/feature_spec_constants.py +0 -25
- feature_store/entities/feature_table.py +0 -164
- feature_store/entities/feature_table_info.py +0 -40
- feature_store/entities/function_info.py +0 -184
- feature_store/entities/on_demand_column_info.py +0 -44
- feature_store/entities/source_data_column_info.py +0 -21
- feature_store/entities/training_set.py +0 -134
- feature_store/feature_table_client/__init__.py +0 -0
- feature_store/feature_table_client/feature_table_client.py +0 -313
- feature_store/spark_client/__init__.py +0 -0
- feature_store/spark_client/spark_client.py +0 -286
- feature_store/training_set_client/__init__.py +0 -0
- feature_store/training_set_client/training_set_client.py +0 -196
- feature_store/utils/__init__.py +0 -0
- feature_store/utils/common_utils.py +0 -96
- feature_store/utils/feature_lookup_utils.py +0 -570
- feature_store/utils/feature_spec_utils.py +0 -286
- feature_store/utils/feature_utils.py +0 -73
- feature_store/utils/schema_utils.py +0 -117
- feature_store/utils/topological_sort.py +0 -158
- feature_store/utils/training_set_utils.py +0 -580
- feature_store/utils/uc_utils.py +0 -281
- feature_store/utils/utils.py +0 -252
- feature_store/utils/validation_utils.py +0 -55
- wedata/feature_store/utils/utils.py +0 -252
- wedata_feature_engineering-0.1.5.dist-info/RECORD +0 -79
- {wedata_feature_engineering-0.1.5.dist-info → wedata_feature_engineering-0.1.7.dist-info}/WHEEL +0 -0
- {wedata_feature_engineering-0.1.5.dist-info → wedata_feature_engineering-0.1.7.dist-info}/top_level.txt +0 -0
@@ -1,184 +0,0 @@
|
|
1
|
-
from collections import defaultdict
|
2
|
-
from typing import List, Optional
|
3
|
-
|
4
|
-
from pyspark.sql import Column, DataFrame
|
5
|
-
from pyspark.sql.functions import isnull, when
|
6
|
-
from pyspark.sql.types import StringType, StructField, StructType
|
7
|
-
|
8
|
-
class FunctionParameterInfo():
|
9
|
-
def __init__(self, name: str, type_text: str):
|
10
|
-
self._name = name
|
11
|
-
self._type_text = type_text
|
12
|
-
|
13
|
-
@property
|
14
|
-
def name(self) -> str:
|
15
|
-
return self._name
|
16
|
-
|
17
|
-
@property
|
18
|
-
def type_text(self) -> str:
|
19
|
-
return self._type_text
|
20
|
-
|
21
|
-
@classmethod
|
22
|
-
def from_dict(cls, function_parameter_info_json):
|
23
|
-
return FunctionParameterInfo(
|
24
|
-
function_parameter_info_json["name"],
|
25
|
-
function_parameter_info_json["type_text"],
|
26
|
-
)
|
27
|
-
|
28
|
-
|
29
|
-
class FunctionInfo():
|
30
|
-
"""
|
31
|
-
Helper entity class that exposes properties in GetFunction's response JSON as attributes.
|
32
|
-
https://docs.databricks.com/api-explorer/workspace/functions/get
|
33
|
-
|
34
|
-
Note: empty fields (e.g. when 0 input parameters) are not included in the response JSON.
|
35
|
-
"""
|
36
|
-
|
37
|
-
# Python UDFs have external_language = "Python"
|
38
|
-
PYTHON = "Python"
|
39
|
-
|
40
|
-
def __init__(
|
41
|
-
self,
|
42
|
-
full_name: str,
|
43
|
-
input_params: List[FunctionParameterInfo],
|
44
|
-
routine_definition: Optional[str],
|
45
|
-
external_language: Optional[str],
|
46
|
-
):
|
47
|
-
self._full_name = full_name
|
48
|
-
self._input_params = input_params
|
49
|
-
self._routine_definition = routine_definition
|
50
|
-
self._external_language = external_language
|
51
|
-
|
52
|
-
@property
|
53
|
-
def full_name(self) -> str:
|
54
|
-
return self._full_name
|
55
|
-
|
56
|
-
@property
|
57
|
-
def input_params(self) -> List[FunctionParameterInfo]:
|
58
|
-
return self._input_params
|
59
|
-
|
60
|
-
@property
|
61
|
-
def routine_definition(self) -> Optional[str]:
|
62
|
-
return self._routine_definition
|
63
|
-
|
64
|
-
@property
|
65
|
-
def external_language(self) -> Optional[str]:
|
66
|
-
"""
|
67
|
-
Field is None if language is SQL (not an external language).
|
68
|
-
"""
|
69
|
-
return self._external_language
|
70
|
-
|
71
|
-
@classmethod
|
72
|
-
def from_dict(cls, function_info_json):
|
73
|
-
input_params = function_info_json.get("input_params", {}).get("parameters", [])
|
74
|
-
return FunctionInfo(
|
75
|
-
full_name=function_info_json["full_name"],
|
76
|
-
input_params=[FunctionParameterInfo.from_dict(p) for p in input_params],
|
77
|
-
routine_definition=function_info_json.get("routine_definition", None),
|
78
|
-
external_language=function_info_json.get("external_language", None),
|
79
|
-
)
|
80
|
-
|
81
|
-
|
82
|
-
class InformationSchemaSparkClient:
|
83
|
-
"""
|
84
|
-
Internal client to retrieve Unity Catalog metadata from system.information_schema.
|
85
|
-
https://docs.databricks.com/sql/language-manual/sql-ref-information-schema.html
|
86
|
-
"""
|
87
|
-
|
88
|
-
def _get_routines_with_parameters(self, full_routine_names: List[str]) -> DataFrame:
|
89
|
-
"""
|
90
|
-
Retrieve the routines with their parameters from information_schema.routines, information_schema.parameters.
|
91
|
-
Return DataFrame only contains routines that 1. exist and 2. the caller has GetFunction permission on.
|
92
|
-
|
93
|
-
Note: The returned DataFrame contains the cartesian product of routines and parameters.
|
94
|
-
For efficiency, routines table columns are only present in the first row for each routine.
|
95
|
-
"""
|
96
|
-
routine_name_schema = StructType(
|
97
|
-
[
|
98
|
-
StructField("specific_catalog", StringType(), False),
|
99
|
-
StructField("specific_schema", StringType(), False),
|
100
|
-
StructField("specific_name", StringType(), False),
|
101
|
-
]
|
102
|
-
)
|
103
|
-
routine_names_df = self._spark_client.createDataFrame(
|
104
|
-
[full_routine_name.split(".") for full_routine_name in full_routine_names],
|
105
|
-
routine_name_schema,
|
106
|
-
)
|
107
|
-
routines_table = self._spark_client.read_table(
|
108
|
-
"system.information_schema.routines"
|
109
|
-
)
|
110
|
-
parameters_table = self._spark_client.read_table(
|
111
|
-
"system.information_schema.parameters"
|
112
|
-
)
|
113
|
-
|
114
|
-
# Inner join routines table to filter out non-existent routines.
|
115
|
-
# Left join parameters as routines may have no parameters.
|
116
|
-
full_routines_with_parameters_df = routine_names_df.join(
|
117
|
-
routines_table, on=routine_names_df.columns, how="inner"
|
118
|
-
).join(parameters_table, on=routine_names_df.columns, how="left")
|
119
|
-
|
120
|
-
# Return only relevant metadata from information_schema, sorted by routine name + parameter order.
|
121
|
-
# For efficiency, only preserve routine column values in the first of each routine's result rows.
|
122
|
-
# The first row will have parameter.ordinal_value is None (no parameters) or equals 0 (first parameter).
|
123
|
-
def select_if_first_row(col: Column) -> Column:
|
124
|
-
return when(
|
125
|
-
isnull(parameters_table.ordinal_position)
|
126
|
-
| (parameters_table.ordinal_position == 0),
|
127
|
-
col,
|
128
|
-
).otherwise(None)
|
129
|
-
|
130
|
-
return full_routines_with_parameters_df.select(
|
131
|
-
routine_names_df.columns
|
132
|
-
+ [
|
133
|
-
select_if_first_row(routines_table.routine_definition).alias(
|
134
|
-
"routine_definition"
|
135
|
-
),
|
136
|
-
select_if_first_row(routines_table.external_language).alias(
|
137
|
-
"external_language"
|
138
|
-
),
|
139
|
-
parameters_table.ordinal_position,
|
140
|
-
parameters_table.parameter_name,
|
141
|
-
parameters_table.full_data_type,
|
142
|
-
]
|
143
|
-
).sort(routine_names_df.columns + [parameters_table.ordinal_position])
|
144
|
-
|
145
|
-
def get_functions(self, full_function_names: List[str]) -> List[FunctionInfo]:
|
146
|
-
"""
|
147
|
-
Retrieves and maps Unity Catalog functions' metadata as FunctionInfos.
|
148
|
-
"""
|
149
|
-
# Avoid unnecessary Spark calls and return if empty.
|
150
|
-
if not full_function_names:
|
151
|
-
return []
|
152
|
-
|
153
|
-
# Collect dict of routine name -> DataFrame rows describing the routine.
|
154
|
-
routines_with_parameters_df = self._get_routines_with_parameters(
|
155
|
-
full_routine_names=full_function_names
|
156
|
-
)
|
157
|
-
routine_infos = defaultdict(list)
|
158
|
-
for r in routines_with_parameters_df.collect():
|
159
|
-
routine_name = f"{r.specific_catalog}.{r.specific_schema}.{r.specific_name}"
|
160
|
-
routine_infos[routine_name].append(r)
|
161
|
-
|
162
|
-
# Mock GetFunction DNE error, since information_schema does not throw.
|
163
|
-
for function_name in full_function_names:
|
164
|
-
if not function_name in routine_infos:
|
165
|
-
raise ValueError(f"Function '{function_name}' does not exist.")
|
166
|
-
|
167
|
-
# Map routine_infos into FunctionInfos.
|
168
|
-
function_infos = []
|
169
|
-
for function_name in full_function_names:
|
170
|
-
routine_info = routine_infos[function_name][0]
|
171
|
-
input_params = [
|
172
|
-
FunctionParameterInfo(name=p.parameter_name, type_text=p.full_data_type)
|
173
|
-
for p in routine_infos[function_name]
|
174
|
-
if p.ordinal_position is not None
|
175
|
-
]
|
176
|
-
function_infos.append(
|
177
|
-
FunctionInfo(
|
178
|
-
full_name=function_name,
|
179
|
-
input_params=input_params,
|
180
|
-
routine_definition=routine_info.routine_definition,
|
181
|
-
external_language=routine_info.external_language,
|
182
|
-
)
|
183
|
-
)
|
184
|
-
return function_infos
|
@@ -1,44 +0,0 @@
|
|
1
|
-
from typing import Dict
|
2
|
-
|
3
|
-
class OnDemandColumnInfo:
|
4
|
-
def __init__(
|
5
|
-
self,
|
6
|
-
udf_name: str,
|
7
|
-
input_bindings: Dict[str, str],
|
8
|
-
output_name: str,
|
9
|
-
):
|
10
|
-
if not udf_name:
|
11
|
-
raise ValueError("udf_name must be non-empty.")
|
12
|
-
if not output_name:
|
13
|
-
raise ValueError("output_name must be non-empty.")
|
14
|
-
|
15
|
-
self._udf_name = udf_name
|
16
|
-
self._input_bindings = input_bindings
|
17
|
-
self._output_name = output_name
|
18
|
-
|
19
|
-
@property
|
20
|
-
def udf_name(self) -> str:
|
21
|
-
return self._udf_name
|
22
|
-
|
23
|
-
@property
|
24
|
-
def input_bindings(self) -> Dict[str, str]:
|
25
|
-
"""
|
26
|
-
input_bindings is serialized as the InputBindings proto message.
|
27
|
-
"""
|
28
|
-
return self._input_bindings
|
29
|
-
|
30
|
-
@property
|
31
|
-
def output_name(self) -> str:
|
32
|
-
return self._output_name
|
33
|
-
|
34
|
-
@classmethod
|
35
|
-
def from_proto(cls, on_demand_column_info_proto):
|
36
|
-
input_bindings_dict = {
|
37
|
-
input_binding.parameter: input_binding.bound_to
|
38
|
-
for input_binding in on_demand_column_info_proto.input_bindings
|
39
|
-
}
|
40
|
-
return OnDemandColumnInfo(
|
41
|
-
udf_name=on_demand_column_info_proto.udf_name,
|
42
|
-
input_bindings=input_bindings_dict,
|
43
|
-
output_name=on_demand_column_info_proto.output_name,
|
44
|
-
)
|
@@ -1,21 +0,0 @@
|
|
1
|
-
|
2
|
-
class SourceDataColumnInfo:
|
3
|
-
def __init__(self, name: str):
|
4
|
-
if not name:
|
5
|
-
raise ValueError("name must be non-empty.")
|
6
|
-
self._name = name
|
7
|
-
|
8
|
-
@property
|
9
|
-
def name(self):
|
10
|
-
return self._name
|
11
|
-
|
12
|
-
@property
|
13
|
-
def output_name(self) -> str:
|
14
|
-
"""
|
15
|
-
This field does not exist in the proto, and is provided for convenience.
|
16
|
-
"""
|
17
|
-
return self._name
|
18
|
-
|
19
|
-
@classmethod
|
20
|
-
def from_proto(cls, source_data_column_info_proto):
|
21
|
-
return cls(name=source_data_column_info_proto.name)
|
@@ -1,134 +0,0 @@
|
|
1
|
-
from typing import Dict, List, Optional
|
2
|
-
|
3
|
-
from pyspark.sql import DataFrame
|
4
|
-
|
5
|
-
from feature_store.entities.feature_table import FeatureTable
|
6
|
-
from feature_store.entities.function_info import FunctionInfo
|
7
|
-
from feature_store.utils.feature_lookup_utils import (
|
8
|
-
join_feature_data_if_not_overridden,
|
9
|
-
)
|
10
|
-
|
11
|
-
from feature_store.entities.feature_spec import FeatureSpec
|
12
|
-
from feature_store.utils.feature_spec_utils import (
|
13
|
-
COLUMN_INFO_TYPE_FEATURE,
|
14
|
-
COLUMN_INFO_TYPE_ON_DEMAND,
|
15
|
-
COLUMN_INFO_TYPE_SOURCE,
|
16
|
-
get_feature_execution_groups,
|
17
|
-
)
|
18
|
-
|
19
|
-
|
20
|
-
class TrainingSet:
|
21
|
-
"""
|
22
|
-
.. note::
|
23
|
-
|
24
|
-
Aliases: `!databricks.feature_engineering.training_set.TrainingSet`, `!databricks.feature_store.training_set.TrainingSet`
|
25
|
-
|
26
|
-
Class that defines :obj:`TrainingSet` objects.
|
27
|
-
|
28
|
-
.. note::
|
29
|
-
|
30
|
-
The :class:`TrainingSet` constructor should not be called directly. Instead,
|
31
|
-
call :meth:`create_training_set() <databricks.feature_engineering.client.FeatureEngineeringClient.create_training_set>`.
|
32
|
-
"""
|
33
|
-
|
34
|
-
def __init__(
|
35
|
-
self,
|
36
|
-
feature_spec: FeatureSpec,
|
37
|
-
df: DataFrame,
|
38
|
-
labels: List[str],
|
39
|
-
feature_table_metadata_map: Dict[str, FeatureTable],
|
40
|
-
feature_table_data_map: Dict[str, DataFrame],
|
41
|
-
uc_function_infos: Dict[str, FunctionInfo],
|
42
|
-
use_spark_native_join: Optional[bool] = False,
|
43
|
-
):
|
44
|
-
"""Initialize a :obj:`TrainingSet` object."""
|
45
|
-
assert isinstance(
|
46
|
-
labels, list
|
47
|
-
), f"Expected type `list` for argument `labels`. Got '{labels}' with type '{type(labels)}'."
|
48
|
-
|
49
|
-
self._feature_spec = feature_spec
|
50
|
-
self._df = df
|
51
|
-
self._labels = labels
|
52
|
-
self._feature_table_metadata_map = feature_table_metadata_map
|
53
|
-
self._feature_table_data_map = feature_table_data_map
|
54
|
-
self._uc_function_infos = uc_function_infos
|
55
|
-
self._use_spark_native_join = use_spark_native_join
|
56
|
-
# Perform basic validations and resolve FeatureSpec and label column data types.
|
57
|
-
self._validate_and_inject_dtypes()
|
58
|
-
self._label_data_types = {
|
59
|
-
name: data_type for name, data_type in df.dtypes if name in labels
|
60
|
-
}
|
61
|
-
|
62
|
-
@property
|
63
|
-
def feature_spec(self) -> FeatureSpec:
|
64
|
-
"""Define a feature spec."""
|
65
|
-
return self._feature_spec
|
66
|
-
|
67
|
-
def _augment_df(self) -> DataFrame:
|
68
|
-
"""
|
69
|
-
Internal helper to augment DataFrame with feature lookups and on-demand features specified in the FeatureSpec.
|
70
|
-
Does not drop excluded columns, and does not overwrite columns that already exist.
|
71
|
-
Return column order is df.columns + feature lookups + on-demand features.
|
72
|
-
"""
|
73
|
-
execution_groups = get_feature_execution_groups(
|
74
|
-
self.feature_spec, self._df.columns
|
75
|
-
)
|
76
|
-
|
77
|
-
result_df = self._df
|
78
|
-
# Iterate over all levels and type of DAG nodes in FeatureSpec and execute them.
|
79
|
-
for execution_group in execution_groups:
|
80
|
-
if execution_group.type == COLUMN_INFO_TYPE_SOURCE:
|
81
|
-
continue
|
82
|
-
if execution_group.type == COLUMN_INFO_TYPE_FEATURE:
|
83
|
-
# Apply FeatureLookups
|
84
|
-
result_df = join_feature_data_if_not_overridden(
|
85
|
-
feature_spec=self.feature_spec,
|
86
|
-
df=result_df,
|
87
|
-
features_to_join=execution_group.features,
|
88
|
-
feature_table_metadata_map=self._feature_table_metadata_map,
|
89
|
-
feature_table_data_map=self._feature_table_data_map,
|
90
|
-
use_spark_native_join=self._use_spark_native_join,
|
91
|
-
)
|
92
|
-
# elif execution_group.type == COLUMN_INFO_TYPE_ON_DEMAND:
|
93
|
-
# # Apply all on-demand UDFs
|
94
|
-
# result_df = apply_functions_if_not_overridden(
|
95
|
-
# df=result_df,
|
96
|
-
# functions_to_apply=execution_group.features,
|
97
|
-
# uc_function_infos=self._uc_function_infos,
|
98
|
-
# )
|
99
|
-
else:
|
100
|
-
# This should never be reached.
|
101
|
-
raise Exception("Unknown feature execution type:", execution_group.type)
|
102
|
-
return result_df
|
103
|
-
|
104
|
-
def _validate_and_inject_dtypes(self):
|
105
|
-
"""
|
106
|
-
Performs validations through _augment_df (e.g. Delta table exists, Delta and feature table dtypes match),
|
107
|
-
then inject the result DataFrame dtypes into the FeatureSpec.
|
108
|
-
"""
|
109
|
-
augmented_df = self._augment_df()
|
110
|
-
augmented_df_dtypes = {column: dtype for column, dtype in augmented_df.dtypes}
|
111
|
-
|
112
|
-
# Inject the result DataFrame column types into the respective ColumnInfo
|
113
|
-
for ci in self.feature_spec.column_infos:
|
114
|
-
ci._data_type = augmented_df_dtypes[ci.output_name]
|
115
|
-
|
116
|
-
def load_df(self) -> DataFrame:
|
117
|
-
"""
|
118
|
-
Load a :class:`DataFrame <pyspark.sql.DataFrame>`.
|
119
|
-
|
120
|
-
Return a :class:`DataFrame <pyspark.sql.DataFrame>` for training.
|
121
|
-
|
122
|
-
The returned :class:`DataFrame <pyspark.sql.DataFrame>` has columns specified
|
123
|
-
in the ``feature_spec`` and ``labels`` parameters provided
|
124
|
-
in :meth:`create_training_set() <databricks.feature_engineering.client.FeatureEngineeringClient.create_training_set>`.
|
125
|
-
|
126
|
-
:return:
|
127
|
-
A :class:`DataFrame <pyspark.sql.DataFrame>` for training
|
128
|
-
"""
|
129
|
-
augmented_df = self._augment_df()
|
130
|
-
# Return only included columns in order defined by FeatureSpec + labels
|
131
|
-
included_columns = [
|
132
|
-
ci.output_name for ci in self.feature_spec.column_infos if ci.include
|
133
|
-
] + self._labels
|
134
|
-
return augmented_df.select(included_columns)
|
File without changes
|