wedata-feature-engineering 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {feature_store → wedata}/__init__.py +1 -1
- {feature_store → wedata/feature_store}/client.py +113 -41
- {feature_store → wedata/feature_store}/constants/constants.py +19 -0
- {feature_store → wedata/feature_store}/entities/column_info.py +4 -4
- {feature_store → wedata/feature_store}/entities/feature_lookup.py +5 -1
- {feature_store → wedata/feature_store}/entities/feature_spec.py +46 -46
- wedata/feature_store/entities/feature_table.py +107 -0
- {feature_store → wedata/feature_store}/entities/training_set.py +13 -12
- {feature_store → wedata/feature_store}/feature_table_client/feature_table_client.py +85 -30
- {feature_store → wedata/feature_store}/spark_client/spark_client.py +30 -56
- wedata/feature_store/training_set_client/training_set_client.py +367 -0
- wedata/feature_store/utils/__init__.py +0 -0
- feature_store/utils/utils.py → wedata/feature_store/utils/common_utils.py +108 -54
- {feature_store → wedata/feature_store}/utils/feature_lookup_utils.py +6 -6
- {feature_store → wedata/feature_store}/utils/feature_spec_utils.py +6 -6
- {feature_store → wedata/feature_store}/utils/feature_utils.py +5 -5
- wedata/feature_store/utils/on_demand_utils.py +107 -0
- {feature_store → wedata/feature_store}/utils/schema_utils.py +1 -1
- wedata/feature_store/utils/signature_utils.py +205 -0
- {feature_store → wedata/feature_store}/utils/training_set_utils.py +18 -19
- {feature_store → wedata/feature_store}/utils/uc_utils.py +1 -1
- {wedata_feature_engineering-0.1.4.dist-info → wedata_feature_engineering-0.1.6.dist-info}/METADATA +1 -1
- wedata_feature_engineering-0.1.6.dist-info/RECORD +43 -0
- wedata_feature_engineering-0.1.6.dist-info/top_level.txt +1 -0
- feature_store/entities/feature_table.py +0 -164
- feature_store/training_set_client/training_set_client.py +0 -196
- feature_store/utils/common_utils.py +0 -96
- wedata_feature_engineering-0.1.4.dist-info/RECORD +0 -41
- wedata_feature_engineering-0.1.4.dist-info/top_level.txt +0 -1
- {feature_store/constants → wedata/feature_store}/__init__.py +0 -0
- {feature_store/entities → wedata/feature_store/constants}/__init__.py +0 -0
- {feature_store/feature_table_client → wedata/feature_store/entities}/__init__.py +0 -0
- {feature_store → wedata/feature_store}/entities/data_type.py +0 -0
- {feature_store → wedata/feature_store}/entities/environment_variables.py +0 -0
- {feature_store → wedata/feature_store}/entities/feature.py +0 -0
- {feature_store → wedata/feature_store}/entities/feature_column_info.py +0 -0
- {feature_store → wedata/feature_store}/entities/feature_function.py +0 -0
- {feature_store → wedata/feature_store}/entities/feature_spec_constants.py +0 -0
- {feature_store → wedata/feature_store}/entities/feature_table_info.py +0 -0
- {feature_store → wedata/feature_store}/entities/function_info.py +0 -0
- {feature_store → wedata/feature_store}/entities/on_demand_column_info.py +0 -0
- {feature_store → wedata/feature_store}/entities/source_data_column_info.py +0 -0
- {feature_store/spark_client → wedata/feature_store/feature_table_client}/__init__.py +0 -0
- {feature_store/training_set_client → wedata/feature_store/spark_client}/__init__.py +0 -0
- {feature_store/utils → wedata/feature_store/training_set_client}/__init__.py +0 -0
- {feature_store → wedata/feature_store}/utils/topological_sort.py +0 -0
- {feature_store → wedata/feature_store}/utils/validation_utils.py +0 -0
- {wedata_feature_engineering-0.1.4.dist-info → wedata_feature_engineering-0.1.6.dist-info}/WHEEL +0 -0
@@ -0,0 +1,107 @@
|
|
1
|
+
import copy
|
2
|
+
from typing import Dict, List
|
3
|
+
|
4
|
+
from pyspark.sql import DataFrame
|
5
|
+
from pyspark.sql.functions import expr
|
6
|
+
|
7
|
+
from wedata.feature_store.entities.feature_function import FeatureFunction
|
8
|
+
from wedata.feature_store.entities.function_info import FunctionInfo
|
9
|
+
from wedata.feature_store.entities.on_demand_column_info import OnDemandColumnInfo
|
10
|
+
from wedata.feature_store.utils import common_utils, uc_utils
|
11
|
+
|
12
|
+
|
13
|
+
def _udf_expr(udf_name: str, arguments: List[str]) -> expr:
|
14
|
+
"""
|
15
|
+
Generate a Spark SQL expression, e.g. expr("udf_name(col1, col2)")
|
16
|
+
"""
|
17
|
+
arguments_str = ", ".join(common_utils.sanitize_identifiers(arguments))
|
18
|
+
return expr(f"{udf_name}({arguments_str})")
|
19
|
+
|
20
|
+
|
21
|
+
def _validate_apply_functions_df(
|
22
|
+
df: DataFrame,
|
23
|
+
functions_to_apply: List[OnDemandColumnInfo],
|
24
|
+
uc_function_infos: Dict[str, FunctionInfo],
|
25
|
+
):
|
26
|
+
"""
|
27
|
+
Validate the following:
|
28
|
+
1. On-demand input columns specified by functions_to_apply exist in the DataFrame.
|
29
|
+
2. On-demand input columns have data types that match those of UDF parameters.
|
30
|
+
"""
|
31
|
+
for odci in functions_to_apply:
|
32
|
+
function_info = uc_function_infos[odci.udf_name]
|
33
|
+
types_dict = dict(df.dtypes)
|
34
|
+
|
35
|
+
for p in function_info.input_params:
|
36
|
+
arg_column = odci.input_bindings[p.name]
|
37
|
+
if arg_column not in df.columns:
|
38
|
+
raise ValueError(
|
39
|
+
f"FeatureFunction argument column '{arg_column}' for UDF '{odci.udf_name}' parameter '{p.name}' "
|
40
|
+
f"does not exist in provided DataFrame with schema '{df.schema}'."
|
41
|
+
)
|
42
|
+
if types_dict[arg_column] != p.type_text:
|
43
|
+
raise ValueError(
|
44
|
+
f"FeatureFunction argument column '{arg_column}' for UDF '{odci.udf_name}' parameter '{p.name}' "
|
45
|
+
f"does not have the expected type. Argument column '{arg_column}' has type "
|
46
|
+
f"'{types_dict[arg_column]}' and parameter '{p.name}' has type '{p.type_text}'."
|
47
|
+
)
|
48
|
+
|
49
|
+
|
50
|
+
def apply_functions_if_not_overridden(
|
51
|
+
df: DataFrame,
|
52
|
+
functions_to_apply: List[OnDemandColumnInfo],
|
53
|
+
uc_function_infos: Dict[str, FunctionInfo],
|
54
|
+
) -> DataFrame:
|
55
|
+
"""
|
56
|
+
For all on-demand features, in the order defined by the FeatureSpec:
|
57
|
+
If the feature does not already exist, append the evaluated UDF expression.
|
58
|
+
Existing column values or column positions are not modified.
|
59
|
+
|
60
|
+
`_validate_apply_functions_df` validates UDFs can be applied on `df` schema.
|
61
|
+
|
62
|
+
The caller should validate:
|
63
|
+
1. FeatureFunction bound argument columns for UDF parameters exist in FeatureSpec defined features.
|
64
|
+
2. FeatureFunction output feature names are unique.
|
65
|
+
"""
|
66
|
+
_validate_apply_functions_df(
|
67
|
+
df=df,
|
68
|
+
functions_to_apply=functions_to_apply,
|
69
|
+
uc_function_infos=uc_function_infos,
|
70
|
+
)
|
71
|
+
|
72
|
+
columns = {}
|
73
|
+
for odci in functions_to_apply:
|
74
|
+
if odci.output_name not in df.columns:
|
75
|
+
function_info = uc_function_infos[odci.udf_name]
|
76
|
+
# Resolve the bound arguments in the UDF parameter order
|
77
|
+
udf_arguments = [
|
78
|
+
odci.input_bindings[p.name] for p in function_info.input_params
|
79
|
+
]
|
80
|
+
columns[odci.output_name] = _udf_expr(odci.udf_name, udf_arguments)
|
81
|
+
return df.withColumns(columns)
|
82
|
+
|
83
|
+
|
84
|
+
def get_feature_functions_with_full_udf_names(
|
85
|
+
feature_functions: List[FeatureFunction], current_catalog: str, current_schema: str
|
86
|
+
):
|
87
|
+
"""
|
88
|
+
Takes in a list of FeatureFunctions, and returns copies with:
|
89
|
+
1. Fully qualified UDF names.
|
90
|
+
2. If output_name is empty, fully qualified UDF names as output_name.
|
91
|
+
"""
|
92
|
+
udf_names = {ff.udf_name for ff in feature_functions}
|
93
|
+
uc_utils._check_qualified_udf_names(udf_names)
|
94
|
+
uc_utils._verify_all_udfs_in_uc(udf_names, current_catalog, current_schema)
|
95
|
+
|
96
|
+
standardized_feature_functions = []
|
97
|
+
for ff in feature_functions:
|
98
|
+
ff_copy = copy.deepcopy(ff)
|
99
|
+
del ff
|
100
|
+
|
101
|
+
ff_copy._udf_name = uc_utils.get_full_udf_name(
|
102
|
+
ff_copy.udf_name, current_catalog, current_schema
|
103
|
+
)
|
104
|
+
if not ff_copy.output_name:
|
105
|
+
ff_copy._output_name = ff_copy.udf_name
|
106
|
+
standardized_feature_functions.append(ff_copy)
|
107
|
+
return standardized_feature_functions
|
@@ -0,0 +1,205 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Any, Dict, Optional
|
3
|
+
|
4
|
+
import mlflow
|
5
|
+
from mlflow.models import ModelSignature
|
6
|
+
from mlflow.types import ColSpec
|
7
|
+
from mlflow.types import DataType as MlflowDataType
|
8
|
+
from mlflow.types import ParamSchema, Schema
|
9
|
+
|
10
|
+
from wedata.feature_store.entities.feature_column_info import FeatureColumnInfo
|
11
|
+
from wedata.feature_store.entities.feature_spec import FeatureSpec
|
12
|
+
from wedata.feature_store.entities.on_demand_column_info import OnDemandColumnInfo
|
13
|
+
from wedata.feature_store.entities.source_data_column_info import SourceDataColumnInfo
|
14
|
+
|
15
|
+
_logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
# Some types (array, map, decimal, timestamp_ntz) are unsupported due to MLflow signatures
|
18
|
+
# lacking any equivalent types. We thus cannot construct a ColSpec for any column
|
19
|
+
# that uses these types.
|
20
|
+
SUPPORTED_TYPE_MAP = {
|
21
|
+
"smallint": MlflowDataType.integer, # Upcast to integer
|
22
|
+
"int": MlflowDataType.integer,
|
23
|
+
"bigint": MlflowDataType.long,
|
24
|
+
"float": MlflowDataType.float,
|
25
|
+
"double": MlflowDataType.double,
|
26
|
+
"boolean": MlflowDataType.boolean,
|
27
|
+
"date": MlflowDataType.datetime,
|
28
|
+
"timestamp": MlflowDataType.datetime,
|
29
|
+
"string": MlflowDataType.string,
|
30
|
+
"binary": MlflowDataType.binary,
|
31
|
+
}
|
32
|
+
|
33
|
+
|
34
|
+
def is_unsupported_type(type_str: str):
|
35
|
+
return type_str not in SUPPORTED_TYPE_MAP
|
36
|
+
|
37
|
+
|
38
|
+
def convert_spark_data_type_to_mlflow_signature_type(spark_type):
|
39
|
+
"""
|
40
|
+
Maps Databricks SQL types to MLflow signature types.
|
41
|
+
docs.databricks.com/sql/language-manual/sql-ref-datatypes.html#language-mappings
|
42
|
+
"""
|
43
|
+
return SUPPORTED_TYPE_MAP.get(spark_type)
|
44
|
+
|
45
|
+
|
46
|
+
def get_input_schema_from_feature_spec(feature_spec: FeatureSpec) -> Schema:
|
47
|
+
"""
|
48
|
+
Produces an MLflow signature schema from a feature spec.
|
49
|
+
Source data columns are marked as required inputs and feature columns
|
50
|
+
(both lookups and on-demand features) are marked as optional inputs.
|
51
|
+
|
52
|
+
:param feature_spec: FeatureSpec object with datatypes for each column.
|
53
|
+
"""
|
54
|
+
# If we're missing any data types for any column, we are likely dealing with a
|
55
|
+
# malformed feature spec and should halt signature construction.
|
56
|
+
if any([ci.data_type is None for ci in feature_spec.column_infos]):
|
57
|
+
raise Exception("Training set does not contain column data types.")
|
58
|
+
|
59
|
+
source_data_cols = [
|
60
|
+
ci
|
61
|
+
for ci in feature_spec.column_infos
|
62
|
+
if isinstance(ci.info, SourceDataColumnInfo)
|
63
|
+
]
|
64
|
+
# Don't create signature if any source data columns (required) are of complex types.
|
65
|
+
if any(
|
66
|
+
[
|
67
|
+
ci.data_type is None or is_unsupported_type(ci.data_type)
|
68
|
+
for ci in source_data_cols
|
69
|
+
]
|
70
|
+
):
|
71
|
+
raise Exception(
|
72
|
+
"Input DataFrame contains column data types not supported by "
|
73
|
+
"MLflow model signatures."
|
74
|
+
)
|
75
|
+
required_input_colspecs = [
|
76
|
+
ColSpec(
|
77
|
+
convert_spark_data_type_to_mlflow_signature_type(ci.data_type),
|
78
|
+
ci.info.output_name,
|
79
|
+
required=True,
|
80
|
+
)
|
81
|
+
for ci in source_data_cols
|
82
|
+
]
|
83
|
+
feature_cols = [
|
84
|
+
ci
|
85
|
+
for ci in feature_spec.column_infos
|
86
|
+
if isinstance(ci.info, (FeatureColumnInfo, OnDemandColumnInfo))
|
87
|
+
]
|
88
|
+
unsupported_feature_cols = [
|
89
|
+
ci for ci in feature_cols if is_unsupported_type(ci.data_type)
|
90
|
+
]
|
91
|
+
optional_input_colspecs = [
|
92
|
+
ColSpec(
|
93
|
+
convert_spark_data_type_to_mlflow_signature_type(ci.data_type),
|
94
|
+
ci.output_name,
|
95
|
+
required=False,
|
96
|
+
)
|
97
|
+
for ci in feature_cols
|
98
|
+
if not is_unsupported_type(ci.data_type)
|
99
|
+
]
|
100
|
+
if unsupported_feature_cols:
|
101
|
+
feat_string = ", ".join(
|
102
|
+
[f"{ci.output_name} ({ci.data_type})" for ci in unsupported_feature_cols]
|
103
|
+
)
|
104
|
+
_logger.warning(
|
105
|
+
f"The following features will not be included in the input schema because their"
|
106
|
+
f" data types are not supported by MLflow model signatures: {feat_string}. "
|
107
|
+
f"These features cannot be overridden during model serving."
|
108
|
+
)
|
109
|
+
return Schema(required_input_colspecs + optional_input_colspecs)
|
110
|
+
|
111
|
+
|
112
|
+
def get_output_schema_from_labels(label_type_map: Optional[Dict[str, str]]) -> Schema:
|
113
|
+
"""
|
114
|
+
Produces an MLflow signature schema from the provided label type map.
|
115
|
+
:param label_type_map: Map label column name -> data type
|
116
|
+
"""
|
117
|
+
if not label_type_map:
|
118
|
+
raise Exception("Training set does not contain a label.")
|
119
|
+
if any([is_unsupported_type(dtype) for dtype in label_type_map.values()]):
|
120
|
+
raise Exception(
|
121
|
+
"Labels are of data types not supported by MLflow model signatures."
|
122
|
+
)
|
123
|
+
else:
|
124
|
+
output_colspecs = [
|
125
|
+
ColSpec(
|
126
|
+
convert_spark_data_type_to_mlflow_signature_type(spark_type),
|
127
|
+
col_name,
|
128
|
+
required=True,
|
129
|
+
)
|
130
|
+
for col_name, spark_type in label_type_map.items()
|
131
|
+
]
|
132
|
+
return Schema(output_colspecs)
|
133
|
+
|
134
|
+
|
135
|
+
def get_mlflow_signature_from_feature_spec(
|
136
|
+
feature_spec: FeatureSpec,
|
137
|
+
label_type_map: Optional[Dict[str, str]],
|
138
|
+
override_output_schema: Optional[Schema],
|
139
|
+
params: Optional[Dict[str, Any]] = None,
|
140
|
+
) -> Optional[ModelSignature]:
|
141
|
+
"""
|
142
|
+
Produce an MLflow signature from a feature spec and label type map.
|
143
|
+
Source data columns are marked as required inputs and feature columns
|
144
|
+
(both lookups and on-demand features) are marked as optional inputs.
|
145
|
+
|
146
|
+
Reads output types from the cached label -> datatype map in the training set.
|
147
|
+
If override_output_schema is provided, it will always be used as the output schema.
|
148
|
+
|
149
|
+
:param feature_spec: FeatureSpec object with datatypes for each column.
|
150
|
+
:param label_type_map: Map of label column name -> datatype
|
151
|
+
:param override_output_schema: User-provided output schema to use if provided.
|
152
|
+
"""
|
153
|
+
kwargs = {}
|
154
|
+
kwargs["inputs"] = get_input_schema_from_feature_spec(feature_spec)
|
155
|
+
try:
|
156
|
+
output_schema = override_output_schema or get_output_schema_from_labels(
|
157
|
+
label_type_map
|
158
|
+
)
|
159
|
+
kwargs["outputs"] = output_schema
|
160
|
+
except Exception as e:
|
161
|
+
_logger.warning(f"Could not infer an output schema: {e}")
|
162
|
+
|
163
|
+
if params:
|
164
|
+
try:
|
165
|
+
from mlflow.types.utils import _infer_param_schema
|
166
|
+
|
167
|
+
kwargs["params"] = _infer_param_schema(params)
|
168
|
+
except Exception as e:
|
169
|
+
_logger.warning(f"Could not infer params schema: {e}")
|
170
|
+
|
171
|
+
return mlflow.models.ModelSignature(**kwargs)
|
172
|
+
|
173
|
+
|
174
|
+
def drop_signature_inputs_and_invalid_params(signature):
|
175
|
+
"""
|
176
|
+
Drop ModelSignature inputs field and invalid params from params field.
|
177
|
+
This is useful for feature store model's raw_model.
|
178
|
+
Feature store model's input schema does not apply to raw_model's input,
|
179
|
+
so we drop the inputs field of raw_model's signature.
|
180
|
+
Feature store model's result_type param enables setting and overriding
|
181
|
+
a default result_type for predictions, but this interferes with params
|
182
|
+
passed to MLflow's predict function, so we drop result_type from
|
183
|
+
the params field of raw_model's signature.
|
184
|
+
|
185
|
+
:param signature: ModelSignature object.
|
186
|
+
"""
|
187
|
+
if signature:
|
188
|
+
outputs_schema = signature.outputs
|
189
|
+
params_schema = signature.params if hasattr(signature, "params") else None
|
190
|
+
try:
|
191
|
+
# Only for mlflow>=2.6.0 ModelSignature contains params attribute
|
192
|
+
if params_schema:
|
193
|
+
updated_params_schema = ParamSchema(
|
194
|
+
[param for param in params_schema if param.name != "result_type"]
|
195
|
+
)
|
196
|
+
return ModelSignature(
|
197
|
+
outputs=outputs_schema, params=updated_params_schema
|
198
|
+
)
|
199
|
+
if outputs_schema:
|
200
|
+
return ModelSignature(outputs=outputs_schema)
|
201
|
+
except TypeError:
|
202
|
+
_logger.warning(
|
203
|
+
"ModelSignature without inputs is not supported, please upgrade "
|
204
|
+
"mlflow >= 2.7.0 to use the feature."
|
205
|
+
)
|
@@ -4,21 +4,19 @@ from typing import Dict, List, Optional, Set
|
|
4
4
|
|
5
5
|
from pyspark.sql import DataFrame
|
6
6
|
|
7
|
-
from feature_store.entities.column_info import ColumnInfo
|
8
|
-
from feature_store.entities.feature import Feature
|
9
|
-
from feature_store.entities.feature_column_info import FeatureColumnInfo
|
10
|
-
from feature_store.entities.feature_lookup import FeatureLookup
|
11
|
-
from feature_store.entities.feature_spec import FeatureSpec
|
12
|
-
from feature_store.entities.feature_table import FeatureTable
|
13
|
-
from feature_store.entities.feature_table_info import FeatureTableInfo
|
14
|
-
from feature_store.entities.function_info import FunctionInfo
|
15
|
-
from feature_store.entities.on_demand_column_info import OnDemandColumnInfo
|
16
|
-
from feature_store.entities.source_data_column_info import SourceDataColumnInfo
|
17
|
-
|
18
|
-
from feature_store.
|
19
|
-
|
20
|
-
from feature_store.utils import common_utils, validation_utils, uc_utils, schema_utils, utils
|
21
|
-
from feature_store.utils.feature_spec_utils import assign_topological_ordering
|
7
|
+
from wedata.feature_store.entities.column_info import ColumnInfo
|
8
|
+
from wedata.feature_store.entities.feature import Feature
|
9
|
+
from wedata.feature_store.entities.feature_column_info import FeatureColumnInfo
|
10
|
+
from wedata.feature_store.entities.feature_lookup import FeatureLookup
|
11
|
+
from wedata.feature_store.entities.feature_spec import FeatureSpec
|
12
|
+
from wedata.feature_store.entities.feature_table import FeatureTable
|
13
|
+
from wedata.feature_store.entities.feature_table_info import FeatureTableInfo
|
14
|
+
from wedata.feature_store.entities.function_info import FunctionInfo
|
15
|
+
from wedata.feature_store.entities.on_demand_column_info import OnDemandColumnInfo
|
16
|
+
from wedata.feature_store.entities.source_data_column_info import SourceDataColumnInfo
|
17
|
+
|
18
|
+
from wedata.feature_store.utils import common_utils, validation_utils, uc_utils, schema_utils
|
19
|
+
from wedata.feature_store.utils.feature_spec_utils import assign_topological_ordering
|
22
20
|
|
23
21
|
_logger = logging.getLogger(__name__)
|
24
22
|
|
@@ -99,9 +97,9 @@ def _explode_feature_lookup(
|
|
99
97
|
FeatureColumnInfo(
|
100
98
|
table_name=feature_lookup.table_name,
|
101
99
|
feature_name=feature_name,
|
102
|
-
lookup_key=
|
100
|
+
lookup_key=common_utils.as_list(feature_lookup.lookup_key),
|
103
101
|
output_name=(feature_lookup._get_output_name(feature_name)),
|
104
|
-
timestamp_lookup_key=
|
102
|
+
timestamp_lookup_key=common_utils.as_list(
|
105
103
|
feature_lookup.timestamp_lookup_key, default=[]
|
106
104
|
),
|
107
105
|
)
|
@@ -280,13 +278,14 @@ def get_table_metadata(
|
|
280
278
|
feature_table_metadata_map = get_feature_table_metadata_for_tables(
|
281
279
|
spark_client, table_names=table_names
|
282
280
|
)
|
281
|
+
|
283
282
|
feature_table_data_map = load_feature_data_for_tables(
|
284
283
|
spark_client, table_names=table_names
|
285
284
|
)
|
286
285
|
return _FeatureTableMetadata(
|
287
286
|
feature_table_features_map,
|
288
287
|
feature_table_metadata_map,
|
289
|
-
feature_table_data_map
|
288
|
+
feature_table_data_map
|
290
289
|
)
|
291
290
|
|
292
291
|
|
@@ -515,7 +514,7 @@ def build_feature_spec(
|
|
515
514
|
for table_name in consumed_table_names
|
516
515
|
]
|
517
516
|
function_infos = [
|
518
|
-
FunctionInfo(
|
517
|
+
FunctionInfo(full_name=udf_name) for udf_name in consumed_udf_names
|
519
518
|
]
|
520
519
|
|
521
520
|
# Build FeatureSpec
|
@@ -2,7 +2,7 @@ import copy
|
|
2
2
|
import re
|
3
3
|
from typing import Optional, Set
|
4
4
|
|
5
|
-
from feature_store.entities.feature_spec import FeatureSpec
|
5
|
+
from wedata.feature_store.entities.feature_spec import FeatureSpec
|
6
6
|
|
7
7
|
SINGLE_LEVEL_NAMESPACE_REGEX = r"^[^\. \/\x00-\x1F\x7F]+$"
|
8
8
|
TWO_LEVEL_NAMESPACE_REGEX = r"^[^\. \/\x00-\x1F\x7F]+(\.[^\. \/\x00-\x1F\x7F]+)$"
|
@@ -0,0 +1,43 @@
|
|
1
|
+
wedata/__init__.py,sha256=26GwucASB9KsmU109sN-VKotEKp1WZYQDGP0wgWZrzY,101
|
2
|
+
wedata/feature_store/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
wedata/feature_store/client.py,sha256=7a-9C8HIBHnQNQD6I4W3UtBQwkJE8G-Q7N24zydjpkY,8100
|
4
|
+
wedata/feature_store/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
wedata/feature_store/constants/constants.py,sha256=b4tgcSt66YIq0Fg7pMbqvbqPOI77Cz8znLVZ4ihUKss,1479
|
6
|
+
wedata/feature_store/entities/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
wedata/feature_store/entities/column_info.py,sha256=sU_WD9U0qse0M2speowLY30qSE6j4_57PuvtdPkwiFY,4192
|
8
|
+
wedata/feature_store/entities/data_type.py,sha256=VpHS6Fr3TphQQ8NbAcEnDJ-8eOZV6ivYuWxv3pAM2RM,3394
|
9
|
+
wedata/feature_store/entities/environment_variables.py,sha256=ZEFml5H9MQuzBKM074mUrFYu-Sga4Knmxqiwpke2WGc,1679
|
10
|
+
wedata/feature_store/entities/feature.py,sha256=wX8fTBlJq3GYdj9rrBDCY3kFgcVBBAiOOZdxEhnQkNQ,1241
|
11
|
+
wedata/feature_store/entities/feature_column_info.py,sha256=-TGxRafYUaNKe0YzHus2XbfRaVrMv7pcffMdbtTT4nA,2031
|
12
|
+
wedata/feature_store/entities/feature_function.py,sha256=R17INrCE-U_Uj9KLbFz69aYlOkTETTwQHMMo470F4lQ,1865
|
13
|
+
wedata/feature_store/entities/feature_lookup.py,sha256=YjYz8kLq42doFbgPzpmm1r3GPhPYkLsIss4H71x-KAo,8009
|
14
|
+
wedata/feature_store/entities/feature_spec.py,sha256=60RUOOe9y_Xsd1I3xqq4NZYnaox4_jjwSyGRTKXLiIw,20041
|
15
|
+
wedata/feature_store/entities/feature_spec_constants.py,sha256=YWDBfRiNDe6fUJFUBo3V4WYg2xsljoPAE-ZejfFZCgM,785
|
16
|
+
wedata/feature_store/entities/feature_table.py,sha256=dHZHSDPD4HJ2XanLVIrVTkaCYUeqZ6eWEpA0d3YO71g,4010
|
17
|
+
wedata/feature_store/entities/feature_table_info.py,sha256=2vUaVdW_jw1dRAlmJWvBRueuMeuqWu_NYB9SlxLI7Uw,1126
|
18
|
+
wedata/feature_store/entities/function_info.py,sha256=l0kmiq2R_QNfSMJ7y0xZohlMiemgYSr1dN5vzV8ijIs,7314
|
19
|
+
wedata/feature_store/entities/on_demand_column_info.py,sha256=Eh5ieaj1TxC7DG6ipBZzH2ZyY0bwkLrDOkuZjgYr4gY,1297
|
20
|
+
wedata/feature_store/entities/source_data_column_info.py,sha256=a9jQOJvehwDIrKPwsP6W9YRBSPNK2nZYypE6-p80CwA,542
|
21
|
+
wedata/feature_store/entities/training_set.py,sha256=ylt1h6Z_xU8hKYvnvd80CeewTGSN68-_kvFpoliwH7s,5679
|
22
|
+
wedata/feature_store/feature_table_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
|
+
wedata/feature_store/feature_table_client/feature_table_client.py,sha256=nrnY3FLQnMhW1BzByDjjfU89hirgaKlg2l2tAfcjvyM,12138
|
24
|
+
wedata/feature_store/spark_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
|
+
wedata/feature_store/spark_client/spark_client.py,sha256=DBCYjLsFrIVRvLErTNyfLIHRul3v0y9uZIY2JR1N92s,10323
|
26
|
+
wedata/feature_store/training_set_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
|
+
wedata/feature_store/training_set_client/training_set_client.py,sha256=gHeZU0rvvUcyNTfroXD3LAinFPdhDpnwTOIWj6z84Tc,15102
|
28
|
+
wedata/feature_store/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
29
|
+
wedata/feature_store/utils/common_utils.py,sha256=rmGXfBoEyDMlfqd7lxpEmKJaLoQ-d-ufWpAcE8nSHqA,10009
|
30
|
+
wedata/feature_store/utils/feature_lookup_utils.py,sha256=da6ULwf5D-FRVpZoNyag1rroBfq_XPSH4a3uEMB_8io,22372
|
31
|
+
wedata/feature_store/utils/feature_spec_utils.py,sha256=GFwc-WT6nN1tnal5F2c0bgVDRhH-YW58S0GaHBPZEsQ,11624
|
32
|
+
wedata/feature_store/utils/feature_utils.py,sha256=KKq28bVB_lCuhnR9Hk6JegJBOVgcelWlvrRM-F9onkA,2796
|
33
|
+
wedata/feature_store/utils/on_demand_utils.py,sha256=pazZRG5c0Se08MV_inBddIeX4Q9xlVN_H9SC_WK3xzs,4260
|
34
|
+
wedata/feature_store/utils/schema_utils.py,sha256=y6EYY1pUxjVg6MP4C7avdW8ZEBBaDo1YTV2CmPF4i8o,4491
|
35
|
+
wedata/feature_store/utils/signature_utils.py,sha256=_4_mo1Qlzklp-JrISMS3Jv89MPbaH6rz_cRDvJqFNXM,7957
|
36
|
+
wedata/feature_store/utils/topological_sort.py,sha256=ebzKxmxeCLk9seB1zR0ASCGXsZsa-DjxJeTc4KUadtg,6475
|
37
|
+
wedata/feature_store/utils/training_set_utils.py,sha256=MYsPZS1d9HKswHgjgxD8K7H9N3dWPyyTTx20Mkp4PVU,22497
|
38
|
+
wedata/feature_store/utils/uc_utils.py,sha256=A-W8Cd8yvTmAMEWaHeWmGmcIDMvUtjAfx2G2x_di1QE,10774
|
39
|
+
wedata/feature_store/utils/validation_utils.py,sha256=FslvrNs3kstqvM6THScLOluEE6O9RWlDrD9xiihTzlw,1735
|
40
|
+
wedata_feature_engineering-0.1.6.dist-info/METADATA,sha256=orxNq_A9F8FcSWYn6wTY1pQ2KtqNVIREvGziUnNa1ys,493
|
41
|
+
wedata_feature_engineering-0.1.6.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
42
|
+
wedata_feature_engineering-0.1.6.dist-info/top_level.txt,sha256=Xa0v1rh__RvfVTVDirW5r5UBKg7ZO_iuTeXfp8MNo2A,7
|
43
|
+
wedata_feature_engineering-0.1.6.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
wedata
|
@@ -1,164 +0,0 @@
|
|
1
|
-
from typing import Dict
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
class FeatureTable:
|
6
|
-
"""
|
7
|
-
.. note::
|
8
|
-
|
9
|
-
Aliases:`!databricks.feature_engineering.entities.feature_table.FeatureTable`, `!databricks.feature_store.entities.feature_table.FeatureTable`
|
10
|
-
|
11
|
-
Value class describing one feature table.
|
12
|
-
|
13
|
-
This will typically not be instantiated directly, instead the
|
14
|
-
:meth:`create_table() <databricks.feature_engineering.client.FeatureEngineeringClient.create_table>`
|
15
|
-
will create :class:`.FeatureTable` objects.
|
16
|
-
"""
|
17
|
-
|
18
|
-
def __init__(
|
19
|
-
self,
|
20
|
-
name,
|
21
|
-
table_id,
|
22
|
-
description,
|
23
|
-
primary_keys,
|
24
|
-
partition_columns,
|
25
|
-
features,
|
26
|
-
creation_timestamp=None,
|
27
|
-
online_stores=None,
|
28
|
-
notebook_producers=None,
|
29
|
-
job_producers=None,
|
30
|
-
table_data_sources=None,
|
31
|
-
path_data_sources=None,
|
32
|
-
custom_data_sources=None,
|
33
|
-
timestamp_keys=None,
|
34
|
-
tags=None,
|
35
|
-
):
|
36
|
-
"""Initialize a FeatureTable object."""
|
37
|
-
self.name = name
|
38
|
-
self.table_id = table_id
|
39
|
-
self.description = description
|
40
|
-
self.primary_keys = primary_keys
|
41
|
-
self.partition_columns = partition_columns
|
42
|
-
self.features = features
|
43
|
-
self.creation_timestamp = creation_timestamp
|
44
|
-
self.online_stores = online_stores if online_stores is not None else []
|
45
|
-
self.notebook_producers = (
|
46
|
-
notebook_producers if notebook_producers is not None else []
|
47
|
-
)
|
48
|
-
self.job_producers = job_producers if job_producers is not None else []
|
49
|
-
self.table_data_sources = (
|
50
|
-
table_data_sources if table_data_sources is not None else []
|
51
|
-
)
|
52
|
-
self.path_data_sources = (
|
53
|
-
path_data_sources if path_data_sources is not None else []
|
54
|
-
)
|
55
|
-
self.custom_data_sources = (
|
56
|
-
custom_data_sources if custom_data_sources is not None else []
|
57
|
-
)
|
58
|
-
self.timestamp_keys = timestamp_keys if timestamp_keys is not None else []
|
59
|
-
self._tags = tags
|
60
|
-
|
61
|
-
# @property
|
62
|
-
# @deprecated("FeatureTable.primary_keys", since="v0.3.6")
|
63
|
-
# def keys(self):
|
64
|
-
# return self.primary_keys
|
65
|
-
|
66
|
-
@property
|
67
|
-
def tags(self) -> Dict[str, str]:
|
68
|
-
"""
|
69
|
-
Get the tags associated with the feature table.
|
70
|
-
|
71
|
-
:return a Dictionary of all tags associated with the feature table as key/value pairs
|
72
|
-
"""
|
73
|
-
if self._tags is None:
|
74
|
-
# If no tags are set, self._tags is expected an empty dictionary.
|
75
|
-
raise ValueError(
|
76
|
-
"Internal error: tags have not been fetched for this FeatureTable instance"
|
77
|
-
)
|
78
|
-
return self._tags
|
79
|
-
|
80
|
-
|
81
|
-
@classmethod
|
82
|
-
def from_uc_get_table_response(cls, uc_get_table_response: Dict[str, object]):
|
83
|
-
"""Return a FeatureStore object from a UC get_table response. Note: UC does not return online_stores or tags.
|
84
|
-
|
85
|
-
:param dict uc_get_table_response: A dictionary representing a UC get_table response.
|
86
|
-
:return FeatureTable: a FeatureStore object from the UC response.
|
87
|
-
"""
|
88
|
-
table_name = uc_get_table_response["full_name"]
|
89
|
-
|
90
|
-
if uc_get_table_response["securable_kind"] == "TABLE_ONLINE_VIEW":
|
91
|
-
source_table = uc_get_table_response["properties_pairs"]["properties"][
|
92
|
-
"source_table"
|
93
|
-
]
|
94
|
-
raise ValueError(
|
95
|
-
f"Table '{table_name}' is an online view. Online Views are not feature tables. Please use the source table '{source_table}' instead."
|
96
|
-
)
|
97
|
-
|
98
|
-
if (
|
99
|
-
"table_type" in uc_get_table_response
|
100
|
-
and uc_get_table_response["table_type"] == "VIEW"
|
101
|
-
):
|
102
|
-
return cls(
|
103
|
-
name=table_name,
|
104
|
-
table_id=uc_get_table_response["table_id"],
|
105
|
-
description=uc_get_table_response["comment"]
|
106
|
-
if "comment" in uc_get_table_response
|
107
|
-
else "",
|
108
|
-
primary_keys=[],
|
109
|
-
partition_columns=[],
|
110
|
-
features=[],
|
111
|
-
creation_timestamp=uc_get_table_response["created_at"],
|
112
|
-
timestamp_keys=[],
|
113
|
-
)
|
114
|
-
|
115
|
-
table_constraints = (
|
116
|
-
uc_get_table_response["table_constraints"]
|
117
|
-
if "table_constraints" in uc_get_table_response
|
118
|
-
else []
|
119
|
-
)
|
120
|
-
primary_key_constraints = [
|
121
|
-
c for c in table_constraints if "primary_key_constraint" in c
|
122
|
-
]
|
123
|
-
if len(primary_key_constraints) == 0:
|
124
|
-
raise ValueError(
|
125
|
-
"Table can't be used as a feature table because it has no primary key constraint defined."
|
126
|
-
+ " Use 'ALTER TABLE table_name ADD CONSTRAINT table_name_pk PRIMARY KEY( key_column [,...] )'"
|
127
|
-
+ " to add a primary key constraint on the table."
|
128
|
-
)
|
129
|
-
primary_key_constraint = primary_key_constraint = primary_key_constraints[0][
|
130
|
-
"primary_key_constraint"
|
131
|
-
]
|
132
|
-
timestamp_keys = (
|
133
|
-
primary_key_constraint["timeseries_columns"]
|
134
|
-
if "timeseries_columns" in primary_key_constraint
|
135
|
-
else []
|
136
|
-
)
|
137
|
-
primary_keys = [
|
138
|
-
c
|
139
|
-
for c in primary_key_constraint["child_columns"]
|
140
|
-
if c not in timestamp_keys
|
141
|
-
]
|
142
|
-
|
143
|
-
columns = uc_get_table_response["columns"]
|
144
|
-
features = [c["name"] for c in columns]
|
145
|
-
partition_columns_unordered = [c for c in columns if "partition_index" in c]
|
146
|
-
partition_columns = [
|
147
|
-
c["name"]
|
148
|
-
for c in sorted(
|
149
|
-
partition_columns_unordered, key=lambda x: x["partition_index"]
|
150
|
-
)
|
151
|
-
]
|
152
|
-
|
153
|
-
return cls(
|
154
|
-
name=table_name,
|
155
|
-
table_id=uc_get_table_response["table_id"],
|
156
|
-
description=uc_get_table_response["comment"]
|
157
|
-
if "comment" in uc_get_table_response
|
158
|
-
else "",
|
159
|
-
primary_keys=primary_keys,
|
160
|
-
partition_columns=partition_columns,
|
161
|
-
features=features,
|
162
|
-
creation_timestamp=uc_get_table_response["created_at"],
|
163
|
-
timestamp_keys=timestamp_keys,
|
164
|
-
)
|