wedata-feature-engineering 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- feature_store/constants/__init__.py +0 -0
- feature_store/constants/constants.py +28 -0
- feature_store/entities/__init__.py +0 -0
- feature_store/entities/column_info.py +117 -0
- feature_store/entities/data_type.py +92 -0
- feature_store/entities/environment_variables.py +55 -0
- feature_store/entities/feature.py +53 -0
- feature_store/entities/feature_column_info.py +64 -0
- feature_store/entities/feature_function.py +55 -0
- feature_store/entities/feature_lookup.py +179 -0
- feature_store/entities/feature_spec.py +454 -0
- feature_store/entities/feature_spec_constants.py +25 -0
- feature_store/entities/feature_table.py +164 -0
- feature_store/entities/feature_table_info.py +40 -0
- feature_store/entities/function_info.py +184 -0
- feature_store/entities/on_demand_column_info.py +44 -0
- feature_store/entities/source_data_column_info.py +21 -0
- feature_store/entities/training_set.py +134 -0
- feature_store/feature_table_client/__init__.py +0 -0
- feature_store/feature_table_client/feature_table_client.py +313 -0
- feature_store/spark_client/__init__.py +0 -0
- feature_store/spark_client/spark_client.py +286 -0
- feature_store/training_set_client/__init__.py +0 -0
- feature_store/training_set_client/training_set_client.py +196 -0
- {wedata_feature_engineering-0.1.0.dist-info → wedata_feature_engineering-0.1.2.dist-info}/METADATA +1 -1
- wedata_feature_engineering-0.1.2.dist-info/RECORD +30 -0
- wedata_feature_engineering-0.1.0.dist-info/RECORD +0 -6
- {wedata_feature_engineering-0.1.0.dist-info → wedata_feature_engineering-0.1.2.dist-info}/WHEEL +0 -0
- {wedata_feature_engineering-0.1.0.dist-info → wedata_feature_engineering-0.1.2.dist-info}/top_level.txt +0 -0
File without changes
|
@@ -0,0 +1,28 @@
|
|
1
|
+
|
2
|
+
OVERWRITE = "overwrite"
|
3
|
+
APPEND = "append"
|
4
|
+
PATH = "path"
|
5
|
+
TABLE = "table"
|
6
|
+
CUSTOM = "custom"
|
7
|
+
PREDICTION_COLUMN_NAME = "prediction"
|
8
|
+
MODEL_DATA_PATH_ROOT = "feature_store"
|
9
|
+
UTF8_BYTES_PER_CHAR = 4
|
10
|
+
MAX_PRIMARY_KEY_STRING_LENGTH_CHARS = 100
|
11
|
+
MAX_PRIMARY_KEY_STRING_LENGTH_BYTES = (
|
12
|
+
MAX_PRIMARY_KEY_STRING_LENGTH_CHARS * UTF8_BYTES_PER_CHAR
|
13
|
+
)
|
14
|
+
STREAMING_TRIGGER_CONTINUOUS = "continuous"
|
15
|
+
STREAMING_TRIGGER_ONCE = "once"
|
16
|
+
STREAMING_TRIGGER_PROCESSING_TIME = "processingTime"
|
17
|
+
DEFAULT_WRITE_STREAM_TRIGGER = {STREAMING_TRIGGER_PROCESSING_TIME: "5 seconds"}
|
18
|
+
_DEFAULT_PUBLISH_STREAM_TRIGGER = {STREAMING_TRIGGER_PROCESSING_TIME: "5 minutes"}
|
19
|
+
|
20
|
+
|
21
|
+
_WARN = "WARN"
|
22
|
+
_ERROR = "ERROR"
|
23
|
+
_SOURCE_FORMAT_DELTA = "delta"
|
24
|
+
|
25
|
+
_NO_RESULT_TYPE_PASSED = "NO_RESULT_TYPE"
|
26
|
+
_USE_SPARK_NATIVE_JOIN = "use_spark_native_join"
|
27
|
+
_PREBUILT_ENV_URI = "prebuilt_env_uri"
|
28
|
+
|
File without changes
|
@@ -0,0 +1,117 @@
|
|
1
|
+
import copy
|
2
|
+
from typing import Optional, Union
|
3
|
+
|
4
|
+
from feature_store.entities.feature_column_info import FeatureColumnInfo
|
5
|
+
from feature_store.entities.feature_spec_constants import SOURCE_DATA_COLUMN_INFO, FEATURE_COLUMN_INFO, \
|
6
|
+
ON_DEMAND_COLUMN_INFO
|
7
|
+
from feature_store.entities.on_demand_column_info import OnDemandColumnInfo
|
8
|
+
from feature_store.entities.source_data_column_info import SourceDataColumnInfo
|
9
|
+
|
10
|
+
|
11
|
+
class ColumnInfo:
|
12
|
+
"""
|
13
|
+
ColumnInfo's structure and properties are mapped 1:1 to the ColumnInfo proto message, unless specified otherwise.
|
14
|
+
"""
|
15
|
+
|
16
|
+
def __init__(
|
17
|
+
self,
|
18
|
+
info: Union[SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo],
|
19
|
+
include: bool,
|
20
|
+
data_type: Optional[str] = None,
|
21
|
+
topological_ordering: Optional[int] = None,
|
22
|
+
):
|
23
|
+
if not isinstance(
|
24
|
+
info, (SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo)
|
25
|
+
):
|
26
|
+
raise ValueError(
|
27
|
+
"info must be one of SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo."
|
28
|
+
)
|
29
|
+
self._info = info
|
30
|
+
self._include = include
|
31
|
+
self._data_type = data_type
|
32
|
+
self._topological_ordering = topological_ordering
|
33
|
+
|
34
|
+
@property
|
35
|
+
def info(
|
36
|
+
self,
|
37
|
+
) -> Union[SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo]:
|
38
|
+
return self._info
|
39
|
+
|
40
|
+
@property
|
41
|
+
def include(self) -> bool:
|
42
|
+
return self._include
|
43
|
+
|
44
|
+
@property
|
45
|
+
def data_type(self) -> Optional[str]:
|
46
|
+
"""
|
47
|
+
FeatureSpecs before v7 are not required to have data types.
|
48
|
+
"""
|
49
|
+
return self._data_type
|
50
|
+
|
51
|
+
@property
|
52
|
+
def topological_ordering(self) -> Optional[int]:
|
53
|
+
"""
|
54
|
+
FeatureSpecs before v8 are not required to have topological ordering.
|
55
|
+
"""
|
56
|
+
return self._topological_ordering
|
57
|
+
|
58
|
+
@property
|
59
|
+
def output_name(self) -> str:
|
60
|
+
"""
|
61
|
+
This field does not exist in the proto, and is provided for convenience.
|
62
|
+
"""
|
63
|
+
return self.info.output_name
|
64
|
+
|
65
|
+
def with_topological_ordering(self, ordering: int):
|
66
|
+
new_column_info = copy.copy(self)
|
67
|
+
new_column_info._topological_ordering = ordering
|
68
|
+
return new_column_info
|
69
|
+
|
70
|
+
@classmethod
|
71
|
+
def from_proto(cls, column_info_proto):
|
72
|
+
if column_info_proto.HasField(SOURCE_DATA_COLUMN_INFO):
|
73
|
+
info = SourceDataColumnInfo.from_proto(
|
74
|
+
column_info_proto.source_data_column_info
|
75
|
+
)
|
76
|
+
elif column_info_proto.HasField(FEATURE_COLUMN_INFO):
|
77
|
+
info = FeatureColumnInfo.from_proto(column_info_proto.feature_column_info)
|
78
|
+
elif column_info_proto.HasField(ON_DEMAND_COLUMN_INFO):
|
79
|
+
info = OnDemandColumnInfo.from_proto(
|
80
|
+
column_info_proto.on_demand_column_info
|
81
|
+
)
|
82
|
+
else:
|
83
|
+
raise ValueError("Unsupported info type: " + str(column_info_proto))
|
84
|
+
|
85
|
+
data_type = (
|
86
|
+
column_info_proto.data_type
|
87
|
+
if column_info_proto.HasField("data_type")
|
88
|
+
else None
|
89
|
+
)
|
90
|
+
topological_ordering = (
|
91
|
+
column_info_proto.topological_ordering
|
92
|
+
if column_info_proto.HasField("topological_ordering")
|
93
|
+
else None
|
94
|
+
)
|
95
|
+
return ColumnInfo(
|
96
|
+
info=info,
|
97
|
+
include=column_info_proto.include,
|
98
|
+
data_type=data_type,
|
99
|
+
topological_ordering=topological_ordering,
|
100
|
+
)
|
101
|
+
|
102
|
+
# def to_proto(self):
|
103
|
+
# column_info = ProtoColumnInfo(
|
104
|
+
# include=self.include,
|
105
|
+
# data_type=self.data_type,
|
106
|
+
# topological_ordering=self.topological_ordering,
|
107
|
+
# )
|
108
|
+
# if isinstance(self.info, SourceDataColumnInfo):
|
109
|
+
# column_info.source_data_column_info.CopyFrom(self.info.to_proto())
|
110
|
+
# elif isinstance(self.info, FeatureColumnInfo):
|
111
|
+
# column_info.feature_column_info.CopyFrom(self.info.to_proto())
|
112
|
+
# elif isinstance(self.info, OnDemandColumnInfo):
|
113
|
+
# column_info.on_demand_column_info.CopyFrom(self.info.to_proto())
|
114
|
+
# else:
|
115
|
+
# raise ValueError("Unsupported info type: " + str(self.info))
|
116
|
+
#
|
117
|
+
# return column_info
|
@@ -0,0 +1,92 @@
|
|
1
|
+
import json
|
2
|
+
import re
|
3
|
+
from typing import Any
|
4
|
+
|
5
|
+
from pyspark.sql.types import ArrayType, DataType, DecimalType, MapType, StructType
|
6
|
+
|
7
|
+
|
8
|
+
|
9
|
+
class DataType(_ProtoEnumEntity):
|
10
|
+
"""Online store types."""
|
11
|
+
|
12
|
+
INTEGER = ProtoDataType.Value("INTEGER")
|
13
|
+
FLOAT = ProtoDataType.Value("FLOAT")
|
14
|
+
BOOLEAN = ProtoDataType.Value("BOOLEAN")
|
15
|
+
STRING = ProtoDataType.Value("STRING")
|
16
|
+
DOUBLE = ProtoDataType.Value("DOUBLE")
|
17
|
+
LONG = ProtoDataType.Value("LONG")
|
18
|
+
TIMESTAMP = ProtoDataType.Value("TIMESTAMP")
|
19
|
+
DATE = ProtoDataType.Value("DATE")
|
20
|
+
SHORT = ProtoDataType.Value("SHORT")
|
21
|
+
ARRAY = ProtoDataType.Value("ARRAY")
|
22
|
+
MAP = ProtoDataType.Value("MAP")
|
23
|
+
BINARY = ProtoDataType.Value("BINARY")
|
24
|
+
DECIMAL = ProtoDataType.Value("DECIMAL")
|
25
|
+
STRUCT = ProtoDataType.Value("STRUCT")
|
26
|
+
|
27
|
+
_FIXED_DECIMAL = re.compile("decimal\\(\\s*(\\d+)\\s*,\\s*(\\d+)\\s*\\)")
|
28
|
+
|
29
|
+
@classmethod
|
30
|
+
def _enum_type(cls) -> Any:
|
31
|
+
return ProtoDataType
|
32
|
+
|
33
|
+
@classmethod
|
34
|
+
def from_spark_type(cls, spark_type):
|
35
|
+
return cls.from_string(spark_type.typeName())
|
36
|
+
|
37
|
+
@classmethod
|
38
|
+
def spark_type_to_string(cls, spark_type):
|
39
|
+
return DataType.to_string(DataType.from_spark_type(spark_type))
|
40
|
+
|
41
|
+
@classmethod
|
42
|
+
def top_level_type_supported(cls, spark_type: DataType) -> bool:
|
43
|
+
"""
|
44
|
+
Checks whether the provided Spark data type is supported by Feature Store, only considering
|
45
|
+
the top-level type for nested data types.
|
46
|
+
|
47
|
+
Details on nested types:
|
48
|
+
ArrayType: The elementType is not checked. Will return True.
|
49
|
+
MapType: The keyType and valueType are not checked. Will return True.
|
50
|
+
StructType: The struct fieds are not checked. Will return True.
|
51
|
+
"""
|
52
|
+
cls.init()
|
53
|
+
return spark_type.typeName().upper() in cls._STRING_TO_ENUM
|
54
|
+
|
55
|
+
@classmethod
|
56
|
+
def to_complex_spark_type(cls, json_value):
|
57
|
+
"""
|
58
|
+
Constructs a complex Spark DataType from its compact JSON representation.
|
59
|
+
|
60
|
+
Examples:
|
61
|
+
- Input: '"decimal(1,2)"'
|
62
|
+
Output: DecimalType(1,2)
|
63
|
+
- Input: '{"containsNull":false,"elementType":"integer","type":"array"}'
|
64
|
+
Output: ArrayType(IntegerType,false)
|
65
|
+
- Input: '{"keyType":"integer","type":"map","valueContainsNull":True,"valueType":"integer"}'
|
66
|
+
Output: MapType(IntegerType,IntegerType,true)
|
67
|
+
"""
|
68
|
+
if not json_value:
|
69
|
+
raise ValueError("Empty JSON value cannot be converted to Spark DataType")
|
70
|
+
|
71
|
+
json_data = json.loads(json_value)
|
72
|
+
if not isinstance(json_data, dict):
|
73
|
+
# DecimalType does not have fromJson() method
|
74
|
+
if json_value == "decimal":
|
75
|
+
return DecimalType()
|
76
|
+
if cls._FIXED_DECIMAL.match(json_data):
|
77
|
+
m = cls._FIXED_DECIMAL.match(json_data)
|
78
|
+
return DecimalType(int(m.group(1)), int(m.group(2)))
|
79
|
+
|
80
|
+
if json_data["type"].upper() == cls.to_string(cls.ARRAY):
|
81
|
+
return ArrayType.fromJson(json_data)
|
82
|
+
|
83
|
+
if json_data["type"].upper() == cls.to_string(cls.MAP):
|
84
|
+
return MapType.fromJson(json_data)
|
85
|
+
|
86
|
+
if json_data["type"].upper() == cls.to_string(cls.STRUCT):
|
87
|
+
return StructType.fromJson(json_data)
|
88
|
+
|
89
|
+
else:
|
90
|
+
raise ValueError(
|
91
|
+
f"Spark type {json_data['type']} cannot be converted to a complex Spark DataType"
|
92
|
+
)
|
@@ -0,0 +1,55 @@
|
|
1
|
+
import os
|
2
|
+
|
3
|
+
|
4
|
+
class _EnvironmentVariable:
|
5
|
+
"""
|
6
|
+
Represents an environment variable for the feature store client for custom configurations as needed.
|
7
|
+
"""
|
8
|
+
|
9
|
+
def __init__(self, name, type_, default):
|
10
|
+
self.name = name
|
11
|
+
self.type = type_
|
12
|
+
self.default = default
|
13
|
+
|
14
|
+
@property
|
15
|
+
def defined(self):
|
16
|
+
return self.name in os.environ
|
17
|
+
|
18
|
+
def get_raw(self):
|
19
|
+
return os.getenv(self.name)
|
20
|
+
|
21
|
+
def set(self, value):
|
22
|
+
os.environ[self.name] = str(value)
|
23
|
+
|
24
|
+
def unset(self):
|
25
|
+
os.environ.pop(self.name, None)
|
26
|
+
|
27
|
+
def get(self):
|
28
|
+
"""
|
29
|
+
Reads the value of the environment variable if it exists and converts it to the desired
|
30
|
+
type. Otherwise, returns the default value.
|
31
|
+
"""
|
32
|
+
if (val := self.get_raw()) is not None:
|
33
|
+
try:
|
34
|
+
return self.type(val)
|
35
|
+
except Exception as e:
|
36
|
+
raise ValueError(
|
37
|
+
f"Failed to convert {val!r} to {self.type} for {self.name}: {e}"
|
38
|
+
)
|
39
|
+
return self.default
|
40
|
+
|
41
|
+
def __str__(self):
|
42
|
+
return f"{self.name} (default: {self.default}, type: {self.type.__name__})"
|
43
|
+
|
44
|
+
def __repr__(self):
|
45
|
+
return repr(self.name)
|
46
|
+
|
47
|
+
def __format__(self, format_spec: str) -> str:
|
48
|
+
return self.name.__format__(format_spec)
|
49
|
+
|
50
|
+
|
51
|
+
# The threshold (in MB) where a broadcast join will be performed for the asof join for point in time feature join
|
52
|
+
# Default is 20MB as benchmarks show diminishing returns with broadcast past this value.The default spark broadcast join threshold is 10MB
|
53
|
+
BROADCAST_JOIN_THRESHOLD = _EnvironmentVariable(
|
54
|
+
"BROADCAST_JOIN_THRESHOLD", int, 20 * 1024 * 1024
|
55
|
+
)
|
@@ -0,0 +1,53 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
class Feature:
|
4
|
+
def __init__(
|
5
|
+
self,
|
6
|
+
feature_table,
|
7
|
+
feature_id,
|
8
|
+
name,
|
9
|
+
data_type,
|
10
|
+
description,
|
11
|
+
data_type_details=None,
|
12
|
+
):
|
13
|
+
self._feature_table = feature_table
|
14
|
+
self._name = name
|
15
|
+
self._data_type = data_type
|
16
|
+
self._description = description
|
17
|
+
self._data_type_details = data_type_details
|
18
|
+
self._feature_id = feature_id
|
19
|
+
|
20
|
+
@property
|
21
|
+
def feature_table(self):
|
22
|
+
return self._feature_table
|
23
|
+
|
24
|
+
@property
|
25
|
+
def feature_id(self):
|
26
|
+
return self._feature_id
|
27
|
+
|
28
|
+
@property
|
29
|
+
def name(self):
|
30
|
+
return self._name
|
31
|
+
|
32
|
+
@property
|
33
|
+
def data_type(self):
|
34
|
+
return self._data_type
|
35
|
+
|
36
|
+
@property
|
37
|
+
def data_type_details(self):
|
38
|
+
return self._data_type_details
|
39
|
+
|
40
|
+
@property
|
41
|
+
def description(self):
|
42
|
+
return self._description
|
43
|
+
|
44
|
+
@classmethod
|
45
|
+
def from_proto(cls, feature_proto):
|
46
|
+
return cls(
|
47
|
+
feature_table=feature_proto.table,
|
48
|
+
feature_id=feature_proto.id,
|
49
|
+
name=feature_proto.name,
|
50
|
+
data_type=feature_proto.data_type,
|
51
|
+
data_type_details=feature_proto.data_type_details,
|
52
|
+
description=feature_proto.description,
|
53
|
+
)
|
@@ -0,0 +1,64 @@
|
|
1
|
+
from typing import List, Optional
|
2
|
+
|
3
|
+
|
4
|
+
|
5
|
+
class FeatureColumnInfo:
|
6
|
+
def __init__(
|
7
|
+
self,
|
8
|
+
table_name: str,
|
9
|
+
feature_name: str,
|
10
|
+
lookup_key: List[str],
|
11
|
+
output_name: str,
|
12
|
+
timestamp_lookup_key: Optional[List[str]] = None,
|
13
|
+
):
|
14
|
+
if timestamp_lookup_key is None:
|
15
|
+
timestamp_lookup_key = []
|
16
|
+
if not table_name:
|
17
|
+
raise ValueError("table_name must be non-empty.")
|
18
|
+
if not feature_name:
|
19
|
+
raise ValueError("feature_name must be non-empty.")
|
20
|
+
if not isinstance(lookup_key, list):
|
21
|
+
raise ValueError("lookup_key must be a list.")
|
22
|
+
if not lookup_key or "" in lookup_key or None in lookup_key:
|
23
|
+
raise ValueError("lookup_key must be non-empty.")
|
24
|
+
if not output_name:
|
25
|
+
raise ValueError("output_name must be non-empty.")
|
26
|
+
if not isinstance(timestamp_lookup_key, list):
|
27
|
+
raise ValueError("timestamp_lookup_key must be a list.")
|
28
|
+
|
29
|
+
self._table_name = table_name
|
30
|
+
self._feature_name = feature_name
|
31
|
+
self._lookup_key = lookup_key
|
32
|
+
self._output_name = output_name
|
33
|
+
self._timestamp_lookup_key = timestamp_lookup_key
|
34
|
+
|
35
|
+
@property
|
36
|
+
def table_name(self):
|
37
|
+
return self._table_name
|
38
|
+
|
39
|
+
@property
|
40
|
+
def lookup_key(self):
|
41
|
+
return self._lookup_key
|
42
|
+
|
43
|
+
@property
|
44
|
+
def feature_name(self):
|
45
|
+
return self._feature_name
|
46
|
+
|
47
|
+
@property
|
48
|
+
def output_name(self):
|
49
|
+
return self._output_name
|
50
|
+
|
51
|
+
@property
|
52
|
+
def timestamp_lookup_key(self):
|
53
|
+
return self._timestamp_lookup_key
|
54
|
+
|
55
|
+
@classmethod
|
56
|
+
def from_proto(cls, feature_column_info_proto):
|
57
|
+
return cls(
|
58
|
+
table_name=feature_column_info_proto.table_name,
|
59
|
+
feature_name=feature_column_info_proto.feature_name,
|
60
|
+
lookup_key=list(feature_column_info_proto.lookup_key),
|
61
|
+
output_name=feature_column_info_proto.output_name,
|
62
|
+
timestamp_lookup_key=list(feature_column_info_proto.timestamp_lookup_key),
|
63
|
+
)
|
64
|
+
|
@@ -0,0 +1,55 @@
|
|
1
|
+
from typing import Dict, Optional
|
2
|
+
|
3
|
+
|
4
|
+
class FeatureFunction:
|
5
|
+
|
6
|
+
"""
|
7
|
+
特征方法类
|
8
|
+
|
9
|
+
特征方法是用户定义的函数,用于将特征表中的特征组合成新特征,特征方法可以是任何用户定义的函数,例如Python UDF。
|
10
|
+
|
11
|
+
特征方法类有以下属性:
|
12
|
+
- udf_name:要调用的Python UDF的名称。
|
13
|
+
- input_bindings:用于将Python UDF的输入映射到训练集中的特征的字典。
|
14
|
+
- output_name:如果提供,则会将此特征重命名为 :meth:`create_training_set() <databricks.feature_engineering.client.FeatureEngineeringClient.create_training_set>` 返回的 :class:`TrainingSet <databricks.ml_features.training_set.TrainingSet>` 中的特征。
|
15
|
+
|
16
|
+
"""
|
17
|
+
|
18
|
+
def __init__(
|
19
|
+
self,
|
20
|
+
*,
|
21
|
+
udf_name: str,
|
22
|
+
input_bindings: Optional[Dict[str, str]] = None,
|
23
|
+
output_name: Optional[str] = None,
|
24
|
+
):
|
25
|
+
"""Initialize a FeatureFunction object. See class documentation."""
|
26
|
+
# UC function names are always lowercase.
|
27
|
+
self._udf_name = udf_name.lower()
|
28
|
+
self._input_bindings = input_bindings if input_bindings else {}
|
29
|
+
self._output_name = output_name
|
30
|
+
|
31
|
+
@property
|
32
|
+
def udf_name(self) -> str:
|
33
|
+
"""
|
34
|
+
The name of the Python UDF called by this FeatureFunction.
|
35
|
+
"""
|
36
|
+
return self._udf_name
|
37
|
+
|
38
|
+
@property
|
39
|
+
def input_bindings(self) -> Dict[str, str]:
|
40
|
+
"""
|
41
|
+
The input to use for each argument of the Python UDF.
|
42
|
+
|
43
|
+
For example:
|
44
|
+
|
45
|
+
`{"x": "feature1", "y": "input1"}`
|
46
|
+
"""
|
47
|
+
return self._input_bindings
|
48
|
+
|
49
|
+
@property
|
50
|
+
def output_name(self) -> Optional[str]:
|
51
|
+
"""
|
52
|
+
The output name to use for the results of this FeatureFunction.
|
53
|
+
If empty, defaults to the fully qualified `udf_name` when evaluated.
|
54
|
+
"""
|
55
|
+
return self._output_name
|
@@ -0,0 +1,179 @@
|
|
1
|
+
import copy
|
2
|
+
import datetime
|
3
|
+
import logging
|
4
|
+
from typing import Dict, List, Optional, Union
|
5
|
+
|
6
|
+
from feature_store.utils import common_utils
|
7
|
+
|
8
|
+
_logger = logging.getLogger(__name__)
|
9
|
+
|
10
|
+
|
11
|
+
class FeatureLookup:
|
12
|
+
|
13
|
+
"""
|
14
|
+
特征查找类
|
15
|
+
|
16
|
+
特征查找类用于指定特征表中的特征,并将其与训练集中的特征进行关联。
|
17
|
+
|
18
|
+
特征查找类有以下属性:
|
19
|
+
|
20
|
+
- table_name:特征表的名称。
|
21
|
+
- lookup_key:用于在特征表和训练集之间进行联接的键。lookup_key必须是训练集中的列。lookup_key的类型和顺序必须与特征表的主键匹配。
|
22
|
+
- feature_names:要从特征表中查找的特征的名称。如果您的模型需要主键作为特征,则可以将它们声明为独立的FeatureLookups。
|
23
|
+
- rename_outputs:如果提供,则会将特征重命名为 :meth:`create_training_set() <databricks.feature_engineering.client.FeatureEngineeringClient.create_training_set>`返回的 :class:`TrainingSet <databricks.ml_features.training_set.TrainingSet>` 中的特征。
|
24
|
+
- timestamp_lookup_key:用于在特征表和训练集之间进行联接的时间戳键。timestamp_lookup_key必须是训练集中的列。timestamp_lookup_key的类型必须与特征表的时间戳键的类型匹配。
|
25
|
+
- lookback_window: 当对特征表执行时间点查找时使用的回溯窗口,该查找针对传递给 :meth:`create_training_set() <databricks.feature_engineering.client.FeatureEngineeringClient.create_training_set>` 方法的数据帧。特征存储将检索在数据帧的``timestamp_lookup_key``指定时间戳之前且在``lookback_window``时间范围内的最新特征值,如果不存在这样的特征值则返回null。当设置为0时,仅返回特征表中的精确匹配项。
|
26
|
+
- feature_name:特征名称。**已弃用**。使用 `feature_names`。
|
27
|
+
- output_name:如果提供,则会将此特征重命名为 :meth:`create_training_set() <databricks.feature_engineering.client.FeatureEngineeringClient.create_training_set>` 返回的 :class:`TrainingSet <databricks.ml_features.training_set.TrainingSet>` 中的特征。**已弃用**。使用 `rename_outputs`。
|
28
|
+
|
29
|
+
示例:
|
30
|
+
|
31
|
+
from databricks.feature_store import FeatureLookup
|
32
|
+
|
33
|
+
lookup = FeatureLookup(
|
34
|
+
table_name="my_feature_table",
|
35
|
+
lookup_key="my_lookup_key",
|
36
|
+
feature_names=["my_feature_1", "my_feature_2"],
|
37
|
+
rename_outputs={"my_feature_1": "my_feature_1_renamed"},
|
38
|
+
timestamp_lookup_key="my_timestamp_lookup_key",
|
39
|
+
lookback_window=datetime.timedelta(days=1)
|
40
|
+
)
|
41
|
+
|
42
|
+
"""
|
43
|
+
|
44
|
+
def __init__(
|
45
|
+
self,
|
46
|
+
table_name: str,
|
47
|
+
lookup_key: Union[str, List[str]],
|
48
|
+
*,
|
49
|
+
feature_names: Union[str, List[str], None] = None,
|
50
|
+
rename_outputs: Optional[Dict[str, str]] = None,
|
51
|
+
timestamp_lookup_key: Optional[str] = None,
|
52
|
+
lookback_window: Optional[datetime.timedelta] = None,
|
53
|
+
**kwargs,
|
54
|
+
):
|
55
|
+
"""Initialize a FeatureLookup object. See class documentation."""
|
56
|
+
|
57
|
+
self._feature_name_deprecated = kwargs.pop("feature_name", None)
|
58
|
+
self._output_name_deprecated = kwargs.pop("output_name", None)
|
59
|
+
|
60
|
+
if kwargs:
|
61
|
+
raise TypeError(
|
62
|
+
f"FeatureLookup got unexpected keyword argument(s): {list(kwargs.keys())}"
|
63
|
+
)
|
64
|
+
|
65
|
+
self._table_name = table_name
|
66
|
+
|
67
|
+
if type(timestamp_lookup_key) is list:
|
68
|
+
if len(timestamp_lookup_key) == 0:
|
69
|
+
timestamp_lookup_key = None
|
70
|
+
elif len(timestamp_lookup_key) == 1:
|
71
|
+
timestamp_lookup_key = timestamp_lookup_key[0]
|
72
|
+
else:
|
73
|
+
raise ValueError(
|
74
|
+
f"Setting multiple timestamp lookup keys is not supported."
|
75
|
+
)
|
76
|
+
|
77
|
+
if rename_outputs is not None and not isinstance(rename_outputs, dict):
|
78
|
+
raise ValueError(
|
79
|
+
f"Unexpected type for rename_outputs: {type(rename_outputs)}"
|
80
|
+
)
|
81
|
+
|
82
|
+
self._feature_names = common_utils.as_list(feature_names, default=[])
|
83
|
+
|
84
|
+
# Make sure the user didn't accidentally pass in any nested lists/dicts in feature_names
|
85
|
+
for fn in self._feature_names:
|
86
|
+
if not isinstance(fn, str):
|
87
|
+
raise ValueError(
|
88
|
+
f"Unexpected type for element in feature_names: {type(self._feature_names)}, only strings allowed in list"
|
89
|
+
)
|
90
|
+
|
91
|
+
if lookback_window is not None:
|
92
|
+
if not timestamp_lookup_key:
|
93
|
+
raise ValueError(
|
94
|
+
f"Unexpected lookback_window value: {lookback_window}, lookback windows can only be applied on time series "
|
95
|
+
f"feature tables. Use timestamp_lookup_key to perform point-in-time lookups with lookback window."
|
96
|
+
)
|
97
|
+
if not isinstance(
|
98
|
+
lookback_window, datetime.timedelta
|
99
|
+
) or lookback_window < datetime.timedelta(0):
|
100
|
+
raise ValueError(
|
101
|
+
f"Unexpected value for lookback_window: {lookback_window}, only non-negative datetime.timedelta allowed."
|
102
|
+
)
|
103
|
+
|
104
|
+
self._lookup_key = copy.copy(lookup_key)
|
105
|
+
self._timestamp_lookup_key = copy.copy(timestamp_lookup_key)
|
106
|
+
self._lookback_window = copy.copy(lookback_window)
|
107
|
+
|
108
|
+
self._rename_outputs = {}
|
109
|
+
if rename_outputs is not None:
|
110
|
+
self._rename_outputs = rename_outputs.copy()
|
111
|
+
|
112
|
+
self._inject_deprecated_feature_name()
|
113
|
+
self._inject_deprecated_output_name()
|
114
|
+
|
115
|
+
@property
|
116
|
+
def table_name(self):
|
117
|
+
"""The table name to use in this FeatureLookup."""
|
118
|
+
return self._table_name
|
119
|
+
|
120
|
+
@property
|
121
|
+
def lookup_key(self):
|
122
|
+
"""The lookup key(s) to use in this FeatureLookup."""
|
123
|
+
return self._lookup_key
|
124
|
+
|
125
|
+
@property
|
126
|
+
def feature_name(self):
|
127
|
+
"""The feature name to use in this FeatureLookup. **Deprecated**. Use `feature_names`."""
|
128
|
+
return self._feature_name_deprecated
|
129
|
+
|
130
|
+
@property
|
131
|
+
def feature_names(self):
|
132
|
+
"""The feature names to use in this FeatureLookup."""
|
133
|
+
return self._feature_names
|
134
|
+
|
135
|
+
@property
|
136
|
+
def output_name(self):
|
137
|
+
"""The output name to use in this FeatureLookup. **Deprecated**. Use `feature_names`."""
|
138
|
+
if self._output_name_deprecated:
|
139
|
+
return self._output_name_deprecated
|
140
|
+
else:
|
141
|
+
return self._feature_name_deprecated
|
142
|
+
|
143
|
+
@property
|
144
|
+
def timestamp_lookup_key(self):
|
145
|
+
return self._timestamp_lookup_key
|
146
|
+
|
147
|
+
@property
|
148
|
+
def lookback_window(self):
|
149
|
+
"""A lookback window applied only for point-in-time lookups."""
|
150
|
+
return self._lookback_window
|
151
|
+
|
152
|
+
def _get_feature_names(self):
|
153
|
+
return self._feature_names
|
154
|
+
|
155
|
+
def _get_output_name(self, feature_name):
|
156
|
+
"""Lookup the renamed output, or fallback to the feature name itself if no mapping is present"""
|
157
|
+
return self._rename_outputs.get(feature_name, feature_name)
|
158
|
+
|
159
|
+
def _inject_deprecated_feature_name(self):
|
160
|
+
if self._feature_name_deprecated:
|
161
|
+
if len(self._feature_names) > 0:
|
162
|
+
raise ValueError(
|
163
|
+
"Use either feature_names or feature_name parameter, but not both."
|
164
|
+
)
|
165
|
+
_logger.warning(
|
166
|
+
f'The feature_name parameter is deprecated. Use "feature_names".'
|
167
|
+
)
|
168
|
+
self._feature_names = [self._feature_name_deprecated]
|
169
|
+
|
170
|
+
def _inject_deprecated_output_name(self):
|
171
|
+
if len(self._feature_names) == 1 and self._output_name_deprecated:
|
172
|
+
if len(self._rename_outputs) > 0:
|
173
|
+
raise ValueError(
|
174
|
+
"Use either output_name or rename_outputs parameter, but not both."
|
175
|
+
)
|
176
|
+
_logger.warning(
|
177
|
+
f'The output_name parameter is deprecated. Use "rename_outputs".'
|
178
|
+
)
|
179
|
+
self._rename_outputs[self._feature_names[0]] = self._output_name_deprecated
|