tencent-wedata-feature-engineering-dev 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.
- tencent_wedata_feature_engineering_dev-0.1.0.dist-info/METADATA +19 -0
- tencent_wedata_feature_engineering_dev-0.1.0.dist-info/RECORD +64 -0
- tencent_wedata_feature_engineering_dev-0.1.0.dist-info/WHEEL +5 -0
- tencent_wedata_feature_engineering_dev-0.1.0.dist-info/top_level.txt +1 -0
- wedata/__init__.py +9 -0
- wedata/feature_store/__init__.py +0 -0
- wedata/feature_store/client.py +462 -0
- wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
- wedata/feature_store/cloud_sdk_client/client.py +86 -0
- wedata/feature_store/cloud_sdk_client/models.py +686 -0
- wedata/feature_store/cloud_sdk_client/utils.py +32 -0
- wedata/feature_store/common/__init__.py +0 -0
- wedata/feature_store/common/protos/__init__.py +0 -0
- wedata/feature_store/common/protos/feature_store_pb2.py +49 -0
- wedata/feature_store/common/store_config/__init__.py +0 -0
- wedata/feature_store/common/store_config/redis.py +48 -0
- wedata/feature_store/constants/__init__.py +0 -0
- wedata/feature_store/constants/constants.py +59 -0
- wedata/feature_store/constants/engine_types.py +34 -0
- wedata/feature_store/entities/__init__.py +0 -0
- wedata/feature_store/entities/column_info.py +138 -0
- wedata/feature_store/entities/environment_variables.py +55 -0
- wedata/feature_store/entities/feature.py +53 -0
- wedata/feature_store/entities/feature_column_info.py +72 -0
- wedata/feature_store/entities/feature_function.py +55 -0
- wedata/feature_store/entities/feature_lookup.py +200 -0
- wedata/feature_store/entities/feature_spec.py +489 -0
- wedata/feature_store/entities/feature_spec_constants.py +25 -0
- wedata/feature_store/entities/feature_table.py +111 -0
- wedata/feature_store/entities/feature_table_info.py +49 -0
- wedata/feature_store/entities/function_info.py +90 -0
- wedata/feature_store/entities/on_demand_column_info.py +57 -0
- wedata/feature_store/entities/source_data_column_info.py +24 -0
- wedata/feature_store/entities/training_set.py +135 -0
- wedata/feature_store/feast_client/__init__.py +0 -0
- wedata/feature_store/feast_client/feast_client.py +482 -0
- wedata/feature_store/feature_table_client/__init__.py +0 -0
- wedata/feature_store/feature_table_client/feature_table_client.py +969 -0
- wedata/feature_store/mlflow_model.py +17 -0
- wedata/feature_store/spark_client/__init__.py +0 -0
- wedata/feature_store/spark_client/spark_client.py +289 -0
- wedata/feature_store/training_set_client/__init__.py +0 -0
- wedata/feature_store/training_set_client/training_set_client.py +572 -0
- wedata/feature_store/utils/__init__.py +0 -0
- wedata/feature_store/utils/common_utils.py +352 -0
- wedata/feature_store/utils/env_utils.py +86 -0
- wedata/feature_store/utils/feature_lookup_utils.py +564 -0
- wedata/feature_store/utils/feature_spec_utils.py +286 -0
- wedata/feature_store/utils/feature_utils.py +73 -0
- wedata/feature_store/utils/on_demand_utils.py +107 -0
- wedata/feature_store/utils/schema_utils.py +117 -0
- wedata/feature_store/utils/signature_utils.py +202 -0
- wedata/feature_store/utils/topological_sort.py +158 -0
- wedata/feature_store/utils/training_set_utils.py +579 -0
- wedata/feature_store/utils/uc_utils.py +296 -0
- wedata/feature_store/utils/validation_utils.py +79 -0
- wedata/tempo/__init__.py +0 -0
- wedata/tempo/interpol.py +448 -0
- wedata/tempo/intervals.py +1331 -0
- wedata/tempo/io.py +61 -0
- wedata/tempo/ml.py +129 -0
- wedata/tempo/resample.py +318 -0
- wedata/tempo/tsdf.py +1720 -0
- wedata/tempo/utils.py +254 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
3
|
+
# source: feature_store.proto
|
|
4
|
+
"""Generated protocol buffer code."""
|
|
5
|
+
from google.protobuf.internal import builder as _builder
|
|
6
|
+
from google.protobuf import descriptor as _descriptor
|
|
7
|
+
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
8
|
+
from google.protobuf import symbol_database as _symbol_database
|
|
9
|
+
from google.protobuf import reflection as _reflection
|
|
10
|
+
from google.protobuf import message as _message
|
|
11
|
+
# @@protoc_insertion_point(imports)
|
|
12
|
+
|
|
13
|
+
_sym_db = _symbol_database.Default()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13\x66\x65\x61ture_store.proto\x12\x1b\x66\x65\x61ture_store.common.protos\"$\n\x14SourceDataColumnInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\"\x84\x01\n\x11\x46\x65\x61tureColumnInfo\x12\x12\n\ntable_name\x18\x01 \x01(\t\x12\x14\n\x0c\x66\x65\x61ture_name\x18\x02 \x01(\t\x12\x12\n\nlookup_key\x18\x03 \x03(\t\x12\x13\n\x0boutput_name\x18\x04 \x01(\t\x12\x1c\n\x14timestamp_lookup_key\x18\x05 \x03(\t\"~\n\x12OnDemandColumnInfo\x12\x10\n\x08udf_name\x18\x01 \x01(\t\x12\x41\n\x0einput_bindings\x18\x02 \x03(\x0b\x32).feature_store.common.protos.InputBinding\x12\x13\n\x0boutput_name\x18\x03 \x01(\t\"3\n\x0cInputBinding\x12\x11\n\tparameter\x18\x01 \x01(\t\x12\x10\n\x08\x62ound_to\x18\x02 \x01(\t\"\xfc\x02\n\nColumnInfo\x12T\n\x17source_data_column_info\x18\x01 \x01(\x0b\x32\x31.feature_store.common.protos.SourceDataColumnInfoH\x00\x12M\n\x13\x66\x65\x61ture_column_info\x18\x02 \x01(\x0b\x32..feature_store.common.protos.FeatureColumnInfoH\x00\x12P\n\x15on_demand_column_info\x18\x03 \x01(\x0b\x32/.feature_store.common.protos.OnDemandColumnInfoH\x00\x12\x14\n\x07include\x18\x04 \x01(\x08H\x01\x88\x01\x01\x12\x11\n\tdata_type\x18\x05 \x01(\t\x12!\n\x14topological_ordering\x18\x06 \x01(\x05H\x02\x88\x01\x01\x42\x06\n\x04infoB\n\n\x08_includeB\x17\n\x15_topological_ordering\"Q\n\x10\x46\x65\x61tureTableInfo\x12\x12\n\ntable_name\x18\x01 \x01(\t\x12\x10\n\x08table_id\x18\x02 \x01(\t\x12\x17\n\x0flookback_window\x18\x03 \x01(\x01\"3\n\x15\x46unctionParameterInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\"\xa2\x01\n\x0c\x46unctionInfo\x12\x11\n\tfull_name\x18\x01 \x01(\t\x12H\n\x0cinput_params\x18\x02 \x03(\x0b\x32\x32.feature_store.common.protos.FunctionParameterInfo\x12\x1a\n\x12routine_definition\x18\x03 \x01(\t\x12\x19\n\x11\x65xternal_language\x18\x04 \x01(\t\"\xb1\x02\n\x0b\x46\x65\x61tureSpec\x12>\n\rinput_columns\x18\x01 \x03(\x0b\x32\'.feature_store.common.protos.ColumnInfo\x12\x43\n\x0cinput_tables\x18\x02 \x03(\x0b\x32-.feature_store.common.protos.FeatureTableInfo\x12\x42\n\x0finput_functions\x18\x03 \x03(\x0b\x32).feature_store.common.protos.FunctionInfo\x12\x14\n\x0cworkspace_id\x18\x04 \x01(\x03\x12$\n\x1c\x66\x65\x61ture_store_client_version\x18\x05 \x01(\t\x12\x1d\n\x15serialization_version\x18\x06 \x01(\x05\x62\x06proto3')
|
|
19
|
+
|
|
20
|
+
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
|
|
21
|
+
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'feature_store_pb2', globals())
|
|
22
|
+
_FEATURESPEC = DESCRIPTOR.message_types_by_name['FeatureSpec']
|
|
23
|
+
FeatureSpec = _reflection.GeneratedProtocolMessageType('FeatureSpec', (_message.Message,), {
|
|
24
|
+
'DESCRIPTOR' : _FEATURESPEC,
|
|
25
|
+
'__module__' : 'feature_spec_pb2'
|
|
26
|
+
# @@protoc_insertion_point(class_scope:featurestorecommon.FeatureSpec)
|
|
27
|
+
})
|
|
28
|
+
if _descriptor._USE_C_DESCRIPTORS == False:
|
|
29
|
+
|
|
30
|
+
DESCRIPTOR._options = None
|
|
31
|
+
_SOURCEDATACOLUMNINFO._serialized_start=52
|
|
32
|
+
_SOURCEDATACOLUMNINFO._serialized_end=88
|
|
33
|
+
_FEATURECOLUMNINFO._serialized_start=91
|
|
34
|
+
_FEATURECOLUMNINFO._serialized_end=223
|
|
35
|
+
_ONDEMANDCOLUMNINFO._serialized_start=225
|
|
36
|
+
_ONDEMANDCOLUMNINFO._serialized_end=351
|
|
37
|
+
_INPUTBINDING._serialized_start=353
|
|
38
|
+
_INPUTBINDING._serialized_end=404
|
|
39
|
+
_COLUMNINFO._serialized_start=407
|
|
40
|
+
_COLUMNINFO._serialized_end=787
|
|
41
|
+
_FEATURETABLEINFO._serialized_start=789
|
|
42
|
+
_FEATURETABLEINFO._serialized_end=870
|
|
43
|
+
_FUNCTIONPARAMETERINFO._serialized_start=872
|
|
44
|
+
_FUNCTIONPARAMETERINFO._serialized_end=923
|
|
45
|
+
_FUNCTIONINFO._serialized_start=926
|
|
46
|
+
_FUNCTIONINFO._serialized_end=1088
|
|
47
|
+
_FEATURESPEC._serialized_start=1091
|
|
48
|
+
_FEATURESPEC._serialized_end=1396
|
|
49
|
+
# @@protoc_insertion_point(module_scope)
|
|
File without changes
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
__doc__ = """
|
|
4
|
+
Feature Redis存储配置
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class RedisStoreConfig:
|
|
9
|
+
def __init__(self, host='localhost', port=6379, db=0, password=None, instance_id=None):
|
|
10
|
+
self._host = host
|
|
11
|
+
self._port = port
|
|
12
|
+
self._db = db
|
|
13
|
+
self._password = password
|
|
14
|
+
self._instance_id = instance_id
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def host(self):
|
|
18
|
+
return self._host
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def port(self):
|
|
22
|
+
return self._port
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def db(self):
|
|
26
|
+
return self._db
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def password(self):
|
|
30
|
+
return self._password
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def instance_id(self):
|
|
34
|
+
return self._instance_id
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def connection_string(self):
|
|
38
|
+
if self.password:
|
|
39
|
+
connection_string = f"{self.host}:{self.port},db={self.db},password={self._password}"
|
|
40
|
+
else:
|
|
41
|
+
connection_string = f"{self.host}:{self.port},db={self.db}"
|
|
42
|
+
return connection_string
|
|
43
|
+
|
|
44
|
+
def __repr__(self):
|
|
45
|
+
return f"RedisStoreConfig(host={self.host}, port={self.port}, db={self.db}, instance_id={self.instance_id})"
|
|
46
|
+
|
|
47
|
+
def __str__(self):
|
|
48
|
+
return self.__repr__()
|
|
File without changes
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
|
|
2
|
+
OVERWRITE = "overwrite"
|
|
3
|
+
APPEND = "append"
|
|
4
|
+
PATH = "path"
|
|
5
|
+
TABLE = "table"
|
|
6
|
+
CUSTOM = "custom"
|
|
7
|
+
PREDICTION_COLUMN_NAME = "prediction"
|
|
8
|
+
MODEL_DATA_PATH_ROOT = "feature_store"
|
|
9
|
+
RAW_MODEL_FOLDER = "raw_model"
|
|
10
|
+
UTF8_BYTES_PER_CHAR = 4
|
|
11
|
+
MAX_PRIMARY_KEY_STRING_LENGTH_CHARS = 100
|
|
12
|
+
MAX_PRIMARY_KEY_STRING_LENGTH_BYTES = (
|
|
13
|
+
MAX_PRIMARY_KEY_STRING_LENGTH_CHARS * UTF8_BYTES_PER_CHAR
|
|
14
|
+
)
|
|
15
|
+
STREAMING_TRIGGER_CONTINUOUS = "continuous"
|
|
16
|
+
STREAMING_TRIGGER_ONCE = "once"
|
|
17
|
+
STREAMING_TRIGGER_PROCESSING_TIME = "processingTime"
|
|
18
|
+
DEFAULT_WRITE_STREAM_TRIGGER = {STREAMING_TRIGGER_PROCESSING_TIME: "5 seconds"}
|
|
19
|
+
_DEFAULT_PUBLISH_STREAM_TRIGGER = {STREAMING_TRIGGER_PROCESSING_TIME: "5 minutes"}
|
|
20
|
+
FEATURE_STORE_CLIENT = "FeatureStoreClient"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
_WARN = "WARN"
|
|
24
|
+
_ERROR = "ERROR"
|
|
25
|
+
_SOURCE_FORMAT_DELTA = "delta"
|
|
26
|
+
|
|
27
|
+
_NO_RESULT_TYPE_PASSED = "NO_RESULT_TYPE"
|
|
28
|
+
_USE_SPARK_NATIVE_JOIN = "use_spark_native_join"
|
|
29
|
+
_PREBUILT_ENV_URI = "prebuilt_env_uri"
|
|
30
|
+
|
|
31
|
+
# MLflow模型相关常量(原mlflow_model_constants.py)
|
|
32
|
+
# Module name of the original mlflow_model
|
|
33
|
+
MLFLOW_MODEL_NAME = "wedata.feature_store.mlflow_model"
|
|
34
|
+
|
|
35
|
+
# FeatureStoreClient.log_model将记录包含'raw_model'文件夹的模型
|
|
36
|
+
# 该文件夹存储原始模型的MLmodel文件,用于推理
|
|
37
|
+
RAW_MODEL_FOLDER = "raw_model"
|
|
38
|
+
|
|
39
|
+
# ML模型文件名常量
|
|
40
|
+
ML_MODEL = "MLmodel"
|
|
41
|
+
|
|
42
|
+
# 特征查找客户端的PyPI包名
|
|
43
|
+
FEATURE_LOOKUP_CLIENT_PIP_PACKAGE = "tencent-wedata-feature-engineering"
|
|
44
|
+
|
|
45
|
+
# 特征查找版本号
|
|
46
|
+
FEATURE_LOOKUP_CLIENT_MAJOR_VERSION = "0.1.0"
|
|
47
|
+
|
|
48
|
+
# 特征存储内部数据目录
|
|
49
|
+
FEATURE_STORE_INTERNAL_DATA_DIR = "_wedata_internal/"
|
|
50
|
+
WEDATA_DEFAULT_FEATURE_STORE_DATABASE = "WEDATA_DEFAULT_FEATURE_STORE_DATABASE"
|
|
51
|
+
|
|
52
|
+
# 特征表属性
|
|
53
|
+
FEATURE_TABLE_KEY = "wedata.feature_table"
|
|
54
|
+
FEATURE_TABLE_VALUE = "true"
|
|
55
|
+
|
|
56
|
+
FEATURE_TABLE_PROJECT = "wedata.feature_project_id"
|
|
57
|
+
FEATURE_TABLE_TIMESTAMP = "timestampKeys"
|
|
58
|
+
FEATURE_TABLE_BACKUP_PRIMARY_KEY = "primaryKeys" # 备用标识,主键
|
|
59
|
+
FEATURE_DLC_TABLE_PRIMARY_KEY = "dlc.ao.data.govern.sorted.keys"
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class EngineTypes(Enum):
|
|
6
|
+
HIVE_ENGINE = "hive"
|
|
7
|
+
ICEBERG_ENGINE = "iceberg"
|
|
8
|
+
|
|
9
|
+
@classmethod
|
|
10
|
+
def get_engine(cls, engine_name: str) -> 'EngineTypes':
|
|
11
|
+
try:
|
|
12
|
+
return cls(engine_name.lower())
|
|
13
|
+
except ValueError:
|
|
14
|
+
raise ValueError(f"Invalid engine type: {engine_name}. Supported engine types: {list(cls)}")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CalculateEngineTypes(Enum):
|
|
18
|
+
DLC = "dlc"
|
|
19
|
+
EMR = "emr"
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def get_calculate_engine(cls, engine_name: str) -> 'CalculateEngineTypes':
|
|
23
|
+
try:
|
|
24
|
+
return cls(engine_name.lower())
|
|
25
|
+
except ValueError:
|
|
26
|
+
raise ValueError(f"Invalid engine type: {engine_name}. Supported engine types: {list(cls)}")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def judge_engine_type() -> 'CalculateEngineTypes':
|
|
30
|
+
if os.environ.get("DLC_REGION", ""):
|
|
31
|
+
return CalculateEngineTypes.DLC
|
|
32
|
+
else:
|
|
33
|
+
return CalculateEngineTypes.EMR
|
|
34
|
+
|
|
File without changes
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
from typing import Optional, Union
|
|
3
|
+
|
|
4
|
+
from wedata.feature_store.entities.feature_column_info import FeatureColumnInfo
|
|
5
|
+
from wedata.feature_store.entities.feature_spec_constants import SOURCE_DATA_COLUMN_INFO, FEATURE_COLUMN_INFO, \
|
|
6
|
+
ON_DEMAND_COLUMN_INFO
|
|
7
|
+
from wedata.feature_store.entities.on_demand_column_info import OnDemandColumnInfo
|
|
8
|
+
from wedata.feature_store.entities.source_data_column_info import SourceDataColumnInfo
|
|
9
|
+
|
|
10
|
+
from wedata.feature_store.common.protos import feature_store_pb2
|
|
11
|
+
|
|
12
|
+
class ColumnInfo:
|
|
13
|
+
"""
|
|
14
|
+
ColumnInfo's structure and properties are mapped 1:1 to the ColumnInfo proto message, unless specified otherwise.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
info: Union[SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo],
|
|
20
|
+
include: bool,
|
|
21
|
+
data_type: Optional[str] = None,
|
|
22
|
+
topological_ordering: Optional[int] = None,
|
|
23
|
+
):
|
|
24
|
+
if not isinstance(
|
|
25
|
+
info, (SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo)
|
|
26
|
+
):
|
|
27
|
+
raise ValueError(
|
|
28
|
+
"info must be one of SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo."
|
|
29
|
+
)
|
|
30
|
+
self._info = info
|
|
31
|
+
self._include = include
|
|
32
|
+
self._data_type = data_type
|
|
33
|
+
self._topological_ordering = topological_ordering
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def info(
|
|
37
|
+
self,
|
|
38
|
+
) -> Union[SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo]:
|
|
39
|
+
return self._info
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def include(self) -> bool:
|
|
43
|
+
return self._include
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def data_type(self) -> Optional[str]:
|
|
47
|
+
"""
|
|
48
|
+
FeatureSpecs before v7 are not required to have data types.
|
|
49
|
+
"""
|
|
50
|
+
return self._data_type
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def topological_ordering(self) -> Optional[int]:
|
|
54
|
+
"""
|
|
55
|
+
FeatureSpecs before v8 are not required to have topological ordering.
|
|
56
|
+
"""
|
|
57
|
+
return self._topological_ordering
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def output_name(self) -> str:
|
|
61
|
+
"""
|
|
62
|
+
This field does not exist in the proto, and is provided for convenience.
|
|
63
|
+
"""
|
|
64
|
+
return self.info.output_name
|
|
65
|
+
|
|
66
|
+
def with_topological_ordering(self, ordering: int):
|
|
67
|
+
new_column_info = copy.copy(self)
|
|
68
|
+
new_column_info._topological_ordering = ordering
|
|
69
|
+
return new_column_info
|
|
70
|
+
|
|
71
|
+
@classmethod
|
|
72
|
+
def from_proto(cls, column_info_proto):
|
|
73
|
+
if column_info_proto.HasField(SOURCE_DATA_COLUMN_INFO):
|
|
74
|
+
info = SourceDataColumnInfo.from_proto(
|
|
75
|
+
column_info_proto.source_data_column_info
|
|
76
|
+
)
|
|
77
|
+
elif column_info_proto.HasField(FEATURE_COLUMN_INFO):
|
|
78
|
+
info = FeatureColumnInfo.from_proto(column_info_proto.feature_column_info)
|
|
79
|
+
elif column_info_proto.HasField(ON_DEMAND_COLUMN_INFO):
|
|
80
|
+
info = OnDemandColumnInfo.from_proto(
|
|
81
|
+
column_info_proto.on_demand_column_info
|
|
82
|
+
)
|
|
83
|
+
else:
|
|
84
|
+
raise ValueError("Unsupported info type: " + str(column_info_proto))
|
|
85
|
+
|
|
86
|
+
data_type = column_info_proto.data_type or None
|
|
87
|
+
# data_type = (
|
|
88
|
+
# column_info_proto.data_type
|
|
89
|
+
# if column_info_proto.HasField("data_type")
|
|
90
|
+
# else None
|
|
91
|
+
# )
|
|
92
|
+
topological_ordering = column_info_proto.topological_ordering or 0
|
|
93
|
+
# topological_ordering = (
|
|
94
|
+
# column_info_proto.topological_ordering
|
|
95
|
+
# if column_info_proto.HasField("topological_ordering")
|
|
96
|
+
# else None
|
|
97
|
+
# )
|
|
98
|
+
|
|
99
|
+
return ColumnInfo(
|
|
100
|
+
info=info,
|
|
101
|
+
include=column_info_proto.include,
|
|
102
|
+
data_type=data_type,
|
|
103
|
+
topological_ordering=topological_ordering,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# def to_proto(self):
|
|
107
|
+
# column_info = ProtoColumnInfo(
|
|
108
|
+
# include=self.include,
|
|
109
|
+
# data_type=self.data_type,
|
|
110
|
+
# topological_ordering=self.topological_ordering,
|
|
111
|
+
# )
|
|
112
|
+
# if isinstance(self.info, SourceDataColumnInfo):
|
|
113
|
+
# column_info.source_data_column_info.CopyFrom(self.info.to_proto())
|
|
114
|
+
# elif isinstance(self.info, FeatureColumnInfo):
|
|
115
|
+
# column_info.feature_column_info.CopyFrom(self.info.to_proto())
|
|
116
|
+
# elif isinstance(self.info, OnDemandColumnInfo):
|
|
117
|
+
# column_info.on_demand_column_info.CopyFrom(self.info.to_proto())
|
|
118
|
+
# else:
|
|
119
|
+
# raise ValueError("Unsupported info type: " + str(self.info))
|
|
120
|
+
#
|
|
121
|
+
# return column_info
|
|
122
|
+
def to_proto(self):
|
|
123
|
+
column_info = feature_store_pb2.ColumnInfo(
|
|
124
|
+
include=self.include,
|
|
125
|
+
data_type=self.data_type,
|
|
126
|
+
topological_ordering=self.topological_ordering,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
if isinstance(self.info, SourceDataColumnInfo):
|
|
130
|
+
column_info.source_data_column_info.CopyFrom(self.info.to_proto())
|
|
131
|
+
elif isinstance(self.info, FeatureColumnInfo):
|
|
132
|
+
column_info.feature_column_info.CopyFrom(self.info.to_proto())
|
|
133
|
+
elif isinstance(self.info, OnDemandColumnInfo):
|
|
134
|
+
column_info.on_demand_column_info.CopyFrom(self.info.to_proto())
|
|
135
|
+
else:
|
|
136
|
+
raise ValueError("Unsupported info type: " + str(self.info))
|
|
137
|
+
|
|
138
|
+
return column_info
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class _EnvironmentVariable:
|
|
5
|
+
"""
|
|
6
|
+
Represents an environment variable for the feature store client for custom configurations as needed.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
def __init__(self, name, type_, default):
|
|
10
|
+
self.name = name
|
|
11
|
+
self.type = type_
|
|
12
|
+
self.default = default
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def defined(self):
|
|
16
|
+
return self.name in os.environ
|
|
17
|
+
|
|
18
|
+
def get_raw(self):
|
|
19
|
+
return os.getenv(self.name)
|
|
20
|
+
|
|
21
|
+
def set(self, value):
|
|
22
|
+
os.environ[self.name] = str(value)
|
|
23
|
+
|
|
24
|
+
def unset(self):
|
|
25
|
+
os.environ.pop(self.name, None)
|
|
26
|
+
|
|
27
|
+
def get(self):
|
|
28
|
+
"""
|
|
29
|
+
Reads the value of the environment variable if it exists and converts it to the desired
|
|
30
|
+
type. Otherwise, returns the default value.
|
|
31
|
+
"""
|
|
32
|
+
if (val := self.get_raw()) is not None:
|
|
33
|
+
try:
|
|
34
|
+
return self.type(val)
|
|
35
|
+
except Exception as e:
|
|
36
|
+
raise ValueError(
|
|
37
|
+
f"Failed to convert {val!r} to {self.type} for {self.name}: {e}"
|
|
38
|
+
)
|
|
39
|
+
return self.default
|
|
40
|
+
|
|
41
|
+
def __str__(self):
|
|
42
|
+
return f"{self.name} (default: {self.default}, type: {self.type.__name__})"
|
|
43
|
+
|
|
44
|
+
def __repr__(self):
|
|
45
|
+
return repr(self.name)
|
|
46
|
+
|
|
47
|
+
def __format__(self, format_spec: str) -> str:
|
|
48
|
+
return self.name.__format__(format_spec)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# The threshold (in MB) where a broadcast join will be performed for the asof join for point in time feature join
|
|
52
|
+
# Default is 20MB as benchmarks show diminishing returns with broadcast past this value.The default spark broadcast join threshold is 10MB
|
|
53
|
+
BROADCAST_JOIN_THRESHOLD = _EnvironmentVariable(
|
|
54
|
+
"BROADCAST_JOIN_THRESHOLD", int, 20 * 1024 * 1024
|
|
55
|
+
)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
class Feature:
|
|
4
|
+
def __init__(
|
|
5
|
+
self,
|
|
6
|
+
feature_table,
|
|
7
|
+
feature_id,
|
|
8
|
+
name,
|
|
9
|
+
data_type,
|
|
10
|
+
description,
|
|
11
|
+
data_type_details=None,
|
|
12
|
+
):
|
|
13
|
+
self._feature_table = feature_table
|
|
14
|
+
self._name = name
|
|
15
|
+
self._data_type = data_type
|
|
16
|
+
self._description = description
|
|
17
|
+
self._data_type_details = data_type_details
|
|
18
|
+
self._feature_id = feature_id
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def feature_table(self):
|
|
22
|
+
return self._feature_table
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def feature_id(self):
|
|
26
|
+
return self._feature_id
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def name(self):
|
|
30
|
+
return self._name
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def data_type(self):
|
|
34
|
+
return self._data_type
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def data_type_details(self):
|
|
38
|
+
return self._data_type_details
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def description(self):
|
|
42
|
+
return self._description
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def from_proto(cls, feature_proto):
|
|
46
|
+
return cls(
|
|
47
|
+
feature_table=feature_proto.table,
|
|
48
|
+
feature_id=feature_proto.id,
|
|
49
|
+
name=feature_proto.name,
|
|
50
|
+
data_type=feature_proto.data_type,
|
|
51
|
+
data_type_details=feature_proto.data_type_details,
|
|
52
|
+
description=feature_proto.description,
|
|
53
|
+
)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
|
|
3
|
+
from wedata.feature_store.common.protos import feature_store_pb2
|
|
4
|
+
|
|
5
|
+
class FeatureColumnInfo:
|
|
6
|
+
def __init__(
|
|
7
|
+
self,
|
|
8
|
+
table_name: str,
|
|
9
|
+
feature_name: str,
|
|
10
|
+
lookup_key: List[str],
|
|
11
|
+
output_name: str,
|
|
12
|
+
timestamp_lookup_key: Optional[List[str]] = None,
|
|
13
|
+
):
|
|
14
|
+
if timestamp_lookup_key is None:
|
|
15
|
+
timestamp_lookup_key = []
|
|
16
|
+
if not table_name:
|
|
17
|
+
raise ValueError("table_name must be non-empty.")
|
|
18
|
+
if not feature_name:
|
|
19
|
+
raise ValueError("feature_name must be non-empty.")
|
|
20
|
+
if not isinstance(lookup_key, list):
|
|
21
|
+
raise ValueError("lookup_key must be a list.")
|
|
22
|
+
if not lookup_key or "" in lookup_key or None in lookup_key:
|
|
23
|
+
raise ValueError("lookup_key must be non-empty.")
|
|
24
|
+
if not output_name:
|
|
25
|
+
raise ValueError("output_name must be non-empty.")
|
|
26
|
+
if not isinstance(timestamp_lookup_key, list):
|
|
27
|
+
raise ValueError("timestamp_lookup_key must be a list.")
|
|
28
|
+
|
|
29
|
+
self._table_name = table_name
|
|
30
|
+
self._feature_name = feature_name
|
|
31
|
+
self._lookup_key = lookup_key
|
|
32
|
+
self._output_name = output_name
|
|
33
|
+
self._timestamp_lookup_key = timestamp_lookup_key
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def table_name(self):
|
|
37
|
+
return self._table_name
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def lookup_key(self):
|
|
41
|
+
return self._lookup_key
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def feature_name(self):
|
|
45
|
+
return self._feature_name
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def output_name(self):
|
|
49
|
+
return self._output_name
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def timestamp_lookup_key(self):
|
|
53
|
+
return self._timestamp_lookup_key
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def from_proto(cls, feature_column_info_proto):
|
|
57
|
+
return cls(
|
|
58
|
+
table_name=feature_column_info_proto.table_name,
|
|
59
|
+
feature_name=feature_column_info_proto.feature_name,
|
|
60
|
+
lookup_key=list(feature_column_info_proto.lookup_key),
|
|
61
|
+
output_name=feature_column_info_proto.output_name,
|
|
62
|
+
timestamp_lookup_key=list(feature_column_info_proto.timestamp_lookup_key),
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
def to_proto(self):
|
|
66
|
+
return feature_store_pb2.FeatureColumnInfo(
|
|
67
|
+
table_name=self.table_name,
|
|
68
|
+
feature_name=self.feature_name,
|
|
69
|
+
lookup_key=self.lookup_key,
|
|
70
|
+
output_name=self.output_name,
|
|
71
|
+
timestamp_lookup_key=self.timestamp_lookup_key,
|
|
72
|
+
)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from typing import Dict, Optional
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class FeatureFunction:
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
特征方法类
|
|
8
|
+
|
|
9
|
+
特征方法是用户定义的函数,用于将特征表中的特征组合成新特征,特征方法可以是任何用户定义的函数,例如Python UDF。
|
|
10
|
+
|
|
11
|
+
特征方法类有以下属性:
|
|
12
|
+
- udf_name:要调用的Python UDF的名称。
|
|
13
|
+
- input_bindings:用于将Python UDF的输入映射到训练集中的特征的字典。
|
|
14
|
+
- output_name:如果提供,则会将此特征重命名为 :meth:`create_training_set() <databricks.feature_engineering.client.FeatureEngineeringClient.create_training_set>` 返回的 :class:`TrainingSet <databricks.ml_features.training_set.TrainingSet>` 中的特征。
|
|
15
|
+
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
*,
|
|
21
|
+
udf_name: str,
|
|
22
|
+
input_bindings: Optional[Dict[str, str]] = None,
|
|
23
|
+
output_name: Optional[str] = None,
|
|
24
|
+
):
|
|
25
|
+
"""Initialize a FeatureFunction object. See class documentation."""
|
|
26
|
+
# UC function names are always lowercase.
|
|
27
|
+
self._udf_name = udf_name.lower()
|
|
28
|
+
self._input_bindings = input_bindings if input_bindings else {}
|
|
29
|
+
self._output_name = output_name
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def udf_name(self) -> str:
|
|
33
|
+
"""
|
|
34
|
+
The name of the Python UDF called by this FeatureFunction.
|
|
35
|
+
"""
|
|
36
|
+
return self._udf_name
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def input_bindings(self) -> Dict[str, str]:
|
|
40
|
+
"""
|
|
41
|
+
The input to use for each argument of the Python UDF.
|
|
42
|
+
|
|
43
|
+
For example:
|
|
44
|
+
|
|
45
|
+
`{"x": "feature1", "y": "input1"}`
|
|
46
|
+
"""
|
|
47
|
+
return self._input_bindings
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def output_name(self) -> Optional[str]:
|
|
51
|
+
"""
|
|
52
|
+
The output name to use for the results of this FeatureFunction.
|
|
53
|
+
If empty, defaults to the fully qualified `udf_name` when evaluated.
|
|
54
|
+
"""
|
|
55
|
+
return self._output_name
|