tencent-wedata-feature-engineering-dev 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.

Files changed (64) hide show
  1. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/METADATA +19 -0
  2. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/RECORD +64 -0
  3. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/WHEEL +5 -0
  4. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/top_level.txt +1 -0
  5. wedata/__init__.py +9 -0
  6. wedata/feature_store/__init__.py +0 -0
  7. wedata/feature_store/client.py +462 -0
  8. wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
  9. wedata/feature_store/cloud_sdk_client/client.py +86 -0
  10. wedata/feature_store/cloud_sdk_client/models.py +686 -0
  11. wedata/feature_store/cloud_sdk_client/utils.py +32 -0
  12. wedata/feature_store/common/__init__.py +0 -0
  13. wedata/feature_store/common/protos/__init__.py +0 -0
  14. wedata/feature_store/common/protos/feature_store_pb2.py +49 -0
  15. wedata/feature_store/common/store_config/__init__.py +0 -0
  16. wedata/feature_store/common/store_config/redis.py +48 -0
  17. wedata/feature_store/constants/__init__.py +0 -0
  18. wedata/feature_store/constants/constants.py +59 -0
  19. wedata/feature_store/constants/engine_types.py +34 -0
  20. wedata/feature_store/entities/__init__.py +0 -0
  21. wedata/feature_store/entities/column_info.py +138 -0
  22. wedata/feature_store/entities/environment_variables.py +55 -0
  23. wedata/feature_store/entities/feature.py +53 -0
  24. wedata/feature_store/entities/feature_column_info.py +72 -0
  25. wedata/feature_store/entities/feature_function.py +55 -0
  26. wedata/feature_store/entities/feature_lookup.py +200 -0
  27. wedata/feature_store/entities/feature_spec.py +489 -0
  28. wedata/feature_store/entities/feature_spec_constants.py +25 -0
  29. wedata/feature_store/entities/feature_table.py +111 -0
  30. wedata/feature_store/entities/feature_table_info.py +49 -0
  31. wedata/feature_store/entities/function_info.py +90 -0
  32. wedata/feature_store/entities/on_demand_column_info.py +57 -0
  33. wedata/feature_store/entities/source_data_column_info.py +24 -0
  34. wedata/feature_store/entities/training_set.py +135 -0
  35. wedata/feature_store/feast_client/__init__.py +0 -0
  36. wedata/feature_store/feast_client/feast_client.py +482 -0
  37. wedata/feature_store/feature_table_client/__init__.py +0 -0
  38. wedata/feature_store/feature_table_client/feature_table_client.py +969 -0
  39. wedata/feature_store/mlflow_model.py +17 -0
  40. wedata/feature_store/spark_client/__init__.py +0 -0
  41. wedata/feature_store/spark_client/spark_client.py +289 -0
  42. wedata/feature_store/training_set_client/__init__.py +0 -0
  43. wedata/feature_store/training_set_client/training_set_client.py +572 -0
  44. wedata/feature_store/utils/__init__.py +0 -0
  45. wedata/feature_store/utils/common_utils.py +352 -0
  46. wedata/feature_store/utils/env_utils.py +86 -0
  47. wedata/feature_store/utils/feature_lookup_utils.py +564 -0
  48. wedata/feature_store/utils/feature_spec_utils.py +286 -0
  49. wedata/feature_store/utils/feature_utils.py +73 -0
  50. wedata/feature_store/utils/on_demand_utils.py +107 -0
  51. wedata/feature_store/utils/schema_utils.py +117 -0
  52. wedata/feature_store/utils/signature_utils.py +202 -0
  53. wedata/feature_store/utils/topological_sort.py +158 -0
  54. wedata/feature_store/utils/training_set_utils.py +579 -0
  55. wedata/feature_store/utils/uc_utils.py +296 -0
  56. wedata/feature_store/utils/validation_utils.py +79 -0
  57. wedata/tempo/__init__.py +0 -0
  58. wedata/tempo/interpol.py +448 -0
  59. wedata/tempo/intervals.py +1331 -0
  60. wedata/tempo/io.py +61 -0
  61. wedata/tempo/ml.py +129 -0
  62. wedata/tempo/resample.py +318 -0
  63. wedata/tempo/tsdf.py +1720 -0
  64. wedata/tempo/utils.py +254 -0
@@ -0,0 +1,49 @@
1
+ # -*- coding: utf-8 -*-
2
+ # Generated by the protocol buffer compiler. DO NOT EDIT!
3
+ # source: feature_store.proto
4
+ """Generated protocol buffer code."""
5
+ from google.protobuf.internal import builder as _builder
6
+ from google.protobuf import descriptor as _descriptor
7
+ from google.protobuf import descriptor_pool as _descriptor_pool
8
+ from google.protobuf import symbol_database as _symbol_database
9
+ from google.protobuf import reflection as _reflection
10
+ from google.protobuf import message as _message
11
+ # @@protoc_insertion_point(imports)
12
+
13
+ _sym_db = _symbol_database.Default()
14
+
15
+
16
+
17
+
18
+ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13\x66\x65\x61ture_store.proto\x12\x1b\x66\x65\x61ture_store.common.protos\"$\n\x14SourceDataColumnInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\"\x84\x01\n\x11\x46\x65\x61tureColumnInfo\x12\x12\n\ntable_name\x18\x01 \x01(\t\x12\x14\n\x0c\x66\x65\x61ture_name\x18\x02 \x01(\t\x12\x12\n\nlookup_key\x18\x03 \x03(\t\x12\x13\n\x0boutput_name\x18\x04 \x01(\t\x12\x1c\n\x14timestamp_lookup_key\x18\x05 \x03(\t\"~\n\x12OnDemandColumnInfo\x12\x10\n\x08udf_name\x18\x01 \x01(\t\x12\x41\n\x0einput_bindings\x18\x02 \x03(\x0b\x32).feature_store.common.protos.InputBinding\x12\x13\n\x0boutput_name\x18\x03 \x01(\t\"3\n\x0cInputBinding\x12\x11\n\tparameter\x18\x01 \x01(\t\x12\x10\n\x08\x62ound_to\x18\x02 \x01(\t\"\xfc\x02\n\nColumnInfo\x12T\n\x17source_data_column_info\x18\x01 \x01(\x0b\x32\x31.feature_store.common.protos.SourceDataColumnInfoH\x00\x12M\n\x13\x66\x65\x61ture_column_info\x18\x02 \x01(\x0b\x32..feature_store.common.protos.FeatureColumnInfoH\x00\x12P\n\x15on_demand_column_info\x18\x03 \x01(\x0b\x32/.feature_store.common.protos.OnDemandColumnInfoH\x00\x12\x14\n\x07include\x18\x04 \x01(\x08H\x01\x88\x01\x01\x12\x11\n\tdata_type\x18\x05 \x01(\t\x12!\n\x14topological_ordering\x18\x06 \x01(\x05H\x02\x88\x01\x01\x42\x06\n\x04infoB\n\n\x08_includeB\x17\n\x15_topological_ordering\"Q\n\x10\x46\x65\x61tureTableInfo\x12\x12\n\ntable_name\x18\x01 \x01(\t\x12\x10\n\x08table_id\x18\x02 \x01(\t\x12\x17\n\x0flookback_window\x18\x03 \x01(\x01\"3\n\x15\x46unctionParameterInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\"\xa2\x01\n\x0c\x46unctionInfo\x12\x11\n\tfull_name\x18\x01 \x01(\t\x12H\n\x0cinput_params\x18\x02 \x03(\x0b\x32\x32.feature_store.common.protos.FunctionParameterInfo\x12\x1a\n\x12routine_definition\x18\x03 \x01(\t\x12\x19\n\x11\x65xternal_language\x18\x04 \x01(\t\"\xb1\x02\n\x0b\x46\x65\x61tureSpec\x12>\n\rinput_columns\x18\x01 \x03(\x0b\x32\'.feature_store.common.protos.ColumnInfo\x12\x43\n\x0cinput_tables\x18\x02 \x03(\x0b\x32-.feature_store.common.protos.FeatureTableInfo\x12\x42\n\x0finput_functions\x18\x03 \x03(\x0b\x32).feature_store.common.protos.FunctionInfo\x12\x14\n\x0cworkspace_id\x18\x04 \x01(\x03\x12$\n\x1c\x66\x65\x61ture_store_client_version\x18\x05 \x01(\t\x12\x1d\n\x15serialization_version\x18\x06 \x01(\x05\x62\x06proto3')
19
+
20
+ _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
21
+ _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'feature_store_pb2', globals())
22
+ _FEATURESPEC = DESCRIPTOR.message_types_by_name['FeatureSpec']
23
+ FeatureSpec = _reflection.GeneratedProtocolMessageType('FeatureSpec', (_message.Message,), {
24
+ 'DESCRIPTOR' : _FEATURESPEC,
25
+ '__module__' : 'feature_spec_pb2'
26
+ # @@protoc_insertion_point(class_scope:featurestorecommon.FeatureSpec)
27
+ })
28
+ if _descriptor._USE_C_DESCRIPTORS == False:
29
+
30
+ DESCRIPTOR._options = None
31
+ _SOURCEDATACOLUMNINFO._serialized_start=52
32
+ _SOURCEDATACOLUMNINFO._serialized_end=88
33
+ _FEATURECOLUMNINFO._serialized_start=91
34
+ _FEATURECOLUMNINFO._serialized_end=223
35
+ _ONDEMANDCOLUMNINFO._serialized_start=225
36
+ _ONDEMANDCOLUMNINFO._serialized_end=351
37
+ _INPUTBINDING._serialized_start=353
38
+ _INPUTBINDING._serialized_end=404
39
+ _COLUMNINFO._serialized_start=407
40
+ _COLUMNINFO._serialized_end=787
41
+ _FEATURETABLEINFO._serialized_start=789
42
+ _FEATURETABLEINFO._serialized_end=870
43
+ _FUNCTIONPARAMETERINFO._serialized_start=872
44
+ _FUNCTIONPARAMETERINFO._serialized_end=923
45
+ _FUNCTIONINFO._serialized_start=926
46
+ _FUNCTIONINFO._serialized_end=1088
47
+ _FEATURESPEC._serialized_start=1091
48
+ _FEATURESPEC._serialized_end=1396
49
+ # @@protoc_insertion_point(module_scope)
File without changes
@@ -0,0 +1,48 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ __doc__ = """
4
+ Feature Redis存储配置
5
+ """
6
+
7
+
8
+ class RedisStoreConfig:
9
+ def __init__(self, host='localhost', port=6379, db=0, password=None, instance_id=None):
10
+ self._host = host
11
+ self._port = port
12
+ self._db = db
13
+ self._password = password
14
+ self._instance_id = instance_id
15
+
16
+ @property
17
+ def host(self):
18
+ return self._host
19
+
20
+ @property
21
+ def port(self):
22
+ return self._port
23
+
24
+ @property
25
+ def db(self):
26
+ return self._db
27
+
28
+ @property
29
+ def password(self):
30
+ return self._password
31
+
32
+ @property
33
+ def instance_id(self):
34
+ return self._instance_id
35
+
36
+ @property
37
+ def connection_string(self):
38
+ if self.password:
39
+ connection_string = f"{self.host}:{self.port},db={self.db},password={self._password}"
40
+ else:
41
+ connection_string = f"{self.host}:{self.port},db={self.db}"
42
+ return connection_string
43
+
44
+ def __repr__(self):
45
+ return f"RedisStoreConfig(host={self.host}, port={self.port}, db={self.db}, instance_id={self.instance_id})"
46
+
47
+ def __str__(self):
48
+ return self.__repr__()
File without changes
@@ -0,0 +1,59 @@
1
+
2
+ OVERWRITE = "overwrite"
3
+ APPEND = "append"
4
+ PATH = "path"
5
+ TABLE = "table"
6
+ CUSTOM = "custom"
7
+ PREDICTION_COLUMN_NAME = "prediction"
8
+ MODEL_DATA_PATH_ROOT = "feature_store"
9
+ RAW_MODEL_FOLDER = "raw_model"
10
+ UTF8_BYTES_PER_CHAR = 4
11
+ MAX_PRIMARY_KEY_STRING_LENGTH_CHARS = 100
12
+ MAX_PRIMARY_KEY_STRING_LENGTH_BYTES = (
13
+ MAX_PRIMARY_KEY_STRING_LENGTH_CHARS * UTF8_BYTES_PER_CHAR
14
+ )
15
+ STREAMING_TRIGGER_CONTINUOUS = "continuous"
16
+ STREAMING_TRIGGER_ONCE = "once"
17
+ STREAMING_TRIGGER_PROCESSING_TIME = "processingTime"
18
+ DEFAULT_WRITE_STREAM_TRIGGER = {STREAMING_TRIGGER_PROCESSING_TIME: "5 seconds"}
19
+ _DEFAULT_PUBLISH_STREAM_TRIGGER = {STREAMING_TRIGGER_PROCESSING_TIME: "5 minutes"}
20
+ FEATURE_STORE_CLIENT = "FeatureStoreClient"
21
+
22
+
23
+ _WARN = "WARN"
24
+ _ERROR = "ERROR"
25
+ _SOURCE_FORMAT_DELTA = "delta"
26
+
27
+ _NO_RESULT_TYPE_PASSED = "NO_RESULT_TYPE"
28
+ _USE_SPARK_NATIVE_JOIN = "use_spark_native_join"
29
+ _PREBUILT_ENV_URI = "prebuilt_env_uri"
30
+
31
+ # MLflow模型相关常量(原mlflow_model_constants.py)
32
+ # Module name of the original mlflow_model
33
+ MLFLOW_MODEL_NAME = "wedata.feature_store.mlflow_model"
34
+
35
+ # FeatureStoreClient.log_model将记录包含'raw_model'文件夹的模型
36
+ # 该文件夹存储原始模型的MLmodel文件,用于推理
37
+ RAW_MODEL_FOLDER = "raw_model"
38
+
39
+ # ML模型文件名常量
40
+ ML_MODEL = "MLmodel"
41
+
42
+ # 特征查找客户端的PyPI包名
43
+ FEATURE_LOOKUP_CLIENT_PIP_PACKAGE = "tencent-wedata-feature-engineering"
44
+
45
+ # 特征查找版本号
46
+ FEATURE_LOOKUP_CLIENT_MAJOR_VERSION = "0.1.0"
47
+
48
+ # 特征存储内部数据目录
49
+ FEATURE_STORE_INTERNAL_DATA_DIR = "_wedata_internal/"
50
+ WEDATA_DEFAULT_FEATURE_STORE_DATABASE = "WEDATA_DEFAULT_FEATURE_STORE_DATABASE"
51
+
52
+ # 特征表属性
53
+ FEATURE_TABLE_KEY = "wedata.feature_table"
54
+ FEATURE_TABLE_VALUE = "true"
55
+
56
+ FEATURE_TABLE_PROJECT = "wedata.feature_project_id"
57
+ FEATURE_TABLE_TIMESTAMP = "timestampKeys"
58
+ FEATURE_TABLE_BACKUP_PRIMARY_KEY = "primaryKeys" # 备用标识,主键
59
+ FEATURE_DLC_TABLE_PRIMARY_KEY = "dlc.ao.data.govern.sorted.keys"
@@ -0,0 +1,34 @@
1
+ from enum import Enum
2
+ import os
3
+
4
+
5
+ class EngineTypes(Enum):
6
+ HIVE_ENGINE = "hive"
7
+ ICEBERG_ENGINE = "iceberg"
8
+
9
+ @classmethod
10
+ def get_engine(cls, engine_name: str) -> 'EngineTypes':
11
+ try:
12
+ return cls(engine_name.lower())
13
+ except ValueError:
14
+ raise ValueError(f"Invalid engine type: {engine_name}. Supported engine types: {list(cls)}")
15
+
16
+
17
+ class CalculateEngineTypes(Enum):
18
+ DLC = "dlc"
19
+ EMR = "emr"
20
+
21
+ @classmethod
22
+ def get_calculate_engine(cls, engine_name: str) -> 'CalculateEngineTypes':
23
+ try:
24
+ return cls(engine_name.lower())
25
+ except ValueError:
26
+ raise ValueError(f"Invalid engine type: {engine_name}. Supported engine types: {list(cls)}")
27
+
28
+
29
+ def judge_engine_type() -> 'CalculateEngineTypes':
30
+ if os.environ.get("DLC_REGION", ""):
31
+ return CalculateEngineTypes.DLC
32
+ else:
33
+ return CalculateEngineTypes.EMR
34
+
File without changes
@@ -0,0 +1,138 @@
1
+ import copy
2
+ from typing import Optional, Union
3
+
4
+ from wedata.feature_store.entities.feature_column_info import FeatureColumnInfo
5
+ from wedata.feature_store.entities.feature_spec_constants import SOURCE_DATA_COLUMN_INFO, FEATURE_COLUMN_INFO, \
6
+ ON_DEMAND_COLUMN_INFO
7
+ from wedata.feature_store.entities.on_demand_column_info import OnDemandColumnInfo
8
+ from wedata.feature_store.entities.source_data_column_info import SourceDataColumnInfo
9
+
10
+ from wedata.feature_store.common.protos import feature_store_pb2
11
+
12
+ class ColumnInfo:
13
+ """
14
+ ColumnInfo's structure and properties are mapped 1:1 to the ColumnInfo proto message, unless specified otherwise.
15
+ """
16
+
17
+ def __init__(
18
+ self,
19
+ info: Union[SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo],
20
+ include: bool,
21
+ data_type: Optional[str] = None,
22
+ topological_ordering: Optional[int] = None,
23
+ ):
24
+ if not isinstance(
25
+ info, (SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo)
26
+ ):
27
+ raise ValueError(
28
+ "info must be one of SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo."
29
+ )
30
+ self._info = info
31
+ self._include = include
32
+ self._data_type = data_type
33
+ self._topological_ordering = topological_ordering
34
+
35
+ @property
36
+ def info(
37
+ self,
38
+ ) -> Union[SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo]:
39
+ return self._info
40
+
41
+ @property
42
+ def include(self) -> bool:
43
+ return self._include
44
+
45
+ @property
46
+ def data_type(self) -> Optional[str]:
47
+ """
48
+ FeatureSpecs before v7 are not required to have data types.
49
+ """
50
+ return self._data_type
51
+
52
+ @property
53
+ def topological_ordering(self) -> Optional[int]:
54
+ """
55
+ FeatureSpecs before v8 are not required to have topological ordering.
56
+ """
57
+ return self._topological_ordering
58
+
59
+ @property
60
+ def output_name(self) -> str:
61
+ """
62
+ This field does not exist in the proto, and is provided for convenience.
63
+ """
64
+ return self.info.output_name
65
+
66
+ def with_topological_ordering(self, ordering: int):
67
+ new_column_info = copy.copy(self)
68
+ new_column_info._topological_ordering = ordering
69
+ return new_column_info
70
+
71
+ @classmethod
72
+ def from_proto(cls, column_info_proto):
73
+ if column_info_proto.HasField(SOURCE_DATA_COLUMN_INFO):
74
+ info = SourceDataColumnInfo.from_proto(
75
+ column_info_proto.source_data_column_info
76
+ )
77
+ elif column_info_proto.HasField(FEATURE_COLUMN_INFO):
78
+ info = FeatureColumnInfo.from_proto(column_info_proto.feature_column_info)
79
+ elif column_info_proto.HasField(ON_DEMAND_COLUMN_INFO):
80
+ info = OnDemandColumnInfo.from_proto(
81
+ column_info_proto.on_demand_column_info
82
+ )
83
+ else:
84
+ raise ValueError("Unsupported info type: " + str(column_info_proto))
85
+
86
+ data_type = column_info_proto.data_type or None
87
+ # data_type = (
88
+ # column_info_proto.data_type
89
+ # if column_info_proto.HasField("data_type")
90
+ # else None
91
+ # )
92
+ topological_ordering = column_info_proto.topological_ordering or 0
93
+ # topological_ordering = (
94
+ # column_info_proto.topological_ordering
95
+ # if column_info_proto.HasField("topological_ordering")
96
+ # else None
97
+ # )
98
+
99
+ return ColumnInfo(
100
+ info=info,
101
+ include=column_info_proto.include,
102
+ data_type=data_type,
103
+ topological_ordering=topological_ordering,
104
+ )
105
+
106
+ # def to_proto(self):
107
+ # column_info = ProtoColumnInfo(
108
+ # include=self.include,
109
+ # data_type=self.data_type,
110
+ # topological_ordering=self.topological_ordering,
111
+ # )
112
+ # if isinstance(self.info, SourceDataColumnInfo):
113
+ # column_info.source_data_column_info.CopyFrom(self.info.to_proto())
114
+ # elif isinstance(self.info, FeatureColumnInfo):
115
+ # column_info.feature_column_info.CopyFrom(self.info.to_proto())
116
+ # elif isinstance(self.info, OnDemandColumnInfo):
117
+ # column_info.on_demand_column_info.CopyFrom(self.info.to_proto())
118
+ # else:
119
+ # raise ValueError("Unsupported info type: " + str(self.info))
120
+ #
121
+ # return column_info
122
+ def to_proto(self):
123
+ column_info = feature_store_pb2.ColumnInfo(
124
+ include=self.include,
125
+ data_type=self.data_type,
126
+ topological_ordering=self.topological_ordering,
127
+ )
128
+
129
+ if isinstance(self.info, SourceDataColumnInfo):
130
+ column_info.source_data_column_info.CopyFrom(self.info.to_proto())
131
+ elif isinstance(self.info, FeatureColumnInfo):
132
+ column_info.feature_column_info.CopyFrom(self.info.to_proto())
133
+ elif isinstance(self.info, OnDemandColumnInfo):
134
+ column_info.on_demand_column_info.CopyFrom(self.info.to_proto())
135
+ else:
136
+ raise ValueError("Unsupported info type: " + str(self.info))
137
+
138
+ return column_info
@@ -0,0 +1,55 @@
1
+ import os
2
+
3
+
4
+ class _EnvironmentVariable:
5
+ """
6
+ Represents an environment variable for the feature store client for custom configurations as needed.
7
+ """
8
+
9
+ def __init__(self, name, type_, default):
10
+ self.name = name
11
+ self.type = type_
12
+ self.default = default
13
+
14
+ @property
15
+ def defined(self):
16
+ return self.name in os.environ
17
+
18
+ def get_raw(self):
19
+ return os.getenv(self.name)
20
+
21
+ def set(self, value):
22
+ os.environ[self.name] = str(value)
23
+
24
+ def unset(self):
25
+ os.environ.pop(self.name, None)
26
+
27
+ def get(self):
28
+ """
29
+ Reads the value of the environment variable if it exists and converts it to the desired
30
+ type. Otherwise, returns the default value.
31
+ """
32
+ if (val := self.get_raw()) is not None:
33
+ try:
34
+ return self.type(val)
35
+ except Exception as e:
36
+ raise ValueError(
37
+ f"Failed to convert {val!r} to {self.type} for {self.name}: {e}"
38
+ )
39
+ return self.default
40
+
41
+ def __str__(self):
42
+ return f"{self.name} (default: {self.default}, type: {self.type.__name__})"
43
+
44
+ def __repr__(self):
45
+ return repr(self.name)
46
+
47
+ def __format__(self, format_spec: str) -> str:
48
+ return self.name.__format__(format_spec)
49
+
50
+
51
+ # The threshold (in MB) where a broadcast join will be performed for the asof join for point in time feature join
52
+ # Default is 20MB as benchmarks show diminishing returns with broadcast past this value.The default spark broadcast join threshold is 10MB
53
+ BROADCAST_JOIN_THRESHOLD = _EnvironmentVariable(
54
+ "BROADCAST_JOIN_THRESHOLD", int, 20 * 1024 * 1024
55
+ )
@@ -0,0 +1,53 @@
1
+
2
+
3
+ class Feature:
4
+ def __init__(
5
+ self,
6
+ feature_table,
7
+ feature_id,
8
+ name,
9
+ data_type,
10
+ description,
11
+ data_type_details=None,
12
+ ):
13
+ self._feature_table = feature_table
14
+ self._name = name
15
+ self._data_type = data_type
16
+ self._description = description
17
+ self._data_type_details = data_type_details
18
+ self._feature_id = feature_id
19
+
20
+ @property
21
+ def feature_table(self):
22
+ return self._feature_table
23
+
24
+ @property
25
+ def feature_id(self):
26
+ return self._feature_id
27
+
28
+ @property
29
+ def name(self):
30
+ return self._name
31
+
32
+ @property
33
+ def data_type(self):
34
+ return self._data_type
35
+
36
+ @property
37
+ def data_type_details(self):
38
+ return self._data_type_details
39
+
40
+ @property
41
+ def description(self):
42
+ return self._description
43
+
44
+ @classmethod
45
+ def from_proto(cls, feature_proto):
46
+ return cls(
47
+ feature_table=feature_proto.table,
48
+ feature_id=feature_proto.id,
49
+ name=feature_proto.name,
50
+ data_type=feature_proto.data_type,
51
+ data_type_details=feature_proto.data_type_details,
52
+ description=feature_proto.description,
53
+ )
@@ -0,0 +1,72 @@
1
+ from typing import List, Optional
2
+
3
+ from wedata.feature_store.common.protos import feature_store_pb2
4
+
5
+ class FeatureColumnInfo:
6
+ def __init__(
7
+ self,
8
+ table_name: str,
9
+ feature_name: str,
10
+ lookup_key: List[str],
11
+ output_name: str,
12
+ timestamp_lookup_key: Optional[List[str]] = None,
13
+ ):
14
+ if timestamp_lookup_key is None:
15
+ timestamp_lookup_key = []
16
+ if not table_name:
17
+ raise ValueError("table_name must be non-empty.")
18
+ if not feature_name:
19
+ raise ValueError("feature_name must be non-empty.")
20
+ if not isinstance(lookup_key, list):
21
+ raise ValueError("lookup_key must be a list.")
22
+ if not lookup_key or "" in lookup_key or None in lookup_key:
23
+ raise ValueError("lookup_key must be non-empty.")
24
+ if not output_name:
25
+ raise ValueError("output_name must be non-empty.")
26
+ if not isinstance(timestamp_lookup_key, list):
27
+ raise ValueError("timestamp_lookup_key must be a list.")
28
+
29
+ self._table_name = table_name
30
+ self._feature_name = feature_name
31
+ self._lookup_key = lookup_key
32
+ self._output_name = output_name
33
+ self._timestamp_lookup_key = timestamp_lookup_key
34
+
35
+ @property
36
+ def table_name(self):
37
+ return self._table_name
38
+
39
+ @property
40
+ def lookup_key(self):
41
+ return self._lookup_key
42
+
43
+ @property
44
+ def feature_name(self):
45
+ return self._feature_name
46
+
47
+ @property
48
+ def output_name(self):
49
+ return self._output_name
50
+
51
+ @property
52
+ def timestamp_lookup_key(self):
53
+ return self._timestamp_lookup_key
54
+
55
+ @classmethod
56
+ def from_proto(cls, feature_column_info_proto):
57
+ return cls(
58
+ table_name=feature_column_info_proto.table_name,
59
+ feature_name=feature_column_info_proto.feature_name,
60
+ lookup_key=list(feature_column_info_proto.lookup_key),
61
+ output_name=feature_column_info_proto.output_name,
62
+ timestamp_lookup_key=list(feature_column_info_proto.timestamp_lookup_key),
63
+ )
64
+
65
+ def to_proto(self):
66
+ return feature_store_pb2.FeatureColumnInfo(
67
+ table_name=self.table_name,
68
+ feature_name=self.feature_name,
69
+ lookup_key=self.lookup_key,
70
+ output_name=self.output_name,
71
+ timestamp_lookup_key=self.timestamp_lookup_key,
72
+ )
@@ -0,0 +1,55 @@
1
+ from typing import Dict, Optional
2
+
3
+
4
+ class FeatureFunction:
5
+
6
+ """
7
+ 特征方法类
8
+
9
+ 特征方法是用户定义的函数,用于将特征表中的特征组合成新特征,特征方法可以是任何用户定义的函数,例如Python UDF。
10
+
11
+ 特征方法类有以下属性:
12
+ - udf_name:要调用的Python UDF的名称。
13
+ - input_bindings:用于将Python UDF的输入映射到训练集中的特征的字典。
14
+ - output_name:如果提供,则会将此特征重命名为 :meth:`create_training_set() <databricks.feature_engineering.client.FeatureEngineeringClient.create_training_set>` 返回的 :class:`TrainingSet <databricks.ml_features.training_set.TrainingSet>` 中的特征。
15
+
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ *,
21
+ udf_name: str,
22
+ input_bindings: Optional[Dict[str, str]] = None,
23
+ output_name: Optional[str] = None,
24
+ ):
25
+ """Initialize a FeatureFunction object. See class documentation."""
26
+ # UC function names are always lowercase.
27
+ self._udf_name = udf_name.lower()
28
+ self._input_bindings = input_bindings if input_bindings else {}
29
+ self._output_name = output_name
30
+
31
+ @property
32
+ def udf_name(self) -> str:
33
+ """
34
+ The name of the Python UDF called by this FeatureFunction.
35
+ """
36
+ return self._udf_name
37
+
38
+ @property
39
+ def input_bindings(self) -> Dict[str, str]:
40
+ """
41
+ The input to use for each argument of the Python UDF.
42
+
43
+ For example:
44
+
45
+ `{"x": "feature1", "y": "input1"}`
46
+ """
47
+ return self._input_bindings
48
+
49
+ @property
50
+ def output_name(self) -> Optional[str]:
51
+ """
52
+ The output name to use for the results of this FeatureFunction.
53
+ If empty, defaults to the fully qualified `udf_name` when evaluated.
54
+ """
55
+ return self._output_name