wedata-feature-engineering 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wedata/__init__.py +1 -1
- wedata/feature_store/client.py +113 -41
- wedata/feature_store/constants/constants.py +19 -0
- wedata/feature_store/entities/column_info.py +4 -4
- wedata/feature_store/entities/feature_lookup.py +5 -1
- wedata/feature_store/entities/feature_spec.py +46 -46
- wedata/feature_store/entities/feature_table.py +42 -99
- wedata/feature_store/entities/training_set.py +13 -12
- wedata/feature_store/feature_table_client/feature_table_client.py +85 -30
- wedata/feature_store/spark_client/spark_client.py +30 -56
- wedata/feature_store/training_set_client/training_set_client.py +209 -38
- wedata/feature_store/utils/common_utils.py +213 -3
- wedata/feature_store/utils/feature_lookup_utils.py +6 -6
- wedata/feature_store/utils/feature_spec_utils.py +6 -6
- wedata/feature_store/utils/feature_utils.py +5 -5
- wedata/feature_store/utils/on_demand_utils.py +107 -0
- wedata/feature_store/utils/schema_utils.py +1 -1
- wedata/feature_store/utils/signature_utils.py +205 -0
- wedata/feature_store/utils/training_set_utils.py +18 -19
- wedata/feature_store/utils/uc_utils.py +1 -1
- {wedata_feature_engineering-0.1.5.dist-info → wedata_feature_engineering-0.1.6.dist-info}/METADATA +1 -1
- wedata_feature_engineering-0.1.6.dist-info/RECORD +43 -0
- feature_store/__init__.py +0 -6
- feature_store/client.py +0 -169
- feature_store/constants/__init__.py +0 -0
- feature_store/constants/constants.py +0 -28
- feature_store/entities/__init__.py +0 -0
- feature_store/entities/column_info.py +0 -117
- feature_store/entities/data_type.py +0 -92
- feature_store/entities/environment_variables.py +0 -55
- feature_store/entities/feature.py +0 -53
- feature_store/entities/feature_column_info.py +0 -64
- feature_store/entities/feature_function.py +0 -55
- feature_store/entities/feature_lookup.py +0 -179
- feature_store/entities/feature_spec.py +0 -454
- feature_store/entities/feature_spec_constants.py +0 -25
- feature_store/entities/feature_table.py +0 -164
- feature_store/entities/feature_table_info.py +0 -40
- feature_store/entities/function_info.py +0 -184
- feature_store/entities/on_demand_column_info.py +0 -44
- feature_store/entities/source_data_column_info.py +0 -21
- feature_store/entities/training_set.py +0 -134
- feature_store/feature_table_client/__init__.py +0 -0
- feature_store/feature_table_client/feature_table_client.py +0 -313
- feature_store/spark_client/__init__.py +0 -0
- feature_store/spark_client/spark_client.py +0 -286
- feature_store/training_set_client/__init__.py +0 -0
- feature_store/training_set_client/training_set_client.py +0 -196
- feature_store/utils/__init__.py +0 -0
- feature_store/utils/common_utils.py +0 -96
- feature_store/utils/feature_lookup_utils.py +0 -570
- feature_store/utils/feature_spec_utils.py +0 -286
- feature_store/utils/feature_utils.py +0 -73
- feature_store/utils/schema_utils.py +0 -117
- feature_store/utils/topological_sort.py +0 -158
- feature_store/utils/training_set_utils.py +0 -580
- feature_store/utils/uc_utils.py +0 -281
- feature_store/utils/utils.py +0 -252
- feature_store/utils/validation_utils.py +0 -55
- wedata/feature_store/utils/utils.py +0 -252
- wedata_feature_engineering-0.1.5.dist-info/RECORD +0 -79
- {wedata_feature_engineering-0.1.5.dist-info → wedata_feature_engineering-0.1.6.dist-info}/WHEEL +0 -0
- {wedata_feature_engineering-0.1.5.dist-info → wedata_feature_engineering-0.1.6.dist-info}/top_level.txt +0 -0
@@ -4,21 +4,19 @@ from typing import Dict, List, Optional, Set
|
|
4
4
|
|
5
5
|
from pyspark.sql import DataFrame
|
6
6
|
|
7
|
-
from feature_store.entities.column_info import ColumnInfo
|
8
|
-
from feature_store.entities.feature import Feature
|
9
|
-
from feature_store.entities.feature_column_info import FeatureColumnInfo
|
10
|
-
from feature_store.entities.feature_lookup import FeatureLookup
|
11
|
-
from feature_store.entities.feature_spec import FeatureSpec
|
12
|
-
from feature_store.entities.feature_table import FeatureTable
|
13
|
-
from feature_store.entities.feature_table_info import FeatureTableInfo
|
14
|
-
from feature_store.entities.function_info import FunctionInfo
|
15
|
-
from feature_store.entities.on_demand_column_info import OnDemandColumnInfo
|
16
|
-
from feature_store.entities.source_data_column_info import SourceDataColumnInfo
|
17
|
-
|
18
|
-
from feature_store.
|
19
|
-
|
20
|
-
from feature_store.utils import common_utils, validation_utils, uc_utils, schema_utils, utils
|
21
|
-
from feature_store.utils.feature_spec_utils import assign_topological_ordering
|
7
|
+
from wedata.feature_store.entities.column_info import ColumnInfo
|
8
|
+
from wedata.feature_store.entities.feature import Feature
|
9
|
+
from wedata.feature_store.entities.feature_column_info import FeatureColumnInfo
|
10
|
+
from wedata.feature_store.entities.feature_lookup import FeatureLookup
|
11
|
+
from wedata.feature_store.entities.feature_spec import FeatureSpec
|
12
|
+
from wedata.feature_store.entities.feature_table import FeatureTable
|
13
|
+
from wedata.feature_store.entities.feature_table_info import FeatureTableInfo
|
14
|
+
from wedata.feature_store.entities.function_info import FunctionInfo
|
15
|
+
from wedata.feature_store.entities.on_demand_column_info import OnDemandColumnInfo
|
16
|
+
from wedata.feature_store.entities.source_data_column_info import SourceDataColumnInfo
|
17
|
+
|
18
|
+
from wedata.feature_store.utils import common_utils, validation_utils, uc_utils, schema_utils
|
19
|
+
from wedata.feature_store.utils.feature_spec_utils import assign_topological_ordering
|
22
20
|
|
23
21
|
_logger = logging.getLogger(__name__)
|
24
22
|
|
@@ -99,9 +97,9 @@ def _explode_feature_lookup(
|
|
99
97
|
FeatureColumnInfo(
|
100
98
|
table_name=feature_lookup.table_name,
|
101
99
|
feature_name=feature_name,
|
102
|
-
lookup_key=
|
100
|
+
lookup_key=common_utils.as_list(feature_lookup.lookup_key),
|
103
101
|
output_name=(feature_lookup._get_output_name(feature_name)),
|
104
|
-
timestamp_lookup_key=
|
102
|
+
timestamp_lookup_key=common_utils.as_list(
|
105
103
|
feature_lookup.timestamp_lookup_key, default=[]
|
106
104
|
),
|
107
105
|
)
|
@@ -280,13 +278,14 @@ def get_table_metadata(
|
|
280
278
|
feature_table_metadata_map = get_feature_table_metadata_for_tables(
|
281
279
|
spark_client, table_names=table_names
|
282
280
|
)
|
281
|
+
|
283
282
|
feature_table_data_map = load_feature_data_for_tables(
|
284
283
|
spark_client, table_names=table_names
|
285
284
|
)
|
286
285
|
return _FeatureTableMetadata(
|
287
286
|
feature_table_features_map,
|
288
287
|
feature_table_metadata_map,
|
289
|
-
feature_table_data_map
|
288
|
+
feature_table_data_map
|
290
289
|
)
|
291
290
|
|
292
291
|
|
@@ -515,7 +514,7 @@ def build_feature_spec(
|
|
515
514
|
for table_name in consumed_table_names
|
516
515
|
]
|
517
516
|
function_infos = [
|
518
|
-
FunctionInfo(
|
517
|
+
FunctionInfo(full_name=udf_name) for udf_name in consumed_udf_names
|
519
518
|
]
|
520
519
|
|
521
520
|
# Build FeatureSpec
|
@@ -2,7 +2,7 @@ import copy
|
|
2
2
|
import re
|
3
3
|
from typing import Optional, Set
|
4
4
|
|
5
|
-
from feature_store.entities.feature_spec import FeatureSpec
|
5
|
+
from wedata.feature_store.entities.feature_spec import FeatureSpec
|
6
6
|
|
7
7
|
SINGLE_LEVEL_NAMESPACE_REGEX = r"^[^\. \/\x00-\x1F\x7F]+$"
|
8
8
|
TWO_LEVEL_NAMESPACE_REGEX = r"^[^\. \/\x00-\x1F\x7F]+(\.[^\. \/\x00-\x1F\x7F]+)$"
|
@@ -0,0 +1,43 @@
|
|
1
|
+
wedata/__init__.py,sha256=26GwucASB9KsmU109sN-VKotEKp1WZYQDGP0wgWZrzY,101
|
2
|
+
wedata/feature_store/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
wedata/feature_store/client.py,sha256=7a-9C8HIBHnQNQD6I4W3UtBQwkJE8G-Q7N24zydjpkY,8100
|
4
|
+
wedata/feature_store/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
wedata/feature_store/constants/constants.py,sha256=b4tgcSt66YIq0Fg7pMbqvbqPOI77Cz8znLVZ4ihUKss,1479
|
6
|
+
wedata/feature_store/entities/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
wedata/feature_store/entities/column_info.py,sha256=sU_WD9U0qse0M2speowLY30qSE6j4_57PuvtdPkwiFY,4192
|
8
|
+
wedata/feature_store/entities/data_type.py,sha256=VpHS6Fr3TphQQ8NbAcEnDJ-8eOZV6ivYuWxv3pAM2RM,3394
|
9
|
+
wedata/feature_store/entities/environment_variables.py,sha256=ZEFml5H9MQuzBKM074mUrFYu-Sga4Knmxqiwpke2WGc,1679
|
10
|
+
wedata/feature_store/entities/feature.py,sha256=wX8fTBlJq3GYdj9rrBDCY3kFgcVBBAiOOZdxEhnQkNQ,1241
|
11
|
+
wedata/feature_store/entities/feature_column_info.py,sha256=-TGxRafYUaNKe0YzHus2XbfRaVrMv7pcffMdbtTT4nA,2031
|
12
|
+
wedata/feature_store/entities/feature_function.py,sha256=R17INrCE-U_Uj9KLbFz69aYlOkTETTwQHMMo470F4lQ,1865
|
13
|
+
wedata/feature_store/entities/feature_lookup.py,sha256=YjYz8kLq42doFbgPzpmm1r3GPhPYkLsIss4H71x-KAo,8009
|
14
|
+
wedata/feature_store/entities/feature_spec.py,sha256=60RUOOe9y_Xsd1I3xqq4NZYnaox4_jjwSyGRTKXLiIw,20041
|
15
|
+
wedata/feature_store/entities/feature_spec_constants.py,sha256=YWDBfRiNDe6fUJFUBo3V4WYg2xsljoPAE-ZejfFZCgM,785
|
16
|
+
wedata/feature_store/entities/feature_table.py,sha256=dHZHSDPD4HJ2XanLVIrVTkaCYUeqZ6eWEpA0d3YO71g,4010
|
17
|
+
wedata/feature_store/entities/feature_table_info.py,sha256=2vUaVdW_jw1dRAlmJWvBRueuMeuqWu_NYB9SlxLI7Uw,1126
|
18
|
+
wedata/feature_store/entities/function_info.py,sha256=l0kmiq2R_QNfSMJ7y0xZohlMiemgYSr1dN5vzV8ijIs,7314
|
19
|
+
wedata/feature_store/entities/on_demand_column_info.py,sha256=Eh5ieaj1TxC7DG6ipBZzH2ZyY0bwkLrDOkuZjgYr4gY,1297
|
20
|
+
wedata/feature_store/entities/source_data_column_info.py,sha256=a9jQOJvehwDIrKPwsP6W9YRBSPNK2nZYypE6-p80CwA,542
|
21
|
+
wedata/feature_store/entities/training_set.py,sha256=ylt1h6Z_xU8hKYvnvd80CeewTGSN68-_kvFpoliwH7s,5679
|
22
|
+
wedata/feature_store/feature_table_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
|
+
wedata/feature_store/feature_table_client/feature_table_client.py,sha256=nrnY3FLQnMhW1BzByDjjfU89hirgaKlg2l2tAfcjvyM,12138
|
24
|
+
wedata/feature_store/spark_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
|
+
wedata/feature_store/spark_client/spark_client.py,sha256=DBCYjLsFrIVRvLErTNyfLIHRul3v0y9uZIY2JR1N92s,10323
|
26
|
+
wedata/feature_store/training_set_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
|
+
wedata/feature_store/training_set_client/training_set_client.py,sha256=gHeZU0rvvUcyNTfroXD3LAinFPdhDpnwTOIWj6z84Tc,15102
|
28
|
+
wedata/feature_store/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
29
|
+
wedata/feature_store/utils/common_utils.py,sha256=rmGXfBoEyDMlfqd7lxpEmKJaLoQ-d-ufWpAcE8nSHqA,10009
|
30
|
+
wedata/feature_store/utils/feature_lookup_utils.py,sha256=da6ULwf5D-FRVpZoNyag1rroBfq_XPSH4a3uEMB_8io,22372
|
31
|
+
wedata/feature_store/utils/feature_spec_utils.py,sha256=GFwc-WT6nN1tnal5F2c0bgVDRhH-YW58S0GaHBPZEsQ,11624
|
32
|
+
wedata/feature_store/utils/feature_utils.py,sha256=KKq28bVB_lCuhnR9Hk6JegJBOVgcelWlvrRM-F9onkA,2796
|
33
|
+
wedata/feature_store/utils/on_demand_utils.py,sha256=pazZRG5c0Se08MV_inBddIeX4Q9xlVN_H9SC_WK3xzs,4260
|
34
|
+
wedata/feature_store/utils/schema_utils.py,sha256=y6EYY1pUxjVg6MP4C7avdW8ZEBBaDo1YTV2CmPF4i8o,4491
|
35
|
+
wedata/feature_store/utils/signature_utils.py,sha256=_4_mo1Qlzklp-JrISMS3Jv89MPbaH6rz_cRDvJqFNXM,7957
|
36
|
+
wedata/feature_store/utils/topological_sort.py,sha256=ebzKxmxeCLk9seB1zR0ASCGXsZsa-DjxJeTc4KUadtg,6475
|
37
|
+
wedata/feature_store/utils/training_set_utils.py,sha256=MYsPZS1d9HKswHgjgxD8K7H9N3dWPyyTTx20Mkp4PVU,22497
|
38
|
+
wedata/feature_store/utils/uc_utils.py,sha256=A-W8Cd8yvTmAMEWaHeWmGmcIDMvUtjAfx2G2x_di1QE,10774
|
39
|
+
wedata/feature_store/utils/validation_utils.py,sha256=FslvrNs3kstqvM6THScLOluEE6O9RWlDrD9xiihTzlw,1735
|
40
|
+
wedata_feature_engineering-0.1.6.dist-info/METADATA,sha256=orxNq_A9F8FcSWYn6wTY1pQ2KtqNVIREvGziUnNa1ys,493
|
41
|
+
wedata_feature_engineering-0.1.6.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
42
|
+
wedata_feature_engineering-0.1.6.dist-info/top_level.txt,sha256=Xa0v1rh__RvfVTVDirW5r5UBKg7ZO_iuTeXfp8MNo2A,7
|
43
|
+
wedata_feature_engineering-0.1.6.dist-info/RECORD,,
|
feature_store/__init__.py
DELETED
feature_store/client.py
DELETED
@@ -1,169 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Wedata FeatureStoreClient Python实现
|
3
|
-
"""
|
4
|
-
|
5
|
-
from __future__ import annotations
|
6
|
-
from typing import Union, List, Dict, Optional, Any
|
7
|
-
from pyspark.sql import DataFrame, SparkSession
|
8
|
-
from pyspark.sql.streaming import StreamingQuery
|
9
|
-
from pyspark.sql.types import StructType
|
10
|
-
|
11
|
-
from feature_store.constants.constants import APPEND, DEFAULT_WRITE_STREAM_TRIGGER
|
12
|
-
from feature_store.entities.feature_function import FeatureFunction
|
13
|
-
from feature_store.entities.feature_lookup import FeatureLookup
|
14
|
-
from feature_store.entities.training_set import TrainingSet
|
15
|
-
from feature_store.feature_table_client.feature_table_client import FeatureTableClient
|
16
|
-
from feature_store.spark_client.spark_client import SparkClient
|
17
|
-
from feature_store.training_set_client.training_set_client import TrainingSetClient
|
18
|
-
from feature_store.utils.feature_utils import format_feature_lookups_and_functions
|
19
|
-
|
20
|
-
|
21
|
-
class FeatureStoreClient:
|
22
|
-
"""特征存储统一客户端,提供特征全生命周期管理能力"""
|
23
|
-
|
24
|
-
def __init__(self, spark: SparkSession):
|
25
|
-
"""
|
26
|
-
:param spark: 已初始化的SparkSession对象
|
27
|
-
"""
|
28
|
-
self._spark = spark
|
29
|
-
self._spark_client = SparkClient(spark)
|
30
|
-
self._feature_table_client = FeatureTableClient(spark)
|
31
|
-
|
32
|
-
def create_table(
|
33
|
-
self,
|
34
|
-
name: str,
|
35
|
-
primary_keys: Union[str, List[str]],
|
36
|
-
df: Optional[DataFrame] = None,
|
37
|
-
*,
|
38
|
-
timestamp_keys: Union[str, List[str], None] = None,
|
39
|
-
partition_columns: Union[str, List[str], None] = None,
|
40
|
-
schema: Optional[StructType] = None,
|
41
|
-
description: Optional[str] = None,
|
42
|
-
tags: Optional[Dict[str, str]] = None
|
43
|
-
):
|
44
|
-
"""
|
45
|
-
创建特征表(支持批流数据写入)
|
46
|
-
|
47
|
-
Args:
|
48
|
-
name: 特征表全称(格式:<table>)
|
49
|
-
primary_keys: 主键列名(支持复合主键)
|
50
|
-
df: 初始数据(可选,用于推断schema)
|
51
|
-
timestamp_keys: 时间戳键(用于时态特征)
|
52
|
-
partition_columns: 分区列(优化存储查询)
|
53
|
-
description: 业务描述
|
54
|
-
tags: 业务标签
|
55
|
-
|
56
|
-
Returns:
|
57
|
-
FeatureTable实例
|
58
|
-
|
59
|
-
Raises:
|
60
|
-
ValueError: 当schema与数据不匹配时
|
61
|
-
"""
|
62
|
-
|
63
|
-
return self._feature_table_client.create_table(
|
64
|
-
name=name,
|
65
|
-
primary_keys=primary_keys,
|
66
|
-
df=df,
|
67
|
-
timestamp_keys=timestamp_keys,
|
68
|
-
partition_columns=partition_columns,
|
69
|
-
schema=schema,
|
70
|
-
description=description,
|
71
|
-
tags=tags
|
72
|
-
)
|
73
|
-
|
74
|
-
|
75
|
-
def read_table(self, name: str) -> DataFrame:
|
76
|
-
"""
|
77
|
-
读取特征表数据
|
78
|
-
|
79
|
-
Args:
|
80
|
-
name: 特征表名称
|
81
|
-
|
82
|
-
Returns:
|
83
|
-
DataFrame: 包含特征表数据的DataFrame对象
|
84
|
-
"""
|
85
|
-
return self._feature_table_client.read_table(name)
|
86
|
-
|
87
|
-
|
88
|
-
def drop_table(self, name: str) -> None:
|
89
|
-
"""
|
90
|
-
删除特征表
|
91
|
-
|
92
|
-
Args:
|
93
|
-
name: 要删除的特征表名称
|
94
|
-
|
95
|
-
Returns:
|
96
|
-
None
|
97
|
-
"""
|
98
|
-
return self._feature_table_client.drop_table(name)
|
99
|
-
|
100
|
-
|
101
|
-
def create_training_set(
|
102
|
-
self,
|
103
|
-
df: DataFrame,
|
104
|
-
feature_lookups: List[Union[FeatureLookup, FeatureFunction]],
|
105
|
-
label: Union[str, List[str], None],
|
106
|
-
exclude_columns: Optional[List[str]] = None,
|
107
|
-
**kwargs,
|
108
|
-
) -> TrainingSet:
|
109
|
-
|
110
|
-
"""
|
111
|
-
创建训练集
|
112
|
-
|
113
|
-
Args:
|
114
|
-
df: 基础数据
|
115
|
-
feature_lookups: 特征查询列表
|
116
|
-
label: 标签列名
|
117
|
-
exclude_columns: 排除列名
|
118
|
-
|
119
|
-
Returns:
|
120
|
-
TrainingSet实例
|
121
|
-
"""
|
122
|
-
|
123
|
-
if exclude_columns is None:
|
124
|
-
exclude_columns = []
|
125
|
-
|
126
|
-
features = feature_lookups
|
127
|
-
del feature_lookups
|
128
|
-
|
129
|
-
features = format_feature_lookups_and_functions(self._spark_client, features)
|
130
|
-
# 创建TrainingSetClient实例
|
131
|
-
training_set_client = TrainingSetClient(self._spark_client)
|
132
|
-
return training_set_client.create_training_set_from_feature_lookups(
|
133
|
-
df=df,
|
134
|
-
feature_lookups=features,
|
135
|
-
label=label,
|
136
|
-
exclude_columns=exclude_columns,
|
137
|
-
**kwargs
|
138
|
-
)
|
139
|
-
|
140
|
-
def write_table(
|
141
|
-
self,
|
142
|
-
name: str,
|
143
|
-
df: DataFrame,
|
144
|
-
mode: str = APPEND,
|
145
|
-
checkpoint_location: Optional[str] = None,
|
146
|
-
trigger: Dict[str, Any] = DEFAULT_WRITE_STREAM_TRIGGER,
|
147
|
-
) -> Optional[StreamingQuery]:
|
148
|
-
|
149
|
-
"""
|
150
|
-
写入数据到特征表(支持批处理和流式处理)
|
151
|
-
|
152
|
-
Args:
|
153
|
-
name: 特征表名称
|
154
|
-
df: 要写入的数据DataFrame
|
155
|
-
mode: 写入模式(默认追加)
|
156
|
-
checkpoint_location: 流式处理的检查点位置(可选)
|
157
|
-
trigger: 流式处理触发器配置(默认使用系统预设)
|
158
|
-
|
159
|
-
Returns:
|
160
|
-
如果是流式写入返回StreamingQuery对象,否则返回None
|
161
|
-
"""
|
162
|
-
|
163
|
-
return self._feature_table_client.write_table(
|
164
|
-
name=name,
|
165
|
-
df=df,
|
166
|
-
mode=mode,
|
167
|
-
checkpoint_location=checkpoint_location,
|
168
|
-
trigger=trigger,
|
169
|
-
)
|
File without changes
|
@@ -1,28 +0,0 @@
|
|
1
|
-
|
2
|
-
OVERWRITE = "overwrite"
|
3
|
-
APPEND = "append"
|
4
|
-
PATH = "path"
|
5
|
-
TABLE = "table"
|
6
|
-
CUSTOM = "custom"
|
7
|
-
PREDICTION_COLUMN_NAME = "prediction"
|
8
|
-
MODEL_DATA_PATH_ROOT = "feature_store"
|
9
|
-
UTF8_BYTES_PER_CHAR = 4
|
10
|
-
MAX_PRIMARY_KEY_STRING_LENGTH_CHARS = 100
|
11
|
-
MAX_PRIMARY_KEY_STRING_LENGTH_BYTES = (
|
12
|
-
MAX_PRIMARY_KEY_STRING_LENGTH_CHARS * UTF8_BYTES_PER_CHAR
|
13
|
-
)
|
14
|
-
STREAMING_TRIGGER_CONTINUOUS = "continuous"
|
15
|
-
STREAMING_TRIGGER_ONCE = "once"
|
16
|
-
STREAMING_TRIGGER_PROCESSING_TIME = "processingTime"
|
17
|
-
DEFAULT_WRITE_STREAM_TRIGGER = {STREAMING_TRIGGER_PROCESSING_TIME: "5 seconds"}
|
18
|
-
_DEFAULT_PUBLISH_STREAM_TRIGGER = {STREAMING_TRIGGER_PROCESSING_TIME: "5 minutes"}
|
19
|
-
|
20
|
-
|
21
|
-
_WARN = "WARN"
|
22
|
-
_ERROR = "ERROR"
|
23
|
-
_SOURCE_FORMAT_DELTA = "delta"
|
24
|
-
|
25
|
-
_NO_RESULT_TYPE_PASSED = "NO_RESULT_TYPE"
|
26
|
-
_USE_SPARK_NATIVE_JOIN = "use_spark_native_join"
|
27
|
-
_PREBUILT_ENV_URI = "prebuilt_env_uri"
|
28
|
-
|
File without changes
|
@@ -1,117 +0,0 @@
|
|
1
|
-
import copy
|
2
|
-
from typing import Optional, Union
|
3
|
-
|
4
|
-
from feature_store.entities.feature_column_info import FeatureColumnInfo
|
5
|
-
from feature_store.entities.feature_spec_constants import SOURCE_DATA_COLUMN_INFO, FEATURE_COLUMN_INFO, \
|
6
|
-
ON_DEMAND_COLUMN_INFO
|
7
|
-
from feature_store.entities.on_demand_column_info import OnDemandColumnInfo
|
8
|
-
from feature_store.entities.source_data_column_info import SourceDataColumnInfo
|
9
|
-
|
10
|
-
|
11
|
-
class ColumnInfo:
|
12
|
-
"""
|
13
|
-
ColumnInfo's structure and properties are mapped 1:1 to the ColumnInfo proto message, unless specified otherwise.
|
14
|
-
"""
|
15
|
-
|
16
|
-
def __init__(
|
17
|
-
self,
|
18
|
-
info: Union[SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo],
|
19
|
-
include: bool,
|
20
|
-
data_type: Optional[str] = None,
|
21
|
-
topological_ordering: Optional[int] = None,
|
22
|
-
):
|
23
|
-
if not isinstance(
|
24
|
-
info, (SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo)
|
25
|
-
):
|
26
|
-
raise ValueError(
|
27
|
-
"info must be one of SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo."
|
28
|
-
)
|
29
|
-
self._info = info
|
30
|
-
self._include = include
|
31
|
-
self._data_type = data_type
|
32
|
-
self._topological_ordering = topological_ordering
|
33
|
-
|
34
|
-
@property
|
35
|
-
def info(
|
36
|
-
self,
|
37
|
-
) -> Union[SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo]:
|
38
|
-
return self._info
|
39
|
-
|
40
|
-
@property
|
41
|
-
def include(self) -> bool:
|
42
|
-
return self._include
|
43
|
-
|
44
|
-
@property
|
45
|
-
def data_type(self) -> Optional[str]:
|
46
|
-
"""
|
47
|
-
FeatureSpecs before v7 are not required to have data types.
|
48
|
-
"""
|
49
|
-
return self._data_type
|
50
|
-
|
51
|
-
@property
|
52
|
-
def topological_ordering(self) -> Optional[int]:
|
53
|
-
"""
|
54
|
-
FeatureSpecs before v8 are not required to have topological ordering.
|
55
|
-
"""
|
56
|
-
return self._topological_ordering
|
57
|
-
|
58
|
-
@property
|
59
|
-
def output_name(self) -> str:
|
60
|
-
"""
|
61
|
-
This field does not exist in the proto, and is provided for convenience.
|
62
|
-
"""
|
63
|
-
return self.info.output_name
|
64
|
-
|
65
|
-
def with_topological_ordering(self, ordering: int):
|
66
|
-
new_column_info = copy.copy(self)
|
67
|
-
new_column_info._topological_ordering = ordering
|
68
|
-
return new_column_info
|
69
|
-
|
70
|
-
@classmethod
|
71
|
-
def from_proto(cls, column_info_proto):
|
72
|
-
if column_info_proto.HasField(SOURCE_DATA_COLUMN_INFO):
|
73
|
-
info = SourceDataColumnInfo.from_proto(
|
74
|
-
column_info_proto.source_data_column_info
|
75
|
-
)
|
76
|
-
elif column_info_proto.HasField(FEATURE_COLUMN_INFO):
|
77
|
-
info = FeatureColumnInfo.from_proto(column_info_proto.feature_column_info)
|
78
|
-
elif column_info_proto.HasField(ON_DEMAND_COLUMN_INFO):
|
79
|
-
info = OnDemandColumnInfo.from_proto(
|
80
|
-
column_info_proto.on_demand_column_info
|
81
|
-
)
|
82
|
-
else:
|
83
|
-
raise ValueError("Unsupported info type: " + str(column_info_proto))
|
84
|
-
|
85
|
-
data_type = (
|
86
|
-
column_info_proto.data_type
|
87
|
-
if column_info_proto.HasField("data_type")
|
88
|
-
else None
|
89
|
-
)
|
90
|
-
topological_ordering = (
|
91
|
-
column_info_proto.topological_ordering
|
92
|
-
if column_info_proto.HasField("topological_ordering")
|
93
|
-
else None
|
94
|
-
)
|
95
|
-
return ColumnInfo(
|
96
|
-
info=info,
|
97
|
-
include=column_info_proto.include,
|
98
|
-
data_type=data_type,
|
99
|
-
topological_ordering=topological_ordering,
|
100
|
-
)
|
101
|
-
|
102
|
-
# def to_proto(self):
|
103
|
-
# column_info = ProtoColumnInfo(
|
104
|
-
# include=self.include,
|
105
|
-
# data_type=self.data_type,
|
106
|
-
# topological_ordering=self.topological_ordering,
|
107
|
-
# )
|
108
|
-
# if isinstance(self.info, SourceDataColumnInfo):
|
109
|
-
# column_info.source_data_column_info.CopyFrom(self.info.to_proto())
|
110
|
-
# elif isinstance(self.info, FeatureColumnInfo):
|
111
|
-
# column_info.feature_column_info.CopyFrom(self.info.to_proto())
|
112
|
-
# elif isinstance(self.info, OnDemandColumnInfo):
|
113
|
-
# column_info.on_demand_column_info.CopyFrom(self.info.to_proto())
|
114
|
-
# else:
|
115
|
-
# raise ValueError("Unsupported info type: " + str(self.info))
|
116
|
-
#
|
117
|
-
# return column_info
|
@@ -1,92 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
import re
|
3
|
-
from typing import Any
|
4
|
-
|
5
|
-
from pyspark.sql.types import ArrayType, DataType, DecimalType, MapType, StructType
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
class DataType(_ProtoEnumEntity):
|
10
|
-
"""Online store types."""
|
11
|
-
|
12
|
-
INTEGER = ProtoDataType.Value("INTEGER")
|
13
|
-
FLOAT = ProtoDataType.Value("FLOAT")
|
14
|
-
BOOLEAN = ProtoDataType.Value("BOOLEAN")
|
15
|
-
STRING = ProtoDataType.Value("STRING")
|
16
|
-
DOUBLE = ProtoDataType.Value("DOUBLE")
|
17
|
-
LONG = ProtoDataType.Value("LONG")
|
18
|
-
TIMESTAMP = ProtoDataType.Value("TIMESTAMP")
|
19
|
-
DATE = ProtoDataType.Value("DATE")
|
20
|
-
SHORT = ProtoDataType.Value("SHORT")
|
21
|
-
ARRAY = ProtoDataType.Value("ARRAY")
|
22
|
-
MAP = ProtoDataType.Value("MAP")
|
23
|
-
BINARY = ProtoDataType.Value("BINARY")
|
24
|
-
DECIMAL = ProtoDataType.Value("DECIMAL")
|
25
|
-
STRUCT = ProtoDataType.Value("STRUCT")
|
26
|
-
|
27
|
-
_FIXED_DECIMAL = re.compile("decimal\\(\\s*(\\d+)\\s*,\\s*(\\d+)\\s*\\)")
|
28
|
-
|
29
|
-
@classmethod
|
30
|
-
def _enum_type(cls) -> Any:
|
31
|
-
return ProtoDataType
|
32
|
-
|
33
|
-
@classmethod
|
34
|
-
def from_spark_type(cls, spark_type):
|
35
|
-
return cls.from_string(spark_type.typeName())
|
36
|
-
|
37
|
-
@classmethod
|
38
|
-
def spark_type_to_string(cls, spark_type):
|
39
|
-
return DataType.to_string(DataType.from_spark_type(spark_type))
|
40
|
-
|
41
|
-
@classmethod
|
42
|
-
def top_level_type_supported(cls, spark_type: DataType) -> bool:
|
43
|
-
"""
|
44
|
-
Checks whether the provided Spark data type is supported by Feature Store, only considering
|
45
|
-
the top-level type for nested data types.
|
46
|
-
|
47
|
-
Details on nested types:
|
48
|
-
ArrayType: The elementType is not checked. Will return True.
|
49
|
-
MapType: The keyType and valueType are not checked. Will return True.
|
50
|
-
StructType: The struct fieds are not checked. Will return True.
|
51
|
-
"""
|
52
|
-
cls.init()
|
53
|
-
return spark_type.typeName().upper() in cls._STRING_TO_ENUM
|
54
|
-
|
55
|
-
@classmethod
|
56
|
-
def to_complex_spark_type(cls, json_value):
|
57
|
-
"""
|
58
|
-
Constructs a complex Spark DataType from its compact JSON representation.
|
59
|
-
|
60
|
-
Examples:
|
61
|
-
- Input: '"decimal(1,2)"'
|
62
|
-
Output: DecimalType(1,2)
|
63
|
-
- Input: '{"containsNull":false,"elementType":"integer","type":"array"}'
|
64
|
-
Output: ArrayType(IntegerType,false)
|
65
|
-
- Input: '{"keyType":"integer","type":"map","valueContainsNull":True,"valueType":"integer"}'
|
66
|
-
Output: MapType(IntegerType,IntegerType,true)
|
67
|
-
"""
|
68
|
-
if not json_value:
|
69
|
-
raise ValueError("Empty JSON value cannot be converted to Spark DataType")
|
70
|
-
|
71
|
-
json_data = json.loads(json_value)
|
72
|
-
if not isinstance(json_data, dict):
|
73
|
-
# DecimalType does not have fromJson() method
|
74
|
-
if json_value == "decimal":
|
75
|
-
return DecimalType()
|
76
|
-
if cls._FIXED_DECIMAL.match(json_data):
|
77
|
-
m = cls._FIXED_DECIMAL.match(json_data)
|
78
|
-
return DecimalType(int(m.group(1)), int(m.group(2)))
|
79
|
-
|
80
|
-
if json_data["type"].upper() == cls.to_string(cls.ARRAY):
|
81
|
-
return ArrayType.fromJson(json_data)
|
82
|
-
|
83
|
-
if json_data["type"].upper() == cls.to_string(cls.MAP):
|
84
|
-
return MapType.fromJson(json_data)
|
85
|
-
|
86
|
-
if json_data["type"].upper() == cls.to_string(cls.STRUCT):
|
87
|
-
return StructType.fromJson(json_data)
|
88
|
-
|
89
|
-
else:
|
90
|
-
raise ValueError(
|
91
|
-
f"Spark type {json_data['type']} cannot be converted to a complex Spark DataType"
|
92
|
-
)
|
@@ -1,55 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
|
3
|
-
|
4
|
-
class _EnvironmentVariable:
|
5
|
-
"""
|
6
|
-
Represents an environment variable for the feature store client for custom configurations as needed.
|
7
|
-
"""
|
8
|
-
|
9
|
-
def __init__(self, name, type_, default):
|
10
|
-
self.name = name
|
11
|
-
self.type = type_
|
12
|
-
self.default = default
|
13
|
-
|
14
|
-
@property
|
15
|
-
def defined(self):
|
16
|
-
return self.name in os.environ
|
17
|
-
|
18
|
-
def get_raw(self):
|
19
|
-
return os.getenv(self.name)
|
20
|
-
|
21
|
-
def set(self, value):
|
22
|
-
os.environ[self.name] = str(value)
|
23
|
-
|
24
|
-
def unset(self):
|
25
|
-
os.environ.pop(self.name, None)
|
26
|
-
|
27
|
-
def get(self):
|
28
|
-
"""
|
29
|
-
Reads the value of the environment variable if it exists and converts it to the desired
|
30
|
-
type. Otherwise, returns the default value.
|
31
|
-
"""
|
32
|
-
if (val := self.get_raw()) is not None:
|
33
|
-
try:
|
34
|
-
return self.type(val)
|
35
|
-
except Exception as e:
|
36
|
-
raise ValueError(
|
37
|
-
f"Failed to convert {val!r} to {self.type} for {self.name}: {e}"
|
38
|
-
)
|
39
|
-
return self.default
|
40
|
-
|
41
|
-
def __str__(self):
|
42
|
-
return f"{self.name} (default: {self.default}, type: {self.type.__name__})"
|
43
|
-
|
44
|
-
def __repr__(self):
|
45
|
-
return repr(self.name)
|
46
|
-
|
47
|
-
def __format__(self, format_spec: str) -> str:
|
48
|
-
return self.name.__format__(format_spec)
|
49
|
-
|
50
|
-
|
51
|
-
# The threshold (in MB) where a broadcast join will be performed for the asof join for point in time feature join
|
52
|
-
# Default is 20MB as benchmarks show diminishing returns with broadcast past this value.The default spark broadcast join threshold is 10MB
|
53
|
-
BROADCAST_JOIN_THRESHOLD = _EnvironmentVariable(
|
54
|
-
"BROADCAST_JOIN_THRESHOLD", int, 20 * 1024 * 1024
|
55
|
-
)
|
@@ -1,53 +0,0 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
class Feature:
|
4
|
-
def __init__(
|
5
|
-
self,
|
6
|
-
feature_table,
|
7
|
-
feature_id,
|
8
|
-
name,
|
9
|
-
data_type,
|
10
|
-
description,
|
11
|
-
data_type_details=None,
|
12
|
-
):
|
13
|
-
self._feature_table = feature_table
|
14
|
-
self._name = name
|
15
|
-
self._data_type = data_type
|
16
|
-
self._description = description
|
17
|
-
self._data_type_details = data_type_details
|
18
|
-
self._feature_id = feature_id
|
19
|
-
|
20
|
-
@property
|
21
|
-
def feature_table(self):
|
22
|
-
return self._feature_table
|
23
|
-
|
24
|
-
@property
|
25
|
-
def feature_id(self):
|
26
|
-
return self._feature_id
|
27
|
-
|
28
|
-
@property
|
29
|
-
def name(self):
|
30
|
-
return self._name
|
31
|
-
|
32
|
-
@property
|
33
|
-
def data_type(self):
|
34
|
-
return self._data_type
|
35
|
-
|
36
|
-
@property
|
37
|
-
def data_type_details(self):
|
38
|
-
return self._data_type_details
|
39
|
-
|
40
|
-
@property
|
41
|
-
def description(self):
|
42
|
-
return self._description
|
43
|
-
|
44
|
-
@classmethod
|
45
|
-
def from_proto(cls, feature_proto):
|
46
|
-
return cls(
|
47
|
-
feature_table=feature_proto.table,
|
48
|
-
feature_id=feature_proto.id,
|
49
|
-
name=feature_proto.name,
|
50
|
-
data_type=feature_proto.data_type,
|
51
|
-
data_type_details=feature_proto.data_type_details,
|
52
|
-
description=feature_proto.description,
|
53
|
-
)
|