tencent-wedata-feature-engineering-dev 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.
- tencent_wedata_feature_engineering_dev-0.1.0.dist-info/METADATA +19 -0
- tencent_wedata_feature_engineering_dev-0.1.0.dist-info/RECORD +64 -0
- tencent_wedata_feature_engineering_dev-0.1.0.dist-info/WHEEL +5 -0
- tencent_wedata_feature_engineering_dev-0.1.0.dist-info/top_level.txt +1 -0
- wedata/__init__.py +9 -0
- wedata/feature_store/__init__.py +0 -0
- wedata/feature_store/client.py +462 -0
- wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
- wedata/feature_store/cloud_sdk_client/client.py +86 -0
- wedata/feature_store/cloud_sdk_client/models.py +686 -0
- wedata/feature_store/cloud_sdk_client/utils.py +32 -0
- wedata/feature_store/common/__init__.py +0 -0
- wedata/feature_store/common/protos/__init__.py +0 -0
- wedata/feature_store/common/protos/feature_store_pb2.py +49 -0
- wedata/feature_store/common/store_config/__init__.py +0 -0
- wedata/feature_store/common/store_config/redis.py +48 -0
- wedata/feature_store/constants/__init__.py +0 -0
- wedata/feature_store/constants/constants.py +59 -0
- wedata/feature_store/constants/engine_types.py +34 -0
- wedata/feature_store/entities/__init__.py +0 -0
- wedata/feature_store/entities/column_info.py +138 -0
- wedata/feature_store/entities/environment_variables.py +55 -0
- wedata/feature_store/entities/feature.py +53 -0
- wedata/feature_store/entities/feature_column_info.py +72 -0
- wedata/feature_store/entities/feature_function.py +55 -0
- wedata/feature_store/entities/feature_lookup.py +200 -0
- wedata/feature_store/entities/feature_spec.py +489 -0
- wedata/feature_store/entities/feature_spec_constants.py +25 -0
- wedata/feature_store/entities/feature_table.py +111 -0
- wedata/feature_store/entities/feature_table_info.py +49 -0
- wedata/feature_store/entities/function_info.py +90 -0
- wedata/feature_store/entities/on_demand_column_info.py +57 -0
- wedata/feature_store/entities/source_data_column_info.py +24 -0
- wedata/feature_store/entities/training_set.py +135 -0
- wedata/feature_store/feast_client/__init__.py +0 -0
- wedata/feature_store/feast_client/feast_client.py +482 -0
- wedata/feature_store/feature_table_client/__init__.py +0 -0
- wedata/feature_store/feature_table_client/feature_table_client.py +969 -0
- wedata/feature_store/mlflow_model.py +17 -0
- wedata/feature_store/spark_client/__init__.py +0 -0
- wedata/feature_store/spark_client/spark_client.py +289 -0
- wedata/feature_store/training_set_client/__init__.py +0 -0
- wedata/feature_store/training_set_client/training_set_client.py +572 -0
- wedata/feature_store/utils/__init__.py +0 -0
- wedata/feature_store/utils/common_utils.py +352 -0
- wedata/feature_store/utils/env_utils.py +86 -0
- wedata/feature_store/utils/feature_lookup_utils.py +564 -0
- wedata/feature_store/utils/feature_spec_utils.py +286 -0
- wedata/feature_store/utils/feature_utils.py +73 -0
- wedata/feature_store/utils/on_demand_utils.py +107 -0
- wedata/feature_store/utils/schema_utils.py +117 -0
- wedata/feature_store/utils/signature_utils.py +202 -0
- wedata/feature_store/utils/topological_sort.py +158 -0
- wedata/feature_store/utils/training_set_utils.py +579 -0
- wedata/feature_store/utils/uc_utils.py +296 -0
- wedata/feature_store/utils/validation_utils.py +79 -0
- wedata/tempo/__init__.py +0 -0
- wedata/tempo/interpol.py +448 -0
- wedata/tempo/intervals.py +1331 -0
- wedata/tempo/io.py +61 -0
- wedata/tempo/ml.py +129 -0
- wedata/tempo/resample.py +318 -0
- wedata/tempo/tsdf.py +1720 -0
- wedata/tempo/utils.py +254 -0
|
@@ -0,0 +1,482 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
__doc__ = """
|
|
4
|
+
Feast客户端,用于与Feast服务器交互
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
import re
|
|
10
|
+
from typing import List, Dict, Optional, Any
|
|
11
|
+
try:
|
|
12
|
+
# pyspark 3.5.0 以后
|
|
13
|
+
from pyspark.errors import IllegalArgumentException
|
|
14
|
+
except ModuleNotFoundError:
|
|
15
|
+
from pyspark.sql.utils import IllegalArgumentException
|
|
16
|
+
|
|
17
|
+
import pandas
|
|
18
|
+
import pytz
|
|
19
|
+
from feast import FeatureStore, RepoConfig, FeatureView
|
|
20
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
21
|
+
from wedata.feature_store.common.store_config.redis import RedisStoreConfig
|
|
22
|
+
from feast import Entity, FeatureService
|
|
23
|
+
from feast.infra.offline_stores.contrib.spark_offline_store.spark_source import SparkSource
|
|
24
|
+
from feast.infra.online_stores.redis import RedisOnlineStore
|
|
25
|
+
from feast.errors import FeatureServiceNotFoundException
|
|
26
|
+
from feast.types import ValueType
|
|
27
|
+
from pyspark.sql.types import (
|
|
28
|
+
TimestampType, DateType, StructType, NullType, ByteType, IntegerType, DecimalType, DoubleType, FloatType,
|
|
29
|
+
BooleanType,
|
|
30
|
+
StringType, ArrayType, LongType
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
TEMP_FILE_PATH = "/tmp/feast_data/"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class FeastClient:
|
|
37
|
+
|
|
38
|
+
def __init__(self, offline_store: SparkSession, online_store_config: RedisStoreConfig = None):
|
|
39
|
+
project_id = os.getenv("WEDATA_PROJECT_ID", "")
|
|
40
|
+
remote_path = os.getenv("FEAST_REMOTE_ADDRESS", "")
|
|
41
|
+
if offline_store is None or not isinstance(offline_store, SparkSession):
|
|
42
|
+
raise ValueError("offline_store must be provided SparkSession instance")
|
|
43
|
+
|
|
44
|
+
# 应用Spark配置
|
|
45
|
+
spark_conf_dict = dict()
|
|
46
|
+
spark_conf = offline_store.sparkContext.getConf().getAll()
|
|
47
|
+
for item in spark_conf:
|
|
48
|
+
spark_conf_dict[item[0]] = item[1]
|
|
49
|
+
|
|
50
|
+
config = RepoConfig(
|
|
51
|
+
project=project_id,
|
|
52
|
+
registry={"registry_type": "remote", "path": remote_path},
|
|
53
|
+
provider="local",
|
|
54
|
+
online_store={"type": "redis",
|
|
55
|
+
"connection_string": online_store_config.connection_string} if online_store_config else None,
|
|
56
|
+
offline_store={"type": "spark", "spark_conf": spark_conf_dict},
|
|
57
|
+
batch_engine={"type": "spark.engine"},
|
|
58
|
+
entity_key_serialization_version=2
|
|
59
|
+
)
|
|
60
|
+
self._client = FeatureStore(config=config)
|
|
61
|
+
self._spark = offline_store
|
|
62
|
+
self._spark.builder.enableHiveSupport()
|
|
63
|
+
# 设置Spark时区为pytz时区,避免后续spark操作toPandas时出现时区问题
|
|
64
|
+
try:
|
|
65
|
+
spark_timezone = self._spark.conf.get("spark.sql.session.timeZone", "")
|
|
66
|
+
if spark_timezone:
|
|
67
|
+
pytz_timezone = _translate_spark_timezone(spark_timezone)
|
|
68
|
+
self._spark.conf.set("spark.sql.session.timeZone", pytz_timezone)
|
|
69
|
+
else:
|
|
70
|
+
self._spark.conf.set("spark.sql.session.timeZone", "Etc/GMT+8")
|
|
71
|
+
except IllegalArgumentException:
|
|
72
|
+
self._spark.conf.set("spark.sql.session.timeZone", "Etc/GMT+8")
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def client(self):
|
|
76
|
+
return self._client
|
|
77
|
+
|
|
78
|
+
def create_table(self,
|
|
79
|
+
table_name: str,
|
|
80
|
+
primary_keys: List[str],
|
|
81
|
+
timestamp_key: str,
|
|
82
|
+
df: Optional[DataFrame] = None,
|
|
83
|
+
schema: Optional[StructType] = None,
|
|
84
|
+
tags: Optional[Dict[str, str]] = None,
|
|
85
|
+
description: Optional[str] = None):
|
|
86
|
+
if schema is not None and df is None:
|
|
87
|
+
# 创建空的Spark DataFrame
|
|
88
|
+
df = self._spark.createDataFrame([], schema)
|
|
89
|
+
feast_table_name = translate_table_name_to_feast(table_name)
|
|
90
|
+
entities = _get_entity_from_schema(feast_table_name, df.schema, primary_keys)
|
|
91
|
+
feature_view = _create_table_to_feature_view(
|
|
92
|
+
table_name=table_name,
|
|
93
|
+
primary_keys=primary_keys,
|
|
94
|
+
entities=entities,
|
|
95
|
+
timestamp_key=timestamp_key,
|
|
96
|
+
df=df,
|
|
97
|
+
tags=tags,
|
|
98
|
+
description=description
|
|
99
|
+
)
|
|
100
|
+
self._apply_feature_view(table_name, entities, feature_view)
|
|
101
|
+
|
|
102
|
+
def _apply_feature_view(self, table_name, entities, feature_view: FeatureView):
|
|
103
|
+
database_name, old_table_name = table_name.split(".")
|
|
104
|
+
try:
|
|
105
|
+
feature_service = self._client.get_feature_service(database_name)
|
|
106
|
+
except FeatureServiceNotFoundException:
|
|
107
|
+
feature_service = FeatureService(name=database_name, features=[feature_view])
|
|
108
|
+
else:
|
|
109
|
+
if feature_service.name == "":
|
|
110
|
+
feature_service = FeatureService(name=database_name, features=[feature_view])
|
|
111
|
+
else:
|
|
112
|
+
# 对于已存在的FeatureService,需要更新其中的FeatureView
|
|
113
|
+
update_flag = False
|
|
114
|
+
for index in range(0, len(feature_service.feature_view_projections)):
|
|
115
|
+
if feature_service.feature_view_projections[index].name == feature_view.name:
|
|
116
|
+
# update feature_view
|
|
117
|
+
feature_service.feature_view_projections[index] = feature_view.projection
|
|
118
|
+
update_flag = True
|
|
119
|
+
break
|
|
120
|
+
if not update_flag:
|
|
121
|
+
feature_service.feature_view_projections.append(feature_view.projection)
|
|
122
|
+
self._client.apply(feature_view)
|
|
123
|
+
self._client.apply(entities)
|
|
124
|
+
self._client.apply(feature_service)
|
|
125
|
+
|
|
126
|
+
def remove_offline_table(self, table_name: str):
|
|
127
|
+
feast_table_name = translate_table_name_to_feast(table_name)
|
|
128
|
+
database_name, old_table_name = table_name.split(".")
|
|
129
|
+
self._client.registry.delete_data_source(feast_table_name, self._client.project)
|
|
130
|
+
try:
|
|
131
|
+
feature_view = self.get_feature_view(table_name)
|
|
132
|
+
except Exception as e:
|
|
133
|
+
pass
|
|
134
|
+
else:
|
|
135
|
+
try:
|
|
136
|
+
feature_service = self._client.get_feature_service(database_name)
|
|
137
|
+
except Exception as e:
|
|
138
|
+
print(f"feature_service:{database_name} not found")
|
|
139
|
+
else:
|
|
140
|
+
for index in range(0, len(feature_service.feature_view_projections)):
|
|
141
|
+
if feature_service.feature_view_projections[index].name == feature_view.name:
|
|
142
|
+
feature_service.feature_view_projections.pop(index)
|
|
143
|
+
break
|
|
144
|
+
self._client.apply(feature_service)
|
|
145
|
+
self._client.registry.delete_feature_view(feast_table_name, self._client.project)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def get_feature_view(self, table_name: str):
|
|
150
|
+
feast_table_name = translate_table_name_to_feast(table_name)
|
|
151
|
+
return self._client.get_feature_view(feast_table_name)
|
|
152
|
+
|
|
153
|
+
def remove_online_table(self, table_name: str):
|
|
154
|
+
if not self._client.config.online_store:
|
|
155
|
+
raise ValueError("Online store is not configured")
|
|
156
|
+
|
|
157
|
+
feast_table_name = translate_table_name_to_feast(table_name)
|
|
158
|
+
table_view = self._client.get_feature_view(feast_table_name)
|
|
159
|
+
if not table_view:
|
|
160
|
+
raise ValueError(f"Table {table_name} not found in Feast")
|
|
161
|
+
|
|
162
|
+
if self._client.config.online_store.type == "redis":
|
|
163
|
+
print("redis table_view join_keys:", table_view.join_keys)
|
|
164
|
+
redis_online_store = RedisOnlineStore()
|
|
165
|
+
redis_online_store.delete_table(self._client.config, table_view)
|
|
166
|
+
table_view.online = False
|
|
167
|
+
table_view.materialization_intervals = []
|
|
168
|
+
self._client.apply(table_view)
|
|
169
|
+
else:
|
|
170
|
+
raise ValueError(f"Unsupported online store type: {self._client.config.online_store.type}")
|
|
171
|
+
|
|
172
|
+
self._client.refresh_registry()
|
|
173
|
+
|
|
174
|
+
def alter_table(self, full_table_name: str, timestamp_key: str, primary_keys: List[str]):
|
|
175
|
+
"""
|
|
176
|
+
将已注册的Delta表同步到Feast中作为离线特征数据
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
full_table_name: 表名(格式:<table>)
|
|
180
|
+
timestamp_key: 时间戳列名
|
|
181
|
+
primary_keys: 主键列名列表
|
|
182
|
+
Raises:
|
|
183
|
+
ValueError: 当表不存在或参数无效时抛出
|
|
184
|
+
RuntimeError: 当同步操作失败时抛出
|
|
185
|
+
"""
|
|
186
|
+
import logging
|
|
187
|
+
try:
|
|
188
|
+
|
|
189
|
+
# 1. 读取Delta表数据和schema
|
|
190
|
+
df = self._spark.table(full_table_name)
|
|
191
|
+
|
|
192
|
+
feast_table_name = translate_table_name_to_feast(full_table_name)
|
|
193
|
+
entities = _get_entity_from_schema(feast_table_name, df.schema, primary_keys)
|
|
194
|
+
# 2. 从表属性中获取主键和时间戳列
|
|
195
|
+
tbl_props = self._spark.sql(f"SHOW TBLPROPERTIES {full_table_name}").collect()
|
|
196
|
+
props = {row['key']: row['value'] for row in tbl_props}
|
|
197
|
+
|
|
198
|
+
if not primary_keys:
|
|
199
|
+
raise ValueError("Primary keys not found in table properties")
|
|
200
|
+
if not timestamp_key:
|
|
201
|
+
raise ValueError("Timestamp keys not found in table properties")
|
|
202
|
+
|
|
203
|
+
logging.info(f"Primary keys: {primary_keys}")
|
|
204
|
+
logging.info(f"Timestamp keys: {timestamp_key}")
|
|
205
|
+
|
|
206
|
+
# 3. 创建或更新FeatureView
|
|
207
|
+
feature_view = _create_table_to_feature_view(
|
|
208
|
+
table_name=full_table_name,
|
|
209
|
+
entities=entities,
|
|
210
|
+
primary_keys=primary_keys,
|
|
211
|
+
timestamp_key=timestamp_key,
|
|
212
|
+
df=df,
|
|
213
|
+
tags={"source": "delta_table", **json.loads(props.get("tags", "{}"))},
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
self._apply_feature_view(full_table_name, entities, feature_view)
|
|
217
|
+
# 4. 应用到Feast
|
|
218
|
+
logging.info(f"Successfully synced Delta table {full_table_name} to Feast")
|
|
219
|
+
|
|
220
|
+
except Exception as e:
|
|
221
|
+
logging.error(f"Failed to sync Delta table to Feast: {str(e)}")
|
|
222
|
+
raise RuntimeError(f"Failed to sync Delta table {full_table_name} to Feast: {str(e)}") from e
|
|
223
|
+
|
|
224
|
+
def modify_tags(
|
|
225
|
+
self,
|
|
226
|
+
table_name: str,
|
|
227
|
+
tags: Dict[str, str]
|
|
228
|
+
) -> None:
|
|
229
|
+
"""修改特征表的标签信息
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
table_name: 特征表名称(格式: <database>.<table>)
|
|
233
|
+
tags: 要更新的标签字典
|
|
234
|
+
|
|
235
|
+
Raises:
|
|
236
|
+
ValueError: 当参数无效时抛出
|
|
237
|
+
RuntimeError: 当修改操作失败时抛出
|
|
238
|
+
"""
|
|
239
|
+
if not table_name:
|
|
240
|
+
raise ValueError("table_name cannot be empty")
|
|
241
|
+
if not tags:
|
|
242
|
+
raise ValueError("tags cannot be empty")
|
|
243
|
+
|
|
244
|
+
feast_table_name = translate_table_name_to_feast(table_name)
|
|
245
|
+
try:
|
|
246
|
+
# 获取现有的FeatureView
|
|
247
|
+
feature_view = self._client.get_feature_view(feast_table_name)
|
|
248
|
+
if not feature_view:
|
|
249
|
+
raise ValueError(f"FeatureView '{table_name}' not found")
|
|
250
|
+
|
|
251
|
+
# 更新标签
|
|
252
|
+
current_tags = feature_view.tags or {}
|
|
253
|
+
current_tags.update(tags)
|
|
254
|
+
feature_view.tags = current_tags
|
|
255
|
+
|
|
256
|
+
# 应用更新
|
|
257
|
+
self._client.apply([feature_view])
|
|
258
|
+
print(f"Successfully updated tags for table '{table_name}'")
|
|
259
|
+
|
|
260
|
+
except Exception as e:
|
|
261
|
+
raise RuntimeError(f"Failed to modify tags for table '{table_name}': {str(e)}") from e
|
|
262
|
+
|
|
263
|
+
def get_online_table_view(self, full_table_name: str, columns_name: List[str], entity_rows: List[Dict[str, Any]]) -> pandas.DataFrame:
|
|
264
|
+
"""
|
|
265
|
+
获取在线特征表的数据
|
|
266
|
+
args:
|
|
267
|
+
full_table_name: 特征表名称(格式: <database>.<table>)
|
|
268
|
+
return:
|
|
269
|
+
FeatureView实例
|
|
270
|
+
"""
|
|
271
|
+
feast_table = translate_table_name_to_feast(full_table_name)
|
|
272
|
+
feature_names = []
|
|
273
|
+
for column_name in columns_name:
|
|
274
|
+
feature_names.append(f"{feast_table}:{column_name}")
|
|
275
|
+
|
|
276
|
+
if isinstance(entity_rows, list):
|
|
277
|
+
new_entity_rows = []
|
|
278
|
+
for entity_row in entity_rows:
|
|
279
|
+
temp_entity_row = {}
|
|
280
|
+
for key, value in entity_row.items():
|
|
281
|
+
temp_entity_row[_get_entity_name(full_table_name, key)] = value
|
|
282
|
+
new_entity_rows.append(temp_entity_row)
|
|
283
|
+
elif isinstance(entity_rows, dict):
|
|
284
|
+
new_entity_rows = {}
|
|
285
|
+
for key, value in entity_rows.items():
|
|
286
|
+
new_entity_rows[_get_entity_name(full_table_name, key)] = value
|
|
287
|
+
else:
|
|
288
|
+
raise TypeError("entity_rows must be a list or dict")
|
|
289
|
+
|
|
290
|
+
try:
|
|
291
|
+
self._client.refresh_registry()
|
|
292
|
+
online_stores = self._client.get_online_features(features=feature_names, entity_rows=new_entity_rows)
|
|
293
|
+
except UnboundLocalError as e:
|
|
294
|
+
raise ValueError(f"{full_table_name} table not in feast registry. {str(e)}")
|
|
295
|
+
|
|
296
|
+
return online_stores.to_df()
|
|
297
|
+
|
|
298
|
+
def read_offline_table(self, table_name: str, database_name: str, columns_df: pandas.DataFrame,
|
|
299
|
+
full_feature_names=True) -> pandas.DataFrame:
|
|
300
|
+
"""
|
|
301
|
+
获取离线特征表的数据(存储到Feast中的数据)
|
|
302
|
+
"""
|
|
303
|
+
if not isinstance(columns_df, pandas.DataFrame):
|
|
304
|
+
raise TypeError("columns_df must be a pandas.DataFrame instance")
|
|
305
|
+
|
|
306
|
+
full_table_name = f"{database_name}.{table_name}"
|
|
307
|
+
feast_table_name = translate_table_name_to_feast(full_table_name)
|
|
308
|
+
# 批量替换DataFrame列名
|
|
309
|
+
rename_dict = {}
|
|
310
|
+
for column_name in columns_df.columns:
|
|
311
|
+
rename_dict[column_name] = _get_entity_name(feast_table_name, column_name)
|
|
312
|
+
|
|
313
|
+
columns_df.rename(columns=rename_dict, inplace=True)
|
|
314
|
+
features = self._client.get_feature_service(database_name, allow_cache=False)
|
|
315
|
+
result = self._client.get_historical_features(
|
|
316
|
+
entity_df=columns_df, features=features, full_feature_names=full_feature_names)
|
|
317
|
+
return result.to_df()
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def _create_table_to_feature_view(
|
|
321
|
+
table_name: str,
|
|
322
|
+
entities: List[Entity],
|
|
323
|
+
primary_keys: List[str],
|
|
324
|
+
timestamp_key: str,
|
|
325
|
+
df: Optional[DataFrame],
|
|
326
|
+
tags: Optional[Dict[str, str]] = None,
|
|
327
|
+
description: Optional[str] = None,
|
|
328
|
+
):
|
|
329
|
+
"""
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
FeatureView实例
|
|
333
|
+
"""
|
|
334
|
+
if primary_keys is None or len(primary_keys) == 0:
|
|
335
|
+
raise ValueError("primary_keys must not be empty")
|
|
336
|
+
if not timestamp_key:
|
|
337
|
+
raise ValueError("timestamp_keys must not be empty")
|
|
338
|
+
|
|
339
|
+
os.makedirs(TEMP_FILE_PATH, exist_ok=True)
|
|
340
|
+
|
|
341
|
+
temp_file = os.path.join(TEMP_FILE_PATH, f"{table_name}.parquet")
|
|
342
|
+
|
|
343
|
+
df.write.parquet(f"file://{temp_file}", mode="overwrite")
|
|
344
|
+
feast_table_name = translate_table_name_to_feast(table_name)
|
|
345
|
+
resources = SparkSource(
|
|
346
|
+
name=feast_table_name,
|
|
347
|
+
table=table_name,
|
|
348
|
+
# path=f"file://{temp_file}",
|
|
349
|
+
timestamp_field=timestamp_key,
|
|
350
|
+
# query=f"SELECT * FROM {table_name}",
|
|
351
|
+
# file_format="parquet",
|
|
352
|
+
tags=tags,
|
|
353
|
+
description=description,
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
# 构建FeatureView的剩余逻辑
|
|
357
|
+
feature_view = FeatureView(
|
|
358
|
+
name=feast_table_name,
|
|
359
|
+
entities=entities,
|
|
360
|
+
tags=tags,
|
|
361
|
+
source=resources,
|
|
362
|
+
)
|
|
363
|
+
feature_view.online = False
|
|
364
|
+
return feature_view
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def _translate_spark_timezone(timezone: str) -> str:
|
|
368
|
+
"""
|
|
369
|
+
将Spark时区字符串转换为pytz时区字符串
|
|
370
|
+
Args:
|
|
371
|
+
timezone: Spark时区字符串
|
|
372
|
+
Returns:
|
|
373
|
+
Feast时区字符串
|
|
374
|
+
"""
|
|
375
|
+
try:
|
|
376
|
+
py_timezone = pytz.timezone(timezone)
|
|
377
|
+
except pytz.exceptions.UnknownTimeZoneError:
|
|
378
|
+
# GMT+08:00 转换为 'Etc/GMT+8'
|
|
379
|
+
result = re.compile(r"GMT([+-])(\d{2}):(\d{2})").match(timezone)
|
|
380
|
+
if result:
|
|
381
|
+
groups = result.groups()
|
|
382
|
+
if len(groups) == 3:
|
|
383
|
+
return f"Etc/GMT{groups[0]}{int(groups[1])}"
|
|
384
|
+
else:
|
|
385
|
+
raise ValueError(f"Invalid timezone string: {timezone}")
|
|
386
|
+
else:
|
|
387
|
+
return str(py_timezone)
|
|
388
|
+
|
|
389
|
+
return timezone
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def _get_entity_name(table_name: str, field_name: str):
|
|
393
|
+
return field_name
|
|
394
|
+
# return f"{table_name}_{field_name}"
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def _get_entity_from_schema(table_name:str, schema: StructType, primary_list: List[str] = None) -> List[Entity]:
|
|
398
|
+
"""
|
|
399
|
+
Args:
|
|
400
|
+
table_name: 表名
|
|
401
|
+
schema: Spark DataFrame Schema
|
|
402
|
+
primary_list: 主键列表
|
|
403
|
+
Returns:
|
|
404
|
+
List[Entity]
|
|
405
|
+
"""
|
|
406
|
+
entities = list()
|
|
407
|
+
for field in schema.fields:
|
|
408
|
+
if primary_list:
|
|
409
|
+
if field.name not in primary_list:
|
|
410
|
+
continue
|
|
411
|
+
|
|
412
|
+
entity_name = _get_entity_name(table_name, field.name)
|
|
413
|
+
if isinstance(field.dataType, (TimestampType, DateType)):
|
|
414
|
+
continue
|
|
415
|
+
# entities.append(Entity(name=entity_name, value_type=ValueType.UNIX_TIMESTAMP))
|
|
416
|
+
elif isinstance(field.dataType, IntegerType):
|
|
417
|
+
entities.append(Entity(name=entity_name, value_type=ValueType.INT32))
|
|
418
|
+
elif isinstance(field.dataType, StringType):
|
|
419
|
+
entities.append(Entity(name=entity_name, value_type=ValueType.STRING))
|
|
420
|
+
elif isinstance(field.dataType, (DecimalType, FloatType)):
|
|
421
|
+
entities.append(Entity(name=entity_name, value_type=ValueType.FLOAT))
|
|
422
|
+
elif isinstance(field.dataType, DoubleType):
|
|
423
|
+
entities.append(Entity(name=entity_name, value_type=ValueType.DOUBLE))
|
|
424
|
+
elif isinstance(field.dataType, BooleanType):
|
|
425
|
+
entities.append(Entity(name=entity_name, value_type=ValueType.BOOL))
|
|
426
|
+
elif isinstance(field.dataType, ByteType):
|
|
427
|
+
entities.append(Entity(name=entity_name, value_type=ValueType.BYTES))
|
|
428
|
+
elif isinstance(field.dataType, LongType):
|
|
429
|
+
entities.append(Entity(name=entity_name, value_type=ValueType.INT64))
|
|
430
|
+
elif isinstance(field.dataType, NullType):
|
|
431
|
+
entities.append(Entity(name=entity_name, value_type=ValueType.NULL))
|
|
432
|
+
elif isinstance(field.dataType, ArrayType):
|
|
433
|
+
if isinstance(field.dataType.elementType, ByteType):
|
|
434
|
+
entities.append(Entity(name=entity_name, value_type=ValueType.BYTES_LIST))
|
|
435
|
+
elif isinstance(field.dataType.elementType, StringType):
|
|
436
|
+
entities.append(Entity(name=entity_name, value_type=ValueType.STRING_LIST))
|
|
437
|
+
elif isinstance(field.dataType.elementType, IntegerType):
|
|
438
|
+
entities.append(Entity(name=entity_name, value_type=ValueType.INT32_LIST))
|
|
439
|
+
elif isinstance(field.dataType.elementType, LongType):
|
|
440
|
+
entities.append(Entity(name=entity_name, value_type=ValueType.INT64_LIST))
|
|
441
|
+
elif isinstance(field.dataType.elementType, DoubleType):
|
|
442
|
+
entities.append(Entity(name=entity_name, value_type=ValueType.DOUBLE_LIST))
|
|
443
|
+
elif isinstance(field.dataType.elementType, (DecimalType, FloatType)):
|
|
444
|
+
entities.append(Entity(name=entity_name, value_type=ValueType.FLOAT_LIST))
|
|
445
|
+
elif isinstance(field.dataType.elementType, BooleanType):
|
|
446
|
+
entities.append(Entity(name=entity_name, value_type=ValueType.BOOL_LIST))
|
|
447
|
+
elif isinstance(field.dataType.elementType, (TimestampType, DateType)):
|
|
448
|
+
continue
|
|
449
|
+
# entities.append(Entity(name=entity_name, value_type=ValueType.UNIX_TIMESTAMP_LIST))
|
|
450
|
+
else:
|
|
451
|
+
print(f"Unsupported array element type: {field.dataType.elementType}")
|
|
452
|
+
else:
|
|
453
|
+
print(f"Unsupported field type: {field.dataType}")
|
|
454
|
+
|
|
455
|
+
return entities
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def translate_table_name_to_feast(table_name: str):
|
|
459
|
+
splits = table_name.split(".")
|
|
460
|
+
if len(splits) == 1:
|
|
461
|
+
return table_name
|
|
462
|
+
elif len(splits) == 2:
|
|
463
|
+
return f"{splits[0]}_{splits[1]}"
|
|
464
|
+
else:
|
|
465
|
+
raise ValueError(f"Invalid table name: {table_name}")
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
if __name__ == '__main__':
|
|
469
|
+
import datetime
|
|
470
|
+
FeastClient = FeastClient()
|
|
471
|
+
FeastClient.client.registry.delete_data_source(name="xxxxx")
|
|
472
|
+
FeastClient.client.registry.delete_entity("xxxxx", )
|
|
473
|
+
FeastClient.client.registry.delete_feature_view()
|
|
474
|
+
FeastClient.client.registry.get_feature_view()
|
|
475
|
+
FeastClient.client.registry.delete_feature_service()
|
|
476
|
+
FeastClient.client.get_historical_features()
|
|
477
|
+
feature_view = FeastClient.client.get_feature_view(name="xxxxx")
|
|
478
|
+
feature_view.source.get_table_query_string()
|
|
479
|
+
feast_table_name = "xxx"
|
|
480
|
+
from wedata.feature_store.utils.common_utils import build_full_table_name
|
|
481
|
+
feast_table_name = translate_table_name_to_feast(build_full_table_name(table_name, database_name))
|
|
482
|
+
FeastClient.client.materialize(start_date=datetime.datetime(2021,1,1), end_date=datetime.datetime.now(), feature_views=[feast_table_name])
|
|
File without changes
|