tencent-wedata-feature-engineering-dev 0.1.49__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.
- {tencent_wedata_feature_engineering_dev-0.1.49.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/METADATA +10 -8
- tencent_wedata_feature_engineering_dev-0.2.0.dist-info/RECORD +46 -0
- {tencent_wedata_feature_engineering_dev-0.1.49.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/WHEEL +1 -1
- wedata/feature_store/client.py +28 -92
- wedata/feature_store/constants/constants.py +2 -5
- wedata/feature_store/entities/feature_lookup.py +0 -17
- wedata/feature_store/entities/feature_spec.py +2 -2
- wedata/feature_store/entities/feature_table.py +1 -5
- wedata/feature_store/entities/function_info.py +4 -1
- wedata/feature_store/feature_table_client/feature_table_client.py +53 -528
- wedata/feature_store/spark_client/spark_client.py +15 -41
- wedata/feature_store/training_set_client/training_set_client.py +10 -9
- wedata/feature_store/utils/common_utils.py +4 -48
- wedata/feature_store/utils/feature_lookup_utils.py +43 -37
- wedata/feature_store/utils/feature_spec_utils.py +1 -1
- wedata/feature_store/utils/uc_utils.py +1 -1
- tencent_wedata_feature_engineering_dev-0.1.49.dist-info/RECORD +0 -66
- wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
- wedata/feature_store/cloud_sdk_client/client.py +0 -108
- wedata/feature_store/cloud_sdk_client/models.py +0 -686
- wedata/feature_store/cloud_sdk_client/utils.py +0 -39
- wedata/feature_store/common/log/__init__.py +0 -0
- wedata/feature_store/common/log/logger.py +0 -40
- wedata/feature_store/common/store_config/__init__.py +0 -0
- wedata/feature_store/common/store_config/redis.py +0 -48
- wedata/feature_store/constants/engine_types.py +0 -34
- wedata/feature_store/feast_client/__init__.py +0 -0
- wedata/feature_store/feast_client/feast_client.py +0 -487
- wedata/feature_store/utils/env_utils.py +0 -108
- wedata/tempo/__init__.py +0 -0
- wedata/tempo/interpol.py +0 -448
- wedata/tempo/intervals.py +0 -1331
- wedata/tempo/io.py +0 -61
- wedata/tempo/ml.py +0 -129
- wedata/tempo/resample.py +0 -318
- wedata/tempo/tsdf.py +0 -1720
- wedata/tempo/utils.py +0 -254
- {tencent_wedata_feature_engineering_dev-0.1.49.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -3,79 +3,26 @@
|
|
|
3
3
|
"""
|
|
4
4
|
import json
|
|
5
5
|
from typing import Union, List, Dict, Optional, Sequence, Any
|
|
6
|
-
|
|
7
|
-
import tencentcloud.common.exception
|
|
8
6
|
from pyspark.sql import DataFrame, SparkSession
|
|
9
7
|
from pyspark.sql.streaming import StreamingQuery
|
|
10
8
|
from pyspark.sql.types import StructType
|
|
11
9
|
import os
|
|
12
|
-
|
|
13
|
-
from wedata.feature_store.constants.constants import
|
|
14
|
-
|
|
15
|
-
FEATURE_TABLE_VALUE, FEATURE_TABLE_PROJECT, FEATURE_TABLE_TIMESTAMP,
|
|
16
|
-
FEATURE_TABLE_BACKUP_PRIMARY_KEY, FEATURE_DLC_TABLE_PRIMARY_KEY)
|
|
17
|
-
from wedata.feature_store.constants.engine_types import EngineTypes
|
|
18
|
-
from wedata.feature_store.common.store_config.redis import RedisStoreConfig
|
|
10
|
+
|
|
11
|
+
from wedata.feature_store.constants.constants import APPEND, DEFAULT_WRITE_STREAM_TRIGGER, FEATURE_TABLE_KEY, \
|
|
12
|
+
FEATURE_TABLE_VALUE, FEATURE_TABLE_PROJECT
|
|
19
13
|
from wedata.feature_store.entities.feature_table import FeatureTable
|
|
20
14
|
from wedata.feature_store.spark_client.spark_client import SparkClient
|
|
21
|
-
from wedata.feature_store.utils import common_utils
|
|
22
|
-
from wedata.feature_store.feast_client.feast_client import FeastClient
|
|
23
|
-
from wedata.feature_store.cloud_sdk_client.models import (
|
|
24
|
-
TaskSchedulerConfiguration, OnlineFeatureConfiguration, OfflineFeatureConfiguration,
|
|
25
|
-
CreateOnlineFeatureTableRequest, DescribeNormalSchedulerExecutorGroupsRequest, RefreshFeatureTableRequest)
|
|
26
|
-
from wedata.feature_store.cloud_sdk_client.client import FeatureCloudSDK
|
|
15
|
+
from wedata.feature_store.utils import common_utils
|
|
27
16
|
|
|
28
17
|
|
|
29
18
|
class FeatureTableClient:
|
|
30
19
|
"""特征表操作类"""
|
|
31
20
|
|
|
32
21
|
def __init__(
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
cloud_secret_id: str = None,
|
|
36
|
-
cloud_secret_key: str = None,
|
|
22
|
+
self,
|
|
23
|
+
spark: SparkSession
|
|
37
24
|
):
|
|
38
25
|
self._spark = spark
|
|
39
|
-
self._feast_client = FeastClient(spark)
|
|
40
|
-
if cloud_secret_id and cloud_secret_key:
|
|
41
|
-
self.__cloud_secret_id = cloud_secret_id
|
|
42
|
-
self.__cloud_secret_key = cloud_secret_key
|
|
43
|
-
else:
|
|
44
|
-
self.__cloud_secret_id, self.__cloud_secret_key = env_utils.get_cloud_secret()
|
|
45
|
-
self.__project = env_utils.get_project_id()
|
|
46
|
-
self.__region = env_utils.get_region()
|
|
47
|
-
|
|
48
|
-
@property
|
|
49
|
-
def cloud_secret_id(self) -> str:
|
|
50
|
-
if not self.__cloud_secret_id:
|
|
51
|
-
raise ValueError("cloud_secret_id is empty. please set it first.")
|
|
52
|
-
return self.__cloud_secret_id
|
|
53
|
-
|
|
54
|
-
@cloud_secret_id.setter
|
|
55
|
-
def cloud_secret_id(self, cloud_secret_id: str):
|
|
56
|
-
if not cloud_secret_id:
|
|
57
|
-
raise ValueError("cloud_secret_id cannot be None")
|
|
58
|
-
self.__cloud_secret_id = cloud_secret_id
|
|
59
|
-
|
|
60
|
-
@property
|
|
61
|
-
def cloud_secret_key(self) -> str:
|
|
62
|
-
if not self.__cloud_secret_key:
|
|
63
|
-
raise ValueError("cloud_secret_key is empty. please set it first.")
|
|
64
|
-
return self.__cloud_secret_key
|
|
65
|
-
|
|
66
|
-
@cloud_secret_key.setter
|
|
67
|
-
def cloud_secret_key(self, cloud_secret_key: str):
|
|
68
|
-
if not cloud_secret_key:
|
|
69
|
-
raise ValueError("cloud_secret_key cannot be None")
|
|
70
|
-
self.__cloud_secret_key = cloud_secret_key
|
|
71
|
-
|
|
72
|
-
@property
|
|
73
|
-
def project(self) -> str:
|
|
74
|
-
return self.__project
|
|
75
|
-
|
|
76
|
-
@property
|
|
77
|
-
def region(self) -> str:
|
|
78
|
-
return self.__region
|
|
79
26
|
|
|
80
27
|
@staticmethod
|
|
81
28
|
def _normalize_params(
|
|
@@ -105,39 +52,26 @@ class FeatureTableClient:
|
|
|
105
52
|
)
|
|
106
53
|
|
|
107
54
|
@staticmethod
|
|
108
|
-
def _validate_key_conflicts(primary_keys: List[str], timestamp_keys: str):
|
|
55
|
+
def _validate_key_conflicts(primary_keys: List[str], timestamp_keys: List[str]):
|
|
109
56
|
"""校验主键与时间戳键是否冲突"""
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
@staticmethod
|
|
114
|
-
def _validate_key_exists(primary_keys: List[str], timestamp_keys: str):
|
|
115
|
-
"""校验主键与时间戳键是否存在"""
|
|
116
|
-
if not primary_keys:
|
|
117
|
-
raise ValueError("Primary keys cannot be empty")
|
|
118
|
-
if not timestamp_keys:
|
|
119
|
-
raise ValueError("Timestamp keys cannot be empty")
|
|
57
|
+
conflict_keys = set(timestamp_keys) & set(primary_keys)
|
|
58
|
+
if conflict_keys:
|
|
59
|
+
raise ValueError(f"Timestamp keys conflict with primary keys: {conflict_keys}")
|
|
120
60
|
|
|
121
61
|
@staticmethod
|
|
122
62
|
def _escape_sql_value(value: str) -> str:
|
|
123
63
|
"""转义SQL值中的特殊字符"""
|
|
124
64
|
return value.replace("'", "''")
|
|
125
65
|
|
|
126
|
-
@staticmethod
|
|
127
|
-
def _check_sequence_element_type(sequence: Sequence[Any], element_type: type) -> bool:
|
|
128
|
-
"""检查序列中的元素是否为指定类型"""
|
|
129
|
-
return all(isinstance(element, element_type) for element in sequence)
|
|
130
|
-
|
|
131
66
|
def create_table(
|
|
132
67
|
self,
|
|
133
68
|
name: str,
|
|
134
69
|
primary_keys: Union[str, List[str]],
|
|
135
|
-
timestamp_key: str,
|
|
136
|
-
engine_type: EngineTypes,
|
|
137
|
-
data_source_name: str,
|
|
138
70
|
database_name: Optional[str] = None,
|
|
71
|
+
location: Optional[str] = None,
|
|
139
72
|
df: Optional[DataFrame] = None,
|
|
140
73
|
*,
|
|
74
|
+
timestamp_keys: Union[str, List[str], None] = None,
|
|
141
75
|
partition_columns: Union[str, List[str], None] = None,
|
|
142
76
|
schema: Optional[StructType] = None,
|
|
143
77
|
description: Optional[str] = None,
|
|
@@ -151,14 +85,14 @@ class FeatureTableClient:
|
|
|
151
85
|
name: 特征表全称(格式:<table>)
|
|
152
86
|
primary_keys: 主键列名(支持复合主键)
|
|
153
87
|
database_name: Optional[str] = None,
|
|
154
|
-
|
|
88
|
+
location: Optional[str] = None,
|
|
155
89
|
df: 初始数据(可选,用于推断schema)
|
|
156
|
-
|
|
157
|
-
engine_type: 引擎类型 version:: 1.33
|
|
90
|
+
timestamp_keys: 时间戳键(用于时态特征)
|
|
158
91
|
partition_columns: 分区列(优化存储查询)
|
|
159
92
|
schema: 表结构定义(可选,当不提供df时必需)
|
|
160
93
|
description: 业务描述
|
|
161
94
|
tags: 业务标签
|
|
95
|
+
|
|
162
96
|
Returns:
|
|
163
97
|
FeatureTable实例
|
|
164
98
|
|
|
@@ -168,33 +102,24 @@ class FeatureTableClient:
|
|
|
168
102
|
|
|
169
103
|
# 参数标准化
|
|
170
104
|
primary_keys = self._normalize_params(primary_keys)
|
|
105
|
+
timestamp_keys = self._normalize_params(timestamp_keys)
|
|
171
106
|
partition_columns = self._normalize_params(partition_columns)
|
|
172
107
|
|
|
173
|
-
assert self._check_sequence_element_type(primary_keys, str), "primary_keys must be a list of strings"
|
|
174
|
-
assert self._check_sequence_element_type(partition_columns, str), "partition_columns must be a list of strings"
|
|
175
|
-
assert isinstance(timestamp_key, str), "timestamp key must be string"
|
|
176
|
-
|
|
177
108
|
# 元数据校验
|
|
178
109
|
self._validate_schema(df, schema)
|
|
179
|
-
self.
|
|
180
|
-
self._validate_key_conflicts(primary_keys, timestamp_key)
|
|
110
|
+
self._validate_key_conflicts(primary_keys, timestamp_keys)
|
|
181
111
|
|
|
182
112
|
# 表名校验
|
|
183
113
|
common_utils.validate_table_name(name)
|
|
184
114
|
|
|
185
115
|
common_utils.validate_database(database_name)
|
|
186
116
|
|
|
187
|
-
# 校验PrimaryKey是否有重复
|
|
188
|
-
dup_list = common_utils.get_duplicates(primary_keys)
|
|
189
|
-
if dup_list :
|
|
190
|
-
raise ValueError(f"Primary keys have duplicates: {dup_list}")
|
|
191
|
-
|
|
192
117
|
# 构建完整表名
|
|
193
118
|
table_name = common_utils.build_full_table_name(name, database_name)
|
|
194
119
|
|
|
195
120
|
# 检查表是否存在
|
|
196
121
|
try:
|
|
197
|
-
if self.
|
|
122
|
+
if self._spark.catalog.tableExists(table_name):
|
|
198
123
|
raise ValueError(
|
|
199
124
|
f"Table '{name}' already exists\n"
|
|
200
125
|
"Solutions:\n"
|
|
@@ -204,20 +129,12 @@ class FeatureTableClient:
|
|
|
204
129
|
except Exception as e:
|
|
205
130
|
raise ValueError(f"Error checking table existence: {str(e)}") from e
|
|
206
131
|
|
|
207
|
-
try:
|
|
208
|
-
self._sync_table_info(table_name=name, action_name="create",
|
|
209
|
-
database_name=env_utils.get_database_name(database_name),
|
|
210
|
-
data_source_name=data_source_name, engine_name=env_utils.get_engine_name(),
|
|
211
|
-
is_try=True)
|
|
212
|
-
except tencentcloud.common.exception.TencentCloudSDKException as e:
|
|
213
|
-
raise RuntimeError(f"Table '{name}' is can't create. {str(e)}")
|
|
214
|
-
|
|
215
132
|
# 推断表schema
|
|
216
133
|
table_schema = schema or df.schema
|
|
217
134
|
|
|
218
135
|
# 构建时间戳键属性
|
|
219
136
|
|
|
220
|
-
|
|
137
|
+
#从环境变量获取额外标签
|
|
221
138
|
env_tags = {
|
|
222
139
|
"project_id": os.getenv("WEDATA_PROJECT_ID", ""), # wedata项目ID
|
|
223
140
|
"engine_name": os.getenv("WEDATA_NOTEBOOK_ENGINE", ""), # wedata引擎名称
|
|
@@ -227,42 +144,13 @@ class FeatureTableClient:
|
|
|
227
144
|
# 构建表属性(通过TBLPROPERTIES)
|
|
228
145
|
tbl_properties = {
|
|
229
146
|
"wedata.feature_table": "true",
|
|
230
|
-
|
|
147
|
+
"primaryKeys": ",".join(primary_keys),
|
|
231
148
|
"wedata.feature_project_id": f"{json.dumps([projectId])}",
|
|
232
|
-
|
|
149
|
+
"timestampKeys": ",".join(timestamp_keys) if timestamp_keys else "",
|
|
233
150
|
"comment": description or "",
|
|
234
151
|
**{f"{k}": v for k, v in (tags or {}).items()},
|
|
235
152
|
**{f"feature_{k}": v for k, v in (env_tags or {}).items()}
|
|
236
153
|
}
|
|
237
|
-
if engine_type == EngineTypes.ICEBERG_ENGINE:
|
|
238
|
-
if partition_columns:
|
|
239
|
-
tbl_properties.update({
|
|
240
|
-
'format-version': '2',
|
|
241
|
-
'write.upsert.enabled': 'true',
|
|
242
|
-
'write.update.mode': 'merge-on-read',
|
|
243
|
-
'write.merge.mode': 'merge-on-read',
|
|
244
|
-
'write.parquet.bloom-filter-enabled.column.id': 'true',
|
|
245
|
-
'dlc.ao.data.govern.sorted.keys': ",".join(primary_keys),
|
|
246
|
-
'write.distribution-mode': 'hash',
|
|
247
|
-
'write.metadata.delete-after-commit.enabled': 'true',
|
|
248
|
-
'write.metadata.previous-versions-max': '100',
|
|
249
|
-
'write.metadata.metrics.default': 'full',
|
|
250
|
-
'smart-optimizer.inherit': 'default',
|
|
251
|
-
})
|
|
252
|
-
else:
|
|
253
|
-
tbl_properties.update({
|
|
254
|
-
'format-version': '2',
|
|
255
|
-
'write.upsert.enabled': 'true',
|
|
256
|
-
'write.update.mode': 'merge-on-read',
|
|
257
|
-
'write.merge.mode': 'merge-on-read',
|
|
258
|
-
'write.parquet.bloom-filter-enabled.column.id': 'true',
|
|
259
|
-
'dlc.ao.data.govern.sorted.keys': ",".join(primary_keys),
|
|
260
|
-
'write.distribution-mode': 'hash',
|
|
261
|
-
'write.metadata.delete-after-commit.enabled': 'true',
|
|
262
|
-
'write.metadata.previous-versions-max': '100',
|
|
263
|
-
'write.metadata.metrics.default': 'full',
|
|
264
|
-
'smart-optimizer.inherit': 'default',
|
|
265
|
-
})
|
|
266
154
|
|
|
267
155
|
# 构建列定义
|
|
268
156
|
columns_ddl = []
|
|
@@ -284,8 +172,7 @@ class FeatureTableClient:
|
|
|
284
172
|
)
|
|
285
173
|
# 本地调试 iceberg --》PARQUET
|
|
286
174
|
# 核心建表语句
|
|
287
|
-
|
|
288
|
-
ddl = f"""
|
|
175
|
+
ddl = f"""
|
|
289
176
|
CREATE TABLE {table_name} (
|
|
290
177
|
{', '.join(columns_ddl)}
|
|
291
178
|
)
|
|
@@ -294,20 +181,7 @@ class FeatureTableClient:
|
|
|
294
181
|
TBLPROPERTIES (
|
|
295
182
|
{', '.join(f"'{k}'='{self._escape_sql_value(v)}'" for k, v in tbl_properties.items())}
|
|
296
183
|
)
|
|
297
|
-
|
|
298
|
-
elif engine_type == EngineTypes.HIVE_ENGINE:
|
|
299
|
-
ddl = f"""
|
|
300
|
-
CREATE TABLE {table_name} (
|
|
301
|
-
{', '.join(columns_ddl)}
|
|
302
|
-
)
|
|
303
|
-
{partition_expr}
|
|
304
|
-
-- STORED AS PARQUET
|
|
305
|
-
TBLPROPERTIES (
|
|
306
|
-
{', '.join(f"'{k}'='{self._escape_sql_value(v)}'" for k, v in tbl_properties.items())}
|
|
307
|
-
)
|
|
308
|
-
"""
|
|
309
|
-
else:
|
|
310
|
-
raise ValueError(f"Engine type {engine_type} is not supported")
|
|
184
|
+
"""
|
|
311
185
|
|
|
312
186
|
# 打印sql
|
|
313
187
|
print(f"create table ddl: {ddl}\n")
|
|
@@ -320,28 +194,8 @@ class FeatureTableClient:
|
|
|
320
194
|
except Exception as e:
|
|
321
195
|
raise ValueError(f"Failed to create table: {str(e)}") from e
|
|
322
196
|
|
|
323
|
-
print("async table info to feast")
|
|
324
|
-
|
|
325
|
-
self._feast_client.create_table(
|
|
326
|
-
table_name=table_name,
|
|
327
|
-
primary_keys=primary_keys,
|
|
328
|
-
timestamp_key=timestamp_key,
|
|
329
|
-
df=df,
|
|
330
|
-
schema=table_schema,
|
|
331
|
-
tags=tags,
|
|
332
|
-
description=description
|
|
333
|
-
)
|
|
334
|
-
|
|
335
197
|
print(f"create table {name} done")
|
|
336
198
|
|
|
337
|
-
try:
|
|
338
|
-
self._sync_table_info(table_name=name, action_name="create",
|
|
339
|
-
database_name=env_utils.get_database_name(database_name),
|
|
340
|
-
data_source_name=data_source_name, engine_name=env_utils.get_engine_name(),
|
|
341
|
-
is_try=False)
|
|
342
|
-
except tencentcloud.common.exception.TencentCloudSDKException as e:
|
|
343
|
-
raise RuntimeError(f"Table '{name}' is can't create. {str(e)}")
|
|
344
|
-
|
|
345
199
|
# 构建并返回FeatureTable对象
|
|
346
200
|
return FeatureTable(
|
|
347
201
|
name=name,
|
|
@@ -350,7 +204,7 @@ class FeatureTableClient:
|
|
|
350
204
|
primary_keys=primary_keys,
|
|
351
205
|
partition_columns=partition_columns or [],
|
|
352
206
|
features=[field.name for field in table_schema.fields],
|
|
353
|
-
timestamp_keys=
|
|
207
|
+
timestamp_keys=timestamp_keys or [],
|
|
354
208
|
tags=dict(**tags or {}, **env_tags)
|
|
355
209
|
)
|
|
356
210
|
|
|
@@ -395,10 +249,6 @@ class FeatureTableClient:
|
|
|
395
249
|
# 构建完整表名
|
|
396
250
|
table_name = common_utils.build_full_table_name(name, database_name)
|
|
397
251
|
|
|
398
|
-
# 检查表是否存在
|
|
399
|
-
if not self._check_table_exists(table_name):
|
|
400
|
-
raise ValueError(f"table '{name}' not exists")
|
|
401
|
-
|
|
402
252
|
# 判断是否是流式DataFrame
|
|
403
253
|
is_streaming = df.isStreaming
|
|
404
254
|
|
|
@@ -411,8 +261,7 @@ class FeatureTableClient:
|
|
|
411
261
|
writer = df.writeStream \
|
|
412
262
|
.format("parquet") \
|
|
413
263
|
.outputMode(mode) \
|
|
414
|
-
.option("checkpointLocation", checkpoint_location)
|
|
415
|
-
# .foreachBatch(process_batch)
|
|
264
|
+
.option("checkpointLocation", checkpoint_location)
|
|
416
265
|
|
|
417
266
|
if trigger:
|
|
418
267
|
writer = writer.trigger(**trigger)
|
|
@@ -423,23 +272,18 @@ class FeatureTableClient:
|
|
|
423
272
|
df.write \
|
|
424
273
|
.mode(mode) \
|
|
425
274
|
.insertInto(table_name)
|
|
426
|
-
# self._feast_client.client.write_to_offline_store(feature_view_name=table_name, df=df.toPandas(), allow_registry_cache=False,)
|
|
427
275
|
return None
|
|
428
276
|
|
|
429
277
|
except Exception as e:
|
|
430
|
-
raise
|
|
431
|
-
|
|
278
|
+
raise ValueError(f"Failed to write to table '{table_name}': {str(e)}") from e
|
|
279
|
+
|
|
432
280
|
|
|
433
|
-
def register_table(self, name, database_name
|
|
434
|
-
primary_keys: Union[str, List[str]]):
|
|
281
|
+
def register_table(self, name, database_name):
|
|
435
282
|
"""注册表 为特征表
|
|
436
283
|
Args:
|
|
437
284
|
name: 表名(格式:<table>)
|
|
438
285
|
database_name: 特征库名称
|
|
439
|
-
|
|
440
|
-
engine_type: 引擎类型
|
|
441
|
-
timestamp_key: 时间戳键
|
|
442
|
-
primary_keys: 主键
|
|
286
|
+
|
|
443
287
|
Raises:
|
|
444
288
|
ValueError: 当表不存在或参数无效时抛出
|
|
445
289
|
RuntimeError: 当修改操作失败时抛出
|
|
@@ -453,49 +297,23 @@ class FeatureTableClient:
|
|
|
453
297
|
common_utils.validate_table_name(name)
|
|
454
298
|
common_utils.validate_database(database_name)
|
|
455
299
|
|
|
456
|
-
if primary_keys:
|
|
457
|
-
assert self._check_sequence_element_type(primary_keys, str), "primary_keys must be a list of strings"
|
|
458
|
-
assert isinstance(timestamp_key, str), "timestamp key must be string"
|
|
459
|
-
|
|
460
300
|
# 构建完整表名
|
|
461
301
|
table_name = common_utils.build_full_table_name(name, database_name)
|
|
462
302
|
|
|
463
303
|
try:
|
|
464
304
|
# 检查表是否存在
|
|
465
|
-
if not self.
|
|
305
|
+
if not self._spark.catalog.tableExists(table_name):
|
|
466
306
|
raise ValueError(f"table '{name}' not exists")
|
|
467
307
|
tbl_pro = self._spark.sql(f"SHOW TBLPROPERTIES {table_name}")
|
|
468
308
|
props = {row['key']: row['value'] for row in tbl_pro.collect()}
|
|
469
|
-
|
|
470
|
-
# 检查Primary Key和Timestamp Key是否为空
|
|
471
|
-
if engine_type == engine_type.ICEBERG_ENGINE and props.get("format-version", "") == "2":
|
|
472
|
-
if not primary_keys:
|
|
473
|
-
if props.get('dlc.ao.data.govern.sorted.keys', "") == "":
|
|
474
|
-
raise ValueError(
|
|
475
|
-
"table dlc.ao.data.govern.sorted.keys is empty. you must set dlc.ao.data.govern.sorted.keys")
|
|
476
|
-
else:
|
|
477
|
-
primary_keys = props.get('dlc.ao.data.govern.sorted.keys').split(",")
|
|
478
|
-
elif engine_type == engine_type.HIVE_ENGINE:
|
|
479
|
-
if not primary_keys:
|
|
480
|
-
raise ValueError("primary_keys cannot be None for HIVE_ENGINE")
|
|
481
|
-
|
|
482
|
-
if props.get("wedata.feature_table", "") == "true":
|
|
483
|
-
raise ValueError("table is already a feature table")
|
|
484
|
-
|
|
485
|
-
self._validate_key_conflicts(primary_keys, timestamp_key)
|
|
486
|
-
# 检查表是否存在
|
|
487
|
-
dup_list = common_utils.get_duplicates(primary_keys)
|
|
488
|
-
if dup_list:
|
|
489
|
-
raise ValueError(f"primary_keys contains duplicates: {dup_list}")
|
|
490
|
-
|
|
491
309
|
s = props.get(FEATURE_TABLE_PROJECT, "")
|
|
492
310
|
if not s: # 如果s是空字符串
|
|
493
311
|
projectIds = []
|
|
494
312
|
else:
|
|
495
313
|
projectIds = json.loads(s)
|
|
496
314
|
current_project_id = os.getenv("WEDATA_PROJECT_ID")
|
|
497
|
-
# 判断是否包含
|
|
498
|
-
if current_project_id not in projectIds
|
|
315
|
+
# 判断是否包含
|
|
316
|
+
if current_project_id not in projectIds:
|
|
499
317
|
register_table_project_ids = props.get(FEATURE_TABLE_PROJECT)
|
|
500
318
|
else:
|
|
501
319
|
projectIds.append(current_project_id)
|
|
@@ -503,62 +321,31 @@ class FeatureTableClient:
|
|
|
503
321
|
tbl_properties = {
|
|
504
322
|
FEATURE_TABLE_KEY: FEATURE_TABLE_VALUE,
|
|
505
323
|
FEATURE_TABLE_PROJECT: register_table_project_ids,
|
|
506
|
-
FEATURE_TABLE_TIMESTAMP: timestamp_key,
|
|
507
|
-
FEATURE_TABLE_BACKUP_PRIMARY_KEY: ",".join(primary_keys),
|
|
508
|
-
}
|
|
509
|
-
|
|
510
|
-
env_tags = {
|
|
511
|
-
"project_id": os.getenv("WEDATA_PROJECT_ID", ""), # wedata项目ID
|
|
512
|
-
"engine_name": os.getenv("WEDATA_NOTEBOOK_ENGINE", ""), # wedata引擎名称
|
|
513
|
-
"user_uin": os.getenv("KERNEL_LOGIN_UIN", "") # wedata用户UIN
|
|
514
324
|
}
|
|
515
|
-
for key, val in env_tags.items():
|
|
516
|
-
if not props.get(f"feature_{key}", ""):
|
|
517
|
-
tbl_properties[f"feature_{key}"] = val
|
|
518
325
|
|
|
519
326
|
# 构建属性设置语句
|
|
520
327
|
props_str = ", ".join(
|
|
521
328
|
f"'{k}'='{self._escape_sql_value(v)}'"
|
|
522
|
-
for k, v in tbl_properties
|
|
329
|
+
for k, v in tbl_properties
|
|
523
330
|
)
|
|
524
331
|
|
|
525
|
-
|
|
526
332
|
alter_sql = f"ALTER TABLE {table_name} SET TBLPROPERTIES ({props_str})"
|
|
527
333
|
|
|
528
|
-
try:
|
|
529
|
-
self._sync_table_info(table_name=name, action_name="create",
|
|
530
|
-
database_name=env_utils.get_database_name(database_name),
|
|
531
|
-
data_source_name=data_source_name, engine_name=env_utils.get_engine_name(), is_try=True)
|
|
532
|
-
except tencentcloud.common.exception.TencentCloudSDKException as e:
|
|
533
|
-
raise RuntimeError(f"Table '{name}' is can't create. {str(e)}")
|
|
534
|
-
|
|
535
334
|
# 执行修改
|
|
536
|
-
print("alter table sql", alter_sql)
|
|
537
335
|
self._spark.sql(alter_sql)
|
|
538
|
-
print("
|
|
539
|
-
|
|
540
|
-
timestamp_key=timestamp_key)
|
|
541
|
-
print(f"Successfully register table '{table_name}'")
|
|
542
|
-
|
|
543
|
-
try:
|
|
544
|
-
self._sync_table_info(table_name=name, action_name="create",
|
|
545
|
-
database_name=env_utils.get_database_name(database_name),
|
|
546
|
-
data_source_name=data_source_name, engine_name=env_utils.get_engine_name(), is_try=False)
|
|
547
|
-
except tencentcloud.common.exception.TencentCloudSDKException as e:
|
|
548
|
-
raise RuntimeError(f"sync table info failed. you need to sync table info manually. {str(e)}")
|
|
336
|
+
print(f"Successfully register table '{name}'")
|
|
337
|
+
|
|
549
338
|
except ValueError as e:
|
|
550
339
|
raise # 直接抛出已知的ValueError
|
|
551
340
|
except Exception as e:
|
|
552
|
-
raise RuntimeError(f"Failed to modify properties for table '{
|
|
341
|
+
raise RuntimeError(f"Failed to modify properties for table '{name}': {str(e)}") from e
|
|
342
|
+
|
|
553
343
|
|
|
554
344
|
def read_table(
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
online_config: Optional[RedisStoreConfig] = None,
|
|
560
|
-
entity_row: Optional[List[Dict[str, Any]]] = None
|
|
561
|
-
) -> DataFrame:
|
|
345
|
+
self,
|
|
346
|
+
name: str,
|
|
347
|
+
database_name: Optional[str] = None,
|
|
348
|
+
) -> DataFrame:
|
|
562
349
|
|
|
563
350
|
"""
|
|
564
351
|
从特征表中读取数据
|
|
@@ -566,9 +353,6 @@ class FeatureTableClient:
|
|
|
566
353
|
Args:
|
|
567
354
|
name: 特征表名称(格式:<table>)
|
|
568
355
|
database_name: 特征库名称
|
|
569
|
-
is_online: 是否读取在线表
|
|
570
|
-
online_config: 在线表配置
|
|
571
|
-
entity_row: 实体行(用于过滤在线数据, 仅当在线表为true时有效)
|
|
572
356
|
Returns:
|
|
573
357
|
包含表数据的DataFrame
|
|
574
358
|
|
|
@@ -581,23 +365,20 @@ class FeatureTableClient:
|
|
|
581
365
|
|
|
582
366
|
common_utils.validate_database(database_name)
|
|
583
367
|
|
|
368
|
+
|
|
584
369
|
# 构建完整表名
|
|
585
370
|
table_name = common_utils.build_full_table_name(name, database_name)
|
|
586
371
|
|
|
587
372
|
try:
|
|
588
373
|
# 检查表是否存在
|
|
589
|
-
if not self.
|
|
374
|
+
if not self._spark.catalog.tableExists(table_name):
|
|
590
375
|
raise ValueError(f"Table '{name}' does not exist")
|
|
591
376
|
|
|
592
|
-
if is_online:
|
|
593
|
-
return self._read_online_table(
|
|
594
|
-
table_name=name, database_name=database_name,
|
|
595
|
-
online_config=online_config, entity_row=entity_row)
|
|
596
377
|
# 读取表数据
|
|
597
378
|
return self._spark.read.table(table_name)
|
|
598
379
|
|
|
599
380
|
except Exception as e:
|
|
600
|
-
raise
|
|
381
|
+
raise ValueError(f"Failed to read table '{name}': {str(e)}") from e
|
|
601
382
|
|
|
602
383
|
def drop_table(self, name: str, database_name: Optional[str] = None) -> None:
|
|
603
384
|
|
|
@@ -607,6 +388,7 @@ class FeatureTableClient:
|
|
|
607
388
|
Args:
|
|
608
389
|
name: 特征表名称(格式:<table>)
|
|
609
390
|
database_name: 特征库名称
|
|
391
|
+
|
|
610
392
|
Raises:
|
|
611
393
|
ValueError: 当表不存在时抛出
|
|
612
394
|
RuntimeError: 当删除操作失败时抛出
|
|
@@ -621,118 +403,22 @@ class FeatureTableClient:
|
|
|
621
403
|
|
|
622
404
|
# 构建完整表名
|
|
623
405
|
table_name = common_utils.build_full_table_name(name, database_name)
|
|
406
|
+
|
|
624
407
|
try:
|
|
625
408
|
# 检查表是否存在
|
|
626
|
-
if not self.
|
|
409
|
+
if not self._spark.catalog.tableExists(table_name):
|
|
627
410
|
print(f"Table '{name}' does not exist")
|
|
628
411
|
return
|
|
629
412
|
|
|
630
|
-
try:
|
|
631
|
-
feature_view = self._feast_client.get_feature_view(table_name)
|
|
632
|
-
except Exception as e:
|
|
633
|
-
print(f"Table '{name}' is not a feature table, skip delete. {str(e)}")
|
|
634
|
-
else:
|
|
635
|
-
if feature_view.online:
|
|
636
|
-
raise ValueError(f"Table '{name}' has a online table, please call drop_online_table first")
|
|
637
|
-
try:
|
|
638
|
-
self._sync_table_info(table_name=name, action_name="delete",
|
|
639
|
-
database_name=env_utils.get_database_name(database_name),
|
|
640
|
-
data_source_name="", engine_name=env_utils.get_engine_name(), is_try=True)
|
|
641
|
-
except tencentcloud.common.exception.TencentCloudSDKException as e:
|
|
642
|
-
raise RuntimeError(f"Table '{name}' is can't delete. {str(e)}")
|
|
643
|
-
|
|
644
413
|
# 执行删除
|
|
645
414
|
self._spark.sql(f"DROP TABLE {table_name}")
|
|
646
415
|
print(f"Table '{name}' dropped")
|
|
647
|
-
|
|
648
|
-
self._feast_client.remove_offline_table(table_name=table_name)
|
|
649
|
-
except Exception as e:
|
|
650
|
-
raise
|
|
651
|
-
# raise ValueError(f"Failed to delete table '{name}' in feast: {str(e)}")
|
|
652
|
-
else:
|
|
653
|
-
print(f"Table '{name}' removed from feast")
|
|
654
|
-
|
|
655
|
-
try:
|
|
656
|
-
self._sync_table_info(table_name=name, action_name="delete",
|
|
657
|
-
database_name=env_utils.get_database_name(database_name),
|
|
658
|
-
data_source_name="", engine_name=env_utils.get_engine_name(), is_try=False)
|
|
659
|
-
except tencentcloud.common.exception.TencentCloudSDKException as e:
|
|
660
|
-
print(f"Failed to delete table information on the web interface. You need to delete it manually. Error: {str(e)}")
|
|
416
|
+
|
|
661
417
|
except ValueError as e:
|
|
662
418
|
raise # 直接抛出已知的ValueError
|
|
663
419
|
except Exception as e:
|
|
664
420
|
raise RuntimeError(f"Failed to delete table '{name}': {str(e)}") from e
|
|
665
421
|
|
|
666
|
-
def _sync_table_info(self, table_name: str, action_name: str, database_name: str,
|
|
667
|
-
data_source_name: str, engine_name: str, is_try: bool):
|
|
668
|
-
return _refresh_table(project_id=self.project, secret_id=self.cloud_secret_id, secret_key=self.cloud_secret_key,
|
|
669
|
-
region=self.region, table_name=table_name,
|
|
670
|
-
action=action_name, database_name=database_name, data_source_name=data_source_name,
|
|
671
|
-
engine_name=engine_name, is_try=is_try, data_source_type=env_utils.get_engine_type())
|
|
672
|
-
|
|
673
|
-
def _read_online_table(self,
|
|
674
|
-
table_name: str, database_name: str, online_config: RedisStoreConfig,
|
|
675
|
-
entity_row:List[Dict[str,Any]] = None):
|
|
676
|
-
full_table_name = common_utils.build_full_table_name(table_name, database_name)
|
|
677
|
-
primary_keys, timestamp_key = self._get_table_primary_keys_and_timestamp_key(full_table_name)
|
|
678
|
-
entity_row_dict = {}
|
|
679
|
-
if isinstance(entity_row, list):
|
|
680
|
-
for row in entity_row:
|
|
681
|
-
if not isinstance(row, dict):
|
|
682
|
-
raise ValueError("Entity_row row must be a dictionary")
|
|
683
|
-
for key in row.keys():
|
|
684
|
-
if key not in primary_keys:
|
|
685
|
-
raise ValueError(f"Entity_row row key '{key}' is not a primary key")
|
|
686
|
-
entity_row_dict[key] = key
|
|
687
|
-
elif isinstance(entity_row, dict):
|
|
688
|
-
for key in entity_row.keys():
|
|
689
|
-
if key not in primary_keys:
|
|
690
|
-
raise ValueError(f"Entity_row row key '{key}' is not a primary key")
|
|
691
|
-
entity_row_dict = entity_row
|
|
692
|
-
else:
|
|
693
|
-
raise ValueError(f"Entity_row must be a list of dictionaries or a single dictionary. {type(entity_row)}")
|
|
694
|
-
|
|
695
|
-
tmp_schema = self._spark.table(tableName=full_table_name).schema
|
|
696
|
-
columns_name_list = []
|
|
697
|
-
tmp_schema_list = []
|
|
698
|
-
for field in tmp_schema.fields:
|
|
699
|
-
if field.name in primary_keys or field.name == timestamp_key:
|
|
700
|
-
if entity_row_dict.get(field.name):
|
|
701
|
-
tmp_schema_list.append(field)
|
|
702
|
-
continue
|
|
703
|
-
columns_name_list.append(field.name)
|
|
704
|
-
tmp_schema_list.append(field)
|
|
705
|
-
|
|
706
|
-
schema_name_list = [field.name for field in tmp_schema_list]
|
|
707
|
-
schema = StructType(tmp_schema_list)
|
|
708
|
-
for field in schema:
|
|
709
|
-
print(f"{field.name} => {field.dataType}")
|
|
710
|
-
|
|
711
|
-
feast_client = FeastClient(offline_store=self._spark, online_store_config=online_config)
|
|
712
|
-
# 构建离线表的entity的数据过滤
|
|
713
|
-
if not entity_row:
|
|
714
|
-
tbl_props = self._spark.sql(f"SHOW TBLPROPERTIES {table_name}")
|
|
715
|
-
props = {row['key']: row['value'] for row in tbl_props.collect()}
|
|
716
|
-
primary_key = props.get(FEATURE_TABLE_BACKUP_PRIMARY_KEY)
|
|
717
|
-
query_result = self._spark.sql(f"SELECT {primary_key} FROM {table_name} LIMIT 1")
|
|
718
|
-
result_row = query_result.first()
|
|
719
|
-
if result_row:
|
|
720
|
-
online_view = feast_client.get_online_table_view(
|
|
721
|
-
full_table_name=full_table_name,
|
|
722
|
-
columns_name=columns_name_list,
|
|
723
|
-
entity_rows=[result_row.asDict()])
|
|
724
|
-
print("=====>read online dataframe:\n", online_view[schema_name_list])
|
|
725
|
-
return self._spark.createDataFrame(online_view[schema_name_list], schema=schema, verifySchema=False)
|
|
726
|
-
else:
|
|
727
|
-
return self._spark.createDataFrame([])
|
|
728
|
-
else:
|
|
729
|
-
online_view = feast_client.get_online_table_view(
|
|
730
|
-
full_table_name=full_table_name,
|
|
731
|
-
columns_name=columns_name_list,
|
|
732
|
-
entity_rows=entity_row)
|
|
733
|
-
print("=====>read online dataframe:\n", online_view[schema_name_list])
|
|
734
|
-
return self._spark.createDataFrame(online_view[schema_name_list], schema=schema, verifySchema=False)
|
|
735
|
-
|
|
736
422
|
def get_table(
|
|
737
423
|
self,
|
|
738
424
|
name: str,
|
|
@@ -740,8 +426,7 @@ class FeatureTableClient:
|
|
|
740
426
|
database_name: Optional[str] = None,
|
|
741
427
|
) -> FeatureTable:
|
|
742
428
|
|
|
743
|
-
"""
|
|
744
|
-
获取特征表元数据信息
|
|
429
|
+
"""获取特征表元数据信息
|
|
745
430
|
|
|
746
431
|
参数:
|
|
747
432
|
name: 特征表名称
|
|
@@ -760,13 +445,11 @@ class FeatureTableClient:
|
|
|
760
445
|
|
|
761
446
|
# 构建完整表名
|
|
762
447
|
table_name = common_utils.build_full_table_name(name, database_name)
|
|
763
|
-
|
|
764
|
-
raise ValueError(f"Table '{name}' does not exist")
|
|
448
|
+
|
|
765
449
|
try:
|
|
766
450
|
return spark_client.get_feature_table(table_name)
|
|
767
451
|
except Exception as e:
|
|
768
|
-
raise
|
|
769
|
-
# raise ValueError(f"Failed to get metadata for table '{name}': {str(e)}") from e
|
|
452
|
+
raise ValueError(f"Failed to get metadata for table '{name}': {str(e)}") from e
|
|
770
453
|
|
|
771
454
|
def alter_table_tag(
|
|
772
455
|
self,
|
|
@@ -774,8 +457,7 @@ class FeatureTableClient:
|
|
|
774
457
|
properties: Dict[str, str],
|
|
775
458
|
database_name: Optional[str] = None,
|
|
776
459
|
):
|
|
777
|
-
"""
|
|
778
|
-
修改表的TBLPROPERTIES属性(有则修改,无则新增)
|
|
460
|
+
"""修改表的TBLPROPERTIES属性(有则修改,无则新增)
|
|
779
461
|
|
|
780
462
|
Args:
|
|
781
463
|
name: 表名(格式:<table>)
|
|
@@ -806,7 +488,7 @@ class FeatureTableClient:
|
|
|
806
488
|
|
|
807
489
|
try:
|
|
808
490
|
# 检查表是否存在
|
|
809
|
-
if not self.
|
|
491
|
+
if not self._spark.catalog.tableExists(table_name):
|
|
810
492
|
raise ValueError(f"table '{name}' not exists")
|
|
811
493
|
|
|
812
494
|
# 构建属性设置语句
|
|
@@ -819,7 +501,6 @@ class FeatureTableClient:
|
|
|
819
501
|
|
|
820
502
|
# 执行修改
|
|
821
503
|
self._spark.sql(alter_sql)
|
|
822
|
-
self._feast_client.modify_tags(table_name=table_name, tags=properties)
|
|
823
504
|
print(f"Successfully updated properties for table '{name}': {list(properties.keys())}")
|
|
824
505
|
|
|
825
506
|
except ValueError as e:
|
|
@@ -827,159 +508,3 @@ class FeatureTableClient:
|
|
|
827
508
|
except Exception as e:
|
|
828
509
|
raise RuntimeError(f"Failed to modify properties for table '{name}': {str(e)}") from e
|
|
829
510
|
|
|
830
|
-
def publish_table(self, table_name: str, data_source_name: str, cloud_secret_id: str, cloud_secret_key: str,
|
|
831
|
-
database_name: Optional[str] = None,
|
|
832
|
-
is_cycle: bool = False, cycle_obj: TaskSchedulerConfiguration = None,
|
|
833
|
-
is_use_default_online: bool = True, online_config: RedisStoreConfig = None):
|
|
834
|
-
"""
|
|
835
|
-
将离线特征表发布为在线特征表
|
|
836
|
-
Args:
|
|
837
|
-
table_name: 离线特征表名称
|
|
838
|
-
data_source_name: 数据源名称
|
|
839
|
-
database_name: 数据库名称
|
|
840
|
-
is_cycle: 是否周期性发布
|
|
841
|
-
cycle_obj: 周期性任务配置
|
|
842
|
-
is_use_default_online: 是否使用默认的在线存储配置
|
|
843
|
-
online_config: 在线存储配置 (仅当is_use_default_online为False时生效)
|
|
844
|
-
"""
|
|
845
|
-
# 构建完整表名
|
|
846
|
-
full_table_name = common_utils.build_full_table_name(table_name, database_name)
|
|
847
|
-
|
|
848
|
-
# 检查表是否存在
|
|
849
|
-
if not self._check_table_exists(full_table_name):
|
|
850
|
-
raise ValueError(f"Table '{full_table_name}' does not exist")
|
|
851
|
-
|
|
852
|
-
# 检查是否已经发布,查看Redis中是否有值
|
|
853
|
-
try:
|
|
854
|
-
# 获取离线表的列名
|
|
855
|
-
online_data = self._read_online_table(
|
|
856
|
-
table_name=table_name,
|
|
857
|
-
database_name=database_name,
|
|
858
|
-
online_config=online_config)
|
|
859
|
-
except Exception as e:
|
|
860
|
-
print(f"Failed to get online table view for table '{full_table_name}': {str(e)}")
|
|
861
|
-
else:
|
|
862
|
-
if online_data:
|
|
863
|
-
raise ValueError(f"Table '{full_table_name}' has already been published")
|
|
864
|
-
|
|
865
|
-
# 配置周期性参数
|
|
866
|
-
if is_cycle:
|
|
867
|
-
if not isinstance(cycle_obj, TaskSchedulerConfiguration):
|
|
868
|
-
raise ValueError("cycle_obj must be a TaskSchedulerConfiguration object when is_cycle is True")
|
|
869
|
-
|
|
870
|
-
cycle_obj.CycleType = "CRONTAB_CYCLE"
|
|
871
|
-
else:
|
|
872
|
-
if isinstance(cycle_obj, TaskSchedulerConfiguration):
|
|
873
|
-
cycle_obj.CycleType = "ONEOFF_CYCLE"
|
|
874
|
-
else:
|
|
875
|
-
cycle_obj = TaskSchedulerConfiguration()
|
|
876
|
-
cycle_obj.CycleType = "ONEOFF_CYCLE"
|
|
877
|
-
# 设置默认当前时间延后1分钟
|
|
878
|
-
cycle_obj.CrontabExpression = (datetime.datetime.now() + datetime.timedelta(minutes=3)).strftime(
|
|
879
|
-
"%M %H %d %m %w ? %y")
|
|
880
|
-
|
|
881
|
-
if is_use_default_online:
|
|
882
|
-
online_feature_config = OnlineFeatureConfiguration()
|
|
883
|
-
online_feature_config.UserDefault = True
|
|
884
|
-
else:
|
|
885
|
-
if not isinstance(online_config, RedisStoreConfig):
|
|
886
|
-
raise ValueError("online_config must be a RedisStoreConfig object when is_use_default_online is False")
|
|
887
|
-
|
|
888
|
-
online_feature_config = OnlineFeatureConfiguration()
|
|
889
|
-
online_feature_config.UserDefault = False
|
|
890
|
-
online_feature_config.Host = online_config.host
|
|
891
|
-
online_feature_config.Port = online_config.port
|
|
892
|
-
online_feature_config.DB = online_config.db
|
|
893
|
-
|
|
894
|
-
offline_feature_config = OfflineFeatureConfiguration()
|
|
895
|
-
offline_feature_config.DatabaseName = env_utils.get_database_name(database_name)
|
|
896
|
-
offline_feature_config.TableName = table_name
|
|
897
|
-
|
|
898
|
-
offline_feature_config.PrimaryKeys, offline_feature_config.TimestampColumn = self._get_table_primary_keys_and_timestamp_key(
|
|
899
|
-
full_table_name)
|
|
900
|
-
|
|
901
|
-
offline_feature_config.DatasourceName = data_source_name
|
|
902
|
-
offline_feature_config.DatasourceType = env_utils.get_engine_type()
|
|
903
|
-
offline_feature_config.EngineName = env_utils.get_engine_name()
|
|
904
|
-
|
|
905
|
-
api_requests = CreateOnlineFeatureTableRequest()
|
|
906
|
-
api_requests.OfflineFeatureConfiguration = offline_feature_config
|
|
907
|
-
api_requests.OnlineFeatureConfiguration = online_feature_config
|
|
908
|
-
api_requests.TaskSchedulerConfiguration = cycle_obj
|
|
909
|
-
api_requests.ProjectId = env_utils.get_project_id()
|
|
910
|
-
region = env_utils.get_region()
|
|
911
|
-
if not os.environ.get("RESOURCE_GROUP_ID", ""):
|
|
912
|
-
res_group_item = _get_default_resource_group(
|
|
913
|
-
api_requests.ProjectId, cloud_secret_id, cloud_secret_key, region)
|
|
914
|
-
api_requests.ResourceGroupId = res_group_item.ExecutorGroupId
|
|
915
|
-
else:
|
|
916
|
-
api_requests.ResourceGroupId = os.environ.get("RESOURCE_GROUP_ID")
|
|
917
|
-
client = FeatureCloudSDK(secret_id=cloud_secret_id, secret_key=cloud_secret_key, region=region)
|
|
918
|
-
resp = client.CreateOnlineFeatureTable(api_requests)
|
|
919
|
-
if cycle_obj.CycleType == "ONEOFF_CYCLE":
|
|
920
|
-
print(f"publish online task create success. it will be execute after 3 min. {resp.Data.OnlineTableId} {resp.Data.OfflineTableId} ")
|
|
921
|
-
else:
|
|
922
|
-
print(f"publish online task create success. {resp.Data.OnlineTableId} {resp.Data.OfflineTableId} ")
|
|
923
|
-
|
|
924
|
-
def drop_online_table(self, table_name: str, online_config: RedisStoreConfig, database_name: Optional[str] = None):
|
|
925
|
-
# 构建完整表名
|
|
926
|
-
full_table_name = common_utils.build_full_table_name(table_name, database_name)
|
|
927
|
-
feast_client = FeastClient(self._spark, online_config)
|
|
928
|
-
try:
|
|
929
|
-
self._sync_table_info(table_name=table_name, database_name=database_name, action_name="delete_online",
|
|
930
|
-
data_source_name="", engine_name=env_utils.get_engine_name(), is_try=True)
|
|
931
|
-
except Exception as e:
|
|
932
|
-
raise RuntimeError(f"drop online table failed. table_name: {full_table_name}. {str(e)}")
|
|
933
|
-
|
|
934
|
-
feast_client.remove_online_table(full_table_name)
|
|
935
|
-
try:
|
|
936
|
-
self._sync_table_info(table_name=table_name, database_name=database_name, action_name="delete_online",
|
|
937
|
-
data_source_name="", engine_name=env_utils.get_engine_name(), is_try=False)
|
|
938
|
-
except Exception as e:
|
|
939
|
-
raise RuntimeError(f"drop online table failed. table_name: {full_table_name}. {str(e)}")
|
|
940
|
-
print(f"drop online table success. table_name: {full_table_name}")
|
|
941
|
-
|
|
942
|
-
def _get_table_primary_keys_and_timestamp_key(self, full_table_name: str) -> 'str, str':
|
|
943
|
-
|
|
944
|
-
tbl_pro = self._spark.sql(f"SHOW TBLPROPERTIES {full_table_name}")
|
|
945
|
-
props = {row['key']: row['value'] for row in tbl_pro.collect()}
|
|
946
|
-
|
|
947
|
-
if props.get(FEATURE_DLC_TABLE_PRIMARY_KEY, ""):
|
|
948
|
-
primary_keys = props.get(FEATURE_DLC_TABLE_PRIMARY_KEY, "")
|
|
949
|
-
else:
|
|
950
|
-
primary_keys = props.get(FEATURE_TABLE_BACKUP_PRIMARY_KEY, "")
|
|
951
|
-
primary_keys = primary_keys.split(",")
|
|
952
|
-
timestamp_key = props.get(FEATURE_TABLE_TIMESTAMP, "")
|
|
953
|
-
return primary_keys, timestamp_key
|
|
954
|
-
|
|
955
|
-
def _check_table_exists(self, full_table_name: str) -> bool:
|
|
956
|
-
return common_utils.check_spark_table_exists(self._spark, full_table_name)
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
def _get_default_resource_group(project_id: str, secret_id: str, secret_key: str, region: str):
|
|
960
|
-
client = FeatureCloudSDK(secret_id=secret_id, secret_key=secret_key, region=region)
|
|
961
|
-
request = DescribeNormalSchedulerExecutorGroupsRequest()
|
|
962
|
-
request.ProjectId = project_id
|
|
963
|
-
resp = client.DescribeNormalSchedulerExecutorGroups(request)
|
|
964
|
-
# 默认取第一个健康可用的资源组进行执行
|
|
965
|
-
for item in resp.Data:
|
|
966
|
-
if item.Available:
|
|
967
|
-
return item
|
|
968
|
-
raise ValueError("No available resource group found")
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
def _refresh_table(project_id: str, secret_id: str, secret_key: str, region: str, table_name: str,
|
|
972
|
-
action: str, database_name: str, data_source_name: str, data_source_type: str,
|
|
973
|
-
engine_name: str, is_try: bool):
|
|
974
|
-
client = FeatureCloudSDK(secret_id=secret_id, secret_key=secret_key, region=region)
|
|
975
|
-
request = RefreshFeatureTableRequest()
|
|
976
|
-
request.ProjectId = project_id
|
|
977
|
-
request.TableName = table_name
|
|
978
|
-
request.DatabaseName = database_name
|
|
979
|
-
request.DatasourceName = data_source_name
|
|
980
|
-
request.DatasourceType = data_source_type
|
|
981
|
-
request.EngineName = engine_name
|
|
982
|
-
request.ActionName = action
|
|
983
|
-
request.IsTry = is_try
|
|
984
|
-
resp = client.RefreshFeatureTable(request)
|
|
985
|
-
return resp
|