tencent-wedata-feature-engineering-dev 0.1.50__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.

Files changed (38) hide show
  1. {tencent_wedata_feature_engineering_dev-0.1.50.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/METADATA +10 -8
  2. tencent_wedata_feature_engineering_dev-0.2.0.dist-info/RECORD +46 -0
  3. {tencent_wedata_feature_engineering_dev-0.1.50.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/WHEEL +1 -1
  4. wedata/feature_store/client.py +28 -92
  5. wedata/feature_store/constants/constants.py +2 -5
  6. wedata/feature_store/entities/feature_lookup.py +0 -17
  7. wedata/feature_store/entities/feature_spec.py +2 -2
  8. wedata/feature_store/entities/feature_table.py +1 -5
  9. wedata/feature_store/entities/function_info.py +4 -1
  10. wedata/feature_store/feature_table_client/feature_table_client.py +53 -528
  11. wedata/feature_store/spark_client/spark_client.py +15 -41
  12. wedata/feature_store/training_set_client/training_set_client.py +10 -9
  13. wedata/feature_store/utils/common_utils.py +4 -48
  14. wedata/feature_store/utils/feature_lookup_utils.py +43 -37
  15. wedata/feature_store/utils/feature_spec_utils.py +1 -1
  16. wedata/feature_store/utils/uc_utils.py +1 -1
  17. tencent_wedata_feature_engineering_dev-0.1.50.dist-info/RECORD +0 -66
  18. wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
  19. wedata/feature_store/cloud_sdk_client/client.py +0 -108
  20. wedata/feature_store/cloud_sdk_client/models.py +0 -686
  21. wedata/feature_store/cloud_sdk_client/utils.py +0 -39
  22. wedata/feature_store/common/log/__init__.py +0 -0
  23. wedata/feature_store/common/log/logger.py +0 -40
  24. wedata/feature_store/common/store_config/__init__.py +0 -0
  25. wedata/feature_store/common/store_config/redis.py +0 -48
  26. wedata/feature_store/constants/engine_types.py +0 -34
  27. wedata/feature_store/feast_client/__init__.py +0 -0
  28. wedata/feature_store/feast_client/feast_client.py +0 -487
  29. wedata/feature_store/utils/env_utils.py +0 -108
  30. wedata/tempo/__init__.py +0 -0
  31. wedata/tempo/interpol.py +0 -448
  32. wedata/tempo/intervals.py +0 -1331
  33. wedata/tempo/io.py +0 -61
  34. wedata/tempo/ml.py +0 -129
  35. wedata/tempo/resample.py +0 -318
  36. wedata/tempo/tsdf.py +0 -1720
  37. wedata/tempo/utils.py +0 -254
  38. {tencent_wedata_feature_engineering_dev-0.1.50.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/top_level.txt +0 -0
@@ -3,79 +3,26 @@
3
3
  """
4
4
  import json
5
5
  from typing import Union, List, Dict, Optional, Sequence, Any
6
-
7
- import tencentcloud.common.exception
8
6
  from pyspark.sql import DataFrame, SparkSession
9
7
  from pyspark.sql.streaming import StreamingQuery
10
8
  from pyspark.sql.types import StructType
11
9
  import os
12
- import datetime
13
- from wedata.feature_store.constants.constants import (
14
- APPEND, DEFAULT_WRITE_STREAM_TRIGGER, FEATURE_TABLE_KEY,
15
- FEATURE_TABLE_VALUE, FEATURE_TABLE_PROJECT, FEATURE_TABLE_TIMESTAMP,
16
- FEATURE_TABLE_BACKUP_PRIMARY_KEY, FEATURE_DLC_TABLE_PRIMARY_KEY)
17
- from wedata.feature_store.constants.engine_types import EngineTypes
18
- from wedata.feature_store.common.store_config.redis import RedisStoreConfig
10
+
11
+ from wedata.feature_store.constants.constants import APPEND, DEFAULT_WRITE_STREAM_TRIGGER, FEATURE_TABLE_KEY, \
12
+ FEATURE_TABLE_VALUE, FEATURE_TABLE_PROJECT
19
13
  from wedata.feature_store.entities.feature_table import FeatureTable
20
14
  from wedata.feature_store.spark_client.spark_client import SparkClient
21
- from wedata.feature_store.utils import common_utils, env_utils
22
- from wedata.feature_store.feast_client.feast_client import FeastClient
23
- from wedata.feature_store.cloud_sdk_client.models import (
24
- TaskSchedulerConfiguration, OnlineFeatureConfiguration, OfflineFeatureConfiguration,
25
- CreateOnlineFeatureTableRequest, DescribeNormalSchedulerExecutorGroupsRequest, RefreshFeatureTableRequest)
26
- from wedata.feature_store.cloud_sdk_client.client import FeatureCloudSDK
15
+ from wedata.feature_store.utils import common_utils
27
16
 
28
17
 
29
18
  class FeatureTableClient:
30
19
  """特征表操作类"""
31
20
 
32
21
  def __init__(
33
- self,
34
- spark: SparkSession,
35
- cloud_secret_id: str = None,
36
- cloud_secret_key: str = None,
22
+ self,
23
+ spark: SparkSession
37
24
  ):
38
25
  self._spark = spark
39
- self._feast_client = FeastClient(spark)
40
- if cloud_secret_id and cloud_secret_key:
41
- self.__cloud_secret_id = cloud_secret_id
42
- self.__cloud_secret_key = cloud_secret_key
43
- else:
44
- self.__cloud_secret_id, self.__cloud_secret_key = env_utils.get_cloud_secret()
45
- self.__project = env_utils.get_project_id()
46
- self.__region = env_utils.get_region()
47
-
48
- @property
49
- def cloud_secret_id(self) -> str:
50
- if not self.__cloud_secret_id:
51
- raise ValueError("cloud_secret_id is empty. please set it first.")
52
- return self.__cloud_secret_id
53
-
54
- @cloud_secret_id.setter
55
- def cloud_secret_id(self, cloud_secret_id: str):
56
- if not cloud_secret_id:
57
- raise ValueError("cloud_secret_id cannot be None")
58
- self.__cloud_secret_id = cloud_secret_id
59
-
60
- @property
61
- def cloud_secret_key(self) -> str:
62
- if not self.__cloud_secret_key:
63
- raise ValueError("cloud_secret_key is empty. please set it first.")
64
- return self.__cloud_secret_key
65
-
66
- @cloud_secret_key.setter
67
- def cloud_secret_key(self, cloud_secret_key: str):
68
- if not cloud_secret_key:
69
- raise ValueError("cloud_secret_key cannot be None")
70
- self.__cloud_secret_key = cloud_secret_key
71
-
72
- @property
73
- def project(self) -> str:
74
- return self.__project
75
-
76
- @property
77
- def region(self) -> str:
78
- return self.__region
79
26
 
80
27
  @staticmethod
81
28
  def _normalize_params(
@@ -105,39 +52,26 @@ class FeatureTableClient:
105
52
  )
106
53
 
107
54
  @staticmethod
108
- def _validate_key_conflicts(primary_keys: List[str], timestamp_keys: str):
55
+ def _validate_key_conflicts(primary_keys: List[str], timestamp_keys: List[str]):
109
56
  """校验主键与时间戳键是否冲突"""
110
- if timestamp_keys in primary_keys:
111
- raise ValueError(f"Timestamp keys conflict with primary keys: {timestamp_keys}")
112
-
113
- @staticmethod
114
- def _validate_key_exists(primary_keys: List[str], timestamp_keys: str):
115
- """校验主键与时间戳键是否存在"""
116
- if not primary_keys:
117
- raise ValueError("Primary keys cannot be empty")
118
- if not timestamp_keys:
119
- raise ValueError("Timestamp keys cannot be empty")
57
+ conflict_keys = set(timestamp_keys) & set(primary_keys)
58
+ if conflict_keys:
59
+ raise ValueError(f"Timestamp keys conflict with primary keys: {conflict_keys}")
120
60
 
121
61
  @staticmethod
122
62
  def _escape_sql_value(value: str) -> str:
123
63
  """转义SQL值中的特殊字符"""
124
64
  return value.replace("'", "''")
125
65
 
126
- @staticmethod
127
- def _check_sequence_element_type(sequence: Sequence[Any], element_type: type) -> bool:
128
- """检查序列中的元素是否为指定类型"""
129
- return all(isinstance(element, element_type) for element in sequence)
130
-
131
66
  def create_table(
132
67
  self,
133
68
  name: str,
134
69
  primary_keys: Union[str, List[str]],
135
- timestamp_key: str,
136
- engine_type: EngineTypes,
137
- data_source_name: str,
138
70
  database_name: Optional[str] = None,
71
+ location: Optional[str] = None,
139
72
  df: Optional[DataFrame] = None,
140
73
  *,
74
+ timestamp_keys: Union[str, List[str], None] = None,
141
75
  partition_columns: Union[str, List[str], None] = None,
142
76
  schema: Optional[StructType] = None,
143
77
  description: Optional[str] = None,
@@ -151,14 +85,14 @@ class FeatureTableClient:
151
85
  name: 特征表全称(格式:<table>)
152
86
  primary_keys: 主键列名(支持复合主键)
153
87
  database_name: Optional[str] = None,
154
- data_source_name: 数据源名称,
88
+ location: Optional[str] = None,
155
89
  df: 初始数据(可选,用于推断schema)
156
- timestamp_key: 时间戳键(用于时态特征)
157
- engine_type: 引擎类型 version:: 1.33
90
+ timestamp_keys: 时间戳键(用于时态特征)
158
91
  partition_columns: 分区列(优化存储查询)
159
92
  schema: 表结构定义(可选,当不提供df时必需)
160
93
  description: 业务描述
161
94
  tags: 业务标签
95
+
162
96
  Returns:
163
97
  FeatureTable实例
164
98
 
@@ -168,33 +102,24 @@ class FeatureTableClient:
168
102
 
169
103
  # 参数标准化
170
104
  primary_keys = self._normalize_params(primary_keys)
105
+ timestamp_keys = self._normalize_params(timestamp_keys)
171
106
  partition_columns = self._normalize_params(partition_columns)
172
107
 
173
- assert self._check_sequence_element_type(primary_keys, str), "primary_keys must be a list of strings"
174
- assert self._check_sequence_element_type(partition_columns, str), "partition_columns must be a list of strings"
175
- assert isinstance(timestamp_key, str), "timestamp key must be string"
176
-
177
108
  # 元数据校验
178
109
  self._validate_schema(df, schema)
179
- self._validate_key_exists(primary_keys, timestamp_key)
180
- self._validate_key_conflicts(primary_keys, timestamp_key)
110
+ self._validate_key_conflicts(primary_keys, timestamp_keys)
181
111
 
182
112
  # 表名校验
183
113
  common_utils.validate_table_name(name)
184
114
 
185
115
  common_utils.validate_database(database_name)
186
116
 
187
- # 校验PrimaryKey是否有重复
188
- dup_list = common_utils.get_duplicates(primary_keys)
189
- if dup_list :
190
- raise ValueError(f"Primary keys have duplicates: {dup_list}")
191
-
192
117
  # 构建完整表名
193
118
  table_name = common_utils.build_full_table_name(name, database_name)
194
119
 
195
120
  # 检查表是否存在
196
121
  try:
197
- if self._check_table_exists(table_name):
122
+ if self._spark.catalog.tableExists(table_name):
198
123
  raise ValueError(
199
124
  f"Table '{name}' already exists\n"
200
125
  "Solutions:\n"
@@ -204,20 +129,12 @@ class FeatureTableClient:
204
129
  except Exception as e:
205
130
  raise ValueError(f"Error checking table existence: {str(e)}") from e
206
131
 
207
- try:
208
- self._sync_table_info(table_name=name, action_name="create",
209
- database_name=env_utils.get_database_name(database_name),
210
- data_source_name=data_source_name, engine_name=env_utils.get_engine_name(),
211
- is_try=True)
212
- except tencentcloud.common.exception.TencentCloudSDKException as e:
213
- raise RuntimeError(f"Table '{name}' is can't create. {str(e)}")
214
-
215
132
  # 推断表schema
216
133
  table_schema = schema or df.schema
217
134
 
218
135
  # 构建时间戳键属性
219
136
 
220
- # 从环境变量获取额外标签
137
+ #从环境变量获取额外标签
221
138
  env_tags = {
222
139
  "project_id": os.getenv("WEDATA_PROJECT_ID", ""), # wedata项目ID
223
140
  "engine_name": os.getenv("WEDATA_NOTEBOOK_ENGINE", ""), # wedata引擎名称
@@ -227,42 +144,13 @@ class FeatureTableClient:
227
144
  # 构建表属性(通过TBLPROPERTIES)
228
145
  tbl_properties = {
229
146
  "wedata.feature_table": "true",
230
- FEATURE_TABLE_BACKUP_PRIMARY_KEY: ",".join(primary_keys),
147
+ "primaryKeys": ",".join(primary_keys),
231
148
  "wedata.feature_project_id": f"{json.dumps([projectId])}",
232
- FEATURE_TABLE_TIMESTAMP: timestamp_key,
149
+ "timestampKeys": ",".join(timestamp_keys) if timestamp_keys else "",
233
150
  "comment": description or "",
234
151
  **{f"{k}": v for k, v in (tags or {}).items()},
235
152
  **{f"feature_{k}": v for k, v in (env_tags or {}).items()}
236
153
  }
237
- if engine_type == EngineTypes.ICEBERG_ENGINE:
238
- if partition_columns:
239
- tbl_properties.update({
240
- 'format-version': '2',
241
- 'write.upsert.enabled': 'true',
242
- 'write.update.mode': 'merge-on-read',
243
- 'write.merge.mode': 'merge-on-read',
244
- 'write.parquet.bloom-filter-enabled.column.id': 'true',
245
- 'dlc.ao.data.govern.sorted.keys': ",".join(primary_keys),
246
- 'write.distribution-mode': 'hash',
247
- 'write.metadata.delete-after-commit.enabled': 'true',
248
- 'write.metadata.previous-versions-max': '100',
249
- 'write.metadata.metrics.default': 'full',
250
- 'smart-optimizer.inherit': 'default',
251
- })
252
- else:
253
- tbl_properties.update({
254
- 'format-version': '2',
255
- 'write.upsert.enabled': 'true',
256
- 'write.update.mode': 'merge-on-read',
257
- 'write.merge.mode': 'merge-on-read',
258
- 'write.parquet.bloom-filter-enabled.column.id': 'true',
259
- 'dlc.ao.data.govern.sorted.keys': ",".join(primary_keys),
260
- 'write.distribution-mode': 'hash',
261
- 'write.metadata.delete-after-commit.enabled': 'true',
262
- 'write.metadata.previous-versions-max': '100',
263
- 'write.metadata.metrics.default': 'full',
264
- 'smart-optimizer.inherit': 'default',
265
- })
266
154
 
267
155
  # 构建列定义
268
156
  columns_ddl = []
@@ -284,8 +172,7 @@ class FeatureTableClient:
284
172
  )
285
173
  # 本地调试 iceberg --》PARQUET
286
174
  # 核心建表语句
287
- if engine_type == EngineTypes.ICEBERG_ENGINE:
288
- ddl = f"""
175
+ ddl = f"""
289
176
  CREATE TABLE {table_name} (
290
177
  {', '.join(columns_ddl)}
291
178
  )
@@ -294,20 +181,7 @@ class FeatureTableClient:
294
181
  TBLPROPERTIES (
295
182
  {', '.join(f"'{k}'='{self._escape_sql_value(v)}'" for k, v in tbl_properties.items())}
296
183
  )
297
- """
298
- elif engine_type == EngineTypes.HIVE_ENGINE:
299
- ddl = f"""
300
- CREATE TABLE {table_name} (
301
- {', '.join(columns_ddl)}
302
- )
303
- {partition_expr}
304
- -- STORED AS PARQUET
305
- TBLPROPERTIES (
306
- {', '.join(f"'{k}'='{self._escape_sql_value(v)}'" for k, v in tbl_properties.items())}
307
- )
308
- """
309
- else:
310
- raise ValueError(f"Engine type {engine_type} is not supported")
184
+ """
311
185
 
312
186
  # 打印sql
313
187
  print(f"create table ddl: {ddl}\n")
@@ -320,28 +194,8 @@ class FeatureTableClient:
320
194
  except Exception as e:
321
195
  raise ValueError(f"Failed to create table: {str(e)}") from e
322
196
 
323
- print("async table info to feast")
324
-
325
- self._feast_client.create_table(
326
- table_name=table_name,
327
- primary_keys=primary_keys,
328
- timestamp_key=timestamp_key,
329
- df=df,
330
- schema=table_schema,
331
- tags=tags,
332
- description=description
333
- )
334
-
335
197
  print(f"create table {name} done")
336
198
 
337
- try:
338
- self._sync_table_info(table_name=name, action_name="create",
339
- database_name=env_utils.get_database_name(database_name),
340
- data_source_name=data_source_name, engine_name=env_utils.get_engine_name(),
341
- is_try=False)
342
- except tencentcloud.common.exception.TencentCloudSDKException as e:
343
- raise RuntimeError(f"Table '{name}' is can't create. {str(e)}")
344
-
345
199
  # 构建并返回FeatureTable对象
346
200
  return FeatureTable(
347
201
  name=name,
@@ -350,7 +204,7 @@ class FeatureTableClient:
350
204
  primary_keys=primary_keys,
351
205
  partition_columns=partition_columns or [],
352
206
  features=[field.name for field in table_schema.fields],
353
- timestamp_keys=timestamp_key or [],
207
+ timestamp_keys=timestamp_keys or [],
354
208
  tags=dict(**tags or {}, **env_tags)
355
209
  )
356
210
 
@@ -395,10 +249,6 @@ class FeatureTableClient:
395
249
  # 构建完整表名
396
250
  table_name = common_utils.build_full_table_name(name, database_name)
397
251
 
398
- # 检查表是否存在
399
- if not self._check_table_exists(table_name):
400
- raise ValueError(f"table '{name}' not exists")
401
-
402
252
  # 判断是否是流式DataFrame
403
253
  is_streaming = df.isStreaming
404
254
 
@@ -411,8 +261,7 @@ class FeatureTableClient:
411
261
  writer = df.writeStream \
412
262
  .format("parquet") \
413
263
  .outputMode(mode) \
414
- .option("checkpointLocation", checkpoint_location) \
415
- # .foreachBatch(process_batch)
264
+ .option("checkpointLocation", checkpoint_location)
416
265
 
417
266
  if trigger:
418
267
  writer = writer.trigger(**trigger)
@@ -423,23 +272,18 @@ class FeatureTableClient:
423
272
  df.write \
424
273
  .mode(mode) \
425
274
  .insertInto(table_name)
426
- # self._feast_client.client.write_to_offline_store(feature_view_name=table_name, df=df.toPandas(), allow_registry_cache=False,)
427
275
  return None
428
276
 
429
277
  except Exception as e:
430
- raise
431
- # raise ValueError(f"Failed to write to table '{table_name}': {str(e)}") from e
278
+ raise ValueError(f"Failed to write to table '{table_name}': {str(e)}") from e
279
+
432
280
 
433
- def register_table(self, name, database_name, timestamp_key: str, engine_type: EngineTypes, data_source_name: str,
434
- primary_keys: Union[str, List[str]]):
281
+ def register_table(self, name, database_name):
435
282
  """注册表 为特征表
436
283
  Args:
437
284
  name: 表名(格式:<table>)
438
285
  database_name: 特征库名称
439
- data_source_name: 数据源名称
440
- engine_type: 引擎类型
441
- timestamp_key: 时间戳键
442
- primary_keys: 主键
286
+
443
287
  Raises:
444
288
  ValueError: 当表不存在或参数无效时抛出
445
289
  RuntimeError: 当修改操作失败时抛出
@@ -453,49 +297,23 @@ class FeatureTableClient:
453
297
  common_utils.validate_table_name(name)
454
298
  common_utils.validate_database(database_name)
455
299
 
456
- if primary_keys:
457
- assert self._check_sequence_element_type(primary_keys, str), "primary_keys must be a list of strings"
458
- assert isinstance(timestamp_key, str), "timestamp key must be string"
459
-
460
300
  # 构建完整表名
461
301
  table_name = common_utils.build_full_table_name(name, database_name)
462
302
 
463
303
  try:
464
304
  # 检查表是否存在
465
- if not self._check_table_exists(table_name):
305
+ if not self._spark.catalog.tableExists(table_name):
466
306
  raise ValueError(f"table '{name}' not exists")
467
307
  tbl_pro = self._spark.sql(f"SHOW TBLPROPERTIES {table_name}")
468
308
  props = {row['key']: row['value'] for row in tbl_pro.collect()}
469
-
470
- # 检查Primary Key和Timestamp Key是否为空
471
- if engine_type == engine_type.ICEBERG_ENGINE and props.get("format-version", "") == "2":
472
- if not primary_keys:
473
- if props.get('dlc.ao.data.govern.sorted.keys', "") == "":
474
- raise ValueError(
475
- "table dlc.ao.data.govern.sorted.keys is empty. you must set dlc.ao.data.govern.sorted.keys")
476
- else:
477
- primary_keys = props.get('dlc.ao.data.govern.sorted.keys').split(",")
478
- elif engine_type == engine_type.HIVE_ENGINE:
479
- if not primary_keys:
480
- raise ValueError("primary_keys cannot be None for HIVE_ENGINE")
481
-
482
- if props.get("wedata.feature_table", "") == "true":
483
- raise ValueError("table is already a feature table")
484
-
485
- self._validate_key_conflicts(primary_keys, timestamp_key)
486
- # 检查表是否存在
487
- dup_list = common_utils.get_duplicates(primary_keys)
488
- if dup_list:
489
- raise ValueError(f"primary_keys contains duplicates: {dup_list}")
490
-
491
309
  s = props.get(FEATURE_TABLE_PROJECT, "")
492
310
  if not s: # 如果s是空字符串
493
311
  projectIds = []
494
312
  else:
495
313
  projectIds = json.loads(s)
496
314
  current_project_id = os.getenv("WEDATA_PROJECT_ID")
497
- # 判断是否包含projectIds(仅是projectIds非空的时候)
498
- if current_project_id not in projectIds and len(projectIds):
315
+ # 判断是否包含
316
+ if current_project_id not in projectIds:
499
317
  register_table_project_ids = props.get(FEATURE_TABLE_PROJECT)
500
318
  else:
501
319
  projectIds.append(current_project_id)
@@ -503,62 +321,31 @@ class FeatureTableClient:
503
321
  tbl_properties = {
504
322
  FEATURE_TABLE_KEY: FEATURE_TABLE_VALUE,
505
323
  FEATURE_TABLE_PROJECT: register_table_project_ids,
506
- FEATURE_TABLE_TIMESTAMP: timestamp_key,
507
- FEATURE_TABLE_BACKUP_PRIMARY_KEY: ",".join(primary_keys),
508
- }
509
-
510
- env_tags = {
511
- "project_id": os.getenv("WEDATA_PROJECT_ID", ""), # wedata项目ID
512
- "engine_name": os.getenv("WEDATA_NOTEBOOK_ENGINE", ""), # wedata引擎名称
513
- "user_uin": os.getenv("KERNEL_LOGIN_UIN", "") # wedata用户UIN
514
324
  }
515
- for key, val in env_tags.items():
516
- if not props.get(f"feature_{key}", ""):
517
- tbl_properties[f"feature_{key}"] = val
518
325
 
519
326
  # 构建属性设置语句
520
327
  props_str = ", ".join(
521
328
  f"'{k}'='{self._escape_sql_value(v)}'"
522
- for k, v in tbl_properties.items()
329
+ for k, v in tbl_properties
523
330
  )
524
331
 
525
-
526
332
  alter_sql = f"ALTER TABLE {table_name} SET TBLPROPERTIES ({props_str})"
527
333
 
528
- try:
529
- self._sync_table_info(table_name=name, action_name="create",
530
- database_name=env_utils.get_database_name(database_name),
531
- data_source_name=data_source_name, engine_name=env_utils.get_engine_name(), is_try=True)
532
- except tencentcloud.common.exception.TencentCloudSDKException as e:
533
- raise RuntimeError(f"Table '{name}' is can't create. {str(e)}")
534
-
535
334
  # 执行修改
536
- print("alter table sql", alter_sql)
537
335
  self._spark.sql(alter_sql)
538
- print("Execute sql done, start sync table info to feast")
539
- self._feast_client.alter_table(full_table_name=table_name, primary_keys=primary_keys,
540
- timestamp_key=timestamp_key)
541
- print(f"Successfully register table '{table_name}'")
542
-
543
- try:
544
- self._sync_table_info(table_name=name, action_name="create",
545
- database_name=env_utils.get_database_name(database_name),
546
- data_source_name=data_source_name, engine_name=env_utils.get_engine_name(), is_try=False)
547
- except tencentcloud.common.exception.TencentCloudSDKException as e:
548
- raise RuntimeError(f"sync table info failed. you need to sync table info manually. {str(e)}")
336
+ print(f"Successfully register table '{name}'")
337
+
549
338
  except ValueError as e:
550
339
  raise # 直接抛出已知的ValueError
551
340
  except Exception as e:
552
- raise RuntimeError(f"Failed to modify properties for table '{table_name}': {str(e)}") from e
341
+ raise RuntimeError(f"Failed to modify properties for table '{name}': {str(e)}") from e
342
+
553
343
 
554
344
  def read_table(
555
- self,
556
- name: str,
557
- database_name: Optional[str] = None,
558
- is_online: bool = False,
559
- online_config: Optional[RedisStoreConfig] = None,
560
- entity_row: Optional[List[Dict[str, Any]]] = None
561
- ) -> DataFrame:
345
+ self,
346
+ name: str,
347
+ database_name: Optional[str] = None,
348
+ ) -> DataFrame:
562
349
 
563
350
  """
564
351
  从特征表中读取数据
@@ -566,9 +353,6 @@ class FeatureTableClient:
566
353
  Args:
567
354
  name: 特征表名称(格式:<table>)
568
355
  database_name: 特征库名称
569
- is_online: 是否读取在线表
570
- online_config: 在线表配置
571
- entity_row: 实体行(用于过滤在线数据, 仅当在线表为true时有效)
572
356
  Returns:
573
357
  包含表数据的DataFrame
574
358
 
@@ -581,23 +365,20 @@ class FeatureTableClient:
581
365
 
582
366
  common_utils.validate_database(database_name)
583
367
 
368
+
584
369
  # 构建完整表名
585
370
  table_name = common_utils.build_full_table_name(name, database_name)
586
371
 
587
372
  try:
588
373
  # 检查表是否存在
589
- if not self._check_table_exists(table_name):
374
+ if not self._spark.catalog.tableExists(table_name):
590
375
  raise ValueError(f"Table '{name}' does not exist")
591
376
 
592
- if is_online:
593
- return self._read_online_table(
594
- table_name=name, database_name=database_name,
595
- online_config=online_config, entity_row=entity_row)
596
377
  # 读取表数据
597
378
  return self._spark.read.table(table_name)
598
379
 
599
380
  except Exception as e:
600
- raise
381
+ raise ValueError(f"Failed to read table '{name}': {str(e)}") from e
601
382
 
602
383
  def drop_table(self, name: str, database_name: Optional[str] = None) -> None:
603
384
 
@@ -607,6 +388,7 @@ class FeatureTableClient:
607
388
  Args:
608
389
  name: 特征表名称(格式:<table>)
609
390
  database_name: 特征库名称
391
+
610
392
  Raises:
611
393
  ValueError: 当表不存在时抛出
612
394
  RuntimeError: 当删除操作失败时抛出
@@ -621,118 +403,22 @@ class FeatureTableClient:
621
403
 
622
404
  # 构建完整表名
623
405
  table_name = common_utils.build_full_table_name(name, database_name)
406
+
624
407
  try:
625
408
  # 检查表是否存在
626
- if not self._check_table_exists(table_name):
409
+ if not self._spark.catalog.tableExists(table_name):
627
410
  print(f"Table '{name}' does not exist")
628
411
  return
629
412
 
630
- try:
631
- feature_view = self._feast_client.get_feature_view(table_name)
632
- except Exception as e:
633
- print(f"Table '{name}' is not a feature table, skip delete. {str(e)}")
634
- else:
635
- if feature_view.online:
636
- raise ValueError(f"Table '{name}' has a online table, please call drop_online_table first")
637
- try:
638
- self._sync_table_info(table_name=name, action_name="delete",
639
- database_name=env_utils.get_database_name(database_name),
640
- data_source_name="", engine_name=env_utils.get_engine_name(), is_try=True)
641
- except tencentcloud.common.exception.TencentCloudSDKException as e:
642
- raise RuntimeError(f"Table '{name}' is can't delete. {str(e)}")
643
-
644
413
  # 执行删除
645
414
  self._spark.sql(f"DROP TABLE {table_name}")
646
415
  print(f"Table '{name}' dropped")
647
- try:
648
- self._feast_client.remove_offline_table(table_name=table_name)
649
- except Exception as e:
650
- raise
651
- # raise ValueError(f"Failed to delete table '{name}' in feast: {str(e)}")
652
- else:
653
- print(f"Table '{name}' removed from feast")
654
-
655
- try:
656
- self._sync_table_info(table_name=name, action_name="delete",
657
- database_name=env_utils.get_database_name(database_name),
658
- data_source_name="", engine_name=env_utils.get_engine_name(), is_try=False)
659
- except tencentcloud.common.exception.TencentCloudSDKException as e:
660
- print(f"Failed to delete table information on the web interface. You need to delete it manually. Error: {str(e)}")
416
+
661
417
  except ValueError as e:
662
418
  raise # 直接抛出已知的ValueError
663
419
  except Exception as e:
664
420
  raise RuntimeError(f"Failed to delete table '{name}': {str(e)}") from e
665
421
 
666
- def _sync_table_info(self, table_name: str, action_name: str, database_name: str,
667
- data_source_name: str, engine_name: str, is_try: bool):
668
- return _refresh_table(project_id=self.project, secret_id=self.cloud_secret_id, secret_key=self.cloud_secret_key,
669
- region=self.region, table_name=table_name,
670
- action=action_name, database_name=database_name, data_source_name=data_source_name,
671
- engine_name=engine_name, is_try=is_try, data_source_type=env_utils.get_engine_type())
672
-
673
- def _read_online_table(self,
674
- table_name: str, database_name: str, online_config: RedisStoreConfig,
675
- entity_row:List[Dict[str,Any]] = None):
676
- full_table_name = common_utils.build_full_table_name(table_name, database_name)
677
- primary_keys, timestamp_key = self._get_table_primary_keys_and_timestamp_key(full_table_name)
678
- entity_row_dict = {}
679
- if isinstance(entity_row, list):
680
- for row in entity_row:
681
- if not isinstance(row, dict):
682
- raise ValueError("Entity_row row must be a dictionary")
683
- for key in row.keys():
684
- if key not in primary_keys:
685
- raise ValueError(f"Entity_row row key '{key}' is not a primary key")
686
- entity_row_dict[key] = key
687
- elif isinstance(entity_row, dict):
688
- for key in entity_row.keys():
689
- if key not in primary_keys:
690
- raise ValueError(f"Entity_row row key '{key}' is not a primary key")
691
- entity_row_dict = entity_row
692
- else:
693
- raise ValueError(f"Entity_row must be a list of dictionaries or a single dictionary. {type(entity_row)}")
694
-
695
- tmp_schema = self._spark.table(tableName=full_table_name).schema
696
- columns_name_list = []
697
- tmp_schema_list = []
698
- for field in tmp_schema.fields:
699
- if field.name in primary_keys or field.name == timestamp_key:
700
- if entity_row_dict.get(field.name):
701
- tmp_schema_list.append(field)
702
- continue
703
- columns_name_list.append(field.name)
704
- tmp_schema_list.append(field)
705
-
706
- schema_name_list = [field.name for field in tmp_schema_list]
707
- schema = StructType(tmp_schema_list)
708
- for field in schema:
709
- print(f"{field.name} => {field.dataType}")
710
-
711
- feast_client = FeastClient(offline_store=self._spark, online_store_config=online_config)
712
- # 构建离线表的entity的数据过滤
713
- if not entity_row:
714
- tbl_props = self._spark.sql(f"SHOW TBLPROPERTIES {table_name}")
715
- props = {row['key']: row['value'] for row in tbl_props.collect()}
716
- primary_key = props.get(FEATURE_TABLE_BACKUP_PRIMARY_KEY)
717
- query_result = self._spark.sql(f"SELECT {primary_key} FROM {table_name} LIMIT 1")
718
- result_row = query_result.first()
719
- if result_row:
720
- online_view = feast_client.get_online_table_view(
721
- full_table_name=full_table_name,
722
- columns_name=columns_name_list,
723
- entity_rows=[result_row.asDict()])
724
- print("=====>read online dataframe:\n", online_view[schema_name_list])
725
- return self._spark.createDataFrame(online_view[schema_name_list], schema=schema, verifySchema=False)
726
- else:
727
- return self._spark.createDataFrame([])
728
- else:
729
- online_view = feast_client.get_online_table_view(
730
- full_table_name=full_table_name,
731
- columns_name=columns_name_list,
732
- entity_rows=entity_row)
733
- print("=====>read online dataframe:\n", online_view[schema_name_list])
734
- return self._spark.createDataFrame(online_view[schema_name_list], schema=schema, verifySchema=False)
735
-
736
422
  def get_table(
737
423
  self,
738
424
  name: str,
@@ -740,8 +426,7 @@ class FeatureTableClient:
740
426
  database_name: Optional[str] = None,
741
427
  ) -> FeatureTable:
742
428
 
743
- """
744
- 获取特征表元数据信息
429
+ """获取特征表元数据信息
745
430
 
746
431
  参数:
747
432
  name: 特征表名称
@@ -760,13 +445,11 @@ class FeatureTableClient:
760
445
 
761
446
  # 构建完整表名
762
447
  table_name = common_utils.build_full_table_name(name, database_name)
763
- if not self._check_table_exists(full_table_name=table_name):
764
- raise ValueError(f"Table '{name}' does not exist")
448
+
765
449
  try:
766
450
  return spark_client.get_feature_table(table_name)
767
451
  except Exception as e:
768
- raise
769
- # raise ValueError(f"Failed to get metadata for table '{name}': {str(e)}") from e
452
+ raise ValueError(f"Failed to get metadata for table '{name}': {str(e)}") from e
770
453
 
771
454
  def alter_table_tag(
772
455
  self,
@@ -774,8 +457,7 @@ class FeatureTableClient:
774
457
  properties: Dict[str, str],
775
458
  database_name: Optional[str] = None,
776
459
  ):
777
- """
778
- 修改表的TBLPROPERTIES属性(有则修改,无则新增)
460
+ """修改表的TBLPROPERTIES属性(有则修改,无则新增)
779
461
 
780
462
  Args:
781
463
  name: 表名(格式:<table>)
@@ -806,7 +488,7 @@ class FeatureTableClient:
806
488
 
807
489
  try:
808
490
  # 检查表是否存在
809
- if not self._check_table_exists(table_name):
491
+ if not self._spark.catalog.tableExists(table_name):
810
492
  raise ValueError(f"table '{name}' not exists")
811
493
 
812
494
  # 构建属性设置语句
@@ -819,7 +501,6 @@ class FeatureTableClient:
819
501
 
820
502
  # 执行修改
821
503
  self._spark.sql(alter_sql)
822
- self._feast_client.modify_tags(table_name=table_name, tags=properties)
823
504
  print(f"Successfully updated properties for table '{name}': {list(properties.keys())}")
824
505
 
825
506
  except ValueError as e:
@@ -827,159 +508,3 @@ class FeatureTableClient:
827
508
  except Exception as e:
828
509
  raise RuntimeError(f"Failed to modify properties for table '{name}': {str(e)}") from e
829
510
 
830
- def publish_table(self, table_name: str, data_source_name: str, cloud_secret_id: str, cloud_secret_key: str,
831
- database_name: Optional[str] = None,
832
- is_cycle: bool = False, cycle_obj: TaskSchedulerConfiguration = None,
833
- is_use_default_online: bool = True, online_config: RedisStoreConfig = None):
834
- """
835
- 将离线特征表发布为在线特征表
836
- Args:
837
- table_name: 离线特征表名称
838
- data_source_name: 数据源名称
839
- database_name: 数据库名称
840
- is_cycle: 是否周期性发布
841
- cycle_obj: 周期性任务配置
842
- is_use_default_online: 是否使用默认的在线存储配置
843
- online_config: 在线存储配置 (仅当is_use_default_online为False时生效)
844
- """
845
- # 构建完整表名
846
- full_table_name = common_utils.build_full_table_name(table_name, database_name)
847
-
848
- # 检查表是否存在
849
- if not self._check_table_exists(full_table_name):
850
- raise ValueError(f"Table '{full_table_name}' does not exist")
851
-
852
- # 检查是否已经发布,查看Redis中是否有值
853
- try:
854
- # 获取离线表的列名
855
- online_data = self._read_online_table(
856
- table_name=table_name,
857
- database_name=database_name,
858
- online_config=online_config)
859
- except Exception as e:
860
- print(f"Failed to get online table view for table '{full_table_name}': {str(e)}")
861
- else:
862
- if online_data:
863
- raise ValueError(f"Table '{full_table_name}' has already been published")
864
-
865
- # 配置周期性参数
866
- if is_cycle:
867
- if not isinstance(cycle_obj, TaskSchedulerConfiguration):
868
- raise ValueError("cycle_obj must be a TaskSchedulerConfiguration object when is_cycle is True")
869
-
870
- cycle_obj.CycleType = "CRONTAB_CYCLE"
871
- else:
872
- if isinstance(cycle_obj, TaskSchedulerConfiguration):
873
- cycle_obj.CycleType = "ONEOFF_CYCLE"
874
- else:
875
- cycle_obj = TaskSchedulerConfiguration()
876
- cycle_obj.CycleType = "ONEOFF_CYCLE"
877
- # 设置默认当前时间延后1分钟
878
- cycle_obj.CrontabExpression = (datetime.datetime.now() + datetime.timedelta(minutes=3)).strftime(
879
- "%M %H %d %m %w ? %y")
880
-
881
- if is_use_default_online:
882
- online_feature_config = OnlineFeatureConfiguration()
883
- online_feature_config.UserDefault = True
884
- else:
885
- if not isinstance(online_config, RedisStoreConfig):
886
- raise ValueError("online_config must be a RedisStoreConfig object when is_use_default_online is False")
887
-
888
- online_feature_config = OnlineFeatureConfiguration()
889
- online_feature_config.UserDefault = False
890
- online_feature_config.Host = online_config.host
891
- online_feature_config.Port = online_config.port
892
- online_feature_config.DB = online_config.db
893
-
894
- offline_feature_config = OfflineFeatureConfiguration()
895
- offline_feature_config.DatabaseName = env_utils.get_database_name(database_name)
896
- offline_feature_config.TableName = table_name
897
-
898
- offline_feature_config.PrimaryKeys, offline_feature_config.TimestampColumn = self._get_table_primary_keys_and_timestamp_key(
899
- full_table_name)
900
-
901
- offline_feature_config.DatasourceName = data_source_name
902
- offline_feature_config.DatasourceType = env_utils.get_engine_type()
903
- offline_feature_config.EngineName = env_utils.get_engine_name()
904
-
905
- api_requests = CreateOnlineFeatureTableRequest()
906
- api_requests.OfflineFeatureConfiguration = offline_feature_config
907
- api_requests.OnlineFeatureConfiguration = online_feature_config
908
- api_requests.TaskSchedulerConfiguration = cycle_obj
909
- api_requests.ProjectId = env_utils.get_project_id()
910
- region = env_utils.get_region()
911
- if not os.environ.get("RESOURCE_GROUP_ID", ""):
912
- res_group_item = _get_default_resource_group(
913
- api_requests.ProjectId, cloud_secret_id, cloud_secret_key, region)
914
- api_requests.ResourceGroupId = res_group_item.ExecutorGroupId
915
- else:
916
- api_requests.ResourceGroupId = os.environ.get("RESOURCE_GROUP_ID")
917
- client = FeatureCloudSDK(secret_id=cloud_secret_id, secret_key=cloud_secret_key, region=region)
918
- resp = client.CreateOnlineFeatureTable(api_requests)
919
- if cycle_obj.CycleType == "ONEOFF_CYCLE":
920
- print(f"publish online task create success. it will be execute after 3 min. {resp.Data.OnlineTableId} {resp.Data.OfflineTableId} ")
921
- else:
922
- print(f"publish online task create success. {resp.Data.OnlineTableId} {resp.Data.OfflineTableId} ")
923
-
924
- def drop_online_table(self, table_name: str, online_config: RedisStoreConfig, database_name: Optional[str] = None):
925
- # 构建完整表名
926
- full_table_name = common_utils.build_full_table_name(table_name, database_name)
927
- feast_client = FeastClient(self._spark, online_config)
928
- try:
929
- self._sync_table_info(table_name=table_name, database_name=database_name, action_name="delete_online",
930
- data_source_name="", engine_name=env_utils.get_engine_name(), is_try=True)
931
- except Exception as e:
932
- raise RuntimeError(f"drop online table failed. table_name: {full_table_name}. {str(e)}")
933
-
934
- feast_client.remove_online_table(full_table_name)
935
- try:
936
- self._sync_table_info(table_name=table_name, database_name=database_name, action_name="delete_online",
937
- data_source_name="", engine_name=env_utils.get_engine_name(), is_try=False)
938
- except Exception as e:
939
- raise RuntimeError(f"drop online table failed. table_name: {full_table_name}. {str(e)}")
940
- print(f"drop online table success. table_name: {full_table_name}")
941
-
942
- def _get_table_primary_keys_and_timestamp_key(self, full_table_name: str) -> 'str, str':
943
-
944
- tbl_pro = self._spark.sql(f"SHOW TBLPROPERTIES {full_table_name}")
945
- props = {row['key']: row['value'] for row in tbl_pro.collect()}
946
-
947
- if props.get(FEATURE_DLC_TABLE_PRIMARY_KEY, ""):
948
- primary_keys = props.get(FEATURE_DLC_TABLE_PRIMARY_KEY, "")
949
- else:
950
- primary_keys = props.get(FEATURE_TABLE_BACKUP_PRIMARY_KEY, "")
951
- primary_keys = primary_keys.split(",")
952
- timestamp_key = props.get(FEATURE_TABLE_TIMESTAMP, "")
953
- return primary_keys, timestamp_key
954
-
955
- def _check_table_exists(self, full_table_name: str) -> bool:
956
- return common_utils.check_spark_table_exists(self._spark, full_table_name)
957
-
958
-
959
- def _get_default_resource_group(project_id: str, secret_id: str, secret_key: str, region: str):
960
- client = FeatureCloudSDK(secret_id=secret_id, secret_key=secret_key, region=region)
961
- request = DescribeNormalSchedulerExecutorGroupsRequest()
962
- request.ProjectId = project_id
963
- resp = client.DescribeNormalSchedulerExecutorGroups(request)
964
- # 默认取第一个健康可用的资源组进行执行
965
- for item in resp.Data:
966
- if item.Available:
967
- return item
968
- raise ValueError("No available resource group found")
969
-
970
-
971
- def _refresh_table(project_id: str, secret_id: str, secret_key: str, region: str, table_name: str,
972
- action: str, database_name: str, data_source_name: str, data_source_type: str,
973
- engine_name: str, is_try: bool):
974
- client = FeatureCloudSDK(secret_id=secret_id, secret_key=secret_key, region=region)
975
- request = RefreshFeatureTableRequest()
976
- request.ProjectId = project_id
977
- request.TableName = table_name
978
- request.DatabaseName = database_name
979
- request.DatasourceName = data_source_name
980
- request.DatasourceType = data_source_type
981
- request.EngineName = engine_name
982
- request.ActionName = action
983
- request.IsTry = is_try
984
- resp = client.RefreshFeatureTable(request)
985
- return resp