tencent-wedata-feature-engineering-dev 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.

Files changed (64) hide show
  1. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/METADATA +19 -0
  2. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/RECORD +64 -0
  3. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/WHEEL +5 -0
  4. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/top_level.txt +1 -0
  5. wedata/__init__.py +9 -0
  6. wedata/feature_store/__init__.py +0 -0
  7. wedata/feature_store/client.py +462 -0
  8. wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
  9. wedata/feature_store/cloud_sdk_client/client.py +86 -0
  10. wedata/feature_store/cloud_sdk_client/models.py +686 -0
  11. wedata/feature_store/cloud_sdk_client/utils.py +32 -0
  12. wedata/feature_store/common/__init__.py +0 -0
  13. wedata/feature_store/common/protos/__init__.py +0 -0
  14. wedata/feature_store/common/protos/feature_store_pb2.py +49 -0
  15. wedata/feature_store/common/store_config/__init__.py +0 -0
  16. wedata/feature_store/common/store_config/redis.py +48 -0
  17. wedata/feature_store/constants/__init__.py +0 -0
  18. wedata/feature_store/constants/constants.py +59 -0
  19. wedata/feature_store/constants/engine_types.py +34 -0
  20. wedata/feature_store/entities/__init__.py +0 -0
  21. wedata/feature_store/entities/column_info.py +138 -0
  22. wedata/feature_store/entities/environment_variables.py +55 -0
  23. wedata/feature_store/entities/feature.py +53 -0
  24. wedata/feature_store/entities/feature_column_info.py +72 -0
  25. wedata/feature_store/entities/feature_function.py +55 -0
  26. wedata/feature_store/entities/feature_lookup.py +200 -0
  27. wedata/feature_store/entities/feature_spec.py +489 -0
  28. wedata/feature_store/entities/feature_spec_constants.py +25 -0
  29. wedata/feature_store/entities/feature_table.py +111 -0
  30. wedata/feature_store/entities/feature_table_info.py +49 -0
  31. wedata/feature_store/entities/function_info.py +90 -0
  32. wedata/feature_store/entities/on_demand_column_info.py +57 -0
  33. wedata/feature_store/entities/source_data_column_info.py +24 -0
  34. wedata/feature_store/entities/training_set.py +135 -0
  35. wedata/feature_store/feast_client/__init__.py +0 -0
  36. wedata/feature_store/feast_client/feast_client.py +482 -0
  37. wedata/feature_store/feature_table_client/__init__.py +0 -0
  38. wedata/feature_store/feature_table_client/feature_table_client.py +969 -0
  39. wedata/feature_store/mlflow_model.py +17 -0
  40. wedata/feature_store/spark_client/__init__.py +0 -0
  41. wedata/feature_store/spark_client/spark_client.py +289 -0
  42. wedata/feature_store/training_set_client/__init__.py +0 -0
  43. wedata/feature_store/training_set_client/training_set_client.py +572 -0
  44. wedata/feature_store/utils/__init__.py +0 -0
  45. wedata/feature_store/utils/common_utils.py +352 -0
  46. wedata/feature_store/utils/env_utils.py +86 -0
  47. wedata/feature_store/utils/feature_lookup_utils.py +564 -0
  48. wedata/feature_store/utils/feature_spec_utils.py +286 -0
  49. wedata/feature_store/utils/feature_utils.py +73 -0
  50. wedata/feature_store/utils/on_demand_utils.py +107 -0
  51. wedata/feature_store/utils/schema_utils.py +117 -0
  52. wedata/feature_store/utils/signature_utils.py +202 -0
  53. wedata/feature_store/utils/topological_sort.py +158 -0
  54. wedata/feature_store/utils/training_set_utils.py +579 -0
  55. wedata/feature_store/utils/uc_utils.py +296 -0
  56. wedata/feature_store/utils/validation_utils.py +79 -0
  57. wedata/tempo/__init__.py +0 -0
  58. wedata/tempo/interpol.py +448 -0
  59. wedata/tempo/intervals.py +1331 -0
  60. wedata/tempo/io.py +61 -0
  61. wedata/tempo/ml.py +129 -0
  62. wedata/tempo/resample.py +318 -0
  63. wedata/tempo/tsdf.py +1720 -0
  64. wedata/tempo/utils.py +254 -0
@@ -0,0 +1,969 @@
1
+ """
2
+ 特征表操作相关工具方法
3
+ """
4
+ import json
5
+ from typing import Union, List, Dict, Optional, Sequence, Any
6
+
7
+ import tencentcloud.common.exception
8
+ from pyspark.sql import DataFrame, SparkSession
9
+ from pyspark.sql.streaming import StreamingQuery
10
+ from pyspark.sql.types import StructType
11
+ import os
12
+ import datetime
13
+ from wedata.feature_store.constants.constants import (
14
+ APPEND, DEFAULT_WRITE_STREAM_TRIGGER, FEATURE_TABLE_KEY,
15
+ FEATURE_TABLE_VALUE, FEATURE_TABLE_PROJECT, FEATURE_TABLE_TIMESTAMP,
16
+ FEATURE_TABLE_BACKUP_PRIMARY_KEY, FEATURE_DLC_TABLE_PRIMARY_KEY)
17
+ from wedata.feature_store.constants.engine_types import EngineTypes
18
+ from wedata.feature_store.common.store_config.redis import RedisStoreConfig
19
+ from wedata.feature_store.entities.feature_table import FeatureTable
20
+ from wedata.feature_store.spark_client.spark_client import SparkClient
21
+ from wedata.feature_store.utils import common_utils, env_utils
22
+ from wedata.feature_store.feast_client.feast_client import FeastClient
23
+ from wedata.feature_store.cloud_sdk_client.models import (
24
+ TaskSchedulerConfiguration, OnlineFeatureConfiguration, OfflineFeatureConfiguration,
25
+ CreateOnlineFeatureTableRequest, DescribeNormalSchedulerExecutorGroupsRequest, RefreshFeatureTableRequest)
26
+ from wedata.feature_store.cloud_sdk_client.client import FeatureCloudSDK
27
+
28
+
29
+ class FeatureTableClient:
30
+ """特征表操作类"""
31
+
32
+ def __init__(
33
+ self,
34
+ spark: SparkSession,
35
+ cloud_secret_id: str = None,
36
+ cloud_secret_key: str = None,
37
+ ):
38
+ self._spark = spark
39
+ self._feast_client = FeastClient(spark)
40
+ if cloud_secret_id and cloud_secret_key:
41
+ self.__cloud_secret_id = cloud_secret_id
42
+ self.__cloud_secret_key = cloud_secret_key
43
+ else:
44
+ self.__cloud_secret_id, self.__cloud_secret_key = env_utils.get_cloud_secret()
45
+ self.__project = env_utils.get_project_id()
46
+ self.__region = env_utils.get_region()
47
+
48
+ @property
49
+ def cloud_secret_id(self) -> str:
50
+ if not self.__cloud_secret_id:
51
+ raise ValueError("cloud_secret_id is empty. please set it first.")
52
+ return self.__cloud_secret_id
53
+
54
+ @cloud_secret_id.setter
55
+ def cloud_secret_id(self, cloud_secret_id: str):
56
+ if not cloud_secret_id:
57
+ raise ValueError("cloud_secret_id cannot be None")
58
+ self.__cloud_secret_id = cloud_secret_id
59
+
60
+ @property
61
+ def cloud_secret_key(self) -> str:
62
+ if not self.__cloud_secret_key:
63
+ raise ValueError("cloud_secret_key is empty. please set it first.")
64
+ return self.__cloud_secret_key
65
+
66
+ @cloud_secret_key.setter
67
+ def cloud_secret_key(self, cloud_secret_key: str):
68
+ if not cloud_secret_key:
69
+ raise ValueError("cloud_secret_key cannot be None")
70
+ self.__cloud_secret_key = cloud_secret_key
71
+
72
+ @property
73
+ def project(self) -> str:
74
+ return self.__project
75
+
76
+ @property
77
+ def region(self) -> str:
78
+ return self.__region
79
+
80
+ @staticmethod
81
+ def _normalize_params(
82
+ param: Optional[Union[str, Sequence[str]]],
83
+ default_type: type = list
84
+ ) -> list:
85
+ """统一处理参数标准化"""
86
+ if param is None:
87
+ return default_type()
88
+ return list(param) if isinstance(param, Sequence) else [param]
89
+
90
+ @staticmethod
91
+ def _validate_schema(df: DataFrame, schema: StructType):
92
+ """校验DataFrame和schema的有效性和一致性"""
93
+ # 检查是否同时为空
94
+ if df is None and schema is None:
95
+ raise ValueError("Either DataFrame or schema must be provided")
96
+
97
+ # 检查schema匹配
98
+ if df is not None and schema is not None:
99
+ df_schema = df.schema
100
+ if df_schema != schema:
101
+ diff_fields = set(df_schema.fieldNames()).symmetric_difference(set(schema.fieldNames()))
102
+ raise ValueError(
103
+ f"DataFrame schema does not match. Differences: "
104
+ f"{diff_fields if diff_fields else 'field type mismatch'}"
105
+ )
106
+
107
+ @staticmethod
108
+ def _validate_key_conflicts(primary_keys: List[str], timestamp_keys: str):
109
+ """校验主键与时间戳键是否冲突"""
110
+ if timestamp_keys in primary_keys:
111
+ raise ValueError(f"Timestamp keys conflict with primary keys: {timestamp_keys}")
112
+
113
+ @staticmethod
114
+ def _validate_key_exists(primary_keys: List[str], timestamp_keys: str):
115
+ """校验主键与时间戳键是否存在"""
116
+ if not primary_keys:
117
+ raise ValueError("Primary keys cannot be empty")
118
+ if not timestamp_keys:
119
+ raise ValueError("Timestamp keys cannot be empty")
120
+
121
+ @staticmethod
122
+ def _escape_sql_value(value: str) -> str:
123
+ """转义SQL值中的特殊字符"""
124
+ return value.replace("'", "''")
125
+
126
+ def create_table(
127
+ self,
128
+ name: str,
129
+ primary_keys: Union[str, List[str]],
130
+ timestamp_key: str,
131
+ engine_type: EngineTypes,
132
+ data_source_name: str,
133
+ database_name: Optional[str] = None,
134
+ df: Optional[DataFrame] = None,
135
+ *,
136
+ partition_columns: Union[str, List[str], None] = None,
137
+ schema: Optional[StructType] = None,
138
+ description: Optional[str] = None,
139
+ tags: Optional[Dict[str, str]] = None
140
+ ) -> FeatureTable:
141
+
142
+ """
143
+ 创建特征表(支持批流数据写入)
144
+
145
+ Args:
146
+ name: 特征表全称(格式:<table>)
147
+ primary_keys: 主键列名(支持复合主键)
148
+ database_name: Optional[str] = None,
149
+ data_source_name: 数据源名称,
150
+ df: 初始数据(可选,用于推断schema)
151
+ timestamp_key: 时间戳键(用于时态特征)
152
+ engine_type: 引擎类型 version:: 1.33
153
+ partition_columns: 分区列(优化存储查询)
154
+ schema: 表结构定义(可选,当不提供df时必需)
155
+ description: 业务描述
156
+ tags: 业务标签
157
+ Returns:
158
+ FeatureTable实例
159
+
160
+ Raises:
161
+ ValueError: 当schema与数据不匹配时
162
+ """
163
+
164
+ # 参数标准化
165
+ primary_keys = self._normalize_params(primary_keys)
166
+ partition_columns = self._normalize_params(partition_columns)
167
+
168
+ # 元数据校验
169
+ self._validate_schema(df, schema)
170
+ self._validate_key_exists(primary_keys, timestamp_key)
171
+ self._validate_key_conflicts(primary_keys, timestamp_key)
172
+
173
+ # 表名校验
174
+ common_utils.validate_table_name(name)
175
+
176
+ common_utils.validate_database(database_name)
177
+
178
+ # 校验PrimaryKey是否有重复
179
+ dup_list = common_utils.get_duplicates(primary_keys)
180
+ if dup_list :
181
+ raise ValueError(f"Primary keys have duplicates: {dup_list}")
182
+
183
+ # 构建完整表名
184
+ table_name = common_utils.build_full_table_name(name, database_name)
185
+
186
+ # 检查表是否存在
187
+ try:
188
+ if self._check_table_exists(table_name):
189
+ raise ValueError(
190
+ f"Table '{name}' already exists\n"
191
+ "Solutions:\n"
192
+ "1. Use a different table name\n"
193
+ "2. Drop the existing table: spark.sql(f'DROP TABLE {name}')\n"
194
+ )
195
+ except Exception as e:
196
+ raise ValueError(f"Error checking table existence: {str(e)}") from e
197
+
198
+ try:
199
+ self._sync_table_info(table_name=name, action_name="CREATE",
200
+ database_name=env_utils.get_database_name(database_name),
201
+ data_source_name=data_source_name, engine_name=env_utils.get_engine_name(),
202
+ is_try=True)
203
+ except tencentcloud.common.exception.TencentCloudSDKException as e:
204
+ raise RuntimeError(f"Table '{name}' is can't create. {str(e)}")
205
+
206
+ # 推断表schema
207
+ table_schema = schema or df.schema
208
+
209
+ # 构建时间戳键属性
210
+
211
+ # 从环境变量获取额外标签
212
+ env_tags = {
213
+ "project_id": os.getenv("WEDATA_PROJECT_ID", ""), # wedata项目ID
214
+ "engine_name": os.getenv("WEDATA_NOTEBOOK_ENGINE", ""), # wedata引擎名称
215
+ "user_uin": os.getenv("KERNEL_LOGIN_UIN", "") # wedata用户UIN
216
+ }
217
+ projectId = os.getenv("WEDATA_PROJECT_ID", "")
218
+ # 构建表属性(通过TBLPROPERTIES)
219
+ tbl_properties = {
220
+ "wedata.feature_table": "true",
221
+ FEATURE_TABLE_BACKUP_PRIMARY_KEY: ",".join(primary_keys),
222
+ "wedata.feature_project_id": f"{json.dumps([projectId])}",
223
+ FEATURE_TABLE_TIMESTAMP: timestamp_key,
224
+ "comment": description or "",
225
+ **{f"{k}": v for k, v in (tags or {}).items()},
226
+ **{f"feature_{k}": v for k, v in (env_tags or {}).items()}
227
+ }
228
+ if engine_type == EngineTypes.ICEBERG_ENGINE:
229
+ if partition_columns:
230
+ tbl_properties.update({
231
+ 'format-version': '2',
232
+ 'write.upsert.enabled': 'true',
233
+ 'write.update.mode': 'merge-on-read',
234
+ 'write.merge.mode': 'merge-on-read',
235
+ 'write.parquet.bloom-filter-enabled.column.id': 'true',
236
+ 'dlc.ao.data.govern.sorted.keys': ",".join(primary_keys),
237
+ 'write.distribution-mode': 'hash',
238
+ 'write.metadata.delete-after-commit.enabled': 'true',
239
+ 'write.metadata.previous-versions-max': '100',
240
+ 'write.metadata.metrics.default': 'full',
241
+ 'smart-optimizer.inherit': 'default',
242
+ })
243
+ else:
244
+ tbl_properties.update({
245
+ 'format-version': '2',
246
+ 'write.upsert.enabled': 'true',
247
+ 'write.update.mode': 'merge-on-read',
248
+ 'write.merge.mode': 'merge-on-read',
249
+ 'write.parquet.bloom-filter-enabled.column.id': 'true',
250
+ 'dlc.ao.data.govern.sorted.keys': ",".join(primary_keys),
251
+ 'write.distribution-mode': 'hash',
252
+ 'write.metadata.delete-after-commit.enabled': 'true',
253
+ 'write.metadata.previous-versions-max': '100',
254
+ 'write.metadata.metrics.default': 'full',
255
+ 'smart-optimizer.inherit': 'default',
256
+ })
257
+
258
+ # 构建列定义
259
+ columns_ddl = []
260
+ for field in table_schema.fields:
261
+ data_type = field.dataType.simpleString().upper()
262
+ col_def = f"`{field.name}` {data_type}"
263
+ if not field.nullable:
264
+ col_def += " NOT NULL"
265
+ # 添加字段注释(如果metadata中有comment)
266
+ if field.metadata and "comment" in field.metadata:
267
+ comment = self._escape_sql_value(field.metadata["comment"])
268
+ col_def += f" COMMENT '{comment}'"
269
+ columns_ddl.append(col_def)
270
+
271
+ # 构建分区表达式
272
+ partition_expr = (
273
+ f"PARTITIONED BY ({', '.join([f'`{c}`' for c in partition_columns])})"
274
+ if partition_columns else ""
275
+ )
276
+ # 本地调试 iceberg --》PARQUET
277
+ # 核心建表语句
278
+ if engine_type == EngineTypes.ICEBERG_ENGINE:
279
+ ddl = f"""
280
+ CREATE TABLE {table_name} (
281
+ {', '.join(columns_ddl)}
282
+ )
283
+ USING iceberg
284
+ {partition_expr}
285
+ TBLPROPERTIES (
286
+ {', '.join(f"'{k}'='{self._escape_sql_value(v)}'" for k, v in tbl_properties.items())}
287
+ )
288
+ """
289
+ elif engine_type == EngineTypes.HIVE_ENGINE:
290
+ ddl = f"""
291
+ CREATE TABLE {table_name} (
292
+ {', '.join(columns_ddl)}
293
+ )
294
+ {partition_expr}
295
+ -- STORED AS PARQUET
296
+ TBLPROPERTIES (
297
+ {', '.join(f"'{k}'='{self._escape_sql_value(v)}'" for k, v in tbl_properties.items())}
298
+ )
299
+ """
300
+ else:
301
+ raise ValueError(f"Engine type {engine_type} is not supported")
302
+
303
+ # 打印sql
304
+ print(f"create table ddl: {ddl}\n")
305
+
306
+ # 执行DDL
307
+ try:
308
+ self._spark.sql(ddl)
309
+ if df is not None:
310
+ df.write.insertInto(table_name)
311
+ except Exception as e:
312
+ raise ValueError(f"Failed to create table: {str(e)}") from e
313
+
314
+ print("async table info to feast")
315
+
316
+ self._feast_client.create_table(
317
+ table_name=table_name,
318
+ primary_keys=primary_keys,
319
+ timestamp_key=timestamp_key,
320
+ df=df,
321
+ schema=table_schema,
322
+ tags=tags,
323
+ description=description
324
+ )
325
+
326
+ print(f"create table {name} done")
327
+
328
+ try:
329
+ self._sync_table_info(table_name=name, action_name="CREATE",
330
+ database_name=env_utils.get_database_name(database_name),
331
+ data_source_name=data_source_name, engine_name=env_utils.get_engine_name(),
332
+ is_try=False)
333
+ except tencentcloud.common.exception.TencentCloudSDKException as e:
334
+ raise RuntimeError(f"Table '{name}' is can't create. {str(e)}")
335
+
336
+ # 构建并返回FeatureTable对象
337
+ return FeatureTable(
338
+ name=name,
339
+ table_id=table_name,
340
+ description=description or "",
341
+ primary_keys=primary_keys,
342
+ partition_columns=partition_columns or [],
343
+ features=[field.name for field in table_schema.fields],
344
+ timestamp_keys=timestamp_key or [],
345
+ tags=dict(**tags or {}, **env_tags)
346
+ )
347
+
348
+ def write_table(
349
+ self,
350
+ name: str,
351
+ df: DataFrame,
352
+ database_name: Optional[str] = None,
353
+ mode: Optional[str] = APPEND,
354
+ checkpoint_location: Optional[str] = None,
355
+ trigger: Optional[Dict[str, Any]] = DEFAULT_WRITE_STREAM_TRIGGER
356
+ ) -> Optional[StreamingQuery]:
357
+
358
+ """
359
+ 写入特征表数据(支持批处理和流式写入)
360
+
361
+ Args:
362
+ name: 特征表名称(格式:<table>)
363
+ df: 要写入的数据(DataFrame)
364
+ database_name: 数据库名
365
+ mode: 写入模式(append/overwrite)
366
+ checkpoint_location: 流式写入的检查点位置(仅流式写入需要)
367
+ trigger: 流式写入触发条件(仅流式写入需要)
368
+
369
+ Returns:
370
+ 如果是流式写入返回StreamingQuery对象,否则返回None
371
+
372
+ Raises:
373
+ ValueError: 当参数不合法时抛出
374
+ """
375
+
376
+ # 验证写入模式
377
+ valid_modes = ["append", "overwrite"]
378
+ if mode not in valid_modes:
379
+ raise ValueError(f"Invalid write mode '{mode}', valid options: {valid_modes}")
380
+
381
+ # 表名校验
382
+ common_utils.validate_table_name(name)
383
+
384
+ common_utils.validate_database(database_name)
385
+
386
+ # 构建完整表名
387
+ table_name = common_utils.build_full_table_name(name, database_name)
388
+
389
+ # 检查表是否存在
390
+ if not self._check_table_exists(table_name):
391
+ raise ValueError(f"table '{name}' not exists")
392
+
393
+ # 判断是否是流式DataFrame
394
+ is_streaming = df.isStreaming
395
+
396
+ try:
397
+ if is_streaming:
398
+ # 流式写入
399
+ if not checkpoint_location:
400
+ raise ValueError("Streaming write requires checkpoint_location parameter")
401
+
402
+ writer = df.writeStream \
403
+ .format("parquet") \
404
+ .outputMode(mode) \
405
+ .option("checkpointLocation", checkpoint_location) \
406
+ # .foreachBatch(process_batch)
407
+
408
+ if trigger:
409
+ writer = writer.trigger(**trigger)
410
+
411
+ return writer.toTable(table_name)
412
+ else:
413
+ # 批处理写入
414
+ df.write \
415
+ .mode(mode) \
416
+ .insertInto(table_name)
417
+ # self._feast_client.client.write_to_offline_store(feature_view_name=table_name, df=df.toPandas(), allow_registry_cache=False,)
418
+ return None
419
+
420
+ except Exception as e:
421
+ raise
422
+ # raise ValueError(f"Failed to write to table '{table_name}': {str(e)}") from e
423
+
424
+ def register_table(self, name, database_name, timestamp_key: str, engine_type: EngineTypes, data_source_name: str,
425
+ primary_keys: Union[str, List[str]]):
426
+ """注册表 为特征表
427
+ Args:
428
+ name: 表名(格式:<table>)
429
+ database_name: 特征库名称
430
+ data_source_name: 数据源名称
431
+ engine_type: 引擎类型
432
+ timestamp_key: 时间戳键
433
+ primary_keys: 主键
434
+ Raises:
435
+ ValueError: 当表不存在或参数无效时抛出
436
+ RuntimeError: 当修改操作失败时抛出
437
+
438
+ 示例:
439
+ # 修改表属性
440
+ client.register_table("user_features", "user_database")
441
+ """
442
+
443
+ # 表名校验
444
+ common_utils.validate_table_name(name)
445
+ common_utils.validate_database(database_name)
446
+
447
+ # 构建完整表名
448
+ table_name = common_utils.build_full_table_name(name, database_name)
449
+
450
+ try:
451
+ # 检查表是否存在
452
+ if not self._check_table_exists(table_name):
453
+ raise ValueError(f"table '{name}' not exists")
454
+ tbl_pro = self._spark.sql(f"SHOW TBLPROPERTIES {table_name}")
455
+ props = {row['key']: row['value'] for row in tbl_pro.collect()}
456
+
457
+ # 检查Primary Key和Timestamp Key是否为空
458
+ if engine_type == engine_type.ICEBERG_ENGINE and props.get("format-version", "") == "2":
459
+ if props.get('dlc.ao.data.govern.sorted.keys', "") == "":
460
+ raise ValueError(
461
+ "table dlc.ao.data.govern.sorted.keys is empty. you must set dlc.ao.data.govern.sorted.keys")
462
+ else:
463
+ primary_keys = props.get('dlc.ao.data.govern.sorted.keys').split(",")
464
+ elif engine_type == engine_type.HIVE_ENGINE:
465
+ if not primary_keys:
466
+ raise ValueError("primary_keys cannot be None for HIVE_ENGINE")
467
+
468
+ if props.get("wedata.feature_table", "") == "true":
469
+ raise ValueError("table is already a feature table")
470
+
471
+ self._validate_key_conflicts(primary_keys, timestamp_key)
472
+ # 检查表是否存在
473
+ dup_list = common_utils.get_duplicates(primary_keys)
474
+ if dup_list:
475
+ raise ValueError(f"primary_keys contains duplicates: {dup_list}")
476
+
477
+ s = props.get(FEATURE_TABLE_PROJECT, "")
478
+ if not s: # 如果s是空字符串
479
+ projectIds = []
480
+ else:
481
+ projectIds = json.loads(s)
482
+ current_project_id = os.getenv("WEDATA_PROJECT_ID")
483
+ # 判断是否包含projectIds(仅是projectIds非空的时候)
484
+ if current_project_id not in projectIds and len(projectIds):
485
+ register_table_project_ids = props.get(FEATURE_TABLE_PROJECT)
486
+ else:
487
+ projectIds.append(current_project_id)
488
+ register_table_project_ids = json.dumps(projectIds)
489
+ tbl_properties = {
490
+ FEATURE_TABLE_KEY: FEATURE_TABLE_VALUE,
491
+ FEATURE_TABLE_PROJECT: register_table_project_ids,
492
+ FEATURE_TABLE_TIMESTAMP: timestamp_key,
493
+ FEATURE_TABLE_BACKUP_PRIMARY_KEY: ",".join(primary_keys),
494
+ }
495
+
496
+ env_tags = {
497
+ "project_id": os.getenv("WEDATA_PROJECT_ID", ""), # wedata项目ID
498
+ "engine_name": os.getenv("WEDATA_NOTEBOOK_ENGINE", ""), # wedata引擎名称
499
+ "user_uin": os.getenv("KERNEL_LOGIN_UIN", "") # wedata用户UIN
500
+ }
501
+ for key, val in env_tags.items():
502
+ if not props.get(f"feature_{key}", ""):
503
+ tbl_properties[f"feature_{key}"] = val
504
+
505
+ # 构建属性设置语句
506
+ props_str = ", ".join(
507
+ f"'{k}'='{self._escape_sql_value(v)}'"
508
+ for k, v in tbl_properties.items()
509
+ )
510
+
511
+
512
+ alter_sql = f"ALTER TABLE {table_name} SET TBLPROPERTIES ({props_str})"
513
+
514
+ try:
515
+ self._sync_table_info(table_name=name, action_name="CREATE",
516
+ database_name=env_utils.get_database_name(database_name),
517
+ data_source_name=data_source_name, engine_name=env_utils.get_engine_name(), is_try=True)
518
+ except tencentcloud.common.exception.TencentCloudSDKException as e:
519
+ raise RuntimeError(f"Table '{name}' is can't create. {str(e)}")
520
+
521
+ # 执行修改
522
+ print("alter table sql", alter_sql)
523
+ self._spark.sql(alter_sql)
524
+ print("Execute sql done, start sync table info to feast")
525
+ self._feast_client.alter_table(full_table_name=table_name, primary_keys=primary_keys,
526
+ timestamp_key=timestamp_key)
527
+ print(f"Successfully register table '{table_name}'")
528
+
529
+ try:
530
+ self._sync_table_info(table_name=name, action_name="CREATE",
531
+ database_name=env_utils.get_database_name(database_name),
532
+ data_source_name=data_source_name, engine_name=env_utils.get_engine_name(), is_try=False)
533
+ except tencentcloud.common.exception.TencentCloudSDKException as e:
534
+ raise RuntimeError(f"sync table info failed. you need to sync table info manually. {str(e)}")
535
+ except ValueError as e:
536
+ raise # 直接抛出已知的ValueError
537
+ except Exception as e:
538
+ raise RuntimeError(f"Failed to modify properties for table '{table_name}': {str(e)}") from e
539
+
540
+ def read_table(
541
+ self,
542
+ name: str,
543
+ database_name: Optional[str] = None,
544
+ is_online: bool = False,
545
+ online_config: Optional[RedisStoreConfig] = None,
546
+ entity_row: Optional[List[Dict[str, Any]]] = None
547
+ ) -> DataFrame:
548
+
549
+ """
550
+ 从特征表中读取数据
551
+
552
+ Args:
553
+ name: 特征表名称(格式:<table>)
554
+ database_name: 特征库名称
555
+ is_online: 是否读取在线表
556
+ online_config: 在线表配置
557
+ entity_row: 实体行(用于过滤在线数据, 仅当在线表为true时有效)
558
+ Returns:
559
+ 包含表数据的DataFrame
560
+
561
+ Raises:
562
+ ValueError: 当表不存在或读取失败时抛出
563
+ """
564
+
565
+ # 表名校验
566
+ common_utils.validate_table_name(name)
567
+
568
+ common_utils.validate_database(database_name)
569
+
570
+ # 构建完整表名
571
+ table_name = common_utils.build_full_table_name(name, database_name)
572
+
573
+ try:
574
+ # 检查表是否存在
575
+ if not self._check_table_exists(table_name):
576
+ raise ValueError(f"Table '{name}' does not exist")
577
+
578
+ if is_online:
579
+ return self._read_online_table(
580
+ table_name=name, database_name=database_name,
581
+ online_config=online_config, entity_row=entity_row)
582
+ # 读取表数据
583
+ return self._spark.read.table(table_name)
584
+
585
+ except Exception as e:
586
+ raise
587
+
588
+ def drop_table(self, name: str, database_name: Optional[str] = None) -> None:
589
+
590
+ """
591
+ 删除特征表(表不存在时抛出异常)
592
+
593
+ Args:
594
+ name: 特征表名称(格式:<table>)
595
+ database_name: 特征库名称
596
+ Raises:
597
+ ValueError: 当表不存在时抛出
598
+ RuntimeError: 当删除操作失败时抛出
599
+
600
+ 示例:
601
+ # 基本删除
602
+ drop_table("user_features")
603
+ """
604
+
605
+ # 表名校验
606
+ common_utils.validate_table_name(name)
607
+
608
+ # 构建完整表名
609
+ table_name = common_utils.build_full_table_name(name, database_name)
610
+ try:
611
+ # 检查表是否存在
612
+ if not self._check_table_exists(table_name):
613
+ print(f"Table '{name}' does not exist")
614
+ return
615
+
616
+ # 检查
617
+ df = self._spark.table(tableName=table_name)
618
+
619
+ try:
620
+ feature_view = self._feast_client.get_feature_view(table_name)
621
+ except Exception as e:
622
+ print(f"Table '{name}' is not a feature table, skip delete. {str(e)}")
623
+ else:
624
+ if feature_view.online:
625
+ raise ValueError(f"Table '{name}' has a online table, please call drop_online_table first")
626
+ try:
627
+ self._sync_table_info(table_name=name, action_name="DELETE",
628
+ database_name=env_utils.get_database_name(database_name),
629
+ data_source_name="", engine_name=env_utils.get_engine_name(), is_try=True)
630
+ except tencentcloud.common.exception.TencentCloudSDKException as e:
631
+ raise RuntimeError(f"Table '{name}' is can't delete. {str(e)}")
632
+
633
+ # 执行删除
634
+ self._spark.sql(f"DROP TABLE {table_name}")
635
+ print(f"Table '{name}' dropped")
636
+ try:
637
+ self._feast_client.remove_offline_table(table_name=table_name)
638
+ except Exception as e:
639
+ raise
640
+ # raise ValueError(f"Failed to delete table '{name}' in feast: {str(e)}")
641
+ else:
642
+ print(f"Table '{name}' removed from feast")
643
+
644
+ try:
645
+ self._sync_table_info(table_name=name, action_name="DELETE",
646
+ database_name=env_utils.get_database_name(database_name),
647
+ data_source_name="", engine_name=env_utils.get_engine_name(), is_try=False)
648
+ except tencentcloud.common.exception.TencentCloudSDKException as e:
649
+ print(f"Failed to delete table information on the web interface. You need to delete it manually. Error: {str(e)}")
650
+ except ValueError as e:
651
+ raise # 直接抛出已知的ValueError
652
+ except Exception as e:
653
+ raise RuntimeError(f"Failed to delete table '{name}': {str(e)}") from e
654
+
655
+ def _sync_table_info(self, table_name: str, action_name: str, database_name: str,
656
+ data_source_name: str, engine_name: str, is_try: bool):
657
+ return _refresh_table(project_id=self.project, secret_id=self.cloud_secret_id, secret_key=self.cloud_secret_key,
658
+ region=self.region, table_name=table_name,
659
+ action=action_name, database_name=database_name, data_source_name=data_source_name,
660
+ engine_name=engine_name, is_try=is_try, data_source_type=env_utils.get_engine_type())
661
+
662
+ def _read_online_table(self,
663
+ table_name: str, database_name: str, online_config: RedisStoreConfig,
664
+ entity_row:List[Dict[str,Any]] = None):
665
+ full_table_name = common_utils.build_full_table_name(table_name, database_name)
666
+ primary_keys, timestamp_key = self._get_table_primary_keys_and_timestamp_key(full_table_name)
667
+ entity_row_dict = {}
668
+ if isinstance(entity_row, list):
669
+ for row in entity_row:
670
+ if not isinstance(row, dict):
671
+ raise ValueError("Entity_row row must be a dictionary")
672
+ for key in row.keys():
673
+ if key not in primary_keys:
674
+ raise ValueError(f"Entity_row row key '{key}' is not a primary key")
675
+ entity_row_dict[key] = key
676
+ elif isinstance(entity_row, dict):
677
+ for key in entity_row.keys():
678
+ if key not in primary_keys:
679
+ raise ValueError(f"Entity_row row key '{key}' is not a primary key")
680
+ entity_row_dict = entity_row
681
+ else:
682
+ raise ValueError(f"Entity_row must be a list of dictionaries or a single dictionary. {type(entity_row)}")
683
+
684
+ tmp_schema = self._spark.table(tableName=full_table_name).schema
685
+ columns_name_list = []
686
+ tmp_schema_list = []
687
+ for field in tmp_schema.fields:
688
+ if field.name in primary_keys or field.name == timestamp_key:
689
+ if entity_row_dict.get(field.name):
690
+ tmp_schema_list.append(field)
691
+ continue
692
+ columns_name_list.append(field.name)
693
+ tmp_schema_list.append(field)
694
+
695
+ schema_name_list = [field.name for field in tmp_schema_list]
696
+ schema = StructType(tmp_schema_list)
697
+ for field in schema:
698
+ print(f"{field.name} => {field.dataType}")
699
+
700
+ feast_client = FeastClient(offline_store=self._spark, online_store_config=online_config)
701
+ # 构建离线表的entity的数据过滤
702
+ if not entity_row:
703
+ tbl_props = self._spark.sql(f"SHOW TBLPROPERTIES {table_name}")
704
+ props = {row['key']: row['value'] for row in tbl_props.collect()}
705
+ primary_key = props.get(FEATURE_TABLE_BACKUP_PRIMARY_KEY)
706
+ query_result = self._spark.sql(f"SELECT {primary_key} FROM {table_name} LIMIT 1")
707
+ result_row = query_result.first()
708
+ if result_row:
709
+ online_view = feast_client.get_online_table_view(
710
+ full_table_name=full_table_name,
711
+ columns_name=columns_name_list,
712
+ entity_rows=[result_row.asDict()])
713
+ # print(online_view[schema_name_list])
714
+ return self._spark.createDataFrame(online_view[schema_name_list], schema=schema, verifySchema=False)
715
+ else:
716
+ return self._spark.createDataFrame([])
717
+ else:
718
+ online_view = feast_client.get_online_table_view(
719
+ full_table_name=full_table_name,
720
+ columns_name=columns_name_list,
721
+ entity_rows=entity_row)
722
+ # print(online_view[columns_name_list])
723
+ return self._spark.createDataFrame(online_view[schema_name_list], schema=schema, verifySchema=False)
724
+
725
+ def get_table(
726
+ self,
727
+ name: str,
728
+ spark_client: SparkClient,
729
+ database_name: Optional[str] = None,
730
+ ) -> FeatureTable:
731
+
732
+ """
733
+ 获取特征表元数据信息
734
+
735
+ 参数:
736
+ name: 特征表名称
737
+ spark_client: Spark客户端
738
+
739
+ 返回:
740
+ FeatureTable对象
741
+
742
+ 异常:
743
+ ValueError: 当表不存在或获取失败时抛出
744
+ """
745
+
746
+ # 表名校验
747
+ common_utils.validate_table_name(name)
748
+ common_utils.validate_database(database_name)
749
+
750
+ # 构建完整表名
751
+ table_name = common_utils.build_full_table_name(name, database_name)
752
+ if not self._check_table_exists(full_table_name=table_name):
753
+ raise ValueError(f"Table '{name}' does not exist")
754
+ try:
755
+ return spark_client.get_feature_table(table_name)
756
+ except Exception as e:
757
+ raise
758
+ # raise ValueError(f"Failed to get metadata for table '{name}': {str(e)}") from e
759
+
760
+ def alter_table_tag(
761
+ self,
762
+ name: str,
763
+ properties: Dict[str, str],
764
+ database_name: Optional[str] = None,
765
+ ):
766
+ """
767
+ 修改表的TBLPROPERTIES属性(有则修改,无则新增)
768
+
769
+ Args:
770
+ name: 表名(格式:<table>)
771
+ properties: 要修改/新增的属性字典
772
+ database_name: 特征库名称
773
+
774
+ Raises:
775
+ ValueError: 当表不存在或参数无效时抛出
776
+ RuntimeError: 当修改操作失败时抛出
777
+
778
+ 示例:
779
+ # 修改表属性
780
+ client.alter_tables_tag("user_features", {
781
+ "comment": "更新后的描述",
782
+ "owner": "data_team"
783
+ })
784
+ """
785
+ # 参数校验
786
+ if not properties:
787
+ raise ValueError("properties must be a non-empty dictionary")
788
+
789
+ # 表名校验
790
+ common_utils.validate_table_name(name)
791
+ common_utils.validate_database(database_name)
792
+
793
+ # 构建完整表名
794
+ table_name = common_utils.build_full_table_name(name, database_name)
795
+
796
+ try:
797
+ # 检查表是否存在
798
+ if not self._check_table_exists(table_name):
799
+ raise ValueError(f"table '{name}' not exists")
800
+
801
+ # 构建属性设置语句
802
+ props_str = ", ".join(
803
+ f"'{k}'='{self._escape_sql_value(v)}'"
804
+ for k, v in properties.items()
805
+ )
806
+
807
+ alter_sql = f"ALTER TABLE {table_name} SET TBLPROPERTIES ({props_str})"
808
+
809
+ # 执行修改
810
+ self._spark.sql(alter_sql)
811
+ self._feast_client.modify_tags(table_name=table_name, tags=properties)
812
+ print(f"Successfully updated properties for table '{name}': {list(properties.keys())}")
813
+
814
+ except ValueError as e:
815
+ raise # 直接抛出已知的ValueError
816
+ except Exception as e:
817
+ raise RuntimeError(f"Failed to modify properties for table '{name}': {str(e)}") from e
818
+
819
+ def publish_table(self, table_name: str, data_source_name: str, cloud_secret_id: str, cloud_secret_key: str,
820
+ database_name: Optional[str] = None,
821
+ is_cycle: bool = False, cycle_obj: TaskSchedulerConfiguration = None,
822
+ is_use_default_online: bool = True, online_config: RedisStoreConfig = None):
823
+ """
824
+ 将离线特征表发布为在线特征表
825
+ Args:
826
+ table_name: 离线特征表名称
827
+ data_source_name: 数据源名称
828
+ database_name: 数据库名称
829
+ is_cycle: 是否周期性发布
830
+ cycle_obj: 周期性任务配置
831
+ is_use_default_online: 是否使用默认的在线存储配置
832
+ online_config: 在线存储配置 (仅当is_use_default_online为False时生效)
833
+ """
834
+ # 构建完整表名
835
+ full_table_name = common_utils.build_full_table_name(table_name, database_name)
836
+
837
+ # 检查表是否存在
838
+ if not self._check_table_exists(full_table_name):
839
+ raise ValueError(f"Table '{full_table_name}' does not exist")
840
+
841
+ # 检查是否已经发布,查看Redis中是否有值
842
+ try:
843
+ # 获取离线表的列名
844
+ online_data = self._read_online_table(
845
+ table_name=table_name,
846
+ database_name=database_name,
847
+ online_config=online_config)
848
+ except Exception as e:
849
+ print(f"Failed to get online table view for table '{full_table_name}': {str(e)}")
850
+ else:
851
+ if online_data:
852
+ raise ValueError(f"Table '{full_table_name}' has already been published")
853
+
854
+ # 配置周期性参数
855
+ if is_cycle:
856
+ if not isinstance(cycle_obj, TaskSchedulerConfiguration):
857
+ raise ValueError("cycle_obj must be a TaskSchedulerConfiguration object when is_cycle is True")
858
+
859
+ cycle_obj.CycleType = "CRONTAB_CYCLE"
860
+ else:
861
+ if isinstance(cycle_obj, TaskSchedulerConfiguration):
862
+ cycle_obj.CycleType = "ONEOFF_CYCLE"
863
+ else:
864
+ cycle_obj = TaskSchedulerConfiguration()
865
+ cycle_obj.CycleType = "ONEOFF_CYCLE"
866
+ # 设置默认当前时间延后1分钟
867
+ cycle_obj.CrontabExpression = (datetime.datetime.now() + datetime.timedelta(minutes=3)).strftime(
868
+ "%M %H %d %m %w ? %y")
869
+
870
+ if is_use_default_online:
871
+ online_feature_config = OnlineFeatureConfiguration()
872
+ online_feature_config.UserDefault = True
873
+ else:
874
+ if not isinstance(online_config, RedisStoreConfig):
875
+ raise ValueError("online_config must be a RedisStoreConfig object when is_use_default_online is False")
876
+
877
+ online_feature_config = OnlineFeatureConfiguration()
878
+ online_feature_config.UserDefault = False
879
+ online_feature_config.Host = online_config.host
880
+ online_feature_config.Port = online_config.port
881
+ online_feature_config.DB = online_config.db
882
+
883
+ offline_feature_config = OfflineFeatureConfiguration()
884
+ offline_feature_config.DatabaseName = database_name if database_name else os.environ.get(
885
+ "WEDATA_DEFAULT_FEATURE_STORE_DATABASE")
886
+ offline_feature_config.TableName = table_name
887
+
888
+ offline_feature_config.PrimaryKeys, offline_feature_config.TimestampColumn = self._get_table_primary_keys_and_timestamp_key(
889
+ full_table_name)
890
+
891
+ offline_feature_config.DatasourceName = data_source_name
892
+ offline_feature_config.DatasourceType = "DLC" if os.environ.get("DLC_REGION") else "EMR"
893
+ offline_feature_config.EngineName = os.environ.get("KERNEL_ENGINE")
894
+
895
+ api_requests = CreateOnlineFeatureTableRequest()
896
+ api_requests.OfflineFeatureConfiguration = offline_feature_config
897
+ api_requests.OnlineFeatureConfiguration = online_feature_config
898
+ api_requests.TaskSchedulerConfiguration = cycle_obj
899
+ api_requests.ProjectId = os.environ.get("WEDATA_PROJECT_ID")
900
+ if offline_feature_config.DatasourceType == "DLC":
901
+ region = os.environ.get("DLC_REGION")
902
+ else:
903
+ region = os.environ.get("EMR_REGION")
904
+
905
+ if not os.environ.get("RESOURCE_GROUP_ID", ""):
906
+ res_group_item = _get_default_resource_group(
907
+ api_requests.ProjectId, cloud_secret_id, cloud_secret_key, region)
908
+ api_requests.ResourceGroupId = res_group_item.ExecutorGroupId
909
+ else:
910
+ api_requests.ResourceGroupId = os.environ.get("RESOURCE_GROUP_ID")
911
+ client = FeatureCloudSDK(secret_id=cloud_secret_id, secret_key=cloud_secret_key, region=region)
912
+ resp = client.CreateOnlineFeatureTable(api_requests)
913
+ if cycle_obj.CycleType == "ONEOFF_CYCLE":
914
+ print(f"publish online task create success. it will be execute after 3 min. {resp.Data.OnlineTableId} {resp.Data.OfflineTableId} ")
915
+ else:
916
+ print(f"publish online task create success. {resp.Data.OnlineTableId} {resp.Data.OfflineTableId} ")
917
+
918
+ def drop_online_table(self, table_name: str, online_config: RedisStoreConfig, database_name: Optional[str] = None):
919
+ # 构建完整表名
920
+ full_table_name = common_utils.build_full_table_name(table_name, database_name)
921
+ feast_client = FeastClient(self._spark, online_config)
922
+ feast_client.remove_online_table(full_table_name)
923
+ print(f"drop online table success. table_name: {full_table_name}")
924
+
925
+ def _get_table_primary_keys_and_timestamp_key(self, full_table_name: str) -> 'str, str':
926
+
927
+ tbl_pro = self._spark.sql(f"SHOW TBLPROPERTIES {full_table_name}")
928
+ props = {row['key']: row['value'] for row in tbl_pro.collect()}
929
+
930
+ if props.get(FEATURE_DLC_TABLE_PRIMARY_KEY, ""):
931
+ primary_keys = props.get(FEATURE_DLC_TABLE_PRIMARY_KEY, "")
932
+ else:
933
+ primary_keys = props.get(FEATURE_TABLE_BACKUP_PRIMARY_KEY, "")
934
+ primary_keys = primary_keys.split(",")
935
+ timestamp_key = props.get(FEATURE_TABLE_TIMESTAMP, "")
936
+ return primary_keys, timestamp_key
937
+
938
+ def _check_table_exists(self, full_table_name: str) -> bool:
939
+ return common_utils.check_spark_table_exists(self._spark, full_table_name)
940
+
941
+
942
+ def _get_default_resource_group(project_id: str, secret_id: str, secret_key: str, region: str):
943
+ client = FeatureCloudSDK(secret_id=secret_id, secret_key=secret_key, region=region)
944
+ request = DescribeNormalSchedulerExecutorGroupsRequest()
945
+ request.ProjectId = project_id
946
+ resp = client.DescribeNormalSchedulerExecutorGroups(request)
947
+ # 默认取第一个健康可用的资源组进行执行
948
+ for item in resp.Data:
949
+ if item.Available:
950
+ return item
951
+ raise ValueError("No available resource group found")
952
+
953
+
954
+ def _refresh_table(project_id: str, secret_id: str, secret_key: str, region: str, table_name: str,
955
+ action: str, database_name: str, data_source_name: str, data_source_type: str,
956
+ engine_name: str, is_try: bool):
957
+ return
958
+ client = FeatureCloudSDK(secret_id=secret_id, secret_key=secret_key, region=region)
959
+ request = RefreshFeatureTableRequest()
960
+ request.ProjectId = project_id
961
+ request.TableName = table_name
962
+ request.DatabaseName = database_name
963
+ request.DatasourceName = data_source_name
964
+ request.DatasourceType = data_source_type
965
+ request.EngineName = engine_name
966
+ request.Action = action
967
+ request.IsTry = is_try
968
+ resp = client.RefreshFeatureTable(request)
969
+ return resp