tencent-wedata-feature-engineering-dev 0.1.48__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {tencent_wedata_feature_engineering_dev-0.1.48.dist-info → tencent_wedata_feature_engineering_dev-0.2.5.dist-info}/METADATA +14 -3
  2. tencent_wedata_feature_engineering_dev-0.2.5.dist-info/RECORD +78 -0
  3. {tencent_wedata_feature_engineering_dev-0.1.48.dist-info → tencent_wedata_feature_engineering_dev-0.2.5.dist-info}/WHEEL +1 -1
  4. wedata/__init__.py +1 -1
  5. wedata/common/base_table_client/__init__.py +1 -0
  6. wedata/common/base_table_client/base.py +58 -0
  7. wedata/common/cloud_sdk_client/__init__.py +2 -0
  8. wedata/{feature_store → common}/cloud_sdk_client/client.py +33 -3
  9. wedata/{feature_store → common}/cloud_sdk_client/models.py +212 -37
  10. wedata/{feature_store → common}/cloud_sdk_client/utils.py +7 -0
  11. wedata/{feature_store → common}/constants/constants.py +3 -2
  12. wedata/common/constants/engine_types.py +34 -0
  13. wedata/{feature_store → common}/entities/column_info.py +6 -5
  14. wedata/{feature_store → common}/entities/feature_column_info.py +2 -1
  15. wedata/{feature_store → common}/entities/feature_lookup.py +1 -1
  16. wedata/{feature_store → common}/entities/feature_spec.py +9 -9
  17. wedata/{feature_store → common}/entities/feature_table_info.py +1 -1
  18. wedata/{feature_store → common}/entities/function_info.py +2 -1
  19. wedata/{feature_store → common}/entities/on_demand_column_info.py +2 -1
  20. wedata/{feature_store → common}/entities/source_data_column_info.py +3 -1
  21. wedata/{feature_store → common}/entities/training_set.py +6 -6
  22. wedata/common/feast_client/__init__.py +1 -0
  23. wedata/{feature_store → common}/feast_client/feast_client.py +1 -1
  24. wedata/common/log/__init__.py +1 -0
  25. wedata/{feature_store/common → common}/log/logger.py +9 -5
  26. wedata/common/spark_client/__init__.py +1 -0
  27. wedata/{feature_store → common}/spark_client/spark_client.py +6 -7
  28. wedata/{feature_store → common}/utils/common_utils.py +7 -9
  29. wedata/{feature_store → common}/utils/env_utils.py +12 -0
  30. wedata/{feature_store → common}/utils/feature_lookup_utils.py +6 -6
  31. wedata/{feature_store → common}/utils/feature_spec_utils.py +13 -8
  32. wedata/{feature_store → common}/utils/feature_utils.py +5 -5
  33. wedata/{feature_store → common}/utils/on_demand_utils.py +5 -4
  34. wedata/{feature_store → common}/utils/schema_utils.py +1 -1
  35. wedata/{feature_store → common}/utils/signature_utils.py +4 -4
  36. wedata/{feature_store → common}/utils/training_set_utils.py +13 -13
  37. wedata/{feature_store → common}/utils/uc_utils.py +1 -1
  38. wedata/feature_engineering/__init__.py +1 -0
  39. wedata/feature_engineering/client.py +417 -0
  40. wedata/feature_engineering/ml_training_client/ml_training_client.py +569 -0
  41. wedata/feature_engineering/mlflow_model.py +9 -0
  42. wedata/feature_engineering/table_client/table_client.py +548 -0
  43. wedata/feature_store/client.py +11 -15
  44. wedata/feature_store/constants/engine_types.py +8 -30
  45. wedata/feature_store/feature_table_client/feature_table_client.py +73 -105
  46. wedata/feature_store/training_set_client/training_set_client.py +12 -23
  47. wedata/tempo/interpol.py +2 -2
  48. tencent_wedata_feature_engineering_dev-0.1.48.dist-info/RECORD +0 -66
  49. {tencent_wedata_feature_engineering_dev-0.1.48.dist-info → tencent_wedata_feature_engineering_dev-0.2.5.dist-info}/top_level.txt +0 -0
  50. /wedata/{feature_store/cloud_sdk_client → common}/__init__.py +0 -0
  51. /wedata/{feature_store/common/log → common/constants}/__init__.py +0 -0
  52. /wedata/{feature_store/common/protos → common/entities}/__init__.py +0 -0
  53. /wedata/{feature_store → common}/entities/environment_variables.py +0 -0
  54. /wedata/{feature_store → common}/entities/feature.py +0 -0
  55. /wedata/{feature_store → common}/entities/feature_function.py +0 -0
  56. /wedata/{feature_store → common}/entities/feature_spec_constants.py +0 -0
  57. /wedata/{feature_store → common}/entities/feature_table.py +0 -0
  58. /wedata/{feature_store/entities → common/protos}/__init__.py +0 -0
  59. /wedata/{feature_store/common → common}/protos/feature_store_pb2.py +0 -0
  60. /wedata/{feature_store/feast_client → common/utils}/__init__.py +0 -0
  61. /wedata/{feature_store → common}/utils/topological_sort.py +0 -0
  62. /wedata/{feature_store → common}/utils/validation_utils.py +0 -0
  63. /wedata/{feature_store/spark_client → feature_engineering/ml_training_client}/__init__.py +0 -0
  64. /wedata/{feature_store/utils → feature_engineering/table_client}/__init__.py +0 -0
@@ -0,0 +1,548 @@
1
+ """
2
+ 特征表操作相关工具方法
3
+ """
4
+ import json
5
+ from typing import Union, List, Dict, Optional, Any
6
+
7
+ from pyspark.sql import DataFrame, SparkSession
8
+ from pyspark.sql.streaming import StreamingQuery
9
+ from pyspark.sql.types import StructType
10
+ import os
11
+ from wedata.common.constants.constants import (
12
+ APPEND, DEFAULT_WRITE_STREAM_TRIGGER,
13
+ FEATURE_ENGINEERING_TABLE_PRIMARY_KEY_WEDATA)
14
+ from wedata.common.log import get_logger
15
+ from wedata.common.entities.feature_table import FeatureTable
16
+ from wedata.common.constants.engine_types import EngineTypes
17
+ from wedata.common.spark_client import SparkClient
18
+ from wedata.common.utils import common_utils, env_utils
19
+ from wedata.common.feast_client.feast_client import FeastClient
20
+ from wedata.common.base_table_client import AbstractBaseTableClient
21
+
22
+
23
+ class FeatureEngineeringTableClient(AbstractBaseTableClient):
24
+ """特征表操作类"""
25
+ def __init__(
26
+ self,
27
+ spark: SparkSession,
28
+ cloud_secret_id: str = None,
29
+ cloud_secret_key: str = None,
30
+ ):
31
+ self._spark = spark
32
+ self._feast_client = FeastClient(spark)
33
+ if cloud_secret_id and cloud_secret_key:
34
+ self.__cloud_secret_id = cloud_secret_id
35
+ self.__cloud_secret_key = cloud_secret_key
36
+ else:
37
+ self.__cloud_secret_id, self.__cloud_secret_key = env_utils.get_cloud_secret()
38
+ self.__project = env_utils.get_project_id()
39
+ self.__region = env_utils.get_region()
40
+ self.__logger = get_logger()
41
+
42
+ @property
43
+ def cloud_secret_id(self) -> str:
44
+ if not self.__cloud_secret_id:
45
+ raise ValueError("cloud_secret_id is empty. please set it first.")
46
+ return self.__cloud_secret_id
47
+
48
+ @cloud_secret_id.setter
49
+ def cloud_secret_id(self, cloud_secret_id: str):
50
+ if not cloud_secret_id:
51
+ raise ValueError("cloud_secret_id cannot be None")
52
+ self.__cloud_secret_id = cloud_secret_id
53
+
54
+ @property
55
+ def cloud_secret_key(self) -> str:
56
+ if not self.__cloud_secret_key:
57
+ raise ValueError("cloud_secret_key is empty. please set it first.")
58
+ return self.__cloud_secret_key
59
+
60
+ @cloud_secret_key.setter
61
+ def cloud_secret_key(self, cloud_secret_key: str):
62
+ if not cloud_secret_key:
63
+ raise ValueError("cloud_secret_key cannot be None")
64
+ self.__cloud_secret_key = cloud_secret_key
65
+
66
+ @property
67
+ def project(self) -> str:
68
+ return self.__project
69
+
70
+ @property
71
+ def region(self) -> str:
72
+ return self.__region
73
+
74
+ def create_table(
75
+ self,
76
+ name: str,
77
+ primary_keys: Union[str, List[str]],
78
+ timestamp_key: str,
79
+ engine_type: EngineTypes,
80
+ data_source_name: str,
81
+ database_name: Optional[str] = None,
82
+ df: Optional[DataFrame] = None,
83
+ *,
84
+ partition_columns: Union[str, List[str], None] = None,
85
+ schema: Optional[StructType] = None,
86
+ description: Optional[str] = None,
87
+ tags: Optional[Dict[str, str]] = None
88
+ ) -> FeatureTable:
89
+
90
+ """
91
+ 创建特征表(支持批流数据写入)
92
+
93
+ Args:
94
+ name: 特征表全称(格式:<table>)
95
+ primary_keys: 主键列名(支持复合主键)
96
+ database_name: Optional[str] = None,
97
+ data_source_name: 数据源名称,
98
+ df: 初始数据(可选,用于推断schema)
99
+ timestamp_key: 时间戳键(用于时态特征)
100
+ partition_columns: 分区列(优化存储查询)
101
+ schema: 表结构定义(可选,当不提供df时必需)
102
+ description: 业务描述
103
+ tags: 业务标签
104
+ Returns:
105
+ FeatureTable实例
106
+
107
+ Raises:
108
+ ValueError: 当schema与数据不匹配时
109
+ """
110
+
111
+ # 参数标准化
112
+ primary_keys = self._normalize_params(primary_keys)
113
+ partition_columns = self._normalize_params(partition_columns)
114
+
115
+ assert self._check_sequence_element_type(primary_keys, str), "primary_keys must be a list of strings"
116
+ assert self._check_sequence_element_type(partition_columns, str), "partition_columns must be a list of strings"
117
+ assert isinstance(timestamp_key, str), "timestamp key must be string"
118
+
119
+ # 元数据校验
120
+ self._validate_schema(df, schema)
121
+ self._validate_key_exists(primary_keys, timestamp_key)
122
+ self._validate_key_conflicts(primary_keys, timestamp_key)
123
+
124
+ # 表名校验
125
+ common_utils.validate_table_name(name)
126
+
127
+ common_utils.validate_database(database_name)
128
+
129
+ # 校验PrimaryKey是否有重复
130
+ dup_list = common_utils.get_duplicates(primary_keys)
131
+ if dup_list:
132
+ raise ValueError(f"Primary keys have duplicates: {dup_list}")
133
+
134
+ # 构建完整表名
135
+ table_name = common_utils.build_full_table_name(name, database_name)
136
+
137
+ # 检查表是否存在
138
+ try:
139
+ if self._check_table_exists(table_name):
140
+ raise ValueError(
141
+ f"Table '{name}' already exists\n"
142
+ "Solutions:\n"
143
+ "1. Use a different table name\n"
144
+ "2. Drop the existing table: spark.sql(f'DROP TABLE {name}')\n"
145
+ )
146
+ except Exception as e:
147
+ raise ValueError(f"Error checking table existence: {str(e)}") from e
148
+
149
+ # 推断表schema
150
+ table_schema = schema or df.schema
151
+
152
+ # 构建时间戳键属性
153
+
154
+ # 从环境变量获取额外标签
155
+ env_tags = {
156
+ "project_id": os.getenv("WEDATA_PROJECT_ID", ""), # wedata项目ID
157
+ "engine_name": os.getenv("WEDATA_NOTEBOOK_ENGINE", ""), # wedata引擎名称
158
+ "user_uin": os.getenv("KERNEL_LOGIN_UIN", "") # wedata用户UIN
159
+ }
160
+ projectId = os.getenv("WEDATA_PROJECT_ID", "")
161
+ # 构建表属性(通过TBLPROPERTIES)
162
+ tbl_properties = {
163
+ FEATURE_ENGINEERING_TABLE_PRIMARY_KEY_WEDATA: ",".join(primary_keys),
164
+ "wedata.feature_project_id": f"{json.dumps([projectId])}",
165
+ "comment": description or "",
166
+ **{f"{k}": v for k, v in (tags or {}).items()},
167
+ **{f"feature_{k}": v for k, v in (env_tags or {}).items()}
168
+ }
169
+ if engine_type == EngineTypes.ICEBERG_ENGINE:
170
+ if partition_columns:
171
+ tbl_properties.update({
172
+ 'format-version': '2',
173
+ 'write.upsert.enabled': 'true',
174
+ 'write.update.mode': 'merge-on-read',
175
+ 'write.merge.mode': 'merge-on-read',
176
+ 'write.parquet.bloom-filter-enabled.column.id': 'true',
177
+ 'dlc.ao.data.govern.sorted.keys': ",".join(primary_keys),
178
+ 'write.distribution-mode': 'hash',
179
+ 'write.metadata.delete-after-commit.enabled': 'true',
180
+ 'write.metadata.previous-versions-max': '100',
181
+ 'write.metadata.metrics.default': 'full',
182
+ 'smart-optimizer.inherit': 'default',
183
+ })
184
+ else:
185
+ tbl_properties.update({
186
+ 'format-version': '2',
187
+ 'write.upsert.enabled': 'true',
188
+ 'write.update.mode': 'merge-on-read',
189
+ 'write.merge.mode': 'merge-on-read',
190
+ 'write.parquet.bloom-filter-enabled.column.id': 'true',
191
+ 'dlc.ao.data.govern.sorted.keys': ",".join(primary_keys),
192
+ 'write.distribution-mode': 'hash',
193
+ 'write.metadata.delete-after-commit.enabled': 'true',
194
+ 'write.metadata.previous-versions-max': '100',
195
+ 'write.metadata.metrics.default': 'full',
196
+ 'smart-optimizer.inherit': 'default',
197
+ })
198
+
199
+ # 构建列定义
200
+ columns_ddl = []
201
+ for field in table_schema.fields:
202
+ data_type = field.dataType.simpleString().upper()
203
+ col_def = f"`{field.name}` {data_type}"
204
+ if not field.nullable:
205
+ col_def += " NOT NULL"
206
+ # 添加字段注释(如果metadata中有comment)
207
+ if field.metadata and "comment" in field.metadata:
208
+ comment = self._escape_sql_value(field.metadata["comment"])
209
+ col_def += f" COMMENT '{comment}'"
210
+ columns_ddl.append(col_def)
211
+
212
+ # 构建分区表达式
213
+ partition_expr = (
214
+ f"PARTITIONED BY ({', '.join([f'`{c}`' for c in partition_columns])})"
215
+ if partition_columns else ""
216
+ )
217
+ # 本地调试 iceberg --》PARQUET
218
+ # 核心建表语句
219
+ if engine_type == EngineTypes.ICEBERG_ENGINE:
220
+ ddl = f"""
221
+ CREATE TABLE {table_name} (
222
+ {', '.join(columns_ddl)}
223
+ )
224
+ USING iceberg
225
+ {partition_expr}
226
+ TBLPROPERTIES (
227
+ {', '.join(f"'{k}'='{self._escape_sql_value(v)}'" for k, v in tbl_properties.items())}
228
+ )
229
+ """
230
+ elif engine_type == EngineTypes.HIVE_ENGINE:
231
+ ddl = f"""
232
+ CREATE TABLE {table_name} (
233
+ {', '.join(columns_ddl)}
234
+ )
235
+ {partition_expr}
236
+ TBLPROPERTIES (
237
+ {', '.join(f"'{k}'='{self._escape_sql_value(v)}'" for k, v in tbl_properties.items())}
238
+ )
239
+ """
240
+ else:
241
+ raise ValueError(f"Engine type {engine_type} is not supported")
242
+
243
+ # 打印sql
244
+ self.__logger.info(f"create table ddl: {ddl}\n")
245
+
246
+ # 执行DDL
247
+ try:
248
+ self._spark.sql(ddl)
249
+ if df is not None:
250
+ df.write.insertInto(table_name)
251
+ except Exception as e:
252
+ raise ValueError(f"Failed to create table: {str(e)}") from e
253
+
254
+ self._feast_client.create_table(
255
+ table_name=table_name,
256
+ primary_keys=primary_keys,
257
+ timestamp_key=timestamp_key,
258
+ df=df,
259
+ schema=table_schema,
260
+ tags=tags,
261
+ description=description
262
+ )
263
+
264
+ self.__logger.info(f"Table '{name}' created successfully. Starting web synchronization.")
265
+
266
+ # 构建并返回FeatureTable对象
267
+ return FeatureTable(
268
+ name=name,
269
+ table_id=table_name,
270
+ description=description or "",
271
+ primary_keys=primary_keys,
272
+ partition_columns=partition_columns or [],
273
+ features=[field.name for field in table_schema.fields],
274
+ timestamp_keys=timestamp_key or [],
275
+ tags=dict(**tags or {}, **env_tags)
276
+ )
277
+
278
+ def write_table(
279
+ self,
280
+ name: str,
281
+ df: DataFrame,
282
+ database_name: Optional[str] = None,
283
+ mode: Optional[str] = APPEND,
284
+ checkpoint_location: Optional[str] = None,
285
+ trigger: Optional[Dict[str, Any]] = DEFAULT_WRITE_STREAM_TRIGGER
286
+ ) -> Optional[StreamingQuery]:
287
+
288
+ """
289
+ 写入特征表数据(支持批处理和流式写入)
290
+
291
+ Args:
292
+ name: 特征表名称(格式:<table>)
293
+ df: 要写入的数据(DataFrame)
294
+ database_name: 数据库名
295
+ mode: 写入模式(append/overwrite)
296
+ checkpoint_location: 流式写入的检查点位置(仅流式写入需要)
297
+ trigger: 流式写入触发条件(仅流式写入需要)
298
+
299
+ Returns:
300
+ 如果是流式写入返回StreamingQuery对象,否则返回None
301
+
302
+ Raises:
303
+ ValueError: 当参数不合法时抛出
304
+ """
305
+
306
+ # 验证写入模式
307
+ valid_modes = ["append", "overwrite"]
308
+ if mode not in valid_modes:
309
+ raise ValueError(f"Invalid write mode '{mode}', valid options: {valid_modes}")
310
+
311
+ # 表名校验
312
+ common_utils.validate_table_name(name)
313
+
314
+ common_utils.validate_database(database_name)
315
+
316
+ # 构建完整表名
317
+ table_name = common_utils.build_full_table_name(name, database_name)
318
+
319
+ # 检查表是否存在
320
+ if not self._check_table_exists(table_name):
321
+ raise ValueError(f"table '{name}' not exists")
322
+
323
+ # 判断是否是流式DataFrame
324
+ is_streaming = df.isStreaming
325
+
326
+ try:
327
+ if is_streaming:
328
+ # 流式写入
329
+ if not checkpoint_location:
330
+ raise ValueError("Streaming write requires checkpoint_location parameter")
331
+
332
+ writer = df.writeStream \
333
+ .format("parquet") \
334
+ .outputMode(mode) \
335
+ .option("checkpointLocation", checkpoint_location) \
336
+ # .foreachBatch(process_batch)
337
+
338
+ if trigger:
339
+ writer = writer.trigger(**trigger)
340
+
341
+ return writer.toTable(table_name)
342
+ else:
343
+ # 批处理写入
344
+ df.write \
345
+ .mode(mode) \
346
+ .insertInto(table_name)
347
+ # self._feast_client.client.write_to_offline_store(feature_view_name=table_name, df=df.toPandas(), allow_registry_cache=False,)
348
+ return None
349
+
350
+ except Exception:
351
+ raise
352
+ # raise ValueError(f"Failed to write to table '{table_name}': {str(e)}") from e
353
+
354
+ def read_table(
355
+ self,
356
+ name: str,
357
+ database_name: Optional[str] = None,
358
+ ) -> DataFrame:
359
+
360
+ """
361
+ 从特征表中读取数据
362
+ Args:
363
+ name: 特征表名称(格式:<table>)
364
+ database_name: 特征库名称
365
+ Returns:
366
+ 包含表数据的DataFrame
367
+
368
+ Raises:
369
+ ValueError: 当表不存在或读取失败时抛出
370
+ """
371
+
372
+ # 表名校验
373
+ common_utils.validate_table_name(name)
374
+
375
+ common_utils.validate_database(database_name)
376
+
377
+ # 构建完整表名
378
+ table_name = common_utils.build_full_table_name(name, database_name)
379
+
380
+ try:
381
+ # 检查表是否存在
382
+ if not self._check_table_exists(table_name):
383
+ raise ValueError(f"Table '{name}' does not exist")
384
+
385
+ # 读取表数据
386
+ return self._spark.read.table(table_name)
387
+
388
+ except Exception as e:
389
+ raise
390
+
391
+ def drop_table(self, name: str, database_name: Optional[str] = None) -> None:
392
+
393
+ """
394
+ 删除特征表(表不存在时抛出异常)
395
+
396
+ Args:
397
+ name: 特征表名称(格式:<table>)
398
+ database_name: 特征库名称
399
+ Raises:
400
+ ValueError: 当表不存在时抛出
401
+ RuntimeError: 当删除操作失败时抛出
402
+
403
+ 示例:
404
+ # 基本删除
405
+ drop_table("user_features")
406
+ """
407
+
408
+ # 表名校验
409
+ common_utils.validate_table_name(name)
410
+
411
+ # 构建完整表名
412
+ table_name = common_utils.build_full_table_name(name, database_name)
413
+ try:
414
+ # 检查表是否存在
415
+ if not self._check_table_exists(table_name):
416
+ self.__logger.error(f"Table '{name}' does not exist")
417
+ return
418
+
419
+ try:
420
+ feature_view = self._feast_client.get_feature_view(table_name)
421
+ except Exception:
422
+ pass
423
+ # self.__logger.warning(f"Table '{name}' is not a feature table, skip delete. {str(e)}")
424
+ else:
425
+ if feature_view.online:
426
+ raise ValueError(f"Table '{name}' has a online table, please call drop_online_table first")
427
+
428
+ # 执行删除
429
+ self._spark.sql(f"DROP TABLE {table_name}")
430
+ self.__logger.info(f"Table '{name}' dropped")
431
+ try:
432
+ self._feast_client.remove_offline_table(table_name=table_name)
433
+ except Exception:
434
+ raise
435
+ else:
436
+ self.__logger.info(f"Table '{name}' removed from feast")
437
+ except ValueError:
438
+ raise # 直接抛出已知的ValueError
439
+ except Exception as e:
440
+ raise RuntimeError(f"Failed to delete table '{name}': {str(e)}") from e
441
+
442
+ def get_table(
443
+ self,
444
+ name: str,
445
+ spark_client: SparkClient,
446
+ database_name: Optional[str] = None,
447
+ ) -> FeatureTable:
448
+
449
+ """
450
+ 获取特征表元数据信息
451
+
452
+ 参数:
453
+ name: 特征表名称
454
+ spark_client: Spark客户端
455
+
456
+ 返回:
457
+ FeatureTable对象
458
+
459
+ 异常:
460
+ ValueError: 当表不存在或获取失败时抛出
461
+ """
462
+
463
+ # 表名校验
464
+ common_utils.validate_table_name(name)
465
+ common_utils.validate_database(database_name)
466
+
467
+ # 构建完整表名
468
+ table_name = common_utils.build_full_table_name(name, database_name)
469
+ if not self._check_table_exists(full_table_name=table_name):
470
+ raise ValueError(f"Table '{name}' does not exist")
471
+ try:
472
+ return spark_client.get_feature_table(table_name)
473
+ except Exception as e:
474
+ raise
475
+ # raise ValueError(f"Failed to get metadata for table '{name}': {str(e)}") from e
476
+
477
+ def alter_table_tag(
478
+ self,
479
+ name: str,
480
+ properties: Dict[str, str],
481
+ database_name: Optional[str] = None,
482
+ mode: str = "add"
483
+ ):
484
+ """
485
+ 修改表的TBLPROPERTIES属性(有则修改,无则新增)
486
+
487
+ Args:
488
+ name: 表名(格式:<table>)
489
+ properties: 要修改/新增的属性字典
490
+ database_name: 特征库名称
491
+ mode: 模式 add / delete
492
+
493
+ Raises:
494
+ ValueError: 当表不存在或参数无效时抛出
495
+ RuntimeError: 当修改操作失败时抛出
496
+
497
+ 示例:
498
+ # 修改表属性
499
+ client.alter_tables_tag("user_features", {
500
+ "comment": "更新后的描述",
501
+ "owner": "data_team"
502
+ })
503
+ """
504
+ # 参数校验
505
+ if not properties:
506
+ raise ValueError("properties must be a non-empty dictionary")
507
+
508
+ # 表名校验
509
+ common_utils.validate_table_name(name)
510
+ common_utils.validate_database(database_name)
511
+
512
+ # 构建完整表名
513
+ table_name = common_utils.build_full_table_name(name, database_name)
514
+
515
+ try:
516
+ # 检查表是否存在
517
+ if not self._check_table_exists(table_name):
518
+ raise ValueError(f"table '{name}' not exists")
519
+
520
+ if mode == "add":
521
+ # 构建属性设置语句
522
+ props_str = ", ".join(
523
+ f"'{k}'='{self._escape_sql_value(v)}'"
524
+ for k, v in properties.items()
525
+ )
526
+
527
+ alter_sql = f"ALTER TABLE {table_name} SET TBLPROPERTIES ({props_str})"
528
+ elif mode == "delete":
529
+ props_str = ", ".join(f"'{k}'" for k in properties.keys())
530
+ alter_sql = f"ALTER TABLE {table_name} UNSET TBLPROPERTIES ({props_str})"
531
+ else:
532
+ raise ValueError(f"Invalid mode '{mode}', valid options: {['add', 'delete']}")
533
+
534
+ # 执行修改
535
+ self._spark.sql(alter_sql)
536
+ # 执行结果回写feast
537
+ tbl_pro = self._spark.sql(f"SHOW TBLPROPERTIES {table_name}").collect()
538
+ props = {row['key']: row['value'] for row in tbl_pro}
539
+ self._feast_client.modify_tags(table_name=table_name, tags=props)
540
+ print(f"Successfully updated properties for table '{name}': {list(properties.keys())}")
541
+
542
+ except ValueError as e:
543
+ raise # 直接抛出已知的ValueError
544
+ except Exception as e:
545
+ raise RuntimeError(f"Failed to modify properties for table '{name}': {str(e)}") from e
546
+
547
+ def _check_table_exists(self, full_table_name: str) -> bool:
548
+ return common_utils.check_spark_table_exists(self._spark, full_table_name)
@@ -10,21 +10,21 @@ from pyspark.sql import DataFrame, SparkSession
10
10
  from pyspark.sql.streaming import StreamingQuery
11
11
  from pyspark.sql.types import StructType
12
12
  import mlflow
13
- from wedata.feature_store.constants.constants import FEATURE_STORE_CLIENT
13
+ from wedata.common.constants.constants import FEATURE_STORE_CLIENT
14
14
 
15
- from wedata.feature_store.constants.constants import APPEND, DEFAULT_WRITE_STREAM_TRIGGER
15
+ from wedata.common.constants.constants import APPEND, DEFAULT_WRITE_STREAM_TRIGGER
16
16
  from wedata.feature_store.constants.engine_types import EngineTypes
17
17
  from wedata.feature_store.common.store_config.redis import RedisStoreConfig
18
- from wedata.feature_store.cloud_sdk_client.models import TaskSchedulerConfiguration
19
- from wedata.feature_store.entities.feature_function import FeatureFunction
20
- from wedata.feature_store.entities.feature_lookup import FeatureLookup
21
- from wedata.feature_store.entities.feature_table import FeatureTable
22
- from wedata.feature_store.entities.training_set import TrainingSet
18
+ from wedata.common.cloud_sdk_client.models import TaskSchedulerConfiguration
19
+ from wedata.common.entities.feature_function import FeatureFunction
20
+ from wedata.common.entities.feature_lookup import FeatureLookup
21
+ from wedata.common.entities.feature_table import FeatureTable
22
+ from wedata.common.entities.training_set import TrainingSet
23
23
  from wedata.feature_store.feature_table_client.feature_table_client import FeatureTableClient
24
- from wedata.feature_store.spark_client.spark_client import SparkClient
24
+ from wedata.common.spark_client import SparkClient
25
25
  from wedata.feature_store.training_set_client.training_set_client import TrainingSetClient
26
- from wedata.feature_store.utils import common_utils
27
- from wedata.feature_store.utils.feature_utils import format_feature_lookups_and_functions
26
+ from wedata.common.utils import common_utils
27
+ from wedata.common.utils.feature_utils import format_feature_lookups_and_functions
28
28
 
29
29
 
30
30
  class FeatureStoreClient:
@@ -402,7 +402,7 @@ class FeatureStoreClient:
402
402
  timestamp_key=timestamp_key,
403
403
  )
404
404
 
405
- def publish_table(self, table_name: str, data_source_name: str, cloud_secret_id: str, cloud_secret_key: str,
405
+ def publish_table(self, table_name: str, data_source_name: str,
406
406
  database_name: Optional[str] = None,
407
407
  is_cycle: bool = False, cycle_obj: TaskSchedulerConfiguration = None,
408
408
  is_use_default_online: bool = True, online_config: RedisStoreConfig = None):
@@ -415,8 +415,6 @@ class FeatureStoreClient:
415
415
  Args:
416
416
  table_name: Name of the offline feature table
417
417
  data_source_name: Name of the data source
418
- cloud_secret_id: Cloud secret ID for authentication
419
- cloud_secret_key: Cloud secret key for authentication
420
418
  database_name: Database name (optional)
421
419
  is_cycle: Whether to enable periodic publishing (default: False)
422
420
  cycle_obj: Periodic task configuration object (required if is_cycle is True)
@@ -429,8 +427,6 @@ class FeatureStoreClient:
429
427
  """
430
428
  return self._feature_table_client.publish_table(table_name=table_name, database_name=database_name,
431
429
  data_source_name=data_source_name,
432
- cloud_secret_key=cloud_secret_key,
433
- cloud_secret_id=cloud_secret_id,
434
430
  is_cycle=is_cycle, cycle_obj=cycle_obj,
435
431
  is_use_default_online=is_use_default_online,
436
432
  online_config=online_config)
@@ -1,34 +1,12 @@
1
- from enum import Enum
2
- import os
1
+ import warnings
2
+ from wedata.common.constants.engine_types import (EngineTypes as _EngineTypes,
3
+ CalculateEngineTypes as _CalculateEngineTypes,
4
+ judge_engine_type as _judge_engine_type)
3
5
 
6
+ warnings.warn("engine_types.py is deprecated, please use wedata.common.constants.engine_types.py")
4
7
 
5
- class EngineTypes(Enum):
6
- HIVE_ENGINE = "hive"
7
- ICEBERG_ENGINE = "iceberg"
8
+ EngineTypes = _EngineTypes
8
9
 
9
- @classmethod
10
- def get_engine(cls, engine_name: str) -> 'EngineTypes':
11
- try:
12
- return cls(engine_name.lower())
13
- except ValueError:
14
- raise ValueError(f"Invalid engine type: {engine_name}. Supported engine types: {list(cls)}")
15
-
16
-
17
- class CalculateEngineTypes(Enum):
18
- DLC = "dlc"
19
- EMR = "emr"
20
-
21
- @classmethod
22
- def get_calculate_engine(cls, engine_name: str) -> 'CalculateEngineTypes':
23
- try:
24
- return cls(engine_name.lower())
25
- except ValueError:
26
- raise ValueError(f"Invalid engine type: {engine_name}. Supported engine types: {list(cls)}")
27
-
28
-
29
- def judge_engine_type() -> 'CalculateEngineTypes':
30
- if os.environ.get("DLC_REGION", ""):
31
- return CalculateEngineTypes.DLC
32
- else:
33
- return CalculateEngineTypes.EMR
10
+ CalculateEngineTypes = _CalculateEngineTypes
34
11
 
12
+ judge_engine_type = _judge_engine_type