tencent-wedata-feature-engineering-dev 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.

Files changed (64) hide show
  1. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/METADATA +19 -0
  2. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/RECORD +64 -0
  3. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/WHEEL +5 -0
  4. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/top_level.txt +1 -0
  5. wedata/__init__.py +9 -0
  6. wedata/feature_store/__init__.py +0 -0
  7. wedata/feature_store/client.py +462 -0
  8. wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
  9. wedata/feature_store/cloud_sdk_client/client.py +86 -0
  10. wedata/feature_store/cloud_sdk_client/models.py +686 -0
  11. wedata/feature_store/cloud_sdk_client/utils.py +32 -0
  12. wedata/feature_store/common/__init__.py +0 -0
  13. wedata/feature_store/common/protos/__init__.py +0 -0
  14. wedata/feature_store/common/protos/feature_store_pb2.py +49 -0
  15. wedata/feature_store/common/store_config/__init__.py +0 -0
  16. wedata/feature_store/common/store_config/redis.py +48 -0
  17. wedata/feature_store/constants/__init__.py +0 -0
  18. wedata/feature_store/constants/constants.py +59 -0
  19. wedata/feature_store/constants/engine_types.py +34 -0
  20. wedata/feature_store/entities/__init__.py +0 -0
  21. wedata/feature_store/entities/column_info.py +138 -0
  22. wedata/feature_store/entities/environment_variables.py +55 -0
  23. wedata/feature_store/entities/feature.py +53 -0
  24. wedata/feature_store/entities/feature_column_info.py +72 -0
  25. wedata/feature_store/entities/feature_function.py +55 -0
  26. wedata/feature_store/entities/feature_lookup.py +200 -0
  27. wedata/feature_store/entities/feature_spec.py +489 -0
  28. wedata/feature_store/entities/feature_spec_constants.py +25 -0
  29. wedata/feature_store/entities/feature_table.py +111 -0
  30. wedata/feature_store/entities/feature_table_info.py +49 -0
  31. wedata/feature_store/entities/function_info.py +90 -0
  32. wedata/feature_store/entities/on_demand_column_info.py +57 -0
  33. wedata/feature_store/entities/source_data_column_info.py +24 -0
  34. wedata/feature_store/entities/training_set.py +135 -0
  35. wedata/feature_store/feast_client/__init__.py +0 -0
  36. wedata/feature_store/feast_client/feast_client.py +482 -0
  37. wedata/feature_store/feature_table_client/__init__.py +0 -0
  38. wedata/feature_store/feature_table_client/feature_table_client.py +969 -0
  39. wedata/feature_store/mlflow_model.py +17 -0
  40. wedata/feature_store/spark_client/__init__.py +0 -0
  41. wedata/feature_store/spark_client/spark_client.py +289 -0
  42. wedata/feature_store/training_set_client/__init__.py +0 -0
  43. wedata/feature_store/training_set_client/training_set_client.py +572 -0
  44. wedata/feature_store/utils/__init__.py +0 -0
  45. wedata/feature_store/utils/common_utils.py +352 -0
  46. wedata/feature_store/utils/env_utils.py +86 -0
  47. wedata/feature_store/utils/feature_lookup_utils.py +564 -0
  48. wedata/feature_store/utils/feature_spec_utils.py +286 -0
  49. wedata/feature_store/utils/feature_utils.py +73 -0
  50. wedata/feature_store/utils/on_demand_utils.py +107 -0
  51. wedata/feature_store/utils/schema_utils.py +117 -0
  52. wedata/feature_store/utils/signature_utils.py +202 -0
  53. wedata/feature_store/utils/topological_sort.py +158 -0
  54. wedata/feature_store/utils/training_set_utils.py +579 -0
  55. wedata/feature_store/utils/uc_utils.py +296 -0
  56. wedata/feature_store/utils/validation_utils.py +79 -0
  57. wedata/tempo/__init__.py +0 -0
  58. wedata/tempo/interpol.py +448 -0
  59. wedata/tempo/intervals.py +1331 -0
  60. wedata/tempo/io.py +61 -0
  61. wedata/tempo/ml.py +129 -0
  62. wedata/tempo/resample.py +318 -0
  63. wedata/tempo/tsdf.py +1720 -0
  64. wedata/tempo/utils.py +254 -0
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.1
2
+ Name: tencent-wedata-feature-engineering-dev
3
+ Version: 0.1.0
4
+ Summary: Wedata Feature Engineering Library Development
5
+ Home-page:
6
+ Author: meahqian
7
+ Author-email:
8
+ License: Apache 2.0
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: Apache Software License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.7
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: pandas>=1.0.0
15
+ Requires-Dist: feast[redis]==0.49.0
16
+ Requires-Dist: grpcio==1.74.0
17
+ Requires-Dist: tencentcloud-sdk-python
18
+ Requires-Dist: ipython
19
+
@@ -0,0 +1,64 @@
1
+ wedata/__init__.py,sha256=GYxqkkgH0oH4QtNiOCZHuGkc0sSH1LgEqmhSX6sB4So,200
2
+ wedata/feature_store/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ wedata/feature_store/client.py,sha256=rXQfSC14vvxPed9P0PoboDTwhD47aPwmGndY825Sl6k,19484
4
+ wedata/feature_store/mlflow_model.py,sha256=OCUuccOoO0NXWSzIPoGeL03Ha1Q3aQTJW2RlJrTCmzc,554
5
+ wedata/feature_store/cloud_sdk_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ wedata/feature_store/cloud_sdk_client/client.py,sha256=mCmnhqEjbORNIhX7dF6Der0VOow9mt8cFpTeOiGI_Hg,3661
7
+ wedata/feature_store/cloud_sdk_client/models.py,sha256=7_QUq0kZcrcclRMsIYFoqBrlzVwaHoVY-yU5SHIrJWM,19789
8
+ wedata/feature_store/cloud_sdk_client/utils.py,sha256=6ESwVhlrftnp0h9ojTzbB-m-0hktLI1PLcyk6zpNgrs,857
9
+ wedata/feature_store/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ wedata/feature_store/common/protos/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ wedata/feature_store/common/protos/feature_store_pb2.py,sha256=oMIUGGeGNP84g_nFqOQwTXjV1GiU2ehSOy7CyFu2__g,4207
12
+ wedata/feature_store/common/store_config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ wedata/feature_store/common/store_config/redis.py,sha256=9R5npM2s1u0o9IakmpbRsFdJC0vNar_uvA62OLWuXBs,1145
14
+ wedata/feature_store/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ wedata/feature_store/constants/constants.py,sha256=jRW8iWKF4cqdcsemxU683Rhd2shrWcZh0oFFgUrTack,1954
16
+ wedata/feature_store/constants/engine_types.py,sha256=42mI-kNDDtoA4_I3iqDe4FkF2M2l_Bt4Q1V6WUB-_k0,921
17
+ wedata/feature_store/entities/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ wedata/feature_store/entities/column_info.py,sha256=-AR6EKHwgoqIkRHFyguxVEtnYt6fvusWHkEjF4kvS0A,5141
19
+ wedata/feature_store/entities/environment_variables.py,sha256=ZEFml5H9MQuzBKM074mUrFYu-Sga4Knmxqiwpke2WGc,1679
20
+ wedata/feature_store/entities/feature.py,sha256=wX8fTBlJq3GYdj9rrBDCY3kFgcVBBAiOOZdxEhnQkNQ,1241
21
+ wedata/feature_store/entities/feature_column_info.py,sha256=ZAS_I-MDg2ofCv3nwYvGCQrrpEljzrh_L1D-gqOV_mM,2407
22
+ wedata/feature_store/entities/feature_function.py,sha256=R17INrCE-U_Uj9KLbFz69aYlOkTETTwQHMMo470F4lQ,1865
23
+ wedata/feature_store/entities/feature_lookup.py,sha256=UYmYCzkQ1_KuooybS3F-7HDcjBMPZ72InL06UTHbEtw,8749
24
+ wedata/feature_store/entities/feature_spec.py,sha256=Z2SXE_LObjNY3q5yBVKPXGTUiMZy7zaI6-ZbAoFlwG8,21769
25
+ wedata/feature_store/entities/feature_spec_constants.py,sha256=YWDBfRiNDe6fUJFUBo3V4WYg2xsljoPAE-ZejfFZCgM,785
26
+ wedata/feature_store/entities/feature_table.py,sha256=nHCCd7WUryROt9oTJpYkT-KiGbKcQd7BEE9L2_1dhYw,4107
27
+ wedata/feature_store/entities/feature_table_info.py,sha256=yJ1P3AYaPiHW6ehCbMWhndzguHJqJKWfeFwYjwTLt2U,1481
28
+ wedata/feature_store/entities/function_info.py,sha256=yDwIzTrBR-ECWubgeoy48SYZfdY7P0JcraZnWGCW0ag,2752
29
+ wedata/feature_store/entities/on_demand_column_info.py,sha256=a44ep-f3FOruWNXl3c8v7733rNuoKXJaHTv1aqF905s,1739
30
+ wedata/feature_store/entities/source_data_column_info.py,sha256=FyBmBPUSvc2S2OPFTvsQf2AdS-KFGkYBmd4yL_Vur8M,702
31
+ wedata/feature_store/entities/training_set.py,sha256=ylt1h6Z_xU8hKYvnvd80CeewTGSN68-_kvFpoliwH7s,5679
32
+ wedata/feature_store/feast_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ wedata/feature_store/feast_client/feast_client.py,sha256=nRk2XMmXaZOp-3PMYi-_ScxS4hXbJ_awDylu07T8L_s,20343
34
+ wedata/feature_store/feature_table_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
+ wedata/feature_store/feature_table_client/feature_table_client.py,sha256=w0N-83pqs73jn6wIopCm6KEytDv1fhlhVlRLTp2lgNE,41350
36
+ wedata/feature_store/spark_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
+ wedata/feature_store/spark_client/spark_client.py,sha256=B_6f_VVSdhft1SsoAohXCG08XnjokR3FgXql8cROtsI,11840
38
+ wedata/feature_store/training_set_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
+ wedata/feature_store/training_set_client/training_set_client.py,sha256=teaWM-xDgp2TwnadovUm0i4A26roTozgRefIZaHORko,23376
40
+ wedata/feature_store/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
+ wedata/feature_store/utils/common_utils.py,sha256=BRIU4Qb8uvhe5E3fwBMY_Q7t7XTt-aNqEfhUsHjQnqQ,12371
42
+ wedata/feature_store/utils/env_utils.py,sha256=72-KFPZ_L4SNthEZ-0ZH6ynNlmj0AboH5JnM1BVQoP8,2288
43
+ wedata/feature_store/utils/feature_lookup_utils.py,sha256=mNV6RhBdpv1iTZduCA9YwXwkeJCwU5MFQ1MkFeD9IhY,22003
44
+ wedata/feature_store/utils/feature_spec_utils.py,sha256=j8t-zel2_r8Q9m88BmFKkHMdkGNIduWJB-28OZDASRY,11613
45
+ wedata/feature_store/utils/feature_utils.py,sha256=KKq28bVB_lCuhnR9Hk6JegJBOVgcelWlvrRM-F9onkA,2796
46
+ wedata/feature_store/utils/on_demand_utils.py,sha256=pazZRG5c0Se08MV_inBddIeX4Q9xlVN_H9SC_WK3xzs,4260
47
+ wedata/feature_store/utils/schema_utils.py,sha256=y6EYY1pUxjVg6MP4C7avdW8ZEBBaDo1YTV2CmPF4i8o,4491
48
+ wedata/feature_store/utils/signature_utils.py,sha256=SZFufd19m0jmGnOLmAl3JPKZC-qHq-wQezh6G7HOMfc,7773
49
+ wedata/feature_store/utils/topological_sort.py,sha256=ebzKxmxeCLk9seB1zR0ASCGXsZsa-DjxJeTc4KUadtg,6475
50
+ wedata/feature_store/utils/training_set_utils.py,sha256=MYsPZS1d9HKswHgjgxD8K7H9N3dWPyyTTx20Mkp4PVU,22497
51
+ wedata/feature_store/utils/uc_utils.py,sha256=5jngdLT8quP1lfGHN_SSFQQlcOh_sUB9M1varCgdFwg,11436
52
+ wedata/feature_store/utils/validation_utils.py,sha256=lJe6HCg5v5CZxH_pvT-vpGhCpo66LT2erXraHE2T0iI,2584
53
+ wedata/tempo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
+ wedata/tempo/interpol.py,sha256=3JF8dwcdKv2o10FN45aefgvxR5DjlR6FJAXrbAiGCro,16423
55
+ wedata/tempo/intervals.py,sha256=L2ao7LlgQmfDTFwnBoFLXeuygSvwtIKXL52thiD80Yw,44078
56
+ wedata/tempo/io.py,sha256=KWIn6IBSkuBxr8QCcpdZ6NFX_49-8UQdGftmZgs_ujw,1872
57
+ wedata/tempo/ml.py,sha256=WtGa2szn6PditvZsTZoxo7wFDe4k1SRoMZ-jgNGIjvE,4323
58
+ wedata/tempo/resample.py,sha256=h81RVVmCl4ect-YKE-KZZHPDi1rGI3sh-YIb-Btz0ck,9698
59
+ wedata/tempo/tsdf.py,sha256=S4lZfxhSRFiezYoYS6gvGsl1mZA3zp-MWEKFHYZpDg0,70968
60
+ wedata/tempo/utils.py,sha256=I9I6l2DMwUoY213L04Yc1UR_zTWgSkj1BVo4ZwzQd4Y,7977
61
+ tencent_wedata_feature_engineering_dev-0.1.0.dist-info/METADATA,sha256=WkytgQT-gXM-02grnlDek4MVdritzory-IKuJmC9Myg,581
62
+ tencent_wedata_feature_engineering_dev-0.1.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
63
+ tencent_wedata_feature_engineering_dev-0.1.0.dist-info/top_level.txt,sha256=Xa0v1rh__RvfVTVDirW5r5UBKg7ZO_iuTeXfp8MNo2A,7
64
+ tencent_wedata_feature_engineering_dev-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.45.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
wedata/__init__.py ADDED
@@ -0,0 +1,9 @@
1
+ """
2
+ WeData Feature Engineering
3
+ A toolkit for automated feature engineering
4
+ """
5
+ from wedata.feature_store.constants import constants
6
+
7
+ __version__ = f"{constants.FEATURE_LOOKUP_CLIENT_MAJOR_VERSION}"
8
+
9
+
File without changes
@@ -0,0 +1,462 @@
1
+ """
2
+ Wedata FeatureStoreClient Python实现
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from types import ModuleType
8
+ from typing import Union, List, Dict, Optional, Any
9
+ from pyspark.sql import DataFrame, SparkSession
10
+ from pyspark.sql.streaming import StreamingQuery
11
+ from pyspark.sql.types import StructType
12
+ import mlflow
13
+ from wedata.feature_store.constants.constants import FEATURE_STORE_CLIENT
14
+
15
+ from wedata.feature_store.constants.constants import APPEND, DEFAULT_WRITE_STREAM_TRIGGER
16
+ from wedata.feature_store.constants.engine_types import EngineTypes
17
+ from wedata.feature_store.common.store_config.redis import RedisStoreConfig
18
+ from wedata.feature_store.cloud_sdk_client.models import TaskSchedulerConfiguration
19
+ from wedata.feature_store.entities.feature_function import FeatureFunction
20
+ from wedata.feature_store.entities.feature_lookup import FeatureLookup
21
+ from wedata.feature_store.entities.feature_table import FeatureTable
22
+ from wedata.feature_store.entities.training_set import TrainingSet
23
+ from wedata.feature_store.feature_table_client.feature_table_client import FeatureTableClient
24
+ from wedata.feature_store.spark_client.spark_client import SparkClient
25
+ from wedata.feature_store.training_set_client.training_set_client import TrainingSetClient
26
+ from wedata.feature_store.utils import common_utils
27
+ from wedata.feature_store.utils.feature_utils import format_feature_lookups_and_functions
28
+
29
+
30
+ class FeatureStoreClient:
31
+ """特征存储统一客户端,提供特征全生命周期管理能力"""
32
+
33
+ def __init__(self, spark: Optional[SparkSession] = None, cloud_secret_id: str = None, cloud_secret_key: str = None):
34
+ """
35
+ :param spark: 已初始化的SparkSession对象
36
+ """
37
+ if spark is None:
38
+ spark = SparkSession.builder.getOrCreate()
39
+ self._spark = spark
40
+ self._spark_client = SparkClient(spark)
41
+ self._feature_table_client = FeatureTableClient(spark, cloud_secret_id=cloud_secret_id,
42
+ cloud_secret_key=cloud_secret_key)
43
+ self._training_set_client = TrainingSetClient(self._spark_client)
44
+
45
+ def create_table(
46
+ self,
47
+ name: str,
48
+ primary_keys: Union[str, List[str]],
49
+ timestamp_key: [str],
50
+ engine_type: [EngineTypes],
51
+ data_source_name: [str],
52
+ database_name: Optional[str] = None,
53
+ df: Optional[DataFrame] = None,
54
+ *,
55
+ partition_columns: Union[str, List[str], None] = None,
56
+ schema: Optional[StructType] = None,
57
+ description: Optional[str] = None,
58
+ tags: Optional[Dict[str, str]] = None
59
+ ) -> FeatureTable:
60
+ """
61
+ 创建特征表(支持批流数据写入)
62
+
63
+ Args:
64
+ name: 特征表全称(格式:<table>)
65
+ primary_keys: 主键列名(支持复合主键)
66
+ timestamp_key: 时间戳键(用于时态特征)
67
+ engine_type: 引擎类型 wedata.feature_store.constants.engine_types.EngineTypes
68
+ data_source_name: 数据源名称
69
+ database_name: 数据库名
70
+ df: 初始数据(可选,用于推断schema)
71
+ partition_columns: 分区列(优化存储查询)
72
+ schema: 表结构定义(可选,当不提供df时必需)
73
+ description: 业务描述
74
+ tags: 业务标签
75
+
76
+ Returns:
77
+ FeatureTable实例
78
+
79
+ Raises:
80
+ ValueError: 当schema与数据不匹配时
81
+ """
82
+
83
+ return self._feature_table_client.create_table(
84
+ name=name,
85
+ primary_keys=primary_keys,
86
+ engine_type=engine_type,
87
+ database_name=database_name,
88
+ data_source_name=data_source_name,
89
+ df=df,
90
+ timestamp_key=timestamp_key,
91
+ partition_columns=partition_columns,
92
+ schema=schema,
93
+ description=description,
94
+ tags=tags
95
+ )
96
+
97
+ def register_table(self, name: str, timestamp_key: str, engine_type: EngineTypes, data_source_name: [str],
98
+ database_name: Optional[str] = None,
99
+ primary_keys: Union[str, List[str]] = None) -> DataFrame:
100
+ """
101
+ 将普通的表注册为特征表,并返回特征表数据
102
+
103
+ Args:
104
+ name: 特征表名称
105
+ database_name: 特征库名称
106
+ timestamp_key: 时间戳键 (用于后续离在线特征同步)
107
+ engine_type: 引擎类型 wedata.feature_store.constants.engine_types.EngineTypes
108
+ data_source_name: 数据源名称
109
+ primary_keys: 主键列名(支持复合主键)(仅当engine_type为EngineTypes.HIVE_ENGINE时有效)
110
+ Returns:
111
+ DataFrame: 包含特征表数据的DataFrame对象
112
+ """
113
+
114
+ return self._feature_table_client.register_table(name, database_name, timestamp_key=timestamp_key,
115
+ engine_type=engine_type, primary_keys=primary_keys, data_source_name=data_source_name)
116
+
117
+ def read_table(self, name: str, database_name: Optional[str] = None, is_online: bool = False,
118
+ online_config: Optional[RedisStoreConfig] = None,
119
+ entity_row: Optional[List[Dict[str, Any]]] = None) -> DataFrame:
120
+ """
121
+ 读取特征表数据
122
+
123
+ Args:
124
+ name: 特征表名称
125
+ database_name: 特征库名称
126
+ is_online: 是否读取在线特征表(默认不读取)
127
+ online_config: 在线特征表配置(仅当is_online为True时有效)
128
+ entity_row: 实体行数据(仅当is_online为True时有效)
129
+ [{primary_key1: [value1, value2]}, {primary_key2: [value1, value2]}]
130
+ Returns:
131
+ DataFrame: 包含特征表数据的DataFrame对象
132
+ """
133
+
134
+ return self._feature_table_client.read_table(name=name, database_name=database_name, is_online=is_online,
135
+ online_config=online_config, entity_row=entity_row)
136
+
137
+ def get_table(self, name: str, database_name: Optional[str] = None) -> FeatureTable:
138
+ """
139
+ 获取特征表元数据
140
+ Args:
141
+ name: 特征表名称
142
+ database_name: 特征库名称
143
+
144
+ Returns:
145
+ FeatureTable: 包含特征表元数据的FeatureTable对象
146
+ """
147
+
148
+ return self._feature_table_client.get_table(name, self._spark_client, database_name)
149
+
150
+ def drop_table(self, name: str, database_name: Optional[str] = None) -> None:
151
+ """
152
+ 删除特征表
153
+
154
+ Args:
155
+ name: 要删除的特征表名称
156
+ database_name: database name
157
+ Returns:
158
+ None
159
+ """
160
+
161
+ return self._feature_table_client.drop_table(name, database_name)
162
+
163
+ def write_table(
164
+ self,
165
+ name: str,
166
+ df: DataFrame,
167
+ database_name: Optional[str] = None,
168
+ mode: Optional[str] = APPEND,
169
+ checkpoint_location: Optional[str] = None,
170
+ trigger: Dict[str, Any] = DEFAULT_WRITE_STREAM_TRIGGER,
171
+ ) -> Optional[StreamingQuery]:
172
+ """
173
+ 写入数据到特征表(支持批处理和流式处理)
174
+
175
+ Args:
176
+ name: 特征表名称
177
+ df: 要写入的数据DataFrame
178
+ database_name: 特征库名称
179
+ mode: 写入模式(默认追加)
180
+ checkpoint_location: 流式处理的检查点位置(可选)
181
+ trigger: 流式处理触发器配置(默认使用系统预设)
182
+
183
+ Returns:
184
+ 如果是流式写入返回StreamingQuery对象,否则返回None
185
+ """
186
+
187
+ return self._feature_table_client.write_table(
188
+ name=name,
189
+ df=df,
190
+ database_name=database_name,
191
+ mode=mode,
192
+ checkpoint_location=checkpoint_location,
193
+ trigger=trigger,
194
+ )
195
+
196
+ def create_training_set(
197
+ self,
198
+ df: DataFrame,
199
+ feature_lookups: List[Union[FeatureLookup, FeatureFunction]],
200
+ label: Union[str, List[str], None],
201
+ exclude_columns: Optional[List[str]] = None,
202
+ database_name: Optional[str] = None,
203
+ **kwargs,
204
+ ) -> TrainingSet:
205
+ """
206
+ 创建训练集
207
+
208
+ Args:
209
+ df: 基础数据
210
+ feature_lookups: 特征查询列表
211
+ label: 标签列名
212
+ exclude_columns: 排除列名
213
+ database_name: database name
214
+
215
+ Returns:
216
+ TrainingSet实例
217
+ """
218
+
219
+ if exclude_columns is None:
220
+ exclude_columns = []
221
+
222
+ # 如果为FeatureLookup,则将需要校验FeatureLookup的table_name,并构建完整表名
223
+ for feature in feature_lookups:
224
+ if isinstance(feature, FeatureLookup):
225
+ if not feature.table_name:
226
+ raise ValueError("FeatureLookup must specify a table_name")
227
+ # 先校验表名格式是否合法
228
+ common_utils.validate_table_name(feature.table_name)
229
+ # 再构建完整表名,并赋值给FeatureLookup对象
230
+ feature.table_name = common_utils.build_full_table_name(feature.table_name, database_name)
231
+
232
+ features = feature_lookups
233
+ del feature_lookups
234
+
235
+ features = format_feature_lookups_and_functions(self._spark_client, features)
236
+
237
+ return self._training_set_client.create_training_set_from_feature_lookups(
238
+ df=df,
239
+ feature_lookups=features,
240
+ label=label,
241
+ exclude_columns=exclude_columns,
242
+ **kwargs
243
+ )
244
+
245
+ def log_model(
246
+ self,
247
+ model: Any,
248
+ artifact_path: str,
249
+ *,
250
+ flavor: ModuleType,
251
+ training_set: Optional[TrainingSet] = None,
252
+ registered_model_name: Optional[str] = None,
253
+ model_registry_uri: Optional[str] = None,
254
+ await_registration_for: int = mlflow.tracking._model_registry.DEFAULT_AWAIT_MAX_SLEEP_SECONDS,
255
+ infer_input_example: bool = False,
256
+ **kwargs,
257
+ ):
258
+ """
259
+ 记录MLflow模型并关联特征查找信息
260
+
261
+ 注意:必须使用TrainingSet.load_df返回的DataFrame训练模型,
262
+ 任何对DataFrame的修改(如标准化、添加列等)都不会在推理时应用
263
+
264
+ Args:
265
+ model: 要记录的模型对象
266
+ artifact_path: 模型存储路径
267
+ flavor: MLflow模型类型模块(如mlflow.sklearn)
268
+ training_set: 训练模型使用的TrainingSet对象(可选)
269
+ registered_model_name: 要注册的模型名称(可选)
270
+ model_registry_uri: 模型注册中心地址(可选)
271
+ await_registration_for: 等待模型注册完成的秒数(默认300秒)
272
+ infer_input_example: 是否自动记录输入示例(默认False)
273
+
274
+ Returns:
275
+ None
276
+ """
277
+
278
+ self._training_set_client.log_model(
279
+ model=model,
280
+ artifact_path=artifact_path,
281
+ flavor=flavor,
282
+ training_set=training_set,
283
+ registered_model_name=registered_model_name,
284
+ model_registry_uri=model_registry_uri,
285
+ await_registration_for=await_registration_for,
286
+ infer_input_example=infer_input_example,
287
+ **kwargs
288
+ )
289
+
290
+ def score_batch(
291
+ self, model_uri: str, df: DataFrame, result_type: str = "double"
292
+ ) -> DataFrame:
293
+ """
294
+ Evaluate the model on the provided :class:`DataFrame <pyspark.sql.DataFrame>`.
295
+
296
+ Additional features required for
297
+ model evaluation will be automatically retrieved from :mod:`Feature Store <databricks.feature_store.client>`.
298
+
299
+ .. todo::
300
+
301
+ [ML-15539]: Replace the bitly URL in doc string
302
+
303
+ The model must have been logged with :meth:`.FeatureStoreClient.log_model`,
304
+ which packages the model with feature metadata. Unless present in ``df``,
305
+ these features will be looked up from :mod:`Feature Store <databricks.feature_store.client>` and joined with ``df``
306
+ prior to scoring the model.
307
+
308
+ If a feature is included in ``df``, the provided feature values will be used rather
309
+ than those stored in :mod:`Feature Store <databricks.feature_store.client>`.
310
+
311
+ For example, if a model is trained on two features ``account_creation_date`` and
312
+ ``num_lifetime_purchases``, as in:
313
+
314
+ .. code-block:: python
315
+
316
+ feature_lookups = [
317
+ FeatureLookup(
318
+ table_name = 'trust_and_safety.customer_features',
319
+ feature_name = 'account_creation_date',
320
+ lookup_key = 'customer_id',
321
+ ),
322
+ FeatureLookup(
323
+ table_name = 'trust_and_safety.customer_features',
324
+ feature_name = 'num_lifetime_purchases',
325
+ lookup_key = 'customer_id'
326
+ ),
327
+ ]
328
+
329
+ with mlflow.start_run():
330
+ training_set = fs.create_training_set(
331
+ df,
332
+ feature_lookups = feature_lookups,
333
+ label = 'is_banned',
334
+ exclude_columns = ['customer_id']
335
+ )
336
+ ...
337
+ fs.log_model(
338
+ model,
339
+ "model",
340
+ flavor=mlflow.sklearn,
341
+ training_set=training_set,
342
+ registered_model_name="example_model"
343
+ )
344
+
345
+ Then at inference time, the caller of :meth:`FeatureStoreClient.score_batch` must pass
346
+ a :class:`DataFrame <pyspark.sql.DataFrame>` that includes ``customer_id``, the ``lookup_key`` specified in the
347
+ ``FeatureLookups`` of the :mod:`training_set <databricks.feature_engineering.training_set>`.
348
+ If the :class:`DataFrame <pyspark.sql.DataFrame>` contains a column
349
+ ``account_creation_date``, the values of this column will be used
350
+ in lieu of those in :mod:`Feature Store <databricks.feature_store.client>`. As in:
351
+
352
+ .. code-block:: python
353
+
354
+ # batch_df has columns ['customer_id', 'account_creation_date']
355
+ predictions = fs.score_batch(
356
+ 'models:/example_model/1',
357
+ batch_df
358
+ )
359
+
360
+ :param model_uri: The location, in URI format, of the MLflow model logged using
361
+ :meth:`FeatureStoreClient.log_model`. One of:
362
+
363
+ * ``runs:/<mlflow_run_id>/run-relative/path/to/model``
364
+
365
+ * ``models:/<model_name>/<model_version>``
366
+
367
+ * ``models:/<model_name>/<stage>``
368
+
369
+ For more information about URI schemes, see
370
+ `Referencing Artifacts <https://bit.ly/3wnrseE>`_.
371
+ :param df: The :class:`DataFrame <pyspark.sql.DataFrame>` to score the model on. :mod:`Feature Store <databricks.feature_store.client>` features will be joined with
372
+ ``df`` prior to scoring the model. ``df`` must:
373
+
374
+ 1. Contain columns for lookup keys required to join feature data from Feature
375
+ Store, as specified in the ``feature_spec.yaml`` artifact.
376
+
377
+ 2. Contain columns for all source keys required to score the model, as specified in
378
+ the ``feature_spec.yaml`` artifact.
379
+
380
+ 3. Not contain a column ``prediction``, which is reserved for the model's predictions.
381
+ ``df`` may contain additional columns.
382
+
383
+ Streaming DataFrames are not supported.
384
+
385
+ :param result_type: The return type of the model.
386
+ See :func:`mlflow.pyfunc.spark_udf` result_type.
387
+ :return: A :class:`DataFrame <pyspark.sql.DataFrame>`
388
+ containing:
389
+
390
+ 1. All columns of ``df``.
391
+
392
+ 2. All feature values retrieved from Feature Store.
393
+
394
+ 3. A column ``prediction`` containing the output of the model.
395
+
396
+ """
397
+ return self._training_set_client.score_batch(
398
+ model_uri=model_uri,
399
+ df=df,
400
+ result_type=result_type,
401
+ client_name=FEATURE_STORE_CLIENT,
402
+ )
403
+
404
+ def publish_table(self, table_name: str, data_source_name: str, cloud_secret_id: str, cloud_secret_key: str,
405
+ database_name: Optional[str] = None,
406
+ is_cycle: bool = False, cycle_obj: TaskSchedulerConfiguration = None,
407
+ is_use_default_online: bool = True, online_config: RedisStoreConfig = None):
408
+ """
409
+ Publish an offline feature table to an online feature table.
410
+
411
+ This method synchronizes the offline feature table data to online storage
412
+ for low-latency feature serving in real-time applications.
413
+
414
+ Args:
415
+ table_name: Name of the offline feature table
416
+ data_source_name: Name of the data source
417
+ cloud_secret_id: Cloud secret ID for authentication
418
+ cloud_secret_key: Cloud secret key for authentication
419
+ database_name: Database name (optional)
420
+ is_cycle: Whether to enable periodic publishing (default: False)
421
+ cycle_obj: Periodic task configuration object (required if is_cycle is True)
422
+ is_use_default_online: Whether to use default online storage configuration (default: True)
423
+ online_config: Custom online storage configuration (only effective when is_use_default_online is False)
424
+
425
+ Returns:
426
+ None
427
+
428
+ """
429
+ return self._feature_table_client.publish_table(table_name=table_name, database_name=database_name,
430
+ data_source_name=data_source_name,
431
+ cloud_secret_key=cloud_secret_key,
432
+ cloud_secret_id=cloud_secret_id,
433
+ is_cycle=is_cycle, cycle_obj=cycle_obj,
434
+ is_use_default_online=is_use_default_online,
435
+ online_config=online_config)
436
+
437
+ def drop_online_table(self, table_name: str, online_config: RedisStoreConfig, database_name: Optional[str] = None):
438
+ """
439
+ Drop an online feature table.
440
+ :param table_name: Name of the offline feature table
441
+ :param database_name: Database name (optional)
442
+ :param online_config: Custom online storage configuration (only effective when is_use_default_online is False)
443
+ :return:
444
+ """
445
+ self._feature_table_client.drop_online_table(table_name=table_name, database_name=database_name, online_config=online_config)
446
+
447
+ def create_feature_spec(
448
+ self, name: str,
449
+ features: List[Union[FeatureLookup, FeatureFunction]],
450
+ exclude_columns: List[str]):
451
+
452
+ """
453
+ 创建特征配置文件
454
+ :arg name: 特征配置文件名称
455
+ :arg features: 特征列表,可以是FeatureLookup(特征查找)或FeatureFunction(特征函数)
456
+ :arg exclude_columns: 需要从最终特征集中排除的列名列表
457
+ """
458
+ return self._training_set_client.create_feature_spec(name, features, self._spark_client, exclude_columns)
459
+
460
+ @property
461
+ def spark(self):
462
+ return self._spark
File without changes
@@ -0,0 +1,86 @@
1
+ import json
2
+
3
+ from tencentcloud.wedata.v20210820.wedata_client import WedataClient
4
+ from tencentcloud.common import credential
5
+ from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
6
+ from wedata.feature_store.cloud_sdk_client.utils import get_client_profile, set_request_header
7
+ import wedata.feature_store.cloud_sdk_client.models as models
8
+
9
+
10
+ class FeatureCloudSDK:
11
+ def __init__(self, secret_id: str, secret_key: str, region: str):
12
+ self._client = WedataClient(credential.Credential(secret_id, secret_key), region, get_client_profile())
13
+
14
+ def CreateOnlineFeatureTable(self, request: models.CreateOnlineFeatureTableRequest) -> 'models.CreateOnlineFeatureTableResponse':
15
+ """
16
+ 创建在线特征表
17
+ Args:
18
+ request: 创建请求参数
19
+
20
+ Returns:
21
+ 创建结果响应
22
+ """
23
+ try:
24
+ params = request._serialize()
25
+ headers = set_request_header(request.headers)
26
+ print(f"CreateOnlineFeatureTable params: {params}")
27
+ print(f"CreateOnlineFeatureTable headers: {headers}")
28
+ body = self._client.call("CreateOnlineFeatureTable", params, headers=headers)
29
+ response = json.loads(body)
30
+ model = models.CreateOnlineFeatureTableResponse()
31
+ model._deserialize(response["Response"])
32
+ return model
33
+ except Exception as e:
34
+ if isinstance(e, TencentCloudSDKException):
35
+ raise
36
+ else:
37
+ raise TencentCloudSDKException(type(e).__name__, str(e))
38
+
39
+ def DescribeNormalSchedulerExecutorGroups(self, request: models.DescribeNormalSchedulerExecutorGroupsRequest) -> 'models.DescribeNormalSchedulerExecutorGroupsResponse':
40
+ """
41
+ 查询普通调度器执行器组
42
+ Args:
43
+ request: 查询请求参数
44
+
45
+ Returns:
46
+ 查询结果响应
47
+ """
48
+ try:
49
+ params = request._serialize()
50
+ headers = set_request_header(request.headers)
51
+ print(f"DescribeNormalSchedulerExecutorGroups params: {params}")
52
+ print(f"DescribeNormalSchedulerExecutorGroups headers: {headers}")
53
+ body = self._client.call("DescribeNormalSchedulerExecutorGroups", params, headers=headers)
54
+ response = json.loads(body)
55
+ model = models.DescribeNormalSchedulerExecutorGroupsResponse()
56
+ model._deserialize(response["Response"])
57
+ return model
58
+ except Exception as e:
59
+ if isinstance(e, TencentCloudSDKException):
60
+ raise
61
+ else:
62
+ raise TencentCloudSDKException(type(e).__name__, str(e))
63
+
64
+ def RefreshFeatureTable(self, request: models.RefreshFeatureTableRequest) -> 'models.RefreshFeatureTableResponse':
65
+ """
66
+ 刷新特征表
67
+ Args:
68
+ request: 刷新请求参数
69
+ Returns:
70
+ 刷新结果响应
71
+ """
72
+ try:
73
+ params = request._serialize()
74
+ headers = set_request_header(request.headers)
75
+ print(f"RefreshFeatureTable params: {params}")
76
+ print(f"RefreshFeatureTable headers: {headers}")
77
+ body = self._client.call("RefreshFeatureTable", params, headers=headers)
78
+ response = json.loads(body)
79
+ model = models.RefreshFeatureTableResponse()
80
+ model._deserialize(response["Response"])
81
+ return model
82
+ except Exception as e:
83
+ if isinstance(e, TencentCloudSDKException):
84
+ raise
85
+ else:
86
+ raise TencentCloudSDKException(type(e).__name__, str(e))