tencent-wedata-feature-engineering-dev 0.1.49__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.
- {tencent_wedata_feature_engineering_dev-0.1.49.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/METADATA +10 -8
- tencent_wedata_feature_engineering_dev-0.2.0.dist-info/RECORD +46 -0
- {tencent_wedata_feature_engineering_dev-0.1.49.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/WHEEL +1 -1
- wedata/feature_store/client.py +28 -92
- wedata/feature_store/constants/constants.py +2 -5
- wedata/feature_store/entities/feature_lookup.py +0 -17
- wedata/feature_store/entities/feature_spec.py +2 -2
- wedata/feature_store/entities/feature_table.py +1 -5
- wedata/feature_store/entities/function_info.py +4 -1
- wedata/feature_store/feature_table_client/feature_table_client.py +53 -528
- wedata/feature_store/spark_client/spark_client.py +15 -41
- wedata/feature_store/training_set_client/training_set_client.py +10 -9
- wedata/feature_store/utils/common_utils.py +4 -48
- wedata/feature_store/utils/feature_lookup_utils.py +43 -37
- wedata/feature_store/utils/feature_spec_utils.py +1 -1
- wedata/feature_store/utils/uc_utils.py +1 -1
- tencent_wedata_feature_engineering_dev-0.1.49.dist-info/RECORD +0 -66
- wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
- wedata/feature_store/cloud_sdk_client/client.py +0 -108
- wedata/feature_store/cloud_sdk_client/models.py +0 -686
- wedata/feature_store/cloud_sdk_client/utils.py +0 -39
- wedata/feature_store/common/log/__init__.py +0 -0
- wedata/feature_store/common/log/logger.py +0 -40
- wedata/feature_store/common/store_config/__init__.py +0 -0
- wedata/feature_store/common/store_config/redis.py +0 -48
- wedata/feature_store/constants/engine_types.py +0 -34
- wedata/feature_store/feast_client/__init__.py +0 -0
- wedata/feature_store/feast_client/feast_client.py +0 -487
- wedata/feature_store/utils/env_utils.py +0 -108
- wedata/tempo/__init__.py +0 -0
- wedata/tempo/interpol.py +0 -448
- wedata/tempo/intervals.py +0 -1331
- wedata/tempo/io.py +0 -61
- wedata/tempo/ml.py +0 -129
- wedata/tempo/resample.py +0 -318
- wedata/tempo/tsdf.py +0 -1720
- wedata/tempo/utils.py +0 -254
- {tencent_wedata_feature_engineering_dev-0.1.49.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: tencent-wedata-feature-engineering-dev
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary: Wedata Feature Engineering Library
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Wedata Feature Engineering Library
|
|
5
5
|
Home-page:
|
|
6
6
|
Author: meahqian
|
|
7
7
|
Author-email:
|
|
@@ -12,8 +12,10 @@ Classifier: Operating System :: OS Independent
|
|
|
12
12
|
Requires-Python: >=3.7
|
|
13
13
|
Description-Content-Type: text/markdown
|
|
14
14
|
Requires-Dist: pandas>=1.0.0
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
15
|
+
Dynamic: author
|
|
16
|
+
Dynamic: classifier
|
|
17
|
+
Dynamic: description-content-type
|
|
18
|
+
Dynamic: license
|
|
19
|
+
Dynamic: requires-dist
|
|
20
|
+
Dynamic: requires-python
|
|
21
|
+
Dynamic: summary
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
wedata/__init__.py,sha256=GYxqkkgH0oH4QtNiOCZHuGkc0sSH1LgEqmhSX6sB4So,200
|
|
2
|
+
wedata/feature_store/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
wedata/feature_store/client.py,sha256=NkjwOdmynA9UkAQfIl9VHj6STPGPAF4xvF9zA9czUGU,14767
|
|
4
|
+
wedata/feature_store/mlflow_model.py,sha256=OCUuccOoO0NXWSzIPoGeL03Ha1Q3aQTJW2RlJrTCmzc,554
|
|
5
|
+
wedata/feature_store/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
wedata/feature_store/common/protos/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
wedata/feature_store/common/protos/feature_store_pb2.py,sha256=oMIUGGeGNP84g_nFqOQwTXjV1GiU2ehSOy7CyFu2__g,4207
|
|
8
|
+
wedata/feature_store/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
wedata/feature_store/constants/constants.py,sha256=JzuahjljVFF-ySioQTBfBOMTCuKqN6fkSDC3Fvuxvy0,1774
|
|
10
|
+
wedata/feature_store/entities/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
wedata/feature_store/entities/column_info.py,sha256=-AR6EKHwgoqIkRHFyguxVEtnYt6fvusWHkEjF4kvS0A,5141
|
|
12
|
+
wedata/feature_store/entities/environment_variables.py,sha256=ZEFml5H9MQuzBKM074mUrFYu-Sga4Knmxqiwpke2WGc,1679
|
|
13
|
+
wedata/feature_store/entities/feature.py,sha256=wX8fTBlJq3GYdj9rrBDCY3kFgcVBBAiOOZdxEhnQkNQ,1241
|
|
14
|
+
wedata/feature_store/entities/feature_column_info.py,sha256=ZAS_I-MDg2ofCv3nwYvGCQrrpEljzrh_L1D-gqOV_mM,2407
|
|
15
|
+
wedata/feature_store/entities/feature_function.py,sha256=R17INrCE-U_Uj9KLbFz69aYlOkTETTwQHMMo470F4lQ,1865
|
|
16
|
+
wedata/feature_store/entities/feature_lookup.py,sha256=YjYz8kLq42doFbgPzpmm1r3GPhPYkLsIss4H71x-KAo,8009
|
|
17
|
+
wedata/feature_store/entities/feature_spec.py,sha256=3tLLgZ95QL5qI01DQW5BxKVi4Bm5Qm31b2cFvO2DZh0,21767
|
|
18
|
+
wedata/feature_store/entities/feature_spec_constants.py,sha256=YWDBfRiNDe6fUJFUBo3V4WYg2xsljoPAE-ZejfFZCgM,785
|
|
19
|
+
wedata/feature_store/entities/feature_table.py,sha256=dHZHSDPD4HJ2XanLVIrVTkaCYUeqZ6eWEpA0d3YO71g,4010
|
|
20
|
+
wedata/feature_store/entities/feature_table_info.py,sha256=yJ1P3AYaPiHW6ehCbMWhndzguHJqJKWfeFwYjwTLt2U,1481
|
|
21
|
+
wedata/feature_store/entities/function_info.py,sha256=XRoobE5d9dz9NDjhLmUrEIHUNBiG44WXmFVWWzFJwtg,2942
|
|
22
|
+
wedata/feature_store/entities/on_demand_column_info.py,sha256=a44ep-f3FOruWNXl3c8v7733rNuoKXJaHTv1aqF905s,1739
|
|
23
|
+
wedata/feature_store/entities/source_data_column_info.py,sha256=FyBmBPUSvc2S2OPFTvsQf2AdS-KFGkYBmd4yL_Vur8M,702
|
|
24
|
+
wedata/feature_store/entities/training_set.py,sha256=ylt1h6Z_xU8hKYvnvd80CeewTGSN68-_kvFpoliwH7s,5679
|
|
25
|
+
wedata/feature_store/feature_table_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
|
+
wedata/feature_store/feature_table_client/feature_table_client.py,sha256=foBPizFye0ITj1kZfVDlbMwcNJ7SJwMf1Wl9iLxG3Qc,17554
|
|
27
|
+
wedata/feature_store/spark_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
|
+
wedata/feature_store/spark_client/spark_client.py,sha256=wLF7NhQ_hZ1c_7opGO81z52XV5TT8PElRQeONJhuhss,10430
|
|
29
|
+
wedata/feature_store/training_set_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
|
+
wedata/feature_store/training_set_client/training_set_client.py,sha256=SQ7m3JUo1HF74SV4OohOE2DRYMDMA2pJCeZ3L0ONb_w,23744
|
|
31
|
+
wedata/feature_store/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
32
|
+
wedata/feature_store/utils/common_utils.py,sha256=a7msUfGpJz19EDNZ_L_W_-kCIHy9GrOotF77LW6Vg1Y,10491
|
|
33
|
+
wedata/feature_store/utils/feature_lookup_utils.py,sha256=da6ULwf5D-FRVpZoNyag1rroBfq_XPSH4a3uEMB_8io,22372
|
|
34
|
+
wedata/feature_store/utils/feature_spec_utils.py,sha256=GFwc-WT6nN1tnal5F2c0bgVDRhH-YW58S0GaHBPZEsQ,11624
|
|
35
|
+
wedata/feature_store/utils/feature_utils.py,sha256=KKq28bVB_lCuhnR9Hk6JegJBOVgcelWlvrRM-F9onkA,2796
|
|
36
|
+
wedata/feature_store/utils/on_demand_utils.py,sha256=pazZRG5c0Se08MV_inBddIeX4Q9xlVN_H9SC_WK3xzs,4260
|
|
37
|
+
wedata/feature_store/utils/schema_utils.py,sha256=y6EYY1pUxjVg6MP4C7avdW8ZEBBaDo1YTV2CmPF4i8o,4491
|
|
38
|
+
wedata/feature_store/utils/signature_utils.py,sha256=SZFufd19m0jmGnOLmAl3JPKZC-qHq-wQezh6G7HOMfc,7773
|
|
39
|
+
wedata/feature_store/utils/topological_sort.py,sha256=ebzKxmxeCLk9seB1zR0ASCGXsZsa-DjxJeTc4KUadtg,6475
|
|
40
|
+
wedata/feature_store/utils/training_set_utils.py,sha256=MYsPZS1d9HKswHgjgxD8K7H9N3dWPyyTTx20Mkp4PVU,22497
|
|
41
|
+
wedata/feature_store/utils/uc_utils.py,sha256=KwkiymVx_8AxODUL1RQCIMGU28WM98_i6BmIuQqrZ6o,11431
|
|
42
|
+
wedata/feature_store/utils/validation_utils.py,sha256=lJe6HCg5v5CZxH_pvT-vpGhCpo66LT2erXraHE2T0iI,2584
|
|
43
|
+
tencent_wedata_feature_engineering_dev-0.2.0.dist-info/METADATA,sha256=X-U-df9gDfIyIr16cz8C-xIyfsC1JqdFiK5e6FAZtdI,592
|
|
44
|
+
tencent_wedata_feature_engineering_dev-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
45
|
+
tencent_wedata_feature_engineering_dev-0.2.0.dist-info/top_level.txt,sha256=Xa0v1rh__RvfVTVDirW5r5UBKg7ZO_iuTeXfp8MNo2A,7
|
|
46
|
+
tencent_wedata_feature_engineering_dev-0.2.0.dist-info/RECORD,,
|
wedata/feature_store/client.py
CHANGED
|
@@ -4,6 +4,7 @@ Wedata FeatureStoreClient Python实现
|
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
|
+
import os
|
|
7
8
|
from types import ModuleType
|
|
8
9
|
from typing import Union, List, Dict, Optional, Any
|
|
9
10
|
from pyspark.sql import DataFrame, SparkSession
|
|
@@ -13,9 +14,6 @@ import mlflow
|
|
|
13
14
|
from wedata.feature_store.constants.constants import FEATURE_STORE_CLIENT
|
|
14
15
|
|
|
15
16
|
from wedata.feature_store.constants.constants import APPEND, DEFAULT_WRITE_STREAM_TRIGGER
|
|
16
|
-
from wedata.feature_store.constants.engine_types import EngineTypes
|
|
17
|
-
from wedata.feature_store.common.store_config.redis import RedisStoreConfig
|
|
18
|
-
from wedata.feature_store.cloud_sdk_client.models import TaskSchedulerConfiguration
|
|
19
17
|
from wedata.feature_store.entities.feature_function import FeatureFunction
|
|
20
18
|
from wedata.feature_store.entities.feature_lookup import FeatureLookup
|
|
21
19
|
from wedata.feature_store.entities.feature_table import FeatureTable
|
|
@@ -30,7 +28,7 @@ from wedata.feature_store.utils.feature_utils import format_feature_lookups_and_
|
|
|
30
28
|
class FeatureStoreClient:
|
|
31
29
|
"""特征存储统一客户端,提供特征全生命周期管理能力"""
|
|
32
30
|
|
|
33
|
-
def __init__(self, spark: Optional[SparkSession] = None
|
|
31
|
+
def __init__(self, spark: Optional[SparkSession] = None):
|
|
34
32
|
"""
|
|
35
33
|
:param spark: 已初始化的SparkSession对象
|
|
36
34
|
"""
|
|
@@ -38,20 +36,19 @@ class FeatureStoreClient:
|
|
|
38
36
|
spark = SparkSession.builder.getOrCreate()
|
|
39
37
|
self._spark = spark
|
|
40
38
|
self._spark_client = SparkClient(spark)
|
|
41
|
-
self._feature_table_client = FeatureTableClient(spark
|
|
42
|
-
cloud_secret_key=cloud_secret_key)
|
|
39
|
+
self._feature_table_client = FeatureTableClient(spark)
|
|
43
40
|
self._training_set_client = TrainingSetClient(self._spark_client)
|
|
41
|
+
self._init_catalog()
|
|
44
42
|
|
|
45
43
|
def create_table(
|
|
46
44
|
self,
|
|
47
45
|
name: str,
|
|
48
46
|
primary_keys: Union[str, List[str]],
|
|
49
|
-
timestamp_key: [str],
|
|
50
|
-
engine_type: [EngineTypes],
|
|
51
|
-
data_source_name: [str],
|
|
52
47
|
database_name: Optional[str] = None,
|
|
48
|
+
location: Optional[str] = None,
|
|
53
49
|
df: Optional[DataFrame] = None,
|
|
54
50
|
*,
|
|
51
|
+
timestamp_keys: Union[str, List[str], None] = None,
|
|
55
52
|
partition_columns: Union[str, List[str], None] = None,
|
|
56
53
|
schema: Optional[StructType] = None,
|
|
57
54
|
description: Optional[str] = None,
|
|
@@ -63,11 +60,10 @@ class FeatureStoreClient:
|
|
|
63
60
|
Args:
|
|
64
61
|
name: 特征表全称(格式:<table>)
|
|
65
62
|
primary_keys: 主键列名(支持复合主键)
|
|
66
|
-
timestamp_key: 时间戳键(用于时态特征)
|
|
67
|
-
engine_type: 引擎类型 wedata.feature_store.constants.engine_types.EngineTypes
|
|
68
|
-
data_source_name: 数据源名称
|
|
69
63
|
database_name: 数据库名
|
|
64
|
+
location: cos存储位置
|
|
70
65
|
df: 初始数据(可选,用于推断schema)
|
|
66
|
+
timestamp_keys: 时间戳键(用于时态特征)
|
|
71
67
|
partition_columns: 分区列(优化存储查询)
|
|
72
68
|
schema: 表结构定义(可选,当不提供df时必需)
|
|
73
69
|
description: 业务描述
|
|
@@ -83,56 +79,43 @@ class FeatureStoreClient:
|
|
|
83
79
|
return self._feature_table_client.create_table(
|
|
84
80
|
name=name,
|
|
85
81
|
primary_keys=primary_keys,
|
|
86
|
-
engine_type=engine_type,
|
|
87
82
|
database_name=database_name,
|
|
88
|
-
|
|
83
|
+
location=location,
|
|
89
84
|
df=df,
|
|
90
|
-
|
|
85
|
+
timestamp_keys=timestamp_keys,
|
|
91
86
|
partition_columns=partition_columns,
|
|
92
87
|
schema=schema,
|
|
93
88
|
description=description,
|
|
94
89
|
tags=tags
|
|
95
90
|
)
|
|
96
91
|
|
|
97
|
-
def register_table(self, name: str,
|
|
98
|
-
database_name: Optional[str] = None,
|
|
99
|
-
primary_keys: Union[str, List[str]] = None) -> DataFrame:
|
|
92
|
+
def register_table(self, name: str, database_name: Optional[str] = None) -> DataFrame:
|
|
100
93
|
"""
|
|
101
|
-
|
|
94
|
+
读取特征表数据
|
|
102
95
|
|
|
103
96
|
Args:
|
|
104
97
|
name: 特征表名称
|
|
105
98
|
database_name: 特征库名称
|
|
106
|
-
|
|
107
|
-
engine_type: 引擎类型 wedata.feature_store.constants.engine_types.EngineTypes
|
|
108
|
-
data_source_name: 数据源名称
|
|
109
|
-
primary_keys: 主键列名(支持复合主键)(仅当engine_type为EngineTypes.HIVE_ENGINE时有效)
|
|
99
|
+
|
|
110
100
|
Returns:
|
|
111
101
|
DataFrame: 包含特征表数据的DataFrame对象
|
|
112
102
|
"""
|
|
113
103
|
|
|
114
|
-
return self._feature_table_client.register_table(name, database_name
|
|
115
|
-
engine_type=engine_type, primary_keys=primary_keys, data_source_name=data_source_name)
|
|
104
|
+
return self._feature_table_client.register_table(name, database_name)
|
|
116
105
|
|
|
117
|
-
def read_table(self, name: str, database_name: Optional[str] = None
|
|
118
|
-
online_config: Optional[RedisStoreConfig] = None,
|
|
119
|
-
entity_row: Optional[List[Dict[str, Any]]] = None) -> DataFrame:
|
|
106
|
+
def read_table(self, name: str, database_name: Optional[str] = None) -> DataFrame:
|
|
120
107
|
"""
|
|
121
108
|
读取特征表数据
|
|
122
109
|
|
|
123
110
|
Args:
|
|
124
111
|
name: 特征表名称
|
|
125
112
|
database_name: 特征库名称
|
|
126
|
-
|
|
127
|
-
online_config: 在线特征表配置(仅当is_online为True时有效)
|
|
128
|
-
entity_row: 实体行数据(仅当is_online为True时有效)
|
|
129
|
-
[{primary_key1: [value1, value2]}, {primary_key2: [value1, value2]}]
|
|
113
|
+
|
|
130
114
|
Returns:
|
|
131
115
|
DataFrame: 包含特征表数据的DataFrame对象
|
|
132
116
|
"""
|
|
133
117
|
|
|
134
|
-
return self._feature_table_client.read_table(name
|
|
135
|
-
online_config=online_config, entity_row=entity_row)
|
|
118
|
+
return self._feature_table_client.read_table(name, database_name)
|
|
136
119
|
|
|
137
120
|
def get_table(self, name: str, database_name: Optional[str] = None) -> FeatureTable:
|
|
138
121
|
"""
|
|
@@ -154,6 +137,7 @@ class FeatureStoreClient:
|
|
|
154
137
|
Args:
|
|
155
138
|
name: 要删除的特征表名称
|
|
156
139
|
database_name: database name
|
|
140
|
+
|
|
157
141
|
Returns:
|
|
158
142
|
None
|
|
159
143
|
"""
|
|
@@ -288,7 +272,7 @@ class FeatureStoreClient:
|
|
|
288
272
|
)
|
|
289
273
|
|
|
290
274
|
def score_batch(
|
|
291
|
-
|
|
275
|
+
self, model_uri: str, df: DataFrame, result_type: str = "double"
|
|
292
276
|
) -> DataFrame:
|
|
293
277
|
"""
|
|
294
278
|
Evaluate the model on the provided :class:`DataFrame <pyspark.sql.DataFrame>`.
|
|
@@ -399,65 +383,17 @@ class FeatureStoreClient:
|
|
|
399
383
|
df=df,
|
|
400
384
|
result_type=result_type,
|
|
401
385
|
client_name=FEATURE_STORE_CLIENT,
|
|
402
|
-
timestamp_key=timestamp_key,
|
|
403
386
|
)
|
|
404
387
|
|
|
405
|
-
def publish_table(self, table_name: str, data_source_name: str, cloud_secret_id: str, cloud_secret_key: str,
|
|
406
|
-
database_name: Optional[str] = None,
|
|
407
|
-
is_cycle: bool = False, cycle_obj: TaskSchedulerConfiguration = None,
|
|
408
|
-
is_use_default_online: bool = True, online_config: RedisStoreConfig = None):
|
|
409
|
-
"""
|
|
410
|
-
Publish an offline feature table to an online feature table.
|
|
411
|
-
|
|
412
|
-
This method synchronizes the offline feature table data to online storage
|
|
413
|
-
for low-latency feature serving in real-time applications.
|
|
414
|
-
|
|
415
|
-
Args:
|
|
416
|
-
table_name: Name of the offline feature table
|
|
417
|
-
data_source_name: Name of the data source
|
|
418
|
-
cloud_secret_id: Cloud secret ID for authentication
|
|
419
|
-
cloud_secret_key: Cloud secret key for authentication
|
|
420
|
-
database_name: Database name (optional)
|
|
421
|
-
is_cycle: Whether to enable periodic publishing (default: False)
|
|
422
|
-
cycle_obj: Periodic task configuration object (required if is_cycle is True)
|
|
423
|
-
is_use_default_online: Whether to use default online storage configuration (default: True)
|
|
424
|
-
online_config: Custom online storage configuration (only effective when is_use_default_online is False)
|
|
425
|
-
|
|
426
|
-
Returns:
|
|
427
|
-
None
|
|
428
|
-
|
|
429
|
-
"""
|
|
430
|
-
return self._feature_table_client.publish_table(table_name=table_name, database_name=database_name,
|
|
431
|
-
data_source_name=data_source_name,
|
|
432
|
-
cloud_secret_key=cloud_secret_key,
|
|
433
|
-
cloud_secret_id=cloud_secret_id,
|
|
434
|
-
is_cycle=is_cycle, cycle_obj=cycle_obj,
|
|
435
|
-
is_use_default_online=is_use_default_online,
|
|
436
|
-
online_config=online_config)
|
|
437
|
-
|
|
438
|
-
def drop_online_table(self, table_name: str, online_config: RedisStoreConfig, database_name: Optional[str] = None):
|
|
439
|
-
"""
|
|
440
|
-
Drop an online feature table.
|
|
441
|
-
:param table_name: Name of the offline feature table
|
|
442
|
-
:param database_name: Database name (optional)
|
|
443
|
-
:param online_config: Custom online storage configuration (only effective when is_use_default_online is False)
|
|
444
|
-
:return:
|
|
445
|
-
"""
|
|
446
|
-
self._feature_table_client.drop_online_table(table_name=table_name, database_name=database_name, online_config=online_config)
|
|
447
|
-
|
|
448
|
-
def create_feature_spec(
|
|
449
|
-
self, name: str,
|
|
450
|
-
features: List[Union[FeatureLookup, FeatureFunction]],
|
|
451
|
-
exclude_columns: List[str]):
|
|
452
|
-
|
|
453
|
-
"""
|
|
454
|
-
创建特征配置文件
|
|
455
|
-
:arg name: 特征配置文件名称
|
|
456
|
-
:arg features: 特征列表,可以是FeatureLookup(特征查找)或FeatureFunction(特征函数)
|
|
457
|
-
:arg exclude_columns: 需要从最终特征集中排除的列名列表
|
|
458
|
-
"""
|
|
459
|
-
return self._training_set_client.create_feature_spec(name, features, self._spark_client, exclude_columns)
|
|
460
|
-
|
|
461
388
|
@property
|
|
462
389
|
def spark(self):
|
|
463
390
|
return self._spark
|
|
391
|
+
|
|
392
|
+
def _init_catalog(self):
|
|
393
|
+
"""关联catalog"""
|
|
394
|
+
qcloud_region = os.getenv("QCLOUD_REGION") or os.getenv("REGION") or os.getenv("KERNEL_COS_REGION") \
|
|
395
|
+
or os.getenv("NOTEBOOK_COS_REGION")
|
|
396
|
+
if qcloud_region:
|
|
397
|
+
mlflow.set_registry_uri(f"tclake:{qcloud_region}")
|
|
398
|
+
|
|
399
|
+
|
|
@@ -43,7 +43,7 @@ ML_MODEL = "MLmodel"
|
|
|
43
43
|
FEATURE_LOOKUP_CLIENT_PIP_PACKAGE = "tencent-wedata-feature-engineering-dev"
|
|
44
44
|
|
|
45
45
|
# 特征查找版本号
|
|
46
|
-
FEATURE_LOOKUP_CLIENT_MAJOR_VERSION = "0.
|
|
46
|
+
FEATURE_LOOKUP_CLIENT_MAJOR_VERSION = "0.2.0"
|
|
47
47
|
|
|
48
48
|
# 特征存储内部数据目录
|
|
49
49
|
FEATURE_STORE_INTERNAL_DATA_DIR = "_wedata_internal/"
|
|
@@ -53,7 +53,4 @@ WEDATA_DEFAULT_FEATURE_STORE_DATABASE = "WEDATA_DEFAULT_FEATURE_STORE_DATABASE"
|
|
|
53
53
|
FEATURE_TABLE_KEY = "wedata.feature_table"
|
|
54
54
|
FEATURE_TABLE_VALUE = "true"
|
|
55
55
|
|
|
56
|
-
FEATURE_TABLE_PROJECT = "wedata.feature_project_id"
|
|
57
|
-
FEATURE_TABLE_TIMESTAMP = "timestampKeys"
|
|
58
|
-
FEATURE_TABLE_BACKUP_PRIMARY_KEY = "primaryKeys" # 备用标识,主键
|
|
59
|
-
FEATURE_DLC_TABLE_PRIMARY_KEY = "dlc.ao.data.govern.sorted.keys"
|
|
56
|
+
FEATURE_TABLE_PROJECT = "wedata.feature_project_id"
|
|
@@ -4,7 +4,6 @@ import logging
|
|
|
4
4
|
from typing import Dict, List, Optional, Union
|
|
5
5
|
|
|
6
6
|
from wedata.feature_store.utils import common_utils
|
|
7
|
-
from wedata.feature_store.common.store_config.redis import RedisStoreConfig
|
|
8
7
|
|
|
9
8
|
_logger = logging.getLogger(__name__)
|
|
10
9
|
|
|
@@ -20,8 +19,6 @@ class FeatureLookup:
|
|
|
20
19
|
|
|
21
20
|
- table_name:特征表的名称。
|
|
22
21
|
- lookup_key:用于在特征表和训练集之间进行联接的键。lookup_key必须是训练集中的列。lookup_key的类型和顺序必须与特征表的主键匹配。
|
|
23
|
-
- is_online:如果为True,则会使用在线特征表。如果为False,则会使用离线特征表。默认值为False。
|
|
24
|
-
- online_config:如果is_online为True,则会使用此配置来配置在线特征表。默认值为None。
|
|
25
22
|
- feature_names:要从特征表中查找的特征的名称。如果您的模型需要主键作为特征,则可以将它们声明为独立的FeatureLookups。
|
|
26
23
|
- rename_outputs:如果提供,则会将特征重命名为 :meth:`create_training_set() <databricks.feature_engineering.client.FeatureEngineeringClient.create_training_set>`返回的 :class:`TrainingSet <databricks.ml_features.training_set.TrainingSet>` 中的特征。
|
|
27
24
|
- timestamp_lookup_key:用于在特征表和训练集之间进行联接的时间戳键。timestamp_lookup_key必须是训练集中的列。timestamp_lookup_key的类型必须与特征表的时间戳键的类型匹配。
|
|
@@ -49,8 +46,6 @@ class FeatureLookup:
|
|
|
49
46
|
table_name: str,
|
|
50
47
|
lookup_key: Union[str, List[str]],
|
|
51
48
|
*,
|
|
52
|
-
is_online: bool = False,
|
|
53
|
-
online_config: RedisStoreConfig = None,
|
|
54
49
|
feature_names: Union[str, List[str], None] = None,
|
|
55
50
|
rename_outputs: Optional[Dict[str, str]] = None,
|
|
56
51
|
timestamp_lookup_key: Optional[str] = None,
|
|
@@ -109,8 +104,6 @@ class FeatureLookup:
|
|
|
109
104
|
self._lookup_key = copy.copy(lookup_key)
|
|
110
105
|
self._timestamp_lookup_key = copy.copy(timestamp_lookup_key)
|
|
111
106
|
self._lookback_window = copy.copy(lookback_window)
|
|
112
|
-
self._is_online = is_online
|
|
113
|
-
self._online_config = online_config
|
|
114
107
|
|
|
115
108
|
self._rename_outputs = {}
|
|
116
109
|
if rename_outputs is not None:
|
|
@@ -156,16 +149,6 @@ class FeatureLookup:
|
|
|
156
149
|
"""A lookback window applied only for point-in-time lookups."""
|
|
157
150
|
return self._lookback_window
|
|
158
151
|
|
|
159
|
-
@property
|
|
160
|
-
def is_online(self):
|
|
161
|
-
"""Whether to use online feature tables."""
|
|
162
|
-
return self._is_online
|
|
163
|
-
|
|
164
|
-
@property
|
|
165
|
-
def online_config(self):
|
|
166
|
-
"""The online feature table configuration."""
|
|
167
|
-
return self._online_config
|
|
168
|
-
|
|
169
152
|
def _get_feature_names(self):
|
|
170
153
|
return self._feature_names
|
|
171
154
|
|
|
@@ -138,14 +138,14 @@ class FeatureSpec:
|
|
|
138
138
|
|
|
139
139
|
# function_infos should not be duplicated
|
|
140
140
|
common_utils.validate_strings_unique(
|
|
141
|
-
[function_info.
|
|
141
|
+
[function_info.udf_name for function_info in self.function_infos],
|
|
142
142
|
"Internal Error: Expect all udf_names in function_infos to be unique. Found duplicates {}",
|
|
143
143
|
)
|
|
144
144
|
|
|
145
145
|
# Unique UDF names in function_infos must match those in column_infos.
|
|
146
146
|
# No version check is required as both fields were added simultaneously in FeatureSpec v5.
|
|
147
147
|
unique_udf_names = set(
|
|
148
|
-
[function_info.
|
|
148
|
+
[function_info.udf_name for function_info in self.function_infos]
|
|
149
149
|
)
|
|
150
150
|
unique_column_udf_names = set(
|
|
151
151
|
[odci.udf_name for odci in self.on_demand_column_infos]
|
|
@@ -75,15 +75,11 @@ class FeatureTable:
|
|
|
75
75
|
格式化的字符串,包含表名、ID、描述、主键、分区列、特征数量、
|
|
76
76
|
时间戳键、创建时间、数据源数量和标签数量等信息
|
|
77
77
|
"""
|
|
78
|
-
if self.description and len(self.description) > 50:
|
|
79
|
-
desc = self.description[:50] + "..."
|
|
80
|
-
else:
|
|
81
|
-
desc = self.description
|
|
82
78
|
return (
|
|
83
79
|
f"FeatureTable(\n"
|
|
84
80
|
f" name='{self.name}',\n"
|
|
85
81
|
f" table_id='{self.table_id}',\n"
|
|
86
|
-
f" description='{
|
|
82
|
+
f" description='{self.description[:50]}{'...' if len(self.description) > 50 else ''}',\n"
|
|
87
83
|
f" primary_keys={self.primary_keys},\n"
|
|
88
84
|
f" partition_columns={self.partition_columns},\n"
|
|
89
85
|
f" features={len(self.features)},\n"
|
|
@@ -1,6 +1,9 @@
|
|
|
1
|
-
|
|
1
|
+
from collections import defaultdict
|
|
2
2
|
from typing import List, Optional
|
|
3
3
|
|
|
4
|
+
from pyspark.sql import Column, DataFrame
|
|
5
|
+
from pyspark.sql.functions import isnull, when
|
|
6
|
+
from pyspark.sql.types import StringType, StructField, StructType
|
|
4
7
|
from wedata.feature_store.common.protos import feature_store_pb2
|
|
5
8
|
|
|
6
9
|
class FunctionParameterInfo():
|