tencent-wedata-feature-engineering-dev 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.
- tencent-wedata-feature-engineering-dev-0.1.0/PKG-INFO +13 -0
- tencent-wedata-feature-engineering-dev-0.1.0/README.md +0 -0
- tencent-wedata-feature-engineering-dev-0.1.0/setup.cfg +4 -0
- tencent-wedata-feature-engineering-dev-0.1.0/setup.py +33 -0
- tencent-wedata-feature-engineering-dev-0.1.0/tencent_wedata_feature_engineering_dev.egg-info/PKG-INFO +13 -0
- tencent-wedata-feature-engineering-dev-0.1.0/tencent_wedata_feature_engineering_dev.egg-info/SOURCES.txt +67 -0
- tencent-wedata-feature-engineering-dev-0.1.0/tencent_wedata_feature_engineering_dev.egg-info/dependency_links.txt +1 -0
- tencent-wedata-feature-engineering-dev-0.1.0/tencent_wedata_feature_engineering_dev.egg-info/requires.txt +5 -0
- tencent-wedata-feature-engineering-dev-0.1.0/tencent_wedata_feature_engineering_dev.egg-info/top_level.txt +1 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/__init__.py +9 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/__init__.py +0 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/client.py +462 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/cloud_sdk_client/client.py +86 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/cloud_sdk_client/models.py +686 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/cloud_sdk_client/utils.py +32 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/common/__init__.py +0 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/common/protos/__init__.py +0 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/common/protos/feature_store_pb2.py +49 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/common/store_config/__init__.py +0 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/common/store_config/redis.py +48 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/constants/__init__.py +0 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/constants/constants.py +59 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/constants/engine_types.py +34 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/entities/__init__.py +0 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/entities/column_info.py +138 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/entities/environment_variables.py +55 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/entities/feature.py +53 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/entities/feature_column_info.py +72 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/entities/feature_function.py +55 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/entities/feature_lookup.py +200 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/entities/feature_spec.py +489 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/entities/feature_spec_constants.py +25 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/entities/feature_table.py +111 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/entities/feature_table_info.py +49 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/entities/function_info.py +90 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/entities/on_demand_column_info.py +57 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/entities/source_data_column_info.py +24 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/entities/training_set.py +135 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/feast_client/__init__.py +0 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/feast_client/feast_client.py +482 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/feature_table_client/__init__.py +0 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/feature_table_client/feature_table_client.py +969 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/mlflow_model.py +17 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/spark_client/__init__.py +0 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/spark_client/spark_client.py +289 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/training_set_client/__init__.py +0 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/training_set_client/training_set_client.py +572 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/utils/__init__.py +0 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/utils/common_utils.py +352 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/utils/env_utils.py +86 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/utils/feature_lookup_utils.py +564 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/utils/feature_spec_utils.py +286 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/utils/feature_utils.py +73 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/utils/on_demand_utils.py +107 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/utils/schema_utils.py +117 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/utils/signature_utils.py +202 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/utils/topological_sort.py +158 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/utils/training_set_utils.py +579 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/utils/uc_utils.py +296 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/utils/validation_utils.py +79 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/tempo/__init__.py +0 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/tempo/interpol.py +448 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/tempo/intervals.py +1331 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/tempo/io.py +61 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/tempo/ml.py +129 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/tempo/resample.py +318 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/tempo/tsdf.py +1720 -0
- tencent-wedata-feature-engineering-dev-0.1.0/wedata/tempo/utils.py +254 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: tencent-wedata-feature-engineering-dev
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Wedata Feature Engineering Library Development
|
|
5
|
+
Home-page:
|
|
6
|
+
Author: meahqian
|
|
7
|
+
Author-email:
|
|
8
|
+
License: Apache 2.0
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.7
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
File without changes
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
# Dynamically read version from wedata/__init__.py
|
|
5
|
+
version = {}
|
|
6
|
+
with open(os.path.join(os.path.dirname(__file__), 'wedata', '__init__.py')) as f:
|
|
7
|
+
exec(f.read(), version)
|
|
8
|
+
|
|
9
|
+
setup(
|
|
10
|
+
name="tencent-wedata-feature-engineering-dev",
|
|
11
|
+
version=version["__version__"],
|
|
12
|
+
packages=find_packages(include=['wedata', 'wedata.*']),
|
|
13
|
+
install_requires=[
|
|
14
|
+
'pandas>=1.0.0',
|
|
15
|
+
'feast[redis]==0.49.0',
|
|
16
|
+
'grpcio==1.74.0',
|
|
17
|
+
'tencentcloud-sdk-python',
|
|
18
|
+
'ipython'
|
|
19
|
+
],
|
|
20
|
+
python_requires='>=3.7',
|
|
21
|
+
author="meahqian",
|
|
22
|
+
author_email="",
|
|
23
|
+
description="Wedata Feature Engineering Library Development",
|
|
24
|
+
long_description=open("README.md").read(),
|
|
25
|
+
long_description_content_type="text/markdown",
|
|
26
|
+
license="Apache 2.0",
|
|
27
|
+
url="",
|
|
28
|
+
classifiers=[
|
|
29
|
+
"Programming Language :: Python :: 3",
|
|
30
|
+
"License :: OSI Approved :: Apache Software License",
|
|
31
|
+
"Operating System :: OS Independent",
|
|
32
|
+
],
|
|
33
|
+
)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: tencent-wedata-feature-engineering-dev
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Wedata Feature Engineering Library Development
|
|
5
|
+
Home-page:
|
|
6
|
+
Author: meahqian
|
|
7
|
+
Author-email:
|
|
8
|
+
License: Apache 2.0
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.7
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
setup.py
|
|
3
|
+
tencent_wedata_feature_engineering_dev.egg-info/PKG-INFO
|
|
4
|
+
tencent_wedata_feature_engineering_dev.egg-info/SOURCES.txt
|
|
5
|
+
tencent_wedata_feature_engineering_dev.egg-info/dependency_links.txt
|
|
6
|
+
tencent_wedata_feature_engineering_dev.egg-info/requires.txt
|
|
7
|
+
tencent_wedata_feature_engineering_dev.egg-info/top_level.txt
|
|
8
|
+
wedata/__init__.py
|
|
9
|
+
wedata/feature_store/__init__.py
|
|
10
|
+
wedata/feature_store/client.py
|
|
11
|
+
wedata/feature_store/mlflow_model.py
|
|
12
|
+
wedata/feature_store/cloud_sdk_client/__init__.py
|
|
13
|
+
wedata/feature_store/cloud_sdk_client/client.py
|
|
14
|
+
wedata/feature_store/cloud_sdk_client/models.py
|
|
15
|
+
wedata/feature_store/cloud_sdk_client/utils.py
|
|
16
|
+
wedata/feature_store/common/__init__.py
|
|
17
|
+
wedata/feature_store/common/protos/__init__.py
|
|
18
|
+
wedata/feature_store/common/protos/feature_store_pb2.py
|
|
19
|
+
wedata/feature_store/common/store_config/__init__.py
|
|
20
|
+
wedata/feature_store/common/store_config/redis.py
|
|
21
|
+
wedata/feature_store/constants/__init__.py
|
|
22
|
+
wedata/feature_store/constants/constants.py
|
|
23
|
+
wedata/feature_store/constants/engine_types.py
|
|
24
|
+
wedata/feature_store/entities/__init__.py
|
|
25
|
+
wedata/feature_store/entities/column_info.py
|
|
26
|
+
wedata/feature_store/entities/environment_variables.py
|
|
27
|
+
wedata/feature_store/entities/feature.py
|
|
28
|
+
wedata/feature_store/entities/feature_column_info.py
|
|
29
|
+
wedata/feature_store/entities/feature_function.py
|
|
30
|
+
wedata/feature_store/entities/feature_lookup.py
|
|
31
|
+
wedata/feature_store/entities/feature_spec.py
|
|
32
|
+
wedata/feature_store/entities/feature_spec_constants.py
|
|
33
|
+
wedata/feature_store/entities/feature_table.py
|
|
34
|
+
wedata/feature_store/entities/feature_table_info.py
|
|
35
|
+
wedata/feature_store/entities/function_info.py
|
|
36
|
+
wedata/feature_store/entities/on_demand_column_info.py
|
|
37
|
+
wedata/feature_store/entities/source_data_column_info.py
|
|
38
|
+
wedata/feature_store/entities/training_set.py
|
|
39
|
+
wedata/feature_store/feast_client/__init__.py
|
|
40
|
+
wedata/feature_store/feast_client/feast_client.py
|
|
41
|
+
wedata/feature_store/feature_table_client/__init__.py
|
|
42
|
+
wedata/feature_store/feature_table_client/feature_table_client.py
|
|
43
|
+
wedata/feature_store/spark_client/__init__.py
|
|
44
|
+
wedata/feature_store/spark_client/spark_client.py
|
|
45
|
+
wedata/feature_store/training_set_client/__init__.py
|
|
46
|
+
wedata/feature_store/training_set_client/training_set_client.py
|
|
47
|
+
wedata/feature_store/utils/__init__.py
|
|
48
|
+
wedata/feature_store/utils/common_utils.py
|
|
49
|
+
wedata/feature_store/utils/env_utils.py
|
|
50
|
+
wedata/feature_store/utils/feature_lookup_utils.py
|
|
51
|
+
wedata/feature_store/utils/feature_spec_utils.py
|
|
52
|
+
wedata/feature_store/utils/feature_utils.py
|
|
53
|
+
wedata/feature_store/utils/on_demand_utils.py
|
|
54
|
+
wedata/feature_store/utils/schema_utils.py
|
|
55
|
+
wedata/feature_store/utils/signature_utils.py
|
|
56
|
+
wedata/feature_store/utils/topological_sort.py
|
|
57
|
+
wedata/feature_store/utils/training_set_utils.py
|
|
58
|
+
wedata/feature_store/utils/uc_utils.py
|
|
59
|
+
wedata/feature_store/utils/validation_utils.py
|
|
60
|
+
wedata/tempo/__init__.py
|
|
61
|
+
wedata/tempo/interpol.py
|
|
62
|
+
wedata/tempo/intervals.py
|
|
63
|
+
wedata/tempo/io.py
|
|
64
|
+
wedata/tempo/ml.py
|
|
65
|
+
wedata/tempo/resample.py
|
|
66
|
+
wedata/tempo/tsdf.py
|
|
67
|
+
wedata/tempo/utils.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
wedata
|
|
File without changes
|
|
@@ -0,0 +1,462 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Wedata FeatureStoreClient Python实现
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from types import ModuleType
|
|
8
|
+
from typing import Union, List, Dict, Optional, Any
|
|
9
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
10
|
+
from pyspark.sql.streaming import StreamingQuery
|
|
11
|
+
from pyspark.sql.types import StructType
|
|
12
|
+
import mlflow
|
|
13
|
+
from wedata.feature_store.constants.constants import FEATURE_STORE_CLIENT
|
|
14
|
+
|
|
15
|
+
from wedata.feature_store.constants.constants import APPEND, DEFAULT_WRITE_STREAM_TRIGGER
|
|
16
|
+
from wedata.feature_store.constants.engine_types import EngineTypes
|
|
17
|
+
from wedata.feature_store.common.store_config.redis import RedisStoreConfig
|
|
18
|
+
from wedata.feature_store.cloud_sdk_client.models import TaskSchedulerConfiguration
|
|
19
|
+
from wedata.feature_store.entities.feature_function import FeatureFunction
|
|
20
|
+
from wedata.feature_store.entities.feature_lookup import FeatureLookup
|
|
21
|
+
from wedata.feature_store.entities.feature_table import FeatureTable
|
|
22
|
+
from wedata.feature_store.entities.training_set import TrainingSet
|
|
23
|
+
from wedata.feature_store.feature_table_client.feature_table_client import FeatureTableClient
|
|
24
|
+
from wedata.feature_store.spark_client.spark_client import SparkClient
|
|
25
|
+
from wedata.feature_store.training_set_client.training_set_client import TrainingSetClient
|
|
26
|
+
from wedata.feature_store.utils import common_utils
|
|
27
|
+
from wedata.feature_store.utils.feature_utils import format_feature_lookups_and_functions
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class FeatureStoreClient:
|
|
31
|
+
"""特征存储统一客户端,提供特征全生命周期管理能力"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, spark: Optional[SparkSession] = None, cloud_secret_id: str = None, cloud_secret_key: str = None):
|
|
34
|
+
"""
|
|
35
|
+
:param spark: 已初始化的SparkSession对象
|
|
36
|
+
"""
|
|
37
|
+
if spark is None:
|
|
38
|
+
spark = SparkSession.builder.getOrCreate()
|
|
39
|
+
self._spark = spark
|
|
40
|
+
self._spark_client = SparkClient(spark)
|
|
41
|
+
self._feature_table_client = FeatureTableClient(spark, cloud_secret_id=cloud_secret_id,
|
|
42
|
+
cloud_secret_key=cloud_secret_key)
|
|
43
|
+
self._training_set_client = TrainingSetClient(self._spark_client)
|
|
44
|
+
|
|
45
|
+
def create_table(
|
|
46
|
+
self,
|
|
47
|
+
name: str,
|
|
48
|
+
primary_keys: Union[str, List[str]],
|
|
49
|
+
timestamp_key: [str],
|
|
50
|
+
engine_type: [EngineTypes],
|
|
51
|
+
data_source_name: [str],
|
|
52
|
+
database_name: Optional[str] = None,
|
|
53
|
+
df: Optional[DataFrame] = None,
|
|
54
|
+
*,
|
|
55
|
+
partition_columns: Union[str, List[str], None] = None,
|
|
56
|
+
schema: Optional[StructType] = None,
|
|
57
|
+
description: Optional[str] = None,
|
|
58
|
+
tags: Optional[Dict[str, str]] = None
|
|
59
|
+
) -> FeatureTable:
|
|
60
|
+
"""
|
|
61
|
+
创建特征表(支持批流数据写入)
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
name: 特征表全称(格式:<table>)
|
|
65
|
+
primary_keys: 主键列名(支持复合主键)
|
|
66
|
+
timestamp_key: 时间戳键(用于时态特征)
|
|
67
|
+
engine_type: 引擎类型 wedata.feature_store.constants.engine_types.EngineTypes
|
|
68
|
+
data_source_name: 数据源名称
|
|
69
|
+
database_name: 数据库名
|
|
70
|
+
df: 初始数据(可选,用于推断schema)
|
|
71
|
+
partition_columns: 分区列(优化存储查询)
|
|
72
|
+
schema: 表结构定义(可选,当不提供df时必需)
|
|
73
|
+
description: 业务描述
|
|
74
|
+
tags: 业务标签
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
FeatureTable实例
|
|
78
|
+
|
|
79
|
+
Raises:
|
|
80
|
+
ValueError: 当schema与数据不匹配时
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
return self._feature_table_client.create_table(
|
|
84
|
+
name=name,
|
|
85
|
+
primary_keys=primary_keys,
|
|
86
|
+
engine_type=engine_type,
|
|
87
|
+
database_name=database_name,
|
|
88
|
+
data_source_name=data_source_name,
|
|
89
|
+
df=df,
|
|
90
|
+
timestamp_key=timestamp_key,
|
|
91
|
+
partition_columns=partition_columns,
|
|
92
|
+
schema=schema,
|
|
93
|
+
description=description,
|
|
94
|
+
tags=tags
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
def register_table(self, name: str, timestamp_key: str, engine_type: EngineTypes, data_source_name: [str],
|
|
98
|
+
database_name: Optional[str] = None,
|
|
99
|
+
primary_keys: Union[str, List[str]] = None) -> DataFrame:
|
|
100
|
+
"""
|
|
101
|
+
将普通的表注册为特征表,并返回特征表数据
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
name: 特征表名称
|
|
105
|
+
database_name: 特征库名称
|
|
106
|
+
timestamp_key: 时间戳键 (用于后续离在线特征同步)
|
|
107
|
+
engine_type: 引擎类型 wedata.feature_store.constants.engine_types.EngineTypes
|
|
108
|
+
data_source_name: 数据源名称
|
|
109
|
+
primary_keys: 主键列名(支持复合主键)(仅当engine_type为EngineTypes.HIVE_ENGINE时有效)
|
|
110
|
+
Returns:
|
|
111
|
+
DataFrame: 包含特征表数据的DataFrame对象
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
return self._feature_table_client.register_table(name, database_name, timestamp_key=timestamp_key,
|
|
115
|
+
engine_type=engine_type, primary_keys=primary_keys, data_source_name=data_source_name)
|
|
116
|
+
|
|
117
|
+
def read_table(self, name: str, database_name: Optional[str] = None, is_online: bool = False,
|
|
118
|
+
online_config: Optional[RedisStoreConfig] = None,
|
|
119
|
+
entity_row: Optional[List[Dict[str, Any]]] = None) -> DataFrame:
|
|
120
|
+
"""
|
|
121
|
+
读取特征表数据
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
name: 特征表名称
|
|
125
|
+
database_name: 特征库名称
|
|
126
|
+
is_online: 是否读取在线特征表(默认不读取)
|
|
127
|
+
online_config: 在线特征表配置(仅当is_online为True时有效)
|
|
128
|
+
entity_row: 实体行数据(仅当is_online为True时有效)
|
|
129
|
+
[{primary_key1: [value1, value2]}, {primary_key2: [value1, value2]}]
|
|
130
|
+
Returns:
|
|
131
|
+
DataFrame: 包含特征表数据的DataFrame对象
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
return self._feature_table_client.read_table(name=name, database_name=database_name, is_online=is_online,
|
|
135
|
+
online_config=online_config, entity_row=entity_row)
|
|
136
|
+
|
|
137
|
+
def get_table(self, name: str, database_name: Optional[str] = None) -> FeatureTable:
|
|
138
|
+
"""
|
|
139
|
+
获取特征表元数据
|
|
140
|
+
Args:
|
|
141
|
+
name: 特征表名称
|
|
142
|
+
database_name: 特征库名称
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
FeatureTable: 包含特征表元数据的FeatureTable对象
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
return self._feature_table_client.get_table(name, self._spark_client, database_name)
|
|
149
|
+
|
|
150
|
+
def drop_table(self, name: str, database_name: Optional[str] = None) -> None:
|
|
151
|
+
"""
|
|
152
|
+
删除特征表
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
name: 要删除的特征表名称
|
|
156
|
+
database_name: database name
|
|
157
|
+
Returns:
|
|
158
|
+
None
|
|
159
|
+
"""
|
|
160
|
+
|
|
161
|
+
return self._feature_table_client.drop_table(name, database_name)
|
|
162
|
+
|
|
163
|
+
def write_table(
|
|
164
|
+
self,
|
|
165
|
+
name: str,
|
|
166
|
+
df: DataFrame,
|
|
167
|
+
database_name: Optional[str] = None,
|
|
168
|
+
mode: Optional[str] = APPEND,
|
|
169
|
+
checkpoint_location: Optional[str] = None,
|
|
170
|
+
trigger: Dict[str, Any] = DEFAULT_WRITE_STREAM_TRIGGER,
|
|
171
|
+
) -> Optional[StreamingQuery]:
|
|
172
|
+
"""
|
|
173
|
+
写入数据到特征表(支持批处理和流式处理)
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
name: 特征表名称
|
|
177
|
+
df: 要写入的数据DataFrame
|
|
178
|
+
database_name: 特征库名称
|
|
179
|
+
mode: 写入模式(默认追加)
|
|
180
|
+
checkpoint_location: 流式处理的检查点位置(可选)
|
|
181
|
+
trigger: 流式处理触发器配置(默认使用系统预设)
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
如果是流式写入返回StreamingQuery对象,否则返回None
|
|
185
|
+
"""
|
|
186
|
+
|
|
187
|
+
return self._feature_table_client.write_table(
|
|
188
|
+
name=name,
|
|
189
|
+
df=df,
|
|
190
|
+
database_name=database_name,
|
|
191
|
+
mode=mode,
|
|
192
|
+
checkpoint_location=checkpoint_location,
|
|
193
|
+
trigger=trigger,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
def create_training_set(
|
|
197
|
+
self,
|
|
198
|
+
df: DataFrame,
|
|
199
|
+
feature_lookups: List[Union[FeatureLookup, FeatureFunction]],
|
|
200
|
+
label: Union[str, List[str], None],
|
|
201
|
+
exclude_columns: Optional[List[str]] = None,
|
|
202
|
+
database_name: Optional[str] = None,
|
|
203
|
+
**kwargs,
|
|
204
|
+
) -> TrainingSet:
|
|
205
|
+
"""
|
|
206
|
+
创建训练集
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
df: 基础数据
|
|
210
|
+
feature_lookups: 特征查询列表
|
|
211
|
+
label: 标签列名
|
|
212
|
+
exclude_columns: 排除列名
|
|
213
|
+
database_name: database name
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
TrainingSet实例
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
if exclude_columns is None:
|
|
220
|
+
exclude_columns = []
|
|
221
|
+
|
|
222
|
+
# 如果为FeatureLookup,则将需要校验FeatureLookup的table_name,并构建完整表名
|
|
223
|
+
for feature in feature_lookups:
|
|
224
|
+
if isinstance(feature, FeatureLookup):
|
|
225
|
+
if not feature.table_name:
|
|
226
|
+
raise ValueError("FeatureLookup must specify a table_name")
|
|
227
|
+
# 先校验表名格式是否合法
|
|
228
|
+
common_utils.validate_table_name(feature.table_name)
|
|
229
|
+
# 再构建完整表名,并赋值给FeatureLookup对象
|
|
230
|
+
feature.table_name = common_utils.build_full_table_name(feature.table_name, database_name)
|
|
231
|
+
|
|
232
|
+
features = feature_lookups
|
|
233
|
+
del feature_lookups
|
|
234
|
+
|
|
235
|
+
features = format_feature_lookups_and_functions(self._spark_client, features)
|
|
236
|
+
|
|
237
|
+
return self._training_set_client.create_training_set_from_feature_lookups(
|
|
238
|
+
df=df,
|
|
239
|
+
feature_lookups=features,
|
|
240
|
+
label=label,
|
|
241
|
+
exclude_columns=exclude_columns,
|
|
242
|
+
**kwargs
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
def log_model(
|
|
246
|
+
self,
|
|
247
|
+
model: Any,
|
|
248
|
+
artifact_path: str,
|
|
249
|
+
*,
|
|
250
|
+
flavor: ModuleType,
|
|
251
|
+
training_set: Optional[TrainingSet] = None,
|
|
252
|
+
registered_model_name: Optional[str] = None,
|
|
253
|
+
model_registry_uri: Optional[str] = None,
|
|
254
|
+
await_registration_for: int = mlflow.tracking._model_registry.DEFAULT_AWAIT_MAX_SLEEP_SECONDS,
|
|
255
|
+
infer_input_example: bool = False,
|
|
256
|
+
**kwargs,
|
|
257
|
+
):
|
|
258
|
+
"""
|
|
259
|
+
记录MLflow模型并关联特征查找信息
|
|
260
|
+
|
|
261
|
+
注意:必须使用TrainingSet.load_df返回的DataFrame训练模型,
|
|
262
|
+
任何对DataFrame的修改(如标准化、添加列等)都不会在推理时应用
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
model: 要记录的模型对象
|
|
266
|
+
artifact_path: 模型存储路径
|
|
267
|
+
flavor: MLflow模型类型模块(如mlflow.sklearn)
|
|
268
|
+
training_set: 训练模型使用的TrainingSet对象(可选)
|
|
269
|
+
registered_model_name: 要注册的模型名称(可选)
|
|
270
|
+
model_registry_uri: 模型注册中心地址(可选)
|
|
271
|
+
await_registration_for: 等待模型注册完成的秒数(默认300秒)
|
|
272
|
+
infer_input_example: 是否自动记录输入示例(默认False)
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
None
|
|
276
|
+
"""
|
|
277
|
+
|
|
278
|
+
self._training_set_client.log_model(
|
|
279
|
+
model=model,
|
|
280
|
+
artifact_path=artifact_path,
|
|
281
|
+
flavor=flavor,
|
|
282
|
+
training_set=training_set,
|
|
283
|
+
registered_model_name=registered_model_name,
|
|
284
|
+
model_registry_uri=model_registry_uri,
|
|
285
|
+
await_registration_for=await_registration_for,
|
|
286
|
+
infer_input_example=infer_input_example,
|
|
287
|
+
**kwargs
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
def score_batch(
|
|
291
|
+
self, model_uri: str, df: DataFrame, result_type: str = "double"
|
|
292
|
+
) -> DataFrame:
|
|
293
|
+
"""
|
|
294
|
+
Evaluate the model on the provided :class:`DataFrame <pyspark.sql.DataFrame>`.
|
|
295
|
+
|
|
296
|
+
Additional features required for
|
|
297
|
+
model evaluation will be automatically retrieved from :mod:`Feature Store <databricks.feature_store.client>`.
|
|
298
|
+
|
|
299
|
+
.. todo::
|
|
300
|
+
|
|
301
|
+
[ML-15539]: Replace the bitly URL in doc string
|
|
302
|
+
|
|
303
|
+
The model must have been logged with :meth:`.FeatureStoreClient.log_model`,
|
|
304
|
+
which packages the model with feature metadata. Unless present in ``df``,
|
|
305
|
+
these features will be looked up from :mod:`Feature Store <databricks.feature_store.client>` and joined with ``df``
|
|
306
|
+
prior to scoring the model.
|
|
307
|
+
|
|
308
|
+
If a feature is included in ``df``, the provided feature values will be used rather
|
|
309
|
+
than those stored in :mod:`Feature Store <databricks.feature_store.client>`.
|
|
310
|
+
|
|
311
|
+
For example, if a model is trained on two features ``account_creation_date`` and
|
|
312
|
+
``num_lifetime_purchases``, as in:
|
|
313
|
+
|
|
314
|
+
.. code-block:: python
|
|
315
|
+
|
|
316
|
+
feature_lookups = [
|
|
317
|
+
FeatureLookup(
|
|
318
|
+
table_name = 'trust_and_safety.customer_features',
|
|
319
|
+
feature_name = 'account_creation_date',
|
|
320
|
+
lookup_key = 'customer_id',
|
|
321
|
+
),
|
|
322
|
+
FeatureLookup(
|
|
323
|
+
table_name = 'trust_and_safety.customer_features',
|
|
324
|
+
feature_name = 'num_lifetime_purchases',
|
|
325
|
+
lookup_key = 'customer_id'
|
|
326
|
+
),
|
|
327
|
+
]
|
|
328
|
+
|
|
329
|
+
with mlflow.start_run():
|
|
330
|
+
training_set = fs.create_training_set(
|
|
331
|
+
df,
|
|
332
|
+
feature_lookups = feature_lookups,
|
|
333
|
+
label = 'is_banned',
|
|
334
|
+
exclude_columns = ['customer_id']
|
|
335
|
+
)
|
|
336
|
+
...
|
|
337
|
+
fs.log_model(
|
|
338
|
+
model,
|
|
339
|
+
"model",
|
|
340
|
+
flavor=mlflow.sklearn,
|
|
341
|
+
training_set=training_set,
|
|
342
|
+
registered_model_name="example_model"
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
Then at inference time, the caller of :meth:`FeatureStoreClient.score_batch` must pass
|
|
346
|
+
a :class:`DataFrame <pyspark.sql.DataFrame>` that includes ``customer_id``, the ``lookup_key`` specified in the
|
|
347
|
+
``FeatureLookups`` of the :mod:`training_set <databricks.feature_engineering.training_set>`.
|
|
348
|
+
If the :class:`DataFrame <pyspark.sql.DataFrame>` contains a column
|
|
349
|
+
``account_creation_date``, the values of this column will be used
|
|
350
|
+
in lieu of those in :mod:`Feature Store <databricks.feature_store.client>`. As in:
|
|
351
|
+
|
|
352
|
+
.. code-block:: python
|
|
353
|
+
|
|
354
|
+
# batch_df has columns ['customer_id', 'account_creation_date']
|
|
355
|
+
predictions = fs.score_batch(
|
|
356
|
+
'models:/example_model/1',
|
|
357
|
+
batch_df
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
:param model_uri: The location, in URI format, of the MLflow model logged using
|
|
361
|
+
:meth:`FeatureStoreClient.log_model`. One of:
|
|
362
|
+
|
|
363
|
+
* ``runs:/<mlflow_run_id>/run-relative/path/to/model``
|
|
364
|
+
|
|
365
|
+
* ``models:/<model_name>/<model_version>``
|
|
366
|
+
|
|
367
|
+
* ``models:/<model_name>/<stage>``
|
|
368
|
+
|
|
369
|
+
For more information about URI schemes, see
|
|
370
|
+
`Referencing Artifacts <https://bit.ly/3wnrseE>`_.
|
|
371
|
+
:param df: The :class:`DataFrame <pyspark.sql.DataFrame>` to score the model on. :mod:`Feature Store <databricks.feature_store.client>` features will be joined with
|
|
372
|
+
``df`` prior to scoring the model. ``df`` must:
|
|
373
|
+
|
|
374
|
+
1. Contain columns for lookup keys required to join feature data from Feature
|
|
375
|
+
Store, as specified in the ``feature_spec.yaml`` artifact.
|
|
376
|
+
|
|
377
|
+
2. Contain columns for all source keys required to score the model, as specified in
|
|
378
|
+
the ``feature_spec.yaml`` artifact.
|
|
379
|
+
|
|
380
|
+
3. Not contain a column ``prediction``, which is reserved for the model's predictions.
|
|
381
|
+
``df`` may contain additional columns.
|
|
382
|
+
|
|
383
|
+
Streaming DataFrames are not supported.
|
|
384
|
+
|
|
385
|
+
:param result_type: The return type of the model.
|
|
386
|
+
See :func:`mlflow.pyfunc.spark_udf` result_type.
|
|
387
|
+
:return: A :class:`DataFrame <pyspark.sql.DataFrame>`
|
|
388
|
+
containing:
|
|
389
|
+
|
|
390
|
+
1. All columns of ``df``.
|
|
391
|
+
|
|
392
|
+
2. All feature values retrieved from Feature Store.
|
|
393
|
+
|
|
394
|
+
3. A column ``prediction`` containing the output of the model.
|
|
395
|
+
|
|
396
|
+
"""
|
|
397
|
+
return self._training_set_client.score_batch(
|
|
398
|
+
model_uri=model_uri,
|
|
399
|
+
df=df,
|
|
400
|
+
result_type=result_type,
|
|
401
|
+
client_name=FEATURE_STORE_CLIENT,
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
def publish_table(self, table_name: str, data_source_name: str, cloud_secret_id: str, cloud_secret_key: str,
|
|
405
|
+
database_name: Optional[str] = None,
|
|
406
|
+
is_cycle: bool = False, cycle_obj: TaskSchedulerConfiguration = None,
|
|
407
|
+
is_use_default_online: bool = True, online_config: RedisStoreConfig = None):
|
|
408
|
+
"""
|
|
409
|
+
Publish an offline feature table to an online feature table.
|
|
410
|
+
|
|
411
|
+
This method synchronizes the offline feature table data to online storage
|
|
412
|
+
for low-latency feature serving in real-time applications.
|
|
413
|
+
|
|
414
|
+
Args:
|
|
415
|
+
table_name: Name of the offline feature table
|
|
416
|
+
data_source_name: Name of the data source
|
|
417
|
+
cloud_secret_id: Cloud secret ID for authentication
|
|
418
|
+
cloud_secret_key: Cloud secret key for authentication
|
|
419
|
+
database_name: Database name (optional)
|
|
420
|
+
is_cycle: Whether to enable periodic publishing (default: False)
|
|
421
|
+
cycle_obj: Periodic task configuration object (required if is_cycle is True)
|
|
422
|
+
is_use_default_online: Whether to use default online storage configuration (default: True)
|
|
423
|
+
online_config: Custom online storage configuration (only effective when is_use_default_online is False)
|
|
424
|
+
|
|
425
|
+
Returns:
|
|
426
|
+
None
|
|
427
|
+
|
|
428
|
+
"""
|
|
429
|
+
return self._feature_table_client.publish_table(table_name=table_name, database_name=database_name,
|
|
430
|
+
data_source_name=data_source_name,
|
|
431
|
+
cloud_secret_key=cloud_secret_key,
|
|
432
|
+
cloud_secret_id=cloud_secret_id,
|
|
433
|
+
is_cycle=is_cycle, cycle_obj=cycle_obj,
|
|
434
|
+
is_use_default_online=is_use_default_online,
|
|
435
|
+
online_config=online_config)
|
|
436
|
+
|
|
437
|
+
def drop_online_table(self, table_name: str, online_config: RedisStoreConfig, database_name: Optional[str] = None):
|
|
438
|
+
"""
|
|
439
|
+
Drop an online feature table.
|
|
440
|
+
:param table_name: Name of the offline feature table
|
|
441
|
+
:param database_name: Database name (optional)
|
|
442
|
+
:param online_config: Custom online storage configuration (only effective when is_use_default_online is False)
|
|
443
|
+
:return:
|
|
444
|
+
"""
|
|
445
|
+
self._feature_table_client.drop_online_table(table_name=table_name, database_name=database_name, online_config=online_config)
|
|
446
|
+
|
|
447
|
+
def create_feature_spec(
|
|
448
|
+
self, name: str,
|
|
449
|
+
features: List[Union[FeatureLookup, FeatureFunction]],
|
|
450
|
+
exclude_columns: List[str]):
|
|
451
|
+
|
|
452
|
+
"""
|
|
453
|
+
创建特征配置文件
|
|
454
|
+
:arg name: 特征配置文件名称
|
|
455
|
+
:arg features: 特征列表,可以是FeatureLookup(特征查找)或FeatureFunction(特征函数)
|
|
456
|
+
:arg exclude_columns: 需要从最终特征集中排除的列名列表
|
|
457
|
+
"""
|
|
458
|
+
return self._training_set_client.create_feature_spec(name, features, self._spark_client, exclude_columns)
|
|
459
|
+
|
|
460
|
+
@property
|
|
461
|
+
def spark(self):
|
|
462
|
+
return self._spark
|
tencent-wedata-feature-engineering-dev-0.1.0/wedata/feature_store/cloud_sdk_client/__init__.py
ADDED
|
File without changes
|