wedata-feature-engineering 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wedata_feature_engineering-0.1.0/PKG-INFO +13 -0
- wedata_feature_engineering-0.1.0/README.md +0 -0
- wedata_feature_engineering-0.1.0/feature_store/__init__.py +6 -0
- wedata_feature_engineering-0.1.0/feature_store/client.py +169 -0
- wedata_feature_engineering-0.1.0/setup.cfg +4 -0
- wedata_feature_engineering-0.1.0/setup.py +25 -0
- wedata_feature_engineering-0.1.0/wedata_feature_engineering.egg-info/PKG-INFO +13 -0
- wedata_feature_engineering-0.1.0/wedata_feature_engineering.egg-info/SOURCES.txt +9 -0
- wedata_feature_engineering-0.1.0/wedata_feature_engineering.egg-info/dependency_links.txt +1 -0
- wedata_feature_engineering-0.1.0/wedata_feature_engineering.egg-info/requires.txt +3 -0
- wedata_feature_engineering-0.1.0/wedata_feature_engineering.egg-info/top_level.txt +1 -0
@@ -0,0 +1,13 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: wedata_feature_engineering
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: Wedata Feature Engineering Library
|
5
|
+
Home-page:
|
6
|
+
Author: meahqian
|
7
|
+
Author-email:
|
8
|
+
License: Apache 2.0
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
10
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
11
|
+
Classifier: Operating System :: OS Independent
|
12
|
+
Requires-Python: >=3.7
|
13
|
+
Description-Content-Type: text/markdown
|
File without changes
|
@@ -0,0 +1,169 @@
|
|
1
|
+
"""
|
2
|
+
Wedata FeatureStoreClient Python实现
|
3
|
+
"""
|
4
|
+
|
5
|
+
from __future__ import annotations
|
6
|
+
from typing import Union, List, Dict, Optional, Any
|
7
|
+
from pyspark.sql import DataFrame, SparkSession
|
8
|
+
from pyspark.sql.streaming import StreamingQuery
|
9
|
+
from pyspark.sql.types import StructType
|
10
|
+
|
11
|
+
from feature_store.constants.constants import APPEND, DEFAULT_WRITE_STREAM_TRIGGER
|
12
|
+
from feature_store.entities.feature_function import FeatureFunction
|
13
|
+
from feature_store.entities.feature_lookup import FeatureLookup
|
14
|
+
from feature_store.entities.training_set import TrainingSet
|
15
|
+
from feature_store.feature_table_client.feature_table_client import FeatureTableClient
|
16
|
+
from feature_store.spark_client.spark_client import SparkClient
|
17
|
+
from feature_store.training_set_client.training_set_client import TrainingSetClient
|
18
|
+
from feature_store.utils.feature_utils import format_feature_lookups_and_functions
|
19
|
+
|
20
|
+
|
21
|
+
class FeatureStoreClient:
|
22
|
+
"""特征存储统一客户端,提供特征全生命周期管理能力"""
|
23
|
+
|
24
|
+
def __init__(self, spark: SparkSession):
|
25
|
+
"""
|
26
|
+
:param spark: 已初始化的SparkSession对象
|
27
|
+
"""
|
28
|
+
self._spark = spark
|
29
|
+
self._spark_client = SparkClient(spark)
|
30
|
+
self._feature_table_client = FeatureTableClient(spark)
|
31
|
+
|
32
|
+
def create_table(
|
33
|
+
self,
|
34
|
+
name: str,
|
35
|
+
primary_keys: Union[str, List[str]],
|
36
|
+
df: Optional[DataFrame] = None,
|
37
|
+
*,
|
38
|
+
timestamp_keys: Union[str, List[str], None] = None,
|
39
|
+
partition_columns: Union[str, List[str], None] = None,
|
40
|
+
schema: Optional[StructType] = None,
|
41
|
+
description: Optional[str] = None,
|
42
|
+
tags: Optional[Dict[str, str]] = None
|
43
|
+
):
|
44
|
+
"""
|
45
|
+
创建特征表(支持批流数据写入)
|
46
|
+
|
47
|
+
Args:
|
48
|
+
name: 特征表全称(格式:<table>)
|
49
|
+
primary_keys: 主键列名(支持复合主键)
|
50
|
+
df: 初始数据(可选,用于推断schema)
|
51
|
+
timestamp_keys: 时间戳键(用于时态特征)
|
52
|
+
partition_columns: 分区列(优化存储查询)
|
53
|
+
description: 业务描述
|
54
|
+
tags: 业务标签
|
55
|
+
|
56
|
+
Returns:
|
57
|
+
FeatureTable实例
|
58
|
+
|
59
|
+
Raises:
|
60
|
+
ValueError: 当schema与数据不匹配时
|
61
|
+
"""
|
62
|
+
|
63
|
+
return self._feature_table_client.create_table(
|
64
|
+
name=name,
|
65
|
+
primary_keys=primary_keys,
|
66
|
+
df=df,
|
67
|
+
timestamp_keys=timestamp_keys,
|
68
|
+
partition_columns=partition_columns,
|
69
|
+
schema=schema,
|
70
|
+
description=description,
|
71
|
+
tags=tags
|
72
|
+
)
|
73
|
+
|
74
|
+
|
75
|
+
def read_table(self, name: str) -> DataFrame:
|
76
|
+
"""
|
77
|
+
读取特征表数据
|
78
|
+
|
79
|
+
Args:
|
80
|
+
name: 特征表名称
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
DataFrame: 包含特征表数据的DataFrame对象
|
84
|
+
"""
|
85
|
+
return self._feature_table_client.read_table(name)
|
86
|
+
|
87
|
+
|
88
|
+
def drop_table(self, name: str) -> None:
|
89
|
+
"""
|
90
|
+
删除特征表
|
91
|
+
|
92
|
+
Args:
|
93
|
+
name: 要删除的特征表名称
|
94
|
+
|
95
|
+
Returns:
|
96
|
+
None
|
97
|
+
"""
|
98
|
+
return self._feature_table_client.drop_table(name)
|
99
|
+
|
100
|
+
|
101
|
+
def create_training_set(
|
102
|
+
self,
|
103
|
+
df: DataFrame,
|
104
|
+
feature_lookups: List[Union[FeatureLookup, FeatureFunction]],
|
105
|
+
label: Union[str, List[str], None],
|
106
|
+
exclude_columns: Optional[List[str]] = None,
|
107
|
+
**kwargs,
|
108
|
+
) -> TrainingSet:
|
109
|
+
|
110
|
+
"""
|
111
|
+
创建训练集
|
112
|
+
|
113
|
+
Args:
|
114
|
+
df: 基础数据
|
115
|
+
feature_lookups: 特征查询列表
|
116
|
+
label: 标签列名
|
117
|
+
exclude_columns: 排除列名
|
118
|
+
|
119
|
+
Returns:
|
120
|
+
TrainingSet实例
|
121
|
+
"""
|
122
|
+
|
123
|
+
if exclude_columns is None:
|
124
|
+
exclude_columns = []
|
125
|
+
|
126
|
+
features = feature_lookups
|
127
|
+
del feature_lookups
|
128
|
+
|
129
|
+
features = format_feature_lookups_and_functions(self._spark_client, features)
|
130
|
+
# 创建TrainingSetClient实例
|
131
|
+
training_set_client = TrainingSetClient(self._spark_client)
|
132
|
+
return training_set_client.create_training_set_from_feature_lookups(
|
133
|
+
df=df,
|
134
|
+
feature_lookups=features,
|
135
|
+
label=label,
|
136
|
+
exclude_columns=exclude_columns,
|
137
|
+
**kwargs
|
138
|
+
)
|
139
|
+
|
140
|
+
def write_table(
|
141
|
+
self,
|
142
|
+
name: str,
|
143
|
+
df: DataFrame,
|
144
|
+
mode: str = APPEND,
|
145
|
+
checkpoint_location: Optional[str] = None,
|
146
|
+
trigger: Dict[str, Any] = DEFAULT_WRITE_STREAM_TRIGGER,
|
147
|
+
) -> Optional[StreamingQuery]:
|
148
|
+
|
149
|
+
"""
|
150
|
+
写入数据到特征表(支持批处理和流式处理)
|
151
|
+
|
152
|
+
Args:
|
153
|
+
name: 特征表名称
|
154
|
+
df: 要写入的数据DataFrame
|
155
|
+
mode: 写入模式(默认追加)
|
156
|
+
checkpoint_location: 流式处理的检查点位置(可选)
|
157
|
+
trigger: 流式处理触发器配置(默认使用系统预设)
|
158
|
+
|
159
|
+
Returns:
|
160
|
+
如果是流式写入返回StreamingQuery对象,否则返回None
|
161
|
+
"""
|
162
|
+
|
163
|
+
return self._feature_table_client.write_table(
|
164
|
+
name=name,
|
165
|
+
df=df,
|
166
|
+
mode=mode,
|
167
|
+
checkpoint_location=checkpoint_location,
|
168
|
+
trigger=trigger,
|
169
|
+
)
|
@@ -0,0 +1,25 @@
|
|
1
|
+
from setuptools import setup, find_packages
|
2
|
+
|
3
|
+
setup(
|
4
|
+
name="wedata_feature_engineering", # 包名称
|
5
|
+
version="0.1.0", # 版本号
|
6
|
+
packages=find_packages(exclude=["tests*"]), # 自动发现所有包,排除tests目录
|
7
|
+
install_requires=[ # 依赖包
|
8
|
+
'pyspark>=3.0.0',
|
9
|
+
'delta-spark>=1.0.0',
|
10
|
+
'pandas>=1.0.0' # 新增常用数据处理依赖
|
11
|
+
],
|
12
|
+
python_requires='>=3.7', # Python版本要求
|
13
|
+
author="meahqian", # 作者
|
14
|
+
author_email="", # 新增作者邮箱(建议添加)
|
15
|
+
description="Wedata Feature Engineering Library", # 描述
|
16
|
+
long_description=open("README.md").read(), # 新增详细描述(需README.md文件)
|
17
|
+
long_description_content_type="text/markdown", # 描述内容类型
|
18
|
+
license="Apache 2.0", # 许可证
|
19
|
+
url="", # 新增项目URL(建议添加)
|
20
|
+
classifiers=[ # 新增分类器
|
21
|
+
"Programming Language :: Python :: 3",
|
22
|
+
"License :: OSI Approved :: Apache Software License",
|
23
|
+
"Operating System :: OS Independent",
|
24
|
+
],
|
25
|
+
)
|
@@ -0,0 +1,13 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: wedata-feature-engineering
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: Wedata Feature Engineering Library
|
5
|
+
Home-page:
|
6
|
+
Author: meahqian
|
7
|
+
Author-email:
|
8
|
+
License: Apache 2.0
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
10
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
11
|
+
Classifier: Operating System :: OS Independent
|
12
|
+
Requires-Python: >=3.7
|
13
|
+
Description-Content-Type: text/markdown
|
@@ -0,0 +1,9 @@
|
|
1
|
+
README.md
|
2
|
+
setup.py
|
3
|
+
feature_store/__init__.py
|
4
|
+
feature_store/client.py
|
5
|
+
wedata_feature_engineering.egg-info/PKG-INFO
|
6
|
+
wedata_feature_engineering.egg-info/SOURCES.txt
|
7
|
+
wedata_feature_engineering.egg-info/dependency_links.txt
|
8
|
+
wedata_feature_engineering.egg-info/requires.txt
|
9
|
+
wedata_feature_engineering.egg-info/top_level.txt
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
feature_store
|