wedata-feature-engineering 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wedata/__init__.py +1 -1
- wedata/feature_store/client.py +3 -0
- wedata/feature_store/feature_table_client/feature_table_client.py +8 -12
- wedata/feature_store/spark_client/spark_client.py +36 -37
- wedata/feature_store/training_set_client/training_set_client.py +6 -7
- {wedata_feature_engineering-0.1.8.dist-info → wedata_feature_engineering-0.1.10.dist-info}/METADATA +9 -3
- {wedata_feature_engineering-0.1.8.dist-info → wedata_feature_engineering-0.1.10.dist-info}/RECORD +9 -9
- {wedata_feature_engineering-0.1.8.dist-info → wedata_feature_engineering-0.1.10.dist-info}/WHEEL +1 -1
- {wedata_feature_engineering-0.1.8.dist-info → wedata_feature_engineering-0.1.10.dist-info}/top_level.txt +0 -0
wedata/__init__.py
CHANGED
wedata/feature_store/client.py
CHANGED
@@ -202,6 +202,7 @@ class FeatureStoreClient:
|
|
202
202
|
flavor: ModuleType,
|
203
203
|
training_set: Optional[TrainingSet] = None,
|
204
204
|
registered_model_name: Optional[str] = None,
|
205
|
+
model_registry_uri: Optional[str] = None,
|
205
206
|
await_registration_for: int = mlflow.tracking._model_registry.DEFAULT_AWAIT_MAX_SLEEP_SECONDS,
|
206
207
|
infer_input_example: bool = False,
|
207
208
|
**kwargs,
|
@@ -218,6 +219,7 @@ class FeatureStoreClient:
|
|
218
219
|
flavor: MLflow模型类型模块(如mlflow.sklearn)
|
219
220
|
training_set: 训练模型使用的TrainingSet对象(可选)
|
220
221
|
registered_model_name: 要注册的模型名称(可选)
|
222
|
+
model_registry_uri: 模型注册中心地址(可选)
|
221
223
|
await_registration_for: 等待模型注册完成的秒数(默认300秒)
|
222
224
|
infer_input_example: 是否自动记录输入示例(默认False)
|
223
225
|
|
@@ -231,6 +233,7 @@ class FeatureStoreClient:
|
|
231
233
|
flavor=flavor,
|
232
234
|
training_set=training_set,
|
233
235
|
registered_model_name=registered_model_name,
|
236
|
+
model_registry_uri=model_registry_uri,
|
234
237
|
await_registration_for=await_registration_for,
|
235
238
|
infer_input_example=infer_input_example,
|
236
239
|
**kwargs
|
@@ -113,7 +113,7 @@ class FeatureTableClient:
|
|
113
113
|
try:
|
114
114
|
if self._spark.catalog.tableExists(table_name):
|
115
115
|
raise ValueError(
|
116
|
-
f"Table '{
|
116
|
+
f"Table '{name}' already exists\n"
|
117
117
|
"Solutions:\n"
|
118
118
|
"1. Use a different table name\n"
|
119
119
|
"2. Drop the existing table: spark.sql(f'DROP TABLE {name}')\n"
|
@@ -125,11 +125,6 @@ class FeatureTableClient:
|
|
125
125
|
table_schema = schema or df.schema
|
126
126
|
|
127
127
|
# 构建时间戳键属性
|
128
|
-
timestamp_keys_ddl = []
|
129
|
-
for timestamp_key in timestamp_keys:
|
130
|
-
if timestamp_key not in primary_keys:
|
131
|
-
raise ValueError(f"Timestamp key '{timestamp_key}' must be a primary key")
|
132
|
-
timestamp_keys_ddl.append(f"`{timestamp_key}` TIMESTAMP")
|
133
128
|
|
134
129
|
#从环境变量获取额外标签
|
135
130
|
env_tags = {
|
@@ -142,6 +137,7 @@ class FeatureTableClient:
|
|
142
137
|
tbl_properties = {
|
143
138
|
"feature_table": "TRUE",
|
144
139
|
"primaryKeys": ",".join(primary_keys),
|
140
|
+
"timestampKeys": ",".join(timestamp_keys) if timestamp_keys else "",
|
145
141
|
"comment": description or "",
|
146
142
|
**{f"{k}": v for k, v in (tags or {}).items()},
|
147
143
|
**{f"feature_{k}": v for k, v in (env_tags or {}).items()}
|
@@ -171,7 +167,7 @@ class FeatureTableClient:
|
|
171
167
|
CREATE TABLE {table_name} (
|
172
168
|
{', '.join(columns_ddl)}
|
173
169
|
)
|
174
|
-
USING
|
170
|
+
USING iceberg
|
175
171
|
{partition_expr}
|
176
172
|
TBLPROPERTIES (
|
177
173
|
{', '.join(f"'{k}'='{self._escape_sql_value(v)}'" for k, v in tbl_properties.items())}
|
@@ -293,13 +289,13 @@ class FeatureTableClient:
|
|
293
289
|
try:
|
294
290
|
# 检查表是否存在
|
295
291
|
if not self._spark.catalog.tableExists(table_name):
|
296
|
-
raise ValueError(f"
|
292
|
+
raise ValueError(f"Table '{name}' does not exist")
|
297
293
|
|
298
294
|
# 读取表数据
|
299
295
|
return self._spark.read.table(table_name)
|
300
296
|
|
301
297
|
except Exception as e:
|
302
|
-
raise ValueError(f"
|
298
|
+
raise ValueError(f"Failed to read table '{name}': {str(e)}") from e
|
303
299
|
|
304
300
|
def drop_table(self, name: str):
|
305
301
|
|
@@ -327,7 +323,7 @@ class FeatureTableClient:
|
|
327
323
|
try:
|
328
324
|
# 检查表是否存在
|
329
325
|
if not self._spark.catalog.tableExists(table_name):
|
330
|
-
raise ValueError(f"
|
326
|
+
raise ValueError(f"Table '{name}' does not exist")
|
331
327
|
|
332
328
|
# 执行删除
|
333
329
|
self._spark.sql(f"DROP TABLE {table_name}")
|
@@ -335,7 +331,7 @@ class FeatureTableClient:
|
|
335
331
|
except ValueError as e:
|
336
332
|
raise # 直接抛出已知的ValueError
|
337
333
|
except Exception as e:
|
338
|
-
raise RuntimeError(f"
|
334
|
+
raise RuntimeError(f"Failed to delete table '{name}': {str(e)}") from e
|
339
335
|
|
340
336
|
def get_table(
|
341
337
|
self,
|
@@ -365,4 +361,4 @@ class FeatureTableClient:
|
|
365
361
|
try:
|
366
362
|
return spark_client.get_feature_table(table_name)
|
367
363
|
except Exception as e:
|
368
|
-
raise ValueError(f"
|
364
|
+
raise ValueError(f"Failed to get metadata for table '{table_name}': {str(e)}") from e
|
@@ -9,13 +9,39 @@ from pyspark.sql.types import StructType, StringType, StructField
|
|
9
9
|
from wedata.feature_store.entities.feature import Feature
|
10
10
|
from wedata.feature_store.entities.feature_table import FeatureTable
|
11
11
|
from wedata.feature_store.entities.function_info import FunctionParameterInfo, FunctionInfo
|
12
|
-
from wedata.feature_store.utils.common_utils import unsanitize_identifier
|
12
|
+
from wedata.feature_store.utils.common_utils import unsanitize_identifier
|
13
13
|
|
14
14
|
|
15
15
|
class SparkClient:
|
16
16
|
def __init__(self, spark: SparkSession):
|
17
17
|
self._spark = spark
|
18
18
|
|
19
|
+
def _parse_table_name(self, table_name):
|
20
|
+
"""解析表名并返回表名部分
|
21
|
+
|
22
|
+
参数:
|
23
|
+
table_name: 完整表名,支持格式: catalog.schema.table、schema.table 或 table
|
24
|
+
|
25
|
+
返回:
|
26
|
+
str: 解析后的表名部分
|
27
|
+
"""
|
28
|
+
if not isinstance(table_name, str):
|
29
|
+
raise ValueError("Table name must be string type")
|
30
|
+
|
31
|
+
table_name = table_name.strip()
|
32
|
+
if not table_name:
|
33
|
+
raise ValueError("Table name cannot be empty")
|
34
|
+
|
35
|
+
parts = table_name.split('.')
|
36
|
+
if len(parts) == 3:
|
37
|
+
# 对于三部分名称(catalog.schema.table),只使用表名部分
|
38
|
+
return parts[2]
|
39
|
+
elif len(parts) == 2:
|
40
|
+
# 对于两部分名称(schema.table),只使用表名部分
|
41
|
+
return parts[1]
|
42
|
+
else:
|
43
|
+
# 单表名,直接使用
|
44
|
+
return table_name
|
19
45
|
|
20
46
|
def get_current_catalog(self):
|
21
47
|
"""
|
@@ -66,19 +92,13 @@ class SparkClient:
|
|
66
92
|
"""
|
67
93
|
try:
|
68
94
|
# 解析表名
|
69
|
-
|
70
|
-
if len(parts) == 3:
|
71
|
-
catalog, schema, table = parts
|
72
|
-
elif len(parts) == 2:
|
73
|
-
schema, table = parts
|
74
|
-
else:
|
75
|
-
table = table_name
|
95
|
+
schema_table_name = self._parse_table_name(table_name)
|
76
96
|
|
77
97
|
# 验证表是否存在
|
78
|
-
if not self._spark.catalog.tableExists(
|
98
|
+
if not self._spark.catalog.tableExists(schema_table_name):
|
79
99
|
raise ValueError(f"表不存在: {table_name}")
|
80
100
|
|
81
|
-
return self._spark.table(
|
101
|
+
return self._spark.table(schema_table_name)
|
82
102
|
|
83
103
|
except Exception as e:
|
84
104
|
raise ValueError(f"读取表 {table_name} 失败: {str(e)}")
|
@@ -86,23 +106,10 @@ class SparkClient:
|
|
86
106
|
|
87
107
|
def get_features(self, table_name):
|
88
108
|
# 解析表名
|
89
|
-
|
90
|
-
if len(parts) == 3:
|
91
|
-
# 对于三部分名称(catalog.schema.table),使用schema.table格式
|
92
|
-
_, schema, table = parts
|
93
|
-
full_table_name = f"{schema}.{table}"
|
94
|
-
elif len(parts) == 2:
|
95
|
-
# 对于两部分名称(schema.table),直接使用
|
96
|
-
full_table_name = table_name
|
97
|
-
else:
|
98
|
-
# 单表名,使用当前数据库
|
99
|
-
current_db = self.get_current_database()
|
100
|
-
if not current_db:
|
101
|
-
raise ValueError("无法确定当前数据库")
|
102
|
-
full_table_name = f"{current_db}.{table_name}"
|
109
|
+
schema_table_name = self._parse_table_name(table_name)
|
103
110
|
|
104
111
|
# 使用dbName.tableName格式查询列信息
|
105
|
-
columns = self._spark.catalog.listColumns(tableName=
|
112
|
+
columns = self._spark.catalog.listColumns(tableName=schema_table_name)
|
106
113
|
return [
|
107
114
|
Feature(
|
108
115
|
feature_table=table_name,
|
@@ -114,22 +121,14 @@ class SparkClient:
|
|
114
121
|
]
|
115
122
|
|
116
123
|
def get_feature_table(self, table_name):
|
124
|
+
# 解析表名
|
125
|
+
schema_table_name = self._parse_table_name(table_name)
|
117
126
|
|
118
127
|
# 获取表元数据
|
119
|
-
table = self._spark.catalog.getTable(
|
128
|
+
table = self._spark.catalog.getTable(schema_table_name)
|
120
129
|
|
121
|
-
parts = table_name.split('.')
|
122
|
-
if len(parts) == 3:
|
123
|
-
# 对于三部分名称(catalog.schema.table),只使用表名部分
|
124
|
-
table_to_describe = parts[2]
|
125
|
-
elif len(parts) == 2:
|
126
|
-
# 对于两部分名称(schema.table),只使用表名部分
|
127
|
-
table_to_describe = parts[1]
|
128
|
-
else:
|
129
|
-
# 单表名,直接使用
|
130
|
-
table_to_describe = table_name
|
131
130
|
# 获取表详细信息
|
132
|
-
table_details = self._spark.sql(f"DESCRIBE TABLE EXTENDED {
|
131
|
+
table_details = self._spark.sql(f"DESCRIBE TABLE EXTENDED {schema_table_name}").collect()
|
133
132
|
|
134
133
|
table_properties = {}
|
135
134
|
for row in table_details:
|
@@ -186,6 +186,7 @@ class TrainingSetClient:
|
|
186
186
|
flavor: ModuleType,
|
187
187
|
training_set: Optional[TrainingSet],
|
188
188
|
registered_model_name: Optional[str],
|
189
|
+
model_registry_uri: Optional[str],
|
189
190
|
await_registration_for: int,
|
190
191
|
infer_input_example: bool,
|
191
192
|
**kwargs,
|
@@ -334,8 +335,7 @@ class TrainingSetClient:
|
|
334
335
|
except Exception:
|
335
336
|
input_example = None
|
336
337
|
|
337
|
-
|
338
|
-
#feature_spec.save(data_path)
|
338
|
+
feature_spec.save(data_path)
|
339
339
|
|
340
340
|
# Log the packaged model. If no run is active, this call will create an active run.
|
341
341
|
mlflow.pyfunc.log_model(
|
@@ -355,13 +355,12 @@ class TrainingSetClient:
|
|
355
355
|
# If the user provided an explicit model_registry_uri when constructing the FeatureStoreClient,
|
356
356
|
# we respect this by setting the registry URI prior to reading the model from Model
|
357
357
|
# Registry.
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
# mlflow.set_registry_uri(self._model_registry_uri)
|
358
|
+
if model_registry_uri is not None:
|
359
|
+
# This command will override any previously set registry_uri.
|
360
|
+
mlflow.set_registry_uri(model_registry_uri)
|
362
361
|
|
363
362
|
mlflow.register_model(
|
364
363
|
"runs:/%s/%s" % (run_id, artifact_path),
|
365
364
|
registered_model_name,
|
366
365
|
await_registration_for=await_registration_for,
|
367
|
-
|
366
|
+
)
|
{wedata_feature_engineering-0.1.8.dist-info → wedata_feature_engineering-0.1.10.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: wedata-feature-engineering
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.10
|
4
4
|
Summary: Wedata Feature Engineering Library
|
5
5
|
Home-page:
|
6
6
|
Author: meahqian
|
@@ -14,4 +14,10 @@ Description-Content-Type: text/markdown
|
|
14
14
|
Requires-Dist: pyspark>=3.0.0
|
15
15
|
Requires-Dist: delta-spark>=1.0.0
|
16
16
|
Requires-Dist: pandas>=1.0.0
|
17
|
-
|
17
|
+
Dynamic: author
|
18
|
+
Dynamic: classifier
|
19
|
+
Dynamic: description-content-type
|
20
|
+
Dynamic: license
|
21
|
+
Dynamic: requires-dist
|
22
|
+
Dynamic: requires-python
|
23
|
+
Dynamic: summary
|
{wedata_feature_engineering-0.1.8.dist-info → wedata_feature_engineering-0.1.10.dist-info}/RECORD
RENAMED
@@ -1,6 +1,6 @@
|
|
1
|
-
wedata/__init__.py,sha256=
|
1
|
+
wedata/__init__.py,sha256=_M49ivoMq-NogMzHKd9DW6GfjUBWL18mb4gB5dK1Vbw,102
|
2
2
|
wedata/feature_store/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
wedata/feature_store/client.py,sha256=
|
3
|
+
wedata/feature_store/client.py,sha256=DO68yHiaJQ3LmrZ-owWEuRjuwM6vUjcaEdAcF65mdhs,8271
|
4
4
|
wedata/feature_store/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
5
|
wedata/feature_store/constants/constants.py,sha256=b4tgcSt66YIq0Fg7pMbqvbqPOI77Cz8znLVZ4ihUKss,1479
|
6
6
|
wedata/feature_store/entities/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -20,11 +20,11 @@ wedata/feature_store/entities/on_demand_column_info.py,sha256=Eh5ieaj1TxC7DG6ipB
|
|
20
20
|
wedata/feature_store/entities/source_data_column_info.py,sha256=a9jQOJvehwDIrKPwsP6W9YRBSPNK2nZYypE6-p80CwA,542
|
21
21
|
wedata/feature_store/entities/training_set.py,sha256=ylt1h6Z_xU8hKYvnvd80CeewTGSN68-_kvFpoliwH7s,5679
|
22
22
|
wedata/feature_store/feature_table_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
|
-
wedata/feature_store/feature_table_client/feature_table_client.py,sha256=
|
23
|
+
wedata/feature_store/feature_table_client/feature_table_client.py,sha256=AoqlXWsR95UgrKuh7QNBUF4ygNmAgTQ_bRsJpmajRmc,11938
|
24
24
|
wedata/feature_store/spark_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
|
-
wedata/feature_store/spark_client/spark_client.py,sha256=
|
25
|
+
wedata/feature_store/spark_client/spark_client.py,sha256=SwMf-TsAeV7_8pDmh4927pKEwwKcIFK3JJ-J8rzUp_Q,10129
|
26
26
|
wedata/feature_store/training_set_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
|
-
wedata/feature_store/training_set_client/training_set_client.py,sha256=
|
27
|
+
wedata/feature_store/training_set_client/training_set_client.py,sha256=CVcdgqfHL2S-fSCkfDwQgqtMhkB8haGEi1kEjbudDOk,15087
|
28
28
|
wedata/feature_store/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
29
29
|
wedata/feature_store/utils/common_utils.py,sha256=cR3Vd49sWZrclaXvNO6B52Sk2v88iXmYmCIhi9xWsPM,10000
|
30
30
|
wedata/feature_store/utils/feature_lookup_utils.py,sha256=da6ULwf5D-FRVpZoNyag1rroBfq_XPSH4a3uEMB_8io,22372
|
@@ -37,7 +37,7 @@ wedata/feature_store/utils/topological_sort.py,sha256=ebzKxmxeCLk9seB1zR0ASCGXsZ
|
|
37
37
|
wedata/feature_store/utils/training_set_utils.py,sha256=MYsPZS1d9HKswHgjgxD8K7H9N3dWPyyTTx20Mkp4PVU,22497
|
38
38
|
wedata/feature_store/utils/uc_utils.py,sha256=A-W8Cd8yvTmAMEWaHeWmGmcIDMvUtjAfx2G2x_di1QE,10774
|
39
39
|
wedata/feature_store/utils/validation_utils.py,sha256=FslvrNs3kstqvM6THScLOluEE6O9RWlDrD9xiihTzlw,1735
|
40
|
-
wedata_feature_engineering-0.1.
|
41
|
-
wedata_feature_engineering-0.1.
|
42
|
-
wedata_feature_engineering-0.1.
|
43
|
-
wedata_feature_engineering-0.1.
|
40
|
+
wedata_feature_engineering-0.1.10.dist-info/METADATA,sha256=a3hj-GU81Glxtr14wsUdmothFfw5h9vdGYb2PWL5G5A,645
|
41
|
+
wedata_feature_engineering-0.1.10.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
42
|
+
wedata_feature_engineering-0.1.10.dist-info/top_level.txt,sha256=Xa0v1rh__RvfVTVDirW5r5UBKg7ZO_iuTeXfp8MNo2A,7
|
43
|
+
wedata_feature_engineering-0.1.10.dist-info/RECORD,,
|
File without changes
|