wedata-feature-engineering 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wedata/__init__.py +1 -1
- wedata/feature_store/client.py +3 -0
- wedata/feature_store/feature_table_client/feature_table_client.py +2 -2
- wedata/feature_store/spark_client/spark_client.py +36 -37
- wedata/feature_store/training_set_client/training_set_client.py +6 -7
- {wedata_feature_engineering-0.1.8.dist-info → wedata_feature_engineering-0.1.9.dist-info}/METADATA +9 -3
- {wedata_feature_engineering-0.1.8.dist-info → wedata_feature_engineering-0.1.9.dist-info}/RECORD +9 -9
- {wedata_feature_engineering-0.1.8.dist-info → wedata_feature_engineering-0.1.9.dist-info}/WHEEL +1 -1
- {wedata_feature_engineering-0.1.8.dist-info → wedata_feature_engineering-0.1.9.dist-info}/top_level.txt +0 -0
wedata/__init__.py
CHANGED
wedata/feature_store/client.py
CHANGED
@@ -202,6 +202,7 @@ class FeatureStoreClient:
|
|
202
202
|
flavor: ModuleType,
|
203
203
|
training_set: Optional[TrainingSet] = None,
|
204
204
|
registered_model_name: Optional[str] = None,
|
205
|
+
model_registry_uri: Optional[str] = None,
|
205
206
|
await_registration_for: int = mlflow.tracking._model_registry.DEFAULT_AWAIT_MAX_SLEEP_SECONDS,
|
206
207
|
infer_input_example: bool = False,
|
207
208
|
**kwargs,
|
@@ -218,6 +219,7 @@ class FeatureStoreClient:
|
|
218
219
|
flavor: MLflow模型类型模块(如mlflow.sklearn)
|
219
220
|
training_set: 训练模型使用的TrainingSet对象(可选)
|
220
221
|
registered_model_name: 要注册的模型名称(可选)
|
222
|
+
model_registry_uri: 模型注册中心地址(可选)
|
221
223
|
await_registration_for: 等待模型注册完成的秒数(默认300秒)
|
222
224
|
infer_input_example: 是否自动记录输入示例(默认False)
|
223
225
|
|
@@ -231,6 +233,7 @@ class FeatureStoreClient:
|
|
231
233
|
flavor=flavor,
|
232
234
|
training_set=training_set,
|
233
235
|
registered_model_name=registered_model_name,
|
236
|
+
model_registry_uri=model_registry_uri,
|
234
237
|
await_registration_for=await_registration_for,
|
235
238
|
infer_input_example=infer_input_example,
|
236
239
|
**kwargs
|
@@ -171,7 +171,7 @@ class FeatureTableClient:
|
|
171
171
|
CREATE TABLE {table_name} (
|
172
172
|
{', '.join(columns_ddl)}
|
173
173
|
)
|
174
|
-
USING
|
174
|
+
USING iceberg
|
175
175
|
{partition_expr}
|
176
176
|
TBLPROPERTIES (
|
177
177
|
{', '.join(f"'{k}'='{self._escape_sql_value(v)}'" for k, v in tbl_properties.items())}
|
@@ -365,4 +365,4 @@ class FeatureTableClient:
|
|
365
365
|
try:
|
366
366
|
return spark_client.get_feature_table(table_name)
|
367
367
|
except Exception as e:
|
368
|
-
raise ValueError(f"获取表'{
|
368
|
+
raise ValueError(f"获取表'{table_name}'元数据失败: {str(e)}") from e
|
@@ -9,13 +9,39 @@ from pyspark.sql.types import StructType, StringType, StructField
|
|
9
9
|
from wedata.feature_store.entities.feature import Feature
|
10
10
|
from wedata.feature_store.entities.feature_table import FeatureTable
|
11
11
|
from wedata.feature_store.entities.function_info import FunctionParameterInfo, FunctionInfo
|
12
|
-
from wedata.feature_store.utils.common_utils import unsanitize_identifier
|
12
|
+
from wedata.feature_store.utils.common_utils import unsanitize_identifier
|
13
13
|
|
14
14
|
|
15
15
|
class SparkClient:
|
16
16
|
def __init__(self, spark: SparkSession):
|
17
17
|
self._spark = spark
|
18
18
|
|
19
|
+
def _parse_table_name(self, table_name):
|
20
|
+
"""解析表名并返回表名部分
|
21
|
+
|
22
|
+
参数:
|
23
|
+
table_name: 完整表名,支持格式: catalog.schema.table、schema.table 或 table
|
24
|
+
|
25
|
+
返回:
|
26
|
+
str: 解析后的表名部分
|
27
|
+
"""
|
28
|
+
if not isinstance(table_name, str):
|
29
|
+
raise ValueError("Table name must be string type")
|
30
|
+
|
31
|
+
table_name = table_name.strip()
|
32
|
+
if not table_name:
|
33
|
+
raise ValueError("Table name cannot be empty")
|
34
|
+
|
35
|
+
parts = table_name.split('.')
|
36
|
+
if len(parts) == 3:
|
37
|
+
# 对于三部分名称(catalog.schema.table),只使用表名部分
|
38
|
+
return parts[2]
|
39
|
+
elif len(parts) == 2:
|
40
|
+
# 对于两部分名称(schema.table),只使用表名部分
|
41
|
+
return parts[1]
|
42
|
+
else:
|
43
|
+
# 单表名,直接使用
|
44
|
+
return table_name
|
19
45
|
|
20
46
|
def get_current_catalog(self):
|
21
47
|
"""
|
@@ -66,19 +92,13 @@ class SparkClient:
|
|
66
92
|
"""
|
67
93
|
try:
|
68
94
|
# 解析表名
|
69
|
-
|
70
|
-
if len(parts) == 3:
|
71
|
-
catalog, schema, table = parts
|
72
|
-
elif len(parts) == 2:
|
73
|
-
schema, table = parts
|
74
|
-
else:
|
75
|
-
table = table_name
|
95
|
+
schema_table_name = self._parse_table_name(table_name)
|
76
96
|
|
77
97
|
# 验证表是否存在
|
78
|
-
if not self._spark.catalog.tableExists(
|
98
|
+
if not self._spark.catalog.tableExists(schema_table_name):
|
79
99
|
raise ValueError(f"表不存在: {table_name}")
|
80
100
|
|
81
|
-
return self._spark.table(
|
101
|
+
return self._spark.table(schema_table_name)
|
82
102
|
|
83
103
|
except Exception as e:
|
84
104
|
raise ValueError(f"读取表 {table_name} 失败: {str(e)}")
|
@@ -86,23 +106,10 @@ class SparkClient:
|
|
86
106
|
|
87
107
|
def get_features(self, table_name):
|
88
108
|
# 解析表名
|
89
|
-
|
90
|
-
if len(parts) == 3:
|
91
|
-
# 对于三部分名称(catalog.schema.table),使用schema.table格式
|
92
|
-
_, schema, table = parts
|
93
|
-
full_table_name = f"{schema}.{table}"
|
94
|
-
elif len(parts) == 2:
|
95
|
-
# 对于两部分名称(schema.table),直接使用
|
96
|
-
full_table_name = table_name
|
97
|
-
else:
|
98
|
-
# 单表名,使用当前数据库
|
99
|
-
current_db = self.get_current_database()
|
100
|
-
if not current_db:
|
101
|
-
raise ValueError("无法确定当前数据库")
|
102
|
-
full_table_name = f"{current_db}.{table_name}"
|
109
|
+
schema_table_name = self._parse_table_name(table_name)
|
103
110
|
|
104
111
|
# 使用dbName.tableName格式查询列信息
|
105
|
-
columns = self._spark.catalog.listColumns(tableName=
|
112
|
+
columns = self._spark.catalog.listColumns(tableName=schema_table_name)
|
106
113
|
return [
|
107
114
|
Feature(
|
108
115
|
feature_table=table_name,
|
@@ -114,22 +121,14 @@ class SparkClient:
|
|
114
121
|
]
|
115
122
|
|
116
123
|
def get_feature_table(self, table_name):
|
124
|
+
# 解析表名
|
125
|
+
schema_table_name = self._parse_table_name(table_name)
|
117
126
|
|
118
127
|
# 获取表元数据
|
119
|
-
table = self._spark.catalog.getTable(
|
128
|
+
table = self._spark.catalog.getTable(schema_table_name)
|
120
129
|
|
121
|
-
parts = table_name.split('.')
|
122
|
-
if len(parts) == 3:
|
123
|
-
# 对于三部分名称(catalog.schema.table),只使用表名部分
|
124
|
-
table_to_describe = parts[2]
|
125
|
-
elif len(parts) == 2:
|
126
|
-
# 对于两部分名称(schema.table),只使用表名部分
|
127
|
-
table_to_describe = parts[1]
|
128
|
-
else:
|
129
|
-
# 单表名,直接使用
|
130
|
-
table_to_describe = table_name
|
131
130
|
# 获取表详细信息
|
132
|
-
table_details = self._spark.sql(f"DESCRIBE TABLE EXTENDED {
|
131
|
+
table_details = self._spark.sql(f"DESCRIBE TABLE EXTENDED {schema_table_name}").collect()
|
133
132
|
|
134
133
|
table_properties = {}
|
135
134
|
for row in table_details:
|
@@ -186,6 +186,7 @@ class TrainingSetClient:
|
|
186
186
|
flavor: ModuleType,
|
187
187
|
training_set: Optional[TrainingSet],
|
188
188
|
registered_model_name: Optional[str],
|
189
|
+
model_registry_uri: Optional[str],
|
189
190
|
await_registration_for: int,
|
190
191
|
infer_input_example: bool,
|
191
192
|
**kwargs,
|
@@ -334,8 +335,7 @@ class TrainingSetClient:
|
|
334
335
|
except Exception:
|
335
336
|
input_example = None
|
336
337
|
|
337
|
-
|
338
|
-
#feature_spec.save(data_path)
|
338
|
+
feature_spec.save(data_path)
|
339
339
|
|
340
340
|
# Log the packaged model. If no run is active, this call will create an active run.
|
341
341
|
mlflow.pyfunc.log_model(
|
@@ -355,13 +355,12 @@ class TrainingSetClient:
|
|
355
355
|
# If the user provided an explicit model_registry_uri when constructing the FeatureStoreClient,
|
356
356
|
# we respect this by setting the registry URI prior to reading the model from Model
|
357
357
|
# Registry.
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
# mlflow.set_registry_uri(self._model_registry_uri)
|
358
|
+
if model_registry_uri is not None:
|
359
|
+
# This command will override any previously set registry_uri.
|
360
|
+
mlflow.set_registry_uri(model_registry_uri)
|
362
361
|
|
363
362
|
mlflow.register_model(
|
364
363
|
"runs:/%s/%s" % (run_id, artifact_path),
|
365
364
|
registered_model_name,
|
366
365
|
await_registration_for=await_registration_for,
|
367
|
-
|
366
|
+
)
|
{wedata_feature_engineering-0.1.8.dist-info → wedata_feature_engineering-0.1.9.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: wedata-feature-engineering
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.9
|
4
4
|
Summary: Wedata Feature Engineering Library
|
5
5
|
Home-page:
|
6
6
|
Author: meahqian
|
@@ -14,4 +14,10 @@ Description-Content-Type: text/markdown
|
|
14
14
|
Requires-Dist: pyspark>=3.0.0
|
15
15
|
Requires-Dist: delta-spark>=1.0.0
|
16
16
|
Requires-Dist: pandas>=1.0.0
|
17
|
-
|
17
|
+
Dynamic: author
|
18
|
+
Dynamic: classifier
|
19
|
+
Dynamic: description-content-type
|
20
|
+
Dynamic: license
|
21
|
+
Dynamic: requires-dist
|
22
|
+
Dynamic: requires-python
|
23
|
+
Dynamic: summary
|
{wedata_feature_engineering-0.1.8.dist-info → wedata_feature_engineering-0.1.9.dist-info}/RECORD
RENAMED
@@ -1,6 +1,6 @@
|
|
1
|
-
wedata/__init__.py,sha256=
|
1
|
+
wedata/__init__.py,sha256=QDgjssRv3Fu3e8OTEm5m1qWnkzAdYKW5vMtvkocJCmI,101
|
2
2
|
wedata/feature_store/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
wedata/feature_store/client.py,sha256=
|
3
|
+
wedata/feature_store/client.py,sha256=DO68yHiaJQ3LmrZ-owWEuRjuwM6vUjcaEdAcF65mdhs,8271
|
4
4
|
wedata/feature_store/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
5
|
wedata/feature_store/constants/constants.py,sha256=b4tgcSt66YIq0Fg7pMbqvbqPOI77Cz8znLVZ4ihUKss,1479
|
6
6
|
wedata/feature_store/entities/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -20,11 +20,11 @@ wedata/feature_store/entities/on_demand_column_info.py,sha256=Eh5ieaj1TxC7DG6ipB
|
|
20
20
|
wedata/feature_store/entities/source_data_column_info.py,sha256=a9jQOJvehwDIrKPwsP6W9YRBSPNK2nZYypE6-p80CwA,542
|
21
21
|
wedata/feature_store/entities/training_set.py,sha256=ylt1h6Z_xU8hKYvnvd80CeewTGSN68-_kvFpoliwH7s,5679
|
22
22
|
wedata/feature_store/feature_table_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
|
-
wedata/feature_store/feature_table_client/feature_table_client.py,sha256=
|
23
|
+
wedata/feature_store/feature_table_client/feature_table_client.py,sha256=tsVPB3IZIsfGxPd_kQna9b20zY494wuzLofv2j1w-so,12142
|
24
24
|
wedata/feature_store/spark_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
|
-
wedata/feature_store/spark_client/spark_client.py,sha256=
|
25
|
+
wedata/feature_store/spark_client/spark_client.py,sha256=SwMf-TsAeV7_8pDmh4927pKEwwKcIFK3JJ-J8rzUp_Q,10129
|
26
26
|
wedata/feature_store/training_set_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
|
-
wedata/feature_store/training_set_client/training_set_client.py,sha256=
|
27
|
+
wedata/feature_store/training_set_client/training_set_client.py,sha256=CVcdgqfHL2S-fSCkfDwQgqtMhkB8haGEi1kEjbudDOk,15087
|
28
28
|
wedata/feature_store/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
29
29
|
wedata/feature_store/utils/common_utils.py,sha256=cR3Vd49sWZrclaXvNO6B52Sk2v88iXmYmCIhi9xWsPM,10000
|
30
30
|
wedata/feature_store/utils/feature_lookup_utils.py,sha256=da6ULwf5D-FRVpZoNyag1rroBfq_XPSH4a3uEMB_8io,22372
|
@@ -37,7 +37,7 @@ wedata/feature_store/utils/topological_sort.py,sha256=ebzKxmxeCLk9seB1zR0ASCGXsZ
|
|
37
37
|
wedata/feature_store/utils/training_set_utils.py,sha256=MYsPZS1d9HKswHgjgxD8K7H9N3dWPyyTTx20Mkp4PVU,22497
|
38
38
|
wedata/feature_store/utils/uc_utils.py,sha256=A-W8Cd8yvTmAMEWaHeWmGmcIDMvUtjAfx2G2x_di1QE,10774
|
39
39
|
wedata/feature_store/utils/validation_utils.py,sha256=FslvrNs3kstqvM6THScLOluEE6O9RWlDrD9xiihTzlw,1735
|
40
|
-
wedata_feature_engineering-0.1.
|
41
|
-
wedata_feature_engineering-0.1.
|
42
|
-
wedata_feature_engineering-0.1.
|
43
|
-
wedata_feature_engineering-0.1.
|
40
|
+
wedata_feature_engineering-0.1.9.dist-info/METADATA,sha256=u_UnAZHKL603djFWLyfvZDIL220cwLEwyjYLjjXKRbw,644
|
41
|
+
wedata_feature_engineering-0.1.9.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
|
42
|
+
wedata_feature_engineering-0.1.9.dist-info/top_level.txt,sha256=Xa0v1rh__RvfVTVDirW5r5UBKg7ZO_iuTeXfp8MNo2A,7
|
43
|
+
wedata_feature_engineering-0.1.9.dist-info/RECORD,,
|
File without changes
|