wedata-feature-engineering 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wedata/__init__.py CHANGED
@@ -3,4 +3,4 @@ WeData Feature Engineering
3
3
  A toolkit for automated feature engineering
4
4
  """
5
5
 
6
- __version__ = "0.1.8"
6
+ __version__ = "0.1.9"
@@ -202,6 +202,7 @@ class FeatureStoreClient:
202
202
  flavor: ModuleType,
203
203
  training_set: Optional[TrainingSet] = None,
204
204
  registered_model_name: Optional[str] = None,
205
+ model_registry_uri: Optional[str] = None,
205
206
  await_registration_for: int = mlflow.tracking._model_registry.DEFAULT_AWAIT_MAX_SLEEP_SECONDS,
206
207
  infer_input_example: bool = False,
207
208
  **kwargs,
@@ -218,6 +219,7 @@ class FeatureStoreClient:
218
219
  flavor: MLflow模型类型模块(如mlflow.sklearn)
219
220
  training_set: 训练模型使用的TrainingSet对象(可选)
220
221
  registered_model_name: 要注册的模型名称(可选)
222
+ model_registry_uri: 模型注册中心地址(可选)
221
223
  await_registration_for: 等待模型注册完成的秒数(默认300秒)
222
224
  infer_input_example: 是否自动记录输入示例(默认False)
223
225
 
@@ -231,6 +233,7 @@ class FeatureStoreClient:
231
233
  flavor=flavor,
232
234
  training_set=training_set,
233
235
  registered_model_name=registered_model_name,
236
+ model_registry_uri=model_registry_uri,
234
237
  await_registration_for=await_registration_for,
235
238
  infer_input_example=infer_input_example,
236
239
  **kwargs
@@ -171,7 +171,7 @@ class FeatureTableClient:
171
171
  CREATE TABLE {table_name} (
172
172
  {', '.join(columns_ddl)}
173
173
  )
174
- USING PARQUET
174
+ USING iceberg
175
175
  {partition_expr}
176
176
  TBLPROPERTIES (
177
177
  {', '.join(f"'{k}'='{self._escape_sql_value(v)}'" for k, v in tbl_properties.items())}
@@ -365,4 +365,4 @@ class FeatureTableClient:
365
365
  try:
366
366
  return spark_client.get_feature_table(table_name)
367
367
  except Exception as e:
368
- raise ValueError(f"获取表'{name}'元数据失败: {str(e)}") from e
368
+ raise ValueError(f"获取表'{table_name}'元数据失败: {str(e)}") from e
@@ -9,13 +9,39 @@ from pyspark.sql.types import StructType, StringType, StructField
9
9
  from wedata.feature_store.entities.feature import Feature
10
10
  from wedata.feature_store.entities.feature_table import FeatureTable
11
11
  from wedata.feature_store.entities.function_info import FunctionParameterInfo, FunctionInfo
12
- from wedata.feature_store.utils.common_utils import unsanitize_identifier, sanitize_multi_level_name
12
+ from wedata.feature_store.utils.common_utils import unsanitize_identifier
13
13
 
14
14
 
15
15
  class SparkClient:
16
16
  def __init__(self, spark: SparkSession):
17
17
  self._spark = spark
18
18
 
19
+ def _parse_table_name(self, table_name):
20
+ """解析表名并返回表名部分
21
+
22
+ 参数:
23
+ table_name: 完整表名,支持格式: catalog.schema.table、schema.table 或 table
24
+
25
+ 返回:
26
+ str: 解析后的表名部分
27
+ """
28
+ if not isinstance(table_name, str):
29
+ raise ValueError("Table name must be string type")
30
+
31
+ table_name = table_name.strip()
32
+ if not table_name:
33
+ raise ValueError("Table name cannot be empty")
34
+
35
+ parts = table_name.split('.')
36
+ if len(parts) == 3:
37
+ # 对于三部分名称(catalog.schema.table),只使用表名部分
38
+ return parts[2]
39
+ elif len(parts) == 2:
40
+ # 对于两部分名称(schema.table),只使用表名部分
41
+ return parts[1]
42
+ else:
43
+ # 单表名,直接使用
44
+ return table_name
19
45
 
20
46
  def get_current_catalog(self):
21
47
  """
@@ -66,19 +92,13 @@ class SparkClient:
66
92
  """
67
93
  try:
68
94
  # 解析表名
69
- parts = table_name.split('.')
70
- if len(parts) == 3:
71
- catalog, schema, table = parts
72
- elif len(parts) == 2:
73
- schema, table = parts
74
- else:
75
- table = table_name
95
+ schema_table_name = self._parse_table_name(table_name)
76
96
 
77
97
  # 验证表是否存在
78
- if not self._spark.catalog.tableExists(table):
98
+ if not self._spark.catalog.tableExists(schema_table_name):
79
99
  raise ValueError(f"表不存在: {table_name}")
80
100
 
81
- return self._spark.table(table)
101
+ return self._spark.table(schema_table_name)
82
102
 
83
103
  except Exception as e:
84
104
  raise ValueError(f"读取表 {table_name} 失败: {str(e)}")
@@ -86,23 +106,10 @@ class SparkClient:
86
106
 
87
107
  def get_features(self, table_name):
88
108
  # 解析表名
89
- parts = table_name.split('.')
90
- if len(parts) == 3:
91
- # 对于三部分名称(catalog.schema.table),使用schema.table格式
92
- _, schema, table = parts
93
- full_table_name = f"{schema}.{table}"
94
- elif len(parts) == 2:
95
- # 对于两部分名称(schema.table),直接使用
96
- full_table_name = table_name
97
- else:
98
- # 单表名,使用当前数据库
99
- current_db = self.get_current_database()
100
- if not current_db:
101
- raise ValueError("无法确定当前数据库")
102
- full_table_name = f"{current_db}.{table_name}"
109
+ schema_table_name = self._parse_table_name(table_name)
103
110
 
104
111
  # 使用dbName.tableName格式查询列信息
105
- columns = self._spark.catalog.listColumns(tableName=full_table_name)
112
+ columns = self._spark.catalog.listColumns(tableName=schema_table_name)
106
113
  return [
107
114
  Feature(
108
115
  feature_table=table_name,
@@ -114,22 +121,14 @@ class SparkClient:
114
121
  ]
115
122
 
116
123
  def get_feature_table(self, table_name):
124
+ # 解析表名
125
+ schema_table_name = self._parse_table_name(table_name)
117
126
 
118
127
  # 获取表元数据
119
- table = self._spark.catalog.getTable(table_name)
128
+ table = self._spark.catalog.getTable(schema_table_name)
120
129
 
121
- parts = table_name.split('.')
122
- if len(parts) == 3:
123
- # 对于三部分名称(catalog.schema.table),只使用表名部分
124
- table_to_describe = parts[2]
125
- elif len(parts) == 2:
126
- # 对于两部分名称(schema.table),只使用表名部分
127
- table_to_describe = parts[1]
128
- else:
129
- # 单表名,直接使用
130
- table_to_describe = table_name
131
130
  # 获取表详细信息
132
- table_details = self._spark.sql(f"DESCRIBE TABLE EXTENDED {table_to_describe}").collect()
131
+ table_details = self._spark.sql(f"DESCRIBE TABLE EXTENDED {schema_table_name}").collect()
133
132
 
134
133
  table_properties = {}
135
134
  for row in table_details:
@@ -186,6 +186,7 @@ class TrainingSetClient:
186
186
  flavor: ModuleType,
187
187
  training_set: Optional[TrainingSet],
188
188
  registered_model_name: Optional[str],
189
+ model_registry_uri: Optional[str],
189
190
  await_registration_for: int,
190
191
  infer_input_example: bool,
191
192
  **kwargs,
@@ -334,8 +335,7 @@ class TrainingSetClient:
334
335
  except Exception:
335
336
  input_example = None
336
337
 
337
- # todo:
338
- #feature_spec.save(data_path)
338
+ feature_spec.save(data_path)
339
339
 
340
340
  # Log the packaged model. If no run is active, this call will create an active run.
341
341
  mlflow.pyfunc.log_model(
@@ -355,13 +355,12 @@ class TrainingSetClient:
355
355
  # If the user provided an explicit model_registry_uri when constructing the FeatureStoreClient,
356
356
  # we respect this by setting the registry URI prior to reading the model from Model
357
357
  # Registry.
358
- # todo:
359
- # if self._model_registry_uri:
360
- # # This command will override any previously set registry_uri.
361
- # mlflow.set_registry_uri(self._model_registry_uri)
358
+ if model_registry_uri is not None:
359
+ # This command will override any previously set registry_uri.
360
+ mlflow.set_registry_uri(model_registry_uri)
362
361
 
363
362
  mlflow.register_model(
364
363
  "runs:/%s/%s" % (run_id, artifact_path),
365
364
  registered_model_name,
366
365
  await_registration_for=await_registration_for,
367
- )
366
+ )
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: wedata-feature-engineering
3
- Version: 0.1.8
3
+ Version: 0.1.9
4
4
  Summary: Wedata Feature Engineering Library
5
5
  Home-page:
6
6
  Author: meahqian
@@ -14,4 +14,10 @@ Description-Content-Type: text/markdown
14
14
  Requires-Dist: pyspark>=3.0.0
15
15
  Requires-Dist: delta-spark>=1.0.0
16
16
  Requires-Dist: pandas>=1.0.0
17
-
17
+ Dynamic: author
18
+ Dynamic: classifier
19
+ Dynamic: description-content-type
20
+ Dynamic: license
21
+ Dynamic: requires-dist
22
+ Dynamic: requires-python
23
+ Dynamic: summary
@@ -1,6 +1,6 @@
1
- wedata/__init__.py,sha256=yIceuEY46nh56GEjtGNrDMIKTYtBHEf-Wj5Rc-cJS-g,101
1
+ wedata/__init__.py,sha256=QDgjssRv3Fu3e8OTEm5m1qWnkzAdYKW5vMtvkocJCmI,101
2
2
  wedata/feature_store/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- wedata/feature_store/client.py,sha256=7a-9C8HIBHnQNQD6I4W3UtBQwkJE8G-Q7N24zydjpkY,8100
3
+ wedata/feature_store/client.py,sha256=DO68yHiaJQ3LmrZ-owWEuRjuwM6vUjcaEdAcF65mdhs,8271
4
4
  wedata/feature_store/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  wedata/feature_store/constants/constants.py,sha256=b4tgcSt66YIq0Fg7pMbqvbqPOI77Cz8znLVZ4ihUKss,1479
6
6
  wedata/feature_store/entities/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -20,11 +20,11 @@ wedata/feature_store/entities/on_demand_column_info.py,sha256=Eh5ieaj1TxC7DG6ipB
20
20
  wedata/feature_store/entities/source_data_column_info.py,sha256=a9jQOJvehwDIrKPwsP6W9YRBSPNK2nZYypE6-p80CwA,542
21
21
  wedata/feature_store/entities/training_set.py,sha256=ylt1h6Z_xU8hKYvnvd80CeewTGSN68-_kvFpoliwH7s,5679
22
22
  wedata/feature_store/feature_table_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
- wedata/feature_store/feature_table_client/feature_table_client.py,sha256=W6_TJ6PNc5o6SotVppBmu6VWZ7q_lPgIeg9Xsbr9r-g,12136
23
+ wedata/feature_store/feature_table_client/feature_table_client.py,sha256=tsVPB3IZIsfGxPd_kQna9b20zY494wuzLofv2j1w-so,12142
24
24
  wedata/feature_store/spark_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
- wedata/feature_store/spark_client/spark_client.py,sha256=DBCYjLsFrIVRvLErTNyfLIHRul3v0y9uZIY2JR1N92s,10323
25
+ wedata/feature_store/spark_client/spark_client.py,sha256=SwMf-TsAeV7_8pDmh4927pKEwwKcIFK3JJ-J8rzUp_Q,10129
26
26
  wedata/feature_store/training_set_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
- wedata/feature_store/training_set_client/training_set_client.py,sha256=gHeZU0rvvUcyNTfroXD3LAinFPdhDpnwTOIWj6z84Tc,15102
27
+ wedata/feature_store/training_set_client/training_set_client.py,sha256=CVcdgqfHL2S-fSCkfDwQgqtMhkB8haGEi1kEjbudDOk,15087
28
28
  wedata/feature_store/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
29
  wedata/feature_store/utils/common_utils.py,sha256=cR3Vd49sWZrclaXvNO6B52Sk2v88iXmYmCIhi9xWsPM,10000
30
30
  wedata/feature_store/utils/feature_lookup_utils.py,sha256=da6ULwf5D-FRVpZoNyag1rroBfq_XPSH4a3uEMB_8io,22372
@@ -37,7 +37,7 @@ wedata/feature_store/utils/topological_sort.py,sha256=ebzKxmxeCLk9seB1zR0ASCGXsZ
37
37
  wedata/feature_store/utils/training_set_utils.py,sha256=MYsPZS1d9HKswHgjgxD8K7H9N3dWPyyTTx20Mkp4PVU,22497
38
38
  wedata/feature_store/utils/uc_utils.py,sha256=A-W8Cd8yvTmAMEWaHeWmGmcIDMvUtjAfx2G2x_di1QE,10774
39
39
  wedata/feature_store/utils/validation_utils.py,sha256=FslvrNs3kstqvM6THScLOluEE6O9RWlDrD9xiihTzlw,1735
40
- wedata_feature_engineering-0.1.8.dist-info/METADATA,sha256=HX42mSJie1KwNQnrB3temigb7fmxxEqShuE065NDcL8,493
41
- wedata_feature_engineering-0.1.8.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
42
- wedata_feature_engineering-0.1.8.dist-info/top_level.txt,sha256=Xa0v1rh__RvfVTVDirW5r5UBKg7ZO_iuTeXfp8MNo2A,7
43
- wedata_feature_engineering-0.1.8.dist-info/RECORD,,
40
+ wedata_feature_engineering-0.1.9.dist-info/METADATA,sha256=u_UnAZHKL603djFWLyfvZDIL220cwLEwyjYLjjXKRbw,644
41
+ wedata_feature_engineering-0.1.9.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
42
+ wedata_feature_engineering-0.1.9.dist-info/top_level.txt,sha256=Xa0v1rh__RvfVTVDirW5r5UBKg7ZO_iuTeXfp8MNo2A,7
43
+ wedata_feature_engineering-0.1.9.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.45.1)
2
+ Generator: setuptools (79.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5