tencent-wedata-feature-engineering-dev 0.1.48__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {tencent_wedata_feature_engineering_dev-0.1.48.dist-info → tencent_wedata_feature_engineering_dev-0.2.5.dist-info}/METADATA +14 -3
  2. tencent_wedata_feature_engineering_dev-0.2.5.dist-info/RECORD +78 -0
  3. {tencent_wedata_feature_engineering_dev-0.1.48.dist-info → tencent_wedata_feature_engineering_dev-0.2.5.dist-info}/WHEEL +1 -1
  4. wedata/__init__.py +1 -1
  5. wedata/common/base_table_client/__init__.py +1 -0
  6. wedata/common/base_table_client/base.py +58 -0
  7. wedata/common/cloud_sdk_client/__init__.py +2 -0
  8. wedata/{feature_store → common}/cloud_sdk_client/client.py +33 -3
  9. wedata/{feature_store → common}/cloud_sdk_client/models.py +212 -37
  10. wedata/{feature_store → common}/cloud_sdk_client/utils.py +7 -0
  11. wedata/{feature_store → common}/constants/constants.py +3 -2
  12. wedata/common/constants/engine_types.py +34 -0
  13. wedata/{feature_store → common}/entities/column_info.py +6 -5
  14. wedata/{feature_store → common}/entities/feature_column_info.py +2 -1
  15. wedata/{feature_store → common}/entities/feature_lookup.py +1 -1
  16. wedata/{feature_store → common}/entities/feature_spec.py +9 -9
  17. wedata/{feature_store → common}/entities/feature_table_info.py +1 -1
  18. wedata/{feature_store → common}/entities/function_info.py +2 -1
  19. wedata/{feature_store → common}/entities/on_demand_column_info.py +2 -1
  20. wedata/{feature_store → common}/entities/source_data_column_info.py +3 -1
  21. wedata/{feature_store → common}/entities/training_set.py +6 -6
  22. wedata/common/feast_client/__init__.py +1 -0
  23. wedata/{feature_store → common}/feast_client/feast_client.py +1 -1
  24. wedata/common/log/__init__.py +1 -0
  25. wedata/{feature_store/common → common}/log/logger.py +9 -5
  26. wedata/common/spark_client/__init__.py +1 -0
  27. wedata/{feature_store → common}/spark_client/spark_client.py +6 -7
  28. wedata/{feature_store → common}/utils/common_utils.py +7 -9
  29. wedata/{feature_store → common}/utils/env_utils.py +12 -0
  30. wedata/{feature_store → common}/utils/feature_lookup_utils.py +6 -6
  31. wedata/{feature_store → common}/utils/feature_spec_utils.py +13 -8
  32. wedata/{feature_store → common}/utils/feature_utils.py +5 -5
  33. wedata/{feature_store → common}/utils/on_demand_utils.py +5 -4
  34. wedata/{feature_store → common}/utils/schema_utils.py +1 -1
  35. wedata/{feature_store → common}/utils/signature_utils.py +4 -4
  36. wedata/{feature_store → common}/utils/training_set_utils.py +13 -13
  37. wedata/{feature_store → common}/utils/uc_utils.py +1 -1
  38. wedata/feature_engineering/__init__.py +1 -0
  39. wedata/feature_engineering/client.py +417 -0
  40. wedata/feature_engineering/ml_training_client/ml_training_client.py +569 -0
  41. wedata/feature_engineering/mlflow_model.py +9 -0
  42. wedata/feature_engineering/table_client/table_client.py +548 -0
  43. wedata/feature_store/client.py +11 -15
  44. wedata/feature_store/constants/engine_types.py +8 -30
  45. wedata/feature_store/feature_table_client/feature_table_client.py +73 -105
  46. wedata/feature_store/training_set_client/training_set_client.py +12 -23
  47. wedata/tempo/interpol.py +2 -2
  48. tencent_wedata_feature_engineering_dev-0.1.48.dist-info/RECORD +0 -66
  49. {tencent_wedata_feature_engineering_dev-0.1.48.dist-info → tencent_wedata_feature_engineering_dev-0.2.5.dist-info}/top_level.txt +0 -0
  50. /wedata/{feature_store/cloud_sdk_client → common}/__init__.py +0 -0
  51. /wedata/{feature_store/common/log → common/constants}/__init__.py +0 -0
  52. /wedata/{feature_store/common/protos → common/entities}/__init__.py +0 -0
  53. /wedata/{feature_store → common}/entities/environment_variables.py +0 -0
  54. /wedata/{feature_store → common}/entities/feature.py +0 -0
  55. /wedata/{feature_store → common}/entities/feature_function.py +0 -0
  56. /wedata/{feature_store → common}/entities/feature_spec_constants.py +0 -0
  57. /wedata/{feature_store → common}/entities/feature_table.py +0 -0
  58. /wedata/{feature_store/entities → common/protos}/__init__.py +0 -0
  59. /wedata/{feature_store/common → common}/protos/feature_store_pb2.py +0 -0
  60. /wedata/{feature_store/feast_client → common/utils}/__init__.py +0 -0
  61. /wedata/{feature_store → common}/utils/topological_sort.py +0 -0
  62. /wedata/{feature_store → common}/utils/validation_utils.py +0 -0
  63. /wedata/{feature_store/spark_client → feature_engineering/ml_training_client}/__init__.py +0 -0
  64. /wedata/{feature_store/utils → feature_engineering/table_client}/__init__.py +0 -0
@@ -0,0 +1,417 @@
1
+ from __future__ import annotations
2
+
3
+ __doc__ = """
4
+ WeData3.0 特征工程客户端
5
+
6
+ """
7
+
8
+ from types import ModuleType
9
+ from typing import Union, List, Dict, Optional, Any
10
+ from pyspark.sql import DataFrame, SparkSession
11
+ from pyspark.sql.streaming import StreamingQuery
12
+ from pyspark.sql.types import StructType
13
+ import mlflow
14
+ from wedata.common.constants.constants import FEATURE_STORE_CLIENT
15
+
16
+ from wedata.common.constants.constants import APPEND, DEFAULT_WRITE_STREAM_TRIGGER, FEATURE_LOOKUP_CLIENT_PIP_PACKAGE
17
+ from wedata.feature_store.constants.engine_types import EngineTypes
18
+ from wedata.common.entities.feature_function import FeatureFunction
19
+ from wedata.common.entities.feature_lookup import FeatureLookup
20
+ from wedata.common.entities.feature_table import FeatureTable
21
+ from wedata.common.entities.training_set import TrainingSet
22
+ from wedata.feature_engineering.table_client.table_client import FeatureEngineeringTableClient
23
+ from wedata.common.spark_client import SparkClient
24
+ from wedata.feature_engineering.ml_training_client.ml_training_client import MLTrainingClient
25
+ from wedata.common.utils import common_utils, env_utils
26
+ from wedata.common.utils.feature_utils import format_feature_lookups_and_functions
27
+
28
+
29
+ _i, _v, _ = common_utils.check_package_version("mlflow", "3.0.0", ">=")
30
+ if not _v:
31
+ raise ImportError(f"mlflow version must be greater than or equal to 3.0.0. "
32
+ f"current version is {mlflow.__version__}. "
33
+ f"you can install please install {FEATURE_LOOKUP_CLIENT_PIP_PACKAGE}[mlflow3]")
34
+
35
+
36
+ class FeatureEngineeringClient:
37
+ def __init__(self, spark: Optional[SparkSession] = None, ):
38
+ if spark is None:
39
+ spark = SparkSession.builder.getOrCreate()
40
+ self._spark = spark
41
+ self._spark_client = SparkClient(spark)
42
+ cloud_secret_id, cloud_secret_key = env_utils.get_cloud_secret()
43
+ self._feature_table_client = FeatureEngineeringTableClient(
44
+ spark, cloud_secret_id=cloud_secret_id, cloud_secret_key=cloud_secret_key)
45
+ self._training_set_client = MLTrainingClient(self._spark_client)
46
+
47
+ def create_table(
48
+ self,
49
+ name: str,
50
+ primary_keys: Union[str, List[str]],
51
+ timestamp_key: [str],
52
+ engine_type: [EngineTypes],
53
+ data_source_name: [str],
54
+ database_name: Optional[str] = None,
55
+ df: Optional[DataFrame] = None,
56
+ *,
57
+ partition_columns: Union[str, List[str], None] = None,
58
+ schema: Optional[StructType] = None,
59
+ description: Optional[str] = None,
60
+ tags: Optional[Dict[str, str]] = None
61
+ ) -> FeatureTable:
62
+ """
63
+ 创建特征表(支持批流数据写入)
64
+
65
+ Args:
66
+ name: 特征表全称(格式:<table>)
67
+ primary_keys: 主键列名(支持复合主键)
68
+ timestamp_key: 时间戳键(用于时态特征)
69
+ engine_type: 引擎类型 wedata.feature_store.constants.engine_types.EngineTypes
70
+ data_source_name: 数据源名称
71
+ database_name: 数据库名
72
+ df: 初始数据(可选,用于推断schema)
73
+ partition_columns: 分区列(优化存储查询)
74
+ schema: 表结构定义(可选,当不提供df时必需)
75
+ description: 业务描述
76
+ tags: 业务标签
77
+
78
+ Returns:
79
+ FeatureTable实例
80
+
81
+ Raises:
82
+ ValueError: 当schema与数据不匹配时
83
+ """
84
+
85
+ return self._feature_table_client.create_table(
86
+ name=name,
87
+ primary_keys=primary_keys,
88
+ engine_type=engine_type,
89
+ database_name=database_name,
90
+ data_source_name=data_source_name,
91
+ df=df,
92
+ timestamp_key=timestamp_key,
93
+ partition_columns=partition_columns,
94
+ schema=schema,
95
+ description=description,
96
+ tags=tags
97
+ )
98
+
99
+ def read_table(self, name: str, database_name: Optional[str] = None) -> DataFrame:
100
+ """
101
+ 读取特征表数据
102
+
103
+ Args:
104
+ name: 特征表名称
105
+ database_name: 特征库名称
106
+ Returns:
107
+ DataFrame: 包含特征表数据的DataFrame对象
108
+ """
109
+
110
+ return self._feature_table_client.read_table(name=name, database_name=database_name)
111
+
112
+ def get_table(self, name: str, database_name: Optional[str] = None) -> FeatureTable:
113
+ """
114
+ 获取特征表元数据
115
+ Args:
116
+ name: 特征表名称
117
+ database_name: 特征库名称
118
+
119
+ Returns:
120
+ FeatureTable: 包含特征表元数据的FeatureTable对象
121
+ """
122
+
123
+ return self._feature_table_client.get_table(name, self._spark_client, database_name)
124
+
125
+ def drop_table(self, name: str, database_name: Optional[str] = None) -> None:
126
+ """
127
+ 删除特征表
128
+
129
+ Args:
130
+ name: 要删除的特征表名称
131
+ database_name: database name
132
+ Returns:
133
+ None
134
+ """
135
+
136
+ return self._feature_table_client.drop_table(name, database_name)
137
+
138
+ def write_table(
139
+ self,
140
+ name: str,
141
+ df: DataFrame,
142
+ database_name: Optional[str] = None,
143
+ mode: Optional[str] = APPEND,
144
+ checkpoint_location: Optional[str] = None,
145
+ trigger: Dict[str, Any] = DEFAULT_WRITE_STREAM_TRIGGER,
146
+ ) -> Optional[StreamingQuery]:
147
+ """
148
+ 写入数据到特征表(支持批处理和流式处理)
149
+
150
+ Args:
151
+ name: 特征表名称
152
+ df: 要写入的数据DataFrame
153
+ database_name: 特征库名称
154
+ mode: 写入模式(默认追加)
155
+ checkpoint_location: 流式处理的检查点位置(可选)
156
+ trigger: 流式处理触发器配置(默认使用系统预设)
157
+
158
+ Returns:
159
+ 如果是流式写入返回StreamingQuery对象,否则返回None
160
+ """
161
+
162
+ return self._feature_table_client.write_table(
163
+ name=name,
164
+ df=df,
165
+ database_name=database_name,
166
+ mode=mode,
167
+ checkpoint_location=checkpoint_location,
168
+ trigger=trigger,
169
+ )
170
+
171
+ def create_training_set(
172
+ self,
173
+ df: DataFrame,
174
+ feature_lookups: List[Union[FeatureLookup, FeatureFunction]],
175
+ label: Union[str, List[str], None],
176
+ exclude_columns: Optional[List[str]] = None,
177
+ database_name: Optional[str] = None,
178
+ **kwargs,
179
+ ) -> TrainingSet:
180
+ """
181
+ 创建训练集
182
+
183
+ Args:
184
+ df: 基础数据
185
+ feature_lookups: 特征查询列表
186
+ label: 标签列名
187
+ exclude_columns: 排除列名
188
+ database_name: database name
189
+
190
+ Returns:
191
+ TrainingSet实例
192
+ """
193
+
194
+ if exclude_columns is None:
195
+ exclude_columns = []
196
+
197
+ # 如果为FeatureLookup,则将需要校验FeatureLookup的table_name,并构建完整表名
198
+ for feature in feature_lookups:
199
+ if isinstance(feature, FeatureLookup):
200
+ if not feature.table_name:
201
+ raise ValueError("FeatureLookup must specify a table_name")
202
+ # 先校验表名格式是否合法
203
+ common_utils.validate_table_name(feature.table_name)
204
+ # 再构建完整表名,并赋值给FeatureLookup对象
205
+ feature.table_name = common_utils.build_full_table_name(feature.table_name, database_name)
206
+
207
+ features = feature_lookups
208
+ del feature_lookups
209
+
210
+ features = format_feature_lookups_and_functions(self._spark_client, features)
211
+
212
+ return self._training_set_client.create_training_set_from_feature_lookups(
213
+ df=df,
214
+ feature_lookups=features,
215
+ label=label,
216
+ exclude_columns=exclude_columns,
217
+ **kwargs
218
+ )
219
+
220
+ def log_model(
221
+ self,
222
+ model: Any,
223
+ artifact_path: str,
224
+ *,
225
+ flavor: ModuleType,
226
+ training_set: Optional[TrainingSet] = None,
227
+ registered_model_name: Optional[str] = None,
228
+ model_registry_uri: Optional[str] = None,
229
+ await_registration_for: int = mlflow.tracking._model_registry.DEFAULT_AWAIT_MAX_SLEEP_SECONDS,
230
+ infer_input_example: bool = False,
231
+ **kwargs,
232
+ ):
233
+ """
234
+ 记录MLflow模型并关联特征查找信息
235
+
236
+ 注意:必须使用TrainingSet.load_df返回的DataFrame训练模型,
237
+ 任何对DataFrame的修改(如标准化、添加列等)都不会在推理时应用
238
+
239
+ Args:
240
+ model: 要记录的模型对象
241
+ artifact_path: 模型存储路径
242
+ flavor: MLflow模型类型模块(如mlflow.sklearn)
243
+ training_set: 训练模型使用的TrainingSet对象(可选)
244
+ registered_model_name: 要注册的模型名称(可选)
245
+ model_registry_uri: 模型注册中心地址(可选)
246
+ await_registration_for: 等待模型注册完成的秒数(默认300秒)
247
+ infer_input_example: 是否自动记录输入示例(默认False)
248
+
249
+ Returns:
250
+ None
251
+ """
252
+
253
+ self._training_set_client.log_model(
254
+ model=model,
255
+ artifact_path=artifact_path,
256
+ flavor=flavor,
257
+ training_set=training_set,
258
+ registered_model_name=registered_model_name,
259
+ model_registry_uri=model_registry_uri,
260
+ await_registration_for=await_registration_for,
261
+ infer_input_example=infer_input_example,
262
+ **kwargs
263
+ )
264
+
265
+ def score_batch(
266
+ self, model_uri: str, df: DataFrame, result_type: str = "double", timestamp_key: str = None
267
+ ) -> DataFrame:
268
+ """
269
+ Evaluate the model on the provided :class:`DataFrame <pyspark.sql.DataFrame>`.
270
+
271
+ Additional features required for
272
+ model evaluation will be automatically retrieved from :mod:`Feature Store <databricks.feature_store.client>`.
273
+
274
+ .. todo::
275
+
276
+ [ML-15539]: Replace the bitly URL in doc string
277
+
278
+ The model must have been logged with :meth:`.FeatureStoreClient.log_model`,
279
+ which packages the model with feature metadata. Unless present in ``df``,
280
+ these features will be looked up from :mod:`Feature Store <databricks.feature_store.client>` and joined with ``df``
281
+ prior to scoring the model.
282
+
283
+ If a feature is included in ``df``, the provided feature values will be used rather
284
+ than those stored in :mod:`Feature Store <databricks.feature_store.client>`.
285
+
286
+ For example, if a model is trained on two features ``account_creation_date`` and
287
+ ``num_lifetime_purchases``, as in:
288
+
289
+ .. code-block:: python
290
+
291
+ feature_lookups = [
292
+ FeatureLookup(
293
+ table_name = 'trust_and_safety.customer_features',
294
+ feature_name = 'account_creation_date',
295
+ lookup_key = 'customer_id',
296
+ ),
297
+ FeatureLookup(
298
+ table_name = 'trust_and_safety.customer_features',
299
+ feature_name = 'num_lifetime_purchases',
300
+ lookup_key = 'customer_id'
301
+ ),
302
+ ]
303
+
304
+ with mlflow.start_run():
305
+ training_set = fs.create_training_set(
306
+ df,
307
+ feature_lookups = feature_lookups,
308
+ label = 'is_banned',
309
+ exclude_columns = ['customer_id']
310
+ )
311
+ ...
312
+ fs.log_model(
313
+ model,
314
+ "model",
315
+ flavor=mlflow.sklearn,
316
+ training_set=training_set,
317
+ registered_model_name="example_model"
318
+ )
319
+
320
+ Then at inference time, the caller of :meth:`FeatureStoreClient.score_batch` must pass
321
+ a :class:`DataFrame <pyspark.sql.DataFrame>` that includes ``customer_id``, the ``lookup_key`` specified in the
322
+ ``FeatureLookups`` of the :mod:`training_set <databricks.feature_engineering.training_set>`.
323
+ If the :class:`DataFrame <pyspark.sql.DataFrame>` contains a column
324
+ ``account_creation_date``, the values of this column will be used
325
+ in lieu of those in :mod:`Feature Store <databricks.feature_store.client>`. As in:
326
+
327
+ .. code-block:: python
328
+
329
+ # batch_df has columns ['customer_id', 'account_creation_date']
330
+ predictions = fs.score_batch(
331
+ 'models:/example_model/1',
332
+ batch_df
333
+ )
334
+
335
+ :param model_uri: The location, in URI format, of the MLflow model logged using
336
+ :meth:`FeatureStoreClient.log_model`. One of:
337
+
338
+ * ``runs:/<mlflow_run_id>/run-relative/path/to/model``
339
+
340
+ * ``models:/<model_name>/<model_version>``
341
+
342
+ * ``models:/<model_name>/<stage>``
343
+
344
+ For more information about URI schemes, see
345
+ `Referencing Artifacts <https://bit.ly/3wnrseE>`_.
346
+ :param df: The :class:`DataFrame <pyspark.sql.DataFrame>` to score the model on. :mod:`Feature Store <databricks.feature_store.client>` features will be joined with
347
+ ``df`` prior to scoring the model. ``df`` must:
348
+
349
+ 1. Contain columns for lookup keys required to join feature data from Feature
350
+ Store, as specified in the ``feature_spec.yaml`` artifact.
351
+
352
+ 2. Contain columns for all source keys required to score the model, as specified in
353
+ the ``feature_spec.yaml`` artifact.
354
+
355
+ 3. Not contain a column ``prediction``, which is reserved for the model's predictions.
356
+ ``df`` may contain additional columns.
357
+
358
+ Streaming DataFrames are not supported.
359
+
360
+ :param result_type: The return type of the model.
361
+ See :func:`mlflow.pyfunc.spark_udf` result_type.
362
+ :return: A :class:`DataFrame <pyspark.sql.DataFrame>`
363
+ containing:
364
+
365
+ 1. All columns of ``df``.
366
+
367
+ 2. All feature values retrieved from Feature Store.
368
+
369
+ 3. A column ``prediction`` containing the output of the model.
370
+
371
+ """
372
+ return self._training_set_client.score_batch(
373
+ model_uri=model_uri,
374
+ df=df,
375
+ result_type=result_type,
376
+ client_name=FEATURE_STORE_CLIENT,
377
+ timestamp_key=timestamp_key,
378
+ )
379
+
380
+ def set_feature_table_tag(self, name: str, database_name: str, key: str, value: str):
381
+ """
382
+ 设置特征表标签
383
+ Args:
384
+ name: 特征表名称
385
+ database_name: 数据库名称
386
+ key: 标签键
387
+ value: 标签值
388
+ Returns:
389
+ None
390
+ """
391
+ self._feature_table_client.alter_table_tag(
392
+ name=name,
393
+ database_name=database_name,
394
+ properties={key: value},
395
+ mode="add",
396
+ )
397
+
398
+ def delete_feature_table_tag(self, name: str, database_name: str, key: str):
399
+ """
400
+ 删除特征表标签
401
+ Args:
402
+ name: 特征表名称
403
+ database_name: 数据库名称
404
+ key: 标签键
405
+ Returns:
406
+ None
407
+ """
408
+ self._feature_table_client.alter_table_tag(
409
+ name=name,
410
+ database_name=database_name,
411
+ properties={key: ""},
412
+ mode="delete"
413
+ )
414
+
415
+ @property
416
+ def spark(self):
417
+ return self._spark