wedata-feature-engineering 0.1.5__tar.gz → 0.1.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/PKG-INFO +1 -1
  2. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/__init__.py +1 -1
  3. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/client.py +113 -41
  4. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/constants/constants.py +19 -0
  5. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/entities/column_info.py +4 -4
  6. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/entities/feature_lookup.py +5 -1
  7. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/entities/feature_spec.py +46 -46
  8. wedata-feature-engineering-0.1.7/wedata/feature_store/entities/feature_table.py +107 -0
  9. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/entities/training_set.py +13 -12
  10. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/feature_table_client/feature_table_client.py +86 -31
  11. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/spark_client/spark_client.py +30 -56
  12. wedata-feature-engineering-0.1.7/wedata/feature_store/training_set_client/training_set_client.py +367 -0
  13. wedata-feature-engineering-0.1.5/wedata/feature_store/utils/utils.py → wedata-feature-engineering-0.1.7/wedata/feature_store/utils/common_utils.py +108 -54
  14. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/utils/feature_lookup_utils.py +6 -6
  15. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/utils/feature_spec_utils.py +6 -6
  16. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/utils/feature_utils.py +5 -5
  17. wedata-feature-engineering-0.1.7/wedata/feature_store/utils/on_demand_utils.py +107 -0
  18. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/utils/schema_utils.py +1 -1
  19. wedata-feature-engineering-0.1.7/wedata/feature_store/utils/signature_utils.py +205 -0
  20. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/utils/training_set_utils.py +18 -19
  21. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/utils/uc_utils.py +1 -1
  22. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata_feature_engineering.egg-info/PKG-INFO +1 -1
  23. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata_feature_engineering.egg-info/SOURCES.txt +2 -1
  24. wedata-feature-engineering-0.1.5/wedata/feature_store/entities/feature_table.py +0 -164
  25. wedata-feature-engineering-0.1.5/wedata/feature_store/training_set_client/training_set_client.py +0 -196
  26. wedata-feature-engineering-0.1.5/wedata/feature_store/utils/common_utils.py +0 -96
  27. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/README.md +0 -0
  28. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/setup.cfg +0 -0
  29. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/setup.py +0 -0
  30. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/__init__.py +0 -0
  31. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/constants/__init__.py +0 -0
  32. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/entities/__init__.py +0 -0
  33. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/entities/data_type.py +0 -0
  34. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/entities/environment_variables.py +0 -0
  35. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/entities/feature.py +0 -0
  36. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/entities/feature_column_info.py +0 -0
  37. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/entities/feature_function.py +0 -0
  38. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/entities/feature_spec_constants.py +0 -0
  39. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/entities/feature_table_info.py +0 -0
  40. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/entities/function_info.py +0 -0
  41. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/entities/on_demand_column_info.py +0 -0
  42. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/entities/source_data_column_info.py +0 -0
  43. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/feature_table_client/__init__.py +0 -0
  44. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/spark_client/__init__.py +0 -0
  45. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/training_set_client/__init__.py +0 -0
  46. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/utils/__init__.py +0 -0
  47. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/utils/topological_sort.py +0 -0
  48. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata/feature_store/utils/validation_utils.py +0 -0
  49. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata_feature_engineering.egg-info/dependency_links.txt +0 -0
  50. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata_feature_engineering.egg-info/requires.txt +0 -0
  51. {wedata-feature-engineering-0.1.5 → wedata-feature-engineering-0.1.7}/wedata_feature_engineering.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wedata-feature-engineering
3
- Version: 0.1.5
3
+ Version: 0.1.7
4
4
  Summary: Wedata Feature Engineering Library
5
5
  Home-page:
6
6
  Author: meahqian
@@ -3,4 +3,4 @@ WeData Feature Engineering
3
3
  A toolkit for automated feature engineering
4
4
  """
5
5
 
6
- __version__ = "0.1.5"
6
+ __version__ = "0.1.7"
@@ -3,19 +3,24 @@ Wedata FeatureStoreClient Python实现
3
3
  """
4
4
 
5
5
  from __future__ import annotations
6
+
7
+ from types import ModuleType
6
8
  from typing import Union, List, Dict, Optional, Any
7
9
  from pyspark.sql import DataFrame, SparkSession
8
10
  from pyspark.sql.streaming import StreamingQuery
9
11
  from pyspark.sql.types import StructType
12
+ import mlflow
10
13
 
11
- from feature_store.constants.constants import APPEND, DEFAULT_WRITE_STREAM_TRIGGER
12
- from feature_store.entities.feature_function import FeatureFunction
13
- from feature_store.entities.feature_lookup import FeatureLookup
14
- from feature_store.entities.training_set import TrainingSet
15
- from feature_store.feature_table_client.feature_table_client import FeatureTableClient
16
- from feature_store.spark_client.spark_client import SparkClient
17
- from feature_store.training_set_client.training_set_client import TrainingSetClient
18
- from feature_store.utils.feature_utils import format_feature_lookups_and_functions
14
+ from wedata.feature_store.constants.constants import APPEND, DEFAULT_WRITE_STREAM_TRIGGER
15
+ from wedata.feature_store.entities.feature_function import FeatureFunction
16
+ from wedata.feature_store.entities.feature_lookup import FeatureLookup
17
+ from wedata.feature_store.entities.feature_table import FeatureTable
18
+ from wedata.feature_store.entities.training_set import TrainingSet
19
+ from wedata.feature_store.feature_table_client.feature_table_client import FeatureTableClient
20
+ from wedata.feature_store.spark_client.spark_client import SparkClient
21
+ from wedata.feature_store.training_set_client.training_set_client import TrainingSetClient
22
+ from wedata.feature_store.utils import common_utils
23
+ from wedata.feature_store.utils.feature_utils import format_feature_lookups_and_functions
19
24
 
20
25
 
21
26
  class FeatureStoreClient:
@@ -25,9 +30,11 @@ class FeatureStoreClient:
25
30
  """
26
31
  :param spark: 已初始化的SparkSession对象
27
32
  """
33
+
28
34
  self._spark = spark
29
35
  self._spark_client = SparkClient(spark)
30
36
  self._feature_table_client = FeatureTableClient(spark)
37
+ self._training_set_client = TrainingSetClient(self._spark_client)
31
38
 
32
39
  def create_table(
33
40
  self,
@@ -40,7 +47,7 @@ class FeatureStoreClient:
40
47
  schema: Optional[StructType] = None,
41
48
  description: Optional[str] = None,
42
49
  tags: Optional[Dict[str, str]] = None
43
- ):
50
+ ) -> FeatureTable:
44
51
  """
45
52
  创建特征表(支持批流数据写入)
46
53
 
@@ -50,6 +57,7 @@ class FeatureStoreClient:
50
57
  df: 初始数据(可选,用于推断schema)
51
58
  timestamp_keys: 时间戳键(用于时态特征)
52
59
  partition_columns: 分区列(优化存储查询)
60
+ schema: 表结构定义(可选,当不提供df时必需)
53
61
  description: 业务描述
54
62
  tags: 业务标签
55
63
 
@@ -71,7 +79,6 @@ class FeatureStoreClient:
71
79
  tags=tags
72
80
  )
73
81
 
74
-
75
82
  def read_table(self, name: str) -> DataFrame:
76
83
  """
77
84
  读取特征表数据
@@ -82,8 +89,20 @@ class FeatureStoreClient:
82
89
  Returns:
83
90
  DataFrame: 包含特征表数据的DataFrame对象
84
91
  """
92
+
85
93
  return self._feature_table_client.read_table(name)
86
94
 
95
+ def get_table(self, name: str) -> FeatureTable:
96
+ """
97
+ 获取特征表元数据
98
+ Args:
99
+ name: 特征表名称
100
+
101
+ Returns:
102
+ FeatureTable: 包含特征表元数据的FeatureTable对象
103
+ """
104
+
105
+ return self._feature_table_client.get_table(name, self._spark_client)
87
106
 
88
107
  def drop_table(self, name: str) -> None:
89
108
  """
@@ -95,8 +114,38 @@ class FeatureStoreClient:
95
114
  Returns:
96
115
  None
97
116
  """
117
+
98
118
  return self._feature_table_client.drop_table(name)
99
119
 
120
+ def write_table(
121
+ self,
122
+ name: str,
123
+ df: DataFrame,
124
+ mode: str = APPEND,
125
+ checkpoint_location: Optional[str] = None,
126
+ trigger: Dict[str, Any] = DEFAULT_WRITE_STREAM_TRIGGER,
127
+ ) -> Optional[StreamingQuery]:
128
+ """
129
+ 写入数据到特征表(支持批处理和流式处理)
130
+
131
+ Args:
132
+ name: 特征表名称
133
+ df: 要写入的数据DataFrame
134
+ mode: 写入模式(默认追加)
135
+ checkpoint_location: 流式处理的检查点位置(可选)
136
+ trigger: 流式处理触发器配置(默认使用系统预设)
137
+
138
+ Returns:
139
+ 如果是流式写入返回StreamingQuery对象,否则返回None
140
+ """
141
+
142
+ return self._feature_table_client.write_table(
143
+ name=name,
144
+ df=df,
145
+ mode=mode,
146
+ checkpoint_location=checkpoint_location,
147
+ trigger=trigger,
148
+ )
100
149
 
101
150
  def create_training_set(
102
151
  self,
@@ -106,7 +155,6 @@ class FeatureStoreClient:
106
155
  exclude_columns: Optional[List[str]] = None,
107
156
  **kwargs,
108
157
  ) -> TrainingSet:
109
-
110
158
  """
111
159
  创建训练集
112
160
 
@@ -123,13 +171,22 @@ class FeatureStoreClient:
123
171
  if exclude_columns is None:
124
172
  exclude_columns = []
125
173
 
174
+ # 如果为FeatureLookup,则将需要校验FeatureLookup的table_name,并构建完整表名
175
+ for feature in feature_lookups:
176
+ if isinstance(feature, FeatureLookup):
177
+ if not feature.table_name:
178
+ raise ValueError("FeatureLookup must specify a table_name")
179
+ # 先校验表名格式是否合法
180
+ common_utils.validate_table_name(feature.table_name)
181
+ # 再构建完整表名
182
+ feature.table_name = common_utils.build_full_table_name(feature.table_name)
183
+
126
184
  features = feature_lookups
127
185
  del feature_lookups
128
186
 
129
187
  features = format_feature_lookups_and_functions(self._spark_client, features)
130
- # 创建TrainingSetClient实例
131
- training_set_client = TrainingSetClient(self._spark_client)
132
- return training_set_client.create_training_set_from_feature_lookups(
188
+
189
+ return self._training_set_client.create_training_set_from_feature_lookups(
133
190
  df=df,
134
191
  feature_lookups=features,
135
192
  label=label,
@@ -137,33 +194,48 @@ class FeatureStoreClient:
137
194
  **kwargs
138
195
  )
139
196
 
140
- def write_table(
197
+ def log_model(
141
198
  self,
142
- name: str,
143
- df: DataFrame,
144
- mode: str = APPEND,
145
- checkpoint_location: Optional[str] = None,
146
- trigger: Dict[str, Any] = DEFAULT_WRITE_STREAM_TRIGGER,
147
- ) -> Optional[StreamingQuery]:
148
-
149
- """
150
- 写入数据到特征表(支持批处理和流式处理)
151
-
152
- Args:
153
- name: 特征表名称
154
- df: 要写入的数据DataFrame
155
- mode: 写入模式(默认追加)
156
- checkpoint_location: 流式处理的检查点位置(可选)
157
- trigger: 流式处理触发器配置(默认使用系统预设)
158
-
159
- Returns:
160
- 如果是流式写入返回StreamingQuery对象,否则返回None
199
+ model: Any,
200
+ artifact_path: str,
201
+ *,
202
+ flavor: ModuleType,
203
+ training_set: Optional[TrainingSet] = None,
204
+ registered_model_name: Optional[str] = None,
205
+ await_registration_for: int = mlflow.tracking._model_registry.DEFAULT_AWAIT_MAX_SLEEP_SECONDS,
206
+ infer_input_example: bool = False,
207
+ **kwargs,
208
+ ):
161
209
  """
210
+ 记录MLflow模型并关联特征查找信息
211
+
212
+ 注意:必须使用TrainingSet.load_df返回的DataFrame训练模型,
213
+ 任何对DataFrame的修改(如标准化、添加列等)都不会在推理时应用
214
+
215
+ Args:
216
+ model: 要记录的模型对象
217
+ artifact_path: 模型存储路径
218
+ flavor: MLflow模型类型模块(如mlflow.sklearn)
219
+ training_set: 训练模型使用的TrainingSet对象(可选)
220
+ registered_model_name: 要注册的模型名称(可选)
221
+ await_registration_for: 等待模型注册完成的秒数(默认300秒)
222
+ infer_input_example: 是否自动记录输入示例(默认False)
223
+
224
+ Returns:
225
+ None
226
+ """
227
+
228
+ self._training_set_client.log_model(
229
+ model=model,
230
+ artifact_path=artifact_path,
231
+ flavor=flavor,
232
+ training_set=training_set,
233
+ registered_model_name=registered_model_name,
234
+ await_registration_for=await_registration_for,
235
+ infer_input_example=infer_input_example,
236
+ **kwargs
237
+ )
162
238
 
163
- return self._feature_table_client.write_table(
164
- name=name,
165
- df=df,
166
- mode=mode,
167
- checkpoint_location=checkpoint_location,
168
- trigger=trigger,
169
- )
239
+ @property
240
+ def spark(self):
241
+ return self._spark
@@ -26,3 +26,22 @@ _NO_RESULT_TYPE_PASSED = "NO_RESULT_TYPE"
26
26
  _USE_SPARK_NATIVE_JOIN = "use_spark_native_join"
27
27
  _PREBUILT_ENV_URI = "prebuilt_env_uri"
28
28
 
29
+ # MLflow模型相关常量(原mlflow_model_constants.py)
30
+ # Module name of the original mlflow_model
31
+ MLFLOW_MODEL_NAME = "databricks.feature_store.mlflow_model"
32
+
33
+ # FeatureStoreClient.log_model将记录包含'raw_model'文件夹的模型
34
+ # 该文件夹存储原始模型的MLmodel文件,用于推理
35
+ RAW_MODEL_FOLDER = "raw_model"
36
+
37
+ # ML模型文件名常量
38
+ ML_MODEL = "MLmodel"
39
+
40
+ # 特征查找客户端的PyPI包名
41
+ FEATURE_LOOKUP_CLIENT_PIP_PACKAGE = "databricks-feature-lookup"
42
+
43
+ # 特征查找客户端的主版本号
44
+ FEATURE_LOOKUP_CLIENT_MAJOR_VERSION = 1
45
+
46
+ # 特征存储内部数据目录
47
+ FEATURE_STORE_INTERNAL_DATA_DIR = "_databricks_internal/"
@@ -1,11 +1,11 @@
1
1
  import copy
2
2
  from typing import Optional, Union
3
3
 
4
- from feature_store.entities.feature_column_info import FeatureColumnInfo
5
- from feature_store.entities.feature_spec_constants import SOURCE_DATA_COLUMN_INFO, FEATURE_COLUMN_INFO, \
4
+ from wedata.feature_store.entities.feature_column_info import FeatureColumnInfo
5
+ from wedata.feature_store.entities.feature_spec_constants import SOURCE_DATA_COLUMN_INFO, FEATURE_COLUMN_INFO, \
6
6
  ON_DEMAND_COLUMN_INFO
7
- from feature_store.entities.on_demand_column_info import OnDemandColumnInfo
8
- from feature_store.entities.source_data_column_info import SourceDataColumnInfo
7
+ from wedata.feature_store.entities.on_demand_column_info import OnDemandColumnInfo
8
+ from wedata.feature_store.entities.source_data_column_info import SourceDataColumnInfo
9
9
 
10
10
 
11
11
  class ColumnInfo:
@@ -3,7 +3,7 @@ import datetime
3
3
  import logging
4
4
  from typing import Dict, List, Optional, Union
5
5
 
6
- from feature_store.utils import common_utils
6
+ from wedata.feature_store.utils import common_utils
7
7
 
8
8
  _logger = logging.getLogger(__name__)
9
9
 
@@ -177,3 +177,7 @@ class FeatureLookup:
177
177
  f'The output_name parameter is deprecated. Use "rename_outputs".'
178
178
  )
179
179
  self._rename_outputs[self._feature_names[0]] = self._output_name_deprecated
180
+
181
+ @table_name.setter
182
+ def table_name(self, value):
183
+ self._table_name = value
@@ -6,9 +6,9 @@ from databricks.sdk.service.catalog import FunctionInfo
6
6
  from google.protobuf.json_format import MessageToDict, ParseDict
7
7
  from mlflow.utils.file_utils import TempDir, read_yaml, write_yaml
8
8
 
9
- from feature_store.entities.column_info import ColumnInfo
10
- from feature_store.entities.feature_column_info import FeatureColumnInfo
11
- from feature_store.entities.feature_spec_constants import (
9
+ from wedata.feature_store.entities.column_info import ColumnInfo
10
+ from wedata.feature_store.entities.feature_column_info import FeatureColumnInfo
11
+ from wedata.feature_store.entities.feature_spec_constants import (
12
12
  BOUND_TO,
13
13
  DATA_TYPE,
14
14
  FEATURE_COLUMN_INFO,
@@ -31,10 +31,10 @@ from feature_store.entities.feature_spec_constants import (
31
31
  TRAINING_DATA,
32
32
  UDF_NAME,
33
33
  )
34
- from feature_store.entities.feature_table_info import FeatureTableInfo
35
- from feature_store.entities.on_demand_column_info import OnDemandColumnInfo
36
- from feature_store.entities.source_data_column_info import SourceDataColumnInfo
37
- from feature_store.utils import common_utils
34
+ from wedata.feature_store.entities.feature_table_info import FeatureTableInfo
35
+ from wedata.feature_store.entities.on_demand_column_info import OnDemandColumnInfo
36
+ from wedata.feature_store.entities.source_data_column_info import SourceDataColumnInfo
37
+ from wedata.feature_store.utils import common_utils
38
38
 
39
39
  # Change log for serialization version. Please update for each serialization version.
40
40
  # 1. Initial.
@@ -383,45 +383,45 @@ class FeatureSpec:
383
383
  column_info_dict[INCLUDE] = column_data.pop(INCLUDE)
384
384
  return column_info_dict
385
385
 
386
- # @classmethod
387
- # def _from_dict(cls, spec_dict):
388
- # """
389
- # Convert YAML artifact to FeatureSpec. Transforms YAML artifact to dict keyed by
390
- # source_data_column_info or feature_column_info, such that ParseDict can convert the dict to
391
- # a proto message, and from_proto can convert the proto message to a FeatureSpec object
392
- # :return: :py:class:`~databricks.ml_features_common.entities.feature_spec.FeatureSpec`
393
- # """
394
- # if INPUT_COLUMNS not in spec_dict:
395
- # raise ValueError(
396
- # f"{INPUT_COLUMNS} must be a key in {cls.FEATURE_ARTIFACT_FILE}."
397
- # )
398
- # if not spec_dict[INPUT_COLUMNS]:
399
- # raise ValueError(
400
- # f"{INPUT_COLUMNS} in {cls.FEATURE_ARTIFACT_FILE} must be non-empty."
401
- # )
402
- # spec_dict[INPUT_COLUMNS] = [
403
- # cls._input_columns_yaml_to_proto_dict(column_info)
404
- # for column_info in spec_dict[INPUT_COLUMNS]
405
- # ]
406
- #
407
- # # feature_spec.yaml doesn't include input_tables, input_functions if any are true:
408
- # # 1. The YAML is written by an older client that does not support the functionality.
409
- # # 2. The FeatureSpec does not contain FeatureLookups (input_tables), FeatureFunctions (input_functions).
410
- # input_tables = []
411
- # for input_table in spec_dict.get(INPUT_TABLES, []):
412
- # table_name, attributes = list(input_table.items())[0]
413
- # input_tables.append({TABLE_NAME: table_name, **attributes})
414
- # spec_dict[INPUT_TABLES] = input_tables
415
- #
416
- # input_functions = []
417
- # for input_function in spec_dict.get(INPUT_FUNCTIONS, []):
418
- # udf_name, attributes = list(input_function.items())[0]
419
- # input_functions.append({UDF_NAME: udf_name, **attributes})
420
- # spec_dict[INPUT_FUNCTIONS] = input_functions
421
- #
422
- # return cls.from_proto(
423
- # ParseDict(spec_dict, ProtoFeatureSpec(), ignore_unknown_fields=True)
424
- # )
386
+ @classmethod
387
+ def _from_dict(cls, spec_dict):
388
+ """
389
+ Convert YAML artifact to FeatureSpec. Transforms YAML artifact to dict keyed by
390
+ source_data_column_info or feature_column_info, such that ParseDict can convert the dict to
391
+ a proto message, and from_proto can convert the proto message to a FeatureSpec object
392
+ :return: :py:class:`~databricks.ml_features_common.entities.feature_spec.FeatureSpec`
393
+ """
394
+ if INPUT_COLUMNS not in spec_dict:
395
+ raise ValueError(
396
+ f"{INPUT_COLUMNS} must be a key in {cls.FEATURE_ARTIFACT_FILE}."
397
+ )
398
+ if not spec_dict[INPUT_COLUMNS]:
399
+ raise ValueError(
400
+ f"{INPUT_COLUMNS} in {cls.FEATURE_ARTIFACT_FILE} must be non-empty."
401
+ )
402
+ spec_dict[INPUT_COLUMNS] = [
403
+ cls._input_columns_yaml_to_proto_dict(column_info)
404
+ for column_info in spec_dict[INPUT_COLUMNS]
405
+ ]
406
+
407
+ # feature_spec.yaml doesn't include input_tables, input_functions if any are true:
408
+ # 1. The YAML is written by an older client that does not support the functionality.
409
+ # 2. The FeatureSpec does not contain FeatureLookups (input_tables), FeatureFunctions (input_functions).
410
+ input_tables = []
411
+ for input_table in spec_dict.get(INPUT_TABLES, []):
412
+ table_name, attributes = list(input_table.items())[0]
413
+ input_tables.append({TABLE_NAME: table_name, **attributes})
414
+ spec_dict[INPUT_TABLES] = input_tables
415
+
416
+ input_functions = []
417
+ for input_function in spec_dict.get(INPUT_FUNCTIONS, []):
418
+ udf_name, attributes = list(input_function.items())[0]
419
+ input_functions.append({UDF_NAME: udf_name, **attributes})
420
+ spec_dict[INPUT_FUNCTIONS] = input_functions
421
+
422
+ return cls.from_proto(
423
+ ParseDict(spec_dict, ProtoFeatureSpec(), ignore_unknown_fields=True)
424
+ )
425
425
 
426
426
  @classmethod
427
427
  def _read_file(cls, path: str):
@@ -0,0 +1,107 @@
1
+ from typing import Dict
2
+
3
+
4
+ class FeatureTable:
5
+ """
6
+ 特征表实体类
7
+ """
8
+
9
+ def __init__(
10
+ self,
11
+ name,
12
+ table_id,
13
+ description,
14
+ primary_keys,
15
+ partition_columns,
16
+ features,
17
+ creation_timestamp=None,
18
+ online_stores=None,
19
+ notebook_producers=None,
20
+ job_producers=None,
21
+ table_data_sources=None,
22
+ path_data_sources=None,
23
+ custom_data_sources=None,
24
+ timestamp_keys=None,
25
+ tags=None,
26
+ ):
27
+ """Initialize a FeatureTable object."""
28
+ """初始化特征表对象
29
+
30
+ :param name: 表名
31
+ :param table_id: 表ID
32
+ :param description: 描述
33
+ :param primary_keys: 主键列表
34
+ :param partition_columns: 分区列列表
35
+ :param features: 特征列列表
36
+ :param creation_timestamp: 创建时间戳(可选)
37
+ :param online_stores: 在线存储配置(可选)
38
+ :param notebook_producers: Notebook生产者列表(可选)
39
+ :param job_producers: 作业生产者列表(可选)
40
+ :param table_data_sources: 表数据源列表(可选)
41
+ :param path_data_sources: 路径数据源列表(可选)
42
+ :param custom_data_sources: 自定义数据源列表(可选)
43
+ :param timestamp_keys: 时间戳键列表(可选)
44
+ :param tags: 标签字典(可选)
45
+ """
46
+ self.name = name
47
+ self.table_id = table_id
48
+ self.description = description
49
+ self.primary_keys = primary_keys
50
+ self.partition_columns = partition_columns
51
+ self.features = features
52
+ self.creation_timestamp = creation_timestamp
53
+ self.online_stores = online_stores if online_stores is not None else []
54
+ self.notebook_producers = (
55
+ notebook_producers if notebook_producers is not None else []
56
+ )
57
+ self.job_producers = job_producers if job_producers is not None else []
58
+ self.table_data_sources = (
59
+ table_data_sources if table_data_sources is not None else []
60
+ )
61
+ self.path_data_sources = (
62
+ path_data_sources if path_data_sources is not None else []
63
+ )
64
+ self.custom_data_sources = (
65
+ custom_data_sources if custom_data_sources is not None else []
66
+ )
67
+ self.timestamp_keys = timestamp_keys if timestamp_keys is not None else []
68
+ self._tags = tags
69
+
70
+ def __str__(self):
71
+ """
72
+ 返回特征表实例的字符串表示,包含所有关键属性信息
73
+
74
+ 返回:
75
+ 格式化的字符串,包含表名、ID、描述、主键、分区列、特征数量、
76
+ 时间戳键、创建时间、数据源数量和标签数量等信息
77
+ """
78
+ return (
79
+ f"FeatureTable(\n"
80
+ f" name='{self.name}',\n"
81
+ f" table_id='{self.table_id}',\n"
82
+ f" description='{self.description[:50]}{'...' if len(self.description) > 50 else ''}',\n"
83
+ f" primary_keys={self.primary_keys},\n"
84
+ f" partition_columns={self.partition_columns},\n"
85
+ f" features={len(self.features)},\n"
86
+ f" timestamp_keys={self.timestamp_keys},\n"
87
+ f" creation_timestamp={self.creation_timestamp},\n"
88
+ f" data_sources=[table:{len(self.table_data_sources)} "
89
+ f"path:{len(self.path_data_sources)} custom:{len(self.custom_data_sources)}],\n"
90
+ f" tags={len(self.tags) if self._tags else 0}\n"
91
+ f")"
92
+ )
93
+
94
+ @property
95
+ def tags(self) -> Dict[str, str]:
96
+ """
97
+ Get the tags associated with the feature table.
98
+
99
+ :return a Dictionary of all tags associated with the feature table as key/value pairs
100
+ """
101
+ if self._tags is None:
102
+ # If no tags are set, self._tags is expected an empty dictionary.
103
+ raise ValueError(
104
+ "Internal error: tags have not been fetched for this FeatureTable instance"
105
+ )
106
+ return self._tags
107
+
@@ -2,19 +2,20 @@ from typing import Dict, List, Optional
2
2
 
3
3
  from pyspark.sql import DataFrame
4
4
 
5
- from feature_store.entities.feature_table import FeatureTable
6
- from feature_store.entities.function_info import FunctionInfo
7
- from feature_store.utils.feature_lookup_utils import (
5
+ from wedata.feature_store.entities.feature_table import FeatureTable
6
+ from wedata.feature_store.entities.function_info import FunctionInfo
7
+ from wedata.feature_store.utils.feature_lookup_utils import (
8
8
  join_feature_data_if_not_overridden,
9
9
  )
10
10
 
11
- from feature_store.entities.feature_spec import FeatureSpec
12
- from feature_store.utils.feature_spec_utils import (
11
+ from wedata.feature_store.entities.feature_spec import FeatureSpec
12
+ from wedata.feature_store.utils.feature_spec_utils import (
13
13
  COLUMN_INFO_TYPE_FEATURE,
14
14
  COLUMN_INFO_TYPE_ON_DEMAND,
15
15
  COLUMN_INFO_TYPE_SOURCE,
16
16
  get_feature_execution_groups,
17
17
  )
18
+ from wedata.feature_store.utils.on_demand_utils import apply_functions_if_not_overridden
18
19
 
19
20
 
20
21
  class TrainingSet:
@@ -89,13 +90,13 @@ class TrainingSet:
89
90
  feature_table_data_map=self._feature_table_data_map,
90
91
  use_spark_native_join=self._use_spark_native_join,
91
92
  )
92
- # elif execution_group.type == COLUMN_INFO_TYPE_ON_DEMAND:
93
- # # Apply all on-demand UDFs
94
- # result_df = apply_functions_if_not_overridden(
95
- # df=result_df,
96
- # functions_to_apply=execution_group.features,
97
- # uc_function_infos=self._uc_function_infos,
98
- # )
93
+ elif execution_group.type == COLUMN_INFO_TYPE_ON_DEMAND:
94
+ # Apply all on-demand UDFs
95
+ result_df = apply_functions_if_not_overridden(
96
+ df=result_df,
97
+ functions_to_apply=execution_group.features,
98
+ uc_function_infos=self._uc_function_infos,
99
+ )
99
100
  else:
100
101
  # This should never be reached.
101
102
  raise Exception("Unknown feature execution type:", execution_group.type)