PyPI - wedata-feature-engineering - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

wedata-feature-engineering 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

{feature_store → wedata}/__init__.py RENAMED Viewed

@@ -3,4 +3,4 @@ WeData Feature Engineering
 A toolkit for automated feature engineering
 """
-__version__ = "0.1.0"
+__version__ = "0.1.6"

{feature_store → wedata/feature_store}/client.py RENAMED Viewed

@@ -3,19 +3,24 @@ Wedata FeatureStoreClient Python实现
 """
 from __future__ import annotations
+from types import ModuleType
 from typing import Union, List, Dict, Optional, Any
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.streaming import StreamingQuery
 from pyspark.sql.types import StructType
+import mlflow
-from feature_store.constants.constants import APPEND, DEFAULT_WRITE_STREAM_TRIGGER
-from feature_store.entities.feature_function import FeatureFunction
-from feature_store.entities.feature_lookup import FeatureLookup
-from feature_store.entities.training_set import TrainingSet
-from feature_store.feature_table_client.feature_table_client import FeatureTableClient
-from feature_store.spark_client.spark_client import SparkClient
-from feature_store.training_set_client.training_set_client import TrainingSetClient
-from feature_store.utils.feature_utils import format_feature_lookups_and_functions
+from wedata.feature_store.constants.constants import APPEND, DEFAULT_WRITE_STREAM_TRIGGER
+from wedata.feature_store.entities.feature_function import FeatureFunction
+from wedata.feature_store.entities.feature_lookup import FeatureLookup
+from wedata.feature_store.entities.feature_table import FeatureTable
+from wedata.feature_store.entities.training_set import TrainingSet
+from wedata.feature_store.feature_table_client.feature_table_client import FeatureTableClient
+from wedata.feature_store.spark_client.spark_client import SparkClient
+from wedata.feature_store.training_set_client.training_set_client import TrainingSetClient
+from wedata.feature_store.utils import common_utils
+from wedata.feature_store.utils.feature_utils import format_feature_lookups_and_functions
 class FeatureStoreClient:
@@ -25,9 +30,11 @@ class FeatureStoreClient:
         """
         :param spark: 已初始化的SparkSession对象
         """
         self._spark = spark
         self._spark_client = SparkClient(spark)
         self._feature_table_client = FeatureTableClient(spark)
+        self._training_set_client = TrainingSetClient(self._spark_client)
     def create_table(
             self,
@@ -40,7 +47,7 @@ class FeatureStoreClient:
             schema: Optional[StructType] = None,
             description: Optional[str] = None,
             tags: Optional[Dict[str, str]] = None
-    ):
+    ) -> FeatureTable:
         """
         创建特征表（支持批流数据写入）
@@ -50,6 +57,7 @@ class FeatureStoreClient:
             df: 初始数据（可选，用于推断schema）
             timestamp_keys: 时间戳键（用于时态特征）
             partition_columns: 分区列（优化存储查询）
+            schema: 表结构定义（可选，当不提供df时必需）
             description: 业务描述
             tags: 业务标签
@@ -71,7 +79,6 @@ class FeatureStoreClient:
             tags=tags
         )
     def read_table(self, name: str) -> DataFrame:
         """
         读取特征表数据
@@ -82,8 +89,20 @@ class FeatureStoreClient:
         Returns:
             DataFrame: 包含特征表数据的DataFrame对象
         """
         return self._feature_table_client.read_table(name)
+    def get_table(self, name: str) -> FeatureTable:
+        """
+        获取特征表元数据
+        Args:
+            name: 特征表名称
+        Returns:
+            FeatureTable: 包含特征表元数据的FeatureTable对象
+        """
+        return self._feature_table_client.get_table(name, self._spark_client)
     def drop_table(self, name: str) -> None:
         """
@@ -95,8 +114,38 @@ class FeatureStoreClient:
         Returns:
             None
         """
         return self._feature_table_client.drop_table(name)
+    def write_table(
+            self,
+            name: str,
+            df: DataFrame,
+            mode: str = APPEND,
+            checkpoint_location: Optional[str] = None,
+            trigger: Dict[str, Any] = DEFAULT_WRITE_STREAM_TRIGGER,
+    ) -> Optional[StreamingQuery]:
+        """
+        写入数据到特征表（支持批处理和流式处理）
+        Args:
+            name: 特征表名称
+            df: 要写入的数据DataFrame
+            mode: 写入模式（默认追加）
+            checkpoint_location: 流式处理的检查点位置（可选）
+            trigger: 流式处理触发器配置（默认使用系统预设）
+        Returns:
+            如果是流式写入返回StreamingQuery对象，否则返回None
+        """
+        return self._feature_table_client.write_table(
+            name=name,
+            df=df,
+            mode=mode,
+            checkpoint_location=checkpoint_location,
+            trigger=trigger,
+        )
     def create_training_set(
             self,
@@ -106,7 +155,6 @@ class FeatureStoreClient:
             exclude_columns: Optional[List[str]] = None,
             **kwargs,
     ) -> TrainingSet:
         """
         创建训练集
@@ -123,13 +171,22 @@ class FeatureStoreClient:
         if exclude_columns is None:
             exclude_columns = []
+        # 如果为FeatureLookup，则将需要校验FeatureLookup的table_name，并构建完整表名
+        for feature in feature_lookups:
+            if isinstance(feature, FeatureLookup):
+                if not feature.table_name:
+                    raise ValueError("FeatureLookup must specify a table_name")
+                # 先校验表名格式是否合法
+                common_utils.validate_table_name(feature.table_name)
+                # 再构建完整表名
+                feature.table_name = common_utils.build_full_table_name(feature.table_name)
         features = feature_lookups
         del feature_lookups
         features = format_feature_lookups_and_functions(self._spark_client, features)
-        # 创建TrainingSetClient实例
-        training_set_client = TrainingSetClient(self._spark_client)
-        return training_set_client.create_training_set_from_feature_lookups(
+        return self._training_set_client.create_training_set_from_feature_lookups(
             df=df,
             feature_lookups=features,
             label=label,
@@ -137,33 +194,48 @@ class FeatureStoreClient:
             **kwargs
         )
-    def write_table(
+    def log_model(
             self,
-            name: str,
-            df: DataFrame,
-            mode: str = APPEND,
-            checkpoint_location: Optional[str] = None,
-            trigger: Dict[str, Any] = DEFAULT_WRITE_STREAM_TRIGGER,
-    ) -> Optional[StreamingQuery]:
-        """
-        写入数据到特征表（支持批处理和流式处理）
-        Args:
-            name: 特征表名称
-            df: 要写入的数据DataFrame
-            mode: 写入模式（默认追加）
-            checkpoint_location: 流式处理的检查点位置（可选）
-            trigger: 流式处理触发器配置（默认使用系统预设）
-        Returns:
-            如果是流式写入返回StreamingQuery对象，否则返回None
+            model: Any,
+            artifact_path: str,
+            *,
+            flavor: ModuleType,
+            training_set: Optional[TrainingSet] = None,
+            registered_model_name: Optional[str] = None,
+            await_registration_for: int = mlflow.tracking._model_registry.DEFAULT_AWAIT_MAX_SLEEP_SECONDS,
+            infer_input_example: bool = False,
+            **kwargs,
+    ):
         """
+         记录MLflow模型并关联特征查找信息
+         注意：必须使用TrainingSet.load_df返回的DataFrame训练模型，
+         任何对DataFrame的修改(如标准化、添加列等)都不会在推理时应用
+         Args:
+             model: 要记录的模型对象
+             artifact_path: 模型存储路径
+             flavor: MLflow模型类型模块(如mlflow.sklearn)
+             training_set: 训练模型使用的TrainingSet对象(可选)
+             registered_model_name: 要注册的模型名称(可选)
+             await_registration_for: 等待模型注册完成的秒数(默认300秒)
+             infer_input_example: 是否自动记录输入示例(默认False)
+         Returns:
+             None
+         """
+        self._training_set_client.log_model(
+            model=model,
+            artifact_path=artifact_path,
+            flavor=flavor,
+            training_set=training_set,
+            registered_model_name=registered_model_name,
+            await_registration_for=await_registration_for,
+            infer_input_example=infer_input_example,
+            **kwargs
+        )
-        return self._feature_table_client.write_table(
-            name=name,
-            df=df,
-            mode=mode,
-            checkpoint_location=checkpoint_location,
-            trigger=trigger,
-        )
+    @property
+    def spark(self):
+        return self._spark

{feature_store → wedata/feature_store}/constants/constants.py RENAMED Viewed

@@ -26,3 +26,22 @@ _NO_RESULT_TYPE_PASSED = "NO_RESULT_TYPE"
 _USE_SPARK_NATIVE_JOIN = "use_spark_native_join"
 _PREBUILT_ENV_URI = "prebuilt_env_uri"
+# MLflow模型相关常量(原mlflow_model_constants.py)
+# Module name of the original mlflow_model
+MLFLOW_MODEL_NAME = "databricks.feature_store.mlflow_model"
+# FeatureStoreClient.log_model将记录包含'raw_model'文件夹的模型
+# 该文件夹存储原始模型的MLmodel文件，用于推理
+RAW_MODEL_FOLDER = "raw_model"
+# ML模型文件名常量
+ML_MODEL = "MLmodel"
+# 特征查找客户端的PyPI包名
+FEATURE_LOOKUP_CLIENT_PIP_PACKAGE = "databricks-feature-lookup"
+# 特征查找客户端的主版本号
+FEATURE_LOOKUP_CLIENT_MAJOR_VERSION = 1
+# 特征存储内部数据目录
+FEATURE_STORE_INTERNAL_DATA_DIR = "_databricks_internal/"

{feature_store → wedata/feature_store}/entities/column_info.py RENAMED Viewed

@@ -1,11 +1,11 @@
 import copy
 from typing import Optional, Union
-from feature_store.entities.feature_column_info import FeatureColumnInfo
-from feature_store.entities.feature_spec_constants import SOURCE_DATA_COLUMN_INFO, FEATURE_COLUMN_INFO, \
+from wedata.feature_store.entities.feature_column_info import FeatureColumnInfo
+from wedata.feature_store.entities.feature_spec_constants import SOURCE_DATA_COLUMN_INFO, FEATURE_COLUMN_INFO, \
     ON_DEMAND_COLUMN_INFO
-from feature_store.entities.on_demand_column_info import OnDemandColumnInfo
-from feature_store.entities.source_data_column_info import SourceDataColumnInfo
+from wedata.feature_store.entities.on_demand_column_info import OnDemandColumnInfo
+from wedata.feature_store.entities.source_data_column_info import SourceDataColumnInfo
 class ColumnInfo:

{feature_store → wedata/feature_store}/entities/feature_lookup.py RENAMED Viewed

@@ -3,7 +3,7 @@ import datetime
 import logging
 from typing import Dict, List, Optional, Union
-from feature_store.utils import common_utils
+from wedata.feature_store.utils import common_utils
 _logger = logging.getLogger(__name__)
@@ -177,3 +177,7 @@ class FeatureLookup:
                 f'The output_name parameter is deprecated.  Use "rename_outputs".'
             )
             self._rename_outputs[self._feature_names[0]] = self._output_name_deprecated
+    @table_name.setter
+    def table_name(self, value):
+        self._table_name = value

{feature_store → wedata/feature_store}/entities/feature_spec.py RENAMED Viewed

@@ -6,9 +6,9 @@ from databricks.sdk.service.catalog import FunctionInfo
 from google.protobuf.json_format import MessageToDict, ParseDict
 from mlflow.utils.file_utils import TempDir, read_yaml, write_yaml
-from feature_store.entities.column_info import ColumnInfo
-from feature_store.entities.feature_column_info import FeatureColumnInfo
-from feature_store.entities.feature_spec_constants import (
+from wedata.feature_store.entities.column_info import ColumnInfo
+from wedata.feature_store.entities.feature_column_info import FeatureColumnInfo
+from wedata.feature_store.entities.feature_spec_constants import (
     BOUND_TO,
     DATA_TYPE,
     FEATURE_COLUMN_INFO,
@@ -31,10 +31,10 @@ from feature_store.entities.feature_spec_constants import (
     TRAINING_DATA,
     UDF_NAME,
 )
-from feature_store.entities.feature_table_info import FeatureTableInfo
-from feature_store.entities.on_demand_column_info import OnDemandColumnInfo
-from feature_store.entities.source_data_column_info import SourceDataColumnInfo
-from feature_store.utils import common_utils
+from wedata.feature_store.entities.feature_table_info import FeatureTableInfo
+from wedata.feature_store.entities.on_demand_column_info import OnDemandColumnInfo
+from wedata.feature_store.entities.source_data_column_info import SourceDataColumnInfo
+from wedata.feature_store.utils import common_utils
 # Change log for serialization version. Please update for each serialization version.
 # 1. Initial.
@@ -383,45 +383,45 @@ class FeatureSpec:
             column_info_dict[INCLUDE] = column_data.pop(INCLUDE)
         return column_info_dict
-    # @classmethod
-    # def _from_dict(cls, spec_dict):
-    #     """
-    #     Convert YAML artifact to FeatureSpec. Transforms YAML artifact to dict keyed by
-    #     source_data_column_info or feature_column_info, such that ParseDict can convert the dict to
-    #     a proto message, and from_proto can convert the proto message to a FeatureSpec object
-    #     :return: :py:class:`~databricks.ml_features_common.entities.feature_spec.FeatureSpec`
-    #     """
-    #     if INPUT_COLUMNS not in spec_dict:
-    #         raise ValueError(
-    #             f"{INPUT_COLUMNS} must be a key in {cls.FEATURE_ARTIFACT_FILE}."
-    #         )
-    #     if not spec_dict[INPUT_COLUMNS]:
-    #         raise ValueError(
-    #             f"{INPUT_COLUMNS} in {cls.FEATURE_ARTIFACT_FILE} must be non-empty."
-    #         )
-    #     spec_dict[INPUT_COLUMNS] = [
-    #         cls._input_columns_yaml_to_proto_dict(column_info)
-    #         for column_info in spec_dict[INPUT_COLUMNS]
-    #     ]
-    #
-    #     # feature_spec.yaml doesn't include input_tables, input_functions if any are true:
-    #     # 1. The YAML is written by an older client that does not support the functionality.
-    #     # 2. The FeatureSpec does not contain FeatureLookups (input_tables), FeatureFunctions (input_functions).
-    #     input_tables = []
-    #     for input_table in spec_dict.get(INPUT_TABLES, []):
-    #         table_name, attributes = list(input_table.items())[0]
-    #         input_tables.append({TABLE_NAME: table_name, **attributes})
-    #     spec_dict[INPUT_TABLES] = input_tables
-    #
-    #     input_functions = []
-    #     for input_function in spec_dict.get(INPUT_FUNCTIONS, []):
-    #         udf_name, attributes = list(input_function.items())[0]
-    #         input_functions.append({UDF_NAME: udf_name, **attributes})
-    #     spec_dict[INPUT_FUNCTIONS] = input_functions
-    #
-    #     return cls.from_proto(
-    #         ParseDict(spec_dict, ProtoFeatureSpec(), ignore_unknown_fields=True)
-    #     )
+    @classmethod
+    def _from_dict(cls, spec_dict):
+        """
+        Convert YAML artifact to FeatureSpec. Transforms YAML artifact to dict keyed by
+        source_data_column_info or feature_column_info, such that ParseDict can convert the dict to
+        a proto message, and from_proto can convert the proto message to a FeatureSpec object
+        :return: :py:class:`~databricks.ml_features_common.entities.feature_spec.FeatureSpec`
+        """
+        if INPUT_COLUMNS not in spec_dict:
+            raise ValueError(
+                f"{INPUT_COLUMNS} must be a key in {cls.FEATURE_ARTIFACT_FILE}."
+            )
+        if not spec_dict[INPUT_COLUMNS]:
+            raise ValueError(
+                f"{INPUT_COLUMNS} in {cls.FEATURE_ARTIFACT_FILE} must be non-empty."
+            )
+        spec_dict[INPUT_COLUMNS] = [
+            cls._input_columns_yaml_to_proto_dict(column_info)
+            for column_info in spec_dict[INPUT_COLUMNS]
+        ]
+        # feature_spec.yaml doesn't include input_tables, input_functions if any are true:
+        # 1. The YAML is written by an older client that does not support the functionality.
+        # 2. The FeatureSpec does not contain FeatureLookups (input_tables), FeatureFunctions (input_functions).
+        input_tables = []
+        for input_table in spec_dict.get(INPUT_TABLES, []):
+            table_name, attributes = list(input_table.items())[0]
+            input_tables.append({TABLE_NAME: table_name, **attributes})
+        spec_dict[INPUT_TABLES] = input_tables
+        input_functions = []
+        for input_function in spec_dict.get(INPUT_FUNCTIONS, []):
+            udf_name, attributes = list(input_function.items())[0]
+            input_functions.append({UDF_NAME: udf_name, **attributes})
+        spec_dict[INPUT_FUNCTIONS] = input_functions
+        return cls.from_proto(
+            ParseDict(spec_dict, ProtoFeatureSpec(), ignore_unknown_fields=True)
+        )
     @classmethod
     def _read_file(cls, path: str):

wedata/feature_store/entities/feature_table.py ADDED Viewed

@@ -0,0 +1,107 @@
+from typing import Dict
+class FeatureTable:
+    """
+    特征表实体类
+    """
+    def __init__(
+        self,
+        name,
+        table_id,
+        description,
+        primary_keys,
+        partition_columns,
+        features,
+        creation_timestamp=None,
+        online_stores=None,
+        notebook_producers=None,
+        job_producers=None,
+        table_data_sources=None,
+        path_data_sources=None,
+        custom_data_sources=None,
+        timestamp_keys=None,
+        tags=None,
+    ):
+        """Initialize a FeatureTable object."""
+        """初始化特征表对象
+        :param name: 表名
+        :param table_id: 表ID
+        :param description: 描述
+        :param primary_keys: 主键列表
+        :param partition_columns: 分区列列表
+        :param features: 特征列列表
+        :param creation_timestamp: 创建时间戳(可选)
+        :param online_stores: 在线存储配置(可选)
+        :param notebook_producers: Notebook生产者列表(可选)
+        :param job_producers: 作业生产者列表(可选)
+        :param table_data_sources: 表数据源列表(可选)
+        :param path_data_sources: 路径数据源列表(可选)
+        :param custom_data_sources: 自定义数据源列表(可选)
+        :param timestamp_keys: 时间戳键列表(可选)
+        :param tags: 标签字典(可选)
+        """
+        self.name = name
+        self.table_id = table_id
+        self.description = description
+        self.primary_keys = primary_keys
+        self.partition_columns = partition_columns
+        self.features = features
+        self.creation_timestamp = creation_timestamp
+        self.online_stores = online_stores if online_stores is not None else []
+        self.notebook_producers = (
+            notebook_producers if notebook_producers is not None else []
+        )
+        self.job_producers = job_producers if job_producers is not None else []
+        self.table_data_sources = (
+            table_data_sources if table_data_sources is not None else []
+        )
+        self.path_data_sources = (
+            path_data_sources if path_data_sources is not None else []
+        )
+        self.custom_data_sources = (
+            custom_data_sources if custom_data_sources is not None else []
+        )
+        self.timestamp_keys = timestamp_keys if timestamp_keys is not None else []
+        self._tags = tags
+    def __str__(self):
+        """
+        返回特征表实例的字符串表示，包含所有关键属性信息
+        返回:
+            格式化的字符串，包含表名、ID、描述、主键、分区列、特征数量、
+            时间戳键、创建时间、数据源数量和标签数量等信息
+        """
+        return (
+            f"FeatureTable(\n"
+            f"  name='{self.name}',\n"
+            f"  table_id='{self.table_id}',\n"
+            f"  description='{self.description[:50]}{'...' if len(self.description) > 50 else ''}',\n"
+            f"  primary_keys={self.primary_keys},\n"
+            f"  partition_columns={self.partition_columns},\n"
+            f"  features={len(self.features)},\n"
+            f"  timestamp_keys={self.timestamp_keys},\n"
+            f"  creation_timestamp={self.creation_timestamp},\n"
+            f"  data_sources=[table:{len(self.table_data_sources)} "
+            f"path:{len(self.path_data_sources)} custom:{len(self.custom_data_sources)}],\n"
+            f"  tags={len(self.tags) if self._tags else 0}\n"
+            f")"
+        )
+    @property
+    def tags(self) -> Dict[str, str]:
+        """
+        Get the tags associated with the feature table.
+        :return a Dictionary of all tags associated with the feature table as key/value pairs
+        """
+        if self._tags is None:
+            # If no tags are set, self._tags is expected an empty dictionary.
+            raise ValueError(
+                "Internal error: tags have not been fetched for this FeatureTable instance"
+            )
+        return self._tags

{feature_store → wedata/feature_store}/entities/training_set.py RENAMED Viewed

@@ -2,19 +2,20 @@ from typing import Dict, List, Optional
 from pyspark.sql import DataFrame
-from feature_store.entities.feature_table import FeatureTable
-from feature_store.entities.function_info import FunctionInfo
-from feature_store.utils.feature_lookup_utils import (
+from wedata.feature_store.entities.feature_table import FeatureTable
+from wedata.feature_store.entities.function_info import FunctionInfo
+from wedata.feature_store.utils.feature_lookup_utils import (
     join_feature_data_if_not_overridden,
 )
-from feature_store.entities.feature_spec import FeatureSpec
-from feature_store.utils.feature_spec_utils import (
+from wedata.feature_store.entities.feature_spec import FeatureSpec
+from wedata.feature_store.utils.feature_spec_utils import (
     COLUMN_INFO_TYPE_FEATURE,
     COLUMN_INFO_TYPE_ON_DEMAND,
     COLUMN_INFO_TYPE_SOURCE,
     get_feature_execution_groups,
 )
+from wedata.feature_store.utils.on_demand_utils import apply_functions_if_not_overridden
 class TrainingSet:
@@ -89,13 +90,13 @@ class TrainingSet:
                     feature_table_data_map=self._feature_table_data_map,
                     use_spark_native_join=self._use_spark_native_join,
                 )
-            # elif execution_group.type == COLUMN_INFO_TYPE_ON_DEMAND:
-            #     # Apply all on-demand UDFs
-            #     result_df = apply_functions_if_not_overridden(
-            #         df=result_df,
-            #         functions_to_apply=execution_group.features,
-            #         uc_function_infos=self._uc_function_infos,
-            #     )
+            elif execution_group.type == COLUMN_INFO_TYPE_ON_DEMAND:
+                # Apply all on-demand UDFs
+                result_df = apply_functions_if_not_overridden(
+                    df=result_df,
+                    functions_to_apply=execution_group.features,
+                    uc_function_infos=self._uc_function_infos,
+                )
             else:
                 # This should never be reached.
                 raise Exception("Unknown feature execution type:", execution_group.type)

wedata-feature-engineering 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

wedata-feature-engineering 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl