PyPI - wedata-feature-engineering - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

wedata-feature-engineering 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

wedata/__init__.py +6 -0
wedata/feature_store/__init__.py +0 -0
wedata/feature_store/client.py +169 -0
wedata/feature_store/constants/__init__.py +0 -0
wedata/feature_store/constants/constants.py +28 -0
wedata/feature_store/entities/__init__.py +0 -0
wedata/feature_store/entities/column_info.py +117 -0
wedata/feature_store/entities/data_type.py +92 -0
wedata/feature_store/entities/environment_variables.py +55 -0
wedata/feature_store/entities/feature.py +53 -0
wedata/feature_store/entities/feature_column_info.py +64 -0
wedata/feature_store/entities/feature_function.py +55 -0
wedata/feature_store/entities/feature_lookup.py +179 -0
wedata/feature_store/entities/feature_spec.py +454 -0
wedata/feature_store/entities/feature_spec_constants.py +25 -0
wedata/feature_store/entities/feature_table.py +164 -0
wedata/feature_store/entities/feature_table_info.py +40 -0
wedata/feature_store/entities/function_info.py +184 -0
wedata/feature_store/entities/on_demand_column_info.py +44 -0
wedata/feature_store/entities/source_data_column_info.py +21 -0
wedata/feature_store/entities/training_set.py +134 -0
wedata/feature_store/feature_table_client/__init__.py +0 -0
wedata/feature_store/feature_table_client/feature_table_client.py +313 -0
wedata/feature_store/spark_client/__init__.py +0 -0
wedata/feature_store/spark_client/spark_client.py +286 -0
wedata/feature_store/training_set_client/__init__.py +0 -0
wedata/feature_store/training_set_client/training_set_client.py +196 -0
wedata/feature_store/utils/__init__.py +0 -0
wedata/feature_store/utils/common_utils.py +96 -0
wedata/feature_store/utils/feature_lookup_utils.py +570 -0
wedata/feature_store/utils/feature_spec_utils.py +286 -0
wedata/feature_store/utils/feature_utils.py +73 -0
wedata/feature_store/utils/schema_utils.py +117 -0
wedata/feature_store/utils/topological_sort.py +158 -0
wedata/feature_store/utils/training_set_utils.py +580 -0
wedata/feature_store/utils/uc_utils.py +281 -0
wedata/feature_store/utils/utils.py +252 -0
wedata/feature_store/utils/validation_utils.py +55 -0
{wedata_feature_engineering-0.1.4.dist-info → wedata_feature_engineering-0.1.5.dist-info}/METADATA +1 -1
wedata_feature_engineering-0.1.5.dist-info/RECORD +79 -0
wedata_feature_engineering-0.1.5.dist-info/top_level.txt +1 -0
wedata_feature_engineering-0.1.4.dist-info/RECORD +0 -41
wedata_feature_engineering-0.1.4.dist-info/top_level.txt +0 -1
{wedata_feature_engineering-0.1.4.dist-info → wedata_feature_engineering-0.1.5.dist-info}/WHEEL +0 -0

wedata/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""
+WeData Feature Engineering
+A toolkit for automated feature engineering
+"""
+__version__ = "0.1.5"

wedata/feature_store/__init__.py ADDED Viewed

File without changes

wedata/feature_store/client.py ADDED Viewed

@@ -0,0 +1,169 @@
+"""
+Wedata FeatureStoreClient Python实现
+"""
+from __future__ import annotations
+from typing import Union, List, Dict, Optional, Any
+from pyspark.sql import DataFrame, SparkSession
+from pyspark.sql.streaming import StreamingQuery
+from pyspark.sql.types import StructType
+from feature_store.constants.constants import APPEND, DEFAULT_WRITE_STREAM_TRIGGER
+from feature_store.entities.feature_function import FeatureFunction
+from feature_store.entities.feature_lookup import FeatureLookup
+from feature_store.entities.training_set import TrainingSet
+from feature_store.feature_table_client.feature_table_client import FeatureTableClient
+from feature_store.spark_client.spark_client import SparkClient
+from feature_store.training_set_client.training_set_client import TrainingSetClient
+from feature_store.utils.feature_utils import format_feature_lookups_and_functions
+class FeatureStoreClient:
+    """特征存储统一客户端，提供特征全生命周期管理能力"""
+    def __init__(self, spark: SparkSession):
+        """
+        :param spark: 已初始化的SparkSession对象
+        """
+        self._spark = spark
+        self._spark_client = SparkClient(spark)
+        self._feature_table_client = FeatureTableClient(spark)
+    def create_table(
+            self,
+            name: str,
+            primary_keys: Union[str, List[str]],
+            df: Optional[DataFrame] = None,
+            *,
+            timestamp_keys: Union[str, List[str], None] = None,
+            partition_columns: Union[str, List[str], None] = None,
+            schema: Optional[StructType] = None,
+            description: Optional[str] = None,
+            tags: Optional[Dict[str, str]] = None
+    ):
+        """
+        创建特征表（支持批流数据写入）
+        Args:
+            name: 特征表全称（格式：<table>）
+            primary_keys: 主键列名（支持复合主键）
+            df: 初始数据（可选，用于推断schema）
+            timestamp_keys: 时间戳键（用于时态特征）
+            partition_columns: 分区列（优化存储查询）
+            description: 业务描述
+            tags: 业务标签
+        Returns:
+            FeatureTable实例
+        Raises:
+            ValueError: 当schema与数据不匹配时
+        """
+        return self._feature_table_client.create_table(
+            name=name,
+            primary_keys=primary_keys,
+            df=df,
+            timestamp_keys=timestamp_keys,
+            partition_columns=partition_columns,
+            schema=schema,
+            description=description,
+            tags=tags
+        )
+    def read_table(self, name: str) -> DataFrame:
+        """
+        读取特征表数据
+        Args:
+            name: 特征表名称
+        Returns:
+            DataFrame: 包含特征表数据的DataFrame对象
+        """
+        return self._feature_table_client.read_table(name)
+    def drop_table(self, name: str) -> None:
+        """
+        删除特征表
+        Args:
+            name: 要删除的特征表名称
+        Returns:
+            None
+        """
+        return self._feature_table_client.drop_table(name)
+    def create_training_set(
+            self,
+            df: DataFrame,
+            feature_lookups: List[Union[FeatureLookup, FeatureFunction]],
+            label: Union[str, List[str], None],
+            exclude_columns: Optional[List[str]] = None,
+            **kwargs,
+    ) -> TrainingSet:
+        """
+        创建训练集
+        Args:
+            df: 基础数据
+            feature_lookups: 特征查询列表
+            label: 标签列名
+            exclude_columns: 排除列名
+        Returns:
+            TrainingSet实例
+        """
+        if exclude_columns is None:
+            exclude_columns = []
+        features = feature_lookups
+        del feature_lookups
+        features = format_feature_lookups_and_functions(self._spark_client, features)
+        # 创建TrainingSetClient实例
+        training_set_client = TrainingSetClient(self._spark_client)
+        return training_set_client.create_training_set_from_feature_lookups(
+            df=df,
+            feature_lookups=features,
+            label=label,
+            exclude_columns=exclude_columns,
+            **kwargs
+        )
+    def write_table(
+            self,
+            name: str,
+            df: DataFrame,
+            mode: str = APPEND,
+            checkpoint_location: Optional[str] = None,
+            trigger: Dict[str, Any] = DEFAULT_WRITE_STREAM_TRIGGER,
+    ) -> Optional[StreamingQuery]:
+        """
+        写入数据到特征表（支持批处理和流式处理）
+        Args:
+            name: 特征表名称
+            df: 要写入的数据DataFrame
+            mode: 写入模式（默认追加）
+            checkpoint_location: 流式处理的检查点位置（可选）
+            trigger: 流式处理触发器配置（默认使用系统预设）
+        Returns:
+            如果是流式写入返回StreamingQuery对象，否则返回None
+        """
+        return self._feature_table_client.write_table(
+            name=name,
+            df=df,
+            mode=mode,
+            checkpoint_location=checkpoint_location,
+            trigger=trigger,
+        )

wedata/feature_store/constants/__init__.py ADDED Viewed

File without changes

wedata/feature_store/constants/constants.py ADDED Viewed

@@ -0,0 +1,28 @@
+OVERWRITE = "overwrite"
+APPEND = "append"
+PATH = "path"
+TABLE = "table"
+CUSTOM = "custom"
+PREDICTION_COLUMN_NAME = "prediction"
+MODEL_DATA_PATH_ROOT = "feature_store"
+UTF8_BYTES_PER_CHAR = 4
+MAX_PRIMARY_KEY_STRING_LENGTH_CHARS = 100
+MAX_PRIMARY_KEY_STRING_LENGTH_BYTES = (
+    MAX_PRIMARY_KEY_STRING_LENGTH_CHARS * UTF8_BYTES_PER_CHAR
+)
+STREAMING_TRIGGER_CONTINUOUS = "continuous"
+STREAMING_TRIGGER_ONCE = "once"
+STREAMING_TRIGGER_PROCESSING_TIME = "processingTime"
+DEFAULT_WRITE_STREAM_TRIGGER = {STREAMING_TRIGGER_PROCESSING_TIME: "5 seconds"}
+_DEFAULT_PUBLISH_STREAM_TRIGGER = {STREAMING_TRIGGER_PROCESSING_TIME: "5 minutes"}
+_WARN = "WARN"
+_ERROR = "ERROR"
+_SOURCE_FORMAT_DELTA = "delta"
+_NO_RESULT_TYPE_PASSED = "NO_RESULT_TYPE"
+_USE_SPARK_NATIVE_JOIN = "use_spark_native_join"
+_PREBUILT_ENV_URI = "prebuilt_env_uri"

wedata/feature_store/entities/__init__.py ADDED Viewed

File without changes

wedata/feature_store/entities/column_info.py ADDED Viewed

@@ -0,0 +1,117 @@
+import copy
+from typing import Optional, Union
+from feature_store.entities.feature_column_info import FeatureColumnInfo
+from feature_store.entities.feature_spec_constants import SOURCE_DATA_COLUMN_INFO, FEATURE_COLUMN_INFO, \
+    ON_DEMAND_COLUMN_INFO
+from feature_store.entities.on_demand_column_info import OnDemandColumnInfo
+from feature_store.entities.source_data_column_info import SourceDataColumnInfo
+class ColumnInfo:
+    """
+    ColumnInfo's structure and properties are mapped 1:1 to the ColumnInfo proto message, unless specified otherwise.
+    """
+    def __init__(
+        self,
+        info: Union[SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo],
+        include: bool,
+        data_type: Optional[str] = None,
+        topological_ordering: Optional[int] = None,
+    ):
+        if not isinstance(
+            info, (SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo)
+        ):
+            raise ValueError(
+                "info must be one of SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo."
+            )
+        self._info = info
+        self._include = include
+        self._data_type = data_type
+        self._topological_ordering = topological_ordering
+    @property
+    def info(
+        self,
+    ) -> Union[SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo]:
+        return self._info
+    @property
+    def include(self) -> bool:
+        return self._include
+    @property
+    def data_type(self) -> Optional[str]:
+        """
+        FeatureSpecs before v7 are not required to have data types.
+        """
+        return self._data_type
+    @property
+    def topological_ordering(self) -> Optional[int]:
+        """
+        FeatureSpecs before v8 are not required to have topological ordering.
+        """
+        return self._topological_ordering
+    @property
+    def output_name(self) -> str:
+        """
+        This field does not exist in the proto, and is provided for convenience.
+        """
+        return self.info.output_name
+    def with_topological_ordering(self, ordering: int):
+        new_column_info = copy.copy(self)
+        new_column_info._topological_ordering = ordering
+        return new_column_info
+    @classmethod
+    def from_proto(cls, column_info_proto):
+        if column_info_proto.HasField(SOURCE_DATA_COLUMN_INFO):
+            info = SourceDataColumnInfo.from_proto(
+                column_info_proto.source_data_column_info
+            )
+        elif column_info_proto.HasField(FEATURE_COLUMN_INFO):
+            info = FeatureColumnInfo.from_proto(column_info_proto.feature_column_info)
+        elif column_info_proto.HasField(ON_DEMAND_COLUMN_INFO):
+            info = OnDemandColumnInfo.from_proto(
+                column_info_proto.on_demand_column_info
+            )
+        else:
+            raise ValueError("Unsupported info type: " + str(column_info_proto))
+        data_type = (
+            column_info_proto.data_type
+            if column_info_proto.HasField("data_type")
+            else None
+        )
+        topological_ordering = (
+            column_info_proto.topological_ordering
+            if column_info_proto.HasField("topological_ordering")
+            else None
+        )
+        return ColumnInfo(
+            info=info,
+            include=column_info_proto.include,
+            data_type=data_type,
+            topological_ordering=topological_ordering,
+        )
+    # def to_proto(self):
+    #     column_info = ProtoColumnInfo(
+    #         include=self.include,
+    #         data_type=self.data_type,
+    #         topological_ordering=self.topological_ordering,
+    #     )
+    #     if isinstance(self.info, SourceDataColumnInfo):
+    #         column_info.source_data_column_info.CopyFrom(self.info.to_proto())
+    #     elif isinstance(self.info, FeatureColumnInfo):
+    #         column_info.feature_column_info.CopyFrom(self.info.to_proto())
+    #     elif isinstance(self.info, OnDemandColumnInfo):
+    #         column_info.on_demand_column_info.CopyFrom(self.info.to_proto())
+    #     else:
+    #         raise ValueError("Unsupported info type: " + str(self.info))
+    #
+    #     return column_info

wedata/feature_store/entities/data_type.py ADDED Viewed

@@ -0,0 +1,92 @@
+import json
+import re
+from typing import Any
+from pyspark.sql.types import ArrayType, DataType, DecimalType, MapType, StructType
+class DataType(_ProtoEnumEntity):
+    """Online store types."""
+    INTEGER = ProtoDataType.Value("INTEGER")
+    FLOAT = ProtoDataType.Value("FLOAT")
+    BOOLEAN = ProtoDataType.Value("BOOLEAN")
+    STRING = ProtoDataType.Value("STRING")
+    DOUBLE = ProtoDataType.Value("DOUBLE")
+    LONG = ProtoDataType.Value("LONG")
+    TIMESTAMP = ProtoDataType.Value("TIMESTAMP")
+    DATE = ProtoDataType.Value("DATE")
+    SHORT = ProtoDataType.Value("SHORT")
+    ARRAY = ProtoDataType.Value("ARRAY")
+    MAP = ProtoDataType.Value("MAP")
+    BINARY = ProtoDataType.Value("BINARY")
+    DECIMAL = ProtoDataType.Value("DECIMAL")
+    STRUCT = ProtoDataType.Value("STRUCT")
+    _FIXED_DECIMAL = re.compile("decimal\\(\\s*(\\d+)\\s*,\\s*(\\d+)\\s*\\)")
+    @classmethod
+    def _enum_type(cls) -> Any:
+        return ProtoDataType
+    @classmethod
+    def from_spark_type(cls, spark_type):
+        return cls.from_string(spark_type.typeName())
+    @classmethod
+    def spark_type_to_string(cls, spark_type):
+        return DataType.to_string(DataType.from_spark_type(spark_type))
+    @classmethod
+    def top_level_type_supported(cls, spark_type: DataType) -> bool:
+        """
+        Checks whether the provided Spark data type is supported by Feature Store, only considering
+        the top-level type for nested data types.
+        Details on nested types:
+          ArrayType: The elementType is not checked. Will return True.
+          MapType: The keyType and valueType are not checked. Will return True.
+          StructType: The struct fieds are not checked. Will return True.
+        """
+        cls.init()
+        return spark_type.typeName().upper() in cls._STRING_TO_ENUM
+    @classmethod
+    def to_complex_spark_type(cls, json_value):
+        """
+        Constructs a complex Spark DataType from its compact JSON representation.
+        Examples:
+            - Input: '"decimal(1,2)"'
+              Output: DecimalType(1,2)
+            - Input: '{"containsNull":false,"elementType":"integer","type":"array"}'
+              Output: ArrayType(IntegerType,false)
+            - Input: '{"keyType":"integer","type":"map","valueContainsNull":True,"valueType":"integer"}'
+              Output: MapType(IntegerType,IntegerType,true)
+        """
+        if not json_value:
+            raise ValueError("Empty JSON value cannot be converted to Spark DataType")
+        json_data = json.loads(json_value)
+        if not isinstance(json_data, dict):
+            # DecimalType does not have fromJson() method
+            if json_value == "decimal":
+                return DecimalType()
+            if cls._FIXED_DECIMAL.match(json_data):
+                m = cls._FIXED_DECIMAL.match(json_data)
+                return DecimalType(int(m.group(1)), int(m.group(2)))
+        if json_data["type"].upper() == cls.to_string(cls.ARRAY):
+            return ArrayType.fromJson(json_data)
+        if json_data["type"].upper() == cls.to_string(cls.MAP):
+            return MapType.fromJson(json_data)
+        if json_data["type"].upper() == cls.to_string(cls.STRUCT):
+            return StructType.fromJson(json_data)
+        else:
+            raise ValueError(
+                f"Spark type {json_data['type']} cannot be converted to a complex Spark DataType"
+            )

wedata/feature_store/entities/environment_variables.py ADDED Viewed

@@ -0,0 +1,55 @@
+import os
+class _EnvironmentVariable:
+    """
+    Represents an environment variable for the feature store client for custom configurations as needed.
+    """
+    def __init__(self, name, type_, default):
+        self.name = name
+        self.type = type_
+        self.default = default
+    @property
+    def defined(self):
+        return self.name in os.environ
+    def get_raw(self):
+        return os.getenv(self.name)
+    def set(self, value):
+        os.environ[self.name] = str(value)
+    def unset(self):
+        os.environ.pop(self.name, None)
+    def get(self):
+        """
+        Reads the value of the environment variable if it exists and converts it to the desired
+        type. Otherwise, returns the default value.
+        """
+        if (val := self.get_raw()) is not None:
+            try:
+                return self.type(val)
+            except Exception as e:
+                raise ValueError(
+                    f"Failed to convert {val!r} to {self.type} for {self.name}: {e}"
+                )
+        return self.default
+    def __str__(self):
+        return f"{self.name} (default: {self.default}, type: {self.type.__name__})"
+    def __repr__(self):
+        return repr(self.name)
+    def __format__(self, format_spec: str) -> str:
+        return self.name.__format__(format_spec)
+# The threshold (in MB) where a broadcast join will be performed for the asof join for point in time feature join
+# Default is 20MB as benchmarks show diminishing returns with broadcast past this value.The default spark broadcast join threshold is 10MB
+BROADCAST_JOIN_THRESHOLD = _EnvironmentVariable(
+    "BROADCAST_JOIN_THRESHOLD", int, 20 * 1024 * 1024
+)

wedata/feature_store/entities/feature.py ADDED Viewed

@@ -0,0 +1,53 @@
+class Feature:
+    def __init__(
+        self,
+        feature_table,
+        feature_id,
+        name,
+        data_type,
+        description,
+        data_type_details=None,
+    ):
+        self._feature_table = feature_table
+        self._name = name
+        self._data_type = data_type
+        self._description = description
+        self._data_type_details = data_type_details
+        self._feature_id = feature_id
+    @property
+    def feature_table(self):
+        return self._feature_table
+    @property
+    def feature_id(self):
+        return self._feature_id
+    @property
+    def name(self):
+        return self._name
+    @property
+    def data_type(self):
+        return self._data_type
+    @property
+    def data_type_details(self):
+        return self._data_type_details
+    @property
+    def description(self):
+        return self._description
+    @classmethod
+    def from_proto(cls, feature_proto):
+        return cls(
+            feature_table=feature_proto.table,
+            feature_id=feature_proto.id,
+            name=feature_proto.name,
+            data_type=feature_proto.data_type,
+            data_type_details=feature_proto.data_type_details,
+            description=feature_proto.description,
+        )

wedata/feature_store/entities/feature_column_info.py ADDED Viewed

@@ -0,0 +1,64 @@
+from typing import List, Optional
+class FeatureColumnInfo:
+    def __init__(
+        self,
+        table_name: str,
+        feature_name: str,
+        lookup_key: List[str],
+        output_name: str,
+        timestamp_lookup_key: Optional[List[str]] = None,
+    ):
+        if timestamp_lookup_key is None:
+            timestamp_lookup_key = []
+        if not table_name:
+            raise ValueError("table_name must be non-empty.")
+        if not feature_name:
+            raise ValueError("feature_name must be non-empty.")
+        if not isinstance(lookup_key, list):
+            raise ValueError("lookup_key must be a list.")
+        if not lookup_key or "" in lookup_key or None in lookup_key:
+            raise ValueError("lookup_key must be non-empty.")
+        if not output_name:
+            raise ValueError("output_name must be non-empty.")
+        if not isinstance(timestamp_lookup_key, list):
+            raise ValueError("timestamp_lookup_key must be a list.")
+        self._table_name = table_name
+        self._feature_name = feature_name
+        self._lookup_key = lookup_key
+        self._output_name = output_name
+        self._timestamp_lookup_key = timestamp_lookup_key
+    @property
+    def table_name(self):
+        return self._table_name
+    @property
+    def lookup_key(self):
+        return self._lookup_key
+    @property
+    def feature_name(self):
+        return self._feature_name
+    @property
+    def output_name(self):
+        return self._output_name
+    @property
+    def timestamp_lookup_key(self):
+        return self._timestamp_lookup_key
+    @classmethod
+    def from_proto(cls, feature_column_info_proto):
+        return cls(
+            table_name=feature_column_info_proto.table_name,
+            feature_name=feature_column_info_proto.feature_name,
+            lookup_key=list(feature_column_info_proto.lookup_key),
+            output_name=feature_column_info_proto.output_name,
+            timestamp_lookup_key=list(feature_column_info_proto.timestamp_lookup_key),
+        )

wedata/feature_store/entities/feature_function.py ADDED Viewed

@@ -0,0 +1,55 @@
+from typing import Dict, Optional
+class FeatureFunction:
+    """
+     特征方法类
+     特征方法是用户定义的函数，用于将特征表中的特征组合成新特征，特征方法可以是任何用户定义的函数，例如Python UDF。
+    特征方法类有以下属性：
+    - udf_name：要调用的Python UDF的名称。
+    - input_bindings：用于将Python UDF的输入映射到训练集中的特征的字典。
+    - output_name：如果提供，则会将此特征重命名为 :meth:`create_training_set() <databricks.feature_engineering.client.FeatureEngineeringClient.create_training_set>` 返回的 :class:`TrainingSet <databricks.ml_features.training_set.TrainingSet>` 中的特征。
+    """
+    def __init__(
+        self,
+        *,
+        udf_name: str,
+        input_bindings: Optional[Dict[str, str]] = None,
+        output_name: Optional[str] = None,
+    ):
+        """Initialize a FeatureFunction object. See class documentation."""
+        # UC function names are always lowercase.
+        self._udf_name = udf_name.lower()
+        self._input_bindings = input_bindings if input_bindings else {}
+        self._output_name = output_name
+    @property
+    def udf_name(self) -> str:
+        """
+        The name of the Python UDF called by this FeatureFunction.
+        """
+        return self._udf_name
+    @property
+    def input_bindings(self) -> Dict[str, str]:
+        """
+        The input to use for each argument of the Python UDF.
+        For example:
+        `{"x": "feature1", "y": "input1"}`
+        """
+        return self._input_bindings
+    @property
+    def output_name(self) -> Optional[str]:
+        """
+        The output name to use for the results of this FeatureFunction.
+        If empty, defaults to the fully qualified `udf_name` when evaluated.
+        """
+        return self._output_name

wedata-feature-engineering 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

wedata-feature-engineering 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl