PyPI - tencent-wedata-feature-engineering-dev - Versions diffs - 0.1.48__py3-none-any.whl → 0.2.5__py3-none-any.whl - Mend

tencent-wedata-feature-engineering-dev 0.1.48py3-none-any.whl → 0.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

wedata/feature_engineering/ml_training_client/ml_training_client.py ADDED Viewed

@@ -0,0 +1,569 @@
+import logging
+import os
+from types import ModuleType
+from typing import Any, List, Optional, Union, Dict
+import mlflow
+from mlflow.models import Model
+from mlflow.utils.file_utils import TempDir, read_yaml
+from pyspark.sql import DataFrame
+from pyspark.sql.functions import struct
+from wedata.common.constants import constants
+from wedata.common.entities.feature_function import FeatureFunction
+from wedata.common.entities.feature_lookup import FeatureLookup
+from wedata.common.entities.feature_spec import FeatureSpec
+from wedata.common.entities.training_set import TrainingSet
+from wedata.feature_engineering.mlflow_model import _FeatureEngineeringModelWrapper
+from wedata.common.spark_client import SparkClient
+from wedata.common.utils import validation_utils, common_utils, training_set_utils
+from wedata.common.entities.feature_table import FeatureTable
+from wedata.common.constants.constants import (
+    _NO_RESULT_TYPE_PASSED,
+    _USE_SPARK_NATIVE_JOIN,
+    MODEL_DATA_PATH_ROOT,
+    PREDICTION_COLUMN_NAME,
+    _PREBUILT_ENV_URI
+)
+from wedata.common.utils import uc_utils
+from wedata.common.utils.signature_utils import get_mlflow_signature_from_feature_spec, \
+    drop_signature_inputs_and_invalid_params
+_logger = logging.getLogger(__name__)
+FEATURE_SPEC_GRAPH_MAX_COLUMN_INFO = 1000
+class MLTrainingClient:
+    def __init__(
+        self,
+        spark_client: SparkClient
+    ):
+        self._spark_client = spark_client
+    def create_training_set(
+        self,
+        feature_spec: FeatureSpec,
+        label_names: List[str],
+        df: DataFrame,
+        ft_metadata: training_set_utils._FeatureTableMetadata,
+        kwargs,
+    ):
+        uc_function_infos = training_set_utils.get_uc_function_infos(
+            self._spark_client,
+            {odci.udf_name for odci in feature_spec.on_demand_column_infos},
+        )
+        training_set_utils.warn_if_non_photon_for_native_spark(
+            kwargs.get(_USE_SPARK_NATIVE_JOIN, False), self._spark_client
+        )
+        return TrainingSet(
+            feature_spec,
+            df,
+            label_names,
+            ft_metadata.feature_table_metadata_map,
+            ft_metadata.feature_table_data_map,
+            uc_function_infos,
+            kwargs.get(_USE_SPARK_NATIVE_JOIN, False),
+        )
+    def create_training_set_from_feature_lookups(
+        self,
+        df: DataFrame,
+        feature_lookups: List[Union[FeatureLookup, FeatureFunction]],
+        label: Union[str, List[str], None],
+        exclude_columns: List[str],
+        **kwargs,
+    ) -> TrainingSet:
+        # 获取特征查找列表和特征函数列表
+        features = feature_lookups
+        feature_lookups = [f for f in features if isinstance(f, FeatureLookup)]
+        feature_functions = [f for f in features if isinstance(f, FeatureFunction)]
+        # 最多支持100个FeatureFunctions
+        if len(feature_functions) > training_set_utils.MAX_FEATURE_FUNCTIONS:
+            raise ValueError(
+                f"A maximum of {training_set_utils.MAX_FEATURE_FUNCTIONS} FeatureFunctions are supported."
+            )
+        # 如果未提供标签，则用空列表初始化label_names
+        label_names = common_utils.as_list(label, [])
+        del label
+        # 校验数据集和标签
+        training_set_utils.verify_df_and_labels(df, label_names, exclude_columns)
+        # 获取特征表元数据
+        ft_metadata = training_set_utils.get_table_metadata(
+            self._spark_client,
+            {fl.table_name for fl in feature_lookups}
+        )
+        column_infos = training_set_utils.get_column_infos(
+            feature_lookups,
+            feature_functions,
+            ft_metadata,
+            df_columns=df.columns,
+            label_names=label_names,
+        )
+        training_set_utils.validate_column_infos(
+            self._spark_client,
+            ft_metadata,
+            column_infos.source_data_column_infos,
+            column_infos.feature_column_infos,
+            column_infos.on_demand_column_infos,
+            label_names,
+        )
+        # Build feature_spec locally for comparison with the feature spec yaml generated by the
+        # FeatureStore backend. This will be removed once the migration is validated.
+        feature_spec = training_set_utils.build_feature_spec(
+            feature_lookups,
+            ft_metadata,
+            column_infos,
+            exclude_columns
+        )
+        return self.create_training_set(
+            feature_spec,
+            label_names,
+            df,
+            ft_metadata,
+            kwargs=kwargs,
+        )
+    def create_feature_spec(
+        self,
+        name: str,
+        features: List[Union[FeatureLookup, FeatureFunction]],
+        sparkClient: SparkClient,
+        exclude_columns: List[str] = [],
+    ) -> FeatureSpec:
+        feature_lookups = [f for f in features if isinstance(f, FeatureLookup)]
+        feature_functions = [f for f in features if isinstance(f, FeatureFunction)]
+        # Maximum of 100 FeatureFunctions is supported
+        if len(feature_functions) > training_set_utils.MAX_FEATURE_FUNCTIONS:
+            raise ValueError(
+                f"A maximum of {training_set_utils.MAX_FEATURE_FUNCTIONS} FeatureFunctions are supported."
+            )
+        # Get feature table metadata and column infos
+        ft_metadata = training_set_utils.get_table_metadata(
+            self._spark_client,
+            {fl.table_name for fl in feature_lookups}
+        )
+        column_infos = training_set_utils.get_column_infos(
+            feature_lookups,
+            feature_functions,
+            ft_metadata,
+        )
+        column_infos = training_set_utils.add_inferred_source_columns(column_infos)
+        training_set_utils.validate_column_infos(
+            self._spark_client,
+            ft_metadata,
+            column_infos.source_data_column_infos,
+            column_infos.feature_column_infos,
+            column_infos.on_demand_column_infos,
+        )
+        feature_spec = training_set_utils.build_feature_spec(
+            feature_lookups,
+            ft_metadata,
+            column_infos,
+            exclude_columns
+        )
+        return feature_spec
+    def log_model(
+            self,
+            model: Any,
+            artifact_path: str,
+            *,
+            flavor: ModuleType,
+            training_set: Optional[TrainingSet],
+            registered_model_name: Optional[str],
+            model_registry_uri: Optional[str],
+            await_registration_for: int,
+            infer_input_example: bool,
+            **kwargs,
+    ):
+        # 验证training_set参数是否提供
+        if (training_set is None):
+            raise ValueError(
+                "'training_set' must be provided, but not ."
+            )
+        # 获取特征规格并重新格式化表名
+        # training_set.feature_spec保证来自FeatureStoreClient.create_training_set的3L格式
+        feature_spec = uc_utils.get_feature_spec_with_reformat_full_table_names(
+            training_set.feature_spec
+        )
+        # 获取标签类型映射和标签
+        label_type_map = training_set._label_data_types
+        # 收集所有特征列名
+        feature_columns = [
+            feature_column.output_name
+            for feature_column in feature_spec.feature_column_infos
+        ]
+        df_head = training_set.load_df().select(*feature_columns).head()
+        # 处理输出模式和参数
+        override_output_schema = kwargs.pop("output_schema", None)
+        params = kwargs.pop("params", {})
+        params["result_type"] = params.get("result_type", _NO_RESULT_TYPE_PASSED)
+        # 尝试获取MLflow签名
+        try:
+            signature = get_mlflow_signature_from_feature_spec(
+                feature_spec, label_type_map, override_output_schema, params
+            )
+        except Exception as e:
+            _logger.warning(f"Model could not be logged with a signature: {e}")
+            signature = None
+        with TempDir() as tmp_location:
+            # wedata data_path路径,改为记录表路径,遍历表名，生成数组
+            data_path = os.path.join(tmp_location.path(), "feature_store")
+            os.makedirs(data_path, exist_ok=True)
+            # 创建原始MLflow模型
+            raw_mlflow_model = Model(
+                signature=drop_signature_inputs_and_invalid_params(signature)
+            )
+            raw_model_path = os.path.join(data_path, constants.RAW_MODEL_FOLDER)
+            # 根据flavor类型保存模型
+            if flavor.FLAVOR_NAME != mlflow.pyfunc.FLAVOR_NAME:
+                flavor.save_model(
+                    model, raw_model_path, mlflow_model=raw_mlflow_model, **kwargs
+                )
+            else:
+                flavor.save_model(
+                    raw_model_path,
+                    mlflow_model=raw_mlflow_model,
+                    python_model=model,
+                    **kwargs,
+                )
+            # 验证模型是否支持python_function flavor
+            if not "python_function" in raw_mlflow_model.flavors:
+                raise ValueError(
+                    f"FeatureStoreClient.log_model does not support '{flavor.__name__}' "
+                    f"since it does not have a python_function model flavor."
+                )
+            # 获取并处理conda环境配置
+            model_env = raw_mlflow_model.flavors["python_function"][mlflow.pyfunc.ENV]
+            if isinstance(model_env, dict):
+                # mlflow 2.0 has multiple supported environments
+                conda_file = model_env[mlflow.pyfunc.EnvType.CONDA]
+            else:
+                conda_file = model_env
+            conda_env = read_yaml(raw_model_path, conda_file)
+            #TODO 暂时不需要databricks-feature-lookup这个包，会导致 python 环境创建失败
+            # Check if databricks-feature-lookup version is specified in conda_env
+            lookup_client_version_specified = False
+            for dependency in conda_env.get("dependencies", []):
+                if isinstance(dependency, dict):
+                    for pip_dep in dependency.get("pip", []):
+                        if pip_dep.startswith(
+                                constants.FEATURE_LOOKUP_CLIENT_PIP_PACKAGE
+                        ):
+                            lookup_client_version_specified = True
+                            break
+            #TODO 暂时不需要databricks-feature-lookup这个包，会导致 python 环境创建失败
+            # If databricks-feature-lookup version is not specified, add default version
+            if not lookup_client_version_specified:
+                # Get the pip package string for the databricks-feature-lookup client
+                default_wedata_feature_lookup_pip_package = common_utils.pip_depependency_pinned_version(
+                    pip_package_name=constants.FEATURE_LOOKUP_CLIENT_PIP_PACKAGE,
+                    version=constants.FEATURE_LOOKUP_CLIENT_MAJOR_VERSION,
+                )
+                common_utils.add_mlflow_pip_depependency(
+                    conda_env, default_wedata_feature_lookup_pip_package
+                )
+            # 尝试创建输入示例
+            input_example = None
+            try:
+                if df_head is not None and infer_input_example:
+                    input_example = df_head.asDict()
+            except Exception:
+                pass
+            feature_spec.save(data_path)
+            print(f'artifact_path:{artifact_path},data_path:{data_path},conda_env:{conda_env},'
+                  f'signature:{signature},input_example:{input_example}');
+            mlflow.pyfunc.log_model(
+                artifact_path=artifact_path,
+                python_model=_FeatureEngineeringModelWrapper(model),
+                # data_path=data_path,
+                artifacts={"feature_store": data_path},
+                code_path=None,
+                conda_env=conda_env,
+                signature=signature,
+                input_example=input_example,
+                registered_model_name=registered_model_name
+            )
+            # mlflow.pyfunc.log_model(
+            #     artifact_path=artifact_path,
+            #     loader_module=constants.MLFLOW_MODEL_NAME,
+            #     data_path=data_path,
+            #     conda_env=conda_env,
+            #     signature=signature,
+            #     input_example=input_example,
+            # )
+        # 注册模型
+        # if registered_model_name is not None:
+        #     run_id = mlflow.tracking.fluent.active_run().info.run_id
+        # if model_registry_uri is not None:
+        #     mlflow.set_registry_uri(model_registry_uri)
+        #
+        # mlflow.register_model(
+        #     f"runs:/{run_id}/{artifact_path}",
+        #     registered_model_name,
+        #     await_registration_for=await_registration_for,
+        # )
+        #
+        # print(f"Model registered successfully: {registered_model_name}")
+        # # 验证模型是否已注册
+        # from mlflow.tracking import MlflowClient
+        # client = MlflowClient()
+        # model_version = client.get_latest_versions(registered_model_name, stages=["None"])[0]
+        # print(f"Registered model version: {model_version.version}")
+    def score_batch(
+        self,
+        model_uri: Optional[str],
+        df: DataFrame,
+        result_type: str,
+        env_manager: Optional[str] = None,
+        local_uri: Optional[str] = None,
+        params: Optional[dict[str, Any]] = None,
+        timestamp_key: str = None,
+        **kwargs,
+    ) -> DataFrame:
+        # TODO:ML 待确定是否需要
+        # req_context = RequestContext(request_context.SCORE_BATCH, client_name)
+        # 校验是否
+        validation_utils.check_dataframe_type(df)
+        if (model_uri is None) == (local_uri is None):
+            raise ValueError(
+                "Either 'model_uri' or 'local_uri' must be provided, but not both."
+            )
+        if df.isStreaming:
+            raise ValueError("Streaming DataFrames are not supported.")
+        # 返回结果中会包含列名为 prediction，为预测结果，输入数据中不用此名字
+        if PREDICTION_COLUMN_NAME in df.columns:
+            raise ValueError(
+                "FeatureStoreClient.score_batch returns a DataFrame with a new column "
+                f'"{PREDICTION_COLUMN_NAME}". df already has a column with name '
+                f'"{PREDICTION_COLUMN_NAME}".'
+            )
+        # 校验列中是否有重复列名
+        validation_utils.validate_strings_unique(
+            df.columns,
+            "The provided DataFrame for scoring must have unique column names. Found duplicates {}.",
+        )
+        artifact_path = os.path.join("artifacts", MODEL_DATA_PATH_ROOT)
+        with (TempDir() as tmp_location):
+            local_path = (
+                local_uri
+                if local_uri
+                else common_utils.download_model_artifacts(model_uri, tmp_location.path())
+            )
+            model_data_path = os.path.join(local_path, artifact_path)
+        # Augment local workspace metastore tables from 2L to 3L,
+            # this will prevent us from erroneously reading data from other catalogs
+            feature_spec = uc_utils.get_feature_spec_with_full_table_names(
+                FeatureSpec.load(model_data_path)
+            )
+            raw_model_path = os.path.join(
+                model_data_path, constants.RAW_MODEL_FOLDER
+            )
+            print(f"raw_model_path: {raw_model_path}")
+           # 构建 udf 函数
+            predict_udf =self._spark_client.get_predict_udf(
+                raw_model_path,
+                result_type=result_type,
+                env_manager=env_manager,
+                params=params,
+                prebuilt_env_uri=kwargs.get(_PREBUILT_ENV_URI, None))
+            # TODO (ML-17260) Consider reading the timestamp from the backend instead of feature store artifacts
+            ml_model = Model.load(
+                os.path.join(local_path, constants.ML_MODEL)
+            )
+        # Validate that columns needed for joining feature tables exist and are not duplicates.
+        feature_input_keys = []
+        for fci in feature_spec.feature_column_infos:
+            feature_input_keys.extend([k for k in fci.lookup_key])
+        on_demand_input_names = uc_utils.get_unique_list_order(
+            [
+                input_name
+                for odci in feature_spec.on_demand_column_infos
+                for input_name in odci.input_bindings.values()
+            ]
+        )
+        intermediate_inputs = set(feature_input_keys + on_demand_input_names)
+        source_data_names = [
+            sdci.name for sdci in feature_spec.source_data_column_infos
+        ]
+        feature_output_names = [
+            fci.output_name for fci in feature_spec.feature_column_infos
+        ]
+        on_demand_output_names = [
+            odci.output_name for odci in feature_spec.on_demand_column_infos
+        ]
+        all_output_names = set(
+            source_data_names + feature_output_names + on_demand_output_names
+        )
+        required_cols = intermediate_inputs.difference(all_output_names)
+        required_cols.update(source_data_names)
+        missing_required_columns = [
+            col for col in required_cols if col not in df.columns
+        ]
+        if missing_required_columns:
+            missing_columns_formatted = ", ".join(
+                [f"'{s}'" for s in missing_required_columns]
+            )
+            raise ValueError(
+                f"DataFrame is missing required columns {missing_columns_formatted}."
+            )
+        table_names = {fci.table_name for fci in feature_spec.feature_column_infos}
+        feature_table_features_map = training_set_utils.get_features_for_tables(
+            self._spark_client, table_names=table_names
+        )
+        feature_table_metadata_map = (
+            training_set_utils.get_feature_table_metadata_for_tables(
+                self._spark_client,
+                table_names=table_names,
+            )
+        )
+        feature_table_data_map = training_set_utils.load_feature_data_for_tables(
+            self._spark_client, table_names=table_names
+        )
+        training_set_utils.validate_feature_column_infos_data(
+            self._spark_client,
+            feature_spec.feature_column_infos,
+            feature_table_features_map,
+            feature_table_data_map,
+        )
+        uc_function_infos = training_set_utils.get_uc_function_infos(
+            self._spark_client,
+            {odci.udf_name for odci in feature_spec.on_demand_column_infos},
+        )
+        # Required source data and feature lookup keys have been validated to exist in `df`.
+        # No additional validation is required before resolving FeatureLookups and applying FeatureFunctions.
+        training_set_utils.warn_if_non_photon_for_native_spark(
+            kwargs.get(_USE_SPARK_NATIVE_JOIN, False), self._spark_client
+        )
+        augmented_df = TrainingSet(
+            feature_spec=feature_spec,
+            df=df,
+            labels=[],
+            feature_table_metadata_map=feature_table_metadata_map,
+            feature_table_data_map=feature_table_data_map,
+            uc_function_infos=uc_function_infos,
+            use_spark_native_join=kwargs.get(_USE_SPARK_NATIVE_JOIN, False),
+        )._augment_df()
+        # Only included FeatureSpec columns should be part of UDF inputs for scoring.
+        # Note: extra `df` columns not in FeatureSpec should be preserved.
+        udf_input_columns = [
+            ci.output_name for ci in feature_spec.column_infos if ci.include
+        ]
+        print(f"udf_input_columns:{udf_input_columns}")
+        # Apply predictions.
+        df_with_predictions = augmented_df.withColumn(
+            PREDICTION_COLUMN_NAME, predict_udf(struct(*udf_input_columns))
+        )
+        # Reorder `df_with_predictions` to include:
+        # 1. Preserved `df` columns, in `df` column order.
+        # 2. Computed model input columns, in `FeatureSpec` column order.
+        # 3. Prediction column.
+        output_column_order = (
+            df.columns
+            + [col for col in udf_input_columns if col not in df.columns]
+            + [PREDICTION_COLUMN_NAME]
+        )
+        return_df = df_with_predictions.select(output_column_order)
+        return return_df
+    def _warn_if_tables_mismatched_for_model(
+        self,
+        feature_spec: FeatureSpec,
+        feature_table_metadata_map: Dict[str, FeatureTable],
+        model_creation_timestamp_ms: float,
+    ):
+        """
+        Helper method to warn if feature tables were deleted and recreated after a model was logged.
+        For newer FeatureSpec versions >=3, we can compare the FeatureSpec and current table ids.
+        Otherwise, we compare the model and table creation timestamps.
+        """
+        # 1. Compare feature table ids
+        # Check for feature_spec logged with client versions that supports table_infos
+        if len(feature_spec.table_infos) > 0:
+            # When feature_spec.yaml is parsed, FeatureSpec.load will assert
+            # that the listed table names in input_tables match table names in input_columns.
+            # The following code assumes this as invariant and only checks for the table IDs.
+            mismatched_tables = []
+            for table_info in feature_spec.table_infos:
+                feature_table = feature_table_metadata_map[table_info.table_name]
+                if feature_table :
+                    mismatched_tables.append(table_info.table_name)
+            if len(mismatched_tables) > 0:
+                plural = len(mismatched_tables) > 1
+                _logger.warning(
+                    f"Feature table{'s' if plural else ''} {', '.join(mismatched_tables)} "
+                    f"{'were' if plural else 'was'} deleted and recreated after "
+                    f"the model was trained. Model performance may be affected if the features "
+                    f"used in scoring have drifted from the features used in training."
+                )
+        # 2. 无法获取创建时间，不做校验
+        # feature_tables_created_after_model = []
+        # for name, metadata in feature_table_metadata_map.items():
+        #     if model_creation_timestamp_ms < metadata.creation_timestamp:
+        #         feature_tables_created_after_model.append(name)
+        #
+        # if len(feature_tables_created_after_model) > 0:
+        #     plural = len(feature_tables_created_after_model) > 1
+        #     message = (
+        #         f"Feature table{'s' if plural else ''} {', '.join(feature_tables_created_after_model)} "
+        #         f"{'were' if plural else 'was'} created after the model was logged. "
+        #         f"Model performance may be affected if the features used in scoring have drifted "
+        #         f"from the features used in training."
+        #     )
+        #     _logger.warning(message)

wedata/feature_engineering/mlflow_model.py ADDED Viewed

@@ -0,0 +1,9 @@
+import mlflow
+class _FeatureEngineeringModelWrapper(mlflow.pyfunc.PythonModel):
+    def __init__(self, model):
+        self.model = model
+    def predict(self, context, model_input):
+       return self.model.predict(model_input)

tencent-wedata-feature-engineering-dev 0.1.48__py3-none-any.whl → 0.2.5__py3-none-any.whl

tencent-wedata-feature-engineering-dev 0.1.48py3-none-any.whl → 0.2.5py3-none-any.whl