PyPI - tencent-wedata-feature-engineering-dev - Versions diffs - 0.1.50__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

tencent-wedata-feature-engineering-dev 0.1.50py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.

Files changed (38) hide show

{tencent_wedata_feature_engineering_dev-0.1.50.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/METADATA +10 -8
tencent_wedata_feature_engineering_dev-0.2.0.dist-info/RECORD +46 -0
{tencent_wedata_feature_engineering_dev-0.1.50.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/WHEEL +1 -1
wedata/feature_store/client.py +28 -92
wedata/feature_store/constants/constants.py +2 -5
wedata/feature_store/entities/feature_lookup.py +0 -17
wedata/feature_store/entities/feature_spec.py +2 -2
wedata/feature_store/entities/feature_table.py +1 -5
wedata/feature_store/entities/function_info.py +4 -1
wedata/feature_store/feature_table_client/feature_table_client.py +53 -528
wedata/feature_store/spark_client/spark_client.py +15 -41
wedata/feature_store/training_set_client/training_set_client.py +10 -9
wedata/feature_store/utils/common_utils.py +4 -48
wedata/feature_store/utils/feature_lookup_utils.py +43 -37
wedata/feature_store/utils/feature_spec_utils.py +1 -1
wedata/feature_store/utils/uc_utils.py +1 -1
tencent_wedata_feature_engineering_dev-0.1.50.dist-info/RECORD +0 -66
wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
wedata/feature_store/cloud_sdk_client/client.py +0 -108
wedata/feature_store/cloud_sdk_client/models.py +0 -686
wedata/feature_store/cloud_sdk_client/utils.py +0 -39
wedata/feature_store/common/log/__init__.py +0 -0
wedata/feature_store/common/log/logger.py +0 -40
wedata/feature_store/common/store_config/__init__.py +0 -0
wedata/feature_store/common/store_config/redis.py +0 -48
wedata/feature_store/constants/engine_types.py +0 -34
wedata/feature_store/feast_client/__init__.py +0 -0
wedata/feature_store/feast_client/feast_client.py +0 -487
wedata/feature_store/utils/env_utils.py +0 -108
wedata/tempo/__init__.py +0 -0
wedata/tempo/interpol.py +0 -448
wedata/tempo/intervals.py +0 -1331
wedata/tempo/io.py +0 -61
wedata/tempo/ml.py +0 -129
wedata/tempo/resample.py +0 -318
wedata/tempo/tsdf.py +0 -1720
wedata/tempo/utils.py +0 -254
{tencent_wedata_feature_engineering_dev-0.1.50.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/top_level.txt +0 -0

wedata/feature_store/feature_table_client/feature_table_client.py CHANGED Viewed

@@ -3,79 +3,26 @@
 """
 import json
 from typing import Union, List, Dict, Optional, Sequence, Any
-import tencentcloud.common.exception
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.streaming import StreamingQuery
 from pyspark.sql.types import StructType
 import os
-import datetime
-from wedata.feature_store.constants.constants import (
-    APPEND, DEFAULT_WRITE_STREAM_TRIGGER, FEATURE_TABLE_KEY,
-    FEATURE_TABLE_VALUE, FEATURE_TABLE_PROJECT, FEATURE_TABLE_TIMESTAMP,
-    FEATURE_TABLE_BACKUP_PRIMARY_KEY, FEATURE_DLC_TABLE_PRIMARY_KEY)
-from wedata.feature_store.constants.engine_types import EngineTypes
-from wedata.feature_store.common.store_config.redis import RedisStoreConfig
+from wedata.feature_store.constants.constants import APPEND, DEFAULT_WRITE_STREAM_TRIGGER, FEATURE_TABLE_KEY, \
+    FEATURE_TABLE_VALUE, FEATURE_TABLE_PROJECT
 from wedata.feature_store.entities.feature_table import FeatureTable
 from wedata.feature_store.spark_client.spark_client import SparkClient
-from wedata.feature_store.utils import common_utils, env_utils
-from wedata.feature_store.feast_client.feast_client import FeastClient
-from wedata.feature_store.cloud_sdk_client.models import (
-    TaskSchedulerConfiguration, OnlineFeatureConfiguration, OfflineFeatureConfiguration,
-    CreateOnlineFeatureTableRequest, DescribeNormalSchedulerExecutorGroupsRequest, RefreshFeatureTableRequest)
-from wedata.feature_store.cloud_sdk_client.client import FeatureCloudSDK
+from wedata.feature_store.utils import common_utils
 class FeatureTableClient:
     """特征表操作类"""
     def __init__(
-            self,
-            spark: SparkSession,
-            cloud_secret_id: str = None,
-            cloud_secret_key: str = None,
+        self,
+        spark: SparkSession
     ):
         self._spark = spark
-        self._feast_client = FeastClient(spark)
-        if cloud_secret_id and cloud_secret_key:
-            self.__cloud_secret_id = cloud_secret_id
-            self.__cloud_secret_key = cloud_secret_key
-        else:
-            self.__cloud_secret_id, self.__cloud_secret_key = env_utils.get_cloud_secret()
-        self.__project = env_utils.get_project_id()
-        self.__region = env_utils.get_region()
-    @property
-    def cloud_secret_id(self) -> str:
-        if not self.__cloud_secret_id:
-            raise ValueError("cloud_secret_id is empty. please set it first.")
-        return self.__cloud_secret_id
-    @cloud_secret_id.setter
-    def cloud_secret_id(self, cloud_secret_id: str):
-        if not cloud_secret_id:
-            raise ValueError("cloud_secret_id cannot be None")
-        self.__cloud_secret_id = cloud_secret_id
-    @property
-    def cloud_secret_key(self) -> str:
-        if not self.__cloud_secret_key:
-            raise ValueError("cloud_secret_key is empty. please set it first.")
-        return self.__cloud_secret_key
-    @cloud_secret_key.setter
-    def cloud_secret_key(self, cloud_secret_key: str):
-        if not cloud_secret_key:
-            raise ValueError("cloud_secret_key cannot be None")
-        self.__cloud_secret_key = cloud_secret_key
-    @property
-    def project(self) -> str:
-        return self.__project
-    @property
-    def region(self) -> str:
-        return self.__region
     @staticmethod
     def _normalize_params(
@@ -105,39 +52,26 @@ class FeatureTableClient:
                 )
     @staticmethod
-    def _validate_key_conflicts(primary_keys: List[str], timestamp_keys: str):
+    def _validate_key_conflicts(primary_keys: List[str], timestamp_keys: List[str]):
         """校验主键与时间戳键是否冲突"""
-        if timestamp_keys in primary_keys:
-            raise ValueError(f"Timestamp keys conflict with primary keys: {timestamp_keys}")
-    @staticmethod
-    def _validate_key_exists(primary_keys: List[str], timestamp_keys: str):
-        """校验主键与时间戳键是否存在"""
-        if not primary_keys:
-            raise ValueError("Primary keys cannot be empty")
-        if not timestamp_keys:
-            raise ValueError("Timestamp keys cannot be empty")
+        conflict_keys = set(timestamp_keys) & set(primary_keys)
+        if conflict_keys:
+            raise ValueError(f"Timestamp keys conflict with primary keys: {conflict_keys}")
     @staticmethod
     def _escape_sql_value(value: str) -> str:
         """转义SQL值中的特殊字符"""
         return value.replace("'", "''")
-    @staticmethod
-    def _check_sequence_element_type(sequence: Sequence[Any], element_type: type) -> bool:
-        """检查序列中的元素是否为指定类型"""
-        return all(isinstance(element, element_type) for element in sequence)
     def create_table(
             self,
             name: str,
             primary_keys: Union[str, List[str]],
-            timestamp_key: str,
-            engine_type: EngineTypes,
-            data_source_name: str,
             database_name: Optional[str] = None,
+            location: Optional[str] = None,
             df: Optional[DataFrame] = None,
             *,
+            timestamp_keys: Union[str, List[str], None] = None,
             partition_columns: Union[str, List[str], None] = None,
             schema: Optional[StructType] = None,
             description: Optional[str] = None,
@@ -151,14 +85,14 @@ class FeatureTableClient:
             name: 特征表全称（格式：<table>）
             primary_keys: 主键列名（支持复合主键）
             database_name: Optional[str] = None,
-            data_source_name: 数据源名称,
+            location: Optional[str] = None,
             df: 初始数据（可选，用于推断schema）
-            timestamp_key: 时间戳键（用于时态特征）
-            engine_type: 引擎类型   version:: 1.33
+            timestamp_keys: 时间戳键（用于时态特征）
             partition_columns: 分区列（优化存储查询）
             schema: 表结构定义（可选，当不提供df时必需）
             description: 业务描述
             tags: 业务标签
         Returns:
             FeatureTable实例
@@ -168,33 +102,24 @@ class FeatureTableClient:
         # 参数标准化
         primary_keys = self._normalize_params(primary_keys)
+        timestamp_keys = self._normalize_params(timestamp_keys)
         partition_columns = self._normalize_params(partition_columns)
-        assert self._check_sequence_element_type(primary_keys, str), "primary_keys must be a list of strings"
-        assert self._check_sequence_element_type(partition_columns, str), "partition_columns must be a list of strings"
-        assert isinstance(timestamp_key, str), "timestamp key must be string"
         # 元数据校验
         self._validate_schema(df, schema)
-        self._validate_key_exists(primary_keys, timestamp_key)
-        self._validate_key_conflicts(primary_keys, timestamp_key)
+        self._validate_key_conflicts(primary_keys, timestamp_keys)
         # 表名校验
         common_utils.validate_table_name(name)
         common_utils.validate_database(database_name)
-        # 校验PrimaryKey是否有重复
-        dup_list = common_utils.get_duplicates(primary_keys)
-        if dup_list :
-            raise ValueError(f"Primary keys have duplicates: {dup_list}")
         # 构建完整表名
         table_name = common_utils.build_full_table_name(name, database_name)
         # 检查表是否存在
         try:
-            if self._check_table_exists(table_name):
+            if self._spark.catalog.tableExists(table_name):
                 raise ValueError(
                     f"Table '{name}' already exists\n"
                     "Solutions:\n"
@@ -204,20 +129,12 @@ class FeatureTableClient:
         except Exception as e:
             raise ValueError(f"Error checking table existence: {str(e)}") from e
-        try:
-            self._sync_table_info(table_name=name, action_name="create",
-                                  database_name=env_utils.get_database_name(database_name),
-                                  data_source_name=data_source_name, engine_name=env_utils.get_engine_name(),
-                                  is_try=True)
-        except tencentcloud.common.exception.TencentCloudSDKException as e:
-            raise RuntimeError(f"Table '{name}' is can't create. {str(e)}")
         # 推断表schema
         table_schema = schema or df.schema
         # 构建时间戳键属性
-        # 从环境变量获取额外标签
+        #从环境变量获取额外标签
         env_tags = {
             "project_id": os.getenv("WEDATA_PROJECT_ID", ""),  # wedata项目ID
             "engine_name": os.getenv("WEDATA_NOTEBOOK_ENGINE", ""),  # wedata引擎名称
@@ -227,42 +144,13 @@ class FeatureTableClient:
         # 构建表属性（通过TBLPROPERTIES）
         tbl_properties = {
             "wedata.feature_table": "true",
-            FEATURE_TABLE_BACKUP_PRIMARY_KEY: ",".join(primary_keys),
+            "primaryKeys": ",".join(primary_keys),
             "wedata.feature_project_id": f"{json.dumps([projectId])}",
-            FEATURE_TABLE_TIMESTAMP: timestamp_key,
+            "timestampKeys": ",".join(timestamp_keys) if timestamp_keys else "",
             "comment": description or "",
             **{f"{k}": v for k, v in (tags or {}).items()},
             **{f"feature_{k}": v for k, v in (env_tags or {}).items()}
         }
-        if engine_type == EngineTypes.ICEBERG_ENGINE:
-            if partition_columns:
-                tbl_properties.update({
-                    'format-version': '2',
-                    'write.upsert.enabled': 'true',
-                    'write.update.mode': 'merge-on-read',
-                    'write.merge.mode': 'merge-on-read',
-                    'write.parquet.bloom-filter-enabled.column.id': 'true',
-                    'dlc.ao.data.govern.sorted.keys': ",".join(primary_keys),
-                    'write.distribution-mode': 'hash',
-                    'write.metadata.delete-after-commit.enabled': 'true',
-                    'write.metadata.previous-versions-max': '100',
-                    'write.metadata.metrics.default': 'full',
-                    'smart-optimizer.inherit': 'default',
-                })
-            else:
-                tbl_properties.update({
-                    'format-version': '2',
-                    'write.upsert.enabled': 'true',
-                    'write.update.mode': 'merge-on-read',
-                    'write.merge.mode': 'merge-on-read',
-                    'write.parquet.bloom-filter-enabled.column.id': 'true',
-                    'dlc.ao.data.govern.sorted.keys': ",".join(primary_keys),
-                    'write.distribution-mode': 'hash',
-                    'write.metadata.delete-after-commit.enabled': 'true',
-                    'write.metadata.previous-versions-max': '100',
-                    'write.metadata.metrics.default': 'full',
-                    'smart-optimizer.inherit': 'default',
-                })
         # 构建列定义
         columns_ddl = []
@@ -284,8 +172,7 @@ class FeatureTableClient:
         )
         # 本地调试 iceberg --》PARQUET
         # 核心建表语句
-        if engine_type == EngineTypes.ICEBERG_ENGINE:
-            ddl = f"""
+        ddl = f"""
         CREATE TABLE {table_name} (
             {', '.join(columns_ddl)}
         )
@@ -294,20 +181,7 @@ class FeatureTableClient:
         TBLPROPERTIES (
             {', '.join(f"'{k}'='{self._escape_sql_value(v)}'" for k, v in tbl_properties.items())}
         )
-            """
-        elif engine_type == EngineTypes.HIVE_ENGINE:
-            ddl = f"""
-            CREATE TABLE {table_name} (
-        {', '.join(columns_ddl)}
-    )
-    {partition_expr}
---     STORED AS PARQUET
-    TBLPROPERTIES (
-        {', '.join(f"'{k}'='{self._escape_sql_value(v)}'" for k, v in tbl_properties.items())}
-    )
-            """
-        else:
-            raise ValueError(f"Engine type {engine_type} is not supported")
+        """
         # 打印sql
         print(f"create table ddl: {ddl}\n")
@@ -320,28 +194,8 @@ class FeatureTableClient:
         except Exception as e:
             raise ValueError(f"Failed to create table: {str(e)}") from e
-        print("async table info to feast")
-        self._feast_client.create_table(
-            table_name=table_name,
-            primary_keys=primary_keys,
-            timestamp_key=timestamp_key,
-            df=df,
-            schema=table_schema,
-            tags=tags,
-            description=description
-        )
         print(f"create table {name} done")
-        try:
-            self._sync_table_info(table_name=name, action_name="create",
-                                  database_name=env_utils.get_database_name(database_name),
-                                  data_source_name=data_source_name, engine_name=env_utils.get_engine_name(),
-                                  is_try=False)
-        except tencentcloud.common.exception.TencentCloudSDKException as e:
-            raise RuntimeError(f"Table '{name}' is can't create. {str(e)}")
         # 构建并返回FeatureTable对象
         return FeatureTable(
             name=name,
@@ -350,7 +204,7 @@ class FeatureTableClient:
             primary_keys=primary_keys,
             partition_columns=partition_columns or [],
             features=[field.name for field in table_schema.fields],
-            timestamp_keys=timestamp_key or [],
+            timestamp_keys=timestamp_keys or [],
             tags=dict(**tags or {}, **env_tags)
         )
@@ -395,10 +249,6 @@ class FeatureTableClient:
         # 构建完整表名
         table_name = common_utils.build_full_table_name(name, database_name)
-        # 检查表是否存在
-        if not self._check_table_exists(table_name):
-            raise ValueError(f"table '{name}' not exists")
         # 判断是否是流式DataFrame
         is_streaming = df.isStreaming
@@ -411,8 +261,7 @@ class FeatureTableClient:
                 writer = df.writeStream \
                     .format("parquet") \
                     .outputMode(mode) \
-                    .option("checkpointLocation", checkpoint_location) \
-                    # .foreachBatch(process_batch)
+                    .option("checkpointLocation", checkpoint_location)
                 if trigger:
                     writer = writer.trigger(**trigger)
@@ -423,23 +272,18 @@ class FeatureTableClient:
                 df.write \
                     .mode(mode) \
                     .insertInto(table_name)
-                # self._feast_client.client.write_to_offline_store(feature_view_name=table_name, df=df.toPandas(), allow_registry_cache=False,)
                 return None
         except Exception as e:
-            raise
-            # raise ValueError(f"Failed to write to table '{table_name}': {str(e)}") from e
+            raise ValueError(f"Failed to write to table '{table_name}': {str(e)}") from e
-    def register_table(self, name, database_name, timestamp_key: str, engine_type: EngineTypes, data_source_name: str,
-                       primary_keys: Union[str, List[str]]):
+    def register_table(self, name, database_name):
         """注册表 为特征表
                 Args:
                     name: 表名（格式：<table>）
                     database_name: 特征库名称
-                    data_source_name: 数据源名称
-                    engine_type: 引擎类型
-                    timestamp_key: 时间戳键
-                    primary_keys: 主键
                 Raises:
                     ValueError: 当表不存在或参数无效时抛出
                     RuntimeError: 当修改操作失败时抛出
@@ -453,49 +297,23 @@ class FeatureTableClient:
         common_utils.validate_table_name(name)
         common_utils.validate_database(database_name)
-        if primary_keys:
-            assert self._check_sequence_element_type(primary_keys, str), "primary_keys must be a list of strings"
-        assert isinstance(timestamp_key, str), "timestamp key must be string"
         # 构建完整表名
         table_name = common_utils.build_full_table_name(name, database_name)
         try:
             # 检查表是否存在
-            if not self._check_table_exists(table_name):
+            if not self._spark.catalog.tableExists(table_name):
                 raise ValueError(f"table '{name}' not exists")
             tbl_pro = self._spark.sql(f"SHOW TBLPROPERTIES {table_name}")
             props = {row['key']: row['value'] for row in tbl_pro.collect()}
-            # 检查Primary Key和Timestamp Key是否为空
-            if engine_type == engine_type.ICEBERG_ENGINE and props.get("format-version", "") == "2":
-                if not primary_keys:
-                    if props.get('dlc.ao.data.govern.sorted.keys', "") == "":
-                        raise ValueError(
-                            "table dlc.ao.data.govern.sorted.keys is empty. you must set dlc.ao.data.govern.sorted.keys")
-                    else:
-                        primary_keys = props.get('dlc.ao.data.govern.sorted.keys').split(",")
-            elif engine_type == engine_type.HIVE_ENGINE:
-                if not primary_keys:
-                    raise ValueError("primary_keys cannot be None for HIVE_ENGINE")
-            if props.get("wedata.feature_table", "") == "true":
-                raise ValueError("table is already a feature table")
-            self._validate_key_conflicts(primary_keys, timestamp_key)
-            # 检查表是否存在
-            dup_list = common_utils.get_duplicates(primary_keys)
-            if dup_list:
-                raise ValueError(f"primary_keys contains duplicates: {dup_list}")
             s = props.get(FEATURE_TABLE_PROJECT, "")
             if not s:  # 如果s是空字符串
                 projectIds = []
             else:
                 projectIds = json.loads(s)
             current_project_id = os.getenv("WEDATA_PROJECT_ID")
-            # 判断是否包含projectIds（仅是projectIds非空的时候)
-            if current_project_id not in projectIds and len(projectIds):
+            # 判断是否包含
+            if current_project_id not in projectIds:
                 register_table_project_ids = props.get(FEATURE_TABLE_PROJECT)
             else:
                 projectIds.append(current_project_id)
@@ -503,62 +321,31 @@ class FeatureTableClient:
             tbl_properties = {
                 FEATURE_TABLE_KEY: FEATURE_TABLE_VALUE,
                 FEATURE_TABLE_PROJECT: register_table_project_ids,
-                FEATURE_TABLE_TIMESTAMP: timestamp_key,
-                FEATURE_TABLE_BACKUP_PRIMARY_KEY: ",".join(primary_keys),
-            }
-            env_tags = {
-                "project_id": os.getenv("WEDATA_PROJECT_ID", ""),  # wedata项目ID
-                "engine_name": os.getenv("WEDATA_NOTEBOOK_ENGINE", ""),  # wedata引擎名称
-                "user_uin": os.getenv("KERNEL_LOGIN_UIN", "")  # wedata用户UIN
             }
-            for key, val in env_tags.items():
-                if not props.get(f"feature_{key}", ""):
-                    tbl_properties[f"feature_{key}"] = val
             # 构建属性设置语句
             props_str = ", ".join(
                 f"'{k}'='{self._escape_sql_value(v)}'"
-                for k, v in tbl_properties.items()
+                for k, v in tbl_properties
             )
             alter_sql = f"ALTER TABLE {table_name} SET TBLPROPERTIES ({props_str})"
-            try:
-                self._sync_table_info(table_name=name, action_name="create",
-                                  database_name=env_utils.get_database_name(database_name),
-                                  data_source_name=data_source_name, engine_name=env_utils.get_engine_name(), is_try=True)
-            except tencentcloud.common.exception.TencentCloudSDKException as e:
-                raise RuntimeError(f"Table '{name}' is can't create. {str(e)}")
             # 执行修改
-            print("alter table sql", alter_sql)
             self._spark.sql(alter_sql)
-            print("Execute sql done, start sync table info to feast")
-            self._feast_client.alter_table(full_table_name=table_name, primary_keys=primary_keys,
-                                           timestamp_key=timestamp_key)
-            print(f"Successfully register table '{table_name}'")
-            try:
-                self._sync_table_info(table_name=name, action_name="create",
-                                  database_name=env_utils.get_database_name(database_name),
-                                  data_source_name=data_source_name, engine_name=env_utils.get_engine_name(), is_try=False)
-            except tencentcloud.common.exception.TencentCloudSDKException as e:
-                raise RuntimeError(f"sync table info failed. you need to sync table info manually. {str(e)}")
+            print(f"Successfully register table '{name}'")
         except ValueError as e:
             raise  # 直接抛出已知的ValueError
         except Exception as e:
-            raise RuntimeError(f"Failed to modify properties for table '{table_name}': {str(e)}") from e
+            raise RuntimeError(f"Failed to modify properties for table '{name}': {str(e)}") from e
     def read_table(
-            self,
-            name: str,
-            database_name: Optional[str] = None,
-            is_online: bool = False,
-            online_config: Optional[RedisStoreConfig] = None,
-            entity_row: Optional[List[Dict[str, Any]]] = None
-    ) -> DataFrame:
+                self,
+                name: str,
+                database_name: Optional[str] = None,
+        ) -> DataFrame:
         """
         从特征表中读取数据
@@ -566,9 +353,6 @@ class FeatureTableClient:
         Args:
             name: 特征表名称（格式：<table>）
             database_name: 特征库名称
-            is_online: 是否读取在线表
-            online_config: 在线表配置
-            entity_row: 实体行(用于过滤在线数据, 仅当在线表为true时有效)
         Returns:
             包含表数据的DataFrame
@@ -581,23 +365,20 @@ class FeatureTableClient:
         common_utils.validate_database(database_name)
         # 构建完整表名
         table_name = common_utils.build_full_table_name(name, database_name)
         try:
             # 检查表是否存在
-            if not self._check_table_exists(table_name):
+            if not self._spark.catalog.tableExists(table_name):
                 raise ValueError(f"Table '{name}' does not exist")
-            if is_online:
-                return self._read_online_table(
-                    table_name=name, database_name=database_name,
-                    online_config=online_config, entity_row=entity_row)
             # 读取表数据
             return self._spark.read.table(table_name)
         except Exception as e:
-            raise
+            raise ValueError(f"Failed to read table '{name}': {str(e)}") from e
     def drop_table(self, name: str, database_name: Optional[str] = None) -> None:
@@ -607,6 +388,7 @@ class FeatureTableClient:
         Args:
             name: 特征表名称（格式：<table>）
             database_name: 特征库名称
         Raises:
             ValueError: 当表不存在时抛出
             RuntimeError: 当删除操作失败时抛出
@@ -621,118 +403,22 @@ class FeatureTableClient:
         # 构建完整表名
         table_name = common_utils.build_full_table_name(name, database_name)
         try:
             # 检查表是否存在
-            if not self._check_table_exists(table_name):
+            if not self._spark.catalog.tableExists(table_name):
                 print(f"Table '{name}' does not exist")
                 return
-            try:
-                feature_view = self._feast_client.get_feature_view(table_name)
-            except Exception as e:
-                print(f"Table '{name}' is not a feature table, skip delete. {str(e)}")
-            else:
-                if feature_view.online:
-                    raise ValueError(f"Table '{name}' has a online table, please call drop_online_table first")
-            try:
-                self._sync_table_info(table_name=name, action_name="delete",
-                                  database_name=env_utils.get_database_name(database_name),
-                                  data_source_name="", engine_name=env_utils.get_engine_name(), is_try=True)
-            except tencentcloud.common.exception.TencentCloudSDKException as e:
-                raise RuntimeError(f"Table '{name}' is can't delete. {str(e)}")
             # 执行删除
             self._spark.sql(f"DROP TABLE {table_name}")
             print(f"Table '{name}' dropped")
-            try:
-                self._feast_client.remove_offline_table(table_name=table_name)
-            except Exception as e:
-                raise
-                # raise ValueError(f"Failed to delete table '{name}' in feast: {str(e)}")
-            else:
-                print(f"Table '{name}' removed from feast")
-            try:
-                self._sync_table_info(table_name=name, action_name="delete",
-                                      database_name=env_utils.get_database_name(database_name),
-                                      data_source_name="", engine_name=env_utils.get_engine_name(), is_try=False)
-            except tencentcloud.common.exception.TencentCloudSDKException as e:
-                print(f"Failed to delete table information on the web interface. You need to delete it manually. Error: {str(e)}")
         except ValueError as e:
             raise  # 直接抛出已知的ValueError
         except Exception as e:
             raise RuntimeError(f"Failed to delete table '{name}': {str(e)}") from e
-    def _sync_table_info(self, table_name: str, action_name: str, database_name: str,
-                         data_source_name: str, engine_name: str, is_try: bool):
-        return _refresh_table(project_id=self.project, secret_id=self.cloud_secret_id, secret_key=self.cloud_secret_key,
-                              region=self.region, table_name=table_name,
-                              action=action_name, database_name=database_name, data_source_name=data_source_name,
-                              engine_name=engine_name, is_try=is_try, data_source_type=env_utils.get_engine_type())
-    def _read_online_table(self,
-                           table_name: str, database_name: str, online_config: RedisStoreConfig,
-                           entity_row:List[Dict[str,Any]] = None):
-        full_table_name = common_utils.build_full_table_name(table_name, database_name)
-        primary_keys, timestamp_key = self._get_table_primary_keys_and_timestamp_key(full_table_name)
-        entity_row_dict = {}
-        if isinstance(entity_row, list):
-            for row in entity_row:
-                if not isinstance(row, dict):
-                    raise ValueError("Entity_row row must be a dictionary")
-                for key in row.keys():
-                    if key not in primary_keys:
-                        raise ValueError(f"Entity_row row key '{key}' is not a primary key")
-                    entity_row_dict[key] = key
-        elif isinstance(entity_row, dict):
-            for key in entity_row.keys():
-                if key not in primary_keys:
-                    raise ValueError(f"Entity_row row key '{key}' is not a primary key")
-            entity_row_dict = entity_row
-        else:
-            raise ValueError(f"Entity_row must be a list of dictionaries or a single dictionary. {type(entity_row)}")
-        tmp_schema = self._spark.table(tableName=full_table_name).schema
-        columns_name_list = []
-        tmp_schema_list = []
-        for field in tmp_schema.fields:
-            if field.name in primary_keys or field.name == timestamp_key:
-                if entity_row_dict.get(field.name):
-                    tmp_schema_list.append(field)
-                continue
-            columns_name_list.append(field.name)
-            tmp_schema_list.append(field)
-        schema_name_list = [field.name for field in tmp_schema_list]
-        schema = StructType(tmp_schema_list)
-        for field in schema:
-            print(f"{field.name} => {field.dataType}")
-        feast_client = FeastClient(offline_store=self._spark, online_store_config=online_config)
-        # 构建离线表的entity的数据过滤
-        if not entity_row:
-            tbl_props = self._spark.sql(f"SHOW TBLPROPERTIES {table_name}")
-            props = {row['key']: row['value'] for row in tbl_props.collect()}
-            primary_key = props.get(FEATURE_TABLE_BACKUP_PRIMARY_KEY)
-            query_result = self._spark.sql(f"SELECT {primary_key} FROM {table_name} LIMIT 1")
-            result_row = query_result.first()
-            if result_row:
-                online_view = feast_client.get_online_table_view(
-                    full_table_name=full_table_name,
-                    columns_name=columns_name_list,
-                    entity_rows=[result_row.asDict()])
-                print("=====>read online dataframe:\n", online_view[schema_name_list])
-                return self._spark.createDataFrame(online_view[schema_name_list], schema=schema, verifySchema=False)
-            else:
-                return self._spark.createDataFrame([])
-        else:
-            online_view = feast_client.get_online_table_view(
-                full_table_name=full_table_name,
-                columns_name=columns_name_list,
-                entity_rows=entity_row)
-            print("=====>read online dataframe:\n", online_view[schema_name_list])
-            return self._spark.createDataFrame(online_view[schema_name_list], schema=schema, verifySchema=False)
     def get_table(
             self,
             name: str,
@@ -740,8 +426,7 @@ class FeatureTableClient:
             database_name: Optional[str] = None,
     ) -> FeatureTable:
-        """
-        获取特征表元数据信息
+        """获取特征表元数据信息
         参数:
             name: 特征表名称
@@ -760,13 +445,11 @@ class FeatureTableClient:
         # 构建完整表名
         table_name = common_utils.build_full_table_name(name, database_name)
-        if not self._check_table_exists(full_table_name=table_name):
-            raise ValueError(f"Table '{name}' does not exist")
         try:
             return spark_client.get_feature_table(table_name)
         except Exception as e:
-            raise
-            # raise ValueError(f"Failed to get metadata for table '{name}': {str(e)}") from e
+            raise ValueError(f"Failed to get metadata for table '{name}': {str(e)}") from e
     def alter_table_tag(
             self,
@@ -774,8 +457,7 @@ class FeatureTableClient:
             properties: Dict[str, str],
             database_name: Optional[str] = None,
     ):
-        """
-        修改表的TBLPROPERTIES属性（有则修改，无则新增）
+        """修改表的TBLPROPERTIES属性（有则修改，无则新增）
         Args:
             name: 表名（格式：<table>）
@@ -806,7 +488,7 @@ class FeatureTableClient:
         try:
             # 检查表是否存在
-            if not self._check_table_exists(table_name):
+            if not self._spark.catalog.tableExists(table_name):
                 raise ValueError(f"table '{name}' not exists")
             # 构建属性设置语句
@@ -819,7 +501,6 @@ class FeatureTableClient:
             # 执行修改
             self._spark.sql(alter_sql)
-            self._feast_client.modify_tags(table_name=table_name, tags=properties)
             print(f"Successfully updated properties for table '{name}': {list(properties.keys())}")
         except ValueError as e:
@@ -827,159 +508,3 @@ class FeatureTableClient:
         except Exception as e:
             raise RuntimeError(f"Failed to modify properties for table '{name}': {str(e)}") from e
-    def publish_table(self, table_name: str, data_source_name: str, cloud_secret_id: str, cloud_secret_key: str,
-                      database_name: Optional[str] = None,
-                      is_cycle: bool = False, cycle_obj: TaskSchedulerConfiguration = None,
-                      is_use_default_online: bool = True, online_config: RedisStoreConfig = None):
-        """
-        将离线特征表发布为在线特征表
-        Args:
-            table_name: 离线特征表名称
-            data_source_name: 数据源名称
-            database_name: 数据库名称
-            is_cycle: 是否周期性发布
-            cycle_obj: 周期性任务配置
-            is_use_default_online: 是否使用默认的在线存储配置
-            online_config: 在线存储配置 (仅当is_use_default_online为False时生效)
-        """
-        # 构建完整表名
-        full_table_name = common_utils.build_full_table_name(table_name, database_name)
-        # 检查表是否存在
-        if not self._check_table_exists(full_table_name):
-            raise ValueError(f"Table '{full_table_name}' does not exist")
-        # 检查是否已经发布,查看Redis中是否有值
-        try:
-        # 获取离线表的列名
-            online_data = self._read_online_table(
-                table_name=table_name,
-                database_name=database_name,
-                online_config=online_config)
-        except Exception as e:
-            print(f"Failed to get online table view for table '{full_table_name}': {str(e)}")
-        else:
-            if online_data:
-                raise ValueError(f"Table '{full_table_name}' has already been published")
-        # 配置周期性参数
-        if is_cycle:
-            if not isinstance(cycle_obj, TaskSchedulerConfiguration):
-                raise ValueError("cycle_obj must be a TaskSchedulerConfiguration object when is_cycle is True")
-            cycle_obj.CycleType = "CRONTAB_CYCLE"
-        else:
-            if isinstance(cycle_obj, TaskSchedulerConfiguration):
-                cycle_obj.CycleType = "ONEOFF_CYCLE"
-            else:
-                cycle_obj = TaskSchedulerConfiguration()
-                cycle_obj.CycleType = "ONEOFF_CYCLE"
-                # 设置默认当前时间延后1分钟
-                cycle_obj.CrontabExpression = (datetime.datetime.now() + datetime.timedelta(minutes=3)).strftime(
-                    "%M %H %d %m %w ? %y")
-        if is_use_default_online:
-            online_feature_config = OnlineFeatureConfiguration()
-            online_feature_config.UserDefault = True
-        else:
-            if not isinstance(online_config, RedisStoreConfig):
-                raise ValueError("online_config must be a RedisStoreConfig object when is_use_default_online is False")
-            online_feature_config = OnlineFeatureConfiguration()
-            online_feature_config.UserDefault = False
-            online_feature_config.Host = online_config.host
-            online_feature_config.Port = online_config.port
-            online_feature_config.DB = online_config.db
-        offline_feature_config = OfflineFeatureConfiguration()
-        offline_feature_config.DatabaseName = env_utils.get_database_name(database_name)
-        offline_feature_config.TableName = table_name
-        offline_feature_config.PrimaryKeys, offline_feature_config.TimestampColumn = self._get_table_primary_keys_and_timestamp_key(
-            full_table_name)
-        offline_feature_config.DatasourceName = data_source_name
-        offline_feature_config.DatasourceType = env_utils.get_engine_type()
-        offline_feature_config.EngineName = env_utils.get_engine_name()
-        api_requests = CreateOnlineFeatureTableRequest()
-        api_requests.OfflineFeatureConfiguration = offline_feature_config
-        api_requests.OnlineFeatureConfiguration = online_feature_config
-        api_requests.TaskSchedulerConfiguration = cycle_obj
-        api_requests.ProjectId = env_utils.get_project_id()
-        region = env_utils.get_region()
-        if not os.environ.get("RESOURCE_GROUP_ID", ""):
-            res_group_item = _get_default_resource_group(
-                api_requests.ProjectId, cloud_secret_id, cloud_secret_key, region)
-            api_requests.ResourceGroupId = res_group_item.ExecutorGroupId
-        else:
-            api_requests.ResourceGroupId = os.environ.get("RESOURCE_GROUP_ID")
-        client = FeatureCloudSDK(secret_id=cloud_secret_id, secret_key=cloud_secret_key, region=region)
-        resp = client.CreateOnlineFeatureTable(api_requests)
-        if cycle_obj.CycleType == "ONEOFF_CYCLE":
-            print(f"publish online task create success. it will be execute after 3 min. {resp.Data.OnlineTableId} {resp.Data.OfflineTableId} ")
-        else:
-            print(f"publish online task create success. {resp.Data.OnlineTableId} {resp.Data.OfflineTableId} ")
-    def drop_online_table(self, table_name: str, online_config: RedisStoreConfig, database_name: Optional[str] = None):
-        # 构建完整表名
-        full_table_name = common_utils.build_full_table_name(table_name, database_name)
-        feast_client = FeastClient(self._spark, online_config)
-        try:
-            self._sync_table_info(table_name=table_name, database_name=database_name, action_name="delete_online",
-                                  data_source_name="", engine_name=env_utils.get_engine_name(), is_try=True)
-        except Exception as e:
-            raise RuntimeError(f"drop online table failed. table_name: {full_table_name}. {str(e)}")
-        feast_client.remove_online_table(full_table_name)
-        try:
-            self._sync_table_info(table_name=table_name, database_name=database_name, action_name="delete_online",
-                                  data_source_name="", engine_name=env_utils.get_engine_name(), is_try=False)
-        except Exception as e:
-            raise RuntimeError(f"drop online table failed. table_name: {full_table_name}. {str(e)}")
-        print(f"drop online table success. table_name: {full_table_name}")
-    def _get_table_primary_keys_and_timestamp_key(self, full_table_name: str) -> 'str, str':
-        tbl_pro = self._spark.sql(f"SHOW TBLPROPERTIES {full_table_name}")
-        props = {row['key']: row['value'] for row in tbl_pro.collect()}
-        if props.get(FEATURE_DLC_TABLE_PRIMARY_KEY, ""):
-            primary_keys = props.get(FEATURE_DLC_TABLE_PRIMARY_KEY, "")
-        else:
-            primary_keys = props.get(FEATURE_TABLE_BACKUP_PRIMARY_KEY, "")
-        primary_keys = primary_keys.split(",")
-        timestamp_key = props.get(FEATURE_TABLE_TIMESTAMP, "")
-        return primary_keys, timestamp_key
-    def _check_table_exists(self, full_table_name: str) -> bool:
-        return common_utils.check_spark_table_exists(self._spark, full_table_name)
-def _get_default_resource_group(project_id: str, secret_id: str, secret_key: str, region: str):
-    client = FeatureCloudSDK(secret_id=secret_id, secret_key=secret_key, region=region)
-    request = DescribeNormalSchedulerExecutorGroupsRequest()
-    request.ProjectId = project_id
-    resp = client.DescribeNormalSchedulerExecutorGroups(request)
-    # 默认取第一个健康可用的资源组进行执行
-    for item in resp.Data:
-        if item.Available:
-            return item
-    raise ValueError("No available resource group found")
-def _refresh_table(project_id: str, secret_id: str, secret_key: str, region: str, table_name: str,
-                   action: str, database_name: str, data_source_name: str, data_source_type: str,
-                   engine_name: str, is_try: bool):
-    client = FeatureCloudSDK(secret_id=secret_id, secret_key=secret_key, region=region)
-    request = RefreshFeatureTableRequest()
-    request.ProjectId = project_id
-    request.TableName = table_name
-    request.DatabaseName = database_name
-    request.DatasourceName = data_source_name
-    request.DatasourceType = data_source_type
-    request.EngineName = engine_name
-    request.ActionName = action
-    request.IsTry = is_try
-    resp = client.RefreshFeatureTable(request)
-    return resp

tencent-wedata-feature-engineering-dev 0.1.50__py3-none-any.whl → 0.2.0__py3-none-any.whl

Potentially problematic release.

tencent-wedata-feature-engineering-dev 0.1.50py3-none-any.whl → 0.2.0py3-none-any.whl