PyPI - wedata-feature-engineering - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

wedata-feature-engineering 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

{feature_store → wedata/feature_store}/feature_table_client/feature_table_client.py RENAMED Viewed

@@ -8,15 +8,18 @@ from pyspark.sql.streaming import StreamingQuery
 from pyspark.sql.types import StructType
 import os
-from feature_store.constants.constants import APPEND, DEFAULT_WRITE_STREAM_TRIGGER
+from wedata.feature_store.constants.constants import APPEND, DEFAULT_WRITE_STREAM_TRIGGER
+from wedata.feature_store.entities.feature_table import FeatureTable
+from wedata.feature_store.spark_client.spark_client import SparkClient
+from wedata.feature_store.utils import common_utils
 class FeatureTableClient:
     """特征表操作类"""
     def __init__(
-            self,
-            spark: SparkSession
+        self,
+        spark: SparkSession
     ):
         self._spark = spark
@@ -46,12 +49,6 @@ class FeatureTableClient:
                     f"DataFrame与schema不匹配。差异字段: {diff_fields if diff_fields else '字段类型不一致'}"
                 )
-    @staticmethod
-    def _validate_table_name(name: str):
-        """验证特征表命名规范"""
-        if name.count('.') < 2:
-            raise ValueError("特征表名称需符合<catalog>.<schema>.<table>格式")
     @staticmethod
     def _validate_key_conflicts(primary_keys: List[str], timestamp_keys: List[str]):
         """校验主键与时间戳键是否冲突"""
@@ -75,7 +72,8 @@ class FeatureTableClient:
             schema: Optional[StructType] = None,
             description: Optional[str] = None,
             tags: Optional[Dict[str, str]] = None
-    ):
+    ) -> FeatureTable:
         """
         创建特征表（支持批流数据写入）
@@ -85,6 +83,7 @@ class FeatureTableClient:
             df: 初始数据（可选，用于推断schema）
             timestamp_keys: 时间戳键（用于时态特征）
             partition_columns: 分区列（优化存储查询）
+            schema: 表结构定义（可选，当不提供df时必需）
             description: 业务描述
             tags: 业务标签
@@ -94,6 +93,7 @@ class FeatureTableClient:
         Raises:
             ValueError: 当schema与数据不匹配时
         """
         # 参数标准化
         primary_keys = self._normalize_params(primary_keys)
         timestamp_keys = self._normalize_params(timestamp_keys)
@@ -101,23 +101,25 @@ class FeatureTableClient:
         # 元数据校验
         self._validate_schema(df, schema)
-        #self._validate_table_name(name)
         self._validate_key_conflicts(primary_keys, timestamp_keys)
-        # 表名 格式：<catalog>.<schema>.<table>  catalog默认值：DataLakeCatalog，schema默认值：feature_store
-        table_name = f'DataLakeCatalog.feature_store.{name}'
+        # 表名校验
+        common_utils.validate_table_name(name)
+        # 构建完整表名
+        table_name = common_utils.build_full_table_name(name)
         # 检查表是否存在
         try:
             if self._spark.catalog.tableExists(table_name):
                 raise ValueError(
-                    f"表 '{table_name}' 已存在\n"
-                    "解决方案：\n"
-                    "1. 使用不同的表名\n"
-                    "2. 删除现有表: spark.sql(f'DROP TABLE {name}')\n"
+                    f"Table '{table_name}' already exists\n"
+                    "Solutions:\n"
+                    "1. Use a different table name\n"
+                    "2. Drop the existing table: spark.sql(f'DROP TABLE {name}')\n"
                 )
         except Exception as e:
-            raise ValueError(f"检查表存在性时出错: {str(e)}") from e
+            raise ValueError(f"Error checking table existence: {str(e)}") from e
         # 推断表schema
         table_schema = schema or df.schema
@@ -126,7 +128,7 @@ class FeatureTableClient:
         timestamp_keys_ddl = []
         for timestamp_key in timestamp_keys:
             if timestamp_key not in primary_keys:
-                raise ValueError(f"时间戳键 '{timestamp_key}' 必须是主键")
+                raise ValueError(f"Timestamp key '{timestamp_key}' must be a primary key")
             timestamp_keys_ddl.append(f"`{timestamp_key}` TIMESTAMP")
         #从环境变量获取额外标签
@@ -185,7 +187,19 @@ class FeatureTableClient:
             if df is not None:
                 df.write.insertInto(table_name)
         except Exception as e:
-            raise ValueError(f"建表失败: {str(e)}") from e
+            raise ValueError(f"Failed to create table: {str(e)}") from e
+        # 构建并返回FeatureTable对象
+        return FeatureTable(
+            name=name,
+            table_id=table_name,
+            description=description or "",
+            primary_keys=primary_keys,
+            partition_columns=partition_columns or [],
+            features=[field.name for field in table_schema.fields],
+            timestamp_keys=timestamp_keys or [],
+            tags=dict(**tags or {}, **env_tags)
+        )
     def write_table(
             self,
@@ -195,6 +209,7 @@ class FeatureTableClient:
             checkpoint_location: Optional[str] = None,
             trigger: Optional[Dict[str, Any]] = DEFAULT_WRITE_STREAM_TRIGGER
     ) -> Optional[StreamingQuery]:
         """
         写入特征表数据（支持批处理和流式写入）
@@ -215,10 +230,13 @@ class FeatureTableClient:
         # 验证写入模式
         valid_modes = ["append", "overwrite"]
         if mode not in valid_modes:
-            raise ValueError(f"无效的写入模式 '{mode}'，可选值: {valid_modes}")
+            raise ValueError(f"Invalid write mode '{mode}', valid options: {valid_modes}")
+        # 表名校验
+        common_utils.validate_table_name(name)
-        # 完整表名格式：<catalog>.<schema>.<table>
-        table_name = f'DataLakeCatalog.feature_store.{name}'
+        # 构建完整表名
+        table_name = common_utils.build_full_table_name(name)
         # 判断是否是流式DataFrame
         is_streaming = df.isStreaming
@@ -227,7 +245,7 @@ class FeatureTableClient:
             if is_streaming:
                 # 流式写入
                 if not checkpoint_location:
-                    raise ValueError("流式写入必须提供checkpoint_location参数")
+                    raise ValueError("Streaming write requires checkpoint_location parameter")
                 writer = df.writeStream \
                     .format("parquet") \
@@ -252,6 +270,7 @@ class FeatureTableClient:
             self,
             name: str
     ) -> DataFrame:
         """
         从特征表中读取数据
@@ -264,8 +283,12 @@ class FeatureTableClient:
         Raises:
             ValueError: 当表不存在或读取失败时抛出
         """
+        # 表名校验
+        common_utils.validate_table_name(name)
         # 构建完整表名
-        table_name = f'DataLakeCatalog.feature_store.{name}'
+        table_name = common_utils.build_full_table_name(name)
         try:
             # 检查表是否存在
@@ -278,10 +301,8 @@ class FeatureTableClient:
         except Exception as e:
             raise ValueError(f"读取表 '{table_name}' 失败: {str(e)}") from e
-    def drop_table(
-            self,
-            name: str
-    ) -> None:
+    def drop_table(self, name: str):
         """
         删除特征表（表不存在时抛出异常）
@@ -296,8 +317,12 @@ class FeatureTableClient:
             # 基本删除
             drop_table("user_features")
         """
+        # 表名校验
+        common_utils.validate_table_name(name)
         # 构建完整表名
-        table_name = f'DataLakeCatalog.feature_store.{name}'
+        table_name = common_utils.build_full_table_name(name)
         try:
             # 检查表是否存在
@@ -311,3 +336,33 @@ class FeatureTableClient:
             raise  # 直接抛出已知的ValueError
         except Exception as e:
             raise RuntimeError(f"删除表 '{table_name}' 失败: {str(e)}") from e
+    def get_table(
+            self,
+            name: str,
+            spark_client: SparkClient
+    ) -> FeatureTable:
+        """获取特征表元数据信息
+        参数:
+            name: 特征表名称
+            spark_client: Spark客户端
+        返回:
+            FeatureTable对象
+        异常:
+            ValueError: 当表不存在或获取失败时抛出
+        """
+        # 表名校验
+        common_utils.validate_table_name(name)
+        # 构建完整表名
+        table_name = common_utils.build_full_table_name(name)
+        try:
+            return spark_client.get_feature_table(table_name)
+        except Exception as e:
+            raise ValueError(f"获取表'{name}'元数据失败: {str(e)}") from e

{feature_store → wedata/feature_store}/spark_client/spark_client.py RENAMED Viewed

@@ -6,73 +6,52 @@ from pyspark.sql.catalog import Column
 from pyspark.sql.functions import when, isnull
 from pyspark.sql.types import StructType, StringType, StructField
-from feature_store.entities.feature import Feature
-from feature_store.entities.feature_table import FeatureTable
-from feature_store.entities.function_info import FunctionParameterInfo, FunctionInfo
-from feature_store.utils.common_utils import unsanitize_identifier
-from feature_store.utils.utils import sanitize_multi_level_name
+from wedata.feature_store.entities.feature import Feature
+from wedata.feature_store.entities.feature_table import FeatureTable
+from wedata.feature_store.entities.function_info import FunctionParameterInfo, FunctionInfo
+from wedata.feature_store.utils.common_utils import unsanitize_identifier, sanitize_multi_level_name
 class SparkClient:
     def __init__(self, spark: SparkSession):
         self._spark = spark
-    def createDataFrame(self, data, schema) -> DataFrame:
-        return self._spark.createDataFrame(data, schema)
-    def read_table(
-            self, qualified_table_name, as_of_delta_timestamp=None, streaming=False
-    ):
-        """
-        Reads a Delta table, optionally as of some timestamp.
-        """
-        if streaming and as_of_delta_timestamp:
-            raise ValueError(
-                "Internal error: as_of_delta_timestamp cannot be specified when"
-                " streaming=True."
-            )
-        base_reader = (
-            # By default, Structured Streaming only handles append operations. Because
-            # we have a notion of primary keys, most offline feature store operations
-            # are not appends. For example, FeatureStoreClient.write_table(mode=MERGE)
-            # will issue a MERGE operation.
-            # In order to propagate the non-append operations to the
-            # readStream, we set ignoreChanges to "true".
-            # For more information,
-            # see https://docs.databricks.com/delta/delta-streaming.html#ignore-updates-and-deletes
-            self._spark.readStream.format("delta").option("ignoreChanges", "true")
-            if streaming
-            else self._spark.read.format("delta")
-        )
-        if as_of_delta_timestamp:
-            return base_reader.option("timestampAsOf", as_of_delta_timestamp).table(
-                sanitize_multi_level_name(qualified_table_name)
-            )
-        else:
-            return base_reader.table(sanitize_multi_level_name(qualified_table_name))
     def get_current_catalog(self):
         """
-        Get current set catalog in the spark context.
+        获取当前Spark会话的catalog名称（使用spark.catalog.currentCatalog属性）
+        返回:
+            str: 当前catalog名称，如果未设置则返回None
         """
         try:
-            df = self._spark.sql("SELECT CURRENT_CATALOG()").collect()
-            return unsanitize_identifier(df[0][0])
-        except Exception as e:
+            return unsanitize_identifier(self._spark.catalog.currentCatalog())
+        except Exception:
             return None
     def get_current_database(self):
         """
-        Get current set database in the spark context.
+        获取Spark上下文中当前设置的数据库名称
+        返回:
+            str: 当前数据库名称，如果获取失败则返回None
         """
         try:
-            df = self._spark.sql("SELECT CURRENT_DATABASE()").collect()
-            return unsanitize_identifier(df[0][0])
-        except Exception as e:
+            # 使用Spark SQL查询当前数据库
+            df = self._spark.sql("SELECT CURRENT_DATABASE()")
+            # 获取第一行第一列的值并去除特殊字符
+            return unsanitize_identifier(df.first()[0])
+        except Exception:
+            # 捕获所有异常并返回None
             return None
+    def createDataFrame(self, data, schema) -> DataFrame:
+        return self._spark.createDataFrame(data, schema)
     def read_table(self, table_name):
         """读取Spark表数据
@@ -134,11 +113,6 @@ class SparkClient:
             ) for row in columns
         ]
-    def get_online_stores(self, table_name):
-        return None
     def get_feature_table(self, table_name):
         # 获取表元数据
@@ -170,19 +144,19 @@ class SparkClient:
         return FeatureTable(
             name=table_name,
             table_id=table_properties.get("table_id", table_name),
-            description=table.description or table_properties.get("description", table_name),
+            description=table.description or table_properties.get("comment", table_name),
             primary_keys=table_properties.get("primaryKeys", "").split(",") if table_properties.get("primaryKeys") else [],
             partition_columns=table.partitionColumnNames if hasattr(table, 'partitionColumnNames') else [],
             features=features,
             creation_timestamp=None,  # Spark表元数据不包含创建时间戳
-            online_stores=self.get_online_stores(table_name),
+            online_stores=None,
             notebook_producers=None,
             job_producers=None,
             table_data_sources=None,
             path_data_sources=None,
             custom_data_sources=None,
             timestamp_keys=table_properties.get("timestamp_keys"),
-            tags=table_properties.get("tags")
+            tags=table_properties
         )
     def _get_routines_with_parameters(self, full_routine_names: List[str]) -> DataFrame:

wedata-feature-engineering 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

wedata-feature-engineering 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl