PyPI - wedata-feature-engineering - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl - Mend

wedata-feature-engineering 0.1.8py3-none-any.whl → 0.1.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

wedata/__init__.py CHANGED Viewed

@@ -3,4 +3,4 @@ WeData Feature Engineering
 A toolkit for automated feature engineering
 """
-__version__ = "0.1.8"
+__version__ = "0.1.10"

wedata/feature_store/client.py CHANGED Viewed

@@ -202,6 +202,7 @@ class FeatureStoreClient:
             flavor: ModuleType,
             training_set: Optional[TrainingSet] = None,
             registered_model_name: Optional[str] = None,
+            model_registry_uri: Optional[str] = None,
             await_registration_for: int = mlflow.tracking._model_registry.DEFAULT_AWAIT_MAX_SLEEP_SECONDS,
             infer_input_example: bool = False,
             **kwargs,
@@ -218,6 +219,7 @@ class FeatureStoreClient:
              flavor: MLflow模型类型模块(如mlflow.sklearn)
              training_set: 训练模型使用的TrainingSet对象(可选)
              registered_model_name: 要注册的模型名称(可选)
+             model_registry_uri: 模型注册中心地址(可选)
              await_registration_for: 等待模型注册完成的秒数(默认300秒)
              infer_input_example: 是否自动记录输入示例(默认False)
@@ -231,6 +233,7 @@ class FeatureStoreClient:
             flavor=flavor,
             training_set=training_set,
             registered_model_name=registered_model_name,
+            model_registry_uri=model_registry_uri,
             await_registration_for=await_registration_for,
             infer_input_example=infer_input_example,
             **kwargs

wedata/feature_store/feature_table_client/feature_table_client.py CHANGED Viewed

@@ -113,7 +113,7 @@ class FeatureTableClient:
         try:
             if self._spark.catalog.tableExists(table_name):
                 raise ValueError(
-                    f"Table '{table_name}' already exists\n"
+                    f"Table '{name}' already exists\n"
                     "Solutions:\n"
                     "1. Use a different table name\n"
                     "2. Drop the existing table: spark.sql(f'DROP TABLE {name}')\n"
@@ -125,11 +125,6 @@ class FeatureTableClient:
         table_schema = schema or df.schema
         # 构建时间戳键属性
-        timestamp_keys_ddl = []
-        for timestamp_key in timestamp_keys:
-            if timestamp_key not in primary_keys:
-                raise ValueError(f"Timestamp key '{timestamp_key}' must be a primary key")
-            timestamp_keys_ddl.append(f"`{timestamp_key}` TIMESTAMP")
         #从环境变量获取额外标签
         env_tags = {
@@ -142,6 +137,7 @@ class FeatureTableClient:
         tbl_properties = {
             "feature_table": "TRUE",
             "primaryKeys": ",".join(primary_keys),
+            "timestampKeys": ",".join(timestamp_keys) if timestamp_keys else "",
             "comment": description or "",
             **{f"{k}": v for k, v in (tags or {}).items()},
             **{f"feature_{k}": v for k, v in (env_tags or {}).items()}
@@ -171,7 +167,7 @@ class FeatureTableClient:
         CREATE TABLE {table_name} (
             {', '.join(columns_ddl)}
         )
-        USING PARQUET
+        USING iceberg
         {partition_expr}
         TBLPROPERTIES (
             {', '.join(f"'{k}'='{self._escape_sql_value(v)}'" for k, v in tbl_properties.items())}
@@ -293,13 +289,13 @@ class FeatureTableClient:
         try:
             # 检查表是否存在
             if not self._spark.catalog.tableExists(table_name):
-                raise ValueError(f"表 '{table_name}' 不存在")
+                raise ValueError(f"Table '{name}' does not exist")
             # 读取表数据
             return self._spark.read.table(table_name)
         except Exception as e:
-            raise ValueError(f"读取表 '{table_name}' 失败: {str(e)}") from e
+            raise ValueError(f"Failed to read table '{name}': {str(e)}") from e
     def drop_table(self, name: str):
@@ -327,7 +323,7 @@ class FeatureTableClient:
         try:
             # 检查表是否存在
             if not self._spark.catalog.tableExists(table_name):
-                raise ValueError(f"表 '{table_name}' 不存在")
+                raise ValueError(f"Table '{name}' does not exist")
             # 执行删除
             self._spark.sql(f"DROP TABLE {table_name}")
@@ -335,7 +331,7 @@ class FeatureTableClient:
         except ValueError as e:
             raise  # 直接抛出已知的ValueError
         except Exception as e:
-            raise RuntimeError(f"删除表 '{table_name}' 失败: {str(e)}") from e
+            raise RuntimeError(f"Failed to delete table '{name}': {str(e)}") from e
     def get_table(
             self,
@@ -365,4 +361,4 @@ class FeatureTableClient:
         try:
             return spark_client.get_feature_table(table_name)
         except Exception as e:
-            raise ValueError(f"获取表'{name}'元数据失败: {str(e)}") from e
+            raise ValueError(f"Failed to get metadata for table '{table_name}': {str(e)}") from e

wedata/feature_store/spark_client/spark_client.py CHANGED Viewed

@@ -9,13 +9,39 @@ from pyspark.sql.types import StructType, StringType, StructField
 from wedata.feature_store.entities.feature import Feature
 from wedata.feature_store.entities.feature_table import FeatureTable
 from wedata.feature_store.entities.function_info import FunctionParameterInfo, FunctionInfo
-from wedata.feature_store.utils.common_utils import unsanitize_identifier, sanitize_multi_level_name
+from wedata.feature_store.utils.common_utils import unsanitize_identifier
 class SparkClient:
     def __init__(self, spark: SparkSession):
         self._spark = spark
+    def _parse_table_name(self, table_name):
+        """解析表名并返回表名部分
+        参数:
+            table_name: 完整表名，支持格式: catalog.schema.table、schema.table 或 table
+        返回:
+            str: 解析后的表名部分
+        """
+        if not isinstance(table_name, str):
+            raise ValueError("Table name must be string type")
+        table_name = table_name.strip()
+        if not table_name:
+            raise ValueError("Table name cannot be empty")
+        parts = table_name.split('.')
+        if len(parts) == 3:
+            # 对于三部分名称(catalog.schema.table)，只使用表名部分
+            return parts[2]
+        elif len(parts) == 2:
+            # 对于两部分名称(schema.table)，只使用表名部分
+            return parts[1]
+        else:
+            # 单表名，直接使用
+            return table_name
     def get_current_catalog(self):
         """
@@ -66,19 +92,13 @@ class SparkClient:
         """
         try:
             # 解析表名
-            parts = table_name.split('.')
-            if len(parts) == 3:
-                catalog, schema, table = parts
-            elif len(parts) == 2:
-                schema, table = parts
-            else:
-                table = table_name
+            schema_table_name = self._parse_table_name(table_name)
             # 验证表是否存在
-            if not self._spark.catalog.tableExists(table):
+            if not self._spark.catalog.tableExists(schema_table_name):
                 raise ValueError(f"表不存在: {table_name}")
-            return self._spark.table(table)
+            return self._spark.table(schema_table_name)
         except Exception as e:
             raise ValueError(f"读取表 {table_name} 失败: {str(e)}")
@@ -86,23 +106,10 @@ class SparkClient:
     def get_features(self, table_name):
         # 解析表名
-        parts = table_name.split('.')
-        if len(parts) == 3:
-            # 对于三部分名称(catalog.schema.table)，使用schema.table格式
-            _, schema, table = parts
-            full_table_name = f"{schema}.{table}"
-        elif len(parts) == 2:
-            # 对于两部分名称(schema.table)，直接使用
-            full_table_name = table_name
-        else:
-            # 单表名，使用当前数据库
-            current_db = self.get_current_database()
-            if not current_db:
-                raise ValueError("无法确定当前数据库")
-            full_table_name = f"{current_db}.{table_name}"
+        schema_table_name = self._parse_table_name(table_name)
         # 使用dbName.tableName格式查询列信息
-        columns = self._spark.catalog.listColumns(tableName=full_table_name)
+        columns = self._spark.catalog.listColumns(tableName=schema_table_name)
         return [
             Feature(
                 feature_table=table_name,
@@ -114,22 +121,14 @@ class SparkClient:
         ]
     def get_feature_table(self, table_name):
+        # 解析表名
+        schema_table_name = self._parse_table_name(table_name)
         # 获取表元数据
-        table = self._spark.catalog.getTable(table_name)
+        table = self._spark.catalog.getTable(schema_table_name)
-        parts = table_name.split('.')
-        if len(parts) == 3:
-            # 对于三部分名称(catalog.schema.table)，只使用表名部分
-            table_to_describe = parts[2]
-        elif len(parts) == 2:
-            # 对于两部分名称(schema.table)，只使用表名部分
-            table_to_describe = parts[1]
-        else:
-            # 单表名，直接使用
-            table_to_describe = table_name
         # 获取表详细信息
-        table_details = self._spark.sql(f"DESCRIBE TABLE EXTENDED {table_to_describe}").collect()
+        table_details = self._spark.sql(f"DESCRIBE TABLE EXTENDED {schema_table_name}").collect()
         table_properties = {}
         for row in table_details:

wedata/feature_store/training_set_client/training_set_client.py CHANGED Viewed

@@ -186,6 +186,7 @@ class TrainingSetClient:
             flavor: ModuleType,
             training_set: Optional[TrainingSet],
             registered_model_name: Optional[str],
+            model_registry_uri: Optional[str],
             await_registration_for: int,
             infer_input_example: bool,
             **kwargs,
@@ -334,8 +335,7 @@ class TrainingSetClient:
             except Exception:
                 input_example = None
-            # todo:
-            #feature_spec.save(data_path)
+            feature_spec.save(data_path)
             # Log the packaged model. If no run is active, this call will create an active run.
             mlflow.pyfunc.log_model(
@@ -355,13 +355,12 @@ class TrainingSetClient:
             # If the user provided an explicit model_registry_uri when constructing the FeatureStoreClient,
             # we respect this by setting the registry URI prior to reading the model from Model
             # Registry.
-            # todo:
-            # if self._model_registry_uri:
-            #     # This command will override any previously set registry_uri.
-            #     mlflow.set_registry_uri(self._model_registry_uri)
+        if model_registry_uri is not None:
+             # This command will override any previously set registry_uri.
+            mlflow.set_registry_uri(model_registry_uri)
             mlflow.register_model(
                 "runs:/%s/%s" % (run_id, artifact_path),
                 registered_model_name,
                 await_registration_for=await_registration_for,
-                )
+            )

{wedata_feature_engineering-0.1.8.dist-info → wedata_feature_engineering-0.1.10.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: wedata-feature-engineering
-Version: 0.1.8
+Version: 0.1.10
 Summary: Wedata Feature Engineering Library
 Home-page:
 Author: meahqian
@@ -14,4 +14,10 @@ Description-Content-Type: text/markdown
 Requires-Dist: pyspark>=3.0.0
 Requires-Dist: delta-spark>=1.0.0
 Requires-Dist: pandas>=1.0.0
+Dynamic: author
+Dynamic: classifier
+Dynamic: description-content-type
+Dynamic: license
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary

{wedata_feature_engineering-0.1.8.dist-info → wedata_feature_engineering-0.1.10.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
-wedata/__init__.py,sha256=yIceuEY46nh56GEjtGNrDMIKTYtBHEf-Wj5Rc-cJS-g,101
+wedata/__init__.py,sha256=_M49ivoMq-NogMzHKd9DW6GfjUBWL18mb4gB5dK1Vbw,102
 wedata/feature_store/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wedata/feature_store/client.py,sha256=7a-9C8HIBHnQNQD6I4W3UtBQwkJE8G-Q7N24zydjpkY,8100
+wedata/feature_store/client.py,sha256=DO68yHiaJQ3LmrZ-owWEuRjuwM6vUjcaEdAcF65mdhs,8271
 wedata/feature_store/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wedata/feature_store/constants/constants.py,sha256=b4tgcSt66YIq0Fg7pMbqvbqPOI77Cz8znLVZ4ihUKss,1479
 wedata/feature_store/entities/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -20,11 +20,11 @@ wedata/feature_store/entities/on_demand_column_info.py,sha256=Eh5ieaj1TxC7DG6ipB
 wedata/feature_store/entities/source_data_column_info.py,sha256=a9jQOJvehwDIrKPwsP6W9YRBSPNK2nZYypE6-p80CwA,542
 wedata/feature_store/entities/training_set.py,sha256=ylt1h6Z_xU8hKYvnvd80CeewTGSN68-_kvFpoliwH7s,5679
 wedata/feature_store/feature_table_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wedata/feature_store/feature_table_client/feature_table_client.py,sha256=W6_TJ6PNc5o6SotVppBmu6VWZ7q_lPgIeg9Xsbr9r-g,12136
+wedata/feature_store/feature_table_client/feature_table_client.py,sha256=AoqlXWsR95UgrKuh7QNBUF4ygNmAgTQ_bRsJpmajRmc,11938
 wedata/feature_store/spark_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wedata/feature_store/spark_client/spark_client.py,sha256=DBCYjLsFrIVRvLErTNyfLIHRul3v0y9uZIY2JR1N92s,10323
+wedata/feature_store/spark_client/spark_client.py,sha256=SwMf-TsAeV7_8pDmh4927pKEwwKcIFK3JJ-J8rzUp_Q,10129
 wedata/feature_store/training_set_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wedata/feature_store/training_set_client/training_set_client.py,sha256=gHeZU0rvvUcyNTfroXD3LAinFPdhDpnwTOIWj6z84Tc,15102
+wedata/feature_store/training_set_client/training_set_client.py,sha256=CVcdgqfHL2S-fSCkfDwQgqtMhkB8haGEi1kEjbudDOk,15087
 wedata/feature_store/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wedata/feature_store/utils/common_utils.py,sha256=cR3Vd49sWZrclaXvNO6B52Sk2v88iXmYmCIhi9xWsPM,10000
 wedata/feature_store/utils/feature_lookup_utils.py,sha256=da6ULwf5D-FRVpZoNyag1rroBfq_XPSH4a3uEMB_8io,22372
@@ -37,7 +37,7 @@ wedata/feature_store/utils/topological_sort.py,sha256=ebzKxmxeCLk9seB1zR0ASCGXsZ
 wedata/feature_store/utils/training_set_utils.py,sha256=MYsPZS1d9HKswHgjgxD8K7H9N3dWPyyTTx20Mkp4PVU,22497
 wedata/feature_store/utils/uc_utils.py,sha256=A-W8Cd8yvTmAMEWaHeWmGmcIDMvUtjAfx2G2x_di1QE,10774
 wedata/feature_store/utils/validation_utils.py,sha256=FslvrNs3kstqvM6THScLOluEE6O9RWlDrD9xiihTzlw,1735
-wedata_feature_engineering-0.1.8.dist-info/METADATA,sha256=HX42mSJie1KwNQnrB3temigb7fmxxEqShuE065NDcL8,493
-wedata_feature_engineering-0.1.8.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
-wedata_feature_engineering-0.1.8.dist-info/top_level.txt,sha256=Xa0v1rh__RvfVTVDirW5r5UBKg7ZO_iuTeXfp8MNo2A,7
-wedata_feature_engineering-0.1.8.dist-info/RECORD,,
+wedata_feature_engineering-0.1.10.dist-info/METADATA,sha256=a3hj-GU81Glxtr14wsUdmothFfw5h9vdGYb2PWL5G5A,645
+wedata_feature_engineering-0.1.10.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
+wedata_feature_engineering-0.1.10.dist-info/top_level.txt,sha256=Xa0v1rh__RvfVTVDirW5r5UBKg7ZO_iuTeXfp8MNo2A,7
+wedata_feature_engineering-0.1.10.dist-info/RECORD,,

{wedata_feature_engineering-0.1.8.dist-info → wedata_feature_engineering-0.1.10.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.45.1)
+Generator: setuptools (79.0.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

{wedata_feature_engineering-0.1.8.dist-info → wedata_feature_engineering-0.1.10.dist-info}/top_level.txt RENAMED Viewed

File without changes

wedata-feature-engineering 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

wedata-feature-engineering 0.1.8py3-none-any.whl → 0.1.10py3-none-any.whl