PyPI - tencent-wedata-feature-engineering-dev - Versions diffs - 0.1.0__py3-none-any.whl - Mend

tencent-wedata-feature-engineering-dev 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.

Files changed (64) hide show

tencent_wedata_feature_engineering_dev-0.1.0.dist-info/METADATA +19 -0
tencent_wedata_feature_engineering_dev-0.1.0.dist-info/RECORD +64 -0
tencent_wedata_feature_engineering_dev-0.1.0.dist-info/WHEEL +5 -0
tencent_wedata_feature_engineering_dev-0.1.0.dist-info/top_level.txt +1 -0
wedata/__init__.py +9 -0
wedata/feature_store/__init__.py +0 -0
wedata/feature_store/client.py +462 -0
wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
wedata/feature_store/cloud_sdk_client/client.py +86 -0
wedata/feature_store/cloud_sdk_client/models.py +686 -0
wedata/feature_store/cloud_sdk_client/utils.py +32 -0
wedata/feature_store/common/__init__.py +0 -0
wedata/feature_store/common/protos/__init__.py +0 -0
wedata/feature_store/common/protos/feature_store_pb2.py +49 -0
wedata/feature_store/common/store_config/__init__.py +0 -0
wedata/feature_store/common/store_config/redis.py +48 -0
wedata/feature_store/constants/__init__.py +0 -0
wedata/feature_store/constants/constants.py +59 -0
wedata/feature_store/constants/engine_types.py +34 -0
wedata/feature_store/entities/__init__.py +0 -0
wedata/feature_store/entities/column_info.py +138 -0
wedata/feature_store/entities/environment_variables.py +55 -0
wedata/feature_store/entities/feature.py +53 -0
wedata/feature_store/entities/feature_column_info.py +72 -0
wedata/feature_store/entities/feature_function.py +55 -0
wedata/feature_store/entities/feature_lookup.py +200 -0
wedata/feature_store/entities/feature_spec.py +489 -0
wedata/feature_store/entities/feature_spec_constants.py +25 -0
wedata/feature_store/entities/feature_table.py +111 -0
wedata/feature_store/entities/feature_table_info.py +49 -0
wedata/feature_store/entities/function_info.py +90 -0
wedata/feature_store/entities/on_demand_column_info.py +57 -0
wedata/feature_store/entities/source_data_column_info.py +24 -0
wedata/feature_store/entities/training_set.py +135 -0
wedata/feature_store/feast_client/__init__.py +0 -0
wedata/feature_store/feast_client/feast_client.py +482 -0
wedata/feature_store/feature_table_client/__init__.py +0 -0
wedata/feature_store/feature_table_client/feature_table_client.py +969 -0
wedata/feature_store/mlflow_model.py +17 -0
wedata/feature_store/spark_client/__init__.py +0 -0
wedata/feature_store/spark_client/spark_client.py +289 -0
wedata/feature_store/training_set_client/__init__.py +0 -0
wedata/feature_store/training_set_client/training_set_client.py +572 -0
wedata/feature_store/utils/__init__.py +0 -0
wedata/feature_store/utils/common_utils.py +352 -0
wedata/feature_store/utils/env_utils.py +86 -0
wedata/feature_store/utils/feature_lookup_utils.py +564 -0
wedata/feature_store/utils/feature_spec_utils.py +286 -0
wedata/feature_store/utils/feature_utils.py +73 -0
wedata/feature_store/utils/on_demand_utils.py +107 -0
wedata/feature_store/utils/schema_utils.py +117 -0
wedata/feature_store/utils/signature_utils.py +202 -0
wedata/feature_store/utils/topological_sort.py +158 -0
wedata/feature_store/utils/training_set_utils.py +579 -0
wedata/feature_store/utils/uc_utils.py +296 -0
wedata/feature_store/utils/validation_utils.py +79 -0
wedata/tempo/__init__.py +0 -0
wedata/tempo/interpol.py +448 -0
wedata/tempo/intervals.py +1331 -0
wedata/tempo/io.py +61 -0
wedata/tempo/ml.py +129 -0
wedata/tempo/resample.py +318 -0
wedata/tempo/tsdf.py +1720 -0
wedata/tempo/utils.py +254 -0

wedata/feature_store/utils/common_utils.py ADDED Viewed

@@ -0,0 +1,352 @@
+"""
+通用工具函数
+"""
+import os
+from collections import Counter
+from datetime import datetime, timezone
+from functools import wraps
+from typing import Any, Dict, List, Optional
+from urllib.parse import urlparse
+from mlflow.exceptions import RestException
+from mlflow.store.artifact.artifact_repository_registry import get_artifact_repository
+from mlflow.store.artifact.models_artifact_repo import ModelsArtifactRepository
+from mlflow.store.artifact.runs_artifact_repo import RunsArtifactRepository
+from mlflow.utils import databricks_utils
+from wedata.feature_store.constants import constants
+from wedata.feature_store.constants.constants import MODEL_DATA_PATH_ROOT
+from pyspark.sql import SparkSession
+import logging
+# 配置日志（可选，根据实际情况配置）
+logging.basicConfig(level=logging.ERROR)
+def validate_table_name(name: str):
+    """
+    验证特征表名规范，仅支持单表名，不能包含点（如<catalog>.<schema>.<table>）
+    参数:
+        name: 要验证的表名
+    异常:
+        ValueError: 如果表名包含点或不符合规范
+    """
+    if not name or not isinstance(name, str):
+        raise ValueError("Table name must be a non-empty string")
+    if name.count('.') > 0:
+        raise ValueError("Feature table name only supports single table name, cannot contain dots (e.g. <catalog>.<schema>.<table>)")
+    if not name[0].isalpha():
+        raise ValueError("Table name must start with a letter")
+    if not all(c.isalnum() or c == '_' for c in name):
+        raise ValueError("Table name can only contain letters, numbers and underscores")
+def build_full_table_name(table_name: str, database_name: Optional[str] = None) -> str:
+    """
+    构建完整的表名，格式化为`<database>.<table>`形式。
+    Args:
+        table_name: 输入的表名（可以是简化的表名或完整表名）。
+        database_name: 数据库名
+    Returns:
+        完整表名（`<database>.<table>`）。
+    """
+    feature_store_database_name = os.environ.get("WEDATA_DEFAULT_FEATURE_STORE_DATABASE")
+    if database_name:
+        feature_store_database_name = database_name
+    if not feature_store_database_name:
+        logging.error("The current user has not configured a default feature database. Please contact the manager account to configure it.")
+        raise RuntimeError("Feature store is not configured! Please contact the main account to configure it.")
+    logging.info("feature database:{}".format(feature_store_database_name))
+    feature_store_database = f"{feature_store_database_name}.{table_name}"
+    return feature_store_database
+def enable_if(condition):
+    """
+    A decorator that conditionally enables a function based on a condition.
+    If the condition is not truthy, calling the function raises a NotImplementedError.
+    :param condition: A callable that returns a truthy or falsy value.
+    """
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            if not condition():
+                raise NotImplementedError
+            return func(*args, **kwargs)
+        return wrapper
+    return decorator
+def is_empty(target: str):
+    return target is None or len(target.strip()) == 0
+class _NoDbutilsError(Exception):
+    pass
+def _get_dbutils():
+    try:
+        import IPython
+        ip_shell = IPython.get_ipython()
+        if ip_shell is None:
+            raise _NoDbutilsError
+        return ip_shell.ns_table["user_global"]["dbutils"]
+    except ImportError:
+        raise _NoDbutilsError
+    except KeyError:
+        raise _NoDbutilsError
+def utc_timestamp_ms_from_iso_datetime_string(date_string: str) -> int:
+    dt = datetime.fromisoformat(date_string)
+    utc_dt = dt.replace(tzinfo=timezone.utc)
+    return int(1000 * utc_dt.timestamp())
+def pip_depependency_pinned_major_version(pip_package_name, major_version):
+    """
+    Generate a pip dependency string that is pinned to a major version, for example: "databricks-feature-lookup==0.*"
+    """
+    return f"{pip_package_name}=={major_version}.*"
+def pip_depependency_pinned_version(pip_package_name, version):
+    """
+    Generate a pip dependency string that is pinned to a major version, for example: "databricks-feature-lookup==0.*"
+    """
+    return f"{pip_package_name}=={version}"
+def add_mlflow_pip_depependency(conda_env, pip_package_name):
+    """
+    Add a new pip dependency to the conda environment taken from the raw MLflow model.
+    """
+    if pip_package_name is None or len(pip_package_name) == 0:
+        raise ValueError(
+            "Unexpected input: missing or empty pip_package_name parameter"
+        )
+    found_pip_dependency = False
+    if conda_env is not None:
+        for dep in conda_env["dependencies"]:
+            if isinstance(dep, dict) and "pip" in dep:
+                found_pip_dependency = True
+                pip_deps = dep["pip"]
+                if pip_package_name not in pip_deps:
+                    pip_deps.append(pip_package_name)
+        if "dependencies" in conda_env and not found_pip_dependency:
+            raise ValueError(
+                "Unexpected input: mlflow conda_env did not contain pip as a dependency"
+            )
+def download_model_artifacts(model_uri, dir):
+    """
+    Downloads model artifacts from model_uri to dir.
+    """
+    if not is_artifact_uri(model_uri):
+        raise ValueError(
+            f"Invalid model URI '{model_uri}'."
+            f"Use ``models:/model_name>/<version_number>`` or "
+            f"``runs:/<mlflow_run_id>/run-relative/path/to/model``."
+        )
+    try:
+        repo = get_artifact_repository(model_uri)
+    except RestException as e:
+        raise ValueError(f"The model at '{model_uri}' does not exist.", e)
+    artifact_path = os.path.join("artifacts", MODEL_DATA_PATH_ROOT)
+    if len(repo.list_artifacts(artifact_path)) == 0:
+        raise ValueError(
+            f"No suitable model found at '{model_uri}'. Either no model exists in this "
+            f"artifact location or an existing model was not packaged with Feature Store metadata. "
+            f"Only models logged by FeatureStoreClient.log_model can be used in inference."
+        )
+    return repo.download_artifacts(artifact_path="", dst_path=dir)
+def validate_params_non_empty(params: Dict[str, Any], expected_params: List[str]):
+    """
+    Validate that none of the expected parameters are empty.
+    """
+    for expected_param in expected_params:
+        if expected_param not in params:
+            raise ValueError(
+                f'Internal error: expected parameter "{expected_param}" not found in params dictionary'
+            )
+        param_value = params[expected_param]
+        if not param_value:
+            raise ValueError(f'Parameter "{expected_param}" cannot be empty')
+def get_workspace_url() -> Optional[str]:
+    """
+    Overrides the behavior of the mlflow.utils.databricks_utils.get_workspace_url().
+    """
+    workspace_url = databricks_utils.get_workspace_url()
+    if workspace_url and not urlparse(workspace_url).scheme:
+        workspace_url = "https://" + workspace_url
+    return workspace_url
+def is_artifact_uri(uri):
+    """
+    Checks the artifact URI is associated with a MLflow model or run.
+    The actual URI can be a model URI, model URI + subdirectory, or model URI + path to artifact file.
+    """
+    return ModelsArtifactRepository.is_models_uri(
+        uri
+    ) or RunsArtifactRepository.is_runs_uri(uri)
+def as_list(obj, default=None):
+    if not obj:
+        return default
+    elif isinstance(obj, list):
+        return obj
+    else:
+        return [obj]
+def get_duplicates(elements: List[Any]) -> List[Any]:
+    """
+    Returns duplicate elements in the order they first appear.
+    """
+    element_counts = Counter(elements)
+    duplicates = []
+    for e in element_counts.keys():
+        if element_counts[e] > 1:
+            duplicates.append(e)
+    return duplicates
+def validate_strings_unique(strings: List[str], error_template: str):
+    """
+    Validates all strings are unique, otherwise raise ValueError with the error template and duplicates.
+    Passes single-quoted, comma delimited duplicates to the error template.
+    """
+    duplicate_strings = get_duplicates(strings)
+    if duplicate_strings:
+        duplicates_formatted = ", ".join([f"'{s}'" for s in duplicate_strings])
+        raise ValueError(error_template.format(duplicates_formatted))
+def sanitize_identifier(identifier: str):
+    """
+    Sanitize and wrap an identifier with backquotes. For example, "a`b" becomes "`a``b`".
+    Use this function to sanitize identifiers such as column names in SQL and PySpark.
+    """
+    return f"`{identifier.replace('`', '``')}`"
+def sanitize_identifiers(identifiers: List[str]):
+    """
+    Sanitize and wrap the identifiers in a list with backquotes.
+    """
+    return [sanitize_identifier(i) for i in identifiers]
+def sanitize_multi_level_name(multi_level_name: str):
+    """
+    Sanitize a multi-level name (such as an Unity Catalog table name) by sanitizing each segment
+    and joining the results. For example, "ca+t.fo`o.ba$r" becomes "`ca+t`.`fo``o`.`ba$r`".
+    """
+    segments = multi_level_name.split(".")
+    return ".".join(sanitize_identifiers(segments))
+def unsanitize_identifier(identifier: str):
+    """
+    Unsanitize an identifier. Useful when we get a possibly sanitized identifier from Spark or
+    somewhere else, but we need an unsanitized one.
+    Note: This function does not check the correctness of the identifier passed in. e.g. `foo``
+    is not a valid sanitized identifier. When given such invalid input, this function returns
+    invalid output.
+    """
+    if len(identifier) >= 2 and identifier[0] == "`" and identifier[-1] == "`":
+        return identifier[1:-1].replace("``", "`")
+    else:
+        return identifier
+# strings containing \ or ' can break sql statements, so escape them.
+def escape_sql_string(input_str: str) -> str:
+    return input_str.replace("\\", "\\\\").replace("'", "\\'")
+def get_unique_list_order(elements: List[Any]) -> List[Any]:
+    """
+    Returns unique elements in the order they first appear.
+    """
+    return list(dict.fromkeys(elements))
+def validate_database(database_name):
+    if database_name is None:
+        database_name = os.environ.get(constants.WEDATA_DEFAULT_FEATURE_STORE_DATABASE)
+    if database_name is None:
+        raise ValueError("Database_name variable or default database is not set.")
+    return True
+def check_package_version(package_name, expected_version, op="=="):
+    """
+    检查指定包的版本是否满足预期版本要求。
+    Args:
+        package_name: 包名称
+        expected_version: 预期版本要求，例如3.5.5
+        op: 比较运算符，默认为 "=="
+    Returns:
+        (是否成功找到包，版本是否匹配，已安装版本)
+    如果满足，返回 (True, True, installed_version)；否则返回 (True, False, installed_version)。
+    如果指定包不存在，返回 (False, False, None)。
+    """
+    # 在脚本顶部添加
+    from packaging import version
+    import importlib.metadata
+    try:
+        installed_version = importlib.metadata.version(package_name)
+        if not op:
+            raise ValueError(f"Invalid op: {op}. need be in ['==', '>', '<', '>=', '<=', '!=', '~=']")
+        # 支持版本范围检查（如 ">=2.0,<3.0"）
+        # 使用 packaging.version 进行复杂版本`检查
+        i = version.parse(installed_version)
+        e = version.parse(expected_version)
+        return True, eval(f"i{op}e"), installed_version
+    except importlib.metadata.PackageNotFoundError:
+        return False, False, None
+def check_spark_table_exists(spark_client: SparkSession, full_table_name: str) -> bool:
+    _, ok, _ = check_package_version("pyspark", "3.5.0", ">=")
+    try:
+        return spark_client.catalog.tableExists(full_table_name)
+    except AttributeError:
+        split = full_table_name.split(".")
+        if len(split) == 2:
+            query = f"SHOW TABLES IN {split[0]} LIKE '{split[1]}'"
+        elif len(split) == 3:
+            query = f"SHOW TABLES IN {split[1]} LIKE '{split[2]}'"
+        else:
+            query = f"SHOW TABLES LIKE '{full_table_name}'"
+        logging.debug("check table sql: =======", query)
+        return spark_client.sql(query).count() > 0

wedata/feature_store/utils/env_utils.py ADDED Viewed

@@ -0,0 +1,86 @@
+import os
+class EnvironmentError(Exception):
+    pass
+def get_project_id() -> str:
+    """
+    获取当前项目名称
+    Returns:
+        str: 项目ID
+    Raises:
+        ValueError: 当环境变量 WEDATA_PROJECT_ID 未设置时
+    """
+    project_id = os.environ.get("WEDATA_PROJECT_ID")
+    if project_id:
+        return project_id
+    raise EnvironmentError("environment variable WEDATA_PROJECT_ID is not set, please check environment configuration")
+def get_cloud_secret() -> (str, str):
+    """
+    获取云上密钥
+    Returns:
+        tuple: 包含云上密钥的元组
+    """
+    secret_id = os.environ.get("WEDATA_CLOUD_TEMP_SECRET_ID")
+    secret_key = os.environ.get("WEDATA_CLOUD_TEMP_SECRET_KEY")
+    return secret_id, secret_key
+def get_region() -> str:
+    """
+    获取当前地域
+    """
+    region_dlc = os.environ.get("DLC_REGION")
+    region_emr = os.environ.get("EMR_REGION")
+    region = region_dlc if region_dlc else region_emr
+    if not region:
+        raise EnvironmentError("environment variable DLC_REGION or EMR_REGION is not set, "
+                               "please check environment configuration")
+    return region
+def get_database_name(database_name: str) -> str:
+    """
+    获取数据库名称
+    Args:
+        database_name: 数据库名称
+    Returns:
+        str: 数据库名称
+    Raises:
+        EnvironmentError: 当环境变量 WEDATA_DEFAULT_FEATURE_STORE_DATABASE 未设置时
+    """
+    feature_store_database_name = os.environ.get("WEDATA_DEFAULT_FEATURE_STORE_DATABASE")
+    if database_name:
+        return database_name
+    elif feature_store_database_name:
+        return feature_store_database_name
+    raise EnvironmentError("environment variable WEDATA_DEFAULT_FEATURE_STORE_DATABASE is not set, "
+                           "please check environment configuration")
+def get_engine_name() -> str:
+    """
+    获取引擎名称
+    """
+    engine_name = os.environ.get("KERNEL_ENGINE")
+    if engine_name:
+        return engine_name
+    raise EnvironmentError("environment variable KERNEL_ENGINE is not set, please check environment configuration")
+def get_engine_type() -> str:
+    """
+    判断引擎类型
+    """
+    return "DLC" if os.environ.get("DLC_REGION") else "EMR"