tencent-wedata-feature-engineering-dev 0.1.42__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tencent_wedata_feature_engineering_dev-0.1.42.dist-info → tencent_wedata_feature_engineering_dev-0.2.5.dist-info}/METADATA +14 -3
- tencent_wedata_feature_engineering_dev-0.2.5.dist-info/RECORD +78 -0
- {tencent_wedata_feature_engineering_dev-0.1.42.dist-info → tencent_wedata_feature_engineering_dev-0.2.5.dist-info}/WHEEL +1 -1
- wedata/__init__.py +1 -1
- wedata/common/base_table_client/__init__.py +1 -0
- wedata/common/base_table_client/base.py +58 -0
- wedata/common/cloud_sdk_client/__init__.py +2 -0
- wedata/{feature_store → common}/cloud_sdk_client/client.py +56 -12
- wedata/{feature_store → common}/cloud_sdk_client/models.py +212 -37
- wedata/{feature_store → common}/cloud_sdk_client/utils.py +14 -0
- wedata/{feature_store → common}/constants/constants.py +3 -2
- wedata/common/constants/engine_types.py +34 -0
- wedata/{feature_store → common}/entities/column_info.py +6 -5
- wedata/{feature_store → common}/entities/feature_column_info.py +2 -1
- wedata/{feature_store → common}/entities/feature_lookup.py +1 -1
- wedata/{feature_store → common}/entities/feature_spec.py +9 -9
- wedata/{feature_store → common}/entities/feature_table_info.py +1 -1
- wedata/{feature_store → common}/entities/function_info.py +2 -1
- wedata/{feature_store → common}/entities/on_demand_column_info.py +2 -1
- wedata/{feature_store → common}/entities/source_data_column_info.py +3 -1
- wedata/{feature_store → common}/entities/training_set.py +6 -6
- wedata/common/feast_client/__init__.py +1 -0
- wedata/{feature_store → common}/feast_client/feast_client.py +3 -4
- wedata/common/log/__init__.py +1 -0
- wedata/common/log/logger.py +44 -0
- wedata/common/spark_client/__init__.py +1 -0
- wedata/{feature_store → common}/spark_client/spark_client.py +6 -9
- wedata/{feature_store → common}/utils/common_utils.py +7 -9
- wedata/{feature_store → common}/utils/env_utils.py +31 -10
- wedata/{feature_store → common}/utils/feature_lookup_utils.py +6 -6
- wedata/{feature_store → common}/utils/feature_spec_utils.py +13 -8
- wedata/{feature_store → common}/utils/feature_utils.py +5 -5
- wedata/{feature_store → common}/utils/on_demand_utils.py +5 -4
- wedata/{feature_store → common}/utils/schema_utils.py +1 -1
- wedata/{feature_store → common}/utils/signature_utils.py +4 -4
- wedata/{feature_store → common}/utils/training_set_utils.py +13 -13
- wedata/{feature_store → common}/utils/uc_utils.py +1 -1
- wedata/feature_engineering/__init__.py +1 -0
- wedata/feature_engineering/client.py +417 -0
- wedata/feature_engineering/ml_training_client/ml_training_client.py +569 -0
- wedata/feature_engineering/mlflow_model.py +9 -0
- wedata/feature_engineering/table_client/__init__.py +0 -0
- wedata/feature_engineering/table_client/table_client.py +548 -0
- wedata/feature_store/client.py +13 -16
- wedata/feature_store/constants/engine_types.py +8 -30
- wedata/feature_store/feature_table_client/feature_table_client.py +98 -108
- wedata/feature_store/training_set_client/training_set_client.py +14 -17
- wedata/tempo/interpol.py +2 -2
- tencent_wedata_feature_engineering_dev-0.1.42.dist-info/RECORD +0 -64
- {tencent_wedata_feature_engineering_dev-0.1.42.dist-info → tencent_wedata_feature_engineering_dev-0.2.5.dist-info}/top_level.txt +0 -0
- /wedata/{feature_store/cloud_sdk_client → common}/__init__.py +0 -0
- /wedata/{feature_store/common/protos → common/constants}/__init__.py +0 -0
- /wedata/{feature_store → common}/entities/__init__.py +0 -0
- /wedata/{feature_store → common}/entities/environment_variables.py +0 -0
- /wedata/{feature_store → common}/entities/feature.py +0 -0
- /wedata/{feature_store → common}/entities/feature_function.py +0 -0
- /wedata/{feature_store → common}/entities/feature_spec_constants.py +0 -0
- /wedata/{feature_store → common}/entities/feature_table.py +0 -0
- /wedata/{feature_store/feast_client → common/protos}/__init__.py +0 -0
- /wedata/{feature_store/common → common}/protos/feature_store_pb2.py +0 -0
- /wedata/{feature_store/spark_client → common/utils}/__init__.py +0 -0
- /wedata/{feature_store → common}/utils/topological_sort.py +0 -0
- /wedata/{feature_store → common}/utils/validation_utils.py +0 -0
- /wedata/{feature_store/utils → feature_engineering/ml_training_client}/__init__.py +0 -0
|
@@ -40,10 +40,10 @@ RAW_MODEL_FOLDER = "raw_model"
|
|
|
40
40
|
ML_MODEL = "MLmodel"
|
|
41
41
|
|
|
42
42
|
# 特征查找客户端的PyPI包名
|
|
43
|
-
FEATURE_LOOKUP_CLIENT_PIP_PACKAGE = "tencent-wedata-feature-engineering
|
|
43
|
+
FEATURE_LOOKUP_CLIENT_PIP_PACKAGE = "tencent-wedata-feature-engineering"
|
|
44
44
|
|
|
45
45
|
# 特征查找版本号
|
|
46
|
-
FEATURE_LOOKUP_CLIENT_MAJOR_VERSION = "0.
|
|
46
|
+
FEATURE_LOOKUP_CLIENT_MAJOR_VERSION = "0.2.5"
|
|
47
47
|
|
|
48
48
|
# 特征存储内部数据目录
|
|
49
49
|
FEATURE_STORE_INTERNAL_DATA_DIR = "_wedata_internal/"
|
|
@@ -56,4 +56,5 @@ FEATURE_TABLE_VALUE = "true"
|
|
|
56
56
|
FEATURE_TABLE_PROJECT = "wedata.feature_project_id"
|
|
57
57
|
FEATURE_TABLE_TIMESTAMP = "timestampKeys"
|
|
58
58
|
FEATURE_TABLE_BACKUP_PRIMARY_KEY = "primaryKeys" # 备用标识,主键
|
|
59
|
+
FEATURE_ENGINEERING_TABLE_PRIMARY_KEY_WEDATA = "primary-key" # 用于Wedata3
|
|
59
60
|
FEATURE_DLC_TABLE_PRIMARY_KEY = "dlc.ao.data.govern.sorted.keys"
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class EngineTypes(Enum):
|
|
6
|
+
HIVE_ENGINE = "hive"
|
|
7
|
+
ICEBERG_ENGINE = "iceberg"
|
|
8
|
+
|
|
9
|
+
@classmethod
|
|
10
|
+
def get_engine(cls, engine_name: str) -> 'EngineTypes':
|
|
11
|
+
try:
|
|
12
|
+
return cls(engine_name.lower())
|
|
13
|
+
except ValueError:
|
|
14
|
+
raise ValueError(f"Invalid engine type: {engine_name}. Supported engine types: {list(cls)}")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CalculateEngineTypes(Enum):
|
|
18
|
+
DLC = "dlc"
|
|
19
|
+
EMR = "emr"
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def get_calculate_engine(cls, engine_name: str) -> 'CalculateEngineTypes':
|
|
23
|
+
try:
|
|
24
|
+
return cls(engine_name.lower())
|
|
25
|
+
except ValueError:
|
|
26
|
+
raise ValueError(f"Invalid engine type: {engine_name}. Supported engine types: {list(cls)}")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def judge_engine_type() -> 'CalculateEngineTypes':
|
|
30
|
+
if os.environ.get("DLC_REGION", ""):
|
|
31
|
+
return CalculateEngineTypes.DLC
|
|
32
|
+
else:
|
|
33
|
+
return CalculateEngineTypes.EMR
|
|
34
|
+
|
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
import copy
|
|
2
2
|
from typing import Optional, Union
|
|
3
3
|
|
|
4
|
-
from wedata.
|
|
5
|
-
from wedata.
|
|
4
|
+
from wedata.common.entities.feature_column_info import FeatureColumnInfo
|
|
5
|
+
from wedata.common.entities.feature_spec_constants import SOURCE_DATA_COLUMN_INFO, FEATURE_COLUMN_INFO, \
|
|
6
6
|
ON_DEMAND_COLUMN_INFO
|
|
7
|
-
from wedata.
|
|
8
|
-
from wedata.
|
|
7
|
+
from wedata.common.entities.on_demand_column_info import OnDemandColumnInfo
|
|
8
|
+
from wedata.common.entities.source_data_column_info import SourceDataColumnInfo
|
|
9
|
+
|
|
10
|
+
from wedata.common.protos import feature_store_pb2
|
|
9
11
|
|
|
10
|
-
from wedata.feature_store.common.protos import feature_store_pb2
|
|
11
12
|
|
|
12
13
|
class ColumnInfo:
|
|
13
14
|
"""
|
|
@@ -3,7 +3,7 @@ import datetime
|
|
|
3
3
|
import logging
|
|
4
4
|
from typing import Dict, List, Optional, Union
|
|
5
5
|
|
|
6
|
-
from wedata.
|
|
6
|
+
from wedata.common.utils import common_utils
|
|
7
7
|
from wedata.feature_store.common.store_config.redis import RedisStoreConfig
|
|
8
8
|
|
|
9
9
|
_logger = logging.getLogger(__name__)
|
|
@@ -7,11 +7,11 @@ import mlflow
|
|
|
7
7
|
from google.protobuf.json_format import MessageToDict, ParseDict
|
|
8
8
|
from mlflow.utils.file_utils import TempDir, read_yaml, write_yaml
|
|
9
9
|
|
|
10
|
-
from wedata.
|
|
11
|
-
from wedata.
|
|
12
|
-
from wedata.
|
|
13
|
-
from wedata.
|
|
14
|
-
from wedata.
|
|
10
|
+
from wedata.common.protos import feature_store_pb2
|
|
11
|
+
from wedata.common.entities.column_info import ColumnInfo
|
|
12
|
+
from wedata.common.entities.feature_column_info import FeatureColumnInfo
|
|
13
|
+
from wedata.common.entities.function_info import FunctionInfo
|
|
14
|
+
from wedata.common.entities.feature_spec_constants import (
|
|
15
15
|
BOUND_TO,
|
|
16
16
|
DATA_TYPE,
|
|
17
17
|
FEATURE_COLUMN_INFO,
|
|
@@ -34,10 +34,10 @@ from wedata.feature_store.entities.feature_spec_constants import (
|
|
|
34
34
|
TRAINING_DATA,
|
|
35
35
|
UDF_NAME,
|
|
36
36
|
)
|
|
37
|
-
from wedata.
|
|
38
|
-
from wedata.
|
|
39
|
-
from wedata.
|
|
40
|
-
from wedata.
|
|
37
|
+
from wedata.common.entities.feature_table_info import FeatureTableInfo
|
|
38
|
+
from wedata.common.entities.on_demand_column_info import OnDemandColumnInfo
|
|
39
|
+
from wedata.common.entities.source_data_column_info import SourceDataColumnInfo
|
|
40
|
+
from wedata.common.utils import common_utils
|
|
41
41
|
|
|
42
42
|
# Change log for serialization version. Please update for each serialization version.
|
|
43
43
|
# 1. Initial.
|
|
@@ -2,20 +2,20 @@ from typing import Dict, List, Optional
|
|
|
2
2
|
|
|
3
3
|
from pyspark.sql import DataFrame
|
|
4
4
|
|
|
5
|
-
from wedata.
|
|
6
|
-
from wedata.
|
|
7
|
-
from wedata.
|
|
5
|
+
from wedata.common.entities.feature_table import FeatureTable
|
|
6
|
+
from wedata.common.entities.function_info import FunctionInfo
|
|
7
|
+
from wedata.common.utils.feature_lookup_utils import (
|
|
8
8
|
join_feature_data_if_not_overridden,
|
|
9
9
|
)
|
|
10
10
|
|
|
11
|
-
from wedata.
|
|
12
|
-
from wedata.
|
|
11
|
+
from wedata.common.entities.feature_spec import FeatureSpec
|
|
12
|
+
from wedata.common.utils.feature_spec_utils import (
|
|
13
13
|
COLUMN_INFO_TYPE_FEATURE,
|
|
14
14
|
COLUMN_INFO_TYPE_ON_DEMAND,
|
|
15
15
|
COLUMN_INFO_TYPE_SOURCE,
|
|
16
16
|
get_feature_execution_groups,
|
|
17
17
|
)
|
|
18
|
-
from wedata.
|
|
18
|
+
from wedata.common.utils.on_demand_utils import apply_functions_if_not_overridden
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class TrainingSet:
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .feast_client import FeastClient
|
|
@@ -20,6 +20,7 @@ import pytz
|
|
|
20
20
|
from feast import FeatureStore, RepoConfig, FeatureView
|
|
21
21
|
from pyspark.sql import DataFrame, SparkSession
|
|
22
22
|
from wedata.feature_store.common.store_config.redis import RedisStoreConfig
|
|
23
|
+
from wedata.common.utils import env_utils
|
|
23
24
|
from feast import Entity, FeatureService
|
|
24
25
|
from feast.infra.offline_stores.contrib.spark_offline_store.spark_source import SparkSource
|
|
25
26
|
from feast.infra.online_stores.redis import RedisOnlineStore
|
|
@@ -37,8 +38,8 @@ TEMP_FILE_PATH = "/tmp/feast_data/"
|
|
|
37
38
|
class FeastClient:
|
|
38
39
|
|
|
39
40
|
def __init__(self, offline_store: SparkSession, online_store_config: RedisStoreConfig = None):
|
|
40
|
-
project_id =
|
|
41
|
-
remote_path =
|
|
41
|
+
project_id = env_utils.get_project_id()
|
|
42
|
+
remote_path = env_utils.get_feast_remote_url()
|
|
42
43
|
if offline_store is None or not isinstance(offline_store, SparkSession):
|
|
43
44
|
raise ValueError("offline_store must be provided SparkSession instance")
|
|
44
45
|
|
|
@@ -146,8 +147,6 @@ class FeastClient:
|
|
|
146
147
|
self._client.apply(feature_service)
|
|
147
148
|
self._client.registry.delete_feature_view(feast_table_name, self._client.project)
|
|
148
149
|
|
|
149
|
-
|
|
150
|
-
|
|
151
150
|
def get_feature_view(self, table_name: str):
|
|
152
151
|
feast_table_name = translate_table_name_to_feast(table_name)
|
|
153
152
|
return self._client.get_feature_view(feast_table_name)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .logger import get_logger
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class LoggerSingleton:
|
|
6
|
+
_instance = None
|
|
7
|
+
|
|
8
|
+
def __new__(cls):
|
|
9
|
+
if cls._instance is None:
|
|
10
|
+
cls._instance = super().__new__(cls)
|
|
11
|
+
cls._instance._initialize_logger()
|
|
12
|
+
return cls._instance
|
|
13
|
+
|
|
14
|
+
def _initialize_logger(self):
|
|
15
|
+
self.logger = logging.getLogger("wedata-feature-engineering")
|
|
16
|
+
self.logger.setLevel(logging.INFO)
|
|
17
|
+
|
|
18
|
+
# 清除已有的handler,避免重复添加
|
|
19
|
+
if self.logger.handlers:
|
|
20
|
+
self.logger.handlers.clear()
|
|
21
|
+
|
|
22
|
+
# 创建formatter,包含时间、文件名和行号
|
|
23
|
+
formatter = logging.Formatter(
|
|
24
|
+
fmt='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
25
|
+
datefmt='%Y-%m-%d %H:%M:%S'
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
# 创建handler并输出到stdout
|
|
29
|
+
handler = logging.StreamHandler(sys.stdout)
|
|
30
|
+
handler.setFormatter(formatter)
|
|
31
|
+
|
|
32
|
+
self.logger.addHandler(handler)
|
|
33
|
+
|
|
34
|
+
# 防止消息传播到父级logger
|
|
35
|
+
self.logger.propagate = False
|
|
36
|
+
|
|
37
|
+
def get_logger(self, level=logging.INFO):
|
|
38
|
+
self.logger.setLevel(level)
|
|
39
|
+
return self.logger
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_logger(level=logging.INFO):
|
|
43
|
+
"""获取单例logger实例"""
|
|
44
|
+
return LoggerSingleton().get_logger(level)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .spark_client import SparkClient
|
|
@@ -1,21 +1,20 @@
|
|
|
1
1
|
from collections import defaultdict
|
|
2
|
-
from typing import Optional, Any,
|
|
2
|
+
from typing import Optional, Any, List
|
|
3
3
|
|
|
4
|
-
import logging
|
|
5
4
|
from pyspark.sql import SparkSession, DataFrame
|
|
6
5
|
from pyspark.sql.catalog import Column
|
|
7
6
|
from pyspark.sql.functions import when, isnull
|
|
8
7
|
from pyspark.sql.types import StructType, StringType, StructField
|
|
9
8
|
from mlflow.pyfunc import spark_udf
|
|
10
9
|
|
|
11
|
-
from wedata.
|
|
10
|
+
from wedata.common.constants.constants import (
|
|
12
11
|
_PREBUILT_ENV_URI
|
|
13
12
|
)
|
|
14
13
|
|
|
15
|
-
from wedata.
|
|
16
|
-
from wedata.
|
|
17
|
-
from wedata.
|
|
18
|
-
from wedata.
|
|
14
|
+
from wedata.common.entities.feature import Feature
|
|
15
|
+
from wedata.common.entities.feature_table import FeatureTable
|
|
16
|
+
from wedata.common.entities.function_info import FunctionParameterInfo, FunctionInfo
|
|
17
|
+
from wedata.common.utils.common_utils import unsanitize_identifier, check_spark_table_exists, check_package_version
|
|
19
18
|
|
|
20
19
|
|
|
21
20
|
class SparkClient:
|
|
@@ -36,7 +35,6 @@ class SparkClient:
|
|
|
36
35
|
catalog = self._spark.sql("SELECT current_catalog()").first()[0]
|
|
37
36
|
return unsanitize_identifier(catalog)
|
|
38
37
|
|
|
39
|
-
|
|
40
38
|
def get_current_database(self):
|
|
41
39
|
"""
|
|
42
40
|
获取Spark上下文中当前设置的数据库名称
|
|
@@ -56,7 +54,6 @@ class SparkClient:
|
|
|
56
54
|
def createDataFrame(self, data, schema) -> DataFrame:
|
|
57
55
|
return self._spark.createDataFrame(data, schema)
|
|
58
56
|
|
|
59
|
-
|
|
60
57
|
def read_table(self, table_name):
|
|
61
58
|
"""读取Spark表数据
|
|
62
59
|
|
|
@@ -14,15 +14,11 @@ from mlflow.store.artifact.models_artifact_repo import ModelsArtifactRepository
|
|
|
14
14
|
from mlflow.store.artifact.runs_artifact_repo import RunsArtifactRepository
|
|
15
15
|
from mlflow.utils import databricks_utils
|
|
16
16
|
|
|
17
|
-
from wedata.
|
|
18
|
-
from wedata.
|
|
17
|
+
from wedata.common.constants import constants
|
|
18
|
+
from wedata.common.constants.constants import MODEL_DATA_PATH_ROOT
|
|
19
|
+
from wedata.common.log import get_logger
|
|
19
20
|
from pyspark.sql import SparkSession
|
|
20
21
|
|
|
21
|
-
import logging
|
|
22
|
-
|
|
23
|
-
# 配置日志(可选,根据实际情况配置)
|
|
24
|
-
logging.basicConfig(level=logging.ERROR)
|
|
25
|
-
|
|
26
22
|
|
|
27
23
|
def validate_table_name(name: str):
|
|
28
24
|
"""
|
|
@@ -57,14 +53,16 @@ def build_full_table_name(table_name: str, database_name: Optional[str] = None)
|
|
|
57
53
|
"""
|
|
58
54
|
|
|
59
55
|
feature_store_database_name = os.environ.get("WEDATA_DEFAULT_FEATURE_STORE_DATABASE")
|
|
56
|
+
logger = get_logger()
|
|
60
57
|
if database_name:
|
|
61
58
|
feature_store_database_name = database_name
|
|
62
59
|
|
|
63
60
|
if not feature_store_database_name:
|
|
64
|
-
|
|
61
|
+
logger.error("The current user has not configured a default feature database. "
|
|
62
|
+
"Please contact the manager account to configure it.")
|
|
65
63
|
raise RuntimeError("Feature store is not configured! Please contact the main account to configure it.")
|
|
66
64
|
|
|
67
|
-
|
|
65
|
+
logger.debug("feature database:{}".format(feature_store_database_name))
|
|
68
66
|
|
|
69
67
|
feature_store_database = f"{feature_store_database_name}.{table_name}"
|
|
70
68
|
|
|
@@ -38,10 +38,10 @@ def get_region() -> str:
|
|
|
38
38
|
获取当前地域
|
|
39
39
|
"""
|
|
40
40
|
region_dlc = os.environ.get("DLC_REGION")
|
|
41
|
-
region_emr = os.environ.get("
|
|
41
|
+
region_emr = os.environ.get("KERNEL_REGION")
|
|
42
42
|
region = region_dlc if region_dlc else region_emr
|
|
43
43
|
if not region:
|
|
44
|
-
raise EnvironmentError("environment variable DLC_REGION or
|
|
44
|
+
raise EnvironmentError("environment variable DLC_REGION or KERNEL_REGION is not set, "
|
|
45
45
|
"please check environment configuration")
|
|
46
46
|
return region
|
|
47
47
|
|
|
@@ -68,24 +68,30 @@ def get_database_name(database_name: str) -> str:
|
|
|
68
68
|
"please check environment configuration")
|
|
69
69
|
|
|
70
70
|
|
|
71
|
+
def set_default_database(database_name: str):
|
|
72
|
+
"""
|
|
73
|
+
设置默认数据库名称
|
|
74
|
+
"""
|
|
75
|
+
if not isinstance(database_name, str):
|
|
76
|
+
raise ValueError("database_name must be a string")
|
|
77
|
+
os.environ["WEDATA_DEFAULT_FEATURE_STORE_DATABASE"] = database_name
|
|
78
|
+
|
|
79
|
+
|
|
71
80
|
def get_engine_name() -> str:
|
|
72
81
|
"""
|
|
73
82
|
获取引擎名称
|
|
74
83
|
"""
|
|
75
|
-
|
|
76
|
-
if
|
|
77
|
-
return
|
|
78
|
-
|
|
84
|
+
# 因为DLC有特殊,所以先判断DLC,如果没有再判断EMR
|
|
85
|
+
if get_engine_type() == "DLC":
|
|
86
|
+
return _get_variable("KERNEL_ENGINE")
|
|
87
|
+
return _get_variable("KERNEL_ENGINE_NAME")
|
|
79
88
|
|
|
80
89
|
|
|
81
90
|
def get_engine_id() -> str:
|
|
82
91
|
"""
|
|
83
92
|
获取引擎ID
|
|
84
93
|
"""
|
|
85
|
-
|
|
86
|
-
if engine_id:
|
|
87
|
-
return engine_id
|
|
88
|
-
raise EnvironmentError("environment variable KERNEL_ENGINE is not set, please check environment configuration")
|
|
94
|
+
return _get_variable("KERNEL_ENGINE")
|
|
89
95
|
|
|
90
96
|
|
|
91
97
|
def get_engine_type() -> str:
|
|
@@ -94,3 +100,18 @@ def get_engine_type() -> str:
|
|
|
94
100
|
"""
|
|
95
101
|
return "DLC" if os.environ.get("DLC_REGION") else "EMR"
|
|
96
102
|
|
|
103
|
+
|
|
104
|
+
def get_feast_remote_url() -> str:
|
|
105
|
+
"""
|
|
106
|
+
获取Feast远程URL
|
|
107
|
+
"""
|
|
108
|
+
return _get_variable("KERNEL_FEAST_REMOTE_ADDRESS")
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _get_variable(variable_key: str, is_raise: bool = True, default_value: str = None) -> str:
|
|
112
|
+
val = os.environ.get(variable_key, default_value)
|
|
113
|
+
if not val:
|
|
114
|
+
if is_raise:
|
|
115
|
+
raise EnvironmentError(f"environment variable {variable_key} is not set, "
|
|
116
|
+
f"please check environment configuration")
|
|
117
|
+
return val
|
|
@@ -11,13 +11,13 @@ from pyspark.sql import functions as F
|
|
|
11
11
|
import pyspark.sql.functions as psf
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
from wedata.
|
|
15
|
-
from wedata.
|
|
16
|
-
from wedata.
|
|
17
|
-
from wedata.
|
|
18
|
-
from wedata.
|
|
14
|
+
from wedata.common.entities.environment_variables import BROADCAST_JOIN_THRESHOLD
|
|
15
|
+
from wedata.common.entities.feature_column_info import FeatureColumnInfo
|
|
16
|
+
from wedata.common.entities.feature_lookup import FeatureLookup
|
|
17
|
+
from wedata.common.entities.feature_spec import FeatureSpec
|
|
18
|
+
from wedata.common.entities.feature_table import FeatureTable
|
|
19
19
|
|
|
20
|
-
from wedata.
|
|
20
|
+
from wedata.common.utils import uc_utils
|
|
21
21
|
|
|
22
22
|
_logger = logging.getLogger(__name__)
|
|
23
23
|
|
|
@@ -4,14 +4,19 @@ from functools import reduce
|
|
|
4
4
|
from typing import Dict, List, Tuple, Union
|
|
5
5
|
|
|
6
6
|
import yaml
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
from
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
from wedata.
|
|
7
|
+
|
|
8
|
+
# MLflow 3.x compatibility: YamlSafeDumper was removed, use yaml.SafeDumper directly
|
|
9
|
+
try:
|
|
10
|
+
from mlflow.utils.file_utils import YamlSafeDumper
|
|
11
|
+
except ImportError:
|
|
12
|
+
YamlSafeDumper = yaml.SafeDumper
|
|
13
|
+
|
|
14
|
+
from wedata.common.entities.column_info import ColumnInfo
|
|
15
|
+
from wedata.common.entities.feature_column_info import FeatureColumnInfo
|
|
16
|
+
from wedata.common.entities.feature_spec import FeatureSpec
|
|
17
|
+
from wedata.common.entities.on_demand_column_info import OnDemandColumnInfo
|
|
18
|
+
from wedata.common.entities.source_data_column_info import SourceDataColumnInfo
|
|
19
|
+
from wedata.common.utils.topological_sort import topological_sort
|
|
15
20
|
|
|
16
21
|
DEFAULT_GRAPH_DEPTH_LIMIT = 5
|
|
17
22
|
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import copy
|
|
2
2
|
from typing import List, Union
|
|
3
3
|
|
|
4
|
-
from wedata.
|
|
5
|
-
from wedata.
|
|
6
|
-
from wedata.
|
|
7
|
-
from wedata.
|
|
8
|
-
from wedata.
|
|
4
|
+
from wedata.common.entities.feature_function import FeatureFunction
|
|
5
|
+
from wedata.common.entities.feature_lookup import FeatureLookup
|
|
6
|
+
from wedata.common.spark_client import SparkClient
|
|
7
|
+
from wedata.common.utils import uc_utils
|
|
8
|
+
from wedata.common.utils.feature_lookup_utils import get_feature_lookups_with_full_table_names
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def format_feature_lookups_and_functions(
|
|
@@ -4,10 +4,11 @@ from typing import Dict, List
|
|
|
4
4
|
from pyspark.sql import DataFrame
|
|
5
5
|
from pyspark.sql.functions import expr
|
|
6
6
|
|
|
7
|
-
from wedata.
|
|
8
|
-
from wedata.
|
|
9
|
-
from wedata.
|
|
10
|
-
from wedata.
|
|
7
|
+
from wedata.common.entities.feature_function import FeatureFunction
|
|
8
|
+
from wedata.common.entities.function_info import FunctionInfo
|
|
9
|
+
from wedata.common.entities.on_demand_column_info import OnDemandColumnInfo
|
|
10
|
+
from wedata.common.utils import common_utils
|
|
11
|
+
from wedata.common.utils import uc_utils
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
def _udf_expr(udf_name: str, arguments: List[str]) -> expr:
|
|
@@ -7,10 +7,10 @@ from mlflow.types import ColSpec
|
|
|
7
7
|
from mlflow.types import DataType as MlflowDataType
|
|
8
8
|
from mlflow.types import ParamSchema, Schema
|
|
9
9
|
|
|
10
|
-
from wedata.
|
|
11
|
-
from wedata.
|
|
12
|
-
from wedata.
|
|
13
|
-
from wedata.
|
|
10
|
+
from wedata.common.entities.feature_column_info import FeatureColumnInfo
|
|
11
|
+
from wedata.common.entities.feature_spec import FeatureSpec
|
|
12
|
+
from wedata.common.entities.on_demand_column_info import OnDemandColumnInfo
|
|
13
|
+
from wedata.common.entities.source_data_column_info import SourceDataColumnInfo
|
|
14
14
|
|
|
15
15
|
_logger = logging.getLogger(__name__)
|
|
16
16
|
|
|
@@ -4,19 +4,19 @@ from typing import Dict, List, Optional, Set
|
|
|
4
4
|
|
|
5
5
|
from pyspark.sql import DataFrame
|
|
6
6
|
|
|
7
|
-
from wedata.
|
|
8
|
-
from wedata.
|
|
9
|
-
from wedata.
|
|
10
|
-
from wedata.
|
|
11
|
-
from wedata.
|
|
12
|
-
from wedata.
|
|
13
|
-
from wedata.
|
|
14
|
-
from wedata.
|
|
15
|
-
from wedata.
|
|
16
|
-
from wedata.
|
|
17
|
-
|
|
18
|
-
from wedata.
|
|
19
|
-
from wedata.
|
|
7
|
+
from wedata.common.entities.column_info import ColumnInfo
|
|
8
|
+
from wedata.common.entities.feature import Feature
|
|
9
|
+
from wedata.common.entities.feature_column_info import FeatureColumnInfo
|
|
10
|
+
from wedata.common.entities.feature_lookup import FeatureLookup
|
|
11
|
+
from wedata.common.entities.feature_spec import FeatureSpec
|
|
12
|
+
from wedata.common.entities.feature_table import FeatureTable
|
|
13
|
+
from wedata.common.entities.feature_table_info import FeatureTableInfo
|
|
14
|
+
from wedata.common.entities.function_info import FunctionInfo
|
|
15
|
+
from wedata.common.entities.on_demand_column_info import OnDemandColumnInfo
|
|
16
|
+
from wedata.common.entities.source_data_column_info import SourceDataColumnInfo
|
|
17
|
+
|
|
18
|
+
from wedata.common.utils import common_utils, validation_utils
|
|
19
|
+
from wedata.common.utils.feature_spec_utils import assign_topological_ordering
|
|
20
20
|
|
|
21
21
|
_logger = logging.getLogger(__name__)
|
|
22
22
|
|
|
@@ -3,7 +3,7 @@ import re
|
|
|
3
3
|
from typing import Optional, Set, Any, List
|
|
4
4
|
from datetime import datetime, timezone
|
|
5
5
|
|
|
6
|
-
from wedata.
|
|
6
|
+
from wedata.common.entities.feature_spec import FeatureSpec
|
|
7
7
|
|
|
8
8
|
SINGLE_LEVEL_NAMESPACE_REGEX = r"^[^\. \/\x00-\x1F\x7F]+$"
|
|
9
9
|
TWO_LEVEL_NAMESPACE_REGEX = r"^[^\. \/\x00-\x1F\x7F]+(\.[^\. \/\x00-\x1F\x7F]+)$"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .client import FeatureEngineeringClient
|