tencent-wedata-feature-engineering-dev 0.1.48__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tencent_wedata_feature_engineering_dev-0.1.48.dist-info → tencent_wedata_feature_engineering_dev-0.2.5.dist-info}/METADATA +14 -3
- tencent_wedata_feature_engineering_dev-0.2.5.dist-info/RECORD +78 -0
- {tencent_wedata_feature_engineering_dev-0.1.48.dist-info → tencent_wedata_feature_engineering_dev-0.2.5.dist-info}/WHEEL +1 -1
- wedata/__init__.py +1 -1
- wedata/common/base_table_client/__init__.py +1 -0
- wedata/common/base_table_client/base.py +58 -0
- wedata/common/cloud_sdk_client/__init__.py +2 -0
- wedata/{feature_store → common}/cloud_sdk_client/client.py +33 -3
- wedata/{feature_store → common}/cloud_sdk_client/models.py +212 -37
- wedata/{feature_store → common}/cloud_sdk_client/utils.py +7 -0
- wedata/{feature_store → common}/constants/constants.py +3 -2
- wedata/common/constants/engine_types.py +34 -0
- wedata/{feature_store → common}/entities/column_info.py +6 -5
- wedata/{feature_store → common}/entities/feature_column_info.py +2 -1
- wedata/{feature_store → common}/entities/feature_lookup.py +1 -1
- wedata/{feature_store → common}/entities/feature_spec.py +9 -9
- wedata/{feature_store → common}/entities/feature_table_info.py +1 -1
- wedata/{feature_store → common}/entities/function_info.py +2 -1
- wedata/{feature_store → common}/entities/on_demand_column_info.py +2 -1
- wedata/{feature_store → common}/entities/source_data_column_info.py +3 -1
- wedata/{feature_store → common}/entities/training_set.py +6 -6
- wedata/common/feast_client/__init__.py +1 -0
- wedata/{feature_store → common}/feast_client/feast_client.py +1 -1
- wedata/common/log/__init__.py +1 -0
- wedata/{feature_store/common → common}/log/logger.py +9 -5
- wedata/common/spark_client/__init__.py +1 -0
- wedata/{feature_store → common}/spark_client/spark_client.py +6 -7
- wedata/{feature_store → common}/utils/common_utils.py +7 -9
- wedata/{feature_store → common}/utils/env_utils.py +12 -0
- wedata/{feature_store → common}/utils/feature_lookup_utils.py +6 -6
- wedata/{feature_store → common}/utils/feature_spec_utils.py +13 -8
- wedata/{feature_store → common}/utils/feature_utils.py +5 -5
- wedata/{feature_store → common}/utils/on_demand_utils.py +5 -4
- wedata/{feature_store → common}/utils/schema_utils.py +1 -1
- wedata/{feature_store → common}/utils/signature_utils.py +4 -4
- wedata/{feature_store → common}/utils/training_set_utils.py +13 -13
- wedata/{feature_store → common}/utils/uc_utils.py +1 -1
- wedata/feature_engineering/__init__.py +1 -0
- wedata/feature_engineering/client.py +417 -0
- wedata/feature_engineering/ml_training_client/ml_training_client.py +569 -0
- wedata/feature_engineering/mlflow_model.py +9 -0
- wedata/feature_engineering/table_client/table_client.py +548 -0
- wedata/feature_store/client.py +11 -15
- wedata/feature_store/constants/engine_types.py +8 -30
- wedata/feature_store/feature_table_client/feature_table_client.py +73 -105
- wedata/feature_store/training_set_client/training_set_client.py +12 -23
- wedata/tempo/interpol.py +2 -2
- tencent_wedata_feature_engineering_dev-0.1.48.dist-info/RECORD +0 -66
- {tencent_wedata_feature_engineering_dev-0.1.48.dist-info → tencent_wedata_feature_engineering_dev-0.2.5.dist-info}/top_level.txt +0 -0
- /wedata/{feature_store/cloud_sdk_client → common}/__init__.py +0 -0
- /wedata/{feature_store/common/log → common/constants}/__init__.py +0 -0
- /wedata/{feature_store/common/protos → common/entities}/__init__.py +0 -0
- /wedata/{feature_store → common}/entities/environment_variables.py +0 -0
- /wedata/{feature_store → common}/entities/feature.py +0 -0
- /wedata/{feature_store → common}/entities/feature_function.py +0 -0
- /wedata/{feature_store → common}/entities/feature_spec_constants.py +0 -0
- /wedata/{feature_store → common}/entities/feature_table.py +0 -0
- /wedata/{feature_store/entities → common/protos}/__init__.py +0 -0
- /wedata/{feature_store/common → common}/protos/feature_store_pb2.py +0 -0
- /wedata/{feature_store/feast_client → common/utils}/__init__.py +0 -0
- /wedata/{feature_store → common}/utils/topological_sort.py +0 -0
- /wedata/{feature_store → common}/utils/validation_utils.py +0 -0
- /wedata/{feature_store/spark_client → feature_engineering/ml_training_client}/__init__.py +0 -0
- /wedata/{feature_store/utils → feature_engineering/table_client}/__init__.py +0 -0
|
@@ -3,7 +3,7 @@ import datetime
|
|
|
3
3
|
import logging
|
|
4
4
|
from typing import Dict, List, Optional, Union
|
|
5
5
|
|
|
6
|
-
from wedata.
|
|
6
|
+
from wedata.common.utils import common_utils
|
|
7
7
|
from wedata.feature_store.common.store_config.redis import RedisStoreConfig
|
|
8
8
|
|
|
9
9
|
_logger = logging.getLogger(__name__)
|
|
@@ -7,11 +7,11 @@ import mlflow
|
|
|
7
7
|
from google.protobuf.json_format import MessageToDict, ParseDict
|
|
8
8
|
from mlflow.utils.file_utils import TempDir, read_yaml, write_yaml
|
|
9
9
|
|
|
10
|
-
from wedata.
|
|
11
|
-
from wedata.
|
|
12
|
-
from wedata.
|
|
13
|
-
from wedata.
|
|
14
|
-
from wedata.
|
|
10
|
+
from wedata.common.protos import feature_store_pb2
|
|
11
|
+
from wedata.common.entities.column_info import ColumnInfo
|
|
12
|
+
from wedata.common.entities.feature_column_info import FeatureColumnInfo
|
|
13
|
+
from wedata.common.entities.function_info import FunctionInfo
|
|
14
|
+
from wedata.common.entities.feature_spec_constants import (
|
|
15
15
|
BOUND_TO,
|
|
16
16
|
DATA_TYPE,
|
|
17
17
|
FEATURE_COLUMN_INFO,
|
|
@@ -34,10 +34,10 @@ from wedata.feature_store.entities.feature_spec_constants import (
|
|
|
34
34
|
TRAINING_DATA,
|
|
35
35
|
UDF_NAME,
|
|
36
36
|
)
|
|
37
|
-
from wedata.
|
|
38
|
-
from wedata.
|
|
39
|
-
from wedata.
|
|
40
|
-
from wedata.
|
|
37
|
+
from wedata.common.entities.feature_table_info import FeatureTableInfo
|
|
38
|
+
from wedata.common.entities.on_demand_column_info import OnDemandColumnInfo
|
|
39
|
+
from wedata.common.entities.source_data_column_info import SourceDataColumnInfo
|
|
40
|
+
from wedata.common.utils import common_utils
|
|
41
41
|
|
|
42
42
|
# Change log for serialization version. Please update for each serialization version.
|
|
43
43
|
# 1. Initial.
|
|
@@ -2,20 +2,20 @@ from typing import Dict, List, Optional
|
|
|
2
2
|
|
|
3
3
|
from pyspark.sql import DataFrame
|
|
4
4
|
|
|
5
|
-
from wedata.
|
|
6
|
-
from wedata.
|
|
7
|
-
from wedata.
|
|
5
|
+
from wedata.common.entities.feature_table import FeatureTable
|
|
6
|
+
from wedata.common.entities.function_info import FunctionInfo
|
|
7
|
+
from wedata.common.utils.feature_lookup_utils import (
|
|
8
8
|
join_feature_data_if_not_overridden,
|
|
9
9
|
)
|
|
10
10
|
|
|
11
|
-
from wedata.
|
|
12
|
-
from wedata.
|
|
11
|
+
from wedata.common.entities.feature_spec import FeatureSpec
|
|
12
|
+
from wedata.common.utils.feature_spec_utils import (
|
|
13
13
|
COLUMN_INFO_TYPE_FEATURE,
|
|
14
14
|
COLUMN_INFO_TYPE_ON_DEMAND,
|
|
15
15
|
COLUMN_INFO_TYPE_SOURCE,
|
|
16
16
|
get_feature_execution_groups,
|
|
17
17
|
)
|
|
18
|
-
from wedata.
|
|
18
|
+
from wedata.common.utils.on_demand_utils import apply_functions_if_not_overridden
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class TrainingSet:
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .feast_client import FeastClient
|
|
@@ -20,7 +20,7 @@ import pytz
|
|
|
20
20
|
from feast import FeatureStore, RepoConfig, FeatureView
|
|
21
21
|
from pyspark.sql import DataFrame, SparkSession
|
|
22
22
|
from wedata.feature_store.common.store_config.redis import RedisStoreConfig
|
|
23
|
-
from wedata.
|
|
23
|
+
from wedata.common.utils import env_utils
|
|
24
24
|
from feast import Entity, FeatureService
|
|
25
25
|
from feast.infra.offline_stores.contrib.spark_offline_store.spark_source import SparkSource
|
|
26
26
|
from feast.infra.online_stores.redis import RedisOnlineStore
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .logger import get_logger
|
|
@@ -14,14 +14,14 @@ class LoggerSingleton:
|
|
|
14
14
|
def _initialize_logger(self):
|
|
15
15
|
self.logger = logging.getLogger("wedata-feature-engineering")
|
|
16
16
|
self.logger.setLevel(logging.INFO)
|
|
17
|
-
|
|
17
|
+
|
|
18
18
|
# 清除已有的handler,避免重复添加
|
|
19
19
|
if self.logger.handlers:
|
|
20
20
|
self.logger.handlers.clear()
|
|
21
21
|
|
|
22
22
|
# 创建formatter,包含时间、文件名和行号
|
|
23
23
|
formatter = logging.Formatter(
|
|
24
|
-
fmt='%(asctime)s - %(name)s - %(levelname)s - %(
|
|
24
|
+
fmt='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
25
25
|
datefmt='%Y-%m-%d %H:%M:%S'
|
|
26
26
|
)
|
|
27
27
|
|
|
@@ -30,11 +30,15 @@ class LoggerSingleton:
|
|
|
30
30
|
handler.setFormatter(formatter)
|
|
31
31
|
|
|
32
32
|
self.logger.addHandler(handler)
|
|
33
|
+
|
|
34
|
+
# 防止消息传播到父级logger
|
|
35
|
+
self.logger.propagate = False
|
|
33
36
|
|
|
34
|
-
def get_logger(self):
|
|
37
|
+
def get_logger(self, level=logging.INFO):
|
|
38
|
+
self.logger.setLevel(level)
|
|
35
39
|
return self.logger
|
|
36
40
|
|
|
37
41
|
|
|
38
|
-
def get_logger():
|
|
42
|
+
def get_logger(level=logging.INFO):
|
|
39
43
|
"""获取单例logger实例"""
|
|
40
|
-
return LoggerSingleton().get_logger()
|
|
44
|
+
return LoggerSingleton().get_logger(level)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .spark_client import SparkClient
|
|
@@ -1,21 +1,20 @@
|
|
|
1
1
|
from collections import defaultdict
|
|
2
|
-
from typing import Optional, Any,
|
|
2
|
+
from typing import Optional, Any, List
|
|
3
3
|
|
|
4
|
-
import logging
|
|
5
4
|
from pyspark.sql import SparkSession, DataFrame
|
|
6
5
|
from pyspark.sql.catalog import Column
|
|
7
6
|
from pyspark.sql.functions import when, isnull
|
|
8
7
|
from pyspark.sql.types import StructType, StringType, StructField
|
|
9
8
|
from mlflow.pyfunc import spark_udf
|
|
10
9
|
|
|
11
|
-
from wedata.
|
|
10
|
+
from wedata.common.constants.constants import (
|
|
12
11
|
_PREBUILT_ENV_URI
|
|
13
12
|
)
|
|
14
13
|
|
|
15
|
-
from wedata.
|
|
16
|
-
from wedata.
|
|
17
|
-
from wedata.
|
|
18
|
-
from wedata.
|
|
14
|
+
from wedata.common.entities.feature import Feature
|
|
15
|
+
from wedata.common.entities.feature_table import FeatureTable
|
|
16
|
+
from wedata.common.entities.function_info import FunctionParameterInfo, FunctionInfo
|
|
17
|
+
from wedata.common.utils.common_utils import unsanitize_identifier, check_spark_table_exists, check_package_version
|
|
19
18
|
|
|
20
19
|
|
|
21
20
|
class SparkClient:
|
|
@@ -14,15 +14,11 @@ from mlflow.store.artifact.models_artifact_repo import ModelsArtifactRepository
|
|
|
14
14
|
from mlflow.store.artifact.runs_artifact_repo import RunsArtifactRepository
|
|
15
15
|
from mlflow.utils import databricks_utils
|
|
16
16
|
|
|
17
|
-
from wedata.
|
|
18
|
-
from wedata.
|
|
17
|
+
from wedata.common.constants import constants
|
|
18
|
+
from wedata.common.constants.constants import MODEL_DATA_PATH_ROOT
|
|
19
|
+
from wedata.common.log import get_logger
|
|
19
20
|
from pyspark.sql import SparkSession
|
|
20
21
|
|
|
21
|
-
import logging
|
|
22
|
-
|
|
23
|
-
# 配置日志(可选,根据实际情况配置)
|
|
24
|
-
logging.basicConfig(level=logging.ERROR)
|
|
25
|
-
|
|
26
22
|
|
|
27
23
|
def validate_table_name(name: str):
|
|
28
24
|
"""
|
|
@@ -57,14 +53,16 @@ def build_full_table_name(table_name: str, database_name: Optional[str] = None)
|
|
|
57
53
|
"""
|
|
58
54
|
|
|
59
55
|
feature_store_database_name = os.environ.get("WEDATA_DEFAULT_FEATURE_STORE_DATABASE")
|
|
56
|
+
logger = get_logger()
|
|
60
57
|
if database_name:
|
|
61
58
|
feature_store_database_name = database_name
|
|
62
59
|
|
|
63
60
|
if not feature_store_database_name:
|
|
64
|
-
|
|
61
|
+
logger.error("The current user has not configured a default feature database. "
|
|
62
|
+
"Please contact the manager account to configure it.")
|
|
65
63
|
raise RuntimeError("Feature store is not configured! Please contact the main account to configure it.")
|
|
66
64
|
|
|
67
|
-
|
|
65
|
+
logger.debug("feature database:{}".format(feature_store_database_name))
|
|
68
66
|
|
|
69
67
|
feature_store_database = f"{feature_store_database_name}.{table_name}"
|
|
70
68
|
|
|
@@ -68,10 +68,22 @@ def get_database_name(database_name: str) -> str:
|
|
|
68
68
|
"please check environment configuration")
|
|
69
69
|
|
|
70
70
|
|
|
71
|
+
def set_default_database(database_name: str):
|
|
72
|
+
"""
|
|
73
|
+
设置默认数据库名称
|
|
74
|
+
"""
|
|
75
|
+
if not isinstance(database_name, str):
|
|
76
|
+
raise ValueError("database_name must be a string")
|
|
77
|
+
os.environ["WEDATA_DEFAULT_FEATURE_STORE_DATABASE"] = database_name
|
|
78
|
+
|
|
79
|
+
|
|
71
80
|
def get_engine_name() -> str:
|
|
72
81
|
"""
|
|
73
82
|
获取引擎名称
|
|
74
83
|
"""
|
|
84
|
+
# 因为DLC有特殊,所以先判断DLC,如果没有再判断EMR
|
|
85
|
+
if get_engine_type() == "DLC":
|
|
86
|
+
return _get_variable("KERNEL_ENGINE")
|
|
75
87
|
return _get_variable("KERNEL_ENGINE_NAME")
|
|
76
88
|
|
|
77
89
|
|
|
@@ -11,13 +11,13 @@ from pyspark.sql import functions as F
|
|
|
11
11
|
import pyspark.sql.functions as psf
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
from wedata.
|
|
15
|
-
from wedata.
|
|
16
|
-
from wedata.
|
|
17
|
-
from wedata.
|
|
18
|
-
from wedata.
|
|
14
|
+
from wedata.common.entities.environment_variables import BROADCAST_JOIN_THRESHOLD
|
|
15
|
+
from wedata.common.entities.feature_column_info import FeatureColumnInfo
|
|
16
|
+
from wedata.common.entities.feature_lookup import FeatureLookup
|
|
17
|
+
from wedata.common.entities.feature_spec import FeatureSpec
|
|
18
|
+
from wedata.common.entities.feature_table import FeatureTable
|
|
19
19
|
|
|
20
|
-
from wedata.
|
|
20
|
+
from wedata.common.utils import uc_utils
|
|
21
21
|
|
|
22
22
|
_logger = logging.getLogger(__name__)
|
|
23
23
|
|
|
@@ -4,14 +4,19 @@ from functools import reduce
|
|
|
4
4
|
from typing import Dict, List, Tuple, Union
|
|
5
5
|
|
|
6
6
|
import yaml
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
from
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
from wedata.
|
|
7
|
+
|
|
8
|
+
# MLflow 3.x compatibility: YamlSafeDumper was removed, use yaml.SafeDumper directly
|
|
9
|
+
try:
|
|
10
|
+
from mlflow.utils.file_utils import YamlSafeDumper
|
|
11
|
+
except ImportError:
|
|
12
|
+
YamlSafeDumper = yaml.SafeDumper
|
|
13
|
+
|
|
14
|
+
from wedata.common.entities.column_info import ColumnInfo
|
|
15
|
+
from wedata.common.entities.feature_column_info import FeatureColumnInfo
|
|
16
|
+
from wedata.common.entities.feature_spec import FeatureSpec
|
|
17
|
+
from wedata.common.entities.on_demand_column_info import OnDemandColumnInfo
|
|
18
|
+
from wedata.common.entities.source_data_column_info import SourceDataColumnInfo
|
|
19
|
+
from wedata.common.utils.topological_sort import topological_sort
|
|
15
20
|
|
|
16
21
|
DEFAULT_GRAPH_DEPTH_LIMIT = 5
|
|
17
22
|
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import copy
|
|
2
2
|
from typing import List, Union
|
|
3
3
|
|
|
4
|
-
from wedata.
|
|
5
|
-
from wedata.
|
|
6
|
-
from wedata.
|
|
7
|
-
from wedata.
|
|
8
|
-
from wedata.
|
|
4
|
+
from wedata.common.entities.feature_function import FeatureFunction
|
|
5
|
+
from wedata.common.entities.feature_lookup import FeatureLookup
|
|
6
|
+
from wedata.common.spark_client import SparkClient
|
|
7
|
+
from wedata.common.utils import uc_utils
|
|
8
|
+
from wedata.common.utils.feature_lookup_utils import get_feature_lookups_with_full_table_names
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def format_feature_lookups_and_functions(
|
|
@@ -4,10 +4,11 @@ from typing import Dict, List
|
|
|
4
4
|
from pyspark.sql import DataFrame
|
|
5
5
|
from pyspark.sql.functions import expr
|
|
6
6
|
|
|
7
|
-
from wedata.
|
|
8
|
-
from wedata.
|
|
9
|
-
from wedata.
|
|
10
|
-
from wedata.
|
|
7
|
+
from wedata.common.entities.feature_function import FeatureFunction
|
|
8
|
+
from wedata.common.entities.function_info import FunctionInfo
|
|
9
|
+
from wedata.common.entities.on_demand_column_info import OnDemandColumnInfo
|
|
10
|
+
from wedata.common.utils import common_utils
|
|
11
|
+
from wedata.common.utils import uc_utils
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
def _udf_expr(udf_name: str, arguments: List[str]) -> expr:
|
|
@@ -7,10 +7,10 @@ from mlflow.types import ColSpec
|
|
|
7
7
|
from mlflow.types import DataType as MlflowDataType
|
|
8
8
|
from mlflow.types import ParamSchema, Schema
|
|
9
9
|
|
|
10
|
-
from wedata.
|
|
11
|
-
from wedata.
|
|
12
|
-
from wedata.
|
|
13
|
-
from wedata.
|
|
10
|
+
from wedata.common.entities.feature_column_info import FeatureColumnInfo
|
|
11
|
+
from wedata.common.entities.feature_spec import FeatureSpec
|
|
12
|
+
from wedata.common.entities.on_demand_column_info import OnDemandColumnInfo
|
|
13
|
+
from wedata.common.entities.source_data_column_info import SourceDataColumnInfo
|
|
14
14
|
|
|
15
15
|
_logger = logging.getLogger(__name__)
|
|
16
16
|
|
|
@@ -4,19 +4,19 @@ from typing import Dict, List, Optional, Set
|
|
|
4
4
|
|
|
5
5
|
from pyspark.sql import DataFrame
|
|
6
6
|
|
|
7
|
-
from wedata.
|
|
8
|
-
from wedata.
|
|
9
|
-
from wedata.
|
|
10
|
-
from wedata.
|
|
11
|
-
from wedata.
|
|
12
|
-
from wedata.
|
|
13
|
-
from wedata.
|
|
14
|
-
from wedata.
|
|
15
|
-
from wedata.
|
|
16
|
-
from wedata.
|
|
17
|
-
|
|
18
|
-
from wedata.
|
|
19
|
-
from wedata.
|
|
7
|
+
from wedata.common.entities.column_info import ColumnInfo
|
|
8
|
+
from wedata.common.entities.feature import Feature
|
|
9
|
+
from wedata.common.entities.feature_column_info import FeatureColumnInfo
|
|
10
|
+
from wedata.common.entities.feature_lookup import FeatureLookup
|
|
11
|
+
from wedata.common.entities.feature_spec import FeatureSpec
|
|
12
|
+
from wedata.common.entities.feature_table import FeatureTable
|
|
13
|
+
from wedata.common.entities.feature_table_info import FeatureTableInfo
|
|
14
|
+
from wedata.common.entities.function_info import FunctionInfo
|
|
15
|
+
from wedata.common.entities.on_demand_column_info import OnDemandColumnInfo
|
|
16
|
+
from wedata.common.entities.source_data_column_info import SourceDataColumnInfo
|
|
17
|
+
|
|
18
|
+
from wedata.common.utils import common_utils, validation_utils
|
|
19
|
+
from wedata.common.utils.feature_spec_utils import assign_topological_ordering
|
|
20
20
|
|
|
21
21
|
_logger = logging.getLogger(__name__)
|
|
22
22
|
|
|
@@ -3,7 +3,7 @@ import re
|
|
|
3
3
|
from typing import Optional, Set, Any, List
|
|
4
4
|
from datetime import datetime, timezone
|
|
5
5
|
|
|
6
|
-
from wedata.
|
|
6
|
+
from wedata.common.entities.feature_spec import FeatureSpec
|
|
7
7
|
|
|
8
8
|
SINGLE_LEVEL_NAMESPACE_REGEX = r"^[^\. \/\x00-\x1F\x7F]+$"
|
|
9
9
|
TWO_LEVEL_NAMESPACE_REGEX = r"^[^\. \/\x00-\x1F\x7F]+(\.[^\. \/\x00-\x1F\x7F]+)$"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .client import FeatureEngineeringClient
|