tencent-wedata-feature-engineering-dev 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.
- tencent_wedata_feature_engineering_dev-0.1.0.dist-info/METADATA +19 -0
- tencent_wedata_feature_engineering_dev-0.1.0.dist-info/RECORD +64 -0
- tencent_wedata_feature_engineering_dev-0.1.0.dist-info/WHEEL +5 -0
- tencent_wedata_feature_engineering_dev-0.1.0.dist-info/top_level.txt +1 -0
- wedata/__init__.py +9 -0
- wedata/feature_store/__init__.py +0 -0
- wedata/feature_store/client.py +462 -0
- wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
- wedata/feature_store/cloud_sdk_client/client.py +86 -0
- wedata/feature_store/cloud_sdk_client/models.py +686 -0
- wedata/feature_store/cloud_sdk_client/utils.py +32 -0
- wedata/feature_store/common/__init__.py +0 -0
- wedata/feature_store/common/protos/__init__.py +0 -0
- wedata/feature_store/common/protos/feature_store_pb2.py +49 -0
- wedata/feature_store/common/store_config/__init__.py +0 -0
- wedata/feature_store/common/store_config/redis.py +48 -0
- wedata/feature_store/constants/__init__.py +0 -0
- wedata/feature_store/constants/constants.py +59 -0
- wedata/feature_store/constants/engine_types.py +34 -0
- wedata/feature_store/entities/__init__.py +0 -0
- wedata/feature_store/entities/column_info.py +138 -0
- wedata/feature_store/entities/environment_variables.py +55 -0
- wedata/feature_store/entities/feature.py +53 -0
- wedata/feature_store/entities/feature_column_info.py +72 -0
- wedata/feature_store/entities/feature_function.py +55 -0
- wedata/feature_store/entities/feature_lookup.py +200 -0
- wedata/feature_store/entities/feature_spec.py +489 -0
- wedata/feature_store/entities/feature_spec_constants.py +25 -0
- wedata/feature_store/entities/feature_table.py +111 -0
- wedata/feature_store/entities/feature_table_info.py +49 -0
- wedata/feature_store/entities/function_info.py +90 -0
- wedata/feature_store/entities/on_demand_column_info.py +57 -0
- wedata/feature_store/entities/source_data_column_info.py +24 -0
- wedata/feature_store/entities/training_set.py +135 -0
- wedata/feature_store/feast_client/__init__.py +0 -0
- wedata/feature_store/feast_client/feast_client.py +482 -0
- wedata/feature_store/feature_table_client/__init__.py +0 -0
- wedata/feature_store/feature_table_client/feature_table_client.py +969 -0
- wedata/feature_store/mlflow_model.py +17 -0
- wedata/feature_store/spark_client/__init__.py +0 -0
- wedata/feature_store/spark_client/spark_client.py +289 -0
- wedata/feature_store/training_set_client/__init__.py +0 -0
- wedata/feature_store/training_set_client/training_set_client.py +572 -0
- wedata/feature_store/utils/__init__.py +0 -0
- wedata/feature_store/utils/common_utils.py +352 -0
- wedata/feature_store/utils/env_utils.py +86 -0
- wedata/feature_store/utils/feature_lookup_utils.py +564 -0
- wedata/feature_store/utils/feature_spec_utils.py +286 -0
- wedata/feature_store/utils/feature_utils.py +73 -0
- wedata/feature_store/utils/on_demand_utils.py +107 -0
- wedata/feature_store/utils/schema_utils.py +117 -0
- wedata/feature_store/utils/signature_utils.py +202 -0
- wedata/feature_store/utils/topological_sort.py +158 -0
- wedata/feature_store/utils/training_set_utils.py +579 -0
- wedata/feature_store/utils/uc_utils.py +296 -0
- wedata/feature_store/utils/validation_utils.py +79 -0
- wedata/tempo/__init__.py +0 -0
- wedata/tempo/interpol.py +448 -0
- wedata/tempo/intervals.py +1331 -0
- wedata/tempo/io.py +61 -0
- wedata/tempo/ml.py +129 -0
- wedata/tempo/resample.py +318 -0
- wedata/tempo/tsdf.py +1720 -0
- wedata/tempo/utils.py +254 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Optional, Dict, Any
|
|
3
|
+
import mlflow
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
class _FeatureStoreModelWrapper(mlflow.pyfunc.PythonModel):
|
|
7
|
+
def __init__(self, model):
|
|
8
|
+
self.model = model
|
|
9
|
+
|
|
10
|
+
def predict(self, context, model_input):
|
|
11
|
+
return self.model.predict(model_input)
|
|
12
|
+
|
|
13
|
+
# def _load_pyfunc(path):
|
|
14
|
+
# # Path provided by mlflow is subdirectory of path needed by score_batch
|
|
15
|
+
# artifact_path = os.path.join(mlflow.pyfunc.DATA, "feature_store")
|
|
16
|
+
# index = path.find(artifact_path)
|
|
17
|
+
# return _FeatureStoreModelWrapper(path[:index])
|
|
File without changes
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import Optional, Any, Dict, List
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
from pyspark.sql import SparkSession, DataFrame
|
|
6
|
+
from pyspark.sql.catalog import Column
|
|
7
|
+
from pyspark.sql.functions import when, isnull
|
|
8
|
+
from pyspark.sql.types import StructType, StringType, StructField
|
|
9
|
+
from mlflow.pyfunc import spark_udf
|
|
10
|
+
|
|
11
|
+
from wedata.feature_store.constants.constants import (
|
|
12
|
+
_PREBUILT_ENV_URI
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
from wedata.feature_store.entities.feature import Feature
|
|
16
|
+
from wedata.feature_store.entities.feature_table import FeatureTable
|
|
17
|
+
from wedata.feature_store.entities.function_info import FunctionParameterInfo, FunctionInfo
|
|
18
|
+
from wedata.feature_store.utils.common_utils import unsanitize_identifier, check_spark_table_exists, check_package_version
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SparkClient:
|
|
22
|
+
def __init__(self, spark: SparkSession):
|
|
23
|
+
self._spark = spark
|
|
24
|
+
|
|
25
|
+
def get_current_catalog(self):
|
|
26
|
+
"""
|
|
27
|
+
获取当前Spark会话的catalog名称(使用spark.catalog.currentCatalog属性)
|
|
28
|
+
|
|
29
|
+
返回:
|
|
30
|
+
str: 当前catalog名称,如果未设置则返回None
|
|
31
|
+
"""
|
|
32
|
+
_, ok, _ = check_package_version("pyspark", "3.4.0", ">=")
|
|
33
|
+
print(f"pyspark >= 3.4.0 {ok}")
|
|
34
|
+
if ok:
|
|
35
|
+
return unsanitize_identifier(self._spark.catalog.currentCatalog())
|
|
36
|
+
else:
|
|
37
|
+
catalog = self._spark.sql("SELECT current_catalog()").first()[0]
|
|
38
|
+
return unsanitize_identifier(catalog)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def get_current_database(self):
|
|
42
|
+
"""
|
|
43
|
+
获取Spark上下文中当前设置的数据库名称
|
|
44
|
+
|
|
45
|
+
返回:
|
|
46
|
+
str: 当前数据库名称,如果获取失败则返回None
|
|
47
|
+
"""
|
|
48
|
+
try:
|
|
49
|
+
# 使用Spark SQL查询当前数据库
|
|
50
|
+
df = self._spark.sql("SELECT CURRENT_DATABASE()")
|
|
51
|
+
# 获取第一行第一列的值并去除特殊字符
|
|
52
|
+
return unsanitize_identifier(df.first()[0])
|
|
53
|
+
except Exception:
|
|
54
|
+
# 捕获所有异常并返回None
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
def createDataFrame(self, data, schema) -> DataFrame:
|
|
58
|
+
return self._spark.createDataFrame(data, schema)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def read_table(self, table_name):
|
|
62
|
+
"""读取Spark表数据
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
table_name: 表名,支持格式: catalog.schema.table、schema.table
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
DataFrame: 表数据
|
|
69
|
+
|
|
70
|
+
Raises:
|
|
71
|
+
ValueError: 当表不存在或读取失败时抛出
|
|
72
|
+
"""
|
|
73
|
+
table_name = _translate_spark_table_name(table_name)
|
|
74
|
+
try:
|
|
75
|
+
# 验证表是否存在
|
|
76
|
+
if not check_spark_table_exists(self._spark, table_name):
|
|
77
|
+
raise ValueError(f"Table does not exist: {table_name}")
|
|
78
|
+
return self._spark.table(table_name)
|
|
79
|
+
|
|
80
|
+
except Exception as e:
|
|
81
|
+
raise ValueError(f"Failed to read table {table_name}: {str(e)}")
|
|
82
|
+
|
|
83
|
+
def get_features(self, table_name):
|
|
84
|
+
# 查询列信息
|
|
85
|
+
table_name = _translate_spark_table_name(table_name)
|
|
86
|
+
split = table_name.split(".")
|
|
87
|
+
if len(split) == 2:
|
|
88
|
+
# db.table_name
|
|
89
|
+
columns = self._spark.catalog.listColumns(tableName=split[1], dbName=split[0])
|
|
90
|
+
elif len(split) == 3:
|
|
91
|
+
# catalog.db.table_name
|
|
92
|
+
columns = self._spark.catalog.listColumns(tableName=split[2], dbName=split[1])
|
|
93
|
+
else:
|
|
94
|
+
columns = self._spark.catalog.listColumns(tableName=table_name)
|
|
95
|
+
return [
|
|
96
|
+
Feature(
|
|
97
|
+
feature_table=table_name,
|
|
98
|
+
feature_id=f"{table_name}_{row.name}",
|
|
99
|
+
name=row.name,
|
|
100
|
+
data_type=row.dataType,
|
|
101
|
+
description=row.description or ""
|
|
102
|
+
) for row in columns
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
def get_feature_table(self, table_name):
|
|
106
|
+
"""
|
|
107
|
+
DLC支持table_name为catalog.schema.table
|
|
108
|
+
EMR支持table_name为schema.table
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
table_name = _translate_spark_table_name(table_name)
|
|
112
|
+
# table = self._spark.catalog.getTable(table_name)
|
|
113
|
+
# 获取表配置信息
|
|
114
|
+
properties = self._spark.sql(f"SHOW TBLPROPERTIES {table_name}").collect()
|
|
115
|
+
primary_key_str = next((row.value for row in properties if row.key == "primaryKeys"), None)
|
|
116
|
+
primary_keys = primary_key_str.split(",") if primary_key_str else []
|
|
117
|
+
table_id = next((row.value for row in properties if row.key == "table_id"), table_name)
|
|
118
|
+
description = next((row.value for row in properties if row.key == "comment"), None)
|
|
119
|
+
timestamp_keys_str = next((row.value for row in properties if row.key == "timestampKeys"), None)
|
|
120
|
+
timestamp_keys = timestamp_keys_str.split(",") if timestamp_keys_str else []
|
|
121
|
+
# 获取分区字段信息
|
|
122
|
+
desc_df = self._spark.sql(f"DESCRIBE EXTENDED {table_name}")
|
|
123
|
+
partition_info = desc_df.filter("col_name LIKE '_partition%'").collect()
|
|
124
|
+
partition_columns = []
|
|
125
|
+
if partition_info:
|
|
126
|
+
partition_str = partition_info[0]["data_type"]
|
|
127
|
+
# 从分区字符串中提取分区字段
|
|
128
|
+
if partition_str.startswith("struct<") and partition_str.endswith(">"):
|
|
129
|
+
# 去掉struct<>外壳
|
|
130
|
+
fields_str = partition_str[7:-1]
|
|
131
|
+
# 分割各个字段定义
|
|
132
|
+
field_defs = [f.strip() for f in fields_str.split(",") if f.strip()]
|
|
133
|
+
# 提取字段名
|
|
134
|
+
partition_columns = [f.split(":")[0].strip() for f in field_defs]
|
|
135
|
+
|
|
136
|
+
# 获取特征列信息
|
|
137
|
+
features = self.get_features(table_name)
|
|
138
|
+
|
|
139
|
+
# 构建完整的FeatureTable对象
|
|
140
|
+
return FeatureTable(
|
|
141
|
+
name=table_name,
|
|
142
|
+
table_id=table_id,
|
|
143
|
+
description=description,
|
|
144
|
+
primary_keys=primary_keys,
|
|
145
|
+
partition_columns=partition_columns,
|
|
146
|
+
features=features,
|
|
147
|
+
creation_timestamp=None, # Spark表元数据不包含创建时间戳
|
|
148
|
+
online_stores=None,
|
|
149
|
+
notebook_producers=None,
|
|
150
|
+
job_producers=None,
|
|
151
|
+
table_data_sources=None,
|
|
152
|
+
path_data_sources=None,
|
|
153
|
+
custom_data_sources=None,
|
|
154
|
+
timestamp_keys=timestamp_keys,
|
|
155
|
+
tags=None
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
def _get_routines_with_parameters(self, full_routine_names: List[str]) -> DataFrame:
|
|
159
|
+
"""
|
|
160
|
+
Retrieve the routines with their parameters from information_schema.routines, information_schema.parameters.
|
|
161
|
+
Return DataFrame only contains routines that 1. exist and 2. the caller has GetFunction permission on.
|
|
162
|
+
|
|
163
|
+
Note: The returned DataFrame contains the cartesian product of routines and parameters.
|
|
164
|
+
For efficiency, routines table columns are only present in the first row for each routine.
|
|
165
|
+
"""
|
|
166
|
+
routine_name_schema = StructType(
|
|
167
|
+
[
|
|
168
|
+
StructField("specific_catalog", StringType(), False),
|
|
169
|
+
StructField("specific_schema", StringType(), False),
|
|
170
|
+
StructField("specific_name", StringType(), False),
|
|
171
|
+
]
|
|
172
|
+
)
|
|
173
|
+
routine_names_df = self.createDataFrame(
|
|
174
|
+
[full_routine_name.split(".") for full_routine_name in full_routine_names],
|
|
175
|
+
routine_name_schema,
|
|
176
|
+
)
|
|
177
|
+
routines_table = self.read_table(
|
|
178
|
+
"system.information_schema.routines"
|
|
179
|
+
)
|
|
180
|
+
parameters_table = self.read_table(
|
|
181
|
+
"system.information_schema.parameters"
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# Inner join routines table to filter out non-existent routines.
|
|
185
|
+
# Left join parameters as routines may have no parameters.
|
|
186
|
+
full_routines_with_parameters_df = routine_names_df.join(
|
|
187
|
+
routines_table, on=routine_names_df.columns, how="inner"
|
|
188
|
+
).join(parameters_table, on=routine_names_df.columns, how="left")
|
|
189
|
+
|
|
190
|
+
# Return only relevant metadata from information_schema, sorted by routine name + parameter order.
|
|
191
|
+
# For efficiency, only preserve routine column values in the first of each routine's result rows.
|
|
192
|
+
# The first row will have parameter.ordinal_value is None (no parameters) or equals 0 (first parameter).
|
|
193
|
+
def select_if_first_row(col: Column) -> Column:
|
|
194
|
+
return when(
|
|
195
|
+
isnull(parameters_table.ordinal_position)
|
|
196
|
+
| (parameters_table.ordinal_position == 0),
|
|
197
|
+
col,
|
|
198
|
+
).otherwise(None)
|
|
199
|
+
|
|
200
|
+
return full_routines_with_parameters_df.select(
|
|
201
|
+
routine_names_df.columns
|
|
202
|
+
+ [
|
|
203
|
+
select_if_first_row(routines_table.routine_definition).alias(
|
|
204
|
+
"routine_definition"
|
|
205
|
+
),
|
|
206
|
+
select_if_first_row(routines_table.external_language).alias(
|
|
207
|
+
"external_language"
|
|
208
|
+
),
|
|
209
|
+
parameters_table.ordinal_position,
|
|
210
|
+
parameters_table.parameter_name,
|
|
211
|
+
parameters_table.full_data_type,
|
|
212
|
+
]
|
|
213
|
+
).sort(routine_names_df.columns + [parameters_table.ordinal_position])
|
|
214
|
+
|
|
215
|
+
def get_functions(self, full_function_names: List[str]) -> List[FunctionInfo]:
|
|
216
|
+
"""
|
|
217
|
+
Retrieves and maps Unity Catalog functions' metadata as FunctionInfos.
|
|
218
|
+
"""
|
|
219
|
+
# Avoid unnecessary Spark calls and return if empty.
|
|
220
|
+
if not full_function_names:
|
|
221
|
+
return []
|
|
222
|
+
|
|
223
|
+
# Collect dict of routine name -> DataFrame rows describing the routine.
|
|
224
|
+
routines_with_parameters_df = self._get_routines_with_parameters(
|
|
225
|
+
full_routine_names=full_function_names
|
|
226
|
+
)
|
|
227
|
+
routine_infos = defaultdict(list)
|
|
228
|
+
for r in routines_with_parameters_df.collect():
|
|
229
|
+
routine_name = f"{r.specific_catalog}.{r.specific_schema}.{r.specific_name}"
|
|
230
|
+
routine_infos[routine_name].append(r)
|
|
231
|
+
|
|
232
|
+
# Mock GetFunction DNE error, since information_schema does not throw.
|
|
233
|
+
for function_name in full_function_names:
|
|
234
|
+
if not function_name in routine_infos:
|
|
235
|
+
raise ValueError(f"Function '{function_name}' does not exist.")
|
|
236
|
+
|
|
237
|
+
# Map routine_infos into FunctionInfos.
|
|
238
|
+
function_infos = []
|
|
239
|
+
for function_name in full_function_names:
|
|
240
|
+
routine_info = routine_infos[function_name][0]
|
|
241
|
+
input_params = [
|
|
242
|
+
FunctionParameterInfo(name=p.parameter_name, type_text=p.full_data_type)
|
|
243
|
+
for p in routine_infos[function_name]
|
|
244
|
+
if p.ordinal_position is not None
|
|
245
|
+
]
|
|
246
|
+
function_infos.append(
|
|
247
|
+
FunctionInfo(
|
|
248
|
+
full_name=function_name,
|
|
249
|
+
input_params=input_params,
|
|
250
|
+
routine_definition=routine_info.routine_definition,
|
|
251
|
+
external_language=routine_info.external_language,
|
|
252
|
+
)
|
|
253
|
+
)
|
|
254
|
+
return function_infos
|
|
255
|
+
|
|
256
|
+
def get_predict_udf(
|
|
257
|
+
self,
|
|
258
|
+
model_uri,
|
|
259
|
+
result_type=None,
|
|
260
|
+
env_manager=None,
|
|
261
|
+
params: Optional[dict[str, Any]] = None,
|
|
262
|
+
prebuilt_env_uri: Optional[str] = None,
|
|
263
|
+
):
|
|
264
|
+
kwargs = {}
|
|
265
|
+
if result_type:
|
|
266
|
+
kwargs["result_type"] = result_type
|
|
267
|
+
if env_manager:
|
|
268
|
+
kwargs["env_manager"] = env_manager
|
|
269
|
+
if params:
|
|
270
|
+
kwargs["params"] = params
|
|
271
|
+
if prebuilt_env_uri:
|
|
272
|
+
kwargs[_PREBUILT_ENV_URI] = prebuilt_env_uri
|
|
273
|
+
|
|
274
|
+
return spark_udf(self._spark, model_uri, **kwargs)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def _translate_spark_table_name(table_name):
|
|
278
|
+
from wedata.feature_store.constants.engine_types import judge_engine_type, CalculateEngineTypes
|
|
279
|
+
# 获取表元数据
|
|
280
|
+
if judge_engine_type() == CalculateEngineTypes.EMR:
|
|
281
|
+
split_names = table_name.split(".")
|
|
282
|
+
# print(f"==== EMR TABLE split len({len(split_names)})")
|
|
283
|
+
if len(split_names) <= 2:
|
|
284
|
+
return table_name
|
|
285
|
+
else:
|
|
286
|
+
table_name = ".".join(table_name.split(".")[1:])
|
|
287
|
+
return table_name
|
|
288
|
+
return table_name
|
|
289
|
+
|
|
File without changes
|