wedata-feature-engineering 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wedata/__init__.py +1 -1
- wedata/feature_store/client.py +113 -41
- wedata/feature_store/constants/constants.py +19 -0
- wedata/feature_store/entities/column_info.py +4 -4
- wedata/feature_store/entities/feature_lookup.py +5 -1
- wedata/feature_store/entities/feature_spec.py +46 -46
- wedata/feature_store/entities/feature_table.py +42 -99
- wedata/feature_store/entities/training_set.py +13 -12
- wedata/feature_store/feature_table_client/feature_table_client.py +86 -31
- wedata/feature_store/spark_client/spark_client.py +30 -56
- wedata/feature_store/training_set_client/training_set_client.py +209 -38
- wedata/feature_store/utils/common_utils.py +213 -3
- wedata/feature_store/utils/feature_lookup_utils.py +6 -6
- wedata/feature_store/utils/feature_spec_utils.py +6 -6
- wedata/feature_store/utils/feature_utils.py +5 -5
- wedata/feature_store/utils/on_demand_utils.py +107 -0
- wedata/feature_store/utils/schema_utils.py +1 -1
- wedata/feature_store/utils/signature_utils.py +205 -0
- wedata/feature_store/utils/training_set_utils.py +18 -19
- wedata/feature_store/utils/uc_utils.py +1 -1
- {wedata_feature_engineering-0.1.5.dist-info → wedata_feature_engineering-0.1.7.dist-info}/METADATA +1 -1
- wedata_feature_engineering-0.1.7.dist-info/RECORD +43 -0
- feature_store/__init__.py +0 -6
- feature_store/client.py +0 -169
- feature_store/constants/__init__.py +0 -0
- feature_store/constants/constants.py +0 -28
- feature_store/entities/__init__.py +0 -0
- feature_store/entities/column_info.py +0 -117
- feature_store/entities/data_type.py +0 -92
- feature_store/entities/environment_variables.py +0 -55
- feature_store/entities/feature.py +0 -53
- feature_store/entities/feature_column_info.py +0 -64
- feature_store/entities/feature_function.py +0 -55
- feature_store/entities/feature_lookup.py +0 -179
- feature_store/entities/feature_spec.py +0 -454
- feature_store/entities/feature_spec_constants.py +0 -25
- feature_store/entities/feature_table.py +0 -164
- feature_store/entities/feature_table_info.py +0 -40
- feature_store/entities/function_info.py +0 -184
- feature_store/entities/on_demand_column_info.py +0 -44
- feature_store/entities/source_data_column_info.py +0 -21
- feature_store/entities/training_set.py +0 -134
- feature_store/feature_table_client/__init__.py +0 -0
- feature_store/feature_table_client/feature_table_client.py +0 -313
- feature_store/spark_client/__init__.py +0 -0
- feature_store/spark_client/spark_client.py +0 -286
- feature_store/training_set_client/__init__.py +0 -0
- feature_store/training_set_client/training_set_client.py +0 -196
- feature_store/utils/__init__.py +0 -0
- feature_store/utils/common_utils.py +0 -96
- feature_store/utils/feature_lookup_utils.py +0 -570
- feature_store/utils/feature_spec_utils.py +0 -286
- feature_store/utils/feature_utils.py +0 -73
- feature_store/utils/schema_utils.py +0 -117
- feature_store/utils/topological_sort.py +0 -158
- feature_store/utils/training_set_utils.py +0 -580
- feature_store/utils/uc_utils.py +0 -281
- feature_store/utils/utils.py +0 -252
- feature_store/utils/validation_utils.py +0 -55
- wedata/feature_store/utils/utils.py +0 -252
- wedata_feature_engineering-0.1.5.dist-info/RECORD +0 -79
- {wedata_feature_engineering-0.1.5.dist-info → wedata_feature_engineering-0.1.7.dist-info}/WHEEL +0 -0
- {wedata_feature_engineering-0.1.5.dist-info → wedata_feature_engineering-0.1.7.dist-info}/top_level.txt +0 -0
@@ -1,313 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
特征表操作相关工具方法
|
3
|
-
"""
|
4
|
-
|
5
|
-
from typing import Union, List, Dict, Optional, Sequence, Any
|
6
|
-
from pyspark.sql import DataFrame, SparkSession
|
7
|
-
from pyspark.sql.streaming import StreamingQuery
|
8
|
-
from pyspark.sql.types import StructType
|
9
|
-
import os
|
10
|
-
|
11
|
-
from feature_store.constants.constants import APPEND, DEFAULT_WRITE_STREAM_TRIGGER
|
12
|
-
|
13
|
-
|
14
|
-
class FeatureTableClient:
|
15
|
-
"""特征表操作类"""
|
16
|
-
|
17
|
-
def __init__(
|
18
|
-
self,
|
19
|
-
spark: SparkSession
|
20
|
-
):
|
21
|
-
self._spark = spark
|
22
|
-
|
23
|
-
@staticmethod
|
24
|
-
def _normalize_params(
|
25
|
-
param: Optional[Union[str, Sequence[str]]],
|
26
|
-
default_type: type = list
|
27
|
-
) -> list:
|
28
|
-
"""统一处理参数标准化"""
|
29
|
-
if param is None:
|
30
|
-
return default_type()
|
31
|
-
return list(param) if isinstance(param, Sequence) else [param]
|
32
|
-
|
33
|
-
@staticmethod
|
34
|
-
def _validate_schema(df: DataFrame, schema: StructType):
|
35
|
-
"""校验DataFrame和schema的有效性和一致性"""
|
36
|
-
# 检查是否同时为空
|
37
|
-
if df is None and schema is None:
|
38
|
-
raise ValueError("必须提供DataFrame或schema其中之一")
|
39
|
-
|
40
|
-
# 检查schema匹配
|
41
|
-
if df is not None and schema is not None:
|
42
|
-
df_schema = df.schema
|
43
|
-
if df_schema != schema:
|
44
|
-
diff_fields = set(df_schema.fieldNames()).symmetric_difference(set(schema.fieldNames()))
|
45
|
-
raise ValueError(
|
46
|
-
f"DataFrame与schema不匹配。差异字段: {diff_fields if diff_fields else '字段类型不一致'}"
|
47
|
-
)
|
48
|
-
|
49
|
-
@staticmethod
|
50
|
-
def _validate_table_name(name: str):
|
51
|
-
"""验证特征表命名规范"""
|
52
|
-
if name.count('.') < 2:
|
53
|
-
raise ValueError("特征表名称需符合<catalog>.<schema>.<table>格式")
|
54
|
-
|
55
|
-
@staticmethod
|
56
|
-
def _validate_key_conflicts(primary_keys: List[str], timestamp_keys: List[str]):
|
57
|
-
"""校验主键与时间戳键是否冲突"""
|
58
|
-
conflict_keys = set(timestamp_keys) & set(primary_keys)
|
59
|
-
if conflict_keys:
|
60
|
-
raise ValueError(f"时间戳键与主键冲突: {conflict_keys}")
|
61
|
-
|
62
|
-
@staticmethod
|
63
|
-
def _escape_sql_value(value: str) -> str:
|
64
|
-
"""转义SQL值中的特殊字符"""
|
65
|
-
return value.replace("'", "''")
|
66
|
-
|
67
|
-
def create_table(
|
68
|
-
self,
|
69
|
-
name: str,
|
70
|
-
primary_keys: Union[str, List[str]],
|
71
|
-
df: Optional[DataFrame] = None,
|
72
|
-
*,
|
73
|
-
timestamp_keys: Union[str, List[str], None] = None,
|
74
|
-
partition_columns: Union[str, List[str], None] = None,
|
75
|
-
schema: Optional[StructType] = None,
|
76
|
-
description: Optional[str] = None,
|
77
|
-
tags: Optional[Dict[str, str]] = None
|
78
|
-
):
|
79
|
-
"""
|
80
|
-
创建特征表(支持批流数据写入)
|
81
|
-
|
82
|
-
Args:
|
83
|
-
name: 特征表全称(格式:<table>)
|
84
|
-
primary_keys: 主键列名(支持复合主键)
|
85
|
-
df: 初始数据(可选,用于推断schema)
|
86
|
-
timestamp_keys: 时间戳键(用于时态特征)
|
87
|
-
partition_columns: 分区列(优化存储查询)
|
88
|
-
description: 业务描述
|
89
|
-
tags: 业务标签
|
90
|
-
|
91
|
-
Returns:
|
92
|
-
FeatureTable实例
|
93
|
-
|
94
|
-
Raises:
|
95
|
-
ValueError: 当schema与数据不匹配时
|
96
|
-
"""
|
97
|
-
# 参数标准化
|
98
|
-
primary_keys = self._normalize_params(primary_keys)
|
99
|
-
timestamp_keys = self._normalize_params(timestamp_keys)
|
100
|
-
partition_columns = self._normalize_params(partition_columns)
|
101
|
-
|
102
|
-
# 元数据校验
|
103
|
-
self._validate_schema(df, schema)
|
104
|
-
#self._validate_table_name(name)
|
105
|
-
self._validate_key_conflicts(primary_keys, timestamp_keys)
|
106
|
-
|
107
|
-
# 表名 格式:<catalog>.<schema>.<table> catalog默认值:DataLakeCatalog,schema默认值:feature_store
|
108
|
-
table_name = f'DataLakeCatalog.feature_store.{name}'
|
109
|
-
|
110
|
-
# 检查表是否存在
|
111
|
-
try:
|
112
|
-
if self._spark.catalog.tableExists(table_name):
|
113
|
-
raise ValueError(
|
114
|
-
f"表 '{table_name}' 已存在\n"
|
115
|
-
"解决方案:\n"
|
116
|
-
"1. 使用不同的表名\n"
|
117
|
-
"2. 删除现有表: spark.sql(f'DROP TABLE {name}')\n"
|
118
|
-
)
|
119
|
-
except Exception as e:
|
120
|
-
raise ValueError(f"检查表存在性时出错: {str(e)}") from e
|
121
|
-
|
122
|
-
# 推断表schema
|
123
|
-
table_schema = schema or df.schema
|
124
|
-
|
125
|
-
# 构建时间戳键属性
|
126
|
-
timestamp_keys_ddl = []
|
127
|
-
for timestamp_key in timestamp_keys:
|
128
|
-
if timestamp_key not in primary_keys:
|
129
|
-
raise ValueError(f"时间戳键 '{timestamp_key}' 必须是主键")
|
130
|
-
timestamp_keys_ddl.append(f"`{timestamp_key}` TIMESTAMP")
|
131
|
-
|
132
|
-
#从环境变量获取额外标签
|
133
|
-
env_tags = {
|
134
|
-
"project_id": os.getenv("WEDATA_PROJECT_ID", ""), # wedata项目ID
|
135
|
-
"engine_name": os.getenv("WEDATA_NOTEBOOK_ENGINE", ""), # wedata引擎名称
|
136
|
-
"user_uin": os.getenv("WEDATA_USER_UIN", "") # wedata用户UIN
|
137
|
-
}
|
138
|
-
|
139
|
-
# 构建表属性(通过TBLPROPERTIES)
|
140
|
-
tbl_properties = {
|
141
|
-
"feature_table": "TRUE",
|
142
|
-
"primaryKeys": ",".join(primary_keys),
|
143
|
-
"comment": description or "",
|
144
|
-
**{f"{k}": v for k, v in (tags or {}).items()},
|
145
|
-
**{f"feature_{k}": v for k, v in (env_tags or {}).items()}
|
146
|
-
}
|
147
|
-
|
148
|
-
# 构建列定义
|
149
|
-
columns_ddl = []
|
150
|
-
for field in table_schema.fields:
|
151
|
-
data_type = field.dataType.simpleString().upper()
|
152
|
-
col_def = f"`{field.name}` {data_type}"
|
153
|
-
if not field.nullable:
|
154
|
-
col_def += " NOT NULL"
|
155
|
-
# 添加字段注释(如果metadata中有comment)
|
156
|
-
if field.metadata and "comment" in field.metadata:
|
157
|
-
comment = self._escape_sql_value(field.metadata["comment"])
|
158
|
-
col_def += f" COMMENT '{comment}'"
|
159
|
-
columns_ddl.append(col_def)
|
160
|
-
|
161
|
-
# 构建分区表达式
|
162
|
-
partition_expr = (
|
163
|
-
f"PARTITIONED BY ({', '.join([f'`{c}`' for c in partition_columns])})"
|
164
|
-
if partition_columns else ""
|
165
|
-
)
|
166
|
-
|
167
|
-
# 核心建表语句
|
168
|
-
ddl = f"""
|
169
|
-
CREATE TABLE {table_name} (
|
170
|
-
{', '.join(columns_ddl)}
|
171
|
-
)
|
172
|
-
USING PARQUET
|
173
|
-
{partition_expr}
|
174
|
-
TBLPROPERTIES (
|
175
|
-
{', '.join(f"'{k}'='{self._escape_sql_value(v)}'" for k, v in tbl_properties.items())}
|
176
|
-
)
|
177
|
-
"""
|
178
|
-
|
179
|
-
# 打印sql
|
180
|
-
print(f"create table ddl: {ddl}")
|
181
|
-
|
182
|
-
# 执行DDL
|
183
|
-
try:
|
184
|
-
self._spark.sql(ddl)
|
185
|
-
if df is not None:
|
186
|
-
df.write.insertInto(table_name)
|
187
|
-
except Exception as e:
|
188
|
-
raise ValueError(f"建表失败: {str(e)}") from e
|
189
|
-
|
190
|
-
def write_table(
|
191
|
-
self,
|
192
|
-
name: str,
|
193
|
-
df: DataFrame,
|
194
|
-
mode: str = APPEND,
|
195
|
-
checkpoint_location: Optional[str] = None,
|
196
|
-
trigger: Optional[Dict[str, Any]] = DEFAULT_WRITE_STREAM_TRIGGER
|
197
|
-
) -> Optional[StreamingQuery]:
|
198
|
-
"""
|
199
|
-
写入特征表数据(支持批处理和流式写入)
|
200
|
-
|
201
|
-
Args:
|
202
|
-
name: 特征表名称(格式:<table>)
|
203
|
-
df: 要写入的数据(DataFrame)
|
204
|
-
mode: 写入模式(append/overwrite)
|
205
|
-
checkpoint_location: 流式写入的检查点位置(仅流式写入需要)
|
206
|
-
trigger: 流式写入触发条件(仅流式写入需要)
|
207
|
-
|
208
|
-
Returns:
|
209
|
-
如果是流式写入返回StreamingQuery对象,否则返回None
|
210
|
-
|
211
|
-
Raises:
|
212
|
-
ValueError: 当参数不合法时抛出
|
213
|
-
"""
|
214
|
-
|
215
|
-
# 验证写入模式
|
216
|
-
valid_modes = ["append", "overwrite"]
|
217
|
-
if mode not in valid_modes:
|
218
|
-
raise ValueError(f"无效的写入模式 '{mode}',可选值: {valid_modes}")
|
219
|
-
|
220
|
-
# 完整表名格式:<catalog>.<schema>.<table>
|
221
|
-
table_name = f'DataLakeCatalog.feature_store.{name}'
|
222
|
-
|
223
|
-
# 判断是否是流式DataFrame
|
224
|
-
is_streaming = df.isStreaming
|
225
|
-
|
226
|
-
try:
|
227
|
-
if is_streaming:
|
228
|
-
# 流式写入
|
229
|
-
if not checkpoint_location:
|
230
|
-
raise ValueError("流式写入必须提供checkpoint_location参数")
|
231
|
-
|
232
|
-
writer = df.writeStream \
|
233
|
-
.format("parquet") \
|
234
|
-
.outputMode(mode) \
|
235
|
-
.option("checkpointLocation", checkpoint_location)
|
236
|
-
|
237
|
-
if trigger:
|
238
|
-
writer = writer.trigger(**trigger)
|
239
|
-
|
240
|
-
return writer.toTable(table_name)
|
241
|
-
else:
|
242
|
-
# 批处理写入
|
243
|
-
df.write \
|
244
|
-
.mode(mode) \
|
245
|
-
.insertInto(table_name)
|
246
|
-
return None
|
247
|
-
|
248
|
-
except Exception as e:
|
249
|
-
raise ValueError(f"写入表'{table_name}'失败: {str(e)}") from e
|
250
|
-
|
251
|
-
def read_table(
|
252
|
-
self,
|
253
|
-
name: str
|
254
|
-
) -> DataFrame:
|
255
|
-
"""
|
256
|
-
从特征表中读取数据
|
257
|
-
|
258
|
-
Args:
|
259
|
-
name: 特征表名称(格式:<table>)
|
260
|
-
|
261
|
-
Returns:
|
262
|
-
包含表数据的DataFrame
|
263
|
-
|
264
|
-
Raises:
|
265
|
-
ValueError: 当表不存在或读取失败时抛出
|
266
|
-
"""
|
267
|
-
# 构建完整表名
|
268
|
-
table_name = f'DataLakeCatalog.feature_store.{name}'
|
269
|
-
|
270
|
-
try:
|
271
|
-
# 检查表是否存在
|
272
|
-
if not self._spark.catalog.tableExists(table_name):
|
273
|
-
raise ValueError(f"表 '{table_name}' 不存在")
|
274
|
-
|
275
|
-
# 读取表数据
|
276
|
-
return self._spark.read.table(table_name)
|
277
|
-
|
278
|
-
except Exception as e:
|
279
|
-
raise ValueError(f"读取表 '{table_name}' 失败: {str(e)}") from e
|
280
|
-
|
281
|
-
def drop_table(
|
282
|
-
self,
|
283
|
-
name: str
|
284
|
-
) -> None:
|
285
|
-
"""
|
286
|
-
删除特征表(表不存在时抛出异常)
|
287
|
-
|
288
|
-
Args:
|
289
|
-
name: 特征表名称(格式:<table>)
|
290
|
-
|
291
|
-
Raises:
|
292
|
-
ValueError: 当表不存在时抛出
|
293
|
-
RuntimeError: 当删除操作失败时抛出
|
294
|
-
|
295
|
-
示例:
|
296
|
-
# 基本删除
|
297
|
-
drop_table("user_features")
|
298
|
-
"""
|
299
|
-
# 构建完整表名
|
300
|
-
table_name = f'DataLakeCatalog.feature_store.{name}'
|
301
|
-
|
302
|
-
try:
|
303
|
-
# 检查表是否存在
|
304
|
-
if not self._spark.catalog.tableExists(table_name):
|
305
|
-
raise ValueError(f"表 '{table_name}' 不存在")
|
306
|
-
|
307
|
-
# 执行删除
|
308
|
-
self._spark.sql(f"DROP TABLE {table_name}")
|
309
|
-
|
310
|
-
except ValueError as e:
|
311
|
-
raise # 直接抛出已知的ValueError
|
312
|
-
except Exception as e:
|
313
|
-
raise RuntimeError(f"删除表 '{table_name}' 失败: {str(e)}") from e
|
File without changes
|
@@ -1,286 +0,0 @@
|
|
1
|
-
from collections import defaultdict
|
2
|
-
from typing import List
|
3
|
-
|
4
|
-
from pyspark.sql import SparkSession, DataFrame
|
5
|
-
from pyspark.sql.catalog import Column
|
6
|
-
from pyspark.sql.functions import when, isnull
|
7
|
-
from pyspark.sql.types import StructType, StringType, StructField
|
8
|
-
|
9
|
-
from feature_store.entities.feature import Feature
|
10
|
-
from feature_store.entities.feature_table import FeatureTable
|
11
|
-
from feature_store.entities.function_info import FunctionParameterInfo, FunctionInfo
|
12
|
-
from feature_store.utils.common_utils import unsanitize_identifier
|
13
|
-
from feature_store.utils.utils import sanitize_multi_level_name
|
14
|
-
|
15
|
-
|
16
|
-
class SparkClient:
|
17
|
-
def __init__(self, spark: SparkSession):
|
18
|
-
self._spark = spark
|
19
|
-
|
20
|
-
def createDataFrame(self, data, schema) -> DataFrame:
|
21
|
-
return self._spark.createDataFrame(data, schema)
|
22
|
-
|
23
|
-
def read_table(
|
24
|
-
self, qualified_table_name, as_of_delta_timestamp=None, streaming=False
|
25
|
-
):
|
26
|
-
"""
|
27
|
-
Reads a Delta table, optionally as of some timestamp.
|
28
|
-
"""
|
29
|
-
if streaming and as_of_delta_timestamp:
|
30
|
-
raise ValueError(
|
31
|
-
"Internal error: as_of_delta_timestamp cannot be specified when"
|
32
|
-
" streaming=True."
|
33
|
-
)
|
34
|
-
|
35
|
-
base_reader = (
|
36
|
-
# By default, Structured Streaming only handles append operations. Because
|
37
|
-
# we have a notion of primary keys, most offline feature store operations
|
38
|
-
# are not appends. For example, FeatureStoreClient.write_table(mode=MERGE)
|
39
|
-
# will issue a MERGE operation.
|
40
|
-
# In order to propagate the non-append operations to the
|
41
|
-
# readStream, we set ignoreChanges to "true".
|
42
|
-
# For more information,
|
43
|
-
# see https://docs.databricks.com/delta/delta-streaming.html#ignore-updates-and-deletes
|
44
|
-
self._spark.readStream.format("delta").option("ignoreChanges", "true")
|
45
|
-
if streaming
|
46
|
-
else self._spark.read.format("delta")
|
47
|
-
)
|
48
|
-
|
49
|
-
if as_of_delta_timestamp:
|
50
|
-
return base_reader.option("timestampAsOf", as_of_delta_timestamp).table(
|
51
|
-
sanitize_multi_level_name(qualified_table_name)
|
52
|
-
)
|
53
|
-
else:
|
54
|
-
return base_reader.table(sanitize_multi_level_name(qualified_table_name))
|
55
|
-
|
56
|
-
def get_current_catalog(self):
|
57
|
-
"""
|
58
|
-
Get current set catalog in the spark context.
|
59
|
-
"""
|
60
|
-
try:
|
61
|
-
df = self._spark.sql("SELECT CURRENT_CATALOG()").collect()
|
62
|
-
return unsanitize_identifier(df[0][0])
|
63
|
-
except Exception as e:
|
64
|
-
return None
|
65
|
-
|
66
|
-
def get_current_database(self):
|
67
|
-
"""
|
68
|
-
Get current set database in the spark context.
|
69
|
-
"""
|
70
|
-
try:
|
71
|
-
df = self._spark.sql("SELECT CURRENT_DATABASE()").collect()
|
72
|
-
return unsanitize_identifier(df[0][0])
|
73
|
-
except Exception as e:
|
74
|
-
return None
|
75
|
-
|
76
|
-
def read_table(self, table_name):
|
77
|
-
"""读取Spark表数据
|
78
|
-
|
79
|
-
Args:
|
80
|
-
table_name: 表名,支持格式: catalog.schema.table、schema.table 或 table
|
81
|
-
|
82
|
-
Returns:
|
83
|
-
DataFrame: 表数据
|
84
|
-
|
85
|
-
Raises:
|
86
|
-
ValueError: 当表不存在或读取失败时抛出
|
87
|
-
"""
|
88
|
-
try:
|
89
|
-
# 解析表名
|
90
|
-
parts = table_name.split('.')
|
91
|
-
if len(parts) == 3:
|
92
|
-
catalog, schema, table = parts
|
93
|
-
elif len(parts) == 2:
|
94
|
-
schema, table = parts
|
95
|
-
else:
|
96
|
-
table = table_name
|
97
|
-
|
98
|
-
# 验证表是否存在
|
99
|
-
if not self._spark.catalog.tableExists(table):
|
100
|
-
raise ValueError(f"表不存在: {table_name}")
|
101
|
-
|
102
|
-
return self._spark.table(table)
|
103
|
-
|
104
|
-
except Exception as e:
|
105
|
-
raise ValueError(f"读取表 {table_name} 失败: {str(e)}")
|
106
|
-
|
107
|
-
|
108
|
-
def get_features(self, table_name):
|
109
|
-
# 解析表名
|
110
|
-
parts = table_name.split('.')
|
111
|
-
if len(parts) == 3:
|
112
|
-
# 对于三部分名称(catalog.schema.table),使用schema.table格式
|
113
|
-
_, schema, table = parts
|
114
|
-
full_table_name = f"{schema}.{table}"
|
115
|
-
elif len(parts) == 2:
|
116
|
-
# 对于两部分名称(schema.table),直接使用
|
117
|
-
full_table_name = table_name
|
118
|
-
else:
|
119
|
-
# 单表名,使用当前数据库
|
120
|
-
current_db = self.get_current_database()
|
121
|
-
if not current_db:
|
122
|
-
raise ValueError("无法确定当前数据库")
|
123
|
-
full_table_name = f"{current_db}.{table_name}"
|
124
|
-
|
125
|
-
# 使用dbName.tableName格式查询列信息
|
126
|
-
columns = self._spark.catalog.listColumns(tableName=full_table_name)
|
127
|
-
return [
|
128
|
-
Feature(
|
129
|
-
feature_table=table_name,
|
130
|
-
feature_id=f"{table_name}_{row.name}",
|
131
|
-
name=row.name,
|
132
|
-
data_type=row.dataType,
|
133
|
-
description=row.description or ""
|
134
|
-
) for row in columns
|
135
|
-
]
|
136
|
-
|
137
|
-
def get_online_stores(self, table_name):
|
138
|
-
return None
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
def get_feature_table(self, table_name):
|
143
|
-
|
144
|
-
# 获取表元数据
|
145
|
-
table = self._spark.catalog.getTable(table_name)
|
146
|
-
|
147
|
-
parts = table_name.split('.')
|
148
|
-
if len(parts) == 3:
|
149
|
-
# 对于三部分名称(catalog.schema.table),只使用表名部分
|
150
|
-
table_to_describe = parts[2]
|
151
|
-
elif len(parts) == 2:
|
152
|
-
# 对于两部分名称(schema.table),只使用表名部分
|
153
|
-
table_to_describe = parts[1]
|
154
|
-
else:
|
155
|
-
# 单表名,直接使用
|
156
|
-
table_to_describe = table_name
|
157
|
-
# 获取表详细信息
|
158
|
-
table_details = self._spark.sql(f"DESCRIBE TABLE EXTENDED {table_to_describe}").collect()
|
159
|
-
|
160
|
-
table_properties = {}
|
161
|
-
for row in table_details:
|
162
|
-
if row.col_name == "Table Properties":
|
163
|
-
props = row.data_type[1:-1].split(", ")
|
164
|
-
table_properties = dict(p.split("=") for p in props if "=" in p)
|
165
|
-
|
166
|
-
# 获取特征列信息
|
167
|
-
features = self.get_features(table_name)
|
168
|
-
|
169
|
-
# 构建完整的FeatureTable对象
|
170
|
-
return FeatureTable(
|
171
|
-
name=table_name,
|
172
|
-
table_id=table_properties.get("table_id", table_name),
|
173
|
-
description=table.description or table_properties.get("description", table_name),
|
174
|
-
primary_keys=table_properties.get("primaryKeys", "").split(",") if table_properties.get("primaryKeys") else [],
|
175
|
-
partition_columns=table.partitionColumnNames if hasattr(table, 'partitionColumnNames') else [],
|
176
|
-
features=features,
|
177
|
-
creation_timestamp=None, # Spark表元数据不包含创建时间戳
|
178
|
-
online_stores=self.get_online_stores(table_name),
|
179
|
-
notebook_producers=None,
|
180
|
-
job_producers=None,
|
181
|
-
table_data_sources=None,
|
182
|
-
path_data_sources=None,
|
183
|
-
custom_data_sources=None,
|
184
|
-
timestamp_keys=table_properties.get("timestamp_keys"),
|
185
|
-
tags=table_properties.get("tags")
|
186
|
-
)
|
187
|
-
|
188
|
-
def _get_routines_with_parameters(self, full_routine_names: List[str]) -> DataFrame:
|
189
|
-
"""
|
190
|
-
Retrieve the routines with their parameters from information_schema.routines, information_schema.parameters.
|
191
|
-
Return DataFrame only contains routines that 1. exist and 2. the caller has GetFunction permission on.
|
192
|
-
|
193
|
-
Note: The returned DataFrame contains the cartesian product of routines and parameters.
|
194
|
-
For efficiency, routines table columns are only present in the first row for each routine.
|
195
|
-
"""
|
196
|
-
routine_name_schema = StructType(
|
197
|
-
[
|
198
|
-
StructField("specific_catalog", StringType(), False),
|
199
|
-
StructField("specific_schema", StringType(), False),
|
200
|
-
StructField("specific_name", StringType(), False),
|
201
|
-
]
|
202
|
-
)
|
203
|
-
routine_names_df = self.createDataFrame(
|
204
|
-
[full_routine_name.split(".") for full_routine_name in full_routine_names],
|
205
|
-
routine_name_schema,
|
206
|
-
)
|
207
|
-
routines_table = self.read_table(
|
208
|
-
"system.information_schema.routines"
|
209
|
-
)
|
210
|
-
parameters_table = self.read_table(
|
211
|
-
"system.information_schema.parameters"
|
212
|
-
)
|
213
|
-
|
214
|
-
# Inner join routines table to filter out non-existent routines.
|
215
|
-
# Left join parameters as routines may have no parameters.
|
216
|
-
full_routines_with_parameters_df = routine_names_df.join(
|
217
|
-
routines_table, on=routine_names_df.columns, how="inner"
|
218
|
-
).join(parameters_table, on=routine_names_df.columns, how="left")
|
219
|
-
|
220
|
-
# Return only relevant metadata from information_schema, sorted by routine name + parameter order.
|
221
|
-
# For efficiency, only preserve routine column values in the first of each routine's result rows.
|
222
|
-
# The first row will have parameter.ordinal_value is None (no parameters) or equals 0 (first parameter).
|
223
|
-
def select_if_first_row(col: Column) -> Column:
|
224
|
-
return when(
|
225
|
-
isnull(parameters_table.ordinal_position)
|
226
|
-
| (parameters_table.ordinal_position == 0),
|
227
|
-
col,
|
228
|
-
).otherwise(None)
|
229
|
-
|
230
|
-
return full_routines_with_parameters_df.select(
|
231
|
-
routine_names_df.columns
|
232
|
-
+ [
|
233
|
-
select_if_first_row(routines_table.routine_definition).alias(
|
234
|
-
"routine_definition"
|
235
|
-
),
|
236
|
-
select_if_first_row(routines_table.external_language).alias(
|
237
|
-
"external_language"
|
238
|
-
),
|
239
|
-
parameters_table.ordinal_position,
|
240
|
-
parameters_table.parameter_name,
|
241
|
-
parameters_table.full_data_type,
|
242
|
-
]
|
243
|
-
).sort(routine_names_df.columns + [parameters_table.ordinal_position])
|
244
|
-
|
245
|
-
def get_functions(self, full_function_names: List[str]) -> List[FunctionInfo]:
|
246
|
-
"""
|
247
|
-
Retrieves and maps Unity Catalog functions' metadata as FunctionInfos.
|
248
|
-
"""
|
249
|
-
# Avoid unnecessary Spark calls and return if empty.
|
250
|
-
if not full_function_names:
|
251
|
-
return []
|
252
|
-
|
253
|
-
# Collect dict of routine name -> DataFrame rows describing the routine.
|
254
|
-
routines_with_parameters_df = self._get_routines_with_parameters(
|
255
|
-
full_routine_names=full_function_names
|
256
|
-
)
|
257
|
-
routine_infos = defaultdict(list)
|
258
|
-
for r in routines_with_parameters_df.collect():
|
259
|
-
routine_name = f"{r.specific_catalog}.{r.specific_schema}.{r.specific_name}"
|
260
|
-
routine_infos[routine_name].append(r)
|
261
|
-
|
262
|
-
# Mock GetFunction DNE error, since information_schema does not throw.
|
263
|
-
for function_name in full_function_names:
|
264
|
-
if not function_name in routine_infos:
|
265
|
-
raise ValueError(f"Function '{function_name}' does not exist.")
|
266
|
-
|
267
|
-
# Map routine_infos into FunctionInfos.
|
268
|
-
function_infos = []
|
269
|
-
for function_name in full_function_names:
|
270
|
-
routine_info = routine_infos[function_name][0]
|
271
|
-
input_params = [
|
272
|
-
FunctionParameterInfo(name=p.parameter_name, type_text=p.full_data_type)
|
273
|
-
for p in routine_infos[function_name]
|
274
|
-
if p.ordinal_position is not None
|
275
|
-
]
|
276
|
-
function_infos.append(
|
277
|
-
FunctionInfo(
|
278
|
-
full_name=function_name,
|
279
|
-
input_params=input_params,
|
280
|
-
routine_definition=routine_info.routine_definition,
|
281
|
-
external_language=routine_info.external_language,
|
282
|
-
)
|
283
|
-
)
|
284
|
-
return function_infos
|
285
|
-
|
286
|
-
|
File without changes
|