wedata-feature-engineering 0.1.3__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/PKG-INFO +1 -1
- wedata-feature-engineering-0.1.4/feature_store/utils/__init__.py +0 -0
- wedata-feature-engineering-0.1.4/feature_store/utils/common_utils.py +96 -0
- wedata-feature-engineering-0.1.4/feature_store/utils/feature_lookup_utils.py +570 -0
- wedata-feature-engineering-0.1.4/feature_store/utils/feature_spec_utils.py +286 -0
- wedata-feature-engineering-0.1.4/feature_store/utils/feature_utils.py +73 -0
- wedata-feature-engineering-0.1.4/feature_store/utils/schema_utils.py +117 -0
- wedata-feature-engineering-0.1.4/feature_store/utils/topological_sort.py +158 -0
- wedata-feature-engineering-0.1.4/feature_store/utils/training_set_utils.py +580 -0
- wedata-feature-engineering-0.1.4/feature_store/utils/uc_utils.py +281 -0
- wedata-feature-engineering-0.1.4/feature_store/utils/utils.py +252 -0
- wedata-feature-engineering-0.1.4/feature_store/utils/validation_utils.py +55 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/setup.py +1 -1
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/wedata_feature_engineering.egg-info/PKG-INFO +1 -1
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/wedata_feature_engineering.egg-info/SOURCES.txt +11 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/README.md +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/__init__.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/client.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/constants/__init__.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/constants/constants.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/entities/__init__.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/entities/column_info.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/entities/data_type.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/entities/environment_variables.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/entities/feature.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/entities/feature_column_info.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/entities/feature_function.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/entities/feature_lookup.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/entities/feature_spec.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/entities/feature_spec_constants.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/entities/feature_table.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/entities/feature_table_info.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/entities/function_info.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/entities/on_demand_column_info.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/entities/source_data_column_info.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/entities/training_set.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/feature_table_client/__init__.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/feature_table_client/feature_table_client.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/spark_client/__init__.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/spark_client/spark_client.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/training_set_client/__init__.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/feature_store/training_set_client/training_set_client.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/setup.cfg +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/wedata_feature_engineering.egg-info/dependency_links.txt +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/wedata_feature_engineering.egg-info/requires.txt +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.4}/wedata_feature_engineering.egg-info/top_level.txt +0 -0
File without changes
|
@@ -0,0 +1,96 @@
|
|
1
|
+
"""
|
2
|
+
通用工具函数
|
3
|
+
"""
|
4
|
+
|
5
|
+
from collections import Counter
|
6
|
+
from typing import Any, List
|
7
|
+
|
8
|
+
from mlflow.store.artifact.models_artifact_repo import ModelsArtifactRepository
|
9
|
+
from mlflow.store.artifact.runs_artifact_repo import RunsArtifactRepository
|
10
|
+
|
11
|
+
|
12
|
+
def is_artifact_uri(uri):
|
13
|
+
"""
|
14
|
+
Checks the artifact URI is associated with a MLflow model or run.
|
15
|
+
The actual URI can be a model URI, model URI + subdirectory, or model URI + path to artifact file.
|
16
|
+
"""
|
17
|
+
return ModelsArtifactRepository.is_models_uri(
|
18
|
+
uri
|
19
|
+
) or RunsArtifactRepository.is_runs_uri(uri)
|
20
|
+
|
21
|
+
def as_list(obj, default=None):
|
22
|
+
if not obj:
|
23
|
+
return default
|
24
|
+
elif isinstance(obj, list):
|
25
|
+
return obj
|
26
|
+
else:
|
27
|
+
return [obj]
|
28
|
+
|
29
|
+
def get_duplicates(elements: List[Any]) -> List[Any]:
|
30
|
+
"""
|
31
|
+
Returns duplicate elements in the order they first appear.
|
32
|
+
"""
|
33
|
+
element_counts = Counter(elements)
|
34
|
+
duplicates = []
|
35
|
+
for e in element_counts.keys():
|
36
|
+
if element_counts[e] > 1:
|
37
|
+
duplicates.append(e)
|
38
|
+
return duplicates
|
39
|
+
|
40
|
+
def validate_strings_unique(strings: List[str], error_template: str):
|
41
|
+
"""
|
42
|
+
Validates all strings are unique, otherwise raise ValueError with the error template and duplicates.
|
43
|
+
Passes single-quoted, comma delimited duplicates to the error template.
|
44
|
+
"""
|
45
|
+
duplicate_strings = get_duplicates(strings)
|
46
|
+
if duplicate_strings:
|
47
|
+
duplicates_formatted = ", ".join([f"'{s}'" for s in duplicate_strings])
|
48
|
+
raise ValueError(error_template.format(duplicates_formatted))
|
49
|
+
|
50
|
+
def sanitize_identifier(identifier: str):
|
51
|
+
"""
|
52
|
+
Sanitize and wrap an identifier with backquotes. For example, "a`b" becomes "`a``b`".
|
53
|
+
Use this function to sanitize identifiers such as column names in SQL and PySpark.
|
54
|
+
"""
|
55
|
+
return f"`{identifier.replace('`', '``')}`"
|
56
|
+
|
57
|
+
|
58
|
+
def sanitize_identifiers(identifiers: List[str]):
|
59
|
+
"""
|
60
|
+
Sanitize and wrap the identifiers in a list with backquotes.
|
61
|
+
"""
|
62
|
+
return [sanitize_identifier(i) for i in identifiers]
|
63
|
+
|
64
|
+
|
65
|
+
def sanitize_multi_level_name(multi_level_name: str):
|
66
|
+
"""
|
67
|
+
Sanitize a multi-level name (such as an Unity Catalog table name) by sanitizing each segment
|
68
|
+
and joining the results. For example, "ca+t.fo`o.ba$r" becomes "`ca+t`.`fo``o`.`ba$r`".
|
69
|
+
"""
|
70
|
+
segments = multi_level_name.split(".")
|
71
|
+
return ".".join(sanitize_identifiers(segments))
|
72
|
+
|
73
|
+
|
74
|
+
def unsanitize_identifier(identifier: str):
|
75
|
+
"""
|
76
|
+
Unsanitize an identifier. Useful when we get a possibly sanitized identifier from Spark or
|
77
|
+
somewhere else, but we need an unsanitized one.
|
78
|
+
Note: This function does not check the correctness of the identifier passed in. e.g. `foo``
|
79
|
+
is not a valid sanitized identifier. When given such invalid input, this function returns
|
80
|
+
invalid output.
|
81
|
+
"""
|
82
|
+
if len(identifier) >= 2 and identifier[0] == "`" and identifier[-1] == "`":
|
83
|
+
return identifier[1:-1].replace("``", "`")
|
84
|
+
else:
|
85
|
+
return identifier
|
86
|
+
|
87
|
+
|
88
|
+
# strings containing \ or ' can break sql statements, so escape them.
|
89
|
+
def escape_sql_string(input_str: str) -> str:
|
90
|
+
return input_str.replace("\\", "\\\\").replace("'", "\\'")
|
91
|
+
|
92
|
+
def get_unique_list_order(elements: List[Any]) -> List[Any]:
|
93
|
+
"""
|
94
|
+
Returns unique elements in the order they first appear.
|
95
|
+
"""
|
96
|
+
return list(dict.fromkeys(elements))
|
@@ -0,0 +1,570 @@
|
|
1
|
+
import copy
|
2
|
+
import datetime
|
3
|
+
import logging
|
4
|
+
import re
|
5
|
+
from collections import defaultdict
|
6
|
+
from functools import reduce
|
7
|
+
from typing import Dict, List, Optional, Tuple
|
8
|
+
|
9
|
+
from pyspark.sql import DataFrame, Window
|
10
|
+
from pyspark.sql import functions as F
|
11
|
+
from pyspark.sql.functions import sum, unix_timestamp
|
12
|
+
|
13
|
+
from feature_store.entities.environment_variables import BROADCAST_JOIN_THRESHOLD
|
14
|
+
from feature_store.entities.feature_column_info import FeatureColumnInfo
|
15
|
+
from feature_store.entities.feature_lookup import FeatureLookup
|
16
|
+
from feature_store.entities.feature_spec import FeatureSpec
|
17
|
+
from feature_store.entities.feature_table import FeatureTable
|
18
|
+
|
19
|
+
from feature_store.utils import common_utils, validation_utils, uc_utils
|
20
|
+
|
21
|
+
_logger = logging.getLogger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
def _spark_asof_join_features(
|
25
|
+
df: DataFrame,
|
26
|
+
df_lookup_keys: List[str],
|
27
|
+
df_timestamp_lookup_key: str,
|
28
|
+
feature_table_data: DataFrame,
|
29
|
+
feature_table_keys: List[str],
|
30
|
+
feature_table_timestamp_key: str,
|
31
|
+
feature_to_output_name: Dict[str, str],
|
32
|
+
lookback_window_seconds: Optional[float] = None,
|
33
|
+
use_spark_native_join: Optional[bool] = False,
|
34
|
+
) -> DataFrame:
|
35
|
+
# Alias feature table's keys to DataFrame lookup keys
|
36
|
+
ft_key_aliases = [
|
37
|
+
feature_table_data[ft_key].alias(df_key)
|
38
|
+
for (ft_key, df_key) in zip(feature_table_keys, df_lookup_keys)
|
39
|
+
]
|
40
|
+
# Alias features to corresponding output names
|
41
|
+
ft_features = [
|
42
|
+
(feature_name, output_name)
|
43
|
+
for feature_name, output_name in feature_to_output_name.items()
|
44
|
+
# Skip join if feature it is already in DataFrame and therefore overridden
|
45
|
+
if output_name not in df.columns
|
46
|
+
]
|
47
|
+
ft_feature_aliases = [
|
48
|
+
feature_table_data[feature_name].alias(output_name)
|
49
|
+
for feature_name, output_name in ft_features
|
50
|
+
]
|
51
|
+
# Alias feature table's timestamp key to DataFrame timestamp lookup keys
|
52
|
+
ft_timestamp_key_aliases = [
|
53
|
+
feature_table_data[feature_table_timestamp_key].alias(df_timestamp_lookup_key)
|
54
|
+
]
|
55
|
+
# Select key, timestamp key, and feature columns from feature table
|
56
|
+
feature_and_keys = feature_table_data.select(
|
57
|
+
ft_key_aliases + ft_timestamp_key_aliases + ft_feature_aliases
|
58
|
+
)
|
59
|
+
|
60
|
+
_logger.debug(
|
61
|
+
"Using native spark for point in time join"
|
62
|
+
if use_spark_native_join
|
63
|
+
else "Using tempo for point in time join"
|
64
|
+
)
|
65
|
+
|
66
|
+
if use_spark_native_join:
|
67
|
+
joined_df = _spark_asof_join_features_native(
|
68
|
+
labels_df=df,
|
69
|
+
features_df=feature_and_keys,
|
70
|
+
primary_keys=df_lookup_keys,
|
71
|
+
timestamp_key=df_timestamp_lookup_key,
|
72
|
+
lookback_window_seconds=lookback_window_seconds,
|
73
|
+
)
|
74
|
+
else:
|
75
|
+
joined_df = _spark_asof_join_features_tempo(
|
76
|
+
df=df,
|
77
|
+
df_lookup_keys=df_lookup_keys,
|
78
|
+
df_timestamp_lookup_key=df_timestamp_lookup_key,
|
79
|
+
feature_and_keys=feature_and_keys,
|
80
|
+
ft_features=ft_features,
|
81
|
+
lookback_window_seconds=lookback_window_seconds,
|
82
|
+
)
|
83
|
+
return joined_df
|
84
|
+
|
85
|
+
def _spark_asof_join_features_tempo(
|
86
|
+
df: DataFrame,
|
87
|
+
feature_df: DataFrame,
|
88
|
+
lookup_keys: List[str],
|
89
|
+
timestamp_key: str,
|
90
|
+
lookback_window: Optional[float] = None
|
91
|
+
) -> DataFrame:
|
92
|
+
"""
|
93
|
+
自定义实现as-of连接
|
94
|
+
:param df: 主表DataFrame
|
95
|
+
:param feature_df: 特征表DataFrame
|
96
|
+
:param lookup_keys: 连接键列表
|
97
|
+
:param timestamp_key: 时间戳列名
|
98
|
+
:param lookback_window: 最大回溯时间(秒)
|
99
|
+
:return: 连接后的DataFrame
|
100
|
+
"""
|
101
|
+
|
102
|
+
# 1. 只保留键列和时间戳列
|
103
|
+
df_keys = df.select(lookup_keys + [timestamp_key])
|
104
|
+
feature_keys = feature_df.select(lookup_keys + [timestamp_key])
|
105
|
+
|
106
|
+
# 2. 创建连接条件
|
107
|
+
join_cond = [df_keys[k] == feature_keys[k] for k in lookup_keys]
|
108
|
+
join_cond = reduce(lambda x, y: x & y, join_cond)
|
109
|
+
join_cond &= (df_keys[timestamp_key] >= feature_keys[timestamp_key])
|
110
|
+
|
111
|
+
if lookback_window:
|
112
|
+
join_cond &= (
|
113
|
+
(F.unix_timestamp(df_keys[timestamp_key]) -
|
114
|
+
F.unix_timestamp(feature_keys[timestamp_key])) <= lookback_window
|
115
|
+
)
|
116
|
+
|
117
|
+
# 3. 执行连接并找出每个主表记录对应的最新特征记录
|
118
|
+
joined = df_keys.join(feature_keys, join_cond, "left")
|
119
|
+
|
120
|
+
# 按主表键分组,找出最大特征时间戳
|
121
|
+
window = Window.partitionBy(lookup_keys).orderBy(F.desc(timestamp_key))
|
122
|
+
latest_features = (
|
123
|
+
joined
|
124
|
+
.withColumn("rn", F.row_number().over(window))
|
125
|
+
.filter(F.col("rn") == 1)
|
126
|
+
.drop("rn")
|
127
|
+
)
|
128
|
+
|
129
|
+
# 4. 最终连接获取完整特征数据
|
130
|
+
result = df.join(
|
131
|
+
latest_features.select(lookup_keys + [timestamp_key, "feature_col"]),
|
132
|
+
lookup_keys + [timestamp_key],
|
133
|
+
"left"
|
134
|
+
)
|
135
|
+
|
136
|
+
return result
|
137
|
+
|
138
|
+
def _spark_asof_join_features_native(
|
139
|
+
labels_df: DataFrame,
|
140
|
+
features_df: DataFrame,
|
141
|
+
primary_keys: List[str],
|
142
|
+
timestamp_key: str,
|
143
|
+
lookback_window_seconds: Optional[float] = None,
|
144
|
+
):
|
145
|
+
"""
|
146
|
+
Performs an as-of join operation between two dataframes using native Spark operations.
|
147
|
+
Uses broadcast join for label dataset when within a size threshold to improve
|
148
|
+
efficiency of join operation with the assumption that size(labels_df) << size(features_df).
|
149
|
+
TODO(ML-40580): automatically switch labels_df and features_df based on size
|
150
|
+
The join operation is performed as follows:
|
151
|
+
1. Drop non-join key (primary and timestamp keys) columns from labels and features DataFrames
|
152
|
+
2. Broadcast join labels onto features DataFrame if within broadcast threshold.
|
153
|
+
3. Select maximum timestamp for each primary key
|
154
|
+
4. Rejoin non-primary key columns from features DataFrame to get features data
|
155
|
+
5. Rejoin non-primary key columns from labels DataFrame to get joint data
|
156
|
+
|
157
|
+
Parameters:
|
158
|
+
labels_df (DataFrame): The labels dataframe to join.
|
159
|
+
features_df (DataFrame): The features dataframe to join.
|
160
|
+
primary_keys (List[str]): The primary keys used for joining.
|
161
|
+
timestamp_key (str): The timestamp key used for joining.
|
162
|
+
lookback_window_seconds (Optional[float]): The lookback window in seconds.
|
163
|
+
If provided, the join operation will only consider records within this window.
|
164
|
+
|
165
|
+
Returns:
|
166
|
+
DataFrame: The result of the as-of join operation.
|
167
|
+
"""
|
168
|
+
labels_df_keys_only = labels_df.select(
|
169
|
+
[F.col(key) for key in primary_keys] + [F.col(timestamp_key)]
|
170
|
+
)
|
171
|
+
|
172
|
+
# Broadcast labels DataFrame if within the broadcast threshold
|
173
|
+
if _df_in_size_threshold(labels_df_keys_only, BROADCAST_JOIN_THRESHOLD.get()):
|
174
|
+
labels_df_keys_only = F.broadcast(labels_df_keys_only)
|
175
|
+
|
176
|
+
# Drop non-primary key columns from features DataFrame
|
177
|
+
features_df_keys_only = features_df.select(
|
178
|
+
[F.col(key).alias(f"__features_pk_{key}") for key in primary_keys]
|
179
|
+
+ [F.col(timestamp_key).alias("__features_tk")]
|
180
|
+
)
|
181
|
+
|
182
|
+
# Create join conditions
|
183
|
+
join_conditions = [
|
184
|
+
labels_df_keys_only[key] == features_df_keys_only[f"__features_pk_{key}"]
|
185
|
+
for key in primary_keys
|
186
|
+
]
|
187
|
+
join_conditions = reduce(lambda x, y: x & y, join_conditions)
|
188
|
+
join_conditions &= (
|
189
|
+
labels_df_keys_only[timestamp_key] >= features_df_keys_only["__features_tk"]
|
190
|
+
)
|
191
|
+
if lookback_window_seconds is not None:
|
192
|
+
join_conditions &= (
|
193
|
+
unix_timestamp(labels_df_keys_only[timestamp_key])
|
194
|
+
- unix_timestamp(features_df_keys_only["__features_tk"])
|
195
|
+
) <= lookback_window_seconds
|
196
|
+
|
197
|
+
# Join labels and features DataFrames
|
198
|
+
labels_df_keys_with_features_keys = labels_df_keys_only.join(
|
199
|
+
features_df_keys_only, on=join_conditions, how="left"
|
200
|
+
)
|
201
|
+
|
202
|
+
# Find the features max timestamps for each primary keys and timestamp key in labels
|
203
|
+
labels_df_keys_with_features_keys = labels_df_keys_with_features_keys.groupBy(
|
204
|
+
[labels_df_keys_only[key] for key in primary_keys] + [F.col(timestamp_key)]
|
205
|
+
).agg(F.max("__features_tk").alias("__max_ts"))
|
206
|
+
|
207
|
+
if _df_in_size_threshold(
|
208
|
+
labels_df_keys_with_features_keys, BROADCAST_JOIN_THRESHOLD.get()
|
209
|
+
):
|
210
|
+
labels_df_keys_with_features_keys = F.broadcast(
|
211
|
+
labels_df_keys_with_features_keys
|
212
|
+
)
|
213
|
+
|
214
|
+
# Rejoin features DataFrame to get the features data
|
215
|
+
join_conditions = [
|
216
|
+
features_df[key] == labels_df_keys_with_features_keys[key]
|
217
|
+
for key in primary_keys
|
218
|
+
]
|
219
|
+
join_conditions = reduce(lambda x, y: x & y, join_conditions)
|
220
|
+
join_conditions &= (
|
221
|
+
features_df[timestamp_key] == labels_df_keys_with_features_keys["__max_ts"]
|
222
|
+
)
|
223
|
+
|
224
|
+
features = features_df.join(
|
225
|
+
labels_df_keys_with_features_keys,
|
226
|
+
on=join_conditions,
|
227
|
+
how="inner",
|
228
|
+
)
|
229
|
+
|
230
|
+
pk_columns_to_drop = [
|
231
|
+
labels_df_keys_with_features_keys[key] for key in primary_keys
|
232
|
+
]
|
233
|
+
features = features.drop(*pk_columns_to_drop).drop(
|
234
|
+
features_df[timestamp_key], labels_df_keys_with_features_keys["__max_ts"]
|
235
|
+
)
|
236
|
+
features = features.dropDuplicates(primary_keys + [timestamp_key])
|
237
|
+
# Rejoin labels DataFrame if columns were dropped
|
238
|
+
joint_df = labels_df.join(features, on=primary_keys + [timestamp_key], how="left")
|
239
|
+
return joint_df
|
240
|
+
|
241
|
+
|
242
|
+
def _df_in_size_threshold(df, threshold) -> float:
|
243
|
+
# Default to within threshold if can not find
|
244
|
+
try:
|
245
|
+
num_bytes = _get_df_size_from_spark_plan(df)
|
246
|
+
except Exception as e:
|
247
|
+
num_bytes = 0
|
248
|
+
return num_bytes <= threshold
|
249
|
+
|
250
|
+
|
251
|
+
def _get_df_size_from_spark_plan(df: DataFrame) -> float:
|
252
|
+
"""
|
253
|
+
获取DataFrame的估算大小(字节数)
|
254
|
+
替代方案:直接从DataFrame的SparkSession获取执行计划信息
|
255
|
+
|
256
|
+
参数:
|
257
|
+
df: 要计算大小的Spark DataFrame
|
258
|
+
|
259
|
+
返回:
|
260
|
+
float: 估算的字节数
|
261
|
+
|
262
|
+
异常:
|
263
|
+
ValueError: 如果无法从执行计划中解析出大小信息
|
264
|
+
"""
|
265
|
+
# 直接从DataFrame获取SparkSession
|
266
|
+
spark = df.sql_ctx.sparkSession
|
267
|
+
|
268
|
+
# 创建临时视图
|
269
|
+
df.createOrReplaceTempView("temp_view_for_size")
|
270
|
+
|
271
|
+
# 获取执行计划
|
272
|
+
plan = spark.sql("explain cost select * from temp_view_for_size").collect()[0][0]
|
273
|
+
|
274
|
+
# 解析大小信息
|
275
|
+
search_result = re.search(r"sizeInBytes=.*(['\)])", plan, re.MULTILINE)
|
276
|
+
if search_result is None:
|
277
|
+
raise ValueError("无法从Spark执行计划中获取sizeInBytes信息")
|
278
|
+
|
279
|
+
# 提取大小和单位
|
280
|
+
result = search_result.group(0).replace(")", "")
|
281
|
+
size, units = result.split("=")[1].split()
|
282
|
+
|
283
|
+
# 单位转换映射
|
284
|
+
units_map = {
|
285
|
+
"TiB": 1024**4, # 太字节
|
286
|
+
"GiB": 1024**3, # 吉字节
|
287
|
+
"MiB": 1024**2, # 兆字节
|
288
|
+
"KiB": 1024, # 千字节
|
289
|
+
"B": 1 # 字节(处理没有单位的情况)
|
290
|
+
}
|
291
|
+
|
292
|
+
# 清理单位字符串并转换
|
293
|
+
clean_units = units.rstrip(",")
|
294
|
+
return float(size) * units_map.get(clean_units, 1) # 默认返回原始值
|
295
|
+
|
296
|
+
|
297
|
+
def _spark_join_features(
|
298
|
+
df: DataFrame,
|
299
|
+
df_keys: List[str],
|
300
|
+
feature_table_data: DataFrame,
|
301
|
+
feature_table_keys: List[str],
|
302
|
+
feature_to_output_name: Dict[str, str],
|
303
|
+
) -> DataFrame:
|
304
|
+
"""
|
305
|
+
Helper to join `feature_name` from `feature_table_data` into `df`.
|
306
|
+
|
307
|
+
This join uses a temporary table that contains only the keys and feature
|
308
|
+
from the feature table. The temporary table aliases the keys to match
|
309
|
+
the lookup keys and the feature to match the output_name.
|
310
|
+
|
311
|
+
Aliasing the keys allows us to join on name instead of by column,
|
312
|
+
which prevents duplicate column names after the join.
|
313
|
+
(see: https://kb.databricks.com/data/join-two-dataframes-duplicated-columns.html)
|
314
|
+
|
315
|
+
The joined-in feature is guaranteed to be unique because FeatureSpec
|
316
|
+
columns must be unique and the join is skipped if the feature
|
317
|
+
already exists in the DataFrame.
|
318
|
+
"""
|
319
|
+
|
320
|
+
# Alias feature table's keys to DataFrame lookup keys
|
321
|
+
ft_key_aliases = [
|
322
|
+
feature_table_data[ft_key].alias(df_key)
|
323
|
+
for (ft_key, df_key) in zip(feature_table_keys, df_keys)
|
324
|
+
]
|
325
|
+
# Alias features to corresponding output names
|
326
|
+
ft_feature_aliases = [
|
327
|
+
feature_table_data[feature_name].alias(output_name)
|
328
|
+
for feature_name, output_name in feature_to_output_name.items()
|
329
|
+
# Skip join if feature it is already in DataFrame and therefore overridden
|
330
|
+
if output_name not in df.columns
|
331
|
+
]
|
332
|
+
# Select key and feature columns from feature table
|
333
|
+
feature_and_keys = feature_table_data.select(ft_key_aliases + ft_feature_aliases)
|
334
|
+
# Join feature to feature table
|
335
|
+
return df.join(feature_and_keys, df_keys, how="left")
|
336
|
+
|
337
|
+
|
338
|
+
def _validate_join_keys(
|
339
|
+
feature_column_info: FeatureColumnInfo,
|
340
|
+
df: DataFrame,
|
341
|
+
feature_table_metadata: FeatureTable,
|
342
|
+
feature_table_data: DataFrame,
|
343
|
+
is_timestamp_key: bool = False,
|
344
|
+
):
|
345
|
+
join_error_phrase = (
|
346
|
+
f"Unable to join feature table '{feature_column_info.table_name}'"
|
347
|
+
)
|
348
|
+
feature_column_info_keys = (
|
349
|
+
feature_column_info.timestamp_lookup_key
|
350
|
+
if is_timestamp_key
|
351
|
+
else feature_column_info.lookup_key
|
352
|
+
)
|
353
|
+
feature_table_keys = (
|
354
|
+
feature_table_metadata.timestamp_keys
|
355
|
+
if is_timestamp_key
|
356
|
+
else feature_table_metadata.primary_keys
|
357
|
+
)
|
358
|
+
|
359
|
+
lookup_key_kind = "timestamp lookup key" if is_timestamp_key else "lookup key"
|
360
|
+
feature_table_key_kind = "timestamp key" if is_timestamp_key else "primary key"
|
361
|
+
|
362
|
+
# Validate df has necessary keys
|
363
|
+
missing_df_keys = list(
|
364
|
+
filter(lambda df_key: df_key not in df.columns, feature_column_info_keys)
|
365
|
+
)
|
366
|
+
if missing_df_keys:
|
367
|
+
missing_keys = ", ".join([f"'{key}'" for key in missing_df_keys])
|
368
|
+
raise ValueError(
|
369
|
+
f"{join_error_phrase} because {lookup_key_kind} {missing_keys} not found in DataFrame."
|
370
|
+
)
|
371
|
+
# Validate feature table has necessary keys
|
372
|
+
missing_ft_keys = list(
|
373
|
+
filter(
|
374
|
+
lambda ft_key: ft_key not in feature_table_data.columns, feature_table_keys
|
375
|
+
)
|
376
|
+
)
|
377
|
+
if missing_ft_keys:
|
378
|
+
missing_keys = ", ".join([f"'{key}'" for key in missing_ft_keys])
|
379
|
+
raise ValueError(
|
380
|
+
f"{join_error_phrase} because {feature_table_key_kind} {missing_keys} not found in feature table."
|
381
|
+
)
|
382
|
+
|
383
|
+
# Validate number of feature table keys matches number of df lookup keys
|
384
|
+
if len(feature_column_info_keys) != len(feature_table_keys):
|
385
|
+
raise ValueError(
|
386
|
+
f"{join_error_phrase} because "
|
387
|
+
f"number of {feature_table_key_kind}s ({feature_table_keys}) "
|
388
|
+
f"does not match "
|
389
|
+
f"number of {lookup_key_kind}s ({feature_column_info_keys})."
|
390
|
+
)
|
391
|
+
|
392
|
+
# Validate feature table keys match types of df keys. The number of keys is expected to be the same.
|
393
|
+
# for (df_key, ft_key) in zip(feature_column_info_keys, feature_table_keys):
|
394
|
+
# df_key_type = DataType.from_spark_type(df.schema[df_key].dataType)
|
395
|
+
# ft_key_type = DataType.from_spark_type(
|
396
|
+
# feature_table_data.schema[ft_key].dataType
|
397
|
+
# )
|
398
|
+
# if df_key_type != ft_key_type:
|
399
|
+
# raise ValueError(
|
400
|
+
# f"{join_error_phrase} because {feature_table_key_kind} '{ft_key}' has type '{DataType.to_string(ft_key_type)}' "
|
401
|
+
# f"but corresponding {lookup_key_kind} '{df_key}' has type '{DataType.to_string(df_key_type)}' in DataFrame."
|
402
|
+
# )
|
403
|
+
|
404
|
+
|
405
|
+
def _validate_join_feature_data(
|
406
|
+
df: DataFrame,
|
407
|
+
features_to_join: List[FeatureColumnInfo],
|
408
|
+
feature_table_metadata_map: Dict[str, FeatureTable],
|
409
|
+
feature_table_data_map: Dict[str, DataFrame],
|
410
|
+
):
|
411
|
+
for feature_info in features_to_join:
|
412
|
+
feature_table_metadata = feature_table_metadata_map[feature_info.table_name]
|
413
|
+
feature_table_data = feature_table_data_map[feature_info.table_name]
|
414
|
+
# Validate feature table primary keys match length/type of df lookup keys
|
415
|
+
_validate_join_keys(
|
416
|
+
feature_info,
|
417
|
+
df,
|
418
|
+
feature_table_metadata,
|
419
|
+
feature_table_data,
|
420
|
+
is_timestamp_key=False,
|
421
|
+
)
|
422
|
+
# Validate feature table timestamp keys match length/type of df timestamp lookup keys
|
423
|
+
_validate_join_keys(
|
424
|
+
feature_info,
|
425
|
+
df,
|
426
|
+
feature_table_metadata,
|
427
|
+
feature_table_data,
|
428
|
+
is_timestamp_key=True,
|
429
|
+
)
|
430
|
+
|
431
|
+
|
432
|
+
def join_feature_data_if_not_overridden(
|
433
|
+
feature_spec: FeatureSpec,
|
434
|
+
df: DataFrame,
|
435
|
+
features_to_join: List[FeatureColumnInfo],
|
436
|
+
feature_table_metadata_map: Dict[str, FeatureTable],
|
437
|
+
feature_table_data_map: Dict[str, DataFrame],
|
438
|
+
use_spark_native_join: Optional[bool] = False,
|
439
|
+
) -> DataFrame:
|
440
|
+
"""
|
441
|
+
Joins `df` with features specified by `feature_spec.feature_column_infos` if they do not already exist.
|
442
|
+
|
443
|
+
Return column order is df.columns + newly joined features. The newly joined feature order is not guaranteed to
|
444
|
+
match `feature_spec.feature_column_infos` as feature lookups are first grouped by table for efficiency.
|
445
|
+
|
446
|
+
Before joining, it checks that:
|
447
|
+
1. Feature table keys match length and types of `df` lookup keys specified by FeatureSpec
|
448
|
+
2. `df` contains lookup keys specified by FeatureSpec
|
449
|
+
3. Feature table timestamp lookup keys match length and types of `df` timestamp lookup keys if specified by FeatureSpec
|
450
|
+
4. `df` contains timestamp lookup keys if specified by FeatureSpec
|
451
|
+
"""
|
452
|
+
_validate_join_feature_data(
|
453
|
+
df=df,
|
454
|
+
features_to_join=features_to_join,
|
455
|
+
feature_table_metadata_map=feature_table_metadata_map,
|
456
|
+
feature_table_data_map=feature_table_data_map,
|
457
|
+
)
|
458
|
+
|
459
|
+
# Helper class to group all unique combinations of feature table names and lookup keys.
|
460
|
+
# All features in each of these groups will be JOINed with the training df using a single JOIN.
|
461
|
+
class JoinDataKey:
|
462
|
+
def __init__(
|
463
|
+
self,
|
464
|
+
feature_table: str,
|
465
|
+
lookup_key: List[str],
|
466
|
+
timestamp_lookup_key: List[str],
|
467
|
+
lookback_window: Optional[datetime.timedelta] = None,
|
468
|
+
):
|
469
|
+
self.feature_table = feature_table
|
470
|
+
self.lookup_key = lookup_key
|
471
|
+
self.timestamp_lookup_key = timestamp_lookup_key
|
472
|
+
self.lookback_window = lookback_window
|
473
|
+
|
474
|
+
def __hash__(self):
|
475
|
+
return (
|
476
|
+
hash(self.feature_table)
|
477
|
+
+ hash(tuple(self.lookup_key))
|
478
|
+
+ hash(tuple(self.timestamp_lookup_key))
|
479
|
+
+ hash(self.lookback_window)
|
480
|
+
)
|
481
|
+
|
482
|
+
def __eq__(self, other):
|
483
|
+
return (
|
484
|
+
self.feature_table == other.feature_table
|
485
|
+
and self.lookup_key == other.lookup_key
|
486
|
+
and self.timestamp_lookup_key == other.timestamp_lookup_key
|
487
|
+
and self.lookback_window == other.lookback_window
|
488
|
+
)
|
489
|
+
|
490
|
+
# Iterate through the list of FeatureColumnInfo and group features by name of the
|
491
|
+
# feature table and lookup key(s) and timestamp lookup key(s)
|
492
|
+
table_join_data = defaultdict(dict)
|
493
|
+
lookback_windows = {
|
494
|
+
t.table_name: t.lookback_window for t in feature_spec.table_infos
|
495
|
+
}
|
496
|
+
for feature_info in features_to_join:
|
497
|
+
join_data_key = JoinDataKey(
|
498
|
+
feature_info.table_name,
|
499
|
+
feature_info.lookup_key,
|
500
|
+
feature_info.timestamp_lookup_key,
|
501
|
+
lookback_windows[feature_info.table_name],
|
502
|
+
)
|
503
|
+
table_join_data[join_data_key][
|
504
|
+
feature_info.feature_name
|
505
|
+
] = feature_info.output_name
|
506
|
+
|
507
|
+
for join_data_key, feature_to_output_name in table_join_data.items():
|
508
|
+
|
509
|
+
feature_table_metadata = feature_table_metadata_map[join_data_key.feature_table]
|
510
|
+
feature_table_data = feature_table_data_map[join_data_key.feature_table]
|
511
|
+
|
512
|
+
if join_data_key.timestamp_lookup_key:
|
513
|
+
# If lookback window is set to 0, then perform exact join instead of asof join to get perf benefits.
|
514
|
+
if (
|
515
|
+
join_data_key.lookback_window is not None
|
516
|
+
and join_data_key.lookback_window == 0
|
517
|
+
):
|
518
|
+
df = _spark_join_features(
|
519
|
+
df=df,
|
520
|
+
df_keys=join_data_key.lookup_key
|
521
|
+
+ join_data_key.timestamp_lookup_key,
|
522
|
+
feature_table_data=feature_table_data,
|
523
|
+
feature_table_keys=feature_table_metadata.primary_keys
|
524
|
+
+ feature_table_metadata.timestamp_keys,
|
525
|
+
feature_to_output_name=feature_to_output_name,
|
526
|
+
)
|
527
|
+
else:
|
528
|
+
df = _spark_asof_join_features(
|
529
|
+
df=df,
|
530
|
+
df_lookup_keys=join_data_key.lookup_key,
|
531
|
+
df_timestamp_lookup_key=join_data_key.timestamp_lookup_key[0],
|
532
|
+
feature_table_data=feature_table_data,
|
533
|
+
feature_table_keys=feature_table_metadata.primary_keys,
|
534
|
+
feature_table_timestamp_key=feature_table_metadata.timestamp_keys[
|
535
|
+
0
|
536
|
+
],
|
537
|
+
feature_to_output_name=feature_to_output_name,
|
538
|
+
lookback_window_seconds=join_data_key.lookback_window,
|
539
|
+
use_spark_native_join=use_spark_native_join,
|
540
|
+
)
|
541
|
+
else:
|
542
|
+
df = _spark_join_features(
|
543
|
+
df=df,
|
544
|
+
df_keys=join_data_key.lookup_key,
|
545
|
+
feature_table_data=feature_table_data,
|
546
|
+
feature_table_keys=feature_table_metadata.primary_keys,
|
547
|
+
feature_to_output_name=feature_to_output_name,
|
548
|
+
)
|
549
|
+
return df
|
550
|
+
|
551
|
+
|
552
|
+
def get_feature_lookups_with_full_table_names(
|
553
|
+
feature_lookups: List[FeatureLookup], current_catalog: str, current_schema: str
|
554
|
+
) -> List[FeatureLookup]:
|
555
|
+
"""
|
556
|
+
Takes in a list of FeatureLookups, and returns copies with reformatted table names.
|
557
|
+
"""
|
558
|
+
table_names = {fl.table_name for fl in feature_lookups}
|
559
|
+
uc_utils._check_qualified_table_names(table_names)
|
560
|
+
uc_utils._verify_all_tables_are_either_in_uc_or_in_hms(
|
561
|
+
table_names, current_catalog, current_schema
|
562
|
+
)
|
563
|
+
standardized_feature_lookups = []
|
564
|
+
for fl in feature_lookups:
|
565
|
+
fl_copy = copy.deepcopy(fl)
|
566
|
+
fl_copy._table_name = uc_utils.get_full_table_name(
|
567
|
+
fl_copy.table_name, current_catalog, current_schema
|
568
|
+
)
|
569
|
+
standardized_feature_lookups.append(fl_copy)
|
570
|
+
return standardized_feature_lookups
|