wedata-feature-engineering 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. wedata/__init__.py +1 -1
  2. wedata/feature_store/client.py +113 -41
  3. wedata/feature_store/constants/constants.py +19 -0
  4. wedata/feature_store/entities/column_info.py +4 -4
  5. wedata/feature_store/entities/feature_lookup.py +5 -1
  6. wedata/feature_store/entities/feature_spec.py +46 -46
  7. wedata/feature_store/entities/feature_table.py +42 -99
  8. wedata/feature_store/entities/training_set.py +13 -12
  9. wedata/feature_store/feature_table_client/feature_table_client.py +85 -30
  10. wedata/feature_store/spark_client/spark_client.py +30 -56
  11. wedata/feature_store/training_set_client/training_set_client.py +209 -38
  12. wedata/feature_store/utils/common_utils.py +213 -3
  13. wedata/feature_store/utils/feature_lookup_utils.py +6 -6
  14. wedata/feature_store/utils/feature_spec_utils.py +6 -6
  15. wedata/feature_store/utils/feature_utils.py +5 -5
  16. wedata/feature_store/utils/on_demand_utils.py +107 -0
  17. wedata/feature_store/utils/schema_utils.py +1 -1
  18. wedata/feature_store/utils/signature_utils.py +205 -0
  19. wedata/feature_store/utils/training_set_utils.py +18 -19
  20. wedata/feature_store/utils/uc_utils.py +1 -1
  21. {wedata_feature_engineering-0.1.5.dist-info → wedata_feature_engineering-0.1.6.dist-info}/METADATA +1 -1
  22. wedata_feature_engineering-0.1.6.dist-info/RECORD +43 -0
  23. feature_store/__init__.py +0 -6
  24. feature_store/client.py +0 -169
  25. feature_store/constants/__init__.py +0 -0
  26. feature_store/constants/constants.py +0 -28
  27. feature_store/entities/__init__.py +0 -0
  28. feature_store/entities/column_info.py +0 -117
  29. feature_store/entities/data_type.py +0 -92
  30. feature_store/entities/environment_variables.py +0 -55
  31. feature_store/entities/feature.py +0 -53
  32. feature_store/entities/feature_column_info.py +0 -64
  33. feature_store/entities/feature_function.py +0 -55
  34. feature_store/entities/feature_lookup.py +0 -179
  35. feature_store/entities/feature_spec.py +0 -454
  36. feature_store/entities/feature_spec_constants.py +0 -25
  37. feature_store/entities/feature_table.py +0 -164
  38. feature_store/entities/feature_table_info.py +0 -40
  39. feature_store/entities/function_info.py +0 -184
  40. feature_store/entities/on_demand_column_info.py +0 -44
  41. feature_store/entities/source_data_column_info.py +0 -21
  42. feature_store/entities/training_set.py +0 -134
  43. feature_store/feature_table_client/__init__.py +0 -0
  44. feature_store/feature_table_client/feature_table_client.py +0 -313
  45. feature_store/spark_client/__init__.py +0 -0
  46. feature_store/spark_client/spark_client.py +0 -286
  47. feature_store/training_set_client/__init__.py +0 -0
  48. feature_store/training_set_client/training_set_client.py +0 -196
  49. feature_store/utils/__init__.py +0 -0
  50. feature_store/utils/common_utils.py +0 -96
  51. feature_store/utils/feature_lookup_utils.py +0 -570
  52. feature_store/utils/feature_spec_utils.py +0 -286
  53. feature_store/utils/feature_utils.py +0 -73
  54. feature_store/utils/schema_utils.py +0 -117
  55. feature_store/utils/topological_sort.py +0 -158
  56. feature_store/utils/training_set_utils.py +0 -580
  57. feature_store/utils/uc_utils.py +0 -281
  58. feature_store/utils/utils.py +0 -252
  59. feature_store/utils/validation_utils.py +0 -55
  60. wedata/feature_store/utils/utils.py +0 -252
  61. wedata_feature_engineering-0.1.5.dist-info/RECORD +0 -79
  62. {wedata_feature_engineering-0.1.5.dist-info → wedata_feature_engineering-0.1.6.dist-info}/WHEEL +0 -0
  63. {wedata_feature_engineering-0.1.5.dist-info → wedata_feature_engineering-0.1.6.dist-info}/top_level.txt +0 -0
@@ -1,570 +0,0 @@
1
- import copy
2
- import datetime
3
- import logging
4
- import re
5
- from collections import defaultdict
6
- from functools import reduce
7
- from typing import Dict, List, Optional, Tuple
8
-
9
- from pyspark.sql import DataFrame, Window
10
- from pyspark.sql import functions as F
11
- from pyspark.sql.functions import sum, unix_timestamp
12
-
13
- from feature_store.entities.environment_variables import BROADCAST_JOIN_THRESHOLD
14
- from feature_store.entities.feature_column_info import FeatureColumnInfo
15
- from feature_store.entities.feature_lookup import FeatureLookup
16
- from feature_store.entities.feature_spec import FeatureSpec
17
- from feature_store.entities.feature_table import FeatureTable
18
-
19
- from feature_store.utils import common_utils, validation_utils, uc_utils
20
-
21
- _logger = logging.getLogger(__name__)
22
-
23
-
24
- def _spark_asof_join_features(
25
- df: DataFrame,
26
- df_lookup_keys: List[str],
27
- df_timestamp_lookup_key: str,
28
- feature_table_data: DataFrame,
29
- feature_table_keys: List[str],
30
- feature_table_timestamp_key: str,
31
- feature_to_output_name: Dict[str, str],
32
- lookback_window_seconds: Optional[float] = None,
33
- use_spark_native_join: Optional[bool] = False,
34
- ) -> DataFrame:
35
- # Alias feature table's keys to DataFrame lookup keys
36
- ft_key_aliases = [
37
- feature_table_data[ft_key].alias(df_key)
38
- for (ft_key, df_key) in zip(feature_table_keys, df_lookup_keys)
39
- ]
40
- # Alias features to corresponding output names
41
- ft_features = [
42
- (feature_name, output_name)
43
- for feature_name, output_name in feature_to_output_name.items()
44
- # Skip join if feature it is already in DataFrame and therefore overridden
45
- if output_name not in df.columns
46
- ]
47
- ft_feature_aliases = [
48
- feature_table_data[feature_name].alias(output_name)
49
- for feature_name, output_name in ft_features
50
- ]
51
- # Alias feature table's timestamp key to DataFrame timestamp lookup keys
52
- ft_timestamp_key_aliases = [
53
- feature_table_data[feature_table_timestamp_key].alias(df_timestamp_lookup_key)
54
- ]
55
- # Select key, timestamp key, and feature columns from feature table
56
- feature_and_keys = feature_table_data.select(
57
- ft_key_aliases + ft_timestamp_key_aliases + ft_feature_aliases
58
- )
59
-
60
- _logger.debug(
61
- "Using native spark for point in time join"
62
- if use_spark_native_join
63
- else "Using tempo for point in time join"
64
- )
65
-
66
- if use_spark_native_join:
67
- joined_df = _spark_asof_join_features_native(
68
- labels_df=df,
69
- features_df=feature_and_keys,
70
- primary_keys=df_lookup_keys,
71
- timestamp_key=df_timestamp_lookup_key,
72
- lookback_window_seconds=lookback_window_seconds,
73
- )
74
- else:
75
- joined_df = _spark_asof_join_features_tempo(
76
- df=df,
77
- df_lookup_keys=df_lookup_keys,
78
- df_timestamp_lookup_key=df_timestamp_lookup_key,
79
- feature_and_keys=feature_and_keys,
80
- ft_features=ft_features,
81
- lookback_window_seconds=lookback_window_seconds,
82
- )
83
- return joined_df
84
-
85
- def _spark_asof_join_features_tempo(
86
- df: DataFrame,
87
- feature_df: DataFrame,
88
- lookup_keys: List[str],
89
- timestamp_key: str,
90
- lookback_window: Optional[float] = None
91
- ) -> DataFrame:
92
- """
93
- 自定义实现as-of连接
94
- :param df: 主表DataFrame
95
- :param feature_df: 特征表DataFrame
96
- :param lookup_keys: 连接键列表
97
- :param timestamp_key: 时间戳列名
98
- :param lookback_window: 最大回溯时间(秒)
99
- :return: 连接后的DataFrame
100
- """
101
-
102
- # 1. 只保留键列和时间戳列
103
- df_keys = df.select(lookup_keys + [timestamp_key])
104
- feature_keys = feature_df.select(lookup_keys + [timestamp_key])
105
-
106
- # 2. 创建连接条件
107
- join_cond = [df_keys[k] == feature_keys[k] for k in lookup_keys]
108
- join_cond = reduce(lambda x, y: x & y, join_cond)
109
- join_cond &= (df_keys[timestamp_key] >= feature_keys[timestamp_key])
110
-
111
- if lookback_window:
112
- join_cond &= (
113
- (F.unix_timestamp(df_keys[timestamp_key]) -
114
- F.unix_timestamp(feature_keys[timestamp_key])) <= lookback_window
115
- )
116
-
117
- # 3. 执行连接并找出每个主表记录对应的最新特征记录
118
- joined = df_keys.join(feature_keys, join_cond, "left")
119
-
120
- # 按主表键分组,找出最大特征时间戳
121
- window = Window.partitionBy(lookup_keys).orderBy(F.desc(timestamp_key))
122
- latest_features = (
123
- joined
124
- .withColumn("rn", F.row_number().over(window))
125
- .filter(F.col("rn") == 1)
126
- .drop("rn")
127
- )
128
-
129
- # 4. 最终连接获取完整特征数据
130
- result = df.join(
131
- latest_features.select(lookup_keys + [timestamp_key, "feature_col"]),
132
- lookup_keys + [timestamp_key],
133
- "left"
134
- )
135
-
136
- return result
137
-
138
- def _spark_asof_join_features_native(
139
- labels_df: DataFrame,
140
- features_df: DataFrame,
141
- primary_keys: List[str],
142
- timestamp_key: str,
143
- lookback_window_seconds: Optional[float] = None,
144
- ):
145
- """
146
- Performs an as-of join operation between two dataframes using native Spark operations.
147
- Uses broadcast join for label dataset when within a size threshold to improve
148
- efficiency of join operation with the assumption that size(labels_df) << size(features_df).
149
- TODO(ML-40580): automatically switch labels_df and features_df based on size
150
- The join operation is performed as follows:
151
- 1. Drop non-join key (primary and timestamp keys) columns from labels and features DataFrames
152
- 2. Broadcast join labels onto features DataFrame if within broadcast threshold.
153
- 3. Select maximum timestamp for each primary key
154
- 4. Rejoin non-primary key columns from features DataFrame to get features data
155
- 5. Rejoin non-primary key columns from labels DataFrame to get joint data
156
-
157
- Parameters:
158
- labels_df (DataFrame): The labels dataframe to join.
159
- features_df (DataFrame): The features dataframe to join.
160
- primary_keys (List[str]): The primary keys used for joining.
161
- timestamp_key (str): The timestamp key used for joining.
162
- lookback_window_seconds (Optional[float]): The lookback window in seconds.
163
- If provided, the join operation will only consider records within this window.
164
-
165
- Returns:
166
- DataFrame: The result of the as-of join operation.
167
- """
168
- labels_df_keys_only = labels_df.select(
169
- [F.col(key) for key in primary_keys] + [F.col(timestamp_key)]
170
- )
171
-
172
- # Broadcast labels DataFrame if within the broadcast threshold
173
- if _df_in_size_threshold(labels_df_keys_only, BROADCAST_JOIN_THRESHOLD.get()):
174
- labels_df_keys_only = F.broadcast(labels_df_keys_only)
175
-
176
- # Drop non-primary key columns from features DataFrame
177
- features_df_keys_only = features_df.select(
178
- [F.col(key).alias(f"__features_pk_{key}") for key in primary_keys]
179
- + [F.col(timestamp_key).alias("__features_tk")]
180
- )
181
-
182
- # Create join conditions
183
- join_conditions = [
184
- labels_df_keys_only[key] == features_df_keys_only[f"__features_pk_{key}"]
185
- for key in primary_keys
186
- ]
187
- join_conditions = reduce(lambda x, y: x & y, join_conditions)
188
- join_conditions &= (
189
- labels_df_keys_only[timestamp_key] >= features_df_keys_only["__features_tk"]
190
- )
191
- if lookback_window_seconds is not None:
192
- join_conditions &= (
193
- unix_timestamp(labels_df_keys_only[timestamp_key])
194
- - unix_timestamp(features_df_keys_only["__features_tk"])
195
- ) <= lookback_window_seconds
196
-
197
- # Join labels and features DataFrames
198
- labels_df_keys_with_features_keys = labels_df_keys_only.join(
199
- features_df_keys_only, on=join_conditions, how="left"
200
- )
201
-
202
- # Find the features max timestamps for each primary keys and timestamp key in labels
203
- labels_df_keys_with_features_keys = labels_df_keys_with_features_keys.groupBy(
204
- [labels_df_keys_only[key] for key in primary_keys] + [F.col(timestamp_key)]
205
- ).agg(F.max("__features_tk").alias("__max_ts"))
206
-
207
- if _df_in_size_threshold(
208
- labels_df_keys_with_features_keys, BROADCAST_JOIN_THRESHOLD.get()
209
- ):
210
- labels_df_keys_with_features_keys = F.broadcast(
211
- labels_df_keys_with_features_keys
212
- )
213
-
214
- # Rejoin features DataFrame to get the features data
215
- join_conditions = [
216
- features_df[key] == labels_df_keys_with_features_keys[key]
217
- for key in primary_keys
218
- ]
219
- join_conditions = reduce(lambda x, y: x & y, join_conditions)
220
- join_conditions &= (
221
- features_df[timestamp_key] == labels_df_keys_with_features_keys["__max_ts"]
222
- )
223
-
224
- features = features_df.join(
225
- labels_df_keys_with_features_keys,
226
- on=join_conditions,
227
- how="inner",
228
- )
229
-
230
- pk_columns_to_drop = [
231
- labels_df_keys_with_features_keys[key] for key in primary_keys
232
- ]
233
- features = features.drop(*pk_columns_to_drop).drop(
234
- features_df[timestamp_key], labels_df_keys_with_features_keys["__max_ts"]
235
- )
236
- features = features.dropDuplicates(primary_keys + [timestamp_key])
237
- # Rejoin labels DataFrame if columns were dropped
238
- joint_df = labels_df.join(features, on=primary_keys + [timestamp_key], how="left")
239
- return joint_df
240
-
241
-
242
- def _df_in_size_threshold(df, threshold) -> float:
243
- # Default to within threshold if can not find
244
- try:
245
- num_bytes = _get_df_size_from_spark_plan(df)
246
- except Exception as e:
247
- num_bytes = 0
248
- return num_bytes <= threshold
249
-
250
-
251
- def _get_df_size_from_spark_plan(df: DataFrame) -> float:
252
- """
253
- 获取DataFrame的估算大小(字节数)
254
- 替代方案:直接从DataFrame的SparkSession获取执行计划信息
255
-
256
- 参数:
257
- df: 要计算大小的Spark DataFrame
258
-
259
- 返回:
260
- float: 估算的字节数
261
-
262
- 异常:
263
- ValueError: 如果无法从执行计划中解析出大小信息
264
- """
265
- # 直接从DataFrame获取SparkSession
266
- spark = df.sql_ctx.sparkSession
267
-
268
- # 创建临时视图
269
- df.createOrReplaceTempView("temp_view_for_size")
270
-
271
- # 获取执行计划
272
- plan = spark.sql("explain cost select * from temp_view_for_size").collect()[0][0]
273
-
274
- # 解析大小信息
275
- search_result = re.search(r"sizeInBytes=.*(['\)])", plan, re.MULTILINE)
276
- if search_result is None:
277
- raise ValueError("无法从Spark执行计划中获取sizeInBytes信息")
278
-
279
- # 提取大小和单位
280
- result = search_result.group(0).replace(")", "")
281
- size, units = result.split("=")[1].split()
282
-
283
- # 单位转换映射
284
- units_map = {
285
- "TiB": 1024**4, # 太字节
286
- "GiB": 1024**3, # 吉字节
287
- "MiB": 1024**2, # 兆字节
288
- "KiB": 1024, # 千字节
289
- "B": 1 # 字节(处理没有单位的情况)
290
- }
291
-
292
- # 清理单位字符串并转换
293
- clean_units = units.rstrip(",")
294
- return float(size) * units_map.get(clean_units, 1) # 默认返回原始值
295
-
296
-
297
- def _spark_join_features(
298
- df: DataFrame,
299
- df_keys: List[str],
300
- feature_table_data: DataFrame,
301
- feature_table_keys: List[str],
302
- feature_to_output_name: Dict[str, str],
303
- ) -> DataFrame:
304
- """
305
- Helper to join `feature_name` from `feature_table_data` into `df`.
306
-
307
- This join uses a temporary table that contains only the keys and feature
308
- from the feature table. The temporary table aliases the keys to match
309
- the lookup keys and the feature to match the output_name.
310
-
311
- Aliasing the keys allows us to join on name instead of by column,
312
- which prevents duplicate column names after the join.
313
- (see: https://kb.databricks.com/data/join-two-dataframes-duplicated-columns.html)
314
-
315
- The joined-in feature is guaranteed to be unique because FeatureSpec
316
- columns must be unique and the join is skipped if the feature
317
- already exists in the DataFrame.
318
- """
319
-
320
- # Alias feature table's keys to DataFrame lookup keys
321
- ft_key_aliases = [
322
- feature_table_data[ft_key].alias(df_key)
323
- for (ft_key, df_key) in zip(feature_table_keys, df_keys)
324
- ]
325
- # Alias features to corresponding output names
326
- ft_feature_aliases = [
327
- feature_table_data[feature_name].alias(output_name)
328
- for feature_name, output_name in feature_to_output_name.items()
329
- # Skip join if feature it is already in DataFrame and therefore overridden
330
- if output_name not in df.columns
331
- ]
332
- # Select key and feature columns from feature table
333
- feature_and_keys = feature_table_data.select(ft_key_aliases + ft_feature_aliases)
334
- # Join feature to feature table
335
- return df.join(feature_and_keys, df_keys, how="left")
336
-
337
-
338
- def _validate_join_keys(
339
- feature_column_info: FeatureColumnInfo,
340
- df: DataFrame,
341
- feature_table_metadata: FeatureTable,
342
- feature_table_data: DataFrame,
343
- is_timestamp_key: bool = False,
344
- ):
345
- join_error_phrase = (
346
- f"Unable to join feature table '{feature_column_info.table_name}'"
347
- )
348
- feature_column_info_keys = (
349
- feature_column_info.timestamp_lookup_key
350
- if is_timestamp_key
351
- else feature_column_info.lookup_key
352
- )
353
- feature_table_keys = (
354
- feature_table_metadata.timestamp_keys
355
- if is_timestamp_key
356
- else feature_table_metadata.primary_keys
357
- )
358
-
359
- lookup_key_kind = "timestamp lookup key" if is_timestamp_key else "lookup key"
360
- feature_table_key_kind = "timestamp key" if is_timestamp_key else "primary key"
361
-
362
- # Validate df has necessary keys
363
- missing_df_keys = list(
364
- filter(lambda df_key: df_key not in df.columns, feature_column_info_keys)
365
- )
366
- if missing_df_keys:
367
- missing_keys = ", ".join([f"'{key}'" for key in missing_df_keys])
368
- raise ValueError(
369
- f"{join_error_phrase} because {lookup_key_kind} {missing_keys} not found in DataFrame."
370
- )
371
- # Validate feature table has necessary keys
372
- missing_ft_keys = list(
373
- filter(
374
- lambda ft_key: ft_key not in feature_table_data.columns, feature_table_keys
375
- )
376
- )
377
- if missing_ft_keys:
378
- missing_keys = ", ".join([f"'{key}'" for key in missing_ft_keys])
379
- raise ValueError(
380
- f"{join_error_phrase} because {feature_table_key_kind} {missing_keys} not found in feature table."
381
- )
382
-
383
- # Validate number of feature table keys matches number of df lookup keys
384
- if len(feature_column_info_keys) != len(feature_table_keys):
385
- raise ValueError(
386
- f"{join_error_phrase} because "
387
- f"number of {feature_table_key_kind}s ({feature_table_keys}) "
388
- f"does not match "
389
- f"number of {lookup_key_kind}s ({feature_column_info_keys})."
390
- )
391
-
392
- # Validate feature table keys match types of df keys. The number of keys is expected to be the same.
393
- # for (df_key, ft_key) in zip(feature_column_info_keys, feature_table_keys):
394
- # df_key_type = DataType.from_spark_type(df.schema[df_key].dataType)
395
- # ft_key_type = DataType.from_spark_type(
396
- # feature_table_data.schema[ft_key].dataType
397
- # )
398
- # if df_key_type != ft_key_type:
399
- # raise ValueError(
400
- # f"{join_error_phrase} because {feature_table_key_kind} '{ft_key}' has type '{DataType.to_string(ft_key_type)}' "
401
- # f"but corresponding {lookup_key_kind} '{df_key}' has type '{DataType.to_string(df_key_type)}' in DataFrame."
402
- # )
403
-
404
-
405
- def _validate_join_feature_data(
406
- df: DataFrame,
407
- features_to_join: List[FeatureColumnInfo],
408
- feature_table_metadata_map: Dict[str, FeatureTable],
409
- feature_table_data_map: Dict[str, DataFrame],
410
- ):
411
- for feature_info in features_to_join:
412
- feature_table_metadata = feature_table_metadata_map[feature_info.table_name]
413
- feature_table_data = feature_table_data_map[feature_info.table_name]
414
- # Validate feature table primary keys match length/type of df lookup keys
415
- _validate_join_keys(
416
- feature_info,
417
- df,
418
- feature_table_metadata,
419
- feature_table_data,
420
- is_timestamp_key=False,
421
- )
422
- # Validate feature table timestamp keys match length/type of df timestamp lookup keys
423
- _validate_join_keys(
424
- feature_info,
425
- df,
426
- feature_table_metadata,
427
- feature_table_data,
428
- is_timestamp_key=True,
429
- )
430
-
431
-
432
- def join_feature_data_if_not_overridden(
433
- feature_spec: FeatureSpec,
434
- df: DataFrame,
435
- features_to_join: List[FeatureColumnInfo],
436
- feature_table_metadata_map: Dict[str, FeatureTable],
437
- feature_table_data_map: Dict[str, DataFrame],
438
- use_spark_native_join: Optional[bool] = False,
439
- ) -> DataFrame:
440
- """
441
- Joins `df` with features specified by `feature_spec.feature_column_infos` if they do not already exist.
442
-
443
- Return column order is df.columns + newly joined features. The newly joined feature order is not guaranteed to
444
- match `feature_spec.feature_column_infos` as feature lookups are first grouped by table for efficiency.
445
-
446
- Before joining, it checks that:
447
- 1. Feature table keys match length and types of `df` lookup keys specified by FeatureSpec
448
- 2. `df` contains lookup keys specified by FeatureSpec
449
- 3. Feature table timestamp lookup keys match length and types of `df` timestamp lookup keys if specified by FeatureSpec
450
- 4. `df` contains timestamp lookup keys if specified by FeatureSpec
451
- """
452
- _validate_join_feature_data(
453
- df=df,
454
- features_to_join=features_to_join,
455
- feature_table_metadata_map=feature_table_metadata_map,
456
- feature_table_data_map=feature_table_data_map,
457
- )
458
-
459
- # Helper class to group all unique combinations of feature table names and lookup keys.
460
- # All features in each of these groups will be JOINed with the training df using a single JOIN.
461
- class JoinDataKey:
462
- def __init__(
463
- self,
464
- feature_table: str,
465
- lookup_key: List[str],
466
- timestamp_lookup_key: List[str],
467
- lookback_window: Optional[datetime.timedelta] = None,
468
- ):
469
- self.feature_table = feature_table
470
- self.lookup_key = lookup_key
471
- self.timestamp_lookup_key = timestamp_lookup_key
472
- self.lookback_window = lookback_window
473
-
474
- def __hash__(self):
475
- return (
476
- hash(self.feature_table)
477
- + hash(tuple(self.lookup_key))
478
- + hash(tuple(self.timestamp_lookup_key))
479
- + hash(self.lookback_window)
480
- )
481
-
482
- def __eq__(self, other):
483
- return (
484
- self.feature_table == other.feature_table
485
- and self.lookup_key == other.lookup_key
486
- and self.timestamp_lookup_key == other.timestamp_lookup_key
487
- and self.lookback_window == other.lookback_window
488
- )
489
-
490
- # Iterate through the list of FeatureColumnInfo and group features by name of the
491
- # feature table and lookup key(s) and timestamp lookup key(s)
492
- table_join_data = defaultdict(dict)
493
- lookback_windows = {
494
- t.table_name: t.lookback_window for t in feature_spec.table_infos
495
- }
496
- for feature_info in features_to_join:
497
- join_data_key = JoinDataKey(
498
- feature_info.table_name,
499
- feature_info.lookup_key,
500
- feature_info.timestamp_lookup_key,
501
- lookback_windows[feature_info.table_name],
502
- )
503
- table_join_data[join_data_key][
504
- feature_info.feature_name
505
- ] = feature_info.output_name
506
-
507
- for join_data_key, feature_to_output_name in table_join_data.items():
508
-
509
- feature_table_metadata = feature_table_metadata_map[join_data_key.feature_table]
510
- feature_table_data = feature_table_data_map[join_data_key.feature_table]
511
-
512
- if join_data_key.timestamp_lookup_key:
513
- # If lookback window is set to 0, then perform exact join instead of asof join to get perf benefits.
514
- if (
515
- join_data_key.lookback_window is not None
516
- and join_data_key.lookback_window == 0
517
- ):
518
- df = _spark_join_features(
519
- df=df,
520
- df_keys=join_data_key.lookup_key
521
- + join_data_key.timestamp_lookup_key,
522
- feature_table_data=feature_table_data,
523
- feature_table_keys=feature_table_metadata.primary_keys
524
- + feature_table_metadata.timestamp_keys,
525
- feature_to_output_name=feature_to_output_name,
526
- )
527
- else:
528
- df = _spark_asof_join_features(
529
- df=df,
530
- df_lookup_keys=join_data_key.lookup_key,
531
- df_timestamp_lookup_key=join_data_key.timestamp_lookup_key[0],
532
- feature_table_data=feature_table_data,
533
- feature_table_keys=feature_table_metadata.primary_keys,
534
- feature_table_timestamp_key=feature_table_metadata.timestamp_keys[
535
- 0
536
- ],
537
- feature_to_output_name=feature_to_output_name,
538
- lookback_window_seconds=join_data_key.lookback_window,
539
- use_spark_native_join=use_spark_native_join,
540
- )
541
- else:
542
- df = _spark_join_features(
543
- df=df,
544
- df_keys=join_data_key.lookup_key,
545
- feature_table_data=feature_table_data,
546
- feature_table_keys=feature_table_metadata.primary_keys,
547
- feature_to_output_name=feature_to_output_name,
548
- )
549
- return df
550
-
551
-
552
- def get_feature_lookups_with_full_table_names(
553
- feature_lookups: List[FeatureLookup], current_catalog: str, current_schema: str
554
- ) -> List[FeatureLookup]:
555
- """
556
- Takes in a list of FeatureLookups, and returns copies with reformatted table names.
557
- """
558
- table_names = {fl.table_name for fl in feature_lookups}
559
- uc_utils._check_qualified_table_names(table_names)
560
- uc_utils._verify_all_tables_are_either_in_uc_or_in_hms(
561
- table_names, current_catalog, current_schema
562
- )
563
- standardized_feature_lookups = []
564
- for fl in feature_lookups:
565
- fl_copy = copy.deepcopy(fl)
566
- fl_copy._table_name = uc_utils.get_full_table_name(
567
- fl_copy.table_name, current_catalog, current_schema
568
- )
569
- standardized_feature_lookups.append(fl_copy)
570
- return standardized_feature_lookups