tencent-wedata-feature-engineering-dev 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.

Files changed (64) hide show
  1. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/METADATA +19 -0
  2. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/RECORD +64 -0
  3. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/WHEEL +5 -0
  4. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/top_level.txt +1 -0
  5. wedata/__init__.py +9 -0
  6. wedata/feature_store/__init__.py +0 -0
  7. wedata/feature_store/client.py +462 -0
  8. wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
  9. wedata/feature_store/cloud_sdk_client/client.py +86 -0
  10. wedata/feature_store/cloud_sdk_client/models.py +686 -0
  11. wedata/feature_store/cloud_sdk_client/utils.py +32 -0
  12. wedata/feature_store/common/__init__.py +0 -0
  13. wedata/feature_store/common/protos/__init__.py +0 -0
  14. wedata/feature_store/common/protos/feature_store_pb2.py +49 -0
  15. wedata/feature_store/common/store_config/__init__.py +0 -0
  16. wedata/feature_store/common/store_config/redis.py +48 -0
  17. wedata/feature_store/constants/__init__.py +0 -0
  18. wedata/feature_store/constants/constants.py +59 -0
  19. wedata/feature_store/constants/engine_types.py +34 -0
  20. wedata/feature_store/entities/__init__.py +0 -0
  21. wedata/feature_store/entities/column_info.py +138 -0
  22. wedata/feature_store/entities/environment_variables.py +55 -0
  23. wedata/feature_store/entities/feature.py +53 -0
  24. wedata/feature_store/entities/feature_column_info.py +72 -0
  25. wedata/feature_store/entities/feature_function.py +55 -0
  26. wedata/feature_store/entities/feature_lookup.py +200 -0
  27. wedata/feature_store/entities/feature_spec.py +489 -0
  28. wedata/feature_store/entities/feature_spec_constants.py +25 -0
  29. wedata/feature_store/entities/feature_table.py +111 -0
  30. wedata/feature_store/entities/feature_table_info.py +49 -0
  31. wedata/feature_store/entities/function_info.py +90 -0
  32. wedata/feature_store/entities/on_demand_column_info.py +57 -0
  33. wedata/feature_store/entities/source_data_column_info.py +24 -0
  34. wedata/feature_store/entities/training_set.py +135 -0
  35. wedata/feature_store/feast_client/__init__.py +0 -0
  36. wedata/feature_store/feast_client/feast_client.py +482 -0
  37. wedata/feature_store/feature_table_client/__init__.py +0 -0
  38. wedata/feature_store/feature_table_client/feature_table_client.py +969 -0
  39. wedata/feature_store/mlflow_model.py +17 -0
  40. wedata/feature_store/spark_client/__init__.py +0 -0
  41. wedata/feature_store/spark_client/spark_client.py +289 -0
  42. wedata/feature_store/training_set_client/__init__.py +0 -0
  43. wedata/feature_store/training_set_client/training_set_client.py +572 -0
  44. wedata/feature_store/utils/__init__.py +0 -0
  45. wedata/feature_store/utils/common_utils.py +352 -0
  46. wedata/feature_store/utils/env_utils.py +86 -0
  47. wedata/feature_store/utils/feature_lookup_utils.py +564 -0
  48. wedata/feature_store/utils/feature_spec_utils.py +286 -0
  49. wedata/feature_store/utils/feature_utils.py +73 -0
  50. wedata/feature_store/utils/on_demand_utils.py +107 -0
  51. wedata/feature_store/utils/schema_utils.py +117 -0
  52. wedata/feature_store/utils/signature_utils.py +202 -0
  53. wedata/feature_store/utils/topological_sort.py +158 -0
  54. wedata/feature_store/utils/training_set_utils.py +579 -0
  55. wedata/feature_store/utils/uc_utils.py +296 -0
  56. wedata/feature_store/utils/validation_utils.py +79 -0
  57. wedata/tempo/__init__.py +0 -0
  58. wedata/tempo/interpol.py +448 -0
  59. wedata/tempo/intervals.py +1331 -0
  60. wedata/tempo/io.py +61 -0
  61. wedata/tempo/ml.py +129 -0
  62. wedata/tempo/resample.py +318 -0
  63. wedata/tempo/tsdf.py +1720 -0
  64. wedata/tempo/utils.py +254 -0
@@ -0,0 +1,564 @@
1
+ import copy
2
+ import datetime
3
+ import logging
4
+ import re
5
+ from collections import defaultdict
6
+ from functools import reduce
7
+ from typing import Dict, List, Optional, Tuple
8
+
9
+ from pyspark.sql import DataFrame
10
+ from pyspark.sql import functions as F
11
+ import pyspark.sql.functions as psf
12
+
13
+
14
+ from wedata.feature_store.entities.environment_variables import BROADCAST_JOIN_THRESHOLD
15
+ from wedata.feature_store.entities.feature_column_info import FeatureColumnInfo
16
+ from wedata.feature_store.entities.feature_lookup import FeatureLookup
17
+ from wedata.feature_store.entities.feature_spec import FeatureSpec
18
+ from wedata.feature_store.entities.feature_table import FeatureTable
19
+
20
+ from wedata.feature_store.utils import uc_utils
21
+
22
+ _logger = logging.getLogger(__name__)
23
+
24
+
25
+ def _spark_asof_join_features(
26
+ df: DataFrame,
27
+ df_lookup_keys: List[str],
28
+ df_timestamp_lookup_key: str,
29
+ feature_table_data: DataFrame,
30
+ feature_table_keys: List[str],
31
+ feature_table_timestamp_key: str,
32
+ feature_to_output_name: Dict[str, str],
33
+ lookback_window_seconds: Optional[float] = None,
34
+ use_spark_native_join: Optional[bool] = False,
35
+ ) -> DataFrame:
36
+ # Alias feature table's keys to DataFrame lookup keys
37
+ ft_key_aliases = [
38
+ feature_table_data[ft_key].alias(df_key)
39
+ for (ft_key, df_key) in zip(feature_table_keys, df_lookup_keys)
40
+ ]
41
+ # Alias features to corresponding output names
42
+ ft_features = [
43
+ (feature_name, output_name)
44
+ for feature_name, output_name in feature_to_output_name.items()
45
+ # Skip join if feature it is already in DataFrame and therefore overridden
46
+ if output_name not in df.columns
47
+ ]
48
+ ft_feature_aliases = [
49
+ feature_table_data[feature_name].alias(output_name)
50
+ for feature_name, output_name in ft_features
51
+ ]
52
+ # Alias feature table's timestamp key to DataFrame timestamp lookup keys
53
+ ft_timestamp_key_aliases = [
54
+ feature_table_data[feature_table_timestamp_key].alias(df_timestamp_lookup_key)
55
+ ]
56
+ # Select key, timestamp key, and feature columns from feature table
57
+ feature_and_keys = feature_table_data.select(
58
+ ft_key_aliases + ft_timestamp_key_aliases + ft_feature_aliases
59
+ )
60
+
61
+ _logger.debug(
62
+ "Using native spark for point in time join"
63
+ if use_spark_native_join
64
+ else "Using tempo for point in time join"
65
+ )
66
+
67
+ if use_spark_native_join:
68
+ joined_df = _spark_asof_join_features_native(
69
+ labels_df=df,
70
+ features_df=feature_and_keys,
71
+ primary_keys=df_lookup_keys,
72
+ timestamp_key=df_timestamp_lookup_key,
73
+ lookback_window_seconds=lookback_window_seconds,
74
+ )
75
+ else:
76
+ joined_df = _spark_asof_join_features_tempo(
77
+ df=df,
78
+ feature_df=feature_and_keys,
79
+ lookup_keys=df_lookup_keys,
80
+ timestamp_key=df_timestamp_lookup_key,
81
+ lookback_window=lookback_window_seconds,
82
+ ft_features=ft_features,
83
+ )
84
+ return joined_df
85
+
86
+
87
+ def _spark_asof_join_features_tempo(
88
+ df: DataFrame,
89
+ feature_df: DataFrame,
90
+ lookup_keys: List[str],
91
+ timestamp_key: str,
92
+ ft_features: List[Tuple[str, str]],
93
+ lookback_window: Optional[float] = None
94
+ ) -> DataFrame:
95
+ """
96
+ 自定义实现as-of连接
97
+ :param df: 主表DataFrame
98
+ :param feature_df: 特征表DataFrame
99
+ :param lookup_keys: 连接键列表
100
+ :param timestamp_key: 时间戳列名
101
+ :param lookback_window: 最大回溯时间(秒)
102
+ :return: 连接后的DataFrame
103
+ """
104
+ from wedata.tempo.tsdf import TSDF
105
+ # 1. 只保留键列和时间戳列
106
+ df_tsdf = TSDF(df, ts_col=timestamp_key, partition_cols=lookup_keys)
107
+ ft_tsdf = TSDF(feature_df, ts_col=timestamp_key, partition_cols=lookup_keys)
108
+ # 进行as-of连接
109
+ joined_df = df_tsdf.asofJoin(
110
+ ft_tsdf,
111
+ left_prefix="left",
112
+ right_prefix="right",
113
+ skipNulls=False,
114
+ tolerance=lookback_window
115
+ if lookback_window is not None
116
+ else None,
117
+ ).df
118
+
119
+ # 去掉前缀,恢复列名
120
+ left_aliases = [
121
+ joined_df[f"left_{column_name}"].alias(column_name)
122
+ for column_name in df.columns
123
+ if column_name not in lookup_keys
124
+ ]
125
+ right_aliases = [
126
+ joined_df[f"right_{output_name}"].alias(output_name)
127
+ for (_, output_name) in ft_features
128
+ ]
129
+ return joined_df.select(lookup_keys + left_aliases + right_aliases)
130
+
131
+
132
+ def _spark_asof_join_features_native(
133
+ labels_df: DataFrame,
134
+ features_df: DataFrame,
135
+ primary_keys: List[str],
136
+ timestamp_key: str,
137
+ lookback_window_seconds: Optional[float] = None,
138
+ ):
139
+ """
140
+ Performs an as-of join operation between two dataframes using native Spark operations.
141
+ Uses broadcast join for label dataset when within a size threshold to improve
142
+ efficiency of join operation with the assumption that size(labels_df) << size(features_df).
143
+ TODO(ML-40580): automatically switch labels_df and features_df based on size
144
+ The join operation is performed as follows:
145
+ 1. Drop non-join key (primary and timestamp keys) columns from labels and features DataFrames
146
+ 2. Broadcast join labels onto features DataFrame if within broadcast threshold.
147
+ 3. Select maximum timestamp for each primary key
148
+ 4. Rejoin non-primary key columns from features DataFrame to get features data
149
+ 5. Rejoin non-primary key columns from labels DataFrame to get joint data
150
+
151
+ Parameters:
152
+ labels_df (DataFrame): The labels dataframe to join.
153
+ features_df (DataFrame): The features dataframe to join.
154
+ primary_keys (List[str]): The primary keys used for joining.
155
+ timestamp_key (str): The timestamp key used for joining.
156
+ lookback_window_seconds (Optional[float]): The lookback window in seconds.
157
+ If provided, the join operation will only consider records within this window.
158
+
159
+ Returns:
160
+ DataFrame: The result of the as-of join operation.
161
+ """
162
+ labels_df_keys_only = labels_df.select(
163
+ [F.col(key) for key in primary_keys] + [F.col(timestamp_key)]
164
+ )
165
+
166
+ # Broadcast labels DataFrame if within the broadcast threshold
167
+ if _df_in_size_threshold(labels_df_keys_only, BROADCAST_JOIN_THRESHOLD.get()):
168
+ labels_df_keys_only = F.broadcast(labels_df_keys_only)
169
+
170
+ # Drop non-primary key columns from features DataFrame
171
+ features_df_keys_only = features_df.select(
172
+ [F.col(key).alias(f"__features_pk_{key}") for key in primary_keys]
173
+ + [F.col(timestamp_key).alias("__features_tk")]
174
+ )
175
+
176
+ # Create join conditions
177
+ join_conditions = [
178
+ labels_df_keys_only[key] == features_df_keys_only[f"__features_pk_{key}"]
179
+ for key in primary_keys
180
+ ]
181
+ join_conditions = reduce(lambda x, y: x & y, join_conditions)
182
+ join_conditions &= (
183
+ labels_df_keys_only[timestamp_key] >= features_df_keys_only["__features_tk"]
184
+ )
185
+ if lookback_window_seconds is not None:
186
+ join_conditions &= (
187
+ psf.unix_timestamp(labels_df_keys_only[timestamp_key])
188
+ - psf.unix_timestamp(features_df_keys_only["__features_tk"])
189
+ ) <= lookback_window_seconds
190
+
191
+ # Join labels and features DataFrames
192
+ labels_df_keys_with_features_keys = labels_df_keys_only.join(
193
+ features_df_keys_only, on=join_conditions, how="left"
194
+ )
195
+
196
+ # Find the features max timestamps for each primary keys and timestamp key in labels
197
+ labels_df_keys_with_features_keys = labels_df_keys_with_features_keys.groupBy(
198
+ [labels_df_keys_only[key] for key in primary_keys] + [F.col(timestamp_key)]
199
+ ).agg(F.max("__features_tk").alias("__max_ts"))
200
+
201
+ if _df_in_size_threshold(
202
+ labels_df_keys_with_features_keys, BROADCAST_JOIN_THRESHOLD.get()
203
+ ):
204
+ labels_df_keys_with_features_keys = F.broadcast(
205
+ labels_df_keys_with_features_keys
206
+ )
207
+
208
+ # Rejoin features DataFrame to get the features data
209
+ join_conditions = [
210
+ features_df[key] == labels_df_keys_with_features_keys[key]
211
+ for key in primary_keys
212
+ ]
213
+ join_conditions = reduce(lambda x, y: x & y, join_conditions)
214
+ join_conditions &= (
215
+ features_df[timestamp_key] == labels_df_keys_with_features_keys["__max_ts"]
216
+ )
217
+
218
+ features = features_df.join(
219
+ labels_df_keys_with_features_keys,
220
+ on=join_conditions,
221
+ how="inner",
222
+ )
223
+
224
+ pk_columns_to_drop = [
225
+ labels_df_keys_with_features_keys[key] for key in primary_keys
226
+ ]
227
+ features = features.drop(*pk_columns_to_drop).drop(
228
+ features_df[timestamp_key], labels_df_keys_with_features_keys["__max_ts"]
229
+ )
230
+ features = features.dropDuplicates(primary_keys + [timestamp_key])
231
+ # Rejoin labels DataFrame if columns were dropped
232
+ joint_df = labels_df.join(features, on=primary_keys + [timestamp_key], how="left")
233
+ return joint_df
234
+
235
+
236
+ def _df_in_size_threshold(df, threshold) -> float:
237
+ # Default to within threshold if can not find
238
+ try:
239
+ num_bytes = _get_df_size_from_spark_plan(df)
240
+ except Exception as e:
241
+ num_bytes = 0
242
+ return num_bytes <= threshold
243
+
244
+
245
+ def _get_df_size_from_spark_plan(df: DataFrame) -> float:
246
+ """
247
+ 获取DataFrame的估算大小(字节数)
248
+ 替代方案:直接从DataFrame的SparkSession获取执行计划信息
249
+
250
+ 参数:
251
+ df: 要计算大小的Spark DataFrame
252
+
253
+ 返回:
254
+ float: 估算的字节数
255
+
256
+ 异常:
257
+ ValueError: 如果无法从执行计划中解析出大小信息
258
+ """
259
+ # 直接从DataFrame获取SparkSession
260
+ spark = df.sql_ctx.sparkSession
261
+
262
+ # 创建临时视图
263
+ df.createOrReplaceTempView("temp_view_for_size")
264
+
265
+ # 获取执行计划
266
+ plan = spark.sql("explain cost select * from temp_view_for_size").collect()[0][0]
267
+
268
+ # 解析大小信息
269
+ search_result = re.search(r"sizeInBytes=.*(['\)])", plan, re.MULTILINE)
270
+ if search_result is None:
271
+ raise ValueError("无法从Spark执行计划中获取sizeInBytes信息")
272
+
273
+ # 提取大小和单位
274
+ result = search_result.group(0).replace(")", "")
275
+ size, units = result.split("=")[1].split()
276
+
277
+ # 单位转换映射
278
+ units_map = {
279
+ "TiB": 1024**4, # 太字节
280
+ "GiB": 1024**3, # 吉字节
281
+ "MiB": 1024**2, # 兆字节
282
+ "KiB": 1024, # 千字节
283
+ "B": 1 # 字节(处理没有单位的情况)
284
+ }
285
+
286
+ # 清理单位字符串并转换
287
+ clean_units = units.rstrip(",")
288
+ return float(size) * units_map.get(clean_units, 1) # 默认返回原始值
289
+
290
+
291
+ def _spark_join_features(
292
+ df: DataFrame,
293
+ df_keys: List[str],
294
+ feature_table_data: DataFrame,
295
+ feature_table_keys: List[str],
296
+ feature_to_output_name: Dict[str, str],
297
+ ) -> DataFrame:
298
+ """
299
+ Helper to join `feature_name` from `feature_table_data` into `df`.
300
+
301
+ This join uses a temporary table that contains only the keys and feature
302
+ from the feature table. The temporary table aliases the keys to match
303
+ the lookup keys and the feature to match the output_name.
304
+
305
+ Aliasing the keys allows us to join on name instead of by column,
306
+ which prevents duplicate column names after the join.
307
+ (see: https://kb.databricks.com/data/join-two-dataframes-duplicated-columns.html)
308
+
309
+ The joined-in feature is guaranteed to be unique because FeatureSpec
310
+ columns must be unique and the join is skipped if the feature
311
+ already exists in the DataFrame.
312
+ """
313
+
314
+ # Alias feature table's keys to DataFrame lookup keys
315
+ ft_key_aliases = [
316
+ feature_table_data[ft_key].alias(df_key)
317
+ for (ft_key, df_key) in zip(feature_table_keys, df_keys)
318
+ ]
319
+ # Alias features to corresponding output names
320
+ ft_feature_aliases = [
321
+ feature_table_data[feature_name].alias(output_name)
322
+ for feature_name, output_name in feature_to_output_name.items()
323
+ # Skip join if feature it is already in DataFrame and therefore overridden
324
+ if output_name not in df.columns
325
+ ]
326
+ # Select key and feature columns from feature table
327
+ feature_and_keys = feature_table_data.select(ft_key_aliases + ft_feature_aliases)
328
+ # Join feature to feature table
329
+ return df.join(feature_and_keys, df_keys, how="left")
330
+
331
+
332
+ def _validate_join_keys(
333
+ feature_column_info: FeatureColumnInfo,
334
+ df: DataFrame,
335
+ feature_table_metadata: FeatureTable,
336
+ feature_table_data: DataFrame,
337
+ is_timestamp_key: bool = False,
338
+ ):
339
+ join_error_phrase = (
340
+ f"Unable to join feature table '{feature_column_info.table_name}'"
341
+ )
342
+ feature_column_info_keys = (
343
+ feature_column_info.timestamp_lookup_key
344
+ if is_timestamp_key
345
+ else feature_column_info.lookup_key
346
+ )
347
+ feature_table_keys = (
348
+ feature_table_metadata.timestamp_keys
349
+ if is_timestamp_key
350
+ else feature_table_metadata.primary_keys
351
+ )
352
+
353
+ lookup_key_kind = "timestamp lookup key" if is_timestamp_key else "lookup key"
354
+ feature_table_key_kind = "timestamp key" if is_timestamp_key else "primary key"
355
+
356
+ # Validate df has necessary keys
357
+ missing_df_keys = list(
358
+ filter(lambda df_key: df_key not in df.columns, feature_column_info_keys)
359
+ )
360
+ if missing_df_keys:
361
+ missing_keys = ", ".join([f"'{key}'" for key in missing_df_keys])
362
+ raise ValueError(
363
+ f"{join_error_phrase} because {lookup_key_kind} {missing_keys} not found in DataFrame."
364
+ )
365
+ # Validate feature table has necessary keys
366
+ missing_ft_keys = list(
367
+ filter(
368
+ lambda ft_key: ft_key not in feature_table_data.columns, feature_table_keys
369
+ )
370
+ )
371
+ if missing_ft_keys:
372
+ missing_keys = ", ".join([f"'{key}'" for key in missing_ft_keys])
373
+ raise ValueError(
374
+ f"{join_error_phrase} because {feature_table_key_kind} {missing_keys} not found in feature table."
375
+ )
376
+
377
+ # Validate number of feature table keys matches number of df lookup keys
378
+ if len(feature_column_info_keys) != len(feature_table_keys):
379
+ raise ValueError(
380
+ f"{join_error_phrase} because "
381
+ f"number of {feature_table_key_kind}s ({feature_table_keys}) "
382
+ f"does not match "
383
+ f"number of {lookup_key_kind}s ({feature_column_info_keys})."
384
+ )
385
+
386
+ # Validate feature table keys match types of df keys. The number of keys is expected to be the same.
387
+ # for (df_key, ft_key) in zip(feature_column_info_keys, feature_table_keys):
388
+ # df_key_type = DataType.from_spark_type(df.schema[df_key].dataType)
389
+ # ft_key_type = DataType.from_spark_type(
390
+ # feature_table_data.schema[ft_key].dataType
391
+ # )
392
+ # if df_key_type != ft_key_type:
393
+ # raise ValueError(
394
+ # f"{join_error_phrase} because {feature_table_key_kind} '{ft_key}' has type '{DataType.to_string(ft_key_type)}' "
395
+ # f"but corresponding {lookup_key_kind} '{df_key}' has type '{DataType.to_string(df_key_type)}' in DataFrame."
396
+ # )
397
+
398
+
399
+ def _validate_join_feature_data(
400
+ df: DataFrame,
401
+ features_to_join: List[FeatureColumnInfo],
402
+ feature_table_metadata_map: Dict[str, FeatureTable],
403
+ feature_table_data_map: Dict[str, DataFrame],
404
+ ):
405
+ for feature_info in features_to_join:
406
+ feature_table_metadata = feature_table_metadata_map[feature_info.table_name]
407
+ feature_table_data = feature_table_data_map[feature_info.table_name]
408
+ # Validate feature table primary keys match length/type of df lookup keys
409
+ _validate_join_keys(
410
+ feature_info,
411
+ df,
412
+ feature_table_metadata,
413
+ feature_table_data,
414
+ is_timestamp_key=False,
415
+ )
416
+ # Validate feature table timestamp keys match length/type of df timestamp lookup keys
417
+ _validate_join_keys(
418
+ feature_info,
419
+ df,
420
+ feature_table_metadata,
421
+ feature_table_data,
422
+ is_timestamp_key=True,
423
+ )
424
+
425
+
426
+ def join_feature_data_if_not_overridden(
427
+ feature_spec: FeatureSpec,
428
+ df: DataFrame,
429
+ features_to_join: List[FeatureColumnInfo],
430
+ feature_table_metadata_map: Dict[str, FeatureTable],
431
+ feature_table_data_map: Dict[str, DataFrame],
432
+ use_spark_native_join: Optional[bool] = False,
433
+ ) -> DataFrame:
434
+ """
435
+ Joins `df` with features specified by `feature_spec.feature_column_infos` if they do not already exist.
436
+
437
+ Return column order is df.columns + newly joined features. The newly joined feature order is not guaranteed to
438
+ match `feature_spec.feature_column_infos` as feature lookups are first grouped by table for efficiency.
439
+
440
+ Before joining, it checks that:
441
+ 1. Feature table keys match length and types of `df` lookup keys specified by FeatureSpec
442
+ 2. `df` contains lookup keys specified by FeatureSpec
443
+ 3. Feature table timestamp lookup keys match length and types of `df` timestamp lookup keys if specified by FeatureSpec
444
+ 4. `df` contains timestamp lookup keys if specified by FeatureSpec
445
+ """
446
+ _validate_join_feature_data(
447
+ df=df,
448
+ features_to_join=features_to_join,
449
+ feature_table_metadata_map=feature_table_metadata_map,
450
+ feature_table_data_map=feature_table_data_map,
451
+ )
452
+
453
+ # Helper class to group all unique combinations of feature table names and lookup keys.
454
+ # All features in each of these groups will be JOINed with the training df using a single JOIN.
455
+ class JoinDataKey:
456
+ def __init__(
457
+ self,
458
+ feature_table: str,
459
+ lookup_key: List[str],
460
+ timestamp_lookup_key: List[str],
461
+ lookback_window: Optional[datetime.timedelta] = None,
462
+ ):
463
+ self.feature_table = feature_table
464
+ self.lookup_key = lookup_key
465
+ self.timestamp_lookup_key = timestamp_lookup_key
466
+ self.lookback_window = lookback_window
467
+
468
+ def __hash__(self):
469
+ return (
470
+ hash(self.feature_table)
471
+ + hash(tuple(self.lookup_key))
472
+ + hash(tuple(self.timestamp_lookup_key))
473
+ + hash(self.lookback_window)
474
+ )
475
+
476
+ def __eq__(self, other):
477
+ return (
478
+ self.feature_table == other.feature_table
479
+ and self.lookup_key == other.lookup_key
480
+ and self.timestamp_lookup_key == other.timestamp_lookup_key
481
+ and self.lookback_window == other.lookback_window
482
+ )
483
+
484
+ # Iterate through the list of FeatureColumnInfo and group features by name of the
485
+ # feature table and lookup key(s) and timestamp lookup key(s)
486
+ table_join_data = defaultdict(dict)
487
+ lookback_windows = {
488
+ t.table_name: t.lookback_window for t in feature_spec.table_infos
489
+ }
490
+ for feature_info in features_to_join:
491
+ join_data_key = JoinDataKey(
492
+ feature_info.table_name,
493
+ feature_info.lookup_key,
494
+ feature_info.timestamp_lookup_key,
495
+ lookback_windows[feature_info.table_name],
496
+ )
497
+ table_join_data[join_data_key][
498
+ feature_info.feature_name
499
+ ] = feature_info.output_name
500
+
501
+ for join_data_key, feature_to_output_name in table_join_data.items():
502
+
503
+ feature_table_metadata = feature_table_metadata_map[join_data_key.feature_table]
504
+ feature_table_data = feature_table_data_map[join_data_key.feature_table]
505
+
506
+ if join_data_key.timestamp_lookup_key:
507
+ # If lookback window is set to 0, then perform exact join instead of asof join to get perf benefits.
508
+ if (
509
+ join_data_key.lookback_window is not None
510
+ and join_data_key.lookback_window == 0
511
+ ):
512
+ df = _spark_join_features(
513
+ df=df,
514
+ df_keys=join_data_key.lookup_key
515
+ + join_data_key.timestamp_lookup_key,
516
+ feature_table_data=feature_table_data,
517
+ feature_table_keys=feature_table_metadata.primary_keys
518
+ + feature_table_metadata.timestamp_keys,
519
+ feature_to_output_name=feature_to_output_name,
520
+ )
521
+ else:
522
+ df = _spark_asof_join_features(
523
+ df=df,
524
+ df_lookup_keys=join_data_key.lookup_key,
525
+ df_timestamp_lookup_key=join_data_key.timestamp_lookup_key[0],
526
+ feature_table_data=feature_table_data,
527
+ feature_table_keys=feature_table_metadata.primary_keys,
528
+ feature_table_timestamp_key=feature_table_metadata.timestamp_keys[
529
+ 0
530
+ ],
531
+ feature_to_output_name=feature_to_output_name,
532
+ lookback_window_seconds=join_data_key.lookback_window,
533
+ use_spark_native_join=use_spark_native_join,
534
+ )
535
+ else:
536
+ df = _spark_join_features(
537
+ df=df,
538
+ df_keys=join_data_key.lookup_key,
539
+ feature_table_data=feature_table_data,
540
+ feature_table_keys=feature_table_metadata.primary_keys,
541
+ feature_to_output_name=feature_to_output_name,
542
+ )
543
+ return df
544
+
545
+
546
+ def get_feature_lookups_with_full_table_names(
547
+ feature_lookups: List[FeatureLookup], current_catalog: str, current_schema: str
548
+ ) -> List[FeatureLookup]:
549
+ """
550
+ Takes in a list of FeatureLookups, and returns copies with reformatted table names.
551
+ """
552
+ table_names = {fl.table_name for fl in feature_lookups}
553
+ uc_utils._check_qualified_table_names(table_names)
554
+ uc_utils._verify_all_tables_are_either_in_uc_or_in_hms(
555
+ table_names, current_catalog, current_schema
556
+ )
557
+ standardized_feature_lookups = []
558
+ for fl in feature_lookups:
559
+ fl_copy = copy.deepcopy(fl)
560
+ fl_copy._table_name = uc_utils.get_full_table_name(
561
+ fl_copy.table_name, current_catalog, current_schema
562
+ )
563
+ standardized_feature_lookups.append(fl_copy)
564
+ return standardized_feature_lookups