wedata-feature-engineering 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,580 @@
1
+ import logging
2
+ from collections import defaultdict
3
+ from typing import Dict, List, Optional, Set
4
+
5
+ from pyspark.sql import DataFrame
6
+
7
+ from feature_store.entities.column_info import ColumnInfo
8
+ from feature_store.entities.feature import Feature
9
+ from feature_store.entities.feature_column_info import FeatureColumnInfo
10
+ from feature_store.entities.feature_lookup import FeatureLookup
11
+ from feature_store.entities.feature_spec import FeatureSpec
12
+ from feature_store.entities.feature_table import FeatureTable
13
+ from feature_store.entities.feature_table_info import FeatureTableInfo
14
+ from feature_store.entities.function_info import FunctionInfo
15
+ from feature_store.entities.on_demand_column_info import OnDemandColumnInfo
16
+ from feature_store.entities.source_data_column_info import SourceDataColumnInfo
17
+
18
+ from feature_store.constants.constants import _ERROR, _WARN
19
+
20
+ from feature_store.utils import common_utils, validation_utils, uc_utils, schema_utils, utils
21
+ from feature_store.utils.feature_spec_utils import assign_topological_ordering
22
+
23
+ _logger = logging.getLogger(__name__)
24
+
25
+ MAX_FEATURE_FUNCTIONS = 100
26
+
27
+
28
+ def get_features_for_tables(
29
+ spark_client, table_names: Set[str]
30
+ ) -> Dict[str, List[Feature]]:
31
+ """
32
+ Lookup features from the feature catalog for all table_names, return a dictionary of tablename -> list of features.
33
+ """
34
+ return {
35
+ table_name: spark_client.get_features(table_name)
36
+ for table_name in table_names
37
+ }
38
+
39
+
40
+ def get_feature_table_metadata_for_tables(
41
+ spark_client,
42
+ table_names: Set[str],
43
+ ) -> Dict[str, FeatureTable]:
44
+ """
45
+ Lookup FeatureTable metadata from the feature catalog for all table_names, return a dictionary of tablename -> FeatureTable.
46
+ """
47
+ feature_table_metadata = {}
48
+ for table_name in table_names:
49
+ feature_table_metadata[table_name] = spark_client.get_feature_table(
50
+ table_name
51
+ )
52
+ return feature_table_metadata
53
+
54
+
55
+ def explode_feature_lookups(
56
+ feature_lookups: List[FeatureLookup],
57
+ feature_table_features_map: Dict[str, List[Feature]],
58
+ feature_table_metadata_map: Dict[str, FeatureTable],
59
+ ) -> List[FeatureColumnInfo]:
60
+ """
61
+ Explode FeatureLookups and collect into FeatureColumnInfos. A FeatureLookup may explode into either:
62
+ 1. A single FeatureColumnInfo, in the case where only a single feature name is specified.
63
+ 2. Multiple FeatureColumnInfos, in the cases where either multiple or all feature names are specified.
64
+
65
+ When all features are specified in a FeatureLookup (feature_names is None),
66
+ FeatureColumnInfos will be created for all features except primary and timestamp keys.
67
+ The order of FeatureColumnInfos returned will be the same as the order returned by GetFeatures:
68
+ 1. All partition keys that are not primary keys, in the partition key order.
69
+ 2. All other non-key features in alphabetical order.
70
+ """
71
+ feature_column_infos = []
72
+ for feature_lookup in feature_lookups:
73
+ feature_column_infos_for_feature_lookup = _explode_feature_lookup(
74
+ feature_lookup=feature_lookup,
75
+ features=feature_table_features_map[feature_lookup.table_name],
76
+ feature_table=feature_table_metadata_map[feature_lookup.table_name],
77
+ )
78
+ feature_column_infos += feature_column_infos_for_feature_lookup
79
+ return feature_column_infos
80
+
81
+
82
+ def _explode_feature_lookup(
83
+ feature_lookup: FeatureLookup,
84
+ features: List[Feature],
85
+ feature_table: FeatureTable,
86
+ ) -> List[FeatureColumnInfo]:
87
+ feature_names = []
88
+ if feature_lookup._get_feature_names():
89
+ # If the user explicitly passed in a feature name or list of feature names, use that
90
+ feature_names.extend(feature_lookup._get_feature_names())
91
+ else:
92
+ # Otherwise assume the user wants all columns in the feature table
93
+ keys = {*feature_table.primary_keys, *feature_table.timestamp_keys}
94
+ feature_names.extend(
95
+ [feature.name for feature in features if feature.name not in keys]
96
+ )
97
+
98
+ return [
99
+ FeatureColumnInfo(
100
+ table_name=feature_lookup.table_name,
101
+ feature_name=feature_name,
102
+ lookup_key=utils.as_list(feature_lookup.lookup_key),
103
+ output_name=(feature_lookup._get_output_name(feature_name)),
104
+ timestamp_lookup_key=utils.as_list(
105
+ feature_lookup.timestamp_lookup_key, default=[]
106
+ ),
107
+ )
108
+ for feature_name in feature_names
109
+ ]
110
+
111
+
112
+ def load_feature_data_for_tables(
113
+ spark_client, table_names: Set[str]
114
+ ) -> Dict[str, DataFrame]:
115
+ """
116
+ Load feature DataFrame objects for all table_names, return a dictionary of tablename -> DataFrame.
117
+ """
118
+ return {
119
+ table_name: spark_client.read_table(table_name) for table_name in table_names
120
+ }
121
+
122
+
123
+ def _validate_and_convert_lookback_windows(
124
+ feature_lookups: List[FeatureLookup],
125
+ ) -> Dict[str, Optional[float]]:
126
+ """
127
+ Gets lookback_window values from all feature_lookups, validates that lookback_window values are consistent per feature table,
128
+ converts the lookback window into total seconds, and returns a dictionary of tablename -> lookback_window values. In the
129
+ case where lookback_window is not defined, the key value mapping will be "feature_table_name" -> None.
130
+ """
131
+ table_lookback_windows_map = defaultdict(set)
132
+ for fl in feature_lookups:
133
+ table_lookback_windows_map[fl.table_name].add(fl.lookback_window)
134
+
135
+ for table_name, lookback_windows in table_lookback_windows_map.items():
136
+ if len(set(lookback_windows)) > 1:
137
+ if None in lookback_windows:
138
+ raise ValueError(
139
+ f"lookback_window values must be consistently defined per feature table. '{table_name}' has "
140
+ f"missing lookback_window values: {lookback_windows}."
141
+ )
142
+ else:
143
+ raise ValueError(
144
+ f"Only one value for lookback_window can be defined per feature table. '{table_name}' has "
145
+ f"conflicting lookback_window values: {lookback_windows}."
146
+ )
147
+
148
+ # convert lookback windows to seconds
149
+ for table_name, lookback_windows in table_lookback_windows_map.items():
150
+ # Get the only element from a single member set
151
+ window = next(iter(lookback_windows))
152
+ table_lookback_windows_map[table_name] = (
153
+ window.total_seconds() if window is not None else None
154
+ )
155
+
156
+ return table_lookback_windows_map
157
+
158
+
159
+ def validate_feature_column_infos_data(
160
+ spark_client,
161
+ feature_column_infos: List[FeatureColumnInfo],
162
+ features_by_table: Dict[str, List[Feature]],
163
+ feature_table_data_map: Dict[str, DataFrame],
164
+ ):
165
+ """
166
+ Validates required FeatureLookup data. Checks:
167
+ 1. Feature tables exist in Delta.
168
+ 2. Feature data types match in Delta and Feature Catalog.
169
+ """
170
+ table_to_features = defaultdict(list)
171
+ for fci in feature_column_infos:
172
+ table_to_features[fci.table_name].append(fci.feature_name)
173
+
174
+ for table_name, features_in_spec in table_to_features.items():
175
+
176
+ catalog_features = features_by_table[table_name]
177
+ feature_table_data = feature_table_data_map[table_name]
178
+ catalog_schema = {
179
+ feature.name: feature.data_type for feature in catalog_features
180
+ }
181
+ delta_schema = {
182
+ feature.name: feature.dataType
183
+ for feature in feature_table_data.schema
184
+ }
185
+
186
+ for feature_name in features_in_spec:
187
+ if feature_name not in catalog_schema:
188
+ raise ValueError(
189
+ f"Unable to find feature '{feature_name}' from feature table '{table_name}' in Feature Catalog."
190
+ )
191
+ if feature_name not in delta_schema:
192
+ raise ValueError(
193
+ f"Unable to find feature '{feature_name}' from feature table '{table_name}' in Delta."
194
+ )
195
+
196
+
197
+ def verify_df_and_labels(
198
+ df: DataFrame,
199
+ label_names: List[str],
200
+ exclude_columns: List[str],
201
+ ):
202
+ # Verify DataFrame type and column uniqueness
203
+ validation_utils.check_dataframe_type(df)
204
+ common_utils.validate_strings_unique(
205
+ df.columns, "Found duplicate DataFrame column names {}."
206
+ )
207
+
208
+ # Validate label_names, exclude_columns are unique
209
+ common_utils.validate_strings_unique(label_names, "Found duplicate label names {}.")
210
+ # Verify that label_names is in DataFrame and not in exclude_columns
211
+ for label_name in label_names:
212
+ if label_name not in df.columns:
213
+ raise ValueError(f"Label column '{label_name}' was not found in DataFrame")
214
+ if label_name in exclude_columns:
215
+ raise ValueError(f"Label column '{label_name}' cannot be excluded")
216
+
217
+
218
+ def get_uc_function_infos(
219
+ spark_client, udf_names: Set[str]
220
+ ) -> Dict[str, FunctionInfo]:
221
+ # Note: Only GetFunction ACLs are required here. ExecuteFunction ACL will be checked at SQL execution.
222
+ function_infos = spark_client.get_functions(list(udf_names))
223
+ return {function_info.full_name: function_info for function_info in function_infos}
224
+
225
+
226
+ def _validate_on_demand_column_info_udfs(
227
+ on_demand_column_infos: List[OnDemandColumnInfo],
228
+ uc_function_infos: Dict[str, FunctionInfo],
229
+ ):
230
+ """
231
+ Validates OnDemandColumnInfo UDFs can be applied as on-demand features. Checks:
232
+ 1. UDF is defined in Python.
233
+ 2. UDF input parameters are consistent with its input bindings.
234
+
235
+ Note: Provided UC FunctionInfos not required by OnDemandColumnInfos are not validated.
236
+ """
237
+ for odci in on_demand_column_infos:
238
+ function_info = uc_function_infos[odci.udf_name]
239
+ if function_info.external_language != FunctionInfo.PYTHON:
240
+ raise ValueError(
241
+ f"FeatureFunction UDF '{odci.udf_name}' is not a Python UDF. Only Python UDFs are supported."
242
+ )
243
+
244
+ udf_input_params = [p.name for p in function_info.input_params]
245
+ if odci.input_bindings.keys() != set(udf_input_params):
246
+ raise ValueError(
247
+ f"FeatureFunction UDF '{odci.udf_name}' input parameters {udf_input_params} "
248
+ f"do not match input bindings {odci.input_bindings}."
249
+ )
250
+
251
+
252
+ class _FeatureTableMetadata:
253
+ def __init__(
254
+ self,
255
+ feature_table_features_map,
256
+ feature_table_metadata_map,
257
+ feature_table_data_map,
258
+ ):
259
+ self.feature_table_features_map = feature_table_features_map
260
+ self.feature_table_metadata_map = feature_table_metadata_map
261
+ self.feature_table_data_map = feature_table_data_map
262
+
263
+
264
+ def warn_if_non_photon_for_native_spark(use_native_spark, spark_client):
265
+ if use_native_spark and not spark_client.is_photon_cluster():
266
+ _logger.warning(
267
+ "Native spark join is significantly more performant on Photon-enabled clusters. Consider "
268
+ "switching to a Photon-enabled cluster if performance is an issue."
269
+ )
270
+
271
+
272
+ def get_table_metadata(
273
+ spark_client, table_names
274
+ ):
275
+ # 根据特征表名(table_names)对应的所有特征
276
+ feature_table_features_map = get_features_for_tables(
277
+ spark_client, table_names=table_names
278
+ )
279
+
280
+ feature_table_metadata_map = get_feature_table_metadata_for_tables(
281
+ spark_client, table_names=table_names
282
+ )
283
+ feature_table_data_map = load_feature_data_for_tables(
284
+ spark_client, table_names=table_names
285
+ )
286
+ return _FeatureTableMetadata(
287
+ feature_table_features_map,
288
+ feature_table_metadata_map,
289
+ feature_table_data_map,
290
+ )
291
+
292
+
293
+ class _ColumnInfos:
294
+ def __init__(
295
+ self,
296
+ source_data_column_infos,
297
+ feature_column_infos,
298
+ on_demand_column_infos,
299
+ ):
300
+ self.source_data_column_infos = source_data_column_infos
301
+ self.feature_column_infos = feature_column_infos
302
+ self.on_demand_column_infos = on_demand_column_infos
303
+
304
+
305
+ def get_column_infos(
306
+ feature_lookups, feature_functions, ft_metadata, df_columns=[], label_names=[]
307
+ ):
308
+ # Collect SourceDataColumnInfos
309
+ source_data_column_infos = [
310
+ SourceDataColumnInfo(col) for col in df_columns if col not in label_names
311
+ ]
312
+
313
+ # Collect FeatureColumnInfos
314
+ feature_column_infos = explode_feature_lookups(
315
+ feature_lookups,
316
+ ft_metadata.feature_table_features_map,
317
+ ft_metadata.feature_table_metadata_map,
318
+ )
319
+
320
+ # Collect OnDemandColumnInfos
321
+ on_demand_column_infos = [
322
+ OnDemandColumnInfo(
323
+ udf_name=feature_function.udf_name,
324
+ input_bindings=feature_function.input_bindings,
325
+ output_name=feature_function.output_name,
326
+ )
327
+ for feature_function in feature_functions
328
+ ]
329
+ return _ColumnInfos(
330
+ source_data_column_infos, feature_column_infos, on_demand_column_infos
331
+ )
332
+
333
+
334
+ def validate_column_infos(
335
+ spark_client,
336
+ ft_metadata,
337
+ source_column_infos,
338
+ feature_column_infos,
339
+ on_demand_column_infos,
340
+ label_names=[]
341
+ ):
342
+ source_data_names = [sdci.name for sdci in source_column_infos]
343
+
344
+ # Verify features have unique output names
345
+ feature_output_names = [fci.output_name for fci in feature_column_infos]
346
+ common_utils.validate_strings_unique(
347
+ feature_output_names, "Found duplicate feature output names {}."
348
+ )
349
+
350
+ # Verify labels do not collide with feature output names
351
+ for label_name in label_names:
352
+ if label_name in feature_output_names:
353
+ raise ValueError(
354
+ f"Feature cannot have same output name as label '{label_name}'."
355
+ )
356
+
357
+ # Verify that FeatureLookup output names do not conflict with source data names
358
+ feature_conflicts = [
359
+ name for name in feature_output_names if name in source_data_names
360
+ ]
361
+ if len(feature_conflicts) > 0:
362
+ feature_conflicts_str = ", ".join([f"'{name}'" for name in feature_conflicts])
363
+ raise ValueError(
364
+ f"DataFrame contains column names that match feature output names specified"
365
+ f" in FeatureLookups: {feature_conflicts_str}. Either remove these columns"
366
+ f" from the DataFrame or FeatureLookups."
367
+ )
368
+
369
+ # Validate FeatureLookup data exists (including for columns that will be excluded).
370
+ validate_feature_column_infos_data(
371
+ spark_client,
372
+ feature_column_infos,
373
+ ft_metadata.feature_table_features_map,
374
+ ft_metadata.feature_table_data_map,
375
+ )
376
+
377
+ on_demand_input_names = common_utils.get_unique_list_order(
378
+ [
379
+ input_name
380
+ for odci in on_demand_column_infos
381
+ for input_name in odci.input_bindings.values()
382
+ ]
383
+ )
384
+ on_demand_output_names = [odci.output_name for odci in on_demand_column_infos]
385
+
386
+ # Verify on-demand features have unique output names
387
+ common_utils.validate_strings_unique(
388
+ on_demand_output_names, "Found duplicate on-demand feature output names {}."
389
+ )
390
+
391
+ # Verify labels do not collide with on-demand output names
392
+ for label_name in label_names:
393
+ if label_name in on_demand_output_names:
394
+ raise ValueError(
395
+ f"On-demand feature cannot have same output name as label '{label_name}'."
396
+ )
397
+
398
+ # Verify on-demand feature output names do not conflict with source data or feature output names
399
+ source_data_and_feature_output_names = set(source_data_names + feature_output_names)
400
+ on_demand_conflicts = [
401
+ name
402
+ for name in on_demand_output_names
403
+ if name in source_data_and_feature_output_names
404
+ ]
405
+ if len(on_demand_conflicts) > 0:
406
+ conflicting_on_demand_feature_names = ", ".join(
407
+ f"'{name}'" for name in on_demand_conflicts
408
+ )
409
+ raise ValueError(
410
+ f"FeatureFunctions contains output names that match either DataFrame column names "
411
+ f"or feature output names specified in FeatureLookups: {conflicting_on_demand_feature_names}. "
412
+ f"Either remove these columns from the DataFrame, FeatureLookups, or FeatureFunctions."
413
+ )
414
+
415
+ # Validate on-demand and feature inputs exist in either source data or feature or function
416
+ # outputs from previous levels
417
+ all_output_names = source_data_and_feature_output_names.union(
418
+ on_demand_output_names
419
+ )
420
+ missing_on_demand_inputs = set(on_demand_input_names).difference(all_output_names)
421
+ if len(missing_on_demand_inputs) > 0:
422
+ missing_on_demand_inputs_names = ", ".join(
423
+ [f"'{name}'" for name in sorted(missing_on_demand_inputs)]
424
+ )
425
+ raise ValueError(
426
+ f"Could not find input binding columns {missing_on_demand_inputs_names} required "
427
+ "by FeatureFunctions."
428
+ )
429
+
430
+ feature_input_names = common_utils.get_unique_list_order(
431
+ [input_name for fci in feature_column_infos for input_name in fci.lookup_key]
432
+ )
433
+ # Validate feature inputs exist in either source data or feature or function outputs
434
+ missing_lookup_inputs = set(feature_input_names).difference(all_output_names)
435
+ if len(missing_lookup_inputs) > 0:
436
+ missing_input_names = ", ".join(
437
+ [f"'{name}'" for name in sorted(missing_lookup_inputs)]
438
+ )
439
+ raise ValueError(
440
+ f"Could not find lookup key columns {missing_input_names} required by "
441
+ "FeatureLookups."
442
+ )
443
+
444
+ # uc_function_infos = get_uc_function_infos(
445
+ # spark_client,
446
+ # {odci.udf_name for odci in on_demand_column_infos},
447
+ # )
448
+ # Validate FeatureFunctions UDFs (including for columns that will be excluded).
449
+ # _validate_on_demand_column_info_udfs(
450
+ # on_demand_column_infos=on_demand_column_infos,
451
+ # uc_function_infos=uc_function_infos,
452
+ # )
453
+
454
+
455
+ def build_feature_spec(
456
+ feature_lookups,
457
+ ft_metadata,
458
+ all_column_infos,
459
+ exclude_columns
460
+ ):
461
+ # The order of ColumnInfos in feature_spec.yaml should be:
462
+ # 1. SourceDataColumnInfos: non-label and non-excluded columns from the input DataFrame
463
+ # 2. FeatureColumnInfos: features retrieved through FeatureLookups
464
+ # 3. OnDemandColumnInfos: features created by FeatureFunctions
465
+ column_infos = [
466
+ ColumnInfo(info=info, include=info.output_name not in exclude_columns)
467
+ for info in all_column_infos.source_data_column_infos
468
+ + all_column_infos.feature_column_infos
469
+ + all_column_infos.on_demand_column_infos
470
+ ]
471
+ # Excluded columns that are on-demand inputs or feature lookup keys
472
+ # should still be in feature_spec.yaml with include=False.
473
+ on_demand_input_names = common_utils.get_unique_list_order(
474
+ [
475
+ input_name
476
+ for odci in all_column_infos.on_demand_column_infos
477
+ for input_name in odci.input_bindings.values()
478
+ ]
479
+ )
480
+ lookup_keys_and_on_demand_inputs = set(on_demand_input_names)
481
+ for fci in all_column_infos.feature_column_infos:
482
+ lookup_keys_and_on_demand_inputs.update(fci.lookup_key)
483
+
484
+ column_infos = [
485
+ ci
486
+ for ci in column_infos
487
+ if ci.include or ci.output_name in lookup_keys_and_on_demand_inputs
488
+ ]
489
+
490
+ # Sort table_infos by table_name, function_infos by udf_name, so they appear sorted in feature_spec.yaml
491
+ # Exclude unnecessary table_infos, function_infos from the FeatureSpec. When a FeatureLookup or FeatureFunction
492
+ # output feature is excluded, the underlying table or UDF is not required in the FeatureSpec.
493
+ consumed_table_names = [
494
+ ci.info.table_name
495
+ for ci in column_infos
496
+ if isinstance(ci.info, FeatureColumnInfo)
497
+ ]
498
+ consumed_table_names = sorted(set(consumed_table_names))
499
+ consumed_udf_names = [
500
+ ci.info.udf_name
501
+ for ci in column_infos
502
+ if isinstance(ci.info, OnDemandColumnInfo)
503
+ ]
504
+ consumed_udf_names = sorted(set(consumed_udf_names))
505
+
506
+ # Collect lookback windows
507
+ table_lookback_window_map = _validate_and_convert_lookback_windows(feature_lookups)
508
+
509
+ table_infos = [
510
+ FeatureTableInfo(
511
+ table_name=table_name,
512
+ table_id=ft_metadata.feature_table_metadata_map[table_name].table_id,
513
+ lookback_window=table_lookback_window_map[table_name],
514
+ )
515
+ for table_name in consumed_table_names
516
+ ]
517
+ function_infos = [
518
+ FunctionInfo(udf_name=udf_name) for udf_name in consumed_udf_names
519
+ ]
520
+
521
+ # Build FeatureSpec
522
+ feature_spec = FeatureSpec(
523
+ column_infos=assign_topological_ordering(
524
+ column_infos=column_infos,
525
+ ),
526
+ table_infos=table_infos,
527
+ function_infos=function_infos,
528
+ serialization_version=FeatureSpec.SERIALIZATION_VERSION_NUMBER,
529
+ )
530
+
531
+ return feature_spec
532
+
533
+
534
+ def add_inferred_source_columns(column_infos):
535
+ on_demand_input_names = common_utils.get_unique_list_order(
536
+ [
537
+ input_name
538
+ for odci in column_infos.on_demand_column_infos
539
+ for input_name in odci.input_bindings.values()
540
+ ]
541
+ )
542
+ on_demand_output_names = [
543
+ odci.output_name for odci in column_infos.on_demand_column_infos
544
+ ]
545
+
546
+ feature_input_names = common_utils.get_unique_list_order(
547
+ [
548
+ input_name
549
+ for fci in column_infos.feature_column_infos
550
+ for input_name in fci.lookup_key
551
+ ]
552
+ )
553
+ feature_output_names = [
554
+ fci.output_name for fci in column_infos.feature_column_infos
555
+ ]
556
+
557
+ all_output_names = feature_output_names + on_demand_output_names
558
+
559
+ missing_lookup_inputs = [
560
+ feature_input_name
561
+ for feature_input_name in feature_input_names
562
+ if feature_input_name not in all_output_names
563
+ ]
564
+ missing_on_demand_inputs = [
565
+ on_demand_input_name
566
+ for on_demand_input_name in on_demand_input_names
567
+ if on_demand_input_name not in all_output_names
568
+ ]
569
+
570
+ inferred_inputs = common_utils.get_unique_list_order(
571
+ missing_lookup_inputs + missing_on_demand_inputs
572
+ )
573
+ source_data_column_infos = [SourceDataColumnInfo(col) for col in inferred_inputs]
574
+
575
+ return _ColumnInfos(
576
+ source_data_column_infos=column_infos.source_data_column_infos
577
+ + source_data_column_infos,
578
+ feature_column_infos=column_infos.feature_column_infos,
579
+ on_demand_column_infos=column_infos.on_demand_column_infos,
580
+ )