tencent-wedata-feature-engineering-dev 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.

Files changed (64) hide show
  1. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/METADATA +19 -0
  2. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/RECORD +64 -0
  3. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/WHEEL +5 -0
  4. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/top_level.txt +1 -0
  5. wedata/__init__.py +9 -0
  6. wedata/feature_store/__init__.py +0 -0
  7. wedata/feature_store/client.py +462 -0
  8. wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
  9. wedata/feature_store/cloud_sdk_client/client.py +86 -0
  10. wedata/feature_store/cloud_sdk_client/models.py +686 -0
  11. wedata/feature_store/cloud_sdk_client/utils.py +32 -0
  12. wedata/feature_store/common/__init__.py +0 -0
  13. wedata/feature_store/common/protos/__init__.py +0 -0
  14. wedata/feature_store/common/protos/feature_store_pb2.py +49 -0
  15. wedata/feature_store/common/store_config/__init__.py +0 -0
  16. wedata/feature_store/common/store_config/redis.py +48 -0
  17. wedata/feature_store/constants/__init__.py +0 -0
  18. wedata/feature_store/constants/constants.py +59 -0
  19. wedata/feature_store/constants/engine_types.py +34 -0
  20. wedata/feature_store/entities/__init__.py +0 -0
  21. wedata/feature_store/entities/column_info.py +138 -0
  22. wedata/feature_store/entities/environment_variables.py +55 -0
  23. wedata/feature_store/entities/feature.py +53 -0
  24. wedata/feature_store/entities/feature_column_info.py +72 -0
  25. wedata/feature_store/entities/feature_function.py +55 -0
  26. wedata/feature_store/entities/feature_lookup.py +200 -0
  27. wedata/feature_store/entities/feature_spec.py +489 -0
  28. wedata/feature_store/entities/feature_spec_constants.py +25 -0
  29. wedata/feature_store/entities/feature_table.py +111 -0
  30. wedata/feature_store/entities/feature_table_info.py +49 -0
  31. wedata/feature_store/entities/function_info.py +90 -0
  32. wedata/feature_store/entities/on_demand_column_info.py +57 -0
  33. wedata/feature_store/entities/source_data_column_info.py +24 -0
  34. wedata/feature_store/entities/training_set.py +135 -0
  35. wedata/feature_store/feast_client/__init__.py +0 -0
  36. wedata/feature_store/feast_client/feast_client.py +482 -0
  37. wedata/feature_store/feature_table_client/__init__.py +0 -0
  38. wedata/feature_store/feature_table_client/feature_table_client.py +969 -0
  39. wedata/feature_store/mlflow_model.py +17 -0
  40. wedata/feature_store/spark_client/__init__.py +0 -0
  41. wedata/feature_store/spark_client/spark_client.py +289 -0
  42. wedata/feature_store/training_set_client/__init__.py +0 -0
  43. wedata/feature_store/training_set_client/training_set_client.py +572 -0
  44. wedata/feature_store/utils/__init__.py +0 -0
  45. wedata/feature_store/utils/common_utils.py +352 -0
  46. wedata/feature_store/utils/env_utils.py +86 -0
  47. wedata/feature_store/utils/feature_lookup_utils.py +564 -0
  48. wedata/feature_store/utils/feature_spec_utils.py +286 -0
  49. wedata/feature_store/utils/feature_utils.py +73 -0
  50. wedata/feature_store/utils/on_demand_utils.py +107 -0
  51. wedata/feature_store/utils/schema_utils.py +117 -0
  52. wedata/feature_store/utils/signature_utils.py +202 -0
  53. wedata/feature_store/utils/topological_sort.py +158 -0
  54. wedata/feature_store/utils/training_set_utils.py +579 -0
  55. wedata/feature_store/utils/uc_utils.py +296 -0
  56. wedata/feature_store/utils/validation_utils.py +79 -0
  57. wedata/tempo/__init__.py +0 -0
  58. wedata/tempo/interpol.py +448 -0
  59. wedata/tempo/intervals.py +1331 -0
  60. wedata/tempo/io.py +61 -0
  61. wedata/tempo/ml.py +129 -0
  62. wedata/tempo/resample.py +318 -0
  63. wedata/tempo/tsdf.py +1720 -0
  64. wedata/tempo/utils.py +254 -0
@@ -0,0 +1,572 @@
1
+ import logging
2
+ import os
3
+ from types import ModuleType
4
+ from typing import Any, List, Optional, Set, Union, Dict
5
+
6
+ import mlflow
7
+ from mlflow.models import Model
8
+ from mlflow.utils.file_utils import TempDir, read_yaml
9
+ from pyspark.sql import DataFrame
10
+ from pyspark.sql.functions import struct
11
+
12
+ from wedata.feature_store.constants import constants
13
+ from wedata.feature_store.entities.feature_function import FeatureFunction
14
+ from wedata.feature_store.entities.feature_lookup import FeatureLookup
15
+ from wedata.feature_store.entities.feature_spec import FeatureSpec
16
+ from wedata.feature_store.entities.feature_table import FeatureTable
17
+ from wedata.feature_store.entities.training_set import TrainingSet
18
+ from wedata.feature_store.mlflow_model import _FeatureStoreModelWrapper
19
+ from wedata.feature_store.spark_client.spark_client import SparkClient
20
+ from wedata.feature_store.utils import validation_utils
21
+ from wedata.feature_store.entities.feature_table import FeatureTable
22
+
23
+ from wedata.feature_store.constants.constants import (
24
+ _NO_RESULT_TYPE_PASSED,
25
+ _USE_SPARK_NATIVE_JOIN,
26
+ MODEL_DATA_PATH_ROOT,
27
+ PREDICTION_COLUMN_NAME,
28
+ _PREBUILT_ENV_URI
29
+ )
30
+
31
+ from wedata.feature_store.utils import common_utils, training_set_utils, uc_utils
32
+ from wedata.feature_store.utils.signature_utils import get_mlflow_signature_from_feature_spec, \
33
+ drop_signature_inputs_and_invalid_params
34
+
35
+ _logger = logging.getLogger(__name__)
36
+
37
+ FEATURE_SPEC_GRAPH_MAX_COLUMN_INFO = 1000
38
+
39
+
40
+ class TrainingSetClient:
41
+ def __init__(
42
+ self,
43
+ spark_client: SparkClient
44
+ ):
45
+ self._spark_client = spark_client
46
+
47
+ def create_training_set(
48
+ self,
49
+ feature_spec: FeatureSpec,
50
+ label_names: List[str],
51
+ df: DataFrame,
52
+ ft_metadata: training_set_utils._FeatureTableMetadata,
53
+ kwargs,
54
+ ):
55
+ uc_function_infos = training_set_utils.get_uc_function_infos(
56
+ self._spark_client,
57
+ {odci.udf_name for odci in feature_spec.on_demand_column_infos},
58
+ )
59
+
60
+ training_set_utils.warn_if_non_photon_for_native_spark(
61
+ kwargs.get(_USE_SPARK_NATIVE_JOIN, False), self._spark_client
62
+ )
63
+ return TrainingSet(
64
+ feature_spec,
65
+ df,
66
+ label_names,
67
+ ft_metadata.feature_table_metadata_map,
68
+ ft_metadata.feature_table_data_map,
69
+ uc_function_infos,
70
+ kwargs.get(_USE_SPARK_NATIVE_JOIN, False),
71
+ )
72
+
73
+ def create_training_set_from_feature_lookups(
74
+ self,
75
+ df: DataFrame,
76
+ feature_lookups: List[Union[FeatureLookup, FeatureFunction]],
77
+ label: Union[str, List[str], None],
78
+ exclude_columns: List[str],
79
+ **kwargs,
80
+ ) -> TrainingSet:
81
+
82
+ # 获取特征查找列表和特征函数列表
83
+ features = feature_lookups
84
+ feature_lookups = [f for f in features if isinstance(f, FeatureLookup)]
85
+ feature_functions = [f for f in features if isinstance(f, FeatureFunction)]
86
+
87
+ # 最多支持100个FeatureFunctions
88
+ if len(feature_functions) > training_set_utils.MAX_FEATURE_FUNCTIONS:
89
+ raise ValueError(
90
+ f"A maximum of {training_set_utils.MAX_FEATURE_FUNCTIONS} FeatureFunctions are supported."
91
+ )
92
+
93
+ # 如果未提供标签,则用空列表初始化label_names
94
+ label_names = common_utils.as_list(label, [])
95
+ del label
96
+
97
+ # 校验数据集和标签
98
+ training_set_utils.verify_df_and_labels(df, label_names, exclude_columns)
99
+
100
+ # 获取特征表元数据
101
+ ft_metadata = training_set_utils.get_table_metadata(
102
+ self._spark_client,
103
+ {fl.table_name for fl in feature_lookups}
104
+ )
105
+
106
+ column_infos = training_set_utils.get_column_infos(
107
+ feature_lookups,
108
+ feature_functions,
109
+ ft_metadata,
110
+ df_columns=df.columns,
111
+ label_names=label_names,
112
+ )
113
+
114
+ training_set_utils.validate_column_infos(
115
+ self._spark_client,
116
+ ft_metadata,
117
+ column_infos.source_data_column_infos,
118
+ column_infos.feature_column_infos,
119
+ column_infos.on_demand_column_infos,
120
+ label_names,
121
+ )
122
+
123
+ # Build feature_spec locally for comparison with the feature spec yaml generated by the
124
+ # FeatureStore backend. This will be removed once the migration is validated.
125
+ feature_spec = training_set_utils.build_feature_spec(
126
+ feature_lookups,
127
+ ft_metadata,
128
+ column_infos,
129
+ exclude_columns
130
+ )
131
+
132
+ return self.create_training_set(
133
+ feature_spec,
134
+ label_names,
135
+ df,
136
+ ft_metadata,
137
+ kwargs=kwargs,
138
+ )
139
+
140
+
141
+ def create_feature_spec(
142
+ self,
143
+ name: str,
144
+ features: List[Union[FeatureLookup, FeatureFunction]],
145
+ sparkClient: SparkClient,
146
+ exclude_columns: List[str] = [],
147
+ ) -> FeatureSpec:
148
+
149
+ feature_lookups = [f for f in features if isinstance(f, FeatureLookup)]
150
+ feature_functions = [f for f in features if isinstance(f, FeatureFunction)]
151
+
152
+ # Maximum of 100 FeatureFunctions is supported
153
+ if len(feature_functions) > training_set_utils.MAX_FEATURE_FUNCTIONS:
154
+ raise ValueError(
155
+ f"A maximum of {training_set_utils.MAX_FEATURE_FUNCTIONS} FeatureFunctions are supported."
156
+ )
157
+
158
+ # Get feature table metadata and column infos
159
+ ft_metadata = training_set_utils.get_table_metadata(
160
+ self._spark_client,
161
+ {fl.table_name for fl in feature_lookups}
162
+ )
163
+ column_infos = training_set_utils.get_column_infos(
164
+ feature_lookups,
165
+ feature_functions,
166
+ ft_metadata,
167
+ )
168
+
169
+ column_infos = training_set_utils.add_inferred_source_columns(column_infos)
170
+
171
+ training_set_utils.validate_column_infos(
172
+ self._spark_client,
173
+ ft_metadata,
174
+ column_infos.source_data_column_infos,
175
+ column_infos.feature_column_infos,
176
+ column_infos.on_demand_column_infos,
177
+ )
178
+
179
+ feature_spec = training_set_utils.build_feature_spec(
180
+ feature_lookups,
181
+ ft_metadata,
182
+ column_infos,
183
+ exclude_columns
184
+ )
185
+
186
+ return feature_spec
187
+
188
+
189
+ def log_model(
190
+ self,
191
+ model: Any,
192
+ artifact_path: str,
193
+ *,
194
+ flavor: ModuleType,
195
+ training_set: Optional[TrainingSet],
196
+ registered_model_name: Optional[str],
197
+ model_registry_uri: Optional[str],
198
+ await_registration_for: int,
199
+ infer_input_example: bool,
200
+ **kwargs,
201
+ ):
202
+ # 验证training_set参数是否提供
203
+ if (training_set is None):
204
+ raise ValueError(
205
+ "'training_set' must be provided, but not ."
206
+ )
207
+
208
+ # 获取特征规格并重新格式化表名
209
+ # training_set.feature_spec保证来自FeatureStoreClient.create_training_set的3L格式
210
+ feature_spec = uc_utils.get_feature_spec_with_reformat_full_table_names(
211
+ training_set.feature_spec
212
+ )
213
+
214
+ # 获取标签类型映射和标签
215
+ label_type_map = training_set._label_data_types
216
+
217
+ # 收集所有特征列名
218
+ feature_columns = [
219
+ feature_column.output_name
220
+ for feature_column in feature_spec.feature_column_infos
221
+ ]
222
+ df_head = training_set.load_df().select(*feature_columns).head()
223
+
224
+ # 处理输出模式和参数
225
+ override_output_schema = kwargs.pop("output_schema", None)
226
+ params = kwargs.pop("params", {})
227
+ params["result_type"] = params.get("result_type", _NO_RESULT_TYPE_PASSED)
228
+
229
+ # 尝试获取MLflow签名
230
+ try:
231
+ signature = get_mlflow_signature_from_feature_spec(
232
+ feature_spec, label_type_map, override_output_schema, params
233
+ )
234
+ except Exception as e:
235
+ _logger.warning(f"Model could not be logged with a signature: {e}")
236
+ signature = None
237
+
238
+ with TempDir() as tmp_location:
239
+ # wedata data_path路径,改为记录表路径,遍历表名,生成数组
240
+ data_path = os.path.join(tmp_location.path(), "feature_store")
241
+ os.makedirs(data_path, exist_ok=True)
242
+
243
+ # 创建原始MLflow模型
244
+ raw_mlflow_model = Model(
245
+ signature=drop_signature_inputs_and_invalid_params(signature)
246
+ )
247
+ raw_model_path = os.path.join(data_path, constants.RAW_MODEL_FOLDER)
248
+
249
+ # 根据flavor类型保存模型
250
+ if flavor.FLAVOR_NAME != mlflow.pyfunc.FLAVOR_NAME:
251
+ flavor.save_model(
252
+ model, raw_model_path, mlflow_model=raw_mlflow_model, **kwargs
253
+ )
254
+ else:
255
+ flavor.save_model(
256
+ raw_model_path,
257
+ mlflow_model=raw_mlflow_model,
258
+ python_model=model,
259
+ **kwargs,
260
+ )
261
+
262
+ # 验证模型是否支持python_function flavor
263
+ if not "python_function" in raw_mlflow_model.flavors:
264
+ raise ValueError(
265
+ f"FeatureStoreClient.log_model does not support '{flavor.__name__}' "
266
+ f"since it does not have a python_function model flavor."
267
+ )
268
+
269
+ # 获取并处理conda环境配置
270
+ model_env = raw_mlflow_model.flavors["python_function"][mlflow.pyfunc.ENV]
271
+ if isinstance(model_env, dict):
272
+ # mlflow 2.0 has multiple supported environments
273
+ conda_file = model_env[mlflow.pyfunc.EnvType.CONDA]
274
+ else:
275
+ conda_file = model_env
276
+
277
+ conda_env = read_yaml(raw_model_path, conda_file)
278
+ #TODO 暂时不需要databricks-feature-lookup这个包,会导致 python 环境创建失败
279
+ # Check if databricks-feature-lookup version is specified in conda_env
280
+ lookup_client_version_specified = False
281
+ for dependency in conda_env.get("dependencies", []):
282
+ if isinstance(dependency, dict):
283
+ for pip_dep in dependency.get("pip", []):
284
+ if pip_dep.startswith(
285
+ constants.FEATURE_LOOKUP_CLIENT_PIP_PACKAGE
286
+ ):
287
+ lookup_client_version_specified = True
288
+ break
289
+ #TODO 暂时不需要databricks-feature-lookup这个包,会导致 python 环境创建失败
290
+ # If databricks-feature-lookup version is not specified, add default version
291
+ if not lookup_client_version_specified:
292
+ # Get the pip package string for the databricks-feature-lookup client
293
+ default_wedata_feature_lookup_pip_package = common_utils.pip_depependency_pinned_version(
294
+ pip_package_name=constants.FEATURE_LOOKUP_CLIENT_PIP_PACKAGE,
295
+ version=constants.FEATURE_LOOKUP_CLIENT_MAJOR_VERSION,
296
+ )
297
+ common_utils.add_mlflow_pip_depependency(
298
+ conda_env, default_wedata_feature_lookup_pip_package
299
+ )
300
+
301
+ # 尝试创建输入示例
302
+ input_example = None
303
+ try:
304
+ if df_head is not None and infer_input_example:
305
+ input_example = df_head.asDict()
306
+ except Exception:
307
+ pass
308
+
309
+ feature_spec.save(data_path)
310
+
311
+ print(f'artifact_path:{artifact_path},data_path:{data_path},conda_env:{conda_env},'
312
+ f'signature:{signature},input_example:{input_example}');
313
+
314
+ mlflow.pyfunc.log_model(
315
+ artifact_path=artifact_path,
316
+ python_model=_FeatureStoreModelWrapper(model),
317
+ # data_path=data_path,
318
+ artifacts={"feature_store": data_path},
319
+ code_path=None,
320
+ conda_env=conda_env,
321
+ signature=signature,
322
+ input_example=input_example,
323
+ registered_model_name=registered_model_name
324
+ )
325
+
326
+ # mlflow.pyfunc.log_model(
327
+ # artifact_path=artifact_path,
328
+ # loader_module=constants.MLFLOW_MODEL_NAME,
329
+ # data_path=data_path,
330
+ # conda_env=conda_env,
331
+ # signature=signature,
332
+ # input_example=input_example,
333
+ # )
334
+
335
+ # 注册模型
336
+ # if registered_model_name is not None:
337
+ # run_id = mlflow.tracking.fluent.active_run().info.run_id
338
+ # if model_registry_uri is not None:
339
+ # mlflow.set_registry_uri(model_registry_uri)
340
+ #
341
+ # mlflow.register_model(
342
+ # f"runs:/{run_id}/{artifact_path}",
343
+ # registered_model_name,
344
+ # await_registration_for=await_registration_for,
345
+ # )
346
+ #
347
+ # print(f"Model registered successfully: {registered_model_name}")
348
+
349
+ # # 验证模型是否已注册
350
+ # from mlflow.tracking import MlflowClient
351
+ # client = MlflowClient()
352
+ # model_version = client.get_latest_versions(registered_model_name, stages=["None"])[0]
353
+ # print(f"Registered model version: {model_version.version}")
354
+
355
+ def score_batch(
356
+ self,
357
+ model_uri: Optional[str],
358
+ df: DataFrame,
359
+ result_type: str,
360
+ env_manager: Optional[str] = None,
361
+ local_uri: Optional[str] = None,
362
+ params: Optional[dict[str, Any]] = None,
363
+ **kwargs,
364
+ ) -> DataFrame:
365
+ # TODO:ML 待确定是否需要
366
+ # req_context = RequestContext(request_context.SCORE_BATCH, client_name)
367
+
368
+ # 校验是否
369
+ validation_utils.check_dataframe_type(df)
370
+ if (model_uri is None) == (local_uri is None):
371
+ raise ValueError(
372
+ "Either 'model_uri' or 'local_uri' must be provided, but not both."
373
+ )
374
+ if df.isStreaming:
375
+ raise ValueError("Streaming DataFrames are not supported.")
376
+
377
+ # 返回结果中会包含列名为 prediction,为预测结果,输入数据中不用此名字
378
+ if PREDICTION_COLUMN_NAME in df.columns:
379
+ raise ValueError(
380
+ "FeatureStoreClient.score_batch returns a DataFrame with a new column "
381
+ f'"{PREDICTION_COLUMN_NAME}". df already has a column with name '
382
+ f'"{PREDICTION_COLUMN_NAME}".'
383
+ )
384
+
385
+ # 校验列中是否有重复列名
386
+ validation_utils.validate_strings_unique(
387
+ df.columns,
388
+ "The provided DataFrame for scoring must have unique column names. Found duplicates {}.",
389
+ )
390
+ artifact_path = os.path.join("artifacts", MODEL_DATA_PATH_ROOT)
391
+ # print(f"artifact_path: {artifact_path}")
392
+ with (TempDir() as tmp_location):
393
+ local_path = (
394
+ local_uri
395
+ if local_uri
396
+ else common_utils.download_model_artifacts(model_uri, tmp_location.path())
397
+ )
398
+ # print(f"wedata local_path:{local_path}")
399
+ model_data_path = os.path.join(local_path, artifact_path)
400
+ # print(f"artifact_path: {artifact_path}")
401
+
402
+ # Augment local workspace metastore tables from 2L to 3L,
403
+ # this will prevent us from erroneously reading data from other catalogs
404
+ feature_spec = uc_utils.get_feature_spec_with_full_table_names(
405
+ FeatureSpec.load(model_data_path)
406
+ )
407
+
408
+ raw_model_path = os.path.join(
409
+ model_data_path, constants.RAW_MODEL_FOLDER
410
+ )
411
+ print(f"raw_model_path: {raw_model_path}")
412
+ # 构建 udf 函数
413
+ predict_udf =self._spark_client.get_predict_udf(
414
+ raw_model_path,
415
+ result_type=result_type,
416
+ env_manager=env_manager,
417
+ params=params,
418
+ prebuilt_env_uri=kwargs.get(_PREBUILT_ENV_URI, None))
419
+ # TODO (ML-17260) Consider reading the timestamp from the backend instead of feature store artifacts
420
+ ml_model = Model.load(
421
+ os.path.join(local_path, constants.ML_MODEL)
422
+ )
423
+
424
+ # Validate that columns needed for joining feature tables exist and are not duplicates.
425
+ feature_input_keys = []
426
+ for fci in feature_spec.feature_column_infos:
427
+ feature_input_keys.extend([k for k in fci.lookup_key])
428
+ on_demand_input_names = uc_utils.get_unique_list_order(
429
+ [
430
+ input_name
431
+ for odci in feature_spec.on_demand_column_infos
432
+ for input_name in odci.input_bindings.values()
433
+ ]
434
+ )
435
+ intermediate_inputs = set(feature_input_keys + on_demand_input_names)
436
+ source_data_names = [
437
+ sdci.name for sdci in feature_spec.source_data_column_infos
438
+ ]
439
+ # print(f"wedata source_data_names:{source_data_names}")
440
+
441
+ feature_output_names = [
442
+ fci.output_name for fci in feature_spec.feature_column_infos
443
+ ]
444
+ on_demand_output_names = [
445
+ odci.output_name for odci in feature_spec.on_demand_column_infos
446
+ ]
447
+ all_output_names = set(
448
+ source_data_names + feature_output_names + on_demand_output_names
449
+ )
450
+ required_cols = intermediate_inputs.difference(all_output_names)
451
+ required_cols.update(source_data_names)
452
+
453
+ missing_required_columns = [
454
+ col for col in required_cols if col not in df.columns
455
+ ]
456
+ if missing_required_columns:
457
+ missing_columns_formatted = ", ".join(
458
+ [f"'{s}'" for s in missing_required_columns]
459
+ )
460
+ raise ValueError(
461
+ f"DataFrame is missing required columns {missing_columns_formatted}."
462
+ )
463
+
464
+ table_names = {fci.table_name for fci in feature_spec.feature_column_infos}
465
+ feature_table_features_map = training_set_utils.get_features_for_tables(
466
+ self._spark_client, table_names=table_names
467
+ )
468
+ feature_table_metadata_map = (
469
+ training_set_utils.get_feature_table_metadata_for_tables(
470
+ self._spark_client,
471
+ table_names=table_names,
472
+ )
473
+ )
474
+ feature_table_data_map = training_set_utils.load_feature_data_for_tables(
475
+ self._spark_client, table_names=table_names
476
+ )
477
+ training_set_utils.validate_feature_column_infos_data(
478
+ self._spark_client,
479
+ feature_spec.feature_column_infos,
480
+ feature_table_features_map,
481
+ feature_table_data_map,
482
+ )
483
+
484
+ uc_function_infos = training_set_utils.get_uc_function_infos(
485
+ self._spark_client,
486
+ {odci.udf_name for odci in feature_spec.on_demand_column_infos},
487
+ )
488
+
489
+ # Required source data and feature lookup keys have been validated to exist in `df`.
490
+ # No additional validation is required before resolving FeatureLookups and applying FeatureFunctions.
491
+ training_set_utils.warn_if_non_photon_for_native_spark(
492
+ kwargs.get(_USE_SPARK_NATIVE_JOIN, False), self._spark_client
493
+ )
494
+
495
+ augmented_df = TrainingSet(
496
+ feature_spec=feature_spec,
497
+ df=df,
498
+ labels=[],
499
+ feature_table_metadata_map=feature_table_metadata_map,
500
+ feature_table_data_map=feature_table_data_map,
501
+ uc_function_infos=uc_function_infos,
502
+ use_spark_native_join=kwargs.get(_USE_SPARK_NATIVE_JOIN, False),
503
+ )._augment_df()
504
+ # Only included FeatureSpec columns should be part of UDF inputs for scoring.
505
+ # Note: extra `df` columns not in FeatureSpec should be preserved.
506
+
507
+ udf_input_columns = [
508
+ ci.output_name for ci in feature_spec.column_infos if ci.include
509
+ ]
510
+ print(f"udf_input_columns:{udf_input_columns}")
511
+ # Apply predictions.
512
+ df_with_predictions = augmented_df.withColumn(
513
+ PREDICTION_COLUMN_NAME, predict_udf(struct(*udf_input_columns))
514
+ )
515
+ # Reorder `df_with_predictions` to include:
516
+ # 1. Preserved `df` columns, in `df` column order.
517
+ # 2. Computed model input columns, in `FeatureSpec` column order.
518
+ # 3. Prediction column.
519
+ output_column_order = (
520
+ df.columns
521
+ + [col for col in udf_input_columns if col not in df.columns]
522
+ + [PREDICTION_COLUMN_NAME]
523
+ )
524
+ return_df = df_with_predictions.select(output_column_order)
525
+ return return_df
526
+
527
+ def _warn_if_tables_mismatched_for_model(
528
+ self,
529
+ feature_spec: FeatureSpec,
530
+ feature_table_metadata_map: Dict[str, FeatureTable],
531
+ model_creation_timestamp_ms: float,
532
+ ):
533
+ """
534
+ Helper method to warn if feature tables were deleted and recreated after a model was logged.
535
+ For newer FeatureSpec versions >=3, we can compare the FeatureSpec and current table ids.
536
+ Otherwise, we compare the model and table creation timestamps.
537
+ """
538
+ # 1. Compare feature table ids
539
+ # Check for feature_spec logged with client versions that supports table_infos
540
+ if len(feature_spec.table_infos) > 0:
541
+ # When feature_spec.yaml is parsed, FeatureSpec.load will assert
542
+ # that the listed table names in input_tables match table names in input_columns.
543
+ # The following code assumes this as invariant and only checks for the table IDs.
544
+ mismatched_tables = []
545
+ for table_info in feature_spec.table_infos:
546
+ feature_table = feature_table_metadata_map[table_info.table_name]
547
+ if feature_table :
548
+ mismatched_tables.append(table_info.table_name)
549
+ if len(mismatched_tables) > 0:
550
+ plural = len(mismatched_tables) > 1
551
+ _logger.warning(
552
+ f"Feature table{'s' if plural else ''} {', '.join(mismatched_tables)} "
553
+ f"{'were' if plural else 'was'} deleted and recreated after "
554
+ f"the model was trained. Model performance may be affected if the features "
555
+ f"used in scoring have drifted from the features used in training."
556
+ )
557
+
558
+ # 2. 无法获取创建时间,不做校验
559
+ # feature_tables_created_after_model = []
560
+ # for name, metadata in feature_table_metadata_map.items():
561
+ # if model_creation_timestamp_ms < metadata.creation_timestamp:
562
+ # feature_tables_created_after_model.append(name)
563
+ #
564
+ # if len(feature_tables_created_after_model) > 0:
565
+ # plural = len(feature_tables_created_after_model) > 1
566
+ # message = (
567
+ # f"Feature table{'s' if plural else ''} {', '.join(feature_tables_created_after_model)} "
568
+ # f"{'were' if plural else 'was'} created after the model was logged. "
569
+ # f"Model performance may be affected if the features used in scoring have drifted "
570
+ # f"from the features used in training."
571
+ # )
572
+ # _logger.warning(message)
File without changes