wedata-feature-engineering 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {feature_store → wedata}/__init__.py +1 -1
- {feature_store → wedata/feature_store}/client.py +113 -41
- {feature_store → wedata/feature_store}/constants/constants.py +19 -0
- {feature_store → wedata/feature_store}/entities/column_info.py +4 -4
- {feature_store → wedata/feature_store}/entities/feature_lookup.py +5 -1
- {feature_store → wedata/feature_store}/entities/feature_spec.py +46 -46
- wedata/feature_store/entities/feature_table.py +107 -0
- {feature_store → wedata/feature_store}/entities/training_set.py +13 -12
- {feature_store → wedata/feature_store}/feature_table_client/feature_table_client.py +85 -30
- {feature_store → wedata/feature_store}/spark_client/spark_client.py +30 -56
- wedata/feature_store/training_set_client/training_set_client.py +367 -0
- wedata/feature_store/utils/__init__.py +0 -0
- feature_store/utils/utils.py → wedata/feature_store/utils/common_utils.py +108 -54
- {feature_store → wedata/feature_store}/utils/feature_lookup_utils.py +6 -6
- {feature_store → wedata/feature_store}/utils/feature_spec_utils.py +6 -6
- {feature_store → wedata/feature_store}/utils/feature_utils.py +5 -5
- wedata/feature_store/utils/on_demand_utils.py +107 -0
- {feature_store → wedata/feature_store}/utils/schema_utils.py +1 -1
- wedata/feature_store/utils/signature_utils.py +205 -0
- {feature_store → wedata/feature_store}/utils/training_set_utils.py +18 -19
- {feature_store → wedata/feature_store}/utils/uc_utils.py +1 -1
- {wedata_feature_engineering-0.1.4.dist-info → wedata_feature_engineering-0.1.6.dist-info}/METADATA +1 -1
- wedata_feature_engineering-0.1.6.dist-info/RECORD +43 -0
- wedata_feature_engineering-0.1.6.dist-info/top_level.txt +1 -0
- feature_store/entities/feature_table.py +0 -164
- feature_store/training_set_client/training_set_client.py +0 -196
- feature_store/utils/common_utils.py +0 -96
- wedata_feature_engineering-0.1.4.dist-info/RECORD +0 -41
- wedata_feature_engineering-0.1.4.dist-info/top_level.txt +0 -1
- {feature_store/constants → wedata/feature_store}/__init__.py +0 -0
- {feature_store/entities → wedata/feature_store/constants}/__init__.py +0 -0
- {feature_store/feature_table_client → wedata/feature_store/entities}/__init__.py +0 -0
- {feature_store → wedata/feature_store}/entities/data_type.py +0 -0
- {feature_store → wedata/feature_store}/entities/environment_variables.py +0 -0
- {feature_store → wedata/feature_store}/entities/feature.py +0 -0
- {feature_store → wedata/feature_store}/entities/feature_column_info.py +0 -0
- {feature_store → wedata/feature_store}/entities/feature_function.py +0 -0
- {feature_store → wedata/feature_store}/entities/feature_spec_constants.py +0 -0
- {feature_store → wedata/feature_store}/entities/feature_table_info.py +0 -0
- {feature_store → wedata/feature_store}/entities/function_info.py +0 -0
- {feature_store → wedata/feature_store}/entities/on_demand_column_info.py +0 -0
- {feature_store → wedata/feature_store}/entities/source_data_column_info.py +0 -0
- {feature_store/spark_client → wedata/feature_store/feature_table_client}/__init__.py +0 -0
- {feature_store/training_set_client → wedata/feature_store/spark_client}/__init__.py +0 -0
- {feature_store/utils → wedata/feature_store/training_set_client}/__init__.py +0 -0
- {feature_store → wedata/feature_store}/utils/topological_sort.py +0 -0
- {feature_store → wedata/feature_store}/utils/validation_utils.py +0 -0
- {wedata_feature_engineering-0.1.4.dist-info → wedata_feature_engineering-0.1.6.dist-info}/WHEEL +0 -0
@@ -0,0 +1,367 @@
|
|
1
|
+
import logging
|
2
|
+
import os
|
3
|
+
from types import ModuleType
|
4
|
+
from typing import Any, List, Optional, Set, Union
|
5
|
+
|
6
|
+
import mlflow
|
7
|
+
from mlflow.models import Model
|
8
|
+
from mlflow.utils.file_utils import TempDir, read_yaml
|
9
|
+
from pyspark.sql import DataFrame
|
10
|
+
|
11
|
+
from wedata.feature_store.constants import constants
|
12
|
+
from wedata.feature_store.entities.feature_function import FeatureFunction
|
13
|
+
from wedata.feature_store.entities.feature_lookup import FeatureLookup
|
14
|
+
from wedata.feature_store.entities.feature_spec import FeatureSpec
|
15
|
+
from wedata.feature_store.entities.training_set import TrainingSet
|
16
|
+
from wedata.feature_store.spark_client.spark_client import SparkClient
|
17
|
+
|
18
|
+
from wedata.feature_store.constants.constants import (
|
19
|
+
_NO_RESULT_TYPE_PASSED,
|
20
|
+
_USE_SPARK_NATIVE_JOIN
|
21
|
+
)
|
22
|
+
|
23
|
+
from wedata.feature_store.utils import common_utils, training_set_utils, uc_utils
|
24
|
+
from wedata.feature_store.utils.signature_utils import get_mlflow_signature_from_feature_spec, \
|
25
|
+
drop_signature_inputs_and_invalid_params
|
26
|
+
|
27
|
+
_logger = logging.getLogger(__name__)
|
28
|
+
|
29
|
+
FEATURE_SPEC_GRAPH_MAX_COLUMN_INFO = 1000
|
30
|
+
|
31
|
+
|
32
|
+
class TrainingSetClient:
|
33
|
+
def __init__(
|
34
|
+
self,
|
35
|
+
spark_client: SparkClient
|
36
|
+
):
|
37
|
+
self._spark_client = spark_client
|
38
|
+
|
39
|
+
def create_training_set(
|
40
|
+
self,
|
41
|
+
feature_spec: FeatureSpec,
|
42
|
+
label_names: List[str],
|
43
|
+
df: DataFrame,
|
44
|
+
ft_metadata: training_set_utils._FeatureTableMetadata,
|
45
|
+
kwargs,
|
46
|
+
):
|
47
|
+
uc_function_infos = training_set_utils.get_uc_function_infos(
|
48
|
+
self._spark_client,
|
49
|
+
{odci.udf_name for odci in feature_spec.on_demand_column_infos},
|
50
|
+
)
|
51
|
+
|
52
|
+
training_set_utils.warn_if_non_photon_for_native_spark(
|
53
|
+
kwargs.get(_USE_SPARK_NATIVE_JOIN, False), self._spark_client
|
54
|
+
)
|
55
|
+
return TrainingSet(
|
56
|
+
feature_spec,
|
57
|
+
df,
|
58
|
+
label_names,
|
59
|
+
ft_metadata.feature_table_metadata_map,
|
60
|
+
ft_metadata.feature_table_data_map,
|
61
|
+
uc_function_infos,
|
62
|
+
kwargs.get(_USE_SPARK_NATIVE_JOIN, False),
|
63
|
+
)
|
64
|
+
|
65
|
+
def create_training_set_from_feature_lookups(
|
66
|
+
self,
|
67
|
+
df: DataFrame,
|
68
|
+
feature_lookups: List[Union[FeatureLookup, FeatureFunction]],
|
69
|
+
label: Union[str, List[str], None],
|
70
|
+
exclude_columns: List[str],
|
71
|
+
**kwargs,
|
72
|
+
) -> TrainingSet:
|
73
|
+
|
74
|
+
# 获取特征查找列表和特征函数列表
|
75
|
+
features = feature_lookups
|
76
|
+
feature_lookups = [f for f in features if isinstance(f, FeatureLookup)]
|
77
|
+
feature_functions = [f for f in features if isinstance(f, FeatureFunction)]
|
78
|
+
|
79
|
+
# 最多支持100个FeatureFunctions
|
80
|
+
if len(feature_functions) > training_set_utils.MAX_FEATURE_FUNCTIONS:
|
81
|
+
raise ValueError(
|
82
|
+
f"A maximum of {training_set_utils.MAX_FEATURE_FUNCTIONS} FeatureFunctions are supported."
|
83
|
+
)
|
84
|
+
|
85
|
+
# 如果未提供标签,则用空列表初始化label_names
|
86
|
+
label_names = common_utils.as_list(label, [])
|
87
|
+
del label
|
88
|
+
|
89
|
+
# 校验数据集和标签
|
90
|
+
training_set_utils.verify_df_and_labels(df, label_names, exclude_columns)
|
91
|
+
|
92
|
+
# 获取特征表元数据
|
93
|
+
ft_metadata = training_set_utils.get_table_metadata(
|
94
|
+
self._spark_client,
|
95
|
+
{fl.table_name for fl in feature_lookups}
|
96
|
+
)
|
97
|
+
|
98
|
+
column_infos = training_set_utils.get_column_infos(
|
99
|
+
feature_lookups,
|
100
|
+
feature_functions,
|
101
|
+
ft_metadata,
|
102
|
+
df_columns=df.columns,
|
103
|
+
label_names=label_names,
|
104
|
+
)
|
105
|
+
|
106
|
+
training_set_utils.validate_column_infos(
|
107
|
+
self._spark_client,
|
108
|
+
ft_metadata,
|
109
|
+
column_infos.source_data_column_infos,
|
110
|
+
column_infos.feature_column_infos,
|
111
|
+
column_infos.on_demand_column_infos,
|
112
|
+
label_names,
|
113
|
+
)
|
114
|
+
|
115
|
+
# Build feature_spec locally for comparison with the feature spec yaml generated by the
|
116
|
+
# FeatureStore backend. This will be removed once the migration is validated.
|
117
|
+
feature_spec = training_set_utils.build_feature_spec(
|
118
|
+
feature_lookups,
|
119
|
+
ft_metadata,
|
120
|
+
column_infos,
|
121
|
+
exclude_columns
|
122
|
+
)
|
123
|
+
|
124
|
+
return self.create_training_set(
|
125
|
+
feature_spec,
|
126
|
+
label_names,
|
127
|
+
df,
|
128
|
+
ft_metadata,
|
129
|
+
kwargs=kwargs,
|
130
|
+
)
|
131
|
+
|
132
|
+
|
133
|
+
def create_feature_spec(
|
134
|
+
self,
|
135
|
+
name: str,
|
136
|
+
features: List[Union[FeatureLookup, FeatureFunction]],
|
137
|
+
sparkClient: SparkClient,
|
138
|
+
exclude_columns: List[str] = [],
|
139
|
+
) -> FeatureSpec:
|
140
|
+
|
141
|
+
feature_lookups = [f for f in features if isinstance(f, FeatureLookup)]
|
142
|
+
feature_functions = [f for f in features if isinstance(f, FeatureFunction)]
|
143
|
+
|
144
|
+
# Maximum of 100 FeatureFunctions is supported
|
145
|
+
if len(feature_functions) > training_set_utils.MAX_FEATURE_FUNCTIONS:
|
146
|
+
raise ValueError(
|
147
|
+
f"A maximum of {training_set_utils.MAX_FEATURE_FUNCTIONS} FeatureFunctions are supported."
|
148
|
+
)
|
149
|
+
|
150
|
+
# Get feature table metadata and column infos
|
151
|
+
ft_metadata = training_set_utils.get_table_metadata(
|
152
|
+
self._spark_client,
|
153
|
+
{fl.table_name for fl in feature_lookups}
|
154
|
+
)
|
155
|
+
column_infos = training_set_utils.get_column_infos(
|
156
|
+
feature_lookups,
|
157
|
+
feature_functions,
|
158
|
+
ft_metadata,
|
159
|
+
)
|
160
|
+
|
161
|
+
column_infos = training_set_utils.add_inferred_source_columns(column_infos)
|
162
|
+
|
163
|
+
training_set_utils.validate_column_infos(
|
164
|
+
self._spark_client,
|
165
|
+
ft_metadata,
|
166
|
+
column_infos.source_data_column_infos,
|
167
|
+
column_infos.feature_column_infos,
|
168
|
+
column_infos.on_demand_column_infos,
|
169
|
+
)
|
170
|
+
|
171
|
+
feature_spec = training_set_utils.build_feature_spec(
|
172
|
+
feature_lookups,
|
173
|
+
ft_metadata,
|
174
|
+
column_infos,
|
175
|
+
exclude_columns
|
176
|
+
)
|
177
|
+
|
178
|
+
return feature_spec
|
179
|
+
|
180
|
+
|
181
|
+
def log_model(
|
182
|
+
self,
|
183
|
+
model: Any,
|
184
|
+
artifact_path: str,
|
185
|
+
*,
|
186
|
+
flavor: ModuleType,
|
187
|
+
training_set: Optional[TrainingSet],
|
188
|
+
registered_model_name: Optional[str],
|
189
|
+
await_registration_for: int,
|
190
|
+
infer_input_example: bool,
|
191
|
+
**kwargs,
|
192
|
+
):
|
193
|
+
# Validate only one of the training_set, feature_spec_path arguments is provided.
|
194
|
+
# Retrieve the FeatureSpec, then remove training_set, feature_spec_path from local scope.
|
195
|
+
feature_spec_path = kwargs.pop("feature_spec_path", None)
|
196
|
+
if (training_set is None) == (feature_spec_path is None):
|
197
|
+
raise ValueError(
|
198
|
+
"Either 'training_set' or 'feature_spec_path' must be provided, but not both."
|
199
|
+
)
|
200
|
+
# Retrieve the FeatureSpec and then reformat tables in local metastore to 2L before serialization.
|
201
|
+
# This will make sure the format of the feature spec with local metastore tables is always consistent.
|
202
|
+
if training_set:
|
203
|
+
all_uc_tables = all(
|
204
|
+
[
|
205
|
+
uc_utils.is_uc_entity(table_info.table_name)
|
206
|
+
for table_info in training_set.feature_spec.table_infos
|
207
|
+
]
|
208
|
+
)
|
209
|
+
# training_set.feature_spec is guaranteed to be 3L from FeatureStoreClient.create_training_set.
|
210
|
+
feature_spec = uc_utils.get_feature_spec_with_reformat_full_table_names(
|
211
|
+
training_set.feature_spec
|
212
|
+
)
|
213
|
+
label_type_map = training_set._label_data_types
|
214
|
+
|
215
|
+
labels = training_set._labels
|
216
|
+
df_head = training_set._df.drop(*labels).head()
|
217
|
+
else:
|
218
|
+
# FeatureSpec.load expects the root directory of feature_spec.yaml
|
219
|
+
root_dir, file_name = os.path.split(feature_spec_path)
|
220
|
+
if file_name != FeatureSpec.FEATURE_ARTIFACT_FILE:
|
221
|
+
raise ValueError(
|
222
|
+
f"'feature_spec_path' must be a path to {FeatureSpec.FEATURE_ARTIFACT_FILE}."
|
223
|
+
)
|
224
|
+
feature_spec = FeatureSpec.load(root_dir)
|
225
|
+
|
226
|
+
# The loaded FeatureSpec is not guaranteed to be 3L.
|
227
|
+
# First call get_feature_spec_with_full_table_names to append the default metastore to 2L names,
|
228
|
+
# as get_feature_spec_with_reformat_full_table_names expects full 3L table names and throws otherwise.
|
229
|
+
# TODO (ML-26593): Consolidate this into a single function that allows either 2L/3L names.
|
230
|
+
feature_spec_with_full_table_names = (
|
231
|
+
uc_utils.get_feature_spec_with_full_table_names(feature_spec)
|
232
|
+
)
|
233
|
+
all_uc_tables = all(
|
234
|
+
[
|
235
|
+
uc_utils.is_uc_entity(table_info.table_name)
|
236
|
+
for table_info in feature_spec_with_full_table_names.table_infos
|
237
|
+
]
|
238
|
+
)
|
239
|
+
feature_spec = uc_utils.get_feature_spec_with_reformat_full_table_names(
|
240
|
+
feature_spec_with_full_table_names
|
241
|
+
)
|
242
|
+
label_type_map = None
|
243
|
+
df_head = None
|
244
|
+
del training_set, feature_spec_path
|
245
|
+
|
246
|
+
override_output_schema = kwargs.pop("output_schema", None)
|
247
|
+
params = kwargs.pop("params", {})
|
248
|
+
params["result_type"] = params.get("result_type", _NO_RESULT_TYPE_PASSED)
|
249
|
+
# Signatures will ony be supported for UC-table-only models to
|
250
|
+
# mitigate new online scoring behavior from being a breaking regression for older
|
251
|
+
# models.
|
252
|
+
# See https://docs.google.com/document/d/1L5tLY-kRreRefDfuAM3crXvYlirkcPuUUppU8uIMVM0/edit#
|
253
|
+
try:
|
254
|
+
if all_uc_tables:
|
255
|
+
signature = get_mlflow_signature_from_feature_spec(
|
256
|
+
feature_spec, label_type_map, override_output_schema, params
|
257
|
+
)
|
258
|
+
else:
|
259
|
+
_logger.warning(
|
260
|
+
"Model could not be logged with a signature because the training set uses feature tables in "
|
261
|
+
"Hive Metastore. Migrate the feature tables to Unity Catalog for model to be logged "
|
262
|
+
"with a signature. "
|
263
|
+
"See https://docs.databricks.com/en/machine-learning/feature-store/uc/upgrade-feature-table-to-uc.html for more details."
|
264
|
+
)
|
265
|
+
signature = None
|
266
|
+
except Exception as e:
|
267
|
+
_logger.warning(f"Model could not be logged with a signature: {e}")
|
268
|
+
signature = None
|
269
|
+
|
270
|
+
with TempDir() as tmp_location:
|
271
|
+
data_path = os.path.join(tmp_location.path(), "feature_store")
|
272
|
+
raw_mlflow_model = Model(
|
273
|
+
signature=drop_signature_inputs_and_invalid_params(signature)
|
274
|
+
)
|
275
|
+
raw_model_path = os.path.join(
|
276
|
+
data_path, constants.RAW_MODEL_FOLDER
|
277
|
+
)
|
278
|
+
if flavor.FLAVOR_NAME != mlflow.pyfunc.FLAVOR_NAME:
|
279
|
+
flavor.save_model(
|
280
|
+
model, raw_model_path, mlflow_model=raw_mlflow_model, **kwargs
|
281
|
+
)
|
282
|
+
else:
|
283
|
+
flavor.save_model(
|
284
|
+
raw_model_path,
|
285
|
+
mlflow_model=raw_mlflow_model,
|
286
|
+
python_model=model,
|
287
|
+
**kwargs,
|
288
|
+
)
|
289
|
+
if not "python_function" in raw_mlflow_model.flavors:
|
290
|
+
raise ValueError(
|
291
|
+
f"FeatureStoreClient.log_model does not support '{flavor.__name__}' "
|
292
|
+
f"since it does not have a python_function model flavor."
|
293
|
+
)
|
294
|
+
|
295
|
+
# Re-use the conda environment from the raw model for the packaged model. Later, we may
|
296
|
+
# add an additional requirement for the Feature Store library. At the moment, however,
|
297
|
+
# the databricks-feature-store package is not available via conda or pip.
|
298
|
+
model_env = raw_mlflow_model.flavors["python_function"][mlflow.pyfunc.ENV]
|
299
|
+
if isinstance(model_env, dict):
|
300
|
+
# mlflow 2.0 has multiple supported environments
|
301
|
+
conda_file = model_env[mlflow.pyfunc.EnvType.CONDA]
|
302
|
+
else:
|
303
|
+
conda_file = model_env
|
304
|
+
|
305
|
+
conda_env = read_yaml(raw_model_path, conda_file)
|
306
|
+
|
307
|
+
# Check if databricks-feature-lookup version is specified in conda_env
|
308
|
+
lookup_client_version_specified = False
|
309
|
+
for dependency in conda_env.get("dependencies", []):
|
310
|
+
if isinstance(dependency, dict):
|
311
|
+
for pip_dep in dependency.get("pip", []):
|
312
|
+
if pip_dep.startswith(
|
313
|
+
constants.FEATURE_LOOKUP_CLIENT_PIP_PACKAGE
|
314
|
+
):
|
315
|
+
lookup_client_version_specified = True
|
316
|
+
break
|
317
|
+
|
318
|
+
# If databricks-feature-lookup version is not specified, add default version
|
319
|
+
if not lookup_client_version_specified:
|
320
|
+
# Get the pip package string for the databricks-feature-lookup client
|
321
|
+
default_databricks_feature_lookup_pip_package = common_utils.pip_depependency_pinned_major_version(
|
322
|
+
pip_package_name=constants.FEATURE_LOOKUP_CLIENT_PIP_PACKAGE,
|
323
|
+
major_version=constants.FEATURE_LOOKUP_CLIENT_MAJOR_VERSION,
|
324
|
+
)
|
325
|
+
common_utils.add_mlflow_pip_depependency(
|
326
|
+
conda_env, default_databricks_feature_lookup_pip_package
|
327
|
+
)
|
328
|
+
|
329
|
+
try:
|
330
|
+
if df_head is not None and infer_input_example:
|
331
|
+
input_example = df_head.asDict()
|
332
|
+
else:
|
333
|
+
input_example = None
|
334
|
+
except Exception:
|
335
|
+
input_example = None
|
336
|
+
|
337
|
+
# todo:
|
338
|
+
#feature_spec.save(data_path)
|
339
|
+
|
340
|
+
# Log the packaged model. If no run is active, this call will create an active run.
|
341
|
+
mlflow.pyfunc.log_model(
|
342
|
+
artifact_path=artifact_path,
|
343
|
+
loader_module=constants.MLFLOW_MODEL_NAME,
|
344
|
+
data_path=data_path,
|
345
|
+
code_path=None,
|
346
|
+
conda_env=conda_env,
|
347
|
+
signature=signature,
|
348
|
+
input_example=input_example,
|
349
|
+
)
|
350
|
+
if registered_model_name is not None:
|
351
|
+
# The call to mlflow.pyfunc.log_model will create an active run, so it is safe to
|
352
|
+
# obtain the run_id for the active run.
|
353
|
+
run_id = mlflow.tracking.fluent.active_run().info.run_id
|
354
|
+
|
355
|
+
# If the user provided an explicit model_registry_uri when constructing the FeatureStoreClient,
|
356
|
+
# we respect this by setting the registry URI prior to reading the model from Model
|
357
|
+
# Registry.
|
358
|
+
# todo:
|
359
|
+
# if self._model_registry_uri:
|
360
|
+
# # This command will override any previously set registry_uri.
|
361
|
+
# mlflow.set_registry_uri(self._model_registry_uri)
|
362
|
+
|
363
|
+
mlflow.register_model(
|
364
|
+
"runs:/%s/%s" % (run_id, artifact_path),
|
365
|
+
registered_model_name,
|
366
|
+
await_registration_for=await_registration_for,
|
367
|
+
)
|
File without changes
|
@@ -1,4 +1,8 @@
|
|
1
|
+
"""
|
2
|
+
通用工具函数
|
3
|
+
"""
|
1
4
|
import os
|
5
|
+
from collections import Counter
|
2
6
|
from datetime import datetime, timezone
|
3
7
|
from functools import wraps
|
4
8
|
from typing import Any, Dict, List, Optional
|
@@ -7,8 +11,54 @@ from urllib.parse import urlparse
|
|
7
11
|
import mlflow
|
8
12
|
from mlflow.exceptions import RestException
|
9
13
|
from mlflow.store.artifact.artifact_repository_registry import get_artifact_repository
|
14
|
+
from mlflow.store.artifact.models_artifact_repo import ModelsArtifactRepository
|
15
|
+
from mlflow.store.artifact.runs_artifact_repo import RunsArtifactRepository
|
10
16
|
from mlflow.utils import databricks_utils
|
11
17
|
|
18
|
+
from wedata.feature_store.constants.constants import MODEL_DATA_PATH_ROOT
|
19
|
+
|
20
|
+
|
21
|
+
def validate_table_name(name: str):
|
22
|
+
"""
|
23
|
+
验证特征表名规范,仅支持单表名,不能包含点(如<catalog>.<schema>.<table>)
|
24
|
+
|
25
|
+
参数:
|
26
|
+
name: 要验证的表名
|
27
|
+
|
28
|
+
异常:
|
29
|
+
ValueError: 如果表名包含点或不符合规范
|
30
|
+
"""
|
31
|
+
if not name or not isinstance(name, str):
|
32
|
+
raise ValueError("Table name must be a non-empty string")
|
33
|
+
if name.count('.') > 0:
|
34
|
+
raise ValueError("Feature table name only supports single table name, cannot contain dots (e.g. <catalog>.<schema>.<table>)")
|
35
|
+
if not name[0].isalpha():
|
36
|
+
raise ValueError("Table name must start with a letter")
|
37
|
+
if not all(c.isalnum() or c == '_' for c in name):
|
38
|
+
raise ValueError("Table name can only contain letters, numbers and underscores")
|
39
|
+
|
40
|
+
|
41
|
+
def build_full_table_name(table_name: str) -> str:
|
42
|
+
"""
|
43
|
+
构建完整的表名,格式化为`<database>.<table>`形式。
|
44
|
+
|
45
|
+
Args:
|
46
|
+
table_name: 输入的表名(可以是简化的表名或完整表名)。
|
47
|
+
|
48
|
+
Returns:
|
49
|
+
完整表名(`<database>.<table>`)。
|
50
|
+
"""
|
51
|
+
|
52
|
+
# 从环境变量中获取当前主账户名称
|
53
|
+
owner_uin = os.environ.get("WEDATA_OWNER_UIN", "default")
|
54
|
+
|
55
|
+
# 如果owner_uin为空,则报错
|
56
|
+
if not owner_uin:
|
57
|
+
raise ValueError("WEDATA_OWNER_UIN environment variable is not set")
|
58
|
+
|
59
|
+
feature_store_database = f"{owner_uin}.{table_name}"
|
60
|
+
|
61
|
+
return feature_store_database
|
12
62
|
|
13
63
|
|
14
64
|
def enable_if(condition):
|
@@ -31,21 +81,6 @@ def enable_if(condition):
|
|
31
81
|
return decorator
|
32
82
|
|
33
83
|
|
34
|
-
def as_list(obj, default=None):
|
35
|
-
if not obj:
|
36
|
-
return default
|
37
|
-
elif isinstance(obj, list):
|
38
|
-
return obj
|
39
|
-
else:
|
40
|
-
return [obj]
|
41
|
-
|
42
|
-
|
43
|
-
def as_directory(path):
|
44
|
-
if path.endswith("/"):
|
45
|
-
return path
|
46
|
-
return f"{path}/"
|
47
|
-
|
48
|
-
|
49
84
|
def is_empty(target: str):
|
50
85
|
return target is None or len(target.strip()) == 0
|
51
86
|
|
@@ -68,10 +103,7 @@ def _get_dbutils():
|
|
68
103
|
raise _NoDbutilsError
|
69
104
|
|
70
105
|
|
71
|
-
|
72
106
|
def utc_timestamp_ms_from_iso_datetime_string(date_string: str) -> int:
|
73
|
-
# Python uses seconds for its time granularity, so we multiply by 1000 to convert to milliseconds.
|
74
|
-
# The Feature Store backend returns timestamps in milliseconds, so this allows for direct comparisons.
|
75
107
|
dt = datetime.fromisoformat(date_string)
|
76
108
|
utc_dt = dt.replace(tzinfo=timezone.utc)
|
77
109
|
return 1000 * utc_dt.timestamp()
|
@@ -86,12 +118,7 @@ def pip_depependency_pinned_major_version(pip_package_name, major_version):
|
|
86
118
|
|
87
119
|
def add_mlflow_pip_depependency(conda_env, pip_package_name):
|
88
120
|
"""
|
89
|
-
Add a new pip dependency to the conda environment taken from the raw MLflow model.
|
90
|
-
called for conda environments created by MLflow rather than for generic conda environments, because it assumes
|
91
|
-
the conda environment already contains pip as a dependency. In the case of MLflow models, this is a safe
|
92
|
-
assumption because MLflow always needs to add "mlflow" to the conda environment's pip dependencies.
|
93
|
-
|
94
|
-
This is idempotent and will not add a pip package that is already present in the list of pip packages.
|
121
|
+
Add a new pip dependency to the conda environment taken from the raw MLflow model.
|
95
122
|
"""
|
96
123
|
if pip_package_name is None or len(pip_package_name) == 0:
|
97
124
|
raise ValueError(
|
@@ -106,7 +133,6 @@ def add_mlflow_pip_depependency(conda_env, pip_package_name):
|
|
106
133
|
pip_deps = dep["pip"]
|
107
134
|
if pip_package_name not in pip_deps:
|
108
135
|
pip_deps.append(pip_package_name)
|
109
|
-
# Fail early rather than at model inference time
|
110
136
|
if "dependencies" in conda_env and not found_pip_dependency:
|
111
137
|
raise ValueError(
|
112
138
|
"Unexpected input: mlflow conda_env did not contain pip as a dependency"
|
@@ -115,12 +141,7 @@ def add_mlflow_pip_depependency(conda_env, pip_package_name):
|
|
115
141
|
|
116
142
|
def download_model_artifacts(model_uri, dir):
|
117
143
|
"""
|
118
|
-
Downloads model artifacts from model_uri to dir.
|
119
|
-
|
120
|
-
:param model_uri: The location, in URI format, of a model. Must be either in the model registry
|
121
|
-
(``models:/<model_name>/<model_version>``, ``models:/<model_name>/<stage>``) or the MLflow
|
122
|
-
artifact store (``runs:/<mlflow_run_id>/run-relative/path/to/model``).
|
123
|
-
:param dir: Location to place downloaded model artifacts.
|
144
|
+
Downloads model artifacts from model_uri to dir.
|
124
145
|
"""
|
125
146
|
if not is_artifact_uri(model_uri):
|
126
147
|
raise ValueError(
|
@@ -147,17 +168,7 @@ def download_model_artifacts(model_uri, dir):
|
|
147
168
|
|
148
169
|
def validate_params_non_empty(params: Dict[str, Any], expected_params: List[str]):
|
149
170
|
"""
|
150
|
-
Validate that none of the expected parameters are empty
|
151
|
-
for the first encountered empty parameter.
|
152
|
-
|
153
|
-
Tested with the following param types:
|
154
|
-
|
155
|
-
- str
|
156
|
-
- Dict
|
157
|
-
- List
|
158
|
-
|
159
|
-
:param params: A dictionary of param names -> param values, for example as returned by locals()
|
160
|
-
:param expected_params: List of params to check as non_empty
|
171
|
+
Validate that none of the expected parameters are empty.
|
161
172
|
"""
|
162
173
|
for expected_param in expected_params:
|
163
174
|
if expected_param not in params:
|
@@ -171,8 +182,7 @@ def validate_params_non_empty(params: Dict[str, Any], expected_params: List[str]
|
|
171
182
|
|
172
183
|
def is_in_databricks_job():
|
173
184
|
"""
|
174
|
-
Overrides the behavior of the mlflow databricks_utils.is_in_databricks_job()
|
175
|
-
some jobs have job_id but no run_id, for example one-time job runs.
|
185
|
+
Overrides the behavior of the mlflow databricks_utils.is_in_databricks_job().
|
176
186
|
"""
|
177
187
|
try:
|
178
188
|
return databricks_utils.get_job_id() is not None
|
@@ -182,10 +192,7 @@ def is_in_databricks_job():
|
|
182
192
|
|
183
193
|
def get_workspace_url() -> Optional[str]:
|
184
194
|
"""
|
185
|
-
Overrides the behavior of the mlflow.utils.databricks_utils.get_workspace_url()
|
186
|
-
as get_workspace_url does not always return URLs with defined schemes.
|
187
|
-
|
188
|
-
TODO (ML-32050): Refactor this implementation to mlflow, and bump minimum required mlflow version.
|
195
|
+
Overrides the behavior of the mlflow.utils.databricks_utils.get_workspace_url().
|
189
196
|
"""
|
190
197
|
workspace_url = databricks_utils.get_workspace_url()
|
191
198
|
if workspace_url and not urlparse(workspace_url).scheme:
|
@@ -195,20 +202,60 @@ def get_workspace_url() -> Optional[str]:
|
|
195
202
|
|
196
203
|
def is_in_databricks_env():
|
197
204
|
"""
|
198
|
-
Determine if we are running in a Databricks environment
|
199
|
-
|
200
|
-
If any invoked methods raise an exception, swallow the exception and return False out of an abundance of caution.
|
205
|
+
Determine if we are running in a Databricks environment.
|
201
206
|
"""
|
202
207
|
try:
|
203
208
|
return (
|
204
|
-
|
205
|
-
|
206
|
-
|
209
|
+
is_in_databricks_job()
|
210
|
+
or databricks_utils.is_in_databricks_notebook()
|
211
|
+
or databricks_utils.is_in_databricks_runtime()
|
207
212
|
)
|
208
213
|
except Exception:
|
209
214
|
return False
|
210
215
|
|
211
216
|
|
217
|
+
def is_artifact_uri(uri):
|
218
|
+
"""
|
219
|
+
Checks the artifact URI is associated with a MLflow model or run.
|
220
|
+
The actual URI can be a model URI, model URI + subdirectory, or model URI + path to artifact file.
|
221
|
+
"""
|
222
|
+
return ModelsArtifactRepository.is_models_uri(
|
223
|
+
uri
|
224
|
+
) or RunsArtifactRepository.is_runs_uri(uri)
|
225
|
+
|
226
|
+
|
227
|
+
def as_list(obj, default=None):
|
228
|
+
if not obj:
|
229
|
+
return default
|
230
|
+
elif isinstance(obj, list):
|
231
|
+
return obj
|
232
|
+
else:
|
233
|
+
return [obj]
|
234
|
+
|
235
|
+
|
236
|
+
def get_duplicates(elements: List[Any]) -> List[Any]:
|
237
|
+
"""
|
238
|
+
Returns duplicate elements in the order they first appear.
|
239
|
+
"""
|
240
|
+
element_counts = Counter(elements)
|
241
|
+
duplicates = []
|
242
|
+
for e in element_counts.keys():
|
243
|
+
if element_counts[e] > 1:
|
244
|
+
duplicates.append(e)
|
245
|
+
return duplicates
|
246
|
+
|
247
|
+
|
248
|
+
def validate_strings_unique(strings: List[str], error_template: str):
|
249
|
+
"""
|
250
|
+
Validates all strings are unique, otherwise raise ValueError with the error template and duplicates.
|
251
|
+
Passes single-quoted, comma delimited duplicates to the error template.
|
252
|
+
"""
|
253
|
+
duplicate_strings = get_duplicates(strings)
|
254
|
+
if duplicate_strings:
|
255
|
+
duplicates_formatted = ", ".join([f"'{s}'" for s in duplicate_strings])
|
256
|
+
raise ValueError(error_template.format(duplicates_formatted))
|
257
|
+
|
258
|
+
|
212
259
|
def sanitize_identifier(identifier: str):
|
213
260
|
"""
|
214
261
|
Sanitize and wrap an identifier with backquotes. For example, "a`b" becomes "`a``b`".
|
@@ -250,3 +297,10 @@ def unsanitize_identifier(identifier: str):
|
|
250
297
|
# strings containing \ or ' can break sql statements, so escape them.
|
251
298
|
def escape_sql_string(input_str: str) -> str:
|
252
299
|
return input_str.replace("\\", "\\\\").replace("'", "\\'")
|
300
|
+
|
301
|
+
|
302
|
+
def get_unique_list_order(elements: List[Any]) -> List[Any]:
|
303
|
+
"""
|
304
|
+
Returns unique elements in the order they first appear.
|
305
|
+
"""
|
306
|
+
return list(dict.fromkeys(elements))
|
@@ -10,13 +10,13 @@ from pyspark.sql import DataFrame, Window
|
|
10
10
|
from pyspark.sql import functions as F
|
11
11
|
from pyspark.sql.functions import sum, unix_timestamp
|
12
12
|
|
13
|
-
from feature_store.entities.environment_variables import BROADCAST_JOIN_THRESHOLD
|
14
|
-
from feature_store.entities.feature_column_info import FeatureColumnInfo
|
15
|
-
from feature_store.entities.feature_lookup import FeatureLookup
|
16
|
-
from feature_store.entities.feature_spec import FeatureSpec
|
17
|
-
from feature_store.entities.feature_table import FeatureTable
|
13
|
+
from wedata.feature_store.entities.environment_variables import BROADCAST_JOIN_THRESHOLD
|
14
|
+
from wedata.feature_store.entities.feature_column_info import FeatureColumnInfo
|
15
|
+
from wedata.feature_store.entities.feature_lookup import FeatureLookup
|
16
|
+
from wedata.feature_store.entities.feature_spec import FeatureSpec
|
17
|
+
from wedata.feature_store.entities.feature_table import FeatureTable
|
18
18
|
|
19
|
-
from feature_store.utils import common_utils, validation_utils, uc_utils
|
19
|
+
from wedata.feature_store.utils import common_utils, validation_utils, uc_utils
|
20
20
|
|
21
21
|
_logger = logging.getLogger(__name__)
|
22
22
|
|
@@ -6,12 +6,12 @@ from typing import Dict, List, Set, Tuple, Type, Union
|
|
6
6
|
import yaml
|
7
7
|
from mlflow.utils.file_utils import YamlSafeDumper
|
8
8
|
|
9
|
-
from feature_store.entities.column_info import ColumnInfo
|
10
|
-
from feature_store.entities.feature_column_info import FeatureColumnInfo
|
11
|
-
from feature_store.entities.feature_spec import FeatureSpec
|
12
|
-
from feature_store.entities.on_demand_column_info import OnDemandColumnInfo
|
13
|
-
from feature_store.entities.source_data_column_info import SourceDataColumnInfo
|
14
|
-
from feature_store.utils.topological_sort import topological_sort
|
9
|
+
from wedata.feature_store.entities.column_info import ColumnInfo
|
10
|
+
from wedata.feature_store.entities.feature_column_info import FeatureColumnInfo
|
11
|
+
from wedata.feature_store.entities.feature_spec import FeatureSpec
|
12
|
+
from wedata.feature_store.entities.on_demand_column_info import OnDemandColumnInfo
|
13
|
+
from wedata.feature_store.entities.source_data_column_info import SourceDataColumnInfo
|
14
|
+
from wedata.feature_store.utils.topological_sort import topological_sort
|
15
15
|
|
16
16
|
DEFAULT_GRAPH_DEPTH_LIMIT = 5
|
17
17
|
|
@@ -1,11 +1,11 @@
|
|
1
1
|
import copy
|
2
2
|
from typing import List, Union
|
3
3
|
|
4
|
-
from feature_store.entities.feature_function import FeatureFunction
|
5
|
-
from feature_store.entities.feature_lookup import FeatureLookup
|
6
|
-
from feature_store.spark_client.spark_client import SparkClient
|
7
|
-
from feature_store.utils import uc_utils
|
8
|
-
from feature_store.utils.feature_lookup_utils import get_feature_lookups_with_full_table_names
|
4
|
+
from wedata.feature_store.entities.feature_function import FeatureFunction
|
5
|
+
from wedata.feature_store.entities.feature_lookup import FeatureLookup
|
6
|
+
from wedata.feature_store.spark_client.spark_client import SparkClient
|
7
|
+
from wedata.feature_store.utils import uc_utils
|
8
|
+
from wedata.feature_store.utils.feature_lookup_utils import get_feature_lookups_with_full_table_names
|
9
9
|
|
10
10
|
|
11
11
|
def format_feature_lookups_and_functions(
|