wedata-feature-engineering 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {feature_store → wedata}/__init__.py +1 -1
  2. {feature_store → wedata/feature_store}/client.py +113 -41
  3. {feature_store → wedata/feature_store}/constants/constants.py +19 -0
  4. {feature_store → wedata/feature_store}/entities/column_info.py +4 -4
  5. {feature_store → wedata/feature_store}/entities/feature_lookup.py +5 -1
  6. {feature_store → wedata/feature_store}/entities/feature_spec.py +46 -46
  7. wedata/feature_store/entities/feature_table.py +107 -0
  8. {feature_store → wedata/feature_store}/entities/training_set.py +13 -12
  9. {feature_store → wedata/feature_store}/feature_table_client/feature_table_client.py +85 -30
  10. {feature_store → wedata/feature_store}/spark_client/spark_client.py +30 -56
  11. wedata/feature_store/training_set_client/training_set_client.py +367 -0
  12. wedata/feature_store/utils/__init__.py +0 -0
  13. feature_store/utils/utils.py → wedata/feature_store/utils/common_utils.py +108 -54
  14. {feature_store → wedata/feature_store}/utils/feature_lookup_utils.py +6 -6
  15. {feature_store → wedata/feature_store}/utils/feature_spec_utils.py +6 -6
  16. {feature_store → wedata/feature_store}/utils/feature_utils.py +5 -5
  17. wedata/feature_store/utils/on_demand_utils.py +107 -0
  18. {feature_store → wedata/feature_store}/utils/schema_utils.py +1 -1
  19. wedata/feature_store/utils/signature_utils.py +205 -0
  20. {feature_store → wedata/feature_store}/utils/training_set_utils.py +18 -19
  21. {feature_store → wedata/feature_store}/utils/uc_utils.py +1 -1
  22. {wedata_feature_engineering-0.1.4.dist-info → wedata_feature_engineering-0.1.6.dist-info}/METADATA +1 -1
  23. wedata_feature_engineering-0.1.6.dist-info/RECORD +43 -0
  24. wedata_feature_engineering-0.1.6.dist-info/top_level.txt +1 -0
  25. feature_store/entities/feature_table.py +0 -164
  26. feature_store/training_set_client/training_set_client.py +0 -196
  27. feature_store/utils/common_utils.py +0 -96
  28. wedata_feature_engineering-0.1.4.dist-info/RECORD +0 -41
  29. wedata_feature_engineering-0.1.4.dist-info/top_level.txt +0 -1
  30. {feature_store/constants → wedata/feature_store}/__init__.py +0 -0
  31. {feature_store/entities → wedata/feature_store/constants}/__init__.py +0 -0
  32. {feature_store/feature_table_client → wedata/feature_store/entities}/__init__.py +0 -0
  33. {feature_store → wedata/feature_store}/entities/data_type.py +0 -0
  34. {feature_store → wedata/feature_store}/entities/environment_variables.py +0 -0
  35. {feature_store → wedata/feature_store}/entities/feature.py +0 -0
  36. {feature_store → wedata/feature_store}/entities/feature_column_info.py +0 -0
  37. {feature_store → wedata/feature_store}/entities/feature_function.py +0 -0
  38. {feature_store → wedata/feature_store}/entities/feature_spec_constants.py +0 -0
  39. {feature_store → wedata/feature_store}/entities/feature_table_info.py +0 -0
  40. {feature_store → wedata/feature_store}/entities/function_info.py +0 -0
  41. {feature_store → wedata/feature_store}/entities/on_demand_column_info.py +0 -0
  42. {feature_store → wedata/feature_store}/entities/source_data_column_info.py +0 -0
  43. {feature_store/spark_client → wedata/feature_store/feature_table_client}/__init__.py +0 -0
  44. {feature_store/training_set_client → wedata/feature_store/spark_client}/__init__.py +0 -0
  45. {feature_store/utils → wedata/feature_store/training_set_client}/__init__.py +0 -0
  46. {feature_store → wedata/feature_store}/utils/topological_sort.py +0 -0
  47. {feature_store → wedata/feature_store}/utils/validation_utils.py +0 -0
  48. {wedata_feature_engineering-0.1.4.dist-info → wedata_feature_engineering-0.1.6.dist-info}/WHEEL +0 -0
@@ -0,0 +1,367 @@
1
+ import logging
2
+ import os
3
+ from types import ModuleType
4
+ from typing import Any, List, Optional, Set, Union
5
+
6
+ import mlflow
7
+ from mlflow.models import Model
8
+ from mlflow.utils.file_utils import TempDir, read_yaml
9
+ from pyspark.sql import DataFrame
10
+
11
+ from wedata.feature_store.constants import constants
12
+ from wedata.feature_store.entities.feature_function import FeatureFunction
13
+ from wedata.feature_store.entities.feature_lookup import FeatureLookup
14
+ from wedata.feature_store.entities.feature_spec import FeatureSpec
15
+ from wedata.feature_store.entities.training_set import TrainingSet
16
+ from wedata.feature_store.spark_client.spark_client import SparkClient
17
+
18
+ from wedata.feature_store.constants.constants import (
19
+ _NO_RESULT_TYPE_PASSED,
20
+ _USE_SPARK_NATIVE_JOIN
21
+ )
22
+
23
+ from wedata.feature_store.utils import common_utils, training_set_utils, uc_utils
24
+ from wedata.feature_store.utils.signature_utils import get_mlflow_signature_from_feature_spec, \
25
+ drop_signature_inputs_and_invalid_params
26
+
27
+ _logger = logging.getLogger(__name__)
28
+
29
+ FEATURE_SPEC_GRAPH_MAX_COLUMN_INFO = 1000
30
+
31
+
32
+ class TrainingSetClient:
33
+ def __init__(
34
+ self,
35
+ spark_client: SparkClient
36
+ ):
37
+ self._spark_client = spark_client
38
+
39
+ def create_training_set(
40
+ self,
41
+ feature_spec: FeatureSpec,
42
+ label_names: List[str],
43
+ df: DataFrame,
44
+ ft_metadata: training_set_utils._FeatureTableMetadata,
45
+ kwargs,
46
+ ):
47
+ uc_function_infos = training_set_utils.get_uc_function_infos(
48
+ self._spark_client,
49
+ {odci.udf_name for odci in feature_spec.on_demand_column_infos},
50
+ )
51
+
52
+ training_set_utils.warn_if_non_photon_for_native_spark(
53
+ kwargs.get(_USE_SPARK_NATIVE_JOIN, False), self._spark_client
54
+ )
55
+ return TrainingSet(
56
+ feature_spec,
57
+ df,
58
+ label_names,
59
+ ft_metadata.feature_table_metadata_map,
60
+ ft_metadata.feature_table_data_map,
61
+ uc_function_infos,
62
+ kwargs.get(_USE_SPARK_NATIVE_JOIN, False),
63
+ )
64
+
65
+ def create_training_set_from_feature_lookups(
66
+ self,
67
+ df: DataFrame,
68
+ feature_lookups: List[Union[FeatureLookup, FeatureFunction]],
69
+ label: Union[str, List[str], None],
70
+ exclude_columns: List[str],
71
+ **kwargs,
72
+ ) -> TrainingSet:
73
+
74
+ # 获取特征查找列表和特征函数列表
75
+ features = feature_lookups
76
+ feature_lookups = [f for f in features if isinstance(f, FeatureLookup)]
77
+ feature_functions = [f for f in features if isinstance(f, FeatureFunction)]
78
+
79
+ # 最多支持100个FeatureFunctions
80
+ if len(feature_functions) > training_set_utils.MAX_FEATURE_FUNCTIONS:
81
+ raise ValueError(
82
+ f"A maximum of {training_set_utils.MAX_FEATURE_FUNCTIONS} FeatureFunctions are supported."
83
+ )
84
+
85
+ # 如果未提供标签,则用空列表初始化label_names
86
+ label_names = common_utils.as_list(label, [])
87
+ del label
88
+
89
+ # 校验数据集和标签
90
+ training_set_utils.verify_df_and_labels(df, label_names, exclude_columns)
91
+
92
+ # 获取特征表元数据
93
+ ft_metadata = training_set_utils.get_table_metadata(
94
+ self._spark_client,
95
+ {fl.table_name for fl in feature_lookups}
96
+ )
97
+
98
+ column_infos = training_set_utils.get_column_infos(
99
+ feature_lookups,
100
+ feature_functions,
101
+ ft_metadata,
102
+ df_columns=df.columns,
103
+ label_names=label_names,
104
+ )
105
+
106
+ training_set_utils.validate_column_infos(
107
+ self._spark_client,
108
+ ft_metadata,
109
+ column_infos.source_data_column_infos,
110
+ column_infos.feature_column_infos,
111
+ column_infos.on_demand_column_infos,
112
+ label_names,
113
+ )
114
+
115
+ # Build feature_spec locally for comparison with the feature spec yaml generated by the
116
+ # FeatureStore backend. This will be removed once the migration is validated.
117
+ feature_spec = training_set_utils.build_feature_spec(
118
+ feature_lookups,
119
+ ft_metadata,
120
+ column_infos,
121
+ exclude_columns
122
+ )
123
+
124
+ return self.create_training_set(
125
+ feature_spec,
126
+ label_names,
127
+ df,
128
+ ft_metadata,
129
+ kwargs=kwargs,
130
+ )
131
+
132
+
133
+ def create_feature_spec(
134
+ self,
135
+ name: str,
136
+ features: List[Union[FeatureLookup, FeatureFunction]],
137
+ sparkClient: SparkClient,
138
+ exclude_columns: List[str] = [],
139
+ ) -> FeatureSpec:
140
+
141
+ feature_lookups = [f for f in features if isinstance(f, FeatureLookup)]
142
+ feature_functions = [f for f in features if isinstance(f, FeatureFunction)]
143
+
144
+ # Maximum of 100 FeatureFunctions is supported
145
+ if len(feature_functions) > training_set_utils.MAX_FEATURE_FUNCTIONS:
146
+ raise ValueError(
147
+ f"A maximum of {training_set_utils.MAX_FEATURE_FUNCTIONS} FeatureFunctions are supported."
148
+ )
149
+
150
+ # Get feature table metadata and column infos
151
+ ft_metadata = training_set_utils.get_table_metadata(
152
+ self._spark_client,
153
+ {fl.table_name for fl in feature_lookups}
154
+ )
155
+ column_infos = training_set_utils.get_column_infos(
156
+ feature_lookups,
157
+ feature_functions,
158
+ ft_metadata,
159
+ )
160
+
161
+ column_infos = training_set_utils.add_inferred_source_columns(column_infos)
162
+
163
+ training_set_utils.validate_column_infos(
164
+ self._spark_client,
165
+ ft_metadata,
166
+ column_infos.source_data_column_infos,
167
+ column_infos.feature_column_infos,
168
+ column_infos.on_demand_column_infos,
169
+ )
170
+
171
+ feature_spec = training_set_utils.build_feature_spec(
172
+ feature_lookups,
173
+ ft_metadata,
174
+ column_infos,
175
+ exclude_columns
176
+ )
177
+
178
+ return feature_spec
179
+
180
+
181
+ def log_model(
182
+ self,
183
+ model: Any,
184
+ artifact_path: str,
185
+ *,
186
+ flavor: ModuleType,
187
+ training_set: Optional[TrainingSet],
188
+ registered_model_name: Optional[str],
189
+ await_registration_for: int,
190
+ infer_input_example: bool,
191
+ **kwargs,
192
+ ):
193
+ # Validate only one of the training_set, feature_spec_path arguments is provided.
194
+ # Retrieve the FeatureSpec, then remove training_set, feature_spec_path from local scope.
195
+ feature_spec_path = kwargs.pop("feature_spec_path", None)
196
+ if (training_set is None) == (feature_spec_path is None):
197
+ raise ValueError(
198
+ "Either 'training_set' or 'feature_spec_path' must be provided, but not both."
199
+ )
200
+ # Retrieve the FeatureSpec and then reformat tables in local metastore to 2L before serialization.
201
+ # This will make sure the format of the feature spec with local metastore tables is always consistent.
202
+ if training_set:
203
+ all_uc_tables = all(
204
+ [
205
+ uc_utils.is_uc_entity(table_info.table_name)
206
+ for table_info in training_set.feature_spec.table_infos
207
+ ]
208
+ )
209
+ # training_set.feature_spec is guaranteed to be 3L from FeatureStoreClient.create_training_set.
210
+ feature_spec = uc_utils.get_feature_spec_with_reformat_full_table_names(
211
+ training_set.feature_spec
212
+ )
213
+ label_type_map = training_set._label_data_types
214
+
215
+ labels = training_set._labels
216
+ df_head = training_set._df.drop(*labels).head()
217
+ else:
218
+ # FeatureSpec.load expects the root directory of feature_spec.yaml
219
+ root_dir, file_name = os.path.split(feature_spec_path)
220
+ if file_name != FeatureSpec.FEATURE_ARTIFACT_FILE:
221
+ raise ValueError(
222
+ f"'feature_spec_path' must be a path to {FeatureSpec.FEATURE_ARTIFACT_FILE}."
223
+ )
224
+ feature_spec = FeatureSpec.load(root_dir)
225
+
226
+ # The loaded FeatureSpec is not guaranteed to be 3L.
227
+ # First call get_feature_spec_with_full_table_names to append the default metastore to 2L names,
228
+ # as get_feature_spec_with_reformat_full_table_names expects full 3L table names and throws otherwise.
229
+ # TODO (ML-26593): Consolidate this into a single function that allows either 2L/3L names.
230
+ feature_spec_with_full_table_names = (
231
+ uc_utils.get_feature_spec_with_full_table_names(feature_spec)
232
+ )
233
+ all_uc_tables = all(
234
+ [
235
+ uc_utils.is_uc_entity(table_info.table_name)
236
+ for table_info in feature_spec_with_full_table_names.table_infos
237
+ ]
238
+ )
239
+ feature_spec = uc_utils.get_feature_spec_with_reformat_full_table_names(
240
+ feature_spec_with_full_table_names
241
+ )
242
+ label_type_map = None
243
+ df_head = None
244
+ del training_set, feature_spec_path
245
+
246
+ override_output_schema = kwargs.pop("output_schema", None)
247
+ params = kwargs.pop("params", {})
248
+ params["result_type"] = params.get("result_type", _NO_RESULT_TYPE_PASSED)
249
+ # Signatures will ony be supported for UC-table-only models to
250
+ # mitigate new online scoring behavior from being a breaking regression for older
251
+ # models.
252
+ # See https://docs.google.com/document/d/1L5tLY-kRreRefDfuAM3crXvYlirkcPuUUppU8uIMVM0/edit#
253
+ try:
254
+ if all_uc_tables:
255
+ signature = get_mlflow_signature_from_feature_spec(
256
+ feature_spec, label_type_map, override_output_schema, params
257
+ )
258
+ else:
259
+ _logger.warning(
260
+ "Model could not be logged with a signature because the training set uses feature tables in "
261
+ "Hive Metastore. Migrate the feature tables to Unity Catalog for model to be logged "
262
+ "with a signature. "
263
+ "See https://docs.databricks.com/en/machine-learning/feature-store/uc/upgrade-feature-table-to-uc.html for more details."
264
+ )
265
+ signature = None
266
+ except Exception as e:
267
+ _logger.warning(f"Model could not be logged with a signature: {e}")
268
+ signature = None
269
+
270
+ with TempDir() as tmp_location:
271
+ data_path = os.path.join(tmp_location.path(), "feature_store")
272
+ raw_mlflow_model = Model(
273
+ signature=drop_signature_inputs_and_invalid_params(signature)
274
+ )
275
+ raw_model_path = os.path.join(
276
+ data_path, constants.RAW_MODEL_FOLDER
277
+ )
278
+ if flavor.FLAVOR_NAME != mlflow.pyfunc.FLAVOR_NAME:
279
+ flavor.save_model(
280
+ model, raw_model_path, mlflow_model=raw_mlflow_model, **kwargs
281
+ )
282
+ else:
283
+ flavor.save_model(
284
+ raw_model_path,
285
+ mlflow_model=raw_mlflow_model,
286
+ python_model=model,
287
+ **kwargs,
288
+ )
289
+ if not "python_function" in raw_mlflow_model.flavors:
290
+ raise ValueError(
291
+ f"FeatureStoreClient.log_model does not support '{flavor.__name__}' "
292
+ f"since it does not have a python_function model flavor."
293
+ )
294
+
295
+ # Re-use the conda environment from the raw model for the packaged model. Later, we may
296
+ # add an additional requirement for the Feature Store library. At the moment, however,
297
+ # the databricks-feature-store package is not available via conda or pip.
298
+ model_env = raw_mlflow_model.flavors["python_function"][mlflow.pyfunc.ENV]
299
+ if isinstance(model_env, dict):
300
+ # mlflow 2.0 has multiple supported environments
301
+ conda_file = model_env[mlflow.pyfunc.EnvType.CONDA]
302
+ else:
303
+ conda_file = model_env
304
+
305
+ conda_env = read_yaml(raw_model_path, conda_file)
306
+
307
+ # Check if databricks-feature-lookup version is specified in conda_env
308
+ lookup_client_version_specified = False
309
+ for dependency in conda_env.get("dependencies", []):
310
+ if isinstance(dependency, dict):
311
+ for pip_dep in dependency.get("pip", []):
312
+ if pip_dep.startswith(
313
+ constants.FEATURE_LOOKUP_CLIENT_PIP_PACKAGE
314
+ ):
315
+ lookup_client_version_specified = True
316
+ break
317
+
318
+ # If databricks-feature-lookup version is not specified, add default version
319
+ if not lookup_client_version_specified:
320
+ # Get the pip package string for the databricks-feature-lookup client
321
+ default_databricks_feature_lookup_pip_package = common_utils.pip_depependency_pinned_major_version(
322
+ pip_package_name=constants.FEATURE_LOOKUP_CLIENT_PIP_PACKAGE,
323
+ major_version=constants.FEATURE_LOOKUP_CLIENT_MAJOR_VERSION,
324
+ )
325
+ common_utils.add_mlflow_pip_depependency(
326
+ conda_env, default_databricks_feature_lookup_pip_package
327
+ )
328
+
329
+ try:
330
+ if df_head is not None and infer_input_example:
331
+ input_example = df_head.asDict()
332
+ else:
333
+ input_example = None
334
+ except Exception:
335
+ input_example = None
336
+
337
+ # todo:
338
+ #feature_spec.save(data_path)
339
+
340
+ # Log the packaged model. If no run is active, this call will create an active run.
341
+ mlflow.pyfunc.log_model(
342
+ artifact_path=artifact_path,
343
+ loader_module=constants.MLFLOW_MODEL_NAME,
344
+ data_path=data_path,
345
+ code_path=None,
346
+ conda_env=conda_env,
347
+ signature=signature,
348
+ input_example=input_example,
349
+ )
350
+ if registered_model_name is not None:
351
+ # The call to mlflow.pyfunc.log_model will create an active run, so it is safe to
352
+ # obtain the run_id for the active run.
353
+ run_id = mlflow.tracking.fluent.active_run().info.run_id
354
+
355
+ # If the user provided an explicit model_registry_uri when constructing the FeatureStoreClient,
356
+ # we respect this by setting the registry URI prior to reading the model from Model
357
+ # Registry.
358
+ # todo:
359
+ # if self._model_registry_uri:
360
+ # # This command will override any previously set registry_uri.
361
+ # mlflow.set_registry_uri(self._model_registry_uri)
362
+
363
+ mlflow.register_model(
364
+ "runs:/%s/%s" % (run_id, artifact_path),
365
+ registered_model_name,
366
+ await_registration_for=await_registration_for,
367
+ )
File without changes
@@ -1,4 +1,8 @@
1
+ """
2
+ 通用工具函数
3
+ """
1
4
  import os
5
+ from collections import Counter
2
6
  from datetime import datetime, timezone
3
7
  from functools import wraps
4
8
  from typing import Any, Dict, List, Optional
@@ -7,8 +11,54 @@ from urllib.parse import urlparse
7
11
  import mlflow
8
12
  from mlflow.exceptions import RestException
9
13
  from mlflow.store.artifact.artifact_repository_registry import get_artifact_repository
14
+ from mlflow.store.artifact.models_artifact_repo import ModelsArtifactRepository
15
+ from mlflow.store.artifact.runs_artifact_repo import RunsArtifactRepository
10
16
  from mlflow.utils import databricks_utils
11
17
 
18
+ from wedata.feature_store.constants.constants import MODEL_DATA_PATH_ROOT
19
+
20
+
21
+ def validate_table_name(name: str):
22
+ """
23
+ 验证特征表名规范,仅支持单表名,不能包含点(如<catalog>.<schema>.<table>)
24
+
25
+ 参数:
26
+ name: 要验证的表名
27
+
28
+ 异常:
29
+ ValueError: 如果表名包含点或不符合规范
30
+ """
31
+ if not name or not isinstance(name, str):
32
+ raise ValueError("Table name must be a non-empty string")
33
+ if name.count('.') > 0:
34
+ raise ValueError("Feature table name only supports single table name, cannot contain dots (e.g. <catalog>.<schema>.<table>)")
35
+ if not name[0].isalpha():
36
+ raise ValueError("Table name must start with a letter")
37
+ if not all(c.isalnum() or c == '_' for c in name):
38
+ raise ValueError("Table name can only contain letters, numbers and underscores")
39
+
40
+
41
+ def build_full_table_name(table_name: str) -> str:
42
+ """
43
+ 构建完整的表名,格式化为`<database>.<table>`形式。
44
+
45
+ Args:
46
+ table_name: 输入的表名(可以是简化的表名或完整表名)。
47
+
48
+ Returns:
49
+ 完整表名(`<database>.<table>`)。
50
+ """
51
+
52
+ # 从环境变量中获取当前主账户名称
53
+ owner_uin = os.environ.get("WEDATA_OWNER_UIN", "default")
54
+
55
+ # 如果owner_uin为空,则报错
56
+ if not owner_uin:
57
+ raise ValueError("WEDATA_OWNER_UIN environment variable is not set")
58
+
59
+ feature_store_database = f"{owner_uin}.{table_name}"
60
+
61
+ return feature_store_database
12
62
 
13
63
 
14
64
  def enable_if(condition):
@@ -31,21 +81,6 @@ def enable_if(condition):
31
81
  return decorator
32
82
 
33
83
 
34
- def as_list(obj, default=None):
35
- if not obj:
36
- return default
37
- elif isinstance(obj, list):
38
- return obj
39
- else:
40
- return [obj]
41
-
42
-
43
- def as_directory(path):
44
- if path.endswith("/"):
45
- return path
46
- return f"{path}/"
47
-
48
-
49
84
  def is_empty(target: str):
50
85
  return target is None or len(target.strip()) == 0
51
86
 
@@ -68,10 +103,7 @@ def _get_dbutils():
68
103
  raise _NoDbutilsError
69
104
 
70
105
 
71
-
72
106
  def utc_timestamp_ms_from_iso_datetime_string(date_string: str) -> int:
73
- # Python uses seconds for its time granularity, so we multiply by 1000 to convert to milliseconds.
74
- # The Feature Store backend returns timestamps in milliseconds, so this allows for direct comparisons.
75
107
  dt = datetime.fromisoformat(date_string)
76
108
  utc_dt = dt.replace(tzinfo=timezone.utc)
77
109
  return 1000 * utc_dt.timestamp()
@@ -86,12 +118,7 @@ def pip_depependency_pinned_major_version(pip_package_name, major_version):
86
118
 
87
119
  def add_mlflow_pip_depependency(conda_env, pip_package_name):
88
120
  """
89
- Add a new pip dependency to the conda environment taken from the raw MLflow model. This method should only be
90
- called for conda environments created by MLflow rather than for generic conda environments, because it assumes
91
- the conda environment already contains pip as a dependency. In the case of MLflow models, this is a safe
92
- assumption because MLflow always needs to add "mlflow" to the conda environment's pip dependencies.
93
-
94
- This is idempotent and will not add a pip package that is already present in the list of pip packages.
121
+ Add a new pip dependency to the conda environment taken from the raw MLflow model.
95
122
  """
96
123
  if pip_package_name is None or len(pip_package_name) == 0:
97
124
  raise ValueError(
@@ -106,7 +133,6 @@ def add_mlflow_pip_depependency(conda_env, pip_package_name):
106
133
  pip_deps = dep["pip"]
107
134
  if pip_package_name not in pip_deps:
108
135
  pip_deps.append(pip_package_name)
109
- # Fail early rather than at model inference time
110
136
  if "dependencies" in conda_env and not found_pip_dependency:
111
137
  raise ValueError(
112
138
  "Unexpected input: mlflow conda_env did not contain pip as a dependency"
@@ -115,12 +141,7 @@ def add_mlflow_pip_depependency(conda_env, pip_package_name):
115
141
 
116
142
  def download_model_artifacts(model_uri, dir):
117
143
  """
118
- Downloads model artifacts from model_uri to dir. Intended for use only with Feature Store packaged models.
119
-
120
- :param model_uri: The location, in URI format, of a model. Must be either in the model registry
121
- (``models:/<model_name>/<model_version>``, ``models:/<model_name>/<stage>``) or the MLflow
122
- artifact store (``runs:/<mlflow_run_id>/run-relative/path/to/model``).
123
- :param dir: Location to place downloaded model artifacts.
144
+ Downloads model artifacts from model_uri to dir.
124
145
  """
125
146
  if not is_artifact_uri(model_uri):
126
147
  raise ValueError(
@@ -147,17 +168,7 @@ def download_model_artifacts(model_uri, dir):
147
168
 
148
169
  def validate_params_non_empty(params: Dict[str, Any], expected_params: List[str]):
149
170
  """
150
- Validate that none of the expected parameters are empty, otherwise raise a Value error
151
- for the first encountered empty parameter.
152
-
153
- Tested with the following param types:
154
-
155
- - str
156
- - Dict
157
- - List
158
-
159
- :param params: A dictionary of param names -> param values, for example as returned by locals()
160
- :param expected_params: List of params to check as non_empty
171
+ Validate that none of the expected parameters are empty.
161
172
  """
162
173
  for expected_param in expected_params:
163
174
  if expected_param not in params:
@@ -171,8 +182,7 @@ def validate_params_non_empty(params: Dict[str, Any], expected_params: List[str]
171
182
 
172
183
  def is_in_databricks_job():
173
184
  """
174
- Overrides the behavior of the mlflow databricks_utils.is_in_databricks_job() to account for the fact that
175
- some jobs have job_id but no run_id, for example one-time job runs.
185
+ Overrides the behavior of the mlflow databricks_utils.is_in_databricks_job().
176
186
  """
177
187
  try:
178
188
  return databricks_utils.get_job_id() is not None
@@ -182,10 +192,7 @@ def is_in_databricks_job():
182
192
 
183
193
  def get_workspace_url() -> Optional[str]:
184
194
  """
185
- Overrides the behavior of the mlflow.utils.databricks_utils.get_workspace_url(),
186
- as get_workspace_url does not always return URLs with defined schemes.
187
-
188
- TODO (ML-32050): Refactor this implementation to mlflow, and bump minimum required mlflow version.
195
+ Overrides the behavior of the mlflow.utils.databricks_utils.get_workspace_url().
189
196
  """
190
197
  workspace_url = databricks_utils.get_workspace_url()
191
198
  if workspace_url and not urlparse(workspace_url).scheme:
@@ -195,20 +202,60 @@ def get_workspace_url() -> Optional[str]:
195
202
 
196
203
  def is_in_databricks_env():
197
204
  """
198
- Determine if we are running in a Databricks environment (DBR, MLR, DLT, DCS, Mlflow Projects, Run Cmd 1.2 API, etc)
199
-
200
- If any invoked methods raise an exception, swallow the exception and return False out of an abundance of caution.
205
+ Determine if we are running in a Databricks environment.
201
206
  """
202
207
  try:
203
208
  return (
204
- is_in_databricks_job()
205
- or databricks_utils.is_in_databricks_notebook()
206
- or databricks_utils.is_in_databricks_runtime()
209
+ is_in_databricks_job()
210
+ or databricks_utils.is_in_databricks_notebook()
211
+ or databricks_utils.is_in_databricks_runtime()
207
212
  )
208
213
  except Exception:
209
214
  return False
210
215
 
211
216
 
217
+ def is_artifact_uri(uri):
218
+ """
219
+ Checks the artifact URI is associated with a MLflow model or run.
220
+ The actual URI can be a model URI, model URI + subdirectory, or model URI + path to artifact file.
221
+ """
222
+ return ModelsArtifactRepository.is_models_uri(
223
+ uri
224
+ ) or RunsArtifactRepository.is_runs_uri(uri)
225
+
226
+
227
+ def as_list(obj, default=None):
228
+ if not obj:
229
+ return default
230
+ elif isinstance(obj, list):
231
+ return obj
232
+ else:
233
+ return [obj]
234
+
235
+
236
+ def get_duplicates(elements: List[Any]) -> List[Any]:
237
+ """
238
+ Returns duplicate elements in the order they first appear.
239
+ """
240
+ element_counts = Counter(elements)
241
+ duplicates = []
242
+ for e in element_counts.keys():
243
+ if element_counts[e] > 1:
244
+ duplicates.append(e)
245
+ return duplicates
246
+
247
+
248
+ def validate_strings_unique(strings: List[str], error_template: str):
249
+ """
250
+ Validates all strings are unique, otherwise raise ValueError with the error template and duplicates.
251
+ Passes single-quoted, comma delimited duplicates to the error template.
252
+ """
253
+ duplicate_strings = get_duplicates(strings)
254
+ if duplicate_strings:
255
+ duplicates_formatted = ", ".join([f"'{s}'" for s in duplicate_strings])
256
+ raise ValueError(error_template.format(duplicates_formatted))
257
+
258
+
212
259
  def sanitize_identifier(identifier: str):
213
260
  """
214
261
  Sanitize and wrap an identifier with backquotes. For example, "a`b" becomes "`a``b`".
@@ -250,3 +297,10 @@ def unsanitize_identifier(identifier: str):
250
297
  # strings containing \ or ' can break sql statements, so escape them.
251
298
  def escape_sql_string(input_str: str) -> str:
252
299
  return input_str.replace("\\", "\\\\").replace("'", "\\'")
300
+
301
+
302
+ def get_unique_list_order(elements: List[Any]) -> List[Any]:
303
+ """
304
+ Returns unique elements in the order they first appear.
305
+ """
306
+ return list(dict.fromkeys(elements))
@@ -10,13 +10,13 @@ from pyspark.sql import DataFrame, Window
10
10
  from pyspark.sql import functions as F
11
11
  from pyspark.sql.functions import sum, unix_timestamp
12
12
 
13
- from feature_store.entities.environment_variables import BROADCAST_JOIN_THRESHOLD
14
- from feature_store.entities.feature_column_info import FeatureColumnInfo
15
- from feature_store.entities.feature_lookup import FeatureLookup
16
- from feature_store.entities.feature_spec import FeatureSpec
17
- from feature_store.entities.feature_table import FeatureTable
13
+ from wedata.feature_store.entities.environment_variables import BROADCAST_JOIN_THRESHOLD
14
+ from wedata.feature_store.entities.feature_column_info import FeatureColumnInfo
15
+ from wedata.feature_store.entities.feature_lookup import FeatureLookup
16
+ from wedata.feature_store.entities.feature_spec import FeatureSpec
17
+ from wedata.feature_store.entities.feature_table import FeatureTable
18
18
 
19
- from feature_store.utils import common_utils, validation_utils, uc_utils
19
+ from wedata.feature_store.utils import common_utils, validation_utils, uc_utils
20
20
 
21
21
  _logger = logging.getLogger(__name__)
22
22
 
@@ -6,12 +6,12 @@ from typing import Dict, List, Set, Tuple, Type, Union
6
6
  import yaml
7
7
  from mlflow.utils.file_utils import YamlSafeDumper
8
8
 
9
- from feature_store.entities.column_info import ColumnInfo
10
- from feature_store.entities.feature_column_info import FeatureColumnInfo
11
- from feature_store.entities.feature_spec import FeatureSpec
12
- from feature_store.entities.on_demand_column_info import OnDemandColumnInfo
13
- from feature_store.entities.source_data_column_info import SourceDataColumnInfo
14
- from feature_store.utils.topological_sort import topological_sort
9
+ from wedata.feature_store.entities.column_info import ColumnInfo
10
+ from wedata.feature_store.entities.feature_column_info import FeatureColumnInfo
11
+ from wedata.feature_store.entities.feature_spec import FeatureSpec
12
+ from wedata.feature_store.entities.on_demand_column_info import OnDemandColumnInfo
13
+ from wedata.feature_store.entities.source_data_column_info import SourceDataColumnInfo
14
+ from wedata.feature_store.utils.topological_sort import topological_sort
15
15
 
16
16
  DEFAULT_GRAPH_DEPTH_LIMIT = 5
17
17
 
@@ -1,11 +1,11 @@
1
1
  import copy
2
2
  from typing import List, Union
3
3
 
4
- from feature_store.entities.feature_function import FeatureFunction
5
- from feature_store.entities.feature_lookup import FeatureLookup
6
- from feature_store.spark_client.spark_client import SparkClient
7
- from feature_store.utils import uc_utils
8
- from feature_store.utils.feature_lookup_utils import get_feature_lookups_with_full_table_names
4
+ from wedata.feature_store.entities.feature_function import FeatureFunction
5
+ from wedata.feature_store.entities.feature_lookup import FeatureLookup
6
+ from wedata.feature_store.spark_client.spark_client import SparkClient
7
+ from wedata.feature_store.utils import uc_utils
8
+ from wedata.feature_store.utils.feature_lookup_utils import get_feature_lookups_with_full_table_names
9
9
 
10
10
 
11
11
  def format_feature_lookups_and_functions(