wedata-feature-engineering 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,281 @@
1
+ import copy
2
+ import re
3
+ from typing import Optional, Set
4
+
5
+ from feature_store.entities.feature_spec import FeatureSpec
6
+
7
+ SINGLE_LEVEL_NAMESPACE_REGEX = r"^[^\. \/\x00-\x1F\x7F]+$"
8
+ TWO_LEVEL_NAMESPACE_REGEX = r"^[^\. \/\x00-\x1F\x7F]+(\.[^\. \/\x00-\x1F\x7F]+)$"
9
+ THREE_LEVEL_NAMESPACE_REGEX = (
10
+ r"^[^\. \/\x00-\x1F\x7F]+(\.[^\. \/\x00-\x1F\x7F]+)(\.[^\. \/\x00-\x1F\x7F]+)$"
11
+ )
12
+
13
+ HIVE_METASTORE_NAME = "hive_metastore"
14
+ # these two catalog names both points to the workspace local default HMS (hive metastore).
15
+ LOCAL_METASTORE_NAMES = [HIVE_METASTORE_NAME, "spark_catalog"]
16
+
17
+
18
+ # Get full table name in the form of <catalog_name>.<schema_name>.<table_name>
19
+ # given user specified table name, current catalog and schema.
20
+ def get_full_table_name(
21
+ table_name: str,
22
+ current_catalog: str,
23
+ current_schema: str,
24
+ ) -> str:
25
+ _check_qualified_table_names({table_name})
26
+ return _get_full_name_for_entity(
27
+ name=table_name,
28
+ current_catalog=current_catalog,
29
+ current_schema=current_schema,
30
+ entity_type="table",
31
+ )
32
+
33
+
34
+ # Get full UDF name in the form of <catalog_name>.<schema_name>.<udf_name>
35
+ # given user specified UDF name, current catalog and schema.
36
+ def get_full_udf_name(
37
+ udf_name: str,
38
+ current_catalog: str,
39
+ current_schema: str,
40
+ ) -> str:
41
+ _check_qualified_udf_names({udf_name})
42
+ return _get_full_name_for_entity(
43
+ name=udf_name,
44
+ current_catalog=current_catalog,
45
+ current_schema=current_schema,
46
+ entity_type="UDF",
47
+ )
48
+
49
+
50
+ def _get_full_name_for_entity(
51
+ name: str,
52
+ current_catalog: str,
53
+ current_schema: str,
54
+ entity_type: str,
55
+ ) -> str:
56
+ if not _is_single_level_name(current_catalog) or not _is_single_level_name(
57
+ current_schema
58
+ ):
59
+ raise ValueError(
60
+ f"Invalid catalog '{current_catalog}' or "
61
+ f"schema '{current_schema}' name for {entity_type} '{name}'."
62
+ )
63
+ if _is_single_level_name(name):
64
+ full_name = f"{current_catalog}.{current_schema}.{name}"
65
+ elif _is_two_level_name(name):
66
+ full_name = f"{current_catalog}.{name}"
67
+ elif _is_three_level_name(name):
68
+ full_name = name
69
+ else:
70
+ raise _invalid_names_error({name}, entity_type)
71
+
72
+ catalog, schema, name = full_name.split(".")
73
+ if catalog in LOCAL_METASTORE_NAMES:
74
+ return f"{HIVE_METASTORE_NAME}.{schema}.{name}"
75
+ return full_name
76
+
77
+
78
+ def _replace_catalog_name(full_name: str, catalog: Optional[str]) -> str:
79
+ if catalog is None:
80
+ return full_name
81
+ name_sec = full_name.split(".")
82
+ name_sec[0] = catalog
83
+ return ".".join(name_sec)
84
+
85
+
86
+ # Local metastore tables in feature_spec.yaml are all stored in 2L.
87
+ # Standardize table names to be all in 3L to avoid erroneously reading data from UC tables.
88
+ def get_feature_spec_with_full_table_names(
89
+ feature_spec: FeatureSpec, catalog_name_override: Optional[str] = None
90
+ ) -> FeatureSpec:
91
+ column_info_table_names = [
92
+ column_info.table_name for column_info in feature_spec.feature_column_infos
93
+ ]
94
+ table_info_table_names = [
95
+ table_info.table_name for table_info in feature_spec.table_infos
96
+ ]
97
+ _check_qualified_table_names(set(column_info_table_names))
98
+ _check_qualified_table_names(set(table_info_table_names))
99
+ invalid_table_names = list(
100
+ filter(_is_single_level_name, column_info_table_names)
101
+ ) + list(filter(_is_single_level_name, table_info_table_names))
102
+ if len(invalid_table_names) > 0:
103
+ raise _invalid_names_error(set(invalid_table_names), "table")
104
+ standardized_feature_spec = copy.deepcopy(feature_spec)
105
+ for column_info in standardized_feature_spec.feature_column_infos:
106
+ if _is_two_level_name(column_info.table_name):
107
+ column_info._table_name = f"{HIVE_METASTORE_NAME}.{column_info.table_name}"
108
+ column_info._table_name = _replace_catalog_name(
109
+ column_info.table_name, catalog_name_override
110
+ )
111
+ for column_info in standardized_feature_spec.on_demand_column_infos:
112
+ if _is_two_level_name(column_info.udf_name):
113
+ column_info._udf_name = f"{HIVE_METASTORE_NAME}.{column_info.udf_name}"
114
+ column_info._udf_name = _replace_catalog_name(
115
+ column_info.udf_name, catalog_name_override
116
+ )
117
+ for table_info in standardized_feature_spec.table_infos:
118
+ if _is_two_level_name(table_info.table_name):
119
+ table_info._table_name = f"{HIVE_METASTORE_NAME}.{table_info.table_name}"
120
+ table_info._table_name = _replace_catalog_name(
121
+ table_info.table_name, catalog_name_override
122
+ )
123
+ for udf_info in standardized_feature_spec.function_infos:
124
+ udf_info._udf_name = _replace_catalog_name(
125
+ udf_info.udf_name, catalog_name_override
126
+ )
127
+ return standardized_feature_spec
128
+
129
+
130
+ # Reformat 3L table name for tables in local metastore to 2L. This is used when interacting with catalog client
131
+ # and serializing workspace local feature spec for scoring.
132
+ def reformat_full_table_name(full_table_name: str) -> str:
133
+ if not _is_three_level_name(full_table_name):
134
+ raise _invalid_names_error({full_table_name}, "table")
135
+ catalog, schema, table = full_table_name.split(".")
136
+ if catalog in LOCAL_METASTORE_NAMES:
137
+ return f"{schema}.{table}"
138
+ return full_table_name
139
+
140
+
141
+ # Reformat table names in feature_spec with reformat_full_table_name
142
+ def get_feature_spec_with_reformat_full_table_names(
143
+ feature_spec: FeatureSpec,
144
+ ) -> FeatureSpec:
145
+ column_info_table_names = [
146
+ column_info.table_name for column_info in feature_spec.feature_column_infos
147
+ ]
148
+ table_info_table_names = [
149
+ table_info.table_name for table_info in feature_spec.table_infos
150
+ ]
151
+ _check_qualified_table_names(set(column_info_table_names))
152
+ _check_qualified_table_names(set(table_info_table_names))
153
+ invalid_table_names = list(
154
+ filter(lambda name: not _is_three_level_name(name), column_info_table_names)
155
+ ) + list(
156
+ filter(lambda name: not _is_three_level_name(name), table_info_table_names)
157
+ )
158
+ if len(invalid_table_names) > 0:
159
+ raise _invalid_names_error(set(invalid_table_names), "table")
160
+ standardized_feature_spec = copy.deepcopy(feature_spec)
161
+ for column_info in standardized_feature_spec.feature_column_infos:
162
+ column_info._table_name = reformat_full_table_name(column_info.table_name)
163
+ for table_info in standardized_feature_spec.table_infos:
164
+ table_info._table_name = reformat_full_table_name(table_info.table_name)
165
+ return standardized_feature_spec
166
+
167
+
168
+ def _invalid_names_error(invalid_names: Set[str], entity_type: str) -> ValueError:
169
+ return ValueError(
170
+ f"Invalid {entity_type} name{'s' if len(invalid_names) > 1 else ''} '{', '.join(invalid_names)}'."
171
+ )
172
+
173
+
174
+ def _is_qualified_entity_name(name) -> bool:
175
+ return isinstance(name, str) and (
176
+ _is_single_level_name(name)
177
+ or _is_two_level_name(name)
178
+ or _is_three_level_name(name)
179
+ )
180
+
181
+
182
+ def _is_single_level_name(name) -> bool:
183
+ return (
184
+ isinstance(name, str)
185
+ and re.match(SINGLE_LEVEL_NAMESPACE_REGEX, name) is not None
186
+ )
187
+
188
+
189
+ def _is_two_level_name(name) -> bool:
190
+ return (
191
+ isinstance(name, str) and re.match(TWO_LEVEL_NAMESPACE_REGEX, name) is not None
192
+ )
193
+
194
+
195
+ def _is_three_level_name(name) -> bool:
196
+ return (
197
+ isinstance(name, str)
198
+ and re.match(THREE_LEVEL_NAMESPACE_REGEX, name) is not None
199
+ )
200
+
201
+
202
+ def unsupported_api_error_uc(api_name):
203
+ return ValueError(f"{api_name} is not supported for Unity Catalog tables.")
204
+
205
+
206
+ # check if entity is in UC
207
+ def is_uc_entity(full_entity_name) -> bool:
208
+ catalog_name, schema_name, table_name = full_entity_name.split(".")
209
+ return not is_default_hms_table(full_entity_name)
210
+
211
+
212
+ def is_default_hms_table(full_table_name) -> bool:
213
+ catalog_name, schema_name, table_name = full_table_name.split(".")
214
+ return catalog_name in LOCAL_METASTORE_NAMES
215
+
216
+
217
+ # check if UDF names are in the correct format - 1L, 2L or 3L
218
+ def _check_qualified_udf_names(udf_names: Set[str]):
219
+ unqualified_udf_names = [
220
+ udf_name for udf_name in udf_names if not _is_qualified_entity_name(udf_name)
221
+ ]
222
+ if len(unqualified_udf_names) > 0:
223
+ raise ValueError(
224
+ f"UDF name{'s' if len(unqualified_udf_names) > 1 else ''} "
225
+ f"'{', '.join(map(str, unqualified_udf_names))}' must have the form "
226
+ f"<catalog_name>.<schema_name>.<udf_name>, <schema_name>.<udf_name>, "
227
+ f"or <udf_name> and cannot include space or forward-slash."
228
+ )
229
+
230
+
231
+ # check if table names are in the correct format - 1L, 2L or 3L
232
+ def _check_qualified_table_names(feature_table_names: Set[str]):
233
+ unqualified_table_names = list(
234
+ filter(
235
+ lambda table_name: not _is_qualified_entity_name(table_name),
236
+ feature_table_names,
237
+ )
238
+ )
239
+ if len(unqualified_table_names) > 0:
240
+ raise ValueError(
241
+ f"Feature table name{'s' if len(unqualified_table_names) > 1 else ''} "
242
+ f"'{', '.join(map(str, unqualified_table_names))}' must have the form "
243
+ f"<catalog_name>.<schema_name>.<table_name>, <database_name>.<table_name>, "
244
+ f"or <table_name> and cannot include space or forward-slash."
245
+ )
246
+
247
+
248
+ # For APIs like create_training_set and score_batch, all tables must all be in
249
+ # UC catalog (shareable cross-workspaces) or default HMS (intended to only be used in the current workspace)
250
+ # check if all tables are either in UC or default HMS.
251
+ def _verify_all_tables_are_either_in_uc_or_in_hms(
252
+ table_names: Set[str], current_catalog: str, current_schema: str
253
+ ):
254
+ full_table_names = [
255
+ get_full_table_name(table_name, current_catalog, current_schema)
256
+ for table_name in table_names
257
+ ]
258
+ is_valid = all(
259
+ [is_uc_entity(full_table_name) for full_table_name in full_table_names]
260
+ ) or all(
261
+ [is_default_hms_table(full_table_name) for full_table_name in full_table_names]
262
+ )
263
+ if not is_valid:
264
+ raise ValueError(
265
+ f"Feature table names '{', '.join(table_names)}' "
266
+ f"must all be in UC or the local default hive metastore. "
267
+ f"Mixing feature tables from two different storage locations is not allowed."
268
+ )
269
+
270
+
271
+ # For APIs like create_training_set with FeatureFunctions, only UC UDFs are supported.
272
+ def _verify_all_udfs_in_uc(
273
+ udf_names: Set[str], current_catalog: str, current_schema: str
274
+ ):
275
+ full_udf_names = [
276
+ get_full_udf_name(udf_name, current_catalog, current_schema)
277
+ for udf_name in udf_names
278
+ ]
279
+ is_valid = all([is_uc_entity(full_udf_name) for full_udf_name in full_udf_names])
280
+ if not is_valid:
281
+ raise ValueError(f"UDFs must all be in Unity Catalog.")
@@ -0,0 +1,252 @@
1
+ import os
2
+ from datetime import datetime, timezone
3
+ from functools import wraps
4
+ from typing import Any, Dict, List, Optional
5
+ from urllib.parse import urlparse
6
+
7
+ import mlflow
8
+ from mlflow.exceptions import RestException
9
+ from mlflow.store.artifact.artifact_repository_registry import get_artifact_repository
10
+ from mlflow.utils import databricks_utils
11
+
12
+
13
+
14
+ def enable_if(condition):
15
+ """
16
+ A decorator that conditionally enables a function based on a condition.
17
+ If the condition is not truthy, calling the function raises a NotImplementedError.
18
+
19
+ :param condition: A callable that returns a truthy or falsy value.
20
+ """
21
+
22
+ def decorator(func):
23
+ @wraps(func)
24
+ def wrapper(*args, **kwargs):
25
+ if not condition():
26
+ raise NotImplementedError
27
+ return func(*args, **kwargs)
28
+
29
+ return wrapper
30
+
31
+ return decorator
32
+
33
+
34
+ def as_list(obj, default=None):
35
+ if not obj:
36
+ return default
37
+ elif isinstance(obj, list):
38
+ return obj
39
+ else:
40
+ return [obj]
41
+
42
+
43
+ def as_directory(path):
44
+ if path.endswith("/"):
45
+ return path
46
+ return f"{path}/"
47
+
48
+
49
+ def is_empty(target: str):
50
+ return target is None or len(target.strip()) == 0
51
+
52
+
53
+ class _NoDbutilsError(Exception):
54
+ pass
55
+
56
+
57
+ def _get_dbutils():
58
+ try:
59
+ import IPython
60
+
61
+ ip_shell = IPython.get_ipython()
62
+ if ip_shell is None:
63
+ raise _NoDbutilsError
64
+ return ip_shell.ns_table["user_global"]["dbutils"]
65
+ except ImportError:
66
+ raise _NoDbutilsError
67
+ except KeyError:
68
+ raise _NoDbutilsError
69
+
70
+
71
+
72
+ def utc_timestamp_ms_from_iso_datetime_string(date_string: str) -> int:
73
+ # Python uses seconds for its time granularity, so we multiply by 1000 to convert to milliseconds.
74
+ # The Feature Store backend returns timestamps in milliseconds, so this allows for direct comparisons.
75
+ dt = datetime.fromisoformat(date_string)
76
+ utc_dt = dt.replace(tzinfo=timezone.utc)
77
+ return 1000 * utc_dt.timestamp()
78
+
79
+
80
+ def pip_depependency_pinned_major_version(pip_package_name, major_version):
81
+ """
82
+ Generate a pip dependency string that is pinned to a major version, for example: "databricks-feature-lookup==0.*"
83
+ """
84
+ return f"{pip_package_name}=={major_version}.*"
85
+
86
+
87
+ def add_mlflow_pip_depependency(conda_env, pip_package_name):
88
+ """
89
+ Add a new pip dependency to the conda environment taken from the raw MLflow model. This method should only be
90
+ called for conda environments created by MLflow rather than for generic conda environments, because it assumes
91
+ the conda environment already contains pip as a dependency. In the case of MLflow models, this is a safe
92
+ assumption because MLflow always needs to add "mlflow" to the conda environment's pip dependencies.
93
+
94
+ This is idempotent and will not add a pip package that is already present in the list of pip packages.
95
+ """
96
+ if pip_package_name is None or len(pip_package_name) == 0:
97
+ raise ValueError(
98
+ "Unexpected input: missing or empty pip_package_name parameter"
99
+ )
100
+
101
+ found_pip_dependency = False
102
+ if conda_env is not None:
103
+ for dep in conda_env["dependencies"]:
104
+ if isinstance(dep, dict) and "pip" in dep:
105
+ found_pip_dependency = True
106
+ pip_deps = dep["pip"]
107
+ if pip_package_name not in pip_deps:
108
+ pip_deps.append(pip_package_name)
109
+ # Fail early rather than at model inference time
110
+ if "dependencies" in conda_env and not found_pip_dependency:
111
+ raise ValueError(
112
+ "Unexpected input: mlflow conda_env did not contain pip as a dependency"
113
+ )
114
+
115
+
116
+ def download_model_artifacts(model_uri, dir):
117
+ """
118
+ Downloads model artifacts from model_uri to dir. Intended for use only with Feature Store packaged models.
119
+
120
+ :param model_uri: The location, in URI format, of a model. Must be either in the model registry
121
+ (``models:/<model_name>/<model_version>``, ``models:/<model_name>/<stage>``) or the MLflow
122
+ artifact store (``runs:/<mlflow_run_id>/run-relative/path/to/model``).
123
+ :param dir: Location to place downloaded model artifacts.
124
+ """
125
+ if not is_artifact_uri(model_uri):
126
+ raise ValueError(
127
+ f"Invalid model URI '{model_uri}'."
128
+ f"Use ``models:/model_name>/<version_number>`` or "
129
+ f"``runs:/<mlflow_run_id>/run-relative/path/to/model``."
130
+ )
131
+
132
+ try:
133
+ repo = get_artifact_repository(model_uri)
134
+ except RestException as e:
135
+ raise ValueError(f"The model at '{model_uri}' does not exist.", e)
136
+
137
+ artifact_path = os.path.join(mlflow.pyfunc.DATA, MODEL_DATA_PATH_ROOT)
138
+ if len(repo.list_artifacts(artifact_path)) == 0:
139
+ raise ValueError(
140
+ f"No suitable model found at '{model_uri}'. Either no model exists in this "
141
+ f"artifact location or an existing model was not packaged with Feature Store metadata. "
142
+ f"Only models logged by FeatureStoreClient.log_model can be used in inference."
143
+ )
144
+
145
+ return repo.download_artifacts(artifact_path="", dst_path=dir)
146
+
147
+
148
+ def validate_params_non_empty(params: Dict[str, Any], expected_params: List[str]):
149
+ """
150
+ Validate that none of the expected parameters are empty, otherwise raise a Value error
151
+ for the first encountered empty parameter.
152
+
153
+ Tested with the following param types:
154
+
155
+ - str
156
+ - Dict
157
+ - List
158
+
159
+ :param params: A dictionary of param names -> param values, for example as returned by locals()
160
+ :param expected_params: List of params to check as non_empty
161
+ """
162
+ for expected_param in expected_params:
163
+ if expected_param not in params:
164
+ raise ValueError(
165
+ f'Internal error: expected parameter "{expected_param}" not found in params dictionary'
166
+ )
167
+ param_value = params[expected_param]
168
+ if not param_value:
169
+ raise ValueError(f'Parameter "{expected_param}" cannot be empty')
170
+
171
+
172
+ def is_in_databricks_job():
173
+ """
174
+ Overrides the behavior of the mlflow databricks_utils.is_in_databricks_job() to account for the fact that
175
+ some jobs have job_id but no run_id, for example one-time job runs.
176
+ """
177
+ try:
178
+ return databricks_utils.get_job_id() is not None
179
+ except Exception:
180
+ return False
181
+
182
+
183
+ def get_workspace_url() -> Optional[str]:
184
+ """
185
+ Overrides the behavior of the mlflow.utils.databricks_utils.get_workspace_url(),
186
+ as get_workspace_url does not always return URLs with defined schemes.
187
+
188
+ TODO (ML-32050): Refactor this implementation to mlflow, and bump minimum required mlflow version.
189
+ """
190
+ workspace_url = databricks_utils.get_workspace_url()
191
+ if workspace_url and not urlparse(workspace_url).scheme:
192
+ workspace_url = "https://" + workspace_url
193
+ return workspace_url
194
+
195
+
196
+ def is_in_databricks_env():
197
+ """
198
+ Determine if we are running in a Databricks environment (DBR, MLR, DLT, DCS, Mlflow Projects, Run Cmd 1.2 API, etc)
199
+
200
+ If any invoked methods raise an exception, swallow the exception and return False out of an abundance of caution.
201
+ """
202
+ try:
203
+ return (
204
+ is_in_databricks_job()
205
+ or databricks_utils.is_in_databricks_notebook()
206
+ or databricks_utils.is_in_databricks_runtime()
207
+ )
208
+ except Exception:
209
+ return False
210
+
211
+
212
+ def sanitize_identifier(identifier: str):
213
+ """
214
+ Sanitize and wrap an identifier with backquotes. For example, "a`b" becomes "`a``b`".
215
+ Use this function to sanitize identifiers such as column names in SQL and PySpark.
216
+ """
217
+ return f"`{identifier.replace('`', '``')}`"
218
+
219
+
220
+ def sanitize_identifiers(identifiers: List[str]):
221
+ """
222
+ Sanitize and wrap the identifiers in a list with backquotes.
223
+ """
224
+ return [sanitize_identifier(i) for i in identifiers]
225
+
226
+
227
+ def sanitize_multi_level_name(multi_level_name: str):
228
+ """
229
+ Sanitize a multi-level name (such as an Unity Catalog table name) by sanitizing each segment
230
+ and joining the results. For example, "ca+t.fo`o.ba$r" becomes "`ca+t`.`fo``o`.`ba$r`".
231
+ """
232
+ segments = multi_level_name.split(".")
233
+ return ".".join(sanitize_identifiers(segments))
234
+
235
+
236
+ def unsanitize_identifier(identifier: str):
237
+ """
238
+ Unsanitize an identifier. Useful when we get a possibly sanitized identifier from Spark or
239
+ somewhere else, but we need an unsanitized one.
240
+ Note: This function does not check the correctness of the identifier passed in. e.g. `foo``
241
+ is not a valid sanitized identifier. When given such invalid input, this function returns
242
+ invalid output.
243
+ """
244
+ if len(identifier) >= 2 and identifier[0] == "`" and identifier[-1] == "`":
245
+ return identifier[1:-1].replace("``", "`")
246
+ else:
247
+ return identifier
248
+
249
+
250
+ # strings containing \ or ' can break sql statements, so escape them.
251
+ def escape_sql_string(input_str: str) -> str:
252
+ return input_str.replace("\\", "\\\\").replace("'", "\\'")
@@ -0,0 +1,55 @@
1
+ import logging
2
+ from typing import List, Union
3
+
4
+ from pyspark.sql import DataFrame
5
+
6
+ _logger = logging.getLogger(__name__)
7
+
8
+
9
+ def standardize_checkpoint_location(checkpoint_location):
10
+ if checkpoint_location is None:
11
+ return checkpoint_location
12
+ checkpoint_location = checkpoint_location.strip()
13
+ if checkpoint_location == "":
14
+ checkpoint_location = None
15
+ return checkpoint_location
16
+
17
+
18
+ def _is_spark_connect_data_frame(df):
19
+ # We cannot directly pyspark.sql.connect.dataframe.DataFrame as it requires Spark 3.4, which
20
+ # is not installed on DBR 12.2 and earlier. Instead, we string match on the type.
21
+ return (
22
+ type(df).__name__ == "DataFrame"
23
+ and type(df).__module__ == "pyspark.sql.connect.dataframe"
24
+ )
25
+
26
+
27
+ def check_dataframe_type(df):
28
+ """
29
+ Check if df is a PySpark DataFrame, otherwise raise an error.
30
+ """
31
+ if not (isinstance(df, DataFrame) or _is_spark_connect_data_frame(df)):
32
+ raise ValueError(
33
+ f"Unsupported DataFrame type: {type(df)}. DataFrame must be a PySpark DataFrame."
34
+ )
35
+
36
+
37
+ def check_kwargs_empty(the_kwargs, method_name):
38
+ if len(the_kwargs) != 0:
39
+ raise TypeError(
40
+ f"{method_name}() got unexpected keyword argument(s): {list(the_kwargs.keys())}"
41
+ )
42
+
43
+
44
+ def check_duplicate_keys(keys: Union[str, List[str]], key_name: str) -> None:
45
+ """
46
+ Check if there are duplicate keys. Raise an error if there is duplicates.
47
+ """
48
+ if keys and isinstance(keys, list):
49
+ seen = set()
50
+ for k in keys:
51
+ if k in seen:
52
+ raise ValueError(
53
+ f"Found duplicated key '{k}' in {key_name}. {key_name} must be unique."
54
+ )
55
+ seen.add(k)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wedata-feature-engineering
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: Wedata Feature Engineering Library
5
5
  Home-page:
6
6
  Author: meahqian
@@ -24,7 +24,18 @@ feature_store/spark_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJ
24
24
  feature_store/spark_client/spark_client.py,sha256=vd-NCE9IGC0Ygqr-QSVY0teuWsQSkq_BFV4Mn6xMMNU,11578
25
25
  feature_store/training_set_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
26
  feature_store/training_set_client/training_set_client.py,sha256=Aa80xVXVE1KBdgplL9qqR8ftD5A5r2pfBttAhmySrB0,6696
27
- wedata_feature_engineering-0.1.2.dist-info/METADATA,sha256=IALf_mmflM-eRTOIOqVDJg5OoVVfLXBUHofIdC1T_wI,493
28
- wedata_feature_engineering-0.1.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
29
- wedata_feature_engineering-0.1.2.dist-info/top_level.txt,sha256=15761LgVdJ7tJWbdlYk0EZ560G9k6C4TE42dfLx8d0I,14
30
- wedata_feature_engineering-0.1.2.dist-info/RECORD,,
27
+ feature_store/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
+ feature_store/utils/common_utils.py,sha256=ck8pJYeN6vrcZmTrcnmIOOJWzYZaY3ZjvRSVme4tplo,3314
29
+ feature_store/utils/feature_lookup_utils.py,sha256=iILSP4AFHXrjNTuId6mT7wtMFAsZejyxThr_mZHPRF4,22330
30
+ feature_store/utils/feature_spec_utils.py,sha256=jeWzEhmkVW-bMRySMx_5grepHAlLquMhYxpbbiaJR-g,11582
31
+ feature_store/utils/feature_utils.py,sha256=8KhlkWax3KAi_xRnStVPlhCxeUHO08VW2fmT9jN8QUs,2761
32
+ feature_store/utils/schema_utils.py,sha256=8NhNUsF4Z6UtmzFeaVBnmb7xut0LqZepK3M27PSEpfE,4484
33
+ feature_store/utils/topological_sort.py,sha256=ebzKxmxeCLk9seB1zR0ASCGXsZsa-DjxJeTc4KUadtg,6475
34
+ feature_store/utils/training_set_utils.py,sha256=V5yW-XQ9in7gNOo4xsWy7txnSw_Z9Zxm4mV7MQmrWnk,22466
35
+ feature_store/utils/uc_utils.py,sha256=ets7YlrAtkhW9kKyYajDNo6iZasBIhFyxUT2MOyLuV8,10767
36
+ feature_store/utils/utils.py,sha256=T6dOUX3oOYRsbvXyTIElFZ20kNO92KMYPUCrqY5eomE,8953
37
+ feature_store/utils/validation_utils.py,sha256=FslvrNs3kstqvM6THScLOluEE6O9RWlDrD9xiihTzlw,1735
38
+ wedata_feature_engineering-0.1.4.dist-info/METADATA,sha256=uwmHZ4fVVcncF5YH_p3kUG24D377eKLraAcOlx-KU5o,493
39
+ wedata_feature_engineering-0.1.4.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
40
+ wedata_feature_engineering-0.1.4.dist-info/top_level.txt,sha256=15761LgVdJ7tJWbdlYk0EZ560G9k6C4TE42dfLx8d0I,14
41
+ wedata_feature_engineering-0.1.4.dist-info/RECORD,,