tencent-wedata-feature-engineering-dev 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.

Files changed (64) hide show
  1. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/METADATA +19 -0
  2. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/RECORD +64 -0
  3. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/WHEEL +5 -0
  4. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/top_level.txt +1 -0
  5. wedata/__init__.py +9 -0
  6. wedata/feature_store/__init__.py +0 -0
  7. wedata/feature_store/client.py +462 -0
  8. wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
  9. wedata/feature_store/cloud_sdk_client/client.py +86 -0
  10. wedata/feature_store/cloud_sdk_client/models.py +686 -0
  11. wedata/feature_store/cloud_sdk_client/utils.py +32 -0
  12. wedata/feature_store/common/__init__.py +0 -0
  13. wedata/feature_store/common/protos/__init__.py +0 -0
  14. wedata/feature_store/common/protos/feature_store_pb2.py +49 -0
  15. wedata/feature_store/common/store_config/__init__.py +0 -0
  16. wedata/feature_store/common/store_config/redis.py +48 -0
  17. wedata/feature_store/constants/__init__.py +0 -0
  18. wedata/feature_store/constants/constants.py +59 -0
  19. wedata/feature_store/constants/engine_types.py +34 -0
  20. wedata/feature_store/entities/__init__.py +0 -0
  21. wedata/feature_store/entities/column_info.py +138 -0
  22. wedata/feature_store/entities/environment_variables.py +55 -0
  23. wedata/feature_store/entities/feature.py +53 -0
  24. wedata/feature_store/entities/feature_column_info.py +72 -0
  25. wedata/feature_store/entities/feature_function.py +55 -0
  26. wedata/feature_store/entities/feature_lookup.py +200 -0
  27. wedata/feature_store/entities/feature_spec.py +489 -0
  28. wedata/feature_store/entities/feature_spec_constants.py +25 -0
  29. wedata/feature_store/entities/feature_table.py +111 -0
  30. wedata/feature_store/entities/feature_table_info.py +49 -0
  31. wedata/feature_store/entities/function_info.py +90 -0
  32. wedata/feature_store/entities/on_demand_column_info.py +57 -0
  33. wedata/feature_store/entities/source_data_column_info.py +24 -0
  34. wedata/feature_store/entities/training_set.py +135 -0
  35. wedata/feature_store/feast_client/__init__.py +0 -0
  36. wedata/feature_store/feast_client/feast_client.py +482 -0
  37. wedata/feature_store/feature_table_client/__init__.py +0 -0
  38. wedata/feature_store/feature_table_client/feature_table_client.py +969 -0
  39. wedata/feature_store/mlflow_model.py +17 -0
  40. wedata/feature_store/spark_client/__init__.py +0 -0
  41. wedata/feature_store/spark_client/spark_client.py +289 -0
  42. wedata/feature_store/training_set_client/__init__.py +0 -0
  43. wedata/feature_store/training_set_client/training_set_client.py +572 -0
  44. wedata/feature_store/utils/__init__.py +0 -0
  45. wedata/feature_store/utils/common_utils.py +352 -0
  46. wedata/feature_store/utils/env_utils.py +86 -0
  47. wedata/feature_store/utils/feature_lookup_utils.py +564 -0
  48. wedata/feature_store/utils/feature_spec_utils.py +286 -0
  49. wedata/feature_store/utils/feature_utils.py +73 -0
  50. wedata/feature_store/utils/on_demand_utils.py +107 -0
  51. wedata/feature_store/utils/schema_utils.py +117 -0
  52. wedata/feature_store/utils/signature_utils.py +202 -0
  53. wedata/feature_store/utils/topological_sort.py +158 -0
  54. wedata/feature_store/utils/training_set_utils.py +579 -0
  55. wedata/feature_store/utils/uc_utils.py +296 -0
  56. wedata/feature_store/utils/validation_utils.py +79 -0
  57. wedata/tempo/__init__.py +0 -0
  58. wedata/tempo/interpol.py +448 -0
  59. wedata/tempo/intervals.py +1331 -0
  60. wedata/tempo/io.py +61 -0
  61. wedata/tempo/ml.py +129 -0
  62. wedata/tempo/resample.py +318 -0
  63. wedata/tempo/tsdf.py +1720 -0
  64. wedata/tempo/utils.py +254 -0
@@ -0,0 +1,352 @@
1
+ """
2
+ 通用工具函数
3
+ """
4
+ import os
5
+ from collections import Counter
6
+ from datetime import datetime, timezone
7
+ from functools import wraps
8
+ from typing import Any, Dict, List, Optional
9
+ from urllib.parse import urlparse
10
+
11
+ from mlflow.exceptions import RestException
12
+ from mlflow.store.artifact.artifact_repository_registry import get_artifact_repository
13
+ from mlflow.store.artifact.models_artifact_repo import ModelsArtifactRepository
14
+ from mlflow.store.artifact.runs_artifact_repo import RunsArtifactRepository
15
+ from mlflow.utils import databricks_utils
16
+
17
+ from wedata.feature_store.constants import constants
18
+ from wedata.feature_store.constants.constants import MODEL_DATA_PATH_ROOT
19
+ from pyspark.sql import SparkSession
20
+
21
+ import logging
22
+
23
+ # 配置日志(可选,根据实际情况配置)
24
+ logging.basicConfig(level=logging.ERROR)
25
+
26
+
27
+ def validate_table_name(name: str):
28
+ """
29
+ 验证特征表名规范,仅支持单表名,不能包含点(如<catalog>.<schema>.<table>)
30
+
31
+ 参数:
32
+ name: 要验证的表名
33
+
34
+ 异常:
35
+ ValueError: 如果表名包含点或不符合规范
36
+ """
37
+ if not name or not isinstance(name, str):
38
+ raise ValueError("Table name must be a non-empty string")
39
+ if name.count('.') > 0:
40
+ raise ValueError("Feature table name only supports single table name, cannot contain dots (e.g. <catalog>.<schema>.<table>)")
41
+ if not name[0].isalpha():
42
+ raise ValueError("Table name must start with a letter")
43
+ if not all(c.isalnum() or c == '_' for c in name):
44
+ raise ValueError("Table name can only contain letters, numbers and underscores")
45
+
46
+
47
+ def build_full_table_name(table_name: str, database_name: Optional[str] = None) -> str:
48
+ """
49
+ 构建完整的表名,格式化为`<database>.<table>`形式。
50
+
51
+ Args:
52
+ table_name: 输入的表名(可以是简化的表名或完整表名)。
53
+ database_name: 数据库名
54
+
55
+ Returns:
56
+ 完整表名(`<database>.<table>`)。
57
+ """
58
+
59
+ feature_store_database_name = os.environ.get("WEDATA_DEFAULT_FEATURE_STORE_DATABASE")
60
+ if database_name:
61
+ feature_store_database_name = database_name
62
+
63
+ if not feature_store_database_name:
64
+ logging.error("The current user has not configured a default feature database. Please contact the manager account to configure it.")
65
+ raise RuntimeError("Feature store is not configured! Please contact the main account to configure it.")
66
+
67
+ logging.info("feature database:{}".format(feature_store_database_name))
68
+
69
+ feature_store_database = f"{feature_store_database_name}.{table_name}"
70
+
71
+ return feature_store_database
72
+
73
+
74
+ def enable_if(condition):
75
+ """
76
+ A decorator that conditionally enables a function based on a condition.
77
+ If the condition is not truthy, calling the function raises a NotImplementedError.
78
+
79
+ :param condition: A callable that returns a truthy or falsy value.
80
+ """
81
+
82
+ def decorator(func):
83
+ @wraps(func)
84
+ def wrapper(*args, **kwargs):
85
+ if not condition():
86
+ raise NotImplementedError
87
+ return func(*args, **kwargs)
88
+
89
+ return wrapper
90
+
91
+ return decorator
92
+
93
+
94
+ def is_empty(target: str):
95
+ return target is None or len(target.strip()) == 0
96
+
97
+
98
+ class _NoDbutilsError(Exception):
99
+ pass
100
+
101
+
102
+ def _get_dbutils():
103
+ try:
104
+ import IPython
105
+
106
+ ip_shell = IPython.get_ipython()
107
+ if ip_shell is None:
108
+ raise _NoDbutilsError
109
+ return ip_shell.ns_table["user_global"]["dbutils"]
110
+ except ImportError:
111
+ raise _NoDbutilsError
112
+ except KeyError:
113
+ raise _NoDbutilsError
114
+
115
+
116
+ def utc_timestamp_ms_from_iso_datetime_string(date_string: str) -> int:
117
+ dt = datetime.fromisoformat(date_string)
118
+ utc_dt = dt.replace(tzinfo=timezone.utc)
119
+ return int(1000 * utc_dt.timestamp())
120
+
121
+
122
+ def pip_depependency_pinned_major_version(pip_package_name, major_version):
123
+ """
124
+ Generate a pip dependency string that is pinned to a major version, for example: "databricks-feature-lookup==0.*"
125
+ """
126
+ return f"{pip_package_name}=={major_version}.*"
127
+
128
+ def pip_depependency_pinned_version(pip_package_name, version):
129
+ """
130
+ Generate a pip dependency string that is pinned to a major version, for example: "databricks-feature-lookup==0.*"
131
+ """
132
+ return f"{pip_package_name}=={version}"
133
+
134
+
135
+ def add_mlflow_pip_depependency(conda_env, pip_package_name):
136
+ """
137
+ Add a new pip dependency to the conda environment taken from the raw MLflow model.
138
+ """
139
+ if pip_package_name is None or len(pip_package_name) == 0:
140
+ raise ValueError(
141
+ "Unexpected input: missing or empty pip_package_name parameter"
142
+ )
143
+
144
+ found_pip_dependency = False
145
+ if conda_env is not None:
146
+ for dep in conda_env["dependencies"]:
147
+ if isinstance(dep, dict) and "pip" in dep:
148
+ found_pip_dependency = True
149
+ pip_deps = dep["pip"]
150
+ if pip_package_name not in pip_deps:
151
+ pip_deps.append(pip_package_name)
152
+ if "dependencies" in conda_env and not found_pip_dependency:
153
+ raise ValueError(
154
+ "Unexpected input: mlflow conda_env did not contain pip as a dependency"
155
+ )
156
+
157
+
158
+ def download_model_artifacts(model_uri, dir):
159
+ """
160
+ Downloads model artifacts from model_uri to dir.
161
+ """
162
+ if not is_artifact_uri(model_uri):
163
+ raise ValueError(
164
+ f"Invalid model URI '{model_uri}'."
165
+ f"Use ``models:/model_name>/<version_number>`` or "
166
+ f"``runs:/<mlflow_run_id>/run-relative/path/to/model``."
167
+ )
168
+
169
+ try:
170
+ repo = get_artifact_repository(model_uri)
171
+ except RestException as e:
172
+ raise ValueError(f"The model at '{model_uri}' does not exist.", e)
173
+
174
+ artifact_path = os.path.join("artifacts", MODEL_DATA_PATH_ROOT)
175
+ if len(repo.list_artifacts(artifact_path)) == 0:
176
+ raise ValueError(
177
+ f"No suitable model found at '{model_uri}'. Either no model exists in this "
178
+ f"artifact location or an existing model was not packaged with Feature Store metadata. "
179
+ f"Only models logged by FeatureStoreClient.log_model can be used in inference."
180
+ )
181
+
182
+ return repo.download_artifacts(artifact_path="", dst_path=dir)
183
+
184
+
185
+ def validate_params_non_empty(params: Dict[str, Any], expected_params: List[str]):
186
+ """
187
+ Validate that none of the expected parameters are empty.
188
+ """
189
+ for expected_param in expected_params:
190
+ if expected_param not in params:
191
+ raise ValueError(
192
+ f'Internal error: expected parameter "{expected_param}" not found in params dictionary'
193
+ )
194
+ param_value = params[expected_param]
195
+ if not param_value:
196
+ raise ValueError(f'Parameter "{expected_param}" cannot be empty')
197
+
198
+
199
+ def get_workspace_url() -> Optional[str]:
200
+ """
201
+ Overrides the behavior of the mlflow.utils.databricks_utils.get_workspace_url().
202
+ """
203
+ workspace_url = databricks_utils.get_workspace_url()
204
+ if workspace_url and not urlparse(workspace_url).scheme:
205
+ workspace_url = "https://" + workspace_url
206
+ return workspace_url
207
+
208
+
209
+ def is_artifact_uri(uri):
210
+ """
211
+ Checks the artifact URI is associated with a MLflow model or run.
212
+ The actual URI can be a model URI, model URI + subdirectory, or model URI + path to artifact file.
213
+ """
214
+ return ModelsArtifactRepository.is_models_uri(
215
+ uri
216
+ ) or RunsArtifactRepository.is_runs_uri(uri)
217
+
218
+
219
+ def as_list(obj, default=None):
220
+ if not obj:
221
+ return default
222
+ elif isinstance(obj, list):
223
+ return obj
224
+ else:
225
+ return [obj]
226
+
227
+
228
+ def get_duplicates(elements: List[Any]) -> List[Any]:
229
+ """
230
+ Returns duplicate elements in the order they first appear.
231
+ """
232
+ element_counts = Counter(elements)
233
+ duplicates = []
234
+ for e in element_counts.keys():
235
+ if element_counts[e] > 1:
236
+ duplicates.append(e)
237
+ return duplicates
238
+
239
+
240
+ def validate_strings_unique(strings: List[str], error_template: str):
241
+ """
242
+ Validates all strings are unique, otherwise raise ValueError with the error template and duplicates.
243
+ Passes single-quoted, comma delimited duplicates to the error template.
244
+ """
245
+ duplicate_strings = get_duplicates(strings)
246
+ if duplicate_strings:
247
+ duplicates_formatted = ", ".join([f"'{s}'" for s in duplicate_strings])
248
+ raise ValueError(error_template.format(duplicates_formatted))
249
+
250
+
251
+ def sanitize_identifier(identifier: str):
252
+ """
253
+ Sanitize and wrap an identifier with backquotes. For example, "a`b" becomes "`a``b`".
254
+ Use this function to sanitize identifiers such as column names in SQL and PySpark.
255
+ """
256
+ return f"`{identifier.replace('`', '``')}`"
257
+
258
+
259
+ def sanitize_identifiers(identifiers: List[str]):
260
+ """
261
+ Sanitize and wrap the identifiers in a list with backquotes.
262
+ """
263
+ return [sanitize_identifier(i) for i in identifiers]
264
+
265
+
266
+ def sanitize_multi_level_name(multi_level_name: str):
267
+ """
268
+ Sanitize a multi-level name (such as an Unity Catalog table name) by sanitizing each segment
269
+ and joining the results. For example, "ca+t.fo`o.ba$r" becomes "`ca+t`.`fo``o`.`ba$r`".
270
+ """
271
+ segments = multi_level_name.split(".")
272
+ return ".".join(sanitize_identifiers(segments))
273
+
274
+
275
+ def unsanitize_identifier(identifier: str):
276
+ """
277
+ Unsanitize an identifier. Useful when we get a possibly sanitized identifier from Spark or
278
+ somewhere else, but we need an unsanitized one.
279
+ Note: This function does not check the correctness of the identifier passed in. e.g. `foo``
280
+ is not a valid sanitized identifier. When given such invalid input, this function returns
281
+ invalid output.
282
+ """
283
+ if len(identifier) >= 2 and identifier[0] == "`" and identifier[-1] == "`":
284
+ return identifier[1:-1].replace("``", "`")
285
+ else:
286
+ return identifier
287
+
288
+
289
+ # strings containing \ or ' can break sql statements, so escape them.
290
+ def escape_sql_string(input_str: str) -> str:
291
+ return input_str.replace("\\", "\\\\").replace("'", "\\'")
292
+
293
+
294
+ def get_unique_list_order(elements: List[Any]) -> List[Any]:
295
+ """
296
+ Returns unique elements in the order they first appear.
297
+ """
298
+ return list(dict.fromkeys(elements))
299
+
300
+
301
+ def validate_database(database_name):
302
+ if database_name is None:
303
+ database_name = os.environ.get(constants.WEDATA_DEFAULT_FEATURE_STORE_DATABASE)
304
+ if database_name is None:
305
+ raise ValueError("Database_name variable or default database is not set.")
306
+ return True
307
+
308
+
309
+ def check_package_version(package_name, expected_version, op="=="):
310
+ """
311
+ 检查指定包的版本是否满足预期版本要求。
312
+ Args:
313
+ package_name: 包名称
314
+ expected_version: 预期版本要求,例如3.5.5
315
+ op: 比较运算符,默认为 "=="
316
+ Returns:
317
+ (是否成功找到包,版本是否匹配,已安装版本)
318
+ 如果满足,返回 (True, True, installed_version);否则返回 (True, False, installed_version)。
319
+ 如果指定包不存在,返回 (False, False, None)。
320
+ """
321
+ # 在脚本顶部添加
322
+ from packaging import version
323
+ import importlib.metadata
324
+ try:
325
+ installed_version = importlib.metadata.version(package_name)
326
+
327
+ if not op:
328
+ raise ValueError(f"Invalid op: {op}. need be in ['==', '>', '<', '>=', '<=', '!=', '~=']")
329
+ # 支持版本范围检查(如 ">=2.0,<3.0")
330
+ # 使用 packaging.version 进行复杂版本`检查
331
+ i = version.parse(installed_version)
332
+ e = version.parse(expected_version)
333
+ return True, eval(f"i{op}e"), installed_version
334
+
335
+ except importlib.metadata.PackageNotFoundError:
336
+ return False, False, None
337
+
338
+
339
+ def check_spark_table_exists(spark_client: SparkSession, full_table_name: str) -> bool:
340
+ _, ok, _ = check_package_version("pyspark", "3.5.0", ">=")
341
+ try:
342
+ return spark_client.catalog.tableExists(full_table_name)
343
+ except AttributeError:
344
+ split = full_table_name.split(".")
345
+ if len(split) == 2:
346
+ query = f"SHOW TABLES IN {split[0]} LIKE '{split[1]}'"
347
+ elif len(split) == 3:
348
+ query = f"SHOW TABLES IN {split[1]} LIKE '{split[2]}'"
349
+ else:
350
+ query = f"SHOW TABLES LIKE '{full_table_name}'"
351
+ logging.debug("check table sql: =======", query)
352
+ return spark_client.sql(query).count() > 0
@@ -0,0 +1,86 @@
1
+ import os
2
+
3
+
4
+ class EnvironmentError(Exception):
5
+ pass
6
+
7
+
8
+ def get_project_id() -> str:
9
+ """
10
+ 获取当前项目名称
11
+
12
+ Returns:
13
+ str: 项目ID
14
+
15
+ Raises:
16
+ ValueError: 当环境变量 WEDATA_PROJECT_ID 未设置时
17
+ """
18
+ project_id = os.environ.get("WEDATA_PROJECT_ID")
19
+ if project_id:
20
+ return project_id
21
+ raise EnvironmentError("environment variable WEDATA_PROJECT_ID is not set, please check environment configuration")
22
+
23
+
24
+ def get_cloud_secret() -> (str, str):
25
+ """
26
+ 获取云上密钥
27
+
28
+ Returns:
29
+ tuple: 包含云上密钥的元组
30
+ """
31
+ secret_id = os.environ.get("WEDATA_CLOUD_TEMP_SECRET_ID")
32
+ secret_key = os.environ.get("WEDATA_CLOUD_TEMP_SECRET_KEY")
33
+ return secret_id, secret_key
34
+
35
+
36
+ def get_region() -> str:
37
+ """
38
+ 获取当前地域
39
+ """
40
+ region_dlc = os.environ.get("DLC_REGION")
41
+ region_emr = os.environ.get("EMR_REGION")
42
+ region = region_dlc if region_dlc else region_emr
43
+ if not region:
44
+ raise EnvironmentError("environment variable DLC_REGION or EMR_REGION is not set, "
45
+ "please check environment configuration")
46
+ return region
47
+
48
+
49
+ def get_database_name(database_name: str) -> str:
50
+ """
51
+ 获取数据库名称
52
+
53
+ Args:
54
+ database_name: 数据库名称
55
+
56
+ Returns:
57
+ str: 数据库名称
58
+
59
+ Raises:
60
+ EnvironmentError: 当环境变量 WEDATA_DEFAULT_FEATURE_STORE_DATABASE 未设置时
61
+ """
62
+ feature_store_database_name = os.environ.get("WEDATA_DEFAULT_FEATURE_STORE_DATABASE")
63
+ if database_name:
64
+ return database_name
65
+ elif feature_store_database_name:
66
+ return feature_store_database_name
67
+ raise EnvironmentError("environment variable WEDATA_DEFAULT_FEATURE_STORE_DATABASE is not set, "
68
+ "please check environment configuration")
69
+
70
+
71
+ def get_engine_name() -> str:
72
+ """
73
+ 获取引擎名称
74
+ """
75
+ engine_name = os.environ.get("KERNEL_ENGINE")
76
+ if engine_name:
77
+ return engine_name
78
+ raise EnvironmentError("environment variable KERNEL_ENGINE is not set, please check environment configuration")
79
+
80
+
81
+ def get_engine_type() -> str:
82
+ """
83
+ 判断引擎类型
84
+ """
85
+ return "DLC" if os.environ.get("DLC_REGION") else "EMR"
86
+