wedata-feature-engineering 0.1.3__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5}/PKG-INFO +1 -1
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5}/setup.py +8 -2
- {wedata-feature-engineering-0.1.3/feature_store → wedata-feature-engineering-0.1.5/wedata}/__init__.py +1 -1
- wedata-feature-engineering-0.1.5/wedata/feature_store/training_set_client/__init__.py +0 -0
- wedata-feature-engineering-0.1.5/wedata/feature_store/utils/__init__.py +0 -0
- wedata-feature-engineering-0.1.5/wedata/feature_store/utils/common_utils.py +96 -0
- wedata-feature-engineering-0.1.5/wedata/feature_store/utils/feature_lookup_utils.py +570 -0
- wedata-feature-engineering-0.1.5/wedata/feature_store/utils/feature_spec_utils.py +286 -0
- wedata-feature-engineering-0.1.5/wedata/feature_store/utils/feature_utils.py +73 -0
- wedata-feature-engineering-0.1.5/wedata/feature_store/utils/schema_utils.py +117 -0
- wedata-feature-engineering-0.1.5/wedata/feature_store/utils/topological_sort.py +158 -0
- wedata-feature-engineering-0.1.5/wedata/feature_store/utils/training_set_utils.py +580 -0
- wedata-feature-engineering-0.1.5/wedata/feature_store/utils/uc_utils.py +281 -0
- wedata-feature-engineering-0.1.5/wedata/feature_store/utils/utils.py +252 -0
- wedata-feature-engineering-0.1.5/wedata/feature_store/utils/validation_utils.py +55 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5}/wedata_feature_engineering.egg-info/PKG-INFO +1 -1
- wedata-feature-engineering-0.1.5/wedata_feature_engineering.egg-info/SOURCES.txt +45 -0
- wedata-feature-engineering-0.1.5/wedata_feature_engineering.egg-info/top_level.txt +1 -0
- wedata-feature-engineering-0.1.3/wedata_feature_engineering.egg-info/SOURCES.txt +0 -33
- wedata-feature-engineering-0.1.3/wedata_feature_engineering.egg-info/top_level.txt +0 -1
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5}/README.md +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5}/setup.cfg +0 -0
- {wedata-feature-engineering-0.1.3/feature_store/constants → wedata-feature-engineering-0.1.5/wedata/feature_store}/__init__.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5/wedata}/feature_store/client.py +0 -0
- {wedata-feature-engineering-0.1.3/feature_store/entities → wedata-feature-engineering-0.1.5/wedata/feature_store/constants}/__init__.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5/wedata}/feature_store/constants/constants.py +0 -0
- {wedata-feature-engineering-0.1.3/feature_store/feature_table_client → wedata-feature-engineering-0.1.5/wedata/feature_store/entities}/__init__.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5/wedata}/feature_store/entities/column_info.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5/wedata}/feature_store/entities/data_type.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5/wedata}/feature_store/entities/environment_variables.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5/wedata}/feature_store/entities/feature.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5/wedata}/feature_store/entities/feature_column_info.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5/wedata}/feature_store/entities/feature_function.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5/wedata}/feature_store/entities/feature_lookup.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5/wedata}/feature_store/entities/feature_spec.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5/wedata}/feature_store/entities/feature_spec_constants.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5/wedata}/feature_store/entities/feature_table.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5/wedata}/feature_store/entities/feature_table_info.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5/wedata}/feature_store/entities/function_info.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5/wedata}/feature_store/entities/on_demand_column_info.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5/wedata}/feature_store/entities/source_data_column_info.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5/wedata}/feature_store/entities/training_set.py +0 -0
- {wedata-feature-engineering-0.1.3/feature_store/spark_client → wedata-feature-engineering-0.1.5/wedata/feature_store/feature_table_client}/__init__.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5/wedata}/feature_store/feature_table_client/feature_table_client.py +0 -0
- {wedata-feature-engineering-0.1.3/feature_store/training_set_client → wedata-feature-engineering-0.1.5/wedata/feature_store/spark_client}/__init__.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5/wedata}/feature_store/spark_client/spark_client.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5/wedata}/feature_store/training_set_client/training_set_client.py +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5}/wedata_feature_engineering.egg-info/dependency_links.txt +0 -0
- {wedata-feature-engineering-0.1.3 → wedata-feature-engineering-0.1.5}/wedata_feature_engineering.egg-info/requires.txt +0 -0
@@ -1,9 +1,15 @@
|
|
1
1
|
from setuptools import setup, find_packages
|
2
|
+
import os
|
3
|
+
|
4
|
+
# Dynamically read version from wedata/__init__.py
|
5
|
+
version = {}
|
6
|
+
with open(os.path.join(os.path.dirname(__file__), 'wedata', '__init__.py')) as f:
|
7
|
+
exec(f.read(), version)
|
2
8
|
|
3
9
|
setup(
|
4
10
|
name="wedata-feature-engineering",
|
5
|
-
version="
|
6
|
-
packages=find_packages(
|
11
|
+
version=version["__version__"],
|
12
|
+
packages=find_packages(include=['wedata', 'wedata.*']),
|
7
13
|
install_requires=[
|
8
14
|
'pyspark>=3.0.0',
|
9
15
|
'delta-spark>=1.0.0',
|
File without changes
|
File without changes
|
@@ -0,0 +1,96 @@
|
|
1
|
+
"""
|
2
|
+
通用工具函数
|
3
|
+
"""
|
4
|
+
|
5
|
+
from collections import Counter
|
6
|
+
from typing import Any, List
|
7
|
+
|
8
|
+
from mlflow.store.artifact.models_artifact_repo import ModelsArtifactRepository
|
9
|
+
from mlflow.store.artifact.runs_artifact_repo import RunsArtifactRepository
|
10
|
+
|
11
|
+
|
12
|
+
def is_artifact_uri(uri):
|
13
|
+
"""
|
14
|
+
Checks the artifact URI is associated with a MLflow model or run.
|
15
|
+
The actual URI can be a model URI, model URI + subdirectory, or model URI + path to artifact file.
|
16
|
+
"""
|
17
|
+
return ModelsArtifactRepository.is_models_uri(
|
18
|
+
uri
|
19
|
+
) or RunsArtifactRepository.is_runs_uri(uri)
|
20
|
+
|
21
|
+
def as_list(obj, default=None):
|
22
|
+
if not obj:
|
23
|
+
return default
|
24
|
+
elif isinstance(obj, list):
|
25
|
+
return obj
|
26
|
+
else:
|
27
|
+
return [obj]
|
28
|
+
|
29
|
+
def get_duplicates(elements: List[Any]) -> List[Any]:
|
30
|
+
"""
|
31
|
+
Returns duplicate elements in the order they first appear.
|
32
|
+
"""
|
33
|
+
element_counts = Counter(elements)
|
34
|
+
duplicates = []
|
35
|
+
for e in element_counts.keys():
|
36
|
+
if element_counts[e] > 1:
|
37
|
+
duplicates.append(e)
|
38
|
+
return duplicates
|
39
|
+
|
40
|
+
def validate_strings_unique(strings: List[str], error_template: str):
|
41
|
+
"""
|
42
|
+
Validates all strings are unique, otherwise raise ValueError with the error template and duplicates.
|
43
|
+
Passes single-quoted, comma delimited duplicates to the error template.
|
44
|
+
"""
|
45
|
+
duplicate_strings = get_duplicates(strings)
|
46
|
+
if duplicate_strings:
|
47
|
+
duplicates_formatted = ", ".join([f"'{s}'" for s in duplicate_strings])
|
48
|
+
raise ValueError(error_template.format(duplicates_formatted))
|
49
|
+
|
50
|
+
def sanitize_identifier(identifier: str):
|
51
|
+
"""
|
52
|
+
Sanitize and wrap an identifier with backquotes. For example, "a`b" becomes "`a``b`".
|
53
|
+
Use this function to sanitize identifiers such as column names in SQL and PySpark.
|
54
|
+
"""
|
55
|
+
return f"`{identifier.replace('`', '``')}`"
|
56
|
+
|
57
|
+
|
58
|
+
def sanitize_identifiers(identifiers: List[str]):
|
59
|
+
"""
|
60
|
+
Sanitize and wrap the identifiers in a list with backquotes.
|
61
|
+
"""
|
62
|
+
return [sanitize_identifier(i) for i in identifiers]
|
63
|
+
|
64
|
+
|
65
|
+
def sanitize_multi_level_name(multi_level_name: str):
|
66
|
+
"""
|
67
|
+
Sanitize a multi-level name (such as an Unity Catalog table name) by sanitizing each segment
|
68
|
+
and joining the results. For example, "ca+t.fo`o.ba$r" becomes "`ca+t`.`fo``o`.`ba$r`".
|
69
|
+
"""
|
70
|
+
segments = multi_level_name.split(".")
|
71
|
+
return ".".join(sanitize_identifiers(segments))
|
72
|
+
|
73
|
+
|
74
|
+
def unsanitize_identifier(identifier: str):
|
75
|
+
"""
|
76
|
+
Unsanitize an identifier. Useful when we get a possibly sanitized identifier from Spark or
|
77
|
+
somewhere else, but we need an unsanitized one.
|
78
|
+
Note: This function does not check the correctness of the identifier passed in. e.g. `foo``
|
79
|
+
is not a valid sanitized identifier. When given such invalid input, this function returns
|
80
|
+
invalid output.
|
81
|
+
"""
|
82
|
+
if len(identifier) >= 2 and identifier[0] == "`" and identifier[-1] == "`":
|
83
|
+
return identifier[1:-1].replace("``", "`")
|
84
|
+
else:
|
85
|
+
return identifier
|
86
|
+
|
87
|
+
|
88
|
+
# strings containing \ or ' can break sql statements, so escape them.
|
89
|
+
def escape_sql_string(input_str: str) -> str:
|
90
|
+
return input_str.replace("\\", "\\\\").replace("'", "\\'")
|
91
|
+
|
92
|
+
def get_unique_list_order(elements: List[Any]) -> List[Any]:
|
93
|
+
"""
|
94
|
+
Returns unique elements in the order they first appear.
|
95
|
+
"""
|
96
|
+
return list(dict.fromkeys(elements))
|