wedata-feature-engineering 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wedata/__init__.py +6 -0
- wedata/feature_store/__init__.py +0 -0
- wedata/feature_store/client.py +169 -0
- wedata/feature_store/constants/__init__.py +0 -0
- wedata/feature_store/constants/constants.py +28 -0
- wedata/feature_store/entities/__init__.py +0 -0
- wedata/feature_store/entities/column_info.py +117 -0
- wedata/feature_store/entities/data_type.py +92 -0
- wedata/feature_store/entities/environment_variables.py +55 -0
- wedata/feature_store/entities/feature.py +53 -0
- wedata/feature_store/entities/feature_column_info.py +64 -0
- wedata/feature_store/entities/feature_function.py +55 -0
- wedata/feature_store/entities/feature_lookup.py +179 -0
- wedata/feature_store/entities/feature_spec.py +454 -0
- wedata/feature_store/entities/feature_spec_constants.py +25 -0
- wedata/feature_store/entities/feature_table.py +164 -0
- wedata/feature_store/entities/feature_table_info.py +40 -0
- wedata/feature_store/entities/function_info.py +184 -0
- wedata/feature_store/entities/on_demand_column_info.py +44 -0
- wedata/feature_store/entities/source_data_column_info.py +21 -0
- wedata/feature_store/entities/training_set.py +134 -0
- wedata/feature_store/feature_table_client/__init__.py +0 -0
- wedata/feature_store/feature_table_client/feature_table_client.py +313 -0
- wedata/feature_store/spark_client/__init__.py +0 -0
- wedata/feature_store/spark_client/spark_client.py +286 -0
- wedata/feature_store/training_set_client/__init__.py +0 -0
- wedata/feature_store/training_set_client/training_set_client.py +196 -0
- wedata/feature_store/utils/__init__.py +0 -0
- wedata/feature_store/utils/common_utils.py +96 -0
- wedata/feature_store/utils/feature_lookup_utils.py +570 -0
- wedata/feature_store/utils/feature_spec_utils.py +286 -0
- wedata/feature_store/utils/feature_utils.py +73 -0
- wedata/feature_store/utils/schema_utils.py +117 -0
- wedata/feature_store/utils/topological_sort.py +158 -0
- wedata/feature_store/utils/training_set_utils.py +580 -0
- wedata/feature_store/utils/uc_utils.py +281 -0
- wedata/feature_store/utils/utils.py +252 -0
- wedata/feature_store/utils/validation_utils.py +55 -0
- {wedata_feature_engineering-0.1.4.dist-info → wedata_feature_engineering-0.1.5.dist-info}/METADATA +1 -1
- wedata_feature_engineering-0.1.5.dist-info/RECORD +79 -0
- wedata_feature_engineering-0.1.5.dist-info/top_level.txt +1 -0
- wedata_feature_engineering-0.1.4.dist-info/RECORD +0 -41
- wedata_feature_engineering-0.1.4.dist-info/top_level.txt +0 -1
- {wedata_feature_engineering-0.1.4.dist-info → wedata_feature_engineering-0.1.5.dist-info}/WHEEL +0 -0
@@ -0,0 +1,196 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
import os
|
4
|
+
from collections import defaultdict
|
5
|
+
from types import ModuleType
|
6
|
+
from typing import Any, Dict, List, Optional, Set, Union
|
7
|
+
|
8
|
+
import mlflow
|
9
|
+
import yaml
|
10
|
+
from mlflow.models import Model, ModelSignature
|
11
|
+
from mlflow.utils.file_utils import TempDir, YamlSafeDumper, read_yaml
|
12
|
+
from pyspark.sql import DataFrame
|
13
|
+
from pyspark.sql.functions import struct
|
14
|
+
|
15
|
+
from feature_store.entities.feature_column_info import FeatureColumnInfo
|
16
|
+
from feature_store.entities.feature_function import FeatureFunction
|
17
|
+
from feature_store.entities.feature_lookup import FeatureLookup
|
18
|
+
from feature_store.entities.feature_spec import FeatureSpec
|
19
|
+
from feature_store.entities.training_set import TrainingSet
|
20
|
+
from feature_store.spark_client.spark_client import SparkClient
|
21
|
+
|
22
|
+
from feature_store.constants.constants import (
|
23
|
+
_NO_RESULT_TYPE_PASSED,
|
24
|
+
_PREBUILT_ENV_URI,
|
25
|
+
_USE_SPARK_NATIVE_JOIN,
|
26
|
+
_WARN,
|
27
|
+
MODEL_DATA_PATH_ROOT,
|
28
|
+
PREDICTION_COLUMN_NAME,
|
29
|
+
)
|
30
|
+
|
31
|
+
from feature_store.utils import common_utils, training_set_utils
|
32
|
+
from feature_store.utils.feature_spec_utils import convert_to_yaml_string
|
33
|
+
|
34
|
+
_logger = logging.getLogger(__name__)
|
35
|
+
|
36
|
+
FEATURE_SPEC_GRAPH_MAX_COLUMN_INFO = 1000
|
37
|
+
|
38
|
+
|
39
|
+
class TrainingSetClient:
|
40
|
+
def __init__(
|
41
|
+
self,
|
42
|
+
spark_client: SparkClient
|
43
|
+
):
|
44
|
+
self._spark_client = spark_client
|
45
|
+
|
46
|
+
def create_training_set(
|
47
|
+
self,
|
48
|
+
feature_spec: FeatureSpec,
|
49
|
+
feature_column_infos: List[FeatureColumnInfo],
|
50
|
+
label_names: List[str],
|
51
|
+
df: DataFrame,
|
52
|
+
ft_metadata: training_set_utils._FeatureTableMetadata,
|
53
|
+
kwargs,
|
54
|
+
):
|
55
|
+
uc_function_infos = training_set_utils.get_uc_function_infos(
|
56
|
+
self._spark_client,
|
57
|
+
{odci.udf_name for odci in feature_spec.on_demand_column_infos},
|
58
|
+
)
|
59
|
+
|
60
|
+
# TODO(divyagupta-db): Move validation from _validate_join_feature_data in feature_lookup_utils.py
|
61
|
+
# to a helper function called here and in score_batch.
|
62
|
+
|
63
|
+
# Add consumer of each feature and instrument as final step
|
64
|
+
consumer_feature_table_map = defaultdict(list)
|
65
|
+
for feature in feature_column_infos:
|
66
|
+
consumer_feature_table_map[feature.table_name].append(feature.feature_name)
|
67
|
+
consumed_udf_names = [f.udf_name for f in feature_spec.function_infos]
|
68
|
+
|
69
|
+
# Spark query planning is known to cause spark driver to crash if there are many feature tables to PiT join.
|
70
|
+
# See https://docs.google.com/document/d/1EyA4vvlWikTJMeinsLkxmRAVNlXoF1eqoZElOdqlWyY/edit
|
71
|
+
# So we disable native join by default.
|
72
|
+
training_set_utils.warn_if_non_photon_for_native_spark(
|
73
|
+
kwargs.get(_USE_SPARK_NATIVE_JOIN, False), self._spark_client
|
74
|
+
)
|
75
|
+
return TrainingSet(
|
76
|
+
feature_spec,
|
77
|
+
df,
|
78
|
+
label_names,
|
79
|
+
ft_metadata.feature_table_metadata_map,
|
80
|
+
ft_metadata.feature_table_data_map,
|
81
|
+
uc_function_infos,
|
82
|
+
kwargs.get(_USE_SPARK_NATIVE_JOIN, False),
|
83
|
+
)
|
84
|
+
|
85
|
+
def create_training_set_from_feature_lookups(
|
86
|
+
self,
|
87
|
+
df: DataFrame,
|
88
|
+
feature_lookups: List[Union[FeatureLookup, FeatureFunction]],
|
89
|
+
label: Union[str, List[str], None],
|
90
|
+
exclude_columns: List[str],
|
91
|
+
**kwargs,
|
92
|
+
) -> TrainingSet:
|
93
|
+
|
94
|
+
# 获取特征查找列表和特征函数列表
|
95
|
+
features = feature_lookups
|
96
|
+
feature_lookups = [f for f in features if isinstance(f, FeatureLookup)]
|
97
|
+
feature_functions = [f for f in features if isinstance(f, FeatureFunction)]
|
98
|
+
|
99
|
+
# 如果未提供标签,则用空列表初始化label_names
|
100
|
+
label_names = common_utils.as_list(label, [])
|
101
|
+
del label
|
102
|
+
|
103
|
+
# 校验数据集和标签
|
104
|
+
training_set_utils.verify_df_and_labels(df, label_names, exclude_columns)
|
105
|
+
|
106
|
+
# 获取特征表元数据
|
107
|
+
ft_metadata = training_set_utils.get_table_metadata(
|
108
|
+
self._spark_client,
|
109
|
+
{fl.table_name for fl in feature_lookups}
|
110
|
+
)
|
111
|
+
|
112
|
+
column_infos = training_set_utils.get_column_infos(
|
113
|
+
feature_lookups,
|
114
|
+
feature_functions,
|
115
|
+
ft_metadata,
|
116
|
+
df_columns=df.columns,
|
117
|
+
label_names=label_names,
|
118
|
+
)
|
119
|
+
|
120
|
+
training_set_utils.validate_column_infos(
|
121
|
+
self._spark_client,
|
122
|
+
ft_metadata,
|
123
|
+
column_infos.source_data_column_infos,
|
124
|
+
column_infos.feature_column_infos,
|
125
|
+
column_infos.on_demand_column_infos,
|
126
|
+
label_names,
|
127
|
+
)
|
128
|
+
|
129
|
+
# Build feature_spec locally for comparison with the feature spec yaml generated by the
|
130
|
+
# FeatureStore backend. This will be removed once the migration is validated.
|
131
|
+
feature_spec = training_set_utils.build_feature_spec(
|
132
|
+
feature_lookups,
|
133
|
+
ft_metadata,
|
134
|
+
column_infos,
|
135
|
+
exclude_columns
|
136
|
+
)
|
137
|
+
|
138
|
+
return self.create_training_set(
|
139
|
+
feature_spec,
|
140
|
+
column_infos.feature_column_infos,
|
141
|
+
label_names,
|
142
|
+
df,
|
143
|
+
ft_metadata,
|
144
|
+
kwargs=kwargs,
|
145
|
+
)
|
146
|
+
|
147
|
+
|
148
|
+
|
149
|
+
|
150
|
+
|
151
|
+
def create_feature_spec(
|
152
|
+
self,
|
153
|
+
name: str,
|
154
|
+
features: List[Union[FeatureLookup, FeatureFunction]],
|
155
|
+
sparkClient: SparkClient,
|
156
|
+
exclude_columns: List[str] = [],
|
157
|
+
) -> FeatureSpec:
|
158
|
+
|
159
|
+
feature_lookups = [f for f in features if isinstance(f, FeatureLookup)]
|
160
|
+
feature_functions = [f for f in features if isinstance(f, FeatureFunction)]
|
161
|
+
|
162
|
+
# Maximum of 100 FeatureFunctions is supported
|
163
|
+
if len(feature_functions) > training_set_utils.MAX_FEATURE_FUNCTIONS:
|
164
|
+
raise ValueError(
|
165
|
+
f"A maximum of {training_set_utils.MAX_FEATURE_FUNCTIONS} FeatureFunctions are supported."
|
166
|
+
)
|
167
|
+
|
168
|
+
# Get feature table metadata and column infos
|
169
|
+
ft_metadata = training_set_utils.get_table_metadata(
|
170
|
+
self._spark_client,
|
171
|
+
{fl.table_name for fl in feature_lookups}
|
172
|
+
)
|
173
|
+
column_infos = training_set_utils.get_column_infos(
|
174
|
+
feature_lookups,
|
175
|
+
feature_functions,
|
176
|
+
ft_metadata,
|
177
|
+
)
|
178
|
+
|
179
|
+
column_infos = training_set_utils.add_inferred_source_columns(column_infos)
|
180
|
+
|
181
|
+
training_set_utils.validate_column_infos(
|
182
|
+
self._spark_client,
|
183
|
+
ft_metadata,
|
184
|
+
column_infos.source_data_column_infos,
|
185
|
+
column_infos.feature_column_infos,
|
186
|
+
column_infos.on_demand_column_infos,
|
187
|
+
)
|
188
|
+
|
189
|
+
feature_spec = training_set_utils.build_feature_spec(
|
190
|
+
feature_lookups,
|
191
|
+
ft_metadata,
|
192
|
+
column_infos,
|
193
|
+
exclude_columns
|
194
|
+
)
|
195
|
+
|
196
|
+
return feature_spec
|
File without changes
|
@@ -0,0 +1,96 @@
|
|
1
|
+
"""
|
2
|
+
通用工具函数
|
3
|
+
"""
|
4
|
+
|
5
|
+
from collections import Counter
|
6
|
+
from typing import Any, List
|
7
|
+
|
8
|
+
from mlflow.store.artifact.models_artifact_repo import ModelsArtifactRepository
|
9
|
+
from mlflow.store.artifact.runs_artifact_repo import RunsArtifactRepository
|
10
|
+
|
11
|
+
|
12
|
+
def is_artifact_uri(uri):
|
13
|
+
"""
|
14
|
+
Checks the artifact URI is associated with a MLflow model or run.
|
15
|
+
The actual URI can be a model URI, model URI + subdirectory, or model URI + path to artifact file.
|
16
|
+
"""
|
17
|
+
return ModelsArtifactRepository.is_models_uri(
|
18
|
+
uri
|
19
|
+
) or RunsArtifactRepository.is_runs_uri(uri)
|
20
|
+
|
21
|
+
def as_list(obj, default=None):
|
22
|
+
if not obj:
|
23
|
+
return default
|
24
|
+
elif isinstance(obj, list):
|
25
|
+
return obj
|
26
|
+
else:
|
27
|
+
return [obj]
|
28
|
+
|
29
|
+
def get_duplicates(elements: List[Any]) -> List[Any]:
|
30
|
+
"""
|
31
|
+
Returns duplicate elements in the order they first appear.
|
32
|
+
"""
|
33
|
+
element_counts = Counter(elements)
|
34
|
+
duplicates = []
|
35
|
+
for e in element_counts.keys():
|
36
|
+
if element_counts[e] > 1:
|
37
|
+
duplicates.append(e)
|
38
|
+
return duplicates
|
39
|
+
|
40
|
+
def validate_strings_unique(strings: List[str], error_template: str):
|
41
|
+
"""
|
42
|
+
Validates all strings are unique, otherwise raise ValueError with the error template and duplicates.
|
43
|
+
Passes single-quoted, comma delimited duplicates to the error template.
|
44
|
+
"""
|
45
|
+
duplicate_strings = get_duplicates(strings)
|
46
|
+
if duplicate_strings:
|
47
|
+
duplicates_formatted = ", ".join([f"'{s}'" for s in duplicate_strings])
|
48
|
+
raise ValueError(error_template.format(duplicates_formatted))
|
49
|
+
|
50
|
+
def sanitize_identifier(identifier: str):
|
51
|
+
"""
|
52
|
+
Sanitize and wrap an identifier with backquotes. For example, "a`b" becomes "`a``b`".
|
53
|
+
Use this function to sanitize identifiers such as column names in SQL and PySpark.
|
54
|
+
"""
|
55
|
+
return f"`{identifier.replace('`', '``')}`"
|
56
|
+
|
57
|
+
|
58
|
+
def sanitize_identifiers(identifiers: List[str]):
|
59
|
+
"""
|
60
|
+
Sanitize and wrap the identifiers in a list with backquotes.
|
61
|
+
"""
|
62
|
+
return [sanitize_identifier(i) for i in identifiers]
|
63
|
+
|
64
|
+
|
65
|
+
def sanitize_multi_level_name(multi_level_name: str):
|
66
|
+
"""
|
67
|
+
Sanitize a multi-level name (such as an Unity Catalog table name) by sanitizing each segment
|
68
|
+
and joining the results. For example, "ca+t.fo`o.ba$r" becomes "`ca+t`.`fo``o`.`ba$r`".
|
69
|
+
"""
|
70
|
+
segments = multi_level_name.split(".")
|
71
|
+
return ".".join(sanitize_identifiers(segments))
|
72
|
+
|
73
|
+
|
74
|
+
def unsanitize_identifier(identifier: str):
|
75
|
+
"""
|
76
|
+
Unsanitize an identifier. Useful when we get a possibly sanitized identifier from Spark or
|
77
|
+
somewhere else, but we need an unsanitized one.
|
78
|
+
Note: This function does not check the correctness of the identifier passed in. e.g. `foo``
|
79
|
+
is not a valid sanitized identifier. When given such invalid input, this function returns
|
80
|
+
invalid output.
|
81
|
+
"""
|
82
|
+
if len(identifier) >= 2 and identifier[0] == "`" and identifier[-1] == "`":
|
83
|
+
return identifier[1:-1].replace("``", "`")
|
84
|
+
else:
|
85
|
+
return identifier
|
86
|
+
|
87
|
+
|
88
|
+
# strings containing \ or ' can break sql statements, so escape them.
|
89
|
+
def escape_sql_string(input_str: str) -> str:
|
90
|
+
return input_str.replace("\\", "\\\\").replace("'", "\\'")
|
91
|
+
|
92
|
+
def get_unique_list_order(elements: List[Any]) -> List[Any]:
|
93
|
+
"""
|
94
|
+
Returns unique elements in the order they first appear.
|
95
|
+
"""
|
96
|
+
return list(dict.fromkeys(elements))
|