tencent-wedata-feature-engineering-dev 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.
- tencent_wedata_feature_engineering_dev-0.1.0.dist-info/METADATA +19 -0
- tencent_wedata_feature_engineering_dev-0.1.0.dist-info/RECORD +64 -0
- tencent_wedata_feature_engineering_dev-0.1.0.dist-info/WHEEL +5 -0
- tencent_wedata_feature_engineering_dev-0.1.0.dist-info/top_level.txt +1 -0
- wedata/__init__.py +9 -0
- wedata/feature_store/__init__.py +0 -0
- wedata/feature_store/client.py +462 -0
- wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
- wedata/feature_store/cloud_sdk_client/client.py +86 -0
- wedata/feature_store/cloud_sdk_client/models.py +686 -0
- wedata/feature_store/cloud_sdk_client/utils.py +32 -0
- wedata/feature_store/common/__init__.py +0 -0
- wedata/feature_store/common/protos/__init__.py +0 -0
- wedata/feature_store/common/protos/feature_store_pb2.py +49 -0
- wedata/feature_store/common/store_config/__init__.py +0 -0
- wedata/feature_store/common/store_config/redis.py +48 -0
- wedata/feature_store/constants/__init__.py +0 -0
- wedata/feature_store/constants/constants.py +59 -0
- wedata/feature_store/constants/engine_types.py +34 -0
- wedata/feature_store/entities/__init__.py +0 -0
- wedata/feature_store/entities/column_info.py +138 -0
- wedata/feature_store/entities/environment_variables.py +55 -0
- wedata/feature_store/entities/feature.py +53 -0
- wedata/feature_store/entities/feature_column_info.py +72 -0
- wedata/feature_store/entities/feature_function.py +55 -0
- wedata/feature_store/entities/feature_lookup.py +200 -0
- wedata/feature_store/entities/feature_spec.py +489 -0
- wedata/feature_store/entities/feature_spec_constants.py +25 -0
- wedata/feature_store/entities/feature_table.py +111 -0
- wedata/feature_store/entities/feature_table_info.py +49 -0
- wedata/feature_store/entities/function_info.py +90 -0
- wedata/feature_store/entities/on_demand_column_info.py +57 -0
- wedata/feature_store/entities/source_data_column_info.py +24 -0
- wedata/feature_store/entities/training_set.py +135 -0
- wedata/feature_store/feast_client/__init__.py +0 -0
- wedata/feature_store/feast_client/feast_client.py +482 -0
- wedata/feature_store/feature_table_client/__init__.py +0 -0
- wedata/feature_store/feature_table_client/feature_table_client.py +969 -0
- wedata/feature_store/mlflow_model.py +17 -0
- wedata/feature_store/spark_client/__init__.py +0 -0
- wedata/feature_store/spark_client/spark_client.py +289 -0
- wedata/feature_store/training_set_client/__init__.py +0 -0
- wedata/feature_store/training_set_client/training_set_client.py +572 -0
- wedata/feature_store/utils/__init__.py +0 -0
- wedata/feature_store/utils/common_utils.py +352 -0
- wedata/feature_store/utils/env_utils.py +86 -0
- wedata/feature_store/utils/feature_lookup_utils.py +564 -0
- wedata/feature_store/utils/feature_spec_utils.py +286 -0
- wedata/feature_store/utils/feature_utils.py +73 -0
- wedata/feature_store/utils/on_demand_utils.py +107 -0
- wedata/feature_store/utils/schema_utils.py +117 -0
- wedata/feature_store/utils/signature_utils.py +202 -0
- wedata/feature_store/utils/topological_sort.py +158 -0
- wedata/feature_store/utils/training_set_utils.py +579 -0
- wedata/feature_store/utils/uc_utils.py +296 -0
- wedata/feature_store/utils/validation_utils.py +79 -0
- wedata/tempo/__init__.py +0 -0
- wedata/tempo/interpol.py +448 -0
- wedata/tempo/intervals.py +1331 -0
- wedata/tempo/io.py +61 -0
- wedata/tempo/ml.py +129 -0
- wedata/tempo/resample.py +318 -0
- wedata/tempo/tsdf.py +1720 -0
- wedata/tempo/utils.py +254 -0
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Dict, Optional
|
|
3
|
+
|
|
4
|
+
import mlflow
|
|
5
|
+
from mlflow.models import ModelSignature
|
|
6
|
+
from mlflow.types import ColSpec
|
|
7
|
+
from mlflow.types import DataType as MlflowDataType
|
|
8
|
+
from mlflow.types import ParamSchema, Schema
|
|
9
|
+
|
|
10
|
+
from wedata.feature_store.entities.feature_column_info import FeatureColumnInfo
|
|
11
|
+
from wedata.feature_store.entities.feature_spec import FeatureSpec
|
|
12
|
+
from wedata.feature_store.entities.on_demand_column_info import OnDemandColumnInfo
|
|
13
|
+
from wedata.feature_store.entities.source_data_column_info import SourceDataColumnInfo
|
|
14
|
+
|
|
15
|
+
_logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
# Some types (array, map, decimal, timestamp_ntz) are unsupported due to MLflow signatures
|
|
18
|
+
# lacking any equivalent types. We thus cannot construct a ColSpec for any column
|
|
19
|
+
# that uses these types.
|
|
20
|
+
SUPPORTED_TYPE_MAP = {
|
|
21
|
+
"smallint": MlflowDataType.integer, # Upcast to integer
|
|
22
|
+
"int": MlflowDataType.integer,
|
|
23
|
+
"bigint": MlflowDataType.long,
|
|
24
|
+
"float": MlflowDataType.float,
|
|
25
|
+
"double": MlflowDataType.double,
|
|
26
|
+
"boolean": MlflowDataType.boolean,
|
|
27
|
+
"date": MlflowDataType.datetime,
|
|
28
|
+
"timestamp": MlflowDataType.datetime,
|
|
29
|
+
"string": MlflowDataType.string,
|
|
30
|
+
"binary": MlflowDataType.binary,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def is_unsupported_type(type_str: str):
|
|
35
|
+
return type_str not in SUPPORTED_TYPE_MAP
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def convert_spark_data_type_to_mlflow_signature_type(spark_type):
|
|
39
|
+
return SUPPORTED_TYPE_MAP.get(spark_type)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_input_schema_from_feature_spec(feature_spec: FeatureSpec) -> Schema:
|
|
43
|
+
"""
|
|
44
|
+
Produces an MLflow signature schema from a feature spec.
|
|
45
|
+
Source data columns are marked as required inputs and feature columns
|
|
46
|
+
(both lookups and on-demand features) are marked as optional inputs.
|
|
47
|
+
|
|
48
|
+
:param feature_spec: FeatureSpec object with datatypes for each column.
|
|
49
|
+
"""
|
|
50
|
+
# If we're missing any data types for any column, we are likely dealing with a
|
|
51
|
+
# malformed feature spec and should halt signature construction.
|
|
52
|
+
if any([ci.data_type is None for ci in feature_spec.column_infos]):
|
|
53
|
+
raise Exception("Training set does not contain column data types.")
|
|
54
|
+
|
|
55
|
+
source_data_cols = [
|
|
56
|
+
ci
|
|
57
|
+
for ci in feature_spec.column_infos
|
|
58
|
+
if isinstance(ci.info, SourceDataColumnInfo)
|
|
59
|
+
]
|
|
60
|
+
# Don't create signature if any source data columns (required) are of complex types.
|
|
61
|
+
if any(
|
|
62
|
+
[
|
|
63
|
+
ci.data_type is None or is_unsupported_type(ci.data_type)
|
|
64
|
+
for ci in source_data_cols
|
|
65
|
+
]
|
|
66
|
+
):
|
|
67
|
+
raise Exception(
|
|
68
|
+
"Input DataFrame contains column data types not supported by "
|
|
69
|
+
"MLflow model signatures."
|
|
70
|
+
)
|
|
71
|
+
required_input_colspecs = [
|
|
72
|
+
ColSpec(
|
|
73
|
+
convert_spark_data_type_to_mlflow_signature_type(ci.data_type),
|
|
74
|
+
ci.info.output_name,
|
|
75
|
+
required=True,
|
|
76
|
+
)
|
|
77
|
+
for ci in source_data_cols
|
|
78
|
+
]
|
|
79
|
+
feature_cols = [
|
|
80
|
+
ci
|
|
81
|
+
for ci in feature_spec.column_infos
|
|
82
|
+
if isinstance(ci.info, (FeatureColumnInfo, OnDemandColumnInfo))
|
|
83
|
+
]
|
|
84
|
+
unsupported_feature_cols = [
|
|
85
|
+
ci for ci in feature_cols if is_unsupported_type(ci.data_type)
|
|
86
|
+
]
|
|
87
|
+
optional_input_colspecs = [
|
|
88
|
+
ColSpec(
|
|
89
|
+
convert_spark_data_type_to_mlflow_signature_type(ci.data_type),
|
|
90
|
+
ci.output_name,
|
|
91
|
+
required=True,
|
|
92
|
+
)
|
|
93
|
+
for ci in feature_cols
|
|
94
|
+
if not is_unsupported_type(ci.data_type)
|
|
95
|
+
]
|
|
96
|
+
if unsupported_feature_cols:
|
|
97
|
+
feat_string = ", ".join(
|
|
98
|
+
[f"{ci.output_name} ({ci.data_type})" for ci in unsupported_feature_cols]
|
|
99
|
+
)
|
|
100
|
+
_logger.warning(
|
|
101
|
+
f"The following features will not be included in the input schema because their"
|
|
102
|
+
f" data types are not supported by MLflow model signatures: {feat_string}. "
|
|
103
|
+
f"These features cannot be overridden during model serving."
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
return Schema(optional_input_colspecs)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def get_output_schema_from_labels(label_type_map: Optional[Dict[str, str]]) -> Schema:
|
|
110
|
+
"""
|
|
111
|
+
Produces an MLflow signature schema from the provided label type map.
|
|
112
|
+
:param label_type_map: Map label column name -> data type
|
|
113
|
+
"""
|
|
114
|
+
if not label_type_map:
|
|
115
|
+
raise Exception("Training set does not contain a label.")
|
|
116
|
+
if any([is_unsupported_type(dtype) for dtype in label_type_map.values()]):
|
|
117
|
+
raise Exception(
|
|
118
|
+
"Labels are of data types not supported by MLflow model signatures."
|
|
119
|
+
)
|
|
120
|
+
else:
|
|
121
|
+
output_colspecs = [
|
|
122
|
+
ColSpec(
|
|
123
|
+
convert_spark_data_type_to_mlflow_signature_type(spark_type),
|
|
124
|
+
col_name,
|
|
125
|
+
required=True,
|
|
126
|
+
)
|
|
127
|
+
for col_name, spark_type in label_type_map.items()
|
|
128
|
+
]
|
|
129
|
+
return Schema(output_colspecs)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def get_mlflow_signature_from_feature_spec(
|
|
133
|
+
feature_spec: FeatureSpec,
|
|
134
|
+
label_type_map: Optional[Dict[str, str]],
|
|
135
|
+
override_output_schema: Optional[Schema],
|
|
136
|
+
params: Optional[Dict[str, Any]] = None,
|
|
137
|
+
) -> Optional[ModelSignature]:
|
|
138
|
+
"""
|
|
139
|
+
Produce an MLflow signature from a feature spec and label type map.
|
|
140
|
+
Source data columns are marked as required inputs and feature columns
|
|
141
|
+
(both lookups and on-demand features) are marked as optional inputs.
|
|
142
|
+
|
|
143
|
+
Reads output types from the cached label -> datatype map in the training set.
|
|
144
|
+
If override_output_schema is provided, it will always be used as the output schema.
|
|
145
|
+
|
|
146
|
+
:param feature_spec: FeatureSpec object with datatypes for each column.
|
|
147
|
+
:param label_type_map: Map of label column name -> datatype
|
|
148
|
+
:param override_output_schema: User-provided output schema to use if provided.
|
|
149
|
+
"""
|
|
150
|
+
kwargs = {}
|
|
151
|
+
kwargs["inputs"] = get_input_schema_from_feature_spec(feature_spec)
|
|
152
|
+
try:
|
|
153
|
+
output_schema = override_output_schema or get_output_schema_from_labels(
|
|
154
|
+
label_type_map
|
|
155
|
+
)
|
|
156
|
+
kwargs["outputs"] = output_schema
|
|
157
|
+
except Exception as e:
|
|
158
|
+
_logger.warning(f"Could not infer an output schema: {e}")
|
|
159
|
+
|
|
160
|
+
if params:
|
|
161
|
+
try:
|
|
162
|
+
from mlflow.types.utils import _infer_param_schema
|
|
163
|
+
|
|
164
|
+
kwargs["params"] = _infer_param_schema(params)
|
|
165
|
+
except Exception as e:
|
|
166
|
+
_logger.warning(f"Could not infer params schema: {e}")
|
|
167
|
+
|
|
168
|
+
return mlflow.models.ModelSignature(**kwargs)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def drop_signature_inputs_and_invalid_params(signature):
|
|
172
|
+
"""
|
|
173
|
+
Drop ModelSignature inputs field and invalid params from params field.
|
|
174
|
+
This is useful for feature store model's raw_model.
|
|
175
|
+
Feature store model's input schema does not apply to raw_model's input,
|
|
176
|
+
so we drop the inputs field of raw_model's signature.
|
|
177
|
+
Feature store model's result_type param enables setting and overriding
|
|
178
|
+
a default result_type for predictions, but this interferes with params
|
|
179
|
+
passed to MLflow's predict function, so we drop result_type from
|
|
180
|
+
the params field of raw_model's signature.
|
|
181
|
+
|
|
182
|
+
:param signature: ModelSignature object.
|
|
183
|
+
"""
|
|
184
|
+
if signature:
|
|
185
|
+
outputs_schema = signature.outputs
|
|
186
|
+
params_schema = signature.params if hasattr(signature, "params") else None
|
|
187
|
+
try:
|
|
188
|
+
# Only for mlflow>=2.6.0 ModelSignature contains params attribute
|
|
189
|
+
if params_schema:
|
|
190
|
+
updated_params_schema = ParamSchema(
|
|
191
|
+
[param for param in params_schema if param.name != "result_type"]
|
|
192
|
+
)
|
|
193
|
+
return ModelSignature(
|
|
194
|
+
outputs=outputs_schema, params=updated_params_schema
|
|
195
|
+
)
|
|
196
|
+
if outputs_schema:
|
|
197
|
+
return ModelSignature(outputs=outputs_schema)
|
|
198
|
+
except TypeError:
|
|
199
|
+
_logger.warning(
|
|
200
|
+
"ModelSignature without inputs is not supported, please upgrade "
|
|
201
|
+
"mlflow >= 2.7.0 to use the feature."
|
|
202
|
+
)
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
from collections import defaultdict, deque
|
|
2
|
+
from queue import PriorityQueue
|
|
3
|
+
from typing import Callable, Dict, Hashable, List, Optional
|
|
4
|
+
|
|
5
|
+
__all__ = ["find_cycle", "topological_sort"]
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class _NodeInfo:
|
|
9
|
+
def __init__(self, node):
|
|
10
|
+
self.node = node
|
|
11
|
+
# number of non-processed predecessors.
|
|
12
|
+
self.n_blockers = 0
|
|
13
|
+
# list of nodes that depend on this node.
|
|
14
|
+
self.successors = []
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def find_cycle(
|
|
18
|
+
node_dependencies: Dict[Hashable, List[Hashable]]
|
|
19
|
+
) -> Optional[List[Hashable]]:
|
|
20
|
+
"""
|
|
21
|
+
Finds a cycle in the node_dependencies graph. Returns a list of node(s) that forms a cycle or
|
|
22
|
+
None if no cycle can be found.
|
|
23
|
+
:param node_dependencies: A dict with hashable objects as keys and their list of dependency
|
|
24
|
+
nodes as values.
|
|
25
|
+
"""
|
|
26
|
+
# A stack used to perform DFS on the graph.
|
|
27
|
+
stack = deque()
|
|
28
|
+
# Another stack storing the path from root to current node in DFS. Used to detect cycle.
|
|
29
|
+
backtrack_stack = []
|
|
30
|
+
# A set of nodes that no cycle can be found starting from these nodes.
|
|
31
|
+
resolved = set()
|
|
32
|
+
# Create a copy of the dependency graph with defaultdict for convenience.
|
|
33
|
+
default_dependency = defaultdict(list)
|
|
34
|
+
default_dependency.update(node_dependencies)
|
|
35
|
+
|
|
36
|
+
# Perform DFS on every node in the graph.
|
|
37
|
+
for node in node_dependencies.keys():
|
|
38
|
+
if node in resolved:
|
|
39
|
+
# Skip a node if it's already resolved.
|
|
40
|
+
continue
|
|
41
|
+
# DFS from the node
|
|
42
|
+
stack.append(node)
|
|
43
|
+
while stack:
|
|
44
|
+
top = stack[-1]
|
|
45
|
+
if top not in backtrack_stack:
|
|
46
|
+
# First time visiting this node. There will be a second visit after the dependencies
|
|
47
|
+
# are resolved if it has dependencies.
|
|
48
|
+
backtrack_stack.append(top)
|
|
49
|
+
# If not expended after traversing the dependencies, meaning there is no dependency or
|
|
50
|
+
# all dependencies are resolved.
|
|
51
|
+
expanded = False
|
|
52
|
+
for depend in default_dependency[top]:
|
|
53
|
+
if depend in backtrack_stack:
|
|
54
|
+
# found a cycle
|
|
55
|
+
index = backtrack_stack.index(depend)
|
|
56
|
+
return backtrack_stack[index:]
|
|
57
|
+
if depend in resolved:
|
|
58
|
+
continue
|
|
59
|
+
# Only adding node to stack. backtrack_stack only contains nodes in the current DFS
|
|
60
|
+
# path.
|
|
61
|
+
stack.append(depend)
|
|
62
|
+
expanded = True
|
|
63
|
+
if not expanded:
|
|
64
|
+
stack.pop()
|
|
65
|
+
resolved.add(top)
|
|
66
|
+
backtrack_stack.pop()
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _all_items_in_queue_should_be_grouped(
|
|
71
|
+
queue: PriorityQueue, should_be_grouped: Callable
|
|
72
|
+
) -> bool:
|
|
73
|
+
temp = []
|
|
74
|
+
should_group = True
|
|
75
|
+
# note: avoid using queue.qsize() because it's not guaranteed to be accurate.
|
|
76
|
+
while not queue.empty():
|
|
77
|
+
k, node = queue.get()
|
|
78
|
+
temp.append((k, node))
|
|
79
|
+
if not should_be_grouped(node):
|
|
80
|
+
should_group = False
|
|
81
|
+
for item in temp:
|
|
82
|
+
queue.put(item)
|
|
83
|
+
return should_group
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def topological_sort(
|
|
87
|
+
node_dependencies: Dict[Hashable, List[Hashable]],
|
|
88
|
+
key: Callable = None,
|
|
89
|
+
should_be_grouped: Callable = None,
|
|
90
|
+
) -> List[Hashable]:
|
|
91
|
+
"""
|
|
92
|
+
Topological sort the given node_dependencies graph. Returns a sorted list of nodes.
|
|
93
|
+
:param node_dependencies: A dict with hashable objects as keys and their list of dependency
|
|
94
|
+
nodes as values.
|
|
95
|
+
:param key: a Callable that returns a sort key when called with a hashable object. The key is
|
|
96
|
+
used to break ties in topological sorting. An object with smaller key is added
|
|
97
|
+
to the result list first.
|
|
98
|
+
:raises ValueError if a cycle is found in the graph.
|
|
99
|
+
"""
|
|
100
|
+
# Calling a dedicated find_cycle function to be able to give a detailed error message.
|
|
101
|
+
cycle = find_cycle(node_dependencies)
|
|
102
|
+
if cycle is not None:
|
|
103
|
+
raise ValueError(
|
|
104
|
+
"Following nodes form a cycle: ",
|
|
105
|
+
cycle,
|
|
106
|
+
". Please resolve any circular dependencies before calling Feature Store.",
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# A priority-queue storing the nodes whose dependency has been resolved.
|
|
110
|
+
# priority is determined by the given key function.
|
|
111
|
+
ready_queue = PriorityQueue()
|
|
112
|
+
# Map from node to _NodeInfo.
|
|
113
|
+
nodes = {}
|
|
114
|
+
if key is None:
|
|
115
|
+
key = hash # use the built-in hash function by default
|
|
116
|
+
if should_be_grouped is None:
|
|
117
|
+
should_be_grouped = lambda _: False
|
|
118
|
+
# Perform Kahn's algorithm by traversing the graph starting from nodes without dependency.
|
|
119
|
+
# Node is removed from its successors' dependency once resolved. And node whose dependency gets
|
|
120
|
+
# all resolved is added to the priority queue.
|
|
121
|
+
for node, dependencies in node_dependencies.items():
|
|
122
|
+
# Initialize the graph to topologically sort based on the input node_dependencies.
|
|
123
|
+
# All nodes, its successors and number of predecessors should be populated.
|
|
124
|
+
if node not in nodes:
|
|
125
|
+
nodes[node] = _NodeInfo(node)
|
|
126
|
+
for dependency in dependencies:
|
|
127
|
+
if dependency not in nodes:
|
|
128
|
+
nodes[dependency] = _NodeInfo(dependency)
|
|
129
|
+
nodes[dependency].successors.append(node)
|
|
130
|
+
if len(dependencies):
|
|
131
|
+
nodes[node].n_blockers = len(dependencies)
|
|
132
|
+
# Initialize the ready_queue to start traversing the graph from nodes without any dependencies.
|
|
133
|
+
for node, node_info in nodes.items():
|
|
134
|
+
if node_info.n_blockers == 0:
|
|
135
|
+
ready_queue.put((key(node), node))
|
|
136
|
+
# At the end of the algorithm, result_list will have a topologically sorted listed of nodes.
|
|
137
|
+
result_list = []
|
|
138
|
+
|
|
139
|
+
def process_nodes(node_buffer, queue):
|
|
140
|
+
for node in node_buffer:
|
|
141
|
+
result_list.append(node)
|
|
142
|
+
for successor in nodes[node].successors:
|
|
143
|
+
s_info = nodes[successor]
|
|
144
|
+
s_info.n_blockers -= 1
|
|
145
|
+
if s_info.n_blockers == 0:
|
|
146
|
+
queue.put((key(successor), successor))
|
|
147
|
+
|
|
148
|
+
while not ready_queue.empty():
|
|
149
|
+
if _all_items_in_queue_should_be_grouped(ready_queue, should_be_grouped):
|
|
150
|
+
batch_buffer = []
|
|
151
|
+
while not ready_queue.empty():
|
|
152
|
+
_, node = ready_queue.get()
|
|
153
|
+
batch_buffer.append(node)
|
|
154
|
+
process_nodes(batch_buffer, ready_queue)
|
|
155
|
+
else:
|
|
156
|
+
_, node = ready_queue.get()
|
|
157
|
+
process_nodes([node], ready_queue)
|
|
158
|
+
return result_list
|