mlrun 1.3.2rc1__py3-none-any.whl → 1.3.2rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/api/api/deps.py +14 -1
- mlrun/api/api/endpoints/frontend_spec.py +0 -2
- mlrun/api/api/endpoints/functions.py +15 -27
- mlrun/api/api/endpoints/grafana_proxy.py +435 -74
- mlrun/api/api/endpoints/healthz.py +5 -18
- mlrun/api/api/endpoints/model_endpoints.py +33 -37
- mlrun/api/api/utils.py +6 -13
- mlrun/api/crud/__init__.py +14 -16
- mlrun/api/crud/logs.py +5 -7
- mlrun/api/crud/model_monitoring/__init__.py +2 -2
- mlrun/api/crud/model_monitoring/model_endpoint_store.py +847 -0
- mlrun/api/crud/model_monitoring/model_endpoints.py +105 -328
- mlrun/api/crud/pipelines.py +2 -3
- mlrun/api/db/sqldb/models/models_mysql.py +52 -19
- mlrun/api/db/sqldb/models/models_sqlite.py +52 -19
- mlrun/api/db/sqldb/session.py +19 -26
- mlrun/api/schemas/__init__.py +2 -0
- mlrun/api/schemas/constants.py +0 -13
- mlrun/api/schemas/frontend_spec.py +0 -1
- mlrun/api/schemas/model_endpoints.py +38 -195
- mlrun/api/schemas/schedule.py +2 -2
- mlrun/api/utils/clients/log_collector.py +5 -0
- mlrun/builder.py +9 -41
- mlrun/config.py +1 -76
- mlrun/data_types/__init__.py +1 -6
- mlrun/data_types/data_types.py +1 -3
- mlrun/datastore/__init__.py +2 -9
- mlrun/datastore/sources.py +20 -25
- mlrun/datastore/store_resources.py +1 -1
- mlrun/datastore/targets.py +34 -67
- mlrun/datastore/utils.py +4 -26
- mlrun/db/base.py +2 -4
- mlrun/db/filedb.py +5 -13
- mlrun/db/httpdb.py +32 -64
- mlrun/db/sqldb.py +2 -4
- mlrun/errors.py +0 -5
- mlrun/execution.py +0 -2
- mlrun/feature_store/api.py +8 -24
- mlrun/feature_store/feature_set.py +6 -28
- mlrun/feature_store/feature_vector.py +0 -2
- mlrun/feature_store/ingestion.py +11 -8
- mlrun/feature_store/retrieval/base.py +43 -271
- mlrun/feature_store/retrieval/dask_merger.py +153 -55
- mlrun/feature_store/retrieval/job.py +3 -12
- mlrun/feature_store/retrieval/local_merger.py +130 -48
- mlrun/feature_store/retrieval/spark_merger.py +125 -126
- mlrun/features.py +2 -7
- mlrun/model_monitoring/constants.py +6 -48
- mlrun/model_monitoring/helpers.py +35 -118
- mlrun/model_monitoring/model_monitoring_batch.py +260 -293
- mlrun/model_monitoring/stream_processing_fs.py +253 -220
- mlrun/platforms/iguazio.py +0 -33
- mlrun/projects/project.py +72 -34
- mlrun/runtimes/base.py +0 -5
- mlrun/runtimes/daskjob.py +0 -2
- mlrun/runtimes/function.py +3 -29
- mlrun/runtimes/kubejob.py +15 -39
- mlrun/runtimes/local.py +45 -7
- mlrun/runtimes/mpijob/abstract.py +0 -2
- mlrun/runtimes/mpijob/v1.py +0 -2
- mlrun/runtimes/pod.py +0 -2
- mlrun/runtimes/remotesparkjob.py +0 -2
- mlrun/runtimes/serving.py +0 -6
- mlrun/runtimes/sparkjob/abstract.py +2 -39
- mlrun/runtimes/sparkjob/spark3job.py +0 -2
- mlrun/serving/__init__.py +1 -2
- mlrun/serving/routers.py +35 -35
- mlrun/serving/server.py +12 -22
- mlrun/serving/states.py +30 -162
- mlrun/serving/v2_serving.py +10 -13
- mlrun/utils/clones.py +1 -1
- mlrun/utils/model_monitoring.py +96 -122
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.3.2rc1.dist-info → mlrun-1.3.2rc2.dist-info}/METADATA +27 -23
- {mlrun-1.3.2rc1.dist-info → mlrun-1.3.2rc2.dist-info}/RECORD +79 -92
- mlrun/api/crud/model_monitoring/grafana.py +0 -427
- mlrun/datastore/spark_udf.py +0 -40
- mlrun/model_monitoring/__init__.py +0 -44
- mlrun/model_monitoring/common.py +0 -112
- mlrun/model_monitoring/model_endpoint.py +0 -141
- mlrun/model_monitoring/stores/__init__.py +0 -106
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -448
- mlrun/model_monitoring/stores/model_endpoint_store.py +0 -147
- mlrun/model_monitoring/stores/models/__init__.py +0 -23
- mlrun/model_monitoring/stores/models/base.py +0 -18
- mlrun/model_monitoring/stores/models/mysql.py +0 -100
- mlrun/model_monitoring/stores/models/sqlite.py +0 -98
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -375
- mlrun/utils/db.py +0 -52
- {mlrun-1.3.2rc1.dist-info → mlrun-1.3.2rc2.dist-info}/LICENSE +0 -0
- {mlrun-1.3.2rc1.dist-info → mlrun-1.3.2rc2.dist-info}/WHEEL +0 -0
- {mlrun-1.3.2rc1.dist-info → mlrun-1.3.2rc2.dist-info}/entry_points.txt +0 -0
- {mlrun-1.3.2rc1.dist-info → mlrun-1.3.2rc2.dist-info}/top_level.txt +0 -0
|
@@ -20,6 +20,7 @@ from dask.distributed import Client
|
|
|
20
20
|
|
|
21
21
|
import mlrun
|
|
22
22
|
|
|
23
|
+
from ..feature_vector import OfflineVectorResponse
|
|
23
24
|
from .base import BaseMerger
|
|
24
25
|
|
|
25
26
|
|
|
@@ -31,6 +32,139 @@ class DaskFeatureMerger(BaseMerger):
|
|
|
31
32
|
self.client = engine_args.get("dask_client")
|
|
32
33
|
self._dask_cluster_uri = engine_args.get("dask_cluster_uri")
|
|
33
34
|
|
|
35
|
+
def _generate_vector(
|
|
36
|
+
self,
|
|
37
|
+
entity_rows,
|
|
38
|
+
entity_timestamp_column,
|
|
39
|
+
feature_set_objects,
|
|
40
|
+
feature_set_fields,
|
|
41
|
+
start_time=None,
|
|
42
|
+
end_time=None,
|
|
43
|
+
query=None,
|
|
44
|
+
):
|
|
45
|
+
if "index" not in self._index_columns:
|
|
46
|
+
self._append_drop_column("index")
|
|
47
|
+
|
|
48
|
+
# init the dask client if needed
|
|
49
|
+
if not self.client:
|
|
50
|
+
if self._dask_cluster_uri:
|
|
51
|
+
function = mlrun.import_function(self._dask_cluster_uri)
|
|
52
|
+
self.client = function.client
|
|
53
|
+
else:
|
|
54
|
+
self.client = Client()
|
|
55
|
+
|
|
56
|
+
# load dataframes
|
|
57
|
+
feature_sets = []
|
|
58
|
+
dfs = []
|
|
59
|
+
keys = (
|
|
60
|
+
[]
|
|
61
|
+
) # the struct of key is [[[],[]], ..] So that each record indicates which way the corresponding
|
|
62
|
+
# featureset is connected to the previous one, and within each record the left keys are indicated in index 0
|
|
63
|
+
# and the right keys in index 1, this keys will be the keys that will be used in this join
|
|
64
|
+
all_columns = []
|
|
65
|
+
|
|
66
|
+
fs_link_list = self._create_linked_relation_list(
|
|
67
|
+
feature_set_objects, feature_set_fields
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
for node in fs_link_list:
|
|
71
|
+
name = node.name
|
|
72
|
+
feature_set = feature_set_objects[name]
|
|
73
|
+
feature_sets.append(feature_set)
|
|
74
|
+
columns = feature_set_fields[name]
|
|
75
|
+
column_names = [name for name, alias in columns]
|
|
76
|
+
|
|
77
|
+
for col in node.data["save_cols"]:
|
|
78
|
+
if col not in column_names:
|
|
79
|
+
self._append_drop_column(col)
|
|
80
|
+
column_names += node.data["save_cols"]
|
|
81
|
+
|
|
82
|
+
df = feature_set.to_dataframe(
|
|
83
|
+
columns=column_names,
|
|
84
|
+
df_module=dd,
|
|
85
|
+
start_time=start_time,
|
|
86
|
+
end_time=end_time,
|
|
87
|
+
time_column=entity_timestamp_column,
|
|
88
|
+
index=False,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
df = df.reset_index()
|
|
92
|
+
column_names += node.data["save_index"]
|
|
93
|
+
node.data["save_cols"] += node.data["save_index"]
|
|
94
|
+
entity_timestamp_column_list = (
|
|
95
|
+
[entity_timestamp_column]
|
|
96
|
+
if entity_timestamp_column
|
|
97
|
+
else feature_set.spec.timestamp_key
|
|
98
|
+
)
|
|
99
|
+
if entity_timestamp_column_list:
|
|
100
|
+
column_names += entity_timestamp_column_list
|
|
101
|
+
node.data["save_cols"] += entity_timestamp_column_list
|
|
102
|
+
|
|
103
|
+
df = df.persist()
|
|
104
|
+
|
|
105
|
+
# rename columns to be unique for each feature set
|
|
106
|
+
rename_col_dict = {
|
|
107
|
+
col: f"{col}_{name}"
|
|
108
|
+
for col in column_names
|
|
109
|
+
if col not in node.data["save_cols"]
|
|
110
|
+
}
|
|
111
|
+
df = df.rename(
|
|
112
|
+
columns=rename_col_dict,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
dfs.append(df)
|
|
116
|
+
del df
|
|
117
|
+
|
|
118
|
+
keys.append([node.data["left_keys"], node.data["right_keys"]])
|
|
119
|
+
|
|
120
|
+
# update alias according to the unique column name
|
|
121
|
+
new_columns = []
|
|
122
|
+
for col, alias in columns:
|
|
123
|
+
if col in rename_col_dict and alias:
|
|
124
|
+
new_columns.append((rename_col_dict[col], alias))
|
|
125
|
+
elif col in rename_col_dict and not alias:
|
|
126
|
+
new_columns.append((rename_col_dict[col], col))
|
|
127
|
+
else:
|
|
128
|
+
new_columns.append((col, alias))
|
|
129
|
+
all_columns.append(new_columns)
|
|
130
|
+
self._update_alias(
|
|
131
|
+
dictionary={name: alias for name, alias in new_columns if alias}
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
self.merge(
|
|
135
|
+
entity_df=entity_rows,
|
|
136
|
+
entity_timestamp_column=entity_timestamp_column,
|
|
137
|
+
featuresets=feature_sets,
|
|
138
|
+
featureset_dfs=dfs,
|
|
139
|
+
keys=keys,
|
|
140
|
+
all_columns=all_columns,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
self._result_df = self._result_df.drop(
|
|
144
|
+
columns=self._drop_columns, errors="ignore"
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# renaming all columns according to self._alias
|
|
148
|
+
self._result_df = self._result_df.rename(
|
|
149
|
+
columns=self._alias,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
if self.vector.status.label_column:
|
|
153
|
+
self._result_df = self._result_df.dropna(
|
|
154
|
+
subset=[self.vector.status.label_column]
|
|
155
|
+
)
|
|
156
|
+
# filter joined data frame by the query param
|
|
157
|
+
if query:
|
|
158
|
+
self._result_df = self._result_df.query(query)
|
|
159
|
+
|
|
160
|
+
if self._drop_indexes:
|
|
161
|
+
self._result_df = self._reset_index(self._result_df)
|
|
162
|
+
else:
|
|
163
|
+
self._result_df = self._set_indexes(self._result_df)
|
|
164
|
+
self._write_to_target()
|
|
165
|
+
|
|
166
|
+
return OfflineVectorResponse(self)
|
|
167
|
+
|
|
34
168
|
def _reset_index(self, df):
|
|
35
169
|
to_drop = df.index.name is None
|
|
36
170
|
df = df.reset_index(drop=to_drop)
|
|
@@ -44,13 +178,27 @@ class DaskFeatureMerger(BaseMerger):
|
|
|
44
178
|
featureset_df,
|
|
45
179
|
left_keys: list,
|
|
46
180
|
right_keys: list,
|
|
181
|
+
columns: list,
|
|
47
182
|
):
|
|
48
183
|
|
|
184
|
+
entity_df = self._reset_index(entity_df)
|
|
185
|
+
entity_df = (
|
|
186
|
+
entity_df
|
|
187
|
+
if entity_timestamp_column not in entity_df
|
|
188
|
+
else entity_df.set_index(entity_timestamp_column, drop=True)
|
|
189
|
+
)
|
|
190
|
+
featureset_df = self._reset_index(featureset_df)
|
|
191
|
+
featureset_df = (
|
|
192
|
+
featureset_df
|
|
193
|
+
if entity_timestamp_column not in featureset_df
|
|
194
|
+
else featureset_df.set_index(entity_timestamp_column, drop=True)
|
|
195
|
+
)
|
|
196
|
+
|
|
49
197
|
merged_df = merge_asof(
|
|
50
198
|
entity_df,
|
|
51
199
|
featureset_df,
|
|
52
|
-
|
|
53
|
-
|
|
200
|
+
left_index=True,
|
|
201
|
+
right_index=True,
|
|
54
202
|
left_by=left_keys or None,
|
|
55
203
|
right_by=right_keys or None,
|
|
56
204
|
suffixes=("", f"_{featureset.metadata.name}_"),
|
|
@@ -69,6 +217,7 @@ class DaskFeatureMerger(BaseMerger):
|
|
|
69
217
|
featureset_df,
|
|
70
218
|
left_keys: list,
|
|
71
219
|
right_keys: list,
|
|
220
|
+
columns: list,
|
|
72
221
|
):
|
|
73
222
|
|
|
74
223
|
fs_name = featureset.metadata.name
|
|
@@ -92,56 +241,5 @@ class DaskFeatureMerger(BaseMerger):
|
|
|
92
241
|
|
|
93
242
|
def get_df(self, to_pandas=True):
|
|
94
243
|
if to_pandas and hasattr(self._result_df, "dask"):
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
df = self._result_df
|
|
98
|
-
self._set_indexes(df)
|
|
99
|
-
return df
|
|
100
|
-
|
|
101
|
-
def _create_engine_env(self):
|
|
102
|
-
if "index" not in self._index_columns:
|
|
103
|
-
self._append_drop_column("index")
|
|
104
|
-
|
|
105
|
-
# init the dask client if needed
|
|
106
|
-
if not self.client:
|
|
107
|
-
if self._dask_cluster_uri:
|
|
108
|
-
function = mlrun.import_function(self._dask_cluster_uri)
|
|
109
|
-
self.client = function.client
|
|
110
|
-
else:
|
|
111
|
-
self.client = Client()
|
|
112
|
-
|
|
113
|
-
def _get_engine_df(
|
|
114
|
-
self,
|
|
115
|
-
feature_set,
|
|
116
|
-
feature_set_name,
|
|
117
|
-
column_names=None,
|
|
118
|
-
start_time=None,
|
|
119
|
-
end_time=None,
|
|
120
|
-
entity_timestamp_column=None,
|
|
121
|
-
):
|
|
122
|
-
df = feature_set.to_dataframe(
|
|
123
|
-
columns=column_names,
|
|
124
|
-
df_module=dd,
|
|
125
|
-
start_time=start_time,
|
|
126
|
-
end_time=end_time,
|
|
127
|
-
time_column=entity_timestamp_column,
|
|
128
|
-
index=False,
|
|
129
|
-
)
|
|
130
|
-
|
|
131
|
-
return self._reset_index(df).persist()
|
|
132
|
-
|
|
133
|
-
def _rename_columns_and_select(self, df, rename_col_dict, columns=None):
|
|
134
|
-
return df.rename(
|
|
135
|
-
columns=rename_col_dict,
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
def _drop_columns_from_result(self):
|
|
139
|
-
self._result_df = self._result_df.drop(
|
|
140
|
-
columns=self._drop_columns, errors="ignore"
|
|
141
|
-
)
|
|
142
|
-
|
|
143
|
-
def _filter(self, query):
|
|
144
|
-
self._result_df = self._result_df.query(query)
|
|
145
|
-
|
|
146
|
-
def _order_by(self, order_by_active):
|
|
147
|
-
self._result_df.sort_values(by=order_by_active)
|
|
244
|
+
return self._result_df.compute()
|
|
245
|
+
return self._result_df
|
|
@@ -39,7 +39,6 @@ def run_merge_job(
|
|
|
39
39
|
with_indexes=None,
|
|
40
40
|
query=None,
|
|
41
41
|
join_type="inner",
|
|
42
|
-
order_by=None,
|
|
43
42
|
):
|
|
44
43
|
name = vector.metadata.name
|
|
45
44
|
if not target or not hasattr(target, "to_dict"):
|
|
@@ -104,7 +103,6 @@ def run_merge_job(
|
|
|
104
103
|
"with_indexes": with_indexes,
|
|
105
104
|
"query": query,
|
|
106
105
|
"join_type": join_type,
|
|
107
|
-
"order_by": order_by,
|
|
108
106
|
"engine_args": engine_args,
|
|
109
107
|
},
|
|
110
108
|
inputs={"entity_rows": entity_rows},
|
|
@@ -149,18 +147,12 @@ class RemoteVectorResponse:
|
|
|
149
147
|
:param df_module: optional, py module used to create the DataFrame (e.g. pd, dd, cudf, ..)
|
|
150
148
|
:param kwargs: extended DataItem.as_df() args
|
|
151
149
|
"""
|
|
152
|
-
|
|
153
150
|
file_format = kwargs.get("format")
|
|
154
151
|
if not file_format:
|
|
155
152
|
file_format = self.run.status.results["target"]["kind"]
|
|
156
|
-
|
|
153
|
+
return mlrun.get_dataitem(self.target_uri).as_df(
|
|
157
154
|
columns=columns, df_module=df_module, format=file_format, **kwargs
|
|
158
155
|
)
|
|
159
|
-
if self.vector.spec.with_indexes:
|
|
160
|
-
df.set_index(
|
|
161
|
-
list(self.vector.spec.entity_fields.keys()), inplace=True, drop=True
|
|
162
|
-
)
|
|
163
|
-
return df
|
|
164
156
|
|
|
165
157
|
@property
|
|
166
158
|
def target_uri(self):
|
|
@@ -174,8 +166,7 @@ import mlrun
|
|
|
174
166
|
import mlrun.feature_store.retrieval
|
|
175
167
|
from mlrun.datastore.targets import get_target_driver
|
|
176
168
|
def merge_handler(context, vector_uri, target, entity_rows=None,
|
|
177
|
-
timestamp_column=None, drop_columns=None, with_indexes=None, query=None, join_type='inner',
|
|
178
|
-
engine_args=None, order_by=None):
|
|
169
|
+
timestamp_column=None, drop_columns=None, with_indexes=None, query=None, join_type='inner', engine_args=None):
|
|
179
170
|
vector = context.get_store_resource(vector_uri)
|
|
180
171
|
store_target = get_target_driver(target, vector)
|
|
181
172
|
entity_timestamp_column = timestamp_column or vector.spec.timestamp_field
|
|
@@ -185,7 +176,7 @@ def merge_handler(context, vector_uri, target, entity_rows=None,
|
|
|
185
176
|
context.logger.info(f"starting vector merge task to {vector.uri}")
|
|
186
177
|
merger = mlrun.feature_store.retrieval.{{{engine}}}(vector, **(engine_args or {}))
|
|
187
178
|
merger.start(entity_rows, entity_timestamp_column, store_target, drop_columns, with_indexes=with_indexes,
|
|
188
|
-
query=query, join_type=join_type
|
|
179
|
+
query=query, join_type=join_type)
|
|
189
180
|
|
|
190
181
|
target = vector.status.targets[store_target.name].to_dict()
|
|
191
182
|
context.log_result('feature_vector', vector.uri)
|
|
@@ -16,6 +16,7 @@ import re
|
|
|
16
16
|
|
|
17
17
|
import pandas as pd
|
|
18
18
|
|
|
19
|
+
from ..feature_vector import OfflineVectorResponse
|
|
19
20
|
from .base import BaseMerger
|
|
20
21
|
|
|
21
22
|
|
|
@@ -25,6 +26,133 @@ class LocalFeatureMerger(BaseMerger):
|
|
|
25
26
|
def __init__(self, vector, **engine_args):
|
|
26
27
|
super().__init__(vector, **engine_args)
|
|
27
28
|
|
|
29
|
+
def _generate_vector(
|
|
30
|
+
self,
|
|
31
|
+
entity_rows,
|
|
32
|
+
entity_timestamp_column,
|
|
33
|
+
feature_set_objects,
|
|
34
|
+
feature_set_fields,
|
|
35
|
+
start_time=None,
|
|
36
|
+
end_time=None,
|
|
37
|
+
query=None,
|
|
38
|
+
):
|
|
39
|
+
|
|
40
|
+
feature_sets = []
|
|
41
|
+
dfs = []
|
|
42
|
+
keys = (
|
|
43
|
+
[]
|
|
44
|
+
) # the struct of key is [[[],[]], ..] So that each record indicates which way the corresponding
|
|
45
|
+
# featureset is connected to the previous one, and within each record the left keys are indicated in index 0
|
|
46
|
+
# and the right keys in index 1, this keys will be the keys that will be used in this join
|
|
47
|
+
all_columns = []
|
|
48
|
+
|
|
49
|
+
fs_link_list = self._create_linked_relation_list(
|
|
50
|
+
feature_set_objects, feature_set_fields
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
for node in fs_link_list:
|
|
54
|
+
name = node.name
|
|
55
|
+
feature_set = feature_set_objects[name]
|
|
56
|
+
feature_sets.append(feature_set)
|
|
57
|
+
columns = feature_set_fields[name]
|
|
58
|
+
column_names = [name for name, alias in columns]
|
|
59
|
+
|
|
60
|
+
for col in node.data["save_cols"]:
|
|
61
|
+
if col not in column_names:
|
|
62
|
+
self._append_drop_column(col)
|
|
63
|
+
column_names += node.data["save_cols"]
|
|
64
|
+
|
|
65
|
+
# handling case where there are multiple feature sets and user creates vector where entity_timestamp_
|
|
66
|
+
# column is from a specific feature set (can't be entity timestamp)
|
|
67
|
+
if (
|
|
68
|
+
entity_timestamp_column in column_names
|
|
69
|
+
or feature_set.spec.timestamp_key == entity_timestamp_column
|
|
70
|
+
):
|
|
71
|
+
df = feature_set.to_dataframe(
|
|
72
|
+
columns=column_names,
|
|
73
|
+
start_time=start_time,
|
|
74
|
+
end_time=end_time,
|
|
75
|
+
time_column=entity_timestamp_column,
|
|
76
|
+
)
|
|
77
|
+
else:
|
|
78
|
+
df = feature_set.to_dataframe(
|
|
79
|
+
columns=column_names,
|
|
80
|
+
time_column=entity_timestamp_column,
|
|
81
|
+
)
|
|
82
|
+
if df.index.names[0]:
|
|
83
|
+
df.reset_index(inplace=True)
|
|
84
|
+
column_names += node.data["save_index"]
|
|
85
|
+
node.data["save_cols"] += node.data["save_index"]
|
|
86
|
+
entity_timestamp_column_list = (
|
|
87
|
+
[entity_timestamp_column]
|
|
88
|
+
if entity_timestamp_column
|
|
89
|
+
else feature_set.spec.timestamp_key
|
|
90
|
+
)
|
|
91
|
+
if entity_timestamp_column_list:
|
|
92
|
+
column_names += entity_timestamp_column_list
|
|
93
|
+
node.data["save_cols"] += entity_timestamp_column_list
|
|
94
|
+
# rename columns to be unique for each feature set
|
|
95
|
+
rename_col_dict = {
|
|
96
|
+
col: f"{col}_{name}"
|
|
97
|
+
for col in column_names
|
|
98
|
+
if col not in node.data["save_cols"]
|
|
99
|
+
}
|
|
100
|
+
df.rename(
|
|
101
|
+
columns=rename_col_dict,
|
|
102
|
+
inplace=True,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
dfs.append(df)
|
|
106
|
+
keys.append([node.data["left_keys"], node.data["right_keys"]])
|
|
107
|
+
|
|
108
|
+
# update alias according to the unique column name
|
|
109
|
+
new_columns = []
|
|
110
|
+
for col, alias in columns:
|
|
111
|
+
if col in rename_col_dict and alias:
|
|
112
|
+
new_columns.append((rename_col_dict[col], alias))
|
|
113
|
+
elif col in rename_col_dict and not alias:
|
|
114
|
+
new_columns.append((rename_col_dict[col], col))
|
|
115
|
+
else:
|
|
116
|
+
new_columns.append((col, alias))
|
|
117
|
+
all_columns.append(new_columns)
|
|
118
|
+
self._update_alias(
|
|
119
|
+
dictionary={name: alias for name, alias in new_columns if alias}
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
self.merge(
|
|
123
|
+
entity_df=entity_rows,
|
|
124
|
+
entity_timestamp_column=entity_timestamp_column,
|
|
125
|
+
featuresets=feature_sets,
|
|
126
|
+
featureset_dfs=dfs,
|
|
127
|
+
keys=keys,
|
|
128
|
+
all_columns=all_columns,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
self._result_df.drop(columns=self._drop_columns, inplace=True, errors="ignore")
|
|
132
|
+
|
|
133
|
+
# renaming all columns according to self._alias
|
|
134
|
+
self._result_df.rename(
|
|
135
|
+
columns=self._alias,
|
|
136
|
+
inplace=True,
|
|
137
|
+
)
|
|
138
|
+
if self.vector.status.label_column:
|
|
139
|
+
self._result_df.dropna(
|
|
140
|
+
subset=[self.vector.status.label_column],
|
|
141
|
+
inplace=True,
|
|
142
|
+
)
|
|
143
|
+
# filter joined data frame by the query param
|
|
144
|
+
if query:
|
|
145
|
+
self._result_df.query(query, inplace=True)
|
|
146
|
+
|
|
147
|
+
if self._drop_indexes:
|
|
148
|
+
self._result_df.reset_index(drop=True, inplace=True)
|
|
149
|
+
else:
|
|
150
|
+
self._set_indexes(self._result_df)
|
|
151
|
+
|
|
152
|
+
self._write_to_target()
|
|
153
|
+
|
|
154
|
+
return OfflineVectorResponse(self)
|
|
155
|
+
|
|
28
156
|
def _asof_join(
|
|
29
157
|
self,
|
|
30
158
|
entity_df,
|
|
@@ -33,6 +161,7 @@ class LocalFeatureMerger(BaseMerger):
|
|
|
33
161
|
featureset_df,
|
|
34
162
|
left_keys: list,
|
|
35
163
|
right_keys: list,
|
|
164
|
+
columns: list,
|
|
36
165
|
):
|
|
37
166
|
|
|
38
167
|
indexes = None
|
|
@@ -84,6 +213,7 @@ class LocalFeatureMerger(BaseMerger):
|
|
|
84
213
|
featureset_df,
|
|
85
214
|
left_keys: list,
|
|
86
215
|
right_keys: list,
|
|
216
|
+
columns: list,
|
|
87
217
|
):
|
|
88
218
|
fs_name = featureset.metadata.name
|
|
89
219
|
merged_df = pd.merge(
|
|
@@ -98,51 +228,3 @@ class LocalFeatureMerger(BaseMerger):
|
|
|
98
228
|
if re.findall(f"_{fs_name}_$", col):
|
|
99
229
|
self._append_drop_column(col)
|
|
100
230
|
return merged_df
|
|
101
|
-
|
|
102
|
-
def _create_engine_env(self):
|
|
103
|
-
pass
|
|
104
|
-
|
|
105
|
-
def _get_engine_df(
|
|
106
|
-
self,
|
|
107
|
-
feature_set,
|
|
108
|
-
feature_set_name,
|
|
109
|
-
column_names=None,
|
|
110
|
-
start_time=None,
|
|
111
|
-
end_time=None,
|
|
112
|
-
entity_timestamp_column=None,
|
|
113
|
-
):
|
|
114
|
-
# handling case where there are multiple feature sets and user creates vector where entity_timestamp_
|
|
115
|
-
# column is from a specific feature set (can't be entity timestamp)
|
|
116
|
-
if (
|
|
117
|
-
entity_timestamp_column in column_names
|
|
118
|
-
or feature_set.spec.timestamp_key == entity_timestamp_column
|
|
119
|
-
):
|
|
120
|
-
df = feature_set.to_dataframe(
|
|
121
|
-
columns=column_names,
|
|
122
|
-
start_time=start_time,
|
|
123
|
-
end_time=end_time,
|
|
124
|
-
time_column=entity_timestamp_column,
|
|
125
|
-
)
|
|
126
|
-
else:
|
|
127
|
-
df = feature_set.to_dataframe(
|
|
128
|
-
columns=column_names,
|
|
129
|
-
time_column=entity_timestamp_column,
|
|
130
|
-
)
|
|
131
|
-
if df.index.names[0]:
|
|
132
|
-
df.reset_index(inplace=True)
|
|
133
|
-
return df
|
|
134
|
-
|
|
135
|
-
def _rename_columns_and_select(self, df, rename_col_dict, columns=None):
|
|
136
|
-
df.rename(
|
|
137
|
-
columns=rename_col_dict,
|
|
138
|
-
inplace=True,
|
|
139
|
-
)
|
|
140
|
-
|
|
141
|
-
def _drop_columns_from_result(self):
|
|
142
|
-
self._result_df.drop(columns=self._drop_columns, inplace=True, errors="ignore")
|
|
143
|
-
|
|
144
|
-
def _filter(self, query):
|
|
145
|
-
self._result_df.query(query, inplace=True)
|
|
146
|
-
|
|
147
|
-
def _order_by(self, order_by_active):
|
|
148
|
-
self._result_df.sort_values(by=order_by_active, ignore_index=True, inplace=True)
|