mlrun 1.4.0rc25__py3-none-any.whl → 1.5.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +2 -35
- mlrun/__main__.py +3 -41
- mlrun/api/api/api.py +6 -0
- mlrun/api/api/endpoints/feature_store.py +0 -4
- mlrun/api/api/endpoints/files.py +14 -2
- mlrun/api/api/endpoints/frontend_spec.py +2 -1
- mlrun/api/api/endpoints/functions.py +95 -59
- mlrun/api/api/endpoints/grafana_proxy.py +9 -9
- mlrun/api/api/endpoints/logs.py +17 -3
- mlrun/api/api/endpoints/model_endpoints.py +3 -2
- mlrun/api/api/endpoints/pipelines.py +1 -5
- mlrun/api/api/endpoints/projects.py +88 -0
- mlrun/api/api/endpoints/runs.py +48 -6
- mlrun/api/api/endpoints/submit.py +2 -1
- mlrun/api/api/endpoints/workflows.py +355 -0
- mlrun/api/api/utils.py +3 -4
- mlrun/api/crud/__init__.py +1 -0
- mlrun/api/crud/client_spec.py +6 -2
- mlrun/api/crud/feature_store.py +5 -0
- mlrun/api/crud/model_monitoring/__init__.py +1 -0
- mlrun/api/crud/model_monitoring/deployment.py +497 -0
- mlrun/api/crud/model_monitoring/grafana.py +96 -42
- mlrun/api/crud/model_monitoring/helpers.py +159 -0
- mlrun/api/crud/model_monitoring/model_endpoints.py +202 -476
- mlrun/api/crud/notifications.py +9 -4
- mlrun/api/crud/pipelines.py +6 -11
- mlrun/api/crud/projects.py +2 -2
- mlrun/api/crud/runtime_resources.py +4 -3
- mlrun/api/crud/runtimes/nuclio/helpers.py +5 -1
- mlrun/api/crud/secrets.py +21 -0
- mlrun/api/crud/workflows.py +352 -0
- mlrun/api/db/base.py +16 -1
- mlrun/api/db/init_db.py +2 -4
- mlrun/api/db/session.py +1 -1
- mlrun/api/db/sqldb/db.py +129 -31
- mlrun/api/db/sqldb/models/models_mysql.py +15 -1
- mlrun/api/db/sqldb/models/models_sqlite.py +16 -2
- mlrun/api/launcher.py +38 -6
- mlrun/api/main.py +3 -2
- mlrun/api/rundb/__init__.py +13 -0
- mlrun/{db → api/rundb}/sqldb.py +36 -84
- mlrun/api/runtime_handlers/__init__.py +56 -0
- mlrun/api/runtime_handlers/base.py +1247 -0
- mlrun/api/runtime_handlers/daskjob.py +209 -0
- mlrun/api/runtime_handlers/kubejob.py +37 -0
- mlrun/api/runtime_handlers/mpijob.py +147 -0
- mlrun/api/runtime_handlers/remotesparkjob.py +29 -0
- mlrun/api/runtime_handlers/sparkjob.py +148 -0
- mlrun/api/schemas/__init__.py +17 -6
- mlrun/api/utils/builder.py +1 -4
- mlrun/api/utils/clients/chief.py +14 -0
- mlrun/api/utils/clients/iguazio.py +33 -33
- mlrun/api/utils/clients/nuclio.py +2 -2
- mlrun/api/utils/periodic.py +9 -2
- mlrun/api/utils/projects/follower.py +14 -7
- mlrun/api/utils/projects/leader.py +2 -1
- mlrun/api/utils/projects/remotes/nop_follower.py +2 -2
- mlrun/api/utils/projects/remotes/nop_leader.py +2 -2
- mlrun/api/utils/runtimes/__init__.py +14 -0
- mlrun/api/utils/runtimes/nuclio.py +43 -0
- mlrun/api/utils/scheduler.py +98 -15
- mlrun/api/utils/singletons/db.py +5 -1
- mlrun/api/utils/singletons/project_member.py +4 -1
- mlrun/api/utils/singletons/scheduler.py +1 -1
- mlrun/artifacts/base.py +6 -6
- mlrun/artifacts/dataset.py +4 -4
- mlrun/artifacts/manager.py +2 -3
- mlrun/artifacts/model.py +2 -2
- mlrun/artifacts/plots.py +8 -8
- mlrun/common/db/__init__.py +14 -0
- mlrun/common/helpers.py +37 -0
- mlrun/{mlutils → common/model_monitoring}/__init__.py +3 -2
- mlrun/common/model_monitoring/helpers.py +69 -0
- mlrun/common/schemas/__init__.py +13 -1
- mlrun/common/schemas/auth.py +4 -1
- mlrun/common/schemas/client_spec.py +1 -1
- mlrun/common/schemas/function.py +17 -0
- mlrun/common/schemas/model_monitoring/__init__.py +48 -0
- mlrun/common/{model_monitoring.py → schemas/model_monitoring/constants.py} +11 -23
- mlrun/common/schemas/model_monitoring/grafana.py +55 -0
- mlrun/common/schemas/{model_endpoints.py → model_monitoring/model_endpoints.py} +32 -65
- mlrun/common/schemas/notification.py +1 -0
- mlrun/common/schemas/object.py +4 -0
- mlrun/common/schemas/project.py +1 -0
- mlrun/common/schemas/regex.py +1 -1
- mlrun/common/schemas/runs.py +1 -8
- mlrun/common/schemas/schedule.py +1 -8
- mlrun/common/schemas/workflow.py +54 -0
- mlrun/config.py +45 -42
- mlrun/datastore/__init__.py +21 -0
- mlrun/datastore/base.py +1 -1
- mlrun/datastore/datastore.py +9 -0
- mlrun/datastore/dbfs_store.py +168 -0
- mlrun/datastore/helpers.py +18 -0
- mlrun/datastore/sources.py +1 -0
- mlrun/datastore/store_resources.py +2 -5
- mlrun/datastore/v3io.py +1 -2
- mlrun/db/__init__.py +4 -68
- mlrun/db/base.py +12 -0
- mlrun/db/factory.py +65 -0
- mlrun/db/httpdb.py +175 -20
- mlrun/db/nopdb.py +4 -2
- mlrun/execution.py +4 -2
- mlrun/feature_store/__init__.py +1 -0
- mlrun/feature_store/api.py +1 -2
- mlrun/feature_store/common.py +2 -1
- mlrun/feature_store/feature_set.py +1 -11
- mlrun/feature_store/feature_vector.py +340 -2
- mlrun/feature_store/ingestion.py +5 -10
- mlrun/feature_store/retrieval/base.py +118 -104
- mlrun/feature_store/retrieval/dask_merger.py +17 -10
- mlrun/feature_store/retrieval/job.py +4 -1
- mlrun/feature_store/retrieval/local_merger.py +18 -18
- mlrun/feature_store/retrieval/spark_merger.py +21 -14
- mlrun/feature_store/retrieval/storey_merger.py +22 -16
- mlrun/kfpops.py +3 -9
- mlrun/launcher/base.py +57 -53
- mlrun/launcher/client.py +5 -4
- mlrun/launcher/factory.py +24 -13
- mlrun/launcher/local.py +6 -6
- mlrun/launcher/remote.py +4 -4
- mlrun/lists.py +0 -11
- mlrun/model.py +11 -17
- mlrun/model_monitoring/__init__.py +2 -22
- mlrun/model_monitoring/features_drift_table.py +1 -1
- mlrun/model_monitoring/helpers.py +22 -210
- mlrun/model_monitoring/model_endpoint.py +1 -1
- mlrun/model_monitoring/model_monitoring_batch.py +127 -50
- mlrun/model_monitoring/prometheus.py +219 -0
- mlrun/model_monitoring/stores/__init__.py +16 -11
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +95 -23
- mlrun/model_monitoring/stores/models/mysql.py +47 -29
- mlrun/model_monitoring/stores/models/sqlite.py +47 -29
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +31 -19
- mlrun/model_monitoring/{stream_processing_fs.py → stream_processing.py} +206 -64
- mlrun/model_monitoring/tracking_policy.py +104 -0
- mlrun/package/packager.py +6 -8
- mlrun/package/packagers/default_packager.py +121 -10
- mlrun/package/packagers/numpy_packagers.py +1 -1
- mlrun/platforms/__init__.py +0 -2
- mlrun/platforms/iguazio.py +0 -56
- mlrun/projects/pipelines.py +53 -159
- mlrun/projects/project.py +10 -37
- mlrun/render.py +1 -1
- mlrun/run.py +8 -124
- mlrun/runtimes/__init__.py +6 -42
- mlrun/runtimes/base.py +29 -1249
- mlrun/runtimes/daskjob.py +2 -198
- mlrun/runtimes/funcdoc.py +0 -9
- mlrun/runtimes/function.py +25 -29
- mlrun/runtimes/kubejob.py +5 -29
- mlrun/runtimes/local.py +1 -1
- mlrun/runtimes/mpijob/__init__.py +2 -2
- mlrun/runtimes/mpijob/abstract.py +10 -1
- mlrun/runtimes/mpijob/v1.py +0 -76
- mlrun/runtimes/mpijob/v1alpha1.py +1 -74
- mlrun/runtimes/nuclio.py +3 -2
- mlrun/runtimes/pod.py +28 -18
- mlrun/runtimes/remotesparkjob.py +1 -15
- mlrun/runtimes/serving.py +14 -6
- mlrun/runtimes/sparkjob/__init__.py +0 -1
- mlrun/runtimes/sparkjob/abstract.py +4 -131
- mlrun/runtimes/utils.py +0 -26
- mlrun/serving/routers.py +7 -7
- mlrun/serving/server.py +11 -8
- mlrun/serving/states.py +7 -1
- mlrun/serving/v2_serving.py +6 -6
- mlrun/utils/helpers.py +23 -42
- mlrun/utils/notifications/notification/__init__.py +4 -0
- mlrun/utils/notifications/notification/webhook.py +61 -0
- mlrun/utils/notifications/notification_pusher.py +5 -25
- mlrun/utils/regex.py +7 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/METADATA +26 -25
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/RECORD +180 -158
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/WHEEL +1 -1
- mlrun/mlutils/data.py +0 -160
- mlrun/mlutils/models.py +0 -78
- mlrun/mlutils/plots.py +0 -902
- mlrun/utils/model_monitoring.py +0 -249
- /mlrun/{api/db/sqldb/session.py → common/db/sql_session.py} +0 -0
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/LICENSE +0 -0
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/entry_points.txt +0 -0
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -16,13 +16,12 @@ import abc
|
|
|
16
16
|
import typing
|
|
17
17
|
from datetime import datetime
|
|
18
18
|
|
|
19
|
-
import dask.dataframe as dd
|
|
20
19
|
import pandas as pd
|
|
21
20
|
|
|
22
21
|
import mlrun
|
|
23
22
|
from mlrun.datastore.targets import CSVTarget, ParquetTarget
|
|
24
23
|
from mlrun.feature_store.feature_set import FeatureSet
|
|
25
|
-
from mlrun.feature_store.feature_vector import Feature
|
|
24
|
+
from mlrun.feature_store.feature_vector import Feature, JoinGraph
|
|
26
25
|
|
|
27
26
|
from ...utils import logger, str_to_timestamp
|
|
28
27
|
from ..feature_vector import OfflineVectorResponse
|
|
@@ -42,6 +41,7 @@ class BaseMerger(abc.ABC):
|
|
|
42
41
|
def __init__(self, vector, **engine_args):
|
|
43
42
|
self._relation = dict()
|
|
44
43
|
self._join_type = "inner"
|
|
44
|
+
self._default_join_type = "default_join"
|
|
45
45
|
self.vector = vector
|
|
46
46
|
|
|
47
47
|
self._result_df = None
|
|
@@ -196,21 +196,34 @@ class BaseMerger(abc.ABC):
|
|
|
196
196
|
) # the struct of key is [[[],[]], ..] So that each record indicates which way the corresponding
|
|
197
197
|
# featureset is connected to the previous one, and within each record the left keys are indicated in index 0
|
|
198
198
|
# and the right keys in index 1, this keys will be the keys that will be used in this join
|
|
199
|
+
join_types = []
|
|
199
200
|
|
|
200
|
-
|
|
201
|
-
|
|
201
|
+
entity_rows_keys = (
|
|
202
|
+
list(entity_rows.columns) if entity_rows is not None else None
|
|
202
203
|
)
|
|
204
|
+
join_graph = self._get_graph(
|
|
205
|
+
feature_set_objects, feature_set_fields, entity_rows_keys
|
|
206
|
+
)
|
|
207
|
+
if entity_rows_keys:
|
|
208
|
+
entity_rows = self._convert_entity_rows_to_engine_df(entity_rows)
|
|
209
|
+
dfs.append(entity_rows)
|
|
210
|
+
keys.append([[], []])
|
|
211
|
+
feature_sets.append(None)
|
|
212
|
+
join_types.append(None)
|
|
203
213
|
|
|
204
214
|
filtered = False
|
|
205
|
-
for
|
|
206
|
-
name =
|
|
215
|
+
for step in join_graph.steps:
|
|
216
|
+
name = step.right_feature_set_name
|
|
207
217
|
feature_set = feature_set_objects[name]
|
|
218
|
+
saved_columns_for_relation = list(
|
|
219
|
+
self.vector.get_feature_set_relations(feature_set).keys()
|
|
220
|
+
)
|
|
208
221
|
feature_sets.append(feature_set)
|
|
209
222
|
columns = feature_set_fields[name]
|
|
210
223
|
self._origin_alias.update({name: alias for name, alias in columns})
|
|
211
224
|
column_names = [name for name, _ in columns]
|
|
212
225
|
|
|
213
|
-
for column in
|
|
226
|
+
for column in saved_columns_for_relation:
|
|
214
227
|
if column not in column_names:
|
|
215
228
|
column_names.append(column)
|
|
216
229
|
if column not in self._index_columns:
|
|
@@ -247,19 +260,19 @@ class BaseMerger(abc.ABC):
|
|
|
247
260
|
time_column,
|
|
248
261
|
)
|
|
249
262
|
|
|
250
|
-
column_names += node.data["save_index"]
|
|
251
|
-
node.data["save_cols"] += node.data["save_index"]
|
|
252
263
|
fs_entities_and_timestamp = list(feature_set.spec.entities.keys())
|
|
264
|
+
column_names += fs_entities_and_timestamp
|
|
265
|
+
saved_columns_for_relation += fs_entities_and_timestamp
|
|
253
266
|
if feature_set.spec.timestamp_key:
|
|
254
267
|
column_names.append(feature_set.spec.timestamp_key)
|
|
255
|
-
|
|
268
|
+
saved_columns_for_relation.append(feature_set.spec.timestamp_key)
|
|
256
269
|
fs_entities_and_timestamp.append(feature_set.spec.timestamp_key)
|
|
257
270
|
|
|
258
271
|
# rename columns to be unique for each feature set and select if needed
|
|
259
272
|
rename_col_dict = {
|
|
260
273
|
column: f"{column}_{name}"
|
|
261
274
|
for column in column_names
|
|
262
|
-
if column not in
|
|
275
|
+
if column not in saved_columns_for_relation
|
|
263
276
|
}
|
|
264
277
|
df_temp = self._rename_columns_and_select(
|
|
265
278
|
df,
|
|
@@ -274,7 +287,8 @@ class BaseMerger(abc.ABC):
|
|
|
274
287
|
dfs.append(df)
|
|
275
288
|
del df
|
|
276
289
|
|
|
277
|
-
keys.append([
|
|
290
|
+
keys.append([step.left_keys, step.right_keys])
|
|
291
|
+
join_types.append([step.join_type, step.asof_join])
|
|
278
292
|
|
|
279
293
|
# update alias according to the unique column name
|
|
280
294
|
new_columns = []
|
|
@@ -293,31 +307,13 @@ class BaseMerger(abc.ABC):
|
|
|
293
307
|
"start_time and end_time can only be provided in conjunction with "
|
|
294
308
|
"a timestamp column, or when the at least one feature_set has a timestamp key"
|
|
295
309
|
)
|
|
296
|
-
# convert pandas entity_rows to spark\dask DF if needed
|
|
297
|
-
if (
|
|
298
|
-
entity_rows is not None
|
|
299
|
-
and not hasattr(entity_rows, "rdd")
|
|
300
|
-
and self.engine == "spark"
|
|
301
|
-
):
|
|
302
|
-
entity_rows = self.spark.createDataFrame(entity_rows)
|
|
303
|
-
elif (
|
|
304
|
-
entity_rows is not None
|
|
305
|
-
and not hasattr(entity_rows, "dask")
|
|
306
|
-
and self.engine == "dask"
|
|
307
|
-
):
|
|
308
|
-
entity_rows = dd.from_pandas(
|
|
309
|
-
entity_rows, npartitions=len(entity_rows.columns)
|
|
310
|
-
)
|
|
311
|
-
|
|
312
310
|
# join the feature data frames
|
|
313
311
|
result_timestamp = self.merge(
|
|
314
|
-
|
|
315
|
-
entity_timestamp_column=entity_timestamp_column
|
|
316
|
-
if entity_rows is not None
|
|
317
|
-
else None,
|
|
312
|
+
entity_timestamp_column=entity_timestamp_column,
|
|
318
313
|
featuresets=feature_sets,
|
|
319
314
|
featureset_dfs=dfs,
|
|
320
315
|
keys=keys,
|
|
316
|
+
join_types=join_types,
|
|
321
317
|
)
|
|
322
318
|
|
|
323
319
|
all_columns = None
|
|
@@ -386,41 +382,46 @@ class BaseMerger(abc.ABC):
|
|
|
386
382
|
|
|
387
383
|
def merge(
|
|
388
384
|
self,
|
|
389
|
-
entity_df,
|
|
390
385
|
entity_timestamp_column: str,
|
|
391
386
|
featuresets: list,
|
|
392
387
|
featureset_dfs: list,
|
|
393
388
|
keys: list = None,
|
|
389
|
+
join_types: list = None,
|
|
394
390
|
):
|
|
395
391
|
"""join the entities and feature set features into a result dataframe"""
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
for featureset, featureset_df, lr_key in zip(featuresets, featureset_dfs, keys):
|
|
415
|
-
if featureset.spec.timestamp_key and entity_timestamp_column:
|
|
392
|
+
|
|
393
|
+
merged_df = featureset_dfs.pop(0)
|
|
394
|
+
featureset = featuresets.pop(0)
|
|
395
|
+
keys.pop(0)
|
|
396
|
+
join_types.pop(0)
|
|
397
|
+
|
|
398
|
+
if not entity_timestamp_column and featureset:
|
|
399
|
+
entity_timestamp_column = featureset.spec.timestamp_key
|
|
400
|
+
|
|
401
|
+
for featureset, featureset_df, lr_key, join_type in zip(
|
|
402
|
+
featuresets, featureset_dfs, keys, join_types
|
|
403
|
+
):
|
|
404
|
+
join_type, as_of = join_type
|
|
405
|
+
if (
|
|
406
|
+
featureset.spec.timestamp_key
|
|
407
|
+
and entity_timestamp_column
|
|
408
|
+
and join_type == self._default_join_type
|
|
409
|
+
):
|
|
416
410
|
merge_func = self._asof_join
|
|
417
|
-
|
|
411
|
+
elif join_type == self._default_join_type:
|
|
412
|
+
merge_func = self._join
|
|
413
|
+
elif join_type != self._default_join_type and not as_of:
|
|
414
|
+
self._join_type = join_type
|
|
418
415
|
merge_func = self._join
|
|
416
|
+
else:
|
|
417
|
+
self._join_type = join_type
|
|
418
|
+
merge_func = self._asof_join
|
|
419
419
|
|
|
420
420
|
merged_df = merge_func(
|
|
421
421
|
merged_df,
|
|
422
422
|
entity_timestamp_column,
|
|
423
|
-
featureset,
|
|
423
|
+
featureset.metadata.name,
|
|
424
|
+
featureset.spec.timestamp_key,
|
|
424
425
|
featureset_df,
|
|
425
426
|
lr_key[0],
|
|
426
427
|
lr_key[1],
|
|
@@ -441,7 +442,8 @@ class BaseMerger(abc.ABC):
|
|
|
441
442
|
self,
|
|
442
443
|
entity_df,
|
|
443
444
|
entity_timestamp_column: str,
|
|
444
|
-
|
|
445
|
+
featureset_name: str,
|
|
446
|
+
featureset_timstamp: str,
|
|
445
447
|
featureset_df,
|
|
446
448
|
left_keys: list,
|
|
447
449
|
right_keys: list,
|
|
@@ -452,7 +454,8 @@ class BaseMerger(abc.ABC):
|
|
|
452
454
|
self,
|
|
453
455
|
entity_df,
|
|
454
456
|
entity_timestamp_column: str,
|
|
455
|
-
|
|
457
|
+
featureset_name: str,
|
|
458
|
+
featureset_timestamp: str,
|
|
456
459
|
featureset_df,
|
|
457
460
|
left_keys: list,
|
|
458
461
|
right_keys: list,
|
|
@@ -480,10 +483,42 @@ class BaseMerger(abc.ABC):
|
|
|
480
483
|
size = CSVTarget(path=target_path).write_dataframe(self._result_df, **kw)
|
|
481
484
|
return size
|
|
482
485
|
|
|
486
|
+
def _get_graph(
|
|
487
|
+
self, feature_set_objects, feature_set_fields, entity_rows_keys=None
|
|
488
|
+
):
|
|
489
|
+
join_graph = self.vector.spec.join_graph
|
|
490
|
+
if not join_graph:
|
|
491
|
+
fs_link_list = self._create_linked_relation_list(
|
|
492
|
+
feature_set_objects, feature_set_fields, entity_rows_keys
|
|
493
|
+
)
|
|
494
|
+
join_graph = None
|
|
495
|
+
for i, node in enumerate(fs_link_list):
|
|
496
|
+
if node.name != self._entity_rows_node_name and join_graph is None:
|
|
497
|
+
join_graph = JoinGraph(first_feature_set=node.name)
|
|
498
|
+
elif node.name == self._entity_rows_node_name:
|
|
499
|
+
continue
|
|
500
|
+
else:
|
|
501
|
+
join_graph.inner(other_operand=node.name)
|
|
502
|
+
|
|
503
|
+
last_step = join_graph.steps[-1]
|
|
504
|
+
last_step.join_type = self._default_join_type
|
|
505
|
+
last_step.left_keys = node.left_keys
|
|
506
|
+
last_step.right_keys = node.right_keys
|
|
507
|
+
else:
|
|
508
|
+
join_graph._init_all_join_keys(feature_set_objects, self.vector)
|
|
509
|
+
return join_graph
|
|
510
|
+
|
|
483
511
|
class _Node:
|
|
484
|
-
def __init__(
|
|
512
|
+
def __init__(
|
|
513
|
+
self,
|
|
514
|
+
name: str,
|
|
515
|
+
order: int,
|
|
516
|
+
left_keys: typing.List[str] = None,
|
|
517
|
+
right_keys: typing.List[str] = None,
|
|
518
|
+
):
|
|
485
519
|
self.name = name
|
|
486
|
-
self.
|
|
520
|
+
self.left_keys = left_keys if left_keys is not None else []
|
|
521
|
+
self.right_keys = right_keys if right_keys is not None else []
|
|
487
522
|
# order of this feature_set in the original list
|
|
488
523
|
self.order = order
|
|
489
524
|
self.next = None
|
|
@@ -495,7 +530,9 @@ class BaseMerger(abc.ABC):
|
|
|
495
530
|
return self.name == other.name
|
|
496
531
|
|
|
497
532
|
def __copy__(self):
|
|
498
|
-
return BaseMerger._Node(
|
|
533
|
+
return BaseMerger._Node(
|
|
534
|
+
self.name, self.order, self.left_keys, self.right_keys
|
|
535
|
+
)
|
|
499
536
|
|
|
500
537
|
class _LinkedList:
|
|
501
538
|
def __init__(self, head=None):
|
|
@@ -565,9 +602,6 @@ class BaseMerger(abc.ABC):
|
|
|
565
602
|
node = self.find_node(other_head.name)
|
|
566
603
|
if node is None:
|
|
567
604
|
return
|
|
568
|
-
for col in other_head.data["save_cols"]:
|
|
569
|
-
if col not in node.data["save_cols"]:
|
|
570
|
-
node.data["save_cols"].append(col)
|
|
571
605
|
for other_node in other_iter:
|
|
572
606
|
if self.find_node(other_node.name) is None:
|
|
573
607
|
while node is not None and other_node.order > node.order:
|
|
@@ -587,24 +621,24 @@ class BaseMerger(abc.ABC):
|
|
|
587
621
|
head=BaseMerger._Node(
|
|
588
622
|
name=feature_set_names[0],
|
|
589
623
|
order=0,
|
|
590
|
-
data={
|
|
591
|
-
"left_keys": [],
|
|
592
|
-
"right_keys": [],
|
|
593
|
-
"save_cols": [],
|
|
594
|
-
"save_index": [],
|
|
595
|
-
},
|
|
596
624
|
)
|
|
597
625
|
)
|
|
598
626
|
relation_linked_lists = []
|
|
599
627
|
feature_set_entity_list_dict = {
|
|
600
628
|
name: feature_set_objects[name].spec.entities for name in feature_set_names
|
|
601
629
|
}
|
|
602
|
-
|
|
603
|
-
name: list(
|
|
630
|
+
relation_val_list = {
|
|
631
|
+
name: list(
|
|
632
|
+
self.vector.get_feature_set_relations(
|
|
633
|
+
feature_set_objects[name]
|
|
634
|
+
).values()
|
|
635
|
+
)
|
|
604
636
|
for name in feature_set_names
|
|
605
637
|
}
|
|
606
|
-
|
|
607
|
-
name: list(
|
|
638
|
+
relation_key_list = {
|
|
639
|
+
name: list(
|
|
640
|
+
self.vector.get_feature_set_relations(feature_set_objects[name]).keys()
|
|
641
|
+
)
|
|
608
642
|
for name in feature_set_names
|
|
609
643
|
}
|
|
610
644
|
|
|
@@ -612,12 +646,6 @@ class BaseMerger(abc.ABC):
|
|
|
612
646
|
relations = BaseMerger._LinkedList()
|
|
613
647
|
main_node = BaseMerger._Node(
|
|
614
648
|
name,
|
|
615
|
-
data={
|
|
616
|
-
"left_keys": [],
|
|
617
|
-
"right_keys": [],
|
|
618
|
-
"save_cols": [],
|
|
619
|
-
"save_index": [],
|
|
620
|
-
},
|
|
621
649
|
order=order,
|
|
622
650
|
)
|
|
623
651
|
relations.add_first(main_node)
|
|
@@ -629,8 +657,8 @@ class BaseMerger(abc.ABC):
|
|
|
629
657
|
name_head = linked_list_relation.head.name
|
|
630
658
|
feature_set_in_entity_list = feature_set_entity_list_dict[fs_name_in]
|
|
631
659
|
feature_set_in_entity_list_names = list(feature_set_in_entity_list.keys())
|
|
632
|
-
entity_relation_list =
|
|
633
|
-
col_relation_list =
|
|
660
|
+
entity_relation_list = relation_val_list[name_head]
|
|
661
|
+
col_relation_list = relation_key_list[name_head]
|
|
634
662
|
curr_col_relation_list = list(
|
|
635
663
|
map(
|
|
636
664
|
lambda ent: (
|
|
@@ -649,18 +677,11 @@ class BaseMerger(abc.ABC):
|
|
|
649
677
|
linked_list_relation.add_last(
|
|
650
678
|
BaseMerger._Node(
|
|
651
679
|
fs_name_in,
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
"right_keys": feature_set_in_entity_list_names,
|
|
655
|
-
"save_cols": [],
|
|
656
|
-
"save_index": [],
|
|
657
|
-
},
|
|
680
|
+
left_keys=curr_col_relation_list,
|
|
681
|
+
right_keys=feature_set_in_entity_list_names,
|
|
658
682
|
order=name_in_order,
|
|
659
683
|
)
|
|
660
684
|
)
|
|
661
|
-
linked_list_relation.head.data["save_cols"].extend(
|
|
662
|
-
curr_col_relation_list
|
|
663
|
-
)
|
|
664
685
|
elif name_in_order > head_order and sorted(
|
|
665
686
|
feature_set_in_entity_list_names
|
|
666
687
|
) == sorted(feature_set_entity_list_dict[name_head].keys()):
|
|
@@ -669,16 +690,11 @@ class BaseMerger(abc.ABC):
|
|
|
669
690
|
linked_list_relation.add_last(
|
|
670
691
|
BaseMerger._Node(
|
|
671
692
|
fs_name_in,
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
"right_keys": keys,
|
|
675
|
-
"save_cols": [],
|
|
676
|
-
"save_index": keys,
|
|
677
|
-
},
|
|
693
|
+
left_keys=keys,
|
|
694
|
+
right_keys=keys,
|
|
678
695
|
order=name_in_order,
|
|
679
696
|
)
|
|
680
697
|
)
|
|
681
|
-
linked_list_relation.head.data["save_index"] = keys
|
|
682
698
|
return linked_list_relation
|
|
683
699
|
|
|
684
700
|
def _build_entity_rows_relation(entity_rows_relation, fs_name, fs_order):
|
|
@@ -692,16 +708,11 @@ class BaseMerger(abc.ABC):
|
|
|
692
708
|
entity_rows_relation.add_last(
|
|
693
709
|
BaseMerger._Node(
|
|
694
710
|
fs_name,
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
"right_keys": keys,
|
|
698
|
-
"save_cols": [],
|
|
699
|
-
"save_index": keys,
|
|
700
|
-
},
|
|
711
|
+
left_keys=keys,
|
|
712
|
+
right_keys=keys,
|
|
701
713
|
order=fs_order,
|
|
702
714
|
)
|
|
703
715
|
)
|
|
704
|
-
entity_rows_relation.head.data["save_index"] = keys
|
|
705
716
|
|
|
706
717
|
if entity_rows_keys is not None:
|
|
707
718
|
entity_rows_linked_relation = _create_relation(
|
|
@@ -805,3 +816,6 @@ class BaseMerger(abc.ABC):
|
|
|
805
816
|
:param order_by_active: list of names to sort by.
|
|
806
817
|
"""
|
|
807
818
|
raise NotImplementedError
|
|
819
|
+
|
|
820
|
+
def _convert_entity_rows_to_engine_df(self, entity_rows):
|
|
821
|
+
raise NotImplementedError
|
|
@@ -41,8 +41,9 @@ class DaskFeatureMerger(BaseMerger):
|
|
|
41
41
|
self,
|
|
42
42
|
entity_df,
|
|
43
43
|
entity_timestamp_column: str,
|
|
44
|
-
|
|
45
|
-
|
|
44
|
+
featureset_name: str,
|
|
45
|
+
featureset_timestamp: str,
|
|
46
|
+
featureset_df: list,
|
|
46
47
|
left_keys: list,
|
|
47
48
|
right_keys: list,
|
|
48
49
|
):
|
|
@@ -53,20 +54,20 @@ class DaskFeatureMerger(BaseMerger):
|
|
|
53
54
|
sort_partition, timestamp=entity_timestamp_column
|
|
54
55
|
)
|
|
55
56
|
featureset_df = featureset_df.map_partitions(
|
|
56
|
-
sort_partition, timestamp=
|
|
57
|
+
sort_partition, timestamp=featureset_timestamp
|
|
57
58
|
)
|
|
58
59
|
|
|
59
60
|
merged_df = merge_asof(
|
|
60
61
|
entity_df,
|
|
61
62
|
featureset_df,
|
|
62
63
|
left_on=entity_timestamp_column,
|
|
63
|
-
right_on=
|
|
64
|
+
right_on=featureset_timestamp,
|
|
64
65
|
left_by=left_keys or None,
|
|
65
66
|
right_by=right_keys or None,
|
|
66
|
-
suffixes=("", f"_{
|
|
67
|
+
suffixes=("", f"_{featureset_name}_"),
|
|
67
68
|
)
|
|
68
69
|
for col in merged_df.columns:
|
|
69
|
-
if re.findall(f"_{
|
|
70
|
+
if re.findall(f"_{featureset_name}_$", col):
|
|
70
71
|
self._append_drop_column(col)
|
|
71
72
|
|
|
72
73
|
return merged_df
|
|
@@ -75,23 +76,23 @@ class DaskFeatureMerger(BaseMerger):
|
|
|
75
76
|
self,
|
|
76
77
|
entity_df,
|
|
77
78
|
entity_timestamp_column: str,
|
|
78
|
-
|
|
79
|
+
featureset_name,
|
|
80
|
+
featureset_timestamp,
|
|
79
81
|
featureset_df,
|
|
80
82
|
left_keys: list,
|
|
81
83
|
right_keys: list,
|
|
82
84
|
):
|
|
83
85
|
|
|
84
|
-
fs_name = featureset.metadata.name
|
|
85
86
|
merged_df = merge(
|
|
86
87
|
entity_df,
|
|
87
88
|
featureset_df,
|
|
88
89
|
how=self._join_type,
|
|
89
90
|
left_on=left_keys,
|
|
90
91
|
right_on=right_keys,
|
|
91
|
-
suffixes=("", f"_{
|
|
92
|
+
suffixes=("", f"_{featureset_name}_"),
|
|
92
93
|
)
|
|
93
94
|
for col in merged_df.columns:
|
|
94
|
-
if re.findall(f"_{
|
|
95
|
+
if re.findall(f"_{featureset_name}_$", col):
|
|
95
96
|
self._append_drop_column(col)
|
|
96
97
|
return merged_df
|
|
97
98
|
|
|
@@ -155,3 +156,9 @@ class DaskFeatureMerger(BaseMerger):
|
|
|
155
156
|
|
|
156
157
|
def _order_by(self, order_by_active):
|
|
157
158
|
self._result_df.sort_values(by=order_by_active)
|
|
159
|
+
|
|
160
|
+
def _convert_entity_rows_to_engine_df(self, entity_rows):
|
|
161
|
+
if entity_rows is not None and not hasattr(entity_rows, "dask"):
|
|
162
|
+
return dd.from_pandas(entity_rows, npartitions=len(entity_rows.columns))
|
|
163
|
+
|
|
164
|
+
return entity_rows
|
|
@@ -62,9 +62,12 @@ def run_merge_job(
|
|
|
62
62
|
function = run_config.to_function(kind, merger.get_default_image(kind))
|
|
63
63
|
|
|
64
64
|
# Avoid overriding a handler that was provided by the user
|
|
65
|
-
# The user shouldn't have to provide a handler, but we leave this option open just in case
|
|
66
65
|
if not run_config.handler:
|
|
67
66
|
function.with_code(body=default_code)
|
|
67
|
+
else:
|
|
68
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
69
|
+
"get_offline_features does not support run_config with a handler"
|
|
70
|
+
)
|
|
68
71
|
|
|
69
72
|
function.metadata.project = vector.metadata.project
|
|
70
73
|
function.metadata.name = function.metadata.name or name
|
|
@@ -30,45 +30,42 @@ class LocalFeatureMerger(BaseMerger):
|
|
|
30
30
|
self,
|
|
31
31
|
entity_df,
|
|
32
32
|
entity_timestamp_column: str,
|
|
33
|
-
|
|
34
|
-
|
|
33
|
+
featureset_name,
|
|
34
|
+
featureset_timstamp,
|
|
35
|
+
featureset_df: list,
|
|
35
36
|
left_keys: list,
|
|
36
37
|
right_keys: list,
|
|
37
38
|
):
|
|
38
39
|
|
|
39
|
-
indexes = None
|
|
40
|
-
if not right_keys:
|
|
41
|
-
indexes = list(featureset.spec.entities.keys())
|
|
42
40
|
index_col_not_in_entity = "index" not in entity_df.columns
|
|
43
41
|
index_col_not_in_featureset = "index" not in featureset_df.columns
|
|
44
42
|
entity_df[entity_timestamp_column] = pd.to_datetime(
|
|
45
43
|
entity_df[entity_timestamp_column]
|
|
46
44
|
)
|
|
47
|
-
featureset_df[
|
|
48
|
-
featureset_df[
|
|
45
|
+
featureset_df[featureset_timstamp] = pd.to_datetime(
|
|
46
|
+
featureset_df[featureset_timstamp]
|
|
49
47
|
)
|
|
50
48
|
entity_df.sort_values(by=entity_timestamp_column, inplace=True)
|
|
51
|
-
featureset_df.sort_values(by=
|
|
49
|
+
featureset_df.sort_values(by=featureset_timstamp, inplace=True)
|
|
52
50
|
|
|
53
51
|
merged_df = pd.merge_asof(
|
|
54
52
|
entity_df,
|
|
55
53
|
featureset_df,
|
|
56
54
|
left_on=entity_timestamp_column,
|
|
57
|
-
right_on=
|
|
58
|
-
by=indexes,
|
|
55
|
+
right_on=featureset_timstamp,
|
|
59
56
|
left_by=left_keys or None,
|
|
60
57
|
right_by=right_keys or None,
|
|
61
|
-
suffixes=("", f"_{
|
|
58
|
+
suffixes=("", f"_{featureset_name}_"),
|
|
62
59
|
)
|
|
63
60
|
for col in merged_df.columns:
|
|
64
|
-
if re.findall(f"_{
|
|
61
|
+
if re.findall(f"_{featureset_name}_$", col):
|
|
65
62
|
self._append_drop_column(col)
|
|
66
63
|
# Undo indexing tricks for asof merge
|
|
67
64
|
# to return the correct indexes and not
|
|
68
65
|
# overload `index` columns
|
|
69
66
|
if (
|
|
70
|
-
|
|
71
|
-
and "index" not in
|
|
67
|
+
"index" not in left_keys
|
|
68
|
+
and "index" not in right_keys
|
|
72
69
|
and index_col_not_in_entity
|
|
73
70
|
and index_col_not_in_featureset
|
|
74
71
|
and "index" in merged_df.columns
|
|
@@ -80,22 +77,22 @@ class LocalFeatureMerger(BaseMerger):
|
|
|
80
77
|
self,
|
|
81
78
|
entity_df,
|
|
82
79
|
entity_timestamp_column: str,
|
|
83
|
-
|
|
80
|
+
featureset_name,
|
|
81
|
+
featureset_timestamp,
|
|
84
82
|
featureset_df,
|
|
85
83
|
left_keys: list,
|
|
86
84
|
right_keys: list,
|
|
87
85
|
):
|
|
88
|
-
fs_name = featureset.metadata.name
|
|
89
86
|
merged_df = pd.merge(
|
|
90
87
|
entity_df,
|
|
91
88
|
featureset_df,
|
|
92
89
|
how=self._join_type,
|
|
93
90
|
left_on=left_keys,
|
|
94
91
|
right_on=right_keys,
|
|
95
|
-
suffixes=("", f"_{
|
|
92
|
+
suffixes=("", f"_{featureset_name}_"),
|
|
96
93
|
)
|
|
97
94
|
for col in merged_df.columns:
|
|
98
|
-
if re.findall(f"_{
|
|
95
|
+
if re.findall(f"_{featureset_name}_$", col):
|
|
99
96
|
self._append_drop_column(col)
|
|
100
97
|
return merged_df
|
|
101
98
|
|
|
@@ -135,3 +132,6 @@ class LocalFeatureMerger(BaseMerger):
|
|
|
135
132
|
|
|
136
133
|
def _order_by(self, order_by_active):
|
|
137
134
|
self._result_df.sort_values(by=order_by_active, ignore_index=True, inplace=True)
|
|
135
|
+
|
|
136
|
+
def _convert_entity_rows_to_engine_df(self, entity_rows):
|
|
137
|
+
return entity_rows
|