mlrun 1.4.0rc25__py3-none-any.whl → 1.5.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +2 -35
- mlrun/__main__.py +3 -41
- mlrun/api/api/api.py +6 -0
- mlrun/api/api/endpoints/feature_store.py +0 -4
- mlrun/api/api/endpoints/files.py +14 -2
- mlrun/api/api/endpoints/frontend_spec.py +2 -1
- mlrun/api/api/endpoints/functions.py +95 -59
- mlrun/api/api/endpoints/grafana_proxy.py +9 -9
- mlrun/api/api/endpoints/logs.py +17 -3
- mlrun/api/api/endpoints/model_endpoints.py +3 -2
- mlrun/api/api/endpoints/pipelines.py +1 -5
- mlrun/api/api/endpoints/projects.py +88 -0
- mlrun/api/api/endpoints/runs.py +48 -6
- mlrun/api/api/endpoints/submit.py +2 -1
- mlrun/api/api/endpoints/workflows.py +355 -0
- mlrun/api/api/utils.py +3 -4
- mlrun/api/crud/__init__.py +1 -0
- mlrun/api/crud/client_spec.py +6 -2
- mlrun/api/crud/feature_store.py +5 -0
- mlrun/api/crud/model_monitoring/__init__.py +1 -0
- mlrun/api/crud/model_monitoring/deployment.py +497 -0
- mlrun/api/crud/model_monitoring/grafana.py +96 -42
- mlrun/api/crud/model_monitoring/helpers.py +159 -0
- mlrun/api/crud/model_monitoring/model_endpoints.py +202 -476
- mlrun/api/crud/notifications.py +9 -4
- mlrun/api/crud/pipelines.py +6 -11
- mlrun/api/crud/projects.py +2 -2
- mlrun/api/crud/runtime_resources.py +4 -3
- mlrun/api/crud/runtimes/nuclio/helpers.py +5 -1
- mlrun/api/crud/secrets.py +21 -0
- mlrun/api/crud/workflows.py +352 -0
- mlrun/api/db/base.py +16 -1
- mlrun/api/db/init_db.py +2 -4
- mlrun/api/db/session.py +1 -1
- mlrun/api/db/sqldb/db.py +129 -31
- mlrun/api/db/sqldb/models/models_mysql.py +15 -1
- mlrun/api/db/sqldb/models/models_sqlite.py +16 -2
- mlrun/api/launcher.py +38 -6
- mlrun/api/main.py +3 -2
- mlrun/api/rundb/__init__.py +13 -0
- mlrun/{db → api/rundb}/sqldb.py +36 -84
- mlrun/api/runtime_handlers/__init__.py +56 -0
- mlrun/api/runtime_handlers/base.py +1247 -0
- mlrun/api/runtime_handlers/daskjob.py +209 -0
- mlrun/api/runtime_handlers/kubejob.py +37 -0
- mlrun/api/runtime_handlers/mpijob.py +147 -0
- mlrun/api/runtime_handlers/remotesparkjob.py +29 -0
- mlrun/api/runtime_handlers/sparkjob.py +148 -0
- mlrun/api/schemas/__init__.py +17 -6
- mlrun/api/utils/builder.py +1 -4
- mlrun/api/utils/clients/chief.py +14 -0
- mlrun/api/utils/clients/iguazio.py +33 -33
- mlrun/api/utils/clients/nuclio.py +2 -2
- mlrun/api/utils/periodic.py +9 -2
- mlrun/api/utils/projects/follower.py +14 -7
- mlrun/api/utils/projects/leader.py +2 -1
- mlrun/api/utils/projects/remotes/nop_follower.py +2 -2
- mlrun/api/utils/projects/remotes/nop_leader.py +2 -2
- mlrun/api/utils/runtimes/__init__.py +14 -0
- mlrun/api/utils/runtimes/nuclio.py +43 -0
- mlrun/api/utils/scheduler.py +98 -15
- mlrun/api/utils/singletons/db.py +5 -1
- mlrun/api/utils/singletons/project_member.py +4 -1
- mlrun/api/utils/singletons/scheduler.py +1 -1
- mlrun/artifacts/base.py +6 -6
- mlrun/artifacts/dataset.py +4 -4
- mlrun/artifacts/manager.py +2 -3
- mlrun/artifacts/model.py +2 -2
- mlrun/artifacts/plots.py +8 -8
- mlrun/common/db/__init__.py +14 -0
- mlrun/common/helpers.py +37 -0
- mlrun/{mlutils → common/model_monitoring}/__init__.py +3 -2
- mlrun/common/model_monitoring/helpers.py +69 -0
- mlrun/common/schemas/__init__.py +13 -1
- mlrun/common/schemas/auth.py +4 -1
- mlrun/common/schemas/client_spec.py +1 -1
- mlrun/common/schemas/function.py +17 -0
- mlrun/common/schemas/model_monitoring/__init__.py +48 -0
- mlrun/common/{model_monitoring.py → schemas/model_monitoring/constants.py} +11 -23
- mlrun/common/schemas/model_monitoring/grafana.py +55 -0
- mlrun/common/schemas/{model_endpoints.py → model_monitoring/model_endpoints.py} +32 -65
- mlrun/common/schemas/notification.py +1 -0
- mlrun/common/schemas/object.py +4 -0
- mlrun/common/schemas/project.py +1 -0
- mlrun/common/schemas/regex.py +1 -1
- mlrun/common/schemas/runs.py +1 -8
- mlrun/common/schemas/schedule.py +1 -8
- mlrun/common/schemas/workflow.py +54 -0
- mlrun/config.py +45 -42
- mlrun/datastore/__init__.py +21 -0
- mlrun/datastore/base.py +1 -1
- mlrun/datastore/datastore.py +9 -0
- mlrun/datastore/dbfs_store.py +168 -0
- mlrun/datastore/helpers.py +18 -0
- mlrun/datastore/sources.py +1 -0
- mlrun/datastore/store_resources.py +2 -5
- mlrun/datastore/v3io.py +1 -2
- mlrun/db/__init__.py +4 -68
- mlrun/db/base.py +12 -0
- mlrun/db/factory.py +65 -0
- mlrun/db/httpdb.py +175 -20
- mlrun/db/nopdb.py +4 -2
- mlrun/execution.py +4 -2
- mlrun/feature_store/__init__.py +1 -0
- mlrun/feature_store/api.py +1 -2
- mlrun/feature_store/common.py +2 -1
- mlrun/feature_store/feature_set.py +1 -11
- mlrun/feature_store/feature_vector.py +340 -2
- mlrun/feature_store/ingestion.py +5 -10
- mlrun/feature_store/retrieval/base.py +118 -104
- mlrun/feature_store/retrieval/dask_merger.py +17 -10
- mlrun/feature_store/retrieval/job.py +4 -1
- mlrun/feature_store/retrieval/local_merger.py +18 -18
- mlrun/feature_store/retrieval/spark_merger.py +21 -14
- mlrun/feature_store/retrieval/storey_merger.py +22 -16
- mlrun/kfpops.py +3 -9
- mlrun/launcher/base.py +57 -53
- mlrun/launcher/client.py +5 -4
- mlrun/launcher/factory.py +24 -13
- mlrun/launcher/local.py +6 -6
- mlrun/launcher/remote.py +4 -4
- mlrun/lists.py +0 -11
- mlrun/model.py +11 -17
- mlrun/model_monitoring/__init__.py +2 -22
- mlrun/model_monitoring/features_drift_table.py +1 -1
- mlrun/model_monitoring/helpers.py +22 -210
- mlrun/model_monitoring/model_endpoint.py +1 -1
- mlrun/model_monitoring/model_monitoring_batch.py +127 -50
- mlrun/model_monitoring/prometheus.py +219 -0
- mlrun/model_monitoring/stores/__init__.py +16 -11
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +95 -23
- mlrun/model_monitoring/stores/models/mysql.py +47 -29
- mlrun/model_monitoring/stores/models/sqlite.py +47 -29
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +31 -19
- mlrun/model_monitoring/{stream_processing_fs.py → stream_processing.py} +206 -64
- mlrun/model_monitoring/tracking_policy.py +104 -0
- mlrun/package/packager.py +6 -8
- mlrun/package/packagers/default_packager.py +121 -10
- mlrun/package/packagers/numpy_packagers.py +1 -1
- mlrun/platforms/__init__.py +0 -2
- mlrun/platforms/iguazio.py +0 -56
- mlrun/projects/pipelines.py +53 -159
- mlrun/projects/project.py +10 -37
- mlrun/render.py +1 -1
- mlrun/run.py +8 -124
- mlrun/runtimes/__init__.py +6 -42
- mlrun/runtimes/base.py +29 -1249
- mlrun/runtimes/daskjob.py +2 -198
- mlrun/runtimes/funcdoc.py +0 -9
- mlrun/runtimes/function.py +25 -29
- mlrun/runtimes/kubejob.py +5 -29
- mlrun/runtimes/local.py +1 -1
- mlrun/runtimes/mpijob/__init__.py +2 -2
- mlrun/runtimes/mpijob/abstract.py +10 -1
- mlrun/runtimes/mpijob/v1.py +0 -76
- mlrun/runtimes/mpijob/v1alpha1.py +1 -74
- mlrun/runtimes/nuclio.py +3 -2
- mlrun/runtimes/pod.py +28 -18
- mlrun/runtimes/remotesparkjob.py +1 -15
- mlrun/runtimes/serving.py +14 -6
- mlrun/runtimes/sparkjob/__init__.py +0 -1
- mlrun/runtimes/sparkjob/abstract.py +4 -131
- mlrun/runtimes/utils.py +0 -26
- mlrun/serving/routers.py +7 -7
- mlrun/serving/server.py +11 -8
- mlrun/serving/states.py +7 -1
- mlrun/serving/v2_serving.py +6 -6
- mlrun/utils/helpers.py +23 -42
- mlrun/utils/notifications/notification/__init__.py +4 -0
- mlrun/utils/notifications/notification/webhook.py +61 -0
- mlrun/utils/notifications/notification_pusher.py +5 -25
- mlrun/utils/regex.py +7 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/METADATA +26 -25
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/RECORD +180 -158
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/WHEEL +1 -1
- mlrun/mlutils/data.py +0 -160
- mlrun/mlutils/models.py +0 -78
- mlrun/mlutils/plots.py +0 -902
- mlrun/utils/model_monitoring.py +0 -249
- /mlrun/{api/db/sqldb/session.py → common/db/sql_session.py} +0 -0
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/LICENSE +0 -0
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/entry_points.txt +0 -0
- {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/top_level.txt +0 -0
mlrun/runtimes/nuclio.py
CHANGED
|
@@ -16,7 +16,8 @@ import json
|
|
|
16
16
|
import os
|
|
17
17
|
import socket
|
|
18
18
|
|
|
19
|
-
|
|
19
|
+
import mlrun.db
|
|
20
|
+
|
|
20
21
|
from ..errors import err_to_str
|
|
21
22
|
from ..execution import MLClientCtx
|
|
22
23
|
from ..model import RunTemplate
|
|
@@ -60,7 +61,7 @@ def nuclio_jobs_handler(context, event):
|
|
|
60
61
|
status_code=400,
|
|
61
62
|
)
|
|
62
63
|
|
|
63
|
-
out = get_or_set_dburl()
|
|
64
|
+
out = mlrun.db.get_or_set_dburl()
|
|
64
65
|
if out:
|
|
65
66
|
context.logger.info(f"logging run results to: {out}")
|
|
66
67
|
|
mlrun/runtimes/pod.py
CHANGED
|
@@ -20,7 +20,6 @@ from enum import Enum
|
|
|
20
20
|
import dotenv
|
|
21
21
|
import kfp.dsl
|
|
22
22
|
import kubernetes.client as k8s_client
|
|
23
|
-
from deprecated import deprecated
|
|
24
23
|
|
|
25
24
|
import mlrun.errors
|
|
26
25
|
import mlrun.utils.regex
|
|
@@ -360,15 +359,25 @@ class KubeResourceSpec(FunctionSpec):
|
|
|
360
359
|
patch: bool = False,
|
|
361
360
|
):
|
|
362
361
|
"""
|
|
363
|
-
|
|
364
|
-
|
|
362
|
+
Set pod cpu/memory/gpu limits (max values)
|
|
363
|
+
|
|
364
|
+
:param mem: set limit for memory e.g. '500M', '2G', etc.
|
|
365
|
+
:param cpu: set limit for cpu e.g. '0.5', '2', etc.
|
|
366
|
+
:param gpus: set limit for gpu
|
|
367
|
+
:param gpu_type: set gpu type e.g. "nvidia.com/gpu"
|
|
368
|
+
:param patch: by default it overrides the whole limits section,
|
|
369
|
+
if you wish to patch specific resources use `patch=True`
|
|
365
370
|
"""
|
|
366
371
|
self._verify_and_set_limits("resources", mem, cpu, gpus, gpu_type, patch=patch)
|
|
367
372
|
|
|
368
373
|
def with_requests(self, mem: str = None, cpu: str = None, patch: bool = False):
|
|
369
374
|
"""
|
|
370
|
-
|
|
371
|
-
|
|
375
|
+
Set requested (desired) pod cpu/memory resources
|
|
376
|
+
|
|
377
|
+
:param mem: set request for memory e.g. '200M', '1G', etc.
|
|
378
|
+
:param cpu: set request for cpu e.g. '0.1', '1', etc.
|
|
379
|
+
:param patch: by default it overrides the whole requests section,
|
|
380
|
+
if you wish to patch specific resources use `patch=True`
|
|
372
381
|
"""
|
|
373
382
|
self._verify_and_set_requests("resources", mem, cpu, patch)
|
|
374
383
|
|
|
@@ -1004,15 +1013,6 @@ class KubeResource(BaseRuntime):
|
|
|
1004
1013
|
self.set_env(name, value)
|
|
1005
1014
|
return self
|
|
1006
1015
|
|
|
1007
|
-
# TODO: Remove in 1.5.0
|
|
1008
|
-
@deprecated(
|
|
1009
|
-
version="1.3.0",
|
|
1010
|
-
reason="'Job gpus' will be removed in 1.5.0, use 'with_limits' instead",
|
|
1011
|
-
category=FutureWarning,
|
|
1012
|
-
)
|
|
1013
|
-
def gpus(self, gpus, gpu_type="nvidia.com/gpu"):
|
|
1014
|
-
update_in(self.spec.resources, ["limits", gpu_type], gpus)
|
|
1015
|
-
|
|
1016
1016
|
def set_image_pull_configuration(
|
|
1017
1017
|
self, image_pull_policy: str = None, image_pull_secret_name: str = None
|
|
1018
1018
|
):
|
|
@@ -1041,15 +1041,25 @@ class KubeResource(BaseRuntime):
|
|
|
1041
1041
|
patch: bool = False,
|
|
1042
1042
|
):
|
|
1043
1043
|
"""
|
|
1044
|
-
|
|
1045
|
-
|
|
1044
|
+
Set pod cpu/memory/gpu limits (max values)
|
|
1045
|
+
|
|
1046
|
+
:param mem: set limit for memory e.g. '500M', '2G', etc.
|
|
1047
|
+
:param cpu: set limit for cpu e.g. '0.5', '2', etc.
|
|
1048
|
+
:param gpus: set limit for gpu
|
|
1049
|
+
:param gpu_type: set gpu type e.g. "nvidia.com/gpu"
|
|
1050
|
+
:param patch: by default it overrides the whole limits section,
|
|
1051
|
+
if you wish to patch specific resources use `patch=True`
|
|
1046
1052
|
"""
|
|
1047
1053
|
self.spec.with_limits(mem, cpu, gpus, gpu_type, patch=patch)
|
|
1048
1054
|
|
|
1049
1055
|
def with_requests(self, mem: str = None, cpu: str = None, patch: bool = False):
|
|
1050
1056
|
"""
|
|
1051
|
-
|
|
1052
|
-
|
|
1057
|
+
Set requested (desired) pod cpu/memory resources
|
|
1058
|
+
|
|
1059
|
+
:param mem: set request for memory e.g. '200M', '1G', etc.
|
|
1060
|
+
:param cpu: set request for cpu e.g. '0.1', '1', etc.
|
|
1061
|
+
:param patch: by default it overrides the whole requests section,
|
|
1062
|
+
if you wish to patch specific resources use `patch=True`
|
|
1053
1063
|
"""
|
|
1054
1064
|
self.spec.with_requests(mem, cpu, patch=patch)
|
|
1055
1065
|
|
mlrun/runtimes/remotesparkjob.py
CHANGED
|
@@ -21,8 +21,7 @@ from mlrun.config import config
|
|
|
21
21
|
|
|
22
22
|
from ..model import RunObject
|
|
23
23
|
from ..platforms.iguazio import mount_v3io, mount_v3iod
|
|
24
|
-
from .
|
|
25
|
-
from .kubejob import KubejobRuntime, KubeRuntimeHandler
|
|
24
|
+
from .kubejob import KubejobRuntime
|
|
26
25
|
from .pod import KubeResourceSpec
|
|
27
26
|
|
|
28
27
|
|
|
@@ -210,18 +209,5 @@ class RemoteSparkRuntime(KubejobRuntime):
|
|
|
210
209
|
)
|
|
211
210
|
|
|
212
211
|
|
|
213
|
-
class RemoteSparkRuntimeHandler(KubeRuntimeHandler):
|
|
214
|
-
kind = "remote-spark"
|
|
215
|
-
class_modes = {RuntimeClassMode.run: "remote-spark"}
|
|
216
|
-
|
|
217
|
-
@staticmethod
|
|
218
|
-
def _are_resources_coupled_to_run_object() -> bool:
|
|
219
|
-
return True
|
|
220
|
-
|
|
221
|
-
@staticmethod
|
|
222
|
-
def _get_object_label_selector(object_id: str) -> str:
|
|
223
|
-
return f"mlrun/uid={object_id}"
|
|
224
|
-
|
|
225
|
-
|
|
226
212
|
def igz_spark_pre_hook():
|
|
227
213
|
run(["/bin/bash", "/etc/config/v3io/spark-job-init.sh"])
|
mlrun/runtimes/serving.py
CHANGED
|
@@ -22,6 +22,7 @@ from nuclio import KafkaTrigger
|
|
|
22
22
|
|
|
23
23
|
import mlrun
|
|
24
24
|
import mlrun.common.schemas
|
|
25
|
+
from mlrun.model_monitoring.tracking_policy import TrackingPolicy
|
|
25
26
|
|
|
26
27
|
from ..datastore import parse_kafka_url
|
|
27
28
|
from ..model import ObjectList
|
|
@@ -36,7 +37,7 @@ from ..serving.states import (
|
|
|
36
37
|
new_remote_endpoint,
|
|
37
38
|
params_to_step,
|
|
38
39
|
)
|
|
39
|
-
from ..utils import get_caller_globals, logger,
|
|
40
|
+
from ..utils import get_caller_globals, logger, set_paths
|
|
40
41
|
from .function import NuclioSpec, RemoteRuntime
|
|
41
42
|
from .function_reference import FunctionReference
|
|
42
43
|
|
|
@@ -146,7 +147,6 @@ class ServingSpec(NuclioSpec):
|
|
|
146
147
|
add_templated_ingress_host_mode=None,
|
|
147
148
|
clone_target_dir=None,
|
|
148
149
|
):
|
|
149
|
-
|
|
150
150
|
super().__init__(
|
|
151
151
|
command=command,
|
|
152
152
|
args=args,
|
|
@@ -304,7 +304,7 @@ class ServingRuntime(RemoteRuntime):
|
|
|
304
304
|
batch: int = None,
|
|
305
305
|
sample: int = None,
|
|
306
306
|
stream_args: dict = None,
|
|
307
|
-
tracking_policy: Union[
|
|
307
|
+
tracking_policy: Union[TrackingPolicy, dict] = None,
|
|
308
308
|
):
|
|
309
309
|
"""set tracking parameters:
|
|
310
310
|
|
|
@@ -334,9 +334,7 @@ class ServingRuntime(RemoteRuntime):
|
|
|
334
334
|
if tracking_policy:
|
|
335
335
|
if isinstance(tracking_policy, dict):
|
|
336
336
|
# Convert tracking policy dictionary into `model_monitoring.TrackingPolicy` object
|
|
337
|
-
self.spec.tracking_policy =
|
|
338
|
-
tracking_policy
|
|
339
|
-
)
|
|
337
|
+
self.spec.tracking_policy = TrackingPolicy.from_dict(tracking_policy)
|
|
340
338
|
else:
|
|
341
339
|
# Tracking_policy is already a `model_monitoring.TrackingPolicy` object
|
|
342
340
|
self.spec.tracking_policy = tracking_policy
|
|
@@ -476,6 +474,16 @@ class ServingRuntime(RemoteRuntime):
|
|
|
476
474
|
child_function = self._spec.function_refs[function_name]
|
|
477
475
|
trigger_args = stream.trigger_args or {}
|
|
478
476
|
|
|
477
|
+
if mlrun.mlconf.is_explicit_ack():
|
|
478
|
+
trigger_args["explicit_ack_mode"] = trigger_args.get(
|
|
479
|
+
"explicit_ack_mode", "explicitOnly"
|
|
480
|
+
)
|
|
481
|
+
extra_attributes = trigger_args.get("extra_attributes", {})
|
|
482
|
+
trigger_args["extra_attributes"] = extra_attributes
|
|
483
|
+
extra_attributes["workerAllocationMode"] = extra_attributes.get(
|
|
484
|
+
"workerAllocationMode", "static"
|
|
485
|
+
)
|
|
486
|
+
|
|
479
487
|
if (
|
|
480
488
|
stream.path.startswith("kafka://")
|
|
481
489
|
or "kafka_bootstrap_servers" in stream.options
|
|
@@ -14,21 +14,15 @@
|
|
|
14
14
|
import os.path
|
|
15
15
|
import typing
|
|
16
16
|
from copy import deepcopy
|
|
17
|
-
from datetime import datetime
|
|
18
|
-
from typing import Dict, Optional, Tuple
|
|
19
17
|
|
|
20
18
|
from kubernetes import client
|
|
21
19
|
from kubernetes.client.rest import ApiException
|
|
22
|
-
from sqlalchemy.orm import Session
|
|
23
20
|
|
|
21
|
+
import mlrun.db
|
|
24
22
|
import mlrun.errors
|
|
25
23
|
import mlrun.utils.regex
|
|
26
|
-
from mlrun.api.db.base import DBInterface
|
|
27
24
|
from mlrun.config import config
|
|
28
|
-
from mlrun.db import get_run_db
|
|
29
25
|
from mlrun.errors import err_to_str
|
|
30
|
-
from mlrun.runtimes.base import BaseRuntimeHandler
|
|
31
|
-
from mlrun.runtimes.constants import RunStates, SparkApplicationStates
|
|
32
26
|
|
|
33
27
|
from ...execution import MLClientCtx
|
|
34
28
|
from ...model import RunObject
|
|
@@ -41,7 +35,7 @@ from ...utils import (
|
|
|
41
35
|
verify_field_regex,
|
|
42
36
|
verify_list_and_update_in,
|
|
43
37
|
)
|
|
44
|
-
from ..base import RunError
|
|
38
|
+
from ..base import RunError
|
|
45
39
|
from ..kubejob import KubejobRuntime
|
|
46
40
|
from ..pod import KubeResourceSpec
|
|
47
41
|
from ..utils import get_item_name, get_k8s
|
|
@@ -242,7 +236,7 @@ class AbstractSparkRuntime(KubejobRuntime):
|
|
|
242
236
|
sj.with_driver_requests(cpu=1, mem="512m")
|
|
243
237
|
|
|
244
238
|
sj.deploy()
|
|
245
|
-
get_run_db().delete_function(name=sj.metadata.name)
|
|
239
|
+
mlrun.db.get_run_db().delete_function(name=sj.metadata.name)
|
|
246
240
|
|
|
247
241
|
def _is_using_gpu(self):
|
|
248
242
|
driver_limits = self.spec.driver_resources.get("limits")
|
|
@@ -292,7 +286,7 @@ class AbstractSparkRuntime(KubejobRuntime):
|
|
|
292
286
|
:return True if the function is ready (deployed)
|
|
293
287
|
"""
|
|
294
288
|
# connect will populate the config from the server config
|
|
295
|
-
get_run_db()
|
|
289
|
+
mlrun.db.get_run_db()
|
|
296
290
|
if not self.spec.build.base_image:
|
|
297
291
|
self.spec.build.base_image = self._default_image
|
|
298
292
|
return super().deploy(
|
|
@@ -841,124 +835,3 @@ with ctx:
|
|
|
841
835
|
@spec.setter
|
|
842
836
|
def spec(self, spec):
|
|
843
837
|
raise NotImplementedError()
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
class SparkRuntimeHandler(BaseRuntimeHandler):
|
|
847
|
-
kind = "spark"
|
|
848
|
-
class_modes = {
|
|
849
|
-
RuntimeClassMode.run: "spark",
|
|
850
|
-
}
|
|
851
|
-
|
|
852
|
-
def _resolve_crd_object_status_info(
|
|
853
|
-
self, db: DBInterface, db_session: Session, crd_object
|
|
854
|
-
) -> Tuple[bool, Optional[datetime], Optional[str]]:
|
|
855
|
-
state = crd_object.get("status", {}).get("applicationState", {}).get("state")
|
|
856
|
-
in_terminal_state = state in SparkApplicationStates.terminal_states()
|
|
857
|
-
desired_run_state = SparkApplicationStates.spark_application_state_to_run_state(
|
|
858
|
-
state
|
|
859
|
-
)
|
|
860
|
-
completion_time = None
|
|
861
|
-
if in_terminal_state:
|
|
862
|
-
if crd_object.get("status", {}).get("terminationTime"):
|
|
863
|
-
completion_time = datetime.fromisoformat(
|
|
864
|
-
crd_object.get("status", {})
|
|
865
|
-
.get("terminationTime")
|
|
866
|
-
.replace("Z", "+00:00")
|
|
867
|
-
)
|
|
868
|
-
else:
|
|
869
|
-
last_submission_attempt_time = crd_object.get("status", {}).get(
|
|
870
|
-
"lastSubmissionAttemptTime"
|
|
871
|
-
)
|
|
872
|
-
if last_submission_attempt_time:
|
|
873
|
-
last_submission_attempt_time = last_submission_attempt_time.replace(
|
|
874
|
-
"Z", "+00:00"
|
|
875
|
-
)
|
|
876
|
-
completion_time = datetime.fromisoformat(
|
|
877
|
-
last_submission_attempt_time
|
|
878
|
-
)
|
|
879
|
-
return in_terminal_state, completion_time, desired_run_state
|
|
880
|
-
|
|
881
|
-
def _update_ui_url(
|
|
882
|
-
self,
|
|
883
|
-
db: DBInterface,
|
|
884
|
-
db_session: Session,
|
|
885
|
-
project: str,
|
|
886
|
-
uid: str,
|
|
887
|
-
crd_object,
|
|
888
|
-
run: Dict = None,
|
|
889
|
-
):
|
|
890
|
-
app_state = (
|
|
891
|
-
crd_object.get("status", {}).get("applicationState", {}).get("state")
|
|
892
|
-
)
|
|
893
|
-
state = SparkApplicationStates.spark_application_state_to_run_state(app_state)
|
|
894
|
-
ui_url = None
|
|
895
|
-
if state == RunStates.running:
|
|
896
|
-
ui_url = (
|
|
897
|
-
crd_object.get("status", {})
|
|
898
|
-
.get("driverInfo", {})
|
|
899
|
-
.get("webUIIngressAddress")
|
|
900
|
-
)
|
|
901
|
-
db_ui_url = run.get("status", {}).get("ui_url")
|
|
902
|
-
if db_ui_url == ui_url:
|
|
903
|
-
return
|
|
904
|
-
run.setdefault("status", {})["ui_url"] = ui_url
|
|
905
|
-
db.store_run(db_session, run, uid, project)
|
|
906
|
-
|
|
907
|
-
@staticmethod
|
|
908
|
-
def _are_resources_coupled_to_run_object() -> bool:
|
|
909
|
-
return True
|
|
910
|
-
|
|
911
|
-
@staticmethod
|
|
912
|
-
def _get_object_label_selector(object_id: str) -> str:
|
|
913
|
-
return f"mlrun/uid={object_id}"
|
|
914
|
-
|
|
915
|
-
@staticmethod
|
|
916
|
-
def _get_main_runtime_resource_label_selector() -> str:
|
|
917
|
-
"""
|
|
918
|
-
There are some runtimes which might have multiple k8s resources attached to a one runtime, in this case
|
|
919
|
-
we don't want to pull logs from all but rather only for the "driver"/"launcher" etc
|
|
920
|
-
:return: the label selector
|
|
921
|
-
"""
|
|
922
|
-
return "spark-role=driver"
|
|
923
|
-
|
|
924
|
-
@staticmethod
|
|
925
|
-
def _get_crd_info() -> Tuple[str, str, str]:
|
|
926
|
-
return (
|
|
927
|
-
AbstractSparkRuntime.group,
|
|
928
|
-
AbstractSparkRuntime.version,
|
|
929
|
-
AbstractSparkRuntime.plural,
|
|
930
|
-
)
|
|
931
|
-
|
|
932
|
-
def _delete_extra_resources(
|
|
933
|
-
self,
|
|
934
|
-
db: DBInterface,
|
|
935
|
-
db_session: Session,
|
|
936
|
-
namespace: str,
|
|
937
|
-
deleted_resources: typing.List[Dict],
|
|
938
|
-
label_selector: str = None,
|
|
939
|
-
force: bool = False,
|
|
940
|
-
grace_period: int = None,
|
|
941
|
-
):
|
|
942
|
-
"""
|
|
943
|
-
Handling config maps deletion
|
|
944
|
-
"""
|
|
945
|
-
uids = []
|
|
946
|
-
for crd_dict in deleted_resources:
|
|
947
|
-
uid = crd_dict["metadata"].get("labels", {}).get("mlrun/uid", None)
|
|
948
|
-
uids.append(uid)
|
|
949
|
-
|
|
950
|
-
config_maps = get_k8s().v1api.list_namespaced_config_map(
|
|
951
|
-
namespace, label_selector=label_selector
|
|
952
|
-
)
|
|
953
|
-
for config_map in config_maps.items:
|
|
954
|
-
try:
|
|
955
|
-
uid = config_map.metadata.labels.get("mlrun/uid", None)
|
|
956
|
-
if force or uid in uids:
|
|
957
|
-
get_k8s().v1api.delete_namespaced_config_map(
|
|
958
|
-
config_map.metadata.name, namespace
|
|
959
|
-
)
|
|
960
|
-
logger.info(f"Deleted config map: {config_map.metadata.name}")
|
|
961
|
-
except ApiException as exc:
|
|
962
|
-
# ignore error if config map is already removed
|
|
963
|
-
if exc.status != 404:
|
|
964
|
-
raise
|
mlrun/runtimes/utils.py
CHANGED
|
@@ -24,11 +24,9 @@ import pandas as pd
|
|
|
24
24
|
from kubernetes import client
|
|
25
25
|
|
|
26
26
|
import mlrun
|
|
27
|
-
import mlrun.api.utils.builder
|
|
28
27
|
import mlrun.common.constants
|
|
29
28
|
import mlrun.common.schemas
|
|
30
29
|
import mlrun.utils.regex
|
|
31
|
-
from mlrun.api.utils.clients import nuclio
|
|
32
30
|
from mlrun.errors import err_to_str
|
|
33
31
|
from mlrun.frameworks.parallel_coordinates import gen_pcp_plot
|
|
34
32
|
from mlrun.runtimes.constants import MPIJobCRDVersions
|
|
@@ -62,7 +60,6 @@ global_context = _ContextStore()
|
|
|
62
60
|
|
|
63
61
|
|
|
64
62
|
cached_mpijob_crd_version = None
|
|
65
|
-
cached_nuclio_version = None
|
|
66
63
|
|
|
67
64
|
|
|
68
65
|
# resolve mpijob runtime according to the mpi-operator's supported crd-version
|
|
@@ -119,29 +116,6 @@ def resolve_spark_operator_version():
|
|
|
119
116
|
raise ValueError("Failed to resolve spark operator's version")
|
|
120
117
|
|
|
121
118
|
|
|
122
|
-
# if nuclio version specified on mlrun config set it likewise,
|
|
123
|
-
# if not specified, get it from nuclio api client
|
|
124
|
-
# since this is a heavy operation (sending requests to API), and it's unlikely that the version
|
|
125
|
-
# will change - cache it (this means if we upgrade nuclio, we need to restart mlrun to re-fetch the new version)
|
|
126
|
-
def resolve_nuclio_version():
|
|
127
|
-
global cached_nuclio_version
|
|
128
|
-
|
|
129
|
-
if not cached_nuclio_version:
|
|
130
|
-
|
|
131
|
-
# config override everything
|
|
132
|
-
nuclio_version = config.nuclio_version
|
|
133
|
-
if not nuclio_version and config.nuclio_dashboard_url:
|
|
134
|
-
try:
|
|
135
|
-
nuclio_client = nuclio.Client()
|
|
136
|
-
nuclio_version = nuclio_client.get_dashboard_version()
|
|
137
|
-
except Exception as exc:
|
|
138
|
-
logger.warning("Failed to resolve nuclio version", exc=err_to_str(exc))
|
|
139
|
-
|
|
140
|
-
cached_nuclio_version = nuclio_version
|
|
141
|
-
|
|
142
|
-
return cached_nuclio_version
|
|
143
|
-
|
|
144
|
-
|
|
145
119
|
def calc_hash(func, tag=""):
|
|
146
120
|
# remove tag, hash, date from calculation
|
|
147
121
|
tag = tag or func.metadata.tag
|
mlrun/serving/routers.py
CHANGED
|
@@ -25,10 +25,10 @@ import numpy as np
|
|
|
25
25
|
|
|
26
26
|
import mlrun
|
|
27
27
|
import mlrun.common.model_monitoring
|
|
28
|
-
import mlrun.common.schemas
|
|
29
|
-
|
|
30
|
-
from mlrun.utils import logger, now_date, parse_versioned_object_uri
|
|
28
|
+
import mlrun.common.schemas.model_monitoring
|
|
29
|
+
from mlrun.utils import logger, now_date
|
|
31
30
|
|
|
31
|
+
from ..common.helpers import parse_versioned_object_uri
|
|
32
32
|
from ..config import config
|
|
33
33
|
from .server import GraphServer
|
|
34
34
|
from .utils import RouterToDict, _extract_input_data, _update_result_body
|
|
@@ -1066,13 +1066,13 @@ def _init_endpoint_record(
|
|
|
1066
1066
|
project=project, kind="stream"
|
|
1067
1067
|
),
|
|
1068
1068
|
active=True,
|
|
1069
|
-
monitoring_mode=mlrun.common.model_monitoring.ModelMonitoringMode.enabled
|
|
1069
|
+
monitoring_mode=mlrun.common.schemas.model_monitoring.ModelMonitoringMode.enabled
|
|
1070
1070
|
if voting_ensemble.context.server.track_models
|
|
1071
|
-
else mlrun.common.model_monitoring.ModelMonitoringMode.disabled,
|
|
1071
|
+
else mlrun.common.schemas.model_monitoring.ModelMonitoringMode.disabled,
|
|
1072
1072
|
),
|
|
1073
1073
|
status=mlrun.common.schemas.ModelEndpointStatus(
|
|
1074
1074
|
children=list(voting_ensemble.routes.keys()),
|
|
1075
|
-
endpoint_type=mlrun.common.model_monitoring.EndpointType.ROUTER,
|
|
1075
|
+
endpoint_type=mlrun.common.schemas.model_monitoring.EndpointType.ROUTER,
|
|
1076
1076
|
children_uids=children_uids,
|
|
1077
1077
|
),
|
|
1078
1078
|
)
|
|
@@ -1091,7 +1091,7 @@ def _init_endpoint_record(
|
|
|
1091
1091
|
project=project, endpoint_id=model_endpoint
|
|
1092
1092
|
)
|
|
1093
1093
|
current_endpoint.status.endpoint_type = (
|
|
1094
|
-
mlrun.common.model_monitoring.EndpointType.LEAF_EP
|
|
1094
|
+
mlrun.common.schemas.model_monitoring.EndpointType.LEAF_EP
|
|
1095
1095
|
)
|
|
1096
1096
|
db.create_model_endpoint(
|
|
1097
1097
|
project=project,
|
mlrun/serving/server.py
CHANGED
|
@@ -23,17 +23,19 @@ import uuid
|
|
|
23
23
|
from typing import Optional, Union
|
|
24
24
|
|
|
25
25
|
import mlrun
|
|
26
|
-
import mlrun.
|
|
27
|
-
|
|
26
|
+
import mlrun.common.helpers
|
|
27
|
+
import mlrun.model_monitoring
|
|
28
28
|
from mlrun.config import config
|
|
29
29
|
from mlrun.errors import err_to_str
|
|
30
30
|
from mlrun.secrets import SecretsStore
|
|
31
31
|
|
|
32
|
+
from ..common.helpers import parse_versioned_object_uri
|
|
33
|
+
from ..common.schemas.model_monitoring.constants import FileTargetKind
|
|
32
34
|
from ..datastore import get_stream_pusher
|
|
33
35
|
from ..datastore.store_resources import ResourceCache
|
|
34
36
|
from ..errors import MLRunInvalidArgumentError
|
|
35
37
|
from ..model import ModelObj
|
|
36
|
-
from ..utils import get_caller_globals
|
|
38
|
+
from ..utils import get_caller_globals
|
|
37
39
|
from .states import RootFlowStep, RouterStep, get_function, graph_root_setter
|
|
38
40
|
from .utils import (
|
|
39
41
|
event_id_key,
|
|
@@ -48,7 +50,6 @@ class _StreamContext:
|
|
|
48
50
|
that will be used for pushing the events from the nuclio model serving function"""
|
|
49
51
|
|
|
50
52
|
def __init__(self, enabled: bool, parameters: dict, function_uri: str):
|
|
51
|
-
|
|
52
53
|
"""
|
|
53
54
|
Initialize _StreamContext object.
|
|
54
55
|
:param enabled: A boolean indication for applying the stream context
|
|
@@ -71,7 +72,7 @@ class _StreamContext:
|
|
|
71
72
|
function_uri, config.default_project
|
|
72
73
|
)
|
|
73
74
|
|
|
74
|
-
stream_uri = mlrun.
|
|
75
|
+
stream_uri = mlrun.model_monitoring.get_stream_path(project=project)
|
|
75
76
|
|
|
76
77
|
if log_stream:
|
|
77
78
|
# Update the stream path to the log stream value
|
|
@@ -467,6 +468,8 @@ class GraphContext:
|
|
|
467
468
|
self.logger = nuclio_context.logger
|
|
468
469
|
self.Response = nuclio_context.Response
|
|
469
470
|
self.worker_id = nuclio_context.worker_id
|
|
471
|
+
if hasattr(nuclio_context, "platform"):
|
|
472
|
+
self.platform = nuclio_context.platform
|
|
470
473
|
elif not logger:
|
|
471
474
|
self.logger = mlrun.utils.helpers.logger
|
|
472
475
|
|
|
@@ -483,7 +486,7 @@ class GraphContext:
|
|
|
483
486
|
@property
|
|
484
487
|
def project(self):
|
|
485
488
|
"""current project name (for the current function)"""
|
|
486
|
-
project, _, _, _ = mlrun.
|
|
489
|
+
project, _, _, _ = mlrun.common.helpers.parse_versioned_object_uri(
|
|
487
490
|
self._server.function_uri
|
|
488
491
|
)
|
|
489
492
|
return project
|
|
@@ -521,13 +524,13 @@ class GraphContext:
|
|
|
521
524
|
"""
|
|
522
525
|
if "://" in name:
|
|
523
526
|
return name
|
|
524
|
-
project, uri, tag, _ = mlrun.
|
|
527
|
+
project, uri, tag, _ = mlrun.common.helpers.parse_versioned_object_uri(
|
|
525
528
|
self._server.function_uri
|
|
526
529
|
)
|
|
527
530
|
if name.startswith("."):
|
|
528
531
|
name = f"{uri}-{name[1:]}"
|
|
529
532
|
else:
|
|
530
|
-
project, name, tag, _ = mlrun.
|
|
533
|
+
project, name, tag, _ = mlrun.common.helpers.parse_versioned_object_uri(
|
|
531
534
|
name, project
|
|
532
535
|
)
|
|
533
536
|
(
|
mlrun/serving/states.py
CHANGED
|
@@ -21,6 +21,8 @@ from copy import copy, deepcopy
|
|
|
21
21
|
from inspect import getfullargspec, signature
|
|
22
22
|
from typing import Union
|
|
23
23
|
|
|
24
|
+
import mlrun
|
|
25
|
+
|
|
24
26
|
from ..config import config
|
|
25
27
|
from ..datastore import get_stream_pusher
|
|
26
28
|
from ..datastore.utils import parse_kafka_url
|
|
@@ -1512,5 +1514,9 @@ def _init_async_objects(context, steps):
|
|
|
1512
1514
|
wait_for_result = True
|
|
1513
1515
|
|
|
1514
1516
|
source_args = context.get_param("source_args", {})
|
|
1515
|
-
default_source = storey.SyncEmitSource(
|
|
1517
|
+
default_source = storey.SyncEmitSource(
|
|
1518
|
+
context=context,
|
|
1519
|
+
explicit_ack=mlrun.mlconf.is_explicit_ack(),
|
|
1520
|
+
**source_args,
|
|
1521
|
+
)
|
|
1516
1522
|
return default_source, wait_for_result
|
mlrun/serving/v2_serving.py
CHANGED
|
@@ -17,13 +17,13 @@ import time
|
|
|
17
17
|
import traceback
|
|
18
18
|
from typing import Dict, Union
|
|
19
19
|
|
|
20
|
-
import mlrun
|
|
21
20
|
import mlrun.common.model_monitoring
|
|
22
|
-
import mlrun.common.schemas
|
|
21
|
+
import mlrun.common.schemas.model_monitoring
|
|
23
22
|
from mlrun.artifacts import ModelArtifact # noqa: F401
|
|
24
23
|
from mlrun.config import config
|
|
25
|
-
from mlrun.utils import logger, now_date
|
|
24
|
+
from mlrun.utils import logger, now_date
|
|
26
25
|
|
|
26
|
+
from ..common.helpers import parse_versioned_object_uri
|
|
27
27
|
from .server import GraphServer
|
|
28
28
|
from .utils import StepToDict, _extract_input_data, _update_result_body
|
|
29
29
|
|
|
@@ -516,12 +516,12 @@ def _init_endpoint_record(
|
|
|
516
516
|
project=project, kind="stream"
|
|
517
517
|
),
|
|
518
518
|
active=True,
|
|
519
|
-
monitoring_mode=mlrun.common.model_monitoring.ModelMonitoringMode.enabled
|
|
519
|
+
monitoring_mode=mlrun.common.schemas.model_monitoring.ModelMonitoringMode.enabled
|
|
520
520
|
if model.context.server.track_models
|
|
521
|
-
else mlrun.common.model_monitoring.ModelMonitoringMode.disabled,
|
|
521
|
+
else mlrun.common.schemas.model_monitoring.ModelMonitoringMode.disabled,
|
|
522
522
|
),
|
|
523
523
|
status=mlrun.common.schemas.ModelEndpointStatus(
|
|
524
|
-
endpoint_type=mlrun.common.model_monitoring.EndpointType.NODE_EP
|
|
524
|
+
endpoint_type=mlrun.common.schemas.model_monitoring.EndpointType.NODE_EP
|
|
525
525
|
),
|
|
526
526
|
)
|
|
527
527
|
|