mlrun 1.5.0rc1__py3-none-any.whl → 1.5.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +2 -35
- mlrun/__main__.py +1 -40
- mlrun/api/api/api.py +6 -0
- mlrun/api/api/endpoints/feature_store.py +0 -4
- mlrun/api/api/endpoints/files.py +14 -2
- mlrun/api/api/endpoints/functions.py +6 -1
- mlrun/api/api/endpoints/logs.py +17 -3
- mlrun/api/api/endpoints/pipelines.py +1 -5
- mlrun/api/api/endpoints/projects.py +88 -0
- mlrun/api/api/endpoints/runs.py +48 -6
- mlrun/api/api/endpoints/workflows.py +355 -0
- mlrun/api/api/utils.py +1 -1
- mlrun/api/crud/__init__.py +1 -0
- mlrun/api/crud/client_spec.py +3 -0
- mlrun/api/crud/model_monitoring/deployment.py +36 -7
- mlrun/api/crud/model_monitoring/grafana.py +1 -1
- mlrun/api/crud/model_monitoring/helpers.py +32 -2
- mlrun/api/crud/model_monitoring/model_endpoints.py +27 -5
- mlrun/api/crud/notifications.py +9 -4
- mlrun/api/crud/pipelines.py +4 -9
- mlrun/api/crud/runtime_resources.py +4 -3
- mlrun/api/crud/secrets.py +21 -0
- mlrun/api/crud/workflows.py +352 -0
- mlrun/api/db/base.py +16 -1
- mlrun/api/db/sqldb/db.py +97 -16
- mlrun/api/launcher.py +26 -7
- mlrun/api/main.py +3 -4
- mlrun/{mlutils → api/rundb}/__init__.py +2 -6
- mlrun/{db → api/rundb}/sqldb.py +35 -83
- mlrun/api/runtime_handlers/__init__.py +56 -0
- mlrun/api/runtime_handlers/base.py +1247 -0
- mlrun/api/runtime_handlers/daskjob.py +209 -0
- mlrun/api/runtime_handlers/kubejob.py +37 -0
- mlrun/api/runtime_handlers/mpijob.py +147 -0
- mlrun/api/runtime_handlers/remotesparkjob.py +29 -0
- mlrun/api/runtime_handlers/sparkjob.py +148 -0
- mlrun/api/utils/builder.py +1 -4
- mlrun/api/utils/clients/chief.py +14 -0
- mlrun/api/utils/scheduler.py +98 -15
- mlrun/api/utils/singletons/db.py +4 -0
- mlrun/artifacts/manager.py +1 -2
- mlrun/common/schemas/__init__.py +6 -0
- mlrun/common/schemas/auth.py +4 -1
- mlrun/common/schemas/client_spec.py +1 -1
- mlrun/common/schemas/model_monitoring/__init__.py +1 -0
- mlrun/common/schemas/model_monitoring/constants.py +11 -0
- mlrun/common/schemas/project.py +1 -0
- mlrun/common/schemas/runs.py +1 -8
- mlrun/common/schemas/schedule.py +1 -8
- mlrun/common/schemas/workflow.py +54 -0
- mlrun/config.py +42 -40
- mlrun/datastore/sources.py +1 -1
- mlrun/db/__init__.py +4 -68
- mlrun/db/base.py +12 -0
- mlrun/db/factory.py +65 -0
- mlrun/db/httpdb.py +175 -19
- mlrun/db/nopdb.py +4 -2
- mlrun/execution.py +4 -2
- mlrun/feature_store/__init__.py +1 -0
- mlrun/feature_store/api.py +1 -2
- mlrun/feature_store/feature_set.py +0 -10
- mlrun/feature_store/feature_vector.py +340 -2
- mlrun/feature_store/ingestion.py +5 -10
- mlrun/feature_store/retrieval/base.py +118 -104
- mlrun/feature_store/retrieval/dask_merger.py +17 -10
- mlrun/feature_store/retrieval/job.py +4 -1
- mlrun/feature_store/retrieval/local_merger.py +18 -18
- mlrun/feature_store/retrieval/spark_merger.py +21 -14
- mlrun/feature_store/retrieval/storey_merger.py +21 -15
- mlrun/kfpops.py +3 -9
- mlrun/launcher/base.py +3 -3
- mlrun/launcher/client.py +3 -2
- mlrun/launcher/factory.py +16 -13
- mlrun/lists.py +0 -11
- mlrun/model.py +9 -15
- mlrun/model_monitoring/helpers.py +15 -25
- mlrun/model_monitoring/model_monitoring_batch.py +72 -4
- mlrun/model_monitoring/prometheus.py +219 -0
- mlrun/model_monitoring/stores/__init__.py +15 -9
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +3 -1
- mlrun/model_monitoring/stream_processing.py +181 -29
- mlrun/package/packager.py +6 -8
- mlrun/package/packagers/default_packager.py +121 -10
- mlrun/platforms/__init__.py +0 -2
- mlrun/platforms/iguazio.py +0 -56
- mlrun/projects/pipelines.py +57 -158
- mlrun/projects/project.py +6 -32
- mlrun/render.py +1 -1
- mlrun/run.py +2 -124
- mlrun/runtimes/__init__.py +6 -42
- mlrun/runtimes/base.py +26 -1241
- mlrun/runtimes/daskjob.py +2 -198
- mlrun/runtimes/function.py +16 -5
- mlrun/runtimes/kubejob.py +5 -29
- mlrun/runtimes/mpijob/__init__.py +2 -2
- mlrun/runtimes/mpijob/abstract.py +10 -1
- mlrun/runtimes/mpijob/v1.py +0 -76
- mlrun/runtimes/mpijob/v1alpha1.py +1 -74
- mlrun/runtimes/nuclio.py +3 -2
- mlrun/runtimes/pod.py +0 -10
- mlrun/runtimes/remotesparkjob.py +1 -15
- mlrun/runtimes/serving.py +1 -1
- mlrun/runtimes/sparkjob/__init__.py +0 -1
- mlrun/runtimes/sparkjob/abstract.py +4 -131
- mlrun/serving/states.py +1 -1
- mlrun/utils/db.py +0 -2
- mlrun/utils/helpers.py +19 -13
- mlrun/utils/notifications/notification_pusher.py +5 -25
- mlrun/utils/regex.py +7 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/METADATA +24 -23
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/RECORD +116 -107
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/WHEEL +1 -1
- mlrun/mlutils/data.py +0 -160
- mlrun/mlutils/models.py +0 -78
- mlrun/mlutils/plots.py +0 -902
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/LICENSE +0 -0
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/entry_points.txt +0 -0
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/top_level.txt +0 -0
mlrun/runtimes/daskjob.py
CHANGED
|
@@ -15,33 +15,25 @@ import datetime
|
|
|
15
15
|
import inspect
|
|
16
16
|
import socket
|
|
17
17
|
import time
|
|
18
|
-
import typing
|
|
19
18
|
from os import environ
|
|
20
|
-
from typing import Dict, List, Optional, Union
|
|
21
|
-
|
|
22
|
-
from deprecated import deprecated
|
|
23
|
-
from kubernetes.client.rest import ApiException
|
|
24
|
-
from sqlalchemy.orm import Session
|
|
25
19
|
|
|
26
20
|
import mlrun.common.schemas
|
|
27
21
|
import mlrun.errors
|
|
28
22
|
import mlrun.k8s_utils
|
|
29
23
|
import mlrun.utils
|
|
30
24
|
import mlrun.utils.regex
|
|
31
|
-
from mlrun.api.db.base import DBInterface
|
|
32
25
|
from mlrun.errors import err_to_str
|
|
33
|
-
from mlrun.runtimes.base import BaseRuntimeHandler
|
|
34
26
|
|
|
35
27
|
from ..config import config
|
|
36
28
|
from ..execution import MLClientCtx
|
|
37
29
|
from ..model import RunObject
|
|
38
30
|
from ..render import ipython_display
|
|
39
31
|
from ..utils import logger, normalize_name, update_in
|
|
40
|
-
from .base import FunctionStatus
|
|
32
|
+
from .base import FunctionStatus
|
|
41
33
|
from .kubejob import KubejobRuntime
|
|
42
34
|
from .local import exec_from_params, load_module
|
|
43
35
|
from .pod import KubeResourceSpec, kube_resource_spec_to_pod_spec
|
|
44
|
-
from .utils import RunError, get_func_selector,
|
|
36
|
+
from .utils import RunError, get_func_selector, get_resource_labels, log_std
|
|
45
37
|
|
|
46
38
|
|
|
47
39
|
def get_dask_resource():
|
|
@@ -406,16 +398,6 @@ class DaskCluster(KubejobRuntime):
|
|
|
406
398
|
show_on_failure=show_on_failure,
|
|
407
399
|
)
|
|
408
400
|
|
|
409
|
-
# TODO: Remove in 1.5.0
|
|
410
|
-
@deprecated(
|
|
411
|
-
version="1.3.0",
|
|
412
|
-
reason="'Dask gpus' will be removed in 1.5.0, use 'with_scheduler_limits' / 'with_worker_limits' instead",
|
|
413
|
-
category=FutureWarning,
|
|
414
|
-
)
|
|
415
|
-
def gpus(self, gpus, gpu_type="nvidia.com/gpu"):
|
|
416
|
-
update_in(self.spec.scheduler_resources, ["limits", gpu_type], gpus)
|
|
417
|
-
update_in(self.spec.worker_resources, ["limits", gpu_type], gpus)
|
|
418
|
-
|
|
419
401
|
def with_limits(
|
|
420
402
|
self,
|
|
421
403
|
mem=None,
|
|
@@ -686,181 +668,3 @@ def get_obj_status(selector=None, namespace=None):
|
|
|
686
668
|
f"found dask function {pod.metadata.name} in non ready state ({status})"
|
|
687
669
|
)
|
|
688
670
|
return status
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
class DaskRuntimeHandler(BaseRuntimeHandler):
|
|
692
|
-
kind = "dask"
|
|
693
|
-
class_modes = {RuntimeClassMode.run: "dask"}
|
|
694
|
-
|
|
695
|
-
# Dask runtime resources are per function (and not per run).
|
|
696
|
-
# It means that monitoring runtime resources state doesn't say anything about the run state.
|
|
697
|
-
# Therefore dask run monitoring is done completely by the SDK, so overriding the monitoring method with no logic
|
|
698
|
-
def monitor_runs(
|
|
699
|
-
self, db: DBInterface, db_session: Session, leader_session: Optional[str] = None
|
|
700
|
-
):
|
|
701
|
-
return
|
|
702
|
-
|
|
703
|
-
@staticmethod
|
|
704
|
-
def _get_object_label_selector(object_id: str) -> str:
|
|
705
|
-
return f"mlrun/function={object_id}"
|
|
706
|
-
|
|
707
|
-
@staticmethod
|
|
708
|
-
def resolve_object_id(
|
|
709
|
-
run: dict,
|
|
710
|
-
) -> typing.Optional[str]:
|
|
711
|
-
"""
|
|
712
|
-
Resolves the object ID from the run object.
|
|
713
|
-
In dask runtime, the object ID is the function name.
|
|
714
|
-
:param run: run object
|
|
715
|
-
:return: function name
|
|
716
|
-
"""
|
|
717
|
-
|
|
718
|
-
function = run.get("spec", {}).get("function", None)
|
|
719
|
-
if function:
|
|
720
|
-
|
|
721
|
-
# a dask run's function field is in the format <project-name>/<function-name>@<run-uid>
|
|
722
|
-
# we only want the function name
|
|
723
|
-
project_and_function = function.split("@")[0]
|
|
724
|
-
return project_and_function.split("/")[-1]
|
|
725
|
-
|
|
726
|
-
return None
|
|
727
|
-
|
|
728
|
-
def _enrich_list_resources_response(
|
|
729
|
-
self,
|
|
730
|
-
response: Union[
|
|
731
|
-
mlrun.common.schemas.RuntimeResources,
|
|
732
|
-
mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
|
|
733
|
-
mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
|
|
734
|
-
],
|
|
735
|
-
namespace: str,
|
|
736
|
-
label_selector: str = None,
|
|
737
|
-
group_by: Optional[
|
|
738
|
-
mlrun.common.schemas.ListRuntimeResourcesGroupByField
|
|
739
|
-
] = None,
|
|
740
|
-
) -> Union[
|
|
741
|
-
mlrun.common.schemas.RuntimeResources,
|
|
742
|
-
mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
|
|
743
|
-
mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
|
|
744
|
-
]:
|
|
745
|
-
"""
|
|
746
|
-
Handling listing service resources
|
|
747
|
-
"""
|
|
748
|
-
enrich_needed = self._validate_if_enrich_is_needed_by_group_by(group_by)
|
|
749
|
-
if not enrich_needed:
|
|
750
|
-
return response
|
|
751
|
-
services = get_k8s().v1api.list_namespaced_service(
|
|
752
|
-
namespace, label_selector=label_selector
|
|
753
|
-
)
|
|
754
|
-
service_resources = []
|
|
755
|
-
for service in services.items:
|
|
756
|
-
service_resources.append(
|
|
757
|
-
mlrun.common.schemas.RuntimeResource(
|
|
758
|
-
name=service.metadata.name, labels=service.metadata.labels
|
|
759
|
-
)
|
|
760
|
-
)
|
|
761
|
-
return self._enrich_service_resources_in_response(
|
|
762
|
-
response, service_resources, group_by
|
|
763
|
-
)
|
|
764
|
-
|
|
765
|
-
def _build_output_from_runtime_resources(
|
|
766
|
-
self,
|
|
767
|
-
response: Union[
|
|
768
|
-
mlrun.common.schemas.RuntimeResources,
|
|
769
|
-
mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
|
|
770
|
-
mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
|
|
771
|
-
],
|
|
772
|
-
runtime_resources_list: List[mlrun.common.schemas.RuntimeResources],
|
|
773
|
-
group_by: Optional[
|
|
774
|
-
mlrun.common.schemas.ListRuntimeResourcesGroupByField
|
|
775
|
-
] = None,
|
|
776
|
-
):
|
|
777
|
-
enrich_needed = self._validate_if_enrich_is_needed_by_group_by(group_by)
|
|
778
|
-
if not enrich_needed:
|
|
779
|
-
return response
|
|
780
|
-
service_resources = []
|
|
781
|
-
for runtime_resources in runtime_resources_list:
|
|
782
|
-
if runtime_resources.service_resources:
|
|
783
|
-
service_resources += runtime_resources.service_resources
|
|
784
|
-
return self._enrich_service_resources_in_response(
|
|
785
|
-
response, service_resources, group_by
|
|
786
|
-
)
|
|
787
|
-
|
|
788
|
-
def _validate_if_enrich_is_needed_by_group_by(
|
|
789
|
-
self,
|
|
790
|
-
group_by: Optional[
|
|
791
|
-
mlrun.common.schemas.ListRuntimeResourcesGroupByField
|
|
792
|
-
] = None,
|
|
793
|
-
) -> bool:
|
|
794
|
-
# Dask runtime resources are per function (and not per job) therefore, when grouping by job we're simply
|
|
795
|
-
# omitting the dask runtime resources
|
|
796
|
-
if group_by == mlrun.common.schemas.ListRuntimeResourcesGroupByField.job:
|
|
797
|
-
return False
|
|
798
|
-
elif group_by == mlrun.common.schemas.ListRuntimeResourcesGroupByField.project:
|
|
799
|
-
return True
|
|
800
|
-
elif group_by is not None:
|
|
801
|
-
raise NotImplementedError(
|
|
802
|
-
f"Provided group by field is not supported. group_by={group_by}"
|
|
803
|
-
)
|
|
804
|
-
return True
|
|
805
|
-
|
|
806
|
-
def _enrich_service_resources_in_response(
|
|
807
|
-
self,
|
|
808
|
-
response: Union[
|
|
809
|
-
mlrun.common.schemas.RuntimeResources,
|
|
810
|
-
mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
|
|
811
|
-
mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
|
|
812
|
-
],
|
|
813
|
-
service_resources: List[mlrun.common.schemas.RuntimeResource],
|
|
814
|
-
group_by: Optional[
|
|
815
|
-
mlrun.common.schemas.ListRuntimeResourcesGroupByField
|
|
816
|
-
] = None,
|
|
817
|
-
):
|
|
818
|
-
if group_by == mlrun.common.schemas.ListRuntimeResourcesGroupByField.project:
|
|
819
|
-
for service_resource in service_resources:
|
|
820
|
-
self._add_resource_to_grouped_by_project_resources_response(
|
|
821
|
-
response, "service_resources", service_resource
|
|
822
|
-
)
|
|
823
|
-
else:
|
|
824
|
-
response.service_resources = service_resources
|
|
825
|
-
return response
|
|
826
|
-
|
|
827
|
-
def _delete_extra_resources(
|
|
828
|
-
self,
|
|
829
|
-
db: DBInterface,
|
|
830
|
-
db_session: Session,
|
|
831
|
-
namespace: str,
|
|
832
|
-
deleted_resources: List[Dict],
|
|
833
|
-
label_selector: str = None,
|
|
834
|
-
force: bool = False,
|
|
835
|
-
grace_period: int = None,
|
|
836
|
-
):
|
|
837
|
-
"""
|
|
838
|
-
Handling services deletion
|
|
839
|
-
"""
|
|
840
|
-
if grace_period is None:
|
|
841
|
-
grace_period = config.runtime_resources_deletion_grace_period
|
|
842
|
-
service_names = []
|
|
843
|
-
for pod_dict in deleted_resources:
|
|
844
|
-
dask_component = (
|
|
845
|
-
pod_dict["metadata"].get("labels", {}).get("dask.org/component")
|
|
846
|
-
)
|
|
847
|
-
cluster_name = (
|
|
848
|
-
pod_dict["metadata"].get("labels", {}).get("dask.org/cluster-name")
|
|
849
|
-
)
|
|
850
|
-
if dask_component == "scheduler" and cluster_name:
|
|
851
|
-
service_names.append(cluster_name)
|
|
852
|
-
|
|
853
|
-
services = get_k8s().v1api.list_namespaced_service(
|
|
854
|
-
namespace, label_selector=label_selector
|
|
855
|
-
)
|
|
856
|
-
for service in services.items:
|
|
857
|
-
try:
|
|
858
|
-
if force or service.metadata.name in service_names:
|
|
859
|
-
get_k8s().v1api.delete_namespaced_service(
|
|
860
|
-
service.metadata.name, namespace
|
|
861
|
-
)
|
|
862
|
-
logger.info(f"Deleted service: {service.metadata.name}")
|
|
863
|
-
except ApiException as exc:
|
|
864
|
-
# ignore error if service is already removed
|
|
865
|
-
if exc.status != 404:
|
|
866
|
-
raise
|
mlrun/runtimes/function.py
CHANGED
|
@@ -28,11 +28,12 @@ from kubernetes import client
|
|
|
28
28
|
from nuclio.deploy import find_dashboard_url, get_deploy_status
|
|
29
29
|
from nuclio.triggers import V3IOStreamTrigger
|
|
30
30
|
|
|
31
|
+
import mlrun.db
|
|
31
32
|
import mlrun.errors
|
|
32
33
|
import mlrun.k8s_utils
|
|
33
34
|
import mlrun.utils
|
|
35
|
+
import mlrun.utils.helpers
|
|
34
36
|
from mlrun.common.schemas import AuthInfo
|
|
35
|
-
from mlrun.db import RunDBError
|
|
36
37
|
|
|
37
38
|
from ..config import config as mlconf
|
|
38
39
|
from ..errors import err_to_str
|
|
@@ -488,6 +489,16 @@ class RemoteRuntime(KubeResource):
|
|
|
488
489
|
endpoint = None
|
|
489
490
|
if "://" in stream_path:
|
|
490
491
|
endpoint, stream_path = parse_path(stream_path, suffix="")
|
|
492
|
+
|
|
493
|
+
# verify v3io stream trigger name is valid
|
|
494
|
+
mlrun.utils.helpers.validate_v3io_stream_consumer_group(group)
|
|
495
|
+
|
|
496
|
+
consumer_group = kwargs.pop("consumerGroup", None)
|
|
497
|
+
if consumer_group:
|
|
498
|
+
logger.warning(
|
|
499
|
+
"consumerGroup kwargs value is ignored. use group argument instead"
|
|
500
|
+
)
|
|
501
|
+
|
|
491
502
|
container, path = split_path(stream_path)
|
|
492
503
|
shards = shards or 1
|
|
493
504
|
extra_attributes = extra_attributes or {}
|
|
@@ -603,7 +614,7 @@ class RemoteRuntime(KubeResource):
|
|
|
603
614
|
text, last_log_timestamp = db.get_builder_status(
|
|
604
615
|
self, last_log_timestamp=last_log_timestamp, verbose=verbose
|
|
605
616
|
)
|
|
606
|
-
except RunDBError:
|
|
617
|
+
except mlrun.db.RunDBError:
|
|
607
618
|
raise ValueError("function or deploy process not found")
|
|
608
619
|
state = self.status.state
|
|
609
620
|
if text:
|
|
@@ -714,7 +725,7 @@ class RemoteRuntime(KubeResource):
|
|
|
714
725
|
text, last_log_timestamp = self._get_db().get_builder_status(
|
|
715
726
|
self, last_log_timestamp=last_log_timestamp, verbose=verbose
|
|
716
727
|
)
|
|
717
|
-
except RunDBError:
|
|
728
|
+
except mlrun.db.RunDBError:
|
|
718
729
|
if raise_on_exception:
|
|
719
730
|
return "", "", None
|
|
720
731
|
raise ValueError("function or deploy process not found")
|
|
@@ -725,8 +736,8 @@ class RemoteRuntime(KubeResource):
|
|
|
725
736
|
runtime_env = {
|
|
726
737
|
"MLRUN_DEFAULT_PROJECT": self.metadata.project or mlconf.default_project,
|
|
727
738
|
}
|
|
728
|
-
if
|
|
729
|
-
runtime_env["MLRUN_DBPATH"] =
|
|
739
|
+
if mlconf.httpdb.api_url:
|
|
740
|
+
runtime_env["MLRUN_DBPATH"] = mlconf.httpdb.api_url
|
|
730
741
|
if mlconf.namespace:
|
|
731
742
|
runtime_env["MLRUN_NAMESPACE"] = mlconf.namespace
|
|
732
743
|
if self.metadata.credentials.access_key:
|
mlrun/runtimes/kubejob.py
CHANGED
|
@@ -20,15 +20,14 @@ from kubernetes import client
|
|
|
20
20
|
from kubernetes.client.rest import ApiException
|
|
21
21
|
|
|
22
22
|
import mlrun.common.schemas
|
|
23
|
+
import mlrun.db
|
|
23
24
|
import mlrun.errors
|
|
24
|
-
from mlrun.runtimes.base import BaseRuntimeHandler
|
|
25
25
|
|
|
26
|
-
from ..db import RunDBError
|
|
27
26
|
from ..errors import err_to_str
|
|
28
27
|
from ..kfpops import build_op
|
|
29
28
|
from ..model import RunObject
|
|
30
29
|
from ..utils import get_in, logger
|
|
31
|
-
from .base import RunError
|
|
30
|
+
from .base import RunError
|
|
32
31
|
from .pod import KubeResource, kube_resource_spec_to_pod_spec
|
|
33
32
|
from .utils import get_k8s
|
|
34
33
|
|
|
@@ -257,7 +256,7 @@ class KubejobRuntime(KubeResource):
|
|
|
257
256
|
offset = 0
|
|
258
257
|
try:
|
|
259
258
|
text, _ = db.get_builder_status(self, 0, logs=logs)
|
|
260
|
-
except RunDBError:
|
|
259
|
+
except mlrun.db.RunDBError:
|
|
261
260
|
raise ValueError("function or build process not found")
|
|
262
261
|
|
|
263
262
|
def print_log(text):
|
|
@@ -360,10 +359,8 @@ class KubejobRuntime(KubeResource):
|
|
|
360
359
|
|
|
361
360
|
if self.spec.clone_target_dir:
|
|
362
361
|
workdir = workdir or ""
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
# workdir.removeprefix("./")
|
|
366
|
-
workdir = workdir[2:]
|
|
362
|
+
workdir = workdir.removeprefix("./")
|
|
363
|
+
|
|
367
364
|
return os.path.join(self.spec.clone_target_dir, workdir)
|
|
368
365
|
|
|
369
366
|
return workdir
|
|
@@ -390,24 +387,3 @@ def func_to_pod(image, runtime, extra_env, command, args, workdir):
|
|
|
390
387
|
]
|
|
391
388
|
|
|
392
389
|
return pod_spec
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
class KubeRuntimeHandler(BaseRuntimeHandler):
|
|
396
|
-
kind = "job"
|
|
397
|
-
class_modes = {RuntimeClassMode.run: "job", RuntimeClassMode.build: "build"}
|
|
398
|
-
|
|
399
|
-
@staticmethod
|
|
400
|
-
def _expect_pods_without_uid() -> bool:
|
|
401
|
-
"""
|
|
402
|
-
builder pods are handled as part of this runtime handler - they are not coupled to run object, therefore they
|
|
403
|
-
don't have the uid in their labels
|
|
404
|
-
"""
|
|
405
|
-
return True
|
|
406
|
-
|
|
407
|
-
@staticmethod
|
|
408
|
-
def _are_resources_coupled_to_run_object() -> bool:
|
|
409
|
-
return True
|
|
410
|
-
|
|
411
|
-
@staticmethod
|
|
412
|
-
def _get_object_label_selector(object_id: str) -> str:
|
|
413
|
-
return f"mlrun/uid={object_id}"
|
|
@@ -14,5 +14,5 @@
|
|
|
14
14
|
|
|
15
15
|
# flake8: noqa - this is until we take care of the F401 violations with respect to __all__ & sphinx
|
|
16
16
|
|
|
17
|
-
from .v1 import MpiRuntimeV1
|
|
18
|
-
from .v1alpha1 import MpiRuntimeV1Alpha1
|
|
17
|
+
from .v1 import MpiRuntimeV1
|
|
18
|
+
from .v1alpha1 import MpiRuntimeV1Alpha1
|
|
@@ -25,7 +25,7 @@ from mlrun.model import RunObject
|
|
|
25
25
|
from mlrun.runtimes.kubejob import KubejobRuntime
|
|
26
26
|
from mlrun.runtimes.pod import KubeResourceSpec
|
|
27
27
|
from mlrun.runtimes.utils import RunError, get_k8s
|
|
28
|
-
from mlrun.utils import get_in, logger
|
|
28
|
+
from mlrun.utils import get_in, logger, update_in
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class MPIResourceSpec(KubeResourceSpec):
|
|
@@ -138,6 +138,15 @@ class AbstractMPIJobRuntime(KubejobRuntime, abc.ABC):
|
|
|
138
138
|
def _get_crd_info() -> typing.Tuple[str, str, str]:
|
|
139
139
|
pass
|
|
140
140
|
|
|
141
|
+
@staticmethod
|
|
142
|
+
def _get_run_completion_updates(run: dict) -> dict:
|
|
143
|
+
|
|
144
|
+
# TODO: add a 'workers' section in run objects state, each worker will update its state while
|
|
145
|
+
# the run state will be resolved by the server.
|
|
146
|
+
# update the run object state if empty so that it won't default to 'created' state
|
|
147
|
+
update_in(run, "status.state", "running", append=False, replace=False)
|
|
148
|
+
return {}
|
|
149
|
+
|
|
141
150
|
def _pretty_print_jobs(self, items: typing.List):
|
|
142
151
|
print(f"{'status':10} {'name':20} {'start':21} end")
|
|
143
152
|
for i in items:
|
mlrun/runtimes/mpijob/v1.py
CHANGED
|
@@ -13,17 +13,13 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import typing
|
|
15
15
|
from copy import deepcopy
|
|
16
|
-
from datetime import datetime
|
|
17
16
|
|
|
18
17
|
from kubernetes import client
|
|
19
|
-
from sqlalchemy.orm import Session
|
|
20
18
|
|
|
21
19
|
import mlrun.runtimes.pod
|
|
22
|
-
from mlrun.api.db.base import DBInterface
|
|
23
20
|
from mlrun.config import config as mlconf
|
|
24
21
|
from mlrun.execution import MLClientCtx
|
|
25
22
|
from mlrun.model import RunObject
|
|
26
|
-
from mlrun.runtimes.base import BaseRuntimeHandler, RunStates, RuntimeClassMode
|
|
27
23
|
from mlrun.runtimes.constants import MPIJobCRDVersions, MPIJobV1CleanPodPolicies
|
|
28
24
|
from mlrun.runtimes.mpijob.abstract import AbstractMPIJobRuntime, MPIResourceSpec
|
|
29
25
|
from mlrun.utils import get_in, update_in
|
|
@@ -318,75 +314,3 @@ class MpiRuntimeV1(AbstractMPIJobRuntime):
|
|
|
318
314
|
MpiRuntimeV1.crd_version,
|
|
319
315
|
MpiRuntimeV1.crd_plural,
|
|
320
316
|
)
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
class MpiV1RuntimeHandler(BaseRuntimeHandler):
|
|
324
|
-
kind = "mpijob"
|
|
325
|
-
class_modes = {
|
|
326
|
-
RuntimeClassMode.run: "mpijob",
|
|
327
|
-
}
|
|
328
|
-
|
|
329
|
-
def _resolve_crd_object_status_info(
|
|
330
|
-
self, db: DBInterface, db_session: Session, crd_object
|
|
331
|
-
) -> typing.Tuple[bool, typing.Optional[datetime], typing.Optional[str]]:
|
|
332
|
-
"""
|
|
333
|
-
https://github.com/kubeflow/mpi-operator/blob/master/pkg/apis/kubeflow/v1/types.go#L29
|
|
334
|
-
https://github.com/kubeflow/common/blob/master/pkg/apis/common/v1/types.go#L55
|
|
335
|
-
"""
|
|
336
|
-
launcher_status = (
|
|
337
|
-
crd_object.get("status", {}).get("replicaStatuses", {}).get("Launcher", {})
|
|
338
|
-
)
|
|
339
|
-
# the launcher status also has running property, but it's empty for
|
|
340
|
-
# short period after the creation, so we're
|
|
341
|
-
# checking terminal state by the completion time existence
|
|
342
|
-
in_terminal_state = (
|
|
343
|
-
crd_object.get("status", {}).get("completionTime", None) is not None
|
|
344
|
-
)
|
|
345
|
-
desired_run_state = RunStates.running
|
|
346
|
-
completion_time = None
|
|
347
|
-
if in_terminal_state:
|
|
348
|
-
completion_time = datetime.fromisoformat(
|
|
349
|
-
crd_object.get("status", {})
|
|
350
|
-
.get("completionTime")
|
|
351
|
-
.replace("Z", "+00:00")
|
|
352
|
-
)
|
|
353
|
-
desired_run_state = (
|
|
354
|
-
RunStates.error
|
|
355
|
-
if launcher_status.get("failed", 0) > 0
|
|
356
|
-
else RunStates.completed
|
|
357
|
-
)
|
|
358
|
-
return in_terminal_state, completion_time, desired_run_state
|
|
359
|
-
|
|
360
|
-
@staticmethod
|
|
361
|
-
def _are_resources_coupled_to_run_object() -> bool:
|
|
362
|
-
return True
|
|
363
|
-
|
|
364
|
-
@staticmethod
|
|
365
|
-
def _get_object_label_selector(object_id: str) -> str:
|
|
366
|
-
return f"mlrun/uid={object_id}"
|
|
367
|
-
|
|
368
|
-
@staticmethod
|
|
369
|
-
def _get_main_runtime_resource_label_selector() -> str:
|
|
370
|
-
"""
|
|
371
|
-
There are some runtimes which might have multiple k8s resources attached to a one runtime, in this case
|
|
372
|
-
we don't want to pull logs from all but rather only for the "driver"/"launcher" etc
|
|
373
|
-
:return: the label selector
|
|
374
|
-
"""
|
|
375
|
-
return "mpi-job-role=launcher"
|
|
376
|
-
|
|
377
|
-
@staticmethod
|
|
378
|
-
def _get_run_completion_updates(run: dict) -> dict:
|
|
379
|
-
|
|
380
|
-
# TODO: add a 'workers' section in run objects state, each worker will update its state while
|
|
381
|
-
# the run state will be resolved by the server.
|
|
382
|
-
# update the run object state if empty so that it won't default to 'created' state
|
|
383
|
-
update_in(run, "status.state", "running", append=False, replace=False)
|
|
384
|
-
return {}
|
|
385
|
-
|
|
386
|
-
@staticmethod
|
|
387
|
-
def _get_crd_info() -> typing.Tuple[str, str, str]:
|
|
388
|
-
return (
|
|
389
|
-
MpiRuntimeV1.crd_group,
|
|
390
|
-
MpiRuntimeV1.crd_version,
|
|
391
|
-
MpiRuntimeV1.crd_plural,
|
|
392
|
-
)
|
|
@@ -13,18 +13,14 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import typing
|
|
15
15
|
from copy import deepcopy
|
|
16
|
-
from datetime import datetime
|
|
17
16
|
|
|
18
17
|
from kubernetes import client
|
|
19
|
-
from sqlalchemy.orm import Session
|
|
20
18
|
|
|
21
19
|
import mlrun.runtimes.pod
|
|
22
|
-
from mlrun.api.db.base import DBInterface
|
|
23
20
|
from mlrun.config import config as mlconf
|
|
24
21
|
from mlrun.execution import MLClientCtx
|
|
25
22
|
from mlrun.model import RunObject
|
|
26
|
-
from mlrun.runtimes.
|
|
27
|
-
from mlrun.runtimes.constants import MPIJobCRDVersions, MPIJobV1Alpha1States
|
|
23
|
+
from mlrun.runtimes.constants import MPIJobCRDVersions
|
|
28
24
|
from mlrun.runtimes.mpijob.abstract import AbstractMPIJobRuntime
|
|
29
25
|
from mlrun.utils import get_in, update_in
|
|
30
26
|
|
|
@@ -156,72 +152,3 @@ class MpiRuntimeV1Alpha1(AbstractMPIJobRuntime):
|
|
|
156
152
|
MpiRuntimeV1Alpha1.crd_version,
|
|
157
153
|
MpiRuntimeV1Alpha1.crd_plural,
|
|
158
154
|
)
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
class MpiV1Alpha1RuntimeHandler(BaseRuntimeHandler):
|
|
162
|
-
kind = "mpijob"
|
|
163
|
-
class_modes = {
|
|
164
|
-
RuntimeClassMode.run: "mpijob",
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
def _resolve_crd_object_status_info(
|
|
168
|
-
self, db: DBInterface, db_session: Session, crd_object
|
|
169
|
-
) -> typing.Tuple[bool, typing.Optional[datetime], typing.Optional[str]]:
|
|
170
|
-
"""
|
|
171
|
-
https://github.com/kubeflow/mpi-operator/blob/master/pkg/apis/kubeflow/v1alpha1/types.go#L115
|
|
172
|
-
"""
|
|
173
|
-
launcher_status = crd_object.get("status", {}).get("launcherStatus", "")
|
|
174
|
-
in_terminal_state = launcher_status in MPIJobV1Alpha1States.terminal_states()
|
|
175
|
-
desired_run_state = MPIJobV1Alpha1States.mpijob_state_to_run_state(
|
|
176
|
-
launcher_status
|
|
177
|
-
)
|
|
178
|
-
completion_time = None
|
|
179
|
-
if in_terminal_state:
|
|
180
|
-
completion_time = datetime.fromisoformat(
|
|
181
|
-
crd_object.get("status", {})
|
|
182
|
-
.get("completionTime")
|
|
183
|
-
.replace("Z", "+00:00")
|
|
184
|
-
)
|
|
185
|
-
desired_run_state = {
|
|
186
|
-
"Succeeded": RunStates.completed,
|
|
187
|
-
"Failed": RunStates.error,
|
|
188
|
-
}[launcher_status]
|
|
189
|
-
return in_terminal_state, completion_time, desired_run_state
|
|
190
|
-
|
|
191
|
-
@staticmethod
|
|
192
|
-
def _are_resources_coupled_to_run_object() -> bool:
|
|
193
|
-
return True
|
|
194
|
-
|
|
195
|
-
@staticmethod
|
|
196
|
-
def _get_object_label_selector(object_id: str) -> str:
|
|
197
|
-
return f"mlrun/uid={object_id}"
|
|
198
|
-
|
|
199
|
-
@staticmethod
|
|
200
|
-
def _get_main_runtime_resource_label_selector() -> str:
|
|
201
|
-
"""
|
|
202
|
-
There are some runtimes which might have multiple k8s resources attached to a one runtime, in this case
|
|
203
|
-
we don't want to pull logs from all but rather only for the "driver"/"launcher" etc
|
|
204
|
-
:return: the label selector
|
|
205
|
-
"""
|
|
206
|
-
return "mpi_role_type=launcher"
|
|
207
|
-
|
|
208
|
-
@staticmethod
|
|
209
|
-
def _get_run_completion_updates(run: dict) -> dict:
|
|
210
|
-
|
|
211
|
-
# TODO: add a 'workers' section in run objects state, each worker will update its state while
|
|
212
|
-
# the run state will be resolved by the server.
|
|
213
|
-
# update the run object state if empty so that it won't default to 'created' state
|
|
214
|
-
update_in(run, "status.state", "running", append=False, replace=False)
|
|
215
|
-
return {}
|
|
216
|
-
|
|
217
|
-
@staticmethod
|
|
218
|
-
def _get_crd_info() -> typing.Tuple[str, str, str]:
|
|
219
|
-
return (
|
|
220
|
-
MpiRuntimeV1Alpha1.crd_group,
|
|
221
|
-
MpiRuntimeV1Alpha1.crd_version,
|
|
222
|
-
MpiRuntimeV1Alpha1.crd_plural,
|
|
223
|
-
)
|
|
224
|
-
|
|
225
|
-
@staticmethod
|
|
226
|
-
def _get_crd_object_status(crd_object) -> str:
|
|
227
|
-
return crd_object.get("status", {}).get("launcherStatus", "")
|
mlrun/runtimes/nuclio.py
CHANGED
|
@@ -16,7 +16,8 @@ import json
|
|
|
16
16
|
import os
|
|
17
17
|
import socket
|
|
18
18
|
|
|
19
|
-
|
|
19
|
+
import mlrun.db
|
|
20
|
+
|
|
20
21
|
from ..errors import err_to_str
|
|
21
22
|
from ..execution import MLClientCtx
|
|
22
23
|
from ..model import RunTemplate
|
|
@@ -60,7 +61,7 @@ def nuclio_jobs_handler(context, event):
|
|
|
60
61
|
status_code=400,
|
|
61
62
|
)
|
|
62
63
|
|
|
63
|
-
out = get_or_set_dburl()
|
|
64
|
+
out = mlrun.db.get_or_set_dburl()
|
|
64
65
|
if out:
|
|
65
66
|
context.logger.info(f"logging run results to: {out}")
|
|
66
67
|
|
mlrun/runtimes/pod.py
CHANGED
|
@@ -20,7 +20,6 @@ from enum import Enum
|
|
|
20
20
|
import dotenv
|
|
21
21
|
import kfp.dsl
|
|
22
22
|
import kubernetes.client as k8s_client
|
|
23
|
-
from deprecated import deprecated
|
|
24
23
|
|
|
25
24
|
import mlrun.errors
|
|
26
25
|
import mlrun.utils.regex
|
|
@@ -1014,15 +1013,6 @@ class KubeResource(BaseRuntime):
|
|
|
1014
1013
|
self.set_env(name, value)
|
|
1015
1014
|
return self
|
|
1016
1015
|
|
|
1017
|
-
# TODO: Remove in 1.5.0
|
|
1018
|
-
@deprecated(
|
|
1019
|
-
version="1.3.0",
|
|
1020
|
-
reason="'Job gpus' will be removed in 1.5.0, use 'with_limits' instead",
|
|
1021
|
-
category=FutureWarning,
|
|
1022
|
-
)
|
|
1023
|
-
def gpus(self, gpus, gpu_type="nvidia.com/gpu"):
|
|
1024
|
-
update_in(self.spec.resources, ["limits", gpu_type], gpus)
|
|
1025
|
-
|
|
1026
1016
|
def set_image_pull_configuration(
|
|
1027
1017
|
self, image_pull_policy: str = None, image_pull_secret_name: str = None
|
|
1028
1018
|
):
|
mlrun/runtimes/remotesparkjob.py
CHANGED
|
@@ -21,8 +21,7 @@ from mlrun.config import config
|
|
|
21
21
|
|
|
22
22
|
from ..model import RunObject
|
|
23
23
|
from ..platforms.iguazio import mount_v3io, mount_v3iod
|
|
24
|
-
from .
|
|
25
|
-
from .kubejob import KubejobRuntime, KubeRuntimeHandler
|
|
24
|
+
from .kubejob import KubejobRuntime
|
|
26
25
|
from .pod import KubeResourceSpec
|
|
27
26
|
|
|
28
27
|
|
|
@@ -210,18 +209,5 @@ class RemoteSparkRuntime(KubejobRuntime):
|
|
|
210
209
|
)
|
|
211
210
|
|
|
212
211
|
|
|
213
|
-
class RemoteSparkRuntimeHandler(KubeRuntimeHandler):
|
|
214
|
-
kind = "remote-spark"
|
|
215
|
-
class_modes = {RuntimeClassMode.run: "remote-spark"}
|
|
216
|
-
|
|
217
|
-
@staticmethod
|
|
218
|
-
def _are_resources_coupled_to_run_object() -> bool:
|
|
219
|
-
return True
|
|
220
|
-
|
|
221
|
-
@staticmethod
|
|
222
|
-
def _get_object_label_selector(object_id: str) -> str:
|
|
223
|
-
return f"mlrun/uid={object_id}"
|
|
224
|
-
|
|
225
|
-
|
|
226
212
|
def igz_spark_pre_hook():
|
|
227
213
|
run(["/bin/bash", "/etc/config/v3io/spark-job-init.sh"])
|
mlrun/runtimes/serving.py
CHANGED
|
@@ -474,7 +474,7 @@ class ServingRuntime(RemoteRuntime):
|
|
|
474
474
|
child_function = self._spec.function_refs[function_name]
|
|
475
475
|
trigger_args = stream.trigger_args or {}
|
|
476
476
|
|
|
477
|
-
if mlrun.mlconf.
|
|
477
|
+
if mlrun.mlconf.is_explicit_ack():
|
|
478
478
|
trigger_args["explicit_ack_mode"] = trigger_args.get(
|
|
479
479
|
"explicit_ack_mode", "explicitOnly"
|
|
480
480
|
)
|