mlrun 1.5.0rc1__py3-none-any.whl → 1.5.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +2 -35
- mlrun/__main__.py +1 -40
- mlrun/api/api/api.py +6 -0
- mlrun/api/api/endpoints/feature_store.py +0 -4
- mlrun/api/api/endpoints/files.py +14 -2
- mlrun/api/api/endpoints/functions.py +6 -1
- mlrun/api/api/endpoints/logs.py +17 -3
- mlrun/api/api/endpoints/pipelines.py +1 -5
- mlrun/api/api/endpoints/projects.py +88 -0
- mlrun/api/api/endpoints/runs.py +48 -6
- mlrun/api/api/endpoints/workflows.py +355 -0
- mlrun/api/api/utils.py +1 -1
- mlrun/api/crud/__init__.py +1 -0
- mlrun/api/crud/client_spec.py +3 -0
- mlrun/api/crud/model_monitoring/deployment.py +36 -7
- mlrun/api/crud/model_monitoring/grafana.py +1 -1
- mlrun/api/crud/model_monitoring/helpers.py +32 -2
- mlrun/api/crud/model_monitoring/model_endpoints.py +27 -5
- mlrun/api/crud/notifications.py +9 -4
- mlrun/api/crud/pipelines.py +4 -9
- mlrun/api/crud/runtime_resources.py +4 -3
- mlrun/api/crud/secrets.py +21 -0
- mlrun/api/crud/workflows.py +352 -0
- mlrun/api/db/base.py +16 -1
- mlrun/api/db/sqldb/db.py +97 -16
- mlrun/api/launcher.py +26 -7
- mlrun/api/main.py +3 -4
- mlrun/{mlutils → api/rundb}/__init__.py +2 -6
- mlrun/{db → api/rundb}/sqldb.py +35 -83
- mlrun/api/runtime_handlers/__init__.py +56 -0
- mlrun/api/runtime_handlers/base.py +1247 -0
- mlrun/api/runtime_handlers/daskjob.py +209 -0
- mlrun/api/runtime_handlers/kubejob.py +37 -0
- mlrun/api/runtime_handlers/mpijob.py +147 -0
- mlrun/api/runtime_handlers/remotesparkjob.py +29 -0
- mlrun/api/runtime_handlers/sparkjob.py +148 -0
- mlrun/api/utils/builder.py +1 -4
- mlrun/api/utils/clients/chief.py +14 -0
- mlrun/api/utils/scheduler.py +98 -15
- mlrun/api/utils/singletons/db.py +4 -0
- mlrun/artifacts/manager.py +1 -2
- mlrun/common/schemas/__init__.py +6 -0
- mlrun/common/schemas/auth.py +4 -1
- mlrun/common/schemas/client_spec.py +1 -1
- mlrun/common/schemas/model_monitoring/__init__.py +1 -0
- mlrun/common/schemas/model_monitoring/constants.py +11 -0
- mlrun/common/schemas/project.py +1 -0
- mlrun/common/schemas/runs.py +1 -8
- mlrun/common/schemas/schedule.py +1 -8
- mlrun/common/schemas/workflow.py +54 -0
- mlrun/config.py +42 -40
- mlrun/datastore/sources.py +1 -1
- mlrun/db/__init__.py +4 -68
- mlrun/db/base.py +12 -0
- mlrun/db/factory.py +65 -0
- mlrun/db/httpdb.py +175 -19
- mlrun/db/nopdb.py +4 -2
- mlrun/execution.py +4 -2
- mlrun/feature_store/__init__.py +1 -0
- mlrun/feature_store/api.py +1 -2
- mlrun/feature_store/feature_set.py +0 -10
- mlrun/feature_store/feature_vector.py +340 -2
- mlrun/feature_store/ingestion.py +5 -10
- mlrun/feature_store/retrieval/base.py +118 -104
- mlrun/feature_store/retrieval/dask_merger.py +17 -10
- mlrun/feature_store/retrieval/job.py +4 -1
- mlrun/feature_store/retrieval/local_merger.py +18 -18
- mlrun/feature_store/retrieval/spark_merger.py +21 -14
- mlrun/feature_store/retrieval/storey_merger.py +21 -15
- mlrun/kfpops.py +3 -9
- mlrun/launcher/base.py +3 -3
- mlrun/launcher/client.py +3 -2
- mlrun/launcher/factory.py +16 -13
- mlrun/lists.py +0 -11
- mlrun/model.py +9 -15
- mlrun/model_monitoring/helpers.py +15 -25
- mlrun/model_monitoring/model_monitoring_batch.py +72 -4
- mlrun/model_monitoring/prometheus.py +219 -0
- mlrun/model_monitoring/stores/__init__.py +15 -9
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +3 -1
- mlrun/model_monitoring/stream_processing.py +181 -29
- mlrun/package/packager.py +6 -8
- mlrun/package/packagers/default_packager.py +121 -10
- mlrun/platforms/__init__.py +0 -2
- mlrun/platforms/iguazio.py +0 -56
- mlrun/projects/pipelines.py +57 -158
- mlrun/projects/project.py +6 -32
- mlrun/render.py +1 -1
- mlrun/run.py +2 -124
- mlrun/runtimes/__init__.py +6 -42
- mlrun/runtimes/base.py +26 -1241
- mlrun/runtimes/daskjob.py +2 -198
- mlrun/runtimes/function.py +16 -5
- mlrun/runtimes/kubejob.py +5 -29
- mlrun/runtimes/mpijob/__init__.py +2 -2
- mlrun/runtimes/mpijob/abstract.py +10 -1
- mlrun/runtimes/mpijob/v1.py +0 -76
- mlrun/runtimes/mpijob/v1alpha1.py +1 -74
- mlrun/runtimes/nuclio.py +3 -2
- mlrun/runtimes/pod.py +0 -10
- mlrun/runtimes/remotesparkjob.py +1 -15
- mlrun/runtimes/serving.py +1 -1
- mlrun/runtimes/sparkjob/__init__.py +0 -1
- mlrun/runtimes/sparkjob/abstract.py +4 -131
- mlrun/serving/states.py +1 -1
- mlrun/utils/db.py +0 -2
- mlrun/utils/helpers.py +19 -13
- mlrun/utils/notifications/notification_pusher.py +5 -25
- mlrun/utils/regex.py +7 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/METADATA +24 -23
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/RECORD +116 -107
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/WHEEL +1 -1
- mlrun/mlutils/data.py +0 -160
- mlrun/mlutils/models.py +0 -78
- mlrun/mlutils/plots.py +0 -902
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/LICENSE +0 -0
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/entry_points.txt +0 -0
- {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
# Copyright 2023 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
#
|
|
15
|
+
import typing
|
|
16
|
+
from typing import Dict, List, Optional, Union
|
|
17
|
+
|
|
18
|
+
from kubernetes.client.rest import ApiException
|
|
19
|
+
from sqlalchemy.orm import Session
|
|
20
|
+
|
|
21
|
+
import mlrun.common.schemas
|
|
22
|
+
import mlrun.errors
|
|
23
|
+
import mlrun.k8s_utils
|
|
24
|
+
import mlrun.utils
|
|
25
|
+
import mlrun.utils.regex
|
|
26
|
+
from mlrun.api.db.base import DBInterface
|
|
27
|
+
from mlrun.api.runtime_handlers.base import BaseRuntimeHandler
|
|
28
|
+
from mlrun.config import config
|
|
29
|
+
from mlrun.runtimes.base import RuntimeClassMode
|
|
30
|
+
from mlrun.runtimes.utils import get_k8s
|
|
31
|
+
from mlrun.utils import logger
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DaskRuntimeHandler(BaseRuntimeHandler):
|
|
35
|
+
kind = "dask"
|
|
36
|
+
class_modes = {RuntimeClassMode.run: "dask"}
|
|
37
|
+
|
|
38
|
+
# Dask runtime resources are per function (and not per run).
|
|
39
|
+
# It means that monitoring runtime resources state doesn't say anything about the run state.
|
|
40
|
+
# Therefore dask run monitoring is done completely by the SDK, so overriding the monitoring method with no logic
|
|
41
|
+
def monitor_runs(
|
|
42
|
+
self, db: DBInterface, db_session: Session, leader_session: Optional[str] = None
|
|
43
|
+
):
|
|
44
|
+
return
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
def _get_object_label_selector(object_id: str) -> str:
|
|
48
|
+
return f"mlrun/function={object_id}"
|
|
49
|
+
|
|
50
|
+
@staticmethod
|
|
51
|
+
def resolve_object_id(
|
|
52
|
+
run: dict,
|
|
53
|
+
) -> typing.Optional[str]:
|
|
54
|
+
"""
|
|
55
|
+
Resolves the object ID from the run object.
|
|
56
|
+
In dask runtime, the object ID is the function name.
|
|
57
|
+
:param run: run object
|
|
58
|
+
:return: function name
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
function = run.get("spec", {}).get("function", None)
|
|
62
|
+
if function:
|
|
63
|
+
|
|
64
|
+
# a dask run's function field is in the format <project-name>/<function-name>@<run-uid>
|
|
65
|
+
# we only want the function name
|
|
66
|
+
project_and_function = function.split("@")[0]
|
|
67
|
+
return project_and_function.split("/")[-1]
|
|
68
|
+
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
def _enrich_list_resources_response(
|
|
72
|
+
self,
|
|
73
|
+
response: Union[
|
|
74
|
+
mlrun.common.schemas.RuntimeResources,
|
|
75
|
+
mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
|
|
76
|
+
mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
|
|
77
|
+
],
|
|
78
|
+
namespace: str,
|
|
79
|
+
label_selector: str = None,
|
|
80
|
+
group_by: Optional[
|
|
81
|
+
mlrun.common.schemas.ListRuntimeResourcesGroupByField
|
|
82
|
+
] = None,
|
|
83
|
+
) -> Union[
|
|
84
|
+
mlrun.common.schemas.RuntimeResources,
|
|
85
|
+
mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
|
|
86
|
+
mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
|
|
87
|
+
]:
|
|
88
|
+
"""
|
|
89
|
+
Handling listing service resources
|
|
90
|
+
"""
|
|
91
|
+
enrich_needed = self._validate_if_enrich_is_needed_by_group_by(group_by)
|
|
92
|
+
if not enrich_needed:
|
|
93
|
+
return response
|
|
94
|
+
services = get_k8s().v1api.list_namespaced_service(
|
|
95
|
+
namespace, label_selector=label_selector
|
|
96
|
+
)
|
|
97
|
+
service_resources = []
|
|
98
|
+
for service in services.items:
|
|
99
|
+
service_resources.append(
|
|
100
|
+
mlrun.common.schemas.RuntimeResource(
|
|
101
|
+
name=service.metadata.name, labels=service.metadata.labels
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
return self._enrich_service_resources_in_response(
|
|
105
|
+
response, service_resources, group_by
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
def _build_output_from_runtime_resources(
|
|
109
|
+
self,
|
|
110
|
+
response: Union[
|
|
111
|
+
mlrun.common.schemas.RuntimeResources,
|
|
112
|
+
mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
|
|
113
|
+
mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
|
|
114
|
+
],
|
|
115
|
+
runtime_resources_list: List[mlrun.common.schemas.RuntimeResources],
|
|
116
|
+
group_by: Optional[
|
|
117
|
+
mlrun.common.schemas.ListRuntimeResourcesGroupByField
|
|
118
|
+
] = None,
|
|
119
|
+
):
|
|
120
|
+
enrich_needed = self._validate_if_enrich_is_needed_by_group_by(group_by)
|
|
121
|
+
if not enrich_needed:
|
|
122
|
+
return response
|
|
123
|
+
service_resources = []
|
|
124
|
+
for runtime_resources in runtime_resources_list:
|
|
125
|
+
if runtime_resources.service_resources:
|
|
126
|
+
service_resources += runtime_resources.service_resources
|
|
127
|
+
return self._enrich_service_resources_in_response(
|
|
128
|
+
response, service_resources, group_by
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def _validate_if_enrich_is_needed_by_group_by(
|
|
132
|
+
self,
|
|
133
|
+
group_by: Optional[
|
|
134
|
+
mlrun.common.schemas.ListRuntimeResourcesGroupByField
|
|
135
|
+
] = None,
|
|
136
|
+
) -> bool:
|
|
137
|
+
# Dask runtime resources are per function (and not per job) therefore, when grouping by job we're simply
|
|
138
|
+
# omitting the dask runtime resources
|
|
139
|
+
if group_by == mlrun.common.schemas.ListRuntimeResourcesGroupByField.job:
|
|
140
|
+
return False
|
|
141
|
+
elif group_by == mlrun.common.schemas.ListRuntimeResourcesGroupByField.project:
|
|
142
|
+
return True
|
|
143
|
+
elif group_by is not None:
|
|
144
|
+
raise NotImplementedError(
|
|
145
|
+
f"Provided group by field is not supported. group_by={group_by}"
|
|
146
|
+
)
|
|
147
|
+
return True
|
|
148
|
+
|
|
149
|
+
def _enrich_service_resources_in_response(
|
|
150
|
+
self,
|
|
151
|
+
response: Union[
|
|
152
|
+
mlrun.common.schemas.RuntimeResources,
|
|
153
|
+
mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
|
|
154
|
+
mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
|
|
155
|
+
],
|
|
156
|
+
service_resources: List[mlrun.common.schemas.RuntimeResource],
|
|
157
|
+
group_by: Optional[
|
|
158
|
+
mlrun.common.schemas.ListRuntimeResourcesGroupByField
|
|
159
|
+
] = None,
|
|
160
|
+
):
|
|
161
|
+
if group_by == mlrun.common.schemas.ListRuntimeResourcesGroupByField.project:
|
|
162
|
+
for service_resource in service_resources:
|
|
163
|
+
self._add_resource_to_grouped_by_project_resources_response(
|
|
164
|
+
response, "service_resources", service_resource
|
|
165
|
+
)
|
|
166
|
+
else:
|
|
167
|
+
response.service_resources = service_resources
|
|
168
|
+
return response
|
|
169
|
+
|
|
170
|
+
def _delete_extra_resources(
|
|
171
|
+
self,
|
|
172
|
+
db: DBInterface,
|
|
173
|
+
db_session: Session,
|
|
174
|
+
namespace: str,
|
|
175
|
+
deleted_resources: List[Dict],
|
|
176
|
+
label_selector: str = None,
|
|
177
|
+
force: bool = False,
|
|
178
|
+
grace_period: int = None,
|
|
179
|
+
):
|
|
180
|
+
"""
|
|
181
|
+
Handling services deletion
|
|
182
|
+
"""
|
|
183
|
+
if grace_period is None:
|
|
184
|
+
grace_period = config.runtime_resources_deletion_grace_period
|
|
185
|
+
service_names = []
|
|
186
|
+
for pod_dict in deleted_resources:
|
|
187
|
+
dask_component = (
|
|
188
|
+
pod_dict["metadata"].get("labels", {}).get("dask.org/component")
|
|
189
|
+
)
|
|
190
|
+
cluster_name = (
|
|
191
|
+
pod_dict["metadata"].get("labels", {}).get("dask.org/cluster-name")
|
|
192
|
+
)
|
|
193
|
+
if dask_component == "scheduler" and cluster_name:
|
|
194
|
+
service_names.append(cluster_name)
|
|
195
|
+
|
|
196
|
+
services = get_k8s().v1api.list_namespaced_service(
|
|
197
|
+
namespace, label_selector=label_selector
|
|
198
|
+
)
|
|
199
|
+
for service in services.items:
|
|
200
|
+
try:
|
|
201
|
+
if force or service.metadata.name in service_names:
|
|
202
|
+
get_k8s().v1api.delete_namespaced_service(
|
|
203
|
+
service.metadata.name, namespace
|
|
204
|
+
)
|
|
205
|
+
logger.info(f"Deleted service: {service.metadata.name}")
|
|
206
|
+
except ApiException as exc:
|
|
207
|
+
# ignore error if service is already removed
|
|
208
|
+
if exc.status != 404:
|
|
209
|
+
raise
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Copyright 2023 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from mlrun.api.runtime_handlers.base import BaseRuntimeHandler
|
|
16
|
+
from mlrun.runtimes.base import RuntimeClassMode
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class KubeRuntimeHandler(BaseRuntimeHandler):
|
|
20
|
+
kind = "job"
|
|
21
|
+
class_modes = {RuntimeClassMode.run: "job", RuntimeClassMode.build: "build"}
|
|
22
|
+
|
|
23
|
+
@staticmethod
|
|
24
|
+
def _expect_pods_without_uid() -> bool:
|
|
25
|
+
"""
|
|
26
|
+
builder pods are handled as part of this runtime handler - they are not coupled to run object, therefore they
|
|
27
|
+
don't have the uid in their labels
|
|
28
|
+
"""
|
|
29
|
+
return True
|
|
30
|
+
|
|
31
|
+
@staticmethod
|
|
32
|
+
def _are_resources_coupled_to_run_object() -> bool:
|
|
33
|
+
return True
|
|
34
|
+
|
|
35
|
+
@staticmethod
|
|
36
|
+
def _get_object_label_selector(object_id: str) -> str:
|
|
37
|
+
return f"mlrun/uid={object_id}"
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# Copyright 2023 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import typing
|
|
16
|
+
from datetime import datetime
|
|
17
|
+
|
|
18
|
+
from sqlalchemy.orm import Session
|
|
19
|
+
|
|
20
|
+
from mlrun.api.db.base import DBInterface
|
|
21
|
+
from mlrun.api.runtime_handlers import BaseRuntimeHandler
|
|
22
|
+
from mlrun.runtimes.base import RuntimeClassMode
|
|
23
|
+
from mlrun.runtimes.constants import MPIJobV1Alpha1States, RunStates
|
|
24
|
+
from mlrun.runtimes.mpijob import MpiRuntimeV1, MpiRuntimeV1Alpha1
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class MpiV1Alpha1RuntimeHandler(BaseRuntimeHandler):
|
|
28
|
+
kind = "mpijob"
|
|
29
|
+
class_modes = {
|
|
30
|
+
RuntimeClassMode.run: "mpijob",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
def _resolve_crd_object_status_info(
|
|
34
|
+
self, db: DBInterface, db_session: Session, crd_object
|
|
35
|
+
) -> typing.Tuple[bool, typing.Optional[datetime], typing.Optional[str]]:
|
|
36
|
+
"""
|
|
37
|
+
https://github.com/kubeflow/mpi-operator/blob/master/pkg/apis/kubeflow/v1alpha1/types.go#L115
|
|
38
|
+
"""
|
|
39
|
+
launcher_status = crd_object.get("status", {}).get("launcherStatus", "")
|
|
40
|
+
in_terminal_state = launcher_status in MPIJobV1Alpha1States.terminal_states()
|
|
41
|
+
desired_run_state = MPIJobV1Alpha1States.mpijob_state_to_run_state(
|
|
42
|
+
launcher_status
|
|
43
|
+
)
|
|
44
|
+
completion_time = None
|
|
45
|
+
if in_terminal_state:
|
|
46
|
+
completion_time = datetime.fromisoformat(
|
|
47
|
+
crd_object.get("status", {})
|
|
48
|
+
.get("completionTime")
|
|
49
|
+
.replace("Z", "+00:00")
|
|
50
|
+
)
|
|
51
|
+
desired_run_state = {
|
|
52
|
+
"Succeeded": RunStates.completed,
|
|
53
|
+
"Failed": RunStates.error,
|
|
54
|
+
}[launcher_status]
|
|
55
|
+
return in_terminal_state, completion_time, desired_run_state
|
|
56
|
+
|
|
57
|
+
@staticmethod
|
|
58
|
+
def _are_resources_coupled_to_run_object() -> bool:
|
|
59
|
+
return True
|
|
60
|
+
|
|
61
|
+
@staticmethod
|
|
62
|
+
def _get_object_label_selector(object_id: str) -> str:
|
|
63
|
+
return f"mlrun/uid={object_id}"
|
|
64
|
+
|
|
65
|
+
@staticmethod
|
|
66
|
+
def _get_main_runtime_resource_label_selector() -> str:
|
|
67
|
+
"""
|
|
68
|
+
There are some runtimes which might have multiple k8s resources attached to a one runtime, in this case
|
|
69
|
+
we don't want to pull logs from all but rather only for the "driver"/"launcher" etc
|
|
70
|
+
:return: the label selector
|
|
71
|
+
"""
|
|
72
|
+
return "mpi_role_type=launcher"
|
|
73
|
+
|
|
74
|
+
@staticmethod
|
|
75
|
+
def _get_crd_info() -> typing.Tuple[str, str, str]:
|
|
76
|
+
return (
|
|
77
|
+
MpiRuntimeV1Alpha1.crd_group,
|
|
78
|
+
MpiRuntimeV1Alpha1.crd_version,
|
|
79
|
+
MpiRuntimeV1Alpha1.crd_plural,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
@staticmethod
|
|
83
|
+
def _get_crd_object_status(crd_object) -> str:
|
|
84
|
+
return crd_object.get("status", {}).get("launcherStatus", "")
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class MpiV1RuntimeHandler(BaseRuntimeHandler):
|
|
88
|
+
kind = "mpijob"
|
|
89
|
+
class_modes = {
|
|
90
|
+
RuntimeClassMode.run: "mpijob",
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
def _resolve_crd_object_status_info(
|
|
94
|
+
self, db: DBInterface, db_session: Session, crd_object
|
|
95
|
+
) -> typing.Tuple[bool, typing.Optional[datetime], typing.Optional[str]]:
|
|
96
|
+
"""
|
|
97
|
+
https://github.com/kubeflow/mpi-operator/blob/master/pkg/apis/kubeflow/v1/types.go#L29
|
|
98
|
+
https://github.com/kubeflow/common/blob/master/pkg/apis/common/v1/types.go#L55
|
|
99
|
+
"""
|
|
100
|
+
launcher_status = (
|
|
101
|
+
crd_object.get("status", {}).get("replicaStatuses", {}).get("Launcher", {})
|
|
102
|
+
)
|
|
103
|
+
# the launcher status also has running property, but it's empty for
|
|
104
|
+
# short period after the creation, so we're
|
|
105
|
+
# checking terminal state by the completion time existence
|
|
106
|
+
in_terminal_state = (
|
|
107
|
+
crd_object.get("status", {}).get("completionTime", None) is not None
|
|
108
|
+
)
|
|
109
|
+
desired_run_state = RunStates.running
|
|
110
|
+
completion_time = None
|
|
111
|
+
if in_terminal_state:
|
|
112
|
+
completion_time = datetime.fromisoformat(
|
|
113
|
+
crd_object.get("status", {})
|
|
114
|
+
.get("completionTime")
|
|
115
|
+
.replace("Z", "+00:00")
|
|
116
|
+
)
|
|
117
|
+
desired_run_state = (
|
|
118
|
+
RunStates.error
|
|
119
|
+
if launcher_status.get("failed", 0) > 0
|
|
120
|
+
else RunStates.completed
|
|
121
|
+
)
|
|
122
|
+
return in_terminal_state, completion_time, desired_run_state
|
|
123
|
+
|
|
124
|
+
@staticmethod
|
|
125
|
+
def _are_resources_coupled_to_run_object() -> bool:
|
|
126
|
+
return True
|
|
127
|
+
|
|
128
|
+
@staticmethod
|
|
129
|
+
def _get_object_label_selector(object_id: str) -> str:
|
|
130
|
+
return f"mlrun/uid={object_id}"
|
|
131
|
+
|
|
132
|
+
@staticmethod
|
|
133
|
+
def _get_main_runtime_resource_label_selector() -> str:
|
|
134
|
+
"""
|
|
135
|
+
There are some runtimes which might have multiple k8s resources attached to a one runtime, in this case
|
|
136
|
+
we don't want to pull logs from all but rather only for the "driver"/"launcher" etc
|
|
137
|
+
:return: the label selector
|
|
138
|
+
"""
|
|
139
|
+
return "mpi-job-role=launcher"
|
|
140
|
+
|
|
141
|
+
@staticmethod
|
|
142
|
+
def _get_crd_info() -> typing.Tuple[str, str, str]:
|
|
143
|
+
return (
|
|
144
|
+
MpiRuntimeV1.crd_group,
|
|
145
|
+
MpiRuntimeV1.crd_version,
|
|
146
|
+
MpiRuntimeV1.crd_plural,
|
|
147
|
+
)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Copyright 2023 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from mlrun.api.runtime_handlers.kubejob import KubeRuntimeHandler
|
|
16
|
+
from mlrun.runtimes.base import RuntimeClassMode
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class RemoteSparkRuntimeHandler(KubeRuntimeHandler):
|
|
20
|
+
kind = "remote-spark"
|
|
21
|
+
class_modes = {RuntimeClassMode.run: "remote-spark"}
|
|
22
|
+
|
|
23
|
+
@staticmethod
|
|
24
|
+
def _are_resources_coupled_to_run_object() -> bool:
|
|
25
|
+
return True
|
|
26
|
+
|
|
27
|
+
@staticmethod
|
|
28
|
+
def _get_object_label_selector(object_id: str) -> str:
|
|
29
|
+
return f"mlrun/uid={object_id}"
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# Copyright 2023 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import typing
|
|
15
|
+
from datetime import datetime
|
|
16
|
+
from typing import Dict, Optional, Tuple
|
|
17
|
+
|
|
18
|
+
from kubernetes.client.rest import ApiException
|
|
19
|
+
from sqlalchemy.orm import Session
|
|
20
|
+
|
|
21
|
+
from mlrun.api.db.base import DBInterface
|
|
22
|
+
from mlrun.api.runtime_handlers.base import BaseRuntimeHandler
|
|
23
|
+
from mlrun.runtimes.base import RuntimeClassMode
|
|
24
|
+
from mlrun.runtimes.constants import RunStates, SparkApplicationStates
|
|
25
|
+
from mlrun.runtimes.sparkjob.abstract import AbstractSparkRuntime
|
|
26
|
+
from mlrun.runtimes.utils import get_k8s
|
|
27
|
+
from mlrun.utils import logger
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class SparkRuntimeHandler(BaseRuntimeHandler):
|
|
31
|
+
kind = "spark"
|
|
32
|
+
class_modes = {
|
|
33
|
+
RuntimeClassMode.run: "spark",
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
def _resolve_crd_object_status_info(
|
|
37
|
+
self, db: DBInterface, db_session: Session, crd_object
|
|
38
|
+
) -> Tuple[bool, Optional[datetime], Optional[str]]:
|
|
39
|
+
state = crd_object.get("status", {}).get("applicationState", {}).get("state")
|
|
40
|
+
in_terminal_state = state in SparkApplicationStates.terminal_states()
|
|
41
|
+
desired_run_state = SparkApplicationStates.spark_application_state_to_run_state(
|
|
42
|
+
state
|
|
43
|
+
)
|
|
44
|
+
completion_time = None
|
|
45
|
+
if in_terminal_state:
|
|
46
|
+
if crd_object.get("status", {}).get("terminationTime"):
|
|
47
|
+
completion_time = datetime.fromisoformat(
|
|
48
|
+
crd_object.get("status", {})
|
|
49
|
+
.get("terminationTime")
|
|
50
|
+
.replace("Z", "+00:00")
|
|
51
|
+
)
|
|
52
|
+
else:
|
|
53
|
+
last_submission_attempt_time = crd_object.get("status", {}).get(
|
|
54
|
+
"lastSubmissionAttemptTime"
|
|
55
|
+
)
|
|
56
|
+
if last_submission_attempt_time:
|
|
57
|
+
last_submission_attempt_time = last_submission_attempt_time.replace(
|
|
58
|
+
"Z", "+00:00"
|
|
59
|
+
)
|
|
60
|
+
completion_time = datetime.fromisoformat(
|
|
61
|
+
last_submission_attempt_time
|
|
62
|
+
)
|
|
63
|
+
return in_terminal_state, completion_time, desired_run_state
|
|
64
|
+
|
|
65
|
+
def _update_ui_url(
|
|
66
|
+
self,
|
|
67
|
+
db: DBInterface,
|
|
68
|
+
db_session: Session,
|
|
69
|
+
project: str,
|
|
70
|
+
uid: str,
|
|
71
|
+
crd_object,
|
|
72
|
+
run: Dict = None,
|
|
73
|
+
):
|
|
74
|
+
app_state = (
|
|
75
|
+
crd_object.get("status", {}).get("applicationState", {}).get("state")
|
|
76
|
+
)
|
|
77
|
+
state = SparkApplicationStates.spark_application_state_to_run_state(app_state)
|
|
78
|
+
ui_url = None
|
|
79
|
+
if state == RunStates.running:
|
|
80
|
+
ui_url = (
|
|
81
|
+
crd_object.get("status", {})
|
|
82
|
+
.get("driverInfo", {})
|
|
83
|
+
.get("webUIIngressAddress")
|
|
84
|
+
)
|
|
85
|
+
db_ui_url = run.get("status", {}).get("ui_url")
|
|
86
|
+
if db_ui_url == ui_url:
|
|
87
|
+
return
|
|
88
|
+
run.setdefault("status", {})["ui_url"] = ui_url
|
|
89
|
+
db.store_run(db_session, run, uid, project)
|
|
90
|
+
|
|
91
|
+
@staticmethod
|
|
92
|
+
def _are_resources_coupled_to_run_object() -> bool:
|
|
93
|
+
return True
|
|
94
|
+
|
|
95
|
+
@staticmethod
|
|
96
|
+
def _get_object_label_selector(object_id: str) -> str:
|
|
97
|
+
return f"mlrun/uid={object_id}"
|
|
98
|
+
|
|
99
|
+
@staticmethod
|
|
100
|
+
def _get_main_runtime_resource_label_selector() -> str:
|
|
101
|
+
"""
|
|
102
|
+
There are some runtimes which might have multiple k8s resources attached to a one runtime, in this case
|
|
103
|
+
we don't want to pull logs from all but rather only for the "driver"/"launcher" etc
|
|
104
|
+
:return: the label selector
|
|
105
|
+
"""
|
|
106
|
+
return "spark-role=driver"
|
|
107
|
+
|
|
108
|
+
@staticmethod
|
|
109
|
+
def _get_crd_info() -> Tuple[str, str, str]:
|
|
110
|
+
return (
|
|
111
|
+
AbstractSparkRuntime.group,
|
|
112
|
+
AbstractSparkRuntime.version,
|
|
113
|
+
AbstractSparkRuntime.plural,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
def _delete_extra_resources(
|
|
117
|
+
self,
|
|
118
|
+
db: DBInterface,
|
|
119
|
+
db_session: Session,
|
|
120
|
+
namespace: str,
|
|
121
|
+
deleted_resources: typing.List[Dict],
|
|
122
|
+
label_selector: str = None,
|
|
123
|
+
force: bool = False,
|
|
124
|
+
grace_period: int = None,
|
|
125
|
+
):
|
|
126
|
+
"""
|
|
127
|
+
Handling config maps deletion
|
|
128
|
+
"""
|
|
129
|
+
uids = []
|
|
130
|
+
for crd_dict in deleted_resources:
|
|
131
|
+
uid = crd_dict["metadata"].get("labels", {}).get("mlrun/uid", None)
|
|
132
|
+
uids.append(uid)
|
|
133
|
+
|
|
134
|
+
config_maps = get_k8s().v1api.list_namespaced_config_map(
|
|
135
|
+
namespace, label_selector=label_selector
|
|
136
|
+
)
|
|
137
|
+
for config_map in config_maps.items:
|
|
138
|
+
try:
|
|
139
|
+
uid = config_map.metadata.labels.get("mlrun/uid", None)
|
|
140
|
+
if force or uid in uids:
|
|
141
|
+
get_k8s().v1api.delete_namespaced_config_map(
|
|
142
|
+
config_map.metadata.name, namespace
|
|
143
|
+
)
|
|
144
|
+
logger.info(f"Deleted config map: {config_map.metadata.name}")
|
|
145
|
+
except ApiException as exc:
|
|
146
|
+
# ignore error if config map is already removed
|
|
147
|
+
if exc.status != 404:
|
|
148
|
+
raise
|
mlrun/api/utils/builder.py
CHANGED
|
@@ -420,10 +420,7 @@ def build_image(
|
|
|
420
420
|
# use a temp dir for permissions and set it as the workdir
|
|
421
421
|
tmpdir = tempfile.mkdtemp()
|
|
422
422
|
relative_workdir = runtime.spec.clone_target_dir or ""
|
|
423
|
-
|
|
424
|
-
# TODO: use 'removeprefix' when we drop python 3.7 support
|
|
425
|
-
# relative_workdir.removeprefix("./")
|
|
426
|
-
relative_workdir = relative_workdir[2:]
|
|
423
|
+
relative_workdir = relative_workdir.removeprefix("./")
|
|
427
424
|
|
|
428
425
|
runtime.spec.clone_target_dir = path.join(tmpdir, "mlrun", relative_workdir)
|
|
429
426
|
|
mlrun/api/utils/clients/chief.py
CHANGED
|
@@ -98,6 +98,20 @@ class Client(
|
|
|
98
98
|
"DELETE", f"projects/{project}/schedules/{name}", request
|
|
99
99
|
)
|
|
100
100
|
|
|
101
|
+
async def submit_workflow(
|
|
102
|
+
self,
|
|
103
|
+
project: str,
|
|
104
|
+
name: str,
|
|
105
|
+
request: fastapi.Request,
|
|
106
|
+
json: dict,
|
|
107
|
+
) -> fastapi.Response:
|
|
108
|
+
"""
|
|
109
|
+
Workflow schedules are running only on chief
|
|
110
|
+
"""
|
|
111
|
+
return await self._proxy_request_to_chief(
|
|
112
|
+
"POST", f"projects/{project}/workflows/{name}/submit", request, json
|
|
113
|
+
)
|
|
114
|
+
|
|
101
115
|
async def delete_schedules(
|
|
102
116
|
self, project: str, request: fastapi.Request
|
|
103
117
|
) -> fastapi.Response:
|