mlrun 1.4.0rc25__py3-none-any.whl → 1.5.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (184) hide show
  1. mlrun/__init__.py +2 -35
  2. mlrun/__main__.py +3 -41
  3. mlrun/api/api/api.py +6 -0
  4. mlrun/api/api/endpoints/feature_store.py +0 -4
  5. mlrun/api/api/endpoints/files.py +14 -2
  6. mlrun/api/api/endpoints/frontend_spec.py +2 -1
  7. mlrun/api/api/endpoints/functions.py +95 -59
  8. mlrun/api/api/endpoints/grafana_proxy.py +9 -9
  9. mlrun/api/api/endpoints/logs.py +17 -3
  10. mlrun/api/api/endpoints/model_endpoints.py +3 -2
  11. mlrun/api/api/endpoints/pipelines.py +1 -5
  12. mlrun/api/api/endpoints/projects.py +88 -0
  13. mlrun/api/api/endpoints/runs.py +48 -6
  14. mlrun/api/api/endpoints/submit.py +2 -1
  15. mlrun/api/api/endpoints/workflows.py +355 -0
  16. mlrun/api/api/utils.py +3 -4
  17. mlrun/api/crud/__init__.py +1 -0
  18. mlrun/api/crud/client_spec.py +6 -2
  19. mlrun/api/crud/feature_store.py +5 -0
  20. mlrun/api/crud/model_monitoring/__init__.py +1 -0
  21. mlrun/api/crud/model_monitoring/deployment.py +497 -0
  22. mlrun/api/crud/model_monitoring/grafana.py +96 -42
  23. mlrun/api/crud/model_monitoring/helpers.py +159 -0
  24. mlrun/api/crud/model_monitoring/model_endpoints.py +202 -476
  25. mlrun/api/crud/notifications.py +9 -4
  26. mlrun/api/crud/pipelines.py +6 -11
  27. mlrun/api/crud/projects.py +2 -2
  28. mlrun/api/crud/runtime_resources.py +4 -3
  29. mlrun/api/crud/runtimes/nuclio/helpers.py +5 -1
  30. mlrun/api/crud/secrets.py +21 -0
  31. mlrun/api/crud/workflows.py +352 -0
  32. mlrun/api/db/base.py +16 -1
  33. mlrun/api/db/init_db.py +2 -4
  34. mlrun/api/db/session.py +1 -1
  35. mlrun/api/db/sqldb/db.py +129 -31
  36. mlrun/api/db/sqldb/models/models_mysql.py +15 -1
  37. mlrun/api/db/sqldb/models/models_sqlite.py +16 -2
  38. mlrun/api/launcher.py +38 -6
  39. mlrun/api/main.py +3 -2
  40. mlrun/api/rundb/__init__.py +13 -0
  41. mlrun/{db → api/rundb}/sqldb.py +36 -84
  42. mlrun/api/runtime_handlers/__init__.py +56 -0
  43. mlrun/api/runtime_handlers/base.py +1247 -0
  44. mlrun/api/runtime_handlers/daskjob.py +209 -0
  45. mlrun/api/runtime_handlers/kubejob.py +37 -0
  46. mlrun/api/runtime_handlers/mpijob.py +147 -0
  47. mlrun/api/runtime_handlers/remotesparkjob.py +29 -0
  48. mlrun/api/runtime_handlers/sparkjob.py +148 -0
  49. mlrun/api/schemas/__init__.py +17 -6
  50. mlrun/api/utils/builder.py +1 -4
  51. mlrun/api/utils/clients/chief.py +14 -0
  52. mlrun/api/utils/clients/iguazio.py +33 -33
  53. mlrun/api/utils/clients/nuclio.py +2 -2
  54. mlrun/api/utils/periodic.py +9 -2
  55. mlrun/api/utils/projects/follower.py +14 -7
  56. mlrun/api/utils/projects/leader.py +2 -1
  57. mlrun/api/utils/projects/remotes/nop_follower.py +2 -2
  58. mlrun/api/utils/projects/remotes/nop_leader.py +2 -2
  59. mlrun/api/utils/runtimes/__init__.py +14 -0
  60. mlrun/api/utils/runtimes/nuclio.py +43 -0
  61. mlrun/api/utils/scheduler.py +98 -15
  62. mlrun/api/utils/singletons/db.py +5 -1
  63. mlrun/api/utils/singletons/project_member.py +4 -1
  64. mlrun/api/utils/singletons/scheduler.py +1 -1
  65. mlrun/artifacts/base.py +6 -6
  66. mlrun/artifacts/dataset.py +4 -4
  67. mlrun/artifacts/manager.py +2 -3
  68. mlrun/artifacts/model.py +2 -2
  69. mlrun/artifacts/plots.py +8 -8
  70. mlrun/common/db/__init__.py +14 -0
  71. mlrun/common/helpers.py +37 -0
  72. mlrun/{mlutils → common/model_monitoring}/__init__.py +3 -2
  73. mlrun/common/model_monitoring/helpers.py +69 -0
  74. mlrun/common/schemas/__init__.py +13 -1
  75. mlrun/common/schemas/auth.py +4 -1
  76. mlrun/common/schemas/client_spec.py +1 -1
  77. mlrun/common/schemas/function.py +17 -0
  78. mlrun/common/schemas/model_monitoring/__init__.py +48 -0
  79. mlrun/common/{model_monitoring.py → schemas/model_monitoring/constants.py} +11 -23
  80. mlrun/common/schemas/model_monitoring/grafana.py +55 -0
  81. mlrun/common/schemas/{model_endpoints.py → model_monitoring/model_endpoints.py} +32 -65
  82. mlrun/common/schemas/notification.py +1 -0
  83. mlrun/common/schemas/object.py +4 -0
  84. mlrun/common/schemas/project.py +1 -0
  85. mlrun/common/schemas/regex.py +1 -1
  86. mlrun/common/schemas/runs.py +1 -8
  87. mlrun/common/schemas/schedule.py +1 -8
  88. mlrun/common/schemas/workflow.py +54 -0
  89. mlrun/config.py +45 -42
  90. mlrun/datastore/__init__.py +21 -0
  91. mlrun/datastore/base.py +1 -1
  92. mlrun/datastore/datastore.py +9 -0
  93. mlrun/datastore/dbfs_store.py +168 -0
  94. mlrun/datastore/helpers.py +18 -0
  95. mlrun/datastore/sources.py +1 -0
  96. mlrun/datastore/store_resources.py +2 -5
  97. mlrun/datastore/v3io.py +1 -2
  98. mlrun/db/__init__.py +4 -68
  99. mlrun/db/base.py +12 -0
  100. mlrun/db/factory.py +65 -0
  101. mlrun/db/httpdb.py +175 -20
  102. mlrun/db/nopdb.py +4 -2
  103. mlrun/execution.py +4 -2
  104. mlrun/feature_store/__init__.py +1 -0
  105. mlrun/feature_store/api.py +1 -2
  106. mlrun/feature_store/common.py +2 -1
  107. mlrun/feature_store/feature_set.py +1 -11
  108. mlrun/feature_store/feature_vector.py +340 -2
  109. mlrun/feature_store/ingestion.py +5 -10
  110. mlrun/feature_store/retrieval/base.py +118 -104
  111. mlrun/feature_store/retrieval/dask_merger.py +17 -10
  112. mlrun/feature_store/retrieval/job.py +4 -1
  113. mlrun/feature_store/retrieval/local_merger.py +18 -18
  114. mlrun/feature_store/retrieval/spark_merger.py +21 -14
  115. mlrun/feature_store/retrieval/storey_merger.py +22 -16
  116. mlrun/kfpops.py +3 -9
  117. mlrun/launcher/base.py +57 -53
  118. mlrun/launcher/client.py +5 -4
  119. mlrun/launcher/factory.py +24 -13
  120. mlrun/launcher/local.py +6 -6
  121. mlrun/launcher/remote.py +4 -4
  122. mlrun/lists.py +0 -11
  123. mlrun/model.py +11 -17
  124. mlrun/model_monitoring/__init__.py +2 -22
  125. mlrun/model_monitoring/features_drift_table.py +1 -1
  126. mlrun/model_monitoring/helpers.py +22 -210
  127. mlrun/model_monitoring/model_endpoint.py +1 -1
  128. mlrun/model_monitoring/model_monitoring_batch.py +127 -50
  129. mlrun/model_monitoring/prometheus.py +219 -0
  130. mlrun/model_monitoring/stores/__init__.py +16 -11
  131. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +95 -23
  132. mlrun/model_monitoring/stores/models/mysql.py +47 -29
  133. mlrun/model_monitoring/stores/models/sqlite.py +47 -29
  134. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +31 -19
  135. mlrun/model_monitoring/{stream_processing_fs.py → stream_processing.py} +206 -64
  136. mlrun/model_monitoring/tracking_policy.py +104 -0
  137. mlrun/package/packager.py +6 -8
  138. mlrun/package/packagers/default_packager.py +121 -10
  139. mlrun/package/packagers/numpy_packagers.py +1 -1
  140. mlrun/platforms/__init__.py +0 -2
  141. mlrun/platforms/iguazio.py +0 -56
  142. mlrun/projects/pipelines.py +53 -159
  143. mlrun/projects/project.py +10 -37
  144. mlrun/render.py +1 -1
  145. mlrun/run.py +8 -124
  146. mlrun/runtimes/__init__.py +6 -42
  147. mlrun/runtimes/base.py +29 -1249
  148. mlrun/runtimes/daskjob.py +2 -198
  149. mlrun/runtimes/funcdoc.py +0 -9
  150. mlrun/runtimes/function.py +25 -29
  151. mlrun/runtimes/kubejob.py +5 -29
  152. mlrun/runtimes/local.py +1 -1
  153. mlrun/runtimes/mpijob/__init__.py +2 -2
  154. mlrun/runtimes/mpijob/abstract.py +10 -1
  155. mlrun/runtimes/mpijob/v1.py +0 -76
  156. mlrun/runtimes/mpijob/v1alpha1.py +1 -74
  157. mlrun/runtimes/nuclio.py +3 -2
  158. mlrun/runtimes/pod.py +28 -18
  159. mlrun/runtimes/remotesparkjob.py +1 -15
  160. mlrun/runtimes/serving.py +14 -6
  161. mlrun/runtimes/sparkjob/__init__.py +0 -1
  162. mlrun/runtimes/sparkjob/abstract.py +4 -131
  163. mlrun/runtimes/utils.py +0 -26
  164. mlrun/serving/routers.py +7 -7
  165. mlrun/serving/server.py +11 -8
  166. mlrun/serving/states.py +7 -1
  167. mlrun/serving/v2_serving.py +6 -6
  168. mlrun/utils/helpers.py +23 -42
  169. mlrun/utils/notifications/notification/__init__.py +4 -0
  170. mlrun/utils/notifications/notification/webhook.py +61 -0
  171. mlrun/utils/notifications/notification_pusher.py +5 -25
  172. mlrun/utils/regex.py +7 -2
  173. mlrun/utils/version/version.json +2 -2
  174. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/METADATA +26 -25
  175. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/RECORD +180 -158
  176. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/WHEEL +1 -1
  177. mlrun/mlutils/data.py +0 -160
  178. mlrun/mlutils/models.py +0 -78
  179. mlrun/mlutils/plots.py +0 -902
  180. mlrun/utils/model_monitoring.py +0 -249
  181. /mlrun/{api/db/sqldb/session.py → common/db/sql_session.py} +0 -0
  182. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/LICENSE +0 -0
  183. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/entry_points.txt +0 -0
  184. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,209 @@
1
+ # Copyright 2023 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+ import typing
16
+ from typing import Dict, List, Optional, Union
17
+
18
+ from kubernetes.client.rest import ApiException
19
+ from sqlalchemy.orm import Session
20
+
21
+ import mlrun.common.schemas
22
+ import mlrun.errors
23
+ import mlrun.k8s_utils
24
+ import mlrun.utils
25
+ import mlrun.utils.regex
26
+ from mlrun.api.db.base import DBInterface
27
+ from mlrun.api.runtime_handlers.base import BaseRuntimeHandler
28
+ from mlrun.config import config
29
+ from mlrun.runtimes.base import RuntimeClassMode
30
+ from mlrun.runtimes.utils import get_k8s
31
+ from mlrun.utils import logger
32
+
33
+
34
+ class DaskRuntimeHandler(BaseRuntimeHandler):
35
+ kind = "dask"
36
+ class_modes = {RuntimeClassMode.run: "dask"}
37
+
38
+ # Dask runtime resources are per function (and not per run).
39
+ # It means that monitoring runtime resources state doesn't say anything about the run state.
40
+ # Therefore dask run monitoring is done completely by the SDK, so overriding the monitoring method with no logic
41
+ def monitor_runs(
42
+ self, db: DBInterface, db_session: Session, leader_session: Optional[str] = None
43
+ ):
44
+ return
45
+
46
+ @staticmethod
47
+ def _get_object_label_selector(object_id: str) -> str:
48
+ return f"mlrun/function={object_id}"
49
+
50
+ @staticmethod
51
+ def resolve_object_id(
52
+ run: dict,
53
+ ) -> typing.Optional[str]:
54
+ """
55
+ Resolves the object ID from the run object.
56
+ In dask runtime, the object ID is the function name.
57
+ :param run: run object
58
+ :return: function name
59
+ """
60
+
61
+ function = run.get("spec", {}).get("function", None)
62
+ if function:
63
+
64
+ # a dask run's function field is in the format <project-name>/<function-name>@<run-uid>
65
+ # we only want the function name
66
+ project_and_function = function.split("@")[0]
67
+ return project_and_function.split("/")[-1]
68
+
69
+ return None
70
+
71
+ def _enrich_list_resources_response(
72
+ self,
73
+ response: Union[
74
+ mlrun.common.schemas.RuntimeResources,
75
+ mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
76
+ mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
77
+ ],
78
+ namespace: str,
79
+ label_selector: str = None,
80
+ group_by: Optional[
81
+ mlrun.common.schemas.ListRuntimeResourcesGroupByField
82
+ ] = None,
83
+ ) -> Union[
84
+ mlrun.common.schemas.RuntimeResources,
85
+ mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
86
+ mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
87
+ ]:
88
+ """
89
+ Handling listing service resources
90
+ """
91
+ enrich_needed = self._validate_if_enrich_is_needed_by_group_by(group_by)
92
+ if not enrich_needed:
93
+ return response
94
+ services = get_k8s().v1api.list_namespaced_service(
95
+ namespace, label_selector=label_selector
96
+ )
97
+ service_resources = []
98
+ for service in services.items:
99
+ service_resources.append(
100
+ mlrun.common.schemas.RuntimeResource(
101
+ name=service.metadata.name, labels=service.metadata.labels
102
+ )
103
+ )
104
+ return self._enrich_service_resources_in_response(
105
+ response, service_resources, group_by
106
+ )
107
+
108
+ def _build_output_from_runtime_resources(
109
+ self,
110
+ response: Union[
111
+ mlrun.common.schemas.RuntimeResources,
112
+ mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
113
+ mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
114
+ ],
115
+ runtime_resources_list: List[mlrun.common.schemas.RuntimeResources],
116
+ group_by: Optional[
117
+ mlrun.common.schemas.ListRuntimeResourcesGroupByField
118
+ ] = None,
119
+ ):
120
+ enrich_needed = self._validate_if_enrich_is_needed_by_group_by(group_by)
121
+ if not enrich_needed:
122
+ return response
123
+ service_resources = []
124
+ for runtime_resources in runtime_resources_list:
125
+ if runtime_resources.service_resources:
126
+ service_resources += runtime_resources.service_resources
127
+ return self._enrich_service_resources_in_response(
128
+ response, service_resources, group_by
129
+ )
130
+
131
+ def _validate_if_enrich_is_needed_by_group_by(
132
+ self,
133
+ group_by: Optional[
134
+ mlrun.common.schemas.ListRuntimeResourcesGroupByField
135
+ ] = None,
136
+ ) -> bool:
137
+ # Dask runtime resources are per function (and not per job) therefore, when grouping by job we're simply
138
+ # omitting the dask runtime resources
139
+ if group_by == mlrun.common.schemas.ListRuntimeResourcesGroupByField.job:
140
+ return False
141
+ elif group_by == mlrun.common.schemas.ListRuntimeResourcesGroupByField.project:
142
+ return True
143
+ elif group_by is not None:
144
+ raise NotImplementedError(
145
+ f"Provided group by field is not supported. group_by={group_by}"
146
+ )
147
+ return True
148
+
149
+ def _enrich_service_resources_in_response(
150
+ self,
151
+ response: Union[
152
+ mlrun.common.schemas.RuntimeResources,
153
+ mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
154
+ mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
155
+ ],
156
+ service_resources: List[mlrun.common.schemas.RuntimeResource],
157
+ group_by: Optional[
158
+ mlrun.common.schemas.ListRuntimeResourcesGroupByField
159
+ ] = None,
160
+ ):
161
+ if group_by == mlrun.common.schemas.ListRuntimeResourcesGroupByField.project:
162
+ for service_resource in service_resources:
163
+ self._add_resource_to_grouped_by_project_resources_response(
164
+ response, "service_resources", service_resource
165
+ )
166
+ else:
167
+ response.service_resources = service_resources
168
+ return response
169
+
170
+ def _delete_extra_resources(
171
+ self,
172
+ db: DBInterface,
173
+ db_session: Session,
174
+ namespace: str,
175
+ deleted_resources: List[Dict],
176
+ label_selector: str = None,
177
+ force: bool = False,
178
+ grace_period: int = None,
179
+ ):
180
+ """
181
+ Handling services deletion
182
+ """
183
+ if grace_period is None:
184
+ grace_period = config.runtime_resources_deletion_grace_period
185
+ service_names = []
186
+ for pod_dict in deleted_resources:
187
+ dask_component = (
188
+ pod_dict["metadata"].get("labels", {}).get("dask.org/component")
189
+ )
190
+ cluster_name = (
191
+ pod_dict["metadata"].get("labels", {}).get("dask.org/cluster-name")
192
+ )
193
+ if dask_component == "scheduler" and cluster_name:
194
+ service_names.append(cluster_name)
195
+
196
+ services = get_k8s().v1api.list_namespaced_service(
197
+ namespace, label_selector=label_selector
198
+ )
199
+ for service in services.items:
200
+ try:
201
+ if force or service.metadata.name in service_names:
202
+ get_k8s().v1api.delete_namespaced_service(
203
+ service.metadata.name, namespace
204
+ )
205
+ logger.info(f"Deleted service: {service.metadata.name}")
206
+ except ApiException as exc:
207
+ # ignore error if service is already removed
208
+ if exc.status != 404:
209
+ raise
@@ -0,0 +1,37 @@
1
+ # Copyright 2023 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from mlrun.api.runtime_handlers.base import BaseRuntimeHandler
16
+ from mlrun.runtimes.base import RuntimeClassMode
17
+
18
+
19
+ class KubeRuntimeHandler(BaseRuntimeHandler):
20
+ kind = "job"
21
+ class_modes = {RuntimeClassMode.run: "job", RuntimeClassMode.build: "build"}
22
+
23
+ @staticmethod
24
+ def _expect_pods_without_uid() -> bool:
25
+ """
26
+ builder pods are handled as part of this runtime handler - they are not coupled to run object, therefore they
27
+ don't have the uid in their labels
28
+ """
29
+ return True
30
+
31
+ @staticmethod
32
+ def _are_resources_coupled_to_run_object() -> bool:
33
+ return True
34
+
35
+ @staticmethod
36
+ def _get_object_label_selector(object_id: str) -> str:
37
+ return f"mlrun/uid={object_id}"
@@ -0,0 +1,147 @@
1
+ # Copyright 2023 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import typing
16
+ from datetime import datetime
17
+
18
+ from sqlalchemy.orm import Session
19
+
20
+ from mlrun.api.db.base import DBInterface
21
+ from mlrun.api.runtime_handlers import BaseRuntimeHandler
22
+ from mlrun.runtimes.base import RuntimeClassMode
23
+ from mlrun.runtimes.constants import MPIJobV1Alpha1States, RunStates
24
+ from mlrun.runtimes.mpijob import MpiRuntimeV1, MpiRuntimeV1Alpha1
25
+
26
+
27
+ class MpiV1Alpha1RuntimeHandler(BaseRuntimeHandler):
28
+ kind = "mpijob"
29
+ class_modes = {
30
+ RuntimeClassMode.run: "mpijob",
31
+ }
32
+
33
+ def _resolve_crd_object_status_info(
34
+ self, db: DBInterface, db_session: Session, crd_object
35
+ ) -> typing.Tuple[bool, typing.Optional[datetime], typing.Optional[str]]:
36
+ """
37
+ https://github.com/kubeflow/mpi-operator/blob/master/pkg/apis/kubeflow/v1alpha1/types.go#L115
38
+ """
39
+ launcher_status = crd_object.get("status", {}).get("launcherStatus", "")
40
+ in_terminal_state = launcher_status in MPIJobV1Alpha1States.terminal_states()
41
+ desired_run_state = MPIJobV1Alpha1States.mpijob_state_to_run_state(
42
+ launcher_status
43
+ )
44
+ completion_time = None
45
+ if in_terminal_state:
46
+ completion_time = datetime.fromisoformat(
47
+ crd_object.get("status", {})
48
+ .get("completionTime")
49
+ .replace("Z", "+00:00")
50
+ )
51
+ desired_run_state = {
52
+ "Succeeded": RunStates.completed,
53
+ "Failed": RunStates.error,
54
+ }[launcher_status]
55
+ return in_terminal_state, completion_time, desired_run_state
56
+
57
+ @staticmethod
58
+ def _are_resources_coupled_to_run_object() -> bool:
59
+ return True
60
+
61
+ @staticmethod
62
+ def _get_object_label_selector(object_id: str) -> str:
63
+ return f"mlrun/uid={object_id}"
64
+
65
+ @staticmethod
66
+ def _get_main_runtime_resource_label_selector() -> str:
67
+ """
68
+ There are some runtimes which might have multiple k8s resources attached to a one runtime, in this case
69
+ we don't want to pull logs from all but rather only for the "driver"/"launcher" etc
70
+ :return: the label selector
71
+ """
72
+ return "mpi_role_type=launcher"
73
+
74
+ @staticmethod
75
+ def _get_crd_info() -> typing.Tuple[str, str, str]:
76
+ return (
77
+ MpiRuntimeV1Alpha1.crd_group,
78
+ MpiRuntimeV1Alpha1.crd_version,
79
+ MpiRuntimeV1Alpha1.crd_plural,
80
+ )
81
+
82
+ @staticmethod
83
+ def _get_crd_object_status(crd_object) -> str:
84
+ return crd_object.get("status", {}).get("launcherStatus", "")
85
+
86
+
87
+ class MpiV1RuntimeHandler(BaseRuntimeHandler):
88
+ kind = "mpijob"
89
+ class_modes = {
90
+ RuntimeClassMode.run: "mpijob",
91
+ }
92
+
93
+ def _resolve_crd_object_status_info(
94
+ self, db: DBInterface, db_session: Session, crd_object
95
+ ) -> typing.Tuple[bool, typing.Optional[datetime], typing.Optional[str]]:
96
+ """
97
+ https://github.com/kubeflow/mpi-operator/blob/master/pkg/apis/kubeflow/v1/types.go#L29
98
+ https://github.com/kubeflow/common/blob/master/pkg/apis/common/v1/types.go#L55
99
+ """
100
+ launcher_status = (
101
+ crd_object.get("status", {}).get("replicaStatuses", {}).get("Launcher", {})
102
+ )
103
+ # the launcher status also has running property, but it's empty for
104
+ # short period after the creation, so we're
105
+ # checking terminal state by the completion time existence
106
+ in_terminal_state = (
107
+ crd_object.get("status", {}).get("completionTime", None) is not None
108
+ )
109
+ desired_run_state = RunStates.running
110
+ completion_time = None
111
+ if in_terminal_state:
112
+ completion_time = datetime.fromisoformat(
113
+ crd_object.get("status", {})
114
+ .get("completionTime")
115
+ .replace("Z", "+00:00")
116
+ )
117
+ desired_run_state = (
118
+ RunStates.error
119
+ if launcher_status.get("failed", 0) > 0
120
+ else RunStates.completed
121
+ )
122
+ return in_terminal_state, completion_time, desired_run_state
123
+
124
+ @staticmethod
125
+ def _are_resources_coupled_to_run_object() -> bool:
126
+ return True
127
+
128
+ @staticmethod
129
+ def _get_object_label_selector(object_id: str) -> str:
130
+ return f"mlrun/uid={object_id}"
131
+
132
+ @staticmethod
133
+ def _get_main_runtime_resource_label_selector() -> str:
134
+ """
135
+ There are some runtimes which might have multiple k8s resources attached to a one runtime, in this case
136
+ we don't want to pull logs from all but rather only for the "driver"/"launcher" etc
137
+ :return: the label selector
138
+ """
139
+ return "mpi-job-role=launcher"
140
+
141
+ @staticmethod
142
+ def _get_crd_info() -> typing.Tuple[str, str, str]:
143
+ return (
144
+ MpiRuntimeV1.crd_group,
145
+ MpiRuntimeV1.crd_version,
146
+ MpiRuntimeV1.crd_plural,
147
+ )
@@ -0,0 +1,29 @@
1
+ # Copyright 2023 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from mlrun.api.runtime_handlers.kubejob import KubeRuntimeHandler
16
+ from mlrun.runtimes.base import RuntimeClassMode
17
+
18
+
19
+ class RemoteSparkRuntimeHandler(KubeRuntimeHandler):
20
+ kind = "remote-spark"
21
+ class_modes = {RuntimeClassMode.run: "remote-spark"}
22
+
23
+ @staticmethod
24
+ def _are_resources_coupled_to_run_object() -> bool:
25
+ return True
26
+
27
+ @staticmethod
28
+ def _get_object_label_selector(object_id: str) -> str:
29
+ return f"mlrun/uid={object_id}"
@@ -0,0 +1,148 @@
1
+ # Copyright 2023 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import typing
15
+ from datetime import datetime
16
+ from typing import Dict, Optional, Tuple
17
+
18
+ from kubernetes.client.rest import ApiException
19
+ from sqlalchemy.orm import Session
20
+
21
+ from mlrun.api.db.base import DBInterface
22
+ from mlrun.api.runtime_handlers.base import BaseRuntimeHandler
23
+ from mlrun.runtimes.base import RuntimeClassMode
24
+ from mlrun.runtimes.constants import RunStates, SparkApplicationStates
25
+ from mlrun.runtimes.sparkjob.abstract import AbstractSparkRuntime
26
+ from mlrun.runtimes.utils import get_k8s
27
+ from mlrun.utils import logger
28
+
29
+
30
+ class SparkRuntimeHandler(BaseRuntimeHandler):
31
+ kind = "spark"
32
+ class_modes = {
33
+ RuntimeClassMode.run: "spark",
34
+ }
35
+
36
+ def _resolve_crd_object_status_info(
37
+ self, db: DBInterface, db_session: Session, crd_object
38
+ ) -> Tuple[bool, Optional[datetime], Optional[str]]:
39
+ state = crd_object.get("status", {}).get("applicationState", {}).get("state")
40
+ in_terminal_state = state in SparkApplicationStates.terminal_states()
41
+ desired_run_state = SparkApplicationStates.spark_application_state_to_run_state(
42
+ state
43
+ )
44
+ completion_time = None
45
+ if in_terminal_state:
46
+ if crd_object.get("status", {}).get("terminationTime"):
47
+ completion_time = datetime.fromisoformat(
48
+ crd_object.get("status", {})
49
+ .get("terminationTime")
50
+ .replace("Z", "+00:00")
51
+ )
52
+ else:
53
+ last_submission_attempt_time = crd_object.get("status", {}).get(
54
+ "lastSubmissionAttemptTime"
55
+ )
56
+ if last_submission_attempt_time:
57
+ last_submission_attempt_time = last_submission_attempt_time.replace(
58
+ "Z", "+00:00"
59
+ )
60
+ completion_time = datetime.fromisoformat(
61
+ last_submission_attempt_time
62
+ )
63
+ return in_terminal_state, completion_time, desired_run_state
64
+
65
+ def _update_ui_url(
66
+ self,
67
+ db: DBInterface,
68
+ db_session: Session,
69
+ project: str,
70
+ uid: str,
71
+ crd_object,
72
+ run: Dict = None,
73
+ ):
74
+ app_state = (
75
+ crd_object.get("status", {}).get("applicationState", {}).get("state")
76
+ )
77
+ state = SparkApplicationStates.spark_application_state_to_run_state(app_state)
78
+ ui_url = None
79
+ if state == RunStates.running:
80
+ ui_url = (
81
+ crd_object.get("status", {})
82
+ .get("driverInfo", {})
83
+ .get("webUIIngressAddress")
84
+ )
85
+ db_ui_url = run.get("status", {}).get("ui_url")
86
+ if db_ui_url == ui_url:
87
+ return
88
+ run.setdefault("status", {})["ui_url"] = ui_url
89
+ db.store_run(db_session, run, uid, project)
90
+
91
+ @staticmethod
92
+ def _are_resources_coupled_to_run_object() -> bool:
93
+ return True
94
+
95
+ @staticmethod
96
+ def _get_object_label_selector(object_id: str) -> str:
97
+ return f"mlrun/uid={object_id}"
98
+
99
+ @staticmethod
100
+ def _get_main_runtime_resource_label_selector() -> str:
101
+ """
102
+ There are some runtimes which might have multiple k8s resources attached to a one runtime, in this case
103
+ we don't want to pull logs from all but rather only for the "driver"/"launcher" etc
104
+ :return: the label selector
105
+ """
106
+ return "spark-role=driver"
107
+
108
+ @staticmethod
109
+ def _get_crd_info() -> Tuple[str, str, str]:
110
+ return (
111
+ AbstractSparkRuntime.group,
112
+ AbstractSparkRuntime.version,
113
+ AbstractSparkRuntime.plural,
114
+ )
115
+
116
+ def _delete_extra_resources(
117
+ self,
118
+ db: DBInterface,
119
+ db_session: Session,
120
+ namespace: str,
121
+ deleted_resources: typing.List[Dict],
122
+ label_selector: str = None,
123
+ force: bool = False,
124
+ grace_period: int = None,
125
+ ):
126
+ """
127
+ Handling config maps deletion
128
+ """
129
+ uids = []
130
+ for crd_dict in deleted_resources:
131
+ uid = crd_dict["metadata"].get("labels", {}).get("mlrun/uid", None)
132
+ uids.append(uid)
133
+
134
+ config_maps = get_k8s().v1api.list_namespaced_config_map(
135
+ namespace, label_selector=label_selector
136
+ )
137
+ for config_map in config_maps.items:
138
+ try:
139
+ uid = config_map.metadata.labels.get("mlrun/uid", None)
140
+ if force or uid in uids:
141
+ get_k8s().v1api.delete_namespaced_config_map(
142
+ config_map.metadata.name, namespace
143
+ )
144
+ logger.info(f"Deleted config map: {config_map.metadata.name}")
145
+ except ApiException as exc:
146
+ # ignore error if config map is already removed
147
+ if exc.status != 404:
148
+ raise
@@ -36,6 +36,7 @@ import mlrun.common.schemas.function as old_function
36
36
  import mlrun.common.schemas.http as old_http
37
37
  import mlrun.common.schemas.k8s as old_k8s
38
38
  import mlrun.common.schemas.memory_reports as old_memory_reports
39
+ import mlrun.common.schemas.model_monitoring.grafana
39
40
  import mlrun.common.schemas.object as old_object
40
41
  import mlrun.common.schemas.pipeline as old_pipeline
41
42
  import mlrun.common.schemas.project as old_project
@@ -163,13 +164,23 @@ MostCommonObjectTypesReport = DeprecationHelper(
163
164
  ObjectTypeReport = DeprecationHelper(mlrun.common.schemas.ObjectTypeReport)
164
165
  Features = DeprecationHelper(mlrun.common.schemas.Features)
165
166
  FeatureValues = DeprecationHelper(mlrun.common.schemas.FeatureValues)
166
- GrafanaColumn = DeprecationHelper(mlrun.common.schemas.GrafanaColumn)
167
- GrafanaDataPoint = DeprecationHelper(mlrun.common.schemas.GrafanaDataPoint)
168
- GrafanaNumberColumn = DeprecationHelper(mlrun.common.schemas.GrafanaNumberColumn)
169
- GrafanaStringColumn = DeprecationHelper(mlrun.common.schemas.GrafanaStringColumn)
170
- GrafanaTable = DeprecationHelper(mlrun.common.schemas.GrafanaTable)
167
+ GrafanaColumn = DeprecationHelper(
168
+ mlrun.common.schemas.model_monitoring.grafana.GrafanaColumn
169
+ )
170
+ GrafanaDataPoint = DeprecationHelper(
171
+ mlrun.common.schemas.model_monitoring.grafana.GrafanaDataPoint
172
+ )
173
+ GrafanaNumberColumn = DeprecationHelper(
174
+ mlrun.common.schemas.model_monitoring.grafana.GrafanaNumberColumn
175
+ )
176
+ GrafanaStringColumn = DeprecationHelper(
177
+ mlrun.common.schemas.model_monitoring.grafana.GrafanaStringColumn
178
+ )
179
+ GrafanaTable = DeprecationHelper(
180
+ mlrun.common.schemas.model_monitoring.grafana.GrafanaTable
181
+ )
171
182
  GrafanaTimeSeriesTarget = DeprecationHelper(
172
- mlrun.common.schemas.GrafanaTimeSeriesTarget
183
+ mlrun.common.schemas.model_monitoring.grafana.GrafanaTimeSeriesTarget
173
184
  )
174
185
  ModelEndpoint = DeprecationHelper(mlrun.common.schemas.ModelEndpoint)
175
186
  ModelEndpointList = DeprecationHelper(mlrun.common.schemas.ModelEndpointList)
@@ -420,10 +420,7 @@ def build_image(
420
420
  # use a temp dir for permissions and set it as the workdir
421
421
  tmpdir = tempfile.mkdtemp()
422
422
  relative_workdir = runtime.spec.clone_target_dir or ""
423
- if relative_workdir.startswith("./"):
424
- # TODO: use 'removeprefix' when we drop python 3.7 support
425
- # relative_workdir.removeprefix("./")
426
- relative_workdir = relative_workdir[2:]
423
+ relative_workdir = relative_workdir.removeprefix("./")
427
424
 
428
425
  runtime.spec.clone_target_dir = path.join(tmpdir, "mlrun", relative_workdir)
429
426
 
@@ -98,6 +98,20 @@ class Client(
98
98
  "DELETE", f"projects/{project}/schedules/{name}", request
99
99
  )
100
100
 
101
+ async def submit_workflow(
102
+ self,
103
+ project: str,
104
+ name: str,
105
+ request: fastapi.Request,
106
+ json: dict,
107
+ ) -> fastapi.Response:
108
+ """
109
+ Workflow schedules are running only on chief
110
+ """
111
+ return await self._proxy_request_to_chief(
112
+ "POST", f"projects/{project}/workflows/{name}/submit", request, json
113
+ )
114
+
101
115
  async def delete_schedules(
102
116
  self, project: str, request: fastapi.Request
103
117
  ) -> fastapi.Response: