mlrun 1.4.0rc25__py3-none-any.whl → 1.5.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (184) hide show
  1. mlrun/__init__.py +2 -35
  2. mlrun/__main__.py +3 -41
  3. mlrun/api/api/api.py +6 -0
  4. mlrun/api/api/endpoints/feature_store.py +0 -4
  5. mlrun/api/api/endpoints/files.py +14 -2
  6. mlrun/api/api/endpoints/frontend_spec.py +2 -1
  7. mlrun/api/api/endpoints/functions.py +95 -59
  8. mlrun/api/api/endpoints/grafana_proxy.py +9 -9
  9. mlrun/api/api/endpoints/logs.py +17 -3
  10. mlrun/api/api/endpoints/model_endpoints.py +3 -2
  11. mlrun/api/api/endpoints/pipelines.py +1 -5
  12. mlrun/api/api/endpoints/projects.py +88 -0
  13. mlrun/api/api/endpoints/runs.py +48 -6
  14. mlrun/api/api/endpoints/submit.py +2 -1
  15. mlrun/api/api/endpoints/workflows.py +355 -0
  16. mlrun/api/api/utils.py +3 -4
  17. mlrun/api/crud/__init__.py +1 -0
  18. mlrun/api/crud/client_spec.py +6 -2
  19. mlrun/api/crud/feature_store.py +5 -0
  20. mlrun/api/crud/model_monitoring/__init__.py +1 -0
  21. mlrun/api/crud/model_monitoring/deployment.py +497 -0
  22. mlrun/api/crud/model_monitoring/grafana.py +96 -42
  23. mlrun/api/crud/model_monitoring/helpers.py +159 -0
  24. mlrun/api/crud/model_monitoring/model_endpoints.py +202 -476
  25. mlrun/api/crud/notifications.py +9 -4
  26. mlrun/api/crud/pipelines.py +6 -11
  27. mlrun/api/crud/projects.py +2 -2
  28. mlrun/api/crud/runtime_resources.py +4 -3
  29. mlrun/api/crud/runtimes/nuclio/helpers.py +5 -1
  30. mlrun/api/crud/secrets.py +21 -0
  31. mlrun/api/crud/workflows.py +352 -0
  32. mlrun/api/db/base.py +16 -1
  33. mlrun/api/db/init_db.py +2 -4
  34. mlrun/api/db/session.py +1 -1
  35. mlrun/api/db/sqldb/db.py +129 -31
  36. mlrun/api/db/sqldb/models/models_mysql.py +15 -1
  37. mlrun/api/db/sqldb/models/models_sqlite.py +16 -2
  38. mlrun/api/launcher.py +38 -6
  39. mlrun/api/main.py +3 -2
  40. mlrun/api/rundb/__init__.py +13 -0
  41. mlrun/{db → api/rundb}/sqldb.py +36 -84
  42. mlrun/api/runtime_handlers/__init__.py +56 -0
  43. mlrun/api/runtime_handlers/base.py +1247 -0
  44. mlrun/api/runtime_handlers/daskjob.py +209 -0
  45. mlrun/api/runtime_handlers/kubejob.py +37 -0
  46. mlrun/api/runtime_handlers/mpijob.py +147 -0
  47. mlrun/api/runtime_handlers/remotesparkjob.py +29 -0
  48. mlrun/api/runtime_handlers/sparkjob.py +148 -0
  49. mlrun/api/schemas/__init__.py +17 -6
  50. mlrun/api/utils/builder.py +1 -4
  51. mlrun/api/utils/clients/chief.py +14 -0
  52. mlrun/api/utils/clients/iguazio.py +33 -33
  53. mlrun/api/utils/clients/nuclio.py +2 -2
  54. mlrun/api/utils/periodic.py +9 -2
  55. mlrun/api/utils/projects/follower.py +14 -7
  56. mlrun/api/utils/projects/leader.py +2 -1
  57. mlrun/api/utils/projects/remotes/nop_follower.py +2 -2
  58. mlrun/api/utils/projects/remotes/nop_leader.py +2 -2
  59. mlrun/api/utils/runtimes/__init__.py +14 -0
  60. mlrun/api/utils/runtimes/nuclio.py +43 -0
  61. mlrun/api/utils/scheduler.py +98 -15
  62. mlrun/api/utils/singletons/db.py +5 -1
  63. mlrun/api/utils/singletons/project_member.py +4 -1
  64. mlrun/api/utils/singletons/scheduler.py +1 -1
  65. mlrun/artifacts/base.py +6 -6
  66. mlrun/artifacts/dataset.py +4 -4
  67. mlrun/artifacts/manager.py +2 -3
  68. mlrun/artifacts/model.py +2 -2
  69. mlrun/artifacts/plots.py +8 -8
  70. mlrun/common/db/__init__.py +14 -0
  71. mlrun/common/helpers.py +37 -0
  72. mlrun/{mlutils → common/model_monitoring}/__init__.py +3 -2
  73. mlrun/common/model_monitoring/helpers.py +69 -0
  74. mlrun/common/schemas/__init__.py +13 -1
  75. mlrun/common/schemas/auth.py +4 -1
  76. mlrun/common/schemas/client_spec.py +1 -1
  77. mlrun/common/schemas/function.py +17 -0
  78. mlrun/common/schemas/model_monitoring/__init__.py +48 -0
  79. mlrun/common/{model_monitoring.py → schemas/model_monitoring/constants.py} +11 -23
  80. mlrun/common/schemas/model_monitoring/grafana.py +55 -0
  81. mlrun/common/schemas/{model_endpoints.py → model_monitoring/model_endpoints.py} +32 -65
  82. mlrun/common/schemas/notification.py +1 -0
  83. mlrun/common/schemas/object.py +4 -0
  84. mlrun/common/schemas/project.py +1 -0
  85. mlrun/common/schemas/regex.py +1 -1
  86. mlrun/common/schemas/runs.py +1 -8
  87. mlrun/common/schemas/schedule.py +1 -8
  88. mlrun/common/schemas/workflow.py +54 -0
  89. mlrun/config.py +45 -42
  90. mlrun/datastore/__init__.py +21 -0
  91. mlrun/datastore/base.py +1 -1
  92. mlrun/datastore/datastore.py +9 -0
  93. mlrun/datastore/dbfs_store.py +168 -0
  94. mlrun/datastore/helpers.py +18 -0
  95. mlrun/datastore/sources.py +1 -0
  96. mlrun/datastore/store_resources.py +2 -5
  97. mlrun/datastore/v3io.py +1 -2
  98. mlrun/db/__init__.py +4 -68
  99. mlrun/db/base.py +12 -0
  100. mlrun/db/factory.py +65 -0
  101. mlrun/db/httpdb.py +175 -20
  102. mlrun/db/nopdb.py +4 -2
  103. mlrun/execution.py +4 -2
  104. mlrun/feature_store/__init__.py +1 -0
  105. mlrun/feature_store/api.py +1 -2
  106. mlrun/feature_store/common.py +2 -1
  107. mlrun/feature_store/feature_set.py +1 -11
  108. mlrun/feature_store/feature_vector.py +340 -2
  109. mlrun/feature_store/ingestion.py +5 -10
  110. mlrun/feature_store/retrieval/base.py +118 -104
  111. mlrun/feature_store/retrieval/dask_merger.py +17 -10
  112. mlrun/feature_store/retrieval/job.py +4 -1
  113. mlrun/feature_store/retrieval/local_merger.py +18 -18
  114. mlrun/feature_store/retrieval/spark_merger.py +21 -14
  115. mlrun/feature_store/retrieval/storey_merger.py +22 -16
  116. mlrun/kfpops.py +3 -9
  117. mlrun/launcher/base.py +57 -53
  118. mlrun/launcher/client.py +5 -4
  119. mlrun/launcher/factory.py +24 -13
  120. mlrun/launcher/local.py +6 -6
  121. mlrun/launcher/remote.py +4 -4
  122. mlrun/lists.py +0 -11
  123. mlrun/model.py +11 -17
  124. mlrun/model_monitoring/__init__.py +2 -22
  125. mlrun/model_monitoring/features_drift_table.py +1 -1
  126. mlrun/model_monitoring/helpers.py +22 -210
  127. mlrun/model_monitoring/model_endpoint.py +1 -1
  128. mlrun/model_monitoring/model_monitoring_batch.py +127 -50
  129. mlrun/model_monitoring/prometheus.py +219 -0
  130. mlrun/model_monitoring/stores/__init__.py +16 -11
  131. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +95 -23
  132. mlrun/model_monitoring/stores/models/mysql.py +47 -29
  133. mlrun/model_monitoring/stores/models/sqlite.py +47 -29
  134. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +31 -19
  135. mlrun/model_monitoring/{stream_processing_fs.py → stream_processing.py} +206 -64
  136. mlrun/model_monitoring/tracking_policy.py +104 -0
  137. mlrun/package/packager.py +6 -8
  138. mlrun/package/packagers/default_packager.py +121 -10
  139. mlrun/package/packagers/numpy_packagers.py +1 -1
  140. mlrun/platforms/__init__.py +0 -2
  141. mlrun/platforms/iguazio.py +0 -56
  142. mlrun/projects/pipelines.py +53 -159
  143. mlrun/projects/project.py +10 -37
  144. mlrun/render.py +1 -1
  145. mlrun/run.py +8 -124
  146. mlrun/runtimes/__init__.py +6 -42
  147. mlrun/runtimes/base.py +29 -1249
  148. mlrun/runtimes/daskjob.py +2 -198
  149. mlrun/runtimes/funcdoc.py +0 -9
  150. mlrun/runtimes/function.py +25 -29
  151. mlrun/runtimes/kubejob.py +5 -29
  152. mlrun/runtimes/local.py +1 -1
  153. mlrun/runtimes/mpijob/__init__.py +2 -2
  154. mlrun/runtimes/mpijob/abstract.py +10 -1
  155. mlrun/runtimes/mpijob/v1.py +0 -76
  156. mlrun/runtimes/mpijob/v1alpha1.py +1 -74
  157. mlrun/runtimes/nuclio.py +3 -2
  158. mlrun/runtimes/pod.py +28 -18
  159. mlrun/runtimes/remotesparkjob.py +1 -15
  160. mlrun/runtimes/serving.py +14 -6
  161. mlrun/runtimes/sparkjob/__init__.py +0 -1
  162. mlrun/runtimes/sparkjob/abstract.py +4 -131
  163. mlrun/runtimes/utils.py +0 -26
  164. mlrun/serving/routers.py +7 -7
  165. mlrun/serving/server.py +11 -8
  166. mlrun/serving/states.py +7 -1
  167. mlrun/serving/v2_serving.py +6 -6
  168. mlrun/utils/helpers.py +23 -42
  169. mlrun/utils/notifications/notification/__init__.py +4 -0
  170. mlrun/utils/notifications/notification/webhook.py +61 -0
  171. mlrun/utils/notifications/notification_pusher.py +5 -25
  172. mlrun/utils/regex.py +7 -2
  173. mlrun/utils/version/version.json +2 -2
  174. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/METADATA +26 -25
  175. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/RECORD +180 -158
  176. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/WHEEL +1 -1
  177. mlrun/mlutils/data.py +0 -160
  178. mlrun/mlutils/models.py +0 -78
  179. mlrun/mlutils/plots.py +0 -902
  180. mlrun/utils/model_monitoring.py +0 -249
  181. /mlrun/{api/db/sqldb/session.py → common/db/sql_session.py} +0 -0
  182. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/LICENSE +0 -0
  183. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/entry_points.txt +0 -0
  184. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1247 @@
1
+ # Copyright 2023 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+ import traceback
16
+ from abc import ABC, abstractmethod
17
+ from datetime import datetime, timedelta, timezone
18
+ from typing import Dict, List, Optional, Tuple, Union
19
+
20
+ from kubernetes.client.rest import ApiException
21
+ from sqlalchemy.orm import Session
22
+
23
+ import mlrun.common.schemas
24
+ import mlrun.errors
25
+ import mlrun.launcher.factory
26
+ import mlrun.utils.helpers
27
+ import mlrun.utils.notifications
28
+ import mlrun.utils.regex
29
+ from mlrun.api.constants import LogSources
30
+ from mlrun.api.db.base import DBInterface
31
+ from mlrun.config import config
32
+ from mlrun.errors import err_to_str
33
+ from mlrun.runtimes import RuntimeClassMode
34
+ from mlrun.runtimes.constants import PodPhases, RunStates
35
+ from mlrun.runtimes.utils import get_k8s
36
+ from mlrun.utils import logger, now_date
37
+
38
+
39
+ class BaseRuntimeHandler(ABC):
40
+ # setting here to allow tests to override
41
+ kind = "base"
42
+ class_modes: Dict[RuntimeClassMode, str] = {}
43
+ wait_for_deletion_interval = 10
44
+
45
+ @staticmethod
46
+ @abstractmethod
47
+ def _get_object_label_selector(object_id: str) -> str:
48
+ """
49
+ Should return the label selector to get only resources of a specific object (with id object_id)
50
+ """
51
+ pass
52
+
53
+ def _should_collect_logs(self) -> bool:
54
+ """
55
+ There are some runtimes which we don't collect logs for using the log collector
56
+ :return: whether it should collect log for it
57
+ """
58
+ return True
59
+
60
+ def _get_possible_mlrun_class_label_values(
61
+ self, class_mode: Union[RuntimeClassMode, str] = None
62
+ ) -> List[str]:
63
+ """
64
+ Should return the possible values of the mlrun/class label for runtime resources that are of this runtime
65
+ handler kind
66
+ """
67
+ if not class_mode:
68
+ return list(self.class_modes.values())
69
+ class_mode = self.class_modes.get(class_mode, None)
70
+ return [class_mode] if class_mode else []
71
+
72
+ def list_resources(
73
+ self,
74
+ project: str,
75
+ object_id: Optional[str] = None,
76
+ label_selector: str = None,
77
+ group_by: Optional[
78
+ mlrun.common.schemas.ListRuntimeResourcesGroupByField
79
+ ] = None,
80
+ ) -> Union[
81
+ mlrun.common.schemas.RuntimeResources,
82
+ mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
83
+ mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
84
+ ]:
85
+ # We currently don't support removing runtime resources in non k8s env
86
+ if not get_k8s().is_running_inside_kubernetes_cluster():
87
+ return {}
88
+ namespace = get_k8s().resolve_namespace()
89
+ label_selector = self.resolve_label_selector(project, object_id, label_selector)
90
+ pods = self._list_pods(namespace, label_selector)
91
+ pod_resources = self._build_pod_resources(pods)
92
+ crd_objects = self._list_crd_objects(namespace, label_selector)
93
+ crd_resources = self._build_crd_resources(crd_objects)
94
+ response = self._build_list_resources_response(
95
+ pod_resources, crd_resources, group_by
96
+ )
97
+ response = self._enrich_list_resources_response(
98
+ response, namespace, label_selector, group_by
99
+ )
100
+ return response
101
+
102
+ def build_output_from_runtime_resources(
103
+ self,
104
+ runtime_resources_list: List[mlrun.common.schemas.RuntimeResources],
105
+ group_by: Optional[
106
+ mlrun.common.schemas.ListRuntimeResourcesGroupByField
107
+ ] = None,
108
+ ):
109
+ pod_resources = []
110
+ crd_resources = []
111
+ for runtime_resources in runtime_resources_list:
112
+ pod_resources += runtime_resources.pod_resources
113
+ crd_resources += runtime_resources.crd_resources
114
+ response = self._build_list_resources_response(
115
+ pod_resources, crd_resources, group_by
116
+ )
117
+ response = self._build_output_from_runtime_resources(
118
+ response, runtime_resources_list, group_by
119
+ )
120
+ return response
121
+
122
+ def delete_resources(
123
+ self,
124
+ db: DBInterface,
125
+ db_session: Session,
126
+ label_selector: str = None,
127
+ force: bool = False,
128
+ grace_period: int = None,
129
+ ):
130
+ if grace_period is None:
131
+ grace_period = config.runtime_resources_deletion_grace_period
132
+ # We currently don't support removing runtime resources in non k8s env
133
+ if not get_k8s().is_running_inside_kubernetes_cluster():
134
+ return
135
+ namespace = get_k8s().resolve_namespace()
136
+ label_selector = self.resolve_label_selector("*", label_selector=label_selector)
137
+ crd_group, crd_version, crd_plural = self._get_crd_info()
138
+ if crd_group and crd_version and crd_plural:
139
+ deleted_resources = self._delete_crd_resources(
140
+ db,
141
+ db_session,
142
+ namespace,
143
+ label_selector,
144
+ force,
145
+ grace_period,
146
+ )
147
+ else:
148
+ deleted_resources = self._delete_pod_resources(
149
+ db,
150
+ db_session,
151
+ namespace,
152
+ label_selector,
153
+ force,
154
+ grace_period,
155
+ )
156
+ self._delete_extra_resources(
157
+ db,
158
+ db_session,
159
+ namespace,
160
+ deleted_resources,
161
+ label_selector,
162
+ force,
163
+ grace_period,
164
+ )
165
+
166
+ def delete_runtime_object_resources(
167
+ self,
168
+ db: DBInterface,
169
+ db_session: Session,
170
+ object_id: str,
171
+ label_selector: str = None,
172
+ force: bool = False,
173
+ grace_period: int = None,
174
+ ):
175
+ if grace_period is None:
176
+ grace_period = config.runtime_resources_deletion_grace_period
177
+ label_selector = self._add_object_label_selector_if_needed(
178
+ object_id, label_selector
179
+ )
180
+ self.delete_resources(db, db_session, label_selector, force, grace_period)
181
+
182
+ def monitor_runs(self, db: DBInterface, db_session: Session):
183
+ namespace = get_k8s().resolve_namespace()
184
+ label_selector = self._get_default_label_selector()
185
+ crd_group, crd_version, crd_plural = self._get_crd_info()
186
+ runtime_resource_is_crd = False
187
+ if crd_group and crd_version and crd_plural:
188
+ runtime_resource_is_crd = True
189
+ runtime_resources = self._list_crd_objects(namespace, label_selector)
190
+ else:
191
+ runtime_resources = self._list_pods(namespace, label_selector)
192
+ project_run_uid_map = self._list_runs_for_monitoring(db, db_session)
193
+ # project -> uid -> {"name": <runtime-resource-name>}
194
+ run_runtime_resources_map = {}
195
+ for runtime_resource in runtime_resources:
196
+ project, uid, name = self._resolve_runtime_resource_run(runtime_resource)
197
+ run_runtime_resources_map.setdefault(project, {})
198
+ run_runtime_resources_map.get(project).update({uid: {"name": name}})
199
+ try:
200
+ self._monitor_runtime_resource(
201
+ db,
202
+ db_session,
203
+ project_run_uid_map,
204
+ runtime_resource,
205
+ runtime_resource_is_crd,
206
+ namespace,
207
+ project,
208
+ uid,
209
+ name,
210
+ )
211
+ except Exception as exc:
212
+ logger.warning(
213
+ "Failed monitoring runtime resource. Continuing",
214
+ runtime_resource_name=runtime_resource["metadata"]["name"],
215
+ project_name=project,
216
+ namespace=namespace,
217
+ exc=err_to_str(exc),
218
+ traceback=traceback.format_exc(),
219
+ )
220
+ for project, runs in project_run_uid_map.items():
221
+ if runs:
222
+ for run_uid, run in runs.items():
223
+ try:
224
+ if not run:
225
+ run = db.read_run(db_session, run_uid, project)
226
+ if self.kind == run.get("metadata", {}).get("labels", {}).get(
227
+ "kind", ""
228
+ ):
229
+ self._ensure_run_not_stuck_on_non_terminal_state(
230
+ db,
231
+ db_session,
232
+ project,
233
+ run_uid,
234
+ run,
235
+ run_runtime_resources_map,
236
+ )
237
+ except Exception as exc:
238
+ logger.warning(
239
+ "Failed ensuring run not stuck. Continuing",
240
+ run_uid=run_uid,
241
+ run=run,
242
+ project=project,
243
+ exc=err_to_str(exc),
244
+ traceback=traceback.format_exc(),
245
+ )
246
+
247
+ def _ensure_run_not_stuck_on_non_terminal_state(
248
+ self,
249
+ db: DBInterface,
250
+ db_session: Session,
251
+ project: str,
252
+ run_uid: str,
253
+ run: dict = None,
254
+ run_runtime_resources_map: dict = None,
255
+ ):
256
+ """
257
+ Ensuring that a run does not become trapped in a non-terminal state as a result of not finding
258
+ corresponding k8s resource.
259
+ This can occur when a node is evicted or preempted, causing the resources to be removed from the resource
260
+ listing when the final state recorded in the database is non-terminal.
261
+ This will have a significant impact on scheduled jobs, since they will not be created until the
262
+ previous run reaches a terminal state (because of concurrency limit)
263
+ """
264
+ now = now_date()
265
+ db_run_state = run.get("status", {}).get("state")
266
+ if not db_run_state:
267
+ # we are setting the run state to a terminal state to avoid log spamming, this is mainly sanity as we are
268
+ # setting state to runs when storing new runs.
269
+ logger.info(
270
+ "Runs monitoring found a run without state, updating to a terminal state",
271
+ project=project,
272
+ uid=run_uid,
273
+ db_run_state=db_run_state,
274
+ now=now,
275
+ )
276
+ run.setdefault("status", {})["state"] = RunStates.error
277
+ run.setdefault("status", {})["last_update"] = now.isoformat()
278
+ db.store_run(db_session, run, run_uid, project)
279
+ return
280
+ if db_run_state in RunStates.non_terminal_states():
281
+ if run_runtime_resources_map and run_uid in run_runtime_resources_map.get(
282
+ project, {}
283
+ ):
284
+ # if found resource there is no need to continue
285
+ return
286
+ last_update_str = run.get("status", {}).get("last_update")
287
+ debounce_period = (
288
+ config.resolve_runs_monitoring_missing_runtime_resources_debouncing_interval()
289
+ )
290
+ if last_update_str is None:
291
+ logger.info(
292
+ "Runs monitoring found run in non-terminal state without last update time set, "
293
+ "updating last update time to now, to be able to evaluate next time if something changed",
294
+ project=project,
295
+ uid=run_uid,
296
+ db_run_state=db_run_state,
297
+ now=now,
298
+ debounce_period=debounce_period,
299
+ )
300
+ run.setdefault("status", {})["last_update"] = now.isoformat()
301
+ db.store_run(db_session, run, run_uid, project)
302
+ return
303
+
304
+ if datetime.fromisoformat(last_update_str) > now - timedelta(
305
+ seconds=debounce_period
306
+ ):
307
+ # we are setting non-terminal states to runs before the run is actually applied to k8s, meaning there is
308
+ # a timeframe where the run exists and no runtime resources exist and it's ok, therefore we're applying
309
+ # a debounce period before setting the state to error
310
+ logger.warning(
311
+ "Monitoring did not discover a runtime resource that corresponded to a run in a "
312
+ "non-terminal state. but record has recently updated. Debouncing",
313
+ project=project,
314
+ uid=run_uid,
315
+ db_run_state=db_run_state,
316
+ last_update=datetime.fromisoformat(last_update_str),
317
+ now=now,
318
+ debounce_period=debounce_period,
319
+ )
320
+ else:
321
+ logger.info(
322
+ "Updating run state", run_uid=run_uid, run_state=RunStates.error
323
+ )
324
+ run.setdefault("status", {})["state"] = RunStates.error
325
+ run.setdefault("status", {})[
326
+ "reason"
327
+ ] = "A runtime resource related to this run could not be found"
328
+ run.setdefault("status", {})["last_update"] = now.isoformat()
329
+ db.store_run(db_session, run, run_uid, project)
330
+
331
+ def _add_object_label_selector_if_needed(
332
+ self,
333
+ object_id: Optional[str] = None,
334
+ label_selector: Optional[str] = None,
335
+ ):
336
+ if object_id:
337
+ object_label_selector = self._get_object_label_selector(object_id)
338
+ if label_selector:
339
+ label_selector = ",".join([object_label_selector, label_selector])
340
+ else:
341
+ label_selector = object_label_selector
342
+ return label_selector
343
+
344
+ @staticmethod
345
+ def _get_main_runtime_resource_label_selector() -> str:
346
+ """
347
+ There are some runtimes which might have multiple k8s resources attached to a one runtime, in this case
348
+ we don't want to pull logs from all but rather only for the "driver"/"launcher" etc
349
+ :return: the label selector
350
+ """
351
+ return ""
352
+
353
+ def _enrich_list_resources_response(
354
+ self,
355
+ response: Union[
356
+ mlrun.common.schemas.RuntimeResources,
357
+ mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
358
+ mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
359
+ ],
360
+ namespace: str,
361
+ label_selector: str = None,
362
+ group_by: Optional[
363
+ mlrun.common.schemas.ListRuntimeResourcesGroupByField
364
+ ] = None,
365
+ ) -> Union[
366
+ mlrun.common.schemas.RuntimeResources,
367
+ mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
368
+ mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
369
+ ]:
370
+ """
371
+ Override this to list resources other then pods or CRDs (which are handled by the base class)
372
+ """
373
+ return response
374
+
375
+ def _build_output_from_runtime_resources(
376
+ self,
377
+ response: Union[
378
+ mlrun.common.schemas.RuntimeResources,
379
+ mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
380
+ mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
381
+ ],
382
+ runtime_resources_list: List[mlrun.common.schemas.RuntimeResources],
383
+ group_by: Optional[
384
+ mlrun.common.schemas.ListRuntimeResourcesGroupByField
385
+ ] = None,
386
+ ):
387
+ """
388
+ Override this to add runtime resources other than pods or CRDs (which are handled by the base class) to the
389
+ output
390
+ """
391
+ return response
392
+
393
+ def _delete_extra_resources(
394
+ self,
395
+ db: DBInterface,
396
+ db_session: Session,
397
+ namespace: str,
398
+ deleted_resources: List[Dict],
399
+ label_selector: str = None,
400
+ force: bool = False,
401
+ grace_period: int = None,
402
+ ):
403
+ """
404
+ Override this to handle deletion of resources other than pods or CRDs (which are handled by the base class)
405
+ Note that this is happening after the deletion of the CRDs or the pods
406
+ Note to add this at the beginning:
407
+ if grace_period is None:
408
+ grace_period = config.runtime_resources_deletion_grace_period
409
+ """
410
+ pass
411
+
412
+ def _resolve_crd_object_status_info(
413
+ self, db: DBInterface, db_session: Session, crd_object
414
+ ) -> Tuple[bool, Optional[datetime], Optional[str]]:
415
+ """
416
+ Override this if the runtime has CRD resources.
417
+ :return: Tuple with:
418
+ 1. bool determining whether the crd object is in terminal state
419
+ 2. datetime of when the crd object got into terminal state (only when the crd object in terminal state)
420
+ 3. the desired run state matching the crd object state
421
+ """
422
+ return False, None, None
423
+
424
+ def _update_ui_url(
425
+ self,
426
+ db: DBInterface,
427
+ db_session: Session,
428
+ project: str,
429
+ uid: str,
430
+ crd_object,
431
+ run: Dict = None,
432
+ ):
433
+ """
434
+ Update the UI URL for relevant jobs.
435
+ """
436
+ pass
437
+
438
+ def _resolve_pod_status_info(
439
+ self, db: DBInterface, db_session: Session, pod: Dict
440
+ ) -> Tuple[bool, Optional[datetime], Optional[str]]:
441
+ """
442
+ :return: Tuple with:
443
+ 1. bool determining whether the pod is in terminal state
444
+ 2. datetime of when the pod got into terminal state (only when the pod in terminal state)
445
+ 3. the run state matching the pod state
446
+ """
447
+ in_terminal_state = pod["status"]["phase"] in PodPhases.terminal_phases()
448
+ run_state = PodPhases.pod_phase_to_run_state(pod["status"]["phase"])
449
+ last_container_completion_time = None
450
+ if in_terminal_state:
451
+ for container_status in pod["status"].get("container_statuses", []):
452
+ if container_status.get("state", {}).get("terminated"):
453
+ container_completion_time = container_status["state"][
454
+ "terminated"
455
+ ].get("finished_at")
456
+
457
+ # take latest completion time
458
+ if (
459
+ not last_container_completion_time
460
+ or last_container_completion_time < container_completion_time
461
+ ):
462
+ last_container_completion_time = container_completion_time
463
+
464
+ return in_terminal_state, last_container_completion_time, run_state
465
+
466
+ def _get_default_label_selector(
467
+ self, class_mode: Union[RuntimeClassMode, str] = None
468
+ ) -> str:
469
+ """
470
+ Override this to add a default label selector
471
+ """
472
+ class_values = self._get_possible_mlrun_class_label_values(class_mode)
473
+ if not class_values:
474
+ return ""
475
+ if len(class_values) == 1:
476
+ return f"mlrun/class={class_values[0]}"
477
+ return f"mlrun/class in ({', '.join(class_values)})"
478
+
479
+ @staticmethod
480
+ def _get_crd_info() -> Tuple[str, str, str]:
481
+ """
482
+ Override this if the runtime has CRD resources. this should return the CRD info:
483
+ crd group, crd version, crd plural
484
+ """
485
+ return "", "", ""
486
+
487
+ @staticmethod
488
+ def _are_resources_coupled_to_run_object() -> bool:
489
+ """
490
+ Some resources are tightly coupled to mlrun Run object, for example, for each Run of a Function of the job kind
491
+ a kubernetes job is being generated, on the opposite a Function of the daskjob kind generates a dask cluster,
492
+ and every Run is being executed using this cluster, i.e. no resources are created for the Run.
493
+ This function should return true for runtimes in which Run are coupled to the underlying resources and therefore
494
+ aspects of the Run (like its state) should be taken into consideration on resources deletion
495
+ """
496
+ return False
497
+
498
+ @staticmethod
499
+ def _expect_pods_without_uid() -> bool:
500
+ return False
501
+
502
+ def _list_pods(self, namespace: str, label_selector: str = None) -> List:
503
+ pods = get_k8s().list_pods(namespace, selector=label_selector)
504
+ # when we work with custom objects (list_namespaced_custom_object) it's always a dict, to be able to generalize
505
+ # code working on runtime resource (either a custom object or a pod) we're transforming to dicts
506
+ pods = [pod.to_dict() for pod in pods]
507
+ return pods
508
+
509
+ def _list_crd_objects(self, namespace: str, label_selector: str = None) -> List:
510
+ crd_group, crd_version, crd_plural = self._get_crd_info()
511
+ crd_objects = []
512
+ if crd_group and crd_version and crd_plural:
513
+ try:
514
+ crd_objects = get_k8s().crdapi.list_namespaced_custom_object(
515
+ crd_group,
516
+ crd_version,
517
+ namespace,
518
+ crd_plural,
519
+ label_selector=label_selector,
520
+ )
521
+ except ApiException as exc:
522
+ # ignore error if crd is not defined
523
+ if exc.status != 404:
524
+ raise
525
+ else:
526
+ crd_objects = crd_objects["items"]
527
+ return crd_objects
528
+
529
+ def resolve_label_selector(
530
+ self,
531
+ project: str,
532
+ object_id: Optional[str] = None,
533
+ label_selector: Optional[str] = None,
534
+ class_mode: Union[RuntimeClassMode, str] = None,
535
+ with_main_runtime_resource_label_selector: bool = False,
536
+ ) -> str:
537
+ default_label_selector = self._get_default_label_selector(class_mode=class_mode)
538
+
539
+ if label_selector:
540
+ label_selector = ",".join([default_label_selector, label_selector])
541
+ else:
542
+ label_selector = default_label_selector
543
+
544
+ if project and project != "*":
545
+ label_selector = ",".join([label_selector, f"mlrun/project={project}"])
546
+
547
+ label_selector = self._add_object_label_selector_if_needed(
548
+ object_id, label_selector
549
+ )
550
+
551
+ if with_main_runtime_resource_label_selector:
552
+ main_runtime_resource_label_selector = (
553
+ self._get_main_runtime_resource_label_selector()
554
+ )
555
+ if main_runtime_resource_label_selector:
556
+ label_selector = ",".join(
557
+ [label_selector, main_runtime_resource_label_selector]
558
+ )
559
+
560
+ return label_selector
561
+
562
+ @staticmethod
563
+ def resolve_object_id(
564
+ run: dict,
565
+ ) -> Optional[str]:
566
+ """
567
+ Get the object id from the run object
568
+ Override this if the object id is not the run uid
569
+ :param run: run object
570
+ :return: object id
571
+ """
572
+ return run.get("metadata", {}).get("uid", None)
573
+
574
+ def _wait_for_pods_deletion(
575
+ self,
576
+ namespace: str,
577
+ deleted_pods: List[Dict],
578
+ label_selector: str = None,
579
+ ):
580
+ deleted_pod_names = [pod_dict["metadata"]["name"] for pod_dict in deleted_pods]
581
+
582
+ def _verify_pods_removed():
583
+ pods = get_k8s().v1api.list_namespaced_pod(
584
+ namespace, label_selector=label_selector
585
+ )
586
+ existing_pod_names = [pod.metadata.name for pod in pods.items]
587
+ still_in_deletion_pods = set(existing_pod_names).intersection(
588
+ deleted_pod_names
589
+ )
590
+ if still_in_deletion_pods:
591
+ raise RuntimeError(
592
+ f"Pods are still in deletion process: {still_in_deletion_pods}"
593
+ )
594
+
595
+ if deleted_pod_names:
596
+ timeout = 180
597
+ logger.debug(
598
+ "Waiting for pods deletion",
599
+ timeout=timeout,
600
+ interval=self.wait_for_deletion_interval,
601
+ )
602
+ mlrun.utils.retry_until_successful(
603
+ self.wait_for_deletion_interval,
604
+ timeout,
605
+ logger,
606
+ True,
607
+ _verify_pods_removed,
608
+ )
609
+
610
+ def _wait_for_crds_underlying_pods_deletion(
611
+ self,
612
+ deleted_crds: List[Dict],
613
+ label_selector: str = None,
614
+ ):
615
+ # we're using here the run identifier as the common ground to identify which pods are relevant to which CRD, so
616
+ # if they are not coupled we are not able to wait - simply return
617
+ # NOTE - there are surely smarter ways to do this, without depending on the run object, but as of writing this
618
+ # none of the runtimes using CRDs are like that, so not handling it now
619
+ if not self._are_resources_coupled_to_run_object():
620
+ return
621
+
622
+ def _verify_crds_underlying_pods_removed():
623
+ project_uid_crd_map = {}
624
+ for crd in deleted_crds:
625
+ project, uid, _ = self._resolve_runtime_resource_run(crd)
626
+ if not uid or not project:
627
+ logger.warning(
628
+ "Could not resolve run uid from crd. Skipping waiting for pods deletion",
629
+ crd=crd,
630
+ )
631
+ continue
632
+ project_uid_crd_map.setdefault(project, {})[uid] = crd["metadata"][
633
+ "name"
634
+ ]
635
+ still_in_deletion_crds_to_pod_names = {}
636
+ jobs_runtime_resources: mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput = self.list_resources(
637
+ "*",
638
+ label_selector=label_selector,
639
+ group_by=mlrun.common.schemas.ListRuntimeResourcesGroupByField.job,
640
+ )
641
+ for project, project_jobs in jobs_runtime_resources.items():
642
+ if project not in project_uid_crd_map:
643
+ continue
644
+ for job_uid, job_runtime_resources in jobs_runtime_resources[
645
+ project
646
+ ].items():
647
+ if job_uid not in project_uid_crd_map[project]:
648
+ continue
649
+ if job_runtime_resources.pod_resources:
650
+ still_in_deletion_crds_to_pod_names[
651
+ project_uid_crd_map[project][job_uid]
652
+ ] = [
653
+ pod_resource.name
654
+ for pod_resource in job_runtime_resources.pod_resources
655
+ ]
656
+ if still_in_deletion_crds_to_pod_names:
657
+ raise RuntimeError(
658
+ f"CRD underlying pods are still in deletion process: {still_in_deletion_crds_to_pod_names}"
659
+ )
660
+
661
+ if deleted_crds:
662
+ timeout = 180
663
+ logger.debug(
664
+ "Waiting for CRDs underlying pods deletion",
665
+ timeout=timeout,
666
+ interval=self.wait_for_deletion_interval,
667
+ )
668
+ mlrun.utils.retry_until_successful(
669
+ self.wait_for_deletion_interval,
670
+ timeout,
671
+ logger,
672
+ True,
673
+ _verify_crds_underlying_pods_removed,
674
+ )
675
+
676
+ def _delete_pod_resources(
677
+ self,
678
+ db: DBInterface,
679
+ db_session: Session,
680
+ namespace: str,
681
+ label_selector: str = None,
682
+ force: bool = False,
683
+ grace_period: int = None,
684
+ ) -> List[Dict]:
685
+ if grace_period is None:
686
+ grace_period = config.runtime_resources_deletion_grace_period
687
+ pods = get_k8s().v1api.list_namespaced_pod(
688
+ namespace, label_selector=label_selector
689
+ )
690
+ deleted_pods = []
691
+ for pod in pods.items:
692
+ pod_dict = pod.to_dict()
693
+
694
+ # best effort - don't let one failure in pod deletion to cut the whole operation
695
+ try:
696
+ (
697
+ in_terminal_state,
698
+ last_update,
699
+ run_state,
700
+ ) = self._resolve_pod_status_info(db, db_session, pod_dict)
701
+ if not force:
702
+ if not in_terminal_state:
703
+ continue
704
+
705
+ # give some grace period if we have last update time
706
+ now = datetime.now(timezone.utc)
707
+ if (
708
+ last_update is not None
709
+ and last_update + timedelta(seconds=float(grace_period)) > now
710
+ ):
711
+ continue
712
+
713
+ # if resources are tightly coupled to the run object - we want to perform some actions on the run object
714
+ # before deleting them
715
+ if self._are_resources_coupled_to_run_object():
716
+ try:
717
+ self._pre_deletion_runtime_resource_run_actions(
718
+ db, db_session, pod_dict, run_state
719
+ )
720
+ except Exception as exc:
721
+ # Don't prevent the deletion for failure in the pre deletion run actions
722
+ logger.warning(
723
+ "Failure in pod run pre-deletion actions. Continuing",
724
+ exc=repr(exc),
725
+ pod_name=pod.metadata.name,
726
+ )
727
+
728
+ get_k8s().delete_pod(pod.metadata.name, namespace)
729
+ deleted_pods.append(pod_dict)
730
+ except Exception as exc:
731
+ logger.warning(
732
+ f"Cleanup failed processing pod {pod.metadata.name}: {repr(exc)}. Continuing"
733
+ )
734
+ # TODO: don't wait for pods to be deleted, client should poll the deletion status
735
+ self._wait_for_pods_deletion(namespace, deleted_pods, label_selector)
736
+ return deleted_pods
737
+
738
+ def _delete_crd_resources(
739
+ self,
740
+ db: DBInterface,
741
+ db_session: Session,
742
+ namespace: str,
743
+ label_selector: str = None,
744
+ force: bool = False,
745
+ grace_period: int = None,
746
+ ) -> List[Dict]:
747
+ if grace_period is None:
748
+ grace_period = config.runtime_resources_deletion_grace_period
749
+ crd_group, crd_version, crd_plural = self._get_crd_info()
750
+ deleted_crds = []
751
+ try:
752
+ crd_objects = get_k8s().crdapi.list_namespaced_custom_object(
753
+ crd_group,
754
+ crd_version,
755
+ namespace,
756
+ crd_plural,
757
+ label_selector=label_selector,
758
+ )
759
+ except ApiException as exc:
760
+ # ignore error if crd is not defined
761
+ if exc.status != 404:
762
+ raise
763
+ else:
764
+ for crd_object in crd_objects["items"]:
765
+ # best effort - don't let one failure in pod deletion to cut the whole operation
766
+ try:
767
+ (
768
+ in_terminal_state,
769
+ last_update,
770
+ desired_run_state,
771
+ ) = self._resolve_crd_object_status_info(db, db_session, crd_object)
772
+ if not force:
773
+ if not in_terminal_state:
774
+ continue
775
+
776
+ # give some grace period if we have last update time
777
+ now = datetime.now(timezone.utc)
778
+ if (
779
+ last_update is not None
780
+ and last_update + timedelta(seconds=float(grace_period))
781
+ > now
782
+ ):
783
+ continue
784
+
785
+ # if resources are tightly coupled to the run object - we want to perform some actions on the run
786
+ # object before deleting them
787
+ if self._are_resources_coupled_to_run_object():
788
+ try:
789
+ self._pre_deletion_runtime_resource_run_actions(
790
+ db,
791
+ db_session,
792
+ crd_object,
793
+ desired_run_state,
794
+ )
795
+ except Exception as exc:
796
+ # Don't prevent the deletion for failure in the pre deletion run actions
797
+ logger.warning(
798
+ "Failure in crd object run pre-deletion actions. Continuing",
799
+ exc=err_to_str(exc),
800
+ crd_object_name=crd_object["metadata"]["name"],
801
+ )
802
+
803
+ get_k8s().delete_crd(
804
+ crd_object["metadata"]["name"],
805
+ crd_group,
806
+ crd_version,
807
+ crd_plural,
808
+ namespace,
809
+ )
810
+ deleted_crds.append(crd_object)
811
+ except Exception:
812
+ exc = traceback.format_exc()
813
+ crd_object_name = crd_object["metadata"]["name"]
814
+ logger.warning(
815
+ f"Cleanup failed processing CRD object {crd_object_name}: {err_to_str(exc)}. Continuing"
816
+ )
817
+ self._wait_for_crds_underlying_pods_deletion(deleted_crds, label_selector)
818
+ return deleted_crds
819
+
820
+ def _pre_deletion_runtime_resource_run_actions(
821
+ self,
822
+ db: DBInterface,
823
+ db_session: Session,
824
+ runtime_resource: Dict,
825
+ run_state: str,
826
+ ):
827
+ project, uid, name = self._resolve_runtime_resource_run(runtime_resource)
828
+
829
+ # if cannot resolve related run nothing to do
830
+ if not uid:
831
+ if not self._expect_pods_without_uid():
832
+ logger.warning(
833
+ "Could not resolve run uid from runtime resource. Skipping pre-deletion actions",
834
+ runtime_resource=runtime_resource,
835
+ )
836
+ raise ValueError("Could not resolve run uid from runtime resource")
837
+ else:
838
+ return
839
+
840
+ logger.info(
841
+ "Performing pre-deletion actions before cleaning up runtime resources",
842
+ project=project,
843
+ uid=uid,
844
+ )
845
+
846
+ self._ensure_run_state(db, db_session, project, uid, name, run_state)
847
+
848
+ self._ensure_run_logs_collected(db, db_session, project, uid)
849
+
850
+ def _is_runtime_resource_run_in_terminal_state(
851
+ self,
852
+ db: DBInterface,
853
+ db_session: Session,
854
+ runtime_resource: Dict,
855
+ ) -> Tuple[bool, Optional[datetime]]:
856
+ """
857
+ A runtime can have different underlying resources (like pods or CRDs) - to generalize we call it runtime
858
+ resource. This function will verify whether the Run object related to this runtime resource is in transient
859
+ state. This is useful in order to determine whether an object can be removed. for example, a kubejob's pod
860
+ might be in completed state, but we would like to verify that the run is completed as well to verify the logs
861
+ were collected before we're removing the pod.
862
+
863
+ :returns: bool determining whether the run in terminal state, and the last update time if it exists
864
+ """
865
+ project, uid, _ = self._resolve_runtime_resource_run(runtime_resource)
866
+
867
+ # if no uid, assume in terminal state
868
+ if not uid:
869
+ return True, None
870
+
871
+ run = db.read_run(db_session, uid, project)
872
+ last_update = None
873
+ last_update_str = run.get("status", {}).get("last_update")
874
+ if last_update_str is not None:
875
+ last_update = datetime.fromisoformat(last_update_str)
876
+
877
+ if run.get("status", {}).get("state") not in RunStates.terminal_states():
878
+ return False, last_update
879
+
880
+ return True, last_update
881
+
882
+ def _list_runs_for_monitoring(
883
+ self, db: DBInterface, db_session: Session, states: list = None
884
+ ):
885
+ runs = db.list_runs(db_session, project="*", states=states)
886
+ project_run_uid_map = {}
887
+ run_with_missing_data = []
888
+ duplicated_runs = []
889
+ for run in runs:
890
+ project = run.get("metadata", {}).get("project")
891
+ uid = run.get("metadata", {}).get("uid")
892
+ if not uid or not project:
893
+ run_with_missing_data.append(run.get("metadata", {}))
894
+ continue
895
+ current_run = project_run_uid_map.setdefault(project, {}).get(uid)
896
+
897
+ # sanity
898
+ if current_run:
899
+ duplicated_runs = {
900
+ "monitored_run": current_run.get(["metadata"]),
901
+ "duplicated_run": run.get(["metadata"]),
902
+ }
903
+ continue
904
+
905
+ project_run_uid_map[project][uid] = run
906
+
907
+ # If there are duplications or runs with missing data it probably won't be fixed
908
+ # Monitoring is running periodically and we don't want to log on every problem we found which will spam the log
909
+ # so we're aggregating the problems and logging only once per aggregation
910
+ if duplicated_runs:
911
+ logger.warning(
912
+ "Found duplicated runs (same uid). Heuristically monitoring the first one found",
913
+ duplicated_runs=duplicated_runs,
914
+ )
915
+
916
+ if run_with_missing_data:
917
+ logger.warning(
918
+ "Found runs with missing data. They will not be monitored",
919
+ run_with_missing_data=run_with_missing_data,
920
+ )
921
+
922
+ return project_run_uid_map
923
+
924
+ def _monitor_runtime_resource(
925
+ self,
926
+ db: DBInterface,
927
+ db_session: Session,
928
+ project_run_uid_map: Dict,
929
+ runtime_resource: Dict,
930
+ runtime_resource_is_crd: bool,
931
+ namespace: str,
932
+ project: str = None,
933
+ uid: str = None,
934
+ name: str = None,
935
+ ):
936
+ if not project and not uid and not name:
937
+ project, uid, name = self._resolve_runtime_resource_run(runtime_resource)
938
+ if not project or not uid:
939
+ # Currently any build pod won't have UID and therefore will cause this log message to be printed which
940
+ # spams the log
941
+ # TODO: uncomment the log message when builder become a kind / starts having a UID
942
+ # logger.warning(
943
+ # "Could not resolve run project or uid from runtime resource, can not monitor run. Continuing",
944
+ # project=project,
945
+ # uid=uid,
946
+ # runtime_resource_name=runtime_resource["metadata"]["name"],
947
+ # namespace=namespace,
948
+ # )
949
+ return
950
+ run = project_run_uid_map.get(project, {}).get(uid)
951
+ if runtime_resource_is_crd:
952
+ (
953
+ _,
954
+ _,
955
+ run_state,
956
+ ) = self._resolve_crd_object_status_info(db, db_session, runtime_resource)
957
+ else:
958
+ (
959
+ _,
960
+ _,
961
+ run_state,
962
+ ) = self._resolve_pod_status_info(db, db_session, runtime_resource)
963
+ self._update_ui_url(db, db_session, project, uid, runtime_resource, run)
964
+ _, updated_run_state = self._ensure_run_state(
965
+ db,
966
+ db_session,
967
+ project,
968
+ uid,
969
+ name,
970
+ run_state,
971
+ run,
972
+ search_run=False,
973
+ )
974
+ if updated_run_state in RunStates.terminal_states():
975
+ self._ensure_run_logs_collected(db, db_session, project, uid)
976
+
977
+ def _build_list_resources_response(
978
+ self,
979
+ pod_resources: List[mlrun.common.schemas.RuntimeResource] = None,
980
+ crd_resources: List[mlrun.common.schemas.RuntimeResource] = None,
981
+ group_by: Optional[
982
+ mlrun.common.schemas.ListRuntimeResourcesGroupByField
983
+ ] = None,
984
+ ) -> Union[
985
+ mlrun.common.schemas.RuntimeResources,
986
+ mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
987
+ mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
988
+ ]:
989
+ if crd_resources is None:
990
+ crd_resources = []
991
+ if pod_resources is None:
992
+ pod_resources = []
993
+
994
+ if group_by is None:
995
+ return mlrun.common.schemas.RuntimeResources(
996
+ crd_resources=crd_resources, pod_resources=pod_resources
997
+ )
998
+ else:
999
+ if group_by == mlrun.common.schemas.ListRuntimeResourcesGroupByField.job:
1000
+ return self._build_grouped_by_job_list_resources_response(
1001
+ pod_resources, crd_resources
1002
+ )
1003
+ elif (
1004
+ group_by
1005
+ == mlrun.common.schemas.ListRuntimeResourcesGroupByField.project
1006
+ ):
1007
+ return self._build_grouped_by_project_list_resources_response(
1008
+ pod_resources, crd_resources
1009
+ )
1010
+ else:
1011
+ raise NotImplementedError(
1012
+ f"Provided group by field is not supported. group_by={group_by}"
1013
+ )
1014
+
1015
+ def _build_grouped_by_project_list_resources_response(
1016
+ self,
1017
+ pod_resources: List[mlrun.common.schemas.RuntimeResource] = None,
1018
+ crd_resources: List[mlrun.common.schemas.RuntimeResource] = None,
1019
+ ) -> mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput:
1020
+ resources = {}
1021
+ for pod_resource in pod_resources:
1022
+ self._add_resource_to_grouped_by_project_resources_response(
1023
+ resources, "pod_resources", pod_resource
1024
+ )
1025
+ for crd_resource in crd_resources:
1026
+ self._add_resource_to_grouped_by_project_resources_response(
1027
+ resources, "crd_resources", crd_resource
1028
+ )
1029
+ return resources
1030
+
1031
+ def _build_grouped_by_job_list_resources_response(
1032
+ self,
1033
+ pod_resources: List[mlrun.common.schemas.RuntimeResource] = None,
1034
+ crd_resources: List[mlrun.common.schemas.RuntimeResource] = None,
1035
+ ) -> mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput:
1036
+ resources = {}
1037
+ for pod_resource in pod_resources:
1038
+ self._add_resource_to_grouped_by_job_resources_response(
1039
+ resources, "pod_resources", pod_resource
1040
+ )
1041
+ for crd_resource in crd_resources:
1042
+ self._add_resource_to_grouped_by_job_resources_response(
1043
+ resources, "crd_resources", crd_resource
1044
+ )
1045
+ return resources
1046
+
1047
+ def _add_resource_to_grouped_by_project_resources_response(
1048
+ self,
1049
+ resources: mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
1050
+ resource_field_name: str,
1051
+ resource: mlrun.common.schemas.RuntimeResource,
1052
+ ):
1053
+ if "mlrun/class" in resource.labels:
1054
+ project = resource.labels.get("mlrun/project", "")
1055
+ mlrun_class = resource.labels["mlrun/class"]
1056
+ kind = self._resolve_kind_from_class(mlrun_class)
1057
+ self._add_resource_to_grouped_by_field_resources_response(
1058
+ project, kind, resources, resource_field_name, resource
1059
+ )
1060
+
1061
+ def _add_resource_to_grouped_by_job_resources_response(
1062
+ self,
1063
+ resources: mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
1064
+ resource_field_name: str,
1065
+ resource: mlrun.common.schemas.RuntimeResource,
1066
+ ):
1067
+ if "mlrun/uid" in resource.labels:
1068
+ project = resource.labels.get("mlrun/project", config.default_project)
1069
+ uid = resource.labels["mlrun/uid"]
1070
+ self._add_resource_to_grouped_by_field_resources_response(
1071
+ project, uid, resources, resource_field_name, resource
1072
+ )
1073
+
1074
+ @staticmethod
1075
+ def _add_resource_to_grouped_by_field_resources_response(
1076
+ first_field_value: str,
1077
+ second_field_value: str,
1078
+ resources: mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
1079
+ resource_field_name: str,
1080
+ resource: mlrun.common.schemas.RuntimeResource,
1081
+ ):
1082
+ if first_field_value not in resources:
1083
+ resources[first_field_value] = {}
1084
+ if second_field_value not in resources[first_field_value]:
1085
+ resources[first_field_value][
1086
+ second_field_value
1087
+ ] = mlrun.common.schemas.RuntimeResources(
1088
+ pod_resources=[], crd_resources=[]
1089
+ )
1090
+ if not getattr(
1091
+ resources[first_field_value][second_field_value], resource_field_name
1092
+ ):
1093
+ setattr(
1094
+ resources[first_field_value][second_field_value],
1095
+ resource_field_name,
1096
+ [],
1097
+ )
1098
+ getattr(
1099
+ resources[first_field_value][second_field_value], resource_field_name
1100
+ ).append(resource)
1101
+
1102
+ @staticmethod
1103
+ def _resolve_kind_from_class(mlrun_class: str) -> str:
1104
+ class_to_kind_map = {}
1105
+ for kind in mlrun.runtimes.RuntimeKinds.runtime_with_handlers():
1106
+ runtime_handler = mlrun.api.runtime_handlers.get_runtime_handler(kind)
1107
+ class_values = runtime_handler._get_possible_mlrun_class_label_values()
1108
+ for value in class_values:
1109
+ class_to_kind_map[value] = kind
1110
+ return class_to_kind_map[mlrun_class]
1111
+
1112
+ @staticmethod
1113
+ def _get_run_label_selector(project: str, run_uid: str):
1114
+ return f"mlrun/project={project},mlrun/uid={run_uid}"
1115
+
1116
+ @staticmethod
1117
+ def _ensure_run_logs_collected(
1118
+ db: DBInterface, db_session: Session, project: str, uid: str
1119
+ ):
1120
+ # import here to avoid circular imports
1121
+ import mlrun.api.crud as crud
1122
+
1123
+ log_file_exists, _ = crud.Logs().log_file_exists_for_run_uid(project, uid)
1124
+ if not log_file_exists:
1125
+ # this stays for now for backwards compatibility in case we would not use the log collector but rather
1126
+ # the legacy method to pull logs
1127
+ logs_from_k8s = crud.Logs()._get_logs_legacy_method(
1128
+ db_session, project, uid, source=LogSources.K8S
1129
+ )
1130
+ if logs_from_k8s:
1131
+ logger.info("Storing run logs", project=project, uid=uid)
1132
+ crud.Logs().store_log(logs_from_k8s, project, uid, append=False)
1133
+
1134
+ @staticmethod
1135
+ def _ensure_run_state(
1136
+ db: DBInterface,
1137
+ db_session: Session,
1138
+ project: str,
1139
+ uid: str,
1140
+ name: str,
1141
+ run_state: str,
1142
+ run: Dict = None,
1143
+ search_run: bool = True,
1144
+ ) -> Tuple[bool, str]:
1145
+ if run is None:
1146
+ run = {}
1147
+ if search_run:
1148
+ try:
1149
+ run = db.read_run(db_session, uid, project)
1150
+ except mlrun.errors.MLRunNotFoundError:
1151
+ run = {}
1152
+ if not run:
1153
+ logger.warning(
1154
+ "Run not found. A new run will be created",
1155
+ project=project,
1156
+ uid=uid,
1157
+ desired_run_state=run_state,
1158
+ search_run=search_run,
1159
+ )
1160
+ run = {"metadata": {"project": project, "name": name, "uid": uid}}
1161
+ db_run_state = run.get("status", {}).get("state")
1162
+ if db_run_state:
1163
+ if db_run_state == run_state:
1164
+ return False, run_state
1165
+ # if the current run state is terminal and different than the desired - log
1166
+ if db_run_state in RunStates.terminal_states():
1167
+ # This can happen when the SDK running in the user's Run updates the Run's state to terminal, but
1168
+ # before it exits, when the runtime resource is still running, the API monitoring (here) is executed
1169
+ if run_state not in RunStates.terminal_states():
1170
+ now = datetime.now(timezone.utc)
1171
+ last_update_str = run.get("status", {}).get("last_update")
1172
+ if last_update_str is not None:
1173
+ last_update = datetime.fromisoformat(last_update_str)
1174
+ debounce_period = config.runs_monitoring_interval
1175
+ if last_update > now - timedelta(
1176
+ seconds=float(debounce_period)
1177
+ ):
1178
+ logger.warning(
1179
+ "Monitoring found non-terminal state on runtime resource but record has recently "
1180
+ "updated to terminal state. Debouncing",
1181
+ project=project,
1182
+ uid=uid,
1183
+ db_run_state=db_run_state,
1184
+ run_state=run_state,
1185
+ last_update=last_update,
1186
+ now=now,
1187
+ debounce_period=debounce_period,
1188
+ )
1189
+ return False, run_state
1190
+
1191
+ logger.warning(
1192
+ "Run record has terminal state but monitoring found different state on runtime resource. Changing",
1193
+ project=project,
1194
+ uid=uid,
1195
+ db_run_state=db_run_state,
1196
+ run_state=run_state,
1197
+ )
1198
+
1199
+ logger.info("Updating run state", run_state=run_state)
1200
+ run.setdefault("status", {})["state"] = run_state
1201
+ run.setdefault("status", {})["last_update"] = now_date().isoformat()
1202
+ db.store_run(db_session, run, uid, project)
1203
+
1204
+ return True, run_state
1205
+
1206
+ @staticmethod
1207
+ def _resolve_runtime_resource_run(runtime_resource: Dict) -> Tuple[str, str, str]:
1208
+ project = (
1209
+ runtime_resource.get("metadata", {}).get("labels", {}).get("mlrun/project")
1210
+ )
1211
+ if not project:
1212
+ project = config.default_project
1213
+ uid = runtime_resource.get("metadata", {}).get("labels", {}).get("mlrun/uid")
1214
+ name = (
1215
+ runtime_resource.get("metadata", {})
1216
+ .get("labels", {})
1217
+ .get("mlrun/name", "no-name")
1218
+ )
1219
+ return project, uid, name
1220
+
1221
+ @staticmethod
1222
+ def _build_pod_resources(pods) -> List[mlrun.common.schemas.RuntimeResource]:
1223
+ pod_resources = []
1224
+ for pod in pods:
1225
+ pod_resources.append(
1226
+ mlrun.common.schemas.RuntimeResource(
1227
+ name=pod["metadata"]["name"],
1228
+ labels=pod["metadata"]["labels"],
1229
+ status=pod["status"],
1230
+ )
1231
+ )
1232
+ return pod_resources
1233
+
1234
+ @staticmethod
1235
+ def _build_crd_resources(
1236
+ custom_objects,
1237
+ ) -> List[mlrun.common.schemas.RuntimeResource]:
1238
+ crd_resources = []
1239
+ for custom_object in custom_objects:
1240
+ crd_resources.append(
1241
+ mlrun.common.schemas.RuntimeResource(
1242
+ name=custom_object["metadata"]["name"],
1243
+ labels=custom_object["metadata"]["labels"],
1244
+ status=custom_object.get("status", {}),
1245
+ )
1246
+ )
1247
+ return crd_resources