mlrun 1.4.0rc25__py3-none-any.whl → 1.5.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (184) hide show
  1. mlrun/__init__.py +2 -35
  2. mlrun/__main__.py +3 -41
  3. mlrun/api/api/api.py +6 -0
  4. mlrun/api/api/endpoints/feature_store.py +0 -4
  5. mlrun/api/api/endpoints/files.py +14 -2
  6. mlrun/api/api/endpoints/frontend_spec.py +2 -1
  7. mlrun/api/api/endpoints/functions.py +95 -59
  8. mlrun/api/api/endpoints/grafana_proxy.py +9 -9
  9. mlrun/api/api/endpoints/logs.py +17 -3
  10. mlrun/api/api/endpoints/model_endpoints.py +3 -2
  11. mlrun/api/api/endpoints/pipelines.py +1 -5
  12. mlrun/api/api/endpoints/projects.py +88 -0
  13. mlrun/api/api/endpoints/runs.py +48 -6
  14. mlrun/api/api/endpoints/submit.py +2 -1
  15. mlrun/api/api/endpoints/workflows.py +355 -0
  16. mlrun/api/api/utils.py +3 -4
  17. mlrun/api/crud/__init__.py +1 -0
  18. mlrun/api/crud/client_spec.py +6 -2
  19. mlrun/api/crud/feature_store.py +5 -0
  20. mlrun/api/crud/model_monitoring/__init__.py +1 -0
  21. mlrun/api/crud/model_monitoring/deployment.py +497 -0
  22. mlrun/api/crud/model_monitoring/grafana.py +96 -42
  23. mlrun/api/crud/model_monitoring/helpers.py +159 -0
  24. mlrun/api/crud/model_monitoring/model_endpoints.py +202 -476
  25. mlrun/api/crud/notifications.py +9 -4
  26. mlrun/api/crud/pipelines.py +6 -11
  27. mlrun/api/crud/projects.py +2 -2
  28. mlrun/api/crud/runtime_resources.py +4 -3
  29. mlrun/api/crud/runtimes/nuclio/helpers.py +5 -1
  30. mlrun/api/crud/secrets.py +21 -0
  31. mlrun/api/crud/workflows.py +352 -0
  32. mlrun/api/db/base.py +16 -1
  33. mlrun/api/db/init_db.py +2 -4
  34. mlrun/api/db/session.py +1 -1
  35. mlrun/api/db/sqldb/db.py +129 -31
  36. mlrun/api/db/sqldb/models/models_mysql.py +15 -1
  37. mlrun/api/db/sqldb/models/models_sqlite.py +16 -2
  38. mlrun/api/launcher.py +38 -6
  39. mlrun/api/main.py +3 -2
  40. mlrun/api/rundb/__init__.py +13 -0
  41. mlrun/{db → api/rundb}/sqldb.py +36 -84
  42. mlrun/api/runtime_handlers/__init__.py +56 -0
  43. mlrun/api/runtime_handlers/base.py +1247 -0
  44. mlrun/api/runtime_handlers/daskjob.py +209 -0
  45. mlrun/api/runtime_handlers/kubejob.py +37 -0
  46. mlrun/api/runtime_handlers/mpijob.py +147 -0
  47. mlrun/api/runtime_handlers/remotesparkjob.py +29 -0
  48. mlrun/api/runtime_handlers/sparkjob.py +148 -0
  49. mlrun/api/schemas/__init__.py +17 -6
  50. mlrun/api/utils/builder.py +1 -4
  51. mlrun/api/utils/clients/chief.py +14 -0
  52. mlrun/api/utils/clients/iguazio.py +33 -33
  53. mlrun/api/utils/clients/nuclio.py +2 -2
  54. mlrun/api/utils/periodic.py +9 -2
  55. mlrun/api/utils/projects/follower.py +14 -7
  56. mlrun/api/utils/projects/leader.py +2 -1
  57. mlrun/api/utils/projects/remotes/nop_follower.py +2 -2
  58. mlrun/api/utils/projects/remotes/nop_leader.py +2 -2
  59. mlrun/api/utils/runtimes/__init__.py +14 -0
  60. mlrun/api/utils/runtimes/nuclio.py +43 -0
  61. mlrun/api/utils/scheduler.py +98 -15
  62. mlrun/api/utils/singletons/db.py +5 -1
  63. mlrun/api/utils/singletons/project_member.py +4 -1
  64. mlrun/api/utils/singletons/scheduler.py +1 -1
  65. mlrun/artifacts/base.py +6 -6
  66. mlrun/artifacts/dataset.py +4 -4
  67. mlrun/artifacts/manager.py +2 -3
  68. mlrun/artifacts/model.py +2 -2
  69. mlrun/artifacts/plots.py +8 -8
  70. mlrun/common/db/__init__.py +14 -0
  71. mlrun/common/helpers.py +37 -0
  72. mlrun/{mlutils → common/model_monitoring}/__init__.py +3 -2
  73. mlrun/common/model_monitoring/helpers.py +69 -0
  74. mlrun/common/schemas/__init__.py +13 -1
  75. mlrun/common/schemas/auth.py +4 -1
  76. mlrun/common/schemas/client_spec.py +1 -1
  77. mlrun/common/schemas/function.py +17 -0
  78. mlrun/common/schemas/model_monitoring/__init__.py +48 -0
  79. mlrun/common/{model_monitoring.py → schemas/model_monitoring/constants.py} +11 -23
  80. mlrun/common/schemas/model_monitoring/grafana.py +55 -0
  81. mlrun/common/schemas/{model_endpoints.py → model_monitoring/model_endpoints.py} +32 -65
  82. mlrun/common/schemas/notification.py +1 -0
  83. mlrun/common/schemas/object.py +4 -0
  84. mlrun/common/schemas/project.py +1 -0
  85. mlrun/common/schemas/regex.py +1 -1
  86. mlrun/common/schemas/runs.py +1 -8
  87. mlrun/common/schemas/schedule.py +1 -8
  88. mlrun/common/schemas/workflow.py +54 -0
  89. mlrun/config.py +45 -42
  90. mlrun/datastore/__init__.py +21 -0
  91. mlrun/datastore/base.py +1 -1
  92. mlrun/datastore/datastore.py +9 -0
  93. mlrun/datastore/dbfs_store.py +168 -0
  94. mlrun/datastore/helpers.py +18 -0
  95. mlrun/datastore/sources.py +1 -0
  96. mlrun/datastore/store_resources.py +2 -5
  97. mlrun/datastore/v3io.py +1 -2
  98. mlrun/db/__init__.py +4 -68
  99. mlrun/db/base.py +12 -0
  100. mlrun/db/factory.py +65 -0
  101. mlrun/db/httpdb.py +175 -20
  102. mlrun/db/nopdb.py +4 -2
  103. mlrun/execution.py +4 -2
  104. mlrun/feature_store/__init__.py +1 -0
  105. mlrun/feature_store/api.py +1 -2
  106. mlrun/feature_store/common.py +2 -1
  107. mlrun/feature_store/feature_set.py +1 -11
  108. mlrun/feature_store/feature_vector.py +340 -2
  109. mlrun/feature_store/ingestion.py +5 -10
  110. mlrun/feature_store/retrieval/base.py +118 -104
  111. mlrun/feature_store/retrieval/dask_merger.py +17 -10
  112. mlrun/feature_store/retrieval/job.py +4 -1
  113. mlrun/feature_store/retrieval/local_merger.py +18 -18
  114. mlrun/feature_store/retrieval/spark_merger.py +21 -14
  115. mlrun/feature_store/retrieval/storey_merger.py +22 -16
  116. mlrun/kfpops.py +3 -9
  117. mlrun/launcher/base.py +57 -53
  118. mlrun/launcher/client.py +5 -4
  119. mlrun/launcher/factory.py +24 -13
  120. mlrun/launcher/local.py +6 -6
  121. mlrun/launcher/remote.py +4 -4
  122. mlrun/lists.py +0 -11
  123. mlrun/model.py +11 -17
  124. mlrun/model_monitoring/__init__.py +2 -22
  125. mlrun/model_monitoring/features_drift_table.py +1 -1
  126. mlrun/model_monitoring/helpers.py +22 -210
  127. mlrun/model_monitoring/model_endpoint.py +1 -1
  128. mlrun/model_monitoring/model_monitoring_batch.py +127 -50
  129. mlrun/model_monitoring/prometheus.py +219 -0
  130. mlrun/model_monitoring/stores/__init__.py +16 -11
  131. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +95 -23
  132. mlrun/model_monitoring/stores/models/mysql.py +47 -29
  133. mlrun/model_monitoring/stores/models/sqlite.py +47 -29
  134. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +31 -19
  135. mlrun/model_monitoring/{stream_processing_fs.py → stream_processing.py} +206 -64
  136. mlrun/model_monitoring/tracking_policy.py +104 -0
  137. mlrun/package/packager.py +6 -8
  138. mlrun/package/packagers/default_packager.py +121 -10
  139. mlrun/package/packagers/numpy_packagers.py +1 -1
  140. mlrun/platforms/__init__.py +0 -2
  141. mlrun/platforms/iguazio.py +0 -56
  142. mlrun/projects/pipelines.py +53 -159
  143. mlrun/projects/project.py +10 -37
  144. mlrun/render.py +1 -1
  145. mlrun/run.py +8 -124
  146. mlrun/runtimes/__init__.py +6 -42
  147. mlrun/runtimes/base.py +29 -1249
  148. mlrun/runtimes/daskjob.py +2 -198
  149. mlrun/runtimes/funcdoc.py +0 -9
  150. mlrun/runtimes/function.py +25 -29
  151. mlrun/runtimes/kubejob.py +5 -29
  152. mlrun/runtimes/local.py +1 -1
  153. mlrun/runtimes/mpijob/__init__.py +2 -2
  154. mlrun/runtimes/mpijob/abstract.py +10 -1
  155. mlrun/runtimes/mpijob/v1.py +0 -76
  156. mlrun/runtimes/mpijob/v1alpha1.py +1 -74
  157. mlrun/runtimes/nuclio.py +3 -2
  158. mlrun/runtimes/pod.py +28 -18
  159. mlrun/runtimes/remotesparkjob.py +1 -15
  160. mlrun/runtimes/serving.py +14 -6
  161. mlrun/runtimes/sparkjob/__init__.py +0 -1
  162. mlrun/runtimes/sparkjob/abstract.py +4 -131
  163. mlrun/runtimes/utils.py +0 -26
  164. mlrun/serving/routers.py +7 -7
  165. mlrun/serving/server.py +11 -8
  166. mlrun/serving/states.py +7 -1
  167. mlrun/serving/v2_serving.py +6 -6
  168. mlrun/utils/helpers.py +23 -42
  169. mlrun/utils/notifications/notification/__init__.py +4 -0
  170. mlrun/utils/notifications/notification/webhook.py +61 -0
  171. mlrun/utils/notifications/notification_pusher.py +5 -25
  172. mlrun/utils/regex.py +7 -2
  173. mlrun/utils/version/version.json +2 -2
  174. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/METADATA +26 -25
  175. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/RECORD +180 -158
  176. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/WHEEL +1 -1
  177. mlrun/mlutils/data.py +0 -160
  178. mlrun/mlutils/models.py +0 -78
  179. mlrun/mlutils/plots.py +0 -902
  180. mlrun/utils/model_monitoring.py +0 -249
  181. /mlrun/{api/db/sqldb/session.py → common/db/sql_session.py} +0 -0
  182. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/LICENSE +0 -0
  183. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/entry_points.txt +0 -0
  184. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/top_level.txt +0 -0
mlrun/runtimes/base.py CHANGED
@@ -15,35 +15,26 @@ import enum
15
15
  import getpass
16
16
  import http
17
17
  import re
18
- import traceback
19
18
  import warnings
20
- from abc import ABC, abstractmethod
21
19
  from base64 import b64encode
22
- from datetime import datetime, timedelta, timezone
23
20
  from os import environ
24
- from typing import Callable, Dict, List, Optional, Tuple, Union
21
+ from typing import Callable, Dict, List, Optional, Union
25
22
 
26
23
  import requests.exceptions
27
24
  from deprecated import deprecated
28
- from kubernetes.client.rest import ApiException
29
25
  from nuclio.build import mlrun_footer
30
- from sqlalchemy.orm import Session
31
26
 
32
- import mlrun.api.db.sqldb.session
33
- import mlrun.api.utils.singletons.db
34
27
  import mlrun.common.schemas
28
+ import mlrun.db
35
29
  import mlrun.errors
36
30
  import mlrun.launcher.factory
37
31
  import mlrun.utils.helpers
38
32
  import mlrun.utils.notifications
39
33
  import mlrun.utils.regex
40
- from mlrun.api.constants import LogSources
41
- from mlrun.api.db.base import DBInterface
42
34
  from mlrun.utils.helpers import generate_object_uri, verify_field_regex
43
35
 
44
36
  from ..config import config
45
37
  from ..datastore import store_manager
46
- from ..db import RunDBError, get_or_set_dburl, get_run_db
47
38
  from ..errors import err_to_str
48
39
  from ..kfpops import mlrun_op
49
40
  from ..lists import RunList
@@ -58,9 +49,8 @@ from ..utils import (
58
49
  now_date,
59
50
  update_in,
60
51
  )
61
- from .constants import PodPhases, RunStates
62
52
  from .funcdoc import update_function_entry_points
63
- from .utils import RunError, calc_hash, get_k8s
53
+ from .utils import RunError, calc_hash
64
54
 
65
55
  spec_fields = [
66
56
  "command",
@@ -118,7 +108,6 @@ class FunctionSpec(ModelObj):
118
108
  disable_auto_mount=False,
119
109
  clone_target_dir=None,
120
110
  ):
121
-
122
111
  self.command = command or ""
123
112
  self.image = image or ""
124
113
  self.mode = mode
@@ -230,14 +219,16 @@ class BaseRuntime(ModelObj):
230
219
  )
231
220
 
232
221
  def _ensure_run_db(self):
233
- self.spec.rundb = self.spec.rundb or get_or_set_dburl()
222
+ self.spec.rundb = self.spec.rundb or mlrun.db.get_or_set_dburl()
234
223
 
235
224
  def _get_db(self):
236
225
  # TODO: remove this function and use the launcher db instead
237
226
  self._ensure_run_db()
238
227
  if not self._db_conn:
239
228
  if self.spec.rundb:
240
- self._db_conn = get_run_db(self.spec.rundb, secrets=self._secrets)
229
+ self._db_conn = mlrun.db.get_run_db(
230
+ self.spec.rundb, secrets=self._secrets
231
+ )
241
232
  return self._db_conn
242
233
 
243
234
  # This function is different than the auto_mount function, as it mounts to runtimes based on the configuration.
@@ -296,6 +287,7 @@ class BaseRuntime(ModelObj):
296
287
  param_file_secrets: Optional[Dict[str, str]] = None,
297
288
  notifications: Optional[List[mlrun.model.Notification]] = None,
298
289
  returns: Optional[List[Union[str, Dict[str, str]]]] = None,
290
+ **launcher_kwargs,
299
291
  ) -> RunObject:
300
292
  """
301
293
  Run a local or remote task.
@@ -342,8 +334,8 @@ class BaseRuntime(ModelObj):
342
334
 
343
335
  :return: Run context object (RunObject) with run metadata, results and status
344
336
  """
345
- launcher = mlrun.launcher.factory.LauncherFactory.create_launcher(
346
- self._is_remote, local
337
+ launcher = mlrun.launcher.factory.LauncherFactory().create_launcher(
338
+ self._is_remote, local=local, **launcher_kwargs
347
339
  )
348
340
  return launcher.launch(
349
341
  runtime=self,
@@ -376,7 +368,7 @@ class BaseRuntime(ModelObj):
376
368
  iter = task.metadata.iteration
377
369
  try:
378
370
  return self._get_db().read_run(uid, project, iter=iter)
379
- except RunDBError:
371
+ except mlrun.db.RunDBError:
380
372
  return None
381
373
  if task:
382
374
  return task.to_dict()
@@ -575,10 +567,10 @@ class BaseRuntime(ModelObj):
575
567
 
576
568
  elif not was_none and last_state != "completed":
577
569
  try:
578
- runtime_handler = mlrun.runtimes.get_runtime_handler(kind)
579
- updates = runtime_handler._get_run_completion_updates(resp)
570
+ runtime_cls = mlrun.runtimes.get_runtime_class(kind)
571
+ updates = runtime_cls._get_run_completion_updates(resp)
580
572
  except KeyError:
581
- updates = BaseRuntimeHandler._get_run_completion_updates(resp)
573
+ updates = self._get_run_completion_updates(resp)
582
574
 
583
575
  uid = get_in(resp, "metadata.uid")
584
576
  logger.debug(
@@ -606,6 +598,19 @@ class BaseRuntime(ModelObj):
606
598
  matches = re.findall(mlrun.utils.regex.pipeline_param[0], self.to_json())
607
599
  return bool(matches)
608
600
 
601
+ @staticmethod
602
+ def _get_run_completion_updates(run: dict) -> dict:
603
+ """
604
+ Get the required updates for the run object when it's completed and update the run object state
605
+ Override this if the run completion is not resolved by a single execution
606
+ """
607
+ updates = {
608
+ "status.last_update": now_date().isoformat(),
609
+ "status.state": "completed",
610
+ }
611
+ update_in(run, "status.state", "completed")
612
+ return updates
613
+
609
614
  def full_image_path(
610
615
  self, image=None, client_version: str = None, client_python_version: str = None
611
616
  ):
@@ -846,7 +851,7 @@ class BaseRuntime(ModelObj):
846
851
  but because we allow the user to set 'spec.image' for usability purposes,
847
852
  we need to check whether this is a built image or it requires to be built on top.
848
853
  """
849
- launcher = mlrun.launcher.factory.LauncherFactory.create_launcher(
854
+ launcher = mlrun.launcher.factory.LauncherFactory().create_launcher(
850
855
  is_remote=self._is_remote
851
856
  )
852
857
  launcher.prepare_image_for_deploy(self)
@@ -880,7 +885,7 @@ class BaseRuntime(ModelObj):
880
885
  return self
881
886
 
882
887
  def save(self, tag="", versioned=False, refresh=False) -> str:
883
- launcher = mlrun.launcher.factory.LauncherFactory.create_launcher(
888
+ launcher = mlrun.launcher.factory.LauncherFactory().create_launcher(
884
889
  is_remote=self._is_remote
885
890
  )
886
891
  return launcher.save_function(
@@ -913,1228 +918,3 @@ class BaseRuntime(ModelObj):
913
918
  if "default" in p:
914
919
  line += f", default={p['default']}"
915
920
  print(" " + line)
916
-
917
-
918
- class BaseRuntimeHandler(ABC):
919
- # setting here to allow tests to override
920
- kind = "base"
921
- class_modes: Dict[RuntimeClassMode, str] = {}
922
- wait_for_deletion_interval = 10
923
-
924
- @staticmethod
925
- @abstractmethod
926
- def _get_object_label_selector(object_id: str) -> str:
927
- """
928
- Should return the label selector to get only resources of a specific object (with id object_id)
929
- """
930
- pass
931
-
932
- def _should_collect_logs(self) -> bool:
933
- """
934
- There are some runtimes which we don't collect logs for using the log collector
935
- :return: whether it should collect log for it
936
- """
937
- return True
938
-
939
- def _get_possible_mlrun_class_label_values(
940
- self, class_mode: Union[RuntimeClassMode, str] = None
941
- ) -> List[str]:
942
- """
943
- Should return the possible values of the mlrun/class label for runtime resources that are of this runtime
944
- handler kind
945
- """
946
- if not class_mode:
947
- return list(self.class_modes.values())
948
- class_mode = self.class_modes.get(class_mode, None)
949
- return [class_mode] if class_mode else []
950
-
951
- def list_resources(
952
- self,
953
- project: str,
954
- object_id: Optional[str] = None,
955
- label_selector: str = None,
956
- group_by: Optional[
957
- mlrun.common.schemas.ListRuntimeResourcesGroupByField
958
- ] = None,
959
- ) -> Union[
960
- mlrun.common.schemas.RuntimeResources,
961
- mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
962
- mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
963
- ]:
964
- # We currently don't support removing runtime resources in non k8s env
965
- if not get_k8s().is_running_inside_kubernetes_cluster():
966
- return {}
967
- namespace = get_k8s().resolve_namespace()
968
- label_selector = self.resolve_label_selector(project, object_id, label_selector)
969
- pods = self._list_pods(namespace, label_selector)
970
- pod_resources = self._build_pod_resources(pods)
971
- crd_objects = self._list_crd_objects(namespace, label_selector)
972
- crd_resources = self._build_crd_resources(crd_objects)
973
- response = self._build_list_resources_response(
974
- pod_resources, crd_resources, group_by
975
- )
976
- response = self._enrich_list_resources_response(
977
- response, namespace, label_selector, group_by
978
- )
979
- return response
980
-
981
- def build_output_from_runtime_resources(
982
- self,
983
- runtime_resources_list: List[mlrun.common.schemas.RuntimeResources],
984
- group_by: Optional[
985
- mlrun.common.schemas.ListRuntimeResourcesGroupByField
986
- ] = None,
987
- ):
988
- pod_resources = []
989
- crd_resources = []
990
- for runtime_resources in runtime_resources_list:
991
- pod_resources += runtime_resources.pod_resources
992
- crd_resources += runtime_resources.crd_resources
993
- response = self._build_list_resources_response(
994
- pod_resources, crd_resources, group_by
995
- )
996
- response = self._build_output_from_runtime_resources(
997
- response, runtime_resources_list, group_by
998
- )
999
- return response
1000
-
1001
- def delete_resources(
1002
- self,
1003
- db: DBInterface,
1004
- db_session: Session,
1005
- label_selector: str = None,
1006
- force: bool = False,
1007
- grace_period: int = None,
1008
- ):
1009
- if grace_period is None:
1010
- grace_period = config.runtime_resources_deletion_grace_period
1011
- # We currently don't support removing runtime resources in non k8s env
1012
- if not get_k8s().is_running_inside_kubernetes_cluster():
1013
- return
1014
- namespace = get_k8s().resolve_namespace()
1015
- label_selector = self.resolve_label_selector("*", label_selector=label_selector)
1016
- crd_group, crd_version, crd_plural = self._get_crd_info()
1017
- if crd_group and crd_version and crd_plural:
1018
- deleted_resources = self._delete_crd_resources(
1019
- db,
1020
- db_session,
1021
- namespace,
1022
- label_selector,
1023
- force,
1024
- grace_period,
1025
- )
1026
- else:
1027
- deleted_resources = self._delete_pod_resources(
1028
- db,
1029
- db_session,
1030
- namespace,
1031
- label_selector,
1032
- force,
1033
- grace_period,
1034
- )
1035
- self._delete_extra_resources(
1036
- db,
1037
- db_session,
1038
- namespace,
1039
- deleted_resources,
1040
- label_selector,
1041
- force,
1042
- grace_period,
1043
- )
1044
-
1045
- def delete_runtime_object_resources(
1046
- self,
1047
- db: DBInterface,
1048
- db_session: Session,
1049
- object_id: str,
1050
- label_selector: str = None,
1051
- force: bool = False,
1052
- grace_period: int = None,
1053
- ):
1054
- if grace_period is None:
1055
- grace_period = config.runtime_resources_deletion_grace_period
1056
- label_selector = self._add_object_label_selector_if_needed(
1057
- object_id, label_selector
1058
- )
1059
- self.delete_resources(db, db_session, label_selector, force, grace_period)
1060
-
1061
- def monitor_runs(self, db: DBInterface, db_session: Session):
1062
- namespace = get_k8s().resolve_namespace()
1063
- label_selector = self._get_default_label_selector()
1064
- crd_group, crd_version, crd_plural = self._get_crd_info()
1065
- runtime_resource_is_crd = False
1066
- if crd_group and crd_version and crd_plural:
1067
- runtime_resource_is_crd = True
1068
- runtime_resources = self._list_crd_objects(namespace, label_selector)
1069
- else:
1070
- runtime_resources = self._list_pods(namespace, label_selector)
1071
- project_run_uid_map = self._list_runs_for_monitoring(db, db_session)
1072
- # project -> uid -> {"name": <runtime-resource-name>}
1073
- run_runtime_resources_map = {}
1074
- for runtime_resource in runtime_resources:
1075
- project, uid, name = self._resolve_runtime_resource_run(runtime_resource)
1076
- run_runtime_resources_map.setdefault(project, {})
1077
- run_runtime_resources_map.get(project).update({uid: {"name": name}})
1078
- try:
1079
- self._monitor_runtime_resource(
1080
- db,
1081
- db_session,
1082
- project_run_uid_map,
1083
- runtime_resource,
1084
- runtime_resource_is_crd,
1085
- namespace,
1086
- project,
1087
- uid,
1088
- name,
1089
- )
1090
- except Exception as exc:
1091
- logger.warning(
1092
- "Failed monitoring runtime resource. Continuing",
1093
- runtime_resource_name=runtime_resource["metadata"]["name"],
1094
- namespace=namespace,
1095
- exc=err_to_str(exc),
1096
- traceback=traceback.format_exc(),
1097
- )
1098
- for project, runs in project_run_uid_map.items():
1099
- if runs:
1100
- for run_uid, run in runs.items():
1101
- try:
1102
- if not run:
1103
- run = db.read_run(db_session, run_uid, project)
1104
- if self.kind == run.get("metadata", {}).get("labels", {}).get(
1105
- "kind", ""
1106
- ):
1107
- self._ensure_run_not_stuck_on_non_terminal_state(
1108
- db,
1109
- db_session,
1110
- project,
1111
- run_uid,
1112
- run,
1113
- run_runtime_resources_map,
1114
- )
1115
- except Exception as exc:
1116
- logger.warning(
1117
- "Failed ensuring run not stuck. Continuing",
1118
- run_uid=run_uid,
1119
- run=run,
1120
- project=project,
1121
- exc=err_to_str(exc),
1122
- traceback=traceback.format_exc(),
1123
- )
1124
-
1125
- def _ensure_run_not_stuck_on_non_terminal_state(
1126
- self,
1127
- db: DBInterface,
1128
- db_session: Session,
1129
- project: str,
1130
- run_uid: str,
1131
- run: dict = None,
1132
- run_runtime_resources_map: dict = None,
1133
- ):
1134
- """
1135
- Ensuring that a run does not become trapped in a non-terminal state as a result of not finding
1136
- corresponding k8s resource.
1137
- This can occur when a node is evicted or preempted, causing the resources to be removed from the resource
1138
- listing when the final state recorded in the database is non-terminal.
1139
- This will have a significant impact on scheduled jobs, since they will not be created until the
1140
- previous run reaches a terminal state (because of concurrency limit)
1141
- """
1142
- now = now_date()
1143
- db_run_state = run.get("status", {}).get("state")
1144
- if not db_run_state:
1145
- # we are setting the run state to a terminal state to avoid log spamming, this is mainly sanity as we are
1146
- # setting state to runs when storing new runs.
1147
- logger.info(
1148
- "Runs monitoring found a run without state, updating to a terminal state",
1149
- project=project,
1150
- uid=run_uid,
1151
- db_run_state=db_run_state,
1152
- now=now,
1153
- )
1154
- run.setdefault("status", {})["state"] = RunStates.error
1155
- run.setdefault("status", {})["last_update"] = now.isoformat()
1156
- db.store_run(db_session, run, run_uid, project)
1157
- return
1158
- if db_run_state in RunStates.non_terminal_states():
1159
- if run_runtime_resources_map and run_uid in run_runtime_resources_map.get(
1160
- project, {}
1161
- ):
1162
- # if found resource there is no need to continue
1163
- return
1164
- last_update_str = run.get("status", {}).get("last_update")
1165
- debounce_period = (
1166
- config.resolve_runs_monitoring_missing_runtime_resources_debouncing_interval()
1167
- )
1168
- if last_update_str is None:
1169
- logger.info(
1170
- "Runs monitoring found run in non-terminal state without last update time set, "
1171
- "updating last update time to now, to be able to evaluate next time if something changed",
1172
- project=project,
1173
- uid=run_uid,
1174
- db_run_state=db_run_state,
1175
- now=now,
1176
- debounce_period=debounce_period,
1177
- )
1178
- run.setdefault("status", {})["last_update"] = now.isoformat()
1179
- db.store_run(db_session, run, run_uid, project)
1180
- return
1181
-
1182
- if datetime.fromisoformat(last_update_str) > now - timedelta(
1183
- seconds=debounce_period
1184
- ):
1185
- # we are setting non-terminal states to runs before the run is actually applied to k8s, meaning there is
1186
- # a timeframe where the run exists and no runtime resources exist and it's ok, therefore we're applying
1187
- # a debounce period before setting the state to error
1188
- logger.warning(
1189
- "Monitoring did not discover a runtime resource that corresponded to a run in a "
1190
- "non-terminal state. but record has recently updated. Debouncing",
1191
- project=project,
1192
- uid=run_uid,
1193
- db_run_state=db_run_state,
1194
- last_update=datetime.fromisoformat(last_update_str),
1195
- now=now,
1196
- debounce_period=debounce_period,
1197
- )
1198
- else:
1199
- logger.info(
1200
- "Updating run state", run_uid=run_uid, run_state=RunStates.error
1201
- )
1202
- run.setdefault("status", {})["state"] = RunStates.error
1203
- run.setdefault("status", {})[
1204
- "reason"
1205
- ] = "A runtime resource related to this run could not be found"
1206
- run.setdefault("status", {})["last_update"] = now.isoformat()
1207
- db.store_run(db_session, run, run_uid, project)
1208
-
1209
- def _add_object_label_selector_if_needed(
1210
- self,
1211
- object_id: Optional[str] = None,
1212
- label_selector: Optional[str] = None,
1213
- ):
1214
- if object_id:
1215
- object_label_selector = self._get_object_label_selector(object_id)
1216
- if label_selector:
1217
- label_selector = ",".join([object_label_selector, label_selector])
1218
- else:
1219
- label_selector = object_label_selector
1220
- return label_selector
1221
-
1222
- @staticmethod
1223
- def _get_main_runtime_resource_label_selector() -> str:
1224
- """
1225
- There are some runtimes which might have multiple k8s resources attached to a one runtime, in this case
1226
- we don't want to pull logs from all but rather only for the "driver"/"launcher" etc
1227
- :return: the label selector
1228
- """
1229
- return ""
1230
-
1231
- def _enrich_list_resources_response(
1232
- self,
1233
- response: Union[
1234
- mlrun.common.schemas.RuntimeResources,
1235
- mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
1236
- mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
1237
- ],
1238
- namespace: str,
1239
- label_selector: str = None,
1240
- group_by: Optional[
1241
- mlrun.common.schemas.ListRuntimeResourcesGroupByField
1242
- ] = None,
1243
- ) -> Union[
1244
- mlrun.common.schemas.RuntimeResources,
1245
- mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
1246
- mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
1247
- ]:
1248
- """
1249
- Override this to list resources other then pods or CRDs (which are handled by the base class)
1250
- """
1251
- return response
1252
-
1253
- def _build_output_from_runtime_resources(
1254
- self,
1255
- response: Union[
1256
- mlrun.common.schemas.RuntimeResources,
1257
- mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
1258
- mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
1259
- ],
1260
- runtime_resources_list: List[mlrun.common.schemas.RuntimeResources],
1261
- group_by: Optional[
1262
- mlrun.common.schemas.ListRuntimeResourcesGroupByField
1263
- ] = None,
1264
- ):
1265
- """
1266
- Override this to add runtime resources other than pods or CRDs (which are handled by the base class) to the
1267
- output
1268
- """
1269
- return response
1270
-
1271
- def _delete_extra_resources(
1272
- self,
1273
- db: DBInterface,
1274
- db_session: Session,
1275
- namespace: str,
1276
- deleted_resources: List[Dict],
1277
- label_selector: str = None,
1278
- force: bool = False,
1279
- grace_period: int = None,
1280
- ):
1281
- """
1282
- Override this to handle deletion of resources other than pods or CRDs (which are handled by the base class)
1283
- Note that this is happening after the deletion of the CRDs or the pods
1284
- Note to add this at the beginning:
1285
- if grace_period is None:
1286
- grace_period = config.runtime_resources_deletion_grace_period
1287
- """
1288
- pass
1289
-
1290
- def _resolve_crd_object_status_info(
1291
- self, db: DBInterface, db_session: Session, crd_object
1292
- ) -> Tuple[bool, Optional[datetime], Optional[str]]:
1293
- """
1294
- Override this if the runtime has CRD resources.
1295
- :return: Tuple with:
1296
- 1. bool determining whether the crd object is in terminal state
1297
- 2. datetime of when the crd object got into terminal state (only when the crd object in terminal state)
1298
- 3. the desired run state matching the crd object state
1299
- """
1300
- return False, None, None
1301
-
1302
- def _update_ui_url(
1303
- self,
1304
- db: DBInterface,
1305
- db_session: Session,
1306
- project: str,
1307
- uid: str,
1308
- crd_object,
1309
- run: Dict = None,
1310
- ):
1311
- """
1312
- Update the UI URL for relevant jobs.
1313
- """
1314
- pass
1315
-
1316
- def _resolve_pod_status_info(
1317
- self, db: DBInterface, db_session: Session, pod: Dict
1318
- ) -> Tuple[bool, Optional[datetime], Optional[str]]:
1319
- """
1320
- :return: Tuple with:
1321
- 1. bool determining whether the pod is in terminal state
1322
- 2. datetime of when the pod got into terminal state (only when the pod in terminal state)
1323
- 3. the run state matching the pod state
1324
- """
1325
- in_terminal_state = pod["status"]["phase"] in PodPhases.terminal_phases()
1326
- run_state = PodPhases.pod_phase_to_run_state(pod["status"]["phase"])
1327
- last_container_completion_time = None
1328
- if in_terminal_state:
1329
- for container_status in pod["status"].get("container_statuses", []):
1330
- if container_status.get("state", {}).get("terminated"):
1331
- container_completion_time = container_status["state"][
1332
- "terminated"
1333
- ].get("finished_at")
1334
-
1335
- # take latest completion time
1336
- if (
1337
- not last_container_completion_time
1338
- or last_container_completion_time < container_completion_time
1339
- ):
1340
- last_container_completion_time = container_completion_time
1341
-
1342
- return in_terminal_state, last_container_completion_time, run_state
1343
-
1344
- def _get_default_label_selector(
1345
- self, class_mode: Union[RuntimeClassMode, str] = None
1346
- ) -> str:
1347
- """
1348
- Override this to add a default label selector
1349
- """
1350
- class_values = self._get_possible_mlrun_class_label_values(class_mode)
1351
- if not class_values:
1352
- return ""
1353
- if len(class_values) == 1:
1354
- return f"mlrun/class={class_values[0]}"
1355
- return f"mlrun/class in ({', '.join(class_values)})"
1356
-
1357
- @staticmethod
1358
- def _get_run_completion_updates(run: dict) -> dict:
1359
- """
1360
- Get the required updates for the run object when it's completed and update the run object state
1361
- Override this if the run completion is not resolved by a single execution
1362
- """
1363
- updates = {
1364
- "status.last_update": now_date().isoformat(),
1365
- "status.state": "completed",
1366
- }
1367
- update_in(run, "status.state", "completed")
1368
- return updates
1369
-
1370
- @staticmethod
1371
- def _get_crd_info() -> Tuple[str, str, str]:
1372
- """
1373
- Override this if the runtime has CRD resources. this should return the CRD info:
1374
- crd group, crd version, crd plural
1375
- """
1376
- return "", "", ""
1377
-
1378
- @staticmethod
1379
- def _are_resources_coupled_to_run_object() -> bool:
1380
- """
1381
- Some resources are tightly coupled to mlrun Run object, for example, for each Run of a Function of the job kind
1382
- a kubernetes job is being generated, on the opposite a Function of the daskjob kind generates a dask cluster,
1383
- and every Run is being executed using this cluster, i.e. no resources are created for the Run.
1384
- This function should return true for runtimes in which Run are coupled to the underlying resources and therefore
1385
- aspects of the Run (like its state) should be taken into consideration on resources deletion
1386
- """
1387
- return False
1388
-
1389
- @staticmethod
1390
- def _expect_pods_without_uid() -> bool:
1391
- return False
1392
-
1393
- def _list_pods(self, namespace: str, label_selector: str = None) -> List:
1394
- pods = get_k8s().list_pods(namespace, selector=label_selector)
1395
- # when we work with custom objects (list_namespaced_custom_object) it's always a dict, to be able to generalize
1396
- # code working on runtime resource (either a custom object or a pod) we're transforming to dicts
1397
- pods = [pod.to_dict() for pod in pods]
1398
- return pods
1399
-
1400
- def _list_crd_objects(self, namespace: str, label_selector: str = None) -> List:
1401
- crd_group, crd_version, crd_plural = self._get_crd_info()
1402
- crd_objects = []
1403
- if crd_group and crd_version and crd_plural:
1404
- try:
1405
- crd_objects = get_k8s().crdapi.list_namespaced_custom_object(
1406
- crd_group,
1407
- crd_version,
1408
- namespace,
1409
- crd_plural,
1410
- label_selector=label_selector,
1411
- )
1412
- except ApiException as exc:
1413
- # ignore error if crd is not defined
1414
- if exc.status != 404:
1415
- raise
1416
- else:
1417
- crd_objects = crd_objects["items"]
1418
- return crd_objects
1419
-
1420
- def resolve_label_selector(
1421
- self,
1422
- project: str,
1423
- object_id: Optional[str] = None,
1424
- label_selector: Optional[str] = None,
1425
- class_mode: Union[RuntimeClassMode, str] = None,
1426
- with_main_runtime_resource_label_selector: bool = False,
1427
- ) -> str:
1428
- default_label_selector = self._get_default_label_selector(class_mode=class_mode)
1429
-
1430
- if label_selector:
1431
- label_selector = ",".join([default_label_selector, label_selector])
1432
- else:
1433
- label_selector = default_label_selector
1434
-
1435
- if project and project != "*":
1436
- label_selector = ",".join([label_selector, f"mlrun/project={project}"])
1437
-
1438
- label_selector = self._add_object_label_selector_if_needed(
1439
- object_id, label_selector
1440
- )
1441
-
1442
- if with_main_runtime_resource_label_selector:
1443
- main_runtime_resource_label_selector = (
1444
- self._get_main_runtime_resource_label_selector()
1445
- )
1446
- if main_runtime_resource_label_selector:
1447
- label_selector = ",".join(
1448
- [label_selector, main_runtime_resource_label_selector]
1449
- )
1450
-
1451
- return label_selector
1452
-
1453
- @staticmethod
1454
- def resolve_object_id(
1455
- run: dict,
1456
- ) -> Optional[str]:
1457
- """
1458
- Get the object id from the run object
1459
- Override this if the object id is not the run uid
1460
- :param run: run object
1461
- :return: object id
1462
- """
1463
- return run.get("metadata", {}).get("uid", None)
1464
-
1465
- def _wait_for_pods_deletion(
1466
- self,
1467
- namespace: str,
1468
- deleted_pods: List[Dict],
1469
- label_selector: str = None,
1470
- ):
1471
- deleted_pod_names = [pod_dict["metadata"]["name"] for pod_dict in deleted_pods]
1472
-
1473
- def _verify_pods_removed():
1474
- pods = get_k8s().v1api.list_namespaced_pod(
1475
- namespace, label_selector=label_selector
1476
- )
1477
- existing_pod_names = [pod.metadata.name for pod in pods.items]
1478
- still_in_deletion_pods = set(existing_pod_names).intersection(
1479
- deleted_pod_names
1480
- )
1481
- if still_in_deletion_pods:
1482
- raise RuntimeError(
1483
- f"Pods are still in deletion process: {still_in_deletion_pods}"
1484
- )
1485
-
1486
- if deleted_pod_names:
1487
- timeout = 180
1488
- logger.debug(
1489
- "Waiting for pods deletion",
1490
- timeout=timeout,
1491
- interval=self.wait_for_deletion_interval,
1492
- )
1493
- mlrun.utils.retry_until_successful(
1494
- self.wait_for_deletion_interval,
1495
- timeout,
1496
- logger,
1497
- True,
1498
- _verify_pods_removed,
1499
- )
1500
-
1501
- def _wait_for_crds_underlying_pods_deletion(
1502
- self,
1503
- deleted_crds: List[Dict],
1504
- label_selector: str = None,
1505
- ):
1506
- # we're using here the run identifier as the common ground to identify which pods are relevant to which CRD, so
1507
- # if they are not coupled we are not able to wait - simply return
1508
- # NOTE - there are surely smarter ways to do this, without depending on the run object, but as of writing this
1509
- # none of the runtimes using CRDs are like that, so not handling it now
1510
- if not self._are_resources_coupled_to_run_object():
1511
- return
1512
-
1513
- def _verify_crds_underlying_pods_removed():
1514
- project_uid_crd_map = {}
1515
- for crd in deleted_crds:
1516
- project, uid, _ = self._resolve_runtime_resource_run(crd)
1517
- if not uid or not project:
1518
- logger.warning(
1519
- "Could not resolve run uid from crd. Skipping waiting for pods deletion",
1520
- crd=crd,
1521
- )
1522
- continue
1523
- project_uid_crd_map.setdefault(project, {})[uid] = crd["metadata"][
1524
- "name"
1525
- ]
1526
- still_in_deletion_crds_to_pod_names = {}
1527
- jobs_runtime_resources: mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput = self.list_resources(
1528
- "*",
1529
- label_selector=label_selector,
1530
- group_by=mlrun.common.schemas.ListRuntimeResourcesGroupByField.job,
1531
- )
1532
- for project, project_jobs in jobs_runtime_resources.items():
1533
- if project not in project_uid_crd_map:
1534
- continue
1535
- for job_uid, job_runtime_resources in jobs_runtime_resources[
1536
- project
1537
- ].items():
1538
- if job_uid not in project_uid_crd_map[project]:
1539
- continue
1540
- if job_runtime_resources.pod_resources:
1541
- still_in_deletion_crds_to_pod_names[
1542
- project_uid_crd_map[project][job_uid]
1543
- ] = [
1544
- pod_resource.name
1545
- for pod_resource in job_runtime_resources.pod_resources
1546
- ]
1547
- if still_in_deletion_crds_to_pod_names:
1548
- raise RuntimeError(
1549
- f"CRD underlying pods are still in deletion process: {still_in_deletion_crds_to_pod_names}"
1550
- )
1551
-
1552
- if deleted_crds:
1553
- timeout = 180
1554
- logger.debug(
1555
- "Waiting for CRDs underlying pods deletion",
1556
- timeout=timeout,
1557
- interval=self.wait_for_deletion_interval,
1558
- )
1559
- mlrun.utils.retry_until_successful(
1560
- self.wait_for_deletion_interval,
1561
- timeout,
1562
- logger,
1563
- True,
1564
- _verify_crds_underlying_pods_removed,
1565
- )
1566
-
1567
- def _delete_pod_resources(
1568
- self,
1569
- db: DBInterface,
1570
- db_session: Session,
1571
- namespace: str,
1572
- label_selector: str = None,
1573
- force: bool = False,
1574
- grace_period: int = None,
1575
- ) -> List[Dict]:
1576
- if grace_period is None:
1577
- grace_period = config.runtime_resources_deletion_grace_period
1578
- pods = get_k8s().v1api.list_namespaced_pod(
1579
- namespace, label_selector=label_selector
1580
- )
1581
- deleted_pods = []
1582
- for pod in pods.items:
1583
- pod_dict = pod.to_dict()
1584
-
1585
- # best effort - don't let one failure in pod deletion to cut the whole operation
1586
- try:
1587
- (
1588
- in_terminal_state,
1589
- last_update,
1590
- run_state,
1591
- ) = self._resolve_pod_status_info(db, db_session, pod_dict)
1592
- if not force:
1593
- if not in_terminal_state:
1594
- continue
1595
-
1596
- # give some grace period if we have last update time
1597
- now = datetime.now(timezone.utc)
1598
- if (
1599
- last_update is not None
1600
- and last_update + timedelta(seconds=float(grace_period)) > now
1601
- ):
1602
- continue
1603
-
1604
- # if resources are tightly coupled to the run object - we want to perform some actions on the run object
1605
- # before deleting them
1606
- if self._are_resources_coupled_to_run_object():
1607
- try:
1608
- self._pre_deletion_runtime_resource_run_actions(
1609
- db, db_session, pod_dict, run_state
1610
- )
1611
- except Exception as exc:
1612
- # Don't prevent the deletion for failure in the pre deletion run actions
1613
- logger.warning(
1614
- "Failure in pod run pre-deletion actions. Continuing",
1615
- exc=repr(exc),
1616
- pod_name=pod.metadata.name,
1617
- )
1618
-
1619
- get_k8s().delete_pod(pod.metadata.name, namespace)
1620
- deleted_pods.append(pod_dict)
1621
- except Exception as exc:
1622
- logger.warning(
1623
- f"Cleanup failed processing pod {pod.metadata.name}: {repr(exc)}. Continuing"
1624
- )
1625
- # TODO: don't wait for pods to be deleted, client should poll the deletion status
1626
- self._wait_for_pods_deletion(namespace, deleted_pods, label_selector)
1627
- return deleted_pods
1628
-
1629
- def _delete_crd_resources(
1630
- self,
1631
- db: DBInterface,
1632
- db_session: Session,
1633
- namespace: str,
1634
- label_selector: str = None,
1635
- force: bool = False,
1636
- grace_period: int = None,
1637
- ) -> List[Dict]:
1638
- if grace_period is None:
1639
- grace_period = config.runtime_resources_deletion_grace_period
1640
- crd_group, crd_version, crd_plural = self._get_crd_info()
1641
- deleted_crds = []
1642
- try:
1643
- crd_objects = get_k8s().crdapi.list_namespaced_custom_object(
1644
- crd_group,
1645
- crd_version,
1646
- namespace,
1647
- crd_plural,
1648
- label_selector=label_selector,
1649
- )
1650
- except ApiException as exc:
1651
- # ignore error if crd is not defined
1652
- if exc.status != 404:
1653
- raise
1654
- else:
1655
- for crd_object in crd_objects["items"]:
1656
- # best effort - don't let one failure in pod deletion to cut the whole operation
1657
- try:
1658
- (
1659
- in_terminal_state,
1660
- last_update,
1661
- desired_run_state,
1662
- ) = self._resolve_crd_object_status_info(db, db_session, crd_object)
1663
- if not force:
1664
- if not in_terminal_state:
1665
- continue
1666
-
1667
- # give some grace period if we have last update time
1668
- now = datetime.now(timezone.utc)
1669
- if (
1670
- last_update is not None
1671
- and last_update + timedelta(seconds=float(grace_period))
1672
- > now
1673
- ):
1674
- continue
1675
-
1676
- # if resources are tightly coupled to the run object - we want to perform some actions on the run
1677
- # object before deleting them
1678
- if self._are_resources_coupled_to_run_object():
1679
-
1680
- try:
1681
- self._pre_deletion_runtime_resource_run_actions(
1682
- db,
1683
- db_session,
1684
- crd_object,
1685
- desired_run_state,
1686
- )
1687
- except Exception as exc:
1688
- # Don't prevent the deletion for failure in the pre deletion run actions
1689
- logger.warning(
1690
- "Failure in crd object run pre-deletion actions. Continuing",
1691
- exc=err_to_str(exc),
1692
- crd_object_name=crd_object["metadata"]["name"],
1693
- )
1694
-
1695
- get_k8s().delete_crd(
1696
- crd_object["metadata"]["name"],
1697
- crd_group,
1698
- crd_version,
1699
- crd_plural,
1700
- namespace,
1701
- )
1702
- deleted_crds.append(crd_object)
1703
- except Exception:
1704
- exc = traceback.format_exc()
1705
- crd_object_name = crd_object["metadata"]["name"]
1706
- logger.warning(
1707
- f"Cleanup failed processing CRD object {crd_object_name}: {err_to_str(exc)}. Continuing"
1708
- )
1709
- self._wait_for_crds_underlying_pods_deletion(deleted_crds, label_selector)
1710
- return deleted_crds
1711
-
1712
- def _pre_deletion_runtime_resource_run_actions(
1713
- self,
1714
- db: DBInterface,
1715
- db_session: Session,
1716
- runtime_resource: Dict,
1717
- run_state: str,
1718
- ):
1719
- project, uid, name = self._resolve_runtime_resource_run(runtime_resource)
1720
-
1721
- # if cannot resolve related run nothing to do
1722
- if not uid:
1723
- if not self._expect_pods_without_uid():
1724
- logger.warning(
1725
- "Could not resolve run uid from runtime resource. Skipping pre-deletion actions",
1726
- runtime_resource=runtime_resource,
1727
- )
1728
- raise ValueError("Could not resolve run uid from runtime resource")
1729
- else:
1730
- return
1731
-
1732
- logger.info(
1733
- "Performing pre-deletion actions before cleaning up runtime resources",
1734
- project=project,
1735
- uid=uid,
1736
- )
1737
-
1738
- self._ensure_run_state(db, db_session, project, uid, name, run_state)
1739
-
1740
- self._ensure_run_logs_collected(db, db_session, project, uid)
1741
-
1742
- def _is_runtime_resource_run_in_terminal_state(
1743
- self,
1744
- db: DBInterface,
1745
- db_session: Session,
1746
- runtime_resource: Dict,
1747
- ) -> Tuple[bool, Optional[datetime]]:
1748
- """
1749
- A runtime can have different underlying resources (like pods or CRDs) - to generalize we call it runtime
1750
- resource. This function will verify whether the Run object related to this runtime resource is in transient
1751
- state. This is useful in order to determine whether an object can be removed. for example, a kubejob's pod
1752
- might be in completed state, but we would like to verify that the run is completed as well to verify the logs
1753
- were collected before we're removing the pod.
1754
-
1755
- :returns: bool determining whether the run in terminal state, and the last update time if it exists
1756
- """
1757
- project, uid, _ = self._resolve_runtime_resource_run(runtime_resource)
1758
-
1759
- # if no uid, assume in terminal state
1760
- if not uid:
1761
- return True, None
1762
-
1763
- run = db.read_run(db_session, uid, project)
1764
- last_update = None
1765
- last_update_str = run.get("status", {}).get("last_update")
1766
- if last_update_str is not None:
1767
- last_update = datetime.fromisoformat(last_update_str)
1768
-
1769
- if run.get("status", {}).get("state") not in RunStates.terminal_states():
1770
- return False, last_update
1771
-
1772
- return True, last_update
1773
-
1774
- def _list_runs_for_monitoring(
1775
- self, db: DBInterface, db_session: Session, states: list = None
1776
- ):
1777
- runs = db.list_runs(db_session, project="*", states=states)
1778
- project_run_uid_map = {}
1779
- run_with_missing_data = []
1780
- duplicated_runs = []
1781
- for run in runs:
1782
- project = run.get("metadata", {}).get("project")
1783
- uid = run.get("metadata", {}).get("uid")
1784
- if not uid or not project:
1785
- run_with_missing_data.append(run.get("metadata", {}))
1786
- continue
1787
- current_run = project_run_uid_map.setdefault(project, {}).get(uid)
1788
-
1789
- # sanity
1790
- if current_run:
1791
- duplicated_runs = {
1792
- "monitored_run": current_run.get(["metadata"]),
1793
- "duplicated_run": run.get(["metadata"]),
1794
- }
1795
- continue
1796
-
1797
- project_run_uid_map[project][uid] = run
1798
-
1799
- # If there are duplications or runs with missing data it probably won't be fixed
1800
- # Monitoring is running periodically and we don't want to log on every problem we found which will spam the log
1801
- # so we're aggregating the problems and logging only once per aggregation
1802
- if duplicated_runs:
1803
- logger.warning(
1804
- "Found duplicated runs (same uid). Heuristically monitoring the first one found",
1805
- duplicated_runs=duplicated_runs,
1806
- )
1807
-
1808
- if run_with_missing_data:
1809
- logger.warning(
1810
- "Found runs with missing data. They will not be monitored",
1811
- run_with_missing_data=run_with_missing_data,
1812
- )
1813
-
1814
- return project_run_uid_map
1815
-
1816
- def _monitor_runtime_resource(
1817
- self,
1818
- db: DBInterface,
1819
- db_session: Session,
1820
- project_run_uid_map: Dict,
1821
- runtime_resource: Dict,
1822
- runtime_resource_is_crd: bool,
1823
- namespace: str,
1824
- project: str = None,
1825
- uid: str = None,
1826
- name: str = None,
1827
- ):
1828
- if not project and not uid and not name:
1829
- project, uid, name = self._resolve_runtime_resource_run(runtime_resource)
1830
- if not project or not uid:
1831
- # Currently any build pod won't have UID and therefore will cause this log message to be printed which
1832
- # spams the log
1833
- # TODO: uncomment the log message when builder become a kind / starts having a UID
1834
- # logger.warning(
1835
- # "Could not resolve run project or uid from runtime resource, can not monitor run. Continuing",
1836
- # project=project,
1837
- # uid=uid,
1838
- # runtime_resource_name=runtime_resource["metadata"]["name"],
1839
- # namespace=namespace,
1840
- # )
1841
- return
1842
- run = project_run_uid_map.get(project, {}).get(uid)
1843
- if runtime_resource_is_crd:
1844
- (
1845
- _,
1846
- _,
1847
- run_state,
1848
- ) = self._resolve_crd_object_status_info(db, db_session, runtime_resource)
1849
- else:
1850
- (
1851
- _,
1852
- _,
1853
- run_state,
1854
- ) = self._resolve_pod_status_info(db, db_session, runtime_resource)
1855
- self._update_ui_url(db, db_session, project, uid, runtime_resource, run)
1856
- _, updated_run_state = self._ensure_run_state(
1857
- db,
1858
- db_session,
1859
- project,
1860
- uid,
1861
- name,
1862
- run_state,
1863
- run,
1864
- search_run=False,
1865
- )
1866
- if updated_run_state in RunStates.terminal_states():
1867
- self._ensure_run_logs_collected(db, db_session, project, uid)
1868
-
1869
- def _build_list_resources_response(
1870
- self,
1871
- pod_resources: List[mlrun.common.schemas.RuntimeResource] = None,
1872
- crd_resources: List[mlrun.common.schemas.RuntimeResource] = None,
1873
- group_by: Optional[
1874
- mlrun.common.schemas.ListRuntimeResourcesGroupByField
1875
- ] = None,
1876
- ) -> Union[
1877
- mlrun.common.schemas.RuntimeResources,
1878
- mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
1879
- mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
1880
- ]:
1881
- if crd_resources is None:
1882
- crd_resources = []
1883
- if pod_resources is None:
1884
- pod_resources = []
1885
-
1886
- if group_by is None:
1887
- return mlrun.common.schemas.RuntimeResources(
1888
- crd_resources=crd_resources, pod_resources=pod_resources
1889
- )
1890
- else:
1891
- if group_by == mlrun.common.schemas.ListRuntimeResourcesGroupByField.job:
1892
- return self._build_grouped_by_job_list_resources_response(
1893
- pod_resources, crd_resources
1894
- )
1895
- elif (
1896
- group_by
1897
- == mlrun.common.schemas.ListRuntimeResourcesGroupByField.project
1898
- ):
1899
- return self._build_grouped_by_project_list_resources_response(
1900
- pod_resources, crd_resources
1901
- )
1902
- else:
1903
- raise NotImplementedError(
1904
- f"Provided group by field is not supported. group_by={group_by}"
1905
- )
1906
-
1907
- def _build_grouped_by_project_list_resources_response(
1908
- self,
1909
- pod_resources: List[mlrun.common.schemas.RuntimeResource] = None,
1910
- crd_resources: List[mlrun.common.schemas.RuntimeResource] = None,
1911
- ) -> mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput:
1912
- resources = {}
1913
- for pod_resource in pod_resources:
1914
- self._add_resource_to_grouped_by_project_resources_response(
1915
- resources, "pod_resources", pod_resource
1916
- )
1917
- for crd_resource in crd_resources:
1918
- self._add_resource_to_grouped_by_project_resources_response(
1919
- resources, "crd_resources", crd_resource
1920
- )
1921
- return resources
1922
-
1923
- def _build_grouped_by_job_list_resources_response(
1924
- self,
1925
- pod_resources: List[mlrun.common.schemas.RuntimeResource] = None,
1926
- crd_resources: List[mlrun.common.schemas.RuntimeResource] = None,
1927
- ) -> mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput:
1928
- resources = {}
1929
- for pod_resource in pod_resources:
1930
- self._add_resource_to_grouped_by_job_resources_response(
1931
- resources, "pod_resources", pod_resource
1932
- )
1933
- for crd_resource in crd_resources:
1934
- self._add_resource_to_grouped_by_job_resources_response(
1935
- resources, "crd_resources", crd_resource
1936
- )
1937
- return resources
1938
-
1939
- def _add_resource_to_grouped_by_project_resources_response(
1940
- self,
1941
- resources: mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
1942
- resource_field_name: str,
1943
- resource: mlrun.common.schemas.RuntimeResource,
1944
- ):
1945
- if "mlrun/class" in resource.labels:
1946
- project = resource.labels.get("mlrun/project", "")
1947
- mlrun_class = resource.labels["mlrun/class"]
1948
- kind = self._resolve_kind_from_class(mlrun_class)
1949
- self._add_resource_to_grouped_by_field_resources_response(
1950
- project, kind, resources, resource_field_name, resource
1951
- )
1952
-
1953
- def _add_resource_to_grouped_by_job_resources_response(
1954
- self,
1955
- resources: mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
1956
- resource_field_name: str,
1957
- resource: mlrun.common.schemas.RuntimeResource,
1958
- ):
1959
- if "mlrun/uid" in resource.labels:
1960
- project = resource.labels.get("mlrun/project", config.default_project)
1961
- uid = resource.labels["mlrun/uid"]
1962
- self._add_resource_to_grouped_by_field_resources_response(
1963
- project, uid, resources, resource_field_name, resource
1964
- )
1965
-
1966
- @staticmethod
1967
- def _add_resource_to_grouped_by_field_resources_response(
1968
- first_field_value: str,
1969
- second_field_value: str,
1970
- resources: mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
1971
- resource_field_name: str,
1972
- resource: mlrun.common.schemas.RuntimeResource,
1973
- ):
1974
- if first_field_value not in resources:
1975
- resources[first_field_value] = {}
1976
- if second_field_value not in resources[first_field_value]:
1977
- resources[first_field_value][
1978
- second_field_value
1979
- ] = mlrun.common.schemas.RuntimeResources(
1980
- pod_resources=[], crd_resources=[]
1981
- )
1982
- if not getattr(
1983
- resources[first_field_value][second_field_value], resource_field_name
1984
- ):
1985
- setattr(
1986
- resources[first_field_value][second_field_value],
1987
- resource_field_name,
1988
- [],
1989
- )
1990
- getattr(
1991
- resources[first_field_value][second_field_value], resource_field_name
1992
- ).append(resource)
1993
-
1994
- @staticmethod
1995
- def _resolve_kind_from_class(mlrun_class: str) -> str:
1996
- class_to_kind_map = {}
1997
- for kind in mlrun.runtimes.RuntimeKinds.runtime_with_handlers():
1998
- runtime_handler = mlrun.runtimes.get_runtime_handler(kind)
1999
- class_values = runtime_handler._get_possible_mlrun_class_label_values()
2000
- for value in class_values:
2001
- class_to_kind_map[value] = kind
2002
- return class_to_kind_map[mlrun_class]
2003
-
2004
- @staticmethod
2005
- def _get_run_label_selector(project: str, run_uid: str):
2006
- return f"mlrun/project={project},mlrun/uid={run_uid}"
2007
-
2008
- @staticmethod
2009
- def _ensure_run_logs_collected(
2010
- db: DBInterface, db_session: Session, project: str, uid: str
2011
- ):
2012
- # import here to avoid circular imports
2013
- import mlrun.api.crud as crud
2014
-
2015
- log_file_exists, _ = crud.Logs().log_file_exists_for_run_uid(project, uid)
2016
- if not log_file_exists:
2017
- # this stays for now for backwards compatibility in case we would not use the log collector but rather
2018
- # the legacy method to pull logs
2019
- logs_from_k8s = crud.Logs()._get_logs_legacy_method(
2020
- db_session, project, uid, source=LogSources.K8S
2021
- )
2022
- if logs_from_k8s:
2023
- logger.info("Storing run logs", project=project, uid=uid)
2024
- crud.Logs().store_log(logs_from_k8s, project, uid, append=False)
2025
-
2026
- @staticmethod
2027
- def _ensure_run_state(
2028
- db: DBInterface,
2029
- db_session: Session,
2030
- project: str,
2031
- uid: str,
2032
- name: str,
2033
- run_state: str,
2034
- run: Dict = None,
2035
- search_run: bool = True,
2036
- ) -> Tuple[bool, str]:
2037
- if run is None:
2038
- run = {}
2039
- if search_run:
2040
- try:
2041
- run = db.read_run(db_session, uid, project)
2042
- except mlrun.errors.MLRunNotFoundError:
2043
- run = {}
2044
- if not run:
2045
- logger.warning(
2046
- "Run not found. A new run will be created",
2047
- project=project,
2048
- uid=uid,
2049
- desired_run_state=run_state,
2050
- search_run=search_run,
2051
- )
2052
- run = {"metadata": {"project": project, "name": name, "uid": uid}}
2053
- db_run_state = run.get("status", {}).get("state")
2054
- if db_run_state:
2055
- if db_run_state == run_state:
2056
- return False, run_state
2057
- # if the current run state is terminal and different than the desired - log
2058
- if db_run_state in RunStates.terminal_states():
2059
-
2060
- # This can happen when the SDK running in the user's Run updates the Run's state to terminal, but
2061
- # before it exits, when the runtime resource is still running, the API monitoring (here) is executed
2062
- if run_state not in RunStates.terminal_states():
2063
- now = datetime.now(timezone.utc)
2064
- last_update_str = run.get("status", {}).get("last_update")
2065
- if last_update_str is not None:
2066
- last_update = datetime.fromisoformat(last_update_str)
2067
- debounce_period = config.runs_monitoring_interval
2068
- if last_update > now - timedelta(
2069
- seconds=float(debounce_period)
2070
- ):
2071
- logger.warning(
2072
- "Monitoring found non-terminal state on runtime resource but record has recently "
2073
- "updated to terminal state. Debouncing",
2074
- project=project,
2075
- uid=uid,
2076
- db_run_state=db_run_state,
2077
- run_state=run_state,
2078
- last_update=last_update,
2079
- now=now,
2080
- debounce_period=debounce_period,
2081
- )
2082
- return False, run_state
2083
-
2084
- logger.warning(
2085
- "Run record has terminal state but monitoring found different state on runtime resource. Changing",
2086
- project=project,
2087
- uid=uid,
2088
- db_run_state=db_run_state,
2089
- run_state=run_state,
2090
- )
2091
-
2092
- logger.info("Updating run state", run_state=run_state)
2093
- run.setdefault("status", {})["state"] = run_state
2094
- run.setdefault("status", {})["last_update"] = now_date().isoformat()
2095
- db.store_run(db_session, run, uid, project)
2096
-
2097
- return True, run_state
2098
-
2099
- @staticmethod
2100
- def _resolve_runtime_resource_run(runtime_resource: Dict) -> Tuple[str, str, str]:
2101
- project = (
2102
- runtime_resource.get("metadata", {}).get("labels", {}).get("mlrun/project")
2103
- )
2104
- if not project:
2105
- project = config.default_project
2106
- uid = runtime_resource.get("metadata", {}).get("labels", {}).get("mlrun/uid")
2107
- name = (
2108
- runtime_resource.get("metadata", {})
2109
- .get("labels", {})
2110
- .get("mlrun/name", "no-name")
2111
- )
2112
- return project, uid, name
2113
-
2114
- @staticmethod
2115
- def _build_pod_resources(pods) -> List[mlrun.common.schemas.RuntimeResource]:
2116
- pod_resources = []
2117
- for pod in pods:
2118
- pod_resources.append(
2119
- mlrun.common.schemas.RuntimeResource(
2120
- name=pod["metadata"]["name"],
2121
- labels=pod["metadata"]["labels"],
2122
- status=pod["status"],
2123
- )
2124
- )
2125
- return pod_resources
2126
-
2127
- @staticmethod
2128
- def _build_crd_resources(
2129
- custom_objects,
2130
- ) -> List[mlrun.common.schemas.RuntimeResource]:
2131
- crd_resources = []
2132
- for custom_object in custom_objects:
2133
- crd_resources.append(
2134
- mlrun.common.schemas.RuntimeResource(
2135
- name=custom_object["metadata"]["name"],
2136
- labels=custom_object["metadata"]["labels"],
2137
- status=custom_object.get("status", {}),
2138
- )
2139
- )
2140
- return crd_resources