mlrun 1.5.0rc1__py3-none-any.whl → 1.5.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (119) hide show
  1. mlrun/__init__.py +2 -35
  2. mlrun/__main__.py +1 -40
  3. mlrun/api/api/api.py +6 -0
  4. mlrun/api/api/endpoints/feature_store.py +0 -4
  5. mlrun/api/api/endpoints/files.py +14 -2
  6. mlrun/api/api/endpoints/functions.py +6 -1
  7. mlrun/api/api/endpoints/logs.py +17 -3
  8. mlrun/api/api/endpoints/pipelines.py +1 -5
  9. mlrun/api/api/endpoints/projects.py +88 -0
  10. mlrun/api/api/endpoints/runs.py +48 -6
  11. mlrun/api/api/endpoints/workflows.py +355 -0
  12. mlrun/api/api/utils.py +1 -1
  13. mlrun/api/crud/__init__.py +1 -0
  14. mlrun/api/crud/client_spec.py +3 -0
  15. mlrun/api/crud/model_monitoring/deployment.py +36 -7
  16. mlrun/api/crud/model_monitoring/grafana.py +1 -1
  17. mlrun/api/crud/model_monitoring/helpers.py +32 -2
  18. mlrun/api/crud/model_monitoring/model_endpoints.py +27 -5
  19. mlrun/api/crud/notifications.py +9 -4
  20. mlrun/api/crud/pipelines.py +4 -9
  21. mlrun/api/crud/runtime_resources.py +4 -3
  22. mlrun/api/crud/secrets.py +21 -0
  23. mlrun/api/crud/workflows.py +352 -0
  24. mlrun/api/db/base.py +16 -1
  25. mlrun/api/db/sqldb/db.py +97 -16
  26. mlrun/api/launcher.py +26 -7
  27. mlrun/api/main.py +3 -4
  28. mlrun/{mlutils → api/rundb}/__init__.py +2 -6
  29. mlrun/{db → api/rundb}/sqldb.py +35 -83
  30. mlrun/api/runtime_handlers/__init__.py +56 -0
  31. mlrun/api/runtime_handlers/base.py +1247 -0
  32. mlrun/api/runtime_handlers/daskjob.py +209 -0
  33. mlrun/api/runtime_handlers/kubejob.py +37 -0
  34. mlrun/api/runtime_handlers/mpijob.py +147 -0
  35. mlrun/api/runtime_handlers/remotesparkjob.py +29 -0
  36. mlrun/api/runtime_handlers/sparkjob.py +148 -0
  37. mlrun/api/utils/builder.py +1 -4
  38. mlrun/api/utils/clients/chief.py +14 -0
  39. mlrun/api/utils/scheduler.py +98 -15
  40. mlrun/api/utils/singletons/db.py +4 -0
  41. mlrun/artifacts/manager.py +1 -2
  42. mlrun/common/schemas/__init__.py +6 -0
  43. mlrun/common/schemas/auth.py +4 -1
  44. mlrun/common/schemas/client_spec.py +1 -1
  45. mlrun/common/schemas/model_monitoring/__init__.py +1 -0
  46. mlrun/common/schemas/model_monitoring/constants.py +11 -0
  47. mlrun/common/schemas/project.py +1 -0
  48. mlrun/common/schemas/runs.py +1 -8
  49. mlrun/common/schemas/schedule.py +1 -8
  50. mlrun/common/schemas/workflow.py +54 -0
  51. mlrun/config.py +42 -40
  52. mlrun/datastore/sources.py +1 -1
  53. mlrun/db/__init__.py +4 -68
  54. mlrun/db/base.py +12 -0
  55. mlrun/db/factory.py +65 -0
  56. mlrun/db/httpdb.py +175 -19
  57. mlrun/db/nopdb.py +4 -2
  58. mlrun/execution.py +4 -2
  59. mlrun/feature_store/__init__.py +1 -0
  60. mlrun/feature_store/api.py +1 -2
  61. mlrun/feature_store/feature_set.py +0 -10
  62. mlrun/feature_store/feature_vector.py +340 -2
  63. mlrun/feature_store/ingestion.py +5 -10
  64. mlrun/feature_store/retrieval/base.py +118 -104
  65. mlrun/feature_store/retrieval/dask_merger.py +17 -10
  66. mlrun/feature_store/retrieval/job.py +4 -1
  67. mlrun/feature_store/retrieval/local_merger.py +18 -18
  68. mlrun/feature_store/retrieval/spark_merger.py +21 -14
  69. mlrun/feature_store/retrieval/storey_merger.py +21 -15
  70. mlrun/kfpops.py +3 -9
  71. mlrun/launcher/base.py +3 -3
  72. mlrun/launcher/client.py +3 -2
  73. mlrun/launcher/factory.py +16 -13
  74. mlrun/lists.py +0 -11
  75. mlrun/model.py +9 -15
  76. mlrun/model_monitoring/helpers.py +15 -25
  77. mlrun/model_monitoring/model_monitoring_batch.py +72 -4
  78. mlrun/model_monitoring/prometheus.py +219 -0
  79. mlrun/model_monitoring/stores/__init__.py +15 -9
  80. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +3 -1
  81. mlrun/model_monitoring/stream_processing.py +181 -29
  82. mlrun/package/packager.py +6 -8
  83. mlrun/package/packagers/default_packager.py +121 -10
  84. mlrun/platforms/__init__.py +0 -2
  85. mlrun/platforms/iguazio.py +0 -56
  86. mlrun/projects/pipelines.py +57 -158
  87. mlrun/projects/project.py +6 -32
  88. mlrun/render.py +1 -1
  89. mlrun/run.py +2 -124
  90. mlrun/runtimes/__init__.py +6 -42
  91. mlrun/runtimes/base.py +26 -1241
  92. mlrun/runtimes/daskjob.py +2 -198
  93. mlrun/runtimes/function.py +16 -5
  94. mlrun/runtimes/kubejob.py +5 -29
  95. mlrun/runtimes/mpijob/__init__.py +2 -2
  96. mlrun/runtimes/mpijob/abstract.py +10 -1
  97. mlrun/runtimes/mpijob/v1.py +0 -76
  98. mlrun/runtimes/mpijob/v1alpha1.py +1 -74
  99. mlrun/runtimes/nuclio.py +3 -2
  100. mlrun/runtimes/pod.py +0 -10
  101. mlrun/runtimes/remotesparkjob.py +1 -15
  102. mlrun/runtimes/serving.py +1 -1
  103. mlrun/runtimes/sparkjob/__init__.py +0 -1
  104. mlrun/runtimes/sparkjob/abstract.py +4 -131
  105. mlrun/serving/states.py +1 -1
  106. mlrun/utils/db.py +0 -2
  107. mlrun/utils/helpers.py +19 -13
  108. mlrun/utils/notifications/notification_pusher.py +5 -25
  109. mlrun/utils/regex.py +7 -2
  110. mlrun/utils/version/version.json +2 -2
  111. {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/METADATA +24 -23
  112. {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/RECORD +116 -107
  113. {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/WHEEL +1 -1
  114. mlrun/mlutils/data.py +0 -160
  115. mlrun/mlutils/models.py +0 -78
  116. mlrun/mlutils/plots.py +0 -902
  117. {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/LICENSE +0 -0
  118. {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/entry_points.txt +0 -0
  119. {mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/top_level.txt +0 -0
mlrun/runtimes/base.py CHANGED
@@ -15,33 +15,26 @@ import enum
15
15
  import getpass
16
16
  import http
17
17
  import re
18
- import traceback
19
18
  import warnings
20
- from abc import ABC, abstractmethod
21
19
  from base64 import b64encode
22
- from datetime import datetime, timedelta, timezone
23
20
  from os import environ
24
- from typing import Callable, Dict, List, Optional, Tuple, Union
21
+ from typing import Callable, Dict, List, Optional, Union
25
22
 
26
23
  import requests.exceptions
27
24
  from deprecated import deprecated
28
- from kubernetes.client.rest import ApiException
29
25
  from nuclio.build import mlrun_footer
30
- from sqlalchemy.orm import Session
31
26
 
32
27
  import mlrun.common.schemas
28
+ import mlrun.db
33
29
  import mlrun.errors
34
30
  import mlrun.launcher.factory
35
31
  import mlrun.utils.helpers
36
32
  import mlrun.utils.notifications
37
33
  import mlrun.utils.regex
38
- from mlrun.api.constants import LogSources
39
- from mlrun.api.db.base import DBInterface
40
34
  from mlrun.utils.helpers import generate_object_uri, verify_field_regex
41
35
 
42
36
  from ..config import config
43
37
  from ..datastore import store_manager
44
- from ..db import RunDBError, get_or_set_dburl, get_run_db
45
38
  from ..errors import err_to_str
46
39
  from ..kfpops import mlrun_op
47
40
  from ..lists import RunList
@@ -56,9 +49,8 @@ from ..utils import (
56
49
  now_date,
57
50
  update_in,
58
51
  )
59
- from .constants import PodPhases, RunStates
60
52
  from .funcdoc import update_function_entry_points
61
- from .utils import RunError, calc_hash, get_k8s
53
+ from .utils import RunError, calc_hash
62
54
 
63
55
  spec_fields = [
64
56
  "command",
@@ -227,14 +219,16 @@ class BaseRuntime(ModelObj):
227
219
  )
228
220
 
229
221
  def _ensure_run_db(self):
230
- self.spec.rundb = self.spec.rundb or get_or_set_dburl()
222
+ self.spec.rundb = self.spec.rundb or mlrun.db.get_or_set_dburl()
231
223
 
232
224
  def _get_db(self):
233
225
  # TODO: remove this function and use the launcher db instead
234
226
  self._ensure_run_db()
235
227
  if not self._db_conn:
236
228
  if self.spec.rundb:
237
- self._db_conn = get_run_db(self.spec.rundb, secrets=self._secrets)
229
+ self._db_conn = mlrun.db.get_run_db(
230
+ self.spec.rundb, secrets=self._secrets
231
+ )
238
232
  return self._db_conn
239
233
 
240
234
  # This function is different than the auto_mount function, as it mounts to runtimes based on the configuration.
@@ -293,6 +287,7 @@ class BaseRuntime(ModelObj):
293
287
  param_file_secrets: Optional[Dict[str, str]] = None,
294
288
  notifications: Optional[List[mlrun.model.Notification]] = None,
295
289
  returns: Optional[List[Union[str, Dict[str, str]]]] = None,
290
+ **launcher_kwargs,
296
291
  ) -> RunObject:
297
292
  """
298
293
  Run a local or remote task.
@@ -340,7 +335,7 @@ class BaseRuntime(ModelObj):
340
335
  :return: Run context object (RunObject) with run metadata, results and status
341
336
  """
342
337
  launcher = mlrun.launcher.factory.LauncherFactory().create_launcher(
343
- self._is_remote, local=local
338
+ self._is_remote, local=local, **launcher_kwargs
344
339
  )
345
340
  return launcher.launch(
346
341
  runtime=self,
@@ -373,7 +368,7 @@ class BaseRuntime(ModelObj):
373
368
  iter = task.metadata.iteration
374
369
  try:
375
370
  return self._get_db().read_run(uid, project, iter=iter)
376
- except RunDBError:
371
+ except mlrun.db.RunDBError:
377
372
  return None
378
373
  if task:
379
374
  return task.to_dict()
@@ -572,10 +567,10 @@ class BaseRuntime(ModelObj):
572
567
 
573
568
  elif not was_none and last_state != "completed":
574
569
  try:
575
- runtime_handler = mlrun.runtimes.get_runtime_handler(kind)
576
- updates = runtime_handler._get_run_completion_updates(resp)
570
+ runtime_cls = mlrun.runtimes.get_runtime_class(kind)
571
+ updates = runtime_cls._get_run_completion_updates(resp)
577
572
  except KeyError:
578
- updates = BaseRuntimeHandler._get_run_completion_updates(resp)
573
+ updates = self._get_run_completion_updates(resp)
579
574
 
580
575
  uid = get_in(resp, "metadata.uid")
581
576
  logger.debug(
@@ -603,6 +598,19 @@ class BaseRuntime(ModelObj):
603
598
  matches = re.findall(mlrun.utils.regex.pipeline_param[0], self.to_json())
604
599
  return bool(matches)
605
600
 
601
+ @staticmethod
602
+ def _get_run_completion_updates(run: dict) -> dict:
603
+ """
604
+ Get the required updates for the run object when it's completed and update the run object state
605
+ Override this if the run completion is not resolved by a single execution
606
+ """
607
+ updates = {
608
+ "status.last_update": now_date().isoformat(),
609
+ "status.state": "completed",
610
+ }
611
+ update_in(run, "status.state", "completed")
612
+ return updates
613
+
606
614
  def full_image_path(
607
615
  self, image=None, client_version: str = None, client_python_version: str = None
608
616
  ):
@@ -910,1226 +918,3 @@ class BaseRuntime(ModelObj):
910
918
  if "default" in p:
911
919
  line += f", default={p['default']}"
912
920
  print(" " + line)
913
-
914
-
915
- class BaseRuntimeHandler(ABC):
916
- # setting here to allow tests to override
917
- kind = "base"
918
- class_modes: Dict[RuntimeClassMode, str] = {}
919
- wait_for_deletion_interval = 10
920
-
921
- @staticmethod
922
- @abstractmethod
923
- def _get_object_label_selector(object_id: str) -> str:
924
- """
925
- Should return the label selector to get only resources of a specific object (with id object_id)
926
- """
927
- pass
928
-
929
- def _should_collect_logs(self) -> bool:
930
- """
931
- There are some runtimes which we don't collect logs for using the log collector
932
- :return: whether it should collect log for it
933
- """
934
- return True
935
-
936
- def _get_possible_mlrun_class_label_values(
937
- self, class_mode: Union[RuntimeClassMode, str] = None
938
- ) -> List[str]:
939
- """
940
- Should return the possible values of the mlrun/class label for runtime resources that are of this runtime
941
- handler kind
942
- """
943
- if not class_mode:
944
- return list(self.class_modes.values())
945
- class_mode = self.class_modes.get(class_mode, None)
946
- return [class_mode] if class_mode else []
947
-
948
- def list_resources(
949
- self,
950
- project: str,
951
- object_id: Optional[str] = None,
952
- label_selector: str = None,
953
- group_by: Optional[
954
- mlrun.common.schemas.ListRuntimeResourcesGroupByField
955
- ] = None,
956
- ) -> Union[
957
- mlrun.common.schemas.RuntimeResources,
958
- mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
959
- mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
960
- ]:
961
- # We currently don't support removing runtime resources in non k8s env
962
- if not get_k8s().is_running_inside_kubernetes_cluster():
963
- return {}
964
- namespace = get_k8s().resolve_namespace()
965
- label_selector = self.resolve_label_selector(project, object_id, label_selector)
966
- pods = self._list_pods(namespace, label_selector)
967
- pod_resources = self._build_pod_resources(pods)
968
- crd_objects = self._list_crd_objects(namespace, label_selector)
969
- crd_resources = self._build_crd_resources(crd_objects)
970
- response = self._build_list_resources_response(
971
- pod_resources, crd_resources, group_by
972
- )
973
- response = self._enrich_list_resources_response(
974
- response, namespace, label_selector, group_by
975
- )
976
- return response
977
-
978
- def build_output_from_runtime_resources(
979
- self,
980
- runtime_resources_list: List[mlrun.common.schemas.RuntimeResources],
981
- group_by: Optional[
982
- mlrun.common.schemas.ListRuntimeResourcesGroupByField
983
- ] = None,
984
- ):
985
- pod_resources = []
986
- crd_resources = []
987
- for runtime_resources in runtime_resources_list:
988
- pod_resources += runtime_resources.pod_resources
989
- crd_resources += runtime_resources.crd_resources
990
- response = self._build_list_resources_response(
991
- pod_resources, crd_resources, group_by
992
- )
993
- response = self._build_output_from_runtime_resources(
994
- response, runtime_resources_list, group_by
995
- )
996
- return response
997
-
998
- def delete_resources(
999
- self,
1000
- db: DBInterface,
1001
- db_session: Session,
1002
- label_selector: str = None,
1003
- force: bool = False,
1004
- grace_period: int = None,
1005
- ):
1006
- if grace_period is None:
1007
- grace_period = config.runtime_resources_deletion_grace_period
1008
- # We currently don't support removing runtime resources in non k8s env
1009
- if not get_k8s().is_running_inside_kubernetes_cluster():
1010
- return
1011
- namespace = get_k8s().resolve_namespace()
1012
- label_selector = self.resolve_label_selector("*", label_selector=label_selector)
1013
- crd_group, crd_version, crd_plural = self._get_crd_info()
1014
- if crd_group and crd_version and crd_plural:
1015
- deleted_resources = self._delete_crd_resources(
1016
- db,
1017
- db_session,
1018
- namespace,
1019
- label_selector,
1020
- force,
1021
- grace_period,
1022
- )
1023
- else:
1024
- deleted_resources = self._delete_pod_resources(
1025
- db,
1026
- db_session,
1027
- namespace,
1028
- label_selector,
1029
- force,
1030
- grace_period,
1031
- )
1032
- self._delete_extra_resources(
1033
- db,
1034
- db_session,
1035
- namespace,
1036
- deleted_resources,
1037
- label_selector,
1038
- force,
1039
- grace_period,
1040
- )
1041
-
1042
- def delete_runtime_object_resources(
1043
- self,
1044
- db: DBInterface,
1045
- db_session: Session,
1046
- object_id: str,
1047
- label_selector: str = None,
1048
- force: bool = False,
1049
- grace_period: int = None,
1050
- ):
1051
- if grace_period is None:
1052
- grace_period = config.runtime_resources_deletion_grace_period
1053
- label_selector = self._add_object_label_selector_if_needed(
1054
- object_id, label_selector
1055
- )
1056
- self.delete_resources(db, db_session, label_selector, force, grace_period)
1057
-
1058
- def monitor_runs(self, db: DBInterface, db_session: Session):
1059
- namespace = get_k8s().resolve_namespace()
1060
- label_selector = self._get_default_label_selector()
1061
- crd_group, crd_version, crd_plural = self._get_crd_info()
1062
- runtime_resource_is_crd = False
1063
- if crd_group and crd_version and crd_plural:
1064
- runtime_resource_is_crd = True
1065
- runtime_resources = self._list_crd_objects(namespace, label_selector)
1066
- else:
1067
- runtime_resources = self._list_pods(namespace, label_selector)
1068
- project_run_uid_map = self._list_runs_for_monitoring(db, db_session)
1069
- # project -> uid -> {"name": <runtime-resource-name>}
1070
- run_runtime_resources_map = {}
1071
- for runtime_resource in runtime_resources:
1072
- project, uid, name = self._resolve_runtime_resource_run(runtime_resource)
1073
- run_runtime_resources_map.setdefault(project, {})
1074
- run_runtime_resources_map.get(project).update({uid: {"name": name}})
1075
- try:
1076
- self._monitor_runtime_resource(
1077
- db,
1078
- db_session,
1079
- project_run_uid_map,
1080
- runtime_resource,
1081
- runtime_resource_is_crd,
1082
- namespace,
1083
- project,
1084
- uid,
1085
- name,
1086
- )
1087
- except Exception as exc:
1088
- logger.warning(
1089
- "Failed monitoring runtime resource. Continuing",
1090
- runtime_resource_name=runtime_resource["metadata"]["name"],
1091
- namespace=namespace,
1092
- exc=err_to_str(exc),
1093
- traceback=traceback.format_exc(),
1094
- )
1095
- for project, runs in project_run_uid_map.items():
1096
- if runs:
1097
- for run_uid, run in runs.items():
1098
- try:
1099
- if not run:
1100
- run = db.read_run(db_session, run_uid, project)
1101
- if self.kind == run.get("metadata", {}).get("labels", {}).get(
1102
- "kind", ""
1103
- ):
1104
- self._ensure_run_not_stuck_on_non_terminal_state(
1105
- db,
1106
- db_session,
1107
- project,
1108
- run_uid,
1109
- run,
1110
- run_runtime_resources_map,
1111
- )
1112
- except Exception as exc:
1113
- logger.warning(
1114
- "Failed ensuring run not stuck. Continuing",
1115
- run_uid=run_uid,
1116
- run=run,
1117
- project=project,
1118
- exc=err_to_str(exc),
1119
- traceback=traceback.format_exc(),
1120
- )
1121
-
1122
- def _ensure_run_not_stuck_on_non_terminal_state(
1123
- self,
1124
- db: DBInterface,
1125
- db_session: Session,
1126
- project: str,
1127
- run_uid: str,
1128
- run: dict = None,
1129
- run_runtime_resources_map: dict = None,
1130
- ):
1131
- """
1132
- Ensuring that a run does not become trapped in a non-terminal state as a result of not finding
1133
- corresponding k8s resource.
1134
- This can occur when a node is evicted or preempted, causing the resources to be removed from the resource
1135
- listing when the final state recorded in the database is non-terminal.
1136
- This will have a significant impact on scheduled jobs, since they will not be created until the
1137
- previous run reaches a terminal state (because of concurrency limit)
1138
- """
1139
- now = now_date()
1140
- db_run_state = run.get("status", {}).get("state")
1141
- if not db_run_state:
1142
- # we are setting the run state to a terminal state to avoid log spamming, this is mainly sanity as we are
1143
- # setting state to runs when storing new runs.
1144
- logger.info(
1145
- "Runs monitoring found a run without state, updating to a terminal state",
1146
- project=project,
1147
- uid=run_uid,
1148
- db_run_state=db_run_state,
1149
- now=now,
1150
- )
1151
- run.setdefault("status", {})["state"] = RunStates.error
1152
- run.setdefault("status", {})["last_update"] = now.isoformat()
1153
- db.store_run(db_session, run, run_uid, project)
1154
- return
1155
- if db_run_state in RunStates.non_terminal_states():
1156
- if run_runtime_resources_map and run_uid in run_runtime_resources_map.get(
1157
- project, {}
1158
- ):
1159
- # if found resource there is no need to continue
1160
- return
1161
- last_update_str = run.get("status", {}).get("last_update")
1162
- debounce_period = (
1163
- config.resolve_runs_monitoring_missing_runtime_resources_debouncing_interval()
1164
- )
1165
- if last_update_str is None:
1166
- logger.info(
1167
- "Runs monitoring found run in non-terminal state without last update time set, "
1168
- "updating last update time to now, to be able to evaluate next time if something changed",
1169
- project=project,
1170
- uid=run_uid,
1171
- db_run_state=db_run_state,
1172
- now=now,
1173
- debounce_period=debounce_period,
1174
- )
1175
- run.setdefault("status", {})["last_update"] = now.isoformat()
1176
- db.store_run(db_session, run, run_uid, project)
1177
- return
1178
-
1179
- if datetime.fromisoformat(last_update_str) > now - timedelta(
1180
- seconds=debounce_period
1181
- ):
1182
- # we are setting non-terminal states to runs before the run is actually applied to k8s, meaning there is
1183
- # a timeframe where the run exists and no runtime resources exist and it's ok, therefore we're applying
1184
- # a debounce period before setting the state to error
1185
- logger.warning(
1186
- "Monitoring did not discover a runtime resource that corresponded to a run in a "
1187
- "non-terminal state. but record has recently updated. Debouncing",
1188
- project=project,
1189
- uid=run_uid,
1190
- db_run_state=db_run_state,
1191
- last_update=datetime.fromisoformat(last_update_str),
1192
- now=now,
1193
- debounce_period=debounce_period,
1194
- )
1195
- else:
1196
- logger.info(
1197
- "Updating run state", run_uid=run_uid, run_state=RunStates.error
1198
- )
1199
- run.setdefault("status", {})["state"] = RunStates.error
1200
- run.setdefault("status", {})[
1201
- "reason"
1202
- ] = "A runtime resource related to this run could not be found"
1203
- run.setdefault("status", {})["last_update"] = now.isoformat()
1204
- db.store_run(db_session, run, run_uid, project)
1205
-
1206
- def _add_object_label_selector_if_needed(
1207
- self,
1208
- object_id: Optional[str] = None,
1209
- label_selector: Optional[str] = None,
1210
- ):
1211
- if object_id:
1212
- object_label_selector = self._get_object_label_selector(object_id)
1213
- if label_selector:
1214
- label_selector = ",".join([object_label_selector, label_selector])
1215
- else:
1216
- label_selector = object_label_selector
1217
- return label_selector
1218
-
1219
- @staticmethod
1220
- def _get_main_runtime_resource_label_selector() -> str:
1221
- """
1222
- There are some runtimes which might have multiple k8s resources attached to a one runtime, in this case
1223
- we don't want to pull logs from all but rather only for the "driver"/"launcher" etc
1224
- :return: the label selector
1225
- """
1226
- return ""
1227
-
1228
- def _enrich_list_resources_response(
1229
- self,
1230
- response: Union[
1231
- mlrun.common.schemas.RuntimeResources,
1232
- mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
1233
- mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
1234
- ],
1235
- namespace: str,
1236
- label_selector: str = None,
1237
- group_by: Optional[
1238
- mlrun.common.schemas.ListRuntimeResourcesGroupByField
1239
- ] = None,
1240
- ) -> Union[
1241
- mlrun.common.schemas.RuntimeResources,
1242
- mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
1243
- mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
1244
- ]:
1245
- """
1246
- Override this to list resources other then pods or CRDs (which are handled by the base class)
1247
- """
1248
- return response
1249
-
1250
- def _build_output_from_runtime_resources(
1251
- self,
1252
- response: Union[
1253
- mlrun.common.schemas.RuntimeResources,
1254
- mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
1255
- mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
1256
- ],
1257
- runtime_resources_list: List[mlrun.common.schemas.RuntimeResources],
1258
- group_by: Optional[
1259
- mlrun.common.schemas.ListRuntimeResourcesGroupByField
1260
- ] = None,
1261
- ):
1262
- """
1263
- Override this to add runtime resources other than pods or CRDs (which are handled by the base class) to the
1264
- output
1265
- """
1266
- return response
1267
-
1268
- def _delete_extra_resources(
1269
- self,
1270
- db: DBInterface,
1271
- db_session: Session,
1272
- namespace: str,
1273
- deleted_resources: List[Dict],
1274
- label_selector: str = None,
1275
- force: bool = False,
1276
- grace_period: int = None,
1277
- ):
1278
- """
1279
- Override this to handle deletion of resources other than pods or CRDs (which are handled by the base class)
1280
- Note that this is happening after the deletion of the CRDs or the pods
1281
- Note to add this at the beginning:
1282
- if grace_period is None:
1283
- grace_period = config.runtime_resources_deletion_grace_period
1284
- """
1285
- pass
1286
-
1287
- def _resolve_crd_object_status_info(
1288
- self, db: DBInterface, db_session: Session, crd_object
1289
- ) -> Tuple[bool, Optional[datetime], Optional[str]]:
1290
- """
1291
- Override this if the runtime has CRD resources.
1292
- :return: Tuple with:
1293
- 1. bool determining whether the crd object is in terminal state
1294
- 2. datetime of when the crd object got into terminal state (only when the crd object in terminal state)
1295
- 3. the desired run state matching the crd object state
1296
- """
1297
- return False, None, None
1298
-
1299
- def _update_ui_url(
1300
- self,
1301
- db: DBInterface,
1302
- db_session: Session,
1303
- project: str,
1304
- uid: str,
1305
- crd_object,
1306
- run: Dict = None,
1307
- ):
1308
- """
1309
- Update the UI URL for relevant jobs.
1310
- """
1311
- pass
1312
-
1313
- def _resolve_pod_status_info(
1314
- self, db: DBInterface, db_session: Session, pod: Dict
1315
- ) -> Tuple[bool, Optional[datetime], Optional[str]]:
1316
- """
1317
- :return: Tuple with:
1318
- 1. bool determining whether the pod is in terminal state
1319
- 2. datetime of when the pod got into terminal state (only when the pod in terminal state)
1320
- 3. the run state matching the pod state
1321
- """
1322
- in_terminal_state = pod["status"]["phase"] in PodPhases.terminal_phases()
1323
- run_state = PodPhases.pod_phase_to_run_state(pod["status"]["phase"])
1324
- last_container_completion_time = None
1325
- if in_terminal_state:
1326
- for container_status in pod["status"].get("container_statuses", []):
1327
- if container_status.get("state", {}).get("terminated"):
1328
- container_completion_time = container_status["state"][
1329
- "terminated"
1330
- ].get("finished_at")
1331
-
1332
- # take latest completion time
1333
- if (
1334
- not last_container_completion_time
1335
- or last_container_completion_time < container_completion_time
1336
- ):
1337
- last_container_completion_time = container_completion_time
1338
-
1339
- return in_terminal_state, last_container_completion_time, run_state
1340
-
1341
- def _get_default_label_selector(
1342
- self, class_mode: Union[RuntimeClassMode, str] = None
1343
- ) -> str:
1344
- """
1345
- Override this to add a default label selector
1346
- """
1347
- class_values = self._get_possible_mlrun_class_label_values(class_mode)
1348
- if not class_values:
1349
- return ""
1350
- if len(class_values) == 1:
1351
- return f"mlrun/class={class_values[0]}"
1352
- return f"mlrun/class in ({', '.join(class_values)})"
1353
-
1354
- @staticmethod
1355
- def _get_run_completion_updates(run: dict) -> dict:
1356
- """
1357
- Get the required updates for the run object when it's completed and update the run object state
1358
- Override this if the run completion is not resolved by a single execution
1359
- """
1360
- updates = {
1361
- "status.last_update": now_date().isoformat(),
1362
- "status.state": "completed",
1363
- }
1364
- update_in(run, "status.state", "completed")
1365
- return updates
1366
-
1367
- @staticmethod
1368
- def _get_crd_info() -> Tuple[str, str, str]:
1369
- """
1370
- Override this if the runtime has CRD resources. this should return the CRD info:
1371
- crd group, crd version, crd plural
1372
- """
1373
- return "", "", ""
1374
-
1375
- @staticmethod
1376
- def _are_resources_coupled_to_run_object() -> bool:
1377
- """
1378
- Some resources are tightly coupled to mlrun Run object, for example, for each Run of a Function of the job kind
1379
- a kubernetes job is being generated, on the opposite a Function of the daskjob kind generates a dask cluster,
1380
- and every Run is being executed using this cluster, i.e. no resources are created for the Run.
1381
- This function should return true for runtimes in which Run are coupled to the underlying resources and therefore
1382
- aspects of the Run (like its state) should be taken into consideration on resources deletion
1383
- """
1384
- return False
1385
-
1386
- @staticmethod
1387
- def _expect_pods_without_uid() -> bool:
1388
- return False
1389
-
1390
- def _list_pods(self, namespace: str, label_selector: str = None) -> List:
1391
- pods = get_k8s().list_pods(namespace, selector=label_selector)
1392
- # when we work with custom objects (list_namespaced_custom_object) it's always a dict, to be able to generalize
1393
- # code working on runtime resource (either a custom object or a pod) we're transforming to dicts
1394
- pods = [pod.to_dict() for pod in pods]
1395
- return pods
1396
-
1397
- def _list_crd_objects(self, namespace: str, label_selector: str = None) -> List:
1398
- crd_group, crd_version, crd_plural = self._get_crd_info()
1399
- crd_objects = []
1400
- if crd_group and crd_version and crd_plural:
1401
- try:
1402
- crd_objects = get_k8s().crdapi.list_namespaced_custom_object(
1403
- crd_group,
1404
- crd_version,
1405
- namespace,
1406
- crd_plural,
1407
- label_selector=label_selector,
1408
- )
1409
- except ApiException as exc:
1410
- # ignore error if crd is not defined
1411
- if exc.status != 404:
1412
- raise
1413
- else:
1414
- crd_objects = crd_objects["items"]
1415
- return crd_objects
1416
-
1417
- def resolve_label_selector(
1418
- self,
1419
- project: str,
1420
- object_id: Optional[str] = None,
1421
- label_selector: Optional[str] = None,
1422
- class_mode: Union[RuntimeClassMode, str] = None,
1423
- with_main_runtime_resource_label_selector: bool = False,
1424
- ) -> str:
1425
- default_label_selector = self._get_default_label_selector(class_mode=class_mode)
1426
-
1427
- if label_selector:
1428
- label_selector = ",".join([default_label_selector, label_selector])
1429
- else:
1430
- label_selector = default_label_selector
1431
-
1432
- if project and project != "*":
1433
- label_selector = ",".join([label_selector, f"mlrun/project={project}"])
1434
-
1435
- label_selector = self._add_object_label_selector_if_needed(
1436
- object_id, label_selector
1437
- )
1438
-
1439
- if with_main_runtime_resource_label_selector:
1440
- main_runtime_resource_label_selector = (
1441
- self._get_main_runtime_resource_label_selector()
1442
- )
1443
- if main_runtime_resource_label_selector:
1444
- label_selector = ",".join(
1445
- [label_selector, main_runtime_resource_label_selector]
1446
- )
1447
-
1448
- return label_selector
1449
-
1450
- @staticmethod
1451
- def resolve_object_id(
1452
- run: dict,
1453
- ) -> Optional[str]:
1454
- """
1455
- Get the object id from the run object
1456
- Override this if the object id is not the run uid
1457
- :param run: run object
1458
- :return: object id
1459
- """
1460
- return run.get("metadata", {}).get("uid", None)
1461
-
1462
- def _wait_for_pods_deletion(
1463
- self,
1464
- namespace: str,
1465
- deleted_pods: List[Dict],
1466
- label_selector: str = None,
1467
- ):
1468
- deleted_pod_names = [pod_dict["metadata"]["name"] for pod_dict in deleted_pods]
1469
-
1470
- def _verify_pods_removed():
1471
- pods = get_k8s().v1api.list_namespaced_pod(
1472
- namespace, label_selector=label_selector
1473
- )
1474
- existing_pod_names = [pod.metadata.name for pod in pods.items]
1475
- still_in_deletion_pods = set(existing_pod_names).intersection(
1476
- deleted_pod_names
1477
- )
1478
- if still_in_deletion_pods:
1479
- raise RuntimeError(
1480
- f"Pods are still in deletion process: {still_in_deletion_pods}"
1481
- )
1482
-
1483
- if deleted_pod_names:
1484
- timeout = 180
1485
- logger.debug(
1486
- "Waiting for pods deletion",
1487
- timeout=timeout,
1488
- interval=self.wait_for_deletion_interval,
1489
- )
1490
- mlrun.utils.retry_until_successful(
1491
- self.wait_for_deletion_interval,
1492
- timeout,
1493
- logger,
1494
- True,
1495
- _verify_pods_removed,
1496
- )
1497
-
1498
- def _wait_for_crds_underlying_pods_deletion(
1499
- self,
1500
- deleted_crds: List[Dict],
1501
- label_selector: str = None,
1502
- ):
1503
- # we're using here the run identifier as the common ground to identify which pods are relevant to which CRD, so
1504
- # if they are not coupled we are not able to wait - simply return
1505
- # NOTE - there are surely smarter ways to do this, without depending on the run object, but as of writing this
1506
- # none of the runtimes using CRDs are like that, so not handling it now
1507
- if not self._are_resources_coupled_to_run_object():
1508
- return
1509
-
1510
- def _verify_crds_underlying_pods_removed():
1511
- project_uid_crd_map = {}
1512
- for crd in deleted_crds:
1513
- project, uid, _ = self._resolve_runtime_resource_run(crd)
1514
- if not uid or not project:
1515
- logger.warning(
1516
- "Could not resolve run uid from crd. Skipping waiting for pods deletion",
1517
- crd=crd,
1518
- )
1519
- continue
1520
- project_uid_crd_map.setdefault(project, {})[uid] = crd["metadata"][
1521
- "name"
1522
- ]
1523
- still_in_deletion_crds_to_pod_names = {}
1524
- jobs_runtime_resources: mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput = self.list_resources(
1525
- "*",
1526
- label_selector=label_selector,
1527
- group_by=mlrun.common.schemas.ListRuntimeResourcesGroupByField.job,
1528
- )
1529
- for project, project_jobs in jobs_runtime_resources.items():
1530
- if project not in project_uid_crd_map:
1531
- continue
1532
- for job_uid, job_runtime_resources in jobs_runtime_resources[
1533
- project
1534
- ].items():
1535
- if job_uid not in project_uid_crd_map[project]:
1536
- continue
1537
- if job_runtime_resources.pod_resources:
1538
- still_in_deletion_crds_to_pod_names[
1539
- project_uid_crd_map[project][job_uid]
1540
- ] = [
1541
- pod_resource.name
1542
- for pod_resource in job_runtime_resources.pod_resources
1543
- ]
1544
- if still_in_deletion_crds_to_pod_names:
1545
- raise RuntimeError(
1546
- f"CRD underlying pods are still in deletion process: {still_in_deletion_crds_to_pod_names}"
1547
- )
1548
-
1549
- if deleted_crds:
1550
- timeout = 180
1551
- logger.debug(
1552
- "Waiting for CRDs underlying pods deletion",
1553
- timeout=timeout,
1554
- interval=self.wait_for_deletion_interval,
1555
- )
1556
- mlrun.utils.retry_until_successful(
1557
- self.wait_for_deletion_interval,
1558
- timeout,
1559
- logger,
1560
- True,
1561
- _verify_crds_underlying_pods_removed,
1562
- )
1563
-
1564
- def _delete_pod_resources(
1565
- self,
1566
- db: DBInterface,
1567
- db_session: Session,
1568
- namespace: str,
1569
- label_selector: str = None,
1570
- force: bool = False,
1571
- grace_period: int = None,
1572
- ) -> List[Dict]:
1573
- if grace_period is None:
1574
- grace_period = config.runtime_resources_deletion_grace_period
1575
- pods = get_k8s().v1api.list_namespaced_pod(
1576
- namespace, label_selector=label_selector
1577
- )
1578
- deleted_pods = []
1579
- for pod in pods.items:
1580
- pod_dict = pod.to_dict()
1581
-
1582
- # best effort - don't let one failure in pod deletion to cut the whole operation
1583
- try:
1584
- (
1585
- in_terminal_state,
1586
- last_update,
1587
- run_state,
1588
- ) = self._resolve_pod_status_info(db, db_session, pod_dict)
1589
- if not force:
1590
- if not in_terminal_state:
1591
- continue
1592
-
1593
- # give some grace period if we have last update time
1594
- now = datetime.now(timezone.utc)
1595
- if (
1596
- last_update is not None
1597
- and last_update + timedelta(seconds=float(grace_period)) > now
1598
- ):
1599
- continue
1600
-
1601
- # if resources are tightly coupled to the run object - we want to perform some actions on the run object
1602
- # before deleting them
1603
- if self._are_resources_coupled_to_run_object():
1604
- try:
1605
- self._pre_deletion_runtime_resource_run_actions(
1606
- db, db_session, pod_dict, run_state
1607
- )
1608
- except Exception as exc:
1609
- # Don't prevent the deletion for failure in the pre deletion run actions
1610
- logger.warning(
1611
- "Failure in pod run pre-deletion actions. Continuing",
1612
- exc=repr(exc),
1613
- pod_name=pod.metadata.name,
1614
- )
1615
-
1616
- get_k8s().delete_pod(pod.metadata.name, namespace)
1617
- deleted_pods.append(pod_dict)
1618
- except Exception as exc:
1619
- logger.warning(
1620
- f"Cleanup failed processing pod {pod.metadata.name}: {repr(exc)}. Continuing"
1621
- )
1622
- # TODO: don't wait for pods to be deleted, client should poll the deletion status
1623
- self._wait_for_pods_deletion(namespace, deleted_pods, label_selector)
1624
- return deleted_pods
1625
-
1626
- def _delete_crd_resources(
1627
- self,
1628
- db: DBInterface,
1629
- db_session: Session,
1630
- namespace: str,
1631
- label_selector: str = None,
1632
- force: bool = False,
1633
- grace_period: int = None,
1634
- ) -> List[Dict]:
1635
- if grace_period is None:
1636
- grace_period = config.runtime_resources_deletion_grace_period
1637
- crd_group, crd_version, crd_plural = self._get_crd_info()
1638
- deleted_crds = []
1639
- try:
1640
- crd_objects = get_k8s().crdapi.list_namespaced_custom_object(
1641
- crd_group,
1642
- crd_version,
1643
- namespace,
1644
- crd_plural,
1645
- label_selector=label_selector,
1646
- )
1647
- except ApiException as exc:
1648
- # ignore error if crd is not defined
1649
- if exc.status != 404:
1650
- raise
1651
- else:
1652
- for crd_object in crd_objects["items"]:
1653
- # best effort - don't let one failure in pod deletion to cut the whole operation
1654
- try:
1655
- (
1656
- in_terminal_state,
1657
- last_update,
1658
- desired_run_state,
1659
- ) = self._resolve_crd_object_status_info(db, db_session, crd_object)
1660
- if not force:
1661
- if not in_terminal_state:
1662
- continue
1663
-
1664
- # give some grace period if we have last update time
1665
- now = datetime.now(timezone.utc)
1666
- if (
1667
- last_update is not None
1668
- and last_update + timedelta(seconds=float(grace_period))
1669
- > now
1670
- ):
1671
- continue
1672
-
1673
- # if resources are tightly coupled to the run object - we want to perform some actions on the run
1674
- # object before deleting them
1675
- if self._are_resources_coupled_to_run_object():
1676
- try:
1677
- self._pre_deletion_runtime_resource_run_actions(
1678
- db,
1679
- db_session,
1680
- crd_object,
1681
- desired_run_state,
1682
- )
1683
- except Exception as exc:
1684
- # Don't prevent the deletion for failure in the pre deletion run actions
1685
- logger.warning(
1686
- "Failure in crd object run pre-deletion actions. Continuing",
1687
- exc=err_to_str(exc),
1688
- crd_object_name=crd_object["metadata"]["name"],
1689
- )
1690
-
1691
- get_k8s().delete_crd(
1692
- crd_object["metadata"]["name"],
1693
- crd_group,
1694
- crd_version,
1695
- crd_plural,
1696
- namespace,
1697
- )
1698
- deleted_crds.append(crd_object)
1699
- except Exception:
1700
- exc = traceback.format_exc()
1701
- crd_object_name = crd_object["metadata"]["name"]
1702
- logger.warning(
1703
- f"Cleanup failed processing CRD object {crd_object_name}: {err_to_str(exc)}. Continuing"
1704
- )
1705
- self._wait_for_crds_underlying_pods_deletion(deleted_crds, label_selector)
1706
- return deleted_crds
1707
-
1708
- def _pre_deletion_runtime_resource_run_actions(
1709
- self,
1710
- db: DBInterface,
1711
- db_session: Session,
1712
- runtime_resource: Dict,
1713
- run_state: str,
1714
- ):
1715
- project, uid, name = self._resolve_runtime_resource_run(runtime_resource)
1716
-
1717
- # if cannot resolve related run nothing to do
1718
- if not uid:
1719
- if not self._expect_pods_without_uid():
1720
- logger.warning(
1721
- "Could not resolve run uid from runtime resource. Skipping pre-deletion actions",
1722
- runtime_resource=runtime_resource,
1723
- )
1724
- raise ValueError("Could not resolve run uid from runtime resource")
1725
- else:
1726
- return
1727
-
1728
- logger.info(
1729
- "Performing pre-deletion actions before cleaning up runtime resources",
1730
- project=project,
1731
- uid=uid,
1732
- )
1733
-
1734
- self._ensure_run_state(db, db_session, project, uid, name, run_state)
1735
-
1736
- self._ensure_run_logs_collected(db, db_session, project, uid)
1737
-
1738
- def _is_runtime_resource_run_in_terminal_state(
1739
- self,
1740
- db: DBInterface,
1741
- db_session: Session,
1742
- runtime_resource: Dict,
1743
- ) -> Tuple[bool, Optional[datetime]]:
1744
- """
1745
- A runtime can have different underlying resources (like pods or CRDs) - to generalize we call it runtime
1746
- resource. This function will verify whether the Run object related to this runtime resource is in transient
1747
- state. This is useful in order to determine whether an object can be removed. for example, a kubejob's pod
1748
- might be in completed state, but we would like to verify that the run is completed as well to verify the logs
1749
- were collected before we're removing the pod.
1750
-
1751
- :returns: bool determining whether the run in terminal state, and the last update time if it exists
1752
- """
1753
- project, uid, _ = self._resolve_runtime_resource_run(runtime_resource)
1754
-
1755
- # if no uid, assume in terminal state
1756
- if not uid:
1757
- return True, None
1758
-
1759
- run = db.read_run(db_session, uid, project)
1760
- last_update = None
1761
- last_update_str = run.get("status", {}).get("last_update")
1762
- if last_update_str is not None:
1763
- last_update = datetime.fromisoformat(last_update_str)
1764
-
1765
- if run.get("status", {}).get("state") not in RunStates.terminal_states():
1766
- return False, last_update
1767
-
1768
- return True, last_update
1769
-
1770
- def _list_runs_for_monitoring(
1771
- self, db: DBInterface, db_session: Session, states: list = None
1772
- ):
1773
- runs = db.list_runs(db_session, project="*", states=states)
1774
- project_run_uid_map = {}
1775
- run_with_missing_data = []
1776
- duplicated_runs = []
1777
- for run in runs:
1778
- project = run.get("metadata", {}).get("project")
1779
- uid = run.get("metadata", {}).get("uid")
1780
- if not uid or not project:
1781
- run_with_missing_data.append(run.get("metadata", {}))
1782
- continue
1783
- current_run = project_run_uid_map.setdefault(project, {}).get(uid)
1784
-
1785
- # sanity
1786
- if current_run:
1787
- duplicated_runs = {
1788
- "monitored_run": current_run.get(["metadata"]),
1789
- "duplicated_run": run.get(["metadata"]),
1790
- }
1791
- continue
1792
-
1793
- project_run_uid_map[project][uid] = run
1794
-
1795
- # If there are duplications or runs with missing data it probably won't be fixed
1796
- # Monitoring is running periodically and we don't want to log on every problem we found which will spam the log
1797
- # so we're aggregating the problems and logging only once per aggregation
1798
- if duplicated_runs:
1799
- logger.warning(
1800
- "Found duplicated runs (same uid). Heuristically monitoring the first one found",
1801
- duplicated_runs=duplicated_runs,
1802
- )
1803
-
1804
- if run_with_missing_data:
1805
- logger.warning(
1806
- "Found runs with missing data. They will not be monitored",
1807
- run_with_missing_data=run_with_missing_data,
1808
- )
1809
-
1810
- return project_run_uid_map
1811
-
1812
- def _monitor_runtime_resource(
1813
- self,
1814
- db: DBInterface,
1815
- db_session: Session,
1816
- project_run_uid_map: Dict,
1817
- runtime_resource: Dict,
1818
- runtime_resource_is_crd: bool,
1819
- namespace: str,
1820
- project: str = None,
1821
- uid: str = None,
1822
- name: str = None,
1823
- ):
1824
- if not project and not uid and not name:
1825
- project, uid, name = self._resolve_runtime_resource_run(runtime_resource)
1826
- if not project or not uid:
1827
- # Currently any build pod won't have UID and therefore will cause this log message to be printed which
1828
- # spams the log
1829
- # TODO: uncomment the log message when builder become a kind / starts having a UID
1830
- # logger.warning(
1831
- # "Could not resolve run project or uid from runtime resource, can not monitor run. Continuing",
1832
- # project=project,
1833
- # uid=uid,
1834
- # runtime_resource_name=runtime_resource["metadata"]["name"],
1835
- # namespace=namespace,
1836
- # )
1837
- return
1838
- run = project_run_uid_map.get(project, {}).get(uid)
1839
- if runtime_resource_is_crd:
1840
- (
1841
- _,
1842
- _,
1843
- run_state,
1844
- ) = self._resolve_crd_object_status_info(db, db_session, runtime_resource)
1845
- else:
1846
- (
1847
- _,
1848
- _,
1849
- run_state,
1850
- ) = self._resolve_pod_status_info(db, db_session, runtime_resource)
1851
- self._update_ui_url(db, db_session, project, uid, runtime_resource, run)
1852
- _, updated_run_state = self._ensure_run_state(
1853
- db,
1854
- db_session,
1855
- project,
1856
- uid,
1857
- name,
1858
- run_state,
1859
- run,
1860
- search_run=False,
1861
- )
1862
- if updated_run_state in RunStates.terminal_states():
1863
- self._ensure_run_logs_collected(db, db_session, project, uid)
1864
-
1865
- def _build_list_resources_response(
1866
- self,
1867
- pod_resources: List[mlrun.common.schemas.RuntimeResource] = None,
1868
- crd_resources: List[mlrun.common.schemas.RuntimeResource] = None,
1869
- group_by: Optional[
1870
- mlrun.common.schemas.ListRuntimeResourcesGroupByField
1871
- ] = None,
1872
- ) -> Union[
1873
- mlrun.common.schemas.RuntimeResources,
1874
- mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
1875
- mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput,
1876
- ]:
1877
- if crd_resources is None:
1878
- crd_resources = []
1879
- if pod_resources is None:
1880
- pod_resources = []
1881
-
1882
- if group_by is None:
1883
- return mlrun.common.schemas.RuntimeResources(
1884
- crd_resources=crd_resources, pod_resources=pod_resources
1885
- )
1886
- else:
1887
- if group_by == mlrun.common.schemas.ListRuntimeResourcesGroupByField.job:
1888
- return self._build_grouped_by_job_list_resources_response(
1889
- pod_resources, crd_resources
1890
- )
1891
- elif (
1892
- group_by
1893
- == mlrun.common.schemas.ListRuntimeResourcesGroupByField.project
1894
- ):
1895
- return self._build_grouped_by_project_list_resources_response(
1896
- pod_resources, crd_resources
1897
- )
1898
- else:
1899
- raise NotImplementedError(
1900
- f"Provided group by field is not supported. group_by={group_by}"
1901
- )
1902
-
1903
- def _build_grouped_by_project_list_resources_response(
1904
- self,
1905
- pod_resources: List[mlrun.common.schemas.RuntimeResource] = None,
1906
- crd_resources: List[mlrun.common.schemas.RuntimeResource] = None,
1907
- ) -> mlrun.common.schemas.GroupedByProjectRuntimeResourcesOutput:
1908
- resources = {}
1909
- for pod_resource in pod_resources:
1910
- self._add_resource_to_grouped_by_project_resources_response(
1911
- resources, "pod_resources", pod_resource
1912
- )
1913
- for crd_resource in crd_resources:
1914
- self._add_resource_to_grouped_by_project_resources_response(
1915
- resources, "crd_resources", crd_resource
1916
- )
1917
- return resources
1918
-
1919
- def _build_grouped_by_job_list_resources_response(
1920
- self,
1921
- pod_resources: List[mlrun.common.schemas.RuntimeResource] = None,
1922
- crd_resources: List[mlrun.common.schemas.RuntimeResource] = None,
1923
- ) -> mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput:
1924
- resources = {}
1925
- for pod_resource in pod_resources:
1926
- self._add_resource_to_grouped_by_job_resources_response(
1927
- resources, "pod_resources", pod_resource
1928
- )
1929
- for crd_resource in crd_resources:
1930
- self._add_resource_to_grouped_by_job_resources_response(
1931
- resources, "crd_resources", crd_resource
1932
- )
1933
- return resources
1934
-
1935
- def _add_resource_to_grouped_by_project_resources_response(
1936
- self,
1937
- resources: mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
1938
- resource_field_name: str,
1939
- resource: mlrun.common.schemas.RuntimeResource,
1940
- ):
1941
- if "mlrun/class" in resource.labels:
1942
- project = resource.labels.get("mlrun/project", "")
1943
- mlrun_class = resource.labels["mlrun/class"]
1944
- kind = self._resolve_kind_from_class(mlrun_class)
1945
- self._add_resource_to_grouped_by_field_resources_response(
1946
- project, kind, resources, resource_field_name, resource
1947
- )
1948
-
1949
- def _add_resource_to_grouped_by_job_resources_response(
1950
- self,
1951
- resources: mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
1952
- resource_field_name: str,
1953
- resource: mlrun.common.schemas.RuntimeResource,
1954
- ):
1955
- if "mlrun/uid" in resource.labels:
1956
- project = resource.labels.get("mlrun/project", config.default_project)
1957
- uid = resource.labels["mlrun/uid"]
1958
- self._add_resource_to_grouped_by_field_resources_response(
1959
- project, uid, resources, resource_field_name, resource
1960
- )
1961
-
1962
- @staticmethod
1963
- def _add_resource_to_grouped_by_field_resources_response(
1964
- first_field_value: str,
1965
- second_field_value: str,
1966
- resources: mlrun.common.schemas.GroupedByJobRuntimeResourcesOutput,
1967
- resource_field_name: str,
1968
- resource: mlrun.common.schemas.RuntimeResource,
1969
- ):
1970
- if first_field_value not in resources:
1971
- resources[first_field_value] = {}
1972
- if second_field_value not in resources[first_field_value]:
1973
- resources[first_field_value][
1974
- second_field_value
1975
- ] = mlrun.common.schemas.RuntimeResources(
1976
- pod_resources=[], crd_resources=[]
1977
- )
1978
- if not getattr(
1979
- resources[first_field_value][second_field_value], resource_field_name
1980
- ):
1981
- setattr(
1982
- resources[first_field_value][second_field_value],
1983
- resource_field_name,
1984
- [],
1985
- )
1986
- getattr(
1987
- resources[first_field_value][second_field_value], resource_field_name
1988
- ).append(resource)
1989
-
1990
- @staticmethod
1991
- def _resolve_kind_from_class(mlrun_class: str) -> str:
1992
- class_to_kind_map = {}
1993
- for kind in mlrun.runtimes.RuntimeKinds.runtime_with_handlers():
1994
- runtime_handler = mlrun.runtimes.get_runtime_handler(kind)
1995
- class_values = runtime_handler._get_possible_mlrun_class_label_values()
1996
- for value in class_values:
1997
- class_to_kind_map[value] = kind
1998
- return class_to_kind_map[mlrun_class]
1999
-
2000
- @staticmethod
2001
- def _get_run_label_selector(project: str, run_uid: str):
2002
- return f"mlrun/project={project},mlrun/uid={run_uid}"
2003
-
2004
- @staticmethod
2005
- def _ensure_run_logs_collected(
2006
- db: DBInterface, db_session: Session, project: str, uid: str
2007
- ):
2008
- # import here to avoid circular imports
2009
- import mlrun.api.crud as crud
2010
-
2011
- log_file_exists, _ = crud.Logs().log_file_exists_for_run_uid(project, uid)
2012
- if not log_file_exists:
2013
- # this stays for now for backwards compatibility in case we would not use the log collector but rather
2014
- # the legacy method to pull logs
2015
- logs_from_k8s = crud.Logs()._get_logs_legacy_method(
2016
- db_session, project, uid, source=LogSources.K8S
2017
- )
2018
- if logs_from_k8s:
2019
- logger.info("Storing run logs", project=project, uid=uid)
2020
- crud.Logs().store_log(logs_from_k8s, project, uid, append=False)
2021
-
2022
- @staticmethod
2023
- def _ensure_run_state(
2024
- db: DBInterface,
2025
- db_session: Session,
2026
- project: str,
2027
- uid: str,
2028
- name: str,
2029
- run_state: str,
2030
- run: Dict = None,
2031
- search_run: bool = True,
2032
- ) -> Tuple[bool, str]:
2033
- if run is None:
2034
- run = {}
2035
- if search_run:
2036
- try:
2037
- run = db.read_run(db_session, uid, project)
2038
- except mlrun.errors.MLRunNotFoundError:
2039
- run = {}
2040
- if not run:
2041
- logger.warning(
2042
- "Run not found. A new run will be created",
2043
- project=project,
2044
- uid=uid,
2045
- desired_run_state=run_state,
2046
- search_run=search_run,
2047
- )
2048
- run = {"metadata": {"project": project, "name": name, "uid": uid}}
2049
- db_run_state = run.get("status", {}).get("state")
2050
- if db_run_state:
2051
- if db_run_state == run_state:
2052
- return False, run_state
2053
- # if the current run state is terminal and different than the desired - log
2054
- if db_run_state in RunStates.terminal_states():
2055
- # This can happen when the SDK running in the user's Run updates the Run's state to terminal, but
2056
- # before it exits, when the runtime resource is still running, the API monitoring (here) is executed
2057
- if run_state not in RunStates.terminal_states():
2058
- now = datetime.now(timezone.utc)
2059
- last_update_str = run.get("status", {}).get("last_update")
2060
- if last_update_str is not None:
2061
- last_update = datetime.fromisoformat(last_update_str)
2062
- debounce_period = config.runs_monitoring_interval
2063
- if last_update > now - timedelta(
2064
- seconds=float(debounce_period)
2065
- ):
2066
- logger.warning(
2067
- "Monitoring found non-terminal state on runtime resource but record has recently "
2068
- "updated to terminal state. Debouncing",
2069
- project=project,
2070
- uid=uid,
2071
- db_run_state=db_run_state,
2072
- run_state=run_state,
2073
- last_update=last_update,
2074
- now=now,
2075
- debounce_period=debounce_period,
2076
- )
2077
- return False, run_state
2078
-
2079
- logger.warning(
2080
- "Run record has terminal state but monitoring found different state on runtime resource. Changing",
2081
- project=project,
2082
- uid=uid,
2083
- db_run_state=db_run_state,
2084
- run_state=run_state,
2085
- )
2086
-
2087
- logger.info("Updating run state", run_state=run_state)
2088
- run.setdefault("status", {})["state"] = run_state
2089
- run.setdefault("status", {})["last_update"] = now_date().isoformat()
2090
- db.store_run(db_session, run, uid, project)
2091
-
2092
- return True, run_state
2093
-
2094
- @staticmethod
2095
- def _resolve_runtime_resource_run(runtime_resource: Dict) -> Tuple[str, str, str]:
2096
- project = (
2097
- runtime_resource.get("metadata", {}).get("labels", {}).get("mlrun/project")
2098
- )
2099
- if not project:
2100
- project = config.default_project
2101
- uid = runtime_resource.get("metadata", {}).get("labels", {}).get("mlrun/uid")
2102
- name = (
2103
- runtime_resource.get("metadata", {})
2104
- .get("labels", {})
2105
- .get("mlrun/name", "no-name")
2106
- )
2107
- return project, uid, name
2108
-
2109
- @staticmethod
2110
- def _build_pod_resources(pods) -> List[mlrun.common.schemas.RuntimeResource]:
2111
- pod_resources = []
2112
- for pod in pods:
2113
- pod_resources.append(
2114
- mlrun.common.schemas.RuntimeResource(
2115
- name=pod["metadata"]["name"],
2116
- labels=pod["metadata"]["labels"],
2117
- status=pod["status"],
2118
- )
2119
- )
2120
- return pod_resources
2121
-
2122
- @staticmethod
2123
- def _build_crd_resources(
2124
- custom_objects,
2125
- ) -> List[mlrun.common.schemas.RuntimeResource]:
2126
- crd_resources = []
2127
- for custom_object in custom_objects:
2128
- crd_resources.append(
2129
- mlrun.common.schemas.RuntimeResource(
2130
- name=custom_object["metadata"]["name"],
2131
- labels=custom_object["metadata"]["labels"],
2132
- status=custom_object.get("status", {}),
2133
- )
2134
- )
2135
- return crd_resources